Loading cuda_fft.cu +104 −134 Original line number Diff line number Diff line #include <math.h> #include <stdlib.h> #include <stdio.h> #include <cufft.h> #include <cufftw.h> #include "w-stacking.h" #include "/cineca/prod/opt/compilers/hpc-sdk/2023/binary/Linux_ppc64le/23.1/math_libs/11.8/include/cufftmp/cufftMp.h" #include <mpi.h> #include <cuda_runtime.h> #include <complex.h> #include "cuComplex.h" /* #ifdef __CUDACC__ double __device__ #else double #endif */ #include "w-stacking.h" #include <time.h> #ifdef __CUDACC__ __global__ void write_grid( void cuda_fft( int num_w_planes, int grid_size_x, int grid_size_y, int xaxis, int yaxis, cufftDoubleComplex *fftwgrid, double *grid) double * grid, double * gridss, MPI_Comm comm) { long fftwindex2D = blockIdx.x; long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } #endif #ifdef __CUDACC__ cudaError_t mmm; cufftResult_t status; /* #ifdef __CUDACC__ __global__ void write_gridss( int num_w_planes, int xaxis, int yaxis, cufftDoubleComplex *fftwgrid, double *gridss, int grid_size_x, int grid_size_y) { long fftwindex2D = blockIdx.x; long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis); cufftDoubleComplex *fftwgrid; fftwgrid = (cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*yaxis*grid_size_x); double norm = 1.0/(double)(grid_size_x*grid_size_y); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } #endif */ // Plan creation cufftHandle plan; status = cufftCreate(&plan); if (status != CUFFT_SUCCESS) {printf("!!! cufftCreate ERROR %d !!!\n", status);} cudaStream_t stream{}; cudaStreamCreate(&stream); status = cufftMpAttachComm(plan, CUFFT_COMM_MPI, &comm); if (status != CUFFT_SUCCESS) {printf("!!! cufftMpAttachComm ERROR %d !!!\n", status);} void cuda_fft( int num_w_planes, int grid_size_x, int grid_size_y, int xaxis, int yaxis, double * grid, double * gridss) { #ifdef __CUDACC__ status = cufftSetStream(plan, stream); if (status != CUFFT_SUCCESS) {printf("!!! cufftSetStream ERROR %d !!!\n", status);} cufftDoubleComplex *fftwgrid; cufftDoubleComplex *fftwgrid_g; cudaError_t mmm; cufftResult_t status; size_t workspace; status = cufftMakePlan2d(plan, grid_size_x, grid_size_y, CUFFT_Z2Z, &workspace); if (status != CUFFT_SUCCESS) {printf("!!! cufftMakePlan2d ERROR %d !!!\n", status);} cudaDeviceSynchronize(); double * grid_g; //double * gridss_g; // Descriptor creation mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); mmm=cudaMalloc(&grid_g, sizeof(double)*2*num_w_planes*xaxis*yaxis); //mmm=cudaMalloc(&gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis); mmm=cudaMemcpy(grid_g, grid, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); cudaLibXtDesc *fftwgrid_g; cudaLibXtDesc *fftwgrid_g2; clock_t start, end, startcu, endcu; double cufft_exec_time; struct timespec begin, finish, begin0, begink, finishk, begincu, finishcu; status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);} cudaDeviceSynchronize(); status = cufftXtMalloc(plan, &fftwgrid_g2, CUFFT_XT_FORMAT_INPLACE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc 2 ERROR %d !!!\n", status);} cudaDeviceSynchronize(); // Define the CUDA set up //int Nth = NTHREADS; long Nbl = (long)(xaxis+yaxis*xaxis); //if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; fftwgrid =(cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); cufftHandle plan; cufftCreate(&plan); cufftPlan1d(&plan, num_w_planes*xaxis*yaxis, CUFFT_Z2Z, 1); printf("plan creation\n"); double norm = 1.0/(double)(grid_size_x*grid_size_y); long fftwindex = 0; long fftwindex2D = 0; double norm = 1.0/(double)(grid_size_x*grid_size_y); write_grid<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, grid_g); cudaDeviceSynchronize(); /* // Grid composition for (int iw=0; iw<num_w_planes; iw++) { Loading @@ -128,44 +90,42 @@ void cuda_fft( fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } */ //printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137].x); //mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice); //if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);} //printf("memcpy fftwgrid to fftwgrid_g\n"); cudaDeviceSynchronize(); mmm = cudaStreamSynchronize(stream); if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize ERROR %d !!!\n", mmm);} status = cufftXtMemcpy(plan, fftwgrid_g, fftwgrid, CUFFT_COPY_HOST_TO_DEVICE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy htd fftwgrid_g ERROR %d !!!\n", status);} printf("do the transform for each w-plane\n"); cudaDeviceSynchronize(); clock_gettime(CLOCK_MONOTONIC, &begincu); startcu = clock(); status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtExecDescriptor ERROR %d !!!\n", status);} status=cufftExecZ2Z(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); endcu = clock(); clock_gettime(CLOCK_MONOTONIC, &finishcu); cufft_exec_time = ((double) (endcu-startcu)) / CLOCKS_PER_SEC; printf("Time for cuFFT plan execution is %f sec\n", cufft_exec_time); if (status != CUFFT_SUCCESS) {printf("cufftexec ERROR %d\n",status);} mmm = cudaStreamSynchronize(stream); if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);} cudaDeviceSynchronize(); //write_gridss<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, gridss_g, grid_size_x, grid_size_y); //cudaDeviceSynchronize(); status = cufftXtMemcpy(plan, fftwgrid_g2, fftwgrid_g, CUFFT_COPY_DEVICE_TO_DEVICE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);} //mmm=cudaMemcpy(gridss, gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost); mmm = cudaStreamSynchronize(stream); if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);} mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost); if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);} cudaDeviceSynchronize(); status = cufftXtMemcpy(plan, fftwgrid, fftwgrid_g2, CUFFT_COPY_DEVICE_TO_HOST); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dth fftwgrid ERROR %d !!!\n", status);} for (int iw=0; iw<num_w_planes; iw++) { // save the transformed w-plane for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) Loading @@ -176,13 +136,23 @@ void cuda_fft( gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } } } cufftDestroy(plan); mmm = cudaFree(fftwgrid_g); mmm = cudaFree(grid_g); //mmm = cudaFree(gridss_g); #endif status = cufftXtFree(fftwgrid_g); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtFree ERROR %d !!!\n", status);} status = cufftXtFree(fftwgrid_g2); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtFree ERROR %d !!!\n", status);} status = cufftDestroy(plan); if (status != CUFFT_SUCCESS) {printf("!!! cufftDestroy fftwgrid ERROR %d !!!\n", status);} cudaStreamDestroy(stream); cudaDeviceSynchronize(); #endif // __CUDACC__ } w-stacking-fftw.c +53 −42 Original line number Diff line number Diff line Loading @@ -8,6 +8,9 @@ #include <mpi.h> #ifdef USE_FFTW #include <fftw3-mpi.h> /*#ifdef CUDA_FFT #endif*/ #endif #endif #include <omp.h> Loading Loading @@ -53,6 +56,7 @@ int main(int argc, char * argv[]) FILE * pFile1; FILE * pFilereal; FILE * pFileimg; // Global filename to be composed char filename[1000]; Loading Loading @@ -80,6 +84,7 @@ int main(int argc, char * argv[]) char srank[4]; char timingfile[FILENAMELENGTH] = "timings.dat"; // Visibilities related variables double * uu; double * vv; Loading Loading @@ -129,6 +134,8 @@ int main(int argc, char * argv[]) // Initialize FITS image parameters #ifdef FITSIO fitsfile *fptreal; fitsfile *fptrimg; int status; Loading @@ -137,15 +144,15 @@ int main(int argc, char * argv[]) long naxis = 2; long naxes[2] = { grid_size_x, grid_size_y }; #endif // Internal profiling parameters clock_t start, end, start0, startk, endk, startis, endis; double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, cufft_time, phase_time, image_time, mpifftw_time; clock_t start, end, start0, startk, endk, startis, endis, startcu, endcu; double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, cufft_time, phase_time, image_time, mpifftw_time, timecufftmp; double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, cufft_time1, phase_time1, image_time1; double writetime, writetime1; struct timespec begin, finish, begin0, begink, finishk, beginis, finishis; struct timespec begin, finish, begin0, begink, finishk, beginis, finishis, begincu, finishcu; double elapsed; // Number of sectors in which the mesh is divided along the v direction Loading Loading @@ -178,8 +185,16 @@ int main(int argc, char * argv[]) MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if(rank == 0)printf("Running with %d MPI tasks\n",size); MPI_Barrier(MPI_COMM_WORLD); #ifdef USE_FFTW #ifndef CUDA_FFT #ifdef CUDA_FFT int ndevices; cudaGetDeviceCount(&ndevices); cudaSetDevice(rank % ndevices); printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices); #else // Initialize FFTW environment fftw_mpi_init(); #endif Loading Loading @@ -789,22 +804,19 @@ if(rank == 0){ #ifdef USE_FFTW #ifdef CUDA_FFT // FFT transform the data using cuFFT if(rank==0)printf("PERFORMING CUDA FFT\n"); clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); /* for (int isector=0; isector<size; isector++) { #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif if(isector == rank) { */ clock_gettime(CLOCK_MONOTONIC, &begincu); startcu = clock(); cuda_fft( num_w_planes, grid_size_x, Loading @@ -812,22 +824,16 @@ if(rank == 0){ xaxis, yaxis, grid, gridss); //} //} end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); cufft_time = ((double) (end - start)) / CLOCKS_PER_SEC; cufft_time1 = (finish.tv_sec - begin.tv_sec); cufft_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; gridss, MPI_COMM_WORLD); #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif endcu = clock(); clock_gettime(CLOCK_MONOTONIC, &finishcu); timecufftmp = ((double) (endcu-startcu)) / CLOCKS_PER_SEC; #else Loading Loading @@ -883,7 +889,7 @@ if(rank == 0){ endis = clock(); clock_gettime(CLOCK_MONOTONIC, &finishis); mpifftw_time = ((double) (endis-startis)) / CLOCKS_PER_SEC; printf("Time for FFTW MPI is %f sec\n", mpifftw_time); printf("Time for FFTW MPI plan execution is %f sec\n", mpifftw_time); Loading Loading @@ -1044,9 +1050,14 @@ if(rank == 0){ #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif if(rank == 0)printf("WRITING IMAGE\n"); #ifdef FITSIO long * fpixel = (long *) malloc(sizeof(long)*naxis); long * lpixel = (long *) malloc(sizeof(long)*naxis); #endif #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); Loading Loading @@ -1159,7 +1170,7 @@ if(rank == 0){ printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time); #ifdef USE_FFTW #ifdef CUDA_FFT printf("cuFFT time: %f sec\n",cufft_time); printf("cuFFTMp complete execution time: %f sec\n",timecufftmp); printf("Phase time: %f sec\n",phase_time); #else printf("FFTW time: %f sec\n", fftw_time); Loading @@ -1177,7 +1188,7 @@ if(rank == 0){ printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1); #ifdef USE_FFTW #ifdef CUDA_FFT printf("PcuFFT time: %f sec\n", cufft_time1); // printf("PcuFFT time: %f sec\n", cufft_time1); printf("PPhase time: %f sec\n", phase_time1); #else printf("PFFTW time: %f sec\n",fftw_time1); Loading w-stacking.h +3 −4 Original line number Diff line number Diff line Loading @@ -6,9 +6,7 @@ #define PI 3.14159265359 #define REAL_TYPE double //#include <cufft.h> //#include <cufftw.h> #include <mpi.h> Loading Loading @@ -70,6 +68,7 @@ void cuda_fft( int, int, double*, double*); double*, MPI_Comm); #endif Loading
cuda_fft.cu +104 −134 Original line number Diff line number Diff line #include <math.h> #include <stdlib.h> #include <stdio.h> #include <cufft.h> #include <cufftw.h> #include "w-stacking.h" #include "/cineca/prod/opt/compilers/hpc-sdk/2023/binary/Linux_ppc64le/23.1/math_libs/11.8/include/cufftmp/cufftMp.h" #include <mpi.h> #include <cuda_runtime.h> #include <complex.h> #include "cuComplex.h" /* #ifdef __CUDACC__ double __device__ #else double #endif */ #include "w-stacking.h" #include <time.h> #ifdef __CUDACC__ __global__ void write_grid( void cuda_fft( int num_w_planes, int grid_size_x, int grid_size_y, int xaxis, int yaxis, cufftDoubleComplex *fftwgrid, double *grid) double * grid, double * gridss, MPI_Comm comm) { long fftwindex2D = blockIdx.x; long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } #endif #ifdef __CUDACC__ cudaError_t mmm; cufftResult_t status; /* #ifdef __CUDACC__ __global__ void write_gridss( int num_w_planes, int xaxis, int yaxis, cufftDoubleComplex *fftwgrid, double *gridss, int grid_size_x, int grid_size_y) { long fftwindex2D = blockIdx.x; long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis); cufftDoubleComplex *fftwgrid; fftwgrid = (cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*yaxis*grid_size_x); double norm = 1.0/(double)(grid_size_x*grid_size_y); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } #endif */ // Plan creation cufftHandle plan; status = cufftCreate(&plan); if (status != CUFFT_SUCCESS) {printf("!!! cufftCreate ERROR %d !!!\n", status);} cudaStream_t stream{}; cudaStreamCreate(&stream); status = cufftMpAttachComm(plan, CUFFT_COMM_MPI, &comm); if (status != CUFFT_SUCCESS) {printf("!!! cufftMpAttachComm ERROR %d !!!\n", status);} void cuda_fft( int num_w_planes, int grid_size_x, int grid_size_y, int xaxis, int yaxis, double * grid, double * gridss) { #ifdef __CUDACC__ status = cufftSetStream(plan, stream); if (status != CUFFT_SUCCESS) {printf("!!! cufftSetStream ERROR %d !!!\n", status);} cufftDoubleComplex *fftwgrid; cufftDoubleComplex *fftwgrid_g; cudaError_t mmm; cufftResult_t status; size_t workspace; status = cufftMakePlan2d(plan, grid_size_x, grid_size_y, CUFFT_Z2Z, &workspace); if (status != CUFFT_SUCCESS) {printf("!!! cufftMakePlan2d ERROR %d !!!\n", status);} cudaDeviceSynchronize(); double * grid_g; //double * gridss_g; // Descriptor creation mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); mmm=cudaMalloc(&grid_g, sizeof(double)*2*num_w_planes*xaxis*yaxis); //mmm=cudaMalloc(&gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis); mmm=cudaMemcpy(grid_g, grid, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); cudaLibXtDesc *fftwgrid_g; cudaLibXtDesc *fftwgrid_g2; clock_t start, end, startcu, endcu; double cufft_exec_time; struct timespec begin, finish, begin0, begink, finishk, begincu, finishcu; status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);} cudaDeviceSynchronize(); status = cufftXtMalloc(plan, &fftwgrid_g2, CUFFT_XT_FORMAT_INPLACE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc 2 ERROR %d !!!\n", status);} cudaDeviceSynchronize(); // Define the CUDA set up //int Nth = NTHREADS; long Nbl = (long)(xaxis+yaxis*xaxis); //if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; fftwgrid =(cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); cufftHandle plan; cufftCreate(&plan); cufftPlan1d(&plan, num_w_planes*xaxis*yaxis, CUFFT_Z2Z, 1); printf("plan creation\n"); double norm = 1.0/(double)(grid_size_x*grid_size_y); long fftwindex = 0; long fftwindex2D = 0; double norm = 1.0/(double)(grid_size_x*grid_size_y); write_grid<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, grid_g); cudaDeviceSynchronize(); /* // Grid composition for (int iw=0; iw<num_w_planes; iw++) { Loading @@ -128,44 +90,42 @@ void cuda_fft( fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } */ //printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137].x); //mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice); //if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);} //printf("memcpy fftwgrid to fftwgrid_g\n"); cudaDeviceSynchronize(); mmm = cudaStreamSynchronize(stream); if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize ERROR %d !!!\n", mmm);} status = cufftXtMemcpy(plan, fftwgrid_g, fftwgrid, CUFFT_COPY_HOST_TO_DEVICE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy htd fftwgrid_g ERROR %d !!!\n", status);} printf("do the transform for each w-plane\n"); cudaDeviceSynchronize(); clock_gettime(CLOCK_MONOTONIC, &begincu); startcu = clock(); status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtExecDescriptor ERROR %d !!!\n", status);} status=cufftExecZ2Z(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); endcu = clock(); clock_gettime(CLOCK_MONOTONIC, &finishcu); cufft_exec_time = ((double) (endcu-startcu)) / CLOCKS_PER_SEC; printf("Time for cuFFT plan execution is %f sec\n", cufft_exec_time); if (status != CUFFT_SUCCESS) {printf("cufftexec ERROR %d\n",status);} mmm = cudaStreamSynchronize(stream); if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);} cudaDeviceSynchronize(); //write_gridss<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, gridss_g, grid_size_x, grid_size_y); //cudaDeviceSynchronize(); status = cufftXtMemcpy(plan, fftwgrid_g2, fftwgrid_g, CUFFT_COPY_DEVICE_TO_DEVICE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);} //mmm=cudaMemcpy(gridss, gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost); mmm = cudaStreamSynchronize(stream); if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);} mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost); if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);} cudaDeviceSynchronize(); status = cufftXtMemcpy(plan, fftwgrid, fftwgrid_g2, CUFFT_COPY_DEVICE_TO_HOST); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dth fftwgrid ERROR %d !!!\n", status);} for (int iw=0; iw<num_w_planes; iw++) { // save the transformed w-plane for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) Loading @@ -176,13 +136,23 @@ void cuda_fft( gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } } } cufftDestroy(plan); mmm = cudaFree(fftwgrid_g); mmm = cudaFree(grid_g); //mmm = cudaFree(gridss_g); #endif status = cufftXtFree(fftwgrid_g); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtFree ERROR %d !!!\n", status);} status = cufftXtFree(fftwgrid_g2); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtFree ERROR %d !!!\n", status);} status = cufftDestroy(plan); if (status != CUFFT_SUCCESS) {printf("!!! cufftDestroy fftwgrid ERROR %d !!!\n", status);} cudaStreamDestroy(stream); cudaDeviceSynchronize(); #endif // __CUDACC__ }
w-stacking-fftw.c +53 −42 Original line number Diff line number Diff line Loading @@ -8,6 +8,9 @@ #include <mpi.h> #ifdef USE_FFTW #include <fftw3-mpi.h> /*#ifdef CUDA_FFT #endif*/ #endif #endif #include <omp.h> Loading Loading @@ -53,6 +56,7 @@ int main(int argc, char * argv[]) FILE * pFile1; FILE * pFilereal; FILE * pFileimg; // Global filename to be composed char filename[1000]; Loading Loading @@ -80,6 +84,7 @@ int main(int argc, char * argv[]) char srank[4]; char timingfile[FILENAMELENGTH] = "timings.dat"; // Visibilities related variables double * uu; double * vv; Loading Loading @@ -129,6 +134,8 @@ int main(int argc, char * argv[]) // Initialize FITS image parameters #ifdef FITSIO fitsfile *fptreal; fitsfile *fptrimg; int status; Loading @@ -137,15 +144,15 @@ int main(int argc, char * argv[]) long naxis = 2; long naxes[2] = { grid_size_x, grid_size_y }; #endif // Internal profiling parameters clock_t start, end, start0, startk, endk, startis, endis; double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, cufft_time, phase_time, image_time, mpifftw_time; clock_t start, end, start0, startk, endk, startis, endis, startcu, endcu; double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, cufft_time, phase_time, image_time, mpifftw_time, timecufftmp; double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, cufft_time1, phase_time1, image_time1; double writetime, writetime1; struct timespec begin, finish, begin0, begink, finishk, beginis, finishis; struct timespec begin, finish, begin0, begink, finishk, beginis, finishis, begincu, finishcu; double elapsed; // Number of sectors in which the mesh is divided along the v direction Loading Loading @@ -178,8 +185,16 @@ int main(int argc, char * argv[]) MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if(rank == 0)printf("Running with %d MPI tasks\n",size); MPI_Barrier(MPI_COMM_WORLD); #ifdef USE_FFTW #ifndef CUDA_FFT #ifdef CUDA_FFT int ndevices; cudaGetDeviceCount(&ndevices); cudaSetDevice(rank % ndevices); printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices); #else // Initialize FFTW environment fftw_mpi_init(); #endif Loading Loading @@ -789,22 +804,19 @@ if(rank == 0){ #ifdef USE_FFTW #ifdef CUDA_FFT // FFT transform the data using cuFFT if(rank==0)printf("PERFORMING CUDA FFT\n"); clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); /* for (int isector=0; isector<size; isector++) { #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif if(isector == rank) { */ clock_gettime(CLOCK_MONOTONIC, &begincu); startcu = clock(); cuda_fft( num_w_planes, grid_size_x, Loading @@ -812,22 +824,16 @@ if(rank == 0){ xaxis, yaxis, grid, gridss); //} //} end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); cufft_time = ((double) (end - start)) / CLOCKS_PER_SEC; cufft_time1 = (finish.tv_sec - begin.tv_sec); cufft_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; gridss, MPI_COMM_WORLD); #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif endcu = clock(); clock_gettime(CLOCK_MONOTONIC, &finishcu); timecufftmp = ((double) (endcu-startcu)) / CLOCKS_PER_SEC; #else Loading Loading @@ -883,7 +889,7 @@ if(rank == 0){ endis = clock(); clock_gettime(CLOCK_MONOTONIC, &finishis); mpifftw_time = ((double) (endis-startis)) / CLOCKS_PER_SEC; printf("Time for FFTW MPI is %f sec\n", mpifftw_time); printf("Time for FFTW MPI plan execution is %f sec\n", mpifftw_time); Loading Loading @@ -1044,9 +1050,14 @@ if(rank == 0){ #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif if(rank == 0)printf("WRITING IMAGE\n"); #ifdef FITSIO long * fpixel = (long *) malloc(sizeof(long)*naxis); long * lpixel = (long *) malloc(sizeof(long)*naxis); #endif #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); Loading Loading @@ -1159,7 +1170,7 @@ if(rank == 0){ printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time); #ifdef USE_FFTW #ifdef CUDA_FFT printf("cuFFT time: %f sec\n",cufft_time); printf("cuFFTMp complete execution time: %f sec\n",timecufftmp); printf("Phase time: %f sec\n",phase_time); #else printf("FFTW time: %f sec\n", fftw_time); Loading @@ -1177,7 +1188,7 @@ if(rank == 0){ printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1); #ifdef USE_FFTW #ifdef CUDA_FFT printf("PcuFFT time: %f sec\n", cufft_time1); // printf("PcuFFT time: %f sec\n", cufft_time1); printf("PPhase time: %f sec\n", phase_time1); #else printf("PFFTW time: %f sec\n",fftw_time1); Loading
w-stacking.h +3 −4 Original line number Diff line number Diff line Loading @@ -6,9 +6,7 @@ #define PI 3.14159265359 #define REAL_TYPE double //#include <cufft.h> //#include <cufftw.h> #include <mpi.h> Loading Loading @@ -70,6 +68,7 @@ void cuda_fft( int, int, double*, double*); double*, MPI_Comm); #endif