Loading Build/Makefile.M100 +3 −2 Original line number Diff line number Diff line Loading @@ -15,10 +15,11 @@ FITSIO_LIB= /m100_work/IscrC_CD-DLS/cfitsio-3.49 NVCC = nvcc NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda CUFFT_INCL= -I/cineca/prod/opt/compilers/cuda/10.1/none/include/ NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda -lcufft OMP= -fopenmp CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) $(FITSIO_INCL) CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) $(FITSIO_INCL) $(CUFFT_INCL) OPTIMIZE = $(OMP) -O3 -mtune=native Makefile +10 −8 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ include Build/Makefile.systype endif LIBS = -L$(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda LIBS = -L$(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda -lcufft # create MPI code OPT += -DUSE_MPI Loading @@ -22,10 +22,10 @@ OPT += -DUSE_MPI # use FFTW (it can be switched on ONLY if MPI is active) ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) OPT += -DUSE_FFTW LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm -lcuda -lcudart -lcufft endif #OPT += -DNVIDIA OPT += -DNVIDIA # perform one-side communication (suggested) instead of reduce (only if MPI is active) #OPT += -DONE_SIDE # write the full 3D cube of gridded visibilities and its FFT transform Loading @@ -34,17 +34,19 @@ endif OPT += -DWRITE_IMAGE # perform w-stacking phase correction OPT += -DPHASE_ON # Support FFT using cuFFT OPT += -DCUDA_FFT # Support CFITSIO OPT += -DFITSIO # Perform true parallel images writing #OPT += -DPARALLELIO OPT += -DPARALLELIO ifeq (FITSIO,$(findstring FITSIO,$(OPT))) LIBS += -L$(FITSIO_LIB) -lcfitsio endif DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu cuda_fft.cu COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o cuda_fft.o w-stacking.c: w-stacking.cu cp w-stacking.cu w-stacking.c Loading Loading @@ -81,9 +83,9 @@ mpi: $(COBJ) $(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS) mpi_cuda: $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu cuda_fft.cu $(NVLIB) $(CUFFT_INCL) $(MPICC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS) $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS) $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o cuda_fft.o $(NVLIB) $(CFLAGS) $(LIBS) clean: rm *.o Loading cuda_fft.cu +90 −24 Original line number Diff line number Diff line Loading @@ -17,7 +17,44 @@ double */ //__global__ void write_grid #ifdef __CUDACC__ __global__ void write_grid( int num_w_planes, int xaxis, int yaxis, cufftDoubleComplex *fftwgrid, double *grid) { long fftwindex2D = blockIdx.x; long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } #endif /* #ifdef __CUDACC__ __global__ void write_gridss( int num_w_planes, int xaxis, int yaxis, cufftDoubleComplex *fftwgrid, double *gridss, int grid_size_x, int grid_size_y) { long fftwindex2D = blockIdx.x; long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis); double norm = 1.0/(double)(grid_size_x*grid_size_y); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } #endif */ Loading @@ -33,7 +70,6 @@ void cuda_fft( double * grid, double * gridss) { #ifdef __CUDACC__ cufftDoubleComplex *fftwgrid; Loading @@ -41,34 +77,47 @@ void cuda_fft( cudaError_t mmm; cufftResult_t status; double * grid_g; //double * gridss_g; // Define the CUDA set up int Nth = NTHREADS; long Nbl = (long)(num_w_planes*xaxis*yaxis/Nth) + 1; if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; //long Nvis = num_points*freq_per_chan*polarizations; ///printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl); fftwgrid =(cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y); mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); mmm=cudaMalloc(&grid_g, sizeof(double)*2*num_w_planes*xaxis*yaxis); //mmm=cudaMalloc(&gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis); mmm=cudaMemcpy(grid_g, grid, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); clock_t start, end, startcu, endcu; double cufft_exec_time; struct timespec begin, finish, begin0, begink, finishk, begincu, finishcu; mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y); if (mmm != cudaSuccess) {printf("cudaMalloc error %d\n",mmm);} //mmm=cudaMemset(fftwgrid_g, 0.0, 2*grid_size_x*grid_size_y*sizeof(cufftDoubleComplex)); printf("malloc fftwgrid_g\n"); // Define the CUDA set up //int Nth = NTHREADS; long Nbl = (long)(xaxis+yaxis*xaxis); //if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; fftwgrid =(cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); cufftHandle plan; cufftCreate(&plan); cufftPlan1d(&plan, grid_size_x*grid_size_y, CUFFT_Z2Z, 1); cufftPlan1d(&plan, num_w_planes*xaxis*yaxis, CUFFT_Z2Z, 1); printf("plan creation\n"); double norm = 1.0/(double)(grid_size_x*grid_size_y); long fftwindex = 0; long fftwindex2D = 0; write_grid<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, grid_g); cudaDeviceSynchronize(); /* for (int iw=0; iw<num_w_planes; iw++) { printf("select the w-plane to transform\n"); printf("select the %d w-plane to transform\n", iw); for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) Loading @@ -79,28 +128,43 @@ void cuda_fft( fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } */ //printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137].x); mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y, cudaMemcpyHostToDevice); if (mmm != cudaSuccess) {printf("cudaMemcpy error %d\n",mmm);} printf("memcpy fftwgrid to fftwgrid_g\n"); //mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice); //if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);} //printf("memcpy fftwgrid to fftwgrid_g\n"); printf("do the transform for each w-plane\n"); clock_gettime(CLOCK_MONOTONIC, &begincu); startcu = clock(); status=cufftExecZ2Z(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) {printf("cufftexec error %d\n",status);} endcu = clock(); clock_gettime(CLOCK_MONOTONIC, &finishcu); cufft_exec_time = ((double) (endcu-startcu)) / CLOCKS_PER_SEC; printf("Time for cuFFT plan execution is %f sec\n", cufft_exec_time); if (status != CUFFT_SUCCESS) {printf("cufftexec ERROR %d\n",status);} cudaDeviceSynchronize(); mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y, cudaMemcpyDeviceToHost); if (mmm != cudaSuccess) {printf("cudaMemcpy error %d\n",mmm);} printf("memcpy fftwgrid_g to fftwgrid\n"); //printf("fftwgrid[1260137].x post cuFFT %f\n", fftwgrid[1260137].x); //write_gridss<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, gridss_g, grid_size_x, grid_size_y); //cudaDeviceSynchronize(); //mmm=cudaMemcpy(gridss, gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost); mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost); if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);} for (int iw=0; iw<num_w_planes; iw++) { // save the transformed w-plane for (int iv=0; iv<yaxis; iv++) { Loading @@ -118,5 +182,7 @@ void cuda_fft( cufftDestroy(plan); mmm = cudaFree(fftwgrid_g); mmm = cudaFree(grid_g); //mmm = cudaFree(gridss_g); #endif } w-stacking-fftw.c +107 −17 Original line number Diff line number Diff line Loading @@ -104,8 +104,8 @@ int main(int argc, char * argv[]) double resolution; // Mesh related parameters: global size int grid_size_x = 2048; int grid_size_y = 2048; int grid_size_x = 4096; int grid_size_y = 4096; // Split Mesh size (auto-calculated) int local_grid_size_x; int local_grid_size_y; Loading @@ -113,7 +113,7 @@ int main(int argc, char * argv[]) int yaxis; // Number of planes in the w direction int num_w_planes = 8; int num_w_planes = 1; // Size of the convoutional kernel support int w_support = 7; Loading @@ -132,8 +132,8 @@ int main(int argc, char * argv[]) fitsfile *fptreal; fitsfile *fptrimg; int status; char testfitsreal[FILENAMELENGTH] = "parallel_np2_real.fits"; char testfitsimag[FILENAMELENGTH] = "parallel_np2_img.fits"; char testfitsreal[FILENAMELENGTH] = "parallel_np8_real.fits"; char testfitsimag[FILENAMELENGTH] = "parallel_np8_img.fits"; long naxis = 2; long naxes[2] = { grid_size_x, grid_size_y }; Loading @@ -141,11 +141,11 @@ int main(int argc, char * argv[]) // Internal profiling parameters clock_t start, end, start0, startk, endk; double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time; double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1; clock_t start, end, start0, startk, endk, startis, endis; double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, cufft_time, phase_time, image_time, mpifftw_time; double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, cufft_time1, phase_time1, image_time1; double writetime, writetime1; struct timespec begin, finish, begin0, begink, finishk; struct timespec begin, finish, begin0, begink, finishk, beginis, finishis; double elapsed; // Number of sectors in which the mesh is divided along the v direction Loading Loading @@ -179,9 +179,11 @@ int main(int argc, char * argv[]) MPI_Comm_size(MPI_COMM_WORLD, &size); if(rank == 0)printf("Running with %d MPI tasks\n",size); #ifdef USE_FFTW #ifndef CUDA_FFT // Initialize FFTW environment fftw_mpi_init(); #endif #endif #else // If MPI is not used the two parameters are set for consistency rank = 0; Loading Loading @@ -786,9 +788,56 @@ if(rank == 0){ #ifdef USE_FFTW #ifdef CUDA_FFT // FFT transform the data using cuFFT if(rank==0)printf("PERFORMING CUDA FFT\n"); clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); /* for (int isector=0; isector<size; isector++) { #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif if(isector == rank) { */ cuda_fft( num_w_planes, grid_size_x, grid_size_y, xaxis, yaxis, grid, gridss); //} //} end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); cufft_time = ((double) (end - start)) / CLOCKS_PER_SEC; cufft_time1 = (finish.tv_sec - begin.tv_sec); cufft_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; #else // FFT transform the data (using distributed FFTW) if(rank == 0)printf("PERFORMING FFT\n"); //printf("Pre gridss[1] %f \n", gridss[1]); clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); fftw_plan plan; Loading Loading @@ -821,9 +870,24 @@ if(rank == 0){ } } // do the transform for each w-plane //printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137][1]); clock_gettime(CLOCK_MONOTONIC, &beginis); startis = clock(); fftw_execute(plan); endis = clock(); clock_gettime(CLOCK_MONOTONIC, &finishis); mpifftw_time = ((double) (endis-startis)) / CLOCKS_PER_SEC; printf("Time for FFTW MPI is %f sec\n", mpifftw_time); //printf("fftwgrid[1260137].x post cuFFT %f\n", fftwgrid[1260137][0]); // save the transformed w-plane for (int iv=0; iv<yaxis; iv++) { Loading @@ -839,7 +903,7 @@ if(rank == 0){ } fftw_destroy_plan(plan); //printf("Post gridss[1] %f \n", gridss[1]); #ifdef USE_MPI MPI_Win_fence(0,slabwin); MPI_Barrier(MPI_COMM_WORLD); Loading @@ -852,6 +916,8 @@ if(rank == 0){ fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); #endif //CUDA_FFT #ifdef WRITE_DATA // Write results #ifdef USE_MPI Loading Loading @@ -927,8 +993,10 @@ if(rank == 0){ #endif #endif //WRITE_DATA #ifndef CUDA_FFT //#else fftw_free(fftwgrid); #endif // Phase correction clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); Loading @@ -944,7 +1012,8 @@ if(rank == 0){ phase_time1 = (finish.tv_sec - begin.tv_sec); phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; #ifdef WRITE_IMAGE clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); if(rank == 0) { #ifdef FITSIO Loading Loading @@ -1066,10 +1135,15 @@ if(rank == 0){ MPI_Barrier(MPI_COMM_WORLD); #endif end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); image_time = ((double) (end - start)) / CLOCKS_PER_SEC; image_time1 = (finish.tv_sec - begin0.tv_sec); image_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0; #endif //WRITE_IMAGE //#endif //CUDA_FFT #endif //USE_FFTW end = clock(); Loading @@ -1084,8 +1158,16 @@ if(rank == 0){ printf("Process time: %f sec\n",process_time); printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time); #ifdef USE_FFTW #ifdef CUDA_FFT printf("cuFFT time: %f sec\n",cufft_time); printf("Phase time: %f sec\n",phase_time); #else printf("FFTW time: %f sec\n", fftw_time); printf("Phase time: %f sec\n", phase_time); #endif #endif #ifdef WRITE_IMAGE printf("Write image time: %f sec\n", image_time); #endif printf("TOT time: %f sec\n",tot_time); if(num_threads > 1) Loading @@ -1094,8 +1176,16 @@ if(rank == 0){ printf("PProcess time: %f sec\n",process_time1); printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1); #ifdef USE_FFTW #ifdef CUDA_FFT printf("PcuFFT time: %f sec\n", cufft_time1); printf("PPhase time: %f sec\n", phase_time1); #else printf("PFFTW time: %f sec\n",fftw_time1); printf("PPhase time: %f sec\n",phase_time1); #endif #endif #ifdef WRITE_IMAGE printf("PWrite image time: %f sec\n", image_time1); #endif printf("PTOT time: %f sec\n",tot_time1); } Loading @@ -1106,9 +1196,9 @@ if(rank == 0){ pFile = fopen (timingfile,"w"); if (num_threads == 1) { fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,tot_time); fprintf(pFile, "%f %f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,image_time,tot_time); } else { fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,tot_time1); fprintf(pFile, "%f %f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,image_time1,tot_time1); } fclose(pFile); } Loading w-stacking.h +24 −1 Original line number Diff line number Diff line Loading @@ -5,6 +5,13 @@ #define NTHREADS 32 #define PI 3.14159265359 #define REAL_TYPE double //#include <cufft.h> //#include <cufftw.h> #ifdef __CUDACC__ extern "C" #endif Loading Loading @@ -49,4 +56,20 @@ void phase_correction( double, double, int); #ifdef __CUDACC__ extern "C" #endif void cuda_fft( int, int, int, int, int, double*, double*); #endif Loading
Build/Makefile.M100 +3 −2 Original line number Diff line number Diff line Loading @@ -15,10 +15,11 @@ FITSIO_LIB= /m100_work/IscrC_CD-DLS/cfitsio-3.49 NVCC = nvcc NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda CUFFT_INCL= -I/cineca/prod/opt/compilers/cuda/10.1/none/include/ NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda -lcufft OMP= -fopenmp CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) $(FITSIO_INCL) CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) $(FITSIO_INCL) $(CUFFT_INCL) OPTIMIZE = $(OMP) -O3 -mtune=native
Makefile +10 −8 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ include Build/Makefile.systype endif LIBS = -L$(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda LIBS = -L$(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda -lcufft # create MPI code OPT += -DUSE_MPI Loading @@ -22,10 +22,10 @@ OPT += -DUSE_MPI # use FFTW (it can be switched on ONLY if MPI is active) ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) OPT += -DUSE_FFTW LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm -lcuda -lcudart -lcufft endif #OPT += -DNVIDIA OPT += -DNVIDIA # perform one-side communication (suggested) instead of reduce (only if MPI is active) #OPT += -DONE_SIDE # write the full 3D cube of gridded visibilities and its FFT transform Loading @@ -34,17 +34,19 @@ endif OPT += -DWRITE_IMAGE # perform w-stacking phase correction OPT += -DPHASE_ON # Support FFT using cuFFT OPT += -DCUDA_FFT # Support CFITSIO OPT += -DFITSIO # Perform true parallel images writing #OPT += -DPARALLELIO OPT += -DPARALLELIO ifeq (FITSIO,$(findstring FITSIO,$(OPT))) LIBS += -L$(FITSIO_LIB) -lcfitsio endif DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu cuda_fft.cu COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o cuda_fft.o w-stacking.c: w-stacking.cu cp w-stacking.cu w-stacking.c Loading Loading @@ -81,9 +83,9 @@ mpi: $(COBJ) $(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS) mpi_cuda: $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu cuda_fft.cu $(NVLIB) $(CUFFT_INCL) $(MPICC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS) $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS) $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o cuda_fft.o $(NVLIB) $(CFLAGS) $(LIBS) clean: rm *.o Loading
cuda_fft.cu +90 −24 Original line number Diff line number Diff line Loading @@ -17,7 +17,44 @@ double */ //__global__ void write_grid #ifdef __CUDACC__ __global__ void write_grid( int num_w_planes, int xaxis, int yaxis, cufftDoubleComplex *fftwgrid, double *grid) { long fftwindex2D = blockIdx.x; long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } #endif /* #ifdef __CUDACC__ __global__ void write_gridss( int num_w_planes, int xaxis, int yaxis, cufftDoubleComplex *fftwgrid, double *gridss, int grid_size_x, int grid_size_y) { long fftwindex2D = blockIdx.x; long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis); double norm = 1.0/(double)(grid_size_x*grid_size_y); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } #endif */ Loading @@ -33,7 +70,6 @@ void cuda_fft( double * grid, double * gridss) { #ifdef __CUDACC__ cufftDoubleComplex *fftwgrid; Loading @@ -41,34 +77,47 @@ void cuda_fft( cudaError_t mmm; cufftResult_t status; double * grid_g; //double * gridss_g; // Define the CUDA set up int Nth = NTHREADS; long Nbl = (long)(num_w_planes*xaxis*yaxis/Nth) + 1; if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; //long Nvis = num_points*freq_per_chan*polarizations; ///printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl); fftwgrid =(cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y); mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); mmm=cudaMalloc(&grid_g, sizeof(double)*2*num_w_planes*xaxis*yaxis); //mmm=cudaMalloc(&gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis); mmm=cudaMemcpy(grid_g, grid, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); clock_t start, end, startcu, endcu; double cufft_exec_time; struct timespec begin, finish, begin0, begink, finishk, begincu, finishcu; mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y); if (mmm != cudaSuccess) {printf("cudaMalloc error %d\n",mmm);} //mmm=cudaMemset(fftwgrid_g, 0.0, 2*grid_size_x*grid_size_y*sizeof(cufftDoubleComplex)); printf("malloc fftwgrid_g\n"); // Define the CUDA set up //int Nth = NTHREADS; long Nbl = (long)(xaxis+yaxis*xaxis); //if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; fftwgrid =(cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); cufftHandle plan; cufftCreate(&plan); cufftPlan1d(&plan, grid_size_x*grid_size_y, CUFFT_Z2Z, 1); cufftPlan1d(&plan, num_w_planes*xaxis*yaxis, CUFFT_Z2Z, 1); printf("plan creation\n"); double norm = 1.0/(double)(grid_size_x*grid_size_y); long fftwindex = 0; long fftwindex2D = 0; write_grid<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, grid_g); cudaDeviceSynchronize(); /* for (int iw=0; iw<num_w_planes; iw++) { printf("select the w-plane to transform\n"); printf("select the %d w-plane to transform\n", iw); for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) Loading @@ -79,28 +128,43 @@ void cuda_fft( fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } */ //printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137].x); mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y, cudaMemcpyHostToDevice); if (mmm != cudaSuccess) {printf("cudaMemcpy error %d\n",mmm);} printf("memcpy fftwgrid to fftwgrid_g\n"); //mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice); //if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);} //printf("memcpy fftwgrid to fftwgrid_g\n"); printf("do the transform for each w-plane\n"); clock_gettime(CLOCK_MONOTONIC, &begincu); startcu = clock(); status=cufftExecZ2Z(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) {printf("cufftexec error %d\n",status);} endcu = clock(); clock_gettime(CLOCK_MONOTONIC, &finishcu); cufft_exec_time = ((double) (endcu-startcu)) / CLOCKS_PER_SEC; printf("Time for cuFFT plan execution is %f sec\n", cufft_exec_time); if (status != CUFFT_SUCCESS) {printf("cufftexec ERROR %d\n",status);} cudaDeviceSynchronize(); mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y, cudaMemcpyDeviceToHost); if (mmm != cudaSuccess) {printf("cudaMemcpy error %d\n",mmm);} printf("memcpy fftwgrid_g to fftwgrid\n"); //printf("fftwgrid[1260137].x post cuFFT %f\n", fftwgrid[1260137].x); //write_gridss<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, gridss_g, grid_size_x, grid_size_y); //cudaDeviceSynchronize(); //mmm=cudaMemcpy(gridss, gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost); mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost); if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);} for (int iw=0; iw<num_w_planes; iw++) { // save the transformed w-plane for (int iv=0; iv<yaxis; iv++) { Loading @@ -118,5 +182,7 @@ void cuda_fft( cufftDestroy(plan); mmm = cudaFree(fftwgrid_g); mmm = cudaFree(grid_g); //mmm = cudaFree(gridss_g); #endif }
w-stacking-fftw.c +107 −17 Original line number Diff line number Diff line Loading @@ -104,8 +104,8 @@ int main(int argc, char * argv[]) double resolution; // Mesh related parameters: global size int grid_size_x = 2048; int grid_size_y = 2048; int grid_size_x = 4096; int grid_size_y = 4096; // Split Mesh size (auto-calculated) int local_grid_size_x; int local_grid_size_y; Loading @@ -113,7 +113,7 @@ int main(int argc, char * argv[]) int yaxis; // Number of planes in the w direction int num_w_planes = 8; int num_w_planes = 1; // Size of the convoutional kernel support int w_support = 7; Loading @@ -132,8 +132,8 @@ int main(int argc, char * argv[]) fitsfile *fptreal; fitsfile *fptrimg; int status; char testfitsreal[FILENAMELENGTH] = "parallel_np2_real.fits"; char testfitsimag[FILENAMELENGTH] = "parallel_np2_img.fits"; char testfitsreal[FILENAMELENGTH] = "parallel_np8_real.fits"; char testfitsimag[FILENAMELENGTH] = "parallel_np8_img.fits"; long naxis = 2; long naxes[2] = { grid_size_x, grid_size_y }; Loading @@ -141,11 +141,11 @@ int main(int argc, char * argv[]) // Internal profiling parameters clock_t start, end, start0, startk, endk; double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time; double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1; clock_t start, end, start0, startk, endk, startis, endis; double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, cufft_time, phase_time, image_time, mpifftw_time; double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, cufft_time1, phase_time1, image_time1; double writetime, writetime1; struct timespec begin, finish, begin0, begink, finishk; struct timespec begin, finish, begin0, begink, finishk, beginis, finishis; double elapsed; // Number of sectors in which the mesh is divided along the v direction Loading Loading @@ -179,9 +179,11 @@ int main(int argc, char * argv[]) MPI_Comm_size(MPI_COMM_WORLD, &size); if(rank == 0)printf("Running with %d MPI tasks\n",size); #ifdef USE_FFTW #ifndef CUDA_FFT // Initialize FFTW environment fftw_mpi_init(); #endif #endif #else // If MPI is not used the two parameters are set for consistency rank = 0; Loading Loading @@ -786,9 +788,56 @@ if(rank == 0){ #ifdef USE_FFTW #ifdef CUDA_FFT // FFT transform the data using cuFFT if(rank==0)printf("PERFORMING CUDA FFT\n"); clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); /* for (int isector=0; isector<size; isector++) { #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif if(isector == rank) { */ cuda_fft( num_w_planes, grid_size_x, grid_size_y, xaxis, yaxis, grid, gridss); //} //} end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); cufft_time = ((double) (end - start)) / CLOCKS_PER_SEC; cufft_time1 = (finish.tv_sec - begin.tv_sec); cufft_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; #else // FFT transform the data (using distributed FFTW) if(rank == 0)printf("PERFORMING FFT\n"); //printf("Pre gridss[1] %f \n", gridss[1]); clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); fftw_plan plan; Loading Loading @@ -821,9 +870,24 @@ if(rank == 0){ } } // do the transform for each w-plane //printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137][1]); clock_gettime(CLOCK_MONOTONIC, &beginis); startis = clock(); fftw_execute(plan); endis = clock(); clock_gettime(CLOCK_MONOTONIC, &finishis); mpifftw_time = ((double) (endis-startis)) / CLOCKS_PER_SEC; printf("Time for FFTW MPI is %f sec\n", mpifftw_time); //printf("fftwgrid[1260137].x post cuFFT %f\n", fftwgrid[1260137][0]); // save the transformed w-plane for (int iv=0; iv<yaxis; iv++) { Loading @@ -839,7 +903,7 @@ if(rank == 0){ } fftw_destroy_plan(plan); //printf("Post gridss[1] %f \n", gridss[1]); #ifdef USE_MPI MPI_Win_fence(0,slabwin); MPI_Barrier(MPI_COMM_WORLD); Loading @@ -852,6 +916,8 @@ if(rank == 0){ fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); #endif //CUDA_FFT #ifdef WRITE_DATA // Write results #ifdef USE_MPI Loading Loading @@ -927,8 +993,10 @@ if(rank == 0){ #endif #endif //WRITE_DATA #ifndef CUDA_FFT //#else fftw_free(fftwgrid); #endif // Phase correction clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); Loading @@ -944,7 +1012,8 @@ if(rank == 0){ phase_time1 = (finish.tv_sec - begin.tv_sec); phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; #ifdef WRITE_IMAGE clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); if(rank == 0) { #ifdef FITSIO Loading Loading @@ -1066,10 +1135,15 @@ if(rank == 0){ MPI_Barrier(MPI_COMM_WORLD); #endif end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); image_time = ((double) (end - start)) / CLOCKS_PER_SEC; image_time1 = (finish.tv_sec - begin0.tv_sec); image_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0; #endif //WRITE_IMAGE //#endif //CUDA_FFT #endif //USE_FFTW end = clock(); Loading @@ -1084,8 +1158,16 @@ if(rank == 0){ printf("Process time: %f sec\n",process_time); printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time); #ifdef USE_FFTW #ifdef CUDA_FFT printf("cuFFT time: %f sec\n",cufft_time); printf("Phase time: %f sec\n",phase_time); #else printf("FFTW time: %f sec\n", fftw_time); printf("Phase time: %f sec\n", phase_time); #endif #endif #ifdef WRITE_IMAGE printf("Write image time: %f sec\n", image_time); #endif printf("TOT time: %f sec\n",tot_time); if(num_threads > 1) Loading @@ -1094,8 +1176,16 @@ if(rank == 0){ printf("PProcess time: %f sec\n",process_time1); printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1); #ifdef USE_FFTW #ifdef CUDA_FFT printf("PcuFFT time: %f sec\n", cufft_time1); printf("PPhase time: %f sec\n", phase_time1); #else printf("PFFTW time: %f sec\n",fftw_time1); printf("PPhase time: %f sec\n",phase_time1); #endif #endif #ifdef WRITE_IMAGE printf("PWrite image time: %f sec\n", image_time1); #endif printf("PTOT time: %f sec\n",tot_time1); } Loading @@ -1106,9 +1196,9 @@ if(rank == 0){ pFile = fopen (timingfile,"w"); if (num_threads == 1) { fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,tot_time); fprintf(pFile, "%f %f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,image_time,tot_time); } else { fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,tot_time1); fprintf(pFile, "%f %f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,image_time1,tot_time1); } fclose(pFile); } Loading
w-stacking.h +24 −1 Original line number Diff line number Diff line Loading @@ -5,6 +5,13 @@ #define NTHREADS 32 #define PI 3.14159265359 #define REAL_TYPE double //#include <cufft.h> //#include <cufftw.h> #ifdef __CUDACC__ extern "C" #endif Loading Loading @@ -49,4 +56,20 @@ void phase_correction( double, double, int); #ifdef __CUDACC__ extern "C" #endif void cuda_fft( int, int, int, int, int, double*, double*); #endif