cuFFT implementation (fead89a6) · Commits · Claudio Gheller / HPC_Imaging

Build/Makefile.M100

+3 −2

Original line number	Diff line number	Diff line
		@@ -15,10 +15,11 @@ FITSIO_LIB= /m100_work/IscrC_CD-DLS/cfitsio-3.49

		NVCC = nvcc
		NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
		NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
		CUFFT_INCL= -I/cineca/prod/opt/compilers/cuda/10.1/none/include/
		NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda -lcufft

		OMP= -fopenmp

		CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) $(FITSIO_INCL)
		CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) $(FITSIO_INCL) $(CUFFT_INCL)

		OPTIMIZE = $(OMP) -O3 -mtune=native

Makefile

+10 −8

Original line number	Diff line number	Diff line
		@@ -14,7 +14,7 @@ include Build/Makefile.systype
		endif


		LIBS = -L$(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda
		LIBS = -L$(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda -lcufft

		# create MPI code
		OPT += -DUSE_MPI
		@@ -22,10 +22,10 @@ OPT += -DUSE_MPI
		# use FFTW (it can be switched on ONLY if MPI is active)
		ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
		OPT += -DUSE_FFTW
		LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
		LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm -lcuda -lcudart -lcufft
		endif

		#OPT += -DNVIDIA
		OPT += -DNVIDIA
		# perform one-side communication (suggested) instead of reduce (only if MPI is active)
		#OPT += -DONE_SIDE
		# write the full 3D cube of gridded visibilities and its FFT transform
		@@ -34,17 +34,19 @@ endif
		OPT += -DWRITE_IMAGE
		# perform w-stacking phase correction
		OPT += -DPHASE_ON
		# Support FFT using cuFFT
		OPT += -DCUDA_FFT
		# Support CFITSIO
		OPT += -DFITSIO
		# Perform true parallel images writing
		#OPT += -DPARALLELIO
		OPT += -DPARALLELIO

		ifeq (FITSIO,$(findstring FITSIO,$(OPT)))
		LIBS += -L$(FITSIO_LIB) -lcfitsio
		endif

		DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu
		COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o
		DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu cuda_fft.cu
		COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o cuda_fft.o

		w-stacking.c: w-stacking.cu
		cp w-stacking.cu w-stacking.c
		@@ -81,9 +83,9 @@ mpi: $(COBJ)
		$(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS)

		mpi_cuda:
		$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
		$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu cuda_fft.cu $(NVLIB) $(CUFFT_INCL)
		$(MPICC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
		$(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS)
		$(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o cuda_fft.o $(NVLIB) $(CFLAGS) $(LIBS)

		clean:
		rm *.o

cuda_fft.cu

+90 −24

Original line number	Diff line number	Diff line
		@@ -17,7 +17,44 @@ double
		*/


		//__global__ void write_grid
		#ifdef __CUDACC__
		__global__ void write_grid(
		int num_w_planes,
		int xaxis,
		int yaxis,
		cufftDoubleComplex *fftwgrid,
		double *grid)
		{
		long fftwindex2D = blockIdx.x;
		long fftwindex = 2(blockIdx.x+threadIdx.xxaxis*yaxis);

		fftwgrid[fftwindex2D].x = grid[fftwindex];
		fftwgrid[fftwindex2D].y = grid[fftwindex+1];
		}
		#endif


		/*
		#ifdef __CUDACC__
		__global__ void write_gridss(
		int num_w_planes,
		int xaxis,
		int yaxis,
		cufftDoubleComplex *fftwgrid,
		double *gridss,
		int grid_size_x,
		int grid_size_y)
		{
		long fftwindex2D = blockIdx.x;
		long fftwindex = 2(blockIdx.x+threadIdx.xxaxis*yaxis);

		double norm = 1.0/(double)(grid_size_x*grid_size_y);

		gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x;
		gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y;
		}
		#endif
		*/



		@@ -33,7 +70,6 @@ void cuda_fft(
		double * grid,
		double * gridss)
		{

		#ifdef __CUDACC__

		cufftDoubleComplex *fftwgrid;
		@@ -41,34 +77,47 @@ void cuda_fft(
		cudaError_t mmm;
		cufftResult_t status;

		double * grid_g;
		//double * gridss_g;


		// Define the CUDA set up
		int Nth = NTHREADS;
		long Nbl = (long)(num_w_planesxaxisyaxis/Nth) + 1;
		if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
		//long Nvis = num_pointsfreq_per_chanpolarizations;
		///printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
		fftwgrid =(cufftDoubleComplex) malloc(sizeof(cufftDoubleComplex)2grid_size_xgrid_size_y);
		mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)2num_w_planesxaxisyaxis);
		mmm=cudaMalloc(&grid_g, sizeof(double)2num_w_planesxaxisyaxis);
		//mmm=cudaMalloc(&gridss_g, sizeof(double)2num_w_planesxaxisyaxis);
		mmm=cudaMemcpy(grid_g, grid, sizeof(double)2num_w_planesxaxisyaxis, cudaMemcpyHostToDevice);
		cudaDeviceSynchronize();


		clock_t start, end, startcu, endcu;
		double cufft_exec_time;
		struct timespec begin, finish, begin0, begink, finishk, begincu, finishcu;

		mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)2grid_size_x*grid_size_y);
		if (mmm != cudaSuccess) {printf("cudaMalloc error %d\n",mmm);}
		//mmm=cudaMemset(fftwgrid_g, 0.0, 2grid_size_xgrid_size_y*sizeof(cufftDoubleComplex));
		printf("malloc fftwgrid_g\n");


		// Define the CUDA set up
		//int Nth = NTHREADS;
		long Nbl = (long)(xaxis+yaxis*xaxis);
		//if(NWORKERS == 1) {Nbl = 1; Nth = 1;};

		fftwgrid =(cufftDoubleComplex) malloc(sizeof(cufftDoubleComplex)2num_w_planesxaxis*yaxis);

		cufftHandle plan;
		cufftCreate(&plan);
		cufftPlan1d(&plan, grid_size_x*grid_size_y, CUFFT_Z2Z, 1);
		cufftPlan1d(&plan, num_w_planesxaxisyaxis, CUFFT_Z2Z, 1);
		printf("plan creation\n");
		double norm = 1.0/(double)(grid_size_x*grid_size_y);
		long fftwindex = 0;
		long fftwindex2D = 0;


		write_grid<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, grid_g);
		cudaDeviceSynchronize();

		/*

		for (int iw=0; iw<num_w_planes; iw++)
		{
		printf("select the w-plane to transform\n");
		printf("select the %d w-plane to transform\n", iw);
		for (int iv=0; iv<yaxis; iv++)
		{
		for (int iu=0; iu<xaxis; iu++)
		@@ -79,28 +128,43 @@ void cuda_fft(
		fftwgrid[fftwindex2D].y = grid[fftwindex+1];
		}
		}

		*/
		//printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137].x);


		mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)2grid_size_x*grid_size_y, cudaMemcpyHostToDevice);
		if (mmm != cudaSuccess) {printf("cudaMemcpy error %d\n",mmm);}
		printf("memcpy fftwgrid to fftwgrid_g\n");
		//mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)num_w_planesxaxis*yaxis, cudaMemcpyHostToDevice);
		//if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);}
		//printf("memcpy fftwgrid to fftwgrid_g\n");


		printf("do the transform for each w-plane\n");


		clock_gettime(CLOCK_MONOTONIC, &begincu);
		startcu = clock();

		status=cufftExecZ2Z(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE);
		if (status != CUFFT_SUCCESS) {printf("cufftexec error %d\n",status);}

		endcu = clock();
		clock_gettime(CLOCK_MONOTONIC, &finishcu);
		cufft_exec_time = ((double) (endcu-startcu)) / CLOCKS_PER_SEC;
		printf("Time for cuFFT plan execution is %f sec\n", cufft_exec_time);
		if (status != CUFFT_SUCCESS) {printf("cufftexec ERROR %d\n",status);}
		cudaDeviceSynchronize();


		mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)2grid_size_x*grid_size_y, cudaMemcpyDeviceToHost);
		if (mmm != cudaSuccess) {printf("cudaMemcpy error %d\n",mmm);}
		printf("memcpy fftwgrid_g to fftwgrid\n");
		//printf("fftwgrid[1260137].x post cuFFT %f\n", fftwgrid[1260137].x);
		//write_gridss<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, gridss_g, grid_size_x, grid_size_y);
		//cudaDeviceSynchronize();

		//mmm=cudaMemcpy(gridss, gridss_g, sizeof(double)2num_w_planesxaxisyaxis, cudaMemcpyDeviceToHost);


		mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)2num_w_planesxaxisyaxis, cudaMemcpyDeviceToHost);
		if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);}


		for (int iw=0; iw<num_w_planes; iw++)
		{
		// save the transformed w-plane
		for (int iv=0; iv<yaxis; iv++)
		{
		@@ -118,5 +182,7 @@ void cuda_fft(
		cufftDestroy(plan);

		mmm = cudaFree(fftwgrid_g);
		mmm = cudaFree(grid_g);
		//mmm = cudaFree(gridss_g);
		#endif
		}

w-stacking-fftw.c

+107 −17

Original line number	Diff line number	Diff line
		@@ -104,8 +104,8 @@ int main(int argc, char * argv[])
		double resolution;

		// Mesh related parameters: global size
		int grid_size_x = 2048;
		int grid_size_y = 2048;
		int grid_size_x = 4096;
		int grid_size_y = 4096;
		// Split Mesh size (auto-calculated)
		int local_grid_size_x;
		int local_grid_size_y;
		@@ -113,7 +113,7 @@ int main(int argc, char * argv[])
		int yaxis;

		// Number of planes in the w direction
		int num_w_planes = 8;
		int num_w_planes = 1;

		// Size of the convoutional kernel support
		int w_support = 7;
		@@ -132,8 +132,8 @@ int main(int argc, char * argv[])
		fitsfile *fptreal;
		fitsfile *fptrimg;
		int status;
		char testfitsreal[FILENAMELENGTH] = "parallel_np2_real.fits";
		char testfitsimag[FILENAMELENGTH] = "parallel_np2_img.fits";
		char testfitsreal[FILENAMELENGTH] = "parallel_np8_real.fits";
		char testfitsimag[FILENAMELENGTH] = "parallel_np8_img.fits";

		long naxis = 2;
		long naxes[2] = { grid_size_x, grid_size_y };
		@@ -141,11 +141,11 @@ int main(int argc, char * argv[])


		// Internal profiling parameters
		clock_t start, end, start0, startk, endk;
		double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time;
		double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1;
		clock_t start, end, start0, startk, endk, startis, endis;
		double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, cufft_time, phase_time, image_time, mpifftw_time;
		double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, cufft_time1, phase_time1, image_time1;
		double writetime, writetime1;
		struct timespec begin, finish, begin0, begink, finishk;
		struct timespec begin, finish, begin0, begink, finishk, beginis, finishis;
		double elapsed;

		// Number of sectors in which the mesh is divided along the v direction
		@@ -179,9 +179,11 @@ int main(int argc, char * argv[])
		MPI_Comm_size(MPI_COMM_WORLD, &size);
		if(rank == 0)printf("Running with %d MPI tasks\n",size);
		#ifdef USE_FFTW
		#ifndef CUDA_FFT
		// Initialize FFTW environment
		fftw_mpi_init();
		#endif
		#endif
		#else
		// If MPI is not used the two parameters are set for consistency
		rank = 0;
		@@ -786,9 +788,56 @@ if(rank == 0){


		#ifdef USE_FFTW


		#ifdef CUDA_FFT

		// FFT transform the data using cuFFT
		if(rank==0)printf("PERFORMING CUDA FFT\n");
		clock_gettime(CLOCK_MONOTONIC, &begin);
		start = clock();
		/*
		for (int isector=0; isector<size; isector++)
		{
		#ifdef USE_MPI
		MPI_Barrier(MPI_COMM_WORLD);
		#endif
		if(isector == rank)
		{
		*/
		cuda_fft(
		num_w_planes,
		grid_size_x,
		grid_size_y,
		xaxis,
		yaxis,
		grid,
		gridss);

		//}
		//}





		end = clock();
		clock_gettime(CLOCK_MONOTONIC, &finish);
		cufft_time = ((double) (end - start)) / CLOCKS_PER_SEC;
		cufft_time1 = (finish.tv_sec - begin.tv_sec);
		cufft_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;




		#else


		// FFT transform the data (using distributed FFTW)

		if(rank == 0)printf("PERFORMING FFT\n");
		//printf("Pre gridss[1] %f \n", gridss[1]);

		clock_gettime(CLOCK_MONOTONIC, &begin);
		start = clock();
		fftw_plan plan;
		@@ -821,9 +870,24 @@ if(rank == 0){
		}
		}


		// do the transform for each w-plane
		//printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137][1]);


		clock_gettime(CLOCK_MONOTONIC, &beginis);
		startis = clock();

		fftw_execute(plan);

		endis = clock();
		clock_gettime(CLOCK_MONOTONIC, &finishis);
		mpifftw_time = ((double) (endis-startis)) / CLOCKS_PER_SEC;
		printf("Time for FFTW MPI is %f sec\n", mpifftw_time);



		//printf("fftwgrid[1260137].x post cuFFT %f\n", fftwgrid[1260137][0]);
		// save the transformed w-plane
		for (int iv=0; iv<yaxis; iv++)
		{
		@@ -839,7 +903,7 @@ if(rank == 0){
		}

		fftw_destroy_plan(plan);

		//printf("Post gridss[1] %f \n", gridss[1]);
		#ifdef USE_MPI
		MPI_Win_fence(0,slabwin);
		MPI_Barrier(MPI_COMM_WORLD);
		@@ -852,6 +916,8 @@ if(rank == 0){
		fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
		clock_gettime(CLOCK_MONOTONIC, &begin);

		#endif //CUDA_FFT

		#ifdef WRITE_DATA
		// Write results
		#ifdef USE_MPI
		@@ -927,8 +993,10 @@ if(rank == 0){
		#endif
		#endif //WRITE_DATA

		#ifndef CUDA_FFT
		//#else
		fftw_free(fftwgrid);

		#endif
		// Phase correction
		clock_gettime(CLOCK_MONOTONIC, &begin);
		start = clock();
		@@ -944,7 +1012,8 @@ if(rank == 0){
		phase_time1 = (finish.tv_sec - begin.tv_sec);
		phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
		#ifdef WRITE_IMAGE

		clock_gettime(CLOCK_MONOTONIC, &begin);
		start = clock();
		if(rank == 0)
		{
		#ifdef FITSIO
		@@ -1066,10 +1135,15 @@ if(rank == 0){
		MPI_Barrier(MPI_COMM_WORLD);
		#endif

		end = clock();
		clock_gettime(CLOCK_MONOTONIC, &finish);
		image_time = ((double) (end - start)) / CLOCKS_PER_SEC;
		image_time1 = (finish.tv_sec - begin0.tv_sec);
		image_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0;

		#endif //WRITE_IMAGE


		//#endif //CUDA_FFT
		#endif //USE_FFTW

		end = clock();
		@@ -1084,8 +1158,16 @@ if(rank == 0){
		printf("Process time: %f sec\n",process_time);
		printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time);
		#ifdef USE_FFTW
		#ifdef CUDA_FFT
		printf("cuFFT time: %f sec\n",cufft_time);
		printf("Phase time: %f sec\n",phase_time);
		#else
		printf("FFTW time: %f sec\n", fftw_time);
		printf("Phase time: %f sec\n", phase_time);
		#endif
		#endif
		#ifdef WRITE_IMAGE
		printf("Write image time: %f sec\n", image_time);
		#endif
		printf("TOT time: %f sec\n",tot_time);
		if(num_threads > 1)
		@@ -1094,8 +1176,16 @@ if(rank == 0){
		printf("PProcess time: %f sec\n",process_time1);
		printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1);
		#ifdef USE_FFTW
		#ifdef CUDA_FFT
		printf("PcuFFT time: %f sec\n", cufft_time1);
		printf("PPhase time: %f sec\n", phase_time1);
		#else
		printf("PFFTW time: %f sec\n",fftw_time1);
		printf("PPhase time: %f sec\n",phase_time1);
		#endif
		#endif
		#ifdef WRITE_IMAGE
		printf("PWrite image time: %f sec\n", image_time1);
		#endif
		printf("PTOT time: %f sec\n",tot_time1);
		}
		@@ -1106,9 +1196,9 @@ if(rank == 0){
		pFile = fopen (timingfile,"w");
		if (num_threads == 1)
		{
		fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,tot_time);
		fprintf(pFile, "%f %f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,image_time,tot_time);
		} else {
		fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,tot_time1);
		fprintf(pFile, "%f %f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,image_time1,tot_time1);
		}
		fclose(pFile);
		}

w-stacking.h

+24 −1

Original line number	Diff line number	Diff line
		@@ -5,6 +5,13 @@
		#define NTHREADS 32
		#define PI 3.14159265359
		#define REAL_TYPE double

		//#include <cufft.h>
		//#include <cufftw.h>




		#ifdef __CUDACC__
		extern "C"
		#endif
		@@ -49,4 +56,20 @@ void phase_correction(
		double,
		double,
		int);



		#ifdef __CUDACC__
		extern "C"
		#endif

		void cuda_fft(
		int,
		int,
		int,
		int,
		int,
		double*,
		double*);

		#endif