Commit fead89a6 authored by Emanuele De Rubeis's avatar Emanuele De Rubeis
Browse files

cuFFT implementation

parent a7e1860b
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -15,10 +15,11 @@ FITSIO_LIB= /m100_work/IscrC_CD-DLS/cfitsio-3.49

NVCC = nvcc
NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
CUFFT_INCL= -I/cineca/prod/opt/compilers/cuda/10.1/none/include/
NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda -lcufft

OMP= -fopenmp

CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) $(FITSIO_INCL)
CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) $(FITSIO_INCL) $(CUFFT_INCL)

OPTIMIZE = $(OMP) -O3 -mtune=native
+10 −8
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ include Build/Makefile.systype
endif


LIBS = -L$(FFTW_LIB) -lfftw3 -lm -lcudart  -lcuda
LIBS = -L$(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda -lcufft

# create MPI code
OPT += -DUSE_MPI
@@ -22,10 +22,10 @@ OPT += -DUSE_MPI
# use FFTW (it can be switched on ONLY if MPI is active)
ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
   OPT += -DUSE_FFTW
	 LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
	 LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm -lcuda -lcudart -lcufft
endif

#OPT += -DNVIDIA
OPT += -DNVIDIA
# perform one-side communication (suggested) instead of reduce (only if MPI is active)
#OPT += -DONE_SIDE
# write the full 3D cube of gridded visibilities and its FFT transform
@@ -34,17 +34,19 @@ endif
OPT += -DWRITE_IMAGE
# perform w-stacking phase correction
OPT += -DPHASE_ON
# Support FFT using cuFFT
OPT += -DCUDA_FFT
# Support CFITSIO
OPT += -DFITSIO
# Perform true parallel images writing
#OPT += -DPARALLELIO
OPT += -DPARALLELIO

ifeq (FITSIO,$(findstring FITSIO,$(OPT)))
	LIBS += -L$(FITSIO_LIB) -lcfitsio
endif

DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu
COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o
DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu cuda_fft.cu
COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o cuda_fft.o

w-stacking.c: w-stacking.cu
	cp w-stacking.cu w-stacking.c
@@ -81,9 +83,9 @@ mpi: $(COBJ)
	$(MPICC) $(OPTIMIZE) -o w-stackingCfftw   $^  $(CFLAGS) $(LIBS)

mpi_cuda:
	$(NVCC)   $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
	$(NVCC)   $(NVFLAGS) -c w-stacking.cu phase_correction.cu cuda_fft.cu $(NVLIB) $(CUFFT_INCL)
	$(MPICC)  $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
	$(MPIC++) $(OPTIMIZE) $(OPT)   -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS)
	$(MPIC++) $(OPTIMIZE) $(OPT)   -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o cuda_fft.o $(NVLIB) $(CFLAGS) $(LIBS)

clean:
	rm *.o
+90 −24
Original line number Diff line number Diff line
@@ -17,7 +17,44 @@ double
*/


//__global__ void write_grid
#ifdef __CUDACC__
__global__ void write_grid(
	int num_w_planes,
	int xaxis,
	int yaxis,
	cufftDoubleComplex *fftwgrid,
	double *grid)
{
	long fftwindex2D = blockIdx.x;
	long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis);

        fftwgrid[fftwindex2D].x = grid[fftwindex];
        fftwgrid[fftwindex2D].y = grid[fftwindex+1];
}
#endif


/*
#ifdef __CUDACC__
__global__ void write_gridss(
        int num_w_planes,
        int xaxis,
        int yaxis,
        cufftDoubleComplex *fftwgrid,
        double *gridss,
	int grid_size_x,
	int grid_size_y)
{
        long fftwindex2D = blockIdx.x;
        long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis);

	double norm = 1.0/(double)(grid_size_x*grid_size_y);

        gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x;
        gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y;
}
#endif
*/



@@ -33,7 +70,6 @@ void cuda_fft(
	double * grid,
	double * gridss)
{

#ifdef __CUDACC__

	cufftDoubleComplex *fftwgrid;
@@ -41,34 +77,47 @@ void cuda_fft(
	cudaError_t mmm;
	cufftResult_t status;

	double * grid_g;
	//double * gridss_g;


	// Define the CUDA set up
	int Nth = NTHREADS;
	long Nbl = (long)(num_w_planes*xaxis*yaxis/Nth) + 1;
	if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
	//long Nvis = num_points*freq_per_chan*polarizations;
	///printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
	fftwgrid =(cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y);
	mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis);
	mmm=cudaMalloc(&grid_g, sizeof(double)*2*num_w_planes*xaxis*yaxis);
	//mmm=cudaMalloc(&gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis);
	mmm=cudaMemcpy(grid_g, grid, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice);
	cudaDeviceSynchronize();


	clock_t start, end, startcu, endcu;
	double cufft_exec_time;
        struct timespec begin, finish, begin0, begink, finishk, begincu, finishcu;

	mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y);
	if (mmm != cudaSuccess) {printf("cudaMalloc error %d\n",mmm);}
	//mmm=cudaMemset(fftwgrid_g, 0.0, 2*grid_size_x*grid_size_y*sizeof(cufftDoubleComplex));
	printf("malloc fftwgrid_g\n");


	// Define the CUDA set up
	//int Nth = NTHREADS;
	long Nbl = (long)(xaxis+yaxis*xaxis);
	//if(NWORKERS == 1) {Nbl = 1; Nth = 1;};

	fftwgrid =(cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis);

        cufftHandle plan;
	cufftCreate(&plan);
        cufftPlan1d(&plan, grid_size_x*grid_size_y, CUFFT_Z2Z, 1);
        cufftPlan1d(&plan, num_w_planes*xaxis*yaxis, CUFFT_Z2Z, 1);
	printf("plan creation\n");
        double norm = 1.0/(double)(grid_size_x*grid_size_y);
        long fftwindex = 0;
        long fftwindex2D = 0;


	write_grid<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, grid_g);
	cudaDeviceSynchronize();

/*

        for (int iw=0; iw<num_w_planes; iw++)
        {
            printf("select the w-plane to transform\n");
            printf("select the %d w-plane to transform\n", iw);
            for (int iv=0; iv<yaxis; iv++)
            {
               for (int iu=0; iu<xaxis; iu++)
@@ -79,28 +128,43 @@ void cuda_fft(
		   fftwgrid[fftwindex2D].y = grid[fftwindex+1];
               }
            }

*/
	    //printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137].x);


	    mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y, cudaMemcpyHostToDevice);
            if (mmm != cudaSuccess) {printf("cudaMemcpy error %d\n",mmm);}
	    printf("memcpy fftwgrid to fftwgrid_g\n");
	    //mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice);
	    //if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);}
	    //printf("memcpy fftwgrid to fftwgrid_g\n");


            printf("do the transform for each w-plane\n");


	    clock_gettime(CLOCK_MONOTONIC, &begincu);
	    startcu = clock();

            status=cufftExecZ2Z(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE);
	    if (status != CUFFT_SUCCESS) {printf("cufftexec error %d\n",status);}

	    endcu = clock();
	    clock_gettime(CLOCK_MONOTONIC, &finishcu);
	    cufft_exec_time = ((double) (endcu-startcu)) / CLOCKS_PER_SEC;
	    printf("Time for cuFFT plan execution is %f sec\n", cufft_exec_time);
	    if (status != CUFFT_SUCCESS) {printf("cufftexec ERROR %d\n",status);}
	    cudaDeviceSynchronize();


            mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y, cudaMemcpyDeviceToHost);
	    if (mmm != cudaSuccess) {printf("cudaMemcpy error %d\n",mmm);}
	    printf("memcpy fftwgrid_g to fftwgrid\n");
	    //printf("fftwgrid[1260137].x post cuFFT %f\n", fftwgrid[1260137].x);
	    //write_gridss<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, gridss_g, grid_size_x, grid_size_y);
	    //cudaDeviceSynchronize();

	    //mmm=cudaMemcpy(gridss, gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost);


            mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost);
	    if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);}


	for (int iw=0; iw<num_w_planes; iw++)
	{
            // save the transformed w-plane
            for (int iv=0; iv<yaxis; iv++)
            {
@@ -118,5 +182,7 @@ void cuda_fft(
        cufftDestroy(plan);

	mmm = cudaFree(fftwgrid_g);
	mmm = cudaFree(grid_g);
	//mmm = cudaFree(gridss_g);
#endif
}
+107 −17
Original line number Diff line number Diff line
@@ -104,8 +104,8 @@ int main(int argc, char * argv[])
	double resolution;

        // Mesh related parameters: global size
	int grid_size_x = 2048;
	int grid_size_y = 2048;
	int grid_size_x = 4096;
	int grid_size_y = 4096;
	// Split Mesh size (auto-calculated)
	int local_grid_size_x;
	int local_grid_size_y;
@@ -113,7 +113,7 @@ int main(int argc, char * argv[])
	int yaxis;

	// Number of planes in the w direction
	int num_w_planes = 8;
	int num_w_planes = 1;

	// Size of the convoutional kernel support
	int w_support = 7;
@@ -132,8 +132,8 @@ int main(int argc, char * argv[])
	fitsfile *fptreal;
	fitsfile *fptrimg;
	int status;
	char testfitsreal[FILENAMELENGTH] = "parallel_np2_real.fits";
	char testfitsimag[FILENAMELENGTH] = "parallel_np2_img.fits";
	char testfitsreal[FILENAMELENGTH] = "parallel_np8_real.fits";
	char testfitsimag[FILENAMELENGTH] = "parallel_np8_img.fits";

	long naxis = 2;
	long naxes[2] = { grid_size_x, grid_size_y };
@@ -141,11 +141,11 @@ int main(int argc, char * argv[])


	// Internal profiling parameters 
	clock_t start, end, start0, startk, endk;
	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time;
	double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1;
	clock_t start, end, start0, startk, endk, startis, endis;
	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, cufft_time, phase_time, image_time, mpifftw_time;
	double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, cufft_time1, phase_time1, image_time1;
	double writetime, writetime1;
	struct timespec begin, finish, begin0, begink, finishk;
	struct timespec begin, finish, begin0, begink, finishk, beginis, finishis;
	double elapsed;

        // Number of sectors in which the mesh is divided along the v direction
@@ -179,9 +179,11 @@ int main(int argc, char * argv[])
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	if(rank == 0)printf("Running with %d MPI tasks\n",size);
  #ifdef USE_FFTW
  #ifndef CUDA_FFT
	// Initialize FFTW environment
	fftw_mpi_init();
  #endif
  #endif
#else
	// If MPI is not used the two parameters are set for consistency
	rank = 0;
@@ -786,9 +788,56 @@ if(rank == 0){


#ifdef USE_FFTW


#ifdef CUDA_FFT

	// FFT transform the data using cuFFT
	if(rank==0)printf("PERFORMING CUDA FFT\n");
	clock_gettime(CLOCK_MONOTONIC, &begin);
	start = clock();
/*
        for (int isector=0; isector<size; isector++)
        {
            #ifdef USE_MPI
            MPI_Barrier(MPI_COMM_WORLD);
            #endif
            if(isector == rank)
            {
*/
	cuda_fft(
	num_w_planes,
	grid_size_x,
	grid_size_y,
	xaxis,
	yaxis,
	grid,
	gridss);

//}
//}





	end = clock();
	clock_gettime(CLOCK_MONOTONIC, &finish);
	cufft_time = ((double) (end - start)) / CLOCKS_PER_SEC;
        cufft_time1 = (finish.tv_sec - begin.tv_sec);
        cufft_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;




#else


	// FFT transform the data (using distributed FFTW)

	if(rank == 0)printf("PERFORMING FFT\n");
	//printf("Pre gridss[1] %f \n", gridss[1]);

        clock_gettime(CLOCK_MONOTONIC, &begin);
        start = clock();
        fftw_plan plan;
@@ -821,9 +870,24 @@ if(rank == 0){
	       }
	    }


            // do the transform for each w-plane
	    //printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137][1]);


	    clock_gettime(CLOCK_MONOTONIC, &beginis);
	    startis = clock();

	    fftw_execute(plan);

	    endis = clock();
	    clock_gettime(CLOCK_MONOTONIC, &finishis);
	    mpifftw_time = ((double) (endis-startis)) / CLOCKS_PER_SEC;
	    printf("Time for FFTW MPI is %f sec\n", mpifftw_time);



	    //printf("fftwgrid[1260137].x post cuFFT %f\n", fftwgrid[1260137][0]);
	    // save the transformed w-plane
            for (int iv=0; iv<yaxis; iv++)
            {
@@ -839,7 +903,7 @@ if(rank == 0){
	}

        fftw_destroy_plan(plan);

	//printf("Post gridss[1] %f \n", gridss[1]);
        #ifdef USE_MPI
        MPI_Win_fence(0,slabwin);
	MPI_Barrier(MPI_COMM_WORLD);
@@ -852,6 +916,8 @@ if(rank == 0){
        fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
        clock_gettime(CLOCK_MONOTONIC, &begin);

#endif //CUDA_FFT

#ifdef WRITE_DATA
        // Write results
        #ifdef USE_MPI
@@ -927,8 +993,10 @@ if(rank == 0){
        #endif
#endif //WRITE_DATA

#ifndef CUDA_FFT
//#else
	fftw_free(fftwgrid);

#endif
	// Phase correction
        clock_gettime(CLOCK_MONOTONIC, &begin);
        start = clock();
@@ -944,7 +1012,8 @@ if(rank == 0){
        phase_time1 = (finish.tv_sec - begin.tv_sec);
        phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
#ifdef WRITE_IMAGE

        clock_gettime(CLOCK_MONOTONIC, &begin);
	start = clock();
        if(rank == 0)
        {
            #ifdef FITSIO
@@ -1066,10 +1135,15 @@ if(rank == 0){
	MPI_Barrier(MPI_COMM_WORLD);
        #endif

	end = clock();
	clock_gettime(CLOCK_MONOTONIC, &finish);
	image_time = ((double) (end - start)) / CLOCKS_PER_SEC;
	image_time1 = (finish.tv_sec - begin0.tv_sec);
	image_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0;

#endif //WRITE_IMAGE


//#endif //CUDA_FFT
#endif //USE_FFTW

	end = clock();
@@ -1084,8 +1158,16 @@ if(rank == 0){
          printf("Process time:  %f sec\n",process_time);
          printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time);
#ifdef USE_FFTW
#ifdef CUDA_FFT
          printf("cuFFT time:     %f sec\n",cufft_time);
          printf("Phase time:    %f sec\n",phase_time);
#else
	  printf("FFTW time:	%f sec\n", fftw_time);
	  printf("Phase time:	%f sec\n", phase_time);
#endif
#endif
#ifdef WRITE_IMAGE
	  printf("Write image time:	%f sec\n", image_time);
#endif
          printf("TOT time:      %f sec\n",tot_time);
	  if(num_threads > 1)
@@ -1094,8 +1176,16 @@ if(rank == 0){
            printf("PProcess time: %f sec\n",process_time1);
            printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1);
#ifdef USE_FFTW
#ifdef CUDA_FFT
	    printf("PcuFFT time:    %f sec\n", cufft_time1);
	    printf("PPhase time:    %f sec\n", phase_time1);
#else
            printf("PFFTW time:    %f sec\n",fftw_time1);
            printf("PPhase time:   %f sec\n",phase_time1);
#endif
#endif
#ifdef WRITE_IMAGE
	    printf("PWrite image time:	%f sec\n", image_time1);
#endif
            printf("PTOT time:     %f sec\n",tot_time1);
	  }
@@ -1106,9 +1196,9 @@ if(rank == 0){
	 pFile = fopen (timingfile,"w");
	 if (num_threads == 1)
         {
	   fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,tot_time);
	   fprintf(pFile, "%f %f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,image_time,tot_time);
	 } else {
	   fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,tot_time1);
	   fprintf(pFile, "%f %f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,image_time1,tot_time1);
	 }  
	 fclose(pFile);
	} 
+24 −1
Original line number Diff line number Diff line
@@ -5,6 +5,13 @@
#define NTHREADS 32
#define PI 3.14159265359
#define REAL_TYPE double

//#include <cufft.h>
//#include <cufftw.h>




#ifdef __CUDACC__
extern "C"
#endif
@@ -49,4 +56,20 @@ void phase_correction(
     double,
     double,
     int);



#ifdef __CUDACC__
extern "C"
#endif

void cuda_fft(
	int,
	int,
	int,
	int,
	int,
	double*,
	double*);

#endif