Compilation updates - CPU/GPU interoperability (c66c4671) · Commits · Claudio Gheller / HPC_Imaging

Makefile

+8 −8

Original line number	Diff line number	Diff line
		@@ -20,8 +20,8 @@ endif

		LINKER=$(MPICC)

		FFTW_MPI_INC = -I/opt/cray/pe/fftw/3.3.10.5/x86_rome/include
		FFTW_MPI_LIB = -L/opt/cray/pe/fftw/3.3.10.5/x86_rome/lib
		FFTW_MPI_INC =
		FFTW_MPI_LIB =

		CFLAGS += -I./

		@@ -63,11 +63,11 @@ OPT += -DPHASE_ON
		# SELECT THE GRIDDING KERNEL: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL


		#OPT += -DGAUSS_HI_PRECISION
		OPT += -DGAUSS_HI_PRECISION

		#OPT += -DGAUSS

		OPT += -DKAISERBESSEL
		#OPT += -DKAISERBESSEL

		# ========================================================

		@@ -77,16 +77,16 @@ OPT += -DKAISERBESSEL
		#OPT += -DNVIDIA

		# use CUDA for GPUs
		#OPT += -DCUDACC
		OPT += -DCUDACC

		# use GPU acceleration via OMP
		#OPT += -DACCOMP

		# perform stacking on GPUs
		#OPT += -DGPU_STACKING
		OPT += -DGPU_STACKING

		# use NVIDIA GPU to perform the reduce
		#OPT += -DNCCL_REDUCE
		OPT += -DNCCL_REDUCE

		# use GPU to perform FFT
		#OPT += -DCUFFTMP
		@@ -109,7 +109,7 @@ endif
		#OPT += -DRCCL_REDUCE

		# FULL AMD GPU SUPPORT - Recommended for full AMD GPU code execution
		OPT += -DFULL_AMD
		#OPT += -DFULL_AMD
		ifeq (FULL_AMD,$(findstring FULL_AMD,$(OPT)))
		OPT += -DHIPCC -DRCCL_REDUCE -D__HIP_PLATFORM_AMD__
		endif

cuda_fft.cu

+13 −10

Original line number	Diff line number	Diff line
		@@ -30,7 +30,6 @@ __global__ void write_grid(
		fftwgrid[fftwindex2D].x = grid[fftwindex];
		fftwgrid[fftwindex2D].y = grid[fftwindex+1];
		}

		}


		@@ -88,8 +87,9 @@ void cuda_fft(

		cufftDoubleComplex *fftwgrid;


		// Alloco fftwgrid su GPU utilizzando cudaMalloc
		mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)2yaxis*xaxis);
		mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)yaxisxaxis);
		if (mmm != cudaSuccess) {printf("!!! cuda_fft.cu cudaMalloc ERROR %d !!!\n", mmm);}

		int Nth = 32;
		@@ -146,8 +146,11 @@ void cuda_fft(
		status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);}

		cudaStreamSynchronize(stream);

		//Copy the array to be transformed onto the descriptor structure array
		cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, 2xaxisyaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
		mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, xaxisyaxissizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
		if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 1 ERROR %d !!!\n", mmm);}

		//Perform the FFT
		status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE);
		@@ -163,7 +166,8 @@ void cuda_fft(
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);}

		//Copy the result descriptor structure array again onto the original fftwgrid
		cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], 2xaxisyaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
		mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], xaxisyaxissizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
		if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 2 ERROR %d !!!\n", mmm);}

		//Write gridss starting from fftwgrid
		write_gridss<<<Nbl, Nth>>>(num_w_planes, xaxis, yaxis, fftwgrid, gridss, norm, iw);
		@@ -191,4 +195,3 @@ void cuda_fft(

		}
		#endif

fourier_transform.c

+1 −5

Original line number	Diff line number	Diff line
		@@ -104,8 +104,6 @@ void fftw_data ( void )
		// FFT transform the data using cuFFT
		if(rank==0)printf("PERFORMING CUDA FFT\n");

		MPI_Barrier(MPI_COMM_WORLD);

		double start = CPU_TIME_wt;


		@@ -120,8 +118,6 @@ void fftw_data ( void )
		rank,
		MPI_COMM_WORLD);

		MPI_Barrier(MPI_COMM_WORLD);

		timing_wt.cufftmp += CPU_TIME_wt - start;

		return;
		@@ -213,7 +209,7 @@ void write_fftw_data(){
		double* image_real = (double) calloc(xaxisyaxis,sizeof(double));
		double* image_imag = (double) calloc(xaxisyaxis,sizeof(double));

		#ifdef CUDACC
		#ifdef CUFFTMP
		phase_correction(gridss_gpu,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads,rank);
		#else
		phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads,rank);

gridding_nccl.cu

+22 −8

Original line number	Diff line number	Diff line
		@@ -95,10 +95,17 @@ void gridding_data(){

		cudaSetDevice(local_rank);

		cudaMalloc(&grid_gpu, 2param.num_w_planesxaxisyaxis sizeof(double));
		cudaMalloc(&gridss_gpu, 2param.num_w_planesxaxisyaxis sizeof(double));
		cudaStreamCreate(&stream_reduce);
		nnn = cudaMalloc(&grid_gpu, 2param.num_w_planesxaxisyaxis sizeof(double));
		if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &grid_gpu ERROR %d !!!\n", nnn);}

		nnn = cudaMalloc(&gridss_gpu, 2param.num_w_planesxaxisyaxis sizeof(double));
		if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &gridss_gpu ERROR %d !!!\n", nnn);}

		nnn = cudaStreamCreate(&stream_reduce);
		if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaStreamCreate &stream_reduce ERROR %d !!!\n", nnn);}

		cudaStreamCreate(&stream_stacking);
		if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaStreamCreate &stream_stacking ERROR %d !!!\n", nnn);}

		ncclCommInitRank(&comm, size, id, rank);

		@@ -263,7 +270,7 @@ void gridding_data(){

		// Go to next sector
		nnn = cudaMemset( gridss_gpu, 0.0, 2param.num_w_planesxaxisyaxis sizeof(double) );
		if (nnn != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", nnn);}
		if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMemset ERROR %d !!!\n", nnn);}
		}

		free(memory);
		@@ -274,11 +281,18 @@ void gridding_data(){

		//cudaMemcpyAsync(grid, grid_gpu, 2param.num_w_planesxaxisyaxissizeof(double), cudaMemcpyDeviceToHost, stream_reduce);

		#if !defined(CUFFTMP)
		cudaMemcpyAsync(grid, grid_gpu, 2param.num_w_planesxaxisyaxissizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
		cudaStreamSynchronize(stream_reduce);
		#endif

		MPI_Barrier(MPI_COMM_WORLD);

		//cudaStreamSynchronize(stream_reduce);
		// cudaFree(grid_gpu);
		//cudaFree(gridss_gpu);
		#if !defined(CUFFTMP)
		cudaFree(grid_gpu);
		cudaFree(gridss_gpu);
		#endif

		cudaStreamDestroy(stream_reduce);
		cudaStreamDestroy(stream_stacking);

phase_correction.cu

+18 −1

Original line number	Diff line number	Diff line
		@@ -129,6 +129,13 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
		double * image_real_g;
		double * image_imag_g;

		#if !defined(CUFFTMP)
		double * gridss_g;

		mmm = cudaMalloc(&gridss_g, 2num_w_planesxaxisyaxissizeof(double));
		mmm = cudaMemcpy(gridss_g, gridss, 2num_w_planesxaxisyaxissizeof(double), cudaMemcpyHostToDevice);
		#endif

		mmm=cudaMalloc(&image_real_g, xaxisyaxissizeof(double));
		//printf("CUDA ERROR 2 %s\n",cudaGetErrorString(mmm));
		mmm=cudaMalloc(&image_imag_g, xaxisyaxissizeof(double));
		@@ -144,7 +151,11 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
		phase_g <<<Nbl,Nth>>> (xaxis,
		yaxis,
		num_w_planes,
		#if defined(CUFFTMP)
		gridss,
		#else
		gridss_g,
		#endif
		image_real_g,
		image_imag_g,
		wmin,
		@@ -160,7 +171,13 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
		mmm = cudaMemcpy(image_imag, image_imag_g, xaxisyaxissizeof(double), cudaMemcpyDeviceToHost);
		//printf("CUDA ERROR 8 %s\n",cudaGetErrorString(mmm));

		mmm= cudaFree(gridss);
		#if !defined(CUFFTMP)
		cudaFree(gridss_g);
		#else
		cudaFree(gridss);
		#endif


		#else

		#ifndef ACCOMP