Commit c66c4671 authored by Emanuele De Rubeis's avatar Emanuele De Rubeis
Browse files

Compilation updates - CPU/GPU interoperability

parent 84e3702e
Loading
Loading
Loading
Loading
+8 −8
Original line number Diff line number Diff line
@@ -20,8 +20,8 @@ endif

LINKER=$(MPICC)

FFTW_MPI_INC = -I/opt/cray/pe/fftw/3.3.10.5/x86_rome/include
FFTW_MPI_LIB = -L/opt/cray/pe/fftw/3.3.10.5/x86_rome/lib
FFTW_MPI_INC = 
FFTW_MPI_LIB = 

CFLAGS += -I./

@@ -63,11 +63,11 @@ OPT += -DPHASE_ON
# SELECT THE GRIDDING KERNEL: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL


#OPT += -DGAUSS_HI_PRECISION
OPT += -DGAUSS_HI_PRECISION

#OPT += -DGAUSS

OPT += -DKAISERBESSEL
#OPT += -DKAISERBESSEL

# ========================================================

@@ -77,16 +77,16 @@ OPT += -DKAISERBESSEL
#OPT += -DNVIDIA

# use CUDA for GPUs
#OPT += -DCUDACC
OPT += -DCUDACC

# use GPU acceleration via OMP 
#OPT += -DACCOMP

# perform stacking on GPUs
#OPT += -DGPU_STACKING
OPT += -DGPU_STACKING

# use NVIDIA GPU to perform the reduce
#OPT += -DNCCL_REDUCE
OPT += -DNCCL_REDUCE

# use GPU to perform FFT
#OPT += -DCUFFTMP
@@ -109,7 +109,7 @@ endif
#OPT += -DRCCL_REDUCE

# FULL AMD GPU SUPPORT - Recommended for full AMD GPU code execution
OPT += -DFULL_AMD
#OPT += -DFULL_AMD
ifeq (FULL_AMD,$(findstring FULL_AMD,$(OPT)))
OPT += -DHIPCC -DRCCL_REDUCE -D__HIP_PLATFORM_AMD__
endif
+13 −10
Original line number Diff line number Diff line
@@ -30,7 +30,6 @@ __global__ void write_grid(
      fftwgrid[fftwindex2D].x = grid[fftwindex];
      fftwgrid[fftwindex2D].y = grid[fftwindex+1];
    }  
  
}


@@ -88,8 +87,9 @@ void cuda_fft(

  cufftDoubleComplex *fftwgrid;

  
  // Alloco fftwgrid su GPU utilizzando cudaMalloc
  mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*2*yaxis*xaxis);
  mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*yaxis*xaxis);
  if (mmm != cudaSuccess) {printf("!!! cuda_fft.cu cudaMalloc ERROR %d !!!\n", mmm);}

  int Nth = 32;
@@ -146,8 +146,11 @@ void cuda_fft(
      status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE);
      if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);}

      cudaStreamSynchronize(stream);

      //Copy the array to be transformed onto the descriptor structure array 
      cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, 2*xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
      mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
      if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 1 ERROR %d !!!\n", mmm);}

      //Perform the FFT
      status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE);
@@ -163,7 +166,8 @@ void cuda_fft(
      if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);}

      //Copy the result descriptor structure array again onto the original fftwgrid
      cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], 2*xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
      mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
      if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 2 ERROR %d !!!\n", mmm);}

      //Write gridss starting from fftwgrid
      write_gridss<<<Nbl, Nth>>>(num_w_planes, xaxis, yaxis, fftwgrid, gridss, norm, iw);
@@ -191,4 +195,3 @@ void cuda_fft(

}
#endif
  
+1 −5
Original line number Diff line number Diff line
@@ -104,8 +104,6 @@ void fftw_data ( void )
  // FFT transform the data using cuFFT                                                                                                    
  if(rank==0)printf("PERFORMING CUDA FFT\n");

  MPI_Barrier(MPI_COMM_WORLD);

  double start = CPU_TIME_wt;

  
@@ -120,8 +118,6 @@ void fftw_data ( void )
	   rank,
	   MPI_COMM_WORLD);

  MPI_Barrier(MPI_COMM_WORLD);
 
  timing_wt.cufftmp += CPU_TIME_wt - start;

  return;
@@ -213,7 +209,7 @@ void write_fftw_data(){
  double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
  double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));

#ifdef CUDACC
#ifdef CUFFTMP
  phase_correction(gridss_gpu,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads,rank);
#else
  phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads,rank);
+22 −8
Original line number Diff line number Diff line
@@ -95,10 +95,17 @@ void gridding_data(){

  cudaSetDevice(local_rank);

  cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
  cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
  cudaStreamCreate(&stream_reduce);
  nnn = cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &grid_gpu ERROR %d !!!\n", nnn);}

  nnn = cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &gridss_gpu ERROR %d !!!\n", nnn);}
  
  nnn = cudaStreamCreate(&stream_reduce);
  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaStreamCreate &stream_reduce ERROR %d !!!\n", nnn);}

  cudaStreamCreate(&stream_stacking);
  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaStreamCreate &stream_stacking ERROR %d !!!\n", nnn);}

  ncclCommInitRank(&comm, size, id, rank);

@@ -263,7 +270,7 @@ void gridding_data(){

	  // Go to next sector
	  nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );
	  if (nnn != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", nnn);}
	  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMemset ERROR %d !!!\n", nnn);}
	}

      free(memory);
@@ -274,11 +281,18 @@ void gridding_data(){
  
  //cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
  
#if !defined(CUFFTMP)
  cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
  cudaStreamSynchronize(stream_reduce);
#endif

  MPI_Barrier(MPI_COMM_WORLD);

  //cudaStreamSynchronize(stream_reduce);
  //  cudaFree(grid_gpu);
  //cudaFree(gridss_gpu);
#if !defined(CUFFTMP)
  cudaFree(grid_gpu);
  cudaFree(gridss_gpu);
#endif

  cudaStreamDestroy(stream_reduce);
  cudaStreamDestroy(stream_stacking);
  
+18 −1
Original line number Diff line number Diff line
@@ -129,6 +129,13 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
	double * image_real_g;
	double * image_imag_g;

#if !defined(CUFFTMP)
        double * gridss_g;

	mmm = cudaMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double));
        mmm = cudaMemcpy(gridss_g, gridss, 2*num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice);
#endif
	
	mmm=cudaMalloc(&image_real_g, xaxis*yaxis*sizeof(double));
	//printf("CUDA ERROR 2 %s\n",cudaGetErrorString(mmm));
	mmm=cudaMalloc(&image_imag_g, xaxis*yaxis*sizeof(double));
@@ -144,7 +151,11 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
	phase_g <<<Nbl,Nth>>> (xaxis,
                               yaxis,
			       num_w_planes,
#if defined(CUFFTMP)
                               gridss,
#else
			       gridss_g,
#endif
                               image_real_g,
                               image_imag_g,
                               wmin,
@@ -160,7 +171,13 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
	mmm = cudaMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost);
	//printf("CUDA ERROR 8 %s\n",cudaGetErrorString(mmm));

	mmm= cudaFree(gridss);
#if !defined(CUFFTMP)
	cudaFree(gridss_g);
#else
	cudaFree(gridss);
#endif


#else

#ifndef ACCOMP
Loading