Commit ad9f3647 authored by Emanuele De Rubeis's avatar Emanuele De Rubeis
Browse files

CUDA casting bug fixed

parent c66c4671
Loading
Loading
Loading
Loading
+29 −8
Original line number Diff line number Diff line
@@ -9,8 +9,14 @@ OPT_PURE_MPI = -O4 -march=native -mavx -mavx2

OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80

CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include
CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs
###CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include
###CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs


CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/include
CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/lib -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/targets/x86_64-linux/lib/stubs



FFTW_INCL=
FFTW_LIB=
@@ -19,18 +25,33 @@ FFTW_LIB=
##########################################################
#NVIDIA CUFFTMP

CUFFTMP_LIB  = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64
CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp
###CUFFTMP_LIB  = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64
###CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp

CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/math_libs/12.3/include/cufftmp
CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/math_libs/12.3/lib64


##########################################################

NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/
NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/
###NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/
###NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/

NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nvshmem/include
NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nvshmem/lib


##########################################################
#NVIDIA NCCL REDUCE

NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include
NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib
###NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include
###NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib


NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nccl/include
NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nccl/lib


##########################################################

NVC = nvc 
+0 −1
Original line number Diff line number Diff line
@@ -175,4 +175,3 @@ extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *grid

extern MPI_Comm MYMPI_COMM_WORLD;
extern MPI_Win  slabwin;
+0 −1
Original line number Diff line number Diff line
@@ -173,4 +173,3 @@ extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *grid

extern MPI_Comm MYMPI_COMM_WORLD;
extern MPI_Win  slabwin;
+6 −3
Original line number Diff line number Diff line
@@ -89,7 +89,10 @@ void cuda_fft(

  
  // Alloco fftwgrid su GPU utilizzando cudaMalloc
  mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*yaxis*xaxis);

  long long unsigned size_finta_fft = (long long unsigned)((long long unsigned)xaxis*(long long unsigned)yaxis);

  mmm=cudaMalloc(&fftwgrid, (size_t)(size_finta_fft*sizeof(cufftDoubleComplex)));
  if (mmm != cudaSuccess) {printf("!!! cuda_fft.cu cudaMalloc ERROR %d !!!\n", mmm);}

  int Nth = 32;
@@ -149,7 +152,7 @@ void cuda_fft(
      cudaStreamSynchronize(stream);

      //Copy the array to be transformed onto the descriptor structure array 
      mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
      mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, (size_t)(size_finta_fft*sizeof(cufftDoubleComplex)), cudaMemcpyDeviceToDevice);
      if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 1 ERROR %d !!!\n", mmm);}

      //Perform the FFT
@@ -166,7 +169,7 @@ void cuda_fft(
      if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);}

      //Copy the result descriptor structure array again onto the original fftwgrid
      mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
      mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], (size_t)(size_finta_fft*sizeof(cufftDoubleComplex)), cudaMemcpyDeviceToDevice);
      if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 2 ERROR %d !!!\n", mmm);}

      //Write gridss starting from fftwgrid
+28 −26
Original line number Diff line number Diff line
@@ -95,10 +95,12 @@ void gridding_data(){

  cudaSetDevice(local_rank);

  nnn = cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
  long long unsigned size_finta = (long long unsigned)(2*(long long unsigned)param.num_w_planes*(long long unsigned)xaxis*(long long unsigned)yaxis); 
  
  nnn = cudaMalloc(&grid_gpu, (size_t)(size_finta*sizeof(double)));
  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &grid_gpu ERROR %d !!!\n", nnn);}

  nnn = cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
  nnn = cudaMalloc(&gridss_gpu, (size_t)(size_finta*sizeof(double)));
  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &gridss_gpu ERROR %d !!!\n", nnn);}
  
  nnn = cudaStreamCreate(&stream_reduce);
@@ -203,7 +205,7 @@ void gridding_data(){
	    
     //We have to call different GPUs per MPI task!!! [GL]
#ifdef CUDACC
      wstack(param.num_w_planes,
      wstack((long long unsigned)param.num_w_planes,
             Nsec,
             metaData.freq_per_chan,
             metaData.polarisations,
@@ -216,8 +218,8 @@ void gridding_data(){
             dx,
             dw,
             param.w_support,
	     xaxis,
	     yaxis,
             (long long unsigned)xaxis,
             (long long unsigned)yaxis,
             gridss_gpu,
             param.num_threads,
             rank,
@@ -269,7 +271,7 @@ void gridding_data(){
	  timing_wt.reduce += CPU_TIME_wt - start;

	  // Go to next sector
	  nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );
	  nnn = cudaMemset( gridss_gpu, 0.0, (size_t)(size_finta*sizeof(double)) );
	  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMemset ERROR %d !!!\n", nnn);}
	}

@@ -282,7 +284,7 @@ void gridding_data(){
  //cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
  
#if !defined(CUFFTMP)
  cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
  cudaMemcpyAsync(grid, grid_gpu, (size_t)(size_finta*sizeof(double)), cudaMemcpyDeviceToHost, stream_reduce);
  cudaStreamSynchronize(stream_reduce);
#endif

Loading