Loading Build/Makefile.leo +29 −8 Original line number Diff line number Diff line Loading @@ -9,8 +9,14 @@ OPT_PURE_MPI = -O4 -march=native -mavx -mavx2 OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80 CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs ###CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include ###CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/include CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/lib -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/targets/x86_64-linux/lib/stubs FFTW_INCL= FFTW_LIB= Loading @@ -19,18 +25,33 @@ FFTW_LIB= ########################################################## #NVIDIA CUFFTMP CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64 CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp ###CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64 ###CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/math_libs/12.3/include/cufftmp CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/math_libs/12.3/lib64 ########################################################## NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/ NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/ ###NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/ ###NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/ NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nvshmem/include NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nvshmem/lib ########################################################## #NVIDIA NCCL REDUCE NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib ###NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include ###NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nccl/include NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nccl/lib ########################################################## NVC = nvc Loading allvars.h +0 −1 Original line number Diff line number Diff line Loading @@ -175,4 +175,3 @@ extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *grid extern MPI_Comm MYMPI_COMM_WORLD; extern MPI_Win slabwin; allvars_nccl.h +0 −1 Original line number Diff line number Diff line Loading @@ -173,4 +173,3 @@ extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *grid extern MPI_Comm MYMPI_COMM_WORLD; extern MPI_Win slabwin; cuda_fft.cu +6 −3 Original line number Diff line number Diff line Loading @@ -89,7 +89,10 @@ void cuda_fft( // Alloco fftwgrid su GPU utilizzando cudaMalloc mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*yaxis*xaxis); long long unsigned size_finta_fft = (long long unsigned)((long long unsigned)xaxis*(long long unsigned)yaxis); mmm=cudaMalloc(&fftwgrid, (size_t)(size_finta_fft*sizeof(cufftDoubleComplex))); if (mmm != cudaSuccess) {printf("!!! cuda_fft.cu cudaMalloc ERROR %d !!!\n", mmm);} int Nth = 32; Loading Loading @@ -149,7 +152,7 @@ void cuda_fft( cudaStreamSynchronize(stream); //Copy the array to be transformed onto the descriptor structure array mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, (size_t)(size_finta_fft*sizeof(cufftDoubleComplex)), cudaMemcpyDeviceToDevice); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 1 ERROR %d !!!\n", mmm);} //Perform the FFT Loading @@ -166,7 +169,7 @@ void cuda_fft( if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);} //Copy the result descriptor structure array again onto the original fftwgrid mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], (size_t)(size_finta_fft*sizeof(cufftDoubleComplex)), cudaMemcpyDeviceToDevice); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 2 ERROR %d !!!\n", mmm);} //Write gridss starting from fftwgrid Loading gridding_nccl.cu +28 −26 Original line number Diff line number Diff line Loading @@ -95,10 +95,12 @@ void gridding_data(){ cudaSetDevice(local_rank); nnn = cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); long long unsigned size_finta = (long long unsigned)(2*(long long unsigned)param.num_w_planes*(long long unsigned)xaxis*(long long unsigned)yaxis); nnn = cudaMalloc(&grid_gpu, (size_t)(size_finta*sizeof(double))); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &grid_gpu ERROR %d !!!\n", nnn);} nnn = cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); nnn = cudaMalloc(&gridss_gpu, (size_t)(size_finta*sizeof(double))); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &gridss_gpu ERROR %d !!!\n", nnn);} nnn = cudaStreamCreate(&stream_reduce); Loading Loading @@ -203,7 +205,7 @@ void gridding_data(){ //We have to call different GPUs per MPI task!!! [GL] #ifdef CUDACC wstack(param.num_w_planes, wstack((long long unsigned)param.num_w_planes, Nsec, metaData.freq_per_chan, metaData.polarisations, Loading @@ -216,8 +218,8 @@ void gridding_data(){ dx, dw, param.w_support, xaxis, yaxis, (long long unsigned)xaxis, (long long unsigned)yaxis, gridss_gpu, param.num_threads, rank, Loading Loading @@ -269,7 +271,7 @@ void gridding_data(){ timing_wt.reduce += CPU_TIME_wt - start; // Go to next sector nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); nnn = cudaMemset( gridss_gpu, 0.0, (size_t)(size_finta*sizeof(double)) ); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMemset ERROR %d !!!\n", nnn);} } Loading @@ -282,7 +284,7 @@ void gridding_data(){ //cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); #if !defined(CUFFTMP) cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); cudaMemcpyAsync(grid, grid_gpu, (size_t)(size_finta*sizeof(double)), cudaMemcpyDeviceToHost, stream_reduce); cudaStreamSynchronize(stream_reduce); #endif Loading Loading
Build/Makefile.leo +29 −8 Original line number Diff line number Diff line Loading @@ -9,8 +9,14 @@ OPT_PURE_MPI = -O4 -march=native -mavx -mavx2 OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80 CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs ###CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include ###CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/include CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/lib -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/targets/x86_64-linux/lib/stubs FFTW_INCL= FFTW_LIB= Loading @@ -19,18 +25,33 @@ FFTW_LIB= ########################################################## #NVIDIA CUFFTMP CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64 CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp ###CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64 ###CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/math_libs/12.3/include/cufftmp CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/math_libs/12.3/lib64 ########################################################## NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/ NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/ ###NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/ ###NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/ NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nvshmem/include NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nvshmem/lib ########################################################## #NVIDIA NCCL REDUCE NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib ###NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include ###NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nccl/include NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nccl/lib ########################################################## NVC = nvc Loading
allvars.h +0 −1 Original line number Diff line number Diff line Loading @@ -175,4 +175,3 @@ extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *grid extern MPI_Comm MYMPI_COMM_WORLD; extern MPI_Win slabwin;
allvars_nccl.h +0 −1 Original line number Diff line number Diff line Loading @@ -173,4 +173,3 @@ extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *grid extern MPI_Comm MYMPI_COMM_WORLD; extern MPI_Win slabwin;
cuda_fft.cu +6 −3 Original line number Diff line number Diff line Loading @@ -89,7 +89,10 @@ void cuda_fft( // Alloco fftwgrid su GPU utilizzando cudaMalloc mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*yaxis*xaxis); long long unsigned size_finta_fft = (long long unsigned)((long long unsigned)xaxis*(long long unsigned)yaxis); mmm=cudaMalloc(&fftwgrid, (size_t)(size_finta_fft*sizeof(cufftDoubleComplex))); if (mmm != cudaSuccess) {printf("!!! cuda_fft.cu cudaMalloc ERROR %d !!!\n", mmm);} int Nth = 32; Loading Loading @@ -149,7 +152,7 @@ void cuda_fft( cudaStreamSynchronize(stream); //Copy the array to be transformed onto the descriptor structure array mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, (size_t)(size_finta_fft*sizeof(cufftDoubleComplex)), cudaMemcpyDeviceToDevice); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 1 ERROR %d !!!\n", mmm);} //Perform the FFT Loading @@ -166,7 +169,7 @@ void cuda_fft( if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);} //Copy the result descriptor structure array again onto the original fftwgrid mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], (size_t)(size_finta_fft*sizeof(cufftDoubleComplex)), cudaMemcpyDeviceToDevice); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 2 ERROR %d !!!\n", mmm);} //Write gridss starting from fftwgrid Loading
gridding_nccl.cu +28 −26 Original line number Diff line number Diff line Loading @@ -95,10 +95,12 @@ void gridding_data(){ cudaSetDevice(local_rank); nnn = cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); long long unsigned size_finta = (long long unsigned)(2*(long long unsigned)param.num_w_planes*(long long unsigned)xaxis*(long long unsigned)yaxis); nnn = cudaMalloc(&grid_gpu, (size_t)(size_finta*sizeof(double))); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &grid_gpu ERROR %d !!!\n", nnn);} nnn = cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); nnn = cudaMalloc(&gridss_gpu, (size_t)(size_finta*sizeof(double))); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &gridss_gpu ERROR %d !!!\n", nnn);} nnn = cudaStreamCreate(&stream_reduce); Loading Loading @@ -203,7 +205,7 @@ void gridding_data(){ //We have to call different GPUs per MPI task!!! [GL] #ifdef CUDACC wstack(param.num_w_planes, wstack((long long unsigned)param.num_w_planes, Nsec, metaData.freq_per_chan, metaData.polarisations, Loading @@ -216,8 +218,8 @@ void gridding_data(){ dx, dw, param.w_support, xaxis, yaxis, (long long unsigned)xaxis, (long long unsigned)yaxis, gridss_gpu, param.num_threads, rank, Loading Loading @@ -269,7 +271,7 @@ void gridding_data(){ timing_wt.reduce += CPU_TIME_wt - start; // Go to next sector nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); nnn = cudaMemset( gridss_gpu, 0.0, (size_t)(size_finta*sizeof(double)) ); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMemset ERROR %d !!!\n", nnn);} } Loading @@ -282,7 +284,7 @@ void gridding_data(){ //cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); #if !defined(CUFFTMP) cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); cudaMemcpyAsync(grid, grid_gpu, (size_t)(size_finta*sizeof(double)), cudaMemcpyDeviceToHost, stream_reduce); cudaStreamSynchronize(stream_reduce); #endif Loading