Loading Makefile +8 −8 Original line number Diff line number Diff line Loading @@ -20,8 +20,8 @@ endif LINKER=$(MPICC) FFTW_MPI_INC = -I/opt/cray/pe/fftw/3.3.10.5/x86_rome/include FFTW_MPI_LIB = -L/opt/cray/pe/fftw/3.3.10.5/x86_rome/lib FFTW_MPI_INC = FFTW_MPI_LIB = CFLAGS += -I./ Loading Loading @@ -63,11 +63,11 @@ OPT += -DPHASE_ON # SELECT THE GRIDDING KERNEL: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL #OPT += -DGAUSS_HI_PRECISION OPT += -DGAUSS_HI_PRECISION #OPT += -DGAUSS OPT += -DKAISERBESSEL #OPT += -DKAISERBESSEL # ======================================================== Loading @@ -77,16 +77,16 @@ OPT += -DKAISERBESSEL #OPT += -DNVIDIA # use CUDA for GPUs #OPT += -DCUDACC OPT += -DCUDACC # use GPU acceleration via OMP #OPT += -DACCOMP # perform stacking on GPUs #OPT += -DGPU_STACKING OPT += -DGPU_STACKING # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE OPT += -DNCCL_REDUCE # use GPU to perform FFT #OPT += -DCUFFTMP Loading @@ -109,7 +109,7 @@ endif #OPT += -DRCCL_REDUCE # FULL AMD GPU SUPPORT - Recommended for full AMD GPU code execution OPT += -DFULL_AMD #OPT += -DFULL_AMD ifeq (FULL_AMD,$(findstring FULL_AMD,$(OPT))) OPT += -DHIPCC -DRCCL_REDUCE -D__HIP_PLATFORM_AMD__ endif Loading cuda_fft.cu +13 −10 Original line number Diff line number Diff line Loading @@ -30,7 +30,6 @@ __global__ void write_grid( fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } Loading Loading @@ -88,8 +87,9 @@ void cuda_fft( cufftDoubleComplex *fftwgrid; // Alloco fftwgrid su GPU utilizzando cudaMalloc mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*2*yaxis*xaxis); mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*yaxis*xaxis); if (mmm != cudaSuccess) {printf("!!! cuda_fft.cu cudaMalloc ERROR %d !!!\n", mmm);} int Nth = 32; Loading Loading @@ -146,8 +146,11 @@ void cuda_fft( status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);} cudaStreamSynchronize(stream); //Copy the array to be transformed onto the descriptor structure array cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, 2*xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 1 ERROR %d !!!\n", mmm);} //Perform the FFT status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); Loading @@ -163,7 +166,8 @@ void cuda_fft( if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);} //Copy the result descriptor structure array again onto the original fftwgrid cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], 2*xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 2 ERROR %d !!!\n", mmm);} //Write gridss starting from fftwgrid write_gridss<<<Nbl, Nth>>>(num_w_planes, xaxis, yaxis, fftwgrid, gridss, norm, iw); Loading Loading @@ -191,4 +195,3 @@ void cuda_fft( } #endif fourier_transform.c +1 −5 Original line number Diff line number Diff line Loading @@ -104,8 +104,6 @@ void fftw_data ( void ) // FFT transform the data using cuFFT if(rank==0)printf("PERFORMING CUDA FFT\n"); MPI_Barrier(MPI_COMM_WORLD); double start = CPU_TIME_wt; Loading @@ -120,8 +118,6 @@ void fftw_data ( void ) rank, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); timing_wt.cufftmp += CPU_TIME_wt - start; return; Loading Loading @@ -213,7 +209,7 @@ void write_fftw_data(){ double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double)); double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double)); #ifdef CUDACC #ifdef CUFFTMP phase_correction(gridss_gpu,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads,rank); #else phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads,rank); Loading gridding_nccl.cu +22 −8 Original line number Diff line number Diff line Loading @@ -95,10 +95,17 @@ void gridding_data(){ cudaSetDevice(local_rank); cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); cudaStreamCreate(&stream_reduce); nnn = cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &grid_gpu ERROR %d !!!\n", nnn);} nnn = cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &gridss_gpu ERROR %d !!!\n", nnn);} nnn = cudaStreamCreate(&stream_reduce); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaStreamCreate &stream_reduce ERROR %d !!!\n", nnn);} cudaStreamCreate(&stream_stacking); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaStreamCreate &stream_stacking ERROR %d !!!\n", nnn);} ncclCommInitRank(&comm, size, id, rank); Loading Loading @@ -263,7 +270,7 @@ void gridding_data(){ // Go to next sector nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); if (nnn != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", nnn);} if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMemset ERROR %d !!!\n", nnn);} } free(memory); Loading @@ -274,11 +281,18 @@ void gridding_data(){ //cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); #if !defined(CUFFTMP) cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); cudaStreamSynchronize(stream_reduce); #endif MPI_Barrier(MPI_COMM_WORLD); //cudaStreamSynchronize(stream_reduce); // cudaFree(grid_gpu); //cudaFree(gridss_gpu); #if !defined(CUFFTMP) cudaFree(grid_gpu); cudaFree(gridss_gpu); #endif cudaStreamDestroy(stream_reduce); cudaStreamDestroy(stream_stacking); Loading phase_correction.cu +18 −1 Original line number Diff line number Diff line Loading @@ -129,6 +129,13 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in double * image_real_g; double * image_imag_g; #if !defined(CUFFTMP) double * gridss_g; mmm = cudaMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double)); mmm = cudaMemcpy(gridss_g, gridss, 2*num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice); #endif mmm=cudaMalloc(&image_real_g, xaxis*yaxis*sizeof(double)); //printf("CUDA ERROR 2 %s\n",cudaGetErrorString(mmm)); mmm=cudaMalloc(&image_imag_g, xaxis*yaxis*sizeof(double)); Loading @@ -144,7 +151,11 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in phase_g <<<Nbl,Nth>>> (xaxis, yaxis, num_w_planes, #if defined(CUFFTMP) gridss, #else gridss_g, #endif image_real_g, image_imag_g, wmin, Loading @@ -160,7 +171,13 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in mmm = cudaMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost); //printf("CUDA ERROR 8 %s\n",cudaGetErrorString(mmm)); mmm= cudaFree(gridss); #if !defined(CUFFTMP) cudaFree(gridss_g); #else cudaFree(gridss); #endif #else #ifndef ACCOMP Loading Loading
Makefile +8 −8 Original line number Diff line number Diff line Loading @@ -20,8 +20,8 @@ endif LINKER=$(MPICC) FFTW_MPI_INC = -I/opt/cray/pe/fftw/3.3.10.5/x86_rome/include FFTW_MPI_LIB = -L/opt/cray/pe/fftw/3.3.10.5/x86_rome/lib FFTW_MPI_INC = FFTW_MPI_LIB = CFLAGS += -I./ Loading Loading @@ -63,11 +63,11 @@ OPT += -DPHASE_ON # SELECT THE GRIDDING KERNEL: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL #OPT += -DGAUSS_HI_PRECISION OPT += -DGAUSS_HI_PRECISION #OPT += -DGAUSS OPT += -DKAISERBESSEL #OPT += -DKAISERBESSEL # ======================================================== Loading @@ -77,16 +77,16 @@ OPT += -DKAISERBESSEL #OPT += -DNVIDIA # use CUDA for GPUs #OPT += -DCUDACC OPT += -DCUDACC # use GPU acceleration via OMP #OPT += -DACCOMP # perform stacking on GPUs #OPT += -DGPU_STACKING OPT += -DGPU_STACKING # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE OPT += -DNCCL_REDUCE # use GPU to perform FFT #OPT += -DCUFFTMP Loading @@ -109,7 +109,7 @@ endif #OPT += -DRCCL_REDUCE # FULL AMD GPU SUPPORT - Recommended for full AMD GPU code execution OPT += -DFULL_AMD #OPT += -DFULL_AMD ifeq (FULL_AMD,$(findstring FULL_AMD,$(OPT))) OPT += -DHIPCC -DRCCL_REDUCE -D__HIP_PLATFORM_AMD__ endif Loading
cuda_fft.cu +13 −10 Original line number Diff line number Diff line Loading @@ -30,7 +30,6 @@ __global__ void write_grid( fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } Loading Loading @@ -88,8 +87,9 @@ void cuda_fft( cufftDoubleComplex *fftwgrid; // Alloco fftwgrid su GPU utilizzando cudaMalloc mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*2*yaxis*xaxis); mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*yaxis*xaxis); if (mmm != cudaSuccess) {printf("!!! cuda_fft.cu cudaMalloc ERROR %d !!!\n", mmm);} int Nth = 32; Loading Loading @@ -146,8 +146,11 @@ void cuda_fft( status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);} cudaStreamSynchronize(stream); //Copy the array to be transformed onto the descriptor structure array cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, 2*xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 1 ERROR %d !!!\n", mmm);} //Perform the FFT status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); Loading @@ -163,7 +166,8 @@ void cuda_fft( if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);} //Copy the result descriptor structure array again onto the original fftwgrid cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], 2*xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 2 ERROR %d !!!\n", mmm);} //Write gridss starting from fftwgrid write_gridss<<<Nbl, Nth>>>(num_w_planes, xaxis, yaxis, fftwgrid, gridss, norm, iw); Loading Loading @@ -191,4 +195,3 @@ void cuda_fft( } #endif
fourier_transform.c +1 −5 Original line number Diff line number Diff line Loading @@ -104,8 +104,6 @@ void fftw_data ( void ) // FFT transform the data using cuFFT if(rank==0)printf("PERFORMING CUDA FFT\n"); MPI_Barrier(MPI_COMM_WORLD); double start = CPU_TIME_wt; Loading @@ -120,8 +118,6 @@ void fftw_data ( void ) rank, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); timing_wt.cufftmp += CPU_TIME_wt - start; return; Loading Loading @@ -213,7 +209,7 @@ void write_fftw_data(){ double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double)); double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double)); #ifdef CUDACC #ifdef CUFFTMP phase_correction(gridss_gpu,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads,rank); #else phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads,rank); Loading
gridding_nccl.cu +22 −8 Original line number Diff line number Diff line Loading @@ -95,10 +95,17 @@ void gridding_data(){ cudaSetDevice(local_rank); cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); cudaStreamCreate(&stream_reduce); nnn = cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &grid_gpu ERROR %d !!!\n", nnn);} nnn = cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &gridss_gpu ERROR %d !!!\n", nnn);} nnn = cudaStreamCreate(&stream_reduce); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaStreamCreate &stream_reduce ERROR %d !!!\n", nnn);} cudaStreamCreate(&stream_stacking); if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaStreamCreate &stream_stacking ERROR %d !!!\n", nnn);} ncclCommInitRank(&comm, size, id, rank); Loading Loading @@ -263,7 +270,7 @@ void gridding_data(){ // Go to next sector nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); if (nnn != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", nnn);} if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMemset ERROR %d !!!\n", nnn);} } free(memory); Loading @@ -274,11 +281,18 @@ void gridding_data(){ //cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); #if !defined(CUFFTMP) cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); cudaStreamSynchronize(stream_reduce); #endif MPI_Barrier(MPI_COMM_WORLD); //cudaStreamSynchronize(stream_reduce); // cudaFree(grid_gpu); //cudaFree(gridss_gpu); #if !defined(CUFFTMP) cudaFree(grid_gpu); cudaFree(gridss_gpu); #endif cudaStreamDestroy(stream_reduce); cudaStreamDestroy(stream_stacking); Loading
phase_correction.cu +18 −1 Original line number Diff line number Diff line Loading @@ -129,6 +129,13 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in double * image_real_g; double * image_imag_g; #if !defined(CUFFTMP) double * gridss_g; mmm = cudaMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double)); mmm = cudaMemcpy(gridss_g, gridss, 2*num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice); #endif mmm=cudaMalloc(&image_real_g, xaxis*yaxis*sizeof(double)); //printf("CUDA ERROR 2 %s\n",cudaGetErrorString(mmm)); mmm=cudaMalloc(&image_imag_g, xaxis*yaxis*sizeof(double)); Loading @@ -144,7 +151,11 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in phase_g <<<Nbl,Nth>>> (xaxis, yaxis, num_w_planes, #if defined(CUFFTMP) gridss, #else gridss_g, #endif image_real_g, image_imag_g, wmin, Loading @@ -160,7 +171,13 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in mmm = cudaMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost); //printf("CUDA ERROR 8 %s\n",cudaGetErrorString(mmm)); mmm= cudaFree(gridss); #if !defined(CUFFTMP) cudaFree(gridss_g); #else cudaFree(gridss); #endif #else #ifndef ACCOMP Loading