Loading Makefile +25 −5 Original line number Diff line number Diff line Loading @@ -20,8 +20,8 @@ endif LINKER=$(MPICC) FFTW_MPI_INC = -I/opt/cray/pe/fftw/3.3.10.3/x86_rome/include FFTW_MPI_LIB = -L/opt/cray/pe/fftw/3.3.10.3/x86_rome/lib FFTW_MPI_INC = FFTW_MPI_LIB = CFLAGS += -I./ Loading @@ -48,6 +48,24 @@ OPT += -DWRITE_IMAGE # perform w-stacking phase correction OPT += -DPHASE_ON # Support CFITSIO !!! Remember to add the path to the CFITSIO library to LD_LIBRARY_PATH #OPT += -DFITSIO # Perform true parallel images writing #OPT += -DPARALLELIO # Normalize uvw in case it is not done in the binMS OPT += -DNORMALIZE_UVW # Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL #OPT += -DGAUSS_HI_PRECISION OPT += -DGAUSS #OPT += -DKAISERBESSEL #OPT += -DVERBOSE # ======================================================== # ACCELERATION # Loading @@ -58,13 +76,13 @@ OPT += -DPHASE_ON #OPT += -DCUDACC # use GPU acceleration via OMP OPT += -DACCOMP #OPT += -DACCOMP # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE # use AMD GPU to perform the reduce OPT += -DRCCL_REDUCE #OPT += -DRCCL_REDUCE # use GPU to perform FFT #OPT += -DCUFFTMP Loading @@ -74,6 +92,9 @@ OPT += -DRCCL_REDUCE # ======================================================== ifeq (FITSIO,$(findstring FITSIO,$(OPT))) LIBS += -L$(FITSIO_LIB) -lcfitsio endif DEPS = w-stacking.h main.c allvars.h Loading @@ -89,7 +110,6 @@ OBJ_ACC_CUDA = phase_correction.o w-stacking.o DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o DEPS_NCCL_REDUCE = gridding_nccl.cpp OBJ_NCCL_REDUCE = gridding_nccl.o Loading allvars.h +8 −0 Original line number Diff line number Diff line Loading @@ -14,6 +14,10 @@ #include <stdatomic.h> #include <mpi.h> #ifdef FITSIO #include "fitsio.h" #endif #if defined (_OPENMP) #include <omp.h> #endif Loading @@ -30,6 +34,10 @@ #include "w-stacking.h" #endif #if defined(CUDACC) #include <cuda.h> #endif #if defined(NVIDIA) #include <cuda_runtime.h> #endif Loading cuda_fft.cu 0 → 100644 +148 −0 Original line number Diff line number Diff line #include <stdlib.h> #include <stdio.h> #include <cufftMp.h> #include <mpi.h> #include <cuda_runtime.h> #include <complex.h> #include "cuComplex.h" #include "w-stacking.h" #include <time.h> #if defined(CUFFTMP) && !defined(USE_FFTW) void cuda_fft( int num_w_planes, int grid_size_x, int grid_size_y, int xaxis, int yaxis, double * grid, double * gridss, MPI_Comm comm) { #ifdef __CUDACC__ cudaError_t mmm; cufftResult_t status; cufftDoubleComplex *fftwgrid; fftwgrid = (cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*yaxis*grid_size_x); // Plan creation cufftHandle plan; status = cufftCreate(&plan); if (status != CUFFT_SUCCESS) {printf("!!! cufftCreate ERROR %d !!!\n", status);} cudaStream_t stream{}; cudaStreamCreate(&stream); status = cufftMpAttachComm(plan, CUFFT_COMM_MPI, &comm); if (status != CUFFT_SUCCESS) {printf("!!! cufftMpAttachComm ERROR %d !!!\n", status);} status = cufftSetStream(plan, stream); if (status != CUFFT_SUCCESS) {printf("!!! cufftSetStream ERROR %d !!!\n", status);} size_t workspace; status = cufftMakePlan2d(plan, grid_size_x, grid_size_y, CUFFT_Z2Z, &workspace); if (status != CUFFT_SUCCESS) {printf("!!! cufftMakePlan2d ERROR %d !!!\n", status);} cudaDeviceSynchronize(); long fftwindex = 0; long fftwindex2D = 0; double norm = 1.0/(double)(grid_size_x*grid_size_y); // Grid composition for (int iw=0; iw<num_w_planes; iw++) { printf("select the %d w-plane to transform\n", iw); for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } cudaLibXtDesc *fftwgrid_g; cudaLibXtDesc *fftwgrid_g2; status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);} status = cufftXtMalloc(plan, &fftwgrid_g2, CUFFT_XT_FORMAT_INPLACE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc 2 ERROR %d !!!\n", status);} cudaDeviceSynchronize(); mmm = cudaStreamSynchronize(stream); if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize ERROR %d !!!\n", mmm);} status = cufftXtMemcpy(plan, fftwgrid_g, fftwgrid, CUFFT_COPY_HOST_TO_DEVICE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy htd fftwgrid_g ERROR %d !!!\n", status);} cudaDeviceSynchronize(); status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtExecDescriptor ERROR %d !!!\n", status);} mmm = cudaStreamSynchronize(stream); if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);} cudaDeviceSynchronize(); status = cufftXtMemcpy(plan, fftwgrid_g2, fftwgrid_g, CUFFT_COPY_DEVICE_TO_DEVICE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);} mmm = cudaStreamSynchronize(stream); if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);} cudaDeviceSynchronize(); status = cufftXtMemcpy(plan, fftwgrid, fftwgrid_g2, CUFFT_COPY_DEVICE_TO_HOST); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dth fftwgrid ERROR %d !!!\n", status);} cufftXtFree(fftwgrid_g); cufftXtFree(fftwgrid_g2); for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } } } status = cufftDestroy(plan); if (status != CUFFT_SUCCESS) {printf("!!! cufftDestroy fftwgrid ERROR %d !!!\n", status);} cudaStreamDestroy(stream); cudaDeviceSynchronize(); #endif // __CUDACC__ } #endif fourier_transform.c +26 −1 Original line number Diff line number Diff line Loading @@ -4,7 +4,7 @@ void fftw_data(){ #ifdef USE_FFTW #if defined(USE_FFTW) && !defined(CUFFTMP) // FFT transform the data (using distributed FFTW) if(rank == 0)printf("PERFORMING FFT\n"); Loading Loading @@ -83,6 +83,31 @@ void fftw_data(){ #endif #if defined(USE_FFTW) && defined(CUFFTMP) // FFT transform the data using cuFFT if(rank==0)printf("PERFORMING CUDA FFT\n"); MPI_Barrier(MPI_COMM_WORLD); double start = CPU_TIME_wt; cuda_fft( num_w_planes, grid_size_x, grid_size_y, xaxis, yaxis, grid, gridss, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); timing_wt.cufftmp += CPU_TIME_wt - start; #endif } void write_fftw_data(){ Loading gridding.c +54 −2 Original line number Diff line number Diff line Loading @@ -16,10 +16,62 @@ void gridding() if(rank == 0) printf("GRIDDING DATA\n"); // Create histograms and linked lists double start = CPU_TIME_wt; #ifdef NORMALIZE_UVW if (rank==0) printf("NORMALIZING DATA\n"); double minu = 1e20; double minv = 1e20; double minw = 1e20; double maxu = -1e20; double maxv = -1e20; double maxw = -1e20; for (uint inorm=0; inorm<metaData.Nmeasures; inorm++) { minu = MIN(minu,data.uu[inorm]); minv = MIN(minv,data.vv[inorm]); minw = MIN(minw,data.ww[inorm]); maxu = MAX(maxu,data.uu[inorm]); maxv = MAX(maxv,data.vv[inorm]); maxw = MAX(maxw,data.ww[inorm]); } double minu_all; double minv_all; double minw_all; double maxu_all; double maxv_all; double maxw_all; MPI_Allreduce(&minu,&minu_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&minv,&minv_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&minw,&minw_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&maxu,&maxu_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&maxv,&maxv_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&maxw,&maxw_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); double offset = 0.001; double ming = MAX(abs(minu_all),abs(minv_all)); double maxg = MAX(abs(maxu_all),abs(maxv_all)); maxg = MAX(maxg,ming); minw = minw_all; maxw = maxw_all; maxg = maxg+offset*maxg; for (uint inorm=0; inorm<metaData.Nmeasures; inorm++) { data.uu[inorm] = (data.uu[inorm]+maxg)/(2.0*maxg); data.vv[inorm] = (data.vv[inorm]+maxg)/(2.0*maxg); data.ww[inorm] = (data.ww[inorm]-minw)/(maxw-minw); } #endif // Create histograms and linked lists // Initialize linked list initialize_array(); Loading Loading
Makefile +25 −5 Original line number Diff line number Diff line Loading @@ -20,8 +20,8 @@ endif LINKER=$(MPICC) FFTW_MPI_INC = -I/opt/cray/pe/fftw/3.3.10.3/x86_rome/include FFTW_MPI_LIB = -L/opt/cray/pe/fftw/3.3.10.3/x86_rome/lib FFTW_MPI_INC = FFTW_MPI_LIB = CFLAGS += -I./ Loading @@ -48,6 +48,24 @@ OPT += -DWRITE_IMAGE # perform w-stacking phase correction OPT += -DPHASE_ON # Support CFITSIO !!! Remember to add the path to the CFITSIO library to LD_LIBRARY_PATH #OPT += -DFITSIO # Perform true parallel images writing #OPT += -DPARALLELIO # Normalize uvw in case it is not done in the binMS OPT += -DNORMALIZE_UVW # Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL #OPT += -DGAUSS_HI_PRECISION OPT += -DGAUSS #OPT += -DKAISERBESSEL #OPT += -DVERBOSE # ======================================================== # ACCELERATION # Loading @@ -58,13 +76,13 @@ OPT += -DPHASE_ON #OPT += -DCUDACC # use GPU acceleration via OMP OPT += -DACCOMP #OPT += -DACCOMP # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE # use AMD GPU to perform the reduce OPT += -DRCCL_REDUCE #OPT += -DRCCL_REDUCE # use GPU to perform FFT #OPT += -DCUFFTMP Loading @@ -74,6 +92,9 @@ OPT += -DRCCL_REDUCE # ======================================================== ifeq (FITSIO,$(findstring FITSIO,$(OPT))) LIBS += -L$(FITSIO_LIB) -lcfitsio endif DEPS = w-stacking.h main.c allvars.h Loading @@ -89,7 +110,6 @@ OBJ_ACC_CUDA = phase_correction.o w-stacking.o DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o DEPS_NCCL_REDUCE = gridding_nccl.cpp OBJ_NCCL_REDUCE = gridding_nccl.o Loading
allvars.h +8 −0 Original line number Diff line number Diff line Loading @@ -14,6 +14,10 @@ #include <stdatomic.h> #include <mpi.h> #ifdef FITSIO #include "fitsio.h" #endif #if defined (_OPENMP) #include <omp.h> #endif Loading @@ -30,6 +34,10 @@ #include "w-stacking.h" #endif #if defined(CUDACC) #include <cuda.h> #endif #if defined(NVIDIA) #include <cuda_runtime.h> #endif Loading
cuda_fft.cu 0 → 100644 +148 −0 Original line number Diff line number Diff line #include <stdlib.h> #include <stdio.h> #include <cufftMp.h> #include <mpi.h> #include <cuda_runtime.h> #include <complex.h> #include "cuComplex.h" #include "w-stacking.h" #include <time.h> #if defined(CUFFTMP) && !defined(USE_FFTW) void cuda_fft( int num_w_planes, int grid_size_x, int grid_size_y, int xaxis, int yaxis, double * grid, double * gridss, MPI_Comm comm) { #ifdef __CUDACC__ cudaError_t mmm; cufftResult_t status; cufftDoubleComplex *fftwgrid; fftwgrid = (cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*yaxis*grid_size_x); // Plan creation cufftHandle plan; status = cufftCreate(&plan); if (status != CUFFT_SUCCESS) {printf("!!! cufftCreate ERROR %d !!!\n", status);} cudaStream_t stream{}; cudaStreamCreate(&stream); status = cufftMpAttachComm(plan, CUFFT_COMM_MPI, &comm); if (status != CUFFT_SUCCESS) {printf("!!! cufftMpAttachComm ERROR %d !!!\n", status);} status = cufftSetStream(plan, stream); if (status != CUFFT_SUCCESS) {printf("!!! cufftSetStream ERROR %d !!!\n", status);} size_t workspace; status = cufftMakePlan2d(plan, grid_size_x, grid_size_y, CUFFT_Z2Z, &workspace); if (status != CUFFT_SUCCESS) {printf("!!! cufftMakePlan2d ERROR %d !!!\n", status);} cudaDeviceSynchronize(); long fftwindex = 0; long fftwindex2D = 0; double norm = 1.0/(double)(grid_size_x*grid_size_y); // Grid composition for (int iw=0; iw<num_w_planes; iw++) { printf("select the %d w-plane to transform\n", iw); for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } cudaLibXtDesc *fftwgrid_g; cudaLibXtDesc *fftwgrid_g2; status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);} status = cufftXtMalloc(plan, &fftwgrid_g2, CUFFT_XT_FORMAT_INPLACE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc 2 ERROR %d !!!\n", status);} cudaDeviceSynchronize(); mmm = cudaStreamSynchronize(stream); if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize ERROR %d !!!\n", mmm);} status = cufftXtMemcpy(plan, fftwgrid_g, fftwgrid, CUFFT_COPY_HOST_TO_DEVICE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy htd fftwgrid_g ERROR %d !!!\n", status);} cudaDeviceSynchronize(); status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtExecDescriptor ERROR %d !!!\n", status);} mmm = cudaStreamSynchronize(stream); if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);} cudaDeviceSynchronize(); status = cufftXtMemcpy(plan, fftwgrid_g2, fftwgrid_g, CUFFT_COPY_DEVICE_TO_DEVICE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);} mmm = cudaStreamSynchronize(stream); if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);} cudaDeviceSynchronize(); status = cufftXtMemcpy(plan, fftwgrid, fftwgrid_g2, CUFFT_COPY_DEVICE_TO_HOST); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dth fftwgrid ERROR %d !!!\n", status);} cufftXtFree(fftwgrid_g); cufftXtFree(fftwgrid_g2); for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } } } status = cufftDestroy(plan); if (status != CUFFT_SUCCESS) {printf("!!! cufftDestroy fftwgrid ERROR %d !!!\n", status);} cudaStreamDestroy(stream); cudaDeviceSynchronize(); #endif // __CUDACC__ } #endif
fourier_transform.c +26 −1 Original line number Diff line number Diff line Loading @@ -4,7 +4,7 @@ void fftw_data(){ #ifdef USE_FFTW #if defined(USE_FFTW) && !defined(CUFFTMP) // FFT transform the data (using distributed FFTW) if(rank == 0)printf("PERFORMING FFT\n"); Loading Loading @@ -83,6 +83,31 @@ void fftw_data(){ #endif #if defined(USE_FFTW) && defined(CUFFTMP) // FFT transform the data using cuFFT if(rank==0)printf("PERFORMING CUDA FFT\n"); MPI_Barrier(MPI_COMM_WORLD); double start = CPU_TIME_wt; cuda_fft( num_w_planes, grid_size_x, grid_size_y, xaxis, yaxis, grid, gridss, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); timing_wt.cufftmp += CPU_TIME_wt - start; #endif } void write_fftw_data(){ Loading
gridding.c +54 −2 Original line number Diff line number Diff line Loading @@ -16,10 +16,62 @@ void gridding() if(rank == 0) printf("GRIDDING DATA\n"); // Create histograms and linked lists double start = CPU_TIME_wt; #ifdef NORMALIZE_UVW if (rank==0) printf("NORMALIZING DATA\n"); double minu = 1e20; double minv = 1e20; double minw = 1e20; double maxu = -1e20; double maxv = -1e20; double maxw = -1e20; for (uint inorm=0; inorm<metaData.Nmeasures; inorm++) { minu = MIN(minu,data.uu[inorm]); minv = MIN(minv,data.vv[inorm]); minw = MIN(minw,data.ww[inorm]); maxu = MAX(maxu,data.uu[inorm]); maxv = MAX(maxv,data.vv[inorm]); maxw = MAX(maxw,data.ww[inorm]); } double minu_all; double minv_all; double minw_all; double maxu_all; double maxv_all; double maxw_all; MPI_Allreduce(&minu,&minu_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&minv,&minv_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&minw,&minw_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&maxu,&maxu_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&maxv,&maxv_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&maxw,&maxw_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); double offset = 0.001; double ming = MAX(abs(minu_all),abs(minv_all)); double maxg = MAX(abs(maxu_all),abs(maxv_all)); maxg = MAX(maxg,ming); minw = minw_all; maxw = maxw_all; maxg = maxg+offset*maxg; for (uint inorm=0; inorm<metaData.Nmeasures; inorm++) { data.uu[inorm] = (data.uu[inorm]+maxg)/(2.0*maxg); data.vv[inorm] = (data.vv[inorm]+maxg)/(2.0*maxg); data.ww[inorm] = (data.ww[inorm]-minw)/(maxw-minw); } #endif // Create histograms and linked lists // Initialize linked list initialize_array(); Loading