Loading Build/Makefile.local +24 −0 Original line number Diff line number Diff line Loading @@ -13,10 +13,32 @@ CUDA_LIB = -L/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/lib64 FFTW_INCL= FFTW_LIB= ########################################################## #NVIDIA CUFFTMP CUFFTMP_LIB = -L/.../lib64 CUFFTMP_INCL = -I/.../include/cufftmp ########################################################## ########################################################## #NVIDIA NCCL REDUCE NCCL_INC = -I/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/comm_libs/nccl/include NCCL_LIB = -L/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/comm_libs/nccl/lib ########################################################## NVC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/compilers/bin/nvc NVFLAGS = -O4 -fast -march=native $(OMP_GPU) -mavx -mavx2 NVLIB = $(CUDA_INC) $(CUDA_LIB) -lcuda -lcudart NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFT_INC) $(CUFFT_LIB) $(NVSHMEM_INC) $(NVSHMEM_LIB) -lnvshmem_host -lnvshmem_device -lcuda -lcudart -lcufftMp NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcuda -lcudart -lnccl NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/bin/nvcc OPT_NVCC = -std=c++17 --generate-code arch=compute_86,code=sm_86 Loading @@ -25,6 +47,7 @@ CFLAGS += MPICHLIB = ########################################################## #AMD GPUs (DEFAULT = LUMI) Loading @@ -41,3 +64,4 @@ HIP_LIB= -L/opt/rocm-5.2.3/hip/lib AMDLIB = $(HIP_INCL) $(HIP_LIB) $(RCCL_INCL) $(RCCL_LIB) -D__HIP_PLATFORM_AMD__ -lamdhip64 -lrccl ########################################################### Makefile +20 −4 Original line number Diff line number Diff line Loading @@ -110,7 +110,7 @@ DEPS = w-stacking.h main.c allvars.h # ----- define which files will be compiled by MPICC # # these are the OBJS that will be compiled by C compiler if no acceleration (neither with OpenACC nor with OpenMP) is provided # these are the OBJS that will be compiled by C compiler if no acceleration (neither with CUDA nor with OpenMP) is provided CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o # these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on Loading @@ -136,6 +136,12 @@ OBJ_NCCL_REDUCE = gridding_nccl.o DEPS_RCCL_REDUCE = gridding_rccl.cpp OBJ_RCCL_REDUCE = gridding_rccl.o # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT # DEPS_ACC_CUFFTMP = w-stacking.h cuda_fft.cu OBJ_ACC_CUFFTMP = cuda_fft.o # ----------------------------------------------------- # # end of OBJ definition Loading Loading @@ -240,6 +246,16 @@ $(OBJ_RCCL_REDUCE): $(DEPS_RCCL_REDUCE) OBJ += $(OBJ_RCCL_REDUCE) endif ifeq (CUFFTMP,$(findstring CUFFTMP,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-fft LINKER=$(MPIC++) FLAGS=$(OPTIMIZE) LIBS=$(NVLIB) $(NVLIB_2) $(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP) $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c $^ $(LIBS) OBJ += $(OBJ_ACC_CUFFTMP) endif ################################################################################### Loading @@ -258,7 +274,7 @@ clean: rm -f phase_correction.c cleanall: rm -f $(EXEC)* rm -f $(EXEC)$(EXT) rm -f *.o rm -f w-stacking.c rm -f phase_correction.c README.md +38 −11 Original line number Diff line number Diff line # HPC_Imaging Development of a code for imaging enabled to exploit heterogeneous HPC resource To compile the code, feel free to activate and deactivate options in the Makefile. You will find the code options before and then the acceleration options. #Glacopo Free to activate/deactivate in the Makefile the flags with what you want to test You can simply run the code with the command: If you use GPUs for OpenMP and NCCL Reduce compile the CPU part with gcc, i.e. ############################################ make w-stacking ############################################ It will redirect you to the file Build/Makefile.local, which is complete enough apart from different library paths, feel free to use it or to change SYSTYPE. My aim was to make compilation as simple as possible. When you use GPU offloading with OpenMP, please do not compile the CPU part with NVC. This can be easily fixed by setting the environment variable: ############################################ export OMPI_CC = gcc export OMPI_CXX=nvc++ Please avoid compiling CPU part with nvc, especially if the Luca NUMA machinery is active. Anyway, this is just for stacking and phase correction offloading, currently Ring Reduce and NCCL Reduce are mutually excluding. ########################################### In the case in which the default compiler is NVC. The Makefile is suited to understand which are the parts to be compiled with NVC for the OpenMP offloading. The final linker in this case will be however the NVC/NVC++. The problem does not raise on AMD platforms, because you use clang/clang++ for both CPUs and GPUs The extensions of the executable will be changed depending on the different acceleration options. To run the code, the data/paramfile.txt is available. Feel free to change the paramers, i.e. the path of visibilities, which reduce implementation to use, the number of pixels, the number of OpenMP threads and so on. Once you have compiled the code, run it simply with the command: If you want you can compile the cufftMp but it's not very efficient right now, so use the standard FFTW-MPI or FFTW-MPI/OpenMP FFT tagged simply as: ########################################### OPT += -DHYBRID_FFTW mpirun -np <n> <executable> data/paramfile.txt ########################################### Loading fourier_transform.c +71 −1 Original line number Diff line number Diff line Loading @@ -220,8 +220,31 @@ void write_fftw_data(){ #ifdef WRITE_IMAGE double start_image = CPU_TIME_wt; if(rank == 0) { #ifdef FITSIO printf("REMOVING RESIDUAL FITS FILE\n"); remove(testfitsreal); remove(testfitsimag); printf("FITS CREATION\n"); status = 0; fits_create_file(&fptrimg, testfitsimag, &status); fits_create_img(fptrimg, DOUBLE_IMG, naxis, naxes, &status); fits_close_file(fptrimg, &status); status = 0; fits_create_file(&fptreal, testfitsreal, &status); fits_create_img(fptreal, DOUBLE_IMG, naxis, naxes, &status); fits_close_file(fptreal, &status); #endif file.pFilereal = fopen (out.fftfile2,"wb"); file.pFileimg = fopen (out.fftfile3,"wb"); fclose(file.pFilereal); Loading @@ -231,6 +254,31 @@ void write_fftw_data(){ MPI_Barrier(MPI_COMM_WORLD); if(rank == 0)printf("WRITING IMAGE\n"); #ifdef FITSIO uint * fpixel = (uint *) malloc(sizeof(uint)*naxis); uint * lpixel = (uint *) malloc(sizeof(uint)*naxis); #endif #ifdef FITSIO fpixel[0] = 1; fpixel[1] = rank*yaxis+1; lpixel[0] = xaxis; lpixel[1] = (rank+1)*yaxis; status = 0; fits_open_image(&fptreal, testfitsreal, READWRITE, &status); fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status); fits_close_file(fptreal, &status); status = 0; fits_open_image(&fptrimg, testfitsimag, READWRITE, &status); fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status); fits_close_file(fptrimg, &status); #endif //FITSIO for (int isector=0; isector<size; isector++) { Loading @@ -238,7 +286,28 @@ void write_fftw_data(){ if(isector == rank) { printf("%d writing\n",isector); #ifdef FITSIO fpixel[0] = 1; fpixel[1] = isector*yaxis+1; lpixel[0] = xaxis; lpixel[1] = (isector+1)*yaxis; status = 0; fits_open_image(&fptreal, testfitsreal, READWRITE, &status); fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status); fits_close_file(fptreal, &status); status = 0; fits_open_image(&fptrimg, testfitsimag, READWRITE, &status); fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status); fits_close_file(fptrimg, &status); #endif //FITSIO file.pFilereal = fopen (out.fftfile2,"ab"); file.pFileimg = fopen (out.fftfile3,"ab"); Loading @@ -256,6 +325,7 @@ void write_fftw_data(){ MPI_Barrier(MPI_COMM_WORLD); timing_wt.write += CPU_TIME_wt - start_image; #endif //WRITE_IMAGE Loading gridding_cpu.c +6 −5 Original line number Diff line number Diff line Loading @@ -145,7 +145,6 @@ void gridding_data() icount++; } double uumin = 1e20; double vvmin = 1e20; double uumax = -1e20; Loading Loading @@ -173,10 +172,11 @@ void gridding_data() vvmax = MAX( vvmax, my_vvmax ); } timing_wt.compose += CPU_TIME_wt - start; //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); timing_wt.compose += CPU_TIME_wt - start; // Make convolution on the grid #ifdef VERBOSE Loading Loading @@ -218,14 +218,14 @@ void gridding_data() printf("Processed sector %ld\n",isector); #endif start = CPU_TIME_wt; if( size > 1 ) { // Write grid in the corresponding remote slab int target_rank = (int)(isector % size); start = CPU_TIME_wt; if( param.reduce_method == REDUCE_MPI ) MPI_Reduce(gridss, grid, size_of_grid, MPI_DOUBLE, MPI_SUM, target_rank, MYMPI_COMM_WORLD); Loading @@ -248,6 +248,7 @@ void gridding_data() } timing_wt.reduce += CPU_TIME_wt - start; // Go to next sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } Loading Loading
Build/Makefile.local +24 −0 Original line number Diff line number Diff line Loading @@ -13,10 +13,32 @@ CUDA_LIB = -L/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/lib64 FFTW_INCL= FFTW_LIB= ########################################################## #NVIDIA CUFFTMP CUFFTMP_LIB = -L/.../lib64 CUFFTMP_INCL = -I/.../include/cufftmp ########################################################## ########################################################## #NVIDIA NCCL REDUCE NCCL_INC = -I/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/comm_libs/nccl/include NCCL_LIB = -L/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/comm_libs/nccl/lib ########################################################## NVC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/compilers/bin/nvc NVFLAGS = -O4 -fast -march=native $(OMP_GPU) -mavx -mavx2 NVLIB = $(CUDA_INC) $(CUDA_LIB) -lcuda -lcudart NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFT_INC) $(CUFFT_LIB) $(NVSHMEM_INC) $(NVSHMEM_LIB) -lnvshmem_host -lnvshmem_device -lcuda -lcudart -lcufftMp NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcuda -lcudart -lnccl NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/bin/nvcc OPT_NVCC = -std=c++17 --generate-code arch=compute_86,code=sm_86 Loading @@ -25,6 +47,7 @@ CFLAGS += MPICHLIB = ########################################################## #AMD GPUs (DEFAULT = LUMI) Loading @@ -41,3 +64,4 @@ HIP_LIB= -L/opt/rocm-5.2.3/hip/lib AMDLIB = $(HIP_INCL) $(HIP_LIB) $(RCCL_INCL) $(RCCL_LIB) -D__HIP_PLATFORM_AMD__ -lamdhip64 -lrccl ###########################################################
Makefile +20 −4 Original line number Diff line number Diff line Loading @@ -110,7 +110,7 @@ DEPS = w-stacking.h main.c allvars.h # ----- define which files will be compiled by MPICC # # these are the OBJS that will be compiled by C compiler if no acceleration (neither with OpenACC nor with OpenMP) is provided # these are the OBJS that will be compiled by C compiler if no acceleration (neither with CUDA nor with OpenMP) is provided CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o # these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on Loading @@ -136,6 +136,12 @@ OBJ_NCCL_REDUCE = gridding_nccl.o DEPS_RCCL_REDUCE = gridding_rccl.cpp OBJ_RCCL_REDUCE = gridding_rccl.o # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT # DEPS_ACC_CUFFTMP = w-stacking.h cuda_fft.cu OBJ_ACC_CUFFTMP = cuda_fft.o # ----------------------------------------------------- # # end of OBJ definition Loading Loading @@ -240,6 +246,16 @@ $(OBJ_RCCL_REDUCE): $(DEPS_RCCL_REDUCE) OBJ += $(OBJ_RCCL_REDUCE) endif ifeq (CUFFTMP,$(findstring CUFFTMP,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-fft LINKER=$(MPIC++) FLAGS=$(OPTIMIZE) LIBS=$(NVLIB) $(NVLIB_2) $(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP) $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c $^ $(LIBS) OBJ += $(OBJ_ACC_CUFFTMP) endif ################################################################################### Loading @@ -258,7 +274,7 @@ clean: rm -f phase_correction.c cleanall: rm -f $(EXEC)* rm -f $(EXEC)$(EXT) rm -f *.o rm -f w-stacking.c rm -f phase_correction.c
README.md +38 −11 Original line number Diff line number Diff line # HPC_Imaging Development of a code for imaging enabled to exploit heterogeneous HPC resource To compile the code, feel free to activate and deactivate options in the Makefile. You will find the code options before and then the acceleration options. #Glacopo Free to activate/deactivate in the Makefile the flags with what you want to test You can simply run the code with the command: If you use GPUs for OpenMP and NCCL Reduce compile the CPU part with gcc, i.e. ############################################ make w-stacking ############################################ It will redirect you to the file Build/Makefile.local, which is complete enough apart from different library paths, feel free to use it or to change SYSTYPE. My aim was to make compilation as simple as possible. When you use GPU offloading with OpenMP, please do not compile the CPU part with NVC. This can be easily fixed by setting the environment variable: ############################################ export OMPI_CC = gcc export OMPI_CXX=nvc++ Please avoid compiling CPU part with nvc, especially if the Luca NUMA machinery is active. Anyway, this is just for stacking and phase correction offloading, currently Ring Reduce and NCCL Reduce are mutually excluding. ########################################### In the case in which the default compiler is NVC. The Makefile is suited to understand which are the parts to be compiled with NVC for the OpenMP offloading. The final linker in this case will be however the NVC/NVC++. The problem does not raise on AMD platforms, because you use clang/clang++ for both CPUs and GPUs The extensions of the executable will be changed depending on the different acceleration options. To run the code, the data/paramfile.txt is available. Feel free to change the paramers, i.e. the path of visibilities, which reduce implementation to use, the number of pixels, the number of OpenMP threads and so on. Once you have compiled the code, run it simply with the command: If you want you can compile the cufftMp but it's not very efficient right now, so use the standard FFTW-MPI or FFTW-MPI/OpenMP FFT tagged simply as: ########################################### OPT += -DHYBRID_FFTW mpirun -np <n> <executable> data/paramfile.txt ########################################### Loading
fourier_transform.c +71 −1 Original line number Diff line number Diff line Loading @@ -220,8 +220,31 @@ void write_fftw_data(){ #ifdef WRITE_IMAGE double start_image = CPU_TIME_wt; if(rank == 0) { #ifdef FITSIO printf("REMOVING RESIDUAL FITS FILE\n"); remove(testfitsreal); remove(testfitsimag); printf("FITS CREATION\n"); status = 0; fits_create_file(&fptrimg, testfitsimag, &status); fits_create_img(fptrimg, DOUBLE_IMG, naxis, naxes, &status); fits_close_file(fptrimg, &status); status = 0; fits_create_file(&fptreal, testfitsreal, &status); fits_create_img(fptreal, DOUBLE_IMG, naxis, naxes, &status); fits_close_file(fptreal, &status); #endif file.pFilereal = fopen (out.fftfile2,"wb"); file.pFileimg = fopen (out.fftfile3,"wb"); fclose(file.pFilereal); Loading @@ -231,6 +254,31 @@ void write_fftw_data(){ MPI_Barrier(MPI_COMM_WORLD); if(rank == 0)printf("WRITING IMAGE\n"); #ifdef FITSIO uint * fpixel = (uint *) malloc(sizeof(uint)*naxis); uint * lpixel = (uint *) malloc(sizeof(uint)*naxis); #endif #ifdef FITSIO fpixel[0] = 1; fpixel[1] = rank*yaxis+1; lpixel[0] = xaxis; lpixel[1] = (rank+1)*yaxis; status = 0; fits_open_image(&fptreal, testfitsreal, READWRITE, &status); fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status); fits_close_file(fptreal, &status); status = 0; fits_open_image(&fptrimg, testfitsimag, READWRITE, &status); fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status); fits_close_file(fptrimg, &status); #endif //FITSIO for (int isector=0; isector<size; isector++) { Loading @@ -238,7 +286,28 @@ void write_fftw_data(){ if(isector == rank) { printf("%d writing\n",isector); #ifdef FITSIO fpixel[0] = 1; fpixel[1] = isector*yaxis+1; lpixel[0] = xaxis; lpixel[1] = (isector+1)*yaxis; status = 0; fits_open_image(&fptreal, testfitsreal, READWRITE, &status); fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status); fits_close_file(fptreal, &status); status = 0; fits_open_image(&fptrimg, testfitsimag, READWRITE, &status); fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status); fits_close_file(fptrimg, &status); #endif //FITSIO file.pFilereal = fopen (out.fftfile2,"ab"); file.pFileimg = fopen (out.fftfile3,"ab"); Loading @@ -256,6 +325,7 @@ void write_fftw_data(){ MPI_Barrier(MPI_COMM_WORLD); timing_wt.write += CPU_TIME_wt - start_image; #endif //WRITE_IMAGE Loading
gridding_cpu.c +6 −5 Original line number Diff line number Diff line Loading @@ -145,7 +145,6 @@ void gridding_data() icount++; } double uumin = 1e20; double vvmin = 1e20; double uumax = -1e20; Loading Loading @@ -173,10 +172,11 @@ void gridding_data() vvmax = MAX( vvmax, my_vvmax ); } timing_wt.compose += CPU_TIME_wt - start; //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); timing_wt.compose += CPU_TIME_wt - start; // Make convolution on the grid #ifdef VERBOSE Loading Loading @@ -218,14 +218,14 @@ void gridding_data() printf("Processed sector %ld\n",isector); #endif start = CPU_TIME_wt; if( size > 1 ) { // Write grid in the corresponding remote slab int target_rank = (int)(isector % size); start = CPU_TIME_wt; if( param.reduce_method == REDUCE_MPI ) MPI_Reduce(gridss, grid, size_of_grid, MPI_DOUBLE, MPI_SUM, target_rank, MYMPI_COMM_WORLD); Loading @@ -248,6 +248,7 @@ void gridding_data() } timing_wt.reduce += CPU_TIME_wt - start; // Go to next sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } Loading