Loading Makefile +60 −16 Original line number Diff line number Diff line Loading @@ -98,29 +98,66 @@ endif DEPS = w-stacking.h main.c allvars.h ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) && ifneq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o else OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o endif # ------------------------------------------------------- # # here we define which OBJ files have to be compiled by who; # in fact, depending on the GPU-acceleration being on or off, # and on having AMD/NVidia GPUs, things may be different # # ------------------------------------------------------ # ----- define which files will be compiled by MPICC # # these are the OBJS that will be compiled by C compiler if no acceleration (neither with OpenACC nor with OpenMP) is provided CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o # these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o # ----- define which files will be compiled by NVCC for Nvidia # DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu OBJ_ACC_CUDA = phase_correction.o w-stacking.o # ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD # DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o # ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is # offloaded on GPU DEPS_NCCL_REDUCE = gridding_nccl.cpp OBJ_NCCL_REDUCE = gridding_nccl.o DEPS_RCCL_REDUCE = gridding_rccl.cpp OBJ_RCCL_REDUCE = gridding_rccl.o # ----------------------------------------------------- # # end of OBJ definition # ---------------------------------------------------- ifeq (ACCOMP,$(findstring ACCOMP, $(OPT))) OBJ = $(CC_OBJ_ACC) else ifeq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = $(CC_OBJ_ACC) else OBJ = $(CC_OBJ_NOACC) endif ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT))) CFLAGS += $(FFTW_MPI_INC) FFTWLIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm endif # define rules for sources that contains GPU code # ifneq (CUDACC,$(findstring CUDACC,$(OPT))) w-stacking.c: w-stacking.cu cp w-stacking.cu w-stacking.c Loading Loading @@ -153,8 +190,23 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) OBJ += $(OBJ_ACC_CUDA) endif #NVIDIA GPUs ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifneq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT))) ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) # >>>>> AMD GPUs ifeq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(MPICC) FLAGS=$(OPTIMIZE_AMD) $(CFLAGS) LIBS=$(AMDLIB) $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) $(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS) OBJ += $(OBJ_ACC_OMP) # >>>> NVIDIA GPUs else EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(NVC) FLAGS=$(NVFLAGS) $(CFLAGS) Loading @@ -162,17 +214,9 @@ LIBS=$(NVLIB) $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) $(NVC) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_ACC_OMP) endif #AMD GPUs ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifeq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(MPICC) FLAGS=$(OPTIMIZE_AMD) $(CFLAGS) LIBS=$(AMDLIB) $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) $(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS) OBJ += $(OBJ_ACC_OMP) endif Loading gridding.c +50 −38 Original line number Diff line number Diff line Loading @@ -23,51 +23,63 @@ void gridding() if (rank==0) printf("NORMALIZING DATA\n"); double minu = 1e20; double minv = 1e20; double minw = 1e20; double maxu = -1e20; double maxv = -1e20; double maxw = -1e20; typedef struct { double u; double v; double w; } cmp_t; cmp_t getmin = { 1e20, 1e20, 1e20 }; cmp_t getmax = { 0 }; #pragma omp parallel num_threads(param.num_threads) { cmp_t mygetmin = { 1e20, 1e20, 1e20 }; cmp_t mygetmax = { 0 }; #pragma omp for for (uint inorm=0; inorm<metaData.Nmeasures; inorm++) { minu = MIN(minu,data.uu[inorm]); minv = MIN(minv,data.vv[inorm]); minw = MIN(minw,data.ww[inorm]); maxu = MAX(maxu,data.uu[inorm]); maxv = MAX(maxv,data.vv[inorm]); maxw = MAX(maxw,data.ww[inorm]); mygetmin.u = MIN(mygetmin.u, data.uu[inorm]); mygetmin.v = MIN(mygetmin.v, data.vv[inorm]); mygetmin.w = MIN(mygetmin.w, data.ww[inorm]); mygetmax.u = MAX(mygetmax.u, data.uu[inorm]); mygetmax.v = MAX(mygetmax.v, data.vv[inorm]); mygetmax.w = MAX(mygetmax.w, data.ww[inorm]); } double minu_all; double minv_all; double minw_all; double maxu_all; double maxv_all; double maxw_all; #pragma omp critical (getmin_u) getmin.u = MIN( mygetmin.u, getmin.u ); #pragma omp critical (getmin_v) getmin.v = MIN( mygetmin.v, getmin.v ); #pragma omp critical (getmin_w) getmin.w = MIN( mygetmin.w, getmin.w ); #pragma omp critical (getmax_u) getmax.u = MAX( mygetmax.u, getmax.u ); #pragma omp critical (getmax_v) getmax.v = MAX( mygetmax.v, getmax.v ); #pragma omp critical (getmax_w) getmax.w = MAX( mygetmax.w, getmax.w ); } MPI_Allreduce(&minu,&minu_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&minv,&minv_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&minw,&minw_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&maxu,&maxu_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&maxv,&maxv_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&maxw,&maxw_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &getmin, 3, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &getmax, 3, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); double offset = 0.001; double ming = MAX(abs(minu_all),abs(minv_all)); double maxg = MAX(abs(maxu_all),abs(maxv_all)); double ming = MAX(fabs(getmin.u), fabs(getmin.v)); double maxg = MAX(fabs(getmax.u), fabs(getmax.v)); maxg = MAX(maxg,ming); minw = minw_all; maxw = maxw_all; maxg = maxg + offset*maxg; #pragma omp parallel for num_threads(param.num_threads) for (uint inorm=0; inorm < metaData.Nmeasures; inorm++) { data.uu[inorm] = (data.uu[inorm]+maxg) / (2.0*maxg); data.vv[inorm] = (data.vv[inorm]+maxg) / (2.0*maxg); data.ww[inorm] = (data.ww[inorm]-minw)/(maxw-minw); data.ww[inorm] = (data.ww[inorm]-getmin.w) / (getmax.w-getmin.w); } #endif // Create histograms and linked lists Loading Loading
Makefile +60 −16 Original line number Diff line number Diff line Loading @@ -98,29 +98,66 @@ endif DEPS = w-stacking.h main.c allvars.h ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) && ifneq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o else OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o endif # ------------------------------------------------------- # # here we define which OBJ files have to be compiled by who; # in fact, depending on the GPU-acceleration being on or off, # and on having AMD/NVidia GPUs, things may be different # # ------------------------------------------------------ # ----- define which files will be compiled by MPICC # # these are the OBJS that will be compiled by C compiler if no acceleration (neither with OpenACC nor with OpenMP) is provided CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o # these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o # ----- define which files will be compiled by NVCC for Nvidia # DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu OBJ_ACC_CUDA = phase_correction.o w-stacking.o # ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD # DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o # ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is # offloaded on GPU DEPS_NCCL_REDUCE = gridding_nccl.cpp OBJ_NCCL_REDUCE = gridding_nccl.o DEPS_RCCL_REDUCE = gridding_rccl.cpp OBJ_RCCL_REDUCE = gridding_rccl.o # ----------------------------------------------------- # # end of OBJ definition # ---------------------------------------------------- ifeq (ACCOMP,$(findstring ACCOMP, $(OPT))) OBJ = $(CC_OBJ_ACC) else ifeq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = $(CC_OBJ_ACC) else OBJ = $(CC_OBJ_NOACC) endif ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT))) CFLAGS += $(FFTW_MPI_INC) FFTWLIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm endif # define rules for sources that contains GPU code # ifneq (CUDACC,$(findstring CUDACC,$(OPT))) w-stacking.c: w-stacking.cu cp w-stacking.cu w-stacking.c Loading Loading @@ -153,8 +190,23 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) OBJ += $(OBJ_ACC_CUDA) endif #NVIDIA GPUs ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifneq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT))) ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) # >>>>> AMD GPUs ifeq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(MPICC) FLAGS=$(OPTIMIZE_AMD) $(CFLAGS) LIBS=$(AMDLIB) $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) $(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS) OBJ += $(OBJ_ACC_OMP) # >>>> NVIDIA GPUs else EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(NVC) FLAGS=$(NVFLAGS) $(CFLAGS) Loading @@ -162,17 +214,9 @@ LIBS=$(NVLIB) $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) $(NVC) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_ACC_OMP) endif #AMD GPUs ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifeq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(MPICC) FLAGS=$(OPTIMIZE_AMD) $(CFLAGS) LIBS=$(AMDLIB) $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) $(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS) OBJ += $(OBJ_ACC_OMP) endif Loading
gridding.c +50 −38 Original line number Diff line number Diff line Loading @@ -23,51 +23,63 @@ void gridding() if (rank==0) printf("NORMALIZING DATA\n"); double minu = 1e20; double minv = 1e20; double minw = 1e20; double maxu = -1e20; double maxv = -1e20; double maxw = -1e20; typedef struct { double u; double v; double w; } cmp_t; cmp_t getmin = { 1e20, 1e20, 1e20 }; cmp_t getmax = { 0 }; #pragma omp parallel num_threads(param.num_threads) { cmp_t mygetmin = { 1e20, 1e20, 1e20 }; cmp_t mygetmax = { 0 }; #pragma omp for for (uint inorm=0; inorm<metaData.Nmeasures; inorm++) { minu = MIN(minu,data.uu[inorm]); minv = MIN(minv,data.vv[inorm]); minw = MIN(minw,data.ww[inorm]); maxu = MAX(maxu,data.uu[inorm]); maxv = MAX(maxv,data.vv[inorm]); maxw = MAX(maxw,data.ww[inorm]); mygetmin.u = MIN(mygetmin.u, data.uu[inorm]); mygetmin.v = MIN(mygetmin.v, data.vv[inorm]); mygetmin.w = MIN(mygetmin.w, data.ww[inorm]); mygetmax.u = MAX(mygetmax.u, data.uu[inorm]); mygetmax.v = MAX(mygetmax.v, data.vv[inorm]); mygetmax.w = MAX(mygetmax.w, data.ww[inorm]); } double minu_all; double minv_all; double minw_all; double maxu_all; double maxv_all; double maxw_all; #pragma omp critical (getmin_u) getmin.u = MIN( mygetmin.u, getmin.u ); #pragma omp critical (getmin_v) getmin.v = MIN( mygetmin.v, getmin.v ); #pragma omp critical (getmin_w) getmin.w = MIN( mygetmin.w, getmin.w ); #pragma omp critical (getmax_u) getmax.u = MAX( mygetmax.u, getmax.u ); #pragma omp critical (getmax_v) getmax.v = MAX( mygetmax.v, getmax.v ); #pragma omp critical (getmax_w) getmax.w = MAX( mygetmax.w, getmax.w ); } MPI_Allreduce(&minu,&minu_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&minv,&minv_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&minw,&minw_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&maxu,&maxu_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&maxv,&maxv_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&maxw,&maxw_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &getmin, 3, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &getmax, 3, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); double offset = 0.001; double ming = MAX(abs(minu_all),abs(minv_all)); double maxg = MAX(abs(maxu_all),abs(maxv_all)); double ming = MAX(fabs(getmin.u), fabs(getmin.v)); double maxg = MAX(fabs(getmax.u), fabs(getmax.v)); maxg = MAX(maxg,ming); minw = minw_all; maxw = maxw_all; maxg = maxg + offset*maxg; #pragma omp parallel for num_threads(param.num_threads) for (uint inorm=0; inorm < metaData.Nmeasures; inorm++) { data.uu[inorm] = (data.uu[inorm]+maxg) / (2.0*maxg); data.vv[inorm] = (data.vv[inorm]+maxg) / (2.0*maxg); data.ww[inorm] = (data.ww[inorm]-minw)/(maxw-minw); data.ww[inorm] = (data.ww[inorm]-getmin.w) / (getmax.w-getmin.w); } #endif // Create histograms and linked lists Loading