polishing (03d87005) · Commits · Claudio Gheller / HPC_Imaging

Makefile

+60 −16

Original line number	Diff line number	Diff line
		@@ -98,29 +98,66 @@ endif

		DEPS = w-stacking.h main.c allvars.h

		ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) && ifneq (CUDACC,$(findstring CUDACC, $(OPT)))
		OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
		else
		OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
		endif

		# -------------------------------------------------------
		#
		# here we define which OBJ files have to be compiled by who;
		# in fact, depending on the GPU-acceleration being on or off,
		# and on having AMD/NVidia GPUs, things may be different
		#
		# ------------------------------------------------------


		# ----- define which files will be compiled by MPICC
		#
		# these are the OBJS that will be compiled by C compiler if no acceleration (neither with OpenACC nor with OpenMP) is provided
		CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o

		# these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on
		CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o


		# ----- define which files will be compiled by NVCC for Nvidia
		#
		DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu
		OBJ_ACC_CUDA = phase_correction.o w-stacking.o

		# ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD
		#
		DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c
		OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o


		# ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is
		# offloaded on GPU
		DEPS_NCCL_REDUCE = gridding_nccl.cpp
		OBJ_NCCL_REDUCE = gridding_nccl.o

		DEPS_RCCL_REDUCE = gridding_rccl.cpp
		OBJ_RCCL_REDUCE = gridding_rccl.o

		# -----------------------------------------------------
		#
		# end of OBJ definition
		# ----------------------------------------------------


		ifeq (ACCOMP,$(findstring ACCOMP, $(OPT)))
		OBJ = $(CC_OBJ_ACC)
		else ifeq (CUDACC,$(findstring CUDACC, $(OPT)))
		OBJ = $(CC_OBJ_ACC)
		else
		OBJ = $(CC_OBJ_NOACC)
		endif

		ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT)))
		CFLAGS += $(FFTW_MPI_INC)
		FFTWLIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm
		endif

		# define rules for sources that contains GPU code
		#

		ifneq (CUDACC,$(findstring CUDACC,$(OPT)))
		w-stacking.c: w-stacking.cu
		cp w-stacking.cu w-stacking.c
		@@ -153,8 +190,23 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
		OBJ += $(OBJ_ACC_CUDA)
		endif

		#NVIDIA GPUs
		ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifneq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT)))

		ifeq (ACCOMP,$(findstring ACCOMP,$(OPT)))

		# >>>>> AMD GPUs
		ifeq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT)))

		EXEC_EXT := $(EXEC_EXT)_acc-omp
		LINKER=$(MPICC)
		FLAGS=$(OPTIMIZE_AMD) $(CFLAGS)
		LIBS=$(AMDLIB)
		$(OBJ_ACC_OMP): $(DEPS_ACC_OMP)
		$(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS)
		OBJ += $(OBJ_ACC_OMP)

		# >>>> NVIDIA GPUs
		else

		EXEC_EXT := $(EXEC_EXT)_acc-omp
		LINKER=$(NVC)
		FLAGS=$(NVFLAGS) $(CFLAGS)
		@@ -162,17 +214,9 @@ LIBS=$(NVLIB)
		$(OBJ_ACC_OMP): $(DEPS_ACC_OMP)
		$(NVC) $(FLAGS) $(OPT) -c $^ $(LIBS)
		OBJ += $(OBJ_ACC_OMP)

		endif

		#AMD GPUs
		ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifeq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_acc-omp
		LINKER=$(MPICC)
		FLAGS=$(OPTIMIZE_AMD) $(CFLAGS)
		LIBS=$(AMDLIB)
		$(OBJ_ACC_OMP): $(DEPS_ACC_OMP)
		$(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS)
		OBJ += $(OBJ_ACC_OMP)
		endif

gridding.c

+50 −38

Original line number	Diff line number	Diff line
		@@ -23,51 +23,63 @@ void gridding()
		if (rank==0)
		printf("NORMALIZING DATA\n");

		double minu = 1e20;
		double minv = 1e20;
		double minw = 1e20;
		double maxu = -1e20;
		double maxv = -1e20;
		double maxw = -1e20;
		typedef struct {
		double u;
		double v;
		double w; } cmp_t;


		cmp_t getmin = { 1e20, 1e20, 1e20 };
		cmp_t getmax = { 0 };

		#pragma omp parallel num_threads(param.num_threads)
		{
		cmp_t mygetmin = { 1e20, 1e20, 1e20 };
		cmp_t mygetmax = { 0 };

		#pragma omp for
		for (uint inorm=0; inorm<metaData.Nmeasures; inorm++)
		{
		minu = MIN(minu,data.uu[inorm]);
		minv = MIN(minv,data.vv[inorm]);
		minw = MIN(minw,data.ww[inorm]);
		maxu = MAX(maxu,data.uu[inorm]);
		maxv = MAX(maxv,data.vv[inorm]);
		maxw = MAX(maxw,data.ww[inorm]);
		mygetmin.u = MIN(mygetmin.u, data.uu[inorm]);
		mygetmin.v = MIN(mygetmin.v, data.vv[inorm]);
		mygetmin.w = MIN(mygetmin.w, data.ww[inorm]);

		mygetmax.u = MAX(mygetmax.u, data.uu[inorm]);
		mygetmax.v = MAX(mygetmax.v, data.vv[inorm]);
		mygetmax.w = MAX(mygetmax.w, data.ww[inorm]);
		}

		double minu_all;
		double minv_all;
		double minw_all;
		double maxu_all;
		double maxv_all;
		double maxw_all;
		#pragma omp critical (getmin_u)
		getmin.u = MIN( mygetmin.u, getmin.u );
		#pragma omp critical (getmin_v)
		getmin.v = MIN( mygetmin.v, getmin.v );
		#pragma omp critical (getmin_w)
		getmin.w = MIN( mygetmin.w, getmin.w );

		#pragma omp critical (getmax_u)
		getmax.u = MAX( mygetmax.u, getmax.u );
		#pragma omp critical (getmax_v)
		getmax.v = MAX( mygetmax.v, getmax.v );
		#pragma omp critical (getmax_w)
		getmax.w = MAX( mygetmax.w, getmax.w );
		}

		MPI_Allreduce(&minu,&minu_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
		MPI_Allreduce(&minv,&minv_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
		MPI_Allreduce(&minw,&minw_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
		MPI_Allreduce(&maxu,&maxu_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
		MPI_Allreduce(&maxv,&maxv_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
		MPI_Allreduce(&maxw,&maxw_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
		MPI_Allreduce(MPI_IN_PLACE, &getmin, 3, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
		MPI_Allreduce(MPI_IN_PLACE, &getmax, 3, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);

		double offset = 0.001;
		double ming = MAX(abs(minu_all),abs(minv_all));
		double maxg = MAX(abs(maxu_all),abs(maxv_all));
		double ming = MAX(fabs(getmin.u), fabs(getmin.v));
		double maxg = MAX(fabs(getmax.u), fabs(getmax.v));
		maxg = MAX(maxg,ming);
		minw = minw_all;
		maxw = maxw_all;
		maxg = maxg + offset*maxg;

		#pragma omp parallel for num_threads(param.num_threads)
		for (uint inorm=0; inorm < metaData.Nmeasures; inorm++)
		{
		data.uu[inorm] = (data.uu[inorm]+maxg) / (2.0*maxg);
		data.vv[inorm] = (data.vv[inorm]+maxg) / (2.0*maxg);
		data.ww[inorm] = (data.ww[inorm]-minw)/(maxw-minw);
		data.ww[inorm] = (data.ww[inorm]-getmin.w) / (getmax.w-getmin.w);
		}

		#endif

		// Create histograms and linked lists