Commit 03d87005 authored by Luca Tornatore's avatar Luca Tornatore
Browse files

polishing

parent 8689fa16
Loading
Loading
Loading
Loading
+60 −16
Original line number Diff line number Diff line
@@ -98,29 +98,66 @@ endif

DEPS = w-stacking.h  main.c allvars.h

ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) && ifneq (CUDACC,$(findstring CUDACC, $(OPT)))
OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
else
OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
endif

# -------------------------------------------------------
#
#  here we define which OBJ files have to be compiled by who;
#  in fact, depending on the GPU-acceleration being on or off,
#  and on having AMD/NVidia GPUs, things may be different
#
# ------------------------------------------------------


# ----- define which files will be compiled by MPICC
#
# these are the OBJS that will be compiled by C compiler if no acceleration (neither with OpenACC nor with OpenMP) is provided
CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o

# these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on
CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o


# ----- define which files will be compiled by NVCC for Nvidia
#
DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu
OBJ_ACC_CUDA = phase_correction.o w-stacking.o

# ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD
#
DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c
OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o


# ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is
#       offloaded on GPU
DEPS_NCCL_REDUCE = gridding_nccl.cpp
OBJ_NCCL_REDUCE = gridding_nccl.o

DEPS_RCCL_REDUCE = gridding_rccl.cpp
OBJ_RCCL_REDUCE = gridding_rccl.o

# -----------------------------------------------------
#
# end of OBJ definition
# ----------------------------------------------------


ifeq (ACCOMP,$(findstring ACCOMP, $(OPT)))
OBJ = $(CC_OBJ_ACC)
else ifeq (CUDACC,$(findstring CUDACC, $(OPT)))
OBJ = $(CC_OBJ_ACC)
else
OBJ = $(CC_OBJ_NOACC)
endif

ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT)))
CFLAGS += $(FFTW_MPI_INC)
FFTWLIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm
endif

# define rules for sources that contains GPU code
#

ifneq (CUDACC,$(findstring CUDACC,$(OPT)))
w-stacking.c: w-stacking.cu
	cp w-stacking.cu w-stacking.c
@@ -153,8 +190,23 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
OBJ += $(OBJ_ACC_CUDA)
endif

#NVIDIA GPUs
ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifneq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT)))

ifeq (ACCOMP,$(findstring ACCOMP,$(OPT)))

# >>>>> AMD GPUs
ifeq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT)))

EXEC_EXT := $(EXEC_EXT)_acc-omp
LINKER=$(MPICC)
FLAGS=$(OPTIMIZE_AMD) $(CFLAGS)
LIBS=$(AMDLIB) 
$(OBJ_ACC_OMP): $(DEPS_ACC_OMP)
	$(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS) 
OBJ += $(OBJ_ACC_OMP)

# >>>> NVIDIA GPUs
else

EXEC_EXT := $(EXEC_EXT)_acc-omp
LINKER=$(NVC)
FLAGS=$(NVFLAGS) $(CFLAGS)
@@ -162,17 +214,9 @@ LIBS=$(NVLIB)
$(OBJ_ACC_OMP): $(DEPS_ACC_OMP)
	$(NVC) $(FLAGS) $(OPT) -c $^ $(LIBS)
OBJ += $(OBJ_ACC_OMP)

endif

#AMD GPUs
ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifeq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-omp
LINKER=$(MPICC)
FLAGS=$(OPTIMIZE_AMD) $(CFLAGS)
LIBS=$(AMDLIB) 
$(OBJ_ACC_OMP): $(DEPS_ACC_OMP)
	$(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS) 
OBJ += $(OBJ_ACC_OMP)
endif


+50 −38
Original line number Diff line number Diff line
@@ -23,51 +23,63 @@ void gridding()
  if (rank==0)
    printf("NORMALIZING DATA\n");

  double minu = 1e20;
  double minv = 1e20;
  double minw = 1e20;
  double maxu = -1e20;
  double maxv = -1e20;
  double maxw = -1e20;
  typedef struct {
    double u;
    double v;
    double w; } cmp_t;

  
  cmp_t getmin = { 1e20, 1e20, 1e20 };
  cmp_t getmax = { 0 };

 #pragma omp parallel num_threads(param.num_threads)
  {
    cmp_t mygetmin = { 1e20, 1e20, 1e20 };
    cmp_t mygetmax = { 0 };

   #pragma omp for
    for (uint inorm=0; inorm<metaData.Nmeasures; inorm++)
      {
      minu = MIN(minu,data.uu[inorm]);
      minv = MIN(minv,data.vv[inorm]);
      minw = MIN(minw,data.ww[inorm]);
      maxu = MAX(maxu,data.uu[inorm]);
      maxv = MAX(maxv,data.vv[inorm]);
      maxw = MAX(maxw,data.ww[inorm]);
	mygetmin.u = MIN(mygetmin.u, data.uu[inorm]);
	mygetmin.v = MIN(mygetmin.v, data.vv[inorm]);
	mygetmin.w = MIN(mygetmin.w, data.ww[inorm]);
	
	mygetmax.u = MAX(mygetmax.u, data.uu[inorm]);
	mygetmax.v = MAX(mygetmax.v, data.vv[inorm]);
	mygetmax.w = MAX(mygetmax.w, data.ww[inorm]);
      }
    
  double minu_all;
  double minv_all;
  double minw_all;
  double maxu_all;
  double maxv_all;
  double maxw_all;
   #pragma omp critical (getmin_u)
    getmin.u = MIN( mygetmin.u, getmin.u );
   #pragma omp critical (getmin_v)
    getmin.v = MIN( mygetmin.v, getmin.v );
   #pragma omp critical (getmin_w)
    getmin.w = MIN( mygetmin.w, getmin.w );
    
   #pragma omp critical (getmax_u)
    getmax.u = MAX( mygetmax.u, getmax.u );
   #pragma omp critical (getmax_v)
    getmax.v = MAX( mygetmax.v, getmax.v );
   #pragma omp critical (getmax_w)
    getmax.w = MAX( mygetmax.w, getmax.w );
  }

  MPI_Allreduce(&minu,&minu_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
  MPI_Allreduce(&minv,&minv_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
  MPI_Allreduce(&minw,&minw_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
  MPI_Allreduce(&maxu,&maxu_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
  MPI_Allreduce(&maxv,&maxv_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
  MPI_Allreduce(&maxw,&maxw_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
  MPI_Allreduce(MPI_IN_PLACE, &getmin, 3, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
  MPI_Allreduce(MPI_IN_PLACE, &getmax, 3, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);

  double offset = 0.001;
  double ming = MAX(abs(minu_all),abs(minv_all));
  double maxg = MAX(abs(maxu_all),abs(maxv_all));
  double ming = MAX(fabs(getmin.u), fabs(getmin.v));
  double maxg = MAX(fabs(getmax.u), fabs(getmax.v));
  maxg = MAX(maxg,ming);
  minw = minw_all;
  maxw = maxw_all;
  maxg = maxg + offset*maxg;

 #pragma omp parallel for num_threads(param.num_threads)
  for (uint inorm=0; inorm < metaData.Nmeasures; inorm++)
    {
      data.uu[inorm] = (data.uu[inorm]+maxg) / (2.0*maxg);
      data.vv[inorm] = (data.vv[inorm]+maxg) / (2.0*maxg);
      data.ww[inorm] = (data.ww[inorm]-minw)/(maxw-minw);
      data.ww[inorm] = (data.ww[inorm]-getmin.w) / (getmax.w-getmin.w);
    }

 #endif
  
  // Create histograms and linked lists