Commit 66b9e12b authored by Luca Tornatore's avatar Luca Tornatore
Browse files

try to fix the issue on overwriting of stacking arrays

parents e6a47938 98350996
Loading
Loading
Loading
Loading
+24 −0
Original line number Diff line number Diff line
@@ -13,10 +13,32 @@ CUDA_LIB = -L/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/lib64
FFTW_INCL=
FFTW_LIB=


##########################################################
#NVIDIA CUFFTMP

CUFFTMP_LIB  = -L/.../lib64
CUFFTMP_INCL = -I/.../include/cufftmp
##########################################################


##########################################################
#NVIDIA NCCL REDUCE

NCCL_INC = -I/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/comm_libs/nccl/include
NCCL_LIB = -L/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/comm_libs/nccl/lib
##########################################################

NVC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/compilers/bin/nvc 
NVFLAGS = -O4 -fast -march=native $(OMP_GPU) -mavx -mavx2  
NVLIB = $(CUDA_INC) $(CUDA_LIB) -lcuda -lcudart


NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFT_INC) $(CUFFT_LIB) $(NVSHMEM_INC) $(NVSHMEM_LIB) -lnvshmem_host -lnvshmem_device -lcuda -lcudart -lcufftMp

NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcuda -lcudart -lnccl


NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/bin/nvcc
OPT_NVCC   = -std=c++17 --generate-code arch=compute_86,code=sm_86 

@@ -25,6 +47,7 @@ CFLAGS +=
MPICHLIB =



##########################################################
#AMD GPUs (DEFAULT = LUMI)

@@ -41,3 +64,4 @@ HIP_LIB= -L/opt/rocm-5.2.3/hip/lib

AMDLIB = $(HIP_INCL) $(HIP_LIB) $(RCCL_INCL) $(RCCL_LIB) -D__HIP_PLATFORM_AMD__ -lamdhip64 -lrccl
###########################################################
+20 −4
Original line number Diff line number Diff line
@@ -110,7 +110,7 @@ DEPS = w-stacking.h main.c allvars.h

# ----- define which files will be compiled by MPICC
#
# these are the OBJS that will be compiled by C compiler if no acceleration (neither with OpenACC nor with OpenMP) is provided
# these are the OBJS that will be compiled by C compiler if no acceleration (neither with CUDA nor with OpenMP) is provided
CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o

# these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on
@@ -136,6 +136,12 @@ OBJ_NCCL_REDUCE = gridding_nccl.o
DEPS_RCCL_REDUCE = gridding_rccl.cpp
OBJ_RCCL_REDUCE  = gridding_rccl.o

# ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
#
DEPS_ACC_CUFFTMP = w-stacking.h cuda_fft.cu
OBJ_ACC_CUFFTMP  = cuda_fft.o


# -----------------------------------------------------
#
# end of OBJ definition
@@ -240,6 +246,16 @@ $(OBJ_RCCL_REDUCE): $(DEPS_RCCL_REDUCE)
OBJ += $(OBJ_RCCL_REDUCE)
endif

ifeq (CUFFTMP,$(findstring CUFFTMP,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-fft
LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE)
LIBS=$(NVLIB) $(NVLIB_2)
$(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP)
	$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c $^ $(LIBS)
OBJ += $(OBJ_ACC_CUFFTMP)
endif


###################################################################################

@@ -258,7 +274,7 @@ clean:
	rm -f phase_correction.c

cleanall:
	rm -f $(EXEC)*
	rm -f $(EXEC)$(EXT)
	rm -f *.o
	rm -f w-stacking.c
	rm -f phase_correction.c
+38 −11
Original line number Diff line number Diff line
# HPC_Imaging

Development of a code for imaging enabled to exploit heterogeneous HPC resource
To compile the code, feel free to activate and deactivate options in the Makefile.
You will find the code options before and then the acceleration options.

#Glacopo
Free to activate/deactivate in the Makefile the flags with what you want to test
You can simply run the code with the command:

If you use GPUs for OpenMP and NCCL Reduce compile the CPU part with gcc, i.e.
############################################

make w-stacking

############################################

It will redirect you to the file Build/Makefile.local, which is complete enough
apart from different library paths, feel free to use it or to change SYSTYPE.
My aim was to make compilation as simple as possible.

When you use GPU offloading with OpenMP, please do not compile the CPU part with NVC.
This can be easily fixed by setting the environment variable:

############################################

export OMPI_CC = gcc
export OMPI_CXX=nvc++

Please avoid compiling CPU part with nvc, especially if the Luca NUMA machinery is active.
Anyway, this is just for stacking and phase correction offloading, currently Ring Reduce and NCCL Reduce are mutually excluding.
###########################################

In the case in which the default compiler is NVC. The Makefile is suited to understand
which are the parts to be compiled with NVC for the OpenMP offloading.
The final linker in this case will be however the NVC/NVC++.

The problem does not raise on AMD platforms, because you use clang/clang++ for both CPUs
and GPUs

The extensions of the executable will be changed depending on the different acceleration
options.

To run the code, the data/paramfile.txt is available. Feel free to change the paramers,
i.e. the path of visibilities, which reduce implementation to use, the number of pixels,
the number of OpenMP threads and so on.

Once you have compiled the code, run it simply with the command:

If you want you can compile the cufftMp but it's not very efficient right now, so use the standard FFTW-MPI
or FFTW-MPI/OpenMP FFT tagged simply as:
###########################################

OPT += -DHYBRID_FFTW
mpirun -np <n> <executable> data/paramfile.txt

###########################################



+71 −1
Original line number Diff line number Diff line
@@ -220,8 +220,31 @@ void write_fftw_data(){
  
 #ifdef WRITE_IMAGE

  double start_image = CPU_TIME_wt;
  
  if(rank == 0)
    {

     #ifdef FITSIO
      printf("REMOVING RESIDUAL FITS FILE\n");
      remove(testfitsreal);
      remove(testfitsimag);


      printf("FITS CREATION\n");
      status = 0;

      fits_create_file(&fptrimg, testfitsimag, &status);
      fits_create_img(fptrimg, DOUBLE_IMG, naxis, naxes, &status);
      fits_close_file(fptrimg, &status);

      status = 0;

      fits_create_file(&fptreal, testfitsreal, &status);
      fits_create_img(fptreal, DOUBLE_IMG, naxis, naxes, &status);
      fits_close_file(fptreal, &status);
     #endif
      
      file.pFilereal = fopen (out.fftfile2,"wb");
      file.pFileimg = fopen (out.fftfile3,"wb");
      fclose(file.pFilereal);
@@ -231,6 +254,31 @@ void write_fftw_data(){
  MPI_Barrier(MPI_COMM_WORLD);

  if(rank == 0)printf("WRITING IMAGE\n");

 #ifdef FITSIO
  uint * fpixel = (uint *) malloc(sizeof(uint)*naxis);
  uint * lpixel = (uint *) malloc(sizeof(uint)*naxis);
 #endif

 #ifdef FITSIO

  fpixel[0] = 1;
  fpixel[1] = rank*yaxis+1;
  lpixel[0] = xaxis;
  lpixel[1] = (rank+1)*yaxis;

  status = 0;
  fits_open_image(&fptreal, testfitsreal, READWRITE, &status);
  fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status);
  fits_close_file(fptreal, &status);

  status = 0;
  fits_open_image(&fptrimg, testfitsimag, READWRITE, &status);
  fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status);
  fits_close_file(fptrimg, &status);

 #endif //FITSIO

  for (int isector=0; isector<size; isector++)
    {

@@ -238,7 +286,28 @@ void write_fftw_data(){
      
      if(isector == rank)
	{

	  printf("%d writing\n",isector);

	 #ifdef FITSIO

	  fpixel[0] = 1;
	  fpixel[1] = isector*yaxis+1;
	  lpixel[0] = xaxis;
	  lpixel[1] = (isector+1)*yaxis;

	  status = 0;
	  fits_open_image(&fptreal, testfitsreal, READWRITE, &status);
	  fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status);
	  fits_close_file(fptreal, &status);

	  status = 0;
	  fits_open_image(&fptrimg, testfitsimag, READWRITE, &status);
	  fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status);
	  fits_close_file(fptrimg, &status);

	 #endif //FITSIO

	  file.pFilereal = fopen (out.fftfile2,"ab");
	  file.pFileimg = fopen (out.fftfile3,"ab");

@@ -256,6 +325,7 @@ void write_fftw_data(){
 
  MPI_Barrier(MPI_COMM_WORLD);

  timing_wt.write += CPU_TIME_wt - start_image;

 #endif //WRITE_IMAGE

+6 −5
Original line number Diff line number Diff line
@@ -145,7 +145,6 @@ void gridding_data()
	  icount++;
	}
      

      double uumin = 1e20;
      double vvmin = 1e20;
      double uumax = -1e20;
@@ -173,10 +172,11 @@ void gridding_data()
	vvmax = MAX( vvmax, my_vvmax );
      }

      timing_wt.compose += CPU_TIME_wt - start;
      
      //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
      

      timing_wt.compose += CPU_TIME_wt - start;
      
      // Make convolution on the grid

     #ifdef VERBOSE
@@ -218,14 +218,14 @@ void gridding_data()
      printf("Processed sector %ld\n",isector);
     #endif

      start = CPU_TIME_wt;
    
      if( size > 1 )
	{
	  // Write grid in the corresponding remote slab
	  
	  int target_rank = (int)(isector % size);

	  start = CPU_TIME_wt;
	  
	  if( param.reduce_method == REDUCE_MPI )
	   
	    MPI_Reduce(gridss, grid, size_of_grid, MPI_DOUBLE, MPI_SUM, target_rank, MYMPI_COMM_WORLD);
@@ -248,6 +248,7 @@ void gridding_data()
	    }
	  
	  timing_wt.reduce += CPU_TIME_wt - start;

	  // Go to next sector
	  memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );	  
	}	
Loading