try to fix the issue on overwriting of stacking arrays (66b9e12b) · Commits · Claudio Gheller / HPC_Imaging

Build/Makefile.local

+24 −0

Original line number	Diff line number	Diff line
		@@ -13,10 +13,32 @@ CUDA_LIB = -L/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/lib64
		FFTW_INCL=
		FFTW_LIB=


		##########################################################
		#NVIDIA CUFFTMP

		CUFFTMP_LIB = -L/.../lib64
		CUFFTMP_INCL = -I/.../include/cufftmp
		##########################################################


		##########################################################
		#NVIDIA NCCL REDUCE

		NCCL_INC = -I/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/comm_libs/nccl/include
		NCCL_LIB = -L/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/comm_libs/nccl/lib
		##########################################################

		NVC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/compilers/bin/nvc
		NVFLAGS = -O4 -fast -march=native $(OMP_GPU) -mavx -mavx2
		NVLIB = $(CUDA_INC) $(CUDA_LIB) -lcuda -lcudart


		NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFT_INC) $(CUFFT_LIB) $(NVSHMEM_INC) $(NVSHMEM_LIB) -lnvshmem_host -lnvshmem_device -lcuda -lcudart -lcufftMp

		NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcuda -lcudart -lnccl


		NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/bin/nvcc
		OPT_NVCC = -std=c++17 --generate-code arch=compute_86,code=sm_86

		@@ -25,6 +47,7 @@ CFLAGS +=
		MPICHLIB =



		##########################################################
		#AMD GPUs (DEFAULT = LUMI)

		@@ -41,3 +64,4 @@ HIP_LIB= -L/opt/rocm-5.2.3/hip/lib

		AMDLIB = $(HIP_INCL) $(HIP_LIB) $(RCCL_INCL) $(RCCL_LIB) -D__HIP_PLATFORM_AMD__ -lamdhip64 -lrccl
		###########################################################

Makefile

+20 −4

Original line number	Diff line number	Diff line
		@@ -110,7 +110,7 @@ DEPS = w-stacking.h main.c allvars.h

		# ----- define which files will be compiled by MPICC
		#
		# these are the OBJS that will be compiled by C compiler if no acceleration (neither with OpenACC nor with OpenMP) is provided
		# these are the OBJS that will be compiled by C compiler if no acceleration (neither with CUDA nor with OpenMP) is provided
		CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o

		# these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on
		@@ -136,6 +136,12 @@ OBJ_NCCL_REDUCE = gridding_nccl.o
		DEPS_RCCL_REDUCE = gridding_rccl.cpp
		OBJ_RCCL_REDUCE = gridding_rccl.o

		# ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
		#
		DEPS_ACC_CUFFTMP = w-stacking.h cuda_fft.cu
		OBJ_ACC_CUFFTMP = cuda_fft.o


		# -----------------------------------------------------
		#
		# end of OBJ definition
		@@ -240,6 +246,16 @@ $(OBJ_RCCL_REDUCE): $(DEPS_RCCL_REDUCE)
		OBJ += $(OBJ_RCCL_REDUCE)
		endif

		ifeq (CUFFTMP,$(findstring CUFFTMP,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_acc-fft
		LINKER=$(MPIC++)
		FLAGS=$(OPTIMIZE)
		LIBS=$(NVLIB) $(NVLIB_2)
		$(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP)
		$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c $^ $(LIBS)
		OBJ += $(OBJ_ACC_CUFFTMP)
		endif


		###################################################################################

		@@ -258,7 +274,7 @@ clean:
		rm -f phase_correction.c

		cleanall:
		rm -f $(EXEC)*
		rm -f $(EXEC)$(EXT)
		rm -f *.o
		rm -f w-stacking.c
		rm -f phase_correction.c

README.md

+38 −11

Original line number	Diff line number	Diff line
		# HPC_Imaging

		Development of a code for imaging enabled to exploit heterogeneous HPC resource
		To compile the code, feel free to activate and deactivate options in the Makefile.
		You will find the code options before and then the acceleration options.

		#Glacopo
		Free to activate/deactivate in the Makefile the flags with what you want to test
		You can simply run the code with the command:

		If you use GPUs for OpenMP and NCCL Reduce compile the CPU part with gcc, i.e.
		############################################

		make w-stacking

		############################################

		It will redirect you to the file Build/Makefile.local, which is complete enough
		apart from different library paths, feel free to use it or to change SYSTYPE.
		My aim was to make compilation as simple as possible.

		When you use GPU offloading with OpenMP, please do not compile the CPU part with NVC.
		This can be easily fixed by setting the environment variable:

		############################################

		export OMPI_CC = gcc
		export OMPI_CXX=nvc++

		Please avoid compiling CPU part with nvc, especially if the Luca NUMA machinery is active.
		Anyway, this is just for stacking and phase correction offloading, currently Ring Reduce and NCCL Reduce are mutually excluding.
		###########################################

		In the case in which the default compiler is NVC. The Makefile is suited to understand
		which are the parts to be compiled with NVC for the OpenMP offloading.
		The final linker in this case will be however the NVC/NVC++.

		The problem does not raise on AMD platforms, because you use clang/clang++ for both CPUs
		and GPUs

		The extensions of the executable will be changed depending on the different acceleration
		options.

		To run the code, the data/paramfile.txt is available. Feel free to change the paramers,
		i.e. the path of visibilities, which reduce implementation to use, the number of pixels,
		the number of OpenMP threads and so on.

		Once you have compiled the code, run it simply with the command:

		If you want you can compile the cufftMp but it's not very efficient right now, so use the standard FFTW-MPI
		or FFTW-MPI/OpenMP FFT tagged simply as:
		###########################################

		OPT += -DHYBRID_FFTW
		mpirun -np <n> <executable> data/paramfile.txt

		###########################################

fourier_transform.c

+71 −1

Original line number	Diff line number	Diff line
		@@ -220,8 +220,31 @@ void write_fftw_data(){

		#ifdef WRITE_IMAGE

		double start_image = CPU_TIME_wt;

		if(rank == 0)
		{

		#ifdef FITSIO
		printf("REMOVING RESIDUAL FITS FILE\n");
		remove(testfitsreal);
		remove(testfitsimag);


		printf("FITS CREATION\n");
		status = 0;

		fits_create_file(&fptrimg, testfitsimag, &status);
		fits_create_img(fptrimg, DOUBLE_IMG, naxis, naxes, &status);
		fits_close_file(fptrimg, &status);

		status = 0;

		fits_create_file(&fptreal, testfitsreal, &status);
		fits_create_img(fptreal, DOUBLE_IMG, naxis, naxes, &status);
		fits_close_file(fptreal, &status);
		#endif

		file.pFilereal = fopen (out.fftfile2,"wb");
		file.pFileimg = fopen (out.fftfile3,"wb");
		fclose(file.pFilereal);
		@@ -231,6 +254,31 @@ void write_fftw_data(){
		MPI_Barrier(MPI_COMM_WORLD);

		if(rank == 0)printf("WRITING IMAGE\n");

		#ifdef FITSIO
		uint * fpixel = (uint ) malloc(sizeof(uint)naxis);
		uint * lpixel = (uint ) malloc(sizeof(uint)naxis);
		#endif

		#ifdef FITSIO

		fpixel[0] = 1;
		fpixel[1] = rank*yaxis+1;
		lpixel[0] = xaxis;
		lpixel[1] = (rank+1)*yaxis;

		status = 0;
		fits_open_image(&fptreal, testfitsreal, READWRITE, &status);
		fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status);
		fits_close_file(fptreal, &status);

		status = 0;
		fits_open_image(&fptrimg, testfitsimag, READWRITE, &status);
		fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status);
		fits_close_file(fptrimg, &status);

		#endif //FITSIO

		for (int isector=0; isector<size; isector++)
		{

		@@ -238,7 +286,28 @@ void write_fftw_data(){

		if(isector == rank)
		{

		printf("%d writing\n",isector);

		#ifdef FITSIO

		fpixel[0] = 1;
		fpixel[1] = isector*yaxis+1;
		lpixel[0] = xaxis;
		lpixel[1] = (isector+1)*yaxis;

		status = 0;
		fits_open_image(&fptreal, testfitsreal, READWRITE, &status);
		fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status);
		fits_close_file(fptreal, &status);

		status = 0;
		fits_open_image(&fptrimg, testfitsimag, READWRITE, &status);
		fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status);
		fits_close_file(fptrimg, &status);

		#endif //FITSIO

		file.pFilereal = fopen (out.fftfile2,"ab");
		file.pFileimg = fopen (out.fftfile3,"ab");

		@@ -256,6 +325,7 @@ void write_fftw_data(){

		MPI_Barrier(MPI_COMM_WORLD);

		timing_wt.write += CPU_TIME_wt - start_image;

		#endif //WRITE_IMAGE

gridding_cpu.c

+6 −5

Original line number	Diff line number	Diff line
		@@ -145,7 +145,6 @@ void gridding_data()
		icount++;
		}


		double uumin = 1e20;
		double vvmin = 1e20;
		double uumax = -1e20;
		@@ -173,10 +172,11 @@ void gridding_data()
		vvmax = MAX( vvmax, my_vvmax );
		}

		timing_wt.compose += CPU_TIME_wt - start;

		//printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);


		timing_wt.compose += CPU_TIME_wt - start;

		// Make convolution on the grid

		#ifdef VERBOSE
		@@ -218,14 +218,14 @@ void gridding_data()
		printf("Processed sector %ld\n",isector);
		#endif

		start = CPU_TIME_wt;

		if( size > 1 )
		{
		// Write grid in the corresponding remote slab

		int target_rank = (int)(isector % size);

		start = CPU_TIME_wt;

		if( param.reduce_method == REDUCE_MPI )

		MPI_Reduce(gridss, grid, size_of_grid, MPI_DOUBLE, MPI_SUM, target_rank, MYMPI_COMM_WORLD);
		@@ -248,6 +248,7 @@ void gridding_data()
		}

		timing_wt.reduce += CPU_TIME_wt - start;

		// Go to next sector
		memset ( gridss, 0, 2param.num_w_planesxaxisyaxis sizeof(double) );
		}