New implementation compatible with gcc and GPU info (668da3ac) · Commits · Claudio Gheller / HPC_Imaging

Build/Makefile.Magellanus

+16 −14

Original line number	Diff line number	Diff line
		@@ -4,29 +4,31 @@ CXX = g++-10
		MPICC = mpicc
		MPIC++ = mpiCC

		OPTIMIZE = -O3
		#-ffast-math -fopt-info-all-omp -fcf-protection=none -fno-stack-protector -foffload=nvptx-none


		GSL_INCL = -I/home/taffoni/sw/include #-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
		GSL_LIBS = -L/home/taffoni/sw/lib #-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
		GSL_INCL = -I/home/taffoni/sw/include
		GSL_LIBS = -L/home/taffoni/sw/lib

		FFTW_INCL= -I/home/taffoni/sw/include
		FFTW_LIB= -L/home/taffoni/sw/lib -lfftw3_mpi -lfftw3

		#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
		MPI_LIB =
		MPI_INCL=
		#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
		MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
		HDF5_INCL =
		HDF5_LIB =

		OMP = -fopenmp

		OMP = -mp=multicore,gpu -Mprof -cuda
		#OMP = -fopenmp
		NVCC = nvcc
		NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11
		NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda

		CFLAGS += $(OPTIMIZE)
		CFLAGS += -I.
		CFLAGS += -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
		CFLAGS += $(FFTW_INCL) $(GSL_INCL)
		CFLAGS += $(FFTW_LIB) -lm

		CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)

		OPTIMIZE = $(OMP) -O3

		# OMP GPU SPECIFIC FLAGS
		#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math
		#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35
		#-ffast-math -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none

Build/Makefile.Marconi

+5 −5

Original line number	Diff line number	Diff line
		@@ -5,16 +5,16 @@ MPICC = mpicc
		MPIC++ = mpiCC


		CFLAGS += -O3 -mcpu=native
		CFLAGS += -I.
		FFTW_INCL= -I/home/taffoni/sw/include
		FFTW_LIB= -L/home/taffoni/sw/lib

		LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm

		NVCC = nvcc
		NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
		NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda

		OMP= -fopenmp

		CFLAGS += -O3 -mtune=native
		CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)

		OPTIMIZE = $(OMP) -O3 -mtune=native

Makefile

+23 −13

Original line number	Diff line number	Diff line
		@@ -14,21 +14,22 @@ include Build/Makefile.systype
		endif




		LIBS = $(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda

		# create MPI code
		OPT += -DUSE_MPI

		#OPT += -DACCOMP
		# use FFTW (it can be switched on ONLY if MPI is active)
		ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
		OPT += -DUSE_FFTW
		LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
		endif

		#OPT += -DNVIDIA
		# perform one-side communication (suggested) instead of reduce (only if MPI is active)
		OPT += -DONE_SIDE
		# write the full 3D cube of gridded visibilities and its FFT transform
		#OPT += -DWRITE_DATA
		OPT += -DWRITE_DATA
		# write the final image
		OPT += -DWRITE_IMAGE
		# perform w-stacking phase correction
		@@ -46,27 +47,36 @@ phase_correction.c: phase_correction.cu

		ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
		%.o: %.c $(DEPS)
		$(MPICC) -c -o $@ $< $(CFLAGS) $(OPT)
		$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
		else
		%.o: %.c $(DEPS)
		$(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT)
		$(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
		endif

		serial: $(COBJ)
		$(CC) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm
		$(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial $^ $(LIBS)

		serial_omp: phase_correction.c
		$(CC) $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial w-stacking-fftw.c w-stacking_omp.c $(CFLAGS) $(LIBS)

		simple_mpi: phase_correction.c
		$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS)

		mpi_omp: phase_correction.c
		$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_omp w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS)

		serial_cuda:
		$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
		$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
		$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm
		$(CC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
		$(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm

		mpi: $(COBJ)
		$(MPICC) -o w-stackingCfftw $^ $(CFLAGS)
		$(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS)

		mpi_cuda:
		$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
		$(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
		$(MPIC++) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS)
		$(MPICC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
		$(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS)

		clean:
		rm *.o

phase_correction.cu

+16 −8

Original line number	Diff line number	Diff line
		@@ -18,7 +18,11 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
		for (int iw=0; iw<num_w_planes; iw++)
		{
		double wterm = (double)iw/dnum_w_planes;

		#ifdef ACCOMP
		#pragma omp target teams distribute parallel for \
		map(tofrom: image_real[0:xaxisyaxis], image_imag[0:xaxisyaxis]) \
		map (to: gridss[0:2num_w_planesxaxis*yaxis])
		#endif
		for (int iv=0; iv<yaxis; iv++)
		for (int iu=0; iu<xaxis; iu++)
		{
		@@ -53,10 +57,14 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
		s = pimag;

		//printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
		#pragma omp atomic
		image_real[img_index] += pr-qs;
		#pragma omp atomic
		image_imag[img_index] += ps+qr;
		#else
		#pragma omp atomic
		image_real[img_index] += gridss[index];
		#pragma omp atomic
		image_imag[img_index] += gridss[index+1];
		#endif

w-stacking-fftw.c

+212 −163

Original line number	Diff line number	Diff line
		@@ -7,9 +7,15 @@
		#include <fftw3-mpi.h>
		#endif
		#endif
		#include <omp.h>
		#include <math.h>
		#include <time.h>
		#include <unistd.h>
		#ifdef ACCOMP
		#include "w-stacking_omp.h"
		#else
		#include "w-stacking.h"
		#endif
		#define PI 3.14159265359
		#define NUM_OF_SECTORS -1
		#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
		@@ -100,6 +106,8 @@ int main(int argc, char * argv[])
		int xaxis;
		int yaxis;
		int num_w_planes = 1;


		// DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
		int w_support = 7;
		int num_threads;// = 4;
		@@ -115,11 +123,23 @@ int main(int argc, char * argv[])
		struct timespec begin, finish, begin0, begink, finishk;
		double elapsed;
		long nsectors;
		/* GT get nymber of threads exit if not given */
		if(argc == 1) {
		fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
		exit(1);
		}
		// Set the number of OpenMP threads
		num_threads = atoi(argv[1]);

		if ( num_threads == 0 )
		{
		fprintf(stderr, "Wrong parameter: %s\n\n", argv[1]);
		fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
		exit(1);
		}

		clock_gettime(CLOCK_MONOTONIC, &begin0);
		start0 = clock();
		// Set the number of OpenMP threads
		num_threads = atoi(argv[1]);

		// Intialize MPI environment
		#ifdef USE_MPI
		@@ -134,8 +154,22 @@ int main(int argc, char * argv[])
		rank = 0;
		size = 1;
		#endif

		if(rank == 0)printf("Running with %d threads\n",num_threads);

		#ifdef ACCOMP
		if(rank == 0){
		if (0 == omp_get_num_devices()) {
		printf("No accelerator found ... exit\n");
		exit(255);
		}
		printf("Number of available GPUs %d\n", omp_get_num_devices());
		#ifdef NVIDIA
		prtAccelInfo();
		#endif
		}
		#endif

		// set the local size of the image
		local_grid_size_x = grid_size_x;
		nsectors = NUM_OF_SECTORS;
		@@ -211,6 +245,7 @@ int main(int argc, char * argv[])
		printf("N. visibilities on %d %ld\n",rank,Nvis);
		#endif


		// DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used
		// all the sizes are rescaled by the number of MPI tasks
		// Allocate arrays
		@@ -260,13 +295,13 @@ int main(int argc, char * argv[])
		setup_time1 = (finish.tv_sec - begin.tv_sec);
		setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;


		if(rank == 0)printf("GRIDDING DATA\n");

		// Create histograms and linked lists
		clock_gettime(CLOCK_MONOTONIC, &begin);
		start = clock();

		//CLAAA
		// Initialize linked list
		struct sectorlist ** sectorhead;
		sectorhead = (struct sectorlist *) malloc((nsectors+1) sizeof(struct sectorlist));
		@@ -335,20 +370,18 @@ int main(int argc, char * argv[])
		#ifndef USE_MPI
		double * gridtot = (double) calloc(2grid_size_xgrid_size_ynum_w_planes,sizeof(double));
		#endif

		double shift = (double)(dx*yaxis);

		// Open the MPI Memory Window for the slab
		#ifdef USE_MPI
		MPI_Win slabwin;
		MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
		MPI_Win_fence(0,slabwin);
		#endif

		#ifndef USE_MPI
		pFile1 = fopen (outfile1,"w");
		#endif


		// loop over files
		//
		kernel_time = 0.0;
		@@ -357,9 +390,9 @@ int main(int argc, char * argv[])
		reduce_time1 = 0.0;
		compose_time = 0.0;
		compose_time1 = 0.0;

		for (int ifiles=0; ifiles<ndatasets; ifiles++)
		{

		strcpy(filename,datapath_multi[ifiles]);
		printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
		strcat(filename,weightsfile);
		@@ -368,20 +401,21 @@ int main(int argc, char * argv[])
		fseek (pFile,startrowpolarisationssizeof(float),SEEK_SET);
		fread(weights,(Nweights)*sizeof(float),1,pFile);
		fclose(pFile);

		strcpy(filename,datapath);
		strcat(filename,visrealfile);
		//printf("Reading %s\n",filename);
		#ifdef VERBOSE
		printf("Reading %s\n",filename);
		#endif

		pFile = fopen (filename,"rb");
		fseek (pFile,startrowfreq_per_chanpolarisations*sizeof(float),SEEK_SET);
		fread(visreal,Nvis*sizeof(float),1,pFile);
		fclose(pFile);

		strcpy(filename,datapath);
		strcat(filename,visimgfile);
		//printf("Reading %s\n",filename);

		#ifdef VERBOSE
		printf("Reading %s\n",filename);
		#endif
		pFile = fopen (filename,"rb");
		fseek (pFile,startrowfreq_per_chanpolarisations*sizeof(float),SEEK_SET);
		fread(visimg,Nvis*sizeof(float),1,pFile);
		@@ -390,7 +424,6 @@ int main(int argc, char * argv[])
		#ifdef USE_MPI
		MPI_Barrier(MPI_COMM_WORLD);
		#endif

		// Declare temporary arrays for the masking
		double * uus;
		double * vvs;
		@@ -398,8 +431,8 @@ int main(int argc, char * argv[])
		float * visreals;
		float * visimgs;
		float * weightss;

		long isector;

		for (long isector_count=0; isector_count<nsectors; isector_count++)
		{
		clock_gettime(CLOCK_MONOTONIC, &begink);
		@@ -425,13 +458,15 @@ int main(int argc, char * argv[])
		//CLAAAA
		struct sectorlist * current;
		current = sectorhead[isector];



		while (current->index != -1)
		{
		long ilocal = current->index;
		//double vvh = vv[ilocal];
		//int binphi = (int)(vvh*nsectors);
		//if (binphi == isector \|\| boundary[ilocal] == isector)
		//{
		//if (binphi == isector \|\| boundary[ilocal] == isector) {
		uus[icount] = uu[ilocal];
		vvs[icount] = vv[ilocal]-isector*shift;
		wws[icount] = ww[ilocal];
		@@ -457,6 +492,7 @@ int main(int argc, char * argv[])
		compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
		compose_time1 += (finishk.tv_sec - begink.tv_sec);
		compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;

		#ifndef USE_MPI
		double uumin = 1e20;
		double vvmin = 1e20;
		@@ -483,6 +519,7 @@ int main(int argc, char * argv[])
		#endif
		clock_gettime(CLOCK_MONOTONIC, &begink);
		startk = clock();

		wstack(num_w_planes,
		Nsec,
		freq_per_chan,
		@@ -500,6 +537,18 @@ int main(int argc, char * argv[])
		yaxis,
		gridss,
		num_threads);


		/* int z =0 ;
		#pragma omp target map(to:test_i_gpu) map(from:z)
		{
		int x; // only accessible from accelerator
		x = 2;
		z = x + test_i_gpu;
		}*/



		clock_gettime(CLOCK_MONOTONIC, &finishk);
		endk = clock();
		kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;