From 7a25e80b716d909c8aac80662d4ce0d029242367 Mon Sep 17 00:00:00 2001 From: Giuliano Taffoni Date: Tue, 23 Nov 2021 20:50:44 +0100 Subject: [PATCH 1/4] OpenMP GPU implementation --- .gitignore | 2 + Build/Makefile.Macosx | 26 +++++++++++++ Build/Makefile.Magellanus | 34 +++++++++++++++++ Build/Makefile.Marconi | 20 ++++++++++ Build/Makefile.systype | 24 ++++++++++++ Makefile | 79 ++++++++++++++++++++++++--------------- phase_correction.cu | 16 +++++--- w-stacking-fftw.c | 67 ++++++++++++++++++++------------- w-stacking.cu | 32 ++++++++++++---- w-stacking.h | 2 +- 10 files changed, 233 insertions(+), 69 deletions(-) create mode 100644 Build/Makefile.Macosx create mode 100644 Build/Makefile.Magellanus create mode 100644 Build/Makefile.Marconi create mode 100644 Build/Makefile.systype diff --git a/.gitignore b/.gitignore index 797747a..9cce9de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.DS_Store +sync.sh *.o phase_correction.c w-stacking.c diff --git a/Build/Makefile.Macosx b/Build/Makefile.Macosx new file mode 100644 index 0000000..cdb0e13 --- /dev/null +++ b/Build/Makefile.Macosx @@ -0,0 +1,26 @@ +CC = icc +CXX = g++-11 + +MPICC = mpicc +MPIC++ = mpiCC + +FFTW_INCL= -I/usr/local/include +FFTW_LIB= -L/usr/local/lib/ + +GSL_INCL = +GSL_LIBS = + +MPI_LIB = +MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include +HDF5_INCL = +HDF5_LIB = + +OMP= -fopenmp + +NVCC = nvcc +NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 +NVLIB = -L/home/taffoni -lcudart -lcuda + +CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) + +OPTIMIZE = $(OMP) -O3 diff --git a/Build/Makefile.Magellanus b/Build/Makefile.Magellanus new file mode 100644 index 0000000..36c9169 --- /dev/null +++ b/Build/Makefile.Magellanus @@ -0,0 +1,34 @@ +CC = gcc-10 +CXX = g++-10 + +MPICC = mpicc +MPIC++ = mpiCC + +GSL_INCL = -I/home/taffoni/sw/include +GSL_LIBS = -L/home/taffoni/sw/lib + +FFTW_INCL= -I/home/taffoni/sw/include +FFTW_LIB= -L/home/taffoni/sw/lib -lfftw3_mpi -lfftw3 + +#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi +MPI_LIB = +#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include +MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include +HDF5_INCL = +HDF5_LIB = + +OMP = -mp=multicore,gpu -Mprof -cuda -lcudart +#OMP = -fopenmp +NVCC = nvcc +NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11 +NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda + + +CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) + +OPTIMIZE = $(OMP) -O3 + +# OMP GPU SPECIFIC FLAGS +#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math +#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35 +#-ffast-math -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none diff --git a/Build/Makefile.Marconi b/Build/Makefile.Marconi new file mode 100644 index 0000000..4fec93a --- /dev/null +++ b/Build/Makefile.Marconi @@ -0,0 +1,20 @@ +CC = gcc +CXX = g++ + +MPICC = mpicc +MPIC++ = mpiCC + + +FFTW_INCL= -I/home/taffoni/sw/include +FFTW_LIB= -L/home/taffoni/sw/lib + + +NVCC = nvcc +NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 +NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda + +OMP= -fopenmp + +CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) + +OPTIMIZE = $(OMP) -O3 -mtune=native diff --git a/Build/Makefile.systype b/Build/Makefile.systype new file mode 100644 index 0000000..0492232 --- /dev/null +++ b/Build/Makefile.systype @@ -0,0 +1,24 @@ +CC = gcc-10 +CXX = g++-10 + +MPICC = mpicc +MPIC++ = mpiCC + +OPTIMIZE = + + +GSL_INCL = +GSL_LIB = + +FFTW_INCL= +FFTW_LIB= + +NVCC = +NVFLAGS = +NVLIB = + +CFLAGS += + +MPICHLIB = +HDF5INCL = +HDF5LIB = diff --git a/Makefile b/Makefile index 99d1389..1e2c653 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,31 @@ # comment/uncomment the various options depending hoe you want to build the program +# Set default values for compiler options if no systype options are given or found +CC = mpiCC +CXX = mpiCC +OPTIMIZE = -std=c++11 -Wall -g -O2 +MPICHLIB = -lmpich +SWITCHES = + +ifdef SYSTYPE +SYSTYPE := $(SYSTYPE) +include Build/Makefile.$(SYSTYPE) +else +include Build/Makefile.systype +endif + + +LIBS = $(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda + # create MPI code OPT += -DUSE_MPI +OPT += -DACCOMP # use FFTW (it can be switched on ONLY if MPI is active) -OPT += -DUSE_FFTW +ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) + OPT += -DUSE_FFTW + LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm +endif + +#OPT += -DNVIDIA # perform one-side communication (suggested) instead of reduce (only if MPI is active) OPT += -DONE_SIDE # write the full 3D cube of gridded visibilities and its FFT transform @@ -10,25 +33,8 @@ OPT += -DONE_SIDE # write the final image OPT += -DWRITE_IMAGE # perform w-stacking phase correction -#OPT += -DPHASE_ON - -CC = gcc -CXX = g++ -ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) - CC = mpicc - CXX = mpiCC -endif - -OMP = -fopenmp -#OMP = - -CFLAGS += -O3 -mcpu=native -CFLAGS += -I. -LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm +# OPT += PHASE_ON -NVCC = nvcc -NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 -NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o @@ -39,27 +45,40 @@ w-stacking.c: w-stacking.cu phase_correction.c: phase_correction.cu cp phase_correction.cu phase_correction.c +ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) +%.o: %.c $(DEPS) + $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) +else %.o: %.c $(DEPS) - $(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT) + $(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) +endif serial: $(COBJ) - $(CC) $(OMP) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm + $(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial $^ $(LIBS) + +serial_omp: phase_correction.c + $(CC) $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial w-stacking-fftw.c w-stacking_omp.c $(CFLAGS) $(LIBS) + +simple_mpi: phase_correction.c + $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS) + +mpi_omp: phase_correction.c + $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_omp w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS) serial_cuda: - $(NVCC) $(OPT) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) - $(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c - $(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm + $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) + $(CC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS) + $(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm -mpi: $(COBJ) - $(CC) $(OMP) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS) +mpi: $(COBJ) + $(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS) mpi_cuda: - $(NVCC) $(NVFLAGS) $(OPT) -c w-stacking.cu phase_correction.cu $(NVLIB) - $(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c - $(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(LIBS) -lm + $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) + $(MPICC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS) + $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS) clean: rm *.o rm w-stacking.c rm phase_correction.c - diff --git a/phase_correction.cu b/phase_correction.cu index 9a43924..52eb4e0 100644 --- a/phase_correction.cu +++ b/phase_correction.cu @@ -8,7 +8,7 @@ #ifdef __CUDACC__ -__global__ void phase_g(int xaxis, +__global__ void phase_g(int xaxis, int yaxis, int num_w_planes, double * gridss, @@ -31,7 +31,7 @@ __global__ void phase_g(int xaxis, if(gid < arraysize) { long gid_aux = nbucket*gid; - for(int iaux=0; iauxindex = data; newNode->next = *headRef; - *headRef = newNode; + *headRef = newNode; } // Main Code -int main(int argc, char * argv[]) +int main(int argc, char * argv[]) { int rank; int size; - FILE * pFile; + FILE * pFile; FILE * pFile1; FILE * pFilereal; FILE * pFileimg; @@ -50,7 +50,7 @@ int main(int argc, char * argv[]) // //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_gauss4new.binMS/"; //char datapath[900] = "/m100_scratch/userexternal/cgheller/Lofar/Observations/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"; - char datapath[900]; + char datapath[900]; char datapath_multi[NFILES][900]; char ufile[30] = "ucoord.bin"; @@ -117,6 +117,11 @@ int main(int argc, char * argv[]) struct timespec begin, finish, begin0, begink, finishk; double elapsed; long nsectors; + /* GT get nymber of threads exit if not given */ + if(argc == 1) { + fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]); + exit(1); + } clock_gettime(CLOCK_MONOTONIC, &begin0); start0 = clock(); @@ -144,7 +149,7 @@ int main(int argc, char * argv[]) if (nsectors < 0) nsectors = size; local_grid_size_y = grid_size_y/nsectors; //nsectors = size; - + // LOCAL grid size xaxis = local_grid_size_x; yaxis = local_grid_size_y; @@ -156,14 +161,15 @@ int main(int argc, char * argv[]) int ndatasets = 1; //strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/"); //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/"); - strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); + //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/"); - + strcpy(datapath_multi[0],"/data/taffoni/LAVORO/IMAGING/data/newgauss2noconj_t201806301100_SBL180.binMS/"); strcpy(datapath,datapath_multi[0]); // Read metadata strcpy(filename,datapath); strcat(filename,metafile); - pFile = fopen (filename,"r"); + // GT CHECK IF FILE EXISTS OR EXIT + if (pFile = fopen (filename,"r")) { fscanf(pFile,"%ld",&Nmeasures); fscanf(pFile,"%ld",&Nvis); fscanf(pFile,"%ld",&freq_per_chan); @@ -176,7 +182,16 @@ int main(int argc, char * argv[]) fscanf(pFile,"%lf",&uvmax); fscanf(pFile,"%lf",&wmin); fscanf(pFile,"%lf",&wmax); - fclose(pFile); + fclose(pFile); + } else { + printf("Input file does not exists: %s\n", filename); + #ifdef USE_MPI + MPI_Finalize(); + #endif + exit(256); + } + + // WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @@ -199,7 +214,7 @@ int main(int argc, char * argv[]) // Set temporary local size of points long nm_pe = (long)(Nmeasures/size); long remaining = Nmeasures%size; - + long startrow = rank*nm_pe; if (rank == size-1)nm_pe = nm_pe+remaining; @@ -268,7 +283,7 @@ int main(int argc, char * argv[]) // Create histograms and linked lists clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); - + //CLAAA // Initialize linked list struct sectorlist ** sectorhead; @@ -438,7 +453,7 @@ int main(int argc, char * argv[]) // define local destination sector //isector = (isector_count+rank)%size; isector = isector_count; - // allocate sector arrays + // allocate sector arrays long Nsec = histo_send[isector]; uus = (double*) malloc(Nsec*sizeof(double)); vvs = (double*) malloc(Nsec*sizeof(double)); @@ -501,7 +516,7 @@ int main(int argc, char * argv[]) vvmin = MIN(vvmin,vvs[ipart]); vvmax = MAX(vvmax,vvs[ipart]); - + if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]); } @@ -556,9 +571,9 @@ int main(int argc, char * argv[]) #ifdef ONE_SIDE printf("One Side communication active\n"); MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin); - MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); + MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); MPI_Win_unlock(target_rank,slabwin); - //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); + //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); #else MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); #endif //ONE_SIDE @@ -633,7 +648,7 @@ int main(int argc, char * argv[]) process_time1 = (finish.tv_sec - begin.tv_sec); process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); - + #ifdef WRITE_DATA // Write results @@ -712,7 +727,7 @@ int main(int argc, char * argv[]) #ifdef USE_FFTW // FFT transform the data (using distributed FFTW) - + if(rank == 0)printf("PERFORMING FFT\n"); clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); @@ -727,13 +742,13 @@ int main(int argc, char * argv[]) // and perform the FFT per w plane alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start); fftwgrid = fftw_alloc_complex(alloc_local); - plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); + plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); long fftwindex = 0; long fftwindex2D = 0; for (int iw=0; iw Date: Thu, 25 Nov 2021 11:43:54 +0100 Subject: [PATCH 2/4] including testing --- Build/Makefile.Magellanus | 18 +++++++----------- Build/Makefile.MagellanusNVC | 26 ++++++++++++++++++++++++++ phase_correction.cu | 2 +- w-stacking-fftw.c | 8 ++++---- 4 files changed, 38 insertions(+), 16 deletions(-) create mode 100644 Build/Makefile.MagellanusNVC diff --git a/Build/Makefile.Magellanus b/Build/Makefile.Magellanus index 36c9169..b71b35d 100644 --- a/Build/Makefile.Magellanus +++ b/Build/Makefile.Magellanus @@ -7,21 +7,18 @@ MPIC++ = mpiCC GSL_INCL = -I/home/taffoni/sw/include GSL_LIBS = -L/home/taffoni/sw/lib -FFTW_INCL= -I/home/taffoni/sw/include -FFTW_LIB= -L/home/taffoni/sw/lib -lfftw3_mpi -lfftw3 +FFTW_INCL= +FFTW_LIB= -lfftw3_mpi -lfftw3 -#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi MPI_LIB = -#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include -MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include +MPI_INCL= HDF5_INCL = HDF5_LIB = -OMP = -mp=multicore,gpu -Mprof -cuda -lcudart -#OMP = -fopenmp +OMP = -fopenmp -lcuda -lcudart NVCC = nvcc NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11 -NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda +NVLIB = -lcudart -lcuda CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) @@ -29,6 +26,5 @@ CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) OPTIMIZE = $(OMP) -O3 # OMP GPU SPECIFIC FLAGS -#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math -#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35 -#-ffast-math -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none +OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math +OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35 diff --git a/Build/Makefile.MagellanusNVC b/Build/Makefile.MagellanusNVC new file mode 100644 index 0000000..34c9f2e --- /dev/null +++ b/Build/Makefile.MagellanusNVC @@ -0,0 +1,26 @@ +CC = nvc +CXX = nvc++ + +MPICC = mpicc +MPIC++ = mpiCC + +GSL_INCL = -I/home/taffoni/sw/include +GSL_LIBS = -L/home/taffoni/sw/lib + +FFTW_INCL= -I/home/taffoni/sw/include +FFTW_LIB= -L/home/taffoni/sw/lib -lfftw3_mpi -lfftw3 + +MPI_LIB = +MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include +HDF5_INCL = +HDF5_LIB = + +OMP = -mp=multicore,gpu -Mprof -cuda -lcudart +NVCC = nvcc +NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11 +NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda + + +CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) + +OPTIMIZE = $(OMP) -O3 diff --git a/phase_correction.cu b/phase_correction.cu index 52eb4e0..e1db329 100644 --- a/phase_correction.cu +++ b/phase_correction.cu @@ -108,7 +108,7 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in double * image_imag_g; double * gridss_g; - mmm=cudaMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double)); + mmm=cudaMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double)); //printf("CUDA ERROR 1 %s\n",cudaGetErrorString(mmm)); mmm=cudaMalloc(&image_real_g, xaxis*yaxis*sizeof(double)); //printf("CUDA ERROR 2 %s\n",cudaGetErrorString(mmm)); diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c index 3bdf2c1..bdeeed2 100644 --- a/w-stacking-fftw.c +++ b/w-stacking-fftw.c @@ -574,10 +574,10 @@ int main(int argc, char * argv[]) MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); MPI_Win_unlock(target_rank,slabwin); //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); - #else - MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); - #endif //ONE_SIDE - #endif //USE_MPI + #else + MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); + #endif //ONE_SIDE + #endif //USE_MPI clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); -- GitLab From 6c9080ff3b04c003eb728824cf45172469db05bb Mon Sep 17 00:00:00 2001 From: Giuliano Taffoni Date: Tue, 30 Nov 2021 17:53:26 +0100 Subject: [PATCH 3/4] aligned with latest version --- w-stacking-fftw.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c index bdeeed2..c56b3cb 100644 --- a/w-stacking-fftw.c +++ b/w-stacking-fftw.c @@ -141,8 +141,10 @@ int main(int argc, char * argv[]) rank = 0; size = 1; #endif + if(rank == 0)printf("Running with %d threads\n",num_threads); + // set the local size of the image local_grid_size_x = grid_size_x; nsectors = NUM_OF_SECTORS; @@ -278,6 +280,7 @@ int main(int argc, char * argv[]) setup_time1 = (finish.tv_sec - begin.tv_sec); setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; + if(rank == 0)printf("GRIDDING DATA\n"); // Create histograms and linked lists @@ -335,7 +338,7 @@ int main(int argc, char * argv[]) for (int iii=0; iiiindex != -1) { long ilocal = current->index; @@ -503,7 +507,8 @@ int main(int argc, char * argv[]) compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; compose_time1 += (finishk.tv_sec - begink.tv_sec); compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; - #ifndef USE_MPI + + #ifndef USE_MPI double uumin = 1e20; double vvmin = 1e20; double uumax = -1e20; -- GitLab From fd67a5ca7fe8150a292a35298d3ff4bfd3f44128 Mon Sep 17 00:00:00 2001 From: Giuliano Taffoni Date: Mon, 31 Jan 2022 16:25:54 +0100 Subject: [PATCH 4/4] first commit of this branch --- .atom-beautify.w-stacking-fftw.c | 991 +++++++++++++++++++++++++++++++ phase_correction.cu | 4 +- w-stacking-fftw.c | 25 +- w-stacking.cu | 5 +- w-stacking.h | 2 + 5 files changed, 1015 insertions(+), 12 deletions(-) create mode 100644 .atom-beautify.w-stacking-fftw.c diff --git a/.atom-beautify.w-stacking-fftw.c b/.atom-beautify.w-stacking-fftw.c new file mode 100644 index 0000000..a57894a --- /dev/null +++ b/.atom-beautify.w-stacking-fftw.c @@ -0,0 +1,991 @@ +#include +#include +#include +#ifdef USE_MPI +#include +#ifdef USE_FFTW +#include +#endif +#endif +#include +#include +#include "w-stacking.h" +#define PI 3.14159265359 +#define NUM_OF_SECTORS -1 +#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) +#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) +#define NOVERBOSE +#define NFILES 100 + +// Linked List set-up +struct sectorlist { + long index; + struct sectorlist * next; +}; + +void Push(struct sectorlist** headRef, long data) { + struct sectorlist* newNode = malloc(sizeof(struct sectorlist)); + newNode->index = data; + newNode->next = *headRef; + *headRef = newNode; +} + +// Main Code +int main(int argc, char * argv[]) +{ + int rank; + int size; + + FILE * pFile; + FILE * pFile1; + FILE * pFilereal; + FILE * pFileimg; + char filename[1000]; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/old/data/gauss2_t201806301100_SBL180.binMS/"; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/newgauss2noconj_t201806301100_SBL180.binMS/"; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/gauss1_t201806301100_SBL180.binMS/"; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/"; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/gauss4_t201806301100_SBL180.binMS/"; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_t201806301100_SBH255i-test.binMS/"; + // + //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_gauss4new.binMS/"; + //char datapath[900] = "/m100_scratch/userexternal/cgheller/Lofar/Observations/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"; + char datapath[900]; + char datapath_multi[NFILES][900]; + + char ufile[30] = "ucoord.bin"; + char vfile[30] = "vcoord.bin"; + char wfile[30] = "wcoord.bin"; + char weightsfile[30] = "weights.bin"; + char visrealfile[30] = "visibilities_real.bin"; + char visimgfile[30] = "visibilities_img.bin"; + char metafile[30] = "meta.txt"; + char outfile[30] = "grid.txt"; + char outfile1[30] = "coords.txt"; + char outfile2[30] = "grid_real.bin"; + char outfile3[30] = "grid_img.bin"; + char fftfile[30] = "fft.txt"; + char fftfile2[30] = "fft_real.bin"; + char fftfile3[30] = "fft_img.bin"; + char logfile[30] = "run.log"; + char extension[30] = ".txt"; + char srank[4]; + char timingfile[30] = "timings.dat"; + + double * uu; + double * vv; + double * ww; + float * weights; + float * visreal; + float * visimg; + + long Nmeasures,Nmeasures0; + long Nvis,Nvis0; + long Nweights,Nweights0; + long freq_per_chan,freq_per_chan0; + long polarisations,polarisations0; + long Ntimes,Ntimes0; + double dt,dt0; + double thours,thours0; + long baselines,baselines0; + double uvmin,uvmin0; + double uvmax,uvmax0; + double wmin,wmin0; + double wmax,wmax0; + double resolution; + + // MESH SIZE + int grid_size_x = 4096; + int grid_size_y = 4096; + int local_grid_size_x;// = 16; + int local_grid_size_y;// = 16; + int xaxis; + int yaxis; + int num_w_planes = 8; + // DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization + int w_support = 7; + int num_threads;// = 4; + double dx = 1.0/(double)grid_size_x; + double dw = 1.0/(double)num_w_planes; + double w_supporth = (double)((w_support-1)/2)*dx; + + clock_t start, end, start0, startk, endk; + double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time; + double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1; + double writetime, writetime1; + + struct timespec begin, finish, begin0, begink, finishk; + double elapsed; + long nsectors; + /* GT get nymber of threads exit if not given */ + if(argc == 1) { + fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]); + exit(1); + } + + clock_gettime(CLOCK_MONOTONIC, &begin0); + start0 = clock(); + // Set the number of OpenMP threads + num_threads = atoi(argv[1]); + + // Intialize MPI environment +#ifdef USE_MPI + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + if(rank == 0)printf("Running with %d MPI tasks\n",size); + printf("Number of available GPUs %d\n", omp_get_num_devices()); + printf("Number of Current GPUs %d\n", omp_get_default_device()); + int node_rank; + MPI_Comm NUMA_NODE_Comm; + MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &NUMA_NODE_Comm); + MPI_Comm_rank( NUMA_NODE_Comm, &node_rank); + printf( "Task %d has rank %d in its numa node\n", rank, node_rank ); + + #ifdef USE_FFTW + fftw_mpi_init(); + #endif +#else + rank = 0; + size = 1; +#endif + + if(rank == 0)printf("Running with %d threads\n",num_threads); + + + // set the local size of the image + local_grid_size_x = grid_size_x; + nsectors = NUM_OF_SECTORS; + if (nsectors < 0) nsectors = size; + local_grid_size_y = grid_size_y/nsectors; + //nsectors = size; + + // LOCAL grid size + xaxis = local_grid_size_x; + yaxis = local_grid_size_y; + + clock_gettime(CLOCK_MONOTONIC, &begin); + start = clock(); + + // INPUT FILES (only the first ndatasets entries are used) + int ndatasets = 1; + //strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/"); + //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/"); + strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); + //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/"); + //strcpy(datapath_multi[0],"/data/taffoni/LAVORO/IMAGING/data/newgauss2noconj_t201806301100_SBL180.binMS/"); + strcpy(datapath,datapath_multi[0]); + // Read metadata + strcpy(filename,datapath); + strcat(filename,metafile); + // GT CHECK IF FILE EXISTS OR EXIT + if (pFile = fopen (filename,"r")) { + fscanf(pFile,"%ld",&Nmeasures); + fscanf(pFile,"%ld",&Nvis); + fscanf(pFile,"%ld",&freq_per_chan); + fscanf(pFile,"%ld",&polarisations); + fscanf(pFile,"%ld",&Ntimes); + fscanf(pFile,"%lf",&dt); + fscanf(pFile,"%lf",&thours); + fscanf(pFile,"%ld",&baselines); + fscanf(pFile,"%lf",&uvmin); + fscanf(pFile,"%lf",&uvmax); + fscanf(pFile,"%lf",&wmin); + fscanf(pFile,"%lf",&wmax); + fclose(pFile); + } else { + printf("Input file does not exists: %s\n", filename); + #ifdef USE_MPI + MPI_Finalize(); + #endif + exit(256); + } + + + + + // WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + int nsub = 1000; + //int nsub = 10; + printf("Subtracting last %d measurements\n",nsub); + Nmeasures = Nmeasures-nsub; + Nvis = Nmeasures*freq_per_chan*polarisations; + + // calculate the coordinates of the center + double uvshift = uvmin/(uvmax-uvmin); + //printf("UVSHIFT %f %f %f %f %f\n",uvmin, uvmax, wmin, wmax, uvshift); + + if (rank == 0) + { + printf("N. measurements %ld\n",Nmeasures); + printf("N. visibilities %ld\n",Nvis); + } + + // Set temporary local size of points + long nm_pe = (long)(Nmeasures/size); + long remaining = Nmeasures%size; + + long startrow = rank*nm_pe; + if (rank == size-1)nm_pe = nm_pe+remaining; + + long Nmeasures_tot = Nmeasures; + Nmeasures = nm_pe; + long Nvis_tot = Nvis; + Nvis = Nmeasures*freq_per_chan*polarisations; + Nweights = Nmeasures*polarisations; + + #ifdef VERBOSE + printf("N. measurements on %d %ld\n",rank,Nmeasures); + printf("N. visibilities on %d %ld\n",rank,Nvis); + #endif + + // DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used + // all the sizes are rescaled by the number of MPI tasks + // Allocate arrays + uu = (double*) calloc(Nmeasures,sizeof(double)); + vv = (double*) calloc(Nmeasures,sizeof(double)); + ww = (double*) calloc(Nmeasures,sizeof(double)); + weights = (float*) calloc(Nweights,sizeof(float)); + visreal = (float*) calloc(Nvis,sizeof(float)); + visimg = (float*) calloc(Nvis,sizeof(float)); + + if(rank == 0)printf("READING DATA\n"); + // Read data + strcpy(filename,datapath); + strcat(filename,ufile); + //printf("Reading %s\n",filename); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*sizeof(double),SEEK_SET); + fread(uu,Nmeasures*sizeof(double),1,pFile); + fclose(pFile); + + strcpy(filename,datapath); + strcat(filename,vfile); + //printf("Reading %s\n",filename); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*sizeof(double),SEEK_SET); + fread(vv,Nmeasures*sizeof(double),1,pFile); + fclose(pFile); + + strcpy(filename,datapath); + strcat(filename,wfile); + //printf("Reading %s\n",filename); + + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*sizeof(double),SEEK_SET); + fread(ww,Nmeasures*sizeof(double),1,pFile); + fclose(pFile); + + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + + clock_gettime(CLOCK_MONOTONIC, &finish); + end = clock(); + setup_time = ((double) (end - start)) / CLOCKS_PER_SEC; + setup_time1 = (finish.tv_sec - begin.tv_sec); + setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; + + + if(rank == 0)printf("GRIDDING DATA\n"); + + // Create histograms and linked lists + clock_gettime(CLOCK_MONOTONIC, &begin); + start = clock(); + + //CLAAA + // Initialize linked list + struct sectorlist ** sectorhead; + sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist)); + for (int isec=0; isec<=nsectors; isec++) + { + sectorhead[isec] = malloc(sizeof(struct sectorlist)); + sectorhead[isec]->index = -1; + sectorhead[isec]->next = NULL; + } + + + long * histo_send = (long*) calloc(nsectors+1,sizeof(long)); + int * boundary = (int*) calloc(Nmeasures,sizeof(int)); + double uuh,vvh; + for (long iphi = 0; iphi < Nmeasures; iphi++) + { + boundary[iphi] = -1; + uuh = uu[iphi]; + vvh = vv[iphi]; + int binphi = (int)(vvh*nsectors); + // check if the point influence also neighboring slabs + double updist = (double)((binphi+1)*yaxis)*dx - vvh; + double downdist = vvh - (double)(binphi*yaxis)*dx; + // + histo_send[binphi]++; + Push(§orhead[binphi],iphi); + if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(§orhead[binphi+1],iphi);}; + if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1; Push(§orhead[binphi-1],iphi);}; + } + +#ifdef PIPPO + struct sectorlist * current; + long iiii = 0; + for (int j=0; jindex != -1) + { + printf("%d %d %ld %ld %ld\n",rank,j,iiii,histo_send[j],current->index); + current = current->next; + iiii++; + } + } +#endif + + #ifdef VERBOSE + for (int iii=0; iiiindex != -1) + { + long ilocal = current->index; + //double vvh = vv[ilocal]; + //int binphi = (int)(vvh*nsectors); + //if (binphi == isector || boundary[ilocal] == isector) + //{ + uus[icount] = uu[ilocal]; + vvs[icount] = vv[ilocal]-isector*shift; + wws[icount] = ww[ilocal]; + for (long ipol=0; ipol1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis); + inu++; + } + icount++; + //} + current = current->next; + } + + clock_gettime(CLOCK_MONOTONIC, &finishk); + endk = clock(); + compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; + compose_time1 += (finishk.tv_sec - begink.tv_sec); + compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; + + #ifndef USE_MPI + double uumin = 1e20; + double vvmin = 1e20; + double uumax = -1e20; + double vvmax = -1e20; + + for (long ipart=0; ipart %f\n",gridss[iii]); + + #ifndef USE_MPI + long stride = isector*2*xaxis*yaxis*num_w_planes; + for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)gridtot[stride+iii] = gridss[iii]; + #endif + + // Write grid in the corresponding remote slab + #ifdef USE_MPI + int target_rank = (int)isector; + //int target_rank = (int)(size-isector-1); + #ifdef ONE_SIDE + printf("One Side communication active\n"); + MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin); + MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); + MPI_Win_unlock(target_rank,slabwin); + //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); + #else + MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); + #endif //ONE_SIDE + #endif //USE_MPI + + clock_gettime(CLOCK_MONOTONIC, &finishk); + endk = clock(); + reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; + reduce_time1 += (finishk.tv_sec - begink.tv_sec); + reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; + // Go to next sector + for (long inull=0; inull<2*num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0; + + // Deallocate all sector arrays + free(uus); + free(vvs); + free(wws); + free(weightss); + free(visreals); + free(visimgs); + // End of loop over sectors + } + // End of loop over input files + } + + // Finalize MPI communication + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + #endif + + // Swap left and right parts APPARENTLY NOT NECESSARY + /* + for (long kswap=0; kswap 1) + { + for (int iw=0; iw n0) + // and perform the FFT per w plane + alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start); + fftwgrid = fftw_alloc_complex(alloc_local); + plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); + + long fftwindex = 0; + long fftwindex2D = 0; + for (int iw=0; iw 1) + { + for (int iw=0; iw 1) + { + printf("PSetup time: %f sec\n",setup_time1); + printf("PProcess time: %f sec\n",process_time1); + printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1); +#ifdef USE_FFTW + printf("PFFTW time: %f sec\n",fftw_time1); + printf("PPhase time: %f sec\n",phase_time1); +#endif + printf("PTOT time: %f sec\n",tot_time1); + } + } + + if (rank == 0) + { + pFile = fopen (timingfile,"w"); + if (num_threads == 1) + { + fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,tot_time); + } else { + fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,tot_time1); + } + fclose(pFile); + } + + // Close MPI environment + #ifdef USE_MPI + MPI_Win_fence(0,slabwin); + MPI_Win_free(&slabwin); + MPI_Finalize(); + #endif +} diff --git a/phase_correction.cu b/phase_correction.cu index e1db329..0135763 100644 --- a/phase_correction.cu +++ b/phase_correction.cu @@ -87,7 +87,7 @@ __global__ void phase_g(int xaxis, #endif void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes, int xaxistot, int yaxistot, - double resolution, double wmin, double wmax, int num_threads) + double resolution, double wmin, double wmax, int num_threads, int rank) { double dw = (wmax-wmin)/(double)num_w_planes; double wterm = wmin+0.5*dw; @@ -151,7 +151,7 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in #ifdef ACCOMP #pragma omp target teams distribute parallel for collapse(3) \ map(tofrom: image_real[0:xaxis*yaxis], image_imag[0:xaxis*yaxis]) \ - map (to: gridss[0:2*num_w_planes*xaxis*yaxis]) private(wterm) + map (to: gridss[0:2*num_w_planes*xaxis*yaxis]) private(wterm) #else #pragma omp parallel for collapse(3) private(wterm) #endif diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c index c56b3cb..a57894a 100644 --- a/w-stacking-fftw.c +++ b/w-stacking-fftw.c @@ -95,10 +95,10 @@ int main(int argc, char * argv[]) double resolution; // MESH SIZE - int grid_size_x = 2048; - int grid_size_y = 2048; - int local_grid_size_x;// = 8; - int local_grid_size_y;// = 8; + int grid_size_x = 4096; + int grid_size_y = 4096; + int local_grid_size_x;// = 16; + int local_grid_size_y;// = 16; int xaxis; int yaxis; int num_w_planes = 8; @@ -134,6 +134,14 @@ int main(int argc, char * argv[]) MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if(rank == 0)printf("Running with %d MPI tasks\n",size); + printf("Number of available GPUs %d\n", omp_get_num_devices()); + printf("Number of Current GPUs %d\n", omp_get_default_device()); + int node_rank; + MPI_Comm NUMA_NODE_Comm; + MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &NUMA_NODE_Comm); + MPI_Comm_rank( NUMA_NODE_Comm, &node_rank); + printf( "Task %d has rank %d in its numa node\n", rank, node_rank ); + #ifdef USE_FFTW fftw_mpi_init(); #endif @@ -163,9 +171,9 @@ int main(int argc, char * argv[]) int ndatasets = 1; //strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/"); //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/"); - //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); + strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/"); - strcpy(datapath_multi[0],"/data/taffoni/LAVORO/IMAGING/data/newgauss2noconj_t201806301100_SBL180.binMS/"); + //strcpy(datapath_multi[0],"/data/taffoni/LAVORO/IMAGING/data/newgauss2noconj_t201806301100_SBL180.binMS/"); strcpy(datapath,datapath_multi[0]); // Read metadata strcpy(filename,datapath); @@ -550,7 +558,8 @@ int main(int argc, char * argv[]) xaxis, yaxis, gridss, - num_threads); + num_threads, + node_rank); clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; @@ -881,7 +890,7 @@ int main(int argc, char * argv[]) double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double)); double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double)); - phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y,resolution,wmin,wmax,num_threads); + phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y,resolution,wmin,wmax,num_threads,node_rank); end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); diff --git a/w-stacking.cu b/w-stacking.cu index bc696b4..bc70a15 100644 --- a/w-stacking.cu +++ b/w-stacking.cu @@ -136,7 +136,8 @@ void wstack( int grid_size_x, int grid_size_y, double* grid, - int num_threads) + int num_threads, + int rank) { long i; long index; @@ -232,7 +233,7 @@ map(to:num_points, KernelLen, std, std22, norm, num_w_planes, \ uu[0:num_points], vv[0:num_points], ww[0:num_points], \ vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim], \ grid_size_x, grid_size_y, freq_per_chan, polarizations, dx,dw, w_support, num_threads) \ - map(tofrom: grid[0:gpu_grid_dim]) + map(tofrom: grid[0:gpu_grid_dim]) device(rank) #else #pragma omp parallel for private(visindex) #endif diff --git a/w-stacking.h b/w-stacking.h index 3d3a4c6..862eff6 100644 --- a/w-stacking.h +++ b/w-stacking.h @@ -25,6 +25,7 @@ void wstack( int, int, double*, + int, int); #ifdef __CUDACC__ @@ -47,5 +48,6 @@ void phase_correction( double, double, double, + int, int); #endif -- GitLab