diff --git a/.gitignore b/.gitignore index 797747aa16a2fb9000f5268a91f10e289e07f077..9cce9de16ae69e4796589cd888488c2cc1e6d877 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.DS_Store +sync.sh *.o phase_correction.c w-stacking.c diff --git a/Build/Makefile.Macosx b/Build/Makefile.Macosx new file mode 100644 index 0000000000000000000000000000000000000000..cdb0e1306db96f407e23419fd4c988928ebbe68e --- /dev/null +++ b/Build/Makefile.Macosx @@ -0,0 +1,26 @@ +CC = icc +CXX = g++-11 + +MPICC = mpicc +MPIC++ = mpiCC + +FFTW_INCL= -I/usr/local/include +FFTW_LIB= -L/usr/local/lib/ + +GSL_INCL = +GSL_LIBS = + +MPI_LIB = +MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include +HDF5_INCL = +HDF5_LIB = + +OMP= -fopenmp + +NVCC = nvcc +NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 +NVLIB = -L/home/taffoni -lcudart -lcuda + +CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) + +OPTIMIZE = $(OMP) -O3 diff --git a/Build/Makefile.Magellanus b/Build/Makefile.Magellanus new file mode 100644 index 0000000000000000000000000000000000000000..c2858994653726db34f7dcd7ae0451ebc7553bf7 --- /dev/null +++ b/Build/Makefile.Magellanus @@ -0,0 +1,34 @@ +CC = nvc +CXX = nvc++ + +MPICC = mpicc +MPIC++ = mpiCC + +GSL_INCL = -I/home/taffoni/sw/include +GSL_LIBS = -L/home/taffoni/sw/lib + +FFTW_INCL= -I/home/taffoni/sw/include +FFTW_LIB= -L/home/taffoni/sw/lib -lfftw3_mpi -lfftw3 + +#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi +MPI_LIB = +#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include +MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include +HDF5_INCL = +HDF5_LIB = + +OMP = -mp=multicore,gpu -Mprof -cuda +#OMP = -fopenmp +NVCC = nvcc +NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11 +NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda + + +CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) + +OPTIMIZE = $(OMP) -O3 + +# OMP GPU SPECIFIC FLAGS +#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math +#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35 +#-ffast-math -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none diff --git a/Build/Makefile.Marconi b/Build/Makefile.Marconi new file mode 100644 index 0000000000000000000000000000000000000000..4fec93afd7851aad0ebd3e85f449c6c234a7376d --- /dev/null +++ b/Build/Makefile.Marconi @@ -0,0 +1,20 @@ +CC = gcc +CXX = g++ + +MPICC = mpicc +MPIC++ = mpiCC + + +FFTW_INCL= -I/home/taffoni/sw/include +FFTW_LIB= -L/home/taffoni/sw/lib + + +NVCC = nvcc +NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 +NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda + +OMP= -fopenmp + +CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) + +OPTIMIZE = $(OMP) -O3 -mtune=native diff --git a/Build/Makefile.systype b/Build/Makefile.systype new file mode 100644 index 0000000000000000000000000000000000000000..04922322304924d1c8a5397ede089516d9a1d256 --- /dev/null +++ b/Build/Makefile.systype @@ -0,0 +1,24 @@ +CC = gcc-10 +CXX = g++-10 + +MPICC = mpicc +MPIC++ = mpiCC + +OPTIMIZE = + + +GSL_INCL = +GSL_LIB = + +FFTW_INCL= +FFTW_LIB= + +NVCC = +NVFLAGS = +NVLIB = + +CFLAGS += + +MPICHLIB = +HDF5INCL = +HDF5LIB = diff --git a/Makefile b/Makefile index 99d13891512a600974a7a33c5eb43683eaf22e3f..a5635850c6aa354a28b5ceb1bc4fd90836102dfa 100644 --- a/Makefile +++ b/Makefile @@ -1,34 +1,40 @@ # comment/uncomment the various options depending hoe you want to build the program +# Set default values for compiler options if no systype options are given or found +CC = mpiCC +CXX = mpiCC +OPTIMIZE = -std=c++11 -Wall -g -O2 +MPICHLIB = -lmpich +SWITCHES = + +ifdef SYSTYPE +SYSTYPE := $(SYSTYPE) +include Build/Makefile.$(SYSTYPE) +else +include Build/Makefile.systype +endif + + +LIBS = $(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda + # create MPI code OPT += -DUSE_MPI +OPT += -DACCOMP # use FFTW (it can be switched on ONLY if MPI is active) -OPT += -DUSE_FFTW +ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) + OPT += -DUSE_FFTW + LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm -lcudart -lcuda +endif + +OPT += -DNVIDIA # perform one-side communication (suggested) instead of reduce (only if MPI is active) OPT += -DONE_SIDE # write the full 3D cube of gridded visibilities and its FFT transform -#OPT += -DWRITE_DATA +OPT += -DWRITE_DATA # write the final image OPT += -DWRITE_IMAGE # perform w-stacking phase correction -#OPT += -DPHASE_ON - -CC = gcc -CXX = g++ -ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) - CC = mpicc - CXX = mpiCC -endif - -OMP = -fopenmp -#OMP = +# OPT += PHASE_ON -CFLAGS += -O3 -mcpu=native -CFLAGS += -I. -LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm - -NVCC = nvcc -NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 -NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o @@ -39,8 +45,13 @@ w-stacking.c: w-stacking.cu phase_correction.c: phase_correction.cu cp phase_correction.cu phase_correction.c +ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) %.o: %.c $(DEPS) - $(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT) + $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) +else +%.o: %.c $(DEPS) + $(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) +endif serial: $(COBJ) $(CC) $(OMP) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm @@ -50,16 +61,15 @@ serial_cuda: $(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c $(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm -mpi: $(COBJ) - $(CC) $(OMP) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS) +mpi: $(COBJ) + $(MPICC) $(OMP) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS) mpi_cuda: $(NVCC) $(NVFLAGS) $(OPT) -c w-stacking.cu phase_correction.cu $(NVLIB) - $(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c - $(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(LIBS) -lm + $(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c + $(MPIC++) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(LIBS) -lm clean: rm *.o rm w-stacking.c rm phase_correction.c - diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c index 0eda87e9b2dcef70714081129aff0fbeb468fae0..50983d12694a069af5fc6ca800a7fb840d33e37e 100644 --- a/w-stacking-fftw.c +++ b/w-stacking-fftw.c @@ -27,16 +27,16 @@ void Push(struct sectorlist** headRef, long data) { struct sectorlist* newNode = malloc(sizeof(struct sectorlist)); newNode->index = data; newNode->next = *headRef; - *headRef = newNode; + *headRef = newNode; } // Main Code -int main(int argc, char * argv[]) +int main(int argc, char * argv[]) { int rank; int size; - FILE * pFile; + FILE * pFile; FILE * pFile1; FILE * pFilereal; FILE * pFileimg; @@ -50,7 +50,7 @@ int main(int argc, char * argv[]) // //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_gauss4new.binMS/"; //char datapath[900] = "/m100_scratch/userexternal/cgheller/Lofar/Observations/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"; - char datapath[900]; + char datapath[900]; char datapath_multi[NFILES][900]; char ufile[30] = "ucoord.bin"; @@ -117,7 +117,11 @@ int main(int argc, char * argv[]) struct timespec begin, finish, begin0, begink, finishk; double elapsed; long nsectors; - + /* GT get nymber of threads exit if not given */ + if(argc == 1) { + fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]); + exit(1); + } clock_gettime(CLOCK_MONOTONIC, &begin0); start0 = clock(); // Set the number of OpenMP threads @@ -144,7 +148,7 @@ int main(int argc, char * argv[]) if (nsectors < 0) nsectors = size; local_grid_size_y = grid_size_y/nsectors; //nsectors = size; - + // LOCAL grid size xaxis = local_grid_size_x; yaxis = local_grid_size_y; @@ -156,14 +160,15 @@ int main(int argc, char * argv[]) int ndatasets = 1; //strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/"); //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/"); - strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); + strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/"); strcpy(datapath,datapath_multi[0]); // Read metadata strcpy(filename,datapath); strcat(filename,metafile); - pFile = fopen (filename,"r"); + // GT CHECK IF FILE EXISTS OR EXIT + if (pFile = fopen (filename,"r")) { fscanf(pFile,"%ld",&Nmeasures); fscanf(pFile,"%ld",&Nvis); fscanf(pFile,"%ld",&freq_per_chan); @@ -176,7 +181,14 @@ int main(int argc, char * argv[]) fscanf(pFile,"%lf",&uvmax); fscanf(pFile,"%lf",&wmin); fscanf(pFile,"%lf",&wmax); - fclose(pFile); + fclose(pFile); + } else { + printf("Input file does not exists: %s\n", filename); + #ifdef USE_MPI + MPI_Finalize(); + #endif + exit(256); + } // WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @@ -199,7 +211,7 @@ int main(int argc, char * argv[]) // Set temporary local size of points long nm_pe = (long)(Nmeasures/size); long remaining = Nmeasures%size; - + long startrow = rank*nm_pe; if (rank == size-1)nm_pe = nm_pe+remaining; @@ -268,7 +280,7 @@ int main(int argc, char * argv[]) // Create histograms and linked lists clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); - + //CLAAA // Initialize linked list struct sectorlist ** sectorhead; @@ -438,7 +450,7 @@ int main(int argc, char * argv[]) // define local destination sector //isector = (isector_count+rank)%size; isector = isector_count; - // allocate sector arrays + // allocate sector arrays long Nsec = histo_send[isector]; uus = (double*) malloc(Nsec*sizeof(double)); vvs = (double*) malloc(Nsec*sizeof(double)); @@ -501,7 +513,7 @@ int main(int argc, char * argv[]) vvmin = MIN(vvmin,vvs[ipart]); vvmax = MAX(vvmax,vvs[ipart]); - + if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]); } @@ -556,9 +568,9 @@ int main(int argc, char * argv[]) #ifdef ONE_SIDE printf("One Side communication active\n"); MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin); - MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); + MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); MPI_Win_unlock(target_rank,slabwin); - //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); + //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); #else MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); #endif //ONE_SIDE @@ -633,7 +645,7 @@ int main(int argc, char * argv[]) process_time1 = (finish.tv_sec - begin.tv_sec); process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); - + #ifdef WRITE_DATA // Write results @@ -712,7 +724,7 @@ int main(int argc, char * argv[]) #ifdef USE_FFTW // FFT transform the data (using distributed FFTW) - + if(rank == 0)printf("PERFORMING FFT\n"); clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); @@ -727,13 +739,13 @@ int main(int argc, char * argv[]) // and perform the FFT per w plane alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start); fftwgrid = fftw_alloc_complex(alloc_local); - plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); + plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); long fftwindex = 0; long fftwindex2D = 0; for (int iw=0; iw