Loading Build/Makefile.Magellanus +16 −14 Original line number Diff line number Diff line Loading @@ -4,29 +4,31 @@ CXX = g++-10 MPICC = mpicc MPIC++ = mpiCC OPTIMIZE = -O3 #-ffast-math -fopt-info-all-omp -fcf-protection=none -fno-stack-protector -foffload=nvptx-none GSL_INCL = -I/home/taffoni/sw/include #-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include GSL_LIBS = -L/home/taffoni/sw/lib #-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi GSL_INCL = -I/home/taffoni/sw/include GSL_LIBS = -L/home/taffoni/sw/lib FFTW_INCL= -I/home/taffoni/sw/include FFTW_LIB= -L/home/taffoni/sw/lib -lfftw3_mpi -lfftw3 #-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi MPI_LIB = MPI_INCL= #-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include HDF5_INCL = HDF5_LIB = OMP = -fopenmp OMP = -mp=multicore,gpu -Mprof -cuda #OMP = -fopenmp NVCC = nvcc NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11 NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda CFLAGS += $(OPTIMIZE) CFLAGS += -I. CFLAGS += -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include CFLAGS += $(FFTW_INCL) $(GSL_INCL) CFLAGS += $(FFTW_LIB) -lm CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) OPTIMIZE = $(OMP) -O3 # OMP GPU SPECIFIC FLAGS #OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math #OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35 #-ffast-math -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none Build/Makefile.Marconi +5 −5 Original line number Diff line number Diff line Loading @@ -5,16 +5,16 @@ MPICC = mpicc MPIC++ = mpiCC CFLAGS += -O3 -mcpu=native CFLAGS += -I. FFTW_INCL= -I/home/taffoni/sw/include FFTW_LIB= -L/home/taffoni/sw/lib LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm NVCC = nvcc NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda OMP= -fopenmp CFLAGS += -O3 -mtune=native CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) OPTIMIZE = $(OMP) -O3 -mtune=native Makefile +23 −13 Original line number Diff line number Diff line Loading @@ -14,21 +14,22 @@ include Build/Makefile.systype endif LIBS = $(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda # create MPI code OPT += -DUSE_MPI #OPT += -DACCOMP # use FFTW (it can be switched on ONLY if MPI is active) ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) OPT += -DUSE_FFTW LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm endif #OPT += -DNVIDIA # perform one-side communication (suggested) instead of reduce (only if MPI is active) OPT += -DONE_SIDE # write the full 3D cube of gridded visibilities and its FFT transform #OPT += -DWRITE_DATA OPT += -DWRITE_DATA # write the final image OPT += -DWRITE_IMAGE # perform w-stacking phase correction Loading @@ -46,27 +47,36 @@ phase_correction.c: phase_correction.cu ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) %.o: %.c $(DEPS) $(MPICC) -c -o $@ $< $(CFLAGS) $(OPT) $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) else %.o: %.c $(DEPS) $(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT) $(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) endif serial: $(COBJ) $(CC) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm $(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial $^ $(LIBS) serial_omp: phase_correction.c $(CC) $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial w-stacking-fftw.c w-stacking_omp.c $(CFLAGS) $(LIBS) simple_mpi: phase_correction.c $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS) mpi_omp: phase_correction.c $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_omp w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS) serial_cuda: $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) $(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c $(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm $(CC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS) $(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm mpi: $(COBJ) $(MPICC) -o w-stackingCfftw $^ $(CFLAGS) $(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS) mpi_cuda: $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) $(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c $(MPIC++) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(MPICC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS) $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS) clean: rm *.o Loading phase_correction.cu +16 −8 Original line number Diff line number Diff line Loading @@ -18,7 +18,11 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in for (int iw=0; iw<num_w_planes; iw++) { double wterm = (double)iw/dnum_w_planes; #ifdef ACCOMP #pragma omp target teams distribute parallel for \ map(tofrom: image_real[0:xaxis*yaxis], image_imag[0:xaxis*yaxis]) \ map (to: gridss[0:2*num_w_planes*xaxis*yaxis]) #endif for (int iv=0; iv<yaxis; iv++) for (int iu=0; iu<xaxis; iu++) { Loading Loading @@ -53,10 +57,14 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in s = pimag; //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); #pragma omp atomic image_real[img_index] += p*r-q*s; #pragma omp atomic image_imag[img_index] += p*s+q*r; #else #pragma omp atomic image_real[img_index] += gridss[index]; #pragma omp atomic image_imag[img_index] += gridss[index+1]; #endif Loading w-stacking-fftw.c +212 −163 Original line number Diff line number Diff line Loading @@ -7,9 +7,15 @@ #include <fftw3-mpi.h> #endif #endif #include <omp.h> #include <math.h> #include <time.h> #include <unistd.h> #ifdef ACCOMP #include "w-stacking_omp.h" #else #include "w-stacking.h" #endif #define PI 3.14159265359 #define NUM_OF_SECTORS -1 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) Loading Loading @@ -100,6 +106,8 @@ int main(int argc, char * argv[]) int xaxis; int yaxis; int num_w_planes = 1; // DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization int w_support = 7; int num_threads;// = 4; Loading @@ -115,11 +123,23 @@ int main(int argc, char * argv[]) struct timespec begin, finish, begin0, begink, finishk; double elapsed; long nsectors; /* GT get nymber of threads exit if not given */ if(argc == 1) { fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]); exit(1); } // Set the number of OpenMP threads num_threads = atoi(argv[1]); if ( num_threads == 0 ) { fprintf(stderr, "Wrong parameter: %s\n\n", argv[1]); fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]); exit(1); } clock_gettime(CLOCK_MONOTONIC, &begin0); start0 = clock(); // Set the number of OpenMP threads num_threads = atoi(argv[1]); // Intialize MPI environment #ifdef USE_MPI Loading @@ -134,8 +154,22 @@ int main(int argc, char * argv[]) rank = 0; size = 1; #endif if(rank == 0)printf("Running with %d threads\n",num_threads); #ifdef ACCOMP if(rank == 0){ if (0 == omp_get_num_devices()) { printf("No accelerator found ... exit\n"); exit(255); } printf("Number of available GPUs %d\n", omp_get_num_devices()); #ifdef NVIDIA prtAccelInfo(); #endif } #endif // set the local size of the image local_grid_size_x = grid_size_x; nsectors = NUM_OF_SECTORS; Loading Loading @@ -211,6 +245,7 @@ int main(int argc, char * argv[]) printf("N. visibilities on %d %ld\n",rank,Nvis); #endif // DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used // all the sizes are rescaled by the number of MPI tasks // Allocate arrays Loading Loading @@ -260,13 +295,13 @@ int main(int argc, char * argv[]) setup_time1 = (finish.tv_sec - begin.tv_sec); setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; if(rank == 0)printf("GRIDDING DATA\n"); // Create histograms and linked lists clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); //CLAAA // Initialize linked list struct sectorlist ** sectorhead; sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist)); Loading Loading @@ -335,20 +370,18 @@ int main(int argc, char * argv[]) #ifndef USE_MPI double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double)); #endif double shift = (double)(dx*yaxis); // Open the MPI Memory Window for the slab #ifdef USE_MPI MPI_Win slabwin; MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); MPI_Win_fence(0,slabwin); #endif #ifndef USE_MPI pFile1 = fopen (outfile1,"w"); #endif // loop over files // kernel_time = 0.0; Loading @@ -357,9 +390,9 @@ int main(int argc, char * argv[]) reduce_time1 = 0.0; compose_time = 0.0; compose_time1 = 0.0; for (int ifiles=0; ifiles<ndatasets; ifiles++) { strcpy(filename,datapath_multi[ifiles]); printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets); strcat(filename,weightsfile); Loading @@ -368,20 +401,21 @@ int main(int argc, char * argv[]) fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET); fread(weights,(Nweights)*sizeof(float),1,pFile); fclose(pFile); strcpy(filename,datapath); strcat(filename,visrealfile); //printf("Reading %s\n",filename); #ifdef VERBOSE printf("Reading %s\n",filename); #endif pFile = fopen (filename,"rb"); fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); fread(visreal,Nvis*sizeof(float),1,pFile); fclose(pFile); strcpy(filename,datapath); strcat(filename,visimgfile); //printf("Reading %s\n",filename); #ifdef VERBOSE printf("Reading %s\n",filename); #endif pFile = fopen (filename,"rb"); fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); fread(visimg,Nvis*sizeof(float),1,pFile); Loading @@ -390,7 +424,6 @@ int main(int argc, char * argv[]) #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif // Declare temporary arrays for the masking double * uus; double * vvs; Loading @@ -398,8 +431,8 @@ int main(int argc, char * argv[]) float * visreals; float * visimgs; float * weightss; long isector; for (long isector_count=0; isector_count<nsectors; isector_count++) { clock_gettime(CLOCK_MONOTONIC, &begink); Loading @@ -425,13 +458,15 @@ int main(int argc, char * argv[]) //CLAAAA struct sectorlist * current; current = sectorhead[isector]; while (current->index != -1) { long ilocal = current->index; //double vvh = vv[ilocal]; //int binphi = (int)(vvh*nsectors); //if (binphi == isector || boundary[ilocal] == isector) //{ //if (binphi == isector || boundary[ilocal] == isector) { uus[icount] = uu[ilocal]; vvs[icount] = vv[ilocal]-isector*shift; wws[icount] = ww[ilocal]; Loading @@ -457,6 +492,7 @@ int main(int argc, char * argv[]) compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; compose_time1 += (finishk.tv_sec - begink.tv_sec); compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; #ifndef USE_MPI double uumin = 1e20; double vvmin = 1e20; Loading @@ -483,6 +519,7 @@ int main(int argc, char * argv[]) #endif clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); wstack(num_w_planes, Nsec, freq_per_chan, Loading @@ -500,6 +537,18 @@ int main(int argc, char * argv[]) yaxis, gridss, num_threads); /* int z =0 ; #pragma omp target map(to:test_i_gpu) map(from:z) { int x; // only accessible from accelerator x = 2; z = x + test_i_gpu; }*/ clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; Loading Loading
Build/Makefile.Magellanus +16 −14 Original line number Diff line number Diff line Loading @@ -4,29 +4,31 @@ CXX = g++-10 MPICC = mpicc MPIC++ = mpiCC OPTIMIZE = -O3 #-ffast-math -fopt-info-all-omp -fcf-protection=none -fno-stack-protector -foffload=nvptx-none GSL_INCL = -I/home/taffoni/sw/include #-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include GSL_LIBS = -L/home/taffoni/sw/lib #-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi GSL_INCL = -I/home/taffoni/sw/include GSL_LIBS = -L/home/taffoni/sw/lib FFTW_INCL= -I/home/taffoni/sw/include FFTW_LIB= -L/home/taffoni/sw/lib -lfftw3_mpi -lfftw3 #-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi MPI_LIB = MPI_INCL= #-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include HDF5_INCL = HDF5_LIB = OMP = -fopenmp OMP = -mp=multicore,gpu -Mprof -cuda #OMP = -fopenmp NVCC = nvcc NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11 NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda CFLAGS += $(OPTIMIZE) CFLAGS += -I. CFLAGS += -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include CFLAGS += $(FFTW_INCL) $(GSL_INCL) CFLAGS += $(FFTW_LIB) -lm CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) OPTIMIZE = $(OMP) -O3 # OMP GPU SPECIFIC FLAGS #OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math #OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35 #-ffast-math -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none
Build/Makefile.Marconi +5 −5 Original line number Diff line number Diff line Loading @@ -5,16 +5,16 @@ MPICC = mpicc MPIC++ = mpiCC CFLAGS += -O3 -mcpu=native CFLAGS += -I. FFTW_INCL= -I/home/taffoni/sw/include FFTW_LIB= -L/home/taffoni/sw/lib LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm NVCC = nvcc NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda OMP= -fopenmp CFLAGS += -O3 -mtune=native CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) OPTIMIZE = $(OMP) -O3 -mtune=native
Makefile +23 −13 Original line number Diff line number Diff line Loading @@ -14,21 +14,22 @@ include Build/Makefile.systype endif LIBS = $(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda # create MPI code OPT += -DUSE_MPI #OPT += -DACCOMP # use FFTW (it can be switched on ONLY if MPI is active) ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) OPT += -DUSE_FFTW LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm endif #OPT += -DNVIDIA # perform one-side communication (suggested) instead of reduce (only if MPI is active) OPT += -DONE_SIDE # write the full 3D cube of gridded visibilities and its FFT transform #OPT += -DWRITE_DATA OPT += -DWRITE_DATA # write the final image OPT += -DWRITE_IMAGE # perform w-stacking phase correction Loading @@ -46,27 +47,36 @@ phase_correction.c: phase_correction.cu ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) %.o: %.c $(DEPS) $(MPICC) -c -o $@ $< $(CFLAGS) $(OPT) $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) else %.o: %.c $(DEPS) $(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT) $(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) endif serial: $(COBJ) $(CC) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm $(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial $^ $(LIBS) serial_omp: phase_correction.c $(CC) $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial w-stacking-fftw.c w-stacking_omp.c $(CFLAGS) $(LIBS) simple_mpi: phase_correction.c $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS) mpi_omp: phase_correction.c $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_omp w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS) serial_cuda: $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) $(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c $(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm $(CC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS) $(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm mpi: $(COBJ) $(MPICC) -o w-stackingCfftw $^ $(CFLAGS) $(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS) mpi_cuda: $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) $(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c $(MPIC++) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(MPICC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS) $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS) clean: rm *.o Loading
phase_correction.cu +16 −8 Original line number Diff line number Diff line Loading @@ -18,7 +18,11 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in for (int iw=0; iw<num_w_planes; iw++) { double wterm = (double)iw/dnum_w_planes; #ifdef ACCOMP #pragma omp target teams distribute parallel for \ map(tofrom: image_real[0:xaxis*yaxis], image_imag[0:xaxis*yaxis]) \ map (to: gridss[0:2*num_w_planes*xaxis*yaxis]) #endif for (int iv=0; iv<yaxis; iv++) for (int iu=0; iu<xaxis; iu++) { Loading Loading @@ -53,10 +57,14 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in s = pimag; //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); #pragma omp atomic image_real[img_index] += p*r-q*s; #pragma omp atomic image_imag[img_index] += p*s+q*r; #else #pragma omp atomic image_real[img_index] += gridss[index]; #pragma omp atomic image_imag[img_index] += gridss[index+1]; #endif Loading
w-stacking-fftw.c +212 −163 Original line number Diff line number Diff line Loading @@ -7,9 +7,15 @@ #include <fftw3-mpi.h> #endif #endif #include <omp.h> #include <math.h> #include <time.h> #include <unistd.h> #ifdef ACCOMP #include "w-stacking_omp.h" #else #include "w-stacking.h" #endif #define PI 3.14159265359 #define NUM_OF_SECTORS -1 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) Loading Loading @@ -100,6 +106,8 @@ int main(int argc, char * argv[]) int xaxis; int yaxis; int num_w_planes = 1; // DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization int w_support = 7; int num_threads;// = 4; Loading @@ -115,11 +123,23 @@ int main(int argc, char * argv[]) struct timespec begin, finish, begin0, begink, finishk; double elapsed; long nsectors; /* GT get nymber of threads exit if not given */ if(argc == 1) { fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]); exit(1); } // Set the number of OpenMP threads num_threads = atoi(argv[1]); if ( num_threads == 0 ) { fprintf(stderr, "Wrong parameter: %s\n\n", argv[1]); fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]); exit(1); } clock_gettime(CLOCK_MONOTONIC, &begin0); start0 = clock(); // Set the number of OpenMP threads num_threads = atoi(argv[1]); // Intialize MPI environment #ifdef USE_MPI Loading @@ -134,8 +154,22 @@ int main(int argc, char * argv[]) rank = 0; size = 1; #endif if(rank == 0)printf("Running with %d threads\n",num_threads); #ifdef ACCOMP if(rank == 0){ if (0 == omp_get_num_devices()) { printf("No accelerator found ... exit\n"); exit(255); } printf("Number of available GPUs %d\n", omp_get_num_devices()); #ifdef NVIDIA prtAccelInfo(); #endif } #endif // set the local size of the image local_grid_size_x = grid_size_x; nsectors = NUM_OF_SECTORS; Loading Loading @@ -211,6 +245,7 @@ int main(int argc, char * argv[]) printf("N. visibilities on %d %ld\n",rank,Nvis); #endif // DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used // all the sizes are rescaled by the number of MPI tasks // Allocate arrays Loading Loading @@ -260,13 +295,13 @@ int main(int argc, char * argv[]) setup_time1 = (finish.tv_sec - begin.tv_sec); setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; if(rank == 0)printf("GRIDDING DATA\n"); // Create histograms and linked lists clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); //CLAAA // Initialize linked list struct sectorlist ** sectorhead; sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist)); Loading Loading @@ -335,20 +370,18 @@ int main(int argc, char * argv[]) #ifndef USE_MPI double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double)); #endif double shift = (double)(dx*yaxis); // Open the MPI Memory Window for the slab #ifdef USE_MPI MPI_Win slabwin; MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); MPI_Win_fence(0,slabwin); #endif #ifndef USE_MPI pFile1 = fopen (outfile1,"w"); #endif // loop over files // kernel_time = 0.0; Loading @@ -357,9 +390,9 @@ int main(int argc, char * argv[]) reduce_time1 = 0.0; compose_time = 0.0; compose_time1 = 0.0; for (int ifiles=0; ifiles<ndatasets; ifiles++) { strcpy(filename,datapath_multi[ifiles]); printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets); strcat(filename,weightsfile); Loading @@ -368,20 +401,21 @@ int main(int argc, char * argv[]) fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET); fread(weights,(Nweights)*sizeof(float),1,pFile); fclose(pFile); strcpy(filename,datapath); strcat(filename,visrealfile); //printf("Reading %s\n",filename); #ifdef VERBOSE printf("Reading %s\n",filename); #endif pFile = fopen (filename,"rb"); fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); fread(visreal,Nvis*sizeof(float),1,pFile); fclose(pFile); strcpy(filename,datapath); strcat(filename,visimgfile); //printf("Reading %s\n",filename); #ifdef VERBOSE printf("Reading %s\n",filename); #endif pFile = fopen (filename,"rb"); fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); fread(visimg,Nvis*sizeof(float),1,pFile); Loading @@ -390,7 +424,6 @@ int main(int argc, char * argv[]) #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif // Declare temporary arrays for the masking double * uus; double * vvs; Loading @@ -398,8 +431,8 @@ int main(int argc, char * argv[]) float * visreals; float * visimgs; float * weightss; long isector; for (long isector_count=0; isector_count<nsectors; isector_count++) { clock_gettime(CLOCK_MONOTONIC, &begink); Loading @@ -425,13 +458,15 @@ int main(int argc, char * argv[]) //CLAAAA struct sectorlist * current; current = sectorhead[isector]; while (current->index != -1) { long ilocal = current->index; //double vvh = vv[ilocal]; //int binphi = (int)(vvh*nsectors); //if (binphi == isector || boundary[ilocal] == isector) //{ //if (binphi == isector || boundary[ilocal] == isector) { uus[icount] = uu[ilocal]; vvs[icount] = vv[ilocal]-isector*shift; wws[icount] = ww[ilocal]; Loading @@ -457,6 +492,7 @@ int main(int argc, char * argv[]) compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; compose_time1 += (finishk.tv_sec - begink.tv_sec); compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; #ifndef USE_MPI double uumin = 1e20; double vvmin = 1e20; Loading @@ -483,6 +519,7 @@ int main(int argc, char * argv[]) #endif clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); wstack(num_w_planes, Nsec, freq_per_chan, Loading @@ -500,6 +537,18 @@ int main(int argc, char * argv[]) yaxis, gridss, num_threads); /* int z =0 ; #pragma omp target map(to:test_i_gpu) map(from:z) { int x; // only accessible from accelerator x = 2; z = x + test_i_gpu; }*/ clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; Loading