Loading Makefile +13 −5 Original line number Diff line number Diff line # comment/uncomment the various options depending hoe you want to build the program # create MPI code OPT += -DUSE_MPI # use FFTW (it can be switched on ONLY if MPI is active) OPT += -DUSE_FFTW # perform one-side communication (suggested) instead of reduce (only if MPI is active) OPT += -DONE_SIDE OPT += -DNOWRITE_DATA # write the full 3D cube of gridded visibilities and its FFT transform #OPT += -DWRITE_DATA # write the final image OPT += -DWRITE_IMAGE # perform w-stacking phase correction # OPT += PHASE_ON CC = gcc CXX = g++ Loading @@ -20,13 +28,13 @@ NVCC = nvcc NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda DEPS = w-stacking.h DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o w-stacking.c: w-stacking.c: w-stacking.cu cp w-stacking.cu w-stacking.c phase_correction.c: phase_correction.c: phase_correction.cu cp phase_correction.cu phase_correction.c %.o: %.c $(DEPS) Loading phase_correction.cu +42 −5 Original line number Diff line number Diff line Loading @@ -5,24 +5,61 @@ #include <math.h> #include <stdlib.h> #include <stdio.h> #define NWORKERS -1 //100 #define NTHREADS 32 #ifdef __CUDACC__ #endif void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes) void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes, int xaxistot, int yaxistot) { double dnum_w_planes = (double)num_w_planes; double dxaxistot = (double)xaxistot; double dyaxistot = (double)yaxistot; for (int iw=0; iw<num_w_planes; iw++) { for (int iv=0; iw<yaxis; iv++) double wterm = (double)iw/dnum_w_planes; for (int iv=0; iv<yaxis; iv++) for (int iu=0; iu<xaxis; iu++) { long index = 2*(iu+iv*xaxis+xaxis*yaxis*iw); long img_index = iu+iv*xaxis; #ifdef PHASE_ON double xcoord = (double)(iu-xaxistot/2); if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2); double ycoord = (double)(iv-yaxistot/2); if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2); xcoord = xcoord/dxaxistot; ycoord = ycoord/dyaxistot; double efact; double preal, pimag; double radius2 = (xcoord*xcoord+ycoord*ycoord); if(xcoord <= 1.0) { preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); } else { preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1)); pimag = 0.0; } double p,q,r,s; p = gridss[index]; q = gridss[index+1]; r = preal; s = pimag; //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); image_real[img_index] += p*r-q*s; image_imag[img_index] += p*s+q*r; #else image_real[img_index] += gridss[index]; image_imag[img_index] += gridss[index]; image_imag[img_index] += gridss[index+1]; #endif } } Loading w-stacking-fftw.c +19 −8 Original line number Diff line number Diff line Loading @@ -92,14 +92,14 @@ int main(int argc, char * argv[]) double wmin; double wmax; // MESH SIZE int grid_size_x = 2048; int grid_size_y = 2048; int local_grid_size_x;// = 100; int local_grid_size_y;// = 100; int xaxis; int yaxis; int num_w_planes = 10; int num_w_planes = 1; // DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization int w_support = 7; int num_threads;// = 4; Loading @@ -108,8 +108,8 @@ int main(int argc, char * argv[]) double w_supporth = (double)((w_support-1)/2)*dx; clock_t start, end, start0, startk, endk; double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time; double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1; double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time; double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1; double writetime, writetime1; struct timespec begin, finish, begin0, begink, finishk; Loading Loading @@ -150,10 +150,12 @@ int main(int argc, char * argv[]) clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); // CLAAAAAA // INPUT FILES (only the first ndatasets entries are used) int ndatasets = 1; strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/"); strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/"); //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/"); //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/"); strcpy(datapath,datapath_multi[0]); // Read metadata Loading Loading @@ -822,11 +824,18 @@ int main(int argc, char * argv[]) fftw_free(fftwgrid); // Phase correction clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); if(rank == 0)printf("PHASE CORRECTION\n"); double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double)); double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double)); phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes); phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y); end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); phase_time = ((double) (end - start)) / CLOCKS_PER_SEC; phase_time1 = (finish.tv_sec - begin.tv_sec); phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; #ifdef WRITE_IMAGE if(rank == 0) Loading Loading @@ -884,6 +893,7 @@ int main(int argc, char * argv[]) printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time); #ifdef USE_FFTW printf("FFTW time: %f sec\n",fftw_time); printf("Phase time: %f sec\n",phase_time); #endif printf("TOT time: %f sec\n",tot_time); if(num_threads > 1) Loading @@ -893,6 +903,7 @@ int main(int argc, char * argv[]) printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1); #ifdef USE_FFTW printf("PFFTW time: %f sec\n",fftw_time1); printf("PPhase time: %f sec\n",phase_time1); #endif printf("PTOT time: %f sec\n",tot_time1); } Loading w-stacking.cu +0 −2 Original line number Diff line number Diff line Loading @@ -5,8 +5,6 @@ #include <math.h> #include <stdlib.h> #include <stdio.h> #define NWORKERS -1 //100 #define NTHREADS 32 #ifdef __CUDACC__ double __device__ Loading w-stacking.h +4 −0 Original line number Diff line number Diff line #ifndef W_PROJECT_H_ #define W_PROJECT_H_ #define NWORKERS -1 //100 #define NTHREADS 32 #define PI 3.14159265359 #define REAL_TYPE double #ifdef __CUDACC__ Loading Loading @@ -39,6 +41,8 @@ void phase_correction( double*, int, int, int, int, int); #endif Loading
Makefile +13 −5 Original line number Diff line number Diff line # comment/uncomment the various options depending hoe you want to build the program # create MPI code OPT += -DUSE_MPI # use FFTW (it can be switched on ONLY if MPI is active) OPT += -DUSE_FFTW # perform one-side communication (suggested) instead of reduce (only if MPI is active) OPT += -DONE_SIDE OPT += -DNOWRITE_DATA # write the full 3D cube of gridded visibilities and its FFT transform #OPT += -DWRITE_DATA # write the final image OPT += -DWRITE_IMAGE # perform w-stacking phase correction # OPT += PHASE_ON CC = gcc CXX = g++ Loading @@ -20,13 +28,13 @@ NVCC = nvcc NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda DEPS = w-stacking.h DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o w-stacking.c: w-stacking.c: w-stacking.cu cp w-stacking.cu w-stacking.c phase_correction.c: phase_correction.c: phase_correction.cu cp phase_correction.cu phase_correction.c %.o: %.c $(DEPS) Loading
phase_correction.cu +42 −5 Original line number Diff line number Diff line Loading @@ -5,24 +5,61 @@ #include <math.h> #include <stdlib.h> #include <stdio.h> #define NWORKERS -1 //100 #define NTHREADS 32 #ifdef __CUDACC__ #endif void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes) void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes, int xaxistot, int yaxistot) { double dnum_w_planes = (double)num_w_planes; double dxaxistot = (double)xaxistot; double dyaxistot = (double)yaxistot; for (int iw=0; iw<num_w_planes; iw++) { for (int iv=0; iw<yaxis; iv++) double wterm = (double)iw/dnum_w_planes; for (int iv=0; iv<yaxis; iv++) for (int iu=0; iu<xaxis; iu++) { long index = 2*(iu+iv*xaxis+xaxis*yaxis*iw); long img_index = iu+iv*xaxis; #ifdef PHASE_ON double xcoord = (double)(iu-xaxistot/2); if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2); double ycoord = (double)(iv-yaxistot/2); if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2); xcoord = xcoord/dxaxistot; ycoord = ycoord/dyaxistot; double efact; double preal, pimag; double radius2 = (xcoord*xcoord+ycoord*ycoord); if(xcoord <= 1.0) { preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); } else { preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1)); pimag = 0.0; } double p,q,r,s; p = gridss[index]; q = gridss[index+1]; r = preal; s = pimag; //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); image_real[img_index] += p*r-q*s; image_imag[img_index] += p*s+q*r; #else image_real[img_index] += gridss[index]; image_imag[img_index] += gridss[index]; image_imag[img_index] += gridss[index+1]; #endif } } Loading
w-stacking-fftw.c +19 −8 Original line number Diff line number Diff line Loading @@ -92,14 +92,14 @@ int main(int argc, char * argv[]) double wmin; double wmax; // MESH SIZE int grid_size_x = 2048; int grid_size_y = 2048; int local_grid_size_x;// = 100; int local_grid_size_y;// = 100; int xaxis; int yaxis; int num_w_planes = 10; int num_w_planes = 1; // DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization int w_support = 7; int num_threads;// = 4; Loading @@ -108,8 +108,8 @@ int main(int argc, char * argv[]) double w_supporth = (double)((w_support-1)/2)*dx; clock_t start, end, start0, startk, endk; double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time; double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1; double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time; double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1; double writetime, writetime1; struct timespec begin, finish, begin0, begink, finishk; Loading Loading @@ -150,10 +150,12 @@ int main(int argc, char * argv[]) clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); // CLAAAAAA // INPUT FILES (only the first ndatasets entries are used) int ndatasets = 1; strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/"); strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/"); //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/"); //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"); //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/"); strcpy(datapath,datapath_multi[0]); // Read metadata Loading Loading @@ -822,11 +824,18 @@ int main(int argc, char * argv[]) fftw_free(fftwgrid); // Phase correction clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); if(rank == 0)printf("PHASE CORRECTION\n"); double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double)); double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double)); phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes); phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y); end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); phase_time = ((double) (end - start)) / CLOCKS_PER_SEC; phase_time1 = (finish.tv_sec - begin.tv_sec); phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; #ifdef WRITE_IMAGE if(rank == 0) Loading Loading @@ -884,6 +893,7 @@ int main(int argc, char * argv[]) printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time); #ifdef USE_FFTW printf("FFTW time: %f sec\n",fftw_time); printf("Phase time: %f sec\n",phase_time); #endif printf("TOT time: %f sec\n",tot_time); if(num_threads > 1) Loading @@ -893,6 +903,7 @@ int main(int argc, char * argv[]) printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1); #ifdef USE_FFTW printf("PFFTW time: %f sec\n",fftw_time1); printf("PPhase time: %f sec\n",phase_time1); #endif printf("PTOT time: %f sec\n",tot_time1); } Loading
w-stacking.cu +0 −2 Original line number Diff line number Diff line Loading @@ -5,8 +5,6 @@ #include <math.h> #include <stdlib.h> #include <stdio.h> #define NWORKERS -1 //100 #define NTHREADS 32 #ifdef __CUDACC__ double __device__ Loading
w-stacking.h +4 −0 Original line number Diff line number Diff line #ifndef W_PROJECT_H_ #define W_PROJECT_H_ #define NWORKERS -1 //100 #define NTHREADS 32 #define PI 3.14159265359 #define REAL_TYPE double #ifdef __CUDACC__ Loading Loading @@ -39,6 +41,8 @@ void phase_correction( double*, int, int, int, int, int); #endif