Loading Makefile +29 −13 Original line number Diff line number Diff line Loading @@ -24,6 +24,9 @@ FFTW_MPI_INC = FFTW_MPI_LIB = CFLAGS += -I./ FLAGS=$(OPTIMIZE) FFTWLIBS = # ======================================================== Loading Loading @@ -52,7 +55,7 @@ OPT += -DPHASE_ON #OPT += -DNVIDIA #use cuda for GPUs #OPT += -DCUDACC OPT += -DCUDACC # use GPU acceleration via OMP #OPT += -DACCOMP Loading @@ -73,12 +76,23 @@ OPT += -DPHASE_ON DEPS = w-stacking.h main.c allvars.h ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o else OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o endif ifneq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o else OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o endif DEPS_ACC_CUDA = w-stacking.h w-stacking.cu DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu OBJ_ACC_CUDA = phase_correction.o w-stacking.o DEPS_ACC_OMP = w-stacking_omp.h DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o OBJ_NCCL_REDUCE = gridding_nccl.o Loading Loading @@ -114,11 +128,12 @@ endif ifeq (CUDACC,$(findstring CUDACC,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-cuda LINKER=$(NVCC) FLAGS=$(NVFLAGS) $(CFLAGS) LINKER=$(MPIC++) FLAGS=$(OPTIMIZE) LIBS=$(NVLIB) compile_cuda: $(OBJ_ACC_CUDA) $(NVCC) $(OPT) $(NVFLAGS) -c *.cu $(NVLIB) $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS) OBJ += $(OBJ_ACC_CUDA) endif ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) Loading @@ -126,8 +141,9 @@ EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(NVC) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) compile_accomp: $(OBJ_ACC_OMP) $(NVC) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB) $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) $(NVC) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_ACC_OMP) endif ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT))) Loading @@ -136,7 +152,7 @@ LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) compile_accreduce: $(OBJ_NCCL_REDUCE) $(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) endif ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT))) Loading @@ -144,7 +160,7 @@ EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) compile_accreduce: $(OBJ_RCCL_REDCUE) compile_accreduce: $(OBJ_RCCL_REDUCE) $(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3) endif Loading @@ -152,9 +168,9 @@ endif ################################################################################### w-stacking: $(OBJ) $(DEPS) Makefile $(LINKER) $(FLAGS) $(OPTIMIZE) $(OPT) $(OBJ) -lmpi -o $(EXEC)$(EXEC_EXT) -lmpi $(FFTWLIBS) $(LIBS) $(LINKER) $(FLAGS) $(OPT) $(OBJ) -o $(EXEC)$(EXEC_EXT) -lmpi $(FFTWLIBS) $(LIBS) $(OBJ): $(DEPS) Makefile #$(OBJ): $(DEPS) Makefile %.o: %.c $(DEPS) $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) Loading gridding.c +0 −2 Original line number Diff line number Diff line Loading @@ -209,9 +209,7 @@ void write_gridded_data() fclose(file.pFileimg); } #ifdef USE_MPI MPI_Win_fence(0,slabwin); #endif #endif //WRITE_DATA Loading gridding_cpu.c +5 −3 Original line number Diff line number Diff line #include "allvars.h" #include "proto.h" Loading Loading @@ -225,7 +224,7 @@ void gridding_data() { int ret = reduce_ring(target_rank); grid = (double*)Me.fwin.ptr; //Let grid point to the right memory location [GL] //grid = (double*)Me.fwin.ptr; //Let grid point to the right memory location [GL] if ( ret != 0 ) { Loading @@ -241,6 +240,9 @@ void gridding_data() timing_wt.reduce += CPU_TIME_wt - start; } else *grid += *gridss; // Go to next sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); Loading w-stacking.cu +86 −92 Original line number Diff line number Diff line Loading @@ -34,9 +34,9 @@ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) __global__ void convolve_g( int num_w_planes, long num_points, long freq_per_chan, long polarizations, uint num_points, uint freq_per_chan, uint polarizations, double* uu, double* vv, double* ww, Loading @@ -53,11 +53,11 @@ __global__ void convolve_g( { //printf("DENTRO AL KERNEL\n"); long gid = blockIdx.x*blockDim.x + threadIdx.x; uint gid = blockIdx.x*blockDim.x + threadIdx.x; if(gid < num_points) { long i = gid; long visindex = i*freq_per_chan*polarizations; uint i = gid; uint visindex = i*freq_per_chan*polarizations; double norm = std22/PI; int j, k; Loading @@ -79,10 +79,10 @@ __global__ void convolve_g( //printf("[A] %g, %g, %g, %g, %g, %d, %d, %d\n", uu[i], vv[i], ww[i], dx, dw, grid_w, grid_u, grid_v); // check the boundaries long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax); /* Loading @@ -98,36 +98,29 @@ __global__ void convolve_g( for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); //printf("--> %ld %d %d %d\n",iKer,j,k,grid_w); if ( iKer >= 2*num_w_planes*grid_size_x*grid_size_y) { printf("=== ALERT [B] iKer out of bound: %ld, %d, %d, %d, %d, %d\n", iKer, j, k, grid_w, grid_size_x, grid_size_y); printf("grid_w = %d, i=%ld, ww_i = %g, dw = %g\n", grid_w, i, ww_i, dw); printf("kmin,kamx: %ld, %ld : %d, %d %d : %g %g\n", kmin, kmax, grid_v, KernelLen, grid_size_y, vv[i], dx ); printf("jmin,jamx: %ld, %ld : %d, %d %d : %g %g\n", jmin, jmax, grid_u, KernelLen, grid_size_x, uu[i], dx ); } double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; long ifine = visindex; uint ifine = visindex; /* if ( ifine >= num_points*freq_per_chan*polarizations ) printf("=== ALERT [C] ifine out of bound: %ld\n", ifine); */ for (long ifreq=0; ifreq<freq_per_chan; ifreq++) for (uint ifreq=0; ifreq<freq_per_chan; ifreq++) { long iweight = visindex/freq_per_chan; uint iweight = visindex/freq_per_chan; /* if ( iweight >= num_points*polarizations ) printf("=== ALERT [D] ifine out of bound: %ld\n", iweight); */ for (long ipol=0; ipol<polarizations; ipol++) for (uint ipol=0; ipol<polarizations; ipol++) { double vistest = (double)vis_real[ifine]; if (!isnan(vistest)) Loading @@ -151,9 +144,9 @@ __global__ void convolve_g( #endif void wstack( int num_w_planes, long num_points, long freq_per_chan, long polarizations, uint num_points, uint freq_per_chan, uint polarizations, double* uu, double* vv, double* ww, Loading @@ -169,9 +162,9 @@ void wstack( int num_threads, int rank) { long i; long index; long visindex; uint i; uint index; uint visindex; // initialize the convolution kernel // gaussian: Loading @@ -180,6 +173,7 @@ void wstack( double std22 = 1.0/(2.0*std*std); double norm = std22/PI; printf("Task %d, here!", rank); // Loop over visibilities. // Switch between CUDA and GPU versions #ifdef __CUDACC__ Loading @@ -191,9 +185,9 @@ void wstack( cudaGetDevice( &device_cuda ); int Nth = NTHREADS; long Nbl = (long)(num_points/Nth) + 1; uint Nbl = (uint)(num_points/Nth) + 1; if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; long Nvis = num_points*freq_per_chan*polarizations; uint Nvis = num_points*freq_per_chan*polarizations; printf("Running on GPU %d with %d threads and %d blocks\n",device_cuda,Nth,Nbl); // Create GPU arrays and offload them Loading Loading @@ -268,7 +262,7 @@ void wstack( #endif #ifdef ACCOMP long Nvis = num_points*freq_per_chan*polarizations; uint Nvis = num_points*freq_per_chan*polarizations; // #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) // #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #else Loading Loading @@ -297,10 +291,10 @@ void wstack( int grid_v = (int)pos_v; // check the boundaries long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); Loading @@ -313,19 +307,19 @@ void wstack( for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; long ifine = visindex; uint ifine = visindex; // DAV: the following two loops are performend by each thread separately: no problems of race conditions for (long ifreq=0; ifreq<freq_per_chan; ifreq++) for (uint ifreq=0; ifreq<freq_per_chan; ifreq++) { long iweight = visindex/freq_per_chan; for (long ipol=0; ipol<polarizations; ipol++) uint iweight = visindex/freq_per_chan; for (uint ipol=0; ipol<polarizations; ipol++) { if (!isnan(vis_real[ifine])) { Loading w-stacking.h +6 −6 Original line number Diff line number Diff line Loading @@ -13,9 +13,9 @@ extern "C" extern "C" { void wstack( int, long, long, long, unsigned int, unsigned int, unsigned int, double*, double*, double*, Loading @@ -34,9 +34,9 @@ void wstack( #else void wstack( int, long, long, long, unsigned int, unsigned int, unsigned int, double*, double*, double*, Loading Loading
Makefile +29 −13 Original line number Diff line number Diff line Loading @@ -24,6 +24,9 @@ FFTW_MPI_INC = FFTW_MPI_LIB = CFLAGS += -I./ FLAGS=$(OPTIMIZE) FFTWLIBS = # ======================================================== Loading Loading @@ -52,7 +55,7 @@ OPT += -DPHASE_ON #OPT += -DNVIDIA #use cuda for GPUs #OPT += -DCUDACC OPT += -DCUDACC # use GPU acceleration via OMP #OPT += -DACCOMP Loading @@ -73,12 +76,23 @@ OPT += -DPHASE_ON DEPS = w-stacking.h main.c allvars.h ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o else OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o endif ifneq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o else OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o endif DEPS_ACC_CUDA = w-stacking.h w-stacking.cu DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu OBJ_ACC_CUDA = phase_correction.o w-stacking.o DEPS_ACC_OMP = w-stacking_omp.h DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o OBJ_NCCL_REDUCE = gridding_nccl.o Loading Loading @@ -114,11 +128,12 @@ endif ifeq (CUDACC,$(findstring CUDACC,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-cuda LINKER=$(NVCC) FLAGS=$(NVFLAGS) $(CFLAGS) LINKER=$(MPIC++) FLAGS=$(OPTIMIZE) LIBS=$(NVLIB) compile_cuda: $(OBJ_ACC_CUDA) $(NVCC) $(OPT) $(NVFLAGS) -c *.cu $(NVLIB) $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS) OBJ += $(OBJ_ACC_CUDA) endif ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) Loading @@ -126,8 +141,9 @@ EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(NVC) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) compile_accomp: $(OBJ_ACC_OMP) $(NVC) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB) $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) $(NVC) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_ACC_OMP) endif ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT))) Loading @@ -136,7 +152,7 @@ LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) compile_accreduce: $(OBJ_NCCL_REDUCE) $(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) endif ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT))) Loading @@ -144,7 +160,7 @@ EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) compile_accreduce: $(OBJ_RCCL_REDCUE) compile_accreduce: $(OBJ_RCCL_REDUCE) $(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3) endif Loading @@ -152,9 +168,9 @@ endif ################################################################################### w-stacking: $(OBJ) $(DEPS) Makefile $(LINKER) $(FLAGS) $(OPTIMIZE) $(OPT) $(OBJ) -lmpi -o $(EXEC)$(EXEC_EXT) -lmpi $(FFTWLIBS) $(LIBS) $(LINKER) $(FLAGS) $(OPT) $(OBJ) -o $(EXEC)$(EXEC_EXT) -lmpi $(FFTWLIBS) $(LIBS) $(OBJ): $(DEPS) Makefile #$(OBJ): $(DEPS) Makefile %.o: %.c $(DEPS) $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) Loading
gridding.c +0 −2 Original line number Diff line number Diff line Loading @@ -209,9 +209,7 @@ void write_gridded_data() fclose(file.pFileimg); } #ifdef USE_MPI MPI_Win_fence(0,slabwin); #endif #endif //WRITE_DATA Loading
gridding_cpu.c +5 −3 Original line number Diff line number Diff line #include "allvars.h" #include "proto.h" Loading Loading @@ -225,7 +224,7 @@ void gridding_data() { int ret = reduce_ring(target_rank); grid = (double*)Me.fwin.ptr; //Let grid point to the right memory location [GL] //grid = (double*)Me.fwin.ptr; //Let grid point to the right memory location [GL] if ( ret != 0 ) { Loading @@ -241,6 +240,9 @@ void gridding_data() timing_wt.reduce += CPU_TIME_wt - start; } else *grid += *gridss; // Go to next sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); Loading
w-stacking.cu +86 −92 Original line number Diff line number Diff line Loading @@ -34,9 +34,9 @@ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) __global__ void convolve_g( int num_w_planes, long num_points, long freq_per_chan, long polarizations, uint num_points, uint freq_per_chan, uint polarizations, double* uu, double* vv, double* ww, Loading @@ -53,11 +53,11 @@ __global__ void convolve_g( { //printf("DENTRO AL KERNEL\n"); long gid = blockIdx.x*blockDim.x + threadIdx.x; uint gid = blockIdx.x*blockDim.x + threadIdx.x; if(gid < num_points) { long i = gid; long visindex = i*freq_per_chan*polarizations; uint i = gid; uint visindex = i*freq_per_chan*polarizations; double norm = std22/PI; int j, k; Loading @@ -79,10 +79,10 @@ __global__ void convolve_g( //printf("[A] %g, %g, %g, %g, %g, %d, %d, %d\n", uu[i], vv[i], ww[i], dx, dw, grid_w, grid_u, grid_v); // check the boundaries long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax); /* Loading @@ -98,36 +98,29 @@ __global__ void convolve_g( for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); //printf("--> %ld %d %d %d\n",iKer,j,k,grid_w); if ( iKer >= 2*num_w_planes*grid_size_x*grid_size_y) { printf("=== ALERT [B] iKer out of bound: %ld, %d, %d, %d, %d, %d\n", iKer, j, k, grid_w, grid_size_x, grid_size_y); printf("grid_w = %d, i=%ld, ww_i = %g, dw = %g\n", grid_w, i, ww_i, dw); printf("kmin,kamx: %ld, %ld : %d, %d %d : %g %g\n", kmin, kmax, grid_v, KernelLen, grid_size_y, vv[i], dx ); printf("jmin,jamx: %ld, %ld : %d, %d %d : %g %g\n", jmin, jmax, grid_u, KernelLen, grid_size_x, uu[i], dx ); } double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; long ifine = visindex; uint ifine = visindex; /* if ( ifine >= num_points*freq_per_chan*polarizations ) printf("=== ALERT [C] ifine out of bound: %ld\n", ifine); */ for (long ifreq=0; ifreq<freq_per_chan; ifreq++) for (uint ifreq=0; ifreq<freq_per_chan; ifreq++) { long iweight = visindex/freq_per_chan; uint iweight = visindex/freq_per_chan; /* if ( iweight >= num_points*polarizations ) printf("=== ALERT [D] ifine out of bound: %ld\n", iweight); */ for (long ipol=0; ipol<polarizations; ipol++) for (uint ipol=0; ipol<polarizations; ipol++) { double vistest = (double)vis_real[ifine]; if (!isnan(vistest)) Loading @@ -151,9 +144,9 @@ __global__ void convolve_g( #endif void wstack( int num_w_planes, long num_points, long freq_per_chan, long polarizations, uint num_points, uint freq_per_chan, uint polarizations, double* uu, double* vv, double* ww, Loading @@ -169,9 +162,9 @@ void wstack( int num_threads, int rank) { long i; long index; long visindex; uint i; uint index; uint visindex; // initialize the convolution kernel // gaussian: Loading @@ -180,6 +173,7 @@ void wstack( double std22 = 1.0/(2.0*std*std); double norm = std22/PI; printf("Task %d, here!", rank); // Loop over visibilities. // Switch between CUDA and GPU versions #ifdef __CUDACC__ Loading @@ -191,9 +185,9 @@ void wstack( cudaGetDevice( &device_cuda ); int Nth = NTHREADS; long Nbl = (long)(num_points/Nth) + 1; uint Nbl = (uint)(num_points/Nth) + 1; if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; long Nvis = num_points*freq_per_chan*polarizations; uint Nvis = num_points*freq_per_chan*polarizations; printf("Running on GPU %d with %d threads and %d blocks\n",device_cuda,Nth,Nbl); // Create GPU arrays and offload them Loading Loading @@ -268,7 +262,7 @@ void wstack( #endif #ifdef ACCOMP long Nvis = num_points*freq_per_chan*polarizations; uint Nvis = num_points*freq_per_chan*polarizations; // #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) // #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #else Loading Loading @@ -297,10 +291,10 @@ void wstack( int grid_v = (int)pos_v; // check the boundaries long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); Loading @@ -313,19 +307,19 @@ void wstack( for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; long ifine = visindex; uint ifine = visindex; // DAV: the following two loops are performend by each thread separately: no problems of race conditions for (long ifreq=0; ifreq<freq_per_chan; ifreq++) for (uint ifreq=0; ifreq<freq_per_chan; ifreq++) { long iweight = visindex/freq_per_chan; for (long ipol=0; ipol<polarizations; ipol++) uint iweight = visindex/freq_per_chan; for (uint ipol=0; ipol<polarizations; ipol++) { if (!isnan(vis_real[ifine])) { Loading
w-stacking.h +6 −6 Original line number Diff line number Diff line Loading @@ -13,9 +13,9 @@ extern "C" extern "C" { void wstack( int, long, long, long, unsigned int, unsigned int, unsigned int, double*, double*, double*, Loading @@ -34,9 +34,9 @@ void wstack( #else void wstack( int, long, long, long, unsigned int, unsigned int, unsigned int, double*, double*, double*, Loading