Loading Makefile +48 −13 Original line number Diff line number Diff line Loading @@ -59,9 +59,9 @@ OPT += -DPHASE_ON #OPT += -DNORMALIZE_UVW # Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL OPT += -DGAUSS_HI_PRECISION #OPT += -DGAUSS_HI_PRECISION #OPT += -DGAUSS OPT += -DGAUSS #OPT += -DKAISERBESSEL Loading @@ -77,7 +77,7 @@ OPT += -DGAUSS_HI_PRECISION #OPT += -DCUDACC # use GPU acceleration via OMP OPT += -DACCOMP #OPT += -DACCOMP # perform stacking on GPUs #OPT += -DGPU_STACKING Loading Loading @@ -125,7 +125,7 @@ DEPS = w-stacking.h main.c allvars.h # ----- define which files will be compiled by MPICC # # these are the OBJS that will be compiled by C compiler if no acceleration (neither with CUDA nor with OpenMP) is provided CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform_3d.o result.o numa.o reduce.o w-stacking.o phase_correction.o # these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o Loading @@ -142,13 +142,22 @@ DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c OBJ_ACC_OMP = phase_correction.o w-stacking.o # ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is # offloaded on GPU ifeq (CUDACC,$(findstring CUDACC,$(OPT))) DEPS_NCCL_REDUCE = gridding_nccl.cu OBJ_NCCL_REDUCE = gridding_nccl.o DEPS_RCCL_REDUCE = gridding_rccl.cu OBJ_RCCL_REDUCE = gridding_rccl.o else DEPS_NCCL_REDUCE = gridding_nccl.cpp OBJ_NCCL_REDUCE = gridding_nccl.o DEPS_RCCL_REDUCE = gridding_rccl.cpp OBJ_RCCL_REDUCE = gridding_rccl.o endif # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT # Loading Loading @@ -196,6 +205,12 @@ phase_correction.c: phase_correction.cu cuda_fft.cpp: cuda_fft.cu cp cuda_fft.cu cuda_fft.cpp gridding_nccl.cpp: gridding_nccl.cu cp gridding_nccl.cu gridding_nccl.cpp gridding_rccl.cpp: gridding_rccl.cu cp gridding_rccl.cu gridding_rccl.cpp else w-stacking.c: w-stacking.cu rm -f w-stacking.c Loading @@ -206,6 +221,12 @@ phase_correction.c: phase_correction.cu cuda_fft.cpp: cuda_fft.cu rm -f cuda_fft.cpp touch cuda_fft.cpp gridding_nccl.cpp: gridding_nccl.cu rm -f gridding_nccl.cpp touch gridding_nccl.cpp gridding_rccl.cpp: gridding_rccl.cu rm -f gridding_rccl.cpp touch gridding_rccl.cpp endif Loading @@ -221,7 +242,7 @@ LINKER=$(MPIC++) FLAGS=$(OPTIMIZE) LIBS=$(NVLIB) $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS) $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c w-stacking.cu phase_correction.cu $(LIBS) OBJ += $(OBJ_ACC_CUDA) endif Loading Loading @@ -256,6 +277,17 @@ endif ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT))) ifeq (CUDACC,$(findstring CUDACC,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(MPIC++) FLAGS=$(OPTIMIZE) LIBS=$(NVLIB_3) $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) $(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_NCCL_REDUCE) else EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) Loading @@ -264,6 +296,7 @@ $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_NCCL_REDUCE) endif endif ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce Loading Loading @@ -319,9 +352,11 @@ clean: rm -f w-stacking.c rm -f phase_correction.c rm -f cuda_fft.cpp cleanall: rm -f $(EXEC)$(EXT) rm -f *.o rm -f w-stacking.c rm -f phase_correction.c rm -f gridding_nccl.cpp rm -f gridding_rccl.cpp #cleanall: # rm -f $(EXEC)$(EXT) # rm -f *.o # rm -f w-stacking.c # rm -f phase_correction.c allvars_nccl.h +0 −2 Original line number Diff line number Diff line Loading @@ -23,8 +23,6 @@ #include <omp.h> #endif #if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw #include <fftw3-mpi.h> #endif Loading gridding_nccl.cpp→gridding_nccl.cu +33 −15 Original line number Diff line number Diff line #include "allvars_nccl.h" #include "proto.h" #include <cuda.h> Loading Loading @@ -87,7 +86,8 @@ void gridding_data(){ ncclUniqueId id; ncclComm_t comm; cudaStream_t stream_reduce; cudaError_t nnn; cudaStream_t stream_reduce, stream_stacking; if (rank == 0) ncclGetUniqueId(&id); MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); Loading @@ -97,6 +97,7 @@ void gridding_data(){ cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); cudaStreamCreate(&stream_reduce); cudaStreamCreate(&stream_stacking); ncclCommInitRank(&comm, size, id, rank); Loading Loading @@ -190,16 +191,37 @@ void gridding_data(){ printf("Processing sector %ld\n",isector); #endif double *stacking_target_array; if ( size > 1 ) stacking_target_array = gridss; else stacking_target_array = grid; start = CPU_TIME_wt; //We have to call different GPUs per MPI task!!! [GL] #ifdef CUDACC wstack(param.num_w_planes, Nsec, metaData.freq_per_chan, metaData.polarisations, uus, vvs, wws, visreals, visimgs, weightss, dx, dw, param.w_support, xaxis, yaxis, gridss_gpu, param.num_threads, rank, stream_stacking); #else wstack(param.num_w_planes, Nsec, metaData.freq_per_chan, Loading @@ -218,14 +240,13 @@ void gridding_data(){ stacking_target_array, param.num_threads, rank); #endif //Allocate memory on devices non-blocking for the host /////////////////////////////////////////////////////// timing_wt.kernel += CPU_TIME_wt - start; cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce); #ifdef VERBOSE printf("Processed sector %ld\n",isector); #endif Loading @@ -239,8 +260,6 @@ void gridding_data(){ // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); cudaStreamSynchronize(stream_reduce); start = CPU_TIME_wt; ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); Loading @@ -248,9 +267,9 @@ void gridding_data(){ timing_wt.reduce += CPU_TIME_wt - start; // Go to next sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); if (nnn != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", nnn);} } free(memory); Loading @@ -258,16 +277,15 @@ void gridding_data(){ //Copy data back from device to host (to be deleted in next steps) cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); MPI_Barrier(MPI_COMM_WORLD); cudaStreamSynchronize(stream_reduce); cudaFree(gridss_gpu); cudaFree(grid_gpu); cudaFree(gridss_gpu); cudaStreamDestroy(stream_reduce); cudaStreamDestroy(stream_stacking); ncclCommDestroy(comm); Loading w-stacking.cu +131 −85 Original line number Diff line number Diff line Loading @@ -116,6 +116,8 @@ __global__ void convolve_g( double* grid, double std22) { //printf("DENTRO AL KERNEL\n"); uint gid = blockIdx.x*blockDim.x + threadIdx.x; Loading @@ -131,6 +133,7 @@ __global__ void convolve_g( double pos_u = uu[i] / dx; double pos_v = vv[i] / dx; double ww_i = ww[i] / dw; int grid_w = (int)ww_i; int grid_u = (int)pos_u; int grid_v = (int)pos_v; Loading @@ -140,22 +143,32 @@ __global__ void convolve_g( uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax); // Convolve this point onto the grid. for (k = kmin; k <= kmax; k++) { double v_dist = (double)k+0.5 - pos_v; double v_dist = (double)k - pos_v; int increaseprecision = 5; for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); //printf("--> %ld %d %d %d\n",iKer,j,k,grid_w); int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen))); int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen))); #ifdef GAUSS_HI_PRECISION double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); #endif #ifdef GAUSS double conv_weight = convkernel[jKer]*convkernel[kKer]; #endif #ifdef KAISERBESSEL double conv_weight = convkernel[jKer]*convkernel[kKer]; #endif // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; Loading Loading @@ -203,7 +216,14 @@ void wstack( int grid_size_y, double* grid, int num_threads, int rank) #ifdef CUDACC int rank, cudaStream_t stream_stacking #else int rank #endif ) { uint i; //uint index; Loading Loading @@ -249,13 +269,10 @@ void wstack( } } printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices); #ifdef NVIDIA prtAccelInfo(); #endif printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl); // Create GPU arrays and offload them double * uu_g; double * vv_g; Loading @@ -263,32 +280,48 @@ void wstack( float * vis_real_g; float * vis_img_g; float * weight_g; double * grid_g; //double * grid_g; double * convkernel_g; //Create the event inside stream stacking //cudaEvent_t event_kernel; //for (int i=0; i<100000; i++)grid[i]=23.0; cudaError_t mmm; //mmm=cudaEventCreate(&event_kernel); mmm=cudaMalloc(&uu_g,num_points*sizeof(double)); mmm=cudaMalloc(&vv_g,num_points*sizeof(double)); mmm=cudaMalloc(&ww_g,num_points*sizeof(double)); mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float)); mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float)); mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float)); mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); //mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); /* #if !defined(GAUSS_HI_PRECISION) mmm=cudaMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double)); #endif */ if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMalloc ERROR %d !!!\n", mmm);} mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); //mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);} mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice); mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice); mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice); mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice); mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice); mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice); if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpy ERROR %d !!!\n", mmm);} mmm=cudaMemcpyAsync(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); mmm=cudaMemcpyAsync(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); mmm=cudaMemcpyAsync(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); mmm=cudaMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking); mmm=cudaMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking); mmm=cudaMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice, stream_stacking); /* #if !defined(GAUSS_HI_PRECISION) mmm=cudaMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); #endif */ if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpyAsync ERROR %d !!!\n", mmm);} // Call main GPU Kernel convolve_g <<<Nbl,Nth>>> ( convolve_g <<<Nbl,Nth,0,stream_stacking>>> ( num_w_planes, num_points, freq_per_chan, Loading @@ -304,20 +337,33 @@ void wstack( KernelLen, grid_size_x, grid_size_y, grid_g, std22); grid, std22 ); mmm=cudaStreamSynchronize(stream_stacking); //Record the event //mmm=cudaEventRecord(event_kernel,stream_stacking); //Wait until the kernel ends //mmm=cudaStreamWaitEvent(stream_stacking,event_kernel); mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost); //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); printf("CUDA ERROR %s\n",cudaGetErrorString(mmm)); mmm=cudaFree(uu_g); mmm=cudaFree(vv_g); mmm=cudaFree(ww_g); mmm=cudaFree(vis_real_g); mmm=cudaFree(vis_img_g); mmm=cudaFree(weight_g); mmm=cudaFree(grid_g); //mmm=cudaFree(grid_g); /* #if !defined(GAUSS_HI_PRECISION) mmm=cudaFree(convkernel_g); #endif */ // Switch between CUDA and GPU versions # else Loading w-stacking.h +9 −2 Original line number Diff line number Diff line Loading @@ -32,7 +32,8 @@ void wstack( int, double*, int, int); int, cudaStream_t); } #else Loading @@ -58,6 +59,7 @@ void wstack( #endif #ifdef __CUDACC__ extern "C" #endif Loading @@ -81,6 +83,11 @@ void phase_correction( int, int); double gauss_kernel_norm( double norm, double std22, double u_dist, double v_dist); #ifdef ACCOMP #pragma omp declare target (gauss_kernel_norm) Loading Loading
Makefile +48 −13 Original line number Diff line number Diff line Loading @@ -59,9 +59,9 @@ OPT += -DPHASE_ON #OPT += -DNORMALIZE_UVW # Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL OPT += -DGAUSS_HI_PRECISION #OPT += -DGAUSS_HI_PRECISION #OPT += -DGAUSS OPT += -DGAUSS #OPT += -DKAISERBESSEL Loading @@ -77,7 +77,7 @@ OPT += -DGAUSS_HI_PRECISION #OPT += -DCUDACC # use GPU acceleration via OMP OPT += -DACCOMP #OPT += -DACCOMP # perform stacking on GPUs #OPT += -DGPU_STACKING Loading Loading @@ -125,7 +125,7 @@ DEPS = w-stacking.h main.c allvars.h # ----- define which files will be compiled by MPICC # # these are the OBJS that will be compiled by C compiler if no acceleration (neither with CUDA nor with OpenMP) is provided CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform_3d.o result.o numa.o reduce.o w-stacking.o phase_correction.o # these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o Loading @@ -142,13 +142,22 @@ DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c OBJ_ACC_OMP = phase_correction.o w-stacking.o # ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is # offloaded on GPU ifeq (CUDACC,$(findstring CUDACC,$(OPT))) DEPS_NCCL_REDUCE = gridding_nccl.cu OBJ_NCCL_REDUCE = gridding_nccl.o DEPS_RCCL_REDUCE = gridding_rccl.cu OBJ_RCCL_REDUCE = gridding_rccl.o else DEPS_NCCL_REDUCE = gridding_nccl.cpp OBJ_NCCL_REDUCE = gridding_nccl.o DEPS_RCCL_REDUCE = gridding_rccl.cpp OBJ_RCCL_REDUCE = gridding_rccl.o endif # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT # Loading Loading @@ -196,6 +205,12 @@ phase_correction.c: phase_correction.cu cuda_fft.cpp: cuda_fft.cu cp cuda_fft.cu cuda_fft.cpp gridding_nccl.cpp: gridding_nccl.cu cp gridding_nccl.cu gridding_nccl.cpp gridding_rccl.cpp: gridding_rccl.cu cp gridding_rccl.cu gridding_rccl.cpp else w-stacking.c: w-stacking.cu rm -f w-stacking.c Loading @@ -206,6 +221,12 @@ phase_correction.c: phase_correction.cu cuda_fft.cpp: cuda_fft.cu rm -f cuda_fft.cpp touch cuda_fft.cpp gridding_nccl.cpp: gridding_nccl.cu rm -f gridding_nccl.cpp touch gridding_nccl.cpp gridding_rccl.cpp: gridding_rccl.cu rm -f gridding_rccl.cpp touch gridding_rccl.cpp endif Loading @@ -221,7 +242,7 @@ LINKER=$(MPIC++) FLAGS=$(OPTIMIZE) LIBS=$(NVLIB) $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS) $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c w-stacking.cu phase_correction.cu $(LIBS) OBJ += $(OBJ_ACC_CUDA) endif Loading Loading @@ -256,6 +277,17 @@ endif ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT))) ifeq (CUDACC,$(findstring CUDACC,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(MPIC++) FLAGS=$(OPTIMIZE) LIBS=$(NVLIB_3) $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) $(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_NCCL_REDUCE) else EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) Loading @@ -264,6 +296,7 @@ $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_NCCL_REDUCE) endif endif ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce Loading Loading @@ -319,9 +352,11 @@ clean: rm -f w-stacking.c rm -f phase_correction.c rm -f cuda_fft.cpp cleanall: rm -f $(EXEC)$(EXT) rm -f *.o rm -f w-stacking.c rm -f phase_correction.c rm -f gridding_nccl.cpp rm -f gridding_rccl.cpp #cleanall: # rm -f $(EXEC)$(EXT) # rm -f *.o # rm -f w-stacking.c # rm -f phase_correction.c
allvars_nccl.h +0 −2 Original line number Diff line number Diff line Loading @@ -23,8 +23,6 @@ #include <omp.h> #endif #if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw #include <fftw3-mpi.h> #endif Loading
gridding_nccl.cpp→gridding_nccl.cu +33 −15 Original line number Diff line number Diff line #include "allvars_nccl.h" #include "proto.h" #include <cuda.h> Loading Loading @@ -87,7 +86,8 @@ void gridding_data(){ ncclUniqueId id; ncclComm_t comm; cudaStream_t stream_reduce; cudaError_t nnn; cudaStream_t stream_reduce, stream_stacking; if (rank == 0) ncclGetUniqueId(&id); MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); Loading @@ -97,6 +97,7 @@ void gridding_data(){ cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); cudaStreamCreate(&stream_reduce); cudaStreamCreate(&stream_stacking); ncclCommInitRank(&comm, size, id, rank); Loading Loading @@ -190,16 +191,37 @@ void gridding_data(){ printf("Processing sector %ld\n",isector); #endif double *stacking_target_array; if ( size > 1 ) stacking_target_array = gridss; else stacking_target_array = grid; start = CPU_TIME_wt; //We have to call different GPUs per MPI task!!! [GL] #ifdef CUDACC wstack(param.num_w_planes, Nsec, metaData.freq_per_chan, metaData.polarisations, uus, vvs, wws, visreals, visimgs, weightss, dx, dw, param.w_support, xaxis, yaxis, gridss_gpu, param.num_threads, rank, stream_stacking); #else wstack(param.num_w_planes, Nsec, metaData.freq_per_chan, Loading @@ -218,14 +240,13 @@ void gridding_data(){ stacking_target_array, param.num_threads, rank); #endif //Allocate memory on devices non-blocking for the host /////////////////////////////////////////////////////// timing_wt.kernel += CPU_TIME_wt - start; cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce); #ifdef VERBOSE printf("Processed sector %ld\n",isector); #endif Loading @@ -239,8 +260,6 @@ void gridding_data(){ // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); cudaStreamSynchronize(stream_reduce); start = CPU_TIME_wt; ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); Loading @@ -248,9 +267,9 @@ void gridding_data(){ timing_wt.reduce += CPU_TIME_wt - start; // Go to next sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); if (nnn != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", nnn);} } free(memory); Loading @@ -258,16 +277,15 @@ void gridding_data(){ //Copy data back from device to host (to be deleted in next steps) cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); MPI_Barrier(MPI_COMM_WORLD); cudaStreamSynchronize(stream_reduce); cudaFree(gridss_gpu); cudaFree(grid_gpu); cudaFree(gridss_gpu); cudaStreamDestroy(stream_reduce); cudaStreamDestroy(stream_stacking); ncclCommDestroy(comm); Loading
w-stacking.cu +131 −85 Original line number Diff line number Diff line Loading @@ -116,6 +116,8 @@ __global__ void convolve_g( double* grid, double std22) { //printf("DENTRO AL KERNEL\n"); uint gid = blockIdx.x*blockDim.x + threadIdx.x; Loading @@ -131,6 +133,7 @@ __global__ void convolve_g( double pos_u = uu[i] / dx; double pos_v = vv[i] / dx; double ww_i = ww[i] / dw; int grid_w = (int)ww_i; int grid_u = (int)pos_u; int grid_v = (int)pos_v; Loading @@ -140,22 +143,32 @@ __global__ void convolve_g( uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax); // Convolve this point onto the grid. for (k = kmin; k <= kmax; k++) { double v_dist = (double)k+0.5 - pos_v; double v_dist = (double)k - pos_v; int increaseprecision = 5; for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); //printf("--> %ld %d %d %d\n",iKer,j,k,grid_w); int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen))); int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen))); #ifdef GAUSS_HI_PRECISION double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); #endif #ifdef GAUSS double conv_weight = convkernel[jKer]*convkernel[kKer]; #endif #ifdef KAISERBESSEL double conv_weight = convkernel[jKer]*convkernel[kKer]; #endif // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; Loading Loading @@ -203,7 +216,14 @@ void wstack( int grid_size_y, double* grid, int num_threads, int rank) #ifdef CUDACC int rank, cudaStream_t stream_stacking #else int rank #endif ) { uint i; //uint index; Loading Loading @@ -249,13 +269,10 @@ void wstack( } } printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices); #ifdef NVIDIA prtAccelInfo(); #endif printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl); // Create GPU arrays and offload them double * uu_g; double * vv_g; Loading @@ -263,32 +280,48 @@ void wstack( float * vis_real_g; float * vis_img_g; float * weight_g; double * grid_g; //double * grid_g; double * convkernel_g; //Create the event inside stream stacking //cudaEvent_t event_kernel; //for (int i=0; i<100000; i++)grid[i]=23.0; cudaError_t mmm; //mmm=cudaEventCreate(&event_kernel); mmm=cudaMalloc(&uu_g,num_points*sizeof(double)); mmm=cudaMalloc(&vv_g,num_points*sizeof(double)); mmm=cudaMalloc(&ww_g,num_points*sizeof(double)); mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float)); mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float)); mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float)); mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); //mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); /* #if !defined(GAUSS_HI_PRECISION) mmm=cudaMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double)); #endif */ if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMalloc ERROR %d !!!\n", mmm);} mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); //mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);} mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice); mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice); mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice); mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice); mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice); mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice); if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpy ERROR %d !!!\n", mmm);} mmm=cudaMemcpyAsync(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); mmm=cudaMemcpyAsync(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); mmm=cudaMemcpyAsync(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); mmm=cudaMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking); mmm=cudaMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking); mmm=cudaMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice, stream_stacking); /* #if !defined(GAUSS_HI_PRECISION) mmm=cudaMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), cudaMemcpyHostToDevice, stream_stacking); #endif */ if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpyAsync ERROR %d !!!\n", mmm);} // Call main GPU Kernel convolve_g <<<Nbl,Nth>>> ( convolve_g <<<Nbl,Nth,0,stream_stacking>>> ( num_w_planes, num_points, freq_per_chan, Loading @@ -304,20 +337,33 @@ void wstack( KernelLen, grid_size_x, grid_size_y, grid_g, std22); grid, std22 ); mmm=cudaStreamSynchronize(stream_stacking); //Record the event //mmm=cudaEventRecord(event_kernel,stream_stacking); //Wait until the kernel ends //mmm=cudaStreamWaitEvent(stream_stacking,event_kernel); mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost); //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); printf("CUDA ERROR %s\n",cudaGetErrorString(mmm)); mmm=cudaFree(uu_g); mmm=cudaFree(vv_g); mmm=cudaFree(ww_g); mmm=cudaFree(vis_real_g); mmm=cudaFree(vis_img_g); mmm=cudaFree(weight_g); mmm=cudaFree(grid_g); //mmm=cudaFree(grid_g); /* #if !defined(GAUSS_HI_PRECISION) mmm=cudaFree(convkernel_g); #endif */ // Switch between CUDA and GPU versions # else Loading
w-stacking.h +9 −2 Original line number Diff line number Diff line Loading @@ -32,7 +32,8 @@ void wstack( int, double*, int, int); int, cudaStream_t); } #else Loading @@ -58,6 +59,7 @@ void wstack( #endif #ifdef __CUDACC__ extern "C" #endif Loading @@ -81,6 +83,11 @@ void phase_correction( int, int); double gauss_kernel_norm( double norm, double std22, double u_dist, double v_dist); #ifdef ACCOMP #pragma omp declare target (gauss_kernel_norm) Loading