Commit 9e323f26 authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

Stacking/Reduce on GPU with reduced communication

parent a42e9056
Loading
Loading
Loading
Loading
+48 −13
Original line number Diff line number Diff line
@@ -59,9 +59,9 @@ OPT += -DPHASE_ON
#OPT += -DNORMALIZE_UVW

# Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
OPT += -DGAUSS_HI_PRECISION
#OPT += -DGAUSS_HI_PRECISION

#OPT += -DGAUSS
OPT += -DGAUSS

#OPT += -DKAISERBESSEL

@@ -77,7 +77,7 @@ OPT += -DGAUSS_HI_PRECISION
#OPT += -DCUDACC

# use GPU acceleration via OMP 
OPT += -DACCOMP
#OPT += -DACCOMP

# perform stacking on GPUs
#OPT += -DGPU_STACKING
@@ -125,7 +125,7 @@ DEPS = w-stacking.h main.c allvars.h
# ----- define which files will be compiled by MPICC
#
# these are the OBJS that will be compiled by C compiler if no acceleration (neither with CUDA nor with OpenMP) is provided
CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform_3d.o result.o numa.o reduce.o w-stacking.o phase_correction.o

# these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on
CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
@@ -142,13 +142,22 @@ DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c
OBJ_ACC_OMP = phase_correction.o w-stacking.o



# ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is
#       offloaded on GPU
ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
DEPS_NCCL_REDUCE = gridding_nccl.cu
OBJ_NCCL_REDUCE  = gridding_nccl.o

DEPS_RCCL_REDUCE = gridding_rccl.cu
OBJ_RCCL_REDUCE  = gridding_rccl.o
else
DEPS_NCCL_REDUCE = gridding_nccl.cpp
OBJ_NCCL_REDUCE  = gridding_nccl.o

DEPS_RCCL_REDUCE = gridding_rccl.cpp
OBJ_RCCL_REDUCE  = gridding_rccl.o
endif

# ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
#
@@ -196,6 +205,12 @@ phase_correction.c: phase_correction.cu

cuda_fft.cpp: cuda_fft.cu
	cp cuda_fft.cu cuda_fft.cpp

gridding_nccl.cpp: gridding_nccl.cu
	cp gridding_nccl.cu gridding_nccl.cpp

gridding_rccl.cpp: gridding_rccl.cu
	cp gridding_rccl.cu gridding_rccl.cpp
else
w-stacking.c: w-stacking.cu
	rm -f w-stacking.c
@@ -206,6 +221,12 @@ phase_correction.c: phase_correction.cu
cuda_fft.cpp: cuda_fft.cu
	rm -f cuda_fft.cpp
	touch cuda_fft.cpp
gridding_nccl.cpp: gridding_nccl.cu
	rm -f gridding_nccl.cpp
	touch gridding_nccl.cpp
gridding_rccl.cpp: gridding_rccl.cu
	rm -f gridding_rccl.cpp
	touch gridding_rccl.cpp
endif


@@ -221,7 +242,7 @@ LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE) 
LIBS=$(NVLIB)
$(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
	$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS)
	$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c w-stacking.cu phase_correction.cu $(LIBS)
OBJ += $(OBJ_ACC_CUDA)
endif

@@ -256,6 +277,17 @@ endif


ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT)))

ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-reduce
LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE)
LIBS=$(NVLIB_3)
$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
	$(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS)
OBJ += $(OBJ_NCCL_REDUCE)
else

EXEC_EXT := $(EXEC_EXT)_acc-reduce
LINKER=$(NVC++)
FLAGS=$(NVFLAGS) $(CFLAGS)
@@ -264,6 +296,7 @@ $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
	$(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS)
OBJ += $(OBJ_NCCL_REDUCE)
endif
endif

ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-reduce
@@ -319,9 +352,11 @@ clean:
	rm -f w-stacking.c
	rm -f phase_correction.c
	rm -f cuda_fft.cpp

cleanall:
	rm -f $(EXEC)$(EXT)
	rm -f *.o
	rm -f w-stacking.c
	rm -f phase_correction.c
	rm -f gridding_nccl.cpp
	rm -f gridding_rccl.cpp

#cleanall:
#	rm -f $(EXEC)$(EXT)
#	rm -f *.o
#	rm -f w-stacking.c
#	rm -f phase_correction.c
+0 −2
Original line number Diff line number Diff line
@@ -23,8 +23,6 @@
#include <omp.h>
#endif



#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
#include <fftw3-mpi.h>
#endif
+33 −15
Original line number Diff line number Diff line

#include "allvars_nccl.h"
#include "proto.h"
#include <cuda.h>
@@ -87,7 +86,8 @@ void gridding_data(){
  
  ncclUniqueId id;
  ncclComm_t comm;
  cudaStream_t stream_reduce;
  cudaError_t nnn;
  cudaStream_t stream_reduce, stream_stacking;

  if (rank == 0) ncclGetUniqueId(&id);
  MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
@@ -97,6 +97,7 @@ void gridding_data(){
  cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
  cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
  cudaStreamCreate(&stream_reduce);
  cudaStreamCreate(&stream_stacking);
  
  ncclCommInitRank(&comm, size, id, rank);

@@ -190,16 +191,37 @@ void gridding_data(){
      printf("Processing sector %ld\n",isector);
     #endif
  
   
      double *stacking_target_array;
      if ( size > 1 )
	stacking_target_array = gridss;
      else
	stacking_target_array = grid;

      
      start = CPU_TIME_wt;
	    
     //We have to call different GPUs per MPI task!!! [GL]
     #ifdef CUDACC
      wstack(param.num_w_planes,
	     Nsec,
	     metaData.freq_per_chan,
	     metaData.polarisations,
	     uus,
	     vvs,
	     wws,
	     visreals,
	     visimgs,
	     weightss,
	     dx,
	     dw,
	     param.w_support,
	     xaxis,
	     yaxis,
	     gridss_gpu,
	     param.num_threads,
	     rank,
	     stream_stacking);
      #else
      wstack(param.num_w_planes,
	     Nsec,
	     metaData.freq_per_chan,
@@ -218,14 +240,13 @@ void gridding_data(){
	     stacking_target_array,
	     param.num_threads,
	     rank);
      #endif
      //Allocate memory on devices non-blocking for the host                                                                                   
      ///////////////////////////////////////////////////////


      timing_wt.kernel += CPU_TIME_wt - start;
      
      cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce);
      
     #ifdef VERBOSE
      printf("Processed sector %ld\n",isector);
     #endif
@@ -239,8 +260,6 @@ void gridding_data(){
	  // int target_rank = (int)isector;    it implied that size >= nsectors
	  int target_rank = (int)(isector % size);

	  cudaStreamSynchronize(stream_reduce);

	  start = CPU_TIME_wt;

	  ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
@@ -248,9 +267,9 @@ void gridding_data(){
      
	  timing_wt.reduce += CPU_TIME_wt - start;

		  
	  // Go to next sector
	  memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );	  
	  nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );
	  if (nnn != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", nnn);}
	}

      free(memory);
@@ -258,16 +277,15 @@ void gridding_data(){

  //Copy data back from device to host (to be deleted in next steps)
  
  
  cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
  
  MPI_Barrier(MPI_COMM_WORLD);

  cudaStreamSynchronize(stream_reduce);
  cudaFree(gridss_gpu);
  cudaFree(grid_gpu);
  
  cudaFree(gridss_gpu);
  cudaStreamDestroy(stream_reduce);
  cudaStreamDestroy(stream_stacking);
  
  ncclCommDestroy(comm);

+131 −85
Original line number Diff line number Diff line
@@ -116,6 +116,8 @@ __global__ void convolve_g(
			   double* grid,
			   double std22)
			   


{
  //printf("DENTRO AL KERNEL\n");
  uint gid = blockIdx.x*blockDim.x + threadIdx.x;
@@ -131,6 +133,7 @@ __global__ void convolve_g(
      double pos_u = uu[i] / dx;
      double pos_v = vv[i] / dx;
      double ww_i  = ww[i] / dw;

      int grid_w = (int)ww_i;
      int grid_u = (int)pos_u;
      int grid_v = (int)pos_v;
@@ -140,22 +143,32 @@ __global__ void convolve_g(
      uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
      uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
      uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
        //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);


      // Convolve this point onto the grid.
      for (k = kmin; k <= kmax; k++)
        {

            double v_dist = (double)k+0.5 - pos_v;
	  double v_dist = (double)k - pos_v;
	  int increaseprecision = 5;
	  
	  for (j = jmin; j <= jmax; j++)
            {
	      double u_dist = (double)j+0.5 - pos_u;
	      uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
		//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
	      int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
	      int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
	      
	     #ifdef GAUSS_HI_PRECISION
	      double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
	     #endif
	     #ifdef GAUSS
	      double conv_weight = convkernel[jKer]*convkernel[kKer];
	     #endif
	     #ifdef KAISERBESSEL
	      double conv_weight = convkernel[jKer]*convkernel[kKer];
	     #endif

	      // Loops over frequencies and polarizations
	      double add_term_real = 0.0;
	      double add_term_img = 0.0;
@@ -203,7 +216,14 @@ void wstack(
     int grid_size_y,
     double* grid,
     int num_threads,
     int rank)
    #ifdef CUDACC
     int rank,
     cudaStream_t stream_stacking
    #else
     int rank
    #endif
	    )
	    
{
    uint i;
    //uint index;
@@ -249,13 +269,10 @@ void wstack(
      }
    }

      printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices);
     #ifdef NVIDIA
      prtAccelInfo();
     #endif

    printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);

    // Create GPU arrays and offload them
    double * uu_g;
    double * vv_g;
@@ -263,32 +280,48 @@ void wstack(
    float * vis_real_g;
    float * vis_img_g;
    float * weight_g;
    double * grid_g;
    //double * grid_g;
    double * convkernel_g;
    
    //Create the event inside stream stacking
    //cudaEvent_t event_kernel;
    
    //for (int i=0; i<100000; i++)grid[i]=23.0;
    cudaError_t mmm;
    //mmm=cudaEventCreate(&event_kernel);
    mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
    mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
    mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
    mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
    mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
    mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
    mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
    //mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
    /*
   #if !defined(GAUSS_HI_PRECISION)
    mmm=cudaMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double));
   #endif
    */
    if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMalloc ERROR %d !!!\n", mmm);}
    mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
    //mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
    if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);}


    mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
    mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
    mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
    mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
    mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
    mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
    if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpy ERROR %d !!!\n", mmm);}
    mmm=cudaMemcpyAsync(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
    mmm=cudaMemcpyAsync(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
    mmm=cudaMemcpyAsync(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
    mmm=cudaMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
    mmm=cudaMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
    mmm=cudaMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);

    /*
   #if !defined(GAUSS_HI_PRECISION)
    mmm=cudaMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
   #endif
    */
    if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpyAsync ERROR %d !!!\n", mmm);}
    
    // Call main GPU Kernel
    convolve_g <<<Nbl,Nth>>> (
    convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
	       num_w_planes,
               num_points,
               freq_per_chan,
@@ -304,20 +337,33 @@ void wstack(
               KernelLen,
               grid_size_x,
               grid_size_y,
               grid_g,
	       std22);
               grid,
	       std22
						);

    mmm=cudaStreamSynchronize(stream_stacking);
    //Record the event
    //mmm=cudaEventRecord(event_kernel,stream_stacking);

    //Wait until the kernel ends
    //mmm=cudaStreamWaitEvent(stream_stacking,event_kernel);
    
    mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
    
    printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));

    mmm=cudaFree(uu_g);
    mmm=cudaFree(vv_g);
    mmm=cudaFree(ww_g);
    mmm=cudaFree(vis_real_g);
    mmm=cudaFree(vis_img_g);
    mmm=cudaFree(weight_g);
    mmm=cudaFree(grid_g);

    //mmm=cudaFree(grid_g);
    /*
   #if !defined(GAUSS_HI_PRECISION)
    mmm=cudaFree(convkernel_g);
   #endif
    */
// Switch between CUDA and GPU versions
# else

+9 −2
Original line number Diff line number Diff line
@@ -32,7 +32,8 @@ void wstack(
     int,
     double*,
     int,
     int);
     int,
     cudaStream_t);
}

#else 
@@ -58,6 +59,7 @@ void wstack(
#endif



#ifdef __CUDACC__
extern "C"
#endif
@@ -81,6 +83,11 @@ void phase_correction(
     int,
     int);

double gauss_kernel_norm(
  double norm,
  double std22,
  double u_dist,
  double v_dist);

#ifdef ACCOMP
#pragma omp declare target (gauss_kernel_norm)