Commit 42b64919 authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

NCCL Reduce working

parent 59862868
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -4,8 +4,8 @@ CXX = g++
MPICC    =  mpicc
MPIC++   =  mpiCC

OPTIMIZE = -ggdb3 -O4 -fopenmp -march=native -mavx -mavx2 
OMP_GPU = -ggdb3 -mp=multicore,gpu -gpu=cuda12.0 -gpu=cc86
OPTIMIZE = -O4 -fopenmp -march=native -mavx -mavx2 
OMP_GPU = -mp=multicore,gpu -gpu=cuda12.0 -gpu=cc86

CUDA_INC = -I/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/include
CUDA_LIB = -L/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/lib64
+3 −9
Original line number Diff line number Diff line
@@ -55,10 +55,10 @@ OPT += -DPHASE_ON
#OPT += -DNVIDIA

#use cuda for GPUs
OPT += -DCUDACC
#OPT += -DCUDACC

# use GPU acceleration via OMP
#OPT += -DACCOMP
OPT += -DACCOMP

# use NVIDIA GPU to perform the reduce
#OPT += -DNCCL_REDUCE
@@ -79,13 +79,7 @@ DEPS = w-stacking.h main.c allvars.h

ifneq (ACCOMP,$(findstring ACCOMP, $(OPT)))		 
OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
else
OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
endif

ifneq (CUDACC,$(findstring CUDACC, $(OPT)))		 
OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
else
else ifneq (CUDACC,$(findstring CUDACC, $(OPT)))
OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
endif

+2 −1
Original line number Diff line number Diff line
@@ -122,6 +122,7 @@ void gridding_data()
      #warning "the counter of this loop should not be int"
      for( int iphi = histo_send[isector]-1; iphi >=0 ; iphi--)
        {
	  
	  uint ilocal = sectorarray[isector][iphi];

	  uus[icount] = data.uu[ilocal];
@@ -171,7 +172,7 @@ void gridding_data()

      timing_wt.compose += CPU_TIME_wt - start;
      
      printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
      //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);

      // Make convolution on the grid

+42 −42
Original line number Diff line number Diff line

#include "allvars.h"
#include "allvars_nccl.h"
#include "proto.h"
#include <cuda.h>
#include <cuda_runtime.h>
@@ -14,7 +14,7 @@

#if defined( NCCL_REDUCE )


/*
#define NCCLCHECK(cmd) do {                         
ncclResult_t r = cmd;                             
if (r!= ncclSuccess) {                            
@@ -23,7 +23,7 @@ if (r!= ncclSuccess) {
  exit(EXIT_FAILURE);                             
 }                                                 
} while(0)
    
*/  

static uint64_t getHostHash(const char* string) {
  // Based on DJB2a, result = result * 33 ^ char                                                                                                 
@@ -66,6 +66,33 @@ void gridding_data(){
  if ( rank == 0 )
    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);

  // find the largest value in histo_send[]
  //                                                                  
  uint Nsec = histo_send[0];
  for (uint isector = 1; isector < nsectors; isector++)
    Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec );

  uint      Nweightss  = Nsec*metaData.polarisations;
  uint      Nvissec    = Nweightss*metaData.freq_per_chan;

  // allocate sector arrays
  // note: we use the largest allocation among all sectors          

  double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
                                            (Nvissec*2+Nweightss)*sizeof(float_t) );

  if ( memory == NULL )
    shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);

  double_t *uus        = (double*) memory;
  double_t *vvs        = (double*) uus+Nsec;
  double_t *wws        = (double*) vvs+Nsec;
  float_t  *weightss   = (float_t*)(wws+Nsec);
  float_t  *visreals   = weightss + Nweightss;
  float_t  *visimgs    = visreals + Nvissec;



    //Initialize nccl

  double * grid_gpu, *gridss_gpu;
@@ -96,31 +123,6 @@ void gridding_data(){
  
  ncclCommInitRank(&comm, size, id, rank);

  // find the largest value in histo_send[]
  //                                                                  
  uint Nsec = histo_send[0];
  for (uint isector = 1; isector < nsectors; isector++)
    Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec );

  uint      Nweightss  = Nsec*metaData.polarisations;
  uint      Nvissec    = Nweightss*metaData.freq_per_chan;

  // allocate sector arrays
  // note: we use the largest allocation among all sectors          

  double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
                                            (Nvissec*2+Nweightss)*sizeof(float_t) );

  if ( memory == NULL )
    shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);

  double_t *uus        = (double*) memory;
  double_t *vvs        = (double*) uus+Nsec;
  double_t *wws        = (double*) vvs+Nsec;
  float_t  *weightss   = (float_t*)(wws+Nsec);
  float_t  *visreals   = weightss + Nweightss;
  float_t  *visimgs    = visreals + Nvissec;
  
  for (uint isector = 0; isector < nsectors; isector++)
    {

@@ -135,10 +137,9 @@ void gridding_data(){
      #warning "the counter of this loop should not be int"
      for(int iphi = histo_send[isector]-1; iphi>=0; iphi--)
        {

	  uint ilocal = sectorarray[isector][iphi];
	  //double vvh = data.vv[ilocal];
	  //int binphi = (int)(vvh*nsectors);
	  //if (binphi == isector || boundary[ilocal] == isector) {

	  uus[icount] = data.uu[ilocal];
	  vvs[icount] = data.vv[ilocal]-isector*shift;
	  wws[icount] = data.ww[ilocal];
@@ -151,7 +152,7 @@ void gridding_data(){
	    {
	      visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
	      visimgs[inu]  = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
	      //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis);

	      inu++;
	    }
	  icount++;
@@ -186,7 +187,7 @@ void gridding_data(){

      timing_wt.compose += CPU_TIME_wt - start;

      printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
      //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
      
      // Make convolution on the grid

@@ -218,11 +219,11 @@ void gridding_data(){
      //Allocate memory on devices non-blocking for the host                                                                                   
      ///////////////////////////////////////////////////////

      cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce);


      timing_wt.kernel += CPU_TIME_wt - start;
	    
      cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce);
      
     #ifdef VERBOSE
      printf("Processed sector %ld\n",isector);
     #endif
@@ -240,7 +241,7 @@ void gridding_data(){

	  cudaStreamSynchronize(stream_reduce);
    
	  NCCLCHECK(ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce));
	  ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
	  cudaStreamSynchronize(stream_reduce);
      
	  timing_wt.reduce += CPU_TIME_wt - start;
@@ -250,17 +251,16 @@ void gridding_data(){
	  // Go to next sector
	memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );

	free(memory);

    }

  //Copy data back from device to host (to be deleted in next steps)

  free(memory);
  cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
  
  MPI_Barrier(MPI_COMM_WORLD);


  cudaStreamSynchronize(stream_reduce);
  cudaFree(gridss_gpu);
  cudaFree(grid_gpu);
  
+9 −1
Original line number Diff line number Diff line
@@ -11,9 +11,17 @@ void readMetaData(char fileLocal[1000]);
void metaData_calculation();
void allocate_memory();
void readData();

#ifdef __cplusplus
extern "C" {
  void shutdown_wstacking( int, char *, char *, int);
}


#else
void shutdown_wstacking( int, char *, char *, int);
#endif

#ifdef __cplusplus
extern "C" {
  void gridding          (void);