Commit 591ecc58 authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

Updates for AMD

parent a4a3a115
Loading
Loading
Loading
Loading
+16 −0
Original line number Diff line number Diff line
@@ -25,3 +25,19 @@ CFLAGS +=
MPICHLIB =


##########################################################
#AMD GPUs (DEFAULT = LUMI)

CLANG   = clang
CLANG++ = clang++
 
OPTIMIZE_AMD = -O3 -Ofast -fopenmp -march=native -mavx -mavx2 -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa --offload-arch=gfx90a

RCCL_INCL= -I/opt/rocm-5.2.3/rccl/include
RCCL_LIB= -L/opt/rocm-5.2.3/rccl/lib

HIP_INCL= -I/opt/rocm-5.2.3/hip/include
HIP_LIB= -L/opt/rocm-5.2.3/hip/lib

AMDLIB = $(HIP_INCL) $(HIP_LIB) $(RCCL_INCL) $(RCCL_LIB) -D__HIP_PLATFORM_AMD__ -lamdhip64 -lrccl
###########################################################
+26 −11
Original line number Diff line number Diff line
@@ -77,9 +77,9 @@ OPT += -DACCOMP

DEPS = w-stacking.h  main.c allvars.h

ifneq (ACCOMP,$(findstring ACCOMP, $(OPT)))		 
ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) && ifneq (CUDACC,$(findstring CUDACC, $(OPT)))
OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
else ifneq (CUDACC,$(findstring CUDACC, $(OPT)))
else
OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
endif

@@ -89,8 +89,11 @@ OBJ_ACC_CUDA = phase_correction.o w-stacking.o
DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c
OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o


DEPS_NCCL_REDUCE = gridding_nccl.cpp
OBJ_NCCL_REDUCE = gridding_nccl.o

DEPS_RCCL_REDUCE = gridding_rccl.cpp
OBJ_RCCL_REDUCE = gridding_rccl.o

ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT)))
@@ -130,7 +133,8 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
OBJ += $(OBJ_ACC_CUDA)
endif

ifeq (ACCOMP,$(findstring ACCOMP,$(OPT)))
#NVIDIA GPUs
ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifneq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-omp
LINKER=$(NVC)
FLAGS=$(NVFLAGS) $(CFLAGS)
@@ -140,22 +144,33 @@ $(OBJ_ACC_OMP): $(DEPS_ACC_OMP)
OBJ += $(OBJ_ACC_OMP)
endif

#AMD GPUs
ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifeq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-omp
LINKER=$(MPICC)
FLAGS=$(OPTIMIZE_AMD) $(CFLAGS)
LIBS=$(AMDLIB) 
$(OBJ_ACC_OMP): $(DEPS_ACC_OMP)
	$(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS) 
endif


ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-reduce
LINKER=$(NVC++)
FLAGS=$(NVFLAGS) $(CFLAGS)
LIBS=$(NVLIB) $(NVLIB_3)
compile_accreduce: $(OBJ_NCCL_REDUCE)
$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
	$(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS)	
endif

ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-reduce
LINKER=$(NVC++)
FLAGS=$(NVFLAGS) $(CFLAGS)
LIBS=$(NVLIB) $(NVLIB_3)
compile_accreduce: $(OBJ_RCCL_REDUCE)
	$(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3)
LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE_AMD) $(CFLAGS)
LIBS=$(AMDLIB) 
$(OBJ_RCCL_REDUCE): $(DEPS_RCCL_REDUCE)
	$(MPIC++) $(FLAGS) $(OPT) -c $^ $(CFLAGS) $(LIBS)
endif


+138 −246
Original line number Diff line number Diff line


#if defined( REDUCE_RCCL )


#include <stdio.h>
#include "allvars.h"
#include "allvars_nccl.h"
#include "proto.h"
#include <hip/hip_runtime.h>
#include <rccl/rccl.h>

int verbose_level = 0;
/* 
 * Implements the gridding of data via GPU
 * by using NCCL library
 *
 */


#if defined( RCCL_REDUCE )

/*
#define NCCLCHECK(cmd) do {                         
ncclResult_t r = cmd;                             
if (r!= ncclSuccess) {                            
  printf("Failed, NCCL error %s:%d '%s'\n",       
	 __FILE__,__LINE__,ncclGetErrorString(r));   
  exit(EXIT_FAILURE);                             
 }                                                 
} while(0)
*/  

static uint64_t getHostHash(const char* string) {
  // Based on DJB2a, result = result * 33 ^ char                                                                                                 
@@ -32,116 +44,54 @@ static void getHostName(char* hostname, int maxlen) {
}


void gridding(){

    if(rank == 0)printf("GRIDDING DATA\n");

    // Create histograms and linked lists
    
    clock_gettime(CLOCK_MONOTONIC, &begin);
    start = clock();

    // Initialize linked list
    initialize_array();

    //Sector and Gridding data
    gridding_data();
void gridding_data(){

    #ifdef USE_MPI
        MPI_Barrier(MPI_COMM_WORLD);
    #endif
  double shift = (double)(dx*yaxis);

    end = clock();
    clock_gettime(CLOCK_MONOTONIC, &finish);
    timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
    timing.process_time1 = (finish.tv_sec - begin.tv_sec);
    timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    clock_gettime(CLOCK_MONOTONIC, &begin);
  timing_wt.kernel     = 0.0;
  timing_wt.reduce     = 0.0;
  timing_wt.reduce_mpi = 0.0;
  timing_wt.reduce_sh  = 0.0;
  timing_wt.compose    = 0.0;
  
}
  // calculate the resolution in radians
  resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax));

void initialize_array(){
  // calculate the resolution in arcsec 
  double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI;
  if ( rank == 0 )
    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);

    histo_send = (long*) calloc(nsectors+1,sizeof(long));
    int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int));
    double uuh,vvh;
    for (long iphi = 0; iphi < metaData.Nmeasures; iphi++)
    {
     	   boundary[iphi] = -1;
           vvh = data.vv[iphi];  //less or equal to 0.6
           int binphi = (int)(vvh*nsectors); //has values expect 0 and nsectors-1. So we use updist and downdist condition
           // check if the point influence also neighboring slabs
           double updist = (double)((binphi+1)*yaxis)*dx - vvh;
           double downdist = vvh - (double)(binphi*yaxis)*dx;
  // find the largest value in histo_send[]
  //                                                                  
           histo_send[binphi]++;
           if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1;};
           if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1;};
    }
  uint Nsec = histo_send[0];
  for (uint isector = 1; isector < nsectors; isector++)
    Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec );

    sectorarray = (long**)malloc ((nsectors+1) * sizeof(long*));
    for(int sec=0; sec<(nsectors+1); sec++)
    {
      	   sectorarray[sec] = (long*)malloc(histo_send[sec]*sizeof(long));
    }
  uint      Nweightss  = Nsec*metaData.polarisations;
  uint      Nvissec    = Nweightss*metaData.freq_per_chan;

    long *counter = (long*) calloc(nsectors+1,sizeof(long));
    for (long iphi = 0; iphi < metaData.Nmeasures; iphi++)
    {
           vvh = data.vv[iphi];
           int binphi = (int)(vvh*nsectors);
           double updist = (double)((binphi+1)*yaxis)*dx - vvh;
           double downdist = vvh - (double)(binphi*yaxis)*dx;
           sectorarray[binphi][counter[binphi]] = iphi;
           counter[binphi]++;
           if(updist < w_supporth && updist >= 0.0) { sectorarray[binphi+1][counter[binphi+1]] = iphi; counter[binphi+1]++;};
           if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) { sectorarray[binphi-1][counter[binphi-1]] = iphi; counter[binphi-1]++;};
    }
     
    
   #ifdef PIPPO
        long iiii = 0;
        for (int j=0; j<nsectors; j++)
        {
                iiii = 0;
                for(long iphi = histo_send[j]-1; iphi>=0; iphi--)
                {
                      printf("%d %d %ld %ld %ld\n",rank,j,iiii,histo_send[j],sectorarray[j][iphi]);
                      iiii++;
                }
        }
   #endif

    #ifdef VERBOSE
        for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
    #endif
}

void gridding_data(){
  // allocate sector arrays
  // note: we use the largest allocation among all sectors          

  double shift = (double)(dx*yaxis);
  double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
                                            (Nvissec*2+Nweightss)*sizeof(float_t) );

 #ifndef USE_MPI
  file.pFile1 = fopen (out.outfile1,"w");
 #endif
  if ( memory == NULL )
    shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);

  timing.kernel_time = 0.0;
  timing.kernel_time1 = 0.0;
  timing.reduce_time = 0.0;
  timing.reduce_time1 = 0.0;
  timing.compose_time = 0.0;
  timing.compose_time1 = 0.0; 
  double_t *uus        = (double*) memory;
  double_t *vvs        = (double*) uus+Nsec;
  double_t *wws        = (double*) vvs+Nsec;
  float_t  *weightss   = (float_t*)(wws+Nsec);
  float_t  *visreals   = weightss + Nweightss;
  float_t  *visimgs    = visreals + Nvissec;

  // calculate the resolution in radians
  resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax));

  // calculate the resolution in arcsec 
  double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI;
  if ( rank == 0 )
    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);

    //Initialize nccl
 #ifdef NCCL_REDUCE

  double * grid_gpu, *gridss_gpu;
  int local_rank = 0;
@@ -170,87 +120,80 @@ void gridding_data(){
  hipStreamCreate(&stream_reduce);
  
  ncclCommInitRank(&comm, size, id, rank);
 #endif

  for (long isector = 0; isector < nsectors; isector++)
  for (uint isector = 0; isector < nsectors; isector++)
    {
      clock_gettime(CLOCK_MONOTONIC, &begink);
      startk = clock();
      // define local destination sector
      //isector = (isector_count+rank)%size;  // this line must be wrong! [LT]

      // allocate sector arrays 
      long    Nsec       = histo_send[isector];
      double *uus        = (double*) malloc(Nsec*sizeof(double));
      double *vvs        = (double*) malloc(Nsec*sizeof(double));
      double *wws        = (double*) malloc(Nsec*sizeof(double));
      long    Nweightss  = Nsec*metaData.polarisations;
      long    Nvissec    = Nweightss*metaData.freq_per_chan;
      float *weightss    = (float*) malloc(Nweightss*sizeof(float));
      float *visreals    = (float*) malloc(Nvissec*sizeof(float));
      float *visimgs     = (float*) malloc(Nvissec*sizeof(float));
      double start = CPU_TIME_wt;
      
      // select data for this sector
      long icount = 0;
      long ip = 0;
      long inu = 0;
      uint icount = 0;
      uint ip = 0;
      uint inu = 0;

      for(long iphi = histo_send[isector]-1; iphi>=0; iphi--)
      #warning "this loop should be threaded"
      #warning "the counter of this loop should not be int"
      for(int iphi = histo_send[isector]-1; iphi>=0; iphi--)
        {
	  long ilocal = sectorarray[isector][iphi];
	  //double vvh = data.vv[ilocal];
	  //int binphi = (int)(vvh*nsectors);
	  //if (binphi == isector || boundary[ilocal] == isector) {

	  uint ilocal = sectorarray[isector][iphi];

	  uus[icount] = data.uu[ilocal];
	  vvs[icount] = data.vv[ilocal]-isector*shift;
	  wws[icount] = data.ww[ilocal];
	  for (long ipol=0; ipol<metaData.polarisations; ipol++)
	  for (uint ipol=0; ipol<metaData.polarisations; ipol++)
	    {
	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
	      ip++;
	    }
	  for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
	  for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
	    {
	      visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
	      visimgs[inu]  = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
	      //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis);

	      inu++;
	    }
	  icount++;
	}

      clock_gettime(CLOCK_MONOTONIC, &finishk);
      endk = clock();
      timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
      timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
      timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
      timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
      
     #ifndef USE_MPI
      double uumin = 1e20;
      double vvmin = 1e20;
      double uumax = -1e20;
      double vvmax = -1e20;

      for (long ipart=0; ipart<Nsec; ipart++)
      #pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads)
      {
	  uumin = MIN(uumin,uus[ipart]);
	  uumax = MAX(uumax,uus[ipart]);
	  vvmin = MIN(vvmin,vvs[ipart]);
	  vvmax = MAX(vvmax,vvs[ipart]);
        double my_uumin = 1e20;
        double my_vvmin = 1e20;
        double my_uumax = -1e20;
        double my_vvmax = -1e20;

	  if(ipart%10 == 0)fprintf (file.pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
       #pragma omp for
        for (uint ipart=0; ipart<Nsec; ipart++)
          {
            my_uumin = MIN(my_uumin, uus[ipart]);
            my_uumax = MAX(my_uumax, uus[ipart]);
            my_vvmin = MIN(my_vvmin, vvs[ipart]);
            my_vvmax = MAX(my_vvmax, vvs[ipart]);
          }

      printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
     #endif
        uumin = MIN( uumin, my_uumin );
        uumax = MAX( uumax, my_uumax );
        vvmin = MIN( vvmin, my_vvmin );
        vvmax = MAX( vvmax, my_vvmax );
      }

      timing_wt.compose += CPU_TIME_wt - start;

      //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
      
      // Make convolution on the grid

     #ifdef VERBOSE
      printf("Processing sector %ld\n",isector);
     #endif
      clock_gettime(CLOCK_MONOTONIC, &begink);
      startk = clock();

      start = CPU_TIME_wt;

     //We have to call different GPUs per MPI task!!! [GL]
      wstack(param.num_w_planes,
@@ -274,99 +217,47 @@ void gridding_data(){
      //Allocate memory on devices non-blocking for the host                                                                                   
      ///////////////////////////////////////////////////////

     #ifdef NCCL_REDUCE

      timing_wt.kernel += CPU_TIME_wt - start;
	    
      hipMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyHostToDevice, stream_reduce);
     #endif
      /* int z =0 ;
       * #pragma omp target map(to:test_i_gpu) map(from:z)
       * {
       *   int x; // only accessible from accelerator
       *     x = 2;
       *       z = x + test_i_gpu;
       *       }*/

      clock_gettime(CLOCK_MONOTONIC, &finishk);
      endk = clock();
      timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
      timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec);
      timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
      
     #ifdef VERBOSE
      printf("Processed sector %ld\n",isector);
     #endif
      
      
      clock_gettime(CLOCK_MONOTONIC, &begink);
      startk = clock();
      start = CPU_TIME_wt;

      //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
    
     #ifndef USE_MPI
      long stride = isector*2*xaxis*yaxis*num_w_planes;
      for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)
	gridtot[stride+iii] = gridss[iii];
     #endif
      if( size > 1 )
	{
     
	  // Write grid in the corresponding remote slab
     #ifdef USE_MPI
     
	  // int target_rank = (int)isector;    it implied that size >= nsectors
	  int target_rank = (int)(isector % size);
     #ifdef NCCL_REDUCE

	  hipStreamSynchronize(stream_reduce);
    
      ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble,
		 ncclSum, target_rank, comm, stream_reduce);
	  ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
	  hipStreamSynchronize(stream_reduce);
      //hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce);
      //hipStreamSynchronize(stream_reduce);
     #endif
      
     #ifdef REDUCE
      MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
     #endif //REDUCE
	  timing_wt.reduce += CPU_TIME_wt - start;

       //Let's use now the new implementation (ring in shmem and Ired inter-nodes)
     #endif //USE_MPI
	}
	  
       clock_gettime(CLOCK_MONOTONIC, &finishk);
       endk = clock();
       timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
       timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec);
       timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
	  // Go to next sector
       for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;

       // Deallocate all sector arrays
       free(uus);
       free(vvs);
       free(wws);
       free(weightss);
       free(visreals);
       free(visimgs);
      // End of loop over sector    
	memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );

    }

  //Copy data back from device to host (to be deleted in next steps)

 #ifdef NCCL_REDUCE
  free(memory);
  hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce);
 #endif
    #ifndef USE_MPI
        fclose(file.pFile1);
    #endif

  
    #ifdef USE_MPI
  MPI_Barrier(MPI_COMM_WORLD);
    #endif

    end = clock();
    clock_gettime(CLOCK_MONOTONIC, &finish);
    timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
    timing.process_time1 = (finish.tv_sec - begin.tv_sec);
    timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    clock_gettime(CLOCK_MONOTONIC, &begin);

   #ifdef NCCL_REDUCE
  hipStreamSynchronize(stream_reduce);
  hipFree(gridss_gpu);
  hipFree(grid_gpu);
@@ -374,8 +265,9 @@ void gridding_data(){
  hipStreamDestroy(stream_reduce);
  
  ncclCommDestroy(comm);
   #endif
}

  return;
  
}

#endif