Commit 55c56cef authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

CPU OK

parent 59bb195b
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -2,11 +2,11 @@ ndatasets 1
Datapath1		/data/LOFAR_MERGE/hpc_imaging/data/newgauss2noconj_t201806301100_SBL180.binMS/
#Datapath2              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_123MHz.pre-cal.binMS/
#Datapath3              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_125MHz.pre-cal.binMS/
num_threads            1
num_threads            4
w_support              7
reduce_method 	       0
grid_size_x            1024
grid_size_y            1024
reduce_method 	       1
grid_size_x            4096
grid_size_y            4096
num_w_planes           8
ufile		       ucoord.bin
vfile    	       vcoord.bin
+0 −1
Original line number Diff line number Diff line
@@ -82,7 +82,6 @@ void initialize_array()

  histo_send = (uint*) calloc(nsectors+1, sizeof(uint));

  
  for (uint iphi = 0; iphi < metaData.Nmeasures; iphi++)
    {
      double vvh = data.vv[iphi];              //less or equal to 0.6
+2 −3
Original line number Diff line number Diff line


#include "allvars.h"
#include "proto.h"

@@ -10,7 +9,7 @@
 *
 */

#if !defined( NCCL_REDUCE )
#if !defined( NCCL_REDUCE ) && !defined( RCCL_REDUCE )

int reduce_ring (int);

+115 −115
Original line number Diff line number Diff line
@@ -12,9 +12,17 @@
 */


#if defined( NCCL_REDUCE_NVIDIA )
#if defined( NCCL_REDUCE )


#define NCCLCHECK(cmd) do {                         
ncclResult_t r = cmd;                             
if (r!= ncclSuccess) {                            
  printf("Failed, NCCL error %s:%d '%s'\n",       
	 __FILE__,__LINE__,ncclGetErrorString(r));   
  exit(EXIT_FAILURE);                             
 }                                                 
} while(0)
    

static uint64_t getHostHash(const char* string) {
@@ -44,12 +52,11 @@ void gridding_data(){

  double shift = (double)(dx*yaxis);

  timing.kernel_time = 0.0;
  timing.kernel_time1 = 0.0;
  timing.reduce_time = 0.0;
  timing.reduce_time1 = 0.0;
  timing.compose_time = 0.0;
  timing.compose_time1 = 0.0; 
  timing_wt.kernel     = 0.0;
  timing_wt.reduce     = 0.0;
  timing_wt.reduce_mpi = 0.0;
  timing_wt.reduce_sh  = 0.0;
  timing_wt.compose    = 0.0;
  
  // calculate the resolution in radians
  resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax));
@@ -60,7 +67,6 @@ void gridding_data(){
    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);

  //Initialize nccl
 #ifdef NCCL_REDUCE

  double * grid_gpu, *gridss_gpu;
  int local_rank = 0;
@@ -89,46 +95,59 @@ void gridding_data(){
  cudaStreamCreate(&stream_reduce);
  
  ncclCommInitRank(&comm, size, id, rank);
 #endif

  for (long isector = 0; isector < nsectors; isector++)
    {
      clock_gettime(CLOCK_MONOTONIC, &begink);
      startk = clock();
      // define local destination sector
      //isector = (isector_count+rank)%size;  // this line must be wrong! [LT]
  // find the largest value in histo_send[]
  //                                                                  
  uint Nsec = histo_send[0];
  for (uint isector = 1; isector < nsectors; isector++)
    Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec );

  uint      Nweightss  = Nsec*metaData.polarisations;
  uint      Nvissec    = Nweightss*metaData.freq_per_chan;

  // allocate sector arrays
      long    Nsec       = histo_send[isector];
      double *uus        = (double*) malloc(Nsec*sizeof(double));
      double *vvs        = (double*) malloc(Nsec*sizeof(double));
      double *wws        = (double*) malloc(Nsec*sizeof(double));
      long    Nweightss  = Nsec*metaData.polarisations;
      long    Nvissec    = Nweightss*metaData.freq_per_chan;
      float *weightss    = (float*) malloc(Nweightss*sizeof(float));
      float *visreals    = (float*) malloc(Nvissec*sizeof(float));
      float *visimgs     = (float*) malloc(Nvissec*sizeof(float));
  // note: we use the largest allocation among all sectors          

  double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
                                            (Nvissec*2+Nweightss)*sizeof(float_t) );

  if ( memory == NULL )
    shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);

  double_t *uus        = (double*) memory;
  double_t *vvs        = (double*) uus+Nsec;
  double_t *wws        = (double*) vvs+Nsec;
  float_t  *weightss   = (float_t*)(wws+Nsec);
  float_t  *visreals   = weightss + Nweightss;
  float_t  *visimgs    = visreals + Nvissec;
  
  for (uint isector = 0; isector < nsectors; isector++)
    {

      double start = CPU_TIME_wt;
      
      // select data for this sector
      long icount = 0;
      long ip = 0;
      long inu = 0;
      uint icount = 0;
      uint ip = 0;
      uint inu = 0;

      for(long iphi = histo_send[isector]-1; iphi>=0; iphi--)
      #warning "this loop should be threaded"
      #warning "the counter of this loop should not be int"
      for(int iphi = histo_send[isector]-1; iphi>=0; iphi--)
        {
	  long ilocal = sectorarray[isector][iphi];
	  uint ilocal = sectorarray[isector][iphi];
	  //double vvh = data.vv[ilocal];
	  //int binphi = (int)(vvh*nsectors);
	  //if (binphi == isector || boundary[ilocal] == isector) {
	  uus[icount] = data.uu[ilocal];
	  vvs[icount] = data.vv[ilocal]-isector*shift;
	  wws[icount] = data.ww[ilocal];
	  for (long ipol=0; ipol<metaData.polarisations; ipol++)
	  for (uint ipol=0; ipol<metaData.polarisations; ipol++)
	    {
	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
	      ip++;
	    }
	  for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
	  for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
	    {
	      visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
	      visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
@@ -138,21 +157,44 @@ void gridding_data(){
	  icount++;
	}

      clock_gettime(CLOCK_MONOTONIC, &finishk);
      endk = clock();
      timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
      timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
      timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
      timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
      double uumin = 1e20;
      double vvmin = 1e20;
      double uumax = -1e20;
      double vvmax = -1e20;

      #pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads)
      {
        double my_uumin = 1e20;
        double my_vvmin = 1e20;
        double my_uumax = -1e20;
        double my_vvmax = -1e20;

       #pragma omp for
        for (uint ipart=0; ipart<Nsec; ipart++)
          {
            my_uumin = MIN(my_uumin, uus[ipart]);
            my_uumax = MAX(my_uumax, uus[ipart]);
            my_vvmin = MIN(my_vvmin, vvs[ipart]);
            my_vvmax = MAX(my_vvmax, vvs[ipart]);
          }

        uumin = MIN( uumin, my_uumin );
        uumax = MAX( uumax, my_uumax );
        vvmin = MIN( vvmin, my_vvmin );
        vvmax = MAX( vvmax, my_vvmax );
      }

      timing_wt.compose += CPU_TIME_wt - start;

      printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
      
      // Make convolution on the grid

     #ifdef VERBOSE
      printf("Processing sector %ld\n",isector);
     #endif
      clock_gettime(CLOCK_MONOTONIC, &begink);
      startk = clock();

      start = CPU_TIME_wt;
      
     //We have to call different GPUs per MPI task!!! [GL]
      wstack(param.num_w_planes,
@@ -176,75 +218,40 @@ void gridding_data(){
      //Allocate memory on devices non-blocking for the host                                                                                   
      ///////////////////////////////////////////////////////

     #ifdef NCCL_REDUCE
      cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce);
     #endif
      /* int z =0 ;
       * #pragma omp target map(to:test_i_gpu) map(from:z)
       * {
       *   int x; // only accessible from accelerator
       *     x = 2;
       *       z = x + test_i_gpu;
       *       }*/

      clock_gettime(CLOCK_MONOTONIC, &finishk);
      endk = clock();
      timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
      timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec);
      timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;


      timing_wt.kernel += CPU_TIME_wt - start;
      
     #ifdef VERBOSE
      printf("Processed sector %ld\n",isector);
     #endif
      
      
      clock_gettime(CLOCK_MONOTONIC, &begink);
      startk = clock();

      //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
      start = CPU_TIME_wt;

     #ifndef USE_MPI
      long stride = isector*2*xaxis*yaxis*num_w_planes;
      for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)
	gridtot[stride+iii] = gridss[iii];
     #endif
      if( size > 1 )
	{
     
	  // Write grid in the corresponding remote slab
     #ifdef USE_MPI
     
	  // int target_rank = (int)isector;    it implied that size >= nsectors
	  int target_rank = (int)(isector % size);
     #ifdef NCCL_REDUCE

	  cudaStreamSynchronize(stream_reduce);
    
      ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble,
		 ncclSum, target_rank, comm, stream_reduce);
	  NCCLCHECK(ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce));
	  cudaStreamSynchronize(stream_reduce);
      //cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
      //cudaStreamSynchronize(stream_reduce);
     #endif
      
     #ifdef REDUCE
      MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
     #endif //REDUCE
	  timing_wt.reduce += CPU_TIME_wt - start;

       //Let's use now the new implementation (ring in shmem and Ired inter-nodes)
     #endif //USE_MPI
	}
	  
       clock_gettime(CLOCK_MONOTONIC, &finishk);
       endk = clock();
       timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
       timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec);
       timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
	  // Go to next sector
       for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;

       // Deallocate all sector arrays
       free(uus);
       free(vvs);
       free(wws);
       free(weightss);
       free(visreals);
       free(visimgs);
      // End of loop over sector    
	memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );

	free(memory);

    }

  //Copy data back from device to host (to be deleted in next steps)
@@ -254,15 +261,6 @@ void gridding_data(){
  MPI_Barrier(MPI_COMM_WORLD);


  end = clock();
  clock_gettime(CLOCK_MONOTONIC, &finish);
  timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
  timing.process_time1 = (finish.tv_sec - begin.tv_sec);
  timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
  clock_gettime(CLOCK_MONOTONIC, &begin);



  cudaFree(gridss_gpu);
  cudaFree(grid_gpu);
  
@@ -270,6 +268,8 @@ void gridding_data(){
  
  ncclCommDestroy(comm);

  return;
  
}

#endif
+1 −1
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ void readMetaData(char fileLocal[1000]);
void metaData_calculation();
void allocate_memory();
void readData();
void shutdown( int, char *, char *, int);
void shutdown_wstacking( int, char *, char *, int);


#ifdef __cplusplus
+2 −2

File changed.

Contains only whitespace changes.

Loading