Commit ea94cf82 authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

Memory bug fixing

parent 011d30c6
Loading
Loading
Loading
Loading
+21 −30
Original line number Diff line number Diff line
@@ -84,17 +84,15 @@ void gridding_data()

  // find the largest value in histo_send[]
  //
  uint Nsec = histo_send[0];
  for (uint isector = 1; isector < nsectors; isector++)
    Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec );
  
  for (uint isector = 0; isector < nsectors; isector++)
    {

      double start = CPU_TIME_wt;

      uint Nsec            = histo_send[isector];
      uint Nweightss       = Nsec*metaData.polarisations;
      uint Nvissec         = Nweightss*metaData.freq_per_chan;
  
  // allocate sector arrays
  // note: we use the largest allocation among all sectors
  //
  unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t);
      double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
						(Nvissec*2+Nweightss)*sizeof(float_t) );

@@ -109,12 +107,6 @@ void gridding_data()
      float_t  *visimgs    = (float_t*)visreals + Nvissec;
  
      
  for (uint isector = 0; isector < nsectors; isector++)
    {

      double start = CPU_TIME_wt;

      memset( memory, 0, mem_size );
      
      // select data for this sector
      uint icount = 0;
@@ -183,14 +175,14 @@ void gridding_data()
      printf("Processing sector %ld\n",isector);
     #endif

      start = CPU_TIME_wt;

      double *stacking_target_array;
      if ( size > 1 )
	stacking_target_array = gridss;
      else
	stacking_target_array = grid;

      start = CPU_TIME_wt;
	    
     //We have to call different GPUs per MPI task!!! [GL]
      wstack(param.num_w_planes,
	     Nsec,
@@ -259,9 +251,8 @@ void gridding_data()
	  memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );	  
	}	

    }

      free(memory);
    }

  if ( size > 1 )
    {
+24 −31
Original line number Diff line number Diff line
@@ -68,30 +68,6 @@ void gridding_data(){

  // find the largest value in histo_send[]
  //                                                                  
  uint Nsec = histo_send[0];
  for (uint isector = 1; isector < nsectors; isector++)
    Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec );

  uint      Nweightss  = Nsec*metaData.polarisations;
  uint      Nvissec    = Nweightss*metaData.freq_per_chan;

  // allocate sector arrays
  // note: we use the largest allocation among all sectors
  //
  unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t);
  double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
                                            (Nvissec*2+Nweightss)*sizeof(float_t) );

  if ( memory == NULL )
    shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);

  double_t *uus        = (double*) memory;
  double_t *vvs        = (double*) uus+Nsec;
  double_t *wws        = (double*) vvs+Nsec;
  float_t  *weightss   = (float_t*)(wws+Nsec);
  float_t  *visreals   = weightss + Nweightss;
  float_t  *visimgs    = visreals + Nvissec;

 
  
    //Initialize nccl
@@ -129,7 +105,23 @@ void gridding_data(){

      double start = CPU_TIME_wt;

      memset( memory, 0, mem_size );
      uint Nsec            = histo_send[isector];
      uint Nweightss       = Nsec*metaData.polarisations;
      uint Nvissec         = Nweightss*metaData.freq_per_chan;
      double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
						(Nvissec*2+Nweightss)*sizeof(float_t) );

      if ( memory == NULL )
	shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);
  
      double_t *uus        = (double_t*) memory;
      double_t *vvs        = (double_t*) uus+Nsec;
      double_t *wws        = (double_t*) vvs+Nsec;
      float_t  *weightss   = (float_t*)((double_t*)wws+Nsec);
      float_t  *visreals   = (float_t*)weightss + Nweightss;
      float_t  *visimgs    = (float_t*)visreals + Nvissec;
  
      
      
      // select data for this sector
      uint icount = 0;
@@ -246,10 +238,10 @@ void gridding_data(){
	  // int target_rank = (int)isector;    it implied that size >= nsectors
	  int target_rank = (int)(isector % size);

	  start = CPU_TIME_wt;
	  
	  cudaStreamSynchronize(stream_reduce);

	  start = CPU_TIME_wt;

	  ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
	  cudaStreamSynchronize(stream_reduce);
      
@@ -260,11 +252,12 @@ void gridding_data(){
	  memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );	  
	}	

      free(memory);
    }

  //Copy data back from device to host (to be deleted in next steps)

  free(memory);
  
  cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
  
  MPI_Barrier(MPI_COMM_WORLD);
+22 −31
Original line number Diff line number Diff line
@@ -64,32 +64,6 @@ void gridding_data(){
  if ( rank == 0 )
    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);

  // find the largest value in histo_send[]
  //                                                                  
  uint Nsec = histo_send[0];
  for (uint isector = 1; isector < nsectors; isector++)
    Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec );

  uint      Nweightss  = Nsec*metaData.polarisations;
  uint      Nvissec    = Nweightss*metaData.freq_per_chan;

  // allocate sector arrays
  // note: we use the largest allocation among all sectors
  //
  unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t);
  double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
                                            (Nvissec*2+Nweightss)*sizeof(float_t) );

  if ( memory == NULL )
    shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);

  double_ty *uus        = (double*) memory;
  double_ty *vvs        = (double*) uus+Nsec;
  double_ty *wws        = (double*) vvs+Nsec;
  float_ty  *weightss   = (float_ty*)(wws+Nsec);
  float_ty  *visreals   = weightss + Nweightss;
  float_ty  *visimgs    = visreals + Nvissec;

 

    //Initialize nccl
@@ -127,7 +101,23 @@ void gridding_data(){

      double start = CPU_TIME_wt;

      memset( memory, 0, mem_size );
      uint Nsec            = histo_send[isector];
      uint Nweightss       = Nsec*metaData.polarisations;
      uint Nvissec         = Nweightss*metaData.freq_per_chan;
      double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
						(Nvissec*2+Nweightss)*sizeof(float_t) );

      if ( memory == NULL )
	shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);
  
      double_t *uus        = (double_t*) memory;
      double_t *vvs        = (double_t*) uus+Nsec;
      double_t *wws        = (double_t*) vvs+Nsec;
      float_t  *weightss   = (float_t*)((double_t*)wws+Nsec);
      float_t  *visreals   = (float_t*)weightss + Nweightss;
      float_t  *visimgs    = (float_t*)visreals + Nvissec;
  
      
      
      // select data for this sector
      uint icount = 0;
@@ -244,10 +234,11 @@ void gridding_data(){
	  // int target_rank = (int)isector;    it implied that size >= nsectors
	  int target_rank = (int)(isector % size);
	  
	  start = CPU_TIME_wt;
	  	  
	  hipStreamSynchronize(stream_reduce);
    
          start = CPU_TIME_wt;

	  ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
	  hipStreamSynchronize(stream_reduce);
      
@@ -258,11 +249,11 @@ void gridding_data(){
	  memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );	  
	}	

      free(memory);
    }

  //Copy data back from device to host (to be deleted in next steps)

  free(memory);
  hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce);
  
  MPI_Barrier(MPI_COMM_WORLD);