Commit 1d708577 authored by Luca Tornatore's avatar Luca Tornatore
Browse files

finalized one-sided communication in shared-memory

parent a22b9970
Loading
Loading
Loading
Loading
+4 −1
Original line number Original line Diff line number Diff line
@@ -7,7 +7,9 @@ struct ip in;
struct op out, outparam;
struct op out, outparam;


struct meta metaData;
struct meta metaData;
struct time timing;
timing_t wt_timing = {0};
timing_t pr_timing   = {0};

struct parameter param;
struct parameter param;
struct fileData data;
struct fileData data;


@@ -16,6 +18,7 @@ char datapath[900];
int xaxis, yaxis;
int xaxis, yaxis;
int global_rank;
int global_rank;
int size;
int size;
int verbose_level = 0;
long nsectors;
long nsectors;
long startrow;
long startrow;
double resolution, dx, dw, w_supporth;
double resolution, dx, dw, w_supporth;
+76 −9
Original line number Original line Diff line number Diff line
@@ -92,13 +92,24 @@ extern struct meta
} metaData;
} metaData;




extern struct time
typedef struct {
{
  double setup;      // time spent in initialization, init()
   	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time;
  double init;       // time spent in initializing arrays
	double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1;
  double process;    // time spent in gridding;
	double writetime, writetime1;
  double mpi;        // time spent in mpi communications

  double fftw;       //
} timing;
  double kernel;     //
  double mmove;      //
  double reduce;     //
  double reduce_mpi; //
  double reduce_sh ; //
  double compose;    //
  double phase;      //
  double write;      //
  double total; } timing_t;

extern timing_t wt_timing;      // wall-clock timings
extern timing_t pr_timing;      // process CPU timing


extern struct parameter
extern struct parameter
{
{
@@ -127,12 +138,11 @@ extern char datapath[900];
extern int xaxis, yaxis;
extern int xaxis, yaxis;
extern int global_rank;
extern int global_rank;
extern int size;
extern int size;
extern int verbose_level;
extern long nsectors;
extern long nsectors;
extern long startrow;
extern long startrow;
extern double resolution, dx, dw, w_supporth;
extern double resolution, dx, dw, w_supporth;


extern clock_t start, end, start0, startk, endk;
extern struct timespec begin, finish, begin0, begink, finishk;
extern long * histo_send, size_of_grid;
extern long * histo_send, size_of_grid;
extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w;
extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w;


@@ -142,3 +152,60 @@ extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w;
#endif
#endif


extern long **sectorarray;
extern long **sectorarray;


#if defined(DEBUG)
#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) &&	\
				      ( ((t) ==-1 ) || ((T)==(t)) ) ) {	\
    printf(__VA_ARGS__); fflush(stdout); }

#else
#define dprintf(...)
#endif


#define CPU_TIME_wt ({ struct timespec myts; (clock_gettime( CLOCK_REALTIME, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);})
#define CPU_TIME_pr ({ struct timespec myts; (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);})
#define CPU_TIME_th ({ struct timespec myts; (clock_gettime( CLOCK_THREAD_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);})


#if defined(_OPENMP)
#define TAKE_TIME_START( T ) {			\
    wt_timing.T = CPU_TIME_wt;			\
    pr_timing.T = CPU_TIME_pr; }

#define TAKE_TIME_STOP( T ) {			\
    pr_timing.T = CPU_TIME_pr - pr_timing.T;	\
    wt_timing.T = CPU_TIME_wt - wt_timing.T; }

#define TAKE_TIME( Twt, Tpr ) { Twt = CPU_TIME_wt; Tpr = CPU_TIME_pr; }
#define ADD_TIME( T, Twt, Tpr ) {		\
    pr_timing.T += CPU_TIME_pr - Tpr;		\
    wt_timing.T += CPU_TIME_wt - Twt;		\
    Tpr = CPU_TIME_pr; Twt = CPU_TIME_wt; }

#else

#define TAKE_TIME_START( T ) wt_timing.T = CPU_TIME_wt

#define TAKE_TIME_STOP( T )  wt_timing.T = CPU_TIME_wt - wt_timing.T

#define TAKE_TIME( Twt, ... ) Twt = CPU_TIME_wt;
#define ADD_TIME( T, Twt, ... ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt;}

#endif

#define TAKE_TIMEwt_START( T) wt_timing.T = CPU_TIME_wt
#define TAKE_TIMEwt_STOP( T) wt_timing.T = CPU_TIME_wt - wt_timing.T
#define TAKE_TIMEwt( Twt ) Twt = CPU_TIME_wt;
#define ADD_TIMEwt( T, Twt ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt; }


#if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER)
#define PRAGMA_IVDEP _Pragma("GCC ivdep")
#else
#define PRAGMA_IVDEP _Pragma("ivdep")
#endif

#define STRINGIFY(a) #a
#define UNROLL(N) _Pragma(STRINGIFY(unroll(N)))
+5 −4
Original line number Original line Diff line number Diff line
ndatasets              1
ndatasets              1
Datapath1              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_121MHz.pre-cal.binMS/ 
Datapath1              ../input/
Datapath2              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_123MHz.pre-cal.binMS/
Datapath2              ../input/
Datapath3              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_125MHz.pre-cal.binMS/
Datapath3              ../input/
num_threads            2
num_threads            2
w_support              7
w_support              7
grid_size_x            2048
grid_size_x            2048
@@ -24,3 +24,4 @@ fftfile3 fft_img.bin
logfile                run.log
logfile                run.log
extension              .txt
extension              .txt
timingfile             timings.dat 
timingfile             timings.dat 
verbose_level	       1
+209 −203
Original line number Original line Diff line number Diff line
@@ -2,13 +2,15 @@
#include "allvars.h"
#include "allvars.h"
#include "proto.h"
#include "proto.h"


void fftw_data(){
void fftw_data()
{
  
  
 #ifdef USE_FFTW
 #ifdef USE_FFTW
  // FFT transform the data (using distributed FFTW)
  // FFT transform the data (using distributed FFTW)
  if(global_rank == 0)printf("PERFORMING FFT\n");
  if(global_rank == 0)printf("PERFORMING FFT\n");
        clock_gettime(CLOCK_MONOTONIC, &begin);

        start = clock();
  TAKE_TIME_START(fftw);

  fftw_plan plan;
  fftw_plan plan;
  fftw_complex *fftwgrid;
  fftw_complex *fftwgrid;
  ptrdiff_t alloc_local, local_n0, local_0_start;
  ptrdiff_t alloc_local, local_n0, local_0_start;
@@ -64,21 +66,23 @@ void fftw_data(){
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Barrier(MPI_COMM_WORLD);
 #endif
 #endif
  
  
        end = clock();
  TAKE_TIME_STOP(fftw);
        clock_gettime(CLOCK_MONOTONIC, &finish);
        timing.fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC;
        timing.fftw_time1 = (finish.tv_sec - begin.tv_sec);
        timing.fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
        clock_gettime(CLOCK_MONOTONIC, &begin);


 #endif
 #endif
}
}


void write_fftw_data(){
void write_fftw_data(){


  // Write results
  
 #ifdef USE_FFTW
 #ifdef USE_FFTW
  
  double twt, tpr;
    
 #ifdef WRITE_DATA
 #ifdef WRITE_DATA
        // Write results

  TAKE_TIME(twt, tpr);
  
 #ifdef USE_MPI
 #ifdef USE_MPI
  MPI_Win writewin;
  MPI_Win writewin;
  MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin);
  MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin);
@@ -150,27 +154,28 @@ void write_fftw_data(){
  MPI_Win_free(&writewin);
  MPI_Win_free(&writewin);
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Barrier(MPI_COMM_WORLD);
 #endif
 #endif

  ADD_TIME(write, twt, tpr);
 #endif //WRITE_DATA
 #endif //WRITE_DATA
  
  
  
  
  // Phase correction  
  // Phase correction  


     clock_gettime(CLOCK_MONOTONIC, &begin);
  
     start = clock();

  TAKE_TIME_START(phase)
  if(global_rank == 0)printf("PHASE CORRECTION\n");
  if(global_rank == 0)printf("PHASE CORRECTION\n");
  double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
  double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
  double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
  double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
  
  
  phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads);
  phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads);


     end = clock();
  TAKE_TIME_STOP(phase);
     clock_gettime(CLOCK_MONOTONIC, &finish);
     timing.phase_time = ((double) (end - start)) / CLOCKS_PER_SEC;
     timing.phase_time1 = (finish.tv_sec - begin.tv_sec);
     timing.phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
  
  
 #ifdef WRITE_IMAGE
 #ifdef WRITE_IMAGE


  TAKE_TIME(twt, tpr);
  
  if(global_rank == 0)
  if(global_rank == 0)
    {
    {
      file.pFilereal = fopen (out.fftfile2,"wb");
      file.pFilereal = fopen (out.fftfile2,"wb");
@@ -208,6 +213,7 @@ void write_fftw_data(){
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Barrier(MPI_COMM_WORLD);
 #endif
 #endif


  ADD_TIME(write, twt, tpr);
 #endif //WRITE_IMAGE
 #endif //WRITE_IMAGE
  
  
 #endif  //FFTW
 #endif  //FFTW
+404 −355
Original line number Original line Diff line number Diff line
@@ -4,35 +4,35 @@
#include "proto.h"
#include "proto.h"




void gridding(){
void gridding()
{
  
  
  if(global_rank == 0)printf("GRIDDING DATA\n");
  if(global_rank == 0)printf("GRIDDING DATA\n");
  
  
  // Create histograms and linked lists
  // Create histograms and linked lists


    clock_gettime(CLOCK_MONOTONIC, &begin);
  TAKE_TIME_START(init);
    start = clock();
  
  
  // Initialize linked list
  // Initialize linked list
  initialize_array();
  initialize_array();


  TAKE_TIME_STOP(init);
  TAKE_TIME_START(process);

  // Sector and Gridding data
  // Sector and Gridding data
  gridding_data();
  gridding_data();


  TAKE_TIME_STOP(process);
  
 #ifdef USE_MPI
 #ifdef USE_MPI
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Barrier(MPI_COMM_WORLD);
 #endif
 #endif


    end = clock();
  return;
    clock_gettime(CLOCK_MONOTONIC, &finish);
    timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
    timing.process_time1 = (finish.tv_sec - begin.tv_sec);
    timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    clock_gettime(CLOCK_MONOTONIC, &begin);

}
}


void initialize_array(){
void initialize_array()
{


  histo_send     = (long*) calloc(nsectors+1,sizeof(long));
  histo_send     = (long*) calloc(nsectors+1,sizeof(long));
  int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int));
  int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int));
@@ -79,13 +79,16 @@ void initialize_array(){
 #ifdef VERBOSE
 #ifdef VERBOSE
  for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]);
  for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]);
 #endif
 #endif

  free( boundary);
  return;
}
}


void gridding_data(){
void gridding_data()
{


  double shift = (double)(dx*yaxis);
  double shift = (double)(dx*yaxis);
  
  
    // Open the MPI Memory Window for the slab
 #ifdef USE_MPI
 #ifdef USE_MPI
  MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
  MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
  MPI_Win_fence(0,slabwin);
  MPI_Win_fence(0,slabwin);
@@ -95,24 +98,20 @@ void gridding_data(){
  file.pFile1 = fopen (out.outfile1,"w");
  file.pFile1 = fopen (out.outfile1,"w");
 #endif
 #endif
  
  
    timing.kernel_time = 0.0;
    timing.kernel_time1 = 0.0;
    timing.reduce_time = 0.0;
    timing.reduce_time1 = 0.0;
    timing.compose_time = 0.0;
    timing.compose_time1 = 0.0; 

  // calculate the resolution in radians
  // calculate the resolution in radians
  resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax));
  resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax));
  
  
  // calculate the resolution in arcsec 
  // calculate the resolution in arcsec 
  double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI;
  double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI;
  if( global_rank == 0 )
    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);


  for (long isector = 0; isector < nsectors; isector++)
  for (long isector = 0; isector < nsectors; isector++)
    {
    {
        clock_gettime(CLOCK_MONOTONIC, &begink);
      double twt, tpr;
        startk = clock();
      
      TAKE_TIME(twt, tpr);

      // define local destination sector
      // define local destination sector
      //isector = (isector_count+rank)%size;  // this line must be wrong! [LT]
      //isector = (isector_count+rank)%size;  // this line must be wrong! [LT]
      
      
@@ -132,6 +131,7 @@ void gridding_data(){
      long ip = 0;
      long ip = 0;
      long inu = 0;
      long inu = 0;


     #warning shall we omp-ize this ?
      for(long iphi = histo_send[isector]-1; iphi>=0; iphi--)
      for(long iphi = histo_send[isector]-1; iphi>=0; iphi--)
        {
        {
	  long ilocal = sectorarray[isector][iphi];
	  long ilocal = sectorarray[isector][iphi];
@@ -141,33 +141,32 @@ void gridding_data(){
	  uus[icount] = data.uu[ilocal];
	  uus[icount] = data.uu[ilocal];
	  vvs[icount] = data.vv[ilocal]-isector*shift;
	  vvs[icount] = data.vv[ilocal]-isector*shift;
	  wws[icount] = data.ww[ilocal];	  
	  wws[icount] = data.ww[ilocal];	  
              for (long ipol=0; ipol<metaData.polarisations; ipol++)
	  UNROLL(4)
          PRAGMA_IVDEP
	  for (long ipol=0; ipol<metaData.polarisations; ipol++, ip++)
	    {
	    {
	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
		  ip++;
	    }
	    }
              for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
	  
	  PRAGMA_IVDEP
	  UNROLL(4)
	    for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++, inu++)
	      {
	      {
		visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
		visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
		visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
		visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
	      //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis);
	      //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis);
                     inu++;
	    }
	    }
	  icount++;
	  icount++;
	}
	}


         clock_gettime(CLOCK_MONOTONIC, &finishk);
      ADD_TIME(compose, twt, tpr);
         endk = clock();
         timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
         timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
         timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
         timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;


     #ifndef USE_MPI
     #ifndef USE_MPI
      double vvmin = 1e20;
      double vvmin = 1e20;
      double uumax = -1e20;
      double uumax = -1e20;
      double vvmax = -1e20;
      double vvmax = -1e20;


     #warning shall we omp-ize this ?
      for (long ipart=0; ipart<Nsec; ipart++)
      for (long ipart=0; ipart<Nsec; ipart++)
	{
	{
	  uumin = MIN(uumin,uus[ipart]);
	  uumin = MIN(uumin,uus[ipart]);
@@ -186,8 +185,7 @@ void gridding_data(){
     #ifdef VERBOSE
     #ifdef VERBOSE
      printf("Processing sector %ld\n",isector);
      printf("Processing sector %ld\n",isector);
     #endif
     #endif
       clock_gettime(CLOCK_MONOTONIC, &begink);
      TAKE_TIME(twt, tpr);
       startk = clock();


      wstack(param.num_w_planes,
      wstack(param.num_w_planes,
	     Nsec,
	     Nsec,
@@ -207,6 +205,8 @@ void gridding_data(){
	     gridss,
	     gridss,
	     param.num_threads);
	     param.num_threads);


      ADD_TIME(kernel, twt, tpr);
      
      /* int z =0 ;
      /* int z =0 ;
       * #pragma omp target map(to:test_i_gpu) map(from:z)
       * #pragma omp target map(to:test_i_gpu) map(from:z)
       * {
       * {
@@ -215,42 +215,49 @@ void gridding_data(){
       *       z = x + test_i_gpu;
       *       z = x + test_i_gpu;
       *       }*/
       *       }*/


       clock_gettime(CLOCK_MONOTONIC, &finishk);
       endk = clock();
       timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
       timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec);
       timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
     #ifdef VERBOSE
     #ifdef VERBOSE
      printf("Processed sector %ld\n",isector);
      printf("Processed sector %ld\n",isector);
     #endif
     #endif
       clock_gettime(CLOCK_MONOTONIC, &begink);
      /* ----------------
       startk = clock();
       * REDUCE
       * ---------------- */


       //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
      double twt_r, tpr_r;
      TAKE_TIME(twt_r, tpr_r);


       #ifndef USE_MPI
                                                     // ..................
       long stride = isector*2*xaxis*yaxis*num_w_planes;
     #ifndef USE_MPI                                 // REDUCE WITH NO MPI                
       for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)
      
      #pragma omp parallel
      {
	long stride = isector * size_of_grid;
       #pragma omp for
	for (long iii=0; iii< size_fo_grid; iii++)
	  gridtot[stride+iii] = gridss[iii];
	  gridtot[stride+iii] = gridss[iii];
       #endif
      }

                                                     // ..................
                                                     // REDUCE WITH MPI
     #else


      // Write grid in the corresponding remote slab      
      // Write grid in the corresponding remote slab      
      #ifdef USE_MPI
      // int target_rank = (int)isector;    it implied that size >= nsectors
      // int target_rank = (int)isector;    it implied that size >= nsectors
      int target_rank = (int)(isector % size);
      int target_rank = (int)(isector % size);
       
       
     #ifdef ONE_SIDE
     #ifdef ONE_SIDE


       //   MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
       // MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
       // MPI_Win_unlock(target_rank,slabwin);
       
      // for every task, gridss coincides with the 
      // for every task, gridss coincides with the 
      // that can be avoided if shared window coincides with gridss
      // that can be avoided if shared window coincides with gridss

      TAKE_TIME(twt, tpr);
      memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double));
      memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double));
      ADD_TIME(mmove, twt, tpr);
 
 
      dprintf(1, global_rank, 0, "reducing sector %ld..\n", isector);


      TAKE_TIME( twt, tpr);
      reduce( isector, target_rank );  // here the reduce is performed within every host 
      reduce( isector, target_rank );  // here the reduce is performed within every host 
      ADD_TIME(reduce_sh, twt, tpr);


      if ( Me.Nhosts > 1 )
      if ( Me.Nhosts > 1 )
	{
	{
@@ -272,6 +279,7 @@ void gridding_data(){
	   
	   
	  if( Sector_Comm != MPI_COMM_NULL )
	  if( Sector_Comm != MPI_COMM_NULL )
	    {
	    {
	      double _twt_;
	      int sector_size;
	      int sector_size;
	      int sector_rank = 0;
	      int sector_rank = 0;
	      int sector_target;
	      int sector_target;
@@ -281,7 +289,10 @@ void gridding_data(){
	      if ( global_rank == target_rank)
	      if ( global_rank == target_rank)
		{
		{
		  MPI_Send( &sector_rank, 1, MPI_INT, 0, 0, Sector_Comm);
		  MPI_Send( &sector_rank, 1, MPI_INT, 0, 0, Sector_Comm);
		   memcpy(grid, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double), size_of_grid * sizeof(double));    
		  TAKE_TIMEwt( _twt_ );
		  memcpy(gridss, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double),
			 size_of_grid * sizeof(double));
		  ADD_TIMEwt( mmove, _twt_);
		}
		}
	       
	       
	      if( sector_rank == 0 )
	      if( sector_rank == 0 )
@@ -290,29 +301,35 @@ void gridding_data(){
		  MPI_Recv( &sector_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status);
		  MPI_Recv( &sector_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status);
		}
		}


	      TAKE_TIMEwt(_twt_);
	      MPI_Bcast( &sector_target, 1, MPI_INT, 0, Sector_Comm );
	      MPI_Bcast( &sector_target, 1, MPI_INT, 0, Sector_Comm );
	       
	       
	       MPI_Reduce(grid, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm);
	      MPI_Reduce(gridss, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm);
	      
	      
	      MPI_Comm_free( &Sector_Comm );
	      MPI_Comm_free( &Sector_Comm );
	      ADD_TIMEwt(mpi, _twt_);
	    }
	    }
	}
	}
      ADD_TIME(reduce_mpi, twt, tpr);


       
       
     #else   // relates to #ifdef ONE_SIDE
     #else   // relates to #ifdef ONE_SIDE
      
      
       
      {
	double _twt_;
	TAKE_TIMEwt(_twt_);
	MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
	MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
	ADD_TIMEwt(mpi, _twt_);
      }
      
     #endif  //  closes #ifdef ONE_SIDE
     #endif  //  closes #ifdef ONE_SIDE
     #endif  //  closes USE_MPI
     #endif  //  closes USE_MPI


       clock_gettime(CLOCK_MONOTONIC, &finishk);
      ADD_TIME(reduce, twt_r, tpr_r);
       endk = clock();

       timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
      
       timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec);
      // wipe before getting to the next sector
       timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
      memset((void*)gridss, 0, size_of_grid * sizeof(double));
       // Go to next sector
       for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;


      // Deallocate all sector arrays
      // Deallocate all sector arrays
      free(uus);
      free(uus);
@@ -324,25 +341,18 @@ void gridding_data(){
      // End of loop over sector    
      // End of loop over sector    
    }
    }


    // Finalize MPI communication

    #ifdef USE_MPI
  free( histo_send );
       MPI_Win_fence(0,slabwin);
    #endif  


 #ifndef USE_MPI
 #ifndef USE_MPI
  fclose(file.pFile1);
  fclose(file.pFile1);
 #endif
 #endif


 #ifdef USE_MPI
 #ifdef USE_MPI
  MPI_Win_fence(0,slabwin);
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Barrier(MPI_COMM_WORLD);
 #endif
 #endif
  
  
    end = clock();
    clock_gettime(CLOCK_MONOTONIC, &finish);
    timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
    timing.process_time1 = (finish.tv_sec - begin.tv_sec);
    timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    clock_gettime(CLOCK_MONOTONIC, &begin);
}
}


void write_grided_data()
void write_grided_data()
@@ -355,6 +365,7 @@ void write_grided_data()
        printf("WRITING GRIDDED DATA\n");
        printf("WRITING GRIDDED DATA\n");
        file.pFilereal = fopen (out.outfile2,"wb");
        file.pFilereal = fopen (out.outfile2,"wb");
        file.pFileimg = fopen (out.outfile3,"wb");
        file.pFileimg = fopen (out.outfile3,"wb");
	
       #ifdef USE_MPI
       #ifdef USE_MPI
	for (int isector=0; isector<nsectors; isector++)
	for (int isector=0; isector<nsectors; isector++)
	  {
	  {
@@ -377,6 +388,7 @@ void write_grided_data()
			fseek(file.pFilereal, global_index, SEEK_SET);
			fseek(file.pFilereal, global_index, SEEK_SET);
			fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal);
			fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal);
		      }
		      }
		
		for (int iw=0; iw<param.num_w_planes; iw++)
		for (int iw=0; iw<param.num_w_planes; iw++)
		  for (int iv=0; iv<yaxis; iv++)
		  for (int iv=0; iv<yaxis; iv++)
		    for (int iu=0; iu<xaxis; iu++)
		    for (int iu=0; iu<xaxis; iu++)
@@ -424,7 +436,6 @@ void write_grided_data()
    #endif
    #endif
     
     
    #endif //WRITE_DATA      
    #endif //WRITE_DATA      

}
}




@@ -434,6 +445,7 @@ void reduce( int sector, int target_rank )
 {   
 {   
   
   
   int local_rank = Me.Rank[myHOST];
   int local_rank = Me.Rank[myHOST];
   int target_rank_on_myhost = -1;
   
   
   if( Me.Ranks_to_host[ target_rank ] == Me.myhost )
   if( Me.Ranks_to_host[ target_rank ] == Me.myhost )
     // exchange rank 0 with target rank
     // exchange rank 0 with target rank
@@ -442,16 +454,32 @@ void reduce( int sector, int target_rank )
     // every target rank
     // every target rank
     {
     {


       int r = 0;
       target_rank_on_myhost = 0;
       while( Me.Ranks_to_myhost[r] != target_rank )
       while( Me.Ranks_to_myhost[target_rank_on_myhost] != target_rank )
	 r++;
	 target_rank_on_myhost++;


       if( r > 0 )
       dprintf(2, Me.Rank[myHOST], 0,
	       "[SEC %d] swapping Host master with target rank %d (%d)\n",
	       sector, target_rank, target_rank_on_myhost);
       
       
       if( target_rank_on_myhost > 0 )
	 // the target is not the task that already has rank 0
	 // on my host
	 {
	 {
	   
	   if( local_rank == 0 )
	   if( local_rank == 0 )
	     local_rank = r;
	     local_rank = target_rank_on_myhost;
	   if( local_rank == r )
	   else if( local_rank == target_rank_on_myhost )
	     local_rank = 0;
	     local_rank = 0;

	   win_t temp = Me.swins[target_rank_on_myhost];
	   Me.swins[target_rank_on_myhost] = Me.swins[0];
	   Me.swins[0] = temp;

	   temp = Me.scwins[target_rank_on_myhost];
	   Me.scwins[target_rank_on_myhost] = Me.scwins[0];
	   Me.scwins[0] = temp;
	 }
	 }
     }
     }
   
   
@@ -470,6 +498,10 @@ void reduce( int sector, int target_rank )
       if( local_rank % threshold == 0)
       if( local_rank % threshold == 0)
         {
         {
	   int source = local_rank + (1<<l);
	   int source = local_rank + (1<<l);
	   dprintf(2, 0, 0,
		   "[SEC %d] task %d (%d) getting data from task %d at level %d\n", 
		   sector, local_rank, Me.Rank[myHOST], source, l );
	   
	   while( *(int*)(Me.scwins[source].ptr) < l )
	   while( *(int*)(Me.scwins[source].ptr) < l )
	     // sleep 5 usec if the source target is not ready
	     // sleep 5 usec if the source target is not ready
	     NSLEEP( 5000 );
	     NSLEEP( 5000 );
@@ -481,8 +513,25 @@ void reduce( int sector, int target_rank )
	   *(int*)(Me.win_ctrl.ptr) = l;
	   *(int*)(Me.win_ctrl.ptr) = l;
         }
         }
       else
       else
	 {
	   dprintf(2, 0, 0,
		   "[SEC %d] task %d (%d) signaling that level %d is done\n",
		   sector, local_rank, Me.Rank[myHOST], l );
	   
	   *(int*)(Me.win_ctrl.ptr) = l;
	   *(int*)(Me.win_ctrl.ptr) = l;
	 }
	 }
     }

   if ( target_rank_on_myhost > 0 )
     {
       win_t temp = Me.swins[target_rank_on_myhost];
       Me.swins[target_rank_on_myhost] = Me.swins[0];
       Me.swins[0] = temp;

       temp = Me.scwins[target_rank_on_myhost];
       Me.scwins[target_rank_on_myhost] = Me.scwins[0];
       Me.scwins[0] = temp;
     }
   
   
   return;
   return;
 }
 }
Loading