#include <stdio.h>
#include <math.h>
#include "allvars.h"
#include "proto.h"


void gridding()
{
  
  if(global_rank == 0)printf("GRIDDING DATA\n");
  
  // Create histograms and linked lists

  TAKE_TIME_START(init);
  
  // Initialize linked list
  initialize_array();

  TAKE_TIME_STOP(init);
  TAKE_TIME_START(process);

  // Sector and Gridding data
  gridding_data();

  TAKE_TIME_STOP(process);
  
 #ifdef USE_MPI
  MPI_Barrier(MPI_COMM_WORLD);
 #endif

  return;
}

void initialize_array()
{

  histo_send     = (long*) calloc(nsectors+1,sizeof(long));
  int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int));
  double vvh;
    
  for (long iphi = 0; iphi < metaData.Nmeasures; iphi++)
    {
      boundary[iphi] = -1;
      vvh = data.vv[iphi];  //less or equal to 0.6
      int binphi = (int)(vvh*nsectors); //has values expect 0 and nsectors-1. So we use updist and downdist condition
      // check if the point influence also neighboring slabs
      double updist = (double)((binphi+1)*yaxis)*dx - vvh;
      double downdist = vvh - (double)(binphi*yaxis)*dx;
      //
      histo_send[binphi]++;
      if(updist < w_supporth && updist >= 0.0)
	{histo_send[binphi+1]++; boundary[iphi] = binphi+1;};
      if(downdist < w_supporth && binphi > 0 && downdist >= 0.0)
	{histo_send[binphi-1]++; boundary[iphi] = binphi-1;};
    }

  sectorarray = (long**)malloc ((nsectors+1) * sizeof(long*));
  for(int sec=0; sec<(nsectors+1); sec++)
    {
      sectorarray[sec] = (long*)malloc(histo_send[sec]*sizeof(long));
    }
    
  long *counter = (long*) calloc(nsectors+1,sizeof(long));
  for (long iphi = 0; iphi < metaData.Nmeasures; iphi++)
    {
      vvh = data.vv[iphi];
      int binphi = (int)(vvh*nsectors);
      double updist = (double)((binphi+1)*yaxis)*dx - vvh;
      double downdist = vvh - (double)(binphi*yaxis)*dx;
      sectorarray[binphi][counter[binphi]] = iphi;
      counter[binphi]++;
      if(updist < w_supporth && updist >= 0.0)
	{ sectorarray[binphi+1][counter[binphi+1]] = iphi; counter[binphi+1]++;};
      if(downdist < w_supporth && binphi > 0 && downdist >= 0.0)
	{ sectorarray[binphi-1][counter[binphi-1]] = iphi; counter[binphi-1]++;};
    }
     
  
 #ifdef VERBOSE
  for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]);
 #endif

  free( boundary);
  return;
}

void gridding_data()
{

  double shift = (double)(dx*yaxis);
  
 #ifdef USE_MPI
  MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
  MPI_Win_fence(0,slabwin);
 #endif
  
 #ifndef USE_MPI
  file.pFile1 = fopen (out.outfile1,"w");
 #endif
  
  // calculate the resolution in radians
  resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax));
  
  // calculate the resolution in arcsec 
  double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI;
  if( global_rank == 0 )
    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);

  for (long isector = 0; isector < nsectors; isector++)
    {
      double twt, tpr;
      
      TAKE_TIME(twt, tpr);

      // define local destination sector
      //isector = (isector_count+rank)%size;  // this line must be wrong! [LT]
      
      // allocate sector arrays 
      long    Nsec       = histo_send[isector];
      double *uus        = (double*) malloc(Nsec*sizeof(double));
      double *vvs        = (double*) malloc(Nsec*sizeof(double));
      double *wws        = (double*) malloc(Nsec*sizeof(double));
      long    Nweightss  = Nsec*metaData.polarisations;
      long    Nvissec    = Nweightss*metaData.freq_per_chan;
      float *weightss    = (float*) malloc(Nweightss*sizeof(float));
      float *visreals    = (float*) malloc(Nvissec*sizeof(float));
      float *visimgs     = (float*) malloc(Nvissec*sizeof(float));
       
      // select data for this sector
      long icount = 0;
      long ip = 0;
      long inu = 0;

     #warning shall we omp-ize this ?
      for(long iphi = histo_send[isector]-1; iphi>=0; iphi--)
        {
	  long ilocal = sectorarray[isector][iphi];
	  //double vvh = data.vv[ilocal];
	  //int binphi = (int)(vvh*nsectors);
	  //if (binphi == isector || boundary[ilocal] == isector) {
	  uus[icount] = data.uu[ilocal];
	  vvs[icount] = data.vv[ilocal]-isector*shift;
	  wws[icount] = data.ww[ilocal];	  
	  UNROLL(4)
          PRAGMA_IVDEP
	  for (long ipol=0; ipol<metaData.polarisations; ipol++, ip++)
	    {
	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
	    }
	  
	  PRAGMA_IVDEP
	  UNROLL(4)
	    for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++, inu++)
	      {
		visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
		visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
	      //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis);
	    }
	  icount++;
	}

      ADD_TIME(compose, twt, tpr);

     #ifndef USE_MPI
      double vvmin = 1e20;
      double uumax = -1e20;
      double vvmax = -1e20;

     #warning shall we omp-ize this ?
      for (long ipart=0; ipart<Nsec; ipart++)
	{
	  uumin = MIN(uumin,uus[ipart]);
	  uumax = MAX(uumax,uus[ipart]);
	  vvmin = MIN(vvmin,vvs[ipart]);
	  vvmax = MAX(vvmax,vvs[ipart]);
	     
	  if(ipart%10 == 0)fprintf (file.pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
	}
	 
      printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
     #endif

      // Make convolution on the grid

     #ifdef VERBOSE
      printf("Processing sector %ld\n",isector);
     #endif
      TAKE_TIME(twt, tpr);

      wstack(param.num_w_planes,
	     Nsec,
	     metaData.freq_per_chan,
	     metaData.polarisations,
	     uus,
	     vvs,
	     wws,
	     visreals,
	     visimgs,
	     weightss,
	     dx,
	     dw,
	     param.w_support,
	     xaxis,
	     yaxis,
	     gridss,
	     param.num_threads);

      ADD_TIME(kernel, twt, tpr);
      
      /* int z =0 ;
       * #pragma omp target map(to:test_i_gpu) map(from:z)
       * {
       *   int x; // only accessible from accelerator
       *     x = 2;
       *       z = x + test_i_gpu;
       *       }*/

     #ifdef VERBOSE
      printf("Processed sector %ld\n",isector);
     #endif
      /* ----------------
       * REDUCE
       * ---------------- */

      double twt_r, tpr_r;
      TAKE_TIME(twt_r, tpr_r);

                                                     // ..................
     #ifndef USE_MPI                                 // REDUCE WITH NO MPI                
      
      #pragma omp parallel
      {
	long stride = isector * size_of_grid;
       #pragma omp for
	for (long iii=0; iii< size_fo_grid; iii++)
	  gridtot[stride+iii] = gridss[iii];
      }

                                                     // ..................
                                                     // REDUCE WITH MPI
     #else

      // Write grid in the corresponding remote slab      
      // int target_rank = (int)isector;    it implied that size >= nsectors
      int target_rank = (int)(isector % size);
       
     #ifdef ONE_SIDE

      // for every task, gridss coincides with the 
      // that can be avoided if shared window coincides with gridss

      TAKE_TIME(twt, tpr);
      memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double));
      ADD_TIME(mmove, twt, tpr);
 
      dprintf(1, global_rank, 0, "reducing sector %ld..\n", isector);

      TAKE_TIME( twt, tpr);
      reduce( isector, target_rank );  // here the reduce is performed within every host 
      ADD_TIME(reduce_sh, twt, tpr);

      if ( Me.Nhosts > 1 )
	{
	  // here the reduce is performed among hosts
	  MPI_Barrier(MPI_COMM_WORLD);
	   
	  int Im_in_the_new_communicator = MPI_UNDEFINED;
	  if(global_rank == target_rank)
	    Im_in_the_new_communicator = 1;
	  else
	    if( Me.Rank[HOSTS] == 0 )
	      {
		if( Me.Ranks_to_host[ target_rank ] != Me.myhost )
		  Im_in_the_new_communicator = 1;
	      }
	   
	  MPI_Comm Sector_Comm;
	  MPI_Comm_split( COMM[WORLD], Im_in_the_new_communicator, global_rank, &Sector_Comm);
	   
	  if( Sector_Comm != MPI_COMM_NULL )
	    {
	      double _twt_;
	      int sector_size;
	      int sector_rank = 0;
	      int sector_target;
	       
	      MPI_Comm_size( Sector_Comm, &sector_size);
	      MPI_Comm_rank( Sector_Comm, &sector_rank);
	      if ( global_rank == target_rank)
		{
		  MPI_Send( &sector_rank, 1, MPI_INT, 0, 0, Sector_Comm);
		  TAKE_TIMEwt( _twt_ );
		  memcpy(gridss, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double),
			 size_of_grid * sizeof(double));
		  ADD_TIMEwt( mmove, _twt_);
		}
	       
	      if( sector_rank == 0 )
		{
		  MPI_Status status;
		  MPI_Recv( &sector_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status);
		}

	      TAKE_TIMEwt(_twt_);
	      MPI_Bcast( &sector_target, 1, MPI_INT, 0, Sector_Comm );
	       
	      MPI_Reduce(gridss, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm);
	      
	      MPI_Comm_free( &Sector_Comm );
	      ADD_TIMEwt(mpi, _twt_);
	    }
	}
      ADD_TIME(reduce_mpi, twt, tpr);

       
     #else   // relates to #ifdef ONE_SIDE
      
      {
	double _twt_;
	TAKE_TIMEwt(_twt_);
	MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
	ADD_TIMEwt(mpi, _twt_);
      }
      
     #endif  //  closes #ifdef ONE_SIDE
     #endif  //  closes USE_MPI

      ADD_TIME(reduce, twt_r, tpr_r);

      
      // wipe before getting to the next sector
      memset((void*)gridss, 0, size_of_grid * sizeof(double));

      // Deallocate all sector arrays
      free(uus);
      free(vvs);
      free(wws);
      free(weightss);
      free(visreals);
      free(visimgs);
      // End of loop over sector    
    }


  free( histo_send );

 #ifndef USE_MPI
  fclose(file.pFile1);
 #endif

 #ifdef USE_MPI
  MPI_Win_fence(0,slabwin);
  MPI_Barrier(MPI_COMM_WORLD);
 #endif
  
}

void write_grided_data()
{

   #ifdef WRITE_DATA
     // Write results
     if (global_rank == 0)
     {
        printf("WRITING GRIDDED DATA\n");
        file.pFilereal = fopen (out.outfile2,"wb");
        file.pFileimg = fopen (out.outfile3,"wb");
	
       #ifdef USE_MPI
	for (int isector=0; isector<nsectors; isector++)
	  {
	    MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin);
	    MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin);
	    MPI_Win_unlock(isector,slabwin);
	    for (long i=0; i<size_of_grid/2; i++)
              {
		gridss_real[i] = gridss[2*i];
		gridss_img[i] = gridss[2*i+1];
              }
	    if (param.num_w_planes > 1)
              {
		for (int iw=0; iw<param.num_w_planes; iw++)
		  for (int iv=0; iv<yaxis; iv++)
		    for (int iu=0; iu<xaxis; iu++)
		      {
			long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
			long index = iu + iv*xaxis + iw*xaxis*yaxis;
			fseek(file.pFilereal, global_index, SEEK_SET);
			fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal);
		      }
		
		for (int iw=0; iw<param.num_w_planes; iw++)
		  for (int iv=0; iv<yaxis; iv++)
		    for (int iu=0; iu<xaxis; iu++)
		      {
			long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
			long index = iu + iv*xaxis + iw*xaxis*yaxis;
			fseek(file.pFileimg, global_index, SEEK_SET);
			fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg);
			//double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]);
			//fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm);
		      }
		
              }
	    else
              {
		for (int iw=0; iw<param.num_w_planes; iw++)
		  {
		    long global_index = (xaxis*isector*yaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
		    long index = iw*xaxis*yaxis;
		    fseek(file.pFilereal, global_index, SEEK_SET);
		    fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), file.pFilereal);
		    fseek(file.pFileimg, global_index, SEEK_SET);
		    fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), file.pFileimg);
		  }
              }
          }
       #else
	for (int iw=0; iw<param.num_w_planes; iw++)
	  for (int iv=0; iv<param.grid_size_y; iv++)
	    for (int iu=0; iu<param.grid_size_x; iu++)
	      {
		long index = 2*(iu + iv*param.grid_size_x + iw*param.grid_size_x*param.grid_size_y);
		fwrite(&gridtot[index], 1, sizeof(double), file.pFilereal);
		fwrite(&gridtot[index+1], 1, sizeof(double), file.pFileimg);
		//double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
		//fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
	      }
       #endif
        fclose(file.pFilereal);
        fclose(file.pFileimg);
     }
     
    #ifdef USE_MPI
     MPI_Win_fence(0,slabwin);
    #endif
     
    #endif //WRITE_DATA      
}


#define NSLEEP( T ) {struct timespec tsleep={0, (T)}; nanosleep(&tsleep, NULL); }

void reduce( int sector, int target_rank )
 {   
   
   int local_rank = Me.Rank[myHOST];
   int target_rank_on_myhost = -1;
   
   if( Me.Ranks_to_host[ target_rank ] == Me.myhost )
     // exchange rank 0 with target rank
     // in this way the following log2 alogorithm,
     // which reduces to rank 0, will work for
     // every target rank
     {

       target_rank_on_myhost = 0;
       while( Me.Ranks_to_myhost[target_rank_on_myhost] != target_rank )
	 target_rank_on_myhost++;

       dprintf(2, Me.Rank[myHOST], 0,
	       "[SEC %d] swapping Host master with target rank %d (%d)\n",
	       sector, target_rank, target_rank_on_myhost);
       
       
       if( target_rank_on_myhost > 0 )
	 // the target is not the task that already has rank 0
	 // on my host
	 {
	   
	   if( local_rank == 0 )
	     local_rank = target_rank_on_myhost;
	   else if( local_rank == target_rank_on_myhost )
	     local_rank = 0;

	   win_t temp = Me.swins[target_rank_on_myhost];
	   Me.swins[target_rank_on_myhost] = Me.swins[0];
	   Me.swins[0] = temp;

	   temp = Me.scwins[target_rank_on_myhost];
	   Me.scwins[target_rank_on_myhost] = Me.scwins[0];
	   Me.scwins[0] = temp;
	 }
     }
   
   
   int max_level = 0;
   while( (1<< (++max_level) ) < Me.Ntasks[myHOST] );

   *(int*)(Me.win_ctrl.ptr) = -1;
   
   double *my_base = ((double*)Me.win.ptr)+sector*size_of_grid;   
   
   for(int l = 0; l < max_level; l++)
     {
       int threshold = 1 << (1+l);
       
       if( local_rank % threshold == 0)
         {
	   int source = local_rank + (1<<l);
	   dprintf(2, 0, 0,
		   "[SEC %d] task %d (%d) getting data from task %d at level %d\n", 
		   sector, local_rank, Me.Rank[myHOST], source, l );
	   
	   while( *(int*)(Me.scwins[source].ptr) < l )
	     // sleep 5 usec if the source target is not ready
	     NSLEEP( 5000 );

	   double *source_base = ((double*)Me.swins[source].ptr)+sector*size_of_grid;
	   for(int j = 0; j < size_of_grid; j++)
	     *(my_base + j) += *(source_base + j);
	   
	   *(int*)(Me.win_ctrl.ptr) = l;
         }
       else
	 {
	   dprintf(2, 0, 0,
		   "[SEC %d] task %d (%d) signaling that level %d is done\n",
		   sector, local_rank, Me.Rank[myHOST], l );
	   
	   *(int*)(Me.win_ctrl.ptr) = l;
	 }
     }

   if ( target_rank_on_myhost > 0 )
     {
       win_t temp = Me.swins[target_rank_on_myhost];
       Me.swins[target_rank_on_myhost] = Me.swins[0];
       Me.swins[0] = temp;

       temp = Me.scwins[target_rank_on_myhost];
       Me.scwins[target_rank_on_myhost] = Me.scwins[0];
       Me.scwins[0] = temp;
     }
   
   return;
 }



