#include <stdio.h>
//#include "allvars.h"
#include "proto.h"
#include "numa_vars.h"


blocks_t blocks;
int verbose_level = 0;

int reduce_ring (int);

void gridding(){

    if(rank == 0)printf("GRIDDING DATA\n");

    // Create histograms and linked lists
    
    clock_gettime(CLOCK_MONOTONIC, &begin);
    start = clock();

    // Initialize linked list
    initialize_array();

    //Sector and Gridding data
    gridding_data();

    #ifdef USE_MPI
        MPI_Barrier(MPI_COMM_WORLD);
    #endif

    end = clock();
    clock_gettime(CLOCK_MONOTONIC, &finish);
    timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
    timing.process_time1 = (finish.tv_sec - begin.tv_sec);
    timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    clock_gettime(CLOCK_MONOTONIC, &begin);

}

void initialize_array(){

    histo_send = (long*) calloc(nsectors+1,sizeof(long));
    int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int));
    double uuh,vvh;
    for (long iphi = 0; iphi < metaData.Nmeasures; iphi++)
    {
     	   boundary[iphi] = -1;
           vvh = data.vv[iphi];  //less or equal to 0.6
           int binphi = (int)(vvh*nsectors); //has values expect 0 and nsectors-1. So we use updist and downdist condition
           // check if the point influence also neighboring slabs
           double updist = (double)((binphi+1)*yaxis)*dx - vvh;
           double downdist = vvh - (double)(binphi*yaxis)*dx;
           //
           histo_send[binphi]++;
           if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1;};
           if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1;};
    }

    sectorarray = (long**)malloc ((nsectors+1) * sizeof(long*));
    for(int sec=0; sec<(nsectors+1); sec++)
    {
      	   sectorarray[sec] = (long*)malloc(histo_send[sec]*sizeof(long));
    }

    long *counter = (long*) calloc(nsectors+1,sizeof(long));
    for (long iphi = 0; iphi < metaData.Nmeasures; iphi++)
    {
           vvh = data.vv[iphi];
           int binphi = (int)(vvh*nsectors);
           double updist = (double)((binphi+1)*yaxis)*dx - vvh;
           double downdist = vvh - (double)(binphi*yaxis)*dx;
           sectorarray[binphi][counter[binphi]] = iphi;
           counter[binphi]++;
           if(updist < w_supporth && updist >= 0.0) { sectorarray[binphi+1][counter[binphi+1]] = iphi; counter[binphi+1]++;};
           if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) { sectorarray[binphi-1][counter[binphi-1]] = iphi; counter[binphi-1]++;};
    }
     
    
   #ifdef PIPPO
        long iiii = 0;
        for (int j=0; j<nsectors; j++)
        {
                iiii = 0;
                for(long iphi = histo_send[j]-1; iphi>=0; iphi--)
                {
                      printf("%d %d %ld %ld %ld\n",rank,j,iiii,histo_send[j],sectorarray[j][iphi]);
                      iiii++;
                }
        }
   #endif

    #ifdef VERBOSE
        for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
    #endif
}

void gridding_data(){

  double shift = (double)(dx*yaxis);
    
    // Open the MPI Memory Window for the slab
 #ifdef ONE_SIDE
  MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
  MPI_Win_fence(0,slabwin);
 #endif

 #ifdef RING 
  memset( (char*)Me.win.ptr, 0, size_of_grid*sizeof(double)*1.1);                                                                               
  gridss = (double*)Me.win.ptr; //gridss must point to the right location [GL]
  
  memset( (char*)Me.fwin.ptr, 0, size_of_grid*sizeof(double)*1.1); //allocate the memory for the results [GL]
  
  if( Me.Rank[myHOST] == 0 )                                                                                                                    
    {                                                                                                                                           
      for( int tt = 1; tt < Me.Ntasks[myHOST]; tt++ )                                                                                           
        memset( (char*)Me.swins[tt].ptr, 0, size_of_grid*sizeof(double)*1.1);                                                                   
    }                                                                                                                                           
                                                                                                                                                
  //numa_expose(&Me, verbose_level); //Verbose level must be defined in order to know more details about the NUMA architecture [GL]

  MPI_Barrier(MPI_COMM_WORLD);                                                                                                                  
                                                                                                                                                
  if( Me.Rank[HOSTS] >= 0 )                                                                                                                     
    requests = (MPI_Request *)calloc( Me.Ntasks[WORLD], sizeof(MPI_Request) );

  if( Me.Rank[myHOST] == 0 ) {                                                                                                                  
    *((int*)win_ctrl_hostmaster_ptr+CTRL_BARRIER_END) = 0;                                                                                      
    *((int*)win_ctrl_hostmaster_ptr+CTRL_BARRIER_START) = 0;                                                                                    
  }

  *((int*)Me.win_ctrl.ptr + CTRL_FINAL_STATUS) = FINAL_FREE;                                                                                    
  *((int*)Me.win_ctrl.ptr + CTRL_FINAL_CONTRIB) = 0;                                                                                            
  *((int*)Me.win_ctrl.ptr + CTRL_SHMEM_STATUS) = -1;                                                                                            
  MPI_Barrier(*(Me.COMM[myHOST]));

  blocks.Nblocks = Me.Ntasks[myHOST];                                                                                                           
  blocks.Bstart  = (int_t*)calloc( blocks.Nblocks, sizeof(int_t));                                                                              
  blocks.Bsize   = (int_t*)calloc( blocks.Nblocks, sizeof(int_t));                                                                              
  int_t size_b  = size_of_grid / blocks.Nblocks;                                                                                                
  int_t rem   = size_of_grid % blocks.Nblocks;

  blocks.Bsize[0]  = size_b + (rem > 0);                                                                                                        
  blocks.Bstart[0] = 0;                                                                                                                         
  for(int b = 1; b < blocks.Nblocks; b++ ) {                                                                                                    
    blocks.Bstart[b] = blocks.Bstart[b-1]+blocks.Bsize[b-1];                                                                                    
    blocks.Bsize[b] = size_b + (b < rem);}
 #endif

  
 #ifndef USE_MPI
  file.pFile1 = fopen (out.outfile1,"w");
 #endif

  timing.kernel_time = 0.0;
  timing.kernel_time1 = 0.0;
  timing.reduce_time = 0.0;
  timing.reduce_time1 = 0.0;
  timing.compose_time = 0.0;
  timing.compose_time1 = 0.0; 

  // calculate the resolution in radians
  resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax));
    
  // calculate the resolution in arcsec 
  double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI;
  if ( rank == 0 )
    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);

  

  
  for (long isector = 0; isector < nsectors; isector++)
    {
      clock_gettime(CLOCK_MONOTONIC, &begink);
      startk = clock();
      // define local destination sector
      //isector = (isector_count+rank)%size;  // this line must be wrong! [LT]

      // allocate sector arrays 
      long    Nsec       = histo_send[isector];
      double *uus        = (double*) malloc(Nsec*sizeof(double));
      double *vvs        = (double*) malloc(Nsec*sizeof(double));
      double *wws        = (double*) malloc(Nsec*sizeof(double));
      long    Nweightss  = Nsec*metaData.polarisations;
      long    Nvissec    = Nweightss*metaData.freq_per_chan;
      float *weightss    = (float*) malloc(Nweightss*sizeof(float));
      float *visreals    = (float*) malloc(Nvissec*sizeof(float));
      float *visimgs     = (float*) malloc(Nvissec*sizeof(float));
       
      // select data for this sector
      long icount = 0;
      long ip = 0;
      long inu = 0;

      for(long iphi = histo_send[isector]-1; iphi>=0; iphi--)
        {
	  long ilocal = sectorarray[isector][iphi];
	  //double vvh = data.vv[ilocal];
	  //int binphi = (int)(vvh*nsectors);
	  //if (binphi == isector || boundary[ilocal] == isector) {
	  uus[icount] = data.uu[ilocal];
	  vvs[icount] = data.vv[ilocal]-isector*shift;
	  wws[icount] = data.ww[ilocal];
	  for (long ipol=0; ipol<metaData.polarisations; ipol++)
	    {
	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
	      ip++;
	    }
	  for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
	    {
	      visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
	      visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
	      //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis);
	      inu++;
	    }
	  icount++;
	}
      
      clock_gettime(CLOCK_MONOTONIC, &finishk);
      endk = clock();
      timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
      timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
      timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
      timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
      
     #ifndef USE_MPI
      double vvmin = 1e20;
      double uumax = -1e20;
      double vvmax = -1e20;
	 
      for (long ipart=0; ipart<Nsec; ipart++)
	{
	  uumin = MIN(uumin,uus[ipart]);
	  uumax = MAX(uumax,uus[ipart]);
	  vvmin = MIN(vvmin,vvs[ipart]);
	  vvmax = MAX(vvmax,vvs[ipart]);
	     
	  if(ipart%10 == 0)fprintf (file.pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
	}
	 
      printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
     #endif

      // Make convolution on the grid

     #ifdef VERBOSE
      printf("Processing sector %ld\n",isector);
     #endif
      clock_gettime(CLOCK_MONOTONIC, &begink);
      startk = clock();

     //We have to call different GPUs per MPI task!!! [GL]
      wstack(param.num_w_planes,
	     Nsec,
	     metaData.freq_per_chan,
	     metaData.polarisations,
	     uus,
	     vvs,
	     wws,
	     visreals,
	     visimgs,
	     weightss,
	     dx,
	     dw,
	     param.w_support,
	     xaxis,
	     yaxis,
	     gridss,
	     param.num_threads,
	     rank);
      
      /* int z =0 ;
       * #pragma omp target map(to:test_i_gpu) map(from:z)
       * {
       *   int x; // only accessible from accelerator
       *     x = 2;
       *       z = x + test_i_gpu;
       *       }*/

      clock_gettime(CLOCK_MONOTONIC, &finishk);
      endk = clock();
      timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
      timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec);
      timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
     #ifdef VERBOSE
      printf("Processed sector %ld\n",isector);
     #endif
      clock_gettime(CLOCK_MONOTONIC, &begink);
      startk = clock();

      //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
    
     #ifndef USE_MPI
      long stride = isector*2*xaxis*yaxis*num_w_planes;
      for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)
	gridtot[stride+iii] = gridss[iii];
     #endif

      // Write grid in the corresponding remote slab
     #ifdef USE_MPI
      // int target_rank = (int)isector;    it implied that size >= nsectors
      int target_rank = (int)(isector % size);
       
     #ifdef ONE_SIDE
      // printf("One Side communication active\n");
      MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
      MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
      MPI_Win_unlock(target_rank,slabwin);
      //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin);
     #endif //ONE_SIDE
     #ifdef REDUCE
      MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
     #endif //REDUCE
      
       //Let's use now the new implementation (ring in shmem and Ired inter-nodes)
     #ifdef RING
      int ret = reduce_ring(target_rank); //Calls the reduce [GL]
      grid = (double*)Me.fwin.ptr; //Let grid point to the right memory location [GL]
      /*
      if ( ret != 0 ){
	printf( "Problems with the ring reduce\n" ); //Check! Deactivated by default [GL]
	exit(685);}
      */
      #endif //RING
      #endif //USE_MPI
	       
       clock_gettime(CLOCK_MONOTONIC, &finishk);
       endk = clock();
       timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
       timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec);
       timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
       // Go to next sector
       for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;

       // Deallocate all sector arrays
       free(uus);
       free(vvs);
       free(wws);
       free(weightss);
       free(visreals);
       free(visimgs);
      // End of loop over sector    
    }

    // Finalize MPI communication
    #ifdef ONE_SIDE
       MPI_Win_fence(0,slabwin);
    #endif  

    #ifndef USE_MPI
        fclose(file.pFile1);
    #endif

       #ifdef RING //When more nodes are used, let's wait for the external Ireduce to be done! [GL]
	if( (Me.Rank[HOSTS] >= 0) && (Me.Nhosts > 1 )) {
          MPI_Waitall( Me.Ntasks[WORLD], requests, MPI_STATUSES_IGNORE);
          free(requests);}

        MPI_Barrier(MPI_COMM_WORLD);

       #endif
	
    #ifdef USE_MPI
        MPI_Barrier(MPI_COMM_WORLD);
    #endif

    end = clock();
    clock_gettime(CLOCK_MONOTONIC, &finish);
    timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
    timing.process_time1 = (finish.tv_sec - begin.tv_sec);
    timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    clock_gettime(CLOCK_MONOTONIC, &begin);
}

void write_grided_data()
{

   #ifdef WRITE_DATA
     // Write results
     if (rank == 0)
     {
        printf("WRITING GRIDDED DATA\n");
        file.pFilereal = fopen (out.outfile2,"wb");
        file.pFileimg = fopen (out.outfile3,"wb");
        #ifdef USE_MPI
           for (int isector=0; isector<nsectors; isector++)
           {
	    #ifdef RING //Let the MPI_Get copy from the right location (Results must be checked!) [GL]
	     MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,Me.win.win);
       	    #else
              MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin);
              MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin);
              MPI_Win_unlock(isector,slabwin);
	    #endif
              for (long i=0; i<size_of_grid/2; i++)
              {
                      gridss_real[i] = gridss[2*i];
                      gridss_img[i] = gridss[2*i+1];
              }
              if (param.num_w_planes > 1)
              {
                      for (int iw=0; iw<param.num_w_planes; iw++)
                        for (int iv=0; iv<yaxis; iv++)
                          for (int iu=0; iu<xaxis; iu++)
                          {
                               long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
                               long index = iu + iv*xaxis + iw*xaxis*yaxis;
                               fseek(file.pFilereal, global_index, SEEK_SET);
                               fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal);
                          }
                      for (int iw=0; iw<param.num_w_planes; iw++)
                        for (int iv=0; iv<yaxis; iv++)
                          for (int iu=0; iu<xaxis; iu++)
                          {
                               long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
                               long index = iu + iv*xaxis + iw*xaxis*yaxis;
                               fseek(file.pFileimg, global_index, SEEK_SET);
                               fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg);
                               //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]);
                               //fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm);
                          }

              }
              else
              {
                      for (int iw=0; iw<param.num_w_planes; iw++)
                      {
                          long global_index = (xaxis*isector*yaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
                          long index = iw*xaxis*yaxis;
                          fseek(file.pFilereal, global_index, SEEK_SET);
                          fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), file.pFilereal);
                          fseek(file.pFileimg, global_index, SEEK_SET);
                          fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), file.pFileimg);
                     }
              }
          }
       #else
          for (int iw=0; iw<param.num_w_planes; iw++)
             for (int iv=0; iv<param.grid_size_y; iv++)
               for (int iu=0; iu<param.grid_size_x; iu++)
                {
                      long index = 2*(iu + iv*param.grid_size_x + iw*param.grid_size_x*param.grid_size_y);
                      fwrite(&gridtot[index], 1, sizeof(double), file.pFilereal);
                      fwrite(&gridtot[index+1], 1, sizeof(double), file.pFileimg);
                      //double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
                      //fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
                 }
        #endif
        fclose(file.pFilereal);
        fclose(file.pFileimg);
     }

     #ifdef USE_MPI
        MPI_Win_fence(0,slabwin);
     #endif

   #endif //WRITE_DATA 

}
