From 7a25e80b716d909c8aac80662d4ce0d029242367 Mon Sep 17 00:00:00 2001
From: Giuliano Taffoni <giuliano.taffoni@inaf.it>
Date: Tue, 23 Nov 2021 20:50:44 +0100
Subject: [PATCH 1/4] OpenMP GPU implementation

---
 .gitignore                |  2 +
 Build/Makefile.Macosx     | 26 +++++++++++++
 Build/Makefile.Magellanus | 34 +++++++++++++++++
 Build/Makefile.Marconi    | 20 ++++++++++
 Build/Makefile.systype    | 24 ++++++++++++
 Makefile                  | 79 ++++++++++++++++++++++++---------------
 phase_correction.cu       | 16 +++++---
 w-stacking-fftw.c         | 67 ++++++++++++++++++++-------------
 w-stacking.cu             | 32 ++++++++++++----
 w-stacking.h              |  2 +-
 10 files changed, 233 insertions(+), 69 deletions(-)
 create mode 100644 Build/Makefile.Macosx
 create mode 100644 Build/Makefile.Magellanus
 create mode 100644 Build/Makefile.Marconi
 create mode 100644 Build/Makefile.systype

diff --git a/.gitignore b/.gitignore
index 797747a..9cce9de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+.DS_Store
+sync.sh
 *.o
 phase_correction.c
 w-stacking.c
diff --git a/Build/Makefile.Macosx b/Build/Makefile.Macosx
new file mode 100644
index 0000000..cdb0e13
--- /dev/null
+++ b/Build/Makefile.Macosx
@@ -0,0 +1,26 @@
+CC       =  icc
+CXX      =  g++-11
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+FFTW_INCL=  -I/usr/local/include
+FFTW_LIB=  -L/usr/local/lib/
+
+GSL_INCL =
+GSL_LIBS =
+
+MPI_LIB =
+MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
+HDF5_INCL =
+HDF5_LIB  =
+
+OMP= -fopenmp
+
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
+NVLIB = -L/home/taffoni -lcudart -lcuda
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE = $(OMP) -O3
diff --git a/Build/Makefile.Magellanus b/Build/Makefile.Magellanus
new file mode 100644
index 0000000..36c9169
--- /dev/null
+++ b/Build/Makefile.Magellanus
@@ -0,0 +1,34 @@
+CC       =  gcc-10
+CXX      =  g++-10
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+GSL_INCL =  -I/home/taffoni/sw/include
+GSL_LIBS =  -L/home/taffoni/sw/lib
+
+FFTW_INCL=  -I/home/taffoni/sw/include
+FFTW_LIB=  -L/home/taffoni/sw/lib   -lfftw3_mpi -lfftw3
+
+#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
+MPI_LIB =
+#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
+MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
+HDF5_INCL =
+HDF5_LIB  =
+
+OMP = -mp=multicore,gpu -Mprof -cuda -lcudart 
+#OMP = -fopenmp
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11
+NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda
+
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE =  $(OMP) -O3
+
+# OMP GPU SPECIFIC FLAGS
+#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math
+#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35
+#-ffast-math  -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none
diff --git a/Build/Makefile.Marconi b/Build/Makefile.Marconi
new file mode 100644
index 0000000..4fec93a
--- /dev/null
+++ b/Build/Makefile.Marconi
@@ -0,0 +1,20 @@
+CC       =  gcc
+CXX      =  g++
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+
+FFTW_INCL=  -I/home/taffoni/sw/include
+FFTW_LIB=  -L/home/taffoni/sw/lib
+
+
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
+NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
+
+OMP= -fopenmp
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE = $(OMP) -O3 -mtune=native
diff --git a/Build/Makefile.systype b/Build/Makefile.systype
new file mode 100644
index 0000000..0492232
--- /dev/null
+++ b/Build/Makefile.systype
@@ -0,0 +1,24 @@
+CC       =  gcc-10
+CXX      =  g++-10
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+OPTIMIZE =
+
+
+GSL_INCL =
+GSL_LIB =
+
+FFTW_INCL=
+FFTW_LIB=
+
+NVCC =
+NVFLAGS =
+NVLIB =
+
+CFLAGS +=
+
+MPICHLIB =
+HDF5INCL =
+HDF5LIB  =
diff --git a/Makefile b/Makefile
index 99d1389..1e2c653 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,31 @@
 # comment/uncomment the various options depending hoe you want to build the program
+# Set default values for compiler options if no systype options are given or found
+CC        = mpiCC
+CXX       = mpiCC
+OPTIMIZE  = -std=c++11 -Wall -g -O2
+MPICHLIB  = -lmpich
+SWITCHES =
+
+ifdef SYSTYPE
+SYSTYPE := $(SYSTYPE)
+include Build/Makefile.$(SYSTYPE)
+else
+include Build/Makefile.systype
+endif
+
+
+LIBS = $(FFTW_LIB) -lfftw3 -lm -lcudart  -lcuda
+
 # create MPI code
 OPT += -DUSE_MPI
+OPT += -DACCOMP
 # use FFTW (it can be switched on ONLY if MPI is active)
-OPT += -DUSE_FFTW
+ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
+   OPT += -DUSE_FFTW
+	 LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
+endif
+
+#OPT += -DNVIDIA
 # perform one-side communication (suggested) instead of reduce (only if MPI is active)
 OPT += -DONE_SIDE
 # write the full 3D cube of gridded visibilities and its FFT transform
@@ -10,25 +33,8 @@ OPT += -DONE_SIDE
 # write the final image
 OPT += -DWRITE_IMAGE
 # perform w-stacking phase correction
-#OPT += -DPHASE_ON
-
-CC = gcc
-CXX = g++
-ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
-  CC = mpicc
-  CXX = mpiCC 
-endif
-
-OMP = -fopenmp 
-#OMP = 
-
-CFLAGS += -O3 -mcpu=native
-CFLAGS += -I.
-LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
+# OPT += PHASE_ON
 
-NVCC = nvcc
-NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
-NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
 
 DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu
 COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o
@@ -39,27 +45,40 @@ w-stacking.c: w-stacking.cu
 phase_correction.c: phase_correction.cu
 	cp phase_correction.cu phase_correction.c
 
+ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
+%.o: %.c $(DEPS)
+	$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
+else
 %.o: %.c $(DEPS)
-	$(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT)
+	$(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
+endif
 
 serial: $(COBJ)
-	$(CC) $(OMP) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm
+	$(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial  $^ $(LIBS)
+
+serial_omp: phase_correction.c
+	$(CC)  $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial w-stacking-fftw.c  w-stacking_omp.c    $(CFLAGS) $(LIBS)
+
+simple_mpi: phase_correction.c
+	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c w-stacking-fftw.c phase_correction.c  $(CFLAGS) $(LIBS)
+
+mpi_omp: phase_correction.c
+	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_omp w-stacking_omp.c w-stacking-fftw.c phase_correction.c  $(CFLAGS) $(LIBS)
 
 serial_cuda:
-	$(NVCC) $(OPT) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
-	$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
-	$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm
+	$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
+	$(CC)  $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
+	$(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm
 
-mpi: $(COBJ) 
-	$(CC) $(OMP) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS)
+mpi: $(COBJ)
+	$(MPICC) $(OPTIMIZE) -o w-stackingCfftw   $^  $(CFLAGS) $(LIBS)
 
 mpi_cuda:
-	$(NVCC) $(NVFLAGS) $(OPT) -c w-stacking.cu phase_correction.cu $(NVLIB)
-	$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
-	$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(LIBS) -lm
+	$(NVCC)   $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
+	$(MPICC)  $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
+	$(MPIC++) $(OPTIMIZE) $(OPT)   -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS)
 
 clean:
 	rm *.o
 	rm w-stacking.c
 	rm phase_correction.c
-
diff --git a/phase_correction.cu b/phase_correction.cu
index 9a43924..52eb4e0 100644
--- a/phase_correction.cu
+++ b/phase_correction.cu
@@ -8,7 +8,7 @@
 
 #ifdef __CUDACC__
 
-__global__ void phase_g(int xaxis, 
+__global__ void phase_g(int xaxis,
 		        int yaxis,
 			int num_w_planes,
 			double * gridss,
@@ -31,7 +31,7 @@ __global__ void phase_g(int xaxis,
 	if(gid < arraysize)
 	{
 	  long gid_aux = nbucket*gid;
-	  for(int iaux=0; iaux<nbucket; iaux++) 
+	  for(int iaux=0; iaux<nbucket; iaux++)
           {
 		int iw = gid_aux/(xaxis*yaxis);
 		int ivaux = gid_aux%(xaxis*yaxis);
@@ -101,7 +101,7 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
         long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1;
         if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
         printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
-	
+
 
 	cudaError_t mmm;
 	double * image_real_g;
@@ -148,7 +148,13 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
     omp_set_num_threads(num_threads);
 #endif
 
-        #pragma omp parallel for collapse(3) private(wterm)
+#ifdef ACCOMP
+#pragma omp target teams distribute parallel for collapse(3) \
+                        map(tofrom: image_real[0:xaxis*yaxis], image_imag[0:xaxis*yaxis])       \
+                        map (to: gridss[0:2*num_w_planes*xaxis*yaxis]) private(wterm)
+#else
+  #pragma omp parallel for collapse(3) private(wterm)
+#endif
 	for (int iw=0; iw<num_w_planes; iw++)
 	{
 	    for (int iv=0; iv<yaxis; iv++)
@@ -198,7 +204,7 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 		image_imag[img_index] += gridss[index+1];
 #endif // end of PHASE_ON
 
-            }  
+            }
 	}
 
 #endif // end of __CUDACC__
diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
index 0eda87e..3bdf2c1 100644
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -27,16 +27,16 @@ void Push(struct sectorlist** headRef, long data) {
      struct sectorlist* newNode = malloc(sizeof(struct sectorlist));
      newNode->index = data;
      newNode->next = *headRef;
-     *headRef = newNode; 
+     *headRef = newNode;
 }
 
 // Main Code
-int main(int argc, char * argv[]) 
+int main(int argc, char * argv[])
 {
 	int rank;
 	int size;
 
-	FILE * pFile; 
+	FILE * pFile;
 	FILE * pFile1;
 	FILE * pFilereal;
 	FILE * pFileimg;
@@ -50,7 +50,7 @@ int main(int argc, char * argv[])
 	//
 	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_gauss4new.binMS/";
 	//char datapath[900] = "/m100_scratch/userexternal/cgheller/Lofar/Observations/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/";
-	char datapath[900]; 
+	char datapath[900];
 	char datapath_multi[NFILES][900];
 
 	char ufile[30] = "ucoord.bin";
@@ -117,6 +117,11 @@ int main(int argc, char * argv[])
 	struct timespec begin, finish, begin0, begink, finishk;
 	double elapsed;
 	long nsectors;
+  /* GT get nymber of threads exit if not given */
+      if(argc == 1) {
+        fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
+        exit(1);
+      }
 
 	clock_gettime(CLOCK_MONOTONIC, &begin0);
 	start0 = clock();
@@ -144,7 +149,7 @@ int main(int argc, char * argv[])
 	if (nsectors < 0) nsectors = size;
 	local_grid_size_y = grid_size_y/nsectors;
 	//nsectors = size;
-	
+
 	// LOCAL grid size
 	xaxis = local_grid_size_x;
 	yaxis = local_grid_size_y;
@@ -156,14 +161,15 @@ int main(int argc, char * argv[])
 	int ndatasets = 1;
         //strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/");
         //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/");
-        strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
+        //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
         //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/");
-
+         strcpy(datapath_multi[0],"/data/taffoni/LAVORO/IMAGING/data/newgauss2noconj_t201806301100_SBL180.binMS/");
 	strcpy(datapath,datapath_multi[0]);
 	// Read metadata
 	strcpy(filename,datapath);
 	strcat(filename,metafile);
-	pFile = fopen (filename,"r");
+  // GT CHECK IF FILE EXISTS OR EXIT
+	if (pFile = fopen (filename,"r")) {
         fscanf(pFile,"%ld",&Nmeasures);
         fscanf(pFile,"%ld",&Nvis);
         fscanf(pFile,"%ld",&freq_per_chan);
@@ -176,7 +182,16 @@ int main(int argc, char * argv[])
         fscanf(pFile,"%lf",&uvmax);
         fscanf(pFile,"%lf",&wmin);
         fscanf(pFile,"%lf",&wmax);
-	fclose(pFile);
+	       fclose(pFile);
+       } else {
+         printf("Input file does not exists: %s\n", filename);
+         #ifdef USE_MPI
+         MPI_Finalize();
+         #endif
+         exit(256);
+       }
+
+
 
 
 	// WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -199,7 +214,7 @@ int main(int argc, char * argv[])
         // Set temporary local size of points
 	long nm_pe = (long)(Nmeasures/size);
 	long remaining = Nmeasures%size;
-        
+
         long startrow = rank*nm_pe;
         if (rank == size-1)nm_pe = nm_pe+remaining;
 
@@ -268,7 +283,7 @@ int main(int argc, char * argv[])
 	// Create histograms and linked lists
         clock_gettime(CLOCK_MONOTONIC, &begin);
         start = clock();
- 
+
         //CLAAA
 	// Initialize linked list
 	struct sectorlist ** sectorhead;
@@ -438,7 +453,7 @@ int main(int argc, char * argv[])
           // define local destination sector
           //isector = (isector_count+rank)%size;
           isector = isector_count;
-	  // allocate sector arrays     
+	  // allocate sector arrays
           long Nsec = histo_send[isector];
 	  uus = (double*) malloc(Nsec*sizeof(double));
 	  vvs = (double*) malloc(Nsec*sizeof(double));
@@ -501,7 +516,7 @@ int main(int argc, char * argv[])
 	       vvmin = MIN(vvmin,vvs[ipart]);
 	       vvmax = MAX(vvmax,vvs[ipart]);
 
-		  
+
                if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
           }
 
@@ -556,9 +571,9 @@ int main(int argc, char * argv[])
 	  #ifdef ONE_SIDE
 	  printf("One Side communication active\n");
 	  MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
-	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); 
+	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
 	  MPI_Win_unlock(target_rank,slabwin);
-	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); 	   
+	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin);
           #else
           MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
           #endif //ONE_SIDE
@@ -633,7 +648,7 @@ int main(int argc, char * argv[])
         process_time1 = (finish.tv_sec - begin.tv_sec);
         process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
         clock_gettime(CLOCK_MONOTONIC, &begin);
-         
+
 
 #ifdef WRITE_DATA
 	// Write results
@@ -712,7 +727,7 @@ int main(int argc, char * argv[])
 
 #ifdef USE_FFTW
 	// FFT transform the data (using distributed FFTW)
-	
+
 	if(rank == 0)printf("PERFORMING FFT\n");
         clock_gettime(CLOCK_MONOTONIC, &begin);
         start = clock();
@@ -727,13 +742,13 @@ int main(int argc, char * argv[])
 	// and perform the FFT per w plane
 	alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start);
 	fftwgrid = fftw_alloc_complex(alloc_local);
-	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); 
+	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE);
 
 	long fftwindex = 0;
 	long fftwindex2D = 0;
 	for (int iw=0; iw<num_w_planes; iw++)
         {
-            //printf("FFTing plan %d\n",iw);	    
+            //printf("FFTing plan %d\n",iw);
             // select the w-plane to transform
             for (int iv=0; iv<yaxis; iv++)
             {
@@ -814,7 +829,7 @@ int main(int argc, char * argv[])
                 for (int iw=0; iw<num_w_planes; iw++)
                 for (int iv=0; iv<yaxis; iv++)
                 for (int iu=0; iu<xaxis; iu++)
-                {         
+                {
                           long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
                           long index = iu + iv*xaxis + iw*xaxis*yaxis;
                           fseek(pFileimg, global_index, SEEK_SET);
@@ -825,7 +840,7 @@ int main(int argc, char * argv[])
                           fwrite(gridss_img, size_of_grid/2, sizeof(double), pFileimg);
               }
 
-	      
+
           }
           #else
           /*
@@ -851,15 +866,15 @@ int main(int argc, char * argv[])
 	MPI_Barrier(MPI_COMM_WORLD);
         #endif
 #endif //WRITE_DATA
- 
+
 	fftw_free(fftwgrid);
 
 	// Phase correction
         clock_gettime(CLOCK_MONOTONIC, &begin);
         start = clock();
 	if(rank == 0)printf("PHASE CORRECTION\n");
-        double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));	
-        double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));	
+        double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
+        double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
 
         phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y,resolution,wmin,wmax,num_threads);
 
@@ -949,9 +964,9 @@ int main(int argc, char * argv[])
 	   fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,tot_time);
 	 } else {
 	   fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,tot_time1);
-	 }  
+	 }
 	 fclose(pFile);
-	} 
+	}
 
 	// Close MPI environment
 	#ifdef USE_MPI
diff --git a/w-stacking.cu b/w-stacking.cu
index 4991896..bc696b4 100644
--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -9,6 +9,9 @@
 #ifdef __CUDACC__
 double __device__
 #else
+#ifdef ACCOMP
+#pragma omp  declare target
+#endif
 double
 #endif
 gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
@@ -17,6 +20,10 @@ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
      conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
      return conv_weight;
 }
+#ifdef ACCOMP
+#pragma omp end declare target
+#endif
+
 
 #ifdef __CUDACC__
 //double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
@@ -90,7 +97,7 @@ __global__ void convolve_g(
                 double add_term_img = 0.0;
                 long ifine = visindex;
                 for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
-		{	
+		{
 		   long iweight = visindex/freq_per_chan;
                    for (long ipol=0; ipol<polarizations; ipol++)
                    {
@@ -216,7 +223,19 @@ void wstack(
 #ifdef _OPENMP
     omp_set_num_threads(num_threads);
 #endif
-    #pragma omp parallel for private(visindex) 
+#ifdef ACCOMP
+    long Nvis = num_points*freq_per_chan*polarizations;
+    long gpu_weight_dim = Nvis/freq_per_chan;
+    long gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y;
+#pragma omp target teams distribute parallel for private(visindex) \
+map(to:num_points, KernelLen, std,  std22, norm, num_w_planes, \
+  uu[0:num_points], vv[0:num_points], ww[0:num_points], \
+  vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim], \
+  grid_size_x, grid_size_y, freq_per_chan, polarizations, dx,dw, w_support, num_threads) \
+  map(tofrom: grid[0:gpu_grid_dim])
+#else
+    #pragma omp parallel for private(visindex)
+#endif
     for (i = 0; i < num_points; i++)
     {
 #ifdef _OPENMP
@@ -265,7 +284,7 @@ void wstack(
 		long ifine = visindex;
 		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
 		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
-		{	
+		{
 		   long iweight = visindex/freq_per_chan;
 	           for (long ipol=0; ipol<polarizations; ipol++)
 	           {
@@ -287,7 +306,7 @@ void wstack(
 		grid[iKer+1] += add_term_img;
             }
         }
-	
+
     }
 // End switch between CUDA and CPU versions
 #endif
@@ -297,8 +316,7 @@ void wstack(
 int test(int nnn)
 {
 	int mmm;
-	
+
 	mmm = nnn+1;
 	return mmm;
-}	
-
+}
diff --git a/w-stacking.h b/w-stacking.h
index 35e1495..3d3a4c6 100644
--- a/w-stacking.h
+++ b/w-stacking.h
@@ -48,4 +48,4 @@ void phase_correction(
      double,
      double,
      int);
-#endif 
+#endif
-- 
GitLab


From aabcbc296cd62229fde5a76f539ad3487ad537e4 Mon Sep 17 00:00:00 2001
From: Giuliano Taffoni <giuliano.taffoni@inaf.it>
Date: Thu, 25 Nov 2021 11:43:54 +0100
Subject: [PATCH 2/4] including testing

---
 Build/Makefile.Magellanus    | 18 +++++++-----------
 Build/Makefile.MagellanusNVC | 26 ++++++++++++++++++++++++++
 phase_correction.cu          |  2 +-
 w-stacking-fftw.c            |  8 ++++----
 4 files changed, 38 insertions(+), 16 deletions(-)
 create mode 100644 Build/Makefile.MagellanusNVC

diff --git a/Build/Makefile.Magellanus b/Build/Makefile.Magellanus
index 36c9169..b71b35d 100644
--- a/Build/Makefile.Magellanus
+++ b/Build/Makefile.Magellanus
@@ -7,21 +7,18 @@ MPIC++   =  mpiCC
 GSL_INCL =  -I/home/taffoni/sw/include
 GSL_LIBS =  -L/home/taffoni/sw/lib
 
-FFTW_INCL=  -I/home/taffoni/sw/include
-FFTW_LIB=  -L/home/taffoni/sw/lib   -lfftw3_mpi -lfftw3
+FFTW_INCL=
+FFTW_LIB=    -lfftw3_mpi -lfftw3
 
-#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
 MPI_LIB =
-#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
-MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
+MPI_INCL=
 HDF5_INCL =
 HDF5_LIB  =
 
-OMP = -mp=multicore,gpu -Mprof -cuda -lcudart 
-#OMP = -fopenmp
+OMP = -fopenmp  -lcuda -lcudart
 NVCC = nvcc
 NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11
-NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda
+NVLIB =  -lcudart -lcuda
 
 
 CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
@@ -29,6 +26,5 @@ CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
 OPTIMIZE =  $(OMP) -O3
 
 # OMP GPU SPECIFIC FLAGS
-#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math
-#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35
-#-ffast-math  -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none
+OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math
+OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35
diff --git a/Build/Makefile.MagellanusNVC b/Build/Makefile.MagellanusNVC
new file mode 100644
index 0000000..34c9f2e
--- /dev/null
+++ b/Build/Makefile.MagellanusNVC
@@ -0,0 +1,26 @@
+CC       =  nvc
+CXX      =  nvc++
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+GSL_INCL =  -I/home/taffoni/sw/include
+GSL_LIBS =  -L/home/taffoni/sw/lib
+
+FFTW_INCL=  -I/home/taffoni/sw/include
+FFTW_LIB=  -L/home/taffoni/sw/lib   -lfftw3_mpi -lfftw3
+
+MPI_LIB =
+MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
+HDF5_INCL =
+HDF5_LIB  =
+
+OMP = -mp=multicore,gpu -Mprof -cuda -lcudart
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11
+NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda
+
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE =  $(OMP) -O3
diff --git a/phase_correction.cu b/phase_correction.cu
index 52eb4e0..e1db329 100644
--- a/phase_correction.cu
+++ b/phase_correction.cu
@@ -108,7 +108,7 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 	double * image_imag_g;
 	double * gridss_g;
 
-        mmm=cudaMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double));
+  mmm=cudaMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double));
 	//printf("CUDA ERROR 1 %s\n",cudaGetErrorString(mmm));
 	mmm=cudaMalloc(&image_real_g, xaxis*yaxis*sizeof(double));
 	//printf("CUDA ERROR 2 %s\n",cudaGetErrorString(mmm));
diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
index 3bdf2c1..bdeeed2 100644
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -574,10 +574,10 @@ int main(int argc, char * argv[])
 	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
 	  MPI_Win_unlock(target_rank,slabwin);
 	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin);
-          #else
-          MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
-          #endif //ONE_SIDE
-          #endif //USE_MPI
+    #else
+    MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
+    #endif //ONE_SIDE
+    #endif //USE_MPI
 
 	  clock_gettime(CLOCK_MONOTONIC, &finishk);
 	  endk = clock();
-- 
GitLab


From 6c9080ff3b04c003eb728824cf45172469db05bb Mon Sep 17 00:00:00 2001
From: Giuliano Taffoni <giuliano.taffoni@inaf.it>
Date: Tue, 30 Nov 2021 17:53:26 +0100
Subject: [PATCH 3/4] aligned with latest version

---
 w-stacking-fftw.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
index bdeeed2..c56b3cb 100644
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -141,8 +141,10 @@ int main(int argc, char * argv[])
 	rank = 0;
 	size = 1;
 #endif
+
         if(rank == 0)printf("Running with %d threads\n",num_threads);
 
+
 	// set the local size of the image
 	local_grid_size_x = grid_size_x;
 	nsectors = NUM_OF_SECTORS;
@@ -278,6 +280,7 @@ int main(int argc, char * argv[])
 	setup_time1 = (finish.tv_sec - begin.tv_sec);
 	setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
 
+
         if(rank == 0)printf("GRIDDING DATA\n");
 
 	// Create histograms and linked lists
@@ -335,7 +338,7 @@ int main(int argc, char * argv[])
         for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
         #endif
 
-       // Create sector grid
+// Create sector grid
         double * gridss;
         double * gridss_w;
         double * gridss_real;
@@ -367,6 +370,7 @@ int main(int argc, char * argv[])
         pFile1 = fopen (outfile1,"w");
         #endif
 
+
 	// loop over files
 	//
 	kernel_time = 0.0;
@@ -415,25 +419,21 @@ int main(int argc, char * argv[])
         fread(weights,(Nweights)*sizeof(float),1,pFile);
         fclose(pFile);
 
-        strcpy(filename,datapath);
-        strcat(filename,visrealfile);
-        //printf("Reading %s\n",filename);
-
-        pFile = fopen (filename,"rb");
-        fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
-        fread(visreal,Nvis*sizeof(float),1,pFile);
-        fclose(pFile);
-
-        strcpy(filename,datapath);
-        strcat(filename,visimgfile);
-        //printf("Reading %s\n",filename);
-
-        pFile = fopen (filename,"rb");
-        fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
-        fread(visimg,Nvis*sizeof(float),1,pFile);
-        fclose(pFile);
+  pFile = fopen (filename,"rb");
+  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
+  fread(visreal,Nvis*sizeof(float),1,pFile);
+  fclose(pFile);
+  strcpy(filename,datapath);
+  strcat(filename,visimgfile);
+#ifdef VERBOSE
+  printf("Reading %s\n",filename);
+#endif
+  pFile = fopen (filename,"rb");
+  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
+  fread(visimg,Nvis*sizeof(float),1,pFile);
+  fclose(pFile);
 
-        #ifdef USE_MPI
+#ifdef USE_MPI
         MPI_Barrier(MPI_COMM_WORLD);
         #endif
 
@@ -446,6 +446,7 @@ int main(int argc, char * argv[])
         float * weightss;
 
 	long isector;
+
         for (long isector_count=0; isector_count<nsectors; isector_count++)
         {
           clock_gettime(CLOCK_MONOTONIC, &begink);
@@ -471,6 +472,9 @@ int main(int argc, char * argv[])
 //CLAAAA
 	  struct sectorlist * current;
 	  current = sectorhead[isector];
+
+
+
 	  while (current->index != -1)
           {
              long ilocal = current->index;
@@ -503,7 +507,8 @@ int main(int argc, char * argv[])
 	  compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
 	  compose_time1 += (finishk.tv_sec - begink.tv_sec);
 	  compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
-          #ifndef USE_MPI
+
+    #ifndef USE_MPI
 	  double uumin = 1e20;
 	  double vvmin = 1e20;
 	  double uumax = -1e20;
-- 
GitLab


From fd67a5ca7fe8150a292a35298d3ff4bfd3f44128 Mon Sep 17 00:00:00 2001
From: Giuliano Taffoni <giuliano.taffoni@inaf.it>
Date: Mon, 31 Jan 2022 16:25:54 +0100
Subject: [PATCH 4/4] first commit of this branch

---
 .atom-beautify.w-stacking-fftw.c | 991 +++++++++++++++++++++++++++++++
 phase_correction.cu              |   4 +-
 w-stacking-fftw.c                |  25 +-
 w-stacking.cu                    |   5 +-
 w-stacking.h                     |   2 +
 5 files changed, 1015 insertions(+), 12 deletions(-)
 create mode 100644 .atom-beautify.w-stacking-fftw.c

diff --git a/.atom-beautify.w-stacking-fftw.c b/.atom-beautify.w-stacking-fftw.c
new file mode 100644
index 0000000..a57894a
--- /dev/null
+++ b/.atom-beautify.w-stacking-fftw.c
@@ -0,0 +1,991 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef USE_MPI
+#include <mpi.h>
+#ifdef USE_FFTW
+#include <fftw3-mpi.h>
+#endif
+#endif
+#include <math.h>
+#include <time.h>
+#include "w-stacking.h"
+#define PI 3.14159265359
+#define NUM_OF_SECTORS -1
+#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
+#define NOVERBOSE
+#define NFILES 100
+
+// Linked List set-up
+struct sectorlist {
+     long index;
+     struct sectorlist * next;
+};
+
+void Push(struct sectorlist** headRef, long data) {
+     struct sectorlist* newNode = malloc(sizeof(struct sectorlist));
+     newNode->index = data;
+     newNode->next = *headRef;
+     *headRef = newNode;
+}
+
+// Main Code
+int main(int argc, char * argv[])
+{
+	int rank;
+	int size;
+
+	FILE * pFile;
+	FILE * pFile1;
+	FILE * pFilereal;
+	FILE * pFileimg;
+	char filename[1000];
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/old/data/gauss2_t201806301100_SBL180.binMS/";
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/newgauss2noconj_t201806301100_SBL180.binMS/";
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/gauss1_t201806301100_SBL180.binMS/";
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/";
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/gauss4_t201806301100_SBL180.binMS/";
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_t201806301100_SBH255i-test.binMS/";
+	//
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_gauss4new.binMS/";
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/Lofar/Observations/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/";
+	char datapath[900];
+	char datapath_multi[NFILES][900];
+
+	char ufile[30] = "ucoord.bin";
+        char vfile[30] = "vcoord.bin";
+        char wfile[30] = "wcoord.bin";
+        char weightsfile[30] = "weights.bin";
+        char visrealfile[30] = "visibilities_real.bin";
+        char visimgfile[30] = "visibilities_img.bin";
+        char metafile[30] = "meta.txt";
+	char outfile[30] = "grid.txt";
+	char outfile1[30] = "coords.txt";
+	char outfile2[30] = "grid_real.bin";
+	char outfile3[30] = "grid_img.bin";
+	char fftfile[30] = "fft.txt";
+	char fftfile2[30] = "fft_real.bin";
+	char fftfile3[30] = "fft_img.bin";
+	char logfile[30] = "run.log";
+	char extension[30] = ".txt";
+	char srank[4];
+	char timingfile[30] = "timings.dat";
+
+	double * uu;
+	double * vv;
+	double * ww;
+	float * weights;
+	float * visreal;
+	float * visimg;
+
+	long Nmeasures,Nmeasures0;
+	long Nvis,Nvis0;
+	long Nweights,Nweights0;
+	long freq_per_chan,freq_per_chan0;
+	long polarisations,polarisations0;
+        long Ntimes,Ntimes0;
+	double dt,dt0;
+	double thours,thours0;
+	long baselines,baselines0;
+	double uvmin,uvmin0;
+	double uvmax,uvmax0;
+	double wmin,wmin0;
+	double wmax,wmax0;
+	double resolution;
+
+        // MESH SIZE
+	int grid_size_x = 4096;
+	int grid_size_y = 4096;
+	int local_grid_size_x;// = 16;
+	int local_grid_size_y;// = 16;
+	int xaxis;
+	int yaxis;
+	int num_w_planes = 8;
+	// DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
+	int w_support = 7;
+	int num_threads;// = 4;
+	double dx = 1.0/(double)grid_size_x;
+	double dw = 1.0/(double)num_w_planes;
+	double w_supporth = (double)((w_support-1)/2)*dx;
+
+	clock_t start, end, start0, startk, endk;
+	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time;
+	double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1;
+	double writetime, writetime1;
+
+	struct timespec begin, finish, begin0, begink, finishk;
+	double elapsed;
+	long nsectors;
+  /* GT get nymber of threads exit if not given */
+      if(argc == 1) {
+        fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
+        exit(1);
+      }
+
+	clock_gettime(CLOCK_MONOTONIC, &begin0);
+	start0 = clock();
+        // Set the number of OpenMP threads
+        num_threads = atoi(argv[1]);
+
+	// Intialize MPI environment
+#ifdef USE_MPI
+	MPI_Init(&argc,&argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+	if(rank == 0)printf("Running with %d MPI tasks\n",size);
+        printf("Number of available GPUs %d\n", omp_get_num_devices());
+        printf("Number of Current GPUs %d\n", omp_get_default_device());
+        int node_rank;
+	MPI_Comm NUMA_NODE_Comm;
+	MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &NUMA_NODE_Comm);
+	MPI_Comm_rank( NUMA_NODE_Comm, &node_rank);
+	printf( "Task %d has rank %d in its numa node\n", rank, node_rank );
+
+  #ifdef USE_FFTW
+	fftw_mpi_init();
+  #endif
+#else
+	rank = 0;
+	size = 1;
+#endif
+
+        if(rank == 0)printf("Running with %d threads\n",num_threads);
+
+
+	// set the local size of the image
+	local_grid_size_x = grid_size_x;
+	nsectors = NUM_OF_SECTORS;
+	if (nsectors < 0) nsectors = size;
+	local_grid_size_y = grid_size_y/nsectors;
+	//nsectors = size;
+
+	// LOCAL grid size
+	xaxis = local_grid_size_x;
+	yaxis = local_grid_size_y;
+
+	clock_gettime(CLOCK_MONOTONIC, &begin);
+	start = clock();
+
+	// INPUT FILES (only the first ndatasets entries are used)
+	int ndatasets = 1;
+        //strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/");
+        //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/");
+        strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
+        //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/");
+        //strcpy(datapath_multi[0],"/data/taffoni/LAVORO/IMAGING/data/newgauss2noconj_t201806301100_SBL180.binMS/");
+	strcpy(datapath,datapath_multi[0]);
+	// Read metadata
+	strcpy(filename,datapath);
+	strcat(filename,metafile);
+  // GT CHECK IF FILE EXISTS OR EXIT
+	if (pFile = fopen (filename,"r")) {
+        fscanf(pFile,"%ld",&Nmeasures);
+        fscanf(pFile,"%ld",&Nvis);
+        fscanf(pFile,"%ld",&freq_per_chan);
+        fscanf(pFile,"%ld",&polarisations);
+        fscanf(pFile,"%ld",&Ntimes);
+        fscanf(pFile,"%lf",&dt);
+        fscanf(pFile,"%lf",&thours);
+        fscanf(pFile,"%ld",&baselines);
+        fscanf(pFile,"%lf",&uvmin);
+        fscanf(pFile,"%lf",&uvmax);
+        fscanf(pFile,"%lf",&wmin);
+        fscanf(pFile,"%lf",&wmax);
+	       fclose(pFile);
+       } else {
+         printf("Input file does not exists: %s\n", filename);
+         #ifdef USE_MPI
+         MPI_Finalize();
+         #endif
+         exit(256);
+       }
+
+
+
+
+	// WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+	int nsub = 1000;
+	//int nsub = 10;
+	printf("Subtracting last %d measurements\n",nsub);
+        Nmeasures = Nmeasures-nsub;
+	Nvis = Nmeasures*freq_per_chan*polarisations;
+
+        // calculate the coordinates of the center
+	double uvshift = uvmin/(uvmax-uvmin);
+	//printf("UVSHIFT %f %f %f %f %f\n",uvmin, uvmax, wmin, wmax, uvshift);
+
+	if (rank == 0)
+	{
+	   printf("N. measurements %ld\n",Nmeasures);
+	   printf("N. visibilities %ld\n",Nvis);
+	}
+
+        // Set temporary local size of points
+	long nm_pe = (long)(Nmeasures/size);
+	long remaining = Nmeasures%size;
+
+        long startrow = rank*nm_pe;
+        if (rank == size-1)nm_pe = nm_pe+remaining;
+
+	long Nmeasures_tot = Nmeasures;
+	Nmeasures = nm_pe;
+	long Nvis_tot = Nvis;
+	Nvis = Nmeasures*freq_per_chan*polarisations;
+	Nweights = Nmeasures*polarisations;
+
+        #ifdef VERBOSE
+	printf("N. measurements on %d %ld\n",rank,Nmeasures);
+	printf("N. visibilities on %d %ld\n",rank,Nvis);
+        #endif
+
+	// DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used
+	// all the sizes are rescaled by the number of MPI tasks
+	// Allocate arrays
+	uu = (double*) calloc(Nmeasures,sizeof(double));
+	vv = (double*) calloc(Nmeasures,sizeof(double));
+	ww = (double*) calloc(Nmeasures,sizeof(double));
+	weights = (float*) calloc(Nweights,sizeof(float));
+	visreal = (float*) calloc(Nvis,sizeof(float));
+	visimg = (float*) calloc(Nvis,sizeof(float));
+
+        if(rank == 0)printf("READING DATA\n");
+	// Read data
+	strcpy(filename,datapath);
+	strcat(filename,ufile);
+	//printf("Reading %s\n",filename);
+
+	pFile = fopen (filename,"rb");
+	fseek (pFile,startrow*sizeof(double),SEEK_SET);
+	fread(uu,Nmeasures*sizeof(double),1,pFile);
+	fclose(pFile);
+
+	strcpy(filename,datapath);
+	strcat(filename,vfile);
+	//printf("Reading %s\n",filename);
+
+	pFile = fopen (filename,"rb");
+	fseek (pFile,startrow*sizeof(double),SEEK_SET);
+	fread(vv,Nmeasures*sizeof(double),1,pFile);
+	fclose(pFile);
+
+	strcpy(filename,datapath);
+	strcat(filename,wfile);
+	//printf("Reading %s\n",filename);
+
+	pFile = fopen (filename,"rb");
+	fseek (pFile,startrow*sizeof(double),SEEK_SET);
+	fread(ww,Nmeasures*sizeof(double),1,pFile);
+	fclose(pFile);
+
+        #ifdef USE_MPI
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+
+	clock_gettime(CLOCK_MONOTONIC, &finish);
+	end = clock();
+	setup_time = ((double) (end - start)) / CLOCKS_PER_SEC;
+	setup_time1 = (finish.tv_sec - begin.tv_sec);
+	setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
+
+
+        if(rank == 0)printf("GRIDDING DATA\n");
+
+	// Create histograms and linked lists
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+        start = clock();
+
+        //CLAAA
+	// Initialize linked list
+	struct sectorlist ** sectorhead;
+	sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist));
+	for (int isec=0; isec<=nsectors; isec++)
+	{
+		sectorhead[isec] = malloc(sizeof(struct sectorlist));
+		sectorhead[isec]->index = -1;
+		sectorhead[isec]->next = NULL;
+	}
+
+
+	long * histo_send = (long*) calloc(nsectors+1,sizeof(long));
+	int * boundary = (int*) calloc(Nmeasures,sizeof(int));
+	double uuh,vvh;
+	for (long iphi = 0; iphi < Nmeasures; iphi++)
+	{
+           boundary[iphi] = -1;
+	   uuh = uu[iphi];
+	   vvh = vv[iphi];
+	   int binphi = (int)(vvh*nsectors);
+	   // check if the point influence also neighboring slabs
+	   double updist = (double)((binphi+1)*yaxis)*dx - vvh;
+	   double downdist = vvh - (double)(binphi*yaxis)*dx;
+	   //
+	   histo_send[binphi]++;
+           Push(&sectorhead[binphi],iphi);
+           if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(&sectorhead[binphi+1],iphi);};
+	   if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1; Push(&sectorhead[binphi-1],iphi);};
+	}
+
+#ifdef PIPPO
+	struct sectorlist * current;
+	long iiii = 0;
+	for (int j=0; j<nsectors; j++)
+	{
+		current = sectorhead[j];
+		iiii = 0;
+		while (current->index != -1)
+		{
+			printf("%d %d %ld %ld %ld\n",rank,j,iiii,histo_send[j],current->index);
+			current = current->next;
+			iiii++;
+		}
+	}
+#endif
+
+        #ifdef VERBOSE
+        for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
+        #endif
+
+// Create sector grid
+        double * gridss;
+        double * gridss_w;
+        double * gridss_real;
+        double * gridss_img;
+	double * grid;
+        long size_of_grid;
+        size_of_grid = 2*num_w_planes*xaxis*yaxis;
+        gridss = (double*) calloc(size_of_grid,sizeof(double));
+        gridss_w = (double*) calloc(size_of_grid,sizeof(double));
+        gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
+        gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
+        // Create destination slab
+        grid = (double*) calloc(size_of_grid,sizeof(double));
+        // Create temporary global grid
+        #ifndef USE_MPI
+        double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
+        #endif
+
+        double shift = (double)(dx*yaxis);
+
+        // Open the MPI Memory Window for the slab
+        #ifdef USE_MPI
+        MPI_Win slabwin;
+        MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
+        MPI_Win_fence(0,slabwin);
+        #endif
+
+        #ifndef USE_MPI
+        pFile1 = fopen (outfile1,"w");
+        #endif
+
+
+	// loop over files
+	//
+	kernel_time = 0.0;
+	kernel_time1 = 0.0;
+	reduce_time = 0.0;
+	reduce_time1 = 0.0;
+	compose_time = 0.0;
+	compose_time1 = 0.0;
+
+	// MAIN LOOP OVER FILES
+	//
+	for (int ifiles=0; ifiles<ndatasets; ifiles++)
+	{
+
+	strcpy(filename,datapath_multi[ifiles]);
+        printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
+
+        // Read metadata
+        strcpy(filename,datapath);
+        strcat(filename,metafile);
+        pFile = fopen (filename,"r");
+        fscanf(pFile,"%ld",&Nmeasures0);
+        fscanf(pFile,"%ld",&Nvis0);
+        fscanf(pFile,"%ld",&freq_per_chan0);
+        fscanf(pFile,"%ld",&polarisations0);
+        fscanf(pFile,"%ld",&Ntimes0);
+        fscanf(pFile,"%lf",&dt0);
+        fscanf(pFile,"%lf",&thours0);
+        fscanf(pFile,"%ld",&baselines0);
+        fscanf(pFile,"%lf",&uvmin);
+        fscanf(pFile,"%lf",&uvmax);
+        fscanf(pFile,"%lf",&wmin0);
+        fscanf(pFile,"%lf",&wmax0);
+        fclose(pFile);
+
+        // calculate the resolution in radians
+        resolution = 1.0/MAX(abs(uvmin),abs(uvmax));
+        // calculate the resolution in arcsec
+        double resolution_asec = (3600.0*180.0)/MAX(abs(uvmin),abs(uvmax))/PI;
+        printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
+
+        strcpy(filename,datapath);
+        strcat(filename,weightsfile);
+        pFile = fopen (filename,"rb");
+        fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET);
+        fread(weights,(Nweights)*sizeof(float),1,pFile);
+        fclose(pFile);
+
+  pFile = fopen (filename,"rb");
+  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
+  fread(visreal,Nvis*sizeof(float),1,pFile);
+  fclose(pFile);
+  strcpy(filename,datapath);
+  strcat(filename,visimgfile);
+#ifdef VERBOSE
+  printf("Reading %s\n",filename);
+#endif
+  pFile = fopen (filename,"rb");
+  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
+  fread(visimg,Nvis*sizeof(float),1,pFile);
+  fclose(pFile);
+
+#ifdef USE_MPI
+        MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+
+        // Declare temporary arrays for the masking
+        double * uus;
+        double * vvs;
+        double * wws;
+        float * visreals;
+        float * visimgs;
+        float * weightss;
+
+	long isector;
+
+        for (long isector_count=0; isector_count<nsectors; isector_count++)
+        {
+          clock_gettime(CLOCK_MONOTONIC, &begink);
+          startk = clock();
+          // define local destination sector
+          //isector = (isector_count+rank)%size;
+          isector = isector_count;
+	  // allocate sector arrays
+          long Nsec = histo_send[isector];
+	  uus = (double*) malloc(Nsec*sizeof(double));
+	  vvs = (double*) malloc(Nsec*sizeof(double));
+	  wws = (double*) malloc(Nsec*sizeof(double));
+	  long Nweightss = Nsec*polarisations;
+	  long Nvissec = Nweightss*freq_per_chan;
+	  weightss = (float*) malloc(Nweightss*sizeof(float));
+	  visreals = (float*) malloc(Nvissec*sizeof(float));
+	  visimgs = (float*) malloc(Nvissec*sizeof(float));
+
+	  // select data for this sector
+          long icount = 0;
+	  long ip = 0;
+	  long inu = 0;
+//CLAAAA
+	  struct sectorlist * current;
+	  current = sectorhead[isector];
+
+
+
+	  while (current->index != -1)
+          {
+             long ilocal = current->index;
+	     //double vvh = vv[ilocal];
+             //int binphi = (int)(vvh*nsectors);
+	     //if (binphi == isector || boundary[ilocal] == isector)
+	     //{
+	       uus[icount] = uu[ilocal];
+	       vvs[icount] = vv[ilocal]-isector*shift;
+	       wws[icount] = ww[ilocal];
+               for (long ipol=0; ipol<polarisations; ipol++)
+	       {
+		  weightss[ip] = weights[ilocal*polarisations+ipol];
+                  ip++;
+	       }
+               for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++)
+	       {
+	          visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq];
+	          visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq];
+		  //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis);
+	          inu++;
+	       }
+	       icount++;
+	     //}
+	     current = current->next;
+          }
+
+	  clock_gettime(CLOCK_MONOTONIC, &finishk);
+	  endk = clock();
+	  compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
+	  compose_time1 += (finishk.tv_sec - begink.tv_sec);
+	  compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+
+    #ifndef USE_MPI
+	  double uumin = 1e20;
+	  double vvmin = 1e20;
+	  double uumax = -1e20;
+	  double vvmax = -1e20;
+
+          for (long ipart=0; ipart<Nsec; ipart++)
+          {
+	       uumin = MIN(uumin,uus[ipart]);
+	       uumax = MAX(uumax,uus[ipart]);
+	       vvmin = MIN(vvmin,vvs[ipart]);
+	       vvmax = MAX(vvmax,vvs[ipart]);
+
+
+               if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
+          }
+
+	  printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
+          #endif
+
+          // Make convolution on the grid
+          #ifdef VERBOSE
+	  printf("Processing sector %ld\n",isector);
+	  #endif
+          clock_gettime(CLOCK_MONOTONIC, &begink);
+          startk = clock();
+          wstack(num_w_planes,
+               Nsec,
+               freq_per_chan,
+               polarisations,
+               uus,
+               vvs,
+               wws,
+               visreals,
+               visimgs,
+               weightss,
+               dx,
+               dw,
+               w_support,
+	       xaxis,
+	       yaxis,
+               gridss,
+               num_threads,
+	       node_rank);
+	  clock_gettime(CLOCK_MONOTONIC, &finishk);
+	  endk = clock();
+	  kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
+	  kernel_time1 += (finishk.tv_sec - begink.tv_sec);
+	  kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+          #ifdef VERBOSE
+	  printf("Processed sector %ld\n",isector);
+          #endif
+          clock_gettime(CLOCK_MONOTONIC, &begink);
+          startk = clock();
+
+          //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
+
+          #ifndef USE_MPI
+	  long stride = isector*2*xaxis*yaxis*num_w_planes;
+          for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)gridtot[stride+iii] = gridss[iii];
+	  #endif
+
+	  // Write grid in the corresponding remote slab
+          #ifdef USE_MPI
+	  int target_rank = (int)isector;
+	  //int target_rank = (int)(size-isector-1);
+	  #ifdef ONE_SIDE
+	  printf("One Side communication active\n");
+	  MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
+	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
+	  MPI_Win_unlock(target_rank,slabwin);
+	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin);
+    #else
+    MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
+    #endif //ONE_SIDE
+    #endif //USE_MPI
+
+	  clock_gettime(CLOCK_MONOTONIC, &finishk);
+	  endk = clock();
+	  reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
+	  reduce_time1 += (finishk.tv_sec - begink.tv_sec);
+	  reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+          // Go to next sector
+	  for (long inull=0; inull<2*num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;
+
+	  // Deallocate all sector arrays
+	  free(uus);
+	  free(vvs);
+	  free(wws);
+	  free(weightss);
+	  free(visreals);
+	  free(visimgs);
+        // End of loop over sectors
+        }
+	// End of loop over input files
+	}
+
+	// Finalize MPI communication
+        #ifdef USE_MPI
+	MPI_Win_fence(0,slabwin);
+	#endif
+
+        // Swap left and right parts APPARENTLY NOT NECESSARY
+	/*
+	for (long kswap=0; kswap<num_w_planes; kswap++)
+	for (long jswap=0; jswap<yaxis; jswap++)
+	for (long iswap=0; iswap<xaxis; iswap++)
+	{
+           long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
+	   gridss[index_origin] = grid[index_origin];
+	   gridss[index_origin+1] = grid[index_origin+1];
+	}
+	for (long kswap=0; kswap<num_w_planes; kswap++)
+	for (long jswap=0; jswap<yaxis; jswap++)
+	for (long iswap=0; iswap<xaxis/2; iswap++)
+	{
+           long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
+	   long index_destination = 2*(iswap+xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis);
+	   grid[index_destination] = gridss[index_origin];
+	   grid[index_destination+1] = gridss[index_origin+1];
+	}
+
+	for (long kswap=0; kswap<num_w_planes; kswap++)
+	for (long jswap=0; jswap<yaxis; jswap++)
+	for (long iswap=xaxis/2; iswap<xaxis; iswap++)
+	{
+           long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
+	   long index_destination = 2*(iswap-xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis);
+	   grid[index_destination] = gridss[index_origin];
+	   grid[index_destination+1] = gridss[index_origin+1];
+	}
+	*/
+
+        #ifndef USE_MPI
+        fclose(pFile1);
+        #endif
+        #ifdef USE_MPI
+	MPI_Barrier(MPI_COMM_WORLD);
+	#endif
+
+        end = clock();
+        clock_gettime(CLOCK_MONOTONIC, &finish);
+        process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
+        process_time1 = (finish.tv_sec - begin.tv_sec);
+        process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+
+
+#ifdef WRITE_DATA
+	// Write results
+	if (rank == 0)
+	{
+          printf("WRITING GRIDDED DATA\n");
+          pFilereal = fopen (outfile2,"wb");
+          pFileimg = fopen (outfile3,"wb");
+	  #ifdef USE_MPI
+	  for (int isector=0; isector<nsectors; isector++)
+          {
+	      MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin);
+	      MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin);
+	      MPI_Win_unlock(isector,slabwin);
+	      for (long i=0; i<size_of_grid/2; i++)
+	      {
+		      gridss_real[i] = gridss[2*i];
+		      gridss_img[i] = gridss[2*i+1];
+	      }
+	      if (num_w_planes > 1)
+	      {
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+			  long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+			  fseek(pFilereal, global_index, SEEK_SET);
+			  fwrite(&gridss_real[index], 1, sizeof(double), pFilereal);
+                }
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+                          long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+                          fseek(pFileimg, global_index, SEEK_SET);
+                          fwrite(&gridss_img[index], 1, sizeof(double), pFileimg);
+                          //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]);
+                          //fprintf (pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm);
+                }
+
+	      } else {
+		for (int iw=0; iw<num_w_planes; iw++)
+		{
+			  long global_index = (xaxis*isector*yaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iw*xaxis*yaxis;
+			  fseek(pFilereal, global_index, SEEK_SET);
+                          fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), pFilereal);
+                          fseek(pFileimg, global_index, SEEK_SET);
+                          fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), pFileimg);
+		}
+	      }
+          }
+	  #else
+          for (int iw=0; iw<num_w_planes; iw++)
+            for (int iv=0; iv<grid_size_y; iv++)
+               for (int iu=0; iu<grid_size_x; iu++)
+               {
+                          long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
+		          fwrite(&gridtot[index], 1, sizeof(double), pFilereal);
+		          fwrite(&gridtot[index+1], 1, sizeof(double), pFileimg);
+                          //double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
+                          //fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
+               }
+          #endif
+          fclose(pFilereal);
+          fclose(pFileimg);
+	}
+
+        #ifdef USE_MPI
+        MPI_Win_fence(0,slabwin);
+        #endif
+#endif //WRITE_DATA
+
+
+#ifdef USE_FFTW
+	// FFT transform the data (using distributed FFTW)
+
+	if(rank == 0)printf("PERFORMING FFT\n");
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+        start = clock();
+        fftw_plan plan;
+        fftw_complex *fftwgrid;
+	ptrdiff_t alloc_local, local_n0, local_0_start;
+	double norm = 1.0/(double)(grid_size_x*grid_size_y);
+
+        // map the 1D array of complex visibilities to a 2D array required by FFTW (complex[*][2])
+	// x is the direction of contiguous data and maps to the second parameter
+	// y is the parallelized direction and corresponds to the first parameter (--> n0)
+	// and perform the FFT per w plane
+	alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start);
+	fftwgrid = fftw_alloc_complex(alloc_local);
+	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE);
+
+	long fftwindex = 0;
+	long fftwindex2D = 0;
+	for (int iw=0; iw<num_w_planes; iw++)
+        {
+            //printf("FFTing plan %d\n",iw);
+            // select the w-plane to transform
+            for (int iv=0; iv<yaxis; iv++)
+            {
+               for (int iu=0; iu<xaxis; iu++)
+               {
+		   fftwindex2D = iu + iv*xaxis;
+		   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
+                   fftwgrid[fftwindex2D][0] = grid[fftwindex];
+                   fftwgrid[fftwindex2D][1] = grid[fftwindex+1];
+	       }
+	    }
+
+            // do the transform for each w-plane
+	    fftw_execute(plan);
+
+	    // save the transformed w-plane
+            for (int iv=0; iv<yaxis; iv++)
+            {
+               for (int iu=0; iu<xaxis; iu++)
+               {
+		   fftwindex2D = iu + iv*xaxis;
+		   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
+                   gridss[fftwindex] = norm*fftwgrid[fftwindex2D][0];
+                   gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D][1];
+               }
+            }
+
+	}
+
+        fftw_destroy_plan(plan);
+
+        #ifdef USE_MPI
+        MPI_Win_fence(0,slabwin);
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+
+        end = clock();
+        clock_gettime(CLOCK_MONOTONIC, &finish);
+        fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC;
+        fftw_time1 = (finish.tv_sec - begin.tv_sec);
+        fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+
+#ifdef WRITE_DATA
+        // Write results
+        #ifdef USE_MPI
+	MPI_Win writewin;
+        MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin);
+	MPI_Win_fence(0,writewin);
+        #endif
+        if (rank == 0)
+        {
+          printf("WRITING FFT TRANSFORMED DATA\n");
+          pFilereal = fopen (fftfile2,"wb");
+          pFileimg = fopen (fftfile3,"wb");
+          #ifdef USE_MPI
+          for (int isector=0; isector<nsectors; isector++)
+          {
+              MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin);
+              MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin);
+              MPI_Win_unlock(isector,writewin);
+	      for (long i=0; i<size_of_grid/2; i++)
+	      {
+		      gridss_real[i] = gridss_w[2*i];
+		      gridss_img[i] = gridss_w[2*i+1];
+	      }
+	      if (num_w_planes > 1)
+	      {
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+			  long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+			  fseek(pFilereal, global_index, SEEK_SET);
+			  fwrite(&gridss_real[index], 1, sizeof(double), pFilereal);
+                }
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+                          long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+                          fseek(pFileimg, global_index, SEEK_SET);
+                          fwrite(&gridss_img[index], 1, sizeof(double), pFileimg);
+                }
+              } else {
+                          fwrite(gridss_real, size_of_grid/2, sizeof(double), pFilereal);
+                          fwrite(gridss_img, size_of_grid/2, sizeof(double), pFileimg);
+              }
+
+
+          }
+          #else
+          /*
+	  for (int iw=0; iw<num_w_planes; iw++)
+            for (int iv=0; iv<grid_size_y; iv++)
+               for (int iu=0; iu<grid_size_x; iu++)
+               {
+                          int isector = 0;
+                          long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
+                          double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
+                          fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
+               }
+	  */
+          #endif
+          fclose(pFilereal);
+          fclose(pFileimg);
+        }
+
+        #ifdef USE_MPI
+        //MPI_Win_fence(0,writewin);
+        MPI_Win_fence(0,writewin);
+	MPI_Win_free(&writewin);
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+#endif //WRITE_DATA
+
+	fftw_free(fftwgrid);
+
+	// Phase correction
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+        start = clock();
+	if(rank == 0)printf("PHASE CORRECTION\n");
+        double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
+        double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
+
+        phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y,resolution,wmin,wmax,num_threads,node_rank);
+
+        end = clock();
+        clock_gettime(CLOCK_MONOTONIC, &finish);
+        phase_time = ((double) (end - start)) / CLOCKS_PER_SEC;
+        phase_time1 = (finish.tv_sec - begin.tv_sec);
+        phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
+#ifdef WRITE_IMAGE
+
+        if(rank == 0)
+        {
+            pFilereal = fopen (fftfile2,"wb");
+            pFileimg = fopen (fftfile3,"wb");
+            fclose(pFilereal);
+            fclose(pFileimg);
+        }
+	#ifdef USE_MPI
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+        if(rank == 0)printf("WRITING IMAGE\n");
+	for (int isector=0; isector<size; isector++)
+	{
+	    #ifdef USE_MPI
+	    MPI_Barrier(MPI_COMM_WORLD);
+            #endif
+	    if(isector == rank)
+            {
+	       printf("%d writing\n",isector);
+               pFilereal = fopen (fftfile2,"ab");
+               pFileimg = fopen (fftfile3,"ab");
+
+	       long global_index = isector*(xaxis*yaxis)*sizeof(double);
+
+               fseek(pFilereal, global_index, SEEK_SET);
+               fwrite(image_real, xaxis*yaxis, sizeof(double), pFilereal);
+               fseek(pFileimg, global_index, SEEK_SET);
+               fwrite(image_imag, xaxis*yaxis, sizeof(double), pFileimg);
+
+               fclose(pFilereal);
+               fclose(pFileimg);
+	    }
+	}
+	#ifdef USE_MPI
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+
+#endif //WRITE_IMAGE
+
+
+#endif //USE_FFTW
+
+	end = clock();
+	clock_gettime(CLOCK_MONOTONIC, &finish);
+	tot_time = ((double) (end - start0)) / CLOCKS_PER_SEC;
+	tot_time1 = (finish.tv_sec - begin0.tv_sec);
+	tot_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0;
+
+        if (rank == 0)
+        {
+          printf("Setup time:    %f sec\n",setup_time);
+          printf("Process time:  %f sec\n",process_time);
+          printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time);
+#ifdef USE_FFTW
+          printf("FFTW time:     %f sec\n",fftw_time);
+          printf("Phase time:    %f sec\n",phase_time);
+#endif
+          printf("TOT time:      %f sec\n",tot_time);
+	  if(num_threads > 1)
+          {
+            printf("PSetup time:   %f sec\n",setup_time1);
+            printf("PProcess time: %f sec\n",process_time1);
+            printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1);
+#ifdef USE_FFTW
+            printf("PFFTW time:    %f sec\n",fftw_time1);
+            printf("PPhase time:   %f sec\n",phase_time1);
+#endif
+            printf("PTOT time:     %f sec\n",tot_time1);
+	  }
+        }
+
+	if (rank == 0)
+	{
+	 pFile = fopen (timingfile,"w");
+	 if (num_threads == 1)
+         {
+	   fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,tot_time);
+	 } else {
+	   fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,tot_time1);
+	 }
+	 fclose(pFile);
+	}
+
+	// Close MPI environment
+	#ifdef USE_MPI
+        MPI_Win_fence(0,slabwin);
+	MPI_Win_free(&slabwin);
+	MPI_Finalize();
+	#endif
+}
diff --git a/phase_correction.cu b/phase_correction.cu
index e1db329..0135763 100644
--- a/phase_correction.cu
+++ b/phase_correction.cu
@@ -87,7 +87,7 @@ __global__ void phase_g(int xaxis,
 #endif
 
 void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes, int xaxistot, int yaxistot,
-		      double resolution, double wmin, double wmax, int num_threads)
+		      double resolution, double wmin, double wmax, int num_threads, int rank)
 {
         double dw = (wmax-wmin)/(double)num_w_planes;
 	double wterm = wmin+0.5*dw;
@@ -151,7 +151,7 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 #ifdef ACCOMP
 #pragma omp target teams distribute parallel for collapse(3) \
                         map(tofrom: image_real[0:xaxis*yaxis], image_imag[0:xaxis*yaxis])       \
-                        map (to: gridss[0:2*num_w_planes*xaxis*yaxis]) private(wterm)
+                        map (to: gridss[0:2*num_w_planes*xaxis*yaxis]) private(wterm) 
 #else
   #pragma omp parallel for collapse(3) private(wterm)
 #endif
diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
index c56b3cb..a57894a 100644
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -95,10 +95,10 @@ int main(int argc, char * argv[])
 	double resolution;
 
         // MESH SIZE
-	int grid_size_x = 2048;
-	int grid_size_y = 2048;
-	int local_grid_size_x;// = 8;
-	int local_grid_size_y;// = 8;
+	int grid_size_x = 4096;
+	int grid_size_y = 4096;
+	int local_grid_size_x;// = 16;
+	int local_grid_size_y;// = 16;
 	int xaxis;
 	int yaxis;
 	int num_w_planes = 8;
@@ -134,6 +134,14 @@ int main(int argc, char * argv[])
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 	if(rank == 0)printf("Running with %d MPI tasks\n",size);
+        printf("Number of available GPUs %d\n", omp_get_num_devices());
+        printf("Number of Current GPUs %d\n", omp_get_default_device());
+        int node_rank;
+	MPI_Comm NUMA_NODE_Comm;
+	MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &NUMA_NODE_Comm);
+	MPI_Comm_rank( NUMA_NODE_Comm, &node_rank);
+	printf( "Task %d has rank %d in its numa node\n", rank, node_rank );
+
   #ifdef USE_FFTW
 	fftw_mpi_init();
   #endif
@@ -163,9 +171,9 @@ int main(int argc, char * argv[])
 	int ndatasets = 1;
         //strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/");
         //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/");
-        //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
+        strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
         //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/");
-         strcpy(datapath_multi[0],"/data/taffoni/LAVORO/IMAGING/data/newgauss2noconj_t201806301100_SBL180.binMS/");
+        //strcpy(datapath_multi[0],"/data/taffoni/LAVORO/IMAGING/data/newgauss2noconj_t201806301100_SBL180.binMS/");
 	strcpy(datapath,datapath_multi[0]);
 	// Read metadata
 	strcpy(filename,datapath);
@@ -550,7 +558,8 @@ int main(int argc, char * argv[])
 	       xaxis,
 	       yaxis,
                gridss,
-               num_threads);
+               num_threads,
+	       node_rank);
 	  clock_gettime(CLOCK_MONOTONIC, &finishk);
 	  endk = clock();
 	  kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
@@ -881,7 +890,7 @@ int main(int argc, char * argv[])
         double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
         double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
 
-        phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y,resolution,wmin,wmax,num_threads);
+        phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y,resolution,wmin,wmax,num_threads,node_rank);
 
         end = clock();
         clock_gettime(CLOCK_MONOTONIC, &finish);
diff --git a/w-stacking.cu b/w-stacking.cu
index bc696b4..bc70a15 100644
--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -136,7 +136,8 @@ void wstack(
      int grid_size_x,
      int grid_size_y,
      double* grid,
-     int num_threads)
+     int num_threads,
+     int rank)
 {
     long i;
     long index;
@@ -232,7 +233,7 @@ map(to:num_points, KernelLen, std,  std22, norm, num_w_planes, \
   uu[0:num_points], vv[0:num_points], ww[0:num_points], \
   vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim], \
   grid_size_x, grid_size_y, freq_per_chan, polarizations, dx,dw, w_support, num_threads) \
-  map(tofrom: grid[0:gpu_grid_dim])
+  map(tofrom: grid[0:gpu_grid_dim]) device(rank)
 #else
     #pragma omp parallel for private(visindex)
 #endif
diff --git a/w-stacking.h b/w-stacking.h
index 3d3a4c6..862eff6 100644
--- a/w-stacking.h
+++ b/w-stacking.h
@@ -25,6 +25,7 @@ void wstack(
      int,
      int,
      double*,
+     int,
      int);
 
 #ifdef __CUDACC__
@@ -47,5 +48,6 @@ void phase_correction(
      double,
      double,
      double,
+     int,
      int);
 #endif
-- 
GitLab