diff --git a/.gitignore b/.gitignore
index 797747aa16a2fb9000f5268a91f10e289e07f077..9cce9de16ae69e4796589cd888488c2cc1e6d877 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+.DS_Store
+sync.sh
 *.o
 phase_correction.c
 w-stacking.c
diff --git a/Build/Makefile.Macosx b/Build/Makefile.Macosx
new file mode 100644
index 0000000000000000000000000000000000000000..cdb0e1306db96f407e23419fd4c988928ebbe68e
--- /dev/null
+++ b/Build/Makefile.Macosx
@@ -0,0 +1,26 @@
+CC       =  icc
+CXX      =  g++-11
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+FFTW_INCL=  -I/usr/local/include
+FFTW_LIB=  -L/usr/local/lib/
+
+GSL_INCL =
+GSL_LIBS =
+
+MPI_LIB =
+MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
+HDF5_INCL =
+HDF5_LIB  =
+
+OMP= -fopenmp
+
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
+NVLIB = -L/home/taffoni -lcudart -lcuda
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE = $(OMP) -O3
diff --git a/Build/Makefile.Magellanus b/Build/Makefile.Magellanus
new file mode 100644
index 0000000000000000000000000000000000000000..c2858994653726db34f7dcd7ae0451ebc7553bf7
--- /dev/null
+++ b/Build/Makefile.Magellanus
@@ -0,0 +1,34 @@
+CC       =  nvc
+CXX      =  nvc++
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+GSL_INCL =  -I/home/taffoni/sw/include
+GSL_LIBS =  -L/home/taffoni/sw/lib
+
+FFTW_INCL=  -I/home/taffoni/sw/include
+FFTW_LIB=  -L/home/taffoni/sw/lib   -lfftw3_mpi -lfftw3
+
+#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
+MPI_LIB =
+#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
+MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
+HDF5_INCL =
+HDF5_LIB  =
+
+OMP = -mp=multicore,gpu -Mprof -cuda
+#OMP = -fopenmp
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11
+NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda
+
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE =  $(OMP) -O3
+
+# OMP GPU SPECIFIC FLAGS
+#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math
+#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35
+#-ffast-math  -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none
diff --git a/Build/Makefile.Marconi b/Build/Makefile.Marconi
new file mode 100644
index 0000000000000000000000000000000000000000..4fec93afd7851aad0ebd3e85f449c6c234a7376d
--- /dev/null
+++ b/Build/Makefile.Marconi
@@ -0,0 +1,20 @@
+CC       =  gcc
+CXX      =  g++
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+
+FFTW_INCL=  -I/home/taffoni/sw/include
+FFTW_LIB=  -L/home/taffoni/sw/lib
+
+
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
+NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
+
+OMP= -fopenmp
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE = $(OMP) -O3 -mtune=native
diff --git a/Build/Makefile.systype b/Build/Makefile.systype
new file mode 100644
index 0000000000000000000000000000000000000000..04922322304924d1c8a5397ede089516d9a1d256
--- /dev/null
+++ b/Build/Makefile.systype
@@ -0,0 +1,24 @@
+CC       =  gcc-10
+CXX      =  g++-10
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+OPTIMIZE =
+
+
+GSL_INCL =
+GSL_LIB =
+
+FFTW_INCL=
+FFTW_LIB=
+
+NVCC =
+NVFLAGS =
+NVLIB =
+
+CFLAGS +=
+
+MPICHLIB =
+HDF5INCL =
+HDF5LIB  =
diff --git a/Makefile b/Makefile
index 99d13891512a600974a7a33c5eb43683eaf22e3f..a5635850c6aa354a28b5ceb1bc4fd90836102dfa 100644
--- a/Makefile
+++ b/Makefile
@@ -1,34 +1,40 @@
 # comment/uncomment the various options depending hoe you want to build the program
+# Set default values for compiler options if no systype options are given or found
+CC        = mpiCC
+CXX       = mpiCC
+OPTIMIZE  = -std=c++11 -Wall -g -O2
+MPICHLIB  = -lmpich
+SWITCHES =
+
+ifdef SYSTYPE
+SYSTYPE := $(SYSTYPE)
+include Build/Makefile.$(SYSTYPE)
+else
+include Build/Makefile.systype
+endif
+
+
+LIBS = $(FFTW_LIB) -lfftw3 -lm -lcudart  -lcuda
+
 # create MPI code
 OPT += -DUSE_MPI
+OPT += -DACCOMP
 # use FFTW (it can be switched on ONLY if MPI is active)
-OPT += -DUSE_FFTW
+ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
+   OPT += -DUSE_FFTW
+	 LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm -lcudart -lcuda
+endif
+
+OPT += -DNVIDIA
 # perform one-side communication (suggested) instead of reduce (only if MPI is active)
 OPT += -DONE_SIDE
 # write the full 3D cube of gridded visibilities and its FFT transform
-#OPT += -DWRITE_DATA
+OPT += -DWRITE_DATA
 # write the final image
 OPT += -DWRITE_IMAGE
 # perform w-stacking phase correction
-#OPT += -DPHASE_ON
-
-CC = gcc
-CXX = g++
-ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
-  CC = mpicc
-  CXX = mpiCC 
-endif
-
-OMP = -fopenmp 
-#OMP = 
+# OPT += PHASE_ON
 
-CFLAGS += -O3 -mcpu=native
-CFLAGS += -I.
-LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
-
-NVCC = nvcc
-NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
-NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
 
 DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu
 COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o
@@ -39,8 +45,13 @@ w-stacking.c: w-stacking.cu
 phase_correction.c: phase_correction.cu
 	cp phase_correction.cu phase_correction.c
 
+ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
 %.o: %.c $(DEPS)
-	$(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT)
+	$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
+else
+%.o: %.c $(DEPS)
+	$(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
+endif
 
 serial: $(COBJ)
 	$(CC) $(OMP) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm
@@ -50,16 +61,15 @@ serial_cuda:
 	$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
 	$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm
 
-mpi: $(COBJ) 
-	$(CC) $(OMP) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS)
+mpi: $(COBJ)
+	$(MPICC) $(OMP) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS)
 
 mpi_cuda:
 	$(NVCC) $(NVFLAGS) $(OPT) -c w-stacking.cu phase_correction.cu $(NVLIB)
-	$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
-	$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(LIBS) -lm
+	$(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
+	$(MPIC++) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(LIBS) -lm
 
 clean:
 	rm *.o
 	rm w-stacking.c
 	rm phase_correction.c
-
diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
index 0eda87e9b2dcef70714081129aff0fbeb468fae0..50983d12694a069af5fc6ca800a7fb840d33e37e 100644
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -27,16 +27,16 @@ void Push(struct sectorlist** headRef, long data) {
      struct sectorlist* newNode = malloc(sizeof(struct sectorlist));
      newNode->index = data;
      newNode->next = *headRef;
-     *headRef = newNode; 
+     *headRef = newNode;
 }
 
 // Main Code
-int main(int argc, char * argv[]) 
+int main(int argc, char * argv[])
 {
 	int rank;
 	int size;
 
-	FILE * pFile; 
+	FILE * pFile;
 	FILE * pFile1;
 	FILE * pFilereal;
 	FILE * pFileimg;
@@ -50,7 +50,7 @@ int main(int argc, char * argv[])
 	//
 	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_gauss4new.binMS/";
 	//char datapath[900] = "/m100_scratch/userexternal/cgheller/Lofar/Observations/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/";
-	char datapath[900]; 
+	char datapath[900];
 	char datapath_multi[NFILES][900];
 
 	char ufile[30] = "ucoord.bin";
@@ -117,7 +117,11 @@ int main(int argc, char * argv[])
 	struct timespec begin, finish, begin0, begink, finishk;
 	double elapsed;
 	long nsectors;
-
+  /* GT get nymber of threads exit if not given */
+    if(argc == 1) {
+      fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
+      exit(1);
+    }
 	clock_gettime(CLOCK_MONOTONIC, &begin0);
 	start0 = clock();
         // Set the number of OpenMP threads
@@ -144,7 +148,7 @@ int main(int argc, char * argv[])
 	if (nsectors < 0) nsectors = size;
 	local_grid_size_y = grid_size_y/nsectors;
 	//nsectors = size;
-	
+
 	// LOCAL grid size
 	xaxis = local_grid_size_x;
 	yaxis = local_grid_size_y;
@@ -156,14 +160,15 @@ int main(int argc, char * argv[])
 	int ndatasets = 1;
         //strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/");
         //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/");
-        strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
+  strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
         //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/");
 
 	strcpy(datapath,datapath_multi[0]);
 	// Read metadata
 	strcpy(filename,datapath);
 	strcat(filename,metafile);
-	pFile = fopen (filename,"r");
+  // GT CHECK IF FILE EXISTS OR EXIT
+	if (pFile = fopen (filename,"r")) {
         fscanf(pFile,"%ld",&Nmeasures);
         fscanf(pFile,"%ld",&Nvis);
         fscanf(pFile,"%ld",&freq_per_chan);
@@ -176,7 +181,14 @@ int main(int argc, char * argv[])
         fscanf(pFile,"%lf",&uvmax);
         fscanf(pFile,"%lf",&wmin);
         fscanf(pFile,"%lf",&wmax);
-	fclose(pFile);
+	       fclose(pFile);
+       } else {
+         printf("Input file does not exists: %s\n", filename);
+         #ifdef USE_MPI
+         MPI_Finalize();
+         #endif
+         exit(256);
+       }
 
 
 	// WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -199,7 +211,7 @@ int main(int argc, char * argv[])
         // Set temporary local size of points
 	long nm_pe = (long)(Nmeasures/size);
 	long remaining = Nmeasures%size;
-        
+
         long startrow = rank*nm_pe;
         if (rank == size-1)nm_pe = nm_pe+remaining;
 
@@ -268,7 +280,7 @@ int main(int argc, char * argv[])
 	// Create histograms and linked lists
         clock_gettime(CLOCK_MONOTONIC, &begin);
         start = clock();
- 
+
         //CLAAA
 	// Initialize linked list
 	struct sectorlist ** sectorhead;
@@ -438,7 +450,7 @@ int main(int argc, char * argv[])
           // define local destination sector
           //isector = (isector_count+rank)%size;
           isector = isector_count;
-	  // allocate sector arrays     
+	  // allocate sector arrays
           long Nsec = histo_send[isector];
 	  uus = (double*) malloc(Nsec*sizeof(double));
 	  vvs = (double*) malloc(Nsec*sizeof(double));
@@ -501,7 +513,7 @@ int main(int argc, char * argv[])
 	       vvmin = MIN(vvmin,vvs[ipart]);
 	       vvmax = MAX(vvmax,vvs[ipart]);
 
-		  
+
                if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
           }
 
@@ -556,9 +568,9 @@ int main(int argc, char * argv[])
 	  #ifdef ONE_SIDE
 	  printf("One Side communication active\n");
 	  MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
-	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); 
+	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
 	  MPI_Win_unlock(target_rank,slabwin);
-	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); 	   
+	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin);
           #else
           MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
           #endif //ONE_SIDE
@@ -633,7 +645,7 @@ int main(int argc, char * argv[])
         process_time1 = (finish.tv_sec - begin.tv_sec);
         process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
         clock_gettime(CLOCK_MONOTONIC, &begin);
-         
+
 
 #ifdef WRITE_DATA
 	// Write results
@@ -712,7 +724,7 @@ int main(int argc, char * argv[])
 
 #ifdef USE_FFTW
 	// FFT transform the data (using distributed FFTW)
-	
+
 	if(rank == 0)printf("PERFORMING FFT\n");
         clock_gettime(CLOCK_MONOTONIC, &begin);
         start = clock();
@@ -727,13 +739,13 @@ int main(int argc, char * argv[])
 	// and perform the FFT per w plane
 	alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start);
 	fftwgrid = fftw_alloc_complex(alloc_local);
-	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); 
+	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE);
 
 	long fftwindex = 0;
 	long fftwindex2D = 0;
 	for (int iw=0; iw<num_w_planes; iw++)
         {
-            //printf("FFTing plan %d\n",iw);	    
+            //printf("FFTing plan %d\n",iw);
             // select the w-plane to transform
             for (int iv=0; iv<yaxis; iv++)
             {
@@ -814,7 +826,7 @@ int main(int argc, char * argv[])
                 for (int iw=0; iw<num_w_planes; iw++)
                 for (int iv=0; iv<yaxis; iv++)
                 for (int iu=0; iu<xaxis; iu++)
-                {         
+                {
                           long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
                           long index = iu + iv*xaxis + iw*xaxis*yaxis;
                           fseek(pFileimg, global_index, SEEK_SET);
@@ -825,7 +837,7 @@ int main(int argc, char * argv[])
                           fwrite(gridss_img, size_of_grid/2, sizeof(double), pFileimg);
               }
 
-	      
+
           }
           #else
           /*
@@ -851,15 +863,15 @@ int main(int argc, char * argv[])
 	MPI_Barrier(MPI_COMM_WORLD);
         #endif
 #endif //WRITE_DATA
- 
+
 	fftw_free(fftwgrid);
 
 	// Phase correction
         clock_gettime(CLOCK_MONOTONIC, &begin);
         start = clock();
 	if(rank == 0)printf("PHASE CORRECTION\n");
-        double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));	
-        double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));	
+        double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
+        double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
 
         phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y,resolution,wmin,wmax,num_threads);
 
@@ -949,9 +961,9 @@ int main(int argc, char * argv[])
 	   fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,tot_time);
 	 } else {
 	   fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,tot_time1);
-	 }  
+	 }
 	 fclose(pFile);
-	} 
+	}
 
 	// Close MPI environment
 	#ifdef USE_MPI