Commit 668da3ac authored by Giuliano Taffoni's avatar Giuliano Taffoni
Browse files

New implementation compatible with gcc and GPU info

parent fa112e57
Loading
Loading
Loading
Loading
+16 −14
Original line number Diff line number Diff line
@@ -4,29 +4,31 @@ CXX = g++-10
MPICC    =  mpicc
MPIC++   =  mpiCC

OPTIMIZE =  -O3
#-ffast-math  -fopt-info-all-omp -fcf-protection=none -fno-stack-protector -foffload=nvptx-none


GSL_INCL =  -I/home/taffoni/sw/include #-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
GSL_LIBS =  -L/home/taffoni/sw/lib #-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
GSL_INCL =  -I/home/taffoni/sw/include
GSL_LIBS =  -L/home/taffoni/sw/lib

FFTW_INCL=  -I/home/taffoni/sw/include
FFTW_LIB=  -L/home/taffoni/sw/lib   -lfftw3_mpi -lfftw3

#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
MPI_LIB =
MPI_INCL=
#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
HDF5_INCL =
HDF5_LIB  =

OMP = -fopenmp

OMP = -mp=multicore,gpu -Mprof -cuda
#OMP = -fopenmp
NVCC = nvcc
NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11
NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda

CFLAGS += $(OPTIMIZE)
CFLAGS += -I.
CFLAGS += -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
CFLAGS += $(FFTW_INCL) $(GSL_INCL)
CFLAGS += $(FFTW_LIB) -lm

CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)

OPTIMIZE =  $(OMP) -O3

# OMP GPU SPECIFIC FLAGS
#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math
#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35
#-ffast-math  -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none
+5 −5
Original line number Diff line number Diff line
@@ -5,16 +5,16 @@ MPICC = mpicc
MPIC++   =  mpiCC


CFLAGS += -O3 -mcpu=native
CFLAGS += -I.
FFTW_INCL=  -I/home/taffoni/sw/include
FFTW_LIB=  -L/home/taffoni/sw/lib

LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm

NVCC = nvcc
NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda

OMP= -fopenmp

CFLAGS += -O3 -mtune=native
CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)

OPTIMIZE = $(OMP) -O3 -mtune=native
+23 −13
Original line number Diff line number Diff line
@@ -14,21 +14,22 @@ include Build/Makefile.systype
endif




LIBS = $(FFTW_LIB) -lfftw3 -lm -lcudart  -lcuda

# create MPI code
OPT += -DUSE_MPI

#OPT += -DACCOMP
# use FFTW (it can be switched on ONLY if MPI is active)
ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
   OPT += -DUSE_FFTW
	 LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
endif

#OPT += -DNVIDIA
# perform one-side communication (suggested) instead of reduce (only if MPI is active)
OPT += -DONE_SIDE
# write the full 3D cube of gridded visibilities and its FFT transform
#OPT += -DWRITE_DATA
OPT += -DWRITE_DATA
# write the final image
OPT += -DWRITE_IMAGE
# perform w-stacking phase correction
@@ -46,27 +47,36 @@ phase_correction.c: phase_correction.cu

ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
%.o: %.c $(DEPS)
	$(MPICC)  -c -o $@ $< $(CFLAGS) $(OPT)
	$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
else
%.o: %.c $(DEPS)
	$(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT)
	$(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
endif

serial: $(COBJ)
	$(CC)  -o w-stackingCfftw_serial $(CFLAGS) $^ -lm
	$(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial  $^ $(LIBS)

serial_omp: phase_correction.c
	$(CC)  $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial w-stacking-fftw.c  w-stacking_omp.c    $(CFLAGS) $(LIBS)

simple_mpi: phase_correction.c
	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c w-stacking-fftw.c phase_correction.c  $(CFLAGS) $(LIBS)

mpi_omp: phase_correction.c
	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_omp w-stacking_omp.c w-stacking-fftw.c phase_correction.c  $(CFLAGS) $(LIBS)

serial_cuda:
	$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
	$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
	$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm
	$(CC)  $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
	$(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm

mpi: $(COBJ)
	$(MPICC) -o w-stackingCfftw   $^ $(CFLAGS)
	$(MPICC) $(OPTIMIZE) -o w-stackingCfftw   $^  $(CFLAGS) $(LIBS)

mpi_cuda:
	$(NVCC)   $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
	$(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
	$(MPIC++)  $(OPT)   -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS)
	$(MPICC)  $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
	$(MPIC++) $(OPTIMIZE) $(OPT)   -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS)

clean:
	rm *.o
+16 −8
Original line number Diff line number Diff line
@@ -18,7 +18,11 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
	for (int iw=0; iw<num_w_planes; iw++)
	{
      double wterm = (double)iw/dnum_w_planes;

#ifdef ACCOMP
			#pragma omp target teams distribute parallel for \
			map(tofrom: image_real[0:xaxis*yaxis], image_imag[0:xaxis*yaxis])	\
			map (to: gridss[0:2*num_w_planes*xaxis*yaxis])
#endif
	    for (int iv=0; iv<yaxis; iv++)
            for (int iu=0; iu<xaxis; iu++)
            {
@@ -53,10 +57,14 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
		    s = pimag;

		    //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
				#pragma omp atomic
		    image_real[img_index] += p*r-q*s;
				#pragma omp atomic
		    image_imag[img_index] += p*s+q*r;
#else
#pragma omp atomic
		    image_real[img_index] += gridss[index];
#pragma omp atomic
		    image_imag[img_index] += gridss[index+1];
#endif

+212 −163
Original line number Diff line number Diff line
@@ -7,9 +7,15 @@
#include <fftw3-mpi.h>
#endif
#endif
#include <omp.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#ifdef ACCOMP
#include "w-stacking_omp.h"
#else
#include "w-stacking.h"
#endif
#define PI 3.14159265359
#define NUM_OF_SECTORS -1
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
@@ -100,6 +106,8 @@ int main(int argc, char * argv[])
	int xaxis;
	int yaxis;
	int num_w_planes = 1;


	// DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
	int w_support = 7;
	int num_threads;// = 4;
@@ -115,11 +123,23 @@ int main(int argc, char * argv[])
	struct timespec begin, finish, begin0, begink, finishk;
	double elapsed;
	long nsectors;
  /* GT get nymber of threads exit if not given */
  if(argc == 1) {
    fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
    exit(1);
  }
  // Set the number of OpenMP threads
  num_threads = atoi(argv[1]);

  if ( num_threads == 0 )
  {
    fprintf(stderr, "Wrong parameter: %s\n\n", argv[1]);
    fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
    exit(1);
  }

	clock_gettime(CLOCK_MONOTONIC, &begin0);
	start0 = clock();
        // Set the number of OpenMP threads
        num_threads = atoi(argv[1]);

	// Intialize MPI environment
#ifdef USE_MPI
@@ -134,8 +154,22 @@ int main(int argc, char * argv[])
	rank = 0;
	size = 1;
#endif

  if(rank == 0)printf("Running with %d threads\n",num_threads);

#ifdef ACCOMP
if(rank == 0){
  if (0 == omp_get_num_devices()) {
      printf("No accelerator found ... exit\n");
      exit(255);
   }
   printf("Number of available GPUs %d\n", omp_get_num_devices());
   #ifdef NVIDIA
      prtAccelInfo();
   #endif
 }
#endif

	// set the local size of the image
	local_grid_size_x = grid_size_x;
	nsectors = NUM_OF_SECTORS;
@@ -211,6 +245,7 @@ int main(int argc, char * argv[])
	printf("N. visibilities on %d %ld\n",rank,Nvis);
#endif


	// DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used
	// all the sizes are rescaled by the number of MPI tasks
	// Allocate arrays
@@ -260,13 +295,13 @@ int main(int argc, char * argv[])
	setup_time1 = (finish.tv_sec - begin.tv_sec);
	setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;


  if(rank == 0)printf("GRIDDING DATA\n");

	// Create histograms and linked lists
  clock_gettime(CLOCK_MONOTONIC, &begin);
  start = clock();

        //CLAAA
	// Initialize linked list
	struct sectorlist ** sectorhead;
	sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist));
@@ -335,20 +370,18 @@ int main(int argc, char * argv[])
#ifndef USE_MPI
  double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
#endif

  double shift = (double)(dx*yaxis);

  // Open the MPI Memory Window for the slab
#ifdef USE_MPI
  MPI_Win slabwin;
  MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
  MPI_Win_fence(0,slabwin);
#endif

#ifndef USE_MPI
  pFile1 = fopen (outfile1,"w");
#endif


	// loop over files
	//
	kernel_time = 0.0;
@@ -357,9 +390,9 @@ int main(int argc, char * argv[])
	reduce_time1 = 0.0;
	compose_time = 0.0;
	compose_time1 = 0.0;

	for (int ifiles=0; ifiles<ndatasets; ifiles++)
	{

	strcpy(filename,datapath_multi[ifiles]);
  printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
  strcat(filename,weightsfile);
@@ -368,20 +401,21 @@ int main(int argc, char * argv[])
  fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET);
  fread(weights,(Nweights)*sizeof(float),1,pFile);
  fclose(pFile);

  strcpy(filename,datapath);
  strcat(filename,visrealfile);
        //printf("Reading %s\n",filename);
#ifdef VERBOSE
  printf("Reading %s\n",filename);
#endif

  pFile = fopen (filename,"rb");
  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
  fread(visreal,Nvis*sizeof(float),1,pFile);
  fclose(pFile);

  strcpy(filename,datapath);
  strcat(filename,visimgfile);
        //printf("Reading %s\n",filename);

#ifdef VERBOSE
  printf("Reading %s\n",filename);
#endif
  pFile = fopen (filename,"rb");
  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
  fread(visimg,Nvis*sizeof(float),1,pFile);
@@ -390,7 +424,6 @@ int main(int argc, char * argv[])
#ifdef USE_MPI
  MPI_Barrier(MPI_COMM_WORLD);
#endif

  // Declare temporary arrays for the masking
  double * uus;
  double * vvs;
@@ -398,8 +431,8 @@ int main(int argc, char * argv[])
  float * visreals;
  float * visimgs;
  float * weightss;

	long isector;

  for (long isector_count=0; isector_count<nsectors; isector_count++)
      {
        clock_gettime(CLOCK_MONOTONIC, &begink);
@@ -425,13 +458,15 @@ int main(int argc, char * argv[])
//CLAAAA
	       struct sectorlist * current;
	       current = sectorhead[isector];



	       while (current->index != -1)
          {
             long ilocal = current->index;
      	     //double vvh = vv[ilocal];
             //int binphi = (int)(vvh*nsectors);
	     //if (binphi == isector || boundary[ilocal] == isector)
	     //{
	           //if (binphi == isector || boundary[ilocal] == isector) {
	           uus[icount] = uu[ilocal];
	           vvs[icount] = vv[ilocal]-isector*shift;
	           wws[icount] = ww[ilocal];
@@ -457,6 +492,7 @@ int main(int argc, char * argv[])
	  compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
	  compose_time1 += (finishk.tv_sec - begink.tv_sec);
	  compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;

    #ifndef USE_MPI
	  double uumin = 1e20;
	  double vvmin = 1e20;
@@ -483,6 +519,7 @@ int main(int argc, char * argv[])
	  #endif
          clock_gettime(CLOCK_MONOTONIC, &begink);
          startk = clock();

          wstack(num_w_planes,
               Nsec,
               freq_per_chan,
@@ -500,6 +537,18 @@ int main(int argc, char * argv[])
	             yaxis,
               gridss,
               num_threads);


/* int z =0 ;
#pragma omp target map(to:test_i_gpu) map(from:z)
{
  int x; // only accessible from accelerator
  x = 2;
  z = x + test_i_gpu;
}*/



	  clock_gettime(CLOCK_MONOTONIC, &finishk);
	  endk = clock();
	  kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
Loading