Commit 46034dba authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

GPU working OMP

parent 55c56cef
Loading
Loading
Loading
Loading
+29 −13
Original line number Diff line number Diff line
@@ -24,6 +24,9 @@ FFTW_MPI_INC =
FFTW_MPI_LIB = 

CFLAGS += -I./

FLAGS=$(OPTIMIZE)

FFTWLIBS =

# ========================================================
@@ -52,7 +55,7 @@ OPT += -DPHASE_ON
#OPT += -DNVIDIA

#use cuda for GPUs
#OPT += -DCUDACC
OPT += -DCUDACC

# use GPU acceleration via OMP
#OPT += -DACCOMP
@@ -73,12 +76,23 @@ OPT += -DPHASE_ON


DEPS = w-stacking.h  main.c allvars.h

ifneq (ACCOMP,$(findstring ACCOMP, $(OPT)))		 
OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
else
OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
endif

ifneq (CUDACC,$(findstring CUDACC, $(OPT)))		 
OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
else
OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
endif

DEPS_ACC_CUDA = w-stacking.h w-stacking.cu
DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu
OBJ_ACC_CUDA = phase_correction.o w-stacking.o

DEPS_ACC_OMP = w-stacking_omp.h 
DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c
OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o

OBJ_NCCL_REDUCE = gridding_nccl.o
@@ -114,11 +128,12 @@ endif

ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-cuda
LINKER=$(NVCC)
FLAGS=$(NVFLAGS) $(CFLAGS)
LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE)
LIBS=$(NVLIB)
compile_cuda: $(OBJ_ACC_CUDA)
	$(NVCC) $(OPT) $(NVFLAGS) -c *.cu $(NVLIB)
$(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
	$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS)
OBJ += $(OBJ_ACC_CUDA)
endif

ifeq (ACCOMP,$(findstring ACCOMP,$(OPT)))
@@ -126,8 +141,9 @@ EXEC_EXT := $(EXEC_EXT)_acc-omp
LINKER=$(NVC)
FLAGS=$(NVFLAGS) $(CFLAGS)
LIBS=$(NVLIB)
compile_accomp: $(OBJ_ACC_OMP)
	$(NVC) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB)
$(OBJ_ACC_OMP): $(DEPS_ACC_OMP)
	$(NVC) $(FLAGS) $(OPT) -c $^ $(LIBS)
OBJ += $(OBJ_ACC_OMP)
endif

ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT)))
@@ -136,7 +152,7 @@ LINKER=$(NVC++)
FLAGS=$(NVFLAGS) $(CFLAGS)
LIBS=$(NVLIB) $(NVLIB_3)
compile_accreduce: $(OBJ_NCCL_REDUCE)
	$(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3)
	$(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS)
endif

ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT)))
@@ -144,7 +160,7 @@ EXEC_EXT := $(EXEC_EXT)_acc-reduce
LINKER=$(NVC++)
FLAGS=$(NVFLAGS) $(CFLAGS)
LIBS=$(NVLIB) $(NVLIB_3)
compile_accreduce: $(OBJ_RCCL_REDCUE)
compile_accreduce: $(OBJ_RCCL_REDUCE)
	$(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3)
endif

@@ -152,9 +168,9 @@ endif
###################################################################################

w-stacking: $(OBJ) $(DEPS) Makefile
	$(LINKER) $(FLAGS) $(OPTIMIZE) $(OPT) $(OBJ) -lmpi -o $(EXEC)$(EXEC_EXT) -lmpi $(FFTWLIBS) $(LIBS)
	$(LINKER) $(FLAGS) $(OPT) $(OBJ) -o $(EXEC)$(EXEC_EXT) -lmpi $(FFTWLIBS) $(LIBS)

$(OBJ): $(DEPS) Makefile
#$(OBJ): $(DEPS) Makefile

%.o: %.c $(DEPS)
	$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
+0 −2
Original line number Diff line number Diff line
@@ -209,9 +209,7 @@ void write_gridded_data()
      fclose(file.pFileimg);
    }
  
 #ifdef USE_MPI
  MPI_Win_fence(0,slabwin);
 #endif
  
 #endif //WRITE_DATA 

+5 −3
Original line number Diff line number Diff line

#include "allvars.h"
#include "proto.h"

@@ -225,7 +224,7 @@ void gridding_data()
	    {
	      
	      int ret = reduce_ring(target_rank);
	      grid    = (double*)Me.fwin.ptr; //Let grid point to the right memory location [GL]
	      //grid    = (double*)Me.fwin.ptr; //Let grid point to the right memory location [GL]
	      
	      if ( ret != 0 )
		{
@@ -241,6 +240,9 @@ void gridding_data()
	  timing_wt.reduce += CPU_TIME_wt - start;
	}

      else
	*grid += *gridss;
	
      // Go to next sector
      memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );

+86 −92
Original line number Diff line number Diff line
@@ -34,9 +34,9 @@ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)

__global__ void convolve_g(
     int num_w_planes,
     long num_points,
     long freq_per_chan,
     long polarizations,
     uint num_points,
     uint freq_per_chan,
     uint polarizations,
     double* uu,
     double* vv,
     double* ww,
@@ -53,11 +53,11 @@ __global__ void convolve_g(

{
  //printf("DENTRO AL KERNEL\n");
        long gid = blockIdx.x*blockDim.x + threadIdx.x;
  uint gid = blockIdx.x*blockDim.x + threadIdx.x;
  if(gid < num_points)
    {
	  long i = gid;
        long visindex = i*freq_per_chan*polarizations;
      uint i = gid;
      uint visindex = i*freq_per_chan*polarizations;
      double norm = std22/PI;

      int j, k;
@@ -79,10 +79,10 @@ __global__ void convolve_g(

      //printf("[A] %g, %g, %g, %g, %g, %d, %d, %d\n", uu[i], vv[i], ww[i], dx, dw, grid_w, grid_u, grid_v);
      // check the boundaries
        long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
        long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
        long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
        long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
      uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
      uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
      uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
      uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
      //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);

      /*
@@ -98,36 +98,29 @@ __global__ void convolve_g(
	  for (j = jmin; j <= jmax; j++)
            {
	      double u_dist = (double)j+0.5 - pos_u;
                long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
	      uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
	      //printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);

		
		if ( iKer >= 2*num_w_planes*grid_size_x*grid_size_y) {
		  printf("=== ALERT [B] iKer out of bound: %ld, %d, %d, %d, %d, %d\n", iKer, j, k, grid_w, grid_size_x, grid_size_y);
		  printf("grid_w = %d, i=%ld, ww_i = %g, dw = %g\n", grid_w, i, ww_i, dw);
		  printf("kmin,kamx: %ld, %ld : %d, %d %d : %g %g\n", kmin, kmax, grid_v, KernelLen, grid_size_y, vv[i], dx );
		  printf("jmin,jamx: %ld, %ld : %d, %d %d : %g %g\n", jmin, jmax, grid_u, KernelLen, grid_size_x, uu[i], dx );
		}
		
	      double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
	      // Loops over frequencies and polarizations
	      double add_term_real = 0.0;
	      double add_term_img = 0.0;
		     long ifine = visindex;
	      uint ifine = visindex;
	      /*
		if ( ifine >= num_points*freq_per_chan*polarizations )
		printf("=== ALERT [C] ifine out of bound: %ld\n", ifine);
	      */
		     
                for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
	      for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
		{
		   long iweight = visindex/freq_per_chan;
		  uint iweight = visindex/freq_per_chan;

		  /*
		    if ( iweight >= num_points*polarizations )
		    printf("=== ALERT [D] ifine out of bound: %ld\n", iweight);
		  */
                   for (long ipol=0; ipol<polarizations; ipol++)
		  for (uint ipol=0; ipol<polarizations; ipol++)
		    {
                      double vistest = (double)vis_real[ifine];
                      if (!isnan(vistest))
@@ -151,9 +144,9 @@ __global__ void convolve_g(
#endif
void wstack(
     int num_w_planes,
     long num_points,
     long freq_per_chan,
     long polarizations,
     uint num_points,
     uint freq_per_chan,
     uint polarizations,
     double* uu,
     double* vv,
     double* ww,
@@ -169,9 +162,9 @@ void wstack(
     int num_threads,
     int rank)
{
    long i;
    long index;
    long visindex;
    uint i;
    uint index;
    uint visindex;

    // initialize the convolution kernel
    // gaussian:
@@ -180,6 +173,7 @@ void wstack(
    double std22 = 1.0/(2.0*std*std);
    double norm = std22/PI;

    printf("Task %d, here!", rank);
    // Loop over visibilities.
// Switch between CUDA and GPU versions
#ifdef __CUDACC__
@@ -191,9 +185,9 @@ void wstack(

    cudaGetDevice( &device_cuda );
    int Nth = NTHREADS;
    long Nbl = (long)(num_points/Nth) + 1;
    uint Nbl = (uint)(num_points/Nth) + 1;
    if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
    long Nvis = num_points*freq_per_chan*polarizations;
    uint Nvis = num_points*freq_per_chan*polarizations;
    printf("Running on GPU %d with %d threads and %d blocks\n",device_cuda,Nth,Nbl);

    // Create GPU arrays and offload them
@@ -268,7 +262,7 @@ void wstack(
#endif

#ifdef ACCOMP
    long Nvis = num_points*freq_per_chan*polarizations;
    uint Nvis = num_points*freq_per_chan*polarizations;
  //  #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
  //  #pragma omp target teams distribute parallel for  map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
#else
@@ -297,10 +291,10 @@ void wstack(
        int grid_v = (int)pos_v;

	// check the boundaries
	long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
	long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
	long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
	long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
	uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
	uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
	uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
	uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
        //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);


@@ -313,19 +307,19 @@ void wstack(
            for (j = jmin; j <= jmax; j++)
            {
                double u_dist = (double)j+0.5 - pos_u;
		long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
		uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);


		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
		// Loops over frequencies and polarizations
		double add_term_real = 0.0;
		double add_term_img = 0.0;
		long ifine = visindex;
		uint ifine = visindex;
		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
		for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
		{
		   long iweight = visindex/freq_per_chan;
	           for (long ipol=0; ipol<polarizations; ipol++)
		   uint iweight = visindex/freq_per_chan;
	           for (uint ipol=0; ipol<polarizations; ipol++)
	           {
                      if (!isnan(vis_real[ifine]))
                      {
+6 −6
Original line number Diff line number Diff line
@@ -13,9 +13,9 @@ extern "C"
extern "C" {
void wstack(
     int,
     long,
     long,
     long,
     unsigned int,
     unsigned int,
     unsigned int,
     double*,
     double*,
     double*,
@@ -34,9 +34,9 @@ void wstack(
#else
void wstack(
     int,
     long,
     long,
     long,
     unsigned int,
     unsigned int,
     unsigned int,
     double*,
     double*,
     double*,
Loading