Commit 9bf1af25 authored by Claudio Gheller's avatar Claudio Gheller
Browse files

first commit

parent c103410e
Loading
Loading
Loading
Loading

Makefile

0 → 100644
+47 −0
Original line number Diff line number Diff line
OPT += -DUSE_MPI
OPT += -DUSE_FFTW
OPT += -DONE_SIDE
OPT += -DWRITE_DATA

CC = gcc
CXX = g++
MPICC = mpicc
MPICXX =mpiCC 

CFLAGS += -O3 -mcpu=native
CFLAGS += -I.
LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm

NVCC = nvcc
NVFLAGS = -arch=sm_70 -c w-stacking.cu -Xcompiler -mno-float128 -std=c++11
NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda

DEPS = w-stacking.h
COBJ = w-stacking.o w-stacking-fftw.o

w-stacking.c:
	cp w-stacking.cu w-stacking.c

%.o: %.c $(DEPS)
	$(CC) -c -o $@ $< $(CFLAGS) $(OPT)

serial: $(COBJ)
	$(CC) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm

serial_cuda:
	$(NVCC) $(NVFLAGS) -c w-stacking.cu $(NVLIB)
	$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
	$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o $(NVLIB) -lm

mpi: $(COBJ)
	$(MPICC) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS)

mpi_cuda:
	$(NVCC) $(NVFLAGS) -c w-stacking.cu $(NVLIB)
	$(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
	$(MPICXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o $(NVLIB) $(LIBS) -lm

clean:
	rm *.o
	rm w-stacking.c

peano.c

0 → 100644
+130 −0
Original line number Diff line number Diff line
#include "peano.h"

//typedef unsigned long long peanokey;

/*  The following rewrite of the original function
 *  peano_hilbert_key_old() has been written by MARTIN REINECKE.
 *  It is about a factor 2.3 - 2.5 faster than Volker's old routine!
 */
const unsigned char rottable3[48][8] = {
  {36, 28, 25, 27, 10, 10, 25, 27},
  {29, 11, 24, 24, 37, 11, 26, 26},
  {8, 8, 25, 27, 30, 38, 25, 27},
  {9, 39, 24, 24, 9, 31, 26, 26},
  {40, 24, 44, 32, 40, 6, 44, 6},
  {25, 7, 33, 7, 41, 41, 45, 45},
  {4, 42, 4, 46, 26, 42, 34, 46},
  {43, 43, 47, 47, 5, 27, 5, 35},
  {33, 35, 36, 28, 33, 35, 2, 2},
  {32, 32, 29, 3, 34, 34, 37, 3},
  {33, 35, 0, 0, 33, 35, 30, 38},
  {32, 32, 1, 39, 34, 34, 1, 31},
  {24, 42, 32, 46, 14, 42, 14, 46},
  {43, 43, 47, 47, 25, 15, 33, 15},
  {40, 12, 44, 12, 40, 26, 44, 34},
  {13, 27, 13, 35, 41, 41, 45, 45},
  {28, 41, 28, 22, 38, 43, 38, 22},
  {42, 40, 23, 23, 29, 39, 29, 39},
  {41, 36, 20, 36, 43, 30, 20, 30},
  {37, 31, 37, 31, 42, 40, 21, 21},
  {28, 18, 28, 45, 38, 18, 38, 47},
  {19, 19, 46, 44, 29, 39, 29, 39},
  {16, 36, 45, 36, 16, 30, 47, 30},
  {37, 31, 37, 31, 17, 17, 46, 44},
  {12, 4, 1, 3, 34, 34, 1, 3},
  {5, 35, 0, 0, 13, 35, 2, 2},
  {32, 32, 1, 3, 6, 14, 1, 3},
  {33, 15, 0, 0, 33, 7, 2, 2},
  {16, 0, 20, 8, 16, 30, 20, 30},
  {1, 31, 9, 31, 17, 17, 21, 21},
  {28, 18, 28, 22, 2, 18, 10, 22},
  {19, 19, 23, 23, 29, 3, 29, 11},
  {9, 11, 12, 4, 9, 11, 26, 26},
  {8, 8, 5, 27, 10, 10, 13, 27},
  {9, 11, 24, 24, 9, 11, 6, 14},
  {8, 8, 25, 15, 10, 10, 25, 7},
  {0, 18, 8, 22, 38, 18, 38, 22},
  {19, 19, 23, 23, 1, 39, 9, 39},
  {16, 36, 20, 36, 16, 2, 20, 10},
  {37, 3, 37, 11, 17, 17, 21, 21},
  {4, 17, 4, 46, 14, 19, 14, 46},
  {18, 16, 47, 47, 5, 15, 5, 15},
  {17, 12, 44, 12, 19, 6, 44, 6},
  {13, 7, 13, 7, 18, 16, 45, 45},
  {4, 42, 4, 21, 14, 42, 14, 23},
  {43, 43, 22, 20, 5, 15, 5, 15},
  {40, 12, 21, 12, 40, 6, 23, 6},
  {13, 7, 13, 7, 41, 41, 22, 20}
};

const unsigned char subpix3[48][8] = {
  {0, 7, 1, 6, 3, 4, 2, 5},
  {7, 4, 6, 5, 0, 3, 1, 2},
  {4, 3, 5, 2, 7, 0, 6, 1},
  {3, 0, 2, 1, 4, 7, 5, 6},
  {1, 0, 6, 7, 2, 3, 5, 4},
  {0, 3, 7, 4, 1, 2, 6, 5},
  {3, 2, 4, 5, 0, 1, 7, 6},
  {2, 1, 5, 6, 3, 0, 4, 7},
  {6, 1, 7, 0, 5, 2, 4, 3},
  {1, 2, 0, 3, 6, 5, 7, 4},
  {2, 5, 3, 4, 1, 6, 0, 7},
  {5, 6, 4, 7, 2, 1, 3, 0},
  {7, 6, 0, 1, 4, 5, 3, 2},
  {6, 5, 1, 2, 7, 4, 0, 3},
  {5, 4, 2, 3, 6, 7, 1, 0},
  {4, 7, 3, 0, 5, 6, 2, 1},
  {6, 7, 5, 4, 1, 0, 2, 3},
  {7, 0, 4, 3, 6, 1, 5, 2},
  {0, 1, 3, 2, 7, 6, 4, 5},
  {1, 6, 2, 5, 0, 7, 3, 4},
  {2, 3, 1, 0, 5, 4, 6, 7},
  {3, 4, 0, 7, 2, 5, 1, 6},
  {4, 5, 7, 6, 3, 2, 0, 1},
  {5, 2, 6, 1, 4, 3, 7, 0},
  {7, 0, 6, 1, 4, 3, 5, 2},
  {0, 3, 1, 2, 7, 4, 6, 5},
  {3, 4, 2, 5, 0, 7, 1, 6},
  {4, 7, 5, 6, 3, 0, 2, 1},
  {6, 7, 1, 0, 5, 4, 2, 3},
  {7, 4, 0, 3, 6, 5, 1, 2},
  {4, 5, 3, 2, 7, 6, 0, 1},
  {5, 6, 2, 1, 4, 7, 3, 0},
  {1, 6, 0, 7, 2, 5, 3, 4},
  {6, 5, 7, 4, 1, 2, 0, 3},
  {5, 2, 4, 3, 6, 1, 7, 0},
  {2, 1, 3, 0, 5, 6, 4, 7},
  {0, 1, 7, 6, 3, 2, 4, 5},
  {1, 2, 6, 5, 0, 3, 7, 4},
  {2, 3, 5, 4, 1, 0, 6, 7},
  {3, 0, 4, 7, 2, 1, 5, 6},
  {1, 0, 2, 3, 6, 7, 5, 4},
  {0, 7, 3, 4, 1, 6, 2, 5},
  {7, 6, 4, 5, 0, 1, 3, 2},
  {6, 1, 5, 2, 7, 0, 4, 3},
  {5, 4, 6, 7, 2, 3, 1, 0},
  {4, 3, 7, 0, 5, 2, 6, 1},
  {3, 2, 0, 1, 4, 5, 7, 6},
  {2, 5, 1, 6, 3, 4, 0, 7}
};

/*! This function computes a Peano-Hilbert key for an integer triplet (x,y,z),
  *  with x,y,z in the range between 0 and 2^bits-1.
  */
peanokey peano_hilbert_key(int x, int y, int z, int bits)
{
  int mask;
  unsigned char rotation = 0;
  peanokey key = 0;

  for(mask = 1 << (bits - 1); mask > 0; mask >>= 1)
    {
      unsigned char pix = ((x & mask) ? 4 : 0) | ((y & mask) ? 2 : 0) | ((z & mask) ? 1 : 0);

      key <<= 3;
      key |= subpix3[rotation][pix];
      rotation = rottable3[rotation][pix];
    }

  return key;
}

peano.h

0 → 100644
+3 −0
Original line number Diff line number Diff line
typedef unsigned long long peanokey;

peanokey peano_hilbert_key(int, int, int, int);

w-stacking-fftw.c

0 → 100644
+860 −0

File added.

Preview size limit exceeded, changes collapsed.

w-stacking.cu

0 → 100644
+306 −0
Original line number Diff line number Diff line
#ifdef _OPENMP
#include <omp.h>
#endif
#include "w-stacking.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define NWORKERS -1    //100
#define NTHREADS 32

#ifdef __CUDACC__
double __device__
#else
double
#endif
gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
{
     double conv_weight;
     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
     return conv_weight;
}

#ifdef __CUDACC__
//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
//{
//     double conv_weight;
//     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
//     return conv_weight;
//}

__global__ void convolve_g(
     int num_w_planes,
     long num_points,
     long freq_per_chan,
     long polarizations,
     double* uu,
     double* vv,
     double* ww,
     float* vis_real,
     float* vis_img,
     float* weight,
     double dx,
     double dw,
     int KernelLen,
     int grid_size_x,
     int grid_size_y,
     double* grid,
     double std22)

{
	//printf("DENTRO AL KERNEL\n");
        long gid = blockIdx.x*blockDim.x + threadIdx.x;
	if(gid < num_points)
	{
	long i = gid;
        long visindex = i*freq_per_chan*polarizations;
	double norm = std22/PI;

        int j, k;

        /* Convert UV coordinates to grid coordinates. */
        double pos_u = uu[i] / dx;
        double pos_v = vv[i] / dx;
        double ww_i  = ww[i] / dw;
        int grid_w = (int)ww_i;
        int grid_u = (int)pos_u;
        int grid_v = (int)pos_v;

        // check the boundaries
        long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
        long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
        long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
        long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
        //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);


        // Convolve this point onto the grid.
        for (k = kmin; k <= kmax; k++)
        {

            double v_dist = (double)k+0.5 - pos_v;

            for (j = jmin; j <= jmax; j++)
            {
                double u_dist = (double)j+0.5 - pos_u;
                long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
		//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);

                double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
                // Loops over frequencies and polarizations
                double add_term_real = 0.0;
                double add_term_img = 0.0;
                long ifine = visindex;
                for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
		{	
		   long iweight = visindex/freq_per_chan;
                   for (long ipol=0; ipol<polarizations; ipol++)
                   {
                      double vistest = (double)vis_real[ifine];
                      if (!isnan(vistest))
                      {
                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
                         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
		      }
                      ifine++;
		      iweight++;
                   }
		}
		atomicAdd(&(grid[iKer]),add_term_real);
		atomicAdd(&(grid[iKer+1]),add_term_img);
            }
        }
	}
}
#endif

void wstack(
     int num_w_planes,
     long num_points,
     long freq_per_chan,
     long polarizations,
     double* uu,
     double* vv,
     double* ww,
     float* vis_real,
     float* vis_img,
     float* weight,
     double dx,
     double dw,
     int w_support,
     int grid_size_x,
     int grid_size_y,
     double* grid,
     int num_threads)
{
    long i;
    long index;
    long visindex;

    // initialize the convolution kernel
    // gaussian:
    int KernelLen = (w_support-1)/2;
    double std = 1.0;
    double std22 = 1.0/(2.0*std*std);
    double norm = std22/PI;

    // Loop over visibilities.
// Switch between CUDA and GPU versions
#ifdef __CUDACC__
    // Define the CUDA set up
    int Nth = NTHREADS;
    int Nbl = num_points/Nth + 1;
    if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
    long Nvis = num_points*freq_per_chan*polarizations;
    printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);

    // Create GPU arrays and offload them
    double * uu_g;
    double * vv_g;
    double * ww_g;
    float * vis_real_g;
    float * vis_img_g;
    float * weight_g;
    double * grid_g;

    //for (int i=0; i<100000; i++)grid[i]=23.0;
    cudaError_t mmm;
    mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
    mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
    mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
    mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
    mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
    mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
    mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
    mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));

    mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
    mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
    mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
    mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
    mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
    mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);

    // Call main GPU Kernel
    convolve_g <<<Nbl,Nth>>> (
	       num_w_planes,
               num_points,
               freq_per_chan,
               polarizations,
               uu_g,
               vv_g,
               ww_g,
               vis_real_g,
               vis_img_g,
               weight_g,
               dx,
               dw,
               KernelLen,
               grid_size_x,
               grid_size_y,
               grid_g,
	       std22);

    mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
    printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
    mmm=cudaFree(uu_g);
    mmm=cudaFree(vv_g);
    mmm=cudaFree(ww_g);
    mmm=cudaFree(vis_real_g);
    mmm=cudaFree(vis_img_g);
    mmm=cudaFree(weight_g);
    mmm=cudaFree(grid_g);

// Switch between CUDA and GPU versions
# else

#ifdef _OPENMP
    omp_set_num_threads(num_threads);
#endif
    #pragma omp parallel for private(visindex) 
    for (i = 0; i < num_points; i++)
    {
#ifdef _OPENMP
	//int tid;
	//tid = omp_get_thread_num();
	//printf("%d\n",tid);
#endif

        visindex = i*freq_per_chan*polarizations;

        double sum = 0.0;
        int j, k;
	//if (i%1000 == 0)printf("%ld\n",i);

        /* Convert UV coordinates to grid coordinates. */
        double pos_u = uu[i] / dx;
        double pos_v = vv[i] / dx;
        double ww_i  = ww[i] / dw;
        int grid_w = (int)ww_i;
        int grid_u = (int)pos_u;
        int grid_v = (int)pos_v;

	// check the boundaries
	long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
	long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
	long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
	long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
        //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);


        // Convolve this point onto the grid.
        for (k = kmin; k <= kmax; k++)
        {

            double v_dist = (double)k+0.5 - pos_v;

            for (j = jmin; j <= jmax; j++)
            {
                double u_dist = (double)j+0.5 - pos_u;
		long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);

		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
		// Loops over frequencies and polarizations
		double add_term_real = 0.0;
		double add_term_img = 0.0;
		long ifine = visindex;
		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
		{	
		   long iweight = visindex/freq_per_chan;
	           for (long ipol=0; ipol<polarizations; ipol++)
	           {
                      if (!isnan(vis_real[ifine]))
                      {
		         //printf("%f %ld\n",weight[iweight],iweight);
                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
		         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
			 //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
		      }
		      ifine++;
		      iweight++;
		   }
	        }
		// DAV: this is the critical call in terms of correctness of the results and of performance
		#pragma omp atomic
		grid[iKer] += add_term_real;
		#pragma omp atomic
		grid[iKer+1] += add_term_img;
            }
        }
	
    }
// End switch between CUDA and CPU versions
#endif
    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
}

int test(int nnn)
{
	int mmm;
	
	mmm = nnn+1;
	return mmm;
}	
Loading