first commit (9bf1af25) · Commits · Claudio Gheller / HPC_Imaging

Makefile

0 → 100644

+47 −0

Original line number	Diff line number	Diff line
		OPT += -DUSE_MPI
		OPT += -DUSE_FFTW
		OPT += -DONE_SIDE
		OPT += -DWRITE_DATA

		CC = gcc
		CXX = g++
		MPICC = mpicc
		MPICXX =mpiCC

		CFLAGS += -O3 -mcpu=native
		CFLAGS += -I.
		LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm

		NVCC = nvcc
		NVFLAGS = -arch=sm_70 -c w-stacking.cu -Xcompiler -mno-float128 -std=c++11
		NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda

		DEPS = w-stacking.h
		COBJ = w-stacking.o w-stacking-fftw.o

		w-stacking.c:
		cp w-stacking.cu w-stacking.c

		%.o: %.c $(DEPS)
		$(CC) -c -o $@ $< $(CFLAGS) $(OPT)

		serial: $(COBJ)
		$(CC) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm

		serial_cuda:
		$(NVCC) $(NVFLAGS) -c w-stacking.cu $(NVLIB)
		$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
		$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o $(NVLIB) -lm

		mpi: $(COBJ)
		$(MPICC) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS)

		mpi_cuda:
		$(NVCC) $(NVFLAGS) -c w-stacking.cu $(NVLIB)
		$(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
		$(MPICXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o $(NVLIB) $(LIBS) -lm

		clean:
		rm *.o
		rm w-stacking.c

peano.c

0 → 100644

+130 −0

Original line number	Diff line number	Diff line
		#include "peano.h"

		//typedef unsigned long long peanokey;

		/* The following rewrite of the original function
		* peano_hilbert_key_old() has been written by MARTIN REINECKE.
		* It is about a factor 2.3 - 2.5 faster than Volker's old routine!
		*/
		const unsigned char rottable3[48][8] = {
		{36, 28, 25, 27, 10, 10, 25, 27},
		{29, 11, 24, 24, 37, 11, 26, 26},
		{8, 8, 25, 27, 30, 38, 25, 27},
		{9, 39, 24, 24, 9, 31, 26, 26},
		{40, 24, 44, 32, 40, 6, 44, 6},
		{25, 7, 33, 7, 41, 41, 45, 45},
		{4, 42, 4, 46, 26, 42, 34, 46},
		{43, 43, 47, 47, 5, 27, 5, 35},
		{33, 35, 36, 28, 33, 35, 2, 2},
		{32, 32, 29, 3, 34, 34, 37, 3},
		{33, 35, 0, 0, 33, 35, 30, 38},
		{32, 32, 1, 39, 34, 34, 1, 31},
		{24, 42, 32, 46, 14, 42, 14, 46},
		{43, 43, 47, 47, 25, 15, 33, 15},
		{40, 12, 44, 12, 40, 26, 44, 34},
		{13, 27, 13, 35, 41, 41, 45, 45},
		{28, 41, 28, 22, 38, 43, 38, 22},
		{42, 40, 23, 23, 29, 39, 29, 39},
		{41, 36, 20, 36, 43, 30, 20, 30},
		{37, 31, 37, 31, 42, 40, 21, 21},
		{28, 18, 28, 45, 38, 18, 38, 47},
		{19, 19, 46, 44, 29, 39, 29, 39},
		{16, 36, 45, 36, 16, 30, 47, 30},
		{37, 31, 37, 31, 17, 17, 46, 44},
		{12, 4, 1, 3, 34, 34, 1, 3},
		{5, 35, 0, 0, 13, 35, 2, 2},
		{32, 32, 1, 3, 6, 14, 1, 3},
		{33, 15, 0, 0, 33, 7, 2, 2},
		{16, 0, 20, 8, 16, 30, 20, 30},
		{1, 31, 9, 31, 17, 17, 21, 21},
		{28, 18, 28, 22, 2, 18, 10, 22},
		{19, 19, 23, 23, 29, 3, 29, 11},
		{9, 11, 12, 4, 9, 11, 26, 26},
		{8, 8, 5, 27, 10, 10, 13, 27},
		{9, 11, 24, 24, 9, 11, 6, 14},
		{8, 8, 25, 15, 10, 10, 25, 7},
		{0, 18, 8, 22, 38, 18, 38, 22},
		{19, 19, 23, 23, 1, 39, 9, 39},
		{16, 36, 20, 36, 16, 2, 20, 10},
		{37, 3, 37, 11, 17, 17, 21, 21},
		{4, 17, 4, 46, 14, 19, 14, 46},
		{18, 16, 47, 47, 5, 15, 5, 15},
		{17, 12, 44, 12, 19, 6, 44, 6},
		{13, 7, 13, 7, 18, 16, 45, 45},
		{4, 42, 4, 21, 14, 42, 14, 23},
		{43, 43, 22, 20, 5, 15, 5, 15},
		{40, 12, 21, 12, 40, 6, 23, 6},
		{13, 7, 13, 7, 41, 41, 22, 20}
		};

		const unsigned char subpix3[48][8] = {
		{0, 7, 1, 6, 3, 4, 2, 5},
		{7, 4, 6, 5, 0, 3, 1, 2},
		{4, 3, 5, 2, 7, 0, 6, 1},
		{3, 0, 2, 1, 4, 7, 5, 6},
		{1, 0, 6, 7, 2, 3, 5, 4},
		{0, 3, 7, 4, 1, 2, 6, 5},
		{3, 2, 4, 5, 0, 1, 7, 6},
		{2, 1, 5, 6, 3, 0, 4, 7},
		{6, 1, 7, 0, 5, 2, 4, 3},
		{1, 2, 0, 3, 6, 5, 7, 4},
		{2, 5, 3, 4, 1, 6, 0, 7},
		{5, 6, 4, 7, 2, 1, 3, 0},
		{7, 6, 0, 1, 4, 5, 3, 2},
		{6, 5, 1, 2, 7, 4, 0, 3},
		{5, 4, 2, 3, 6, 7, 1, 0},
		{4, 7, 3, 0, 5, 6, 2, 1},
		{6, 7, 5, 4, 1, 0, 2, 3},
		{7, 0, 4, 3, 6, 1, 5, 2},
		{0, 1, 3, 2, 7, 6, 4, 5},
		{1, 6, 2, 5, 0, 7, 3, 4},
		{2, 3, 1, 0, 5, 4, 6, 7},
		{3, 4, 0, 7, 2, 5, 1, 6},
		{4, 5, 7, 6, 3, 2, 0, 1},
		{5, 2, 6, 1, 4, 3, 7, 0},
		{7, 0, 6, 1, 4, 3, 5, 2},
		{0, 3, 1, 2, 7, 4, 6, 5},
		{3, 4, 2, 5, 0, 7, 1, 6},
		{4, 7, 5, 6, 3, 0, 2, 1},
		{6, 7, 1, 0, 5, 4, 2, 3},
		{7, 4, 0, 3, 6, 5, 1, 2},
		{4, 5, 3, 2, 7, 6, 0, 1},
		{5, 6, 2, 1, 4, 7, 3, 0},
		{1, 6, 0, 7, 2, 5, 3, 4},
		{6, 5, 7, 4, 1, 2, 0, 3},
		{5, 2, 4, 3, 6, 1, 7, 0},
		{2, 1, 3, 0, 5, 6, 4, 7},
		{0, 1, 7, 6, 3, 2, 4, 5},
		{1, 2, 6, 5, 0, 3, 7, 4},
		{2, 3, 5, 4, 1, 0, 6, 7},
		{3, 0, 4, 7, 2, 1, 5, 6},
		{1, 0, 2, 3, 6, 7, 5, 4},
		{0, 7, 3, 4, 1, 6, 2, 5},
		{7, 6, 4, 5, 0, 1, 3, 2},
		{6, 1, 5, 2, 7, 0, 4, 3},
		{5, 4, 6, 7, 2, 3, 1, 0},
		{4, 3, 7, 0, 5, 2, 6, 1},
		{3, 2, 0, 1, 4, 5, 7, 6},
		{2, 5, 1, 6, 3, 4, 0, 7}
		};

		/*! This function computes a Peano-Hilbert key for an integer triplet (x,y,z),
		* with x,y,z in the range between 0 and 2^bits-1.
		*/
		peanokey peano_hilbert_key(int x, int y, int z, int bits)
		{
		int mask;
		unsigned char rotation = 0;
		peanokey key = 0;

		for(mask = 1 << (bits - 1); mask > 0; mask >>= 1)
		{
		unsigned char pix = ((x & mask) ? 4 : 0) \| ((y & mask) ? 2 : 0) \| ((z & mask) ? 1 : 0);

		key <<= 3;
		key \|= subpix3[rotation][pix];
		rotation = rottable3[rotation][pix];
		}

		return key;
		}

peano.h

0 → 100644

+3 −0

Original line number	Diff line number	Diff line
		typedef unsigned long long peanokey;

		peanokey peano_hilbert_key(int, int, int, int);

w-stacking-fftw.c

0 → 100644

+860 −0

File added.

Preview size limit exceeded, changes collapsed.

w-stacking.cu

0 → 100644

+306 −0

Original line number	Diff line number	Diff line
		#ifdef _OPENMP
		#include <omp.h>
		#endif
		#include "w-stacking.h"
		#include <math.h>
		#include <stdlib.h>
		#include <stdio.h>
		#define NWORKERS -1 //100
		#define NTHREADS 32

		#ifdef __CUDACC__
		double __device__
		#else
		double
		#endif
		gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
		{
		double conv_weight;
		conv_weight = norm * exp(-((u_distu_dist)+(v_distv_dist))*std22);
		return conv_weight;
		}

		#ifdef __CUDACC__
		//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
		//{
		// double conv_weight;
		// conv_weight = norm * exp(-((u_distu_dist)+(v_distv_dist))*std22);
		// return conv_weight;
		//}

		__global__ void convolve_g(
		int num_w_planes,
		long num_points,
		long freq_per_chan,
		long polarizations,
		double* uu,
		double* vv,
		double* ww,
		float* vis_real,
		float* vis_img,
		float* weight,
		double dx,
		double dw,
		int KernelLen,
		int grid_size_x,
		int grid_size_y,
		double* grid,
		double std22)

		{
		//printf("DENTRO AL KERNEL\n");
		long gid = blockIdx.x*blockDim.x + threadIdx.x;
		if(gid < num_points)
		{
		long i = gid;
		long visindex = ifreq_per_chanpolarizations;
		double norm = std22/PI;

		int j, k;

		/* Convert UV coordinates to grid coordinates. */
		double pos_u = uu[i] / dx;
		double pos_v = vv[i] / dx;
		double ww_i = ww[i] / dw;
		int grid_w = (int)ww_i;
		int grid_u = (int)pos_u;
		int grid_v = (int)pos_v;

		// check the boundaries
		long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
		long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
		long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
		long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
		//printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);


		// Convolve this point onto the grid.
		for (k = kmin; k <= kmax; k++)
		{

		double v_dist = (double)k+0.5 - pos_v;

		for (j = jmin; j <= jmax; j++)
		{
		double u_dist = (double)j+0.5 - pos_u;
		long iKer = 2 * (j + kgrid_size_x + grid_wgrid_size_x*grid_size_y);
		//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);

		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
		// Loops over frequencies and polarizations
		double add_term_real = 0.0;
		double add_term_img = 0.0;
		long ifine = visindex;
		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
		{
		long iweight = visindex/freq_per_chan;
		for (long ipol=0; ipol<polarizations; ipol++)
		{
		double vistest = (double)vis_real[ifine];
		if (!isnan(vistest))
		{
		add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
		add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
		}
		ifine++;
		iweight++;
		}
		}
		atomicAdd(&(grid[iKer]),add_term_real);
		atomicAdd(&(grid[iKer+1]),add_term_img);
		}
		}
		}
		}
		#endif

		void wstack(
		int num_w_planes,
		long num_points,
		long freq_per_chan,
		long polarizations,
		double* uu,
		double* vv,
		double* ww,
		float* vis_real,
		float* vis_img,
		float* weight,
		double dx,
		double dw,
		int w_support,
		int grid_size_x,
		int grid_size_y,
		double* grid,
		int num_threads)
		{
		long i;
		long index;
		long visindex;

		// initialize the convolution kernel
		// gaussian:
		int KernelLen = (w_support-1)/2;
		double std = 1.0;
		double std22 = 1.0/(2.0stdstd);
		double norm = std22/PI;

		// Loop over visibilities.
		// Switch between CUDA and GPU versions
		#ifdef __CUDACC__
		// Define the CUDA set up
		int Nth = NTHREADS;
		int Nbl = num_points/Nth + 1;
		if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
		long Nvis = num_pointsfreq_per_chanpolarizations;
		printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);

		// Create GPU arrays and offload them
		double * uu_g;
		double * vv_g;
		double * ww_g;
		float * vis_real_g;
		float * vis_img_g;
		float * weight_g;
		double * grid_g;

		//for (int i=0; i<100000; i++)grid[i]=23.0;
		cudaError_t mmm;
		mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
		mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
		mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
		mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
		mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
		mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
		mmm=cudaMalloc(&grid_g,2num_w_planesgrid_size_xgrid_size_ysizeof(double));
		mmm=cudaMemset(grid_g,0.0,2num_w_planesgrid_size_xgrid_size_ysizeof(double));

		mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
		mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
		mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
		mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
		mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
		mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);

		// Call main GPU Kernel
		convolve_g <<<Nbl,Nth>>> (
		num_w_planes,
		num_points,
		freq_per_chan,
		polarizations,
		uu_g,
		vv_g,
		ww_g,
		vis_real_g,
		vis_img_g,
		weight_g,
		dx,
		dw,
		KernelLen,
		grid_size_x,
		grid_size_y,
		grid_g,
		std22);

		mmm = cudaMemcpy(grid, grid_g, 2num_w_planesgrid_size_xgrid_size_ysizeof(double), cudaMemcpyDeviceToHost);
		//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
		printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
		mmm=cudaFree(uu_g);
		mmm=cudaFree(vv_g);
		mmm=cudaFree(ww_g);
		mmm=cudaFree(vis_real_g);
		mmm=cudaFree(vis_img_g);
		mmm=cudaFree(weight_g);
		mmm=cudaFree(grid_g);

		// Switch between CUDA and GPU versions
		# else

		#ifdef _OPENMP
		omp_set_num_threads(num_threads);
		#endif
		#pragma omp parallel for private(visindex)
		for (i = 0; i < num_points; i++)
		{
		#ifdef _OPENMP
		//int tid;
		//tid = omp_get_thread_num();
		//printf("%d\n",tid);
		#endif

		visindex = ifreq_per_chanpolarizations;

		double sum = 0.0;
		int j, k;
		//if (i%1000 == 0)printf("%ld\n",i);

		/* Convert UV coordinates to grid coordinates. */
		double pos_u = uu[i] / dx;
		double pos_v = vv[i] / dx;
		double ww_i = ww[i] / dw;
		int grid_w = (int)ww_i;
		int grid_u = (int)pos_u;
		int grid_v = (int)pos_v;

		// check the boundaries
		long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
		long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
		long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
		long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
		//printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);


		// Convolve this point onto the grid.
		for (k = kmin; k <= kmax; k++)
		{

		double v_dist = (double)k+0.5 - pos_v;

		for (j = jmin; j <= jmax; j++)
		{
		double u_dist = (double)j+0.5 - pos_u;
		long iKer = 2 * (j + kgrid_size_x + grid_wgrid_size_x*grid_size_y);

		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
		// Loops over frequencies and polarizations
		double add_term_real = 0.0;
		double add_term_img = 0.0;
		long ifine = visindex;
		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
		{
		long iweight = visindex/freq_per_chan;
		for (long ipol=0; ipol<polarizations; ipol++)
		{
		if (!isnan(vis_real[ifine]))
		{
		//printf("%f %ld\n",weight[iweight],iweight);
		add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
		add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
		//if(vis_img[ifine]>1e10 \|\| vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_pointsfreq_per_chanpolarizations);
		}
		ifine++;
		iweight++;
		}
		}
		// DAV: this is the critical call in terms of correctness of the results and of performance
		#pragma omp atomic
		grid[iKer] += add_term_real;
		#pragma omp atomic
		grid[iKer+1] += add_term_img;
		}
		}

		}
		// End switch between CUDA and CPU versions
		#endif
		//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
		}

		int test(int nnn)
		{
		int mmm;

		mmm = nnn+1;
		return mmm;
		}