first CUDA implementation of phase_correction. Still buggy (9d47e94e) · Commits · Claudio Gheller / HPC_Imaging

Makefile

+3 −3

Original line number	Diff line number	Diff line
		@@ -10,7 +10,7 @@ OPT += -DONE_SIDE
		# write the final image
		OPT += -DWRITE_IMAGE
		# perform w-stacking phase correction
		OPT += -DPHASE_ON
		#OPT += -DPHASE_ON

		CC = gcc
		CXX = g++
		@@ -46,7 +46,7 @@ serial: $(COBJ)
		$(CC) $(OMP) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm

		serial_cuda:
		$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
		$(NVCC) $(OPT) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
		$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
		$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm

		@@ -54,7 +54,7 @@ mpi: $(COBJ)
		$(CC) $(OMP) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS)

		mpi_cuda:
		$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
		$(NVCC) $(NVFLAGS) $(OPT) -c w-stacking.cu phase_correction.cu $(NVLIB)
		$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
		$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(LIBS) -lm

phase_correction.cu

+113 −6

Original line number	Diff line number	Diff line
		@@ -7,19 +7,126 @@
		#include <stdio.h>

		#ifdef __CUDACC__

		__global__ void phase_g(int xaxis,
		int yaxis,
		int num_w_planes,
		double * gridss,
		double * image_real,
		double * image_imag,
		double wmin,
		double dw,
		double dwnorm,
		int xaxistot,
		int yaxistot,
		double resolution)
		{
		long gid = blockIdx.x*blockDim.x + threadIdx.x;
		double add_term_real;
		double add_term_img;
		double wterm;
		long arraysize = xaxisyaxisnum_w_planes;

		if(gid < arraysize)
		{
		int iw = (int)(gid/(xaxis*yaxis));
		int iv = (int)((gid%(xaxis*yaxis))/xaxis);
		int iu = (iv%yaxis);
		long index = 2*gid;
		long img_index = iu+iv*xaxis;

		wterm = wmin + iw*dw;

		#ifdef PHASE_ON
		if (num_w_planes > 1)
		{
		double xcoord = (double)(iu-xaxistot/2);
		if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
		xcoord = sin(xcoord*resolution);
		double ycoord = (double)(iv-yaxistot/2);
		if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
		ycoord = sin(ycoord*resolution);

		double preal, pimag;
		double radius2 = (xcoordxcoord+ycoordycoord);

		preal = cos(2.0PIwterm*(sqrt(1-radius2)-1.0));
		pimag = sin(2.0PIwterm*(sqrt(1-radius2)-1.0));

		double p,q,r,s;
		p = gridss[index];
		q = gridss[index+1];
		r = preal;
		s = pimag;

		//printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);

		add_term_real = (pr-qs)dwnormsqrt(1-radius2);
		add_term_img = (ps+qr)dwnormsqrt(1-radius2);
		atomicAdd(&(image_real[img_index]),add_term_real);
		atomicAdd(&(image_imag[img_index]),add_term_img);
		} else {
		atomicAdd(&(image_real[img_index]),gridss[index]);
		atomicAdd(&(image_imag[img_index]),gridss[index+1]);
		}
		#else
		atomicAdd(&(image_real[img_index]),gridss[index]);
		atomicAdd(&(image_imag[img_index]),gridss[index+1]);
		#endif // end of PHASE_ON

		}

		}

		#endif

		void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes, int xaxistot, int yaxistot,
		double resolution, double wmin, double wmax)
		{
		double dnum_w_planes = (double)num_w_planes;
		double dxaxistot = (double)xaxistot;
		double dyaxistot = (double)yaxistot;
		double diagonal;
		double dw = (wmax-wmin)/(double)num_w_planes;
		double wterm = wmin+0.5*dw;
		double dwnorm = dw/(wmax-wmin);

		#ifdef __CUDACC__

		int Nth = NTHREADS;
		long Nbl = (long)((num_w_planesxaxisyaxis)/Nth) + 1;
		if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
		printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);


		cudaError_t mmm;
		double * image_real_g;
		double * image_imag_g;
		double * gridss_g;

		mmm=cudaMalloc(&gridss_g, 2num_w_planesxaxisyaxissizeof(double));
		mmm=cudaMalloc(&image_real_g, xaxisyaxissizeof(double));
		mmm=cudaMalloc(&image_imag_g, xaxisyaxissizeof(double));

		mmm=cudaMemcpy(gridss_g, gridss, 2num_w_planesxaxisyaxissizeof(double), cudaMemcpyHostToDevice);
		mmm=cudaMemset(image_real_g, 0.0, xaxisyaxissizeof(double));
		mmm=cudaMemset(image_imag_g, 0.0, xaxisyaxissizeof(double));

		// call the phase correction kernel
		phase_g <<<Nbl,Nth>>> (xaxis,
		yaxis,
		num_w_planes,
		gridss_g,
		image_real_g,
		image_imag_g,
		wmin,
		dw,
		dwnorm,
		xaxistot,
		yaxistot,
		resolution);

		mmm = cudaMemcpy(image_real, image_real_g, xaxisyaxissizeof(double), cudaMemcpyDeviceToHost);
		mmm = cudaMemcpy(image_imag, image_imag_g, xaxisyaxissizeof(double), cudaMemcpyDeviceToHost);

		#else

		for (int iw=0; iw<num_w_planes; iw++)
		{
		for (int iv=0; iv<yaxis; iv++)
		@@ -38,7 +145,6 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
		if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
		ycoord = sin(ycoord*resolution);

		double efact;
		double preal, pimag;
		double radius2 = (xcoordxcoord+ycoordycoord);

		@@ -61,12 +167,13 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
		#else
		image_real[img_index] += gridss[index];
		image_imag[img_index] += gridss[index+1];
		#endif
		#endif // end of PHASE_ON

		}
		wterm += dw;
		}

		#endif // end of __CUDACC__


		}

w-stacking-fftw.c

+25 −25

Original line number	Diff line number	Diff line
		@@ -78,24 +78,24 @@ int main(int argc, char * argv[])
		float * visreal;
		float * visimg;

		long Nmeasures;
		long Nvis;
		long Nweights;
		long freq_per_chan;
		long polarisations;
		long Ntimes;
		double dt;
		double thours;
		long baselines;
		double uvmin;
		double uvmax;
		double wmin;
		double wmax;
		long Nmeasures,Nmeasures0;
		long Nvis,Nvis0;
		long Nweights,Nweights0;
		long freq_per_chan,freq_per_chan0;
		long polarisations,polarisations0;
		long Ntimes,Ntimes0;
		double dt,dt0;
		double thours,thours0;
		long baselines,baselines0;
		double uvmin,uvmin0;
		double uvmax,uvmax0;
		double wmin,wmin0;
		double wmax,wmax0;
		double resolution;

		// MESH SIZE
		int grid_size_x = 2048;
		int grid_size_y = 2048;
		int grid_size_x = 256;
		int grid_size_y = 256;
		int local_grid_size_x;// = 8;
		int local_grid_size_y;// = 8;
		int xaxis;
		@@ -372,18 +372,18 @@ int main(int argc, char * argv[])
		strcpy(filename,datapath);
		strcat(filename,metafile);
		pFile = fopen (filename,"r");
		fscanf(pFile,"%ld",&Nmeasures);
		fscanf(pFile,"%ld",&Nvis);
		fscanf(pFile,"%ld",&freq_per_chan);
		fscanf(pFile,"%ld",&polarisations);
		fscanf(pFile,"%ld",&Ntimes);
		fscanf(pFile,"%lf",&dt);
		fscanf(pFile,"%lf",&thours);
		fscanf(pFile,"%ld",&baselines);
		fscanf(pFile,"%ld",&Nmeasures0);
		fscanf(pFile,"%ld",&Nvis0);
		fscanf(pFile,"%ld",&freq_per_chan0);
		fscanf(pFile,"%ld",&polarisations0);
		fscanf(pFile,"%ld",&Ntimes0);
		fscanf(pFile,"%lf",&dt0);
		fscanf(pFile,"%lf",&thours0);
		fscanf(pFile,"%ld",&baselines0);
		fscanf(pFile,"%lf",&uvmin);
		fscanf(pFile,"%lf",&uvmax);
		fscanf(pFile,"%lf",&wmin);
		fscanf(pFile,"%lf",&wmax);
		fscanf(pFile,"%lf",&wmin0);
		fscanf(pFile,"%lf",&wmax0);
		fclose(pFile);

		// calculate the resolution in radians

w-stacking.cu

+1 −1

Original line number	Diff line number	Diff line
		@@ -147,7 +147,7 @@ void wstack(
		#ifdef __CUDACC__
		// Define the CUDA set up
		int Nth = NTHREADS;
		int Nbl = num_points/Nth + 1;
		long Nbl = (long)(num_points/Nth) + 1;
		if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
		long Nvis = num_pointsfreq_per_chanpolarizations;
		printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);