GPU working OMP (46034dba) · Commits · Claudio Gheller / HPC_Imaging

Makefile

+29 −13

Original line number	Diff line number	Diff line
		@@ -24,6 +24,9 @@ FFTW_MPI_INC =
		FFTW_MPI_LIB =

		CFLAGS += -I./

		FLAGS=$(OPTIMIZE)

		FFTWLIBS =

		# ========================================================
		@@ -52,7 +55,7 @@ OPT += -DPHASE_ON
		#OPT += -DNVIDIA

		#use cuda for GPUs
		#OPT += -DCUDACC
		OPT += -DCUDACC

		# use GPU acceleration via OMP
		#OPT += -DACCOMP
		@@ -73,12 +76,23 @@ OPT += -DPHASE_ON


		DEPS = w-stacking.h main.c allvars.h

		ifneq (ACCOMP,$(findstring ACCOMP, $(OPT)))
		OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
		else
		OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
		endif

		ifneq (CUDACC,$(findstring CUDACC, $(OPT)))
		OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
		else
		OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
		endif

		DEPS_ACC_CUDA = w-stacking.h w-stacking.cu
		DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu
		OBJ_ACC_CUDA = phase_correction.o w-stacking.o

		DEPS_ACC_OMP = w-stacking_omp.h
		DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c
		OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o

		OBJ_NCCL_REDUCE = gridding_nccl.o
		@@ -114,11 +128,12 @@ endif

		ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_acc-cuda
		LINKER=$(NVCC)
		FLAGS=$(NVFLAGS) $(CFLAGS)
		LINKER=$(MPIC++)
		FLAGS=$(OPTIMIZE)
		LIBS=$(NVLIB)
		compile_cuda: $(OBJ_ACC_CUDA)
		$(NVCC) $(OPT) $(NVFLAGS) -c *.cu $(NVLIB)
		$(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
		$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS)
		OBJ += $(OBJ_ACC_CUDA)
		endif

		ifeq (ACCOMP,$(findstring ACCOMP,$(OPT)))
		@@ -126,8 +141,9 @@ EXEC_EXT := $(EXEC_EXT)_acc-omp
		LINKER=$(NVC)
		FLAGS=$(NVFLAGS) $(CFLAGS)
		LIBS=$(NVLIB)
		compile_accomp: $(OBJ_ACC_OMP)
		$(NVC) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB)
		$(OBJ_ACC_OMP): $(DEPS_ACC_OMP)
		$(NVC) $(FLAGS) $(OPT) -c $^ $(LIBS)
		OBJ += $(OBJ_ACC_OMP)
		endif

		ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT)))
		@@ -136,7 +152,7 @@ LINKER=$(NVC++)
		FLAGS=$(NVFLAGS) $(CFLAGS)
		LIBS=$(NVLIB) $(NVLIB_3)
		compile_accreduce: $(OBJ_NCCL_REDUCE)
		$(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3)
		$(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS)
		endif

		ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT)))
		@@ -144,7 +160,7 @@ EXEC_EXT := $(EXEC_EXT)_acc-reduce
		LINKER=$(NVC++)
		FLAGS=$(NVFLAGS) $(CFLAGS)
		LIBS=$(NVLIB) $(NVLIB_3)
		compile_accreduce: $(OBJ_RCCL_REDCUE)
		compile_accreduce: $(OBJ_RCCL_REDUCE)
		$(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3)
		endif

		@@ -152,9 +168,9 @@ endif
		###################################################################################

		w-stacking: $(OBJ) $(DEPS) Makefile
		$(LINKER) $(FLAGS) $(OPTIMIZE) $(OPT) $(OBJ) -lmpi -o $(EXEC)$(EXEC_EXT) -lmpi $(FFTWLIBS) $(LIBS)
		$(LINKER) $(FLAGS) $(OPT) $(OBJ) -o $(EXEC)$(EXEC_EXT) -lmpi $(FFTWLIBS) $(LIBS)

		$(OBJ): $(DEPS) Makefile
		#$(OBJ): $(DEPS) Makefile

		%.o: %.c $(DEPS)
		$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)

gridding.c

+0 −2

Original line number	Diff line number	Diff line
		@@ -209,9 +209,7 @@ void write_gridded_data()
		fclose(file.pFileimg);
		}

		#ifdef USE_MPI
		MPI_Win_fence(0,slabwin);
		#endif

		#endif //WRITE_DATA

gridding_cpu.c

+5 −3

Original line number	Diff line number	Diff line

		#include "allvars.h"
		#include "proto.h"

		@@ -225,7 +224,7 @@ void gridding_data()
		{

		int ret = reduce_ring(target_rank);
		grid = (double*)Me.fwin.ptr; //Let grid point to the right memory location [GL]
		//grid = (double*)Me.fwin.ptr; //Let grid point to the right memory location [GL]

		if ( ret != 0 )
		{
		@@ -241,6 +240,9 @@ void gridding_data()
		timing_wt.reduce += CPU_TIME_wt - start;
		}

		else
		grid += gridss;

		// Go to next sector
		memset ( gridss, 0, 2param.num_w_planesxaxisyaxis sizeof(double) );

w-stacking.cu

+86 −92

Original line number	Diff line number	Diff line
		@@ -34,9 +34,9 @@ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)

		__global__ void convolve_g(
		int num_w_planes,
		long num_points,
		long freq_per_chan,
		long polarizations,
		uint num_points,
		uint freq_per_chan,
		uint polarizations,
		double* uu,
		double* vv,
		double* ww,
		@@ -53,11 +53,11 @@ __global__ void convolve_g(

		{
		//printf("DENTRO AL KERNEL\n");
		long gid = blockIdx.x*blockDim.x + threadIdx.x;
		uint gid = blockIdx.x*blockDim.x + threadIdx.x;
		if(gid < num_points)
		{
		long i = gid;
		long visindex = ifreq_per_chanpolarizations;
		uint i = gid;
		uint visindex = ifreq_per_chanpolarizations;
		double norm = std22/PI;

		int j, k;
		@@ -79,10 +79,10 @@ __global__ void convolve_g(

		//printf("[A] %g, %g, %g, %g, %g, %d, %d, %d\n", uu[i], vv[i], ww[i], dx, dw, grid_w, grid_u, grid_v);
		// check the boundaries
		long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
		long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
		long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
		long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
		uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
		uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
		uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
		uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
		//printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);

		/*
		@@ -98,36 +98,29 @@ __global__ void convolve_g(
		for (j = jmin; j <= jmax; j++)
		{
		double u_dist = (double)j+0.5 - pos_u;
		long iKer = 2 * (j + kgrid_size_x + grid_wgrid_size_x*grid_size_y);
		uint iKer = 2 * (j + kgrid_size_x + grid_wgrid_size_x*grid_size_y);
		//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);


		if ( iKer >= 2num_w_planesgrid_size_x*grid_size_y) {
		printf("=== ALERT [B] iKer out of bound: %ld, %d, %d, %d, %d, %d\n", iKer, j, k, grid_w, grid_size_x, grid_size_y);
		printf("grid_w = %d, i=%ld, ww_i = %g, dw = %g\n", grid_w, i, ww_i, dw);
		printf("kmin,kamx: %ld, %ld : %d, %d %d : %g %g\n", kmin, kmax, grid_v, KernelLen, grid_size_y, vv[i], dx );
		printf("jmin,jamx: %ld, %ld : %d, %d %d : %g %g\n", jmin, jmax, grid_u, KernelLen, grid_size_x, uu[i], dx );
		}

		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
		// Loops over frequencies and polarizations
		double add_term_real = 0.0;
		double add_term_img = 0.0;
		long ifine = visindex;
		uint ifine = visindex;
		/*
		if ( ifine >= num_pointsfreq_per_chanpolarizations )
		printf("=== ALERT [C] ifine out of bound: %ld\n", ifine);
		*/

		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
		for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
		{
		long iweight = visindex/freq_per_chan;
		uint iweight = visindex/freq_per_chan;

		/*
		if ( iweight >= num_points*polarizations )
		printf("=== ALERT [D] ifine out of bound: %ld\n", iweight);
		*/
		for (long ipol=0; ipol<polarizations; ipol++)
		for (uint ipol=0; ipol<polarizations; ipol++)
		{
		double vistest = (double)vis_real[ifine];
		if (!isnan(vistest))
		@@ -151,9 +144,9 @@ __global__ void convolve_g(
		#endif
		void wstack(
		int num_w_planes,
		long num_points,
		long freq_per_chan,
		long polarizations,
		uint num_points,
		uint freq_per_chan,
		uint polarizations,
		double* uu,
		double* vv,
		double* ww,
		@@ -169,9 +162,9 @@ void wstack(
		int num_threads,
		int rank)
		{
		long i;
		long index;
		long visindex;
		uint i;
		uint index;
		uint visindex;

		// initialize the convolution kernel
		// gaussian:
		@@ -180,6 +173,7 @@ void wstack(
		double std22 = 1.0/(2.0stdstd);
		double norm = std22/PI;

		printf("Task %d, here!", rank);
		// Loop over visibilities.
		// Switch between CUDA and GPU versions
		#ifdef __CUDACC__
		@@ -191,9 +185,9 @@ void wstack(

		cudaGetDevice( &device_cuda );
		int Nth = NTHREADS;
		long Nbl = (long)(num_points/Nth) + 1;
		uint Nbl = (uint)(num_points/Nth) + 1;
		if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
		long Nvis = num_pointsfreq_per_chanpolarizations;
		uint Nvis = num_pointsfreq_per_chanpolarizations;
		printf("Running on GPU %d with %d threads and %d blocks\n",device_cuda,Nth,Nbl);

		// Create GPU arrays and offload them
		@@ -268,7 +262,7 @@ void wstack(
		#endif

		#ifdef ACCOMP
		long Nvis = num_pointsfreq_per_chanpolarizations;
		uint Nvis = num_pointsfreq_per_chanpolarizations;
		// #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
		// #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2num_w_planesgrid_size_x*grid_size_y])
		#else
		@@ -297,10 +291,10 @@ void wstack(
		int grid_v = (int)pos_v;

		// check the boundaries
		long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
		long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
		long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
		long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
		uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
		uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
		uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
		uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
		//printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);


		@@ -313,19 +307,19 @@ void wstack(
		for (j = jmin; j <= jmax; j++)
		{
		double u_dist = (double)j+0.5 - pos_u;
		long iKer = 2 * (j + kgrid_size_x + grid_wgrid_size_x*grid_size_y);
		uint iKer = 2 * (j + kgrid_size_x + grid_wgrid_size_x*grid_size_y);


		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
		// Loops over frequencies and polarizations
		double add_term_real = 0.0;
		double add_term_img = 0.0;
		long ifine = visindex;
		uint ifine = visindex;
		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
		for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
		{
		long iweight = visindex/freq_per_chan;
		for (long ipol=0; ipol<polarizations; ipol++)
		uint iweight = visindex/freq_per_chan;
		for (uint ipol=0; ipol<polarizations; ipol++)
		{
		if (!isnan(vis_real[ifine]))
		{

w-stacking.h

+6 −6

Original line number	Diff line number	Diff line
		@@ -13,9 +13,9 @@ extern "C"
		extern "C" {
		void wstack(
		int,
		long,
		long,
		long,
		unsigned int,
		unsigned int,
		unsigned int,
		double*,
		double*,
		double*,
		@@ -34,9 +34,9 @@ void wstack(
		#else
		void wstack(
		int,
		long,
		long,
		long,
		unsigned int,
		unsigned int,
		unsigned int,
		double*,
		double*,
		double*,