Stacking/Reduce on GPU with reduced communication (9e323f26) · Commits · Claudio Gheller / HPC_Imaging

Makefile

+48 −13

Original line number	Diff line number	Diff line
		@@ -59,9 +59,9 @@ OPT += -DPHASE_ON
		#OPT += -DNORMALIZE_UVW

		# Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
		OPT += -DGAUSS_HI_PRECISION
		#OPT += -DGAUSS_HI_PRECISION

		#OPT += -DGAUSS
		OPT += -DGAUSS

		#OPT += -DKAISERBESSEL

		@@ -77,7 +77,7 @@ OPT += -DGAUSS_HI_PRECISION
		#OPT += -DCUDACC

		# use GPU acceleration via OMP
		OPT += -DACCOMP
		#OPT += -DACCOMP

		# perform stacking on GPUs
		#OPT += -DGPU_STACKING
		@@ -125,7 +125,7 @@ DEPS = w-stacking.h main.c allvars.h
		# ----- define which files will be compiled by MPICC
		#
		# these are the OBJS that will be compiled by C compiler if no acceleration (neither with CUDA nor with OpenMP) is provided
		CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
		CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform_3d.o result.o numa.o reduce.o w-stacking.o phase_correction.o

		# these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on
		CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
		@@ -142,13 +142,22 @@ DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c
		OBJ_ACC_OMP = phase_correction.o w-stacking.o



		# ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is
		# offloaded on GPU
		ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
		DEPS_NCCL_REDUCE = gridding_nccl.cu
		OBJ_NCCL_REDUCE = gridding_nccl.o

		DEPS_RCCL_REDUCE = gridding_rccl.cu
		OBJ_RCCL_REDUCE = gridding_rccl.o
		else
		DEPS_NCCL_REDUCE = gridding_nccl.cpp
		OBJ_NCCL_REDUCE = gridding_nccl.o

		DEPS_RCCL_REDUCE = gridding_rccl.cpp
		OBJ_RCCL_REDUCE = gridding_rccl.o
		endif

		# ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
		#
		@@ -196,6 +205,12 @@ phase_correction.c: phase_correction.cu

		cuda_fft.cpp: cuda_fft.cu
		cp cuda_fft.cu cuda_fft.cpp

		gridding_nccl.cpp: gridding_nccl.cu
		cp gridding_nccl.cu gridding_nccl.cpp

		gridding_rccl.cpp: gridding_rccl.cu
		cp gridding_rccl.cu gridding_rccl.cpp
		else
		w-stacking.c: w-stacking.cu
		rm -f w-stacking.c
		@@ -206,6 +221,12 @@ phase_correction.c: phase_correction.cu
		cuda_fft.cpp: cuda_fft.cu
		rm -f cuda_fft.cpp
		touch cuda_fft.cpp
		gridding_nccl.cpp: gridding_nccl.cu
		rm -f gridding_nccl.cpp
		touch gridding_nccl.cpp
		gridding_rccl.cpp: gridding_rccl.cu
		rm -f gridding_rccl.cpp
		touch gridding_rccl.cpp
		endif


		@@ -221,7 +242,7 @@ LINKER=$(MPIC++)
		FLAGS=$(OPTIMIZE)
		LIBS=$(NVLIB)
		$(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
		$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS)
		$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c w-stacking.cu phase_correction.cu $(LIBS)
		OBJ += $(OBJ_ACC_CUDA)
		endif

		@@ -256,6 +277,17 @@ endif


		ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT)))

		ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_acc-reduce
		LINKER=$(MPIC++)
		FLAGS=$(OPTIMIZE)
		LIBS=$(NVLIB_3)
		$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
		$(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS)
		OBJ += $(OBJ_NCCL_REDUCE)
		else

		EXEC_EXT := $(EXEC_EXT)_acc-reduce
		LINKER=$(NVC++)
		FLAGS=$(NVFLAGS) $(CFLAGS)
		@@ -264,6 +296,7 @@ $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
		$(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS)
		OBJ += $(OBJ_NCCL_REDUCE)
		endif
		endif

		ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_acc-reduce
		@@ -319,9 +352,11 @@ clean:
		rm -f w-stacking.c
		rm -f phase_correction.c
		rm -f cuda_fft.cpp

		cleanall:
		rm -f $(EXEC)$(EXT)
		rm -f *.o
		rm -f w-stacking.c
		rm -f phase_correction.c
		rm -f gridding_nccl.cpp
		rm -f gridding_rccl.cpp

		#cleanall:
		# rm -f $(EXEC)$(EXT)
		# rm -f *.o
		# rm -f w-stacking.c
		# rm -f phase_correction.c

allvars_nccl.h

+0 −2

Original line number	Diff line number	Diff line
		@@ -23,8 +23,6 @@
		#include <omp.h>
		#endif



		#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
		#include <fftw3-mpi.h>
		#endif

gridding_nccl.cpp→gridding_nccl.cu

+33 −15

Original line number	Diff line number	Diff line

		#include "allvars_nccl.h"
		#include "proto.h"
		#include <cuda.h>
		@@ -87,7 +86,8 @@ void gridding_data(){

		ncclUniqueId id;
		ncclComm_t comm;
		cudaStream_t stream_reduce;
		cudaError_t nnn;
		cudaStream_t stream_reduce, stream_stacking;

		if (rank == 0) ncclGetUniqueId(&id);
		MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
		@@ -97,6 +97,7 @@ void gridding_data(){
		cudaMalloc(&grid_gpu, 2param.num_w_planesxaxisyaxis sizeof(double));
		cudaMalloc(&gridss_gpu, 2param.num_w_planesxaxisyaxis sizeof(double));
		cudaStreamCreate(&stream_reduce);
		cudaStreamCreate(&stream_stacking);

		ncclCommInitRank(&comm, size, id, rank);

		@@ -190,16 +191,37 @@ void gridding_data(){
		printf("Processing sector %ld\n",isector);
		#endif


		double *stacking_target_array;
		if ( size > 1 )
		stacking_target_array = gridss;
		else
		stacking_target_array = grid;


		start = CPU_TIME_wt;

		//We have to call different GPUs per MPI task!!! [GL]
		#ifdef CUDACC
		wstack(param.num_w_planes,
		Nsec,
		metaData.freq_per_chan,
		metaData.polarisations,
		uus,
		vvs,
		wws,
		visreals,
		visimgs,
		weightss,
		dx,
		dw,
		param.w_support,
		xaxis,
		yaxis,
		gridss_gpu,
		param.num_threads,
		rank,
		stream_stacking);
		#else
		wstack(param.num_w_planes,
		Nsec,
		metaData.freq_per_chan,
		@@ -218,14 +240,13 @@ void gridding_data(){
		stacking_target_array,
		param.num_threads,
		rank);
		#endif
		//Allocate memory on devices non-blocking for the host
		///////////////////////////////////////////////////////


		timing_wt.kernel += CPU_TIME_wt - start;

		cudaMemcpyAsync(gridss_gpu, gridss, 2param.num_w_planesxaxisyaxissizeof(double), cudaMemcpyHostToDevice, stream_reduce);

		#ifdef VERBOSE
		printf("Processed sector %ld\n",isector);
		#endif
		@@ -239,8 +260,6 @@ void gridding_data(){
		// int target_rank = (int)isector; it implied that size >= nsectors
		int target_rank = (int)(isector % size);

		cudaStreamSynchronize(stream_reduce);

		start = CPU_TIME_wt;

		ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
		@@ -248,9 +267,9 @@ void gridding_data(){

		timing_wt.reduce += CPU_TIME_wt - start;


		// Go to next sector
		memset ( gridss, 0, 2param.num_w_planesxaxisyaxis sizeof(double) );
		nnn = cudaMemset( gridss_gpu, 0.0, 2param.num_w_planesxaxisyaxis sizeof(double) );
		if (nnn != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", nnn);}
		}

		free(memory);
		@@ -258,16 +277,15 @@ void gridding_data(){

		//Copy data back from device to host (to be deleted in next steps)


		cudaMemcpyAsync(grid, grid_gpu, 2param.num_w_planesxaxisyaxissizeof(double), cudaMemcpyDeviceToHost, stream_reduce);

		MPI_Barrier(MPI_COMM_WORLD);

		cudaStreamSynchronize(stream_reduce);
		cudaFree(gridss_gpu);
		cudaFree(grid_gpu);

		cudaFree(gridss_gpu);
		cudaStreamDestroy(stream_reduce);
		cudaStreamDestroy(stream_stacking);

		ncclCommDestroy(comm);

w-stacking.cu

+131 −85

Original line number	Diff line number	Diff line
		@@ -116,6 +116,8 @@ __global__ void convolve_g(
		double* grid,
		double std22)



		{
		//printf("DENTRO AL KERNEL\n");
		uint gid = blockIdx.x*blockDim.x + threadIdx.x;
		@@ -131,6 +133,7 @@ __global__ void convolve_g(
		double pos_u = uu[i] / dx;
		double pos_v = vv[i] / dx;
		double ww_i = ww[i] / dw;

		int grid_w = (int)ww_i;
		int grid_u = (int)pos_u;
		int grid_v = (int)pos_v;
		@@ -140,22 +143,32 @@ __global__ void convolve_g(
		uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
		uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
		uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
		//printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);


		// Convolve this point onto the grid.
		for (k = kmin; k <= kmax; k++)
		{

		double v_dist = (double)k+0.5 - pos_v;
		double v_dist = (double)k - pos_v;
		int increaseprecision = 5;

		for (j = jmin; j <= jmax; j++)
		{
		double u_dist = (double)j+0.5 - pos_u;
		uint iKer = 2 * (j + kgrid_size_x + grid_wgrid_size_x*grid_size_y);
		//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
		int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
		int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));

		#ifdef GAUSS_HI_PRECISION
		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
		#endif
		#ifdef GAUSS
		double conv_weight = convkernel[jKer]*convkernel[kKer];
		#endif
		#ifdef KAISERBESSEL
		double conv_weight = convkernel[jKer]*convkernel[kKer];
		#endif

		// Loops over frequencies and polarizations
		double add_term_real = 0.0;
		double add_term_img = 0.0;
		@@ -203,7 +216,14 @@ void wstack(
		int grid_size_y,
		double* grid,
		int num_threads,
		int rank)
		#ifdef CUDACC
		int rank,
		cudaStream_t stream_stacking
		#else
		int rank
		#endif
		)

		{
		uint i;
		//uint index;
		@@ -249,13 +269,10 @@ void wstack(
		}
		}

		printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices);
		#ifdef NVIDIA
		prtAccelInfo();
		#endif

		printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);

		// Create GPU arrays and offload them
		double * uu_g;
		double * vv_g;
		@@ -263,32 +280,48 @@ void wstack(
		float * vis_real_g;
		float * vis_img_g;
		float * weight_g;
		double * grid_g;
		//double * grid_g;
		double * convkernel_g;

		//Create the event inside stream stacking
		//cudaEvent_t event_kernel;

		//for (int i=0; i<100000; i++)grid[i]=23.0;
		cudaError_t mmm;
		//mmm=cudaEventCreate(&event_kernel);
		mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
		mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
		mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
		mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
		mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
		mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
		mmm=cudaMalloc(&grid_g,2num_w_planesgrid_size_xgrid_size_ysizeof(double));
		//mmm=cudaMalloc(&grid_g,2num_w_planesgrid_size_xgrid_size_ysizeof(double));
		/*
		#if !defined(GAUSS_HI_PRECISION)
		mmm=cudaMalloc(&convkernel_g,increaseprecisionw_supportsizeof(double));
		#endif
		*/
		if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMalloc ERROR %d !!!\n", mmm);}
		mmm=cudaMemset(grid_g,0.0,2num_w_planesgrid_size_xgrid_size_ysizeof(double));
		//mmm=cudaMemset(grid_g,0.0,2num_w_planesgrid_size_xgrid_size_ysizeof(double));
		if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);}


		mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
		mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
		mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
		mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
		mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
		mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
		if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpy ERROR %d !!!\n", mmm);}
		mmm=cudaMemcpyAsync(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
		mmm=cudaMemcpyAsync(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
		mmm=cudaMemcpyAsync(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
		mmm=cudaMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
		mmm=cudaMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
		mmm=cudaMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);

		/*
		#if !defined(GAUSS_HI_PRECISION)
		mmm=cudaMemcpyAsync(convkernel_g, convkernel, increaseprecisionw_supportsizeof(double), cudaMemcpyHostToDevice, stream_stacking);
		#endif
		*/
		if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpyAsync ERROR %d !!!\n", mmm);}

		// Call main GPU Kernel
		convolve_g <<<Nbl,Nth>>> (
		convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
		num_w_planes,
		num_points,
		freq_per_chan,
		@@ -304,20 +337,33 @@ void wstack(
		KernelLen,
		grid_size_x,
		grid_size_y,
		grid_g,
		std22);
		grid,
		std22
		);

		mmm=cudaStreamSynchronize(stream_stacking);
		//Record the event
		//mmm=cudaEventRecord(event_kernel,stream_stacking);

		//Wait until the kernel ends
		//mmm=cudaStreamWaitEvent(stream_stacking,event_kernel);

		mmm = cudaMemcpy(grid, grid_g, 2num_w_planesgrid_size_xgrid_size_ysizeof(double), cudaMemcpyDeviceToHost);
		//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);

		printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));

		mmm=cudaFree(uu_g);
		mmm=cudaFree(vv_g);
		mmm=cudaFree(ww_g);
		mmm=cudaFree(vis_real_g);
		mmm=cudaFree(vis_img_g);
		mmm=cudaFree(weight_g);
		mmm=cudaFree(grid_g);

		//mmm=cudaFree(grid_g);
		/*
		#if !defined(GAUSS_HI_PRECISION)
		mmm=cudaFree(convkernel_g);
		#endif
		*/
		// Switch between CUDA and GPU versions
		# else

w-stacking.h

+9 −2

Original line number	Diff line number	Diff line
		@@ -32,7 +32,8 @@ void wstack(
		int,
		double*,
		int,
		int);
		int,
		cudaStream_t);
		}

		#else
		@@ -58,6 +59,7 @@ void wstack(
		#endif



		#ifdef __CUDACC__
		extern "C"
		#endif
		@@ -81,6 +83,11 @@ void phase_correction(
		int,
		int);

		double gauss_kernel_norm(
		double norm,
		double std22,
		double u_dist,
		double v_dist);

		#ifdef ACCOMP
		#pragma omp declare target (gauss_kernel_norm)