Trying to fix the bug also on GPU reduce (821d1bd6) · Commits · Claudio Gheller / HPC_Imaging

Makefile

+2 −2

Original line number	Diff line number	Diff line
		@@ -58,7 +58,7 @@ OPT += -DPHASE_ON
		#OPT += -DNORMALIZE_UVW

		# Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
		OPT += -DGAUSS_HI_PRECISION
		#OPT += -DGAUSS_HI_PRECISION

		#OPT += -DGAUSS

		@@ -76,7 +76,7 @@ OPT += -DGAUSS_HI_PRECISION
		#OPT += -DCUDACC

		# use GPU acceleration via OMP
		#OPT += -DACCOMP
		OPT += -DACCOMP

		# use NVIDIA GPU to perform the reduce
		#OPT += -DNCCL_REDUCE

+15 −6

Original line number	Diff line number	Diff line
		@@ -77,7 +77,8 @@ void gridding_data(){

		// allocate sector arrays
		// note: we use the largest allocation among all sectors

		//
		unsigned long long int mem_size = (Nsec3)sizeof(double_t) + (Nvissec2+Nweightss)sizeof(float_t);
		double_t memory = (double) malloc ( (Nsec3)sizeof(double_t) +
		(Nvissec2+Nweightss)sizeof(float_t) );

		@@ -128,6 +129,8 @@ void gridding_data(){

		double start = CPU_TIME_wt;

		memset( memory, 0, mem_size );

		// select data for this sector
		uint icount = 0;
		uint ip = 0;
		@@ -197,6 +200,12 @@ void gridding_data(){

		start = CPU_TIME_wt;

		double *stacking_target_array;
		if ( size > 1 )
		stacking_target_array = gridss;
		else
		stacking_target_array = grid;

		//We have to call different GPUs per MPI task!!! [GL]
		wstack(param.num_w_planes,
		Nsec,
		@@ -213,7 +222,7 @@ void gridding_data(){
		param.w_support,
		xaxis,
		yaxis,
		gridss,
		stacking_target_array,
		param.num_threads,
		rank);
		//Allocate memory on devices non-blocking for the host
		@@ -246,10 +255,10 @@ void gridding_data(){

		timing_wt.reduce += CPU_TIME_wt - start;

		}

		// Go to next sector
		memset ( gridss, 0, 2param.num_w_planesxaxisyaxis sizeof(double) );
		}

		}

+17 −8

Original line number	Diff line number	Diff line
		@@ -75,9 +75,10 @@ void gridding_data(){

		// allocate sector arrays
		// note: we use the largest allocation among all sectors

		double_ty memory = (double) malloc ( (Nsec3)sizeof(double_ty) +
		(Nvissec2+Nweightss)sizeof(float_ty) );
		//
		unsigned long long int mem_size = (Nsec3)sizeof(double_t) + (Nvissec2+Nweightss)sizeof(float_t);
		double_t memory = (double) malloc ( (Nsec3)sizeof(double_t) +
		(Nvissec2+Nweightss)sizeof(float_t) );

		if ( memory == NULL )
		shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);
		@@ -126,6 +127,8 @@ void gridding_data(){

		double start = CPU_TIME_wt;

		memset( memory, 0, mem_size );

		// select data for this sector
		uint icount = 0;
		uint ip = 0;
		@@ -195,6 +198,12 @@ void gridding_data(){

		start = CPU_TIME_wt;

		double *stacking_target_array;
		if ( size > 1 )
		stacking_target_array = gridss;
		else
		stacking_target_array = grid;

		//We have to call different GPUs per MPI task!!! [GL]
		wstack(param.num_w_planes,
		Nsec,
		@@ -211,7 +220,7 @@ void gridding_data(){
		param.w_support,
		xaxis,
		yaxis,
		gridss,
		stacking_target_array,
		param.num_threads,
		rank);
		//Allocate memory on devices non-blocking for the host
		@@ -244,10 +253,10 @@ void gridding_data(){

		timing_wt.reduce += CPU_TIME_wt - start;

		}

		// Go to next sector
		memset ( gridss, 0, 2param.num_w_planesxaxisyaxis sizeof(double) );
		}

		}