Checking GPU scalability (3835bd76) · Commits · Claudio Gheller / HPC_Imaging

gridding_nccl.cpp

+4 −3

Original line number	Diff line number	Diff line
		@@ -190,7 +190,6 @@ void gridding_data(){
		printf("Processing sector %ld\n",isector);
		#endif

		start = CPU_TIME_wt;

		double *stacking_target_array;
		if ( size > 1 )
		@@ -198,6 +197,8 @@ void gridding_data(){
		else
		stacking_target_array = grid;

		start = CPU_TIME_wt;

		//We have to call different GPUs per MPI task!!! [GL]
		wstack(param.num_w_planes,
		Nsec,

phase_correction.cu

+4 −2

Original line number	Diff line number	Diff line
		@@ -234,13 +234,15 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
		}

		#else
		omp_set_default_device(rank % omp_get_num_devices());

		#if !defined(__clang__)

		#pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2num_w_planesxaxisyaxis]) map(from:image_real[0:xaxisyaxis]) map(from:image_imag[0:xaxis*yaxis]) device(rank % omp_get_num_devices())
		#pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2num_w_planesxaxisyaxis]) map(from:image_real[0:xaxisyaxis]) map(from:image_imag[0:xaxis*yaxis])

		#else

		#pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2num_w_planesxaxisyaxis]) map(from:image_real[0:xaxisyaxis]) map(from:image_imag[0:xaxis*yaxis]) device(rank % omp_get_num_devices())
		#pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2num_w_planesxaxisyaxis]) map(from:image_real[0:xaxisyaxis]) map(from:image_imag[0:xaxis*yaxis])
		#endif

		for (int iw=0; iw<num_w_planes; iw++)

w-stacking.cu

+9 −9

Original line number	Diff line number	Diff line
		@@ -5,12 +5,9 @@
		#include <math.h>
		#include <stdlib.h>
		#include <stdio.h>
		#include "errcodes.h"

		#ifdef __CUDACC__
		#include "allvars_nccl.h"
		#else
		#include "allvars.h"
		#endif

		#include "proto.h"
		@@ -209,7 +206,7 @@ void wstack(
		int rank)
		{
		uint i;
		uint index;
		//uint index;
		uint visindex;

		// initialize the convolution kernel
		@@ -220,13 +217,14 @@ void wstack(
		double std22 = 1.0/(2.0stdstd);
		double norm = std22/PI;
		double * convkernel = (double)malloc(increaseprecisionw_supportsizeof(convkernel));
		double overSamplingFactor = 1.0;
		int withSinc = 0;
		double alpha = 8.6;

		#ifdef GAUSS
		makeGaussKernel(convkernel,w_support,increaseprecision,std22);
		#endif
		#ifdef KAISERBESSEL
		double overSamplingFactor = 1.0;
		int withSinc = 0;
		double alpha = 8.6;
		makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc);
		#endif

		@@ -328,9 +326,9 @@ void wstack(
		#endif

		#ifdef ACCOMP
		omp_set_default_device(rank % omp_get_num_devices());
		uint Nvis = num_pointsfreq_per_chanpolarizations;
		// #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
		// #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2num_w_planesgrid_size_x*grid_size_y])
		#pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2num_w_planesgrid_size_x*grid_size_y])
		#else
		#pragma omp parallel for private(visindex)
		#endif
		@@ -422,6 +420,8 @@ void wstack(
		}

		}

		#pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan],grid[0:2num_w_planesgrid_size_x*grid_size_y])
		// End switch between CUDA and CPU versions
		#endif
		//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);