Full AMD GPUs enabling (732d1d44) · Commits · Claudio Gheller / HPC_Imaging

Makefile

+59 −8

Original line number	Diff line number	Diff line
		@@ -91,18 +91,29 @@ OPT += -DGAUSS_HI_PRECISION
		# use GPU to perform FFT
		#OPT += -DCUFFTMP

		# FULL GPU SUPPORT - Recommended for full NVIDIA GPU code execution
		# FULL NVIDIA GPU SUPPORT - Recommended for full NVIDIA GPU code execution
		OPT += -DFULL_NVIDIA
		ifeq (FULL_NVIDIA,$(findstring FULL_NVIDIA,$(OPT)))
		OPT += -DCUDACC -DNCCL_REDUCE -DCUFFTMP
		endif



		# use HIP for GPUs
		#OPT += -DHIPCC

		# support for AMD GPUs
		#OPT += __HIP_PLATFORM_AMD__
		#OPT += -D__HIP_PLATFORM_AMD__

		# use AMD GPU to perform the reduce
		#OPT += -DRCCL_REDUCE

		# FULL AMD GPU SUPPORT - Recommended for full AMD GPU code execution
		#OPT += -DFULL_AMD
		ifeq (FULL_AMD,$(findstring FULL_AMD,$(OPT)))
		OPT += -DHIPCC -DRCCL_REDUCE -D__HIP_PLATFORM_AMD__
		endif

		# =======================================================

		# OUTPUT HANDLING
		@@ -167,6 +178,11 @@ CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform
		DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu
		OBJ_ACC_CUDA = phase_correction.o w-stacking.o

		# ----- define which files will be compiled by HIPCC for AMD
		#
		DEPS_ACC_HIP = w-stacking.hip.hpp w-stacking.hip.cpp phase_correction.hip.cpp
		OBJ_ACC_HIP = phase_correction.hip.o w-stacking.hip.o

		# ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD
		#
		DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c
		@@ -180,14 +196,12 @@ ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
		DEPS_NCCL_REDUCE = gridding_nccl.cu
		OBJ_NCCL_REDUCE = gridding_nccl.o

		DEPS_RCCL_REDUCE = gridding_rccl.cu
		OBJ_RCCL_REDUCE = gridding_rccl.o
		else ifeq (HIPCC,$(findstring HIPCC,$(OPT)))
		DEPS_RCCL_REDUCE = gridding_rccl.hip.cpp allvars_rccl.hip.hpp
		OBJ_RCCL_REDUCE = gridding_rccl.hip.o
		else
		DEPS_NCCL_REDUCE = gridding_nccl.cpp
		OBJ_NCCL_REDUCE = gridding_nccl.o

		DEPS_RCCL_REDUCE = gridding_rccl.cpp
		OBJ_RCCL_REDUCE = gridding_rccl.o
		endif

		# ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
		@@ -211,6 +225,8 @@ ifeq (ACCOMP,$(findstring ACCOMP, $(OPT)))
		OBJ = $(CC_OBJ_ACC)
		else ifeq (CUDACC,$(findstring CUDACC, $(OPT)))
		OBJ = $(CC_OBJ_ACC)
		else ifeq (HIPCC,$(findstring HIPCC, $(OPT)))
		OBJ = $(CC_OBJ_ACC)
		else
		OBJ = $(CC_OBJ_NOACC)
		endif
		@@ -240,6 +256,22 @@ cuda_fft.cpp: cuda_fft.cu
		gridding_nccl.cpp: gridding_nccl.cu
		cp gridding_nccl.cu gridding_nccl.cpp

		gridding_rccl.cpp: gridding_rccl.cu
		cp gridding_rccl.cu gridding_rccl.cpp

		else ifneq (HIPCC,$(findstring HIPCC,$(OPT)))
		w-stacking.c: w-stacking.cu
		cp w-stacking.cu w-stacking.c

		phase_correction.c: phase_correction.cu
		cp phase_correction.cu phase_correction.c

		cuda_fft.cpp: cuda_fft.cu
		cp cuda_fft.cu cuda_fft.cpp

		gridding_nccl.cpp: gridding_nccl.cu
		cp gridding_nccl.cu gridding_nccl.cpp

		gridding_rccl.cpp: gridding_rccl.cu
		cp gridding_rccl.cu gridding_rccl.cpp
		else
		@@ -277,6 +309,15 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
		OBJ += $(OBJ_ACC_CUDA)
		endif

		ifeq (HIPCC,$(findstring HIPCC,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_acc-hip
		LINKER=$(MPIC++)
		FLAGS=$(OPTIMIZE)
		LIBS=$(AMDLIB)
		$(OBJ_ACC_HIP): $(DEPS_ACC_HIP)
		$(HIPCC) $(OPT) $(OPT_HIPCC) $(CFLAGS) -c w-stacking.hip.cpp phase_correction.hip.cpp $(LIBS)
		OBJ += $(OBJ_ACC_HIP)
		endif

		ifeq (ACCOMP,$(findstring ACCOMP,$(OPT)))

		@@ -317,6 +358,16 @@ LIBS=$(NVLIB_3)
		$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
		$(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS)
		OBJ += $(OBJ_NCCL_REDUCE)

		else ifeq (HIPCC,$(findstring HIPCC,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_acc-reduce
		LINKER=$(MPIC++)
		FLAGS=$(OPTIMIZE) $(CFLAGS)
		LIBS=$(AMDLIB_3)
		$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
		$(MPIC++) $(FLAGS) $(OPT) -c $^ $(CFLAGS) $(LIBS)
		OBJ += $(OBJ_NCCL_REDUCE)

		else

		EXEC_EXT := $(EXEC_EXT)_acc-reduce

allvars_rccl.hip.hpp

0 → 100755

+176 −0

Original line number	Diff line number	Diff line
		/* file to store global variables*/

		#if defined(__STDC__)
		# if (__STDC_VERSION__ >= 199901L)
		# define _XOPEN_SOURCE 700
		# endif
		#endif

		#include <stdlib.h>
		#include <stdio.h>
		#include <string.h>
		#include <math.h>
		#include <unistd.h>


		#if !defined( RCCL_REDUCE ) && !defined(__HIPCC__)
		#include <stdatomic.h>
		#endif

		#include <mpi.h>

		#if defined (_OPENMP)
		#include <omp.h>
		#endif

		#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
		#include <fftw3-mpi.h>
		#endif

		#if defined(ACCOMP)
		#include "w-stacking_omp.h"
		#else
		#include "w-stacking.hip.hpp"
		#endif

		#if defined(NVIDIA)
		#include <cuda_runtime.h>
		#endif

		#include "fft.h"
		#include "numa.h"
		#include "timing.h"
		#include "errcodes.h"

		#define PI 3.14159265359
		#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
		#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
		#define NOVERBOSE
		#define NFILES 100

		#define NAME_LEN 50
		#define LONGNAME_LEN 1000


		#define REDUCE_MPI 0
		#define REDUCE_RING 1

		#if defined(DEBUG)
		#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \
		( ((t) ==-1 ) \|\| ((T)==(t)) ) ) { \
		printf(__VA_ARGS__); fflush(stdout); }

		#else
		#define dprintf(...)
		#endif

		typedef double double_t;
		#if defined(DOUBLE_PRECISION)
		typedef double float_t;
		#else
		typedef float float_t;
		#endif

		typedef unsigned int uint;
		typedef unsigned long long ull;


		extern struct io
		{
		FILE * pFile;
		FILE * pFile1;
		FILE * pFilereal;
		FILE * pFileimg;
		} file;

		extern struct ip
		{
		char ufile[NAME_LEN];
		char vfile[NAME_LEN];
		char wfile[NAME_LEN];
		char weightsfile[NAME_LEN];
		char visrealfile[NAME_LEN];
		char visimgfile[NAME_LEN];
		char metafile[NAME_LEN];
		char paramfile[NAME_LEN];
		} in;

		extern struct op
		{
		char outfile[NAME_LEN];
		char outfile1[NAME_LEN];
		char outfile2[NAME_LEN];
		char outfile3[NAME_LEN];
		char fftfile[NAME_LEN];
		char fftfile_writedata1[NAME_LEN];
		char fftfile_writedata2[NAME_LEN];
		char fftfile2[NAME_LEN];
		char fftfile3[NAME_LEN];
		char logfile[NAME_LEN];
		char extension[NAME_LEN];
		char timingfile[NAME_LEN];

		} out, outparam;

		extern struct meta
		{

		uint Nmeasures;
		uint Nvis;
		uint Nweights;
		uint freq_per_chan;
		uint polarisations;
		uint Ntimes;
		double dt;
		double thours;
		uint baselines;
		double uvmin;
		double uvmax;
		double wmin;
		double wmax;
		} metaData;


		extern struct parameter
		{
		int num_threads;
		int ndatasets;
		char datapath_multi[NFILES][LONGNAME_LEN];
		int grid_size_x;
		int grid_size_y;
		int num_w_planes;
		int w_support;
		int reduce_method;
		} param;

		extern struct fileData
		{
		double * uu;
		double * vv;
		double * ww;
		float * weights;
		float * visreal;
		float * visimg;
		}data;


		extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN];
		extern char datapath[LONGNAME_LEN];
		extern int xaxis, yaxis;
		extern int rank;
		extern int size;
		extern uint nsectors;
		extern uint startrow;
		extern double_t resolution, dx, dw, w_supporth;

		extern uint **sectorarray;
		extern uint *histo_send;
		extern int verbose_level;


		extern uint size_of_grid;
		extern double_t grid_pointers, grid, gridss, gridss_real, gridss_img, gridss_w, grid_gpu, gridss_gpu;

		extern MPI_Comm MYMPI_COMM_WORLD;
		extern MPI_Win slabwin;

gridding_rccl.hip.cpp

0 → 100755

+290 −0

Original line number	Diff line number	Diff line
		#include "allvars_rccl.hip.hpp"
		#include "proto.h"
		#include <hip/hip_runtime.h>
		#include <rccl.h>

		/*
		* Implements the gridding of data via GPU
		* by using NCCL library
		*
		*/


		#if defined( RCCL_REDUCE )

		/*
		#define NCCLCHECK(cmd) do {
		ncclResult_t r = cmd;
		if (r!= ncclSuccess) {
		printf("Failed, NCCL error %s:%d '%s'\n",
		__FILE__,__LINE__,ncclGetErrorString(r));
		exit(EXIT_FAILURE);
		}
		} while(0)
		*/


		static uint64_t getHostHash(const char* string) {
		// Based on DJB2a, result = result * 33 ^ char
		uint64_t result = 5381;
		for (int c = 0; string[c] != '\0'; c++){
		result = ((result << 5) + result) ^ string[c];
		}
		return result;
		}


		static void getHostName(char* hostname, int maxlen) {
		gethostname(hostname, maxlen);
		for (int i=0; i< maxlen; i++) {
		if (hostname[i] == '.') {
		hostname[i] = '\0';
		return;
		}
		}
		}




		void gridding_data(){

		double shift = (double)(dx*yaxis);

		timing_wt.kernel = 0.0;
		timing_wt.reduce = 0.0;
		timing_wt.reduce_mpi = 0.0;
		timing_wt.reduce_sh = 0.0;
		timing_wt.compose = 0.0;

		// calculate the resolution in radians
		resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax));

		// calculate the resolution in arcsec
		double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI;
		if ( rank == 0 )
		printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);

		// find the largest value in histo_send[]
		//


		//Initialize nccl

		//double gridss_gpu, grid_gpu;
		int local_rank = 0;

		uint64_t hostHashs[size];
		char hostname[1024];
		getHostName(hostname, 1024);
		hostHashs[rank] = getHostHash(hostname);
		MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
		for (int p=0; p<size; p++) {
		if (p == rank) break;
		if (hostHashs[p] == hostHashs[rank]) local_rank++;
		}

		ncclUniqueId id;
		ncclComm_t comm;
		hipError_t nnn;
		hipStream_t stream_reduce, stream_stacking;

		if (rank == 0) ncclGetUniqueId(&id);
		MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);

		hipSetDevice(local_rank);

		int n = hipMalloc(&grid_gpu, 2param.num_w_planesxaxisyaxis sizeof(double));
		n = hipMalloc(&gridss_gpu, 2param.num_w_planesxaxisyaxis sizeof(double));
		n = hipStreamCreate(&stream_reduce);
		n = hipStreamCreate(&stream_stacking);

		ncclCommInitRank(&comm, size, id, rank);

		for (uint isector = 0; isector < nsectors; isector++)
		{

		double start = CPU_TIME_wt;

		uint Nsec = histo_send[isector];
		uint Nweightss = Nsec*metaData.polarisations;
		uint Nvissec = Nweightss*metaData.freq_per_chan;
		double_t memory = (double) malloc ( (Nsec3)sizeof(double_t) +
		(Nvissec2+Nweightss)sizeof(float_t) );

		if ( memory == NULL )
		shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);

		double_t uus = (double_t) memory;
		double_t vvs = (double_t) uus+Nsec;
		double_t wws = (double_t) vvs+Nsec;
		float_t weightss = (float_t)((double_t*)wws+Nsec);
		float_t visreals = (float_t)weightss + Nweightss;
		float_t visimgs = (float_t)visreals + Nvissec;



		// select data for this sector
		uint icount = 0;
		uint ip = 0;
		uint inu = 0;

		#warning "this loop should be threaded"
		#warning "the counter of this loop should not be int"
		for(int iphi = histo_send[isector]-1; iphi>=0; iphi--)
		{

		uint ilocal = sectorarray[isector][iphi];

		uus[icount] = data.uu[ilocal];
		vvs[icount] = data.vv[ilocal]-isector*shift;
		wws[icount] = data.ww[ilocal];
		for (uint ipol=0; ipol<metaData.polarisations; ipol++)
		{
		weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
		ip++;
		}
		for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
		{
		visreals[inu] = data.visreal[ilocalmetaData.polarisationsmetaData.freq_per_chan+ifreq];
		visimgs[inu] = data.visimg[ilocalmetaData.polarisationsmetaData.freq_per_chan+ifreq];

		inu++;
		}
		icount++;
		}

		double uumin = 1e20;
		double vvmin = 1e20;
		double uumax = -1e20;
		double vvmax = -1e20;

		#pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads)
		{
		double my_uumin = 1e20;
		double my_vvmin = 1e20;
		double my_uumax = -1e20;
		double my_vvmax = -1e20;

		#pragma omp for
		for (uint ipart=0; ipart<Nsec; ipart++)
		{
		my_uumin = MIN(my_uumin, uus[ipart]);
		my_uumax = MAX(my_uumax, uus[ipart]);
		my_vvmin = MIN(my_vvmin, vvs[ipart]);
		my_vvmax = MAX(my_vvmax, vvs[ipart]);
		}

		uumin = MIN( uumin, my_uumin );
		uumax = MAX( uumax, my_uumax );
		vvmin = MIN( vvmin, my_vvmin );
		vvmax = MAX( vvmax, my_vvmax );
		}

		timing_wt.compose += CPU_TIME_wt - start;

		//printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);

		// Make convolution on the grid

		#ifdef VERBOSE
		printf("Processing sector %ld\n",isector);
		#endif

		start = CPU_TIME_wt;

		//We have to call different GPUs per MPI task!!! [GL]
		#ifdef HIPCC
		wstack(param.num_w_planes,
		Nsec,
		metaData.freq_per_chan,
		metaData.polarisations,
		uus,
		vvs,
		wws,
		visreals,
		visimgs,
		weightss,
		dx,
		dw,
		param.w_support,
		xaxis,
		yaxis,
		gridss_gpu,
		param.num_threads,
		rank,
		stream_stacking);
		#else
		wstack(param.num_w_planes,
		Nsec,
		metaData.freq_per_chan,
		metaData.polarisations,
		uus,
		vvs,
		wws,
		visreals,
		visimgs,
		weightss,
		dx,
		dw,
		param.w_support,
		xaxis,
		yaxis,
		gridss,
		param.num_threads,
		rank);
		#endif
		//Allocate memory on devices non-blocking for the host
		///////////////////////////////////////////////////////


		timing_wt.kernel += CPU_TIME_wt - start;

		#ifdef VERBOSE
		printf("Processed sector %ld\n",isector);
		#endif


		if( size > 1 )
		{

		// Write grid in the corresponding remote slab

		// int target_rank = (int)isector; it implied that size >= nsectors
		int target_rank = (int)(isector % size);

		start = CPU_TIME_wt;

		ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
		n = hipStreamSynchronize(stream_reduce);

		timing_wt.reduce += CPU_TIME_wt - start;

		// Go to next sector
		nnn = hipMemset( gridss_gpu, 0.0, 2param.num_w_planesxaxisyaxis sizeof(double) );
		if (nnn != hipSuccess) {printf("!!! w-stacking.cu hipMemset ERROR %d !!!\n", nnn);}
		}

		free(memory);
		}


		//Copy data back from device to host (to be deleted in next steps)

		n = hipMemcpyAsync(grid, grid_gpu, 2param.num_w_planesxaxisyaxissizeof(double), hipMemcpyDeviceToHost, stream_reduce);

		MPI_Barrier(MPI_COMM_WORLD);

		n = hipStreamSynchronize(stream_reduce);
		n = hipFree(grid_gpu);
		n = hipFree(gridss_gpu);
		n = hipStreamDestroy(stream_reduce);
		n = hipStreamDestroy(stream_stacking);

		ncclCommDestroy(comm);

		return;

		}

		#endif

phase_correction.hip.cpp

0 → 100755

+315 −0

Original line number	Diff line number	Diff line
		#ifdef _OPENMP
		#include <omp.h>
		#endif

		#if !defined(ACCOMP)
		#include "w-stacking.hip.hpp"
		#else
		#include "w-stacking_omp.h"
		#endif

		#include <math.h>
		#include <stdlib.h>
		#include <stdio.h>
		#include "errcodes.h"
		#include "proto.h"

		#ifdef __HIPCC__

		__global__ void phase_g(int xaxis,
		int yaxis,
		int num_w_planes,
		double * gridss,
		double * image_real,
		double * image_imag,
		double wmin,
		double dw,
		double dwnorm,
		int xaxistot,
		int yaxistot,
		double resolution,
		int nbucket)
		{
		long gid = blockIdx.x*blockDim.x + threadIdx.x;
		double add_term_real;
		double add_term_img;
		double wterm;
		long arraysize = (long)((xaxisyaxisnum_w_planes)/nbucket);

		if(gid < arraysize)
		{
		long gid_aux = nbucket*gid;
		for(int iaux=0; iaux<nbucket; iaux++)
		{
		int iw = gid_aux/(xaxis*yaxis);
		int ivaux = gid_aux%(xaxis*yaxis);
		int iv = ivaux/xaxis;
		int iu = ivaux%xaxis;
		long index = 2*gid_aux;
		long img_index = iu+iv*xaxis;

		wterm = wmin + iw*dw;

		#ifdef PHASE_ON
		if (num_w_planes > 1)
		{
		double xcoord = (double)(iu-xaxistot/2);
		if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
		xcoord = sin(xcoord*resolution);
		double ycoord = (double)(iv-yaxistot/2);
		if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
		ycoord = sin(ycoord*resolution);

		double preal, pimag;
		double radius2 = (xcoordxcoord+ycoordycoord);

		preal = cos(2.0PIwterm*(sqrt(1-radius2)-1.0));
		pimag = sin(2.0PIwterm*(sqrt(1-radius2)-1.0));

		double p,q,r,s;
		p = gridss[index];
		q = gridss[index+1];
		r = preal;
		s = pimag;

		//printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);

		add_term_real = (pr-qs)dwnormsqrt(1-radius2);
		add_term_img = (ps+qr)dwnormsqrt(1-radius2);
		atomicAdd(&(image_real[img_index]),add_term_real);
		atomicAdd(&(image_imag[img_index]),add_term_img);
		} else {
		atomicAdd(&(image_real[img_index]),gridss[index]);
		atomicAdd(&(image_imag[img_index]),gridss[index+1]);
		}
		#else
		atomicAdd(&(image_real[img_index]),gridss[index]);
		atomicAdd(&(image_imag[img_index]),gridss[index+1]);
		#endif // end of PHASE_ON
		gid_aux++;
		}
		}

		}

		#endif

		void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes, int xaxistot, int yaxistot,
		double resolution, double wmin, double wmax, int num_threads, int rank)
		{
		double dw = (wmax-wmin)/(double)num_w_planes;
		double wterm = wmin+0.5*dw;
		double dwnorm = dw/(wmax-wmin);

		#ifdef HIPCC

		// WARNING: nbucket MUST be chosen such that xaxisyaxisnum_w_planes is a multiple of nbucket
		int nbucket = 1;
		int Nth = NTHREADS;
		long Nbl = (long)((num_w_planesxaxisyaxis)/Nth/nbucket) + 1;
		if(NWORKERS == 1) {Nbl = 1; Nth = 1;};

		int ndevices;
		int m = hipGetDeviceCount(&ndevices);
		m = hipSetDevice(rank % ndevices);

		if ( rank == 0 ) {
		if (0 == ndevices) {

		shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
		}

		}
		printf("Running rank %d using GPU %d\n", rank, rank % ndevices);
		#ifdef NVIDIA
		prtAccelInfo();
		#endif

		hipError_t mmm;
		double * image_real_g;
		double * image_imag_g;
		double * gridss_g;


		//Copy gridss on the device
		mmm=hipMalloc(&gridss_g, 2num_w_planesxaxisyaxissizeof(double));
		mmm=hipMemcpy(gridss_g, gridss, 2num_w_planesxaxisyaxissizeof(double), hipMemcpyHostToDevice);

		mmm=hipMalloc(&image_real_g, xaxisyaxissizeof(double));
		//printf("HIP ERROR 2 %s\n",hipGetErrorString(mmm));
		mmm=hipMalloc(&image_imag_g, xaxisyaxissizeof(double));
		//printf("HIP ERROR 3 %s\n",hipGetErrorString(mmm));

		//printf("HIP ERROR 4 %s\n",hipGetErrorString(mmm));
		mmm=hipMemset(image_real_g, 0.0, xaxisyaxissizeof(double));
		//printf("HIP ERROR 5 %s\n",hipGetErrorString(mmm));
		mmm=hipMemset(image_imag_g, 0.0, xaxisyaxissizeof(double));
		//printf("HIP ERROR 6 %s\n",hipGetErrorString(mmm));

		// call the phase correction kernel
		phase_g <<<Nbl,Nth>>> (xaxis,
		yaxis,
		num_w_planes,
		gridss_g,
		image_real_g,
		image_imag_g,
		wmin,
		dw,
		dwnorm,
		xaxistot,
		yaxistot,
		resolution,
		nbucket);

		mmm = hipMemcpy(image_real, image_real_g, xaxisyaxissizeof(double), hipMemcpyDeviceToHost);
		//printf("HIP ERROR 7 %s\n",hipGetErrorString(mmm));
		mmm = hipMemcpy(image_imag, image_imag_g, xaxisyaxissizeof(double), hipMemcpyDeviceToHost);
		//printf("HIP ERROR 8 %s\n",hipGetErrorString(mmm));

		mmm= hipFree(gridss_g);
		#else

		#ifndef ACCOMP

		#ifdef _OPENMP
		omp_set_num_threads(num_threads);
		#endif

		#pragma omp parallel for collapse(2) private(wterm)
		for (int iw=0; iw<num_w_planes; iw++)
		{
		for (int iv=0; iv<yaxis; iv++)
		for (int iu=0; iu<xaxis; iu++)
		{

		unsigned int index = 2(iu+ivxaxis+xaxisyaxisiw);
		unsigned int img_index = iu+iv*xaxis;
		wterm = wmin + iw*dw;
		#ifdef PHASE_ON
		if (num_w_planes > 1)
		{
		double xcoord = (double)(iu-xaxistot/2);
		if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
		xcoord = sin(xcoord*resolution);
		double ycoord = (double)(iv-yaxistot/2);
		if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
		ycoord = sin(ycoord*resolution);

		double preal, pimag;
		double radius2 = (xcoordxcoord+ycoordycoord);
		if(xcoord <= 1.0)
		{
		preal = cos(2.0PIwterm*(sqrt(1-radius2)-1.0));
		pimag = sin(2.0PIwterm*(sqrt(1-radius2)-1.0));
		} else {
		preal = cos(-2.0PIwterm*(sqrt(radius2-1.0)-1));
		pimag = 0.0;
		}

		preal = cos(2.0PIwterm*(sqrt(1-radius2)-1.0));
		pimag = sin(2.0PIwterm*(sqrt(1-radius2)-1.0));

		double p,q,r,s;
		p = gridss[index];
		q = gridss[index+1];
		r = preal;
		s = pimag;

		//printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
		#pragma omp atomic
		image_real[img_index] += (pr-qs)dwnormsqrt(1-radius2);
		#pragma omp atomic
		image_imag[img_index] += (ps+qr)dwnormsqrt(1-radius2);
		} else {
		#pragma omp atomic
		image_real[img_index] += gridss[index];
		#pragma omp atomic
		image_imag[img_index] += gridss[index+1];
		}
		#else
		#pragma omp atomic
		image_real[img_index] += gridss[index];
		#pragma omp atomic
		image_imag[img_index] += gridss[index+1];
		#endif // end of PHASE_ON

		}
		}

		#else
		omp_set_default_device(rank % omp_get_num_devices());

		#if !defined(__clang__)

		#pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2num_w_planesxaxisyaxis]) map(from:image_real[0:xaxisyaxis]) map(from:image_imag[0:xaxis*yaxis])

		#else

		#pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2num_w_planesxaxisyaxis]) map(from:image_real[0:xaxisyaxis]) map(from:image_imag[0:xaxis*yaxis])
		#endif

		for (int iw=0; iw<num_w_planes; iw++)
		{
		for (int iv=0; iv<yaxis; iv++)
		for (int iu=0; iu<xaxis; iu++)
		{

		long index = 2(iu+ivxaxis+xaxisyaxisiw);
		long img_index = iu+iv*xaxis;
		wterm = wmin + iw*dw;
		#ifdef PHASE_ON
		if (num_w_planes > 1)
		{
		double xcoord = (double)(iu-xaxistot/2);
		if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
		xcoord = sin(xcoord*resolution);
		double ycoord = (double)(iv-yaxistot/2);
		if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
		ycoord = sin(ycoord*resolution);

		double preal, pimag;
		double radius2 = (xcoordxcoord+ycoordycoord);
		if(xcoord <= 1.0)
		{
		preal = cos(2.0PIwterm*(sqrt(1-radius2)-1.0));
		pimag = sin(2.0PIwterm*(sqrt(1-radius2)-1.0));
		} else {
		preal = cos(-2.0PIwterm*(sqrt(radius2-1.0)-1));
		pimag = 0.0;
		}

		preal = cos(2.0PIwterm*(sqrt(1-radius2)-1.0));
		pimag = sin(2.0PIwterm*(sqrt(1-radius2)-1.0));

		double p,q,r,s;
		p = gridss[index];
		q = gridss[index+1];
		r = preal;
		s = pimag;

		//printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
		#pragma omp atomic
		image_real[img_index] += (pr-qs)dwnormsqrt(1-radius2);
		#pragma omp atomic
		image_imag[img_index] += (ps+qr)dwnormsqrt(1-radius2);
		} else {
		#pragma omp atomic
		image_real[img_index] += gridss[index];
		#pragma omp atomic
		image_imag[img_index] += gridss[index+1];
		}
		#else
		#pragma omp atomic
		image_real[img_index] += gridss[index];
		#pragma omp atomic
		image_imag[img_index] += gridss[index+1];
		#endif // end of PHASE_ON

		}
		}

		#endif
		#endif // end of __HIPCC__


		}

w-stacking.hip.cpp

0 → 100755

+518 −0

File added.

Preview size limit exceeded, changes collapsed.