Commit 732d1d44 authored by Emanuele De Rubeis's avatar Emanuele De Rubeis
Browse files

Full AMD GPUs enabling

parent f7773175
Loading
Loading
Loading
Loading
+59 −8
Original line number Diff line number Diff line
@@ -91,18 +91,29 @@ OPT += -DGAUSS_HI_PRECISION
# use GPU to perform FFT
#OPT += -DCUFFTMP

# FULL GPU SUPPORT - Recommended for full NVIDIA GPU code execution
# FULL NVIDIA GPU SUPPORT - Recommended for full NVIDIA GPU code execution
OPT += -DFULL_NVIDIA
ifeq (FULL_NVIDIA,$(findstring FULL_NVIDIA,$(OPT)))
OPT += -DCUDACC -DNCCL_REDUCE -DCUFFTMP
endif



# use HIP for GPUs
#OPT += -DHIPCC

# support for AMD GPUs
#OPT += __HIP_PLATFORM_AMD__
#OPT += -D__HIP_PLATFORM_AMD__

# use AMD GPU to perform the reduce
#OPT += -DRCCL_REDUCE

# FULL AMD GPU SUPPORT - Recommended for full AMD GPU code execution
#OPT += -DFULL_AMD
ifeq (FULL_AMD,$(findstring FULL_AMD,$(OPT)))
OPT += -DHIPCC -DRCCL_REDUCE -D__HIP_PLATFORM_AMD__
endif

# =======================================================

# OUTPUT HANDLING
@@ -167,6 +178,11 @@ CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform
DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu
OBJ_ACC_CUDA = phase_correction.o w-stacking.o

# ----- define which files will be compiled by HIPCC for AMD
#
DEPS_ACC_HIP = w-stacking.hip.hpp w-stacking.hip.cpp phase_correction.hip.cpp
OBJ_ACC_HIP = phase_correction.hip.o w-stacking.hip.o

# ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD
#
DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c
@@ -180,14 +196,12 @@ ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
DEPS_NCCL_REDUCE = gridding_nccl.cu
OBJ_NCCL_REDUCE  = gridding_nccl.o

DEPS_RCCL_REDUCE = gridding_rccl.cu
OBJ_RCCL_REDUCE  = gridding_rccl.o
else ifeq (HIPCC,$(findstring HIPCC,$(OPT)))
DEPS_RCCL_REDUCE = gridding_rccl.hip.cpp allvars_rccl.hip.hpp
OBJ_RCCL_REDUCE  = gridding_rccl.hip.o
else
DEPS_NCCL_REDUCE = gridding_nccl.cpp
OBJ_NCCL_REDUCE  = gridding_nccl.o

DEPS_RCCL_REDUCE = gridding_rccl.cpp
OBJ_RCCL_REDUCE  = gridding_rccl.o
endif

# ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
@@ -211,6 +225,8 @@ ifeq (ACCOMP,$(findstring ACCOMP, $(OPT)))
OBJ = $(CC_OBJ_ACC)
else ifeq (CUDACC,$(findstring CUDACC, $(OPT)))
OBJ = $(CC_OBJ_ACC)
else ifeq (HIPCC,$(findstring HIPCC, $(OPT)))
OBJ = $(CC_OBJ_ACC)
else 
OBJ = $(CC_OBJ_NOACC)
endif
@@ -240,6 +256,22 @@ cuda_fft.cpp: cuda_fft.cu
gridding_nccl.cpp: gridding_nccl.cu
	cp gridding_nccl.cu gridding_nccl.cpp

gridding_rccl.cpp: gridding_rccl.cu
	cp gridding_rccl.cu gridding_rccl.cpp

else ifneq (HIPCC,$(findstring HIPCC,$(OPT)))
w-stacking.c: w-stacking.cu
	cp w-stacking.cu w-stacking.c

phase_correction.c: phase_correction.cu
	cp phase_correction.cu phase_correction.c

cuda_fft.cpp: cuda_fft.cu
	cp cuda_fft.cu cuda_fft.cpp

gridding_nccl.cpp: gridding_nccl.cu
	cp gridding_nccl.cu gridding_nccl.cpp

gridding_rccl.cpp: gridding_rccl.cu
	cp gridding_rccl.cu gridding_rccl.cpp
else
@@ -277,6 +309,15 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
OBJ += $(OBJ_ACC_CUDA)
endif

ifeq (HIPCC,$(findstring HIPCC,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-hip
LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE) 
LIBS=$(AMDLIB)
$(OBJ_ACC_HIP): $(DEPS_ACC_HIP)
	$(HIPCC) $(OPT) $(OPT_HIPCC) $(CFLAGS) -c w-stacking.hip.cpp phase_correction.hip.cpp $(LIBS)
OBJ += $(OBJ_ACC_HIP)
endif

ifeq (ACCOMP,$(findstring ACCOMP,$(OPT)))

@@ -317,6 +358,16 @@ LIBS=$(NVLIB_3)
$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
	$(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS)
OBJ += $(OBJ_NCCL_REDUCE)

else ifeq (HIPCC,$(findstring HIPCC,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-reduce
LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE) $(CFLAGS)
LIBS=$(AMDLIB_3) 
$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
	$(MPIC++) $(FLAGS) $(OPT) -c $^ $(CFLAGS) $(LIBS)
OBJ += $(OBJ_NCCL_REDUCE)

else

EXEC_EXT := $(EXEC_EXT)_acc-reduce

allvars_rccl.hip.hpp

0 → 100755
+176 −0
Original line number Diff line number Diff line
/* file to store global variables*/

#if defined(__STDC__)
#  if (__STDC_VERSION__ >= 199901L)
#     define _XOPEN_SOURCE 700
#  endif
#endif

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <unistd.h>


#if !defined( RCCL_REDUCE ) && !defined(__HIPCC__)
#include <stdatomic.h>
#endif

#include <mpi.h>

#if defined (_OPENMP)
#include <omp.h>
#endif

#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
#include <fftw3-mpi.h>
#endif

#if defined(ACCOMP)               
#include "w-stacking_omp.h"
#else
#include "w-stacking.hip.hpp"
#endif 

#if defined(NVIDIA)
#include <cuda_runtime.h>
#endif

#include "fft.h"
#include "numa.h"
#include "timing.h"
#include "errcodes.h"

#define PI 3.14159265359
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
#define NOVERBOSE
#define NFILES 100

#define NAME_LEN 50
#define LONGNAME_LEN 1000


#define REDUCE_MPI  0
#define REDUCE_RING 1

#if defined(DEBUG)
#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) &&	\
				      ( ((t) ==-1 ) || ((T)==(t)) ) ) {	\
    printf(__VA_ARGS__); fflush(stdout); }

#else
#define dprintf(...)
#endif

typedef double double_t;
#if defined(DOUBLE_PRECISION)
typedef double float_t;
#else
typedef float float_t;
#endif

typedef unsigned int       uint;
typedef unsigned long long ull;


extern struct io
{
	FILE * pFile;
        FILE * pFile1;
        FILE * pFilereal;
        FILE * pFileimg;
} file;

extern struct ip
{
	char ufile[NAME_LEN];
  	char vfile[NAME_LEN];
  	char wfile[NAME_LEN];
  	char weightsfile[NAME_LEN];
  	char visrealfile[NAME_LEN];
  	char visimgfile[NAME_LEN];
  	char metafile[NAME_LEN];
        char paramfile[NAME_LEN];
} in;

extern struct op
{
	char outfile[NAME_LEN];
        char outfile1[NAME_LEN];
        char outfile2[NAME_LEN];
        char outfile3[NAME_LEN];
        char fftfile[NAME_LEN];
        char fftfile_writedata1[NAME_LEN];
        char fftfile_writedata2[NAME_LEN];
        char fftfile2[NAME_LEN];
        char fftfile3[NAME_LEN];
        char logfile[NAME_LEN];
        char extension[NAME_LEN];
        char timingfile[NAME_LEN];

} out, outparam;

extern struct meta
{

  uint   Nmeasures;
  uint   Nvis;
  uint   Nweights;
  uint   freq_per_chan;
  uint   polarisations;
  uint   Ntimes;
  double dt;
  double thours;
  uint   baselines;
  double uvmin;
  double uvmax;
  double wmin;
  double wmax;
} metaData;


extern struct parameter
{
  int  num_threads;
  int  ndatasets;
  char datapath_multi[NFILES][LONGNAME_LEN];
  int  grid_size_x;
  int  grid_size_y;
  int  num_w_planes;
  int  w_support;
  int  reduce_method;
} param;

extern struct fileData
{
        double * uu;
        double * vv;
        double * ww;
        float * weights;
        float * visreal;
        float * visimg;
}data;


extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN];
extern char datapath[LONGNAME_LEN];
extern int  xaxis, yaxis;
extern int  rank;
extern int  size;
extern uint nsectors;
extern uint startrow;
extern double_t resolution, dx, dw, w_supporth;

extern uint **sectorarray;
extern uint  *histo_send;
extern int    verbose_level; 


extern uint    size_of_grid;
extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w, *grid_gpu, *gridss_gpu;

extern MPI_Comm MYMPI_COMM_WORLD;
extern MPI_Win  slabwin;

gridding_rccl.hip.cpp

0 → 100755
+290 −0
Original line number Diff line number Diff line
#include "allvars_rccl.hip.hpp"
#include "proto.h"
#include <hip/hip_runtime.h>
#include <rccl.h>

/* 
 * Implements the gridding of data via GPU
 * by using NCCL library
 *
 */


#if defined( RCCL_REDUCE )

/*
#define NCCLCHECK(cmd) do {                         
ncclResult_t r = cmd;                             
if (r!= ncclSuccess) {                            
  printf("Failed, NCCL error %s:%d '%s'\n",       
	 __FILE__,__LINE__,ncclGetErrorString(r));   
  exit(EXIT_FAILURE);                             
 }                                                 
} while(0)
*/  


static uint64_t getHostHash(const char* string) {
  // Based on DJB2a, result = result * 33 ^ char                                                                                                 
  uint64_t result = 5381;
  for (int c = 0; string[c] != '\0'; c++){
    result = ((result << 5) + result) ^ string[c];
  }
  return result;
}


static void getHostName(char* hostname, int maxlen) {
  gethostname(hostname, maxlen);
  for (int i=0; i< maxlen; i++) {
    if (hostname[i] == '.') {
        hostname[i] = '\0';
        return;
    }
  }  
}




void gridding_data(){

  double shift = (double)(dx*yaxis);

  timing_wt.kernel     = 0.0;
  timing_wt.reduce     = 0.0;
  timing_wt.reduce_mpi = 0.0;
  timing_wt.reduce_sh  = 0.0;
  timing_wt.compose    = 0.0;
  
  // calculate the resolution in radians
  resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax));

  // calculate the resolution in arcsec 
  double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI;
  if ( rank == 0 )
    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);

  // find the largest value in histo_send[]
  //                                                                  
 
  
    //Initialize nccl

  //double *gridss_gpu, *grid_gpu;
  int local_rank = 0;

  uint64_t hostHashs[size];
  char hostname[1024];
  getHostName(hostname, 1024);
  hostHashs[rank] = getHostHash(hostname);
  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
  for (int p=0; p<size; p++) {
     if (p == rank) break;
     if (hostHashs[p] == hostHashs[rank]) local_rank++;
  }
  
  ncclUniqueId id;
  ncclComm_t comm;
  hipError_t nnn;
  hipStream_t stream_reduce, stream_stacking;

  if (rank == 0) ncclGetUniqueId(&id);
  MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);

  hipSetDevice(local_rank);

  int n = hipMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
  n = hipMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
  n = hipStreamCreate(&stream_reduce);
  n = hipStreamCreate(&stream_stacking);
  
  ncclCommInitRank(&comm, size, id, rank);

  for (uint isector = 0; isector < nsectors; isector++)
    {

      double start = CPU_TIME_wt;

      uint Nsec            = histo_send[isector];
      uint Nweightss       = Nsec*metaData.polarisations;
      uint Nvissec         = Nweightss*metaData.freq_per_chan;
      double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
						(Nvissec*2+Nweightss)*sizeof(float_t) );

      if ( memory == NULL )
	shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);
  
      double_t *uus        = (double_t*) memory;
      double_t *vvs        = (double_t*) uus+Nsec;
      double_t *wws        = (double_t*) vvs+Nsec;
      float_t  *weightss   = (float_t*)((double_t*)wws+Nsec);
      float_t  *visreals   = (float_t*)weightss + Nweightss;
      float_t  *visimgs    = (float_t*)visreals + Nvissec;
  
      
      
      // select data for this sector
      uint icount = 0;
      uint ip = 0;
      uint inu = 0;

      #warning "this loop should be threaded"
      #warning "the counter of this loop should not be int"
      for(int iphi = histo_send[isector]-1; iphi>=0; iphi--)
        {

	  uint ilocal = sectorarray[isector][iphi];

	  uus[icount] = data.uu[ilocal];
	  vvs[icount] = data.vv[ilocal]-isector*shift;
	  wws[icount] = data.ww[ilocal];
	  for (uint ipol=0; ipol<metaData.polarisations; ipol++)
	    {
	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
	      ip++;
	    }
	  for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
	    {
	      visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
	      visimgs[inu]  = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];

	      inu++;
	    }
	  icount++;
	}

      double uumin = 1e20;
      double vvmin = 1e20;
      double uumax = -1e20;
      double vvmax = -1e20;

      #pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads)
      {
        double my_uumin = 1e20;
        double my_vvmin = 1e20;
        double my_uumax = -1e20;
        double my_vvmax = -1e20;

       #pragma omp for
        for (uint ipart=0; ipart<Nsec; ipart++)
          {
            my_uumin = MIN(my_uumin, uus[ipart]);
            my_uumax = MAX(my_uumax, uus[ipart]);
            my_vvmin = MIN(my_vvmin, vvs[ipart]);
            my_vvmax = MAX(my_vvmax, vvs[ipart]);
          }

        uumin = MIN( uumin, my_uumin );
        uumax = MAX( uumax, my_uumax );
        vvmin = MIN( vvmin, my_vvmin );
        vvmax = MAX( vvmax, my_vvmax );
      }

      timing_wt.compose += CPU_TIME_wt - start;

      //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
      
      // Make convolution on the grid

     #ifdef VERBOSE
      printf("Processing sector %ld\n",isector);
     #endif
          
      start = CPU_TIME_wt;
	    
     //We have to call different GPUs per MPI task!!! [GL]
     #ifdef HIPCC
      wstack(param.num_w_planes,
	     Nsec,
	     metaData.freq_per_chan,
	     metaData.polarisations,
	     uus,
	     vvs,
	     wws,
	     visreals,
	     visimgs,
	     weightss,
	     dx,
	     dw,
	     param.w_support,
	     xaxis,
	     yaxis,
	     gridss_gpu,
	     param.num_threads,
	     rank,
	     stream_stacking);
      #else
      wstack(param.num_w_planes,
	     Nsec,
	     metaData.freq_per_chan,
	     metaData.polarisations,
	     uus,
	     vvs,
	     wws,
	     visreals,
	     visimgs,
	     weightss,
	     dx,
	     dw,
	     param.w_support,
	     xaxis,
	     yaxis,
	     gridss,
	     param.num_threads,
	     rank);
      #endif
      //Allocate memory on devices non-blocking for the host                                                                                   
      ///////////////////////////////////////////////////////


      timing_wt.kernel += CPU_TIME_wt - start;
      
     #ifdef VERBOSE
      printf("Processed sector %ld\n",isector);
     #endif
      

      if( size > 1 )
	{
     
	  // Write grid in the corresponding remote slab
     
	  // int target_rank = (int)isector;    it implied that size >= nsectors
	  int target_rank = (int)(isector % size);

	  start = CPU_TIME_wt;

	  ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
	  n = hipStreamSynchronize(stream_reduce);
      
	  timing_wt.reduce += CPU_TIME_wt - start;

	  // Go to next sector
	  nnn = hipMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );
	  if (nnn != hipSuccess) {printf("!!! w-stacking.cu hipMemset ERROR %d !!!\n", nnn);}
	}

      free(memory);
    }


  //Copy data back from device to host (to be deleted in next steps)
  
   n = hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce);
  
  MPI_Barrier(MPI_COMM_WORLD);

  n = hipStreamSynchronize(stream_reduce);
  n = hipFree(grid_gpu);
  n = hipFree(gridss_gpu);
  n = hipStreamDestroy(stream_reduce);
  n = hipStreamDestroy(stream_stacking);
  
  ncclCommDestroy(comm);

  return;
  
}

#endif
+315 −0
Original line number Diff line number Diff line
#ifdef _OPENMP
#include <omp.h>
#endif

#if !defined(ACCOMP)
#include "w-stacking.hip.hpp"
#else
#include "w-stacking_omp.h"
#endif

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include "errcodes.h"
#include "proto.h"

#ifdef __HIPCC__

__global__ void phase_g(int xaxis, 
		        int yaxis,
			int num_w_planes,
			double * gridss,
			double * image_real,
			double * image_imag,
			double wmin,
			double dw,
			double dwnorm,
			int xaxistot,
			int yaxistot,
			double resolution,
			int nbucket)
{
	long gid = blockIdx.x*blockDim.x + threadIdx.x;
	double add_term_real;
	double add_term_img;
	double wterm;
	long arraysize = (long)((xaxis*yaxis*num_w_planes)/nbucket);

	if(gid < arraysize)
	{
	  long gid_aux = nbucket*gid;
	  for(int iaux=0; iaux<nbucket; iaux++) 
          {
		int iw = gid_aux/(xaxis*yaxis);
		int ivaux = gid_aux%(xaxis*yaxis);
		int iv = ivaux/xaxis;
		int iu = ivaux%xaxis;
		long index = 2*gid_aux;
		long img_index = iu+iv*xaxis;

                wterm = wmin + iw*dw;

#ifdef PHASE_ON
                if (num_w_planes > 1)
                {
                    double xcoord = (double)(iu-xaxistot/2);
                    if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
                    xcoord = sin(xcoord*resolution);
                    double ycoord = (double)(iv-yaxistot/2);
                    if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
                    ycoord = sin(ycoord*resolution);

                    double preal, pimag;
                    double radius2 = (xcoord*xcoord+ycoord*ycoord);

                    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
                    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));

                    double p,q,r,s;
                    p = gridss[index];
                    q = gridss[index+1];
                    r = preal;
                    s = pimag;

                    //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);

		    add_term_real = (p*r-q*s)*dwnorm*sqrt(1-radius2);
		    add_term_img = (p*s+q*r)*dwnorm*sqrt(1-radius2);
		    atomicAdd(&(image_real[img_index]),add_term_real);
		    atomicAdd(&(image_imag[img_index]),add_term_img);
                } else {
		    atomicAdd(&(image_real[img_index]),gridss[index]);
		    atomicAdd(&(image_imag[img_index]),gridss[index+1]);
                }
#else
		atomicAdd(&(image_real[img_index]),gridss[index]);
		atomicAdd(&(image_imag[img_index]),gridss[index+1]);
#endif // end of PHASE_ON
		gid_aux++;
           }
	}

}

#endif

void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes, int xaxistot, int yaxistot,
		      double resolution, double wmin, double wmax, int num_threads, int rank)
{
        double dw = (wmax-wmin)/(double)num_w_planes;
	double wterm = wmin+0.5*dw;
	double dwnorm = dw/(wmax-wmin);

#ifdef HIPCC

	// WARNING: nbucket MUST be chosen such that xaxis*yaxis*num_w_planes is a multiple of nbucket
	int nbucket = 1;
	int Nth = NTHREADS;
        long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1;
        if(NWORKERS == 1) {Nbl = 1; Nth = 1;};

	int ndevices;
	int m = hipGetDeviceCount(&ndevices);
	m = hipSetDevice(rank % ndevices);

	if ( rank == 0 ) {
	  if (0 == ndevices) {

	    shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
	  }

	}
	  printf("Running rank %d using GPU %d\n", rank, rank % ndevices);
	 #ifdef NVIDIA
	  prtAccelInfo();
	 #endif

	hipError_t mmm;
	double * image_real_g;
	double * image_imag_g;
	double * gridss_g;

	
	//Copy gridss on the device
	mmm=hipMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double));
	mmm=hipMemcpy(gridss_g, gridss, 2*num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyHostToDevice);

	mmm=hipMalloc(&image_real_g, xaxis*yaxis*sizeof(double));
	//printf("HIP ERROR 2 %s\n",hipGetErrorString(mmm));
	mmm=hipMalloc(&image_imag_g, xaxis*yaxis*sizeof(double));
	//printf("HIP ERROR 3 %s\n",hipGetErrorString(mmm));

	//printf("HIP ERROR 4 %s\n",hipGetErrorString(mmm));
	mmm=hipMemset(image_real_g, 0.0, xaxis*yaxis*sizeof(double));
	//printf("HIP ERROR 5 %s\n",hipGetErrorString(mmm));
	mmm=hipMemset(image_imag_g, 0.0, xaxis*yaxis*sizeof(double));
	//printf("HIP ERROR 6 %s\n",hipGetErrorString(mmm));

	// call the phase correction kernel
	phase_g <<<Nbl,Nth>>> (xaxis,
                               yaxis,
			       num_w_planes,
                               gridss_g,
                               image_real_g,
                               image_imag_g,
                               wmin,
                               dw,
                               dwnorm,
                               xaxistot,
                               yaxistot,
                               resolution,
			       nbucket);

	mmm = hipMemcpy(image_real, image_real_g, xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost);
	//printf("HIP ERROR 7 %s\n",hipGetErrorString(mmm));
	mmm = hipMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost);
	//printf("HIP ERROR 8 %s\n",hipGetErrorString(mmm));

	mmm= hipFree(gridss_g);
#else

#ifndef ACCOMP
	
#ifdef _OPENMP
	omp_set_num_threads(num_threads);
#endif
	
       #pragma omp parallel for collapse(2) private(wterm) 
	for (int iw=0; iw<num_w_planes; iw++)
	{
	    for (int iv=0; iv<yaxis; iv++)
            for (int iu=0; iu<xaxis; iu++)
            {

		unsigned int index = 2*(iu+iv*xaxis+xaxis*yaxis*iw);
		unsigned int img_index = iu+iv*xaxis;
		wterm = wmin + iw*dw;
#ifdef PHASE_ON
                if (num_w_planes > 1)
		{
                    double xcoord = (double)(iu-xaxistot/2);
                    if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
		    xcoord = sin(xcoord*resolution);
                    double ycoord = (double)(iv-yaxistot/2);
                    if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
		    ycoord = sin(ycoord*resolution);

		    double preal, pimag;
		    double radius2 = (xcoord*xcoord+ycoord*ycoord);
		    if(xcoord <= 1.0)
		    {
			    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
			    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
		    } else {
			    preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
			    pimag = 0.0;
		    }

		    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
		    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));

		    double p,q,r,s;
		    p = gridss[index];
		    q = gridss[index+1];
		    r = preal;
		    s = pimag;

		    //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
		   #pragma omp atomic
		    image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2);
		   #pragma omp atomic
		    image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2);
	        } else {
		 #pragma omp atomic
		  image_real[img_index] += gridss[index];
		 #pragma omp atomic
		  image_imag[img_index] += gridss[index+1];
		}
#else
	       #pragma omp atomic
  	        image_real[img_index] += gridss[index];
	       #pragma omp atomic
		image_imag[img_index] += gridss[index+1];
#endif // end of PHASE_ON

            }
	}

#else
	omp_set_default_device(rank % omp_get_num_devices());
	
       #if !defined(__clang__)

       #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis])

       #else

       #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis])
       #endif
	
	for (int iw=0; iw<num_w_planes; iw++)
	{
	    for (int iv=0; iv<yaxis; iv++)
            for (int iu=0; iu<xaxis; iu++)
            {

		long index = 2*(iu+iv*xaxis+xaxis*yaxis*iw);
		long img_index = iu+iv*xaxis;
		wterm = wmin + iw*dw;
#ifdef PHASE_ON
                if (num_w_planes > 1)
		{
                    double xcoord = (double)(iu-xaxistot/2);
                    if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
		    xcoord = sin(xcoord*resolution);
                    double ycoord = (double)(iv-yaxistot/2);
                    if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
		    ycoord = sin(ycoord*resolution);

		    double preal, pimag;
		    double radius2 = (xcoord*xcoord+ycoord*ycoord);
		    if(xcoord <= 1.0)
		    {
			    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
			    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
		    } else {
			    preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
			    pimag = 0.0;
		    }

		    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
		    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));

		    double p,q,r,s;
		    p = gridss[index];
		    q = gridss[index+1];
		    r = preal;
		    s = pimag;

		    //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
		    #pragma omp atomic
		    image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2);
		    #pragma omp atomic
		    image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2);
	        } else {
		    #pragma omp atomic
		    image_real[img_index] += gridss[index];
		    #pragma omp atomic
		    image_imag[img_index] += gridss[index+1];
		}
#else
		#pragma omp atomic
  	        image_real[img_index] += gridss[index];
		#pragma omp atomic
		image_imag[img_index] += gridss[index+1];
#endif // end of PHASE_ON

            }
	}
	
#endif	
#endif // end of __HIPCC__


}

w-stacking.hip.cpp

0 → 100755
+518 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading