Commit 3c580717 authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

AMD Bug fixing

parent a0d74ca8
Loading
Loading
Loading
Loading
+7 −4
Original line number Diff line number Diff line
@@ -20,8 +20,8 @@ endif

LINKER=$(MPICC)

FFTW_MPI_INC = 
FFTW_MPI_LIB = 
FFTW_MPI_INC = -I/opt/cray/pe/fftw/3.3.10.3/x86_rome/include
FFTW_MPI_LIB = -L/opt/cray/pe/fftw/3.3.10.3/x86_rome/lib

CFLAGS += -I./

@@ -64,7 +64,7 @@ OPT += -DACCOMP
#OPT += -DNCCL_REDUCE

# use AMD GPU to perform the reduce
#OPT += -DRCCL_REDUCE
OPT += -DRCCL_REDUCE

# use GPU to perform FFT
#OPT += -DCUFFTMP
@@ -152,6 +152,7 @@ FLAGS=$(OPTIMIZE_AMD) $(CFLAGS)
LIBS=$(AMDLIB) 
$(OBJ_ACC_OMP): $(DEPS_ACC_OMP)
	$(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS) 
OBJ += $(OBJ_ACC_OMP)
endif


@@ -162,6 +163,7 @@ FLAGS=$(NVFLAGS) $(CFLAGS)
LIBS=$(NVLIB) $(NVLIB_3)
$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
	$(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS)
OBJ += $(OBJ_NCCL_REDUCE)
endif

ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT)))
@@ -171,6 +173,7 @@ FLAGS=$(OPTIMIZE_AMD) $(CFLAGS)
LIBS=$(AMDLIB) 
$(OBJ_RCCL_REDUCE): $(DEPS_RCCL_REDUCE)
	$(MPIC++) $(FLAGS) $(OPT) -c $^ $(CFLAGS) $(LIBS)
OBJ += $(OBJ_RCCL_REDUCE)
endif


allvars_nccl.h

0 → 100644
+176 −0
Original line number Diff line number Diff line
/* file to store global variables*/

#if defined(__STDC__)
#  if (__STDC_VERSION__ >= 199901L)
#     define _XOPEN_SOURCE 700
#  endif
#endif

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <unistd.h>


#if !defined( NCCL_REDUCE )
#include <stdatomic.h>
#endif

#include <mpi.h>

#if defined (_OPENMP)
#include <omp.h>
#endif



#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
#include <fftw3-mpi.h>
#endif

#if defined(ACCOMP)               
#include "w-stacking_omp.h"
#else
#include "w-stacking.h"
#endif 

#if defined(NVIDIA)
#include <cuda_runtime.h>
#endif

#include "fft.h"
#include "numa.h"
#include "timing.h"
#include "errcodes.h"

#define PI 3.14159265359
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
#define NOVERBOSE
#define NFILES 100

#define NAME_LEN 50
#define LONGNAME_LEN 1000


#define REDUCE_MPI  0
#define REDUCE_RING 1

#if defined(DEBUG)
#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) &&	\
				      ( ((t) ==-1 ) || ((T)==(t)) ) ) {	\
    printf(__VA_ARGS__); fflush(stdout); }

#else
#define dprintf(...)
#endif

typedef double double_t;
#if defined(DOUBLE_PRECISION)
typedef double float_t;
#else
typedef float float_t;
#endif

typedef unsigned int       uint;
typedef unsigned long long ull;


extern struct io
{
	FILE * pFile;
        FILE * pFile1;
        FILE * pFilereal;
        FILE * pFileimg;
} file;

extern struct ip
{
	char ufile[NAME_LEN];
  	char vfile[NAME_LEN];
  	char wfile[NAME_LEN];
  	char weightsfile[NAME_LEN];
  	char visrealfile[NAME_LEN];
  	char visimgfile[NAME_LEN];
  	char metafile[NAME_LEN];
        char paramfile[NAME_LEN];
} in;

extern struct op
{
	char outfile[NAME_LEN];
        char outfile1[NAME_LEN];
        char outfile2[NAME_LEN];
        char outfile3[NAME_LEN];
        char fftfile[NAME_LEN];
        char fftfile2[NAME_LEN];
        char fftfile3[NAME_LEN];
        char logfile[NAME_LEN];
        char extension[NAME_LEN];
        char timingfile[NAME_LEN];

} out, outparam;

extern struct meta
{

  uint   Nmeasures;
  uint   Nvis;
  uint   Nweights;
  uint   freq_per_chan;
  uint   polarisations;
  uint   Ntimes;
  double dt;
  double thours;
  uint   baselines;
  double uvmin;
  double uvmax;
  double wmin;
  double wmax;
} metaData;


extern struct parameter
{
  int  num_threads;
  int  ndatasets;
  char datapath_multi[NFILES][LONGNAME_LEN];
  int  grid_size_x;
  int  grid_size_y;
  int  num_w_planes;
  int  w_support;
  int  reduce_method;
} param;

extern struct fileData
{
        double * uu;
        double * vv;
        double * ww;
        float * weights;
        float * visreal;
        float * visimg;
}data;


extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN];
extern char datapath[LONGNAME_LEN];
extern int  xaxis, yaxis;
extern int  rank;
extern int  size;
extern uint nsectors;
extern uint startrow;
extern double_t resolution, dx, dw, w_supporth;

extern uint **sectorarray;
extern uint  *histo_send;
extern int    verbose_level; 


extern uint    size_of_grid;
extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w;

extern MPI_Comm MYMPI_COMM_WORLD;
extern MPI_Win  slabwin;

allvars_rccl.h

0 → 100644
+177 −0
Original line number Diff line number Diff line
/* file to store global variables*/

#if defined(__STDC__)
#  if (__STDC_VERSION__ >= 199901L)
#     define _XOPEN_SOURCE 700
#  endif
#endif

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <unistd.h>


#if !defined( RCCL_REDUCE )
#include <stdatomic.h>
#endif

#include <mpi.h>

#if defined (_OPENMP)
#include <omp.h>
#endif



#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
#include <fftw3-mpi.h>
#endif

#if defined(ACCOMP)               
#include "w-stacking_omp.h"
#else
#include "w-stacking.h"
#endif 

#if defined(NVIDIA)
#include <cuda_runtime.h>
#endif

#include "fft.h"
#include "numa.h"
#include "timing.h"
#include "errcodes.h"

#define PI 3.14159265359
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
#define NOVERBOSE
#define NFILES 100

#define NAME_LEN 50
#define LONGNAME_LEN 1000


#define REDUCE_MPI  0
#define REDUCE_RING 1

#if defined(DEBUG)
#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) &&	\
				      ( ((t) ==-1 ) || ((T)==(t)) ) ) {	\
    printf(__VA_ARGS__); fflush(stdout); }

#else
#define dprintf(...)
#endif

//WARNING WHEN USING CLANG!!! CONFLICTING TYPES WITH math.h LIBRARY
typedef double double_ty;
#if defined(DOUBLE_PRECISION)
typedef double float_ty;
#else
typedef float float_ty;
#endif

typedef unsigned int       uint;
typedef unsigned long long ull;


extern struct io
{
	FILE * pFile;
        FILE * pFile1;
        FILE * pFilereal;
        FILE * pFileimg;
} file;

extern struct ip
{
	char ufile[NAME_LEN];
  	char vfile[NAME_LEN];
  	char wfile[NAME_LEN];
  	char weightsfile[NAME_LEN];
  	char visrealfile[NAME_LEN];
  	char visimgfile[NAME_LEN];
  	char metafile[NAME_LEN];
        char paramfile[NAME_LEN];
} in;

extern struct op
{
	char outfile[NAME_LEN];
        char outfile1[NAME_LEN];
        char outfile2[NAME_LEN];
        char outfile3[NAME_LEN];
        char fftfile[NAME_LEN];
        char fftfile2[NAME_LEN];
        char fftfile3[NAME_LEN];
        char logfile[NAME_LEN];
        char extension[NAME_LEN];
        char timingfile[NAME_LEN];

} out, outparam;

extern struct meta
{

  uint   Nmeasures;
  uint   Nvis;
  uint   Nweights;
  uint   freq_per_chan;
  uint   polarisations;
  uint   Ntimes;
  double dt;
  double thours;
  uint   baselines;
  double uvmin;
  double uvmax;
  double wmin;
  double wmax;
} metaData;


extern struct parameter
{
  int  num_threads;
  int  ndatasets;
  char datapath_multi[NFILES][LONGNAME_LEN];
  int  grid_size_x;
  int  grid_size_y;
  int  num_w_planes;
  int  w_support;
  int  reduce_method;
} param;

extern struct fileData
{
        double * uu;
        double * vv;
        double * ww;
        float * weights;
        float * visreal;
        float * visimg;
}data;


extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN];
extern char datapath[LONGNAME_LEN];
extern int  xaxis, yaxis;
extern int  rank;
extern int  size;
extern uint nsectors;
extern uint startrow;
extern double_ty resolution, dx, dw, w_supporth;

extern uint **sectorarray;
extern uint  *histo_send;
extern int    verbose_level; 


extern uint    size_of_grid;
extern double_ty *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w;

extern MPI_Comm MYMPI_COMM_WORLD;
extern MPI_Win  slabwin;
+9 −9
Original line number Diff line number Diff line
#include "allvars_nccl.h"
#include "allvars_rccl.h"
#include "proto.h"
#include <hip/hip_runtime.h>
#include <rccl/rccl.h>
@@ -76,18 +76,18 @@ void gridding_data(){
  // allocate sector arrays
  // note: we use the largest allocation among all sectors          

  double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
                                            (Nvissec*2+Nweightss)*sizeof(float_t) );
  double_ty *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_ty) +
                                            (Nvissec*2+Nweightss)*sizeof(float_ty) );

  if ( memory == NULL )
    shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);

  double_t *uus        = (double*) memory;
  double_t *vvs        = (double*) uus+Nsec;
  double_t *wws        = (double*) vvs+Nsec;
  float_t  *weightss   = (float_t*)(wws+Nsec);
  float_t  *visreals   = weightss + Nweightss;
  float_t  *visimgs    = visreals + Nvissec;
  double_ty *uus        = (double*) memory;
  double_ty *vvs        = (double*) uus+Nsec;
  double_ty *wws        = (double*) vvs+Nsec;
  float_ty  *weightss   = (float_ty*)(wws+Nsec);
  float_ty  *visreals   = weightss + Nweightss;
  float_ty  *visimgs    = visreals + Nvissec;