Loading Makefile +7 −4 Original line number Diff line number Diff line Loading @@ -20,8 +20,8 @@ endif LINKER=$(MPICC) FFTW_MPI_INC = FFTW_MPI_LIB = FFTW_MPI_INC = -I/opt/cray/pe/fftw/3.3.10.3/x86_rome/include FFTW_MPI_LIB = -L/opt/cray/pe/fftw/3.3.10.3/x86_rome/lib CFLAGS += -I./ Loading Loading @@ -64,7 +64,7 @@ OPT += -DACCOMP #OPT += -DNCCL_REDUCE # use AMD GPU to perform the reduce #OPT += -DRCCL_REDUCE OPT += -DRCCL_REDUCE # use GPU to perform FFT #OPT += -DCUFFTMP Loading Loading @@ -152,6 +152,7 @@ FLAGS=$(OPTIMIZE_AMD) $(CFLAGS) LIBS=$(AMDLIB) $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) $(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS) OBJ += $(OBJ_ACC_OMP) endif Loading @@ -162,6 +163,7 @@ FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_NCCL_REDUCE) endif ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT))) Loading @@ -171,6 +173,7 @@ FLAGS=$(OPTIMIZE_AMD) $(CFLAGS) LIBS=$(AMDLIB) $(OBJ_RCCL_REDUCE): $(DEPS_RCCL_REDUCE) $(MPIC++) $(FLAGS) $(OPT) -c $^ $(CFLAGS) $(LIBS) OBJ += $(OBJ_RCCL_REDUCE) endif Loading allvars_nccl.h 0 → 100644 +176 −0 Original line number Diff line number Diff line /* file to store global variables*/ #if defined(__STDC__) # if (__STDC_VERSION__ >= 199901L) # define _XOPEN_SOURCE 700 # endif #endif #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> #include <unistd.h> #if !defined( NCCL_REDUCE ) #include <stdatomic.h> #endif #include <mpi.h> #if defined (_OPENMP) #include <omp.h> #endif #if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw #include <fftw3-mpi.h> #endif #if defined(ACCOMP) #include "w-stacking_omp.h" #else #include "w-stacking.h" #endif #if defined(NVIDIA) #include <cuda_runtime.h> #endif #include "fft.h" #include "numa.h" #include "timing.h" #include "errcodes.h" #define PI 3.14159265359 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) #define NOVERBOSE #define NFILES 100 #define NAME_LEN 50 #define LONGNAME_LEN 1000 #define REDUCE_MPI 0 #define REDUCE_RING 1 #if defined(DEBUG) #define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \ ( ((t) ==-1 ) || ((T)==(t)) ) ) { \ printf(__VA_ARGS__); fflush(stdout); } #else #define dprintf(...) #endif typedef double double_t; #if defined(DOUBLE_PRECISION) typedef double float_t; #else typedef float float_t; #endif typedef unsigned int uint; typedef unsigned long long ull; extern struct io { FILE * pFile; FILE * pFile1; FILE * pFilereal; FILE * pFileimg; } file; extern struct ip { char ufile[NAME_LEN]; char vfile[NAME_LEN]; char wfile[NAME_LEN]; char weightsfile[NAME_LEN]; char visrealfile[NAME_LEN]; char visimgfile[NAME_LEN]; char metafile[NAME_LEN]; char paramfile[NAME_LEN]; } in; extern struct op { char outfile[NAME_LEN]; char outfile1[NAME_LEN]; char outfile2[NAME_LEN]; char outfile3[NAME_LEN]; char fftfile[NAME_LEN]; char fftfile2[NAME_LEN]; char fftfile3[NAME_LEN]; char logfile[NAME_LEN]; char extension[NAME_LEN]; char timingfile[NAME_LEN]; } out, outparam; extern struct meta { uint Nmeasures; uint Nvis; uint Nweights; uint freq_per_chan; uint polarisations; uint Ntimes; double dt; double thours; uint baselines; double uvmin; double uvmax; double wmin; double wmax; } metaData; extern struct parameter { int num_threads; int ndatasets; char datapath_multi[NFILES][LONGNAME_LEN]; int grid_size_x; int grid_size_y; int num_w_planes; int w_support; int reduce_method; } param; extern struct fileData { double * uu; double * vv; double * ww; float * weights; float * visreal; float * visimg; }data; extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN]; extern char datapath[LONGNAME_LEN]; extern int xaxis, yaxis; extern int rank; extern int size; extern uint nsectors; extern uint startrow; extern double_t resolution, dx, dw, w_supporth; extern uint **sectorarray; extern uint *histo_send; extern int verbose_level; extern uint size_of_grid; extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w; extern MPI_Comm MYMPI_COMM_WORLD; extern MPI_Win slabwin; allvars_rccl.h 0 → 100644 +177 −0 Original line number Diff line number Diff line /* file to store global variables*/ #if defined(__STDC__) # if (__STDC_VERSION__ >= 199901L) # define _XOPEN_SOURCE 700 # endif #endif #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> #include <unistd.h> #if !defined( RCCL_REDUCE ) #include <stdatomic.h> #endif #include <mpi.h> #if defined (_OPENMP) #include <omp.h> #endif #if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw #include <fftw3-mpi.h> #endif #if defined(ACCOMP) #include "w-stacking_omp.h" #else #include "w-stacking.h" #endif #if defined(NVIDIA) #include <cuda_runtime.h> #endif #include "fft.h" #include "numa.h" #include "timing.h" #include "errcodes.h" #define PI 3.14159265359 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) #define NOVERBOSE #define NFILES 100 #define NAME_LEN 50 #define LONGNAME_LEN 1000 #define REDUCE_MPI 0 #define REDUCE_RING 1 #if defined(DEBUG) #define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \ ( ((t) ==-1 ) || ((T)==(t)) ) ) { \ printf(__VA_ARGS__); fflush(stdout); } #else #define dprintf(...) #endif //WARNING WHEN USING CLANG!!! CONFLICTING TYPES WITH math.h LIBRARY typedef double double_ty; #if defined(DOUBLE_PRECISION) typedef double float_ty; #else typedef float float_ty; #endif typedef unsigned int uint; typedef unsigned long long ull; extern struct io { FILE * pFile; FILE * pFile1; FILE * pFilereal; FILE * pFileimg; } file; extern struct ip { char ufile[NAME_LEN]; char vfile[NAME_LEN]; char wfile[NAME_LEN]; char weightsfile[NAME_LEN]; char visrealfile[NAME_LEN]; char visimgfile[NAME_LEN]; char metafile[NAME_LEN]; char paramfile[NAME_LEN]; } in; extern struct op { char outfile[NAME_LEN]; char outfile1[NAME_LEN]; char outfile2[NAME_LEN]; char outfile3[NAME_LEN]; char fftfile[NAME_LEN]; char fftfile2[NAME_LEN]; char fftfile3[NAME_LEN]; char logfile[NAME_LEN]; char extension[NAME_LEN]; char timingfile[NAME_LEN]; } out, outparam; extern struct meta { uint Nmeasures; uint Nvis; uint Nweights; uint freq_per_chan; uint polarisations; uint Ntimes; double dt; double thours; uint baselines; double uvmin; double uvmax; double wmin; double wmax; } metaData; extern struct parameter { int num_threads; int ndatasets; char datapath_multi[NFILES][LONGNAME_LEN]; int grid_size_x; int grid_size_y; int num_w_planes; int w_support; int reduce_method; } param; extern struct fileData { double * uu; double * vv; double * ww; float * weights; float * visreal; float * visimg; }data; extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN]; extern char datapath[LONGNAME_LEN]; extern int xaxis, yaxis; extern int rank; extern int size; extern uint nsectors; extern uint startrow; extern double_ty resolution, dx, dw, w_supporth; extern uint **sectorarray; extern uint *histo_send; extern int verbose_level; extern uint size_of_grid; extern double_ty *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w; extern MPI_Comm MYMPI_COMM_WORLD; extern MPI_Win slabwin; gridding_rccl.cpp +9 −9 Original line number Diff line number Diff line #include "allvars_nccl.h" #include "allvars_rccl.h" #include "proto.h" #include <hip/hip_runtime.h> #include <rccl/rccl.h> Loading Loading @@ -76,18 +76,18 @@ void gridding_data(){ // allocate sector arrays // note: we use the largest allocation among all sectors double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); double_ty *memory = (double*) malloc ( (Nsec*3)*sizeof(double_ty) + (Nvissec*2+Nweightss)*sizeof(float_ty) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double*) memory; double_t *vvs = (double*) uus+Nsec; double_t *wws = (double*) vvs+Nsec; float_t *weightss = (float_t*)(wws+Nsec); float_t *visreals = weightss + Nweightss; float_t *visimgs = visreals + Nvissec; double_ty *uus = (double*) memory; double_ty *vvs = (double*) uus+Nsec; double_ty *wws = (double*) vvs+Nsec; float_ty *weightss = (float_ty*)(wws+Nsec); float_ty *visreals = weightss + Nweightss; float_ty *visimgs = visreals + Nvissec; Loading Loading
Makefile +7 −4 Original line number Diff line number Diff line Loading @@ -20,8 +20,8 @@ endif LINKER=$(MPICC) FFTW_MPI_INC = FFTW_MPI_LIB = FFTW_MPI_INC = -I/opt/cray/pe/fftw/3.3.10.3/x86_rome/include FFTW_MPI_LIB = -L/opt/cray/pe/fftw/3.3.10.3/x86_rome/lib CFLAGS += -I./ Loading Loading @@ -64,7 +64,7 @@ OPT += -DACCOMP #OPT += -DNCCL_REDUCE # use AMD GPU to perform the reduce #OPT += -DRCCL_REDUCE OPT += -DRCCL_REDUCE # use GPU to perform FFT #OPT += -DCUFFTMP Loading Loading @@ -152,6 +152,7 @@ FLAGS=$(OPTIMIZE_AMD) $(CFLAGS) LIBS=$(AMDLIB) $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) $(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS) OBJ += $(OBJ_ACC_OMP) endif Loading @@ -162,6 +163,7 @@ FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_NCCL_REDUCE) endif ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT))) Loading @@ -171,6 +173,7 @@ FLAGS=$(OPTIMIZE_AMD) $(CFLAGS) LIBS=$(AMDLIB) $(OBJ_RCCL_REDUCE): $(DEPS_RCCL_REDUCE) $(MPIC++) $(FLAGS) $(OPT) -c $^ $(CFLAGS) $(LIBS) OBJ += $(OBJ_RCCL_REDUCE) endif Loading
allvars_nccl.h 0 → 100644 +176 −0 Original line number Diff line number Diff line /* file to store global variables*/ #if defined(__STDC__) # if (__STDC_VERSION__ >= 199901L) # define _XOPEN_SOURCE 700 # endif #endif #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> #include <unistd.h> #if !defined( NCCL_REDUCE ) #include <stdatomic.h> #endif #include <mpi.h> #if defined (_OPENMP) #include <omp.h> #endif #if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw #include <fftw3-mpi.h> #endif #if defined(ACCOMP) #include "w-stacking_omp.h" #else #include "w-stacking.h" #endif #if defined(NVIDIA) #include <cuda_runtime.h> #endif #include "fft.h" #include "numa.h" #include "timing.h" #include "errcodes.h" #define PI 3.14159265359 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) #define NOVERBOSE #define NFILES 100 #define NAME_LEN 50 #define LONGNAME_LEN 1000 #define REDUCE_MPI 0 #define REDUCE_RING 1 #if defined(DEBUG) #define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \ ( ((t) ==-1 ) || ((T)==(t)) ) ) { \ printf(__VA_ARGS__); fflush(stdout); } #else #define dprintf(...) #endif typedef double double_t; #if defined(DOUBLE_PRECISION) typedef double float_t; #else typedef float float_t; #endif typedef unsigned int uint; typedef unsigned long long ull; extern struct io { FILE * pFile; FILE * pFile1; FILE * pFilereal; FILE * pFileimg; } file; extern struct ip { char ufile[NAME_LEN]; char vfile[NAME_LEN]; char wfile[NAME_LEN]; char weightsfile[NAME_LEN]; char visrealfile[NAME_LEN]; char visimgfile[NAME_LEN]; char metafile[NAME_LEN]; char paramfile[NAME_LEN]; } in; extern struct op { char outfile[NAME_LEN]; char outfile1[NAME_LEN]; char outfile2[NAME_LEN]; char outfile3[NAME_LEN]; char fftfile[NAME_LEN]; char fftfile2[NAME_LEN]; char fftfile3[NAME_LEN]; char logfile[NAME_LEN]; char extension[NAME_LEN]; char timingfile[NAME_LEN]; } out, outparam; extern struct meta { uint Nmeasures; uint Nvis; uint Nweights; uint freq_per_chan; uint polarisations; uint Ntimes; double dt; double thours; uint baselines; double uvmin; double uvmax; double wmin; double wmax; } metaData; extern struct parameter { int num_threads; int ndatasets; char datapath_multi[NFILES][LONGNAME_LEN]; int grid_size_x; int grid_size_y; int num_w_planes; int w_support; int reduce_method; } param; extern struct fileData { double * uu; double * vv; double * ww; float * weights; float * visreal; float * visimg; }data; extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN]; extern char datapath[LONGNAME_LEN]; extern int xaxis, yaxis; extern int rank; extern int size; extern uint nsectors; extern uint startrow; extern double_t resolution, dx, dw, w_supporth; extern uint **sectorarray; extern uint *histo_send; extern int verbose_level; extern uint size_of_grid; extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w; extern MPI_Comm MYMPI_COMM_WORLD; extern MPI_Win slabwin;
allvars_rccl.h 0 → 100644 +177 −0 Original line number Diff line number Diff line /* file to store global variables*/ #if defined(__STDC__) # if (__STDC_VERSION__ >= 199901L) # define _XOPEN_SOURCE 700 # endif #endif #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> #include <unistd.h> #if !defined( RCCL_REDUCE ) #include <stdatomic.h> #endif #include <mpi.h> #if defined (_OPENMP) #include <omp.h> #endif #if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw #include <fftw3-mpi.h> #endif #if defined(ACCOMP) #include "w-stacking_omp.h" #else #include "w-stacking.h" #endif #if defined(NVIDIA) #include <cuda_runtime.h> #endif #include "fft.h" #include "numa.h" #include "timing.h" #include "errcodes.h" #define PI 3.14159265359 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) #define NOVERBOSE #define NFILES 100 #define NAME_LEN 50 #define LONGNAME_LEN 1000 #define REDUCE_MPI 0 #define REDUCE_RING 1 #if defined(DEBUG) #define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \ ( ((t) ==-1 ) || ((T)==(t)) ) ) { \ printf(__VA_ARGS__); fflush(stdout); } #else #define dprintf(...) #endif //WARNING WHEN USING CLANG!!! CONFLICTING TYPES WITH math.h LIBRARY typedef double double_ty; #if defined(DOUBLE_PRECISION) typedef double float_ty; #else typedef float float_ty; #endif typedef unsigned int uint; typedef unsigned long long ull; extern struct io { FILE * pFile; FILE * pFile1; FILE * pFilereal; FILE * pFileimg; } file; extern struct ip { char ufile[NAME_LEN]; char vfile[NAME_LEN]; char wfile[NAME_LEN]; char weightsfile[NAME_LEN]; char visrealfile[NAME_LEN]; char visimgfile[NAME_LEN]; char metafile[NAME_LEN]; char paramfile[NAME_LEN]; } in; extern struct op { char outfile[NAME_LEN]; char outfile1[NAME_LEN]; char outfile2[NAME_LEN]; char outfile3[NAME_LEN]; char fftfile[NAME_LEN]; char fftfile2[NAME_LEN]; char fftfile3[NAME_LEN]; char logfile[NAME_LEN]; char extension[NAME_LEN]; char timingfile[NAME_LEN]; } out, outparam; extern struct meta { uint Nmeasures; uint Nvis; uint Nweights; uint freq_per_chan; uint polarisations; uint Ntimes; double dt; double thours; uint baselines; double uvmin; double uvmax; double wmin; double wmax; } metaData; extern struct parameter { int num_threads; int ndatasets; char datapath_multi[NFILES][LONGNAME_LEN]; int grid_size_x; int grid_size_y; int num_w_planes; int w_support; int reduce_method; } param; extern struct fileData { double * uu; double * vv; double * ww; float * weights; float * visreal; float * visimg; }data; extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN]; extern char datapath[LONGNAME_LEN]; extern int xaxis, yaxis; extern int rank; extern int size; extern uint nsectors; extern uint startrow; extern double_ty resolution, dx, dw, w_supporth; extern uint **sectorarray; extern uint *histo_send; extern int verbose_level; extern uint size_of_grid; extern double_ty *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w; extern MPI_Comm MYMPI_COMM_WORLD; extern MPI_Win slabwin;
gridding_rccl.cpp +9 −9 Original line number Diff line number Diff line #include "allvars_nccl.h" #include "allvars_rccl.h" #include "proto.h" #include <hip/hip_runtime.h> #include <rccl/rccl.h> Loading Loading @@ -76,18 +76,18 @@ void gridding_data(){ // allocate sector arrays // note: we use the largest allocation among all sectors double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); double_ty *memory = (double*) malloc ( (Nsec*3)*sizeof(double_ty) + (Nvissec*2+Nweightss)*sizeof(float_ty) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double*) memory; double_t *vvs = (double*) uus+Nsec; double_t *wws = (double*) vvs+Nsec; float_t *weightss = (float_t*)(wws+Nsec); float_t *visreals = weightss + Nweightss; float_t *visimgs = visreals + Nvissec; double_ty *uus = (double*) memory; double_ty *vvs = (double*) uus+Nsec; double_ty *wws = (double*) vvs+Nsec; float_ty *weightss = (float_ty*)(wws+Nsec); float_ty *visreals = weightss + Nweightss; float_ty *visimgs = visreals + Nvissec; Loading