Loading Makefile +125 −110 Original line number Diff line number Diff line # comment/uncomment the various options depending hoe you want to build the program # Set default values for compiler options if no systype options are given or found EXEC = w-stacking EXEC_EXT := MPICC = mpicc MPICXX = mpiCC OPTIMIZE = -fopenmp -O3 -march=native Loading @@ -13,52 +17,77 @@ else include Build/Makefile.systype endif LINKER=$(MPICC) FFTW_MPI_INC = -I/home/giacopo/Library_fftw/include FFTW_MPI_LIB = -L/home/giacopo/Library_fftw/lib CFLAGS += $(FFTW_MPI_INC) -I/proto.h LIBS = $(FFTW_MPI_LIB) -lfftw3_mpi -lfftw3 -lm #-lcudart -lcuda CFLAGS += -I./ FFTWLIBS = # ======================================================== # CODE OPTIONS # # create MPI code OPT += -DUSE_MPI OPT += -DACCOMP # use FFTW (it can be switched on ONLY if MPI is active) ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) OPT += -DUSE_FFTW LIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm endif #OPT += -DNVIDIA #use cuda for GPUs #OPT += -D__CUDACC__ # perform one-side communication (suggested) instead of reduce (only if MPI is active) #OPT += -DONE_SIDE OPT += -DNCCL_REDUCE #OPT += -DREDUCE #perform the hybrid MPI-OpenMP FFTW #OPT += -DHYBRID_FFTW #OPT += -DCUFFTMP #OPT += -DRING #perform the debugging in the ring implementation #OPT += -DDEBUG # use omp-ized version of fftw routines OPT += -DHYBRID_FFTW # write the full 3D cube of gridded visibilities and its FFT transform #OPT += -DWRITE_DATA # write the final image OPT += -DWRITE_IMAGE # perform w-stacking phase correction OPT += -DPHASE_ON # ======================================================== # ACCELERATION # #OPT += -DNVIDIA #use cuda for GPUs #OPT += -D__CUDACC__ # use GPU acceleration via OMP #OPT += -DACCOMP # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE # use AMD GPU to perform the reduce #OPT += -DRCCL_REDUCE # use GPU to perform FFT #OPT += -DCUFFTMP #perform the debugging in the ring implementation #OPT += -DDEBUG # ======================================================== DEPS = w-stacking.h main.c phase_correction.cu allvars.h COBJ = allvars.o main.o init.o gridding.o gridding_std.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o DEPS = w-stacking.h main.c w-stacking.cu phase_correction.cu allvars.h init.c gridding.c fourier_transform.c result.c COBJ = w-stacking.o main.o phase_correction.o allvars.o init.o gridding.o fourier_transform.o result.o COBJ_OMP = w-stacking_omp.o main.o phase_correction.o allvars.o init.o gridding.o fourier_transform.o result.o #use the shared-memory reduce implemented in gridding_ring.c COBJ_RING = w-stacking.o main.o phase_correction.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o COBJ_RING_OMP = main.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o COBJ_RING_CUDA = main.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o COBJ_RING_OMP_FFT = main.o allvars.o init.o gridding_ring.o result.o numa.o reduce.o COBJ_NCCL = main.o allvars.o init.o numa.o fourier_transform.o result.o COBJ_NCCL_FFT = main.o allvars.o init.o numa.o result.o DEPS_ACC_CUDA = w-stacking.h w-stacking.cu COBJ_ACC_CUDA = phase_correction.o w-stacking.o DEPS_ACC_OMP = w-stacking_omp.h COBJ_ACC_OMP = phase_correction.o w-stacking_omp.o COBJ_NCCL_REDUCE = gridding_nccl.o COBJ_RCCL_REDUCE = gridding_rccl.o ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT))) CFLAGS += $(FFTW_MPI_INC) FFTWLIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm endif ifneq (CUDACC,$(findstring CUDACC,$(OPT))) w-stacking.c: w-stacking.cu Loading @@ -66,92 +95,78 @@ w-stacking.c: w-stacking.cu phase_correction.c: phase_correction.cu cp phase_correction.cu phase_correction.c endif ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) %.o: %.c $(DEPS) $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) else %.o: %.c $(DEPS) $(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) w-stacking.c: w-stacking.cu rm -f w-stacking.cun touch w-stacking.c phase_correction.c: phase_correction.cu rm -f phase_correction.c touch phase_correction.c endif serial: $(COBJ) $(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial $^ $(LIBS) serial_omp: phase_correction.c $(CC) $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial main.c init.c gridding.c fourier_transform.c result.c w-stacking_omp.c $(CFLAGS) $(LIBS) simple_mpi: phase_correction.c $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c main.c init.c gridding.c fourier_transform.c result.c phase_correction.c $(CFLAGS) $(LIBS) ##################################################################################### mpi_omp: $(COBJ_OMP) $(MPICC) $(OPTIMIZE) $(OPT) -fopenmp -o w-stackingMPI_omp $^ $(CFLAGS) $(LIBS) ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT))) EXEC_EXT := $(EXEC_EXT)_fftw endif serial_cuda: $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) $(CC) $(OPTIMIZE) $(OPT) -c main.c init.c gridding.c fourier_transform.c result.c $(CFLAGS) $(LIBS) $(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm ifeq (CUDACC,$(findstring CUDACC,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-cuda LINKER=$(NVCC) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) compile_cuda: $(COBJ_ACC_CUDA) $(NVCC) $(OPT) $(NVFLAGS) -c *.cu $(NVLIB) endif mpi: $(COBJ) $(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS) ##################################################################################### ifeq (RING,$(findstring RING,$(OPT))) mpi_new: $(COBJ_RING) $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_ring $^ $(CFLAGS) $(LIBS) ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(NVC) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) compile_accomp: $(COBJ_ACC_OMP) $(NVC) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB) endif ifeq (REDUCE,$(findstring REDUCE,$(OPT))) mpi_new: $(COBJ_RING) $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_mpi_reduce $^ $(CFLAGS) $(LIBS) ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) compile_accreduce: $(COBJ_NCCL_REDUCE) $(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3) endif ifeq (ONE_SIDE,$(findstring ONE_SIDE,$(OPT))) mpi_new: $(COBJ_RING) $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_mpi_oneside $^ $(CFLAGS) $(LIBS) ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) compile_accreduce: $(COBJ_RCCL_REDCUE) $(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3) endif ################################################################################### #To use the GPUs for the convolution part mpi_ring_omp: $(COBJ_RING_OMP) $(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB) $(NVC) $(NVFLAGS) $(OPT) -o w-stackingCfftw_ring_omp phase_correction.o w-stacking_omp.o $^ $(CFLAGS) $(NVLIB) -lmpi $(LIBS) mpi_amd_omp: $(COBJ_RING_OMP) $(MPICC) $(OPTIMIZE) $(OPT) -c phase_correction.c w-stacking_omp.v1.c $(CFLAGS) $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_amd_omp phase_correction.o w-stacking_omp.v1.o $^ $(CFLAGS) $(LIBS) mpi_omp_fft: $(COBJ_RING_OMP_FFT) $(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB) $(NVC++) $(NVFLAGS) $(OPT) -c fourier_transform_new.cpp $(CFLAGS) $(NVLIB_2) $(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_cufftMp phase_correction.o w-stacking_omp.o fourier_transform_new.o $^ $(CFLAGS) $(NVLIB_2) $(LIBS) #Reduce operation with NCCL (OpenMP+CUDA) nccl_reduce: $(COBJ_NCCL) $(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB) $(NVC++) $(NVFLAGS) $(OPT) -c gridding_nccl.cpp $(CFLAGS) $(NVLIB_3) $(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_nccl phase_correction.o w-stacking_omp.o gridding_nccl.o $^ $(CFLAGS) $(NVLIB_3) $(LIBS) #Reduce operation with RCCL AMD (OpenMP) rccl_reduce: $(COBJ_NCCL) $(MPICC) $(OPTIMIZE) $(OPT) -c phase_correction.c w-stacking_omp.v1.c $(CFLAGS) $(MPIC++) $(OPTIMIZE) $(OPT) -c gridding_nccl.cpp $(CFLAGS) $(ROCLIB) $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingC_nccl phase_correction.o w-stacking_omp.v1.o gridding_nccl.o $^ $(CFLAGS) $(ROCLIB) $(LIBS) #Reduce operation with NCCL (CUDA) nccl_reduce_fft: $(COBJ_NCCL_FFT) $(NVC) $(NVFLAGS) $(OPT) -c phase_correction.cpp w-stacking.cpp $(CFLAGS) $(NVLIB) $(NVC++) $(NVFLAGS) $(OPT) -c gridding_nccl.cpp fourier_transform_new.cpp $(CFLAGS) $(NVLIB_3) $(NVLIB_2) $(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_nccl_cu phase_correction.o w-stacking.o gridding_nccl.o $^ $(CFLAGS) $(NVLIB_2) $(NVLIB_3) $(LIBS) mpi_ring_cuda: $(COBJ_RING_CUDA) $(NVCC) $(OPT) $(NVFLAGS) -c *.cu $(NVLIB) $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_ring_cuda $^ w-stacking.o phase_correction.o $(CFLAGS) $(LIBS) $(NVLIB) mpi_cuda: $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) $(MPICC) $(OPTIMIZE) $(OPT) -c main.c init.c fourier_transform.c result.c $(CFLAGS) $(LIBS) $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS) $(NVLIB) w-stacking: $(COBJ) $(DEPS) Makefile @$(LINKER) $(FLAGS) $(OPT) $(FFTWLIBS) $(LIBS) -lmpi -o $(EXEC)$(EXEC_EXT) $(COBJ): $(DEPS) Makefile %.o: %.c $(DEPS) $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) clean: rm *.o rm w-stacking.c rm phase_correction.c rm -f *.o rm -f w-stacking.c rm -f phase_correction.c cleanall: rm -f $(EXEC)* rm -f *.o rm -f w-stacking.c rm -f phase_correction.c allvars.c +27 −27 Original line number Diff line number Diff line #include "numa_vars.h" MPI_Comm MYMPI_COMM_WORLD; #include "allvars.h" struct io file; struct ip in; Loading @@ -8,35 +10,33 @@ struct ip in; struct op out, outparam; struct meta metaData; struct time timing; struct parameter param; struct fileData data; char filename[1000], buf[30], num_buf[30]; char datapath[900]; char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN]; char datapath[LONGNAME_LEN]; int xaxis, yaxis; int rank; int size; long nsectors; long startrow; uint nsectors; uint startrow; double resolution, dx, dw, w_supporth; uint **sectorarray = NULL; uint *histo_send = NULL; int verbose_level = 0; timing_t timing_wt; double reduce_mpi_time; double reduce_shmem_time; clock_t start, end, start0, startk, endk; struct timespec begin, finish, begin0, begink, finishk; long * histo_send, size_of_grid; double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w; int threads_ok, nthreads_fftw; uint size_of_grid; double *grid_pointers = NULL, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w; #ifdef USE_MPI MPI_Comm MYMPI_COMM_WORLD; MPI_Win slabwin; #endif #ifdef USE_MPI MPI_Request * requests = NULL; #endif long **sectorarray; allvars.h +111 −79 Original line number Diff line number Diff line /* file to store global variables*/ #include <stdio.h> #if defined(__STDC__) # if (__STDC_VERSION__ >= 199901L) # define _XOPEN_SOURCE 700 # endif #endif #include <stdlib.h> #include <stdio.h> #include <string.h> #ifdef USE_MPI #include <math.h> #include <unistd.h> #include <stdatomic.h> #include <mpi.h> #ifdef USE_FFTW #ifndef CUFFTMP #include <fftw3-mpi.h> #endif #if defined (_OPENMP) #include <omp.h> #endif #if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw #include <fftw3-mpi.h> #endif #ifdef ACCOMP #if defined(ACCOMP) #include "w-stacking_omp.h" #else #include "w-stacking.h" #endif #ifdef NVIDIA #if defined(NVIDIA) #include <cuda_runtime.h> #endif #include "fft.h" #include "numa.h" #include "timing.h" #include "errcodes.h" #define PI 3.14159265359 #define NUM_OF_SECTORS -1 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) #define NOVERBOSE #define NFILES 100 #include <omp.h> #include <math.h> #include <time.h> #include <unistd.h> #define NAME_LEN 50 #define LONGNAME_LEN 1000 #define REDUCE_MPI 0 #define REDUCE_RING 1 #if defined(DEBUG) #define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \ ( ((t) ==-1 ) || ((T)==(t)) ) ) { \ printf(__VA_ARGS__); fflush(stdout); } #else #define dprintf(...) #endif typedef double double_t; #if defined(DOUBLE_PRECISION) typedef double float_t; #else typedef float float_t; #endif typedef unsigned int uint; typedef unsigned long long ull; extern struct io { Loading @@ -40,68 +83,60 @@ extern struct io extern struct ip { char ufile[30]; char vfile[30]; char wfile[30]; char weightsfile[30]; char visrealfile[30]; char visimgfile[30]; char metafile[30]; char paramfile[30]; char ufile[NAME_LEN]; char vfile[NAME_LEN]; char wfile[NAME_LEN]; char weightsfile[NAME_LEN]; char visrealfile[NAME_LEN]; char visimgfile[NAME_LEN]; char metafile[NAME_LEN]; char paramfile[NAME_LEN]; } in; extern struct op { char outfile[30]; char outfile1[30]; char outfile2[30]; char outfile3[30]; char fftfile[30]; char fftfile2[30]; char fftfile3[30]; char logfile[30]; char extension[30]; char timingfile[30]; char outfile[NAME_LEN]; char outfile1[NAME_LEN]; char outfile2[NAME_LEN]; char outfile3[NAME_LEN]; char fftfile[NAME_LEN]; char fftfile2[NAME_LEN]; char fftfile3[NAME_LEN]; char logfile[NAME_LEN]; char extension[NAME_LEN]; char timingfile[NAME_LEN]; } out, outparam; extern struct meta { long Nmeasures; long Nvis; long Nweights; long freq_per_chan; long polarisations; long Ntimes; uint Nmeasures; uint Nvis; uint Nweights; uint freq_per_chan; uint polarisations; uint Ntimes; double dt; double thours; long baselines; uint baselines; double uvmin; double uvmax; double wmin; double wmax; } metaData; extern struct time { double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time; double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1; double writetime, writetime1; } timing; extern struct parameter { int num_threads; int ndatasets; char datapath_multi[NFILES][900]; char datapath_multi[NFILES][LONGNAME_LEN]; int grid_size_x; int grid_size_y; int num_w_planes; int w_support; int reduce_method; } param; extern struct fileData Loading @@ -115,26 +150,23 @@ extern struct fileData }data; extern char filename[1000], buf[30], num_buf[30]; extern char datapath[900]; extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN]; extern char datapath[LONGNAME_LEN]; extern int xaxis, yaxis; extern int rank; extern int size; extern long nsectors; extern long startrow; extern double resolution, dx, dw, w_supporth; extern uint nsectors; extern uint startrow; extern double_t resolution, dx, dw, w_supporth; extern clock_t start, end, start0, startk, endk; extern struct timespec begin, finish, begin0, begink, finishk; extern long * histo_send, size_of_grid; extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w; extern uint **sectorarray; extern uint *histo_send; extern int verbose_level; #ifdef USE_MPI extern MPI_Win slabwin; #endif extern long **sectorarray; extern uint size_of_grid; extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w; extern MPI_Comm MYMPI_COMM_WORLD; extern MPI_Win slabwin; #ifdef HYBRID_FFTW extern int thread_level; #endif data/newgauss2noconj_t201806301100_SBL180.binMS.tardeleted 100644 → 0 −95.9 MiB File deleted. View file errcodes.h 0 → 100644 +6 −0 Original line number Diff line number Diff line #define NO_THREADS_SUPPORT 1 #define ERR_IN_PARAMFILE 2 #define NOT_ENOUGH_MEM_STACKING 3 #define ERR_REDUCE 4 #define NO_ACCELERATORS_FOUND 255 Loading
Makefile +125 −110 Original line number Diff line number Diff line # comment/uncomment the various options depending hoe you want to build the program # Set default values for compiler options if no systype options are given or found EXEC = w-stacking EXEC_EXT := MPICC = mpicc MPICXX = mpiCC OPTIMIZE = -fopenmp -O3 -march=native Loading @@ -13,52 +17,77 @@ else include Build/Makefile.systype endif LINKER=$(MPICC) FFTW_MPI_INC = -I/home/giacopo/Library_fftw/include FFTW_MPI_LIB = -L/home/giacopo/Library_fftw/lib CFLAGS += $(FFTW_MPI_INC) -I/proto.h LIBS = $(FFTW_MPI_LIB) -lfftw3_mpi -lfftw3 -lm #-lcudart -lcuda CFLAGS += -I./ FFTWLIBS = # ======================================================== # CODE OPTIONS # # create MPI code OPT += -DUSE_MPI OPT += -DACCOMP # use FFTW (it can be switched on ONLY if MPI is active) ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) OPT += -DUSE_FFTW LIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm endif #OPT += -DNVIDIA #use cuda for GPUs #OPT += -D__CUDACC__ # perform one-side communication (suggested) instead of reduce (only if MPI is active) #OPT += -DONE_SIDE OPT += -DNCCL_REDUCE #OPT += -DREDUCE #perform the hybrid MPI-OpenMP FFTW #OPT += -DHYBRID_FFTW #OPT += -DCUFFTMP #OPT += -DRING #perform the debugging in the ring implementation #OPT += -DDEBUG # use omp-ized version of fftw routines OPT += -DHYBRID_FFTW # write the full 3D cube of gridded visibilities and its FFT transform #OPT += -DWRITE_DATA # write the final image OPT += -DWRITE_IMAGE # perform w-stacking phase correction OPT += -DPHASE_ON # ======================================================== # ACCELERATION # #OPT += -DNVIDIA #use cuda for GPUs #OPT += -D__CUDACC__ # use GPU acceleration via OMP #OPT += -DACCOMP # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE # use AMD GPU to perform the reduce #OPT += -DRCCL_REDUCE # use GPU to perform FFT #OPT += -DCUFFTMP #perform the debugging in the ring implementation #OPT += -DDEBUG # ======================================================== DEPS = w-stacking.h main.c phase_correction.cu allvars.h COBJ = allvars.o main.o init.o gridding.o gridding_std.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o DEPS = w-stacking.h main.c w-stacking.cu phase_correction.cu allvars.h init.c gridding.c fourier_transform.c result.c COBJ = w-stacking.o main.o phase_correction.o allvars.o init.o gridding.o fourier_transform.o result.o COBJ_OMP = w-stacking_omp.o main.o phase_correction.o allvars.o init.o gridding.o fourier_transform.o result.o #use the shared-memory reduce implemented in gridding_ring.c COBJ_RING = w-stacking.o main.o phase_correction.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o COBJ_RING_OMP = main.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o COBJ_RING_CUDA = main.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o COBJ_RING_OMP_FFT = main.o allvars.o init.o gridding_ring.o result.o numa.o reduce.o COBJ_NCCL = main.o allvars.o init.o numa.o fourier_transform.o result.o COBJ_NCCL_FFT = main.o allvars.o init.o numa.o result.o DEPS_ACC_CUDA = w-stacking.h w-stacking.cu COBJ_ACC_CUDA = phase_correction.o w-stacking.o DEPS_ACC_OMP = w-stacking_omp.h COBJ_ACC_OMP = phase_correction.o w-stacking_omp.o COBJ_NCCL_REDUCE = gridding_nccl.o COBJ_RCCL_REDUCE = gridding_rccl.o ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT))) CFLAGS += $(FFTW_MPI_INC) FFTWLIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm endif ifneq (CUDACC,$(findstring CUDACC,$(OPT))) w-stacking.c: w-stacking.cu Loading @@ -66,92 +95,78 @@ w-stacking.c: w-stacking.cu phase_correction.c: phase_correction.cu cp phase_correction.cu phase_correction.c endif ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) %.o: %.c $(DEPS) $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) else %.o: %.c $(DEPS) $(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) w-stacking.c: w-stacking.cu rm -f w-stacking.cun touch w-stacking.c phase_correction.c: phase_correction.cu rm -f phase_correction.c touch phase_correction.c endif serial: $(COBJ) $(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial $^ $(LIBS) serial_omp: phase_correction.c $(CC) $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial main.c init.c gridding.c fourier_transform.c result.c w-stacking_omp.c $(CFLAGS) $(LIBS) simple_mpi: phase_correction.c $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c main.c init.c gridding.c fourier_transform.c result.c phase_correction.c $(CFLAGS) $(LIBS) ##################################################################################### mpi_omp: $(COBJ_OMP) $(MPICC) $(OPTIMIZE) $(OPT) -fopenmp -o w-stackingMPI_omp $^ $(CFLAGS) $(LIBS) ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT))) EXEC_EXT := $(EXEC_EXT)_fftw endif serial_cuda: $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) $(CC) $(OPTIMIZE) $(OPT) -c main.c init.c gridding.c fourier_transform.c result.c $(CFLAGS) $(LIBS) $(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm ifeq (CUDACC,$(findstring CUDACC,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-cuda LINKER=$(NVCC) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) compile_cuda: $(COBJ_ACC_CUDA) $(NVCC) $(OPT) $(NVFLAGS) -c *.cu $(NVLIB) endif mpi: $(COBJ) $(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS) ##################################################################################### ifeq (RING,$(findstring RING,$(OPT))) mpi_new: $(COBJ_RING) $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_ring $^ $(CFLAGS) $(LIBS) ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(NVC) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) compile_accomp: $(COBJ_ACC_OMP) $(NVC) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB) endif ifeq (REDUCE,$(findstring REDUCE,$(OPT))) mpi_new: $(COBJ_RING) $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_mpi_reduce $^ $(CFLAGS) $(LIBS) ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) compile_accreduce: $(COBJ_NCCL_REDUCE) $(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3) endif ifeq (ONE_SIDE,$(findstring ONE_SIDE,$(OPT))) mpi_new: $(COBJ_RING) $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_mpi_oneside $^ $(CFLAGS) $(LIBS) ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) compile_accreduce: $(COBJ_RCCL_REDCUE) $(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3) endif ################################################################################### #To use the GPUs for the convolution part mpi_ring_omp: $(COBJ_RING_OMP) $(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB) $(NVC) $(NVFLAGS) $(OPT) -o w-stackingCfftw_ring_omp phase_correction.o w-stacking_omp.o $^ $(CFLAGS) $(NVLIB) -lmpi $(LIBS) mpi_amd_omp: $(COBJ_RING_OMP) $(MPICC) $(OPTIMIZE) $(OPT) -c phase_correction.c w-stacking_omp.v1.c $(CFLAGS) $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_amd_omp phase_correction.o w-stacking_omp.v1.o $^ $(CFLAGS) $(LIBS) mpi_omp_fft: $(COBJ_RING_OMP_FFT) $(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB) $(NVC++) $(NVFLAGS) $(OPT) -c fourier_transform_new.cpp $(CFLAGS) $(NVLIB_2) $(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_cufftMp phase_correction.o w-stacking_omp.o fourier_transform_new.o $^ $(CFLAGS) $(NVLIB_2) $(LIBS) #Reduce operation with NCCL (OpenMP+CUDA) nccl_reduce: $(COBJ_NCCL) $(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB) $(NVC++) $(NVFLAGS) $(OPT) -c gridding_nccl.cpp $(CFLAGS) $(NVLIB_3) $(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_nccl phase_correction.o w-stacking_omp.o gridding_nccl.o $^ $(CFLAGS) $(NVLIB_3) $(LIBS) #Reduce operation with RCCL AMD (OpenMP) rccl_reduce: $(COBJ_NCCL) $(MPICC) $(OPTIMIZE) $(OPT) -c phase_correction.c w-stacking_omp.v1.c $(CFLAGS) $(MPIC++) $(OPTIMIZE) $(OPT) -c gridding_nccl.cpp $(CFLAGS) $(ROCLIB) $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingC_nccl phase_correction.o w-stacking_omp.v1.o gridding_nccl.o $^ $(CFLAGS) $(ROCLIB) $(LIBS) #Reduce operation with NCCL (CUDA) nccl_reduce_fft: $(COBJ_NCCL_FFT) $(NVC) $(NVFLAGS) $(OPT) -c phase_correction.cpp w-stacking.cpp $(CFLAGS) $(NVLIB) $(NVC++) $(NVFLAGS) $(OPT) -c gridding_nccl.cpp fourier_transform_new.cpp $(CFLAGS) $(NVLIB_3) $(NVLIB_2) $(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_nccl_cu phase_correction.o w-stacking.o gridding_nccl.o $^ $(CFLAGS) $(NVLIB_2) $(NVLIB_3) $(LIBS) mpi_ring_cuda: $(COBJ_RING_CUDA) $(NVCC) $(OPT) $(NVFLAGS) -c *.cu $(NVLIB) $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_ring_cuda $^ w-stacking.o phase_correction.o $(CFLAGS) $(LIBS) $(NVLIB) mpi_cuda: $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) $(MPICC) $(OPTIMIZE) $(OPT) -c main.c init.c fourier_transform.c result.c $(CFLAGS) $(LIBS) $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS) $(NVLIB) w-stacking: $(COBJ) $(DEPS) Makefile @$(LINKER) $(FLAGS) $(OPT) $(FFTWLIBS) $(LIBS) -lmpi -o $(EXEC)$(EXEC_EXT) $(COBJ): $(DEPS) Makefile %.o: %.c $(DEPS) $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) clean: rm *.o rm w-stacking.c rm phase_correction.c rm -f *.o rm -f w-stacking.c rm -f phase_correction.c cleanall: rm -f $(EXEC)* rm -f *.o rm -f w-stacking.c rm -f phase_correction.c
allvars.c +27 −27 Original line number Diff line number Diff line #include "numa_vars.h" MPI_Comm MYMPI_COMM_WORLD; #include "allvars.h" struct io file; struct ip in; Loading @@ -8,35 +10,33 @@ struct ip in; struct op out, outparam; struct meta metaData; struct time timing; struct parameter param; struct fileData data; char filename[1000], buf[30], num_buf[30]; char datapath[900]; char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN]; char datapath[LONGNAME_LEN]; int xaxis, yaxis; int rank; int size; long nsectors; long startrow; uint nsectors; uint startrow; double resolution, dx, dw, w_supporth; uint **sectorarray = NULL; uint *histo_send = NULL; int verbose_level = 0; timing_t timing_wt; double reduce_mpi_time; double reduce_shmem_time; clock_t start, end, start0, startk, endk; struct timespec begin, finish, begin0, begink, finishk; long * histo_send, size_of_grid; double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w; int threads_ok, nthreads_fftw; uint size_of_grid; double *grid_pointers = NULL, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w; #ifdef USE_MPI MPI_Comm MYMPI_COMM_WORLD; MPI_Win slabwin; #endif #ifdef USE_MPI MPI_Request * requests = NULL; #endif long **sectorarray;
allvars.h +111 −79 Original line number Diff line number Diff line /* file to store global variables*/ #include <stdio.h> #if defined(__STDC__) # if (__STDC_VERSION__ >= 199901L) # define _XOPEN_SOURCE 700 # endif #endif #include <stdlib.h> #include <stdio.h> #include <string.h> #ifdef USE_MPI #include <math.h> #include <unistd.h> #include <stdatomic.h> #include <mpi.h> #ifdef USE_FFTW #ifndef CUFFTMP #include <fftw3-mpi.h> #endif #if defined (_OPENMP) #include <omp.h> #endif #if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw #include <fftw3-mpi.h> #endif #ifdef ACCOMP #if defined(ACCOMP) #include "w-stacking_omp.h" #else #include "w-stacking.h" #endif #ifdef NVIDIA #if defined(NVIDIA) #include <cuda_runtime.h> #endif #include "fft.h" #include "numa.h" #include "timing.h" #include "errcodes.h" #define PI 3.14159265359 #define NUM_OF_SECTORS -1 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) #define NOVERBOSE #define NFILES 100 #include <omp.h> #include <math.h> #include <time.h> #include <unistd.h> #define NAME_LEN 50 #define LONGNAME_LEN 1000 #define REDUCE_MPI 0 #define REDUCE_RING 1 #if defined(DEBUG) #define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \ ( ((t) ==-1 ) || ((T)==(t)) ) ) { \ printf(__VA_ARGS__); fflush(stdout); } #else #define dprintf(...) #endif typedef double double_t; #if defined(DOUBLE_PRECISION) typedef double float_t; #else typedef float float_t; #endif typedef unsigned int uint; typedef unsigned long long ull; extern struct io { Loading @@ -40,68 +83,60 @@ extern struct io extern struct ip { char ufile[30]; char vfile[30]; char wfile[30]; char weightsfile[30]; char visrealfile[30]; char visimgfile[30]; char metafile[30]; char paramfile[30]; char ufile[NAME_LEN]; char vfile[NAME_LEN]; char wfile[NAME_LEN]; char weightsfile[NAME_LEN]; char visrealfile[NAME_LEN]; char visimgfile[NAME_LEN]; char metafile[NAME_LEN]; char paramfile[NAME_LEN]; } in; extern struct op { char outfile[30]; char outfile1[30]; char outfile2[30]; char outfile3[30]; char fftfile[30]; char fftfile2[30]; char fftfile3[30]; char logfile[30]; char extension[30]; char timingfile[30]; char outfile[NAME_LEN]; char outfile1[NAME_LEN]; char outfile2[NAME_LEN]; char outfile3[NAME_LEN]; char fftfile[NAME_LEN]; char fftfile2[NAME_LEN]; char fftfile3[NAME_LEN]; char logfile[NAME_LEN]; char extension[NAME_LEN]; char timingfile[NAME_LEN]; } out, outparam; extern struct meta { long Nmeasures; long Nvis; long Nweights; long freq_per_chan; long polarisations; long Ntimes; uint Nmeasures; uint Nvis; uint Nweights; uint freq_per_chan; uint polarisations; uint Ntimes; double dt; double thours; long baselines; uint baselines; double uvmin; double uvmax; double wmin; double wmax; } metaData; extern struct time { double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time; double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1; double writetime, writetime1; } timing; extern struct parameter { int num_threads; int ndatasets; char datapath_multi[NFILES][900]; char datapath_multi[NFILES][LONGNAME_LEN]; int grid_size_x; int grid_size_y; int num_w_planes; int w_support; int reduce_method; } param; extern struct fileData Loading @@ -115,26 +150,23 @@ extern struct fileData }data; extern char filename[1000], buf[30], num_buf[30]; extern char datapath[900]; extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN]; extern char datapath[LONGNAME_LEN]; extern int xaxis, yaxis; extern int rank; extern int size; extern long nsectors; extern long startrow; extern double resolution, dx, dw, w_supporth; extern uint nsectors; extern uint startrow; extern double_t resolution, dx, dw, w_supporth; extern clock_t start, end, start0, startk, endk; extern struct timespec begin, finish, begin0, begink, finishk; extern long * histo_send, size_of_grid; extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w; extern uint **sectorarray; extern uint *histo_send; extern int verbose_level; #ifdef USE_MPI extern MPI_Win slabwin; #endif extern long **sectorarray; extern uint size_of_grid; extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w; extern MPI_Comm MYMPI_COMM_WORLD; extern MPI_Win slabwin; #ifdef HYBRID_FFTW extern int thread_level; #endif
data/newgauss2noconj_t201806301100_SBL180.binMS.tardeleted 100644 → 0 −95.9 MiB File deleted. View file
errcodes.h 0 → 100644 +6 −0 Original line number Diff line number Diff line #define NO_THREADS_SUPPORT 1 #define ERR_IN_PARAMFILE 2 #define NOT_ENOUGH_MEM_STACKING 3 #define ERR_REDUCE 4 #define NO_ACCELERATORS_FOUND 255