first commit (783e2134) · Commits · Claudio Gheller / HPC_Imaging

Makefile

+125 −110

Original line number	Diff line number	Diff line
		# comment/uncomment the various options depending hoe you want to build the program
		# Set default values for compiler options if no systype options are given or found

		EXEC = w-stacking
		EXEC_EXT :=

		MPICC = mpicc
		MPICXX = mpiCC
		OPTIMIZE = -fopenmp -O3 -march=native
		@@ -13,52 +17,77 @@ else
		include Build/Makefile.systype
		endif

		LINKER=$(MPICC)

		FFTW_MPI_INC = -I/home/giacopo/Library_fftw/include
		FFTW_MPI_LIB = -L/home/giacopo/Library_fftw/lib

		CFLAGS += $(FFTW_MPI_INC) -I/proto.h
		LIBS = $(FFTW_MPI_LIB) -lfftw3_mpi -lfftw3 -lm #-lcudart -lcuda
		CFLAGS += -I./
		FFTWLIBS =

		# ========================================================
		# CODE OPTIONS
		#

		# create MPI code
		OPT += -DUSE_MPI
		OPT += -DACCOMP
		# use FFTW (it can be switched on ONLY if MPI is active)
		ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
		OPT += -DUSE_FFTW
		LIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm
		endif

		#OPT += -DNVIDIA
		#use cuda for GPUs
		#OPT += -D__CUDACC__
		# perform one-side communication (suggested) instead of reduce (only if MPI is active)
		#OPT += -DONE_SIDE
		OPT += -DNCCL_REDUCE
		#OPT += -DREDUCE
		#perform the hybrid MPI-OpenMP FFTW
		#OPT += -DHYBRID_FFTW
		#OPT += -DCUFFTMP
		#OPT += -DRING
		#perform the debugging in the ring implementation
		#OPT += -DDEBUG
		# use omp-ized version of fftw routines
		OPT += -DHYBRID_FFTW

		# write the full 3D cube of gridded visibilities and its FFT transform
		#OPT += -DWRITE_DATA

		# write the final image
		OPT += -DWRITE_IMAGE

		# perform w-stacking phase correction
		OPT += -DPHASE_ON

		# ========================================================
		# ACCELERATION
		#

		#OPT += -DNVIDIA

		#use cuda for GPUs
		#OPT += -D__CUDACC__

		# use GPU acceleration via OMP
		#OPT += -DACCOMP

		# use NVIDIA GPU to perform the reduce
		#OPT += -DNCCL_REDUCE

		# use AMD GPU to perform the reduce
		#OPT += -DRCCL_REDUCE

		# use GPU to perform FFT
		#OPT += -DCUFFTMP

		#perform the debugging in the ring implementation
		#OPT += -DDEBUG

		# ========================================================


		DEPS = w-stacking.h main.c phase_correction.cu allvars.h
		COBJ = allvars.o main.o init.o gridding.o gridding_std.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o

		DEPS = w-stacking.h main.c w-stacking.cu phase_correction.cu allvars.h init.c gridding.c fourier_transform.c result.c
		COBJ = w-stacking.o main.o phase_correction.o allvars.o init.o gridding.o fourier_transform.o result.o
		COBJ_OMP = w-stacking_omp.o main.o phase_correction.o allvars.o init.o gridding.o fourier_transform.o result.o
		#use the shared-memory reduce implemented in gridding_ring.c
		COBJ_RING = w-stacking.o main.o phase_correction.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o
		COBJ_RING_OMP = main.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o
		COBJ_RING_CUDA = main.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o
		COBJ_RING_OMP_FFT = main.o allvars.o init.o gridding_ring.o result.o numa.o reduce.o
		COBJ_NCCL = main.o allvars.o init.o numa.o fourier_transform.o result.o
		COBJ_NCCL_FFT = main.o allvars.o init.o numa.o result.o
		DEPS_ACC_CUDA = w-stacking.h w-stacking.cu
		COBJ_ACC_CUDA = phase_correction.o w-stacking.o

		DEPS_ACC_OMP = w-stacking_omp.h
		COBJ_ACC_OMP = phase_correction.o w-stacking_omp.o

		COBJ_NCCL_REDUCE = gridding_nccl.o

		COBJ_RCCL_REDUCE = gridding_rccl.o

		ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT)))
		CFLAGS += $(FFTW_MPI_INC)
		FFTWLIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm
		endif

		ifneq (CUDACC,$(findstring CUDACC,$(OPT)))
		w-stacking.c: w-stacking.cu
		@@ -66,92 +95,78 @@ w-stacking.c: w-stacking.cu

		phase_correction.c: phase_correction.cu
		cp phase_correction.cu phase_correction.c
		endif

		ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
		%.o: %.c $(DEPS)
		$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
		else
		%.o: %.c $(DEPS)
		$(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
		w-stacking.c: w-stacking.cu
		rm -f w-stacking.cun
		touch w-stacking.c
		phase_correction.c: phase_correction.cu
		rm -f phase_correction.c
		touch phase_correction.c
		endif

		serial: $(COBJ)
		$(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial $^ $(LIBS)

		serial_omp: phase_correction.c
		$(CC) $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial main.c init.c gridding.c fourier_transform.c result.c w-stacking_omp.c $(CFLAGS) $(LIBS)

		simple_mpi: phase_correction.c
		$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c main.c init.c gridding.c fourier_transform.c result.c phase_correction.c $(CFLAGS) $(LIBS)
		#####################################################################################

		mpi_omp: $(COBJ_OMP)
		$(MPICC) $(OPTIMIZE) $(OPT) -fopenmp -o w-stackingMPI_omp $^ $(CFLAGS) $(LIBS)
		ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_fftw
		endif

		serial_cuda:
		$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
		$(CC) $(OPTIMIZE) $(OPT) -c main.c init.c gridding.c fourier_transform.c result.c $(CFLAGS) $(LIBS)
		$(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm
		ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_acc-cuda
		LINKER=$(NVCC)
		FLAGS=$(NVFLAGS) $(CFLAGS)
		LIBS=$(NVLIB)
		compile_cuda: $(COBJ_ACC_CUDA)
		$(NVCC) $(OPT) $(NVFLAGS) -c *.cu $(NVLIB)
		endif

		mpi: $(COBJ)
		$(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS)
		#####################################################################################
		ifeq (RING,$(findstring RING,$(OPT)))
		mpi_new: $(COBJ_RING)
		$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_ring $^ $(CFLAGS) $(LIBS)
		ifeq (ACCOMP,$(findstring ACCOMP,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_acc-omp
		LINKER=$(NVC)
		FLAGS=$(NVFLAGS) $(CFLAGS)
		LIBS=$(NVLIB)
		compile_accomp: $(COBJ_ACC_OMP)
		$(NVC) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB)
		endif
		ifeq (REDUCE,$(findstring REDUCE,$(OPT)))
		mpi_new: $(COBJ_RING)
		$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_mpi_reduce $^ $(CFLAGS) $(LIBS)

		ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_acc-reduce
		LINKER=$(NVC++)
		FLAGS=$(NVFLAGS) $(CFLAGS)
		LIBS=$(NVLIB) $(NVLIB_3)
		compile_accreduce: $(COBJ_NCCL_REDUCE)
		$(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3)
		endif
		ifeq (ONE_SIDE,$(findstring ONE_SIDE,$(OPT)))
		mpi_new: $(COBJ_RING)
		$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_mpi_oneside $^ $(CFLAGS) $(LIBS)

		ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_acc-reduce
		LINKER=$(NVC++)
		FLAGS=$(NVFLAGS) $(CFLAGS)
		LIBS=$(NVLIB) $(NVLIB_3)
		compile_accreduce: $(COBJ_RCCL_REDCUE)
		$(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3)
		endif


		###################################################################################
		#To use the GPUs for the convolution part
		mpi_ring_omp: $(COBJ_RING_OMP)
		$(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB)
		$(NVC) $(NVFLAGS) $(OPT) -o w-stackingCfftw_ring_omp phase_correction.o w-stacking_omp.o $^ $(CFLAGS) $(NVLIB) -lmpi $(LIBS)

		mpi_amd_omp: $(COBJ_RING_OMP)
		$(MPICC) $(OPTIMIZE) $(OPT) -c phase_correction.c w-stacking_omp.v1.c $(CFLAGS)
		$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_amd_omp phase_correction.o w-stacking_omp.v1.o $^ $(CFLAGS) $(LIBS)

		mpi_omp_fft: $(COBJ_RING_OMP_FFT)
		$(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB)
		$(NVC++) $(NVFLAGS) $(OPT) -c fourier_transform_new.cpp $(CFLAGS) $(NVLIB_2)
		$(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_cufftMp phase_correction.o w-stacking_omp.o fourier_transform_new.o $^ $(CFLAGS) $(NVLIB_2) $(LIBS)

		#Reduce operation with NCCL (OpenMP+CUDA)
		nccl_reduce: $(COBJ_NCCL)
		$(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB)
		$(NVC++) $(NVFLAGS) $(OPT) -c gridding_nccl.cpp $(CFLAGS) $(NVLIB_3)
		$(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_nccl phase_correction.o w-stacking_omp.o gridding_nccl.o $^ $(CFLAGS) $(NVLIB_3) $(LIBS)

		#Reduce operation with RCCL AMD (OpenMP)
		rccl_reduce: $(COBJ_NCCL)
		$(MPICC) $(OPTIMIZE) $(OPT) -c phase_correction.c w-stacking_omp.v1.c $(CFLAGS)
		$(MPIC++) $(OPTIMIZE) $(OPT) -c gridding_nccl.cpp $(CFLAGS) $(ROCLIB)
		$(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingC_nccl phase_correction.o w-stacking_omp.v1.o gridding_nccl.o $^ $(CFLAGS) $(ROCLIB) $(LIBS)


		#Reduce operation with NCCL (CUDA)
		nccl_reduce_fft: $(COBJ_NCCL_FFT)
		$(NVC) $(NVFLAGS) $(OPT) -c phase_correction.cpp w-stacking.cpp $(CFLAGS) $(NVLIB)
		$(NVC++) $(NVFLAGS) $(OPT) -c gridding_nccl.cpp fourier_transform_new.cpp $(CFLAGS) $(NVLIB_3) $(NVLIB_2)
		$(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_nccl_cu phase_correction.o w-stacking.o gridding_nccl.o $^ $(CFLAGS) $(NVLIB_2) $(NVLIB_3) $(LIBS)

		mpi_ring_cuda: $(COBJ_RING_CUDA)
		$(NVCC) $(OPT) $(NVFLAGS) -c *.cu $(NVLIB)
		$(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_ring_cuda $^ w-stacking.o phase_correction.o $(CFLAGS) $(LIBS) $(NVLIB)

		mpi_cuda:
		$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
		$(MPICC) $(OPTIMIZE) $(OPT) -c main.c init.c fourier_transform.c result.c $(CFLAGS) $(LIBS)
		$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS) $(NVLIB)
		w-stacking: $(COBJ) $(DEPS) Makefile
		@$(LINKER) $(FLAGS) $(OPT) $(FFTWLIBS) $(LIBS) -lmpi -o $(EXEC)$(EXEC_EXT)


		$(COBJ): $(DEPS) Makefile

		%.o: %.c $(DEPS)
		$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)


		clean:
		rm *.o
		rm w-stacking.c
		rm phase_correction.c
		rm -f *.o
		rm -f w-stacking.c
		rm -f phase_correction.c

		cleanall:
		rm -f $(EXEC)*
		rm -f *.o
		rm -f w-stacking.c
		rm -f phase_correction.c

allvars.c

+27 −27

Original line number	Diff line number	Diff line
		#include "numa_vars.h"

		MPI_Comm MYMPI_COMM_WORLD;

		#include "allvars.h"


		struct io file;

		struct ip in;
		@@ -8,35 +10,33 @@ struct ip in;
		struct op out, outparam;

		struct meta metaData;
		struct time timing;
		struct parameter param;
		struct fileData data;

		char filename[1000], buf[30], num_buf[30];
		char datapath[900];
		char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN];
		char datapath[LONGNAME_LEN];
		int xaxis, yaxis;
		int rank;
		int size;
		long nsectors;
		long startrow;
		uint nsectors;
		uint startrow;
		double resolution, dx, dw, w_supporth;

		uint **sectorarray = NULL;
		uint *histo_send = NULL;
		int verbose_level = 0;

		timing_t timing_wt;
		double reduce_mpi_time;
		double reduce_shmem_time;

		clock_t start, end, start0, startk, endk;
		struct timespec begin, finish, begin0, begink, finishk;

		long * histo_send, size_of_grid;
		double * grid, gridss, gridss_real, gridss_img, gridss_w;
		int threads_ok, nthreads_fftw;
		uint size_of_grid;
		double grid_pointers = NULL, grid, gridss, gridss_real, gridss_img, gridss_w;

		#ifdef USE_MPI
		MPI_Comm MYMPI_COMM_WORLD;
		MPI_Win slabwin;
		#endif

		#ifdef USE_MPI
		MPI_Request * requests = NULL;
		#endif

		long **sectorarray;

allvars.h

+111 −79

Original line number	Diff line number	Diff line
		/* file to store global variables*/

		#include <stdio.h>
		#if defined(__STDC__)
		# if (__STDC_VERSION__ >= 199901L)
		# define _XOPEN_SOURCE 700
		# endif
		#endif

		#include <stdlib.h>
		#include <stdio.h>
		#include <string.h>
		#ifdef USE_MPI
		#include <math.h>
		#include <unistd.h>
		#include <stdatomic.h>
		#include <mpi.h>
		#ifdef USE_FFTW
		#ifndef CUFFTMP
		#include <fftw3-mpi.h>
		#endif

		#if defined (_OPENMP)
		#include <omp.h>
		#endif



		#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
		#include <fftw3-mpi.h>
		#endif
		#ifdef ACCOMP

		#if defined(ACCOMP)
		#include "w-stacking_omp.h"
		#else
		#include "w-stacking.h"
		#endif
		#ifdef NVIDIA

		#if defined(NVIDIA)
		#include <cuda_runtime.h>
		#endif

		#include "fft.h"
		#include "numa.h"
		#include "timing.h"
		#include "errcodes.h"

		#define PI 3.14159265359
		#define NUM_OF_SECTORS -1
		#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
		#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
		#define NOVERBOSE
		#define NFILES 100
		#include <omp.h>
		#include <math.h>
		#include <time.h>
		#include <unistd.h>

		#define NAME_LEN 50
		#define LONGNAME_LEN 1000


		#define REDUCE_MPI 0
		#define REDUCE_RING 1

		#if defined(DEBUG)
		#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \
		( ((t) ==-1 ) \|\| ((T)==(t)) ) ) { \
		printf(__VA_ARGS__); fflush(stdout); }

		#else
		#define dprintf(...)
		#endif

		typedef double double_t;
		#if defined(DOUBLE_PRECISION)
		typedef double float_t;
		#else
		typedef float float_t;
		#endif

		typedef unsigned int uint;
		typedef unsigned long long ull;


		extern struct io
		{
		@@ -40,68 +83,60 @@ extern struct io

		extern struct ip
		{
		char ufile[30];
		char vfile[30];
		char wfile[30];
		char weightsfile[30];
		char visrealfile[30];
		char visimgfile[30];
		char metafile[30];
		char paramfile[30];
		char ufile[NAME_LEN];
		char vfile[NAME_LEN];
		char wfile[NAME_LEN];
		char weightsfile[NAME_LEN];
		char visrealfile[NAME_LEN];
		char visimgfile[NAME_LEN];
		char metafile[NAME_LEN];
		char paramfile[NAME_LEN];
		} in;

		extern struct op
		{
		char outfile[30];
		char outfile1[30];
		char outfile2[30];
		char outfile3[30];
		char fftfile[30];
		char fftfile2[30];
		char fftfile3[30];
		char logfile[30];
		char extension[30];
		char timingfile[30];
		char outfile[NAME_LEN];
		char outfile1[NAME_LEN];
		char outfile2[NAME_LEN];
		char outfile3[NAME_LEN];
		char fftfile[NAME_LEN];
		char fftfile2[NAME_LEN];
		char fftfile3[NAME_LEN];
		char logfile[NAME_LEN];
		char extension[NAME_LEN];
		char timingfile[NAME_LEN];

		} out, outparam;

		extern struct meta
		{

		long Nmeasures;
		long Nvis;
		long Nweights;
		long freq_per_chan;
		long polarisations;
		long Ntimes;
		uint Nmeasures;
		uint Nvis;
		uint Nweights;
		uint freq_per_chan;
		uint polarisations;
		uint Ntimes;
		double dt;
		double thours;
		long baselines;
		uint baselines;
		double uvmin;
		double uvmax;
		double wmin;
		double wmax;

		} metaData;


		extern struct time
		{
		double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time;
		double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1;
		double writetime, writetime1;

		} timing;

		extern struct parameter
		{
		int num_threads;
		int ndatasets;
		char datapath_multi[NFILES][900];
		char datapath_multi[NFILES][LONGNAME_LEN];
		int grid_size_x;
		int grid_size_y;
		int num_w_planes;
		int w_support;
		int reduce_method;
		} param;

		extern struct fileData
		@@ -115,26 +150,23 @@ extern struct fileData
		}data;


		extern char filename[1000], buf[30], num_buf[30];
		extern char datapath[900];
		extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN];
		extern char datapath[LONGNAME_LEN];
		extern int xaxis, yaxis;
		extern int rank;
		extern int size;
		extern long nsectors;
		extern long startrow;
		extern double resolution, dx, dw, w_supporth;
		extern uint nsectors;
		extern uint startrow;
		extern double_t resolution, dx, dw, w_supporth;

		extern clock_t start, end, start0, startk, endk;
		extern struct timespec begin, finish, begin0, begink, finishk;
		extern long * histo_send, size_of_grid;
		extern double * grid, gridss, gridss_real, gridss_img, gridss_w;
		extern uint **sectorarray;
		extern uint *histo_send;
		extern int verbose_level;

		#ifdef USE_MPI
		extern MPI_Win slabwin;
		#endif

		extern long **sectorarray;
		extern uint size_of_grid;
		extern double_t grid_pointers, grid, gridss, gridss_real, gridss_img, gridss_w;

		extern MPI_Comm MYMPI_COMM_WORLD;
		extern MPI_Win slabwin;

		#ifdef HYBRID_FFTW
		extern int thread_level;
		#endif

data/newgauss2noconj_t201806301100_SBL180.binMS.tar

deleted100644 → 0

−95.9 MiB

File deleted.

View file

errcodes.h

0 → 100644

+6 −0

Original line number	Diff line number	Diff line

		#define NO_THREADS_SUPPORT 1
		#define ERR_IN_PARAMFILE 2
		#define NOT_ENOUGH_MEM_STACKING 3
		#define ERR_REDUCE 4
		#define NO_ACCELERATORS_FOUND 255