Commit 783e2134 authored by Luca Tornatore's avatar Luca Tornatore
Browse files

first commit

parent 79d0aeaf
Loading
Loading
Loading
Loading
+125 −110
Original line number Diff line number Diff line
# comment/uncomment the various options depending hoe you want to build the program
# Set default values for compiler options if no systype options are given or found

EXEC = w-stacking
EXEC_EXT :=

MPICC     = mpicc
MPICXX    = mpiCC
OPTIMIZE  = -fopenmp -O3 -march=native 
@@ -13,52 +17,77 @@ else
include Build/Makefile.systype
endif

LINKER=$(MPICC)

FFTW_MPI_INC = -I/home/giacopo/Library_fftw/include
FFTW_MPI_LIB = -L/home/giacopo/Library_fftw/lib

CFLAGS += $(FFTW_MPI_INC) -I/proto.h 
LIBS = $(FFTW_MPI_LIB) -lfftw3_mpi -lfftw3 -lm #-lcudart  -lcuda
CFLAGS += -I./
FFTWLIBS =

# ========================================================
# CODE OPTIONS
#

# create MPI code
OPT += -DUSE_MPI
OPT += -DACCOMP
# use FFTW (it can be switched on ONLY if MPI is active)
ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
OPT += -DUSE_FFTW
	LIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm 
endif

#OPT += -DNVIDIA
#use cuda for GPUs
#OPT += -D__CUDACC__
# perform one-side communication (suggested) instead of reduce (only if MPI is active)
#OPT += -DONE_SIDE
OPT += -DNCCL_REDUCE
#OPT += -DREDUCE
#perform the hybrid MPI-OpenMP FFTW
#OPT += -DHYBRID_FFTW
#OPT  += -DCUFFTMP
#OPT += -DRING
#perform the debugging in the ring implementation
#OPT += -DDEBUG
# use omp-ized version of fftw routines
OPT += -DHYBRID_FFTW

# write the full 3D cube of gridded visibilities and its FFT transform
#OPT += -DWRITE_DATA

# write the final image
OPT += -DWRITE_IMAGE

# perform w-stacking phase correction
OPT += -DPHASE_ON

# ========================================================
# ACCELERATION
#

#OPT += -DNVIDIA

#use cuda for GPUs
#OPT += -D__CUDACC__

# use GPU acceleration via OMP
#OPT += -DACCOMP

# use NVIDIA GPU to perform the reduce
#OPT += -DNCCL_REDUCE

# use AMD GPU to perform the reduce
#OPT += -DRCCL_REDUCE

# use GPU to perform FFT
#OPT += -DCUFFTMP

#perform the debugging in the ring implementation
#OPT += -DDEBUG

# ========================================================


DEPS = w-stacking.h  main.c phase_correction.cu allvars.h
COBJ = allvars.o main.o init.o gridding.o gridding_std.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o

DEPS = w-stacking.h main.c w-stacking.cu phase_correction.cu allvars.h init.c gridding.c fourier_transform.c result.c
COBJ = w-stacking.o main.o phase_correction.o allvars.o init.o gridding.o fourier_transform.o result.o
COBJ_OMP = w-stacking_omp.o main.o phase_correction.o allvars.o init.o gridding.o fourier_transform.o result.o
#use the shared-memory reduce implemented in gridding_ring.c
COBJ_RING = w-stacking.o main.o phase_correction.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o 
COBJ_RING_OMP = main.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o
COBJ_RING_CUDA = main.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o
COBJ_RING_OMP_FFT = main.o allvars.o init.o gridding_ring.o result.o numa.o reduce.o
COBJ_NCCL = main.o allvars.o init.o numa.o fourier_transform.o result.o
COBJ_NCCL_FFT = main.o allvars.o init.o numa.o result.o
DEPS_ACC_CUDA = w-stacking.h w-stacking.cu
COBJ_ACC_CUDA = phase_correction.o w-stacking.o

DEPS_ACC_OMP = w-stacking_omp.h
COBJ_ACC_OMP = phase_correction.o w-stacking_omp.o

COBJ_NCCL_REDUCE = gridding_nccl.o

COBJ_RCCL_REDUCE = gridding_rccl.o

ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT)))
CFLAGS += $(FFTW_MPI_INC)
FFTWLIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm
endif

ifneq (CUDACC,$(findstring CUDACC,$(OPT)))
w-stacking.c: w-stacking.cu
@@ -66,92 +95,78 @@ w-stacking.c: w-stacking.cu

phase_correction.c: phase_correction.cu
	cp phase_correction.cu phase_correction.c
endif

ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
%.o: %.c $(DEPS)
	$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
else
%.o: %.c $(DEPS)
	$(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
w-stacking.c: w-stacking.cu
	rm -f w-stacking.cun
	touch w-stacking.c
phase_correction.c: phase_correction.cu
	rm -f phase_correction.c
	touch phase_correction.c
endif

serial: $(COBJ)
	$(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial  $^ $(LIBS)

serial_omp: phase_correction.c
	$(CC)  $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial main.c init.c gridding.c fourier_transform.c result.c w-stacking_omp.c    $(CFLAGS) $(LIBS)

simple_mpi: phase_correction.c
	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c main.c init.c gridding.c fourier_transform.c result.c phase_correction.c  $(CFLAGS) $(LIBS)
#####################################################################################

mpi_omp: $(COBJ_OMP)
	$(MPICC) $(OPTIMIZE) $(OPT) -fopenmp -o w-stackingMPI_omp $^ $(CFLAGS) $(LIBS)
ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_fftw
endif

serial_cuda:
	$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
	$(CC)  $(OPTIMIZE) $(OPT) -c main.c init.c gridding.c fourier_transform.c result.c $(CFLAGS) $(LIBS)
	$(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm
ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-cuda
LINKER=$(NVCC)
FLAGS=$(NVFLAGS) $(CFLAGS)
LIBS=$(NVLIB)
compile_cuda: $(COBJ_ACC_CUDA)
	$(NVCC) $(OPT) $(NVFLAGS) -c *.cu $(NVLIB)
endif

mpi: $(COBJ)
	$(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^  $(CFLAGS) $(LIBS)
#####################################################################################
ifeq (RING,$(findstring RING,$(OPT)))
mpi_new: $(COBJ_RING)
	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_ring $^  $(CFLAGS) $(LIBS)
ifeq (ACCOMP,$(findstring ACCOMP,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-omp
LINKER=$(NVC)
FLAGS=$(NVFLAGS) $(CFLAGS)
LIBS=$(NVLIB)
compile_accomp: $(COBJ_ACC_OMP)
	$(NVC) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB)
endif
ifeq (REDUCE,$(findstring REDUCE,$(OPT)))
mpi_new: $(COBJ_RING)
	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_mpi_reduce $^  $(CFLAGS) $(LIBS)

ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-reduce
LINKER=$(NVC++)
FLAGS=$(NVFLAGS) $(CFLAGS)
LIBS=$(NVLIB) $(NVLIB_3)
compile_accreduce: $(COBJ_NCCL_REDUCE)
	$(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3)
endif
ifeq (ONE_SIDE,$(findstring ONE_SIDE,$(OPT)))
mpi_new: $(COBJ_RING)
	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_mpi_oneside $^  $(CFLAGS) $(LIBS)

ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-reduce
LINKER=$(NVC++)
FLAGS=$(NVFLAGS) $(CFLAGS)
LIBS=$(NVLIB) $(NVLIB_3)
compile_accreduce: $(COBJ_RCCL_REDCUE)
	$(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3)
endif


###################################################################################
#To use the GPUs for the convolution part
mpi_ring_omp: $(COBJ_RING_OMP)
	$(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB)
	$(NVC) $(NVFLAGS) $(OPT) -o w-stackingCfftw_ring_omp phase_correction.o w-stacking_omp.o $^ $(CFLAGS) $(NVLIB) -lmpi $(LIBS)

mpi_amd_omp: $(COBJ_RING_OMP)
	$(MPICC) $(OPTIMIZE) $(OPT) -c phase_correction.c w-stacking_omp.v1.c $(CFLAGS) 
	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_amd_omp phase_correction.o w-stacking_omp.v1.o $^ $(CFLAGS) $(LIBS)

mpi_omp_fft: $(COBJ_RING_OMP_FFT)
	$(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB)
	$(NVC++)   $(NVFLAGS) $(OPT)  -c fourier_transform_new.cpp $(CFLAGS) $(NVLIB_2) 
	$(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_cufftMp phase_correction.o w-stacking_omp.o fourier_transform_new.o $^ $(CFLAGS) $(NVLIB_2) $(LIBS)

#Reduce operation with NCCL (OpenMP+CUDA)
nccl_reduce: $(COBJ_NCCL)
	$(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB)
	$(NVC++) $(NVFLAGS) $(OPT) -c gridding_nccl.cpp $(CFLAGS) $(NVLIB_3)
	$(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_nccl phase_correction.o w-stacking_omp.o gridding_nccl.o $^ $(CFLAGS) $(NVLIB_3) $(LIBS)

#Reduce operation with RCCL AMD (OpenMP)
rccl_reduce: $(COBJ_NCCL)
	$(MPICC) $(OPTIMIZE) $(OPT) -c phase_correction.c w-stacking_omp.v1.c $(CFLAGS) 
	$(MPIC++) $(OPTIMIZE) $(OPT) -c gridding_nccl.cpp $(CFLAGS) $(ROCLIB)
	$(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingC_nccl phase_correction.o w-stacking_omp.v1.o gridding_nccl.o $^ $(CFLAGS) $(ROCLIB) $(LIBS)


#Reduce operation with NCCL (CUDA)
nccl_reduce_fft: $(COBJ_NCCL_FFT)
	$(NVC) $(NVFLAGS) $(OPT) -c phase_correction.cpp w-stacking.cpp $(CFLAGS) $(NVLIB)
	$(NVC++) $(NVFLAGS) $(OPT) -c gridding_nccl.cpp fourier_transform_new.cpp $(CFLAGS) $(NVLIB_3) $(NVLIB_2)
	$(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_nccl_cu phase_correction.o w-stacking.o gridding_nccl.o $^ $(CFLAGS) $(NVLIB_2) $(NVLIB_3) $(LIBS)

mpi_ring_cuda: $(COBJ_RING_CUDA)
	$(NVCC)   $(OPT) $(NVFLAGS) -c *.cu $(NVLIB)
	$(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_ring_cuda $^ w-stacking.o phase_correction.o $(CFLAGS) $(LIBS) $(NVLIB)

mpi_cuda:
	$(NVCC)   $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
	$(MPICC)  $(OPTIMIZE) $(OPT) -c main.c init.c fourier_transform.c result.c $(CFLAGS) $(LIBS)
	$(MPICC) $(OPTIMIZE) $(OPT)   -o w-stackingfftw w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS) $(NVLIB)
w-stacking: $(COBJ) $(DEPS) Makefile
	@$(LINKER) $(FLAGS) $(OPT) $(FFTWLIBS) $(LIBS) -lmpi -o $(EXEC)$(EXEC_EXT)


$(COBJ): $(DEPS) Makefile

%.o: %.c $(DEPS)
	$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)


clean:
	rm *.o
	rm w-stacking.c
	rm phase_correction.c
	rm -f *.o
	rm -f w-stacking.c
	rm -f phase_correction.c

cleanall:
	rm -f $(EXEC)*
	rm -f *.o
	rm -f w-stacking.c
	rm -f phase_correction.c
+27 −27
Original line number Diff line number Diff line
#include "numa_vars.h"

MPI_Comm MYMPI_COMM_WORLD; 

#include "allvars.h"


struct io file;

struct ip in;
@@ -8,35 +10,33 @@ struct ip in;
struct op out, outparam;

struct meta      metaData;
struct time timing;
struct parameter param;
struct fileData  data;

char filename[1000], buf[30], num_buf[30];
char datapath[900];
char   filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN];
char   datapath[LONGNAME_LEN];
int    xaxis, yaxis;
int    rank;
int    size;
long nsectors;
long startrow;
uint   nsectors;
uint   startrow;
double resolution, dx, dw, w_supporth;

uint **sectorarray = NULL;
uint  *histo_send  = NULL;
int    verbose_level = 0; 

timing_t timing_wt;
double   reduce_mpi_time;
double   reduce_shmem_time;

clock_t start, end, start0, startk, endk;
struct timespec begin, finish, begin0, begink, finishk;

long * histo_send, size_of_grid;
double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w;
int threads_ok, nthreads_fftw;
uint     size_of_grid;
double   *grid_pointers = NULL, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w;

#ifdef USE_MPI
MPI_Comm      MYMPI_COMM_WORLD; 
MPI_Win       slabwin;
#endif

#ifdef USE_MPI
MPI_Request * requests = NULL;
#endif

long **sectorarray;

+111 −79
Original line number Diff line number Diff line
/* file to store global variables*/

#include <stdio.h>
#if defined(__STDC__)
#  if (__STDC_VERSION__ >= 199901L)
#     define _XOPEN_SOURCE 700
#  endif
#endif

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#ifdef USE_MPI
#include <math.h>
#include <unistd.h>
#include <stdatomic.h>
#include <mpi.h>
#ifdef USE_FFTW
#ifndef CUFFTMP
#include <fftw3-mpi.h>
#endif

#if defined (_OPENMP)
#include <omp.h>
#endif



#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
#include <fftw3-mpi.h>
#endif
#ifdef ACCOMP

#if defined(ACCOMP)               
#include "w-stacking_omp.h"
#else
#include "w-stacking.h"
#endif 
#ifdef NVIDIA

#if defined(NVIDIA)
#include <cuda_runtime.h>
#endif

#include "fft.h"
#include "numa.h"
#include "timing.h"
#include "errcodes.h"

#define PI 3.14159265359
#define NUM_OF_SECTORS -1
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
#define NOVERBOSE
#define NFILES 100
#include <omp.h>
#include <math.h>
#include <time.h>
#include <unistd.h>

#define NAME_LEN 50
#define LONGNAME_LEN 1000


#define REDUCE_MPI  0
#define REDUCE_RING 1

#if defined(DEBUG)
#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) &&	\
				      ( ((t) ==-1 ) || ((T)==(t)) ) ) {	\
    printf(__VA_ARGS__); fflush(stdout); }

#else
#define dprintf(...)
#endif

typedef double double_t;
#if defined(DOUBLE_PRECISION)
typedef double float_t;
#else
typedef float float_t;
#endif

typedef unsigned int       uint;
typedef unsigned long long ull;


extern struct io
{
@@ -40,68 +83,60 @@ extern struct io

extern struct ip
{
	char ufile[30];
  	char vfile[30];
  	char wfile[30];
  	char weightsfile[30];
  	char visrealfile[30];
  	char visimgfile[30];
  	char metafile[30];
        char paramfile[30];
	char ufile[NAME_LEN];
  	char vfile[NAME_LEN];
  	char wfile[NAME_LEN];
  	char weightsfile[NAME_LEN];
  	char visrealfile[NAME_LEN];
  	char visimgfile[NAME_LEN];
  	char metafile[NAME_LEN];
        char paramfile[NAME_LEN];
} in;

extern struct op
{
	char outfile[30];
        char outfile1[30];
        char outfile2[30];
        char outfile3[30];
        char fftfile[30];
        char fftfile2[30];
        char fftfile3[30];
        char logfile[30];
        char extension[30];
        char timingfile[30];
	char outfile[NAME_LEN];
        char outfile1[NAME_LEN];
        char outfile2[NAME_LEN];
        char outfile3[NAME_LEN];
        char fftfile[NAME_LEN];
        char fftfile2[NAME_LEN];
        char fftfile3[NAME_LEN];
        char logfile[NAME_LEN];
        char extension[NAME_LEN];
        char timingfile[NAME_LEN];

} out, outparam;

extern struct meta
{

	long Nmeasures;
        long Nvis;
        long Nweights;
        long freq_per_chan;
        long polarisations;
        long Ntimes;
  uint   Nmeasures;
  uint   Nvis;
  uint   Nweights;
  uint   freq_per_chan;
  uint   polarisations;
  uint   Ntimes;
  double dt;
  double thours;
        long baselines;
  uint   baselines;
  double uvmin;
  double uvmax;
  double wmin;
  double wmax;

} metaData;


extern struct time
{
   	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time;
	double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1;
	double writetime, writetime1;

} timing;

extern struct parameter
{
  int  num_threads;
  int  ndatasets;
        char datapath_multi[NFILES][900];
  char datapath_multi[NFILES][LONGNAME_LEN];
  int  grid_size_x;
  int  grid_size_y;
  int  num_w_planes;
  int  w_support;
  int  reduce_method;
} param;

extern struct fileData
@@ -115,26 +150,23 @@ extern struct fileData
}data;


extern char filename[1000], buf[30], num_buf[30];
extern char datapath[900];
extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN];
extern char datapath[LONGNAME_LEN];
extern int  xaxis, yaxis;
extern int  rank;
extern int  size;
extern long nsectors;
extern long startrow;
extern double resolution, dx, dw, w_supporth;
extern uint nsectors;
extern uint startrow;
extern double_t resolution, dx, dw, w_supporth;

extern clock_t start, end, start0, startk, endk;
extern struct timespec begin, finish, begin0, begink, finishk;
extern long * histo_send, size_of_grid;
extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w;
extern uint **sectorarray;
extern uint  *histo_send;
extern int    verbose_level; 

#ifdef USE_MPI
    extern  MPI_Win slabwin;
#endif

extern long **sectorarray;
extern uint    size_of_grid;
extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w;

extern MPI_Comm MYMPI_COMM_WORLD;
extern MPI_Win  slabwin;
#ifdef HYBRID_FFTW
extern int thread_level;
#endif
−95.9 MiB

File deleted.

errcodes.h

0 → 100644
+6 −0
Original line number Diff line number Diff line

#define NO_THREADS_SUPPORT        1
#define ERR_IN_PARAMFILE          2
#define NOT_ENOUGH_MEM_STACKING   3
#define ERR_REDUCE                4
#define NO_ACCELERATORS_FOUND     255
Loading