# comment/uncomment the various options depending hoe you want to build the program
# Set default values for compiler options if no systype options are given or found
MPICC        = mpicc
MPICXX       = mpiCC
OPTIMIZE  = -fopenmp -O3 -march=native 
MPICHLIB  = 
SWITCHES =

ifdef SYSTYPE
SYSTYPE := $(SYSTYPE)
include Build/Makefile.$(SYSTYPE)
else
include Build/Makefile.systype
endif

FFTW_MPI_INC = -I/home/giacopo/Library_fftw/include
FFTW_MPI_LIB = -L/home/giacopo/Library_fftw/lib

CFLAGS += $(FFTW_MPI_INC) -I/proto.h 
LIBS = $(FFTW_MPI_LIB) -lfftw3_mpi -lfftw3 -lm #-lcudart  -lcuda

# create MPI code
OPT += -DUSE_MPI
OPT += -DACCOMP
# use FFTW (it can be switched on ONLY if MPI is active)
ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
   OPT += -DUSE_FFTW
	LIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm 
endif

#OPT += -DNVIDIA
#use cuda for GPUs
#OPT += -D__CUDACC__
# perform one-side communication (suggested) instead of reduce (only if MPI is active)
#OPT += -DONE_SIDE
OPT += -DNCCL_REDUCE
#OPT += -DREDUCE
#perform the hybrid MPI-OpenMP FFTW
#OPT += -DHYBRID_FFTW
#OPT  += -DCUFFTMP
#OPT += -DRING
#perform the debugging in the ring implementation
#OPT += -DDEBUG
# write the full 3D cube of gridded visibilities and its FFT transform
#OPT += -DWRITE_DATA
# write the final image
OPT += -DWRITE_IMAGE
# perform w-stacking phase correction
OPT += -DPHASE_ON


DEPS = w-stacking.h main.c w-stacking.cu phase_correction.cu allvars.h init.c gridding.c fourier_transform.c result.c
COBJ = w-stacking.o main.o phase_correction.o allvars.o init.o gridding.o fourier_transform.o result.o
COBJ_OMP = w-stacking_omp.o main.o phase_correction.o allvars.o init.o gridding.o fourier_transform.o result.o
#use the shared-memory reduce implemented in gridding_ring.c
COBJ_RING = w-stacking.o main.o phase_correction.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o 
COBJ_RING_OMP = main.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o
COBJ_RING_CUDA = main.o allvars.o init.o numa.o reduce.o gridding_ring.o fourier_transform.o result.o
COBJ_RING_OMP_FFT = main.o allvars.o init.o gridding_ring.o result.o numa.o reduce.o
COBJ_NCCL = main.o allvars.o init.o numa.o fourier_transform.o result.o
COBJ_NCCL_FFT = main.o allvars.o init.o numa.o result.o

ifneq (CUDACC,$(findstring CUDACC,$(OPT)))
w-stacking.c: w-stacking.cu
	cp w-stacking.cu w-stacking.c

phase_correction.c: phase_correction.cu
	cp phase_correction.cu phase_correction.c
endif

ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
%.o: %.c $(DEPS)
	$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
else
%.o: %.c $(DEPS)
	$(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
endif

serial: $(COBJ)
	$(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial  $^ $(LIBS)

serial_omp: phase_correction.c
	$(CC)  $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial main.c init.c gridding.c fourier_transform.c result.c w-stacking_omp.c    $(CFLAGS) $(LIBS)

simple_mpi: phase_correction.c
	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c main.c init.c gridding.c fourier_transform.c result.c phase_correction.c  $(CFLAGS) $(LIBS)

mpi_omp: $(COBJ_OMP)
	$(MPICC) $(OPTIMIZE) $(OPT) -fopenmp -o w-stackingMPI_omp $^ $(CFLAGS) $(LIBS)

serial_cuda:
	$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
	$(CC)  $(OPTIMIZE) $(OPT) -c main.c init.c gridding.c fourier_transform.c result.c $(CFLAGS) $(LIBS)
	$(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm

mpi: $(COBJ)
	$(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^  $(CFLAGS) $(LIBS)
#####################################################################################
ifeq (RING,$(findstring RING,$(OPT)))
mpi_new: $(COBJ_RING)
	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_ring $^  $(CFLAGS) $(LIBS)
endif
ifeq (REDUCE,$(findstring REDUCE,$(OPT)))
mpi_new: $(COBJ_RING)
	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_mpi_reduce $^  $(CFLAGS) $(LIBS)
endif
ifeq (ONE_SIDE,$(findstring ONE_SIDE,$(OPT)))
mpi_new: $(COBJ_RING)
	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_mpi_oneside $^  $(CFLAGS) $(LIBS)
endif
###################################################################################
#To use the GPUs for the convolution part
mpi_ring_omp: $(COBJ_RING_OMP)
	$(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB)
	$(NVC) $(NVFLAGS) $(OPT) -o w-stackingCfftw_ring_omp phase_correction.o w-stacking_omp.o $^ $(CFLAGS) $(NVLIB) -lmpi $(LIBS)

mpi_amd_omp: $(COBJ_RING_OMP)
	$(MPICC) $(OPTIMIZE) $(OPT) -c phase_correction.c w-stacking_omp.v1.c $(CFLAGS) 
	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_amd_omp phase_correction.o w-stacking_omp.v1.o $^ $(CFLAGS) $(LIBS)

mpi_omp_fft: $(COBJ_RING_OMP_FFT)
	$(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB)
	$(NVC++)   $(NVFLAGS) $(OPT)  -c fourier_transform_new.cpp $(CFLAGS) $(NVLIB_2) 
	$(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_cufftMp phase_correction.o w-stacking_omp.o fourier_transform_new.o $^ $(CFLAGS) $(NVLIB_2) $(LIBS)

#Reduce operation with NCCL (OpenMP+CUDA)
nccl_reduce: $(COBJ_NCCL)
	$(NVC) $(NVFLAGS) $(OPT) -c phase_correction.c w-stacking_omp.c $(CFLAGS) $(NVLIB)
	$(NVC++) $(NVFLAGS) $(OPT) -c gridding_nccl.cpp $(CFLAGS) $(NVLIB_3)
	$(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_nccl phase_correction.o w-stacking_omp.o gridding_nccl.o $^ $(CFLAGS) $(NVLIB_3) $(LIBS)

#Reduce operation with RCCL AMD (OpenMP)
rccl_reduce: $(COBJ_NCCL)
	$(MPICC) $(OPTIMIZE) $(OPT) -c phase_correction.c w-stacking_omp.v1.c $(CFLAGS) 
	$(MPIC++) $(OPTIMIZE) $(OPT) -c gridding_nccl.cpp $(CFLAGS) $(ROCLIB)
	$(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingC_nccl phase_correction.o w-stacking_omp.v1.o gridding_nccl.o $^ $(CFLAGS) $(ROCLIB) $(LIBS)


#Reduce operation with NCCL (CUDA)
nccl_reduce_fft: $(COBJ_NCCL_FFT)
	$(NVC) $(NVFLAGS) $(OPT) -c phase_correction.cpp w-stacking.cpp $(CFLAGS) $(NVLIB)
	$(NVC++) $(NVFLAGS) $(OPT) -c gridding_nccl.cpp fourier_transform_new.cpp $(CFLAGS) $(NVLIB_3) $(NVLIB_2)
	$(NVC++) $(NVFLAGS) $(OPT) -o w-stackingC_nccl_cu phase_correction.o w-stacking.o gridding_nccl.o $^ $(CFLAGS) $(NVLIB_2) $(NVLIB_3) $(LIBS)

mpi_ring_cuda: $(COBJ_RING_CUDA)
	$(NVCC)   $(OPT) $(NVFLAGS) -c *.cu $(NVLIB)
	$(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_ring_cuda $^ w-stacking.o phase_correction.o $(CFLAGS) $(LIBS) $(NVLIB)

mpi_cuda:
	$(NVCC)   $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
	$(MPICC)  $(OPTIMIZE) $(OPT) -c main.c init.c fourier_transform.c result.c $(CFLAGS) $(LIBS)
	$(MPICC) $(OPTIMIZE) $(OPT)   -o w-stackingfftw w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS) $(NVLIB)

clean:
	rm *.o
	rm w-stacking.c
	rm phase_correction.c
