Merging with Emanuele (a52fc9f3) · Commits · Claudio Gheller / HPC_Imaging

Makefile

+25 −5

Original line number	Diff line number	Diff line
		@@ -20,8 +20,8 @@ endif

		LINKER=$(MPICC)

		FFTW_MPI_INC = -I/opt/cray/pe/fftw/3.3.10.3/x86_rome/include
		FFTW_MPI_LIB = -L/opt/cray/pe/fftw/3.3.10.3/x86_rome/lib
		FFTW_MPI_INC =
		FFTW_MPI_LIB =

		CFLAGS += -I./

		@@ -48,6 +48,24 @@ OPT += -DWRITE_IMAGE
		# perform w-stacking phase correction
		OPT += -DPHASE_ON

		# Support CFITSIO !!! Remember to add the path to the CFITSIO library to LD_LIBRARY_PATH
		#OPT += -DFITSIO

		# Perform true parallel images writing
		#OPT += -DPARALLELIO

		# Normalize uvw in case it is not done in the binMS
		OPT += -DNORMALIZE_UVW

		# Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
		#OPT += -DGAUSS_HI_PRECISION

		OPT += -DGAUSS

		#OPT += -DKAISERBESSEL

		#OPT += -DVERBOSE

		# ========================================================
		# ACCELERATION
		#
		@@ -58,13 +76,13 @@ OPT += -DPHASE_ON
		#OPT += -DCUDACC

		# use GPU acceleration via OMP
		OPT += -DACCOMP
		#OPT += -DACCOMP

		# use NVIDIA GPU to perform the reduce
		#OPT += -DNCCL_REDUCE

		# use AMD GPU to perform the reduce
		OPT += -DRCCL_REDUCE
		#OPT += -DRCCL_REDUCE

		# use GPU to perform FFT
		#OPT += -DCUFFTMP
		@@ -74,6 +92,9 @@ OPT += -DRCCL_REDUCE

		# ========================================================

		ifeq (FITSIO,$(findstring FITSIO,$(OPT)))
		LIBS += -L$(FITSIO_LIB) -lcfitsio
		endif

		DEPS = w-stacking.h main.c allvars.h

		@@ -89,7 +110,6 @@ OBJ_ACC_CUDA = phase_correction.o w-stacking.o
		DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c
		OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o


		DEPS_NCCL_REDUCE = gridding_nccl.cpp
		OBJ_NCCL_REDUCE = gridding_nccl.o

allvars.h

+8 −0

Original line number	Diff line number	Diff line
		@@ -14,6 +14,10 @@
		#include <stdatomic.h>
		#include <mpi.h>

		#ifdef FITSIO
		#include "fitsio.h"
		#endif

		#if defined (_OPENMP)
		#include <omp.h>
		#endif
		@@ -30,6 +34,10 @@
		#include "w-stacking.h"
		#endif

		#if defined(CUDACC)
		#include <cuda.h>
		#endif

		#if defined(NVIDIA)
		#include <cuda_runtime.h>
		#endif

cuda_fft.cu

0 → 100644

+148 −0

Original line number	Diff line number	Diff line
		#include <stdlib.h>
		#include <stdio.h>
		#include <cufftMp.h>
		#include <mpi.h>
		#include <cuda_runtime.h>
		#include <complex.h>
		#include "cuComplex.h"
		#include "w-stacking.h"
		#include <time.h>

		#if defined(CUFFTMP) && !defined(USE_FFTW)

		void cuda_fft(
		int num_w_planes,
		int grid_size_x,
		int grid_size_y,
		int xaxis,
		int yaxis,
		double * grid,
		double * gridss,
		MPI_Comm comm)
		{
		#ifdef __CUDACC__

		cudaError_t mmm;
		cufftResult_t status;

		cufftDoubleComplex *fftwgrid;
		fftwgrid = (cufftDoubleComplex) malloc(sizeof(cufftDoubleComplex)2num_w_planesyaxis*grid_size_x);



		// Plan creation

		cufftHandle plan;
		status = cufftCreate(&plan);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftCreate ERROR %d !!!\n", status);}

		cudaStream_t stream{};
		cudaStreamCreate(&stream);


		status = cufftMpAttachComm(plan, CUFFT_COMM_MPI, &comm);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftMpAttachComm ERROR %d !!!\n", status);}

		status = cufftSetStream(plan, stream);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftSetStream ERROR %d !!!\n", status);}

		size_t workspace;
		status = cufftMakePlan2d(plan, grid_size_x, grid_size_y, CUFFT_Z2Z, &workspace);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftMakePlan2d ERROR %d !!!\n", status);}
		cudaDeviceSynchronize();



		long fftwindex = 0;
		long fftwindex2D = 0;
		double norm = 1.0/(double)(grid_size_x*grid_size_y);




		// Grid composition

		for (int iw=0; iw<num_w_planes; iw++)
		{
		printf("select the %d w-plane to transform\n", iw);
		for (int iv=0; iv<yaxis; iv++)
		{
		for (int iu=0; iu<xaxis; iu++)
		{
		fftwindex2D = iu + iv*xaxis;
		fftwindex = 2(fftwindex2D + iwxaxis*yaxis);
		fftwgrid[fftwindex2D].x = grid[fftwindex];
		fftwgrid[fftwindex2D].y = grid[fftwindex+1];
		}
		}


		cudaLibXtDesc *fftwgrid_g;
		cudaLibXtDesc *fftwgrid_g2;


		status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);}

		status = cufftXtMalloc(plan, &fftwgrid_g2, CUFFT_XT_FORMAT_INPLACE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc 2 ERROR %d !!!\n", status);}
		cudaDeviceSynchronize();

		mmm = cudaStreamSynchronize(stream);
		if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize ERROR %d !!!\n", mmm);}

		status = cufftXtMemcpy(plan, fftwgrid_g, fftwgrid, CUFFT_COPY_HOST_TO_DEVICE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy htd fftwgrid_g ERROR %d !!!\n", status);}

		cudaDeviceSynchronize();

		status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtExecDescriptor ERROR %d !!!\n", status);}


		mmm = cudaStreamSynchronize(stream);
		if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);}

		cudaDeviceSynchronize();


		status = cufftXtMemcpy(plan, fftwgrid_g2, fftwgrid_g, CUFFT_COPY_DEVICE_TO_DEVICE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);}


		mmm = cudaStreamSynchronize(stream);
		if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);}

		cudaDeviceSynchronize();

		status = cufftXtMemcpy(plan, fftwgrid, fftwgrid_g2, CUFFT_COPY_DEVICE_TO_HOST);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dth fftwgrid ERROR %d !!!\n", status);}


		cufftXtFree(fftwgrid_g);
		cufftXtFree(fftwgrid_g2);

		for (int iv=0; iv<yaxis; iv++)
		{
		for (int iu=0; iu<xaxis; iu++)
		{
		fftwindex2D = iu + iv*xaxis;
		fftwindex = 2(fftwindex2D + iwxaxis*yaxis);
		gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x;
		gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y;
		}
		}
		}



		status = cufftDestroy(plan);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftDestroy fftwgrid ERROR %d !!!\n", status);}


		cudaStreamDestroy(stream);
		cudaDeviceSynchronize();

		#endif // __CUDACC__
		}
		#endif

fourier_transform.c

+26 −1

Original line number	Diff line number	Diff line
		@@ -4,7 +4,7 @@

		void fftw_data(){

		#ifdef USE_FFTW
		#if defined(USE_FFTW) && !defined(CUFFTMP)
		// FFT transform the data (using distributed FFTW)
		if(rank == 0)printf("PERFORMING FFT\n");

		@@ -83,6 +83,31 @@ void fftw_data(){

		#endif

		#if defined(USE_FFTW) && defined(CUFFTMP)
		// FFT transform the data using cuFFT
		if(rank==0)printf("PERFORMING CUDA FFT\n");

		MPI_Barrier(MPI_COMM_WORLD);

		double start = CPU_TIME_wt;

		cuda_fft(
		num_w_planes,
		grid_size_x,
		grid_size_y,
		xaxis,
		yaxis,
		grid,
		gridss,
		MPI_COMM_WORLD);

		MPI_Barrier(MPI_COMM_WORLD);

		timing_wt.cufftmp += CPU_TIME_wt - start;

		#endif


		}

		void write_fftw_data(){

gridding.c

+54 −2

Original line number	Diff line number	Diff line
		@@ -16,10 +16,62 @@ void gridding()
		if(rank == 0)
		printf("GRIDDING DATA\n");

		// Create histograms and linked lists

		double start = CPU_TIME_wt;

		#ifdef NORMALIZE_UVW

		if (rank==0)
		printf("NORMALIZING DATA\n");

		double minu = 1e20;
		double minv = 1e20;
		double minw = 1e20;
		double maxu = -1e20;
		double maxv = -1e20;
		double maxw = -1e20;

		for (uint inorm=0; inorm<metaData.Nmeasures; inorm++)
		{
		minu = MIN(minu,data.uu[inorm]);
		minv = MIN(minv,data.vv[inorm]);
		minw = MIN(minw,data.ww[inorm]);
		maxu = MAX(maxu,data.uu[inorm]);
		maxv = MAX(maxv,data.vv[inorm]);
		maxw = MAX(maxw,data.ww[inorm]);
		}

		double minu_all;
		double minv_all;
		double minw_all;
		double maxu_all;
		double maxv_all;
		double maxw_all;

		MPI_Allreduce(&minu,&minu_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
		MPI_Allreduce(&minv,&minv_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
		MPI_Allreduce(&minw,&minw_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
		MPI_Allreduce(&maxu,&maxu_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
		MPI_Allreduce(&maxv,&maxv_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
		MPI_Allreduce(&maxw,&maxw_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);

		double offset = 0.001;
		double ming = MAX(abs(minu_all),abs(minv_all));
		double maxg = MAX(abs(maxu_all),abs(maxv_all));
		maxg = MAX(maxg,ming);
		minw = minw_all;
		maxw = maxw_all;
		maxg = maxg+offset*maxg;
		for (uint inorm=0; inorm<metaData.Nmeasures; inorm++)
		{
		data.uu[inorm] = (data.uu[inorm]+maxg)/(2.0*maxg);
		data.vv[inorm] = (data.vv[inorm]+maxg)/(2.0*maxg);
		data.ww[inorm] = (data.ww[inorm]-minw)/(maxw-minw);
		}

		#endif

		// Create histograms and linked lists

		// Initialize linked list
		initialize_array();