Commit a52fc9f3 authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

Merging with Emanuele

parent 45a9e6b7
Loading
Loading
Loading
Loading
+25 −5
Original line number Diff line number Diff line
@@ -20,8 +20,8 @@ endif

LINKER=$(MPICC)

FFTW_MPI_INC = -I/opt/cray/pe/fftw/3.3.10.3/x86_rome/include
FFTW_MPI_LIB = -L/opt/cray/pe/fftw/3.3.10.3/x86_rome/lib
FFTW_MPI_INC = 
FFTW_MPI_LIB = 

CFLAGS += -I./

@@ -48,6 +48,24 @@ OPT += -DWRITE_IMAGE
# perform w-stacking phase correction
OPT += -DPHASE_ON

# Support CFITSIO !!! Remember to add the path to the CFITSIO library to LD_LIBRARY_PATH
#OPT += -DFITSIO

# Perform true parallel images writing
#OPT += -DPARALLELIO

# Normalize uvw in case it is not done in the binMS
OPT += -DNORMALIZE_UVW

# Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
#OPT += -DGAUSS_HI_PRECISION

OPT += -DGAUSS

#OPT += -DKAISERBESSEL

#OPT += -DVERBOSE

# ========================================================
# ACCELERATION
#
@@ -58,13 +76,13 @@ OPT += -DPHASE_ON
#OPT += -DCUDACC

# use GPU acceleration via OMP 
OPT += -DACCOMP
#OPT += -DACCOMP

# use NVIDIA GPU to perform the reduce
#OPT += -DNCCL_REDUCE

# use AMD GPU to perform the reduce
OPT += -DRCCL_REDUCE
#OPT += -DRCCL_REDUCE

# use GPU to perform FFT
#OPT += -DCUFFTMP
@@ -74,6 +92,9 @@ OPT += -DRCCL_REDUCE

# ========================================================

ifeq (FITSIO,$(findstring FITSIO,$(OPT)))
        LIBS += -L$(FITSIO_LIB) -lcfitsio
endif	

DEPS = w-stacking.h  main.c allvars.h

@@ -89,7 +110,6 @@ OBJ_ACC_CUDA = phase_correction.o w-stacking.o
DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c
OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o


DEPS_NCCL_REDUCE = gridding_nccl.cpp
OBJ_NCCL_REDUCE = gridding_nccl.o

+8 −0
Original line number Diff line number Diff line
@@ -14,6 +14,10 @@
#include <stdatomic.h>
#include <mpi.h>

#ifdef FITSIO
#include "fitsio.h"
#endif

#if defined (_OPENMP)
#include <omp.h>
#endif
@@ -30,6 +34,10 @@
#include "w-stacking.h"
#endif 

#if defined(CUDACC)
#include <cuda.h>
#endif

#if defined(NVIDIA)
#include <cuda_runtime.h>
#endif

cuda_fft.cu

0 → 100644
+148 −0
Original line number Diff line number Diff line
#include <stdlib.h>
#include <stdio.h>
#include <cufftMp.h>
#include <mpi.h>
#include <cuda_runtime.h>
#include <complex.h>
#include "cuComplex.h"
#include "w-stacking.h"
#include <time.h>

#if defined(CUFFTMP) && !defined(USE_FFTW)

void cuda_fft(
	int num_w_planes,
	int grid_size_x,
	int grid_size_y,
	int xaxis,
	int yaxis,
	double * grid,
	double * gridss,
	MPI_Comm comm)
{
#ifdef __CUDACC__

        cudaError_t mmm;
        cufftResult_t status;

	cufftDoubleComplex *fftwgrid;
	fftwgrid = (cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*yaxis*grid_size_x);



	// Plan creation

	cufftHandle plan;
        status = cufftCreate(&plan);
        if (status != CUFFT_SUCCESS) {printf("!!! cufftCreate ERROR %d !!!\n", status);}

	cudaStream_t stream{};
	cudaStreamCreate(&stream);


	status = cufftMpAttachComm(plan, CUFFT_COMM_MPI, &comm);
	if (status != CUFFT_SUCCESS) {printf("!!! cufftMpAttachComm ERROR %d !!!\n", status);}

	status = cufftSetStream(plan, stream);
	if (status != CUFFT_SUCCESS) {printf("!!! cufftSetStream ERROR %d !!!\n", status);}

	size_t workspace;
	status = cufftMakePlan2d(plan, grid_size_x, grid_size_y, CUFFT_Z2Z, &workspace);
	if (status != CUFFT_SUCCESS) {printf("!!! cufftMakePlan2d ERROR %d !!!\n", status);}
	cudaDeviceSynchronize();



	long fftwindex = 0;
	long fftwindex2D = 0;
	double norm = 1.0/(double)(grid_size_x*grid_size_y);




	// Grid composition

	for (int iw=0; iw<num_w_planes; iw++)
        {
                printf("select the %d w-plane to transform\n", iw);
                for (int iv=0; iv<yaxis; iv++)
                {
                        for (int iu=0; iu<xaxis; iu++)
                        {
	                        fftwindex2D = iu + iv*xaxis;
				fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
                	        fftwgrid[fftwindex2D].x = grid[fftwindex];
                        	fftwgrid[fftwindex2D].y = grid[fftwindex+1];
                        }
                }


		cudaLibXtDesc *fftwgrid_g;
        	cudaLibXtDesc *fftwgrid_g2;


		status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);}

	        status = cufftXtMalloc(plan, &fftwgrid_g2, CUFFT_XT_FORMAT_INPLACE);
        	if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc 2 ERROR %d !!!\n", status);}
	        cudaDeviceSynchronize();

                mmm = cudaStreamSynchronize(stream);
                if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize ERROR %d !!!\n", mmm);}

		status = cufftXtMemcpy(plan, fftwgrid_g, fftwgrid, CUFFT_COPY_HOST_TO_DEVICE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy htd fftwgrid_g ERROR %d !!!\n", status);}

		cudaDeviceSynchronize();

		status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtExecDescriptor ERROR %d !!!\n", status);}


                mmm = cudaStreamSynchronize(stream);
                if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);}

		cudaDeviceSynchronize();


                status = cufftXtMemcpy(plan, fftwgrid_g2, fftwgrid_g, CUFFT_COPY_DEVICE_TO_DEVICE);
                if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);}


                mmm = cudaStreamSynchronize(stream);
                if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);}

                cudaDeviceSynchronize();

                status = cufftXtMemcpy(plan, fftwgrid, fftwgrid_g2, CUFFT_COPY_DEVICE_TO_HOST);
                if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dth fftwgrid ERROR %d !!!\n", status);}


		cufftXtFree(fftwgrid_g);
		cufftXtFree(fftwgrid_g2);

		for (int iv=0; iv<yaxis; iv++)
                {
                        for (int iu=0; iu<xaxis; iu++)
                        {
                        	fftwindex2D = iu + iv*xaxis;
                        	fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
                        	gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x;
                        	gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y;
			}
        	}
	}



	status = cufftDestroy(plan);
	if (status != CUFFT_SUCCESS) {printf("!!! cufftDestroy fftwgrid ERROR %d !!!\n", status);}


	cudaStreamDestroy(stream);
	cudaDeviceSynchronize();

#endif // __CUDACC__
}
#endif 
+26 −1
Original line number Diff line number Diff line
@@ -4,7 +4,7 @@

void fftw_data(){

 #ifdef USE_FFTW
 #if defined(USE_FFTW) && !defined(CUFFTMP)
  // FFT transform the data (using distributed FFTW)
  if(rank == 0)printf("PERFORMING FFT\n");

@@ -83,6 +83,31 @@ void fftw_data(){
	
 #endif

 #if defined(USE_FFTW) && defined(CUFFTMP)
  // FFT transform the data using cuFFT                                                                                                    
  if(rank==0)printf("PERFORMING CUDA FFT\n");

  MPI_Barrier(MPI_COMM_WORLD);

  double start = CPU_TIME_wt;

  cuda_fft(
	   num_w_planes,
	   grid_size_x,
	   grid_size_y,
	   xaxis,
	   yaxis,
	   grid,
	   gridss,
	   MPI_COMM_WORLD);

  MPI_Barrier(MPI_COMM_WORLD);
 
  timing_wt.cufftmp += CPU_TIME_wt - start;
  
 #endif

   
}

void write_fftw_data(){
+54 −2
Original line number Diff line number Diff line
@@ -16,10 +16,62 @@ void gridding()
  if(rank == 0)
    printf("GRIDDING DATA\n");

  // Create histograms and linked lists
  
  double start = CPU_TIME_wt;
  
 #ifdef NORMALIZE_UVW

  if (rank==0)
    printf("NORMALIZING DATA\n");

  double minu = 1e20;
  double minv = 1e20;
  double minw = 1e20;
  double maxu = -1e20;
  double maxv = -1e20;
  double maxw = -1e20;

  for (uint inorm=0; inorm<metaData.Nmeasures; inorm++)
    {
      minu = MIN(minu,data.uu[inorm]);
      minv = MIN(minv,data.vv[inorm]);
      minw = MIN(minw,data.ww[inorm]);
      maxu = MAX(maxu,data.uu[inorm]);
      maxv = MAX(maxv,data.vv[inorm]);
      maxw = MAX(maxw,data.ww[inorm]);
    }

  double minu_all;
  double minv_all;
  double minw_all;
  double maxu_all;
  double maxv_all;
  double maxw_all;

  MPI_Allreduce(&minu,&minu_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
  MPI_Allreduce(&minv,&minv_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
  MPI_Allreduce(&minw,&minw_all,1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
  MPI_Allreduce(&maxu,&maxu_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
  MPI_Allreduce(&maxv,&maxv_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
  MPI_Allreduce(&maxw,&maxw_all,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);

  double offset = 0.001;
  double ming = MAX(abs(minu_all),abs(minv_all));
  double maxg = MAX(abs(maxu_all),abs(maxv_all));
  maxg = MAX(maxg,ming);
  minw = minw_all;
  maxw = maxw_all;
  maxg = maxg+offset*maxg;
  for (uint inorm=0; inorm<metaData.Nmeasures; inorm++)
    {
      data.uu[inorm] = (data.uu[inorm]+maxg)/(2.0*maxg);
      data.vv[inorm] = (data.vv[inorm]+maxg)/(2.0*maxg);
      data.ww[inorm] = (data.ww[inorm]-minw)/(maxw-minw);
    }

 #endif
  
  // Create histograms and linked lists
  
  // Initialize linked list
  initialize_array();

Loading