Commit 4c4ffcb2 authored by Emanuele De Rubeis's avatar Emanuele De Rubeis
Browse files

cuFFTMp implementation

parent 8b4601ee
Loading
Loading
Loading
Loading
+104 −134
Original line number Diff line number Diff line
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <cufft.h>
#include <cufftw.h>
#include "w-stacking.h"
#include "/cineca/prod/opt/compilers/hpc-sdk/2023/binary/Linux_ppc64le/23.1/math_libs/11.8/include/cufftmp/cufftMp.h"
#include <mpi.h>
#include <cuda_runtime.h>
#include <complex.h>
#include "cuComplex.h"

/*
#ifdef __CUDACC__
double __device__
#else
double
#endif
*/
#include "w-stacking.h"
#include <time.h>


#ifdef __CUDACC__
__global__ void write_grid(
void cuda_fft(
	int num_w_planes,
	int grid_size_x,
	int grid_size_y,
	int xaxis,
	int yaxis,
	cufftDoubleComplex *fftwgrid,
	double *grid)
	double * grid,
	double * gridss,
	MPI_Comm comm)
{
	long fftwindex2D = blockIdx.x;
	long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis);

        fftwgrid[fftwindex2D].x = grid[fftwindex];
        fftwgrid[fftwindex2D].y = grid[fftwindex+1];
}
#endif
#ifdef __CUDACC__

        cudaError_t mmm;
        cufftResult_t status;

/*
#ifdef __CUDACC__
__global__ void write_gridss(
        int num_w_planes,
        int xaxis,
        int yaxis,
        cufftDoubleComplex *fftwgrid,
        double *gridss,
	int grid_size_x,
	int grid_size_y)
{
        long fftwindex2D = blockIdx.x;
        long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis);
	cufftDoubleComplex *fftwgrid;
	fftwgrid = (cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*yaxis*grid_size_x);

	double norm = 1.0/(double)(grid_size_x*grid_size_y);

        gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x;
        gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y;
}
#endif
*/

	// Plan creation

	cufftHandle plan;
        status = cufftCreate(&plan);
        if (status != CUFFT_SUCCESS) {printf("!!! cufftCreate ERROR %d !!!\n", status);}

	cudaStream_t stream{};
	cudaStreamCreate(&stream);


	status = cufftMpAttachComm(plan, CUFFT_COMM_MPI, &comm);
	if (status != CUFFT_SUCCESS) {printf("!!! cufftMpAttachComm ERROR %d !!!\n", status);}

void cuda_fft(
	int num_w_planes,
	int grid_size_x,
	int grid_size_y,
	int xaxis,
	int yaxis,
	double * grid,
	double * gridss)
{
#ifdef __CUDACC__
	status = cufftSetStream(plan, stream);
	if (status != CUFFT_SUCCESS) {printf("!!! cufftSetStream ERROR %d !!!\n", status);}

	cufftDoubleComplex *fftwgrid;
	cufftDoubleComplex *fftwgrid_g;
	cudaError_t mmm;
	cufftResult_t status;
	size_t workspace;
	status = cufftMakePlan2d(plan, grid_size_x, grid_size_y, CUFFT_Z2Z, &workspace);
	if (status != CUFFT_SUCCESS) {printf("!!! cufftMakePlan2d ERROR %d !!!\n", status);}
	cudaDeviceSynchronize();

	double * grid_g;
	//double * gridss_g;

	// Descriptor creation

	mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis);
	mmm=cudaMalloc(&grid_g, sizeof(double)*2*num_w_planes*xaxis*yaxis);
	//mmm=cudaMalloc(&gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis);
	mmm=cudaMemcpy(grid_g, grid, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice);
	cudaDeviceSynchronize();
	cudaLibXtDesc *fftwgrid_g;
	cudaLibXtDesc *fftwgrid_g2;


	clock_t start, end, startcu, endcu;
	double cufft_exec_time;
        struct timespec begin, finish, begin0, begink, finishk, begincu, finishcu;
	status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE);
	if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);}
	cudaDeviceSynchronize();

	status = cufftXtMalloc(plan, &fftwgrid_g2, CUFFT_XT_FORMAT_INPLACE);
	if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc 2 ERROR %d !!!\n", status);}
	cudaDeviceSynchronize();


	// Define the CUDA set up
	//int Nth = NTHREADS;
	long Nbl = (long)(xaxis+yaxis*xaxis);
	//if(NWORKERS == 1) {Nbl = 1; Nth = 1;};

	fftwgrid =(cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis);

        cufftHandle plan;
	cufftCreate(&plan);
        cufftPlan1d(&plan, num_w_planes*xaxis*yaxis, CUFFT_Z2Z, 1);
	printf("plan creation\n");
        double norm = 1.0/(double)(grid_size_x*grid_size_y);
	long fftwindex = 0;
	long fftwindex2D = 0;
	double norm = 1.0/(double)(grid_size_x*grid_size_y);



	write_grid<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, grid_g);
	cudaDeviceSynchronize();

/*
	// Grid composition

	for (int iw=0; iw<num_w_planes; iw++)
        {
@@ -128,44 +90,42 @@ void cuda_fft(
                        	fftwgrid[fftwindex2D].y = grid[fftwindex+1];
                        }
                }
*/
	    //printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137].x);


	    //mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice);
	    //if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);}
	    //printf("memcpy fftwgrid to fftwgrid_g\n");
		cudaDeviceSynchronize();

                mmm = cudaStreamSynchronize(stream);
                if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize ERROR %d !!!\n", mmm);}

		status = cufftXtMemcpy(plan, fftwgrid_g, fftwgrid, CUFFT_COPY_HOST_TO_DEVICE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy htd fftwgrid_g ERROR %d !!!\n", status);}

            printf("do the transform for each w-plane\n");
		cudaDeviceSynchronize();


	    clock_gettime(CLOCK_MONOTONIC, &begincu);
	    startcu = clock();
		status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtExecDescriptor ERROR %d !!!\n", status);}

            status=cufftExecZ2Z(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE);

	    endcu = clock();
	    clock_gettime(CLOCK_MONOTONIC, &finishcu);
	    cufft_exec_time = ((double) (endcu-startcu)) / CLOCKS_PER_SEC;
	    printf("Time for cuFFT plan execution is %f sec\n", cufft_exec_time);
	    if (status != CUFFT_SUCCESS) {printf("cufftexec ERROR %d\n",status);}
                mmm = cudaStreamSynchronize(stream);
                if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);}

		cudaDeviceSynchronize();


	    //write_gridss<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, gridss_g, grid_size_x, grid_size_y);
	    //cudaDeviceSynchronize();
                status = cufftXtMemcpy(plan, fftwgrid_g2, fftwgrid_g, CUFFT_COPY_DEVICE_TO_DEVICE);
                if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);}

	    //mmm=cudaMemcpy(gridss, gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost);

                mmm = cudaStreamSynchronize(stream);
                if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);}

            mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost);
	    if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);}
                cudaDeviceSynchronize();

                status = cufftXtMemcpy(plan, fftwgrid, fftwgrid_g2, CUFFT_COPY_DEVICE_TO_HOST);
                if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dth fftwgrid ERROR %d !!!\n", status);}


	for (int iw=0; iw<num_w_planes; iw++)
	{
            // save the transformed w-plane
		for (int iv=0; iv<yaxis; iv++)
                {
                        for (int iu=0; iu<xaxis; iu++)
@@ -176,13 +136,23 @@ void cuda_fft(
                        	gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y;
			}
        	}

	}

        cufftDestroy(plan);

	mmm = cudaFree(fftwgrid_g);
	mmm = cudaFree(grid_g);
	//mmm = cudaFree(gridss_g);
#endif
	status = cufftXtFree(fftwgrid_g);
	if (status != CUFFT_SUCCESS) {printf("!!! cufftXtFree ERROR %d !!!\n", status);}


        status = cufftXtFree(fftwgrid_g2);
        if (status != CUFFT_SUCCESS) {printf("!!! cufftXtFree ERROR %d !!!\n", status);}


	status = cufftDestroy(plan);
	if (status != CUFFT_SUCCESS) {printf("!!! cufftDestroy fftwgrid ERROR %d !!!\n", status);}


	cudaStreamDestroy(stream);
	cudaDeviceSynchronize();

#endif // __CUDACC__
}
+53 −42
Original line number Diff line number Diff line
@@ -8,6 +8,9 @@
#include <mpi.h>
#ifdef USE_FFTW
#include <fftw3-mpi.h>
/*#ifdef CUDA_FFT

#endif*/
#endif
#endif
#include <omp.h>
@@ -53,6 +56,7 @@ int main(int argc, char * argv[])
	FILE * pFile1;
	FILE * pFilereal;
	FILE * pFileimg;

	// Global filename to be composed
	char filename[1000];

@@ -80,6 +84,7 @@ int main(int argc, char * argv[])
        char srank[4];
        char timingfile[FILENAMELENGTH] = "timings.dat";


	// Visibilities related variables
	double * uu;
	double * vv;
@@ -129,6 +134,8 @@ int main(int argc, char * argv[])


	// Initialize FITS image parameters

#ifdef FITSIO
	fitsfile *fptreal;
	fitsfile *fptrimg;
	int status;
@@ -137,15 +144,15 @@ int main(int argc, char * argv[])

	long naxis = 2;
	long naxes[2] = { grid_size_x, grid_size_y };

#endif


	// Internal profiling parameters
	clock_t start, end, start0, startk, endk, startis, endis;
	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, cufft_time, phase_time, image_time, mpifftw_time;
	clock_t start, end, start0, startk, endk, startis, endis, startcu, endcu;
	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, cufft_time, phase_time, image_time, mpifftw_time, timecufftmp;
	double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, cufft_time1, phase_time1, image_time1;
	double writetime, writetime1;
	struct timespec begin, finish, begin0, begink, finishk, beginis, finishis;
	struct timespec begin, finish, begin0, begink, finishk, beginis, finishis, begincu, finishcu;
	double elapsed;

        // Number of sectors in which the mesh is divided along the v direction
@@ -178,8 +185,16 @@ int main(int argc, char * argv[])
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	if(rank == 0)printf("Running with %d MPI tasks\n",size);

        MPI_Barrier(MPI_COMM_WORLD);

  #ifdef USE_FFTW
  #ifndef CUDA_FFT
  	#ifdef CUDA_FFT
	int ndevices;
	cudaGetDeviceCount(&ndevices);
	cudaSetDevice(rank % ndevices);
	printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices);
	#else
	// Initialize FFTW environment
	fftw_mpi_init();
  	#endif
@@ -789,22 +804,19 @@ if(rank == 0){

#ifdef USE_FFTW


#ifdef CUDA_FFT

	// FFT transform the data using cuFFT
	if(rank==0)printf("PERFORMING CUDA FFT\n");
	clock_gettime(CLOCK_MONOTONIC, &begin);
	start = clock();
/*
        for (int isector=0; isector<size; isector++)
        {

        #ifdef USE_MPI
        MPI_Barrier(MPI_COMM_WORLD);
        #endif
            if(isector == rank)
            {
*/

	clock_gettime(CLOCK_MONOTONIC, &begincu);
	startcu = clock();


	cuda_fft(
	num_w_planes,
	grid_size_x,
@@ -812,22 +824,16 @@ if(rank == 0){
	xaxis,
	yaxis,
	grid,
	gridss);

//}
//}





	end = clock();
	clock_gettime(CLOCK_MONOTONIC, &finish);
	cufft_time = ((double) (end - start)) / CLOCKS_PER_SEC;
        cufft_time1 = (finish.tv_sec - begin.tv_sec);
        cufft_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
	gridss,
	MPI_COMM_WORLD);

	#ifdef USE_MPI
        MPI_Barrier(MPI_COMM_WORLD);
        #endif

	endcu = clock();
	clock_gettime(CLOCK_MONOTONIC, &finishcu);
	timecufftmp = ((double) (endcu-startcu)) / CLOCKS_PER_SEC;


#else
@@ -883,7 +889,7 @@ if(rank == 0){
	    endis = clock();
	    clock_gettime(CLOCK_MONOTONIC, &finishis);
	    mpifftw_time = ((double) (endis-startis)) / CLOCKS_PER_SEC;
	    printf("Time for FFTW MPI is %f sec\n", mpifftw_time);
	    printf("Time for FFTW MPI plan execution is %f sec\n", mpifftw_time);



@@ -1044,9 +1050,14 @@ if(rank == 0){
	#ifdef USE_MPI
	MPI_Barrier(MPI_COMM_WORLD);
        #endif

        if(rank == 0)printf("WRITING IMAGE\n");

	#ifdef FITSIO
        long * fpixel = (long *) malloc(sizeof(long)*naxis);
        long * lpixel = (long *) malloc(sizeof(long)*naxis);
	#endif


        #ifdef USE_MPI
        MPI_Barrier(MPI_COMM_WORLD);
@@ -1159,7 +1170,7 @@ if(rank == 0){
          printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time);
#ifdef USE_FFTW
#ifdef CUDA_FFT
          printf("cuFFT time:     %f sec\n",cufft_time);
          printf("cuFFTMp complete execution time:     %f sec\n",timecufftmp);
          printf("Phase time:    %f sec\n",phase_time);
#else
	  printf("FFTW time:	%f sec\n", fftw_time);
@@ -1177,7 +1188,7 @@ if(rank == 0){
            printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1);
#ifdef USE_FFTW
#ifdef CUDA_FFT
	    printf("PcuFFT time:    %f sec\n", cufft_time1);
//	    printf("PcuFFT time:    %f sec\n", cufft_time1);
	    printf("PPhase time:    %f sec\n", phase_time1);
#else
            printf("PFFTW time:    %f sec\n",fftw_time1);
+3 −4
Original line number Diff line number Diff line
@@ -6,9 +6,7 @@
#define PI 3.14159265359
#define REAL_TYPE double

//#include <cufft.h>
//#include <cufftw.h>

#include <mpi.h>



@@ -70,6 +68,7 @@ void cuda_fft(
	int,
	int,
	double*,
	double*);
	double*,
	MPI_Comm);

#endif