cuFFTMp implementation (4c4ffcb2) · Commits · Claudio Gheller / HPC_Imaging

cuda_fft.cu

+104 −134

Original line number	Diff line number	Diff line
		#include <math.h>
		#include <stdlib.h>
		#include <stdio.h>
		#include <cufft.h>
		#include <cufftw.h>
		#include "w-stacking.h"
		#include "/cineca/prod/opt/compilers/hpc-sdk/2023/binary/Linux_ppc64le/23.1/math_libs/11.8/include/cufftmp/cufftMp.h"
		#include <mpi.h>
		#include <cuda_runtime.h>
		#include <complex.h>
		#include "cuComplex.h"

		/*
		#ifdef __CUDACC__
		double __device__
		#else
		double
		#endif
		*/
		#include "w-stacking.h"
		#include <time.h>


		#ifdef __CUDACC__
		__global__ void write_grid(
		void cuda_fft(
		int num_w_planes,
		int grid_size_x,
		int grid_size_y,
		int xaxis,
		int yaxis,
		cufftDoubleComplex *fftwgrid,
		double *grid)
		double * grid,
		double * gridss,
		MPI_Comm comm)
		{
		long fftwindex2D = blockIdx.x;
		long fftwindex = 2(blockIdx.x+threadIdx.xxaxis*yaxis);

		fftwgrid[fftwindex2D].x = grid[fftwindex];
		fftwgrid[fftwindex2D].y = grid[fftwindex+1];
		}
		#endif
		#ifdef __CUDACC__

		cudaError_t mmm;
		cufftResult_t status;

		/*
		#ifdef __CUDACC__
		__global__ void write_gridss(
		int num_w_planes,
		int xaxis,
		int yaxis,
		cufftDoubleComplex *fftwgrid,
		double *gridss,
		int grid_size_x,
		int grid_size_y)
		{
		long fftwindex2D = blockIdx.x;
		long fftwindex = 2(blockIdx.x+threadIdx.xxaxis*yaxis);
		cufftDoubleComplex *fftwgrid;
		fftwgrid = (cufftDoubleComplex) malloc(sizeof(cufftDoubleComplex)2num_w_planesyaxis*grid_size_x);

		double norm = 1.0/(double)(grid_size_x*grid_size_y);

		gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x;
		gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y;
		}
		#endif
		*/

		// Plan creation

		cufftHandle plan;
		status = cufftCreate(&plan);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftCreate ERROR %d !!!\n", status);}

		cudaStream_t stream{};
		cudaStreamCreate(&stream);


		status = cufftMpAttachComm(plan, CUFFT_COMM_MPI, &comm);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftMpAttachComm ERROR %d !!!\n", status);}

		void cuda_fft(
		int num_w_planes,
		int grid_size_x,
		int grid_size_y,
		int xaxis,
		int yaxis,
		double * grid,
		double * gridss)
		{
		#ifdef __CUDACC__
		status = cufftSetStream(plan, stream);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftSetStream ERROR %d !!!\n", status);}

		cufftDoubleComplex *fftwgrid;
		cufftDoubleComplex *fftwgrid_g;
		cudaError_t mmm;
		cufftResult_t status;
		size_t workspace;
		status = cufftMakePlan2d(plan, grid_size_x, grid_size_y, CUFFT_Z2Z, &workspace);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftMakePlan2d ERROR %d !!!\n", status);}
		cudaDeviceSynchronize();

		double * grid_g;
		//double * gridss_g;

		// Descriptor creation

		mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)2num_w_planesxaxisyaxis);
		mmm=cudaMalloc(&grid_g, sizeof(double)2num_w_planesxaxisyaxis);
		//mmm=cudaMalloc(&gridss_g, sizeof(double)2num_w_planesxaxisyaxis);
		mmm=cudaMemcpy(grid_g, grid, sizeof(double)2num_w_planesxaxisyaxis, cudaMemcpyHostToDevice);
		cudaDeviceSynchronize();
		cudaLibXtDesc *fftwgrid_g;
		cudaLibXtDesc *fftwgrid_g2;


		clock_t start, end, startcu, endcu;
		double cufft_exec_time;
		struct timespec begin, finish, begin0, begink, finishk, begincu, finishcu;
		status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);}
		cudaDeviceSynchronize();

		status = cufftXtMalloc(plan, &fftwgrid_g2, CUFFT_XT_FORMAT_INPLACE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc 2 ERROR %d !!!\n", status);}
		cudaDeviceSynchronize();


		// Define the CUDA set up
		//int Nth = NTHREADS;
		long Nbl = (long)(xaxis+yaxis*xaxis);
		//if(NWORKERS == 1) {Nbl = 1; Nth = 1;};

		fftwgrid =(cufftDoubleComplex) malloc(sizeof(cufftDoubleComplex)2num_w_planesxaxis*yaxis);

		cufftHandle plan;
		cufftCreate(&plan);
		cufftPlan1d(&plan, num_w_planesxaxisyaxis, CUFFT_Z2Z, 1);
		printf("plan creation\n");
		double norm = 1.0/(double)(grid_size_x*grid_size_y);
		long fftwindex = 0;
		long fftwindex2D = 0;
		double norm = 1.0/(double)(grid_size_x*grid_size_y);



		write_grid<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, grid_g);
		cudaDeviceSynchronize();

		/*
		// Grid composition

		for (int iw=0; iw<num_w_planes; iw++)
		{
		@@ -128,44 +90,42 @@ void cuda_fft(
		fftwgrid[fftwindex2D].y = grid[fftwindex+1];
		}
		}
		*/
		//printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137].x);


		//mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)num_w_planesxaxis*yaxis, cudaMemcpyHostToDevice);
		//if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);}
		//printf("memcpy fftwgrid to fftwgrid_g\n");
		cudaDeviceSynchronize();

		mmm = cudaStreamSynchronize(stream);
		if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize ERROR %d !!!\n", mmm);}

		status = cufftXtMemcpy(plan, fftwgrid_g, fftwgrid, CUFFT_COPY_HOST_TO_DEVICE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy htd fftwgrid_g ERROR %d !!!\n", status);}

		printf("do the transform for each w-plane\n");
		cudaDeviceSynchronize();


		clock_gettime(CLOCK_MONOTONIC, &begincu);
		startcu = clock();
		status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtExecDescriptor ERROR %d !!!\n", status);}

		status=cufftExecZ2Z(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE);

		endcu = clock();
		clock_gettime(CLOCK_MONOTONIC, &finishcu);
		cufft_exec_time = ((double) (endcu-startcu)) / CLOCKS_PER_SEC;
		printf("Time for cuFFT plan execution is %f sec\n", cufft_exec_time);
		if (status != CUFFT_SUCCESS) {printf("cufftexec ERROR %d\n",status);}
		mmm = cudaStreamSynchronize(stream);
		if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);}

		cudaDeviceSynchronize();


		//write_gridss<<<Nbl,num_w_planes>>>(num_w_planes,xaxis,yaxis, fftwgrid_g, gridss_g, grid_size_x, grid_size_y);
		//cudaDeviceSynchronize();
		status = cufftXtMemcpy(plan, fftwgrid_g2, fftwgrid_g, CUFFT_COPY_DEVICE_TO_DEVICE);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);}

		//mmm=cudaMemcpy(gridss, gridss_g, sizeof(double)2num_w_planesxaxisyaxis, cudaMemcpyDeviceToHost);

		mmm = cudaStreamSynchronize(stream);
		if (mmm != cudaSuccess) {printf("!!! cudaStreamSynchronize 2 ERROR %d !!!\n", mmm);}

		mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)2num_w_planesxaxisyaxis, cudaMemcpyDeviceToHost);
		if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);}
		cudaDeviceSynchronize();

		status = cufftXtMemcpy(plan, fftwgrid, fftwgrid_g2, CUFFT_COPY_DEVICE_TO_HOST);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dth fftwgrid ERROR %d !!!\n", status);}


		for (int iw=0; iw<num_w_planes; iw++)
		{
		// save the transformed w-plane
		for (int iv=0; iv<yaxis; iv++)
		{
		for (int iu=0; iu<xaxis; iu++)
		@@ -176,13 +136,23 @@ void cuda_fft(
		gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y;
		}
		}

		}

		cufftDestroy(plan);

		mmm = cudaFree(fftwgrid_g);
		mmm = cudaFree(grid_g);
		//mmm = cudaFree(gridss_g);
		#endif
		status = cufftXtFree(fftwgrid_g);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtFree ERROR %d !!!\n", status);}


		status = cufftXtFree(fftwgrid_g2);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftXtFree ERROR %d !!!\n", status);}


		status = cufftDestroy(plan);
		if (status != CUFFT_SUCCESS) {printf("!!! cufftDestroy fftwgrid ERROR %d !!!\n", status);}


		cudaStreamDestroy(stream);
		cudaDeviceSynchronize();

		#endif // __CUDACC__
		}

w-stacking-fftw.c

+53 −42

Original line number	Diff line number	Diff line
		@@ -8,6 +8,9 @@
		#include <mpi.h>
		#ifdef USE_FFTW
		#include <fftw3-mpi.h>
		/*#ifdef CUDA_FFT

		#endif*/
		#endif
		#endif
		#include <omp.h>
		@@ -53,6 +56,7 @@ int main(int argc, char * argv[])
		FILE * pFile1;
		FILE * pFilereal;
		FILE * pFileimg;

		// Global filename to be composed
		char filename[1000];

		@@ -80,6 +84,7 @@ int main(int argc, char * argv[])
		char srank[4];
		char timingfile[FILENAMELENGTH] = "timings.dat";


		// Visibilities related variables
		double * uu;
		double * vv;
		@@ -129,6 +134,8 @@ int main(int argc, char * argv[])


		// Initialize FITS image parameters

		#ifdef FITSIO
		fitsfile *fptreal;
		fitsfile *fptrimg;
		int status;
		@@ -137,15 +144,15 @@ int main(int argc, char * argv[])

		long naxis = 2;
		long naxes[2] = { grid_size_x, grid_size_y };

		#endif


		// Internal profiling parameters
		clock_t start, end, start0, startk, endk, startis, endis;
		double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, cufft_time, phase_time, image_time, mpifftw_time;
		clock_t start, end, start0, startk, endk, startis, endis, startcu, endcu;
		double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, cufft_time, phase_time, image_time, mpifftw_time, timecufftmp;
		double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, cufft_time1, phase_time1, image_time1;
		double writetime, writetime1;
		struct timespec begin, finish, begin0, begink, finishk, beginis, finishis;
		struct timespec begin, finish, begin0, begink, finishk, beginis, finishis, begincu, finishcu;
		double elapsed;

		// Number of sectors in which the mesh is divided along the v direction
		@@ -178,8 +185,16 @@ int main(int argc, char * argv[])
		MPI_Comm_rank(MPI_COMM_WORLD, &rank);
		MPI_Comm_size(MPI_COMM_WORLD, &size);
		if(rank == 0)printf("Running with %d MPI tasks\n",size);

		MPI_Barrier(MPI_COMM_WORLD);

		#ifdef USE_FFTW
		#ifndef CUDA_FFT
		#ifdef CUDA_FFT
		int ndevices;
		cudaGetDeviceCount(&ndevices);
		cudaSetDevice(rank % ndevices);
		printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices);
		#else
		// Initialize FFTW environment
		fftw_mpi_init();
		#endif
		@@ -789,22 +804,19 @@ if(rank == 0){

		#ifdef USE_FFTW


		#ifdef CUDA_FFT

		// FFT transform the data using cuFFT
		if(rank==0)printf("PERFORMING CUDA FFT\n");
		clock_gettime(CLOCK_MONOTONIC, &begin);
		start = clock();
		/*
		for (int isector=0; isector<size; isector++)
		{

		#ifdef USE_MPI
		MPI_Barrier(MPI_COMM_WORLD);
		#endif
		if(isector == rank)
		{
		*/

		clock_gettime(CLOCK_MONOTONIC, &begincu);
		startcu = clock();


		cuda_fft(
		num_w_planes,
		grid_size_x,
		@@ -812,22 +824,16 @@ if(rank == 0){
		xaxis,
		yaxis,
		grid,
		gridss);

		//}
		//}





		end = clock();
		clock_gettime(CLOCK_MONOTONIC, &finish);
		cufft_time = ((double) (end - start)) / CLOCKS_PER_SEC;
		cufft_time1 = (finish.tv_sec - begin.tv_sec);
		cufft_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
		gridss,
		MPI_COMM_WORLD);

		#ifdef USE_MPI
		MPI_Barrier(MPI_COMM_WORLD);
		#endif

		endcu = clock();
		clock_gettime(CLOCK_MONOTONIC, &finishcu);
		timecufftmp = ((double) (endcu-startcu)) / CLOCKS_PER_SEC;


		#else
		@@ -883,7 +889,7 @@ if(rank == 0){
		endis = clock();
		clock_gettime(CLOCK_MONOTONIC, &finishis);
		mpifftw_time = ((double) (endis-startis)) / CLOCKS_PER_SEC;
		printf("Time for FFTW MPI is %f sec\n", mpifftw_time);
		printf("Time for FFTW MPI plan execution is %f sec\n", mpifftw_time);



		@@ -1044,9 +1050,14 @@ if(rank == 0){
		#ifdef USE_MPI
		MPI_Barrier(MPI_COMM_WORLD);
		#endif

		if(rank == 0)printf("WRITING IMAGE\n");

		#ifdef FITSIO
		long * fpixel = (long ) malloc(sizeof(long)naxis);
		long * lpixel = (long ) malloc(sizeof(long)naxis);
		#endif


		#ifdef USE_MPI
		MPI_Barrier(MPI_COMM_WORLD);
		@@ -1159,7 +1170,7 @@ if(rank == 0){
		printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time);
		#ifdef USE_FFTW
		#ifdef CUDA_FFT
		printf("cuFFT time: %f sec\n",cufft_time);
		printf("cuFFTMp complete execution time: %f sec\n",timecufftmp);
		printf("Phase time: %f sec\n",phase_time);
		#else
		printf("FFTW time: %f sec\n", fftw_time);
		@@ -1177,7 +1188,7 @@ if(rank == 0){
		printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1);
		#ifdef USE_FFTW
		#ifdef CUDA_FFT
		printf("PcuFFT time: %f sec\n", cufft_time1);
		// printf("PcuFFT time: %f sec\n", cufft_time1);
		printf("PPhase time: %f sec\n", phase_time1);
		#else
		printf("PFFTW time: %f sec\n",fftw_time1);

w-stacking.h

+3 −4

Original line number	Diff line number	Diff line
		@@ -6,9 +6,7 @@
		#define PI 3.14159265359
		#define REAL_TYPE double

		//#include <cufft.h>
		//#include <cufftw.h>

		#include <mpi.h>



		@@ -70,6 +68,7 @@ void cuda_fft(
		int,
		int,
		double*,
		double*);
		double*,
		MPI_Comm);

		#endif