Loading singlenode_cuda_fft.cu 0 → 100644 +318 −0 Original line number Diff line number Diff line #include <math.h> #include <stdlib.h> #include <stdio.h> #ifndef MULTI_GPU #include <cufft.h> #include <cufftw.h> #else #include <cufftXt.h> #endif #include "w-stacking.h" #include <cuda_runtime.h> #include <complex.h> #include "cuComplex.h" /*************************************************************************/ // CUDA KERNELS FOR fftwgrid /* #ifdef __CUDACC__ __global__ void write_grid( int num_w_planes, int xaxis, int yaxis, cufftDoubleComplex * fftwgrid, double * grid) { long fftwindex2D = blockIdx.x; long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } #endif */ /* #ifdef __CUDACC__ __global__ void write_gridss( int num_w_planes, int xaxis, int yaxis, cufftDoubleComplex * fftwgrid, double * gridss, int grid_size_x, int grid_size_y) { long fftwindex2D = blockIdx.x; long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis); double norm = 1.0/(double)(grid_size_x*grid_size_y); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } #endif */ /*************************************************************************/ void cuda_fft( int num_w_planes, int grid_size_x, int grid_size_y, int xaxis, int yaxis, double * grid, double * gridss) { #ifdef __CUDACC__ cudaError_t mmm; cufftResult_t status; clock_t startcu, endcu; double cufft_exec_time; struct timespec begincu, finishcu; // Define CUDA setup long Nbl = (long)(xaxis+yaxis*xaxis); int Nth = num_w_planes; if(NWORKERS == 1) {Nbl = 1; Nth = 1;} #ifndef MULTI_GPU printf("Performing single GPU cuFFT\n"); cufftDoubleComplex *fftwgrid; cufftDoubleComplex *fftwgrid_g; //double * grid_g; //double * gridss_g; fftwgrid = (cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); mmm = cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); //mmm = cudaMalloc(&grid_g, sizeof(double)*2*num_w_planes*xaxis*yaxis); //mmm = cudaMalloc(&gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis); //mmm = cudaMemcpy(grid_g, grid, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy grid ERROR %d !!!\n", mmm);} cudaDeviceSynchronize(); cufftHandle plan; cufftCreate(&plan); cufftPlan1d(&plan, num_w_planes*xaxis*yaxis, CUFFT_Z2Z, 1); double norm = 1.0/(double)(grid_size_x*grid_size_y); long fftwindex = 0; long fftwindex2D = 0; for (int iw=0; iw<num_w_planes; iw++) { printf("select the %d w-plane to transform\n", iw); for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } } mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice); if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);} cudaDeviceSynchronize(); /* // CUDA kernel grid composition write_grid<<<Nbl, Nth>>>(num_w_planes, xaxis, yaxis, fftwgrid_g, grid_g); cudaDeviceSynchronize(); */ // cuFFT plan execution clock_gettime(CLOCK_MONOTONIC, &begincu); startcu = clock(); status = cufftExecZ2Z(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); cudaDeviceSynchronize(); endcu = clock(); clock_gettime(CLOCK_MONOTONIC, &finishcu); cufft_exec_time = ((double) (endcu - startcu)) / CLOCKS_PER_SEC; printf("Time for cuFFT plan execution is %f sec\n", cufft_exec_time); if (status != CUFFT_SUCCESS) {printf("!!! cufftExec ERROR %d !!!\n", status);} // CUDA kernel grid decomposition /* write_gridss<<<Nbl, Nth>>>(num_w_planes, xaxis, yaxis, fftwgrid_g, gridss_g, grid_size_x, grid_size_y); cudaDeviceSynchronize(); mmm = cudaMemcpy(gridss, gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy gridss ERROR %d !!!\n", mmm);} cudaDeviceSynchronize(); */ mmm = cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy fftwgrid ERROR %d !!!\n", mmm);} cudaDeviceSynchronize(); for (int iw=0; iw<num_w_planes; iw++) { for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } } } cufftDestroy(plan); mmm = cudaFree(fftwgrid_g); //mmm = cudaFree(grid_g); //mmm = cudaFree(gridss_g); cudaDeviceSynchronize(); #else printf("Performing multiple GPU cuFFT\n"); cufftDoubleComplex *fftwgrid; fftwgrid = (cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); double norm = 1.0/(double)(grid_size_x*grid_size_y); long fftwindex2D = 0; long fftwindex = 0; for (int iw=0; iw<num_w_planes; iw++) { for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } } // Specify how many GPUs to be used and their Ids const int nDevices = 4; int deviceIds[nDevices] = {0, 1, 2, 3}; size_t workspace_sizes[nDevices]; cudaLibXtDesc *fftwgrid_multig; cufftHandle plan; cufftCreate(&plan); #if CUFFT_VERSION >= 10400 cudaStream_t stream{}; cudaStreamCreate(&stream); cufftSetStream(plan, stream); #endif status = cufftXtSetGPUs(plan, nDevices, deviceIds); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtSetGPUs ERROR %d !!!\n", status);} status = cufftMakePlan1d(plan, num_w_planes*xaxis*yaxis, CUFFT_Z2Z, 1, workspace_sizes); if (status != CUFFT_SUCCESS) {printf("!!! cufftMakePaln1d ERROR %d !!!\n", status);} cudaDeviceSynchronize(); status = cufftXtMalloc(plan, &fftwgrid_multig, CUFFT_XT_FORMAT_INPLACE); if (status != CUFFT_SUCCESS) {printf("!!! cuufftXtMalloc ERROR %d !!!\n", status);} cudaDeviceSynchronize(); status = cufftXtMemcpy(plan, fftwgrid_multig, fftwgrid, CUFFT_COPY_HOST_TO_DEVICE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy fftwgrid_multig ERROR %d !!!\n", status);} cudaDeviceSynchronize(); clock_gettime(CLOCK_MONOTONIC, &begincu); startcu = clock(); status = cufftXtExecDescriptorZ2Z(plan, fftwgrid_multig, fftwgrid_multig, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtExec ERROR %d !!!\n", status);} cudaDeviceSynchronize(); endcu = clock(); clock_gettime(CLOCK_MONOTONIC, &finishcu); cufft_exec_time = ((double) (endcu - startcu)) / CLOCKS_PER_SEC; printf("Time for multiGPU cuFFT plan execution is %f sec\n", cufft_exec_time); status = cufftXtMemcpy(plan, fftwgrid, fftwgrid_multig, CUFFT_COPY_DEVICE_TO_HOST); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy fftwgrid ERROR %d !!!\n", status);} cudaDeviceSynchronize(); for (int iw=0; iw<num_w_planes; iw++) { for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } } } #if CUFFT_VERSION >= 10400 CUDA_RT_CALL(cudaStreamDestroy(stream)); #endif cufftXtFree(fftwgrid_multig); cufftDestroy(plan); #endif // MULTI_GPU #endif // __CUDACC__ } Loading
singlenode_cuda_fft.cu 0 → 100644 +318 −0 Original line number Diff line number Diff line #include <math.h> #include <stdlib.h> #include <stdio.h> #ifndef MULTI_GPU #include <cufft.h> #include <cufftw.h> #else #include <cufftXt.h> #endif #include "w-stacking.h" #include <cuda_runtime.h> #include <complex.h> #include "cuComplex.h" /*************************************************************************/ // CUDA KERNELS FOR fftwgrid /* #ifdef __CUDACC__ __global__ void write_grid( int num_w_planes, int xaxis, int yaxis, cufftDoubleComplex * fftwgrid, double * grid) { long fftwindex2D = blockIdx.x; long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } #endif */ /* #ifdef __CUDACC__ __global__ void write_gridss( int num_w_planes, int xaxis, int yaxis, cufftDoubleComplex * fftwgrid, double * gridss, int grid_size_x, int grid_size_y) { long fftwindex2D = blockIdx.x; long fftwindex = 2*(blockIdx.x+threadIdx.x*xaxis*yaxis); double norm = 1.0/(double)(grid_size_x*grid_size_y); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } #endif */ /*************************************************************************/ void cuda_fft( int num_w_planes, int grid_size_x, int grid_size_y, int xaxis, int yaxis, double * grid, double * gridss) { #ifdef __CUDACC__ cudaError_t mmm; cufftResult_t status; clock_t startcu, endcu; double cufft_exec_time; struct timespec begincu, finishcu; // Define CUDA setup long Nbl = (long)(xaxis+yaxis*xaxis); int Nth = num_w_planes; if(NWORKERS == 1) {Nbl = 1; Nth = 1;} #ifndef MULTI_GPU printf("Performing single GPU cuFFT\n"); cufftDoubleComplex *fftwgrid; cufftDoubleComplex *fftwgrid_g; //double * grid_g; //double * gridss_g; fftwgrid = (cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); mmm = cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); //mmm = cudaMalloc(&grid_g, sizeof(double)*2*num_w_planes*xaxis*yaxis); //mmm = cudaMalloc(&gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis); //mmm = cudaMemcpy(grid_g, grid, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy grid ERROR %d !!!\n", mmm);} cudaDeviceSynchronize(); cufftHandle plan; cufftCreate(&plan); cufftPlan1d(&plan, num_w_planes*xaxis*yaxis, CUFFT_Z2Z, 1); double norm = 1.0/(double)(grid_size_x*grid_size_y); long fftwindex = 0; long fftwindex2D = 0; for (int iw=0; iw<num_w_planes; iw++) { printf("select the %d w-plane to transform\n", iw); for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } } mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis, cudaMemcpyHostToDevice); if (mmm != cudaSuccess) {printf("cudaMemcpy ERROR %d\n",mmm);} cudaDeviceSynchronize(); /* // CUDA kernel grid composition write_grid<<<Nbl, Nth>>>(num_w_planes, xaxis, yaxis, fftwgrid_g, grid_g); cudaDeviceSynchronize(); */ // cuFFT plan execution clock_gettime(CLOCK_MONOTONIC, &begincu); startcu = clock(); status = cufftExecZ2Z(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); cudaDeviceSynchronize(); endcu = clock(); clock_gettime(CLOCK_MONOTONIC, &finishcu); cufft_exec_time = ((double) (endcu - startcu)) / CLOCKS_PER_SEC; printf("Time for cuFFT plan execution is %f sec\n", cufft_exec_time); if (status != CUFFT_SUCCESS) {printf("!!! cufftExec ERROR %d !!!\n", status);} // CUDA kernel grid decomposition /* write_gridss<<<Nbl, Nth>>>(num_w_planes, xaxis, yaxis, fftwgrid_g, gridss_g, grid_size_x, grid_size_y); cudaDeviceSynchronize(); mmm = cudaMemcpy(gridss, gridss_g, sizeof(double)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy gridss ERROR %d !!!\n", mmm);} cudaDeviceSynchronize(); */ mmm = cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis, cudaMemcpyDeviceToHost); if (mmm != cudaSuccess) {printf("!!! cudaMemcpy fftwgrid ERROR %d !!!\n", mmm);} cudaDeviceSynchronize(); for (int iw=0; iw<num_w_planes; iw++) { for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } } } cufftDestroy(plan); mmm = cudaFree(fftwgrid_g); //mmm = cudaFree(grid_g); //mmm = cudaFree(gridss_g); cudaDeviceSynchronize(); #else printf("Performing multiple GPU cuFFT\n"); cufftDoubleComplex *fftwgrid; fftwgrid = (cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*num_w_planes*xaxis*yaxis); double norm = 1.0/(double)(grid_size_x*grid_size_y); long fftwindex2D = 0; long fftwindex = 0; for (int iw=0; iw<num_w_planes; iw++) { for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } } // Specify how many GPUs to be used and their Ids const int nDevices = 4; int deviceIds[nDevices] = {0, 1, 2, 3}; size_t workspace_sizes[nDevices]; cudaLibXtDesc *fftwgrid_multig; cufftHandle plan; cufftCreate(&plan); #if CUFFT_VERSION >= 10400 cudaStream_t stream{}; cudaStreamCreate(&stream); cufftSetStream(plan, stream); #endif status = cufftXtSetGPUs(plan, nDevices, deviceIds); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtSetGPUs ERROR %d !!!\n", status);} status = cufftMakePlan1d(plan, num_w_planes*xaxis*yaxis, CUFFT_Z2Z, 1, workspace_sizes); if (status != CUFFT_SUCCESS) {printf("!!! cufftMakePaln1d ERROR %d !!!\n", status);} cudaDeviceSynchronize(); status = cufftXtMalloc(plan, &fftwgrid_multig, CUFFT_XT_FORMAT_INPLACE); if (status != CUFFT_SUCCESS) {printf("!!! cuufftXtMalloc ERROR %d !!!\n", status);} cudaDeviceSynchronize(); status = cufftXtMemcpy(plan, fftwgrid_multig, fftwgrid, CUFFT_COPY_HOST_TO_DEVICE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy fftwgrid_multig ERROR %d !!!\n", status);} cudaDeviceSynchronize(); clock_gettime(CLOCK_MONOTONIC, &begincu); startcu = clock(); status = cufftXtExecDescriptorZ2Z(plan, fftwgrid_multig, fftwgrid_multig, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtExec ERROR %d !!!\n", status);} cudaDeviceSynchronize(); endcu = clock(); clock_gettime(CLOCK_MONOTONIC, &finishcu); cufft_exec_time = ((double) (endcu - startcu)) / CLOCKS_PER_SEC; printf("Time for multiGPU cuFFT plan execution is %f sec\n", cufft_exec_time); status = cufftXtMemcpy(plan, fftwgrid, fftwgrid_multig, CUFFT_COPY_DEVICE_TO_HOST); if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy fftwgrid ERROR %d !!!\n", status);} cudaDeviceSynchronize(); for (int iw=0; iw<num_w_planes; iw++) { for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } } } #if CUFFT_VERSION >= 10400 CUDA_RT_CALL(cudaStreamDestroy(stream)); #endif cufftXtFree(fftwgrid_multig); cufftDestroy(plan); #endif // MULTI_GPU #endif // __CUDACC__ }