Loading cuda_fft.cu 0 → 100644 +122 −0 Original line number Original line Diff line number Diff line #include <math.h> #include <stdlib.h> #include <stdio.h> #include <cufft.h> #include <cufftw.h> #include "w-stacking.h" #include <cuda_runtime.h> #include <complex.h> #include "cuComplex.h" /* #ifdef __CUDACC__ double __device__ #else double #endif */ //__global__ void write_grid void cuda_fft( int num_w_planes, int grid_size_x, int grid_size_y, int xaxis, int yaxis, double * grid, double * gridss) { #ifdef __CUDACC__ cufftDoubleComplex *fftwgrid; cufftDoubleComplex *fftwgrid_g; cudaError_t mmm; cufftResult_t status; // Define the CUDA set up int Nth = NTHREADS; long Nbl = (long)(num_w_planes*xaxis*yaxis/Nth) + 1; if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; //long Nvis = num_points*freq_per_chan*polarizations; ///printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl); fftwgrid =(cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y); mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y); if (mmm != cudaSuccess) {printf("cudaMalloc error %d\n",mmm);} //mmm=cudaMemset(fftwgrid_g, 0.0, 2*grid_size_x*grid_size_y*sizeof(cufftDoubleComplex)); printf("malloc fftwgrid_g\n"); cufftHandle plan; cufftCreate(&plan); cufftPlan1d(&plan, grid_size_x*grid_size_y, CUFFT_Z2Z, 1); printf("plan creation\n"); double norm = 1.0/(double)(grid_size_x*grid_size_y); long fftwindex = 0; long fftwindex2D = 0; for (int iw=0; iw<num_w_planes; iw++) { printf("select the w-plane to transform\n"); for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } //printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137].x); mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y, cudaMemcpyHostToDevice); if (mmm != cudaSuccess) {printf("cudaMemcpy error %d\n",mmm);} printf("memcpy fftwgrid to fftwgrid_g\n"); printf("do the transform for each w-plane\n"); status=cufftExecZ2Z(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) {printf("cufftexec error %d\n",status);} cudaDeviceSynchronize(); mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y, cudaMemcpyDeviceToHost); if (mmm != cudaSuccess) {printf("cudaMemcpy error %d\n",mmm);} printf("memcpy fftwgrid_g to fftwgrid\n"); //printf("fftwgrid[1260137].x post cuFFT %f\n", fftwgrid[1260137].x); // save the transformed w-plane for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } } } cufftDestroy(plan); mmm = cudaFree(fftwgrid_g); #endif } Loading
cuda_fft.cu 0 → 100644 +122 −0 Original line number Original line Diff line number Diff line #include <math.h> #include <stdlib.h> #include <stdio.h> #include <cufft.h> #include <cufftw.h> #include "w-stacking.h" #include <cuda_runtime.h> #include <complex.h> #include "cuComplex.h" /* #ifdef __CUDACC__ double __device__ #else double #endif */ //__global__ void write_grid void cuda_fft( int num_w_planes, int grid_size_x, int grid_size_y, int xaxis, int yaxis, double * grid, double * gridss) { #ifdef __CUDACC__ cufftDoubleComplex *fftwgrid; cufftDoubleComplex *fftwgrid_g; cudaError_t mmm; cufftResult_t status; // Define the CUDA set up int Nth = NTHREADS; long Nbl = (long)(num_w_planes*xaxis*yaxis/Nth) + 1; if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; //long Nvis = num_points*freq_per_chan*polarizations; ///printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl); fftwgrid =(cufftDoubleComplex*) malloc(sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y); mmm=cudaMalloc(&fftwgrid_g, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y); if (mmm != cudaSuccess) {printf("cudaMalloc error %d\n",mmm);} //mmm=cudaMemset(fftwgrid_g, 0.0, 2*grid_size_x*grid_size_y*sizeof(cufftDoubleComplex)); printf("malloc fftwgrid_g\n"); cufftHandle plan; cufftCreate(&plan); cufftPlan1d(&plan, grid_size_x*grid_size_y, CUFFT_Z2Z, 1); printf("plan creation\n"); double norm = 1.0/(double)(grid_size_x*grid_size_y); long fftwindex = 0; long fftwindex2D = 0; for (int iw=0; iw<num_w_planes; iw++) { printf("select the w-plane to transform\n"); for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); fftwgrid[fftwindex2D].x = grid[fftwindex]; fftwgrid[fftwindex2D].y = grid[fftwindex+1]; } } //printf("fftwgrid[1260137].x pre cuFFT %f\n", fftwgrid[1260137].x); mmm=cudaMemcpy(fftwgrid_g, fftwgrid, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y, cudaMemcpyHostToDevice); if (mmm != cudaSuccess) {printf("cudaMemcpy error %d\n",mmm);} printf("memcpy fftwgrid to fftwgrid_g\n"); printf("do the transform for each w-plane\n"); status=cufftExecZ2Z(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) {printf("cufftexec error %d\n",status);} cudaDeviceSynchronize(); mmm=cudaMemcpy(fftwgrid, fftwgrid_g, sizeof(cufftDoubleComplex)*2*grid_size_x*grid_size_y, cudaMemcpyDeviceToHost); if (mmm != cudaSuccess) {printf("cudaMemcpy error %d\n",mmm);} printf("memcpy fftwgrid_g to fftwgrid\n"); //printf("fftwgrid[1260137].x post cuFFT %f\n", fftwgrid[1260137].x); // save the transformed w-plane for (int iv=0; iv<yaxis; iv++) { for (int iu=0; iu<xaxis; iu++) { fftwindex2D = iu + iv*xaxis; fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis); gridss[fftwindex] = norm*fftwgrid[fftwindex2D].x; gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D].y; } } } cufftDestroy(plan); mmm = cudaFree(fftwgrid_g); #endif }