Loading fourier_transform.c +71 −1 Original line number Diff line number Diff line Loading @@ -220,8 +220,31 @@ void write_fftw_data(){ #ifdef WRITE_IMAGE double start_image = CPU_TIME_wt; if(rank == 0) { #ifdef FITSIO printf("REMOVING RESIDUAL FITS FILE\n"); remove(testfitsreal); remove(testfitsimag); printf("FITS CREATION\n"); status = 0; fits_create_file(&fptrimg, testfitsimag, &status); fits_create_img(fptrimg, DOUBLE_IMG, naxis, naxes, &status); fits_close_file(fptrimg, &status); status = 0; fits_create_file(&fptreal, testfitsreal, &status); fits_create_img(fptreal, DOUBLE_IMG, naxis, naxes, &status); fits_close_file(fptreal, &status); #endif file.pFilereal = fopen (out.fftfile2,"wb"); file.pFileimg = fopen (out.fftfile3,"wb"); fclose(file.pFilereal); Loading @@ -231,6 +254,31 @@ void write_fftw_data(){ MPI_Barrier(MPI_COMM_WORLD); if(rank == 0)printf("WRITING IMAGE\n"); #ifdef FITSIO uint * fpixel = (uint *) malloc(sizeof(uint)*naxis); uint * lpixel = (uint *) malloc(sizeof(uint)*naxis); #endif #ifdef FITSIO fpixel[0] = 1; fpixel[1] = rank*yaxis+1; lpixel[0] = xaxis; lpixel[1] = (rank+1)*yaxis; status = 0; fits_open_image(&fptreal, testfitsreal, READWRITE, &status); fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status); fits_close_file(fptreal, &status); status = 0; fits_open_image(&fptrimg, testfitsimag, READWRITE, &status); fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status); fits_close_file(fptrimg, &status); #endif //FITSIO for (int isector=0; isector<size; isector++) { Loading @@ -238,7 +286,28 @@ void write_fftw_data(){ if(isector == rank) { printf("%d writing\n",isector); #ifdef FITSIO fpixel[0] = 1; fpixel[1] = isector*yaxis+1; lpixel[0] = xaxis; lpixel[1] = (isector+1)*yaxis; status = 0; fits_open_image(&fptreal, testfitsreal, READWRITE, &status); fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status); fits_close_file(fptreal, &status); status = 0; fits_open_image(&fptrimg, testfitsimag, READWRITE, &status); fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status); fits_close_file(fptrimg, &status); #endif //FITSIO file.pFilereal = fopen (out.fftfile2,"ab"); file.pFileimg = fopen (out.fftfile3,"ab"); Loading @@ -256,6 +325,7 @@ void write_fftw_data(){ MPI_Barrier(MPI_COMM_WORLD); timing_wt.write += CPU_TIME_wt - start_image; #endif //WRITE_IMAGE Loading main.c +12 −0 Original line number Diff line number Diff line Loading @@ -109,6 +109,18 @@ int main(int argc, char * argv[]) } #endif #ifdef FITSIO fitsfile *fptreal; fitsfile *fptrimg; int status; char testfitsreal[NAME_LEN] = "parallel_real.fits"; char testfitsimag[NAME_LEN] = "parallel_img.fits"; uint naxis = 2; uint naxes[2] = { grid_size_x, grid_size_y }; #endif /* Reading Parameter file */ read_parameter_file(in.paramfile); Loading result.c +14 −3 Original line number Diff line number Diff line Loading @@ -26,14 +26,25 @@ void write_result( void ) printf("%40s time : %g sec\n", "Phase", timing_wt.phase); #endif //USE_FFTW #if defined(WRITE_IMAGE) printf("%40s time : %g sec\n", "Image writing", timing_wt.write); #endif printf("%40s time : %g sec\n", "TOT", timing_wt.total); file.pFile = fopen (out.timingfile, "w"); #if defined(USE_FFTW) #if !defined(CUFFTMP) fprintf(file.pFile, "%g %g %g %g %g %g %g\n", timing_wt.setup, timing_wt.kernel, timing_wt.compose, timing_wt.reduce, timing_wt.fftw, timing_wt.phase, timing_wt.total); timing_wt.reduce, timing_wt.fftw, timing_wt.phase, timing_wt.write, timing_wt.total); #else fprintf(file.pFile, "%g %g %g %g %g %g %g\n", timing_wt.setup, timing_wt.kernel, timing_wt.compose, timing_wt.reduce, timing_wt.cufftmp, timing_wt.phase, timing_wt.write, timing_wt.total); #endif fclose(file.pFile); } Loading w-stacking_omp.v2.cdeleted 100644 → 0 +0 −267 Original line number Diff line number Diff line #include <omp.h> #include "w-stacking_omp.h" #include <math.h> #include <stdlib.h> #include <stdio.h> #ifdef NVIDIA #include <cuda_runtime.h> #endif #ifdef ACCOMP #pragma omp declare target #endif double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) { double conv_weight; conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); return conv_weight; } #ifdef ACCOMP #pragma omp end declare target #endif //The function has been slightly modified in order to include the ranks of the tasks calling the GPUs [GL] void wstack( int num_w_planes, unsigned int num_points, unsigned int freq_per_chan, unsigned int polarizations, double* uu, double* vv, double* ww, float* vis_real, float* vis_img, float* weight, double dx, double dw, int w_support, int grid_size_x, int grid_size_y, double* grid, int num_threads, int rank) { //unsigned int index; unsigned int visindex; // initialize the convolution kernel // gaussian: int KernelLen = (w_support-1)/2; double std = 1.0; double std22 = 1.0/(2.0*std*std); double norm = std22/PI; // Loop over visibilities. #ifdef _OPENMP omp_set_num_threads(num_threads); #endif #ifdef ACCOMP unsigned int Nvis = num_points*freq_per_chan*polarizations; unsigned int gpu_weight_dim = Nvis/freq_per_chan; unsigned int gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y; #pragma omp target teams distribute parallel for private(visindex) \ map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points],vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim]) \ map(tofrom: grid[0:gpu_grid_dim]) device(rank % omp_get_num_devices()) //Works also if Ntasks > NGPUs [GL] #endif for (unsigned int i = 0; i < num_points; i++) { #ifdef _OPENMP //int tid; //tid = omp_get_thread_num(); //printf("%d\n",tid); #endif visindex = i*freq_per_chan*polarizations; double sum = 0.0; int j, k; //if (i%1000 == 0)printf("%ld\n",i); /* Convert UV coordinates to grid coordinates. */ double pos_u = uu[i] / dx; double pos_v = vv[i] / dx; double ww_i = ww[i] / dw; //Renormalization of ww_i ww_i = ww_i/(1 + num_w_planes); int grid_w = (int)ww_i; int grid_u = (int)pos_u; int grid_v = (int)pos_v; // check the boundaries unsigned int jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; unsigned int jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; unsigned int kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; unsigned int kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); unsigned int buffer_size = 2*(kmax-kmin+1)*(jmax-jmin+1); double grid_buffer[ buffer_size]; // Convolve this point onto the grid. for (k = kmin; k <= kmax; k++) { double v_dist = (double)k+0.5 - pos_v; unsigned int buffer_base = k * buffer_size; for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; unsigned int ifine = visindex; // DAV: the following two loops are performend by each thread separately: no problems of race conditions for (unsigned int ifreq=0; ifreq<freq_per_chan; ifreq++) { unsigned int iweight = visindex/freq_per_chan; for (unsigned int ipol=0; ipol<polarizations; ipol++) { if (!isnan(vis_real[ifine])) { //printf("%f %ld\n",weight[iweight],iweight); add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); } ifine++; iweight++; } } // DAV: this is the critical call in terms of correctness of the results and of performance grid_buffer[buffer_base + j*2] += add_term_real; grid_buffer[buffer_base + j*2+1] += add_term_img; } for (k = kmin; k <= kmax; k++) { unsigned int buffer_base = k * buffer_size; for (j = jmin; j <= jmax; j++) { unsigned int iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); #pragma omp atomic grid[iKer] += grid_buffer[buffer_base + j*2]; #pragma omp atomic grid[iKer+1] += grid_buffer[buffer_base + j*2+1];} } } } #ifdef ACCOMP #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim],grid[0:gpu_grid_dim]) device(rank % omp_get_num_devices()) #endif //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); } #ifdef NVIDIA #define CUDAErrorCheck(funcall) \ do { \ cudaError_t ierr = funcall; \ if (cudaSuccess != ierr) { \ fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \ __FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr)); \ exit(ierr); \ } \ } while (0) static inline int _corePerSM(int major, int minor) /** * @brief Give the number of CUDA cores per streaming multiprocessor (SM). * * The number of CUDA cores per SM is determined by the compute capability. * * @param major Major revision number of the compute capability. * @param minor Minor revision number of the compute capability. * * @return The number of CUDA cores per SM. */ { if (1 == major) { if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8; } if (2 == major) { if (0 == minor) return 32; if (1 == minor) return 48; } if (3 == major) { if (0 == minor || 5 == minor || 7 == minor) return 192; } if (5 == major) { if (0 == minor || 2 == minor) return 128; } if (6 == major) { if (0 == minor) return 64; if (1 == minor || 2 == minor) return 128; } if (7 == major) { if (0 == minor || 2 == minor || 5 == minor) return 64; } return -1; } void getGPUInfo(int iaccel) { int corePerSM; struct cudaDeviceProp dev; CUDAErrorCheck(cudaSetDevice(iaccel)); CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel)); corePerSM = _corePerSM(dev.major, dev.minor); printf("\n"); printf("============================================================\n"); printf("CUDA Device name : \"%s\"\n", dev.name); printf("------------------------------------------------------------\n"); printf("Comp. Capability : %d.%d\n", dev.major, dev.minor); printf("max clock rate : %.0f MHz\n", dev.clockRate * 1.e-3f); printf("number of SMs : %d\n", dev.multiProcessorCount); printf("cores / SM : %d\n", corePerSM); printf("# of CUDA cores : %d\n", corePerSM * dev.multiProcessorCount); printf("------------------------------------------------------------\n"); printf("global memory : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f); printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f); printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor); printf("------------------------------------------------------------\n"); printf("max # of threads / SM : %d\n", dev.maxThreadsPerMultiProcessor); printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock); printf("max dim. of block : (%d, %d, %d)\n", dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]); printf("max dim. of grid : (%d, %d, %d)\n", dev.maxGridSize[0], dev.maxGridSize[1], dev.maxGridSize[2]); printf("warp size : %d\n", dev.warpSize); printf("============================================================\n"); int z = 0, x = 2; #pragma omp target map(to:x) map(tofrom:z) { z=x+100; } } #endif int test(int nnn) { int mmm; mmm = nnn+1; return mmm; } w-stacking_omp.v3.cdeleted 100644 → 0 +0 −281 Original line number Diff line number Diff line #include <omp.h> #include "w-stacking_omp.h" #include <math.h> #include <stdlib.h> #include <stdio.h> #ifdef NVIDIA #include <cuda_runtime.h> #endif #ifdef ACCOMP #pragma omp declare target #endif double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) { double conv_weight; conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); return conv_weight; } #ifdef ACCOMP #pragma omp end declare target #endif //The function has been slightly modified in order to include the ranks of the tasks calling the GPUs [GL] void wstack( int num_w_planes, unsigned int num_points, unsigned int freq_per_chan, unsigned int polarizations, double* uu, double* vv, double* ww, float* vis_real, float* vis_img, float* weight, double dx, double dw, int w_support, int grid_size_x, int grid_size_y, double* grid, int num_threads, int rank) { //unsigned int index; unsigned int visindex; // initialize the convolution kernel // gaussian: int KernelLen = (w_support-1)/2; double std = 1.0; double std22 = 1.0/(2.0*std*std); double norm = std22/PI; // Loop over visibilities. #ifdef _OPENMP omp_set_num_threads(num_threads); #endif #ifdef ACCOMP unsigned int Nvis = num_points*freq_per_chan*polarizations; unsigned int gpu_weight_dim = Nvis/freq_per_chan; unsigned int gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y; double *grid_buffer = (double*)malloc(gpu_grid_dim*sizeof(double)); /* //Checking for GPU memory unsigned int unsigned int int data_mem = num_points * (sizeof(uu) + sizeof(vv) + sizeof(ww)) + Nvis * (sizeof(vis_real) + sizeof(vis_img) ) + gpu_weight_dim * sizeof(weight); unsigned int unsigned int int grid_mem = gpu_grid_dim * sizeof(grid); printf("Rank %d, Original data: %lld, Grid array: %lld, Total: %lld\n", rank, data_mem, grid_mem, data_mem + grid_mem);fflush(stdout); */ #pragma omp target teams distribute parallel for private(visindex) \ map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim]) \ map(to:grid_buffer[0:gpu_grid_dim]) \ map(tofrom: grid[0:gpu_grid_dim]) device(rank % omp_get_num_devices()) //Works also if Ntasks > NGPUs [GL] #endif for (unsigned int i = 0; i < num_points; i++) { #ifdef _OPENMP //int tid; //tid = omp_get_thread_num(); //printf("%d\n",tid); #endif visindex = i*freq_per_chan*polarizations; double sum = 0.0; int j, k; //if (i%1000 == 0)printf("%ld\n",i); /* Convert UV coordinates to grid coordinates. */ double pos_u = uu[i] / dx; double pos_v = vv[i] / dx; double ww_i = ww[i] / dw; //Renormalization of ww_i ww_i = ww_i/(1+num_w_planes); int grid_w = (int)ww_i; int grid_u = (int)pos_u; int grid_v = (int)pos_v; // check the boundaries unsigned int jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; unsigned int jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; unsigned int kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; unsigned int kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); unsigned int buffer_size = 2*(kmax-kmin+1)*(jmax-jmin+1); //double grid_buffer[ buffer_size]; // Convolve this point onto the grid. for (k = kmin; k <= kmax; k++) { double v_dist = (double)k+0.5 - pos_v; unsigned int buffer_base = k * buffer_size; for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; unsigned int ifine = visindex; // DAV: the following two loops are performend by each thread separately: no problems of race conditions for (unsigned int ifreq=0; ifreq<freq_per_chan; ifreq++) { unsigned int iweight = visindex/freq_per_chan; for (unsigned int ipol=0; ipol<polarizations; ipol++) { if (!isnan(vis_real[ifine])) { //printf("%f %ld\n",weight[iweight],iweight); add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); } ifine++; iweight++; } } // DAV: this is the critical call in terms of correctness of the results and of performance grid_buffer[buffer_base + j*2] += add_term_real; grid_buffer[buffer_base + j*2+1] += add_term_img; } } for (k = kmin; k <= kmax; k++) { unsigned int buffer_base = k * buffer_size; for (j = jmin; j <= jmax; j++) { unsigned int iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); #pragma omp atomic grid[iKer] += grid_buffer[buffer_base + j*2]; #pragma omp atomic grid[iKer+1] += grid_buffer[buffer_base + j*2+1];} } } #ifdef ACCOMP #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim],grid[0:gpu_grid_dim]) device(rank % omp_get_num_devices()) #endif /* for (int i=0; i<100; i++)printf("%f\n",grid[i]); */ } #ifdef NVIDIA #define CUDAErrorCheck(funcall) \ do { \ cudaError_t ierr = funcall; \ if (cudaSuccess != ierr) { \ fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \ __FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr)); \ exit(ierr); \ } \ } while (0) static inline int _corePerSM(int major, int minor) /** * @brief Give the number of CUDA cores per streaming multiprocessor (SM). * * The number of CUDA cores per SM is determined by the compute capability. * * @param major Major revision number of the compute capability. * @param minor Minor revision number of the compute capability. * * @return The number of CUDA cores per SM. */ { if (1 == major) { if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8; } if (2 == major) { if (0 == minor) return 32; if (1 == minor) return 48; } if (3 == major) { if (0 == minor || 5 == minor || 7 == minor) return 192; } if (5 == major) { if (0 == minor || 2 == minor) return 128; } if (6 == major) { if (0 == minor) return 64; if (1 == minor || 2 == minor) return 128; } if (7 == major) { if (0 == minor || 2 == minor || 5 == minor) return 64; } return -1; } void getGPUInfo(int iaccel) { int corePerSM; struct cudaDeviceProp dev; CUDAErrorCheck(cudaSetDevice(iaccel)); CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel)); corePerSM = _corePerSM(dev.major, dev.minor); printf("\n"); printf("============================================================\n"); printf("CUDA Device name : \"%s\"\n", dev.name); printf("------------------------------------------------------------\n"); printf("Comp. Capability : %d.%d\n", dev.major, dev.minor); printf("max clock rate : %.0f MHz\n", dev.clockRate * 1.e-3f); printf("number of SMs : %d\n", dev.multiProcessorCount); printf("cores / SM : %d\n", corePerSM); printf("# of CUDA cores : %d\n", corePerSM * dev.multiProcessorCount); printf("------------------------------------------------------------\n"); printf("global memory : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f); printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f); printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor); printf("------------------------------------------------------------\n"); printf("max # of threads / SM : %d\n", dev.maxThreadsPerMultiProcessor); printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock); printf("max dim. of block : (%d, %d, %d)\n", dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]); printf("max dim. of grid : (%d, %d, %d)\n", dev.maxGridSize[0], dev.maxGridSize[1], dev.maxGridSize[2]); printf("warp size : %d\n", dev.warpSize); printf("============================================================\n"); int z = 0, x = 2; #pragma omp target map(to:x) map(tofrom:z) { z=x+100; } } #endif int test(int nnn) { int mmm; mmm = nnn+1; return mmm; } Loading
fourier_transform.c +71 −1 Original line number Diff line number Diff line Loading @@ -220,8 +220,31 @@ void write_fftw_data(){ #ifdef WRITE_IMAGE double start_image = CPU_TIME_wt; if(rank == 0) { #ifdef FITSIO printf("REMOVING RESIDUAL FITS FILE\n"); remove(testfitsreal); remove(testfitsimag); printf("FITS CREATION\n"); status = 0; fits_create_file(&fptrimg, testfitsimag, &status); fits_create_img(fptrimg, DOUBLE_IMG, naxis, naxes, &status); fits_close_file(fptrimg, &status); status = 0; fits_create_file(&fptreal, testfitsreal, &status); fits_create_img(fptreal, DOUBLE_IMG, naxis, naxes, &status); fits_close_file(fptreal, &status); #endif file.pFilereal = fopen (out.fftfile2,"wb"); file.pFileimg = fopen (out.fftfile3,"wb"); fclose(file.pFilereal); Loading @@ -231,6 +254,31 @@ void write_fftw_data(){ MPI_Barrier(MPI_COMM_WORLD); if(rank == 0)printf("WRITING IMAGE\n"); #ifdef FITSIO uint * fpixel = (uint *) malloc(sizeof(uint)*naxis); uint * lpixel = (uint *) malloc(sizeof(uint)*naxis); #endif #ifdef FITSIO fpixel[0] = 1; fpixel[1] = rank*yaxis+1; lpixel[0] = xaxis; lpixel[1] = (rank+1)*yaxis; status = 0; fits_open_image(&fptreal, testfitsreal, READWRITE, &status); fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status); fits_close_file(fptreal, &status); status = 0; fits_open_image(&fptrimg, testfitsimag, READWRITE, &status); fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status); fits_close_file(fptrimg, &status); #endif //FITSIO for (int isector=0; isector<size; isector++) { Loading @@ -238,7 +286,28 @@ void write_fftw_data(){ if(isector == rank) { printf("%d writing\n",isector); #ifdef FITSIO fpixel[0] = 1; fpixel[1] = isector*yaxis+1; lpixel[0] = xaxis; lpixel[1] = (isector+1)*yaxis; status = 0; fits_open_image(&fptreal, testfitsreal, READWRITE, &status); fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status); fits_close_file(fptreal, &status); status = 0; fits_open_image(&fptrimg, testfitsimag, READWRITE, &status); fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status); fits_close_file(fptrimg, &status); #endif //FITSIO file.pFilereal = fopen (out.fftfile2,"ab"); file.pFileimg = fopen (out.fftfile3,"ab"); Loading @@ -256,6 +325,7 @@ void write_fftw_data(){ MPI_Barrier(MPI_COMM_WORLD); timing_wt.write += CPU_TIME_wt - start_image; #endif //WRITE_IMAGE Loading
main.c +12 −0 Original line number Diff line number Diff line Loading @@ -109,6 +109,18 @@ int main(int argc, char * argv[]) } #endif #ifdef FITSIO fitsfile *fptreal; fitsfile *fptrimg; int status; char testfitsreal[NAME_LEN] = "parallel_real.fits"; char testfitsimag[NAME_LEN] = "parallel_img.fits"; uint naxis = 2; uint naxes[2] = { grid_size_x, grid_size_y }; #endif /* Reading Parameter file */ read_parameter_file(in.paramfile); Loading
result.c +14 −3 Original line number Diff line number Diff line Loading @@ -26,14 +26,25 @@ void write_result( void ) printf("%40s time : %g sec\n", "Phase", timing_wt.phase); #endif //USE_FFTW #if defined(WRITE_IMAGE) printf("%40s time : %g sec\n", "Image writing", timing_wt.write); #endif printf("%40s time : %g sec\n", "TOT", timing_wt.total); file.pFile = fopen (out.timingfile, "w"); #if defined(USE_FFTW) #if !defined(CUFFTMP) fprintf(file.pFile, "%g %g %g %g %g %g %g\n", timing_wt.setup, timing_wt.kernel, timing_wt.compose, timing_wt.reduce, timing_wt.fftw, timing_wt.phase, timing_wt.total); timing_wt.reduce, timing_wt.fftw, timing_wt.phase, timing_wt.write, timing_wt.total); #else fprintf(file.pFile, "%g %g %g %g %g %g %g\n", timing_wt.setup, timing_wt.kernel, timing_wt.compose, timing_wt.reduce, timing_wt.cufftmp, timing_wt.phase, timing_wt.write, timing_wt.total); #endif fclose(file.pFile); } Loading
w-stacking_omp.v2.cdeleted 100644 → 0 +0 −267 Original line number Diff line number Diff line #include <omp.h> #include "w-stacking_omp.h" #include <math.h> #include <stdlib.h> #include <stdio.h> #ifdef NVIDIA #include <cuda_runtime.h> #endif #ifdef ACCOMP #pragma omp declare target #endif double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) { double conv_weight; conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); return conv_weight; } #ifdef ACCOMP #pragma omp end declare target #endif //The function has been slightly modified in order to include the ranks of the tasks calling the GPUs [GL] void wstack( int num_w_planes, unsigned int num_points, unsigned int freq_per_chan, unsigned int polarizations, double* uu, double* vv, double* ww, float* vis_real, float* vis_img, float* weight, double dx, double dw, int w_support, int grid_size_x, int grid_size_y, double* grid, int num_threads, int rank) { //unsigned int index; unsigned int visindex; // initialize the convolution kernel // gaussian: int KernelLen = (w_support-1)/2; double std = 1.0; double std22 = 1.0/(2.0*std*std); double norm = std22/PI; // Loop over visibilities. #ifdef _OPENMP omp_set_num_threads(num_threads); #endif #ifdef ACCOMP unsigned int Nvis = num_points*freq_per_chan*polarizations; unsigned int gpu_weight_dim = Nvis/freq_per_chan; unsigned int gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y; #pragma omp target teams distribute parallel for private(visindex) \ map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points],vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim]) \ map(tofrom: grid[0:gpu_grid_dim]) device(rank % omp_get_num_devices()) //Works also if Ntasks > NGPUs [GL] #endif for (unsigned int i = 0; i < num_points; i++) { #ifdef _OPENMP //int tid; //tid = omp_get_thread_num(); //printf("%d\n",tid); #endif visindex = i*freq_per_chan*polarizations; double sum = 0.0; int j, k; //if (i%1000 == 0)printf("%ld\n",i); /* Convert UV coordinates to grid coordinates. */ double pos_u = uu[i] / dx; double pos_v = vv[i] / dx; double ww_i = ww[i] / dw; //Renormalization of ww_i ww_i = ww_i/(1 + num_w_planes); int grid_w = (int)ww_i; int grid_u = (int)pos_u; int grid_v = (int)pos_v; // check the boundaries unsigned int jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; unsigned int jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; unsigned int kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; unsigned int kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); unsigned int buffer_size = 2*(kmax-kmin+1)*(jmax-jmin+1); double grid_buffer[ buffer_size]; // Convolve this point onto the grid. for (k = kmin; k <= kmax; k++) { double v_dist = (double)k+0.5 - pos_v; unsigned int buffer_base = k * buffer_size; for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; unsigned int ifine = visindex; // DAV: the following two loops are performend by each thread separately: no problems of race conditions for (unsigned int ifreq=0; ifreq<freq_per_chan; ifreq++) { unsigned int iweight = visindex/freq_per_chan; for (unsigned int ipol=0; ipol<polarizations; ipol++) { if (!isnan(vis_real[ifine])) { //printf("%f %ld\n",weight[iweight],iweight); add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); } ifine++; iweight++; } } // DAV: this is the critical call in terms of correctness of the results and of performance grid_buffer[buffer_base + j*2] += add_term_real; grid_buffer[buffer_base + j*2+1] += add_term_img; } for (k = kmin; k <= kmax; k++) { unsigned int buffer_base = k * buffer_size; for (j = jmin; j <= jmax; j++) { unsigned int iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); #pragma omp atomic grid[iKer] += grid_buffer[buffer_base + j*2]; #pragma omp atomic grid[iKer+1] += grid_buffer[buffer_base + j*2+1];} } } } #ifdef ACCOMP #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim],grid[0:gpu_grid_dim]) device(rank % omp_get_num_devices()) #endif //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); } #ifdef NVIDIA #define CUDAErrorCheck(funcall) \ do { \ cudaError_t ierr = funcall; \ if (cudaSuccess != ierr) { \ fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \ __FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr)); \ exit(ierr); \ } \ } while (0) static inline int _corePerSM(int major, int minor) /** * @brief Give the number of CUDA cores per streaming multiprocessor (SM). * * The number of CUDA cores per SM is determined by the compute capability. * * @param major Major revision number of the compute capability. * @param minor Minor revision number of the compute capability. * * @return The number of CUDA cores per SM. */ { if (1 == major) { if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8; } if (2 == major) { if (0 == minor) return 32; if (1 == minor) return 48; } if (3 == major) { if (0 == minor || 5 == minor || 7 == minor) return 192; } if (5 == major) { if (0 == minor || 2 == minor) return 128; } if (6 == major) { if (0 == minor) return 64; if (1 == minor || 2 == minor) return 128; } if (7 == major) { if (0 == minor || 2 == minor || 5 == minor) return 64; } return -1; } void getGPUInfo(int iaccel) { int corePerSM; struct cudaDeviceProp dev; CUDAErrorCheck(cudaSetDevice(iaccel)); CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel)); corePerSM = _corePerSM(dev.major, dev.minor); printf("\n"); printf("============================================================\n"); printf("CUDA Device name : \"%s\"\n", dev.name); printf("------------------------------------------------------------\n"); printf("Comp. Capability : %d.%d\n", dev.major, dev.minor); printf("max clock rate : %.0f MHz\n", dev.clockRate * 1.e-3f); printf("number of SMs : %d\n", dev.multiProcessorCount); printf("cores / SM : %d\n", corePerSM); printf("# of CUDA cores : %d\n", corePerSM * dev.multiProcessorCount); printf("------------------------------------------------------------\n"); printf("global memory : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f); printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f); printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor); printf("------------------------------------------------------------\n"); printf("max # of threads / SM : %d\n", dev.maxThreadsPerMultiProcessor); printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock); printf("max dim. of block : (%d, %d, %d)\n", dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]); printf("max dim. of grid : (%d, %d, %d)\n", dev.maxGridSize[0], dev.maxGridSize[1], dev.maxGridSize[2]); printf("warp size : %d\n", dev.warpSize); printf("============================================================\n"); int z = 0, x = 2; #pragma omp target map(to:x) map(tofrom:z) { z=x+100; } } #endif int test(int nnn) { int mmm; mmm = nnn+1; return mmm; }
w-stacking_omp.v3.cdeleted 100644 → 0 +0 −281 Original line number Diff line number Diff line #include <omp.h> #include "w-stacking_omp.h" #include <math.h> #include <stdlib.h> #include <stdio.h> #ifdef NVIDIA #include <cuda_runtime.h> #endif #ifdef ACCOMP #pragma omp declare target #endif double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) { double conv_weight; conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); return conv_weight; } #ifdef ACCOMP #pragma omp end declare target #endif //The function has been slightly modified in order to include the ranks of the tasks calling the GPUs [GL] void wstack( int num_w_planes, unsigned int num_points, unsigned int freq_per_chan, unsigned int polarizations, double* uu, double* vv, double* ww, float* vis_real, float* vis_img, float* weight, double dx, double dw, int w_support, int grid_size_x, int grid_size_y, double* grid, int num_threads, int rank) { //unsigned int index; unsigned int visindex; // initialize the convolution kernel // gaussian: int KernelLen = (w_support-1)/2; double std = 1.0; double std22 = 1.0/(2.0*std*std); double norm = std22/PI; // Loop over visibilities. #ifdef _OPENMP omp_set_num_threads(num_threads); #endif #ifdef ACCOMP unsigned int Nvis = num_points*freq_per_chan*polarizations; unsigned int gpu_weight_dim = Nvis/freq_per_chan; unsigned int gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y; double *grid_buffer = (double*)malloc(gpu_grid_dim*sizeof(double)); /* //Checking for GPU memory unsigned int unsigned int int data_mem = num_points * (sizeof(uu) + sizeof(vv) + sizeof(ww)) + Nvis * (sizeof(vis_real) + sizeof(vis_img) ) + gpu_weight_dim * sizeof(weight); unsigned int unsigned int int grid_mem = gpu_grid_dim * sizeof(grid); printf("Rank %d, Original data: %lld, Grid array: %lld, Total: %lld\n", rank, data_mem, grid_mem, data_mem + grid_mem);fflush(stdout); */ #pragma omp target teams distribute parallel for private(visindex) \ map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim]) \ map(to:grid_buffer[0:gpu_grid_dim]) \ map(tofrom: grid[0:gpu_grid_dim]) device(rank % omp_get_num_devices()) //Works also if Ntasks > NGPUs [GL] #endif for (unsigned int i = 0; i < num_points; i++) { #ifdef _OPENMP //int tid; //tid = omp_get_thread_num(); //printf("%d\n",tid); #endif visindex = i*freq_per_chan*polarizations; double sum = 0.0; int j, k; //if (i%1000 == 0)printf("%ld\n",i); /* Convert UV coordinates to grid coordinates. */ double pos_u = uu[i] / dx; double pos_v = vv[i] / dx; double ww_i = ww[i] / dw; //Renormalization of ww_i ww_i = ww_i/(1+num_w_planes); int grid_w = (int)ww_i; int grid_u = (int)pos_u; int grid_v = (int)pos_v; // check the boundaries unsigned int jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; unsigned int jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; unsigned int kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; unsigned int kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); unsigned int buffer_size = 2*(kmax-kmin+1)*(jmax-jmin+1); //double grid_buffer[ buffer_size]; // Convolve this point onto the grid. for (k = kmin; k <= kmax; k++) { double v_dist = (double)k+0.5 - pos_v; unsigned int buffer_base = k * buffer_size; for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; unsigned int ifine = visindex; // DAV: the following two loops are performend by each thread separately: no problems of race conditions for (unsigned int ifreq=0; ifreq<freq_per_chan; ifreq++) { unsigned int iweight = visindex/freq_per_chan; for (unsigned int ipol=0; ipol<polarizations; ipol++) { if (!isnan(vis_real[ifine])) { //printf("%f %ld\n",weight[iweight],iweight); add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); } ifine++; iweight++; } } // DAV: this is the critical call in terms of correctness of the results and of performance grid_buffer[buffer_base + j*2] += add_term_real; grid_buffer[buffer_base + j*2+1] += add_term_img; } } for (k = kmin; k <= kmax; k++) { unsigned int buffer_base = k * buffer_size; for (j = jmin; j <= jmax; j++) { unsigned int iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); #pragma omp atomic grid[iKer] += grid_buffer[buffer_base + j*2]; #pragma omp atomic grid[iKer+1] += grid_buffer[buffer_base + j*2+1];} } } #ifdef ACCOMP #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim],grid[0:gpu_grid_dim]) device(rank % omp_get_num_devices()) #endif /* for (int i=0; i<100; i++)printf("%f\n",grid[i]); */ } #ifdef NVIDIA #define CUDAErrorCheck(funcall) \ do { \ cudaError_t ierr = funcall; \ if (cudaSuccess != ierr) { \ fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \ __FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr)); \ exit(ierr); \ } \ } while (0) static inline int _corePerSM(int major, int minor) /** * @brief Give the number of CUDA cores per streaming multiprocessor (SM). * * The number of CUDA cores per SM is determined by the compute capability. * * @param major Major revision number of the compute capability. * @param minor Minor revision number of the compute capability. * * @return The number of CUDA cores per SM. */ { if (1 == major) { if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8; } if (2 == major) { if (0 == minor) return 32; if (1 == minor) return 48; } if (3 == major) { if (0 == minor || 5 == minor || 7 == minor) return 192; } if (5 == major) { if (0 == minor || 2 == minor) return 128; } if (6 == major) { if (0 == minor) return 64; if (1 == minor || 2 == minor) return 128; } if (7 == major) { if (0 == minor || 2 == minor || 5 == minor) return 64; } return -1; } void getGPUInfo(int iaccel) { int corePerSM; struct cudaDeviceProp dev; CUDAErrorCheck(cudaSetDevice(iaccel)); CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel)); corePerSM = _corePerSM(dev.major, dev.minor); printf("\n"); printf("============================================================\n"); printf("CUDA Device name : \"%s\"\n", dev.name); printf("------------------------------------------------------------\n"); printf("Comp. Capability : %d.%d\n", dev.major, dev.minor); printf("max clock rate : %.0f MHz\n", dev.clockRate * 1.e-3f); printf("number of SMs : %d\n", dev.multiProcessorCount); printf("cores / SM : %d\n", corePerSM); printf("# of CUDA cores : %d\n", corePerSM * dev.multiProcessorCount); printf("------------------------------------------------------------\n"); printf("global memory : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f); printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f); printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor); printf("------------------------------------------------------------\n"); printf("max # of threads / SM : %d\n", dev.maxThreadsPerMultiProcessor); printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock); printf("max dim. of block : (%d, %d, %d)\n", dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]); printf("max dim. of grid : (%d, %d, %d)\n", dev.maxGridSize[0], dev.maxGridSize[1], dev.maxGridSize[2]); printf("warp size : %d\n", dev.warpSize); printf("============================================================\n"); int z = 0, x = 2; #pragma omp target map(to:x) map(tofrom:z) { z=x+100; } } #endif int test(int nnn) { int mmm; mmm = nnn+1; return mmm; }