Loading gridding_nccl.cpp +4 −3 Original line number Diff line number Diff line Loading @@ -190,7 +190,6 @@ void gridding_data(){ printf("Processing sector %ld\n",isector); #endif start = CPU_TIME_wt; double *stacking_target_array; if ( size > 1 ) Loading @@ -198,6 +197,8 @@ void gridding_data(){ else stacking_target_array = grid; start = CPU_TIME_wt; //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Nsec, Loading phase_correction.cu +4 −2 Original line number Diff line number Diff line Loading @@ -234,13 +234,15 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in } #else omp_set_default_device(rank % omp_get_num_devices()); #if !defined(__clang__) #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) device(rank % omp_get_num_devices()) #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) #else #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) device(rank % omp_get_num_devices()) #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) #endif for (int iw=0; iw<num_w_planes; iw++) Loading w-stacking.cu +9 −9 Original line number Diff line number Diff line Loading @@ -5,12 +5,9 @@ #include <math.h> #include <stdlib.h> #include <stdio.h> #include "errcodes.h" #ifdef __CUDACC__ #include "allvars_nccl.h" #else #include "allvars.h" #endif #include "proto.h" Loading Loading @@ -209,7 +206,7 @@ void wstack( int rank) { uint i; uint index; //uint index; uint visindex; // initialize the convolution kernel Loading @@ -220,13 +217,14 @@ void wstack( double std22 = 1.0/(2.0*std*std); double norm = std22/PI; double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel)); double overSamplingFactor = 1.0; int withSinc = 0; double alpha = 8.6; #ifdef GAUSS makeGaussKernel(convkernel,w_support,increaseprecision,std22); #endif #ifdef KAISERBESSEL double overSamplingFactor = 1.0; int withSinc = 0; double alpha = 8.6; makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc); #endif Loading Loading @@ -328,9 +326,9 @@ void wstack( #endif #ifdef ACCOMP omp_set_default_device(rank % omp_get_num_devices()); uint Nvis = num_points*freq_per_chan*polarizations; // #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) // #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #else #pragma omp parallel for private(visindex) #endif Loading Loading @@ -422,6 +420,8 @@ void wstack( } } #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan],grid[0:2*num_w_planes*grid_size_x*grid_size_y]) // End switch between CUDA and CPU versions #endif //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); Loading Loading
gridding_nccl.cpp +4 −3 Original line number Diff line number Diff line Loading @@ -190,7 +190,6 @@ void gridding_data(){ printf("Processing sector %ld\n",isector); #endif start = CPU_TIME_wt; double *stacking_target_array; if ( size > 1 ) Loading @@ -198,6 +197,8 @@ void gridding_data(){ else stacking_target_array = grid; start = CPU_TIME_wt; //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Nsec, Loading
phase_correction.cu +4 −2 Original line number Diff line number Diff line Loading @@ -234,13 +234,15 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in } #else omp_set_default_device(rank % omp_get_num_devices()); #if !defined(__clang__) #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) device(rank % omp_get_num_devices()) #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) #else #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) device(rank % omp_get_num_devices()) #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) #endif for (int iw=0; iw<num_w_planes; iw++) Loading
w-stacking.cu +9 −9 Original line number Diff line number Diff line Loading @@ -5,12 +5,9 @@ #include <math.h> #include <stdlib.h> #include <stdio.h> #include "errcodes.h" #ifdef __CUDACC__ #include "allvars_nccl.h" #else #include "allvars.h" #endif #include "proto.h" Loading Loading @@ -209,7 +206,7 @@ void wstack( int rank) { uint i; uint index; //uint index; uint visindex; // initialize the convolution kernel Loading @@ -220,13 +217,14 @@ void wstack( double std22 = 1.0/(2.0*std*std); double norm = std22/PI; double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel)); double overSamplingFactor = 1.0; int withSinc = 0; double alpha = 8.6; #ifdef GAUSS makeGaussKernel(convkernel,w_support,increaseprecision,std22); #endif #ifdef KAISERBESSEL double overSamplingFactor = 1.0; int withSinc = 0; double alpha = 8.6; makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc); #endif Loading Loading @@ -328,9 +326,9 @@ void wstack( #endif #ifdef ACCOMP omp_set_default_device(rank % omp_get_num_devices()); uint Nvis = num_points*freq_per_chan*polarizations; // #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) // #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #else #pragma omp parallel for private(visindex) #endif Loading Loading @@ -422,6 +420,8 @@ void wstack( } } #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan],grid[0:2*num_w_planes*grid_size_x*grid_size_y]) // End switch between CUDA and CPU versions #endif //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); Loading