Loading Makefile +2 −2 Original line number Diff line number Diff line Loading @@ -58,7 +58,7 @@ OPT += -DPHASE_ON #OPT += -DNORMALIZE_UVW # Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL OPT += -DGAUSS_HI_PRECISION #OPT += -DGAUSS_HI_PRECISION #OPT += -DGAUSS Loading @@ -76,7 +76,7 @@ OPT += -DGAUSS_HI_PRECISION #OPT += -DCUDACC # use GPU acceleration via OMP #OPT += -DACCOMP OPT += -DACCOMP # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE Loading gridding_nccl.cpp +15 −6 Original line number Diff line number Diff line Loading @@ -77,7 +77,8 @@ void gridding_data(){ // allocate sector arrays // note: we use the largest allocation among all sectors // unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); Loading Loading @@ -128,6 +129,8 @@ void gridding_data(){ double start = CPU_TIME_wt; memset( memory, 0, mem_size ); // select data for this sector uint icount = 0; uint ip = 0; Loading Loading @@ -197,6 +200,12 @@ void gridding_data(){ start = CPU_TIME_wt; double *stacking_target_array; if ( size > 1 ) stacking_target_array = gridss; else stacking_target_array = grid; //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Nsec, Loading @@ -213,7 +222,7 @@ void gridding_data(){ param.w_support, xaxis, yaxis, gridss, stacking_target_array, param.num_threads, rank); //Allocate memory on devices non-blocking for the host Loading Loading @@ -246,10 +255,10 @@ void gridding_data(){ timing_wt.reduce += CPU_TIME_wt - start; } // Go to next sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } } Loading gridding_rccl.cpp +17 −8 Original line number Diff line number Diff line Loading @@ -75,9 +75,10 @@ void gridding_data(){ // allocate sector arrays // note: we use the largest allocation among all sectors double_ty *memory = (double*) malloc ( (Nsec*3)*sizeof(double_ty) + (Nvissec*2+Nweightss)*sizeof(float_ty) ); // unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); Loading Loading @@ -126,6 +127,8 @@ void gridding_data(){ double start = CPU_TIME_wt; memset( memory, 0, mem_size ); // select data for this sector uint icount = 0; uint ip = 0; Loading Loading @@ -195,6 +198,12 @@ void gridding_data(){ start = CPU_TIME_wt; double *stacking_target_array; if ( size > 1 ) stacking_target_array = gridss; else stacking_target_array = grid; //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Nsec, Loading @@ -211,7 +220,7 @@ void gridding_data(){ param.w_support, xaxis, yaxis, gridss, stacking_target_array, param.num_threads, rank); //Allocate memory on devices non-blocking for the host Loading Loading @@ -244,10 +253,10 @@ void gridding_data(){ timing_wt.reduce += CPU_TIME_wt - start; } // Go to next sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } } Loading Loading
Makefile +2 −2 Original line number Diff line number Diff line Loading @@ -58,7 +58,7 @@ OPT += -DPHASE_ON #OPT += -DNORMALIZE_UVW # Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL OPT += -DGAUSS_HI_PRECISION #OPT += -DGAUSS_HI_PRECISION #OPT += -DGAUSS Loading @@ -76,7 +76,7 @@ OPT += -DGAUSS_HI_PRECISION #OPT += -DCUDACC # use GPU acceleration via OMP #OPT += -DACCOMP OPT += -DACCOMP # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE Loading
gridding_nccl.cpp +15 −6 Original line number Diff line number Diff line Loading @@ -77,7 +77,8 @@ void gridding_data(){ // allocate sector arrays // note: we use the largest allocation among all sectors // unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); Loading Loading @@ -128,6 +129,8 @@ void gridding_data(){ double start = CPU_TIME_wt; memset( memory, 0, mem_size ); // select data for this sector uint icount = 0; uint ip = 0; Loading Loading @@ -197,6 +200,12 @@ void gridding_data(){ start = CPU_TIME_wt; double *stacking_target_array; if ( size > 1 ) stacking_target_array = gridss; else stacking_target_array = grid; //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Nsec, Loading @@ -213,7 +222,7 @@ void gridding_data(){ param.w_support, xaxis, yaxis, gridss, stacking_target_array, param.num_threads, rank); //Allocate memory on devices non-blocking for the host Loading Loading @@ -246,10 +255,10 @@ void gridding_data(){ timing_wt.reduce += CPU_TIME_wt - start; } // Go to next sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } } Loading
gridding_rccl.cpp +17 −8 Original line number Diff line number Diff line Loading @@ -75,9 +75,10 @@ void gridding_data(){ // allocate sector arrays // note: we use the largest allocation among all sectors double_ty *memory = (double*) malloc ( (Nsec*3)*sizeof(double_ty) + (Nvissec*2+Nweightss)*sizeof(float_ty) ); // unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); Loading Loading @@ -126,6 +127,8 @@ void gridding_data(){ double start = CPU_TIME_wt; memset( memory, 0, mem_size ); // select data for this sector uint icount = 0; uint ip = 0; Loading Loading @@ -195,6 +198,12 @@ void gridding_data(){ start = CPU_TIME_wt; double *stacking_target_array; if ( size > 1 ) stacking_target_array = gridss; else stacking_target_array = grid; //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Nsec, Loading @@ -211,7 +220,7 @@ void gridding_data(){ param.w_support, xaxis, yaxis, gridss, stacking_target_array, param.num_threads, rank); //Allocate memory on devices non-blocking for the host Loading Loading @@ -244,10 +253,10 @@ void gridding_data(){ timing_wt.reduce += CPU_TIME_wt - start; } // Go to next sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } } Loading