Loading gridding_cpu.c +21 −30 Original line number Diff line number Diff line Loading @@ -84,17 +84,15 @@ void gridding_data() // find the largest value in histo_send[] // uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); for (uint isector = 0; isector < nsectors; isector++) { double start = CPU_TIME_wt; uint Nsec = histo_send[isector]; uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; // allocate sector arrays // note: we use the largest allocation among all sectors // unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); Loading @@ -109,12 +107,6 @@ void gridding_data() float_t *visimgs = (float_t*)visreals + Nvissec; for (uint isector = 0; isector < nsectors; isector++) { double start = CPU_TIME_wt; memset( memory, 0, mem_size ); // select data for this sector uint icount = 0; Loading Loading @@ -183,14 +175,14 @@ void gridding_data() printf("Processing sector %ld\n",isector); #endif start = CPU_TIME_wt; double *stacking_target_array; if ( size > 1 ) stacking_target_array = gridss; else stacking_target_array = grid; start = CPU_TIME_wt; //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Nsec, Loading Loading @@ -259,9 +251,8 @@ void gridding_data() memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } } free(memory); } if ( size > 1 ) { Loading gridding_nccl.cpp +24 −31 Original line number Diff line number Diff line Loading @@ -68,30 +68,6 @@ void gridding_data(){ // find the largest value in histo_send[] // uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; // allocate sector arrays // note: we use the largest allocation among all sectors // unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double*) memory; double_t *vvs = (double*) uus+Nsec; double_t *wws = (double*) vvs+Nsec; float_t *weightss = (float_t*)(wws+Nsec); float_t *visreals = weightss + Nweightss; float_t *visimgs = visreals + Nvissec; //Initialize nccl Loading Loading @@ -129,7 +105,23 @@ void gridding_data(){ double start = CPU_TIME_wt; memset( memory, 0, mem_size ); uint Nsec = histo_send[isector]; uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double_t*) memory; double_t *vvs = (double_t*) uus+Nsec; double_t *wws = (double_t*) vvs+Nsec; float_t *weightss = (float_t*)((double_t*)wws+Nsec); float_t *visreals = (float_t*)weightss + Nweightss; float_t *visimgs = (float_t*)visreals + Nvissec; // select data for this sector uint icount = 0; Loading Loading @@ -246,10 +238,10 @@ void gridding_data(){ // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); start = CPU_TIME_wt; cudaStreamSynchronize(stream_reduce); start = CPU_TIME_wt; ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); cudaStreamSynchronize(stream_reduce); Loading @@ -260,11 +252,12 @@ void gridding_data(){ memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } free(memory); } //Copy data back from device to host (to be deleted in next steps) free(memory); cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); MPI_Barrier(MPI_COMM_WORLD); Loading gridding_rccl.cpp +22 −31 Original line number Diff line number Diff line Loading @@ -64,32 +64,6 @@ void gridding_data(){ if ( rank == 0 ) printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); // find the largest value in histo_send[] // uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; // allocate sector arrays // note: we use the largest allocation among all sectors // unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_ty *uus = (double*) memory; double_ty *vvs = (double*) uus+Nsec; double_ty *wws = (double*) vvs+Nsec; float_ty *weightss = (float_ty*)(wws+Nsec); float_ty *visreals = weightss + Nweightss; float_ty *visimgs = visreals + Nvissec; //Initialize nccl Loading Loading @@ -127,7 +101,23 @@ void gridding_data(){ double start = CPU_TIME_wt; memset( memory, 0, mem_size ); uint Nsec = histo_send[isector]; uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double_t*) memory; double_t *vvs = (double_t*) uus+Nsec; double_t *wws = (double_t*) vvs+Nsec; float_t *weightss = (float_t*)((double_t*)wws+Nsec); float_t *visreals = (float_t*)weightss + Nweightss; float_t *visimgs = (float_t*)visreals + Nvissec; // select data for this sector uint icount = 0; Loading Loading @@ -244,10 +234,11 @@ void gridding_data(){ // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); start = CPU_TIME_wt; hipStreamSynchronize(stream_reduce); start = CPU_TIME_wt; ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); hipStreamSynchronize(stream_reduce); Loading @@ -258,11 +249,11 @@ void gridding_data(){ memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } free(memory); } //Copy data back from device to host (to be deleted in next steps) free(memory); hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce); MPI_Barrier(MPI_COMM_WORLD); Loading Loading
gridding_cpu.c +21 −30 Original line number Diff line number Diff line Loading @@ -84,17 +84,15 @@ void gridding_data() // find the largest value in histo_send[] // uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); for (uint isector = 0; isector < nsectors; isector++) { double start = CPU_TIME_wt; uint Nsec = histo_send[isector]; uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; // allocate sector arrays // note: we use the largest allocation among all sectors // unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); Loading @@ -109,12 +107,6 @@ void gridding_data() float_t *visimgs = (float_t*)visreals + Nvissec; for (uint isector = 0; isector < nsectors; isector++) { double start = CPU_TIME_wt; memset( memory, 0, mem_size ); // select data for this sector uint icount = 0; Loading Loading @@ -183,14 +175,14 @@ void gridding_data() printf("Processing sector %ld\n",isector); #endif start = CPU_TIME_wt; double *stacking_target_array; if ( size > 1 ) stacking_target_array = gridss; else stacking_target_array = grid; start = CPU_TIME_wt; //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Nsec, Loading Loading @@ -259,9 +251,8 @@ void gridding_data() memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } } free(memory); } if ( size > 1 ) { Loading
gridding_nccl.cpp +24 −31 Original line number Diff line number Diff line Loading @@ -68,30 +68,6 @@ void gridding_data(){ // find the largest value in histo_send[] // uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; // allocate sector arrays // note: we use the largest allocation among all sectors // unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double*) memory; double_t *vvs = (double*) uus+Nsec; double_t *wws = (double*) vvs+Nsec; float_t *weightss = (float_t*)(wws+Nsec); float_t *visreals = weightss + Nweightss; float_t *visimgs = visreals + Nvissec; //Initialize nccl Loading Loading @@ -129,7 +105,23 @@ void gridding_data(){ double start = CPU_TIME_wt; memset( memory, 0, mem_size ); uint Nsec = histo_send[isector]; uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double_t*) memory; double_t *vvs = (double_t*) uus+Nsec; double_t *wws = (double_t*) vvs+Nsec; float_t *weightss = (float_t*)((double_t*)wws+Nsec); float_t *visreals = (float_t*)weightss + Nweightss; float_t *visimgs = (float_t*)visreals + Nvissec; // select data for this sector uint icount = 0; Loading Loading @@ -246,10 +238,10 @@ void gridding_data(){ // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); start = CPU_TIME_wt; cudaStreamSynchronize(stream_reduce); start = CPU_TIME_wt; ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); cudaStreamSynchronize(stream_reduce); Loading @@ -260,11 +252,12 @@ void gridding_data(){ memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } free(memory); } //Copy data back from device to host (to be deleted in next steps) free(memory); cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); MPI_Barrier(MPI_COMM_WORLD); Loading
gridding_rccl.cpp +22 −31 Original line number Diff line number Diff line Loading @@ -64,32 +64,6 @@ void gridding_data(){ if ( rank == 0 ) printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); // find the largest value in histo_send[] // uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; // allocate sector arrays // note: we use the largest allocation among all sectors // unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_ty *uus = (double*) memory; double_ty *vvs = (double*) uus+Nsec; double_ty *wws = (double*) vvs+Nsec; float_ty *weightss = (float_ty*)(wws+Nsec); float_ty *visreals = weightss + Nweightss; float_ty *visimgs = visreals + Nvissec; //Initialize nccl Loading Loading @@ -127,7 +101,23 @@ void gridding_data(){ double start = CPU_TIME_wt; memset( memory, 0, mem_size ); uint Nsec = histo_send[isector]; uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double_t*) memory; double_t *vvs = (double_t*) uus+Nsec; double_t *wws = (double_t*) vvs+Nsec; float_t *weightss = (float_t*)((double_t*)wws+Nsec); float_t *visreals = (float_t*)weightss + Nweightss; float_t *visimgs = (float_t*)visreals + Nvissec; // select data for this sector uint icount = 0; Loading Loading @@ -244,10 +234,11 @@ void gridding_data(){ // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); start = CPU_TIME_wt; hipStreamSynchronize(stream_reduce); start = CPU_TIME_wt; ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); hipStreamSynchronize(stream_reduce); Loading @@ -258,11 +249,11 @@ void gridding_data(){ memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } free(memory); } //Copy data back from device to host (to be deleted in next steps) free(memory); hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce); MPI_Barrier(MPI_COMM_WORLD); Loading