Loading data/paramfile.txt +4 −4 Original line number Diff line number Diff line Loading @@ -2,11 +2,11 @@ ndatasets 1 Datapath1 /data/LOFAR_MERGE/hpc_imaging/data/newgauss2noconj_t201806301100_SBL180.binMS/ #Datapath2 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_123MHz.pre-cal.binMS/ #Datapath3 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_125MHz.pre-cal.binMS/ num_threads 1 num_threads 4 w_support 7 reduce_method 0 grid_size_x 1024 grid_size_y 1024 reduce_method 1 grid_size_x 4096 grid_size_y 4096 num_w_planes 8 ufile ucoord.bin vfile vcoord.bin Loading gridding.c +0 −1 Original line number Diff line number Diff line Loading @@ -82,7 +82,6 @@ void initialize_array() histo_send = (uint*) calloc(nsectors+1, sizeof(uint)); for (uint iphi = 0; iphi < metaData.Nmeasures; iphi++) { double vvh = data.vv[iphi]; //less or equal to 0.6 Loading gridding_cpu.c +2 −3 Original line number Diff line number Diff line #include "allvars.h" #include "proto.h" Loading @@ -10,7 +9,7 @@ * */ #if !defined( NCCL_REDUCE ) #if !defined( NCCL_REDUCE ) && !defined( RCCL_REDUCE ) int reduce_ring (int); Loading gridding_nccl.cpp +115 −115 Original line number Diff line number Diff line Loading @@ -12,9 +12,17 @@ */ #if defined( NCCL_REDUCE_NVIDIA ) #if defined( NCCL_REDUCE ) #define NCCLCHECK(cmd) do { ncclResult_t r = cmd; if (r!= ncclSuccess) { printf("Failed, NCCL error %s:%d '%s'\n", __FILE__,__LINE__,ncclGetErrorString(r)); exit(EXIT_FAILURE); } } while(0) static uint64_t getHostHash(const char* string) { Loading Loading @@ -44,12 +52,11 @@ void gridding_data(){ double shift = (double)(dx*yaxis); timing.kernel_time = 0.0; timing.kernel_time1 = 0.0; timing.reduce_time = 0.0; timing.reduce_time1 = 0.0; timing.compose_time = 0.0; timing.compose_time1 = 0.0; timing_wt.kernel = 0.0; timing_wt.reduce = 0.0; timing_wt.reduce_mpi = 0.0; timing_wt.reduce_sh = 0.0; timing_wt.compose = 0.0; // calculate the resolution in radians resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax)); Loading @@ -60,7 +67,6 @@ void gridding_data(){ printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); //Initialize nccl #ifdef NCCL_REDUCE double * grid_gpu, *gridss_gpu; int local_rank = 0; Loading Loading @@ -89,46 +95,59 @@ void gridding_data(){ cudaStreamCreate(&stream_reduce); ncclCommInitRank(&comm, size, id, rank); #endif for (long isector = 0; isector < nsectors; isector++) { clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); // define local destination sector //isector = (isector_count+rank)%size; // this line must be wrong! [LT] // find the largest value in histo_send[] // uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; // allocate sector arrays long Nsec = histo_send[isector]; double *uus = (double*) malloc(Nsec*sizeof(double)); double *vvs = (double*) malloc(Nsec*sizeof(double)); double *wws = (double*) malloc(Nsec*sizeof(double)); long Nweightss = Nsec*metaData.polarisations; long Nvissec = Nweightss*metaData.freq_per_chan; float *weightss = (float*) malloc(Nweightss*sizeof(float)); float *visreals = (float*) malloc(Nvissec*sizeof(float)); float *visimgs = (float*) malloc(Nvissec*sizeof(float)); // note: we use the largest allocation among all sectors double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double*) memory; double_t *vvs = (double*) uus+Nsec; double_t *wws = (double*) vvs+Nsec; float_t *weightss = (float_t*)(wws+Nsec); float_t *visreals = weightss + Nweightss; float_t *visimgs = visreals + Nvissec; for (uint isector = 0; isector < nsectors; isector++) { double start = CPU_TIME_wt; // select data for this sector long icount = 0; long ip = 0; long inu = 0; uint icount = 0; uint ip = 0; uint inu = 0; for(long iphi = histo_send[isector]-1; iphi>=0; iphi--) #warning "this loop should be threaded" #warning "the counter of this loop should not be int" for(int iphi = histo_send[isector]-1; iphi>=0; iphi--) { long ilocal = sectorarray[isector][iphi]; uint ilocal = sectorarray[isector][iphi]; //double vvh = data.vv[ilocal]; //int binphi = (int)(vvh*nsectors); //if (binphi == isector || boundary[ilocal] == isector) { uus[icount] = data.uu[ilocal]; vvs[icount] = data.vv[ilocal]-isector*shift; wws[icount] = data.ww[ilocal]; for (long ipol=0; ipol<metaData.polarisations; ipol++) for (uint ipol=0; ipol<metaData.polarisations; ipol++) { weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; ip++; } for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) { visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; Loading @@ -138,21 +157,44 @@ void gridding_data(){ icount++; } clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; double uumin = 1e20; double vvmin = 1e20; double uumax = -1e20; double vvmax = -1e20; #pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads) { double my_uumin = 1e20; double my_vvmin = 1e20; double my_uumax = -1e20; double my_vvmax = -1e20; #pragma omp for for (uint ipart=0; ipart<Nsec; ipart++) { my_uumin = MIN(my_uumin, uus[ipart]); my_uumax = MAX(my_uumax, uus[ipart]); my_vvmin = MIN(my_vvmin, vvs[ipart]); my_vvmax = MAX(my_vvmax, vvs[ipart]); } uumin = MIN( uumin, my_uumin ); uumax = MAX( uumax, my_uumax ); vvmin = MIN( vvmin, my_vvmin ); vvmax = MAX( vvmax, my_vvmax ); } timing_wt.compose += CPU_TIME_wt - start; printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); // Make convolution on the grid #ifdef VERBOSE printf("Processing sector %ld\n",isector); #endif clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); start = CPU_TIME_wt; //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Loading @@ -176,75 +218,40 @@ void gridding_data(){ //Allocate memory on devices non-blocking for the host /////////////////////////////////////////////////////// #ifdef NCCL_REDUCE cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce); #endif /* int z =0 ; * #pragma omp target map(to:test_i_gpu) map(from:z) * { * int x; // only accessible from accelerator * x = 2; * z = x + test_i_gpu; * }*/ clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec); timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; timing_wt.kernel += CPU_TIME_wt - start; #ifdef VERBOSE printf("Processed sector %ld\n",isector); #endif clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]); start = CPU_TIME_wt; #ifndef USE_MPI long stride = isector*2*xaxis*yaxis*num_w_planes; for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++) gridtot[stride+iii] = gridss[iii]; #endif if( size > 1 ) { // Write grid in the corresponding remote slab #ifdef USE_MPI // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); #ifdef NCCL_REDUCE cudaStreamSynchronize(stream_reduce); ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); NCCLCHECK(ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce)); cudaStreamSynchronize(stream_reduce); //cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); //cudaStreamSynchronize(stream_reduce); #endif #ifdef REDUCE MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); #endif //REDUCE timing_wt.reduce += CPU_TIME_wt - start; //Let's use now the new implementation (ring in shmem and Ired inter-nodes) #endif //USE_MPI } clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec); timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; // Go to next sector for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0; // Deallocate all sector arrays free(uus); free(vvs); free(wws); free(weightss); free(visreals); free(visimgs); // End of loop over sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); free(memory); } //Copy data back from device to host (to be deleted in next steps) Loading @@ -254,15 +261,6 @@ void gridding_data(){ MPI_Barrier(MPI_COMM_WORLD); end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.process_time1 = (finish.tv_sec - begin.tv_sec); timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); cudaFree(gridss_gpu); cudaFree(grid_gpu); Loading @@ -270,6 +268,8 @@ void gridding_data(){ ncclCommDestroy(comm); return; } #endif proto.h +1 −1 Original line number Diff line number Diff line Loading @@ -11,7 +11,7 @@ void readMetaData(char fileLocal[1000]); void metaData_calculation(); void allocate_memory(); void readData(); void shutdown( int, char *, char *, int); void shutdown_wstacking( int, char *, char *, int); #ifdef __cplusplus Loading allvars.c +2 −2 File changed.Contains only whitespace changes. Show changes Loading
data/paramfile.txt +4 −4 Original line number Diff line number Diff line Loading @@ -2,11 +2,11 @@ ndatasets 1 Datapath1 /data/LOFAR_MERGE/hpc_imaging/data/newgauss2noconj_t201806301100_SBL180.binMS/ #Datapath2 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_123MHz.pre-cal.binMS/ #Datapath3 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_125MHz.pre-cal.binMS/ num_threads 1 num_threads 4 w_support 7 reduce_method 0 grid_size_x 1024 grid_size_y 1024 reduce_method 1 grid_size_x 4096 grid_size_y 4096 num_w_planes 8 ufile ucoord.bin vfile vcoord.bin Loading
gridding.c +0 −1 Original line number Diff line number Diff line Loading @@ -82,7 +82,6 @@ void initialize_array() histo_send = (uint*) calloc(nsectors+1, sizeof(uint)); for (uint iphi = 0; iphi < metaData.Nmeasures; iphi++) { double vvh = data.vv[iphi]; //less or equal to 0.6 Loading
gridding_cpu.c +2 −3 Original line number Diff line number Diff line #include "allvars.h" #include "proto.h" Loading @@ -10,7 +9,7 @@ * */ #if !defined( NCCL_REDUCE ) #if !defined( NCCL_REDUCE ) && !defined( RCCL_REDUCE ) int reduce_ring (int); Loading
gridding_nccl.cpp +115 −115 Original line number Diff line number Diff line Loading @@ -12,9 +12,17 @@ */ #if defined( NCCL_REDUCE_NVIDIA ) #if defined( NCCL_REDUCE ) #define NCCLCHECK(cmd) do { ncclResult_t r = cmd; if (r!= ncclSuccess) { printf("Failed, NCCL error %s:%d '%s'\n", __FILE__,__LINE__,ncclGetErrorString(r)); exit(EXIT_FAILURE); } } while(0) static uint64_t getHostHash(const char* string) { Loading Loading @@ -44,12 +52,11 @@ void gridding_data(){ double shift = (double)(dx*yaxis); timing.kernel_time = 0.0; timing.kernel_time1 = 0.0; timing.reduce_time = 0.0; timing.reduce_time1 = 0.0; timing.compose_time = 0.0; timing.compose_time1 = 0.0; timing_wt.kernel = 0.0; timing_wt.reduce = 0.0; timing_wt.reduce_mpi = 0.0; timing_wt.reduce_sh = 0.0; timing_wt.compose = 0.0; // calculate the resolution in radians resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax)); Loading @@ -60,7 +67,6 @@ void gridding_data(){ printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); //Initialize nccl #ifdef NCCL_REDUCE double * grid_gpu, *gridss_gpu; int local_rank = 0; Loading Loading @@ -89,46 +95,59 @@ void gridding_data(){ cudaStreamCreate(&stream_reduce); ncclCommInitRank(&comm, size, id, rank); #endif for (long isector = 0; isector < nsectors; isector++) { clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); // define local destination sector //isector = (isector_count+rank)%size; // this line must be wrong! [LT] // find the largest value in histo_send[] // uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; // allocate sector arrays long Nsec = histo_send[isector]; double *uus = (double*) malloc(Nsec*sizeof(double)); double *vvs = (double*) malloc(Nsec*sizeof(double)); double *wws = (double*) malloc(Nsec*sizeof(double)); long Nweightss = Nsec*metaData.polarisations; long Nvissec = Nweightss*metaData.freq_per_chan; float *weightss = (float*) malloc(Nweightss*sizeof(float)); float *visreals = (float*) malloc(Nvissec*sizeof(float)); float *visimgs = (float*) malloc(Nvissec*sizeof(float)); // note: we use the largest allocation among all sectors double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double*) memory; double_t *vvs = (double*) uus+Nsec; double_t *wws = (double*) vvs+Nsec; float_t *weightss = (float_t*)(wws+Nsec); float_t *visreals = weightss + Nweightss; float_t *visimgs = visreals + Nvissec; for (uint isector = 0; isector < nsectors; isector++) { double start = CPU_TIME_wt; // select data for this sector long icount = 0; long ip = 0; long inu = 0; uint icount = 0; uint ip = 0; uint inu = 0; for(long iphi = histo_send[isector]-1; iphi>=0; iphi--) #warning "this loop should be threaded" #warning "the counter of this loop should not be int" for(int iphi = histo_send[isector]-1; iphi>=0; iphi--) { long ilocal = sectorarray[isector][iphi]; uint ilocal = sectorarray[isector][iphi]; //double vvh = data.vv[ilocal]; //int binphi = (int)(vvh*nsectors); //if (binphi == isector || boundary[ilocal] == isector) { uus[icount] = data.uu[ilocal]; vvs[icount] = data.vv[ilocal]-isector*shift; wws[icount] = data.ww[ilocal]; for (long ipol=0; ipol<metaData.polarisations; ipol++) for (uint ipol=0; ipol<metaData.polarisations; ipol++) { weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; ip++; } for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) { visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; Loading @@ -138,21 +157,44 @@ void gridding_data(){ icount++; } clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; double uumin = 1e20; double vvmin = 1e20; double uumax = -1e20; double vvmax = -1e20; #pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads) { double my_uumin = 1e20; double my_vvmin = 1e20; double my_uumax = -1e20; double my_vvmax = -1e20; #pragma omp for for (uint ipart=0; ipart<Nsec; ipart++) { my_uumin = MIN(my_uumin, uus[ipart]); my_uumax = MAX(my_uumax, uus[ipart]); my_vvmin = MIN(my_vvmin, vvs[ipart]); my_vvmax = MAX(my_vvmax, vvs[ipart]); } uumin = MIN( uumin, my_uumin ); uumax = MAX( uumax, my_uumax ); vvmin = MIN( vvmin, my_vvmin ); vvmax = MAX( vvmax, my_vvmax ); } timing_wt.compose += CPU_TIME_wt - start; printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); // Make convolution on the grid #ifdef VERBOSE printf("Processing sector %ld\n",isector); #endif clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); start = CPU_TIME_wt; //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Loading @@ -176,75 +218,40 @@ void gridding_data(){ //Allocate memory on devices non-blocking for the host /////////////////////////////////////////////////////// #ifdef NCCL_REDUCE cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce); #endif /* int z =0 ; * #pragma omp target map(to:test_i_gpu) map(from:z) * { * int x; // only accessible from accelerator * x = 2; * z = x + test_i_gpu; * }*/ clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec); timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; timing_wt.kernel += CPU_TIME_wt - start; #ifdef VERBOSE printf("Processed sector %ld\n",isector); #endif clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]); start = CPU_TIME_wt; #ifndef USE_MPI long stride = isector*2*xaxis*yaxis*num_w_planes; for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++) gridtot[stride+iii] = gridss[iii]; #endif if( size > 1 ) { // Write grid in the corresponding remote slab #ifdef USE_MPI // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); #ifdef NCCL_REDUCE cudaStreamSynchronize(stream_reduce); ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); NCCLCHECK(ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce)); cudaStreamSynchronize(stream_reduce); //cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); //cudaStreamSynchronize(stream_reduce); #endif #ifdef REDUCE MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); #endif //REDUCE timing_wt.reduce += CPU_TIME_wt - start; //Let's use now the new implementation (ring in shmem and Ired inter-nodes) #endif //USE_MPI } clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec); timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; // Go to next sector for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0; // Deallocate all sector arrays free(uus); free(vvs); free(wws); free(weightss); free(visreals); free(visimgs); // End of loop over sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); free(memory); } //Copy data back from device to host (to be deleted in next steps) Loading @@ -254,15 +261,6 @@ void gridding_data(){ MPI_Barrier(MPI_COMM_WORLD); end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.process_time1 = (finish.tv_sec - begin.tv_sec); timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); cudaFree(gridss_gpu); cudaFree(grid_gpu); Loading @@ -270,6 +268,8 @@ void gridding_data(){ ncclCommDestroy(comm); return; } #endif
proto.h +1 −1 Original line number Diff line number Diff line Loading @@ -11,7 +11,7 @@ void readMetaData(char fileLocal[1000]); void metaData_calculation(); void allocate_memory(); void readData(); void shutdown( int, char *, char *, int); void shutdown_wstacking( int, char *, char *, int); #ifdef __cplusplus Loading