Loading Build/Makefile.local +16 −0 Original line number Diff line number Diff line Loading @@ -25,3 +25,19 @@ CFLAGS += MPICHLIB = ########################################################## #AMD GPUs (DEFAULT = LUMI) CLANG = clang CLANG++ = clang++ OPTIMIZE_AMD = -O3 -Ofast -fopenmp -march=native -mavx -mavx2 -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa --offload-arch=gfx90a RCCL_INCL= -I/opt/rocm-5.2.3/rccl/include RCCL_LIB= -L/opt/rocm-5.2.3/rccl/lib HIP_INCL= -I/opt/rocm-5.2.3/hip/include HIP_LIB= -L/opt/rocm-5.2.3/hip/lib AMDLIB = $(HIP_INCL) $(HIP_LIB) $(RCCL_INCL) $(RCCL_LIB) -D__HIP_PLATFORM_AMD__ -lamdhip64 -lrccl ########################################################### Makefile +26 −11 Original line number Diff line number Diff line Loading @@ -77,9 +77,9 @@ OPT += -DACCOMP DEPS = w-stacking.h main.c allvars.h ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) && ifneq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o else ifneq (CUDACC,$(findstring CUDACC, $(OPT))) else OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o endif Loading @@ -89,8 +89,11 @@ OBJ_ACC_CUDA = phase_correction.o w-stacking.o DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o DEPS_NCCL_REDUCE = gridding_nccl.cpp OBJ_NCCL_REDUCE = gridding_nccl.o DEPS_RCCL_REDUCE = gridding_rccl.cpp OBJ_RCCL_REDUCE = gridding_rccl.o ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT))) Loading Loading @@ -130,7 +133,8 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) OBJ += $(OBJ_ACC_CUDA) endif ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) #NVIDIA GPUs ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifneq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(NVC) FLAGS=$(NVFLAGS) $(CFLAGS) Loading @@ -140,22 +144,33 @@ $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) OBJ += $(OBJ_ACC_OMP) endif #AMD GPUs ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifeq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(MPICC) FLAGS=$(OPTIMIZE_AMD) $(CFLAGS) LIBS=$(AMDLIB) $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) $(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS) endif ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) compile_accreduce: $(OBJ_NCCL_REDUCE) $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) endif ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) compile_accreduce: $(OBJ_RCCL_REDUCE) $(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3) LINKER=$(MPIC++) FLAGS=$(OPTIMIZE_AMD) $(CFLAGS) LIBS=$(AMDLIB) $(OBJ_RCCL_REDUCE): $(DEPS_RCCL_REDUCE) $(MPIC++) $(FLAGS) $(OPT) -c $^ $(CFLAGS) $(LIBS) endif Loading gridding_rccl.cpp +138 −246 Original line number Diff line number Diff line #if defined( REDUCE_RCCL ) #include <stdio.h> #include "allvars.h" #include "allvars_nccl.h" #include "proto.h" #include <hip/hip_runtime.h> #include <rccl/rccl.h> int verbose_level = 0; /* * Implements the gridding of data via GPU * by using NCCL library * */ #if defined( RCCL_REDUCE ) /* #define NCCLCHECK(cmd) do { ncclResult_t r = cmd; if (r!= ncclSuccess) { printf("Failed, NCCL error %s:%d '%s'\n", __FILE__,__LINE__,ncclGetErrorString(r)); exit(EXIT_FAILURE); } } while(0) */ static uint64_t getHostHash(const char* string) { // Based on DJB2a, result = result * 33 ^ char Loading @@ -32,116 +44,54 @@ static void getHostName(char* hostname, int maxlen) { } void gridding(){ if(rank == 0)printf("GRIDDING DATA\n"); // Create histograms and linked lists clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); // Initialize linked list initialize_array(); //Sector and Gridding data gridding_data(); void gridding_data(){ #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif double shift = (double)(dx*yaxis); end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.process_time1 = (finish.tv_sec - begin.tv_sec); timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); timing_wt.kernel = 0.0; timing_wt.reduce = 0.0; timing_wt.reduce_mpi = 0.0; timing_wt.reduce_sh = 0.0; timing_wt.compose = 0.0; } // calculate the resolution in radians resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax)); void initialize_array(){ // calculate the resolution in arcsec double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI; if ( rank == 0 ) printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); histo_send = (long*) calloc(nsectors+1,sizeof(long)); int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int)); double uuh,vvh; for (long iphi = 0; iphi < metaData.Nmeasures; iphi++) { boundary[iphi] = -1; vvh = data.vv[iphi]; //less or equal to 0.6 int binphi = (int)(vvh*nsectors); //has values expect 0 and nsectors-1. So we use updist and downdist condition // check if the point influence also neighboring slabs double updist = (double)((binphi+1)*yaxis)*dx - vvh; double downdist = vvh - (double)(binphi*yaxis)*dx; // find the largest value in histo_send[] // histo_send[binphi]++; if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1;}; if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1;}; } uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); sectorarray = (long**)malloc ((nsectors+1) * sizeof(long*)); for(int sec=0; sec<(nsectors+1); sec++) { sectorarray[sec] = (long*)malloc(histo_send[sec]*sizeof(long)); } uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; long *counter = (long*) calloc(nsectors+1,sizeof(long)); for (long iphi = 0; iphi < metaData.Nmeasures; iphi++) { vvh = data.vv[iphi]; int binphi = (int)(vvh*nsectors); double updist = (double)((binphi+1)*yaxis)*dx - vvh; double downdist = vvh - (double)(binphi*yaxis)*dx; sectorarray[binphi][counter[binphi]] = iphi; counter[binphi]++; if(updist < w_supporth && updist >= 0.0) { sectorarray[binphi+1][counter[binphi+1]] = iphi; counter[binphi+1]++;}; if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) { sectorarray[binphi-1][counter[binphi-1]] = iphi; counter[binphi-1]++;}; } #ifdef PIPPO long iiii = 0; for (int j=0; j<nsectors; j++) { iiii = 0; for(long iphi = histo_send[j]-1; iphi>=0; iphi--) { printf("%d %d %ld %ld %ld\n",rank,j,iiii,histo_send[j],sectorarray[j][iphi]); iiii++; } } #endif #ifdef VERBOSE for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]); #endif } void gridding_data(){ // allocate sector arrays // note: we use the largest allocation among all sectors double shift = (double)(dx*yaxis); double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); #ifndef USE_MPI file.pFile1 = fopen (out.outfile1,"w"); #endif if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); timing.kernel_time = 0.0; timing.kernel_time1 = 0.0; timing.reduce_time = 0.0; timing.reduce_time1 = 0.0; timing.compose_time = 0.0; timing.compose_time1 = 0.0; double_t *uus = (double*) memory; double_t *vvs = (double*) uus+Nsec; double_t *wws = (double*) vvs+Nsec; float_t *weightss = (float_t*)(wws+Nsec); float_t *visreals = weightss + Nweightss; float_t *visimgs = visreals + Nvissec; // calculate the resolution in radians resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax)); // calculate the resolution in arcsec double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI; if ( rank == 0 ) printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); //Initialize nccl #ifdef NCCL_REDUCE double * grid_gpu, *gridss_gpu; int local_rank = 0; Loading Loading @@ -170,87 +120,80 @@ void gridding_data(){ hipStreamCreate(&stream_reduce); ncclCommInitRank(&comm, size, id, rank); #endif for (long isector = 0; isector < nsectors; isector++) for (uint isector = 0; isector < nsectors; isector++) { clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); // define local destination sector //isector = (isector_count+rank)%size; // this line must be wrong! [LT] // allocate sector arrays long Nsec = histo_send[isector]; double *uus = (double*) malloc(Nsec*sizeof(double)); double *vvs = (double*) malloc(Nsec*sizeof(double)); double *wws = (double*) malloc(Nsec*sizeof(double)); long Nweightss = Nsec*metaData.polarisations; long Nvissec = Nweightss*metaData.freq_per_chan; float *weightss = (float*) malloc(Nweightss*sizeof(float)); float *visreals = (float*) malloc(Nvissec*sizeof(float)); float *visimgs = (float*) malloc(Nvissec*sizeof(float)); double start = CPU_TIME_wt; // select data for this sector long icount = 0; long ip = 0; long inu = 0; uint icount = 0; uint ip = 0; uint inu = 0; for(long iphi = histo_send[isector]-1; iphi>=0; iphi--) #warning "this loop should be threaded" #warning "the counter of this loop should not be int" for(int iphi = histo_send[isector]-1; iphi>=0; iphi--) { long ilocal = sectorarray[isector][iphi]; //double vvh = data.vv[ilocal]; //int binphi = (int)(vvh*nsectors); //if (binphi == isector || boundary[ilocal] == isector) { uint ilocal = sectorarray[isector][iphi]; uus[icount] = data.uu[ilocal]; vvs[icount] = data.vv[ilocal]-isector*shift; wws[icount] = data.ww[ilocal]; for (long ipol=0; ipol<metaData.polarisations; ipol++) for (uint ipol=0; ipol<metaData.polarisations; ipol++) { weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; ip++; } for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) { visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis); inu++; } icount++; } clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; #ifndef USE_MPI double uumin = 1e20; double vvmin = 1e20; double uumax = -1e20; double vvmax = -1e20; for (long ipart=0; ipart<Nsec; ipart++) #pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads) { uumin = MIN(uumin,uus[ipart]); uumax = MAX(uumax,uus[ipart]); vvmin = MIN(vvmin,vvs[ipart]); vvmax = MAX(vvmax,vvs[ipart]); double my_uumin = 1e20; double my_vvmin = 1e20; double my_uumax = -1e20; double my_vvmax = -1e20; if(ipart%10 == 0)fprintf (file.pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]); #pragma omp for for (uint ipart=0; ipart<Nsec; ipart++) { my_uumin = MIN(my_uumin, uus[ipart]); my_uumax = MAX(my_uumax, uus[ipart]); my_vvmin = MIN(my_vvmin, vvs[ipart]); my_vvmax = MAX(my_vvmax, vvs[ipart]); } printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); #endif uumin = MIN( uumin, my_uumin ); uumax = MAX( uumax, my_uumax ); vvmin = MIN( vvmin, my_vvmin ); vvmax = MAX( vvmax, my_vvmax ); } timing_wt.compose += CPU_TIME_wt - start; //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); // Make convolution on the grid #ifdef VERBOSE printf("Processing sector %ld\n",isector); #endif clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); start = CPU_TIME_wt; //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Loading @@ -274,99 +217,47 @@ void gridding_data(){ //Allocate memory on devices non-blocking for the host /////////////////////////////////////////////////////// #ifdef NCCL_REDUCE timing_wt.kernel += CPU_TIME_wt - start; hipMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyHostToDevice, stream_reduce); #endif /* int z =0 ; * #pragma omp target map(to:test_i_gpu) map(from:z) * { * int x; // only accessible from accelerator * x = 2; * z = x + test_i_gpu; * }*/ clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec); timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; #ifdef VERBOSE printf("Processed sector %ld\n",isector); #endif clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); start = CPU_TIME_wt; //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]); #ifndef USE_MPI long stride = isector*2*xaxis*yaxis*num_w_planes; for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++) gridtot[stride+iii] = gridss[iii]; #endif if( size > 1 ) { // Write grid in the corresponding remote slab #ifdef USE_MPI // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); #ifdef NCCL_REDUCE hipStreamSynchronize(stream_reduce); ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); hipStreamSynchronize(stream_reduce); //hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce); //hipStreamSynchronize(stream_reduce); #endif #ifdef REDUCE MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); #endif //REDUCE timing_wt.reduce += CPU_TIME_wt - start; //Let's use now the new implementation (ring in shmem and Ired inter-nodes) #endif //USE_MPI } clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec); timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; // Go to next sector for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0; // Deallocate all sector arrays free(uus); free(vvs); free(wws); free(weightss); free(visreals); free(visimgs); // End of loop over sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } //Copy data back from device to host (to be deleted in next steps) #ifdef NCCL_REDUCE free(memory); hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce); #endif #ifndef USE_MPI fclose(file.pFile1); #endif #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.process_time1 = (finish.tv_sec - begin.tv_sec); timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); #ifdef NCCL_REDUCE hipStreamSynchronize(stream_reduce); hipFree(gridss_gpu); hipFree(grid_gpu); Loading @@ -374,8 +265,9 @@ void gridding_data(){ hipStreamDestroy(stream_reduce); ncclCommDestroy(comm); #endif } return; } #endif Loading
Build/Makefile.local +16 −0 Original line number Diff line number Diff line Loading @@ -25,3 +25,19 @@ CFLAGS += MPICHLIB = ########################################################## #AMD GPUs (DEFAULT = LUMI) CLANG = clang CLANG++ = clang++ OPTIMIZE_AMD = -O3 -Ofast -fopenmp -march=native -mavx -mavx2 -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa --offload-arch=gfx90a RCCL_INCL= -I/opt/rocm-5.2.3/rccl/include RCCL_LIB= -L/opt/rocm-5.2.3/rccl/lib HIP_INCL= -I/opt/rocm-5.2.3/hip/include HIP_LIB= -L/opt/rocm-5.2.3/hip/lib AMDLIB = $(HIP_INCL) $(HIP_LIB) $(RCCL_INCL) $(RCCL_LIB) -D__HIP_PLATFORM_AMD__ -lamdhip64 -lrccl ###########################################################
Makefile +26 −11 Original line number Diff line number Diff line Loading @@ -77,9 +77,9 @@ OPT += -DACCOMP DEPS = w-stacking.h main.c allvars.h ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) && ifneq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o else ifneq (CUDACC,$(findstring CUDACC, $(OPT))) else OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o endif Loading @@ -89,8 +89,11 @@ OBJ_ACC_CUDA = phase_correction.o w-stacking.o DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o DEPS_NCCL_REDUCE = gridding_nccl.cpp OBJ_NCCL_REDUCE = gridding_nccl.o DEPS_RCCL_REDUCE = gridding_rccl.cpp OBJ_RCCL_REDUCE = gridding_rccl.o ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT))) Loading Loading @@ -130,7 +133,8 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) OBJ += $(OBJ_ACC_CUDA) endif ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) #NVIDIA GPUs ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifneq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(NVC) FLAGS=$(NVFLAGS) $(CFLAGS) Loading @@ -140,22 +144,33 @@ $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) OBJ += $(OBJ_ACC_OMP) endif #AMD GPUs ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) && ifeq (__HIP_PLATFORM_AMD__,$(findstring __HIP_PLATFORM_AMD__,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-omp LINKER=$(MPICC) FLAGS=$(OPTIMIZE_AMD) $(CFLAGS) LIBS=$(AMDLIB) $(OBJ_ACC_OMP): $(DEPS_ACC_OMP) $(MPICC) $(FLAGS) $(OPT) -c $^ $(CFLAGS) endif ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) compile_accreduce: $(OBJ_NCCL_REDUCE) $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) endif ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-reduce LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_3) compile_accreduce: $(OBJ_RCCL_REDUCE) $(NVC++) $(NVFLAGS) $(OPT) -c $^ $(CFLAGS) $(NVLIB_3) LINKER=$(MPIC++) FLAGS=$(OPTIMIZE_AMD) $(CFLAGS) LIBS=$(AMDLIB) $(OBJ_RCCL_REDUCE): $(DEPS_RCCL_REDUCE) $(MPIC++) $(FLAGS) $(OPT) -c $^ $(CFLAGS) $(LIBS) endif Loading
gridding_rccl.cpp +138 −246 Original line number Diff line number Diff line #if defined( REDUCE_RCCL ) #include <stdio.h> #include "allvars.h" #include "allvars_nccl.h" #include "proto.h" #include <hip/hip_runtime.h> #include <rccl/rccl.h> int verbose_level = 0; /* * Implements the gridding of data via GPU * by using NCCL library * */ #if defined( RCCL_REDUCE ) /* #define NCCLCHECK(cmd) do { ncclResult_t r = cmd; if (r!= ncclSuccess) { printf("Failed, NCCL error %s:%d '%s'\n", __FILE__,__LINE__,ncclGetErrorString(r)); exit(EXIT_FAILURE); } } while(0) */ static uint64_t getHostHash(const char* string) { // Based on DJB2a, result = result * 33 ^ char Loading @@ -32,116 +44,54 @@ static void getHostName(char* hostname, int maxlen) { } void gridding(){ if(rank == 0)printf("GRIDDING DATA\n"); // Create histograms and linked lists clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); // Initialize linked list initialize_array(); //Sector and Gridding data gridding_data(); void gridding_data(){ #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif double shift = (double)(dx*yaxis); end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.process_time1 = (finish.tv_sec - begin.tv_sec); timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); timing_wt.kernel = 0.0; timing_wt.reduce = 0.0; timing_wt.reduce_mpi = 0.0; timing_wt.reduce_sh = 0.0; timing_wt.compose = 0.0; } // calculate the resolution in radians resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax)); void initialize_array(){ // calculate the resolution in arcsec double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI; if ( rank == 0 ) printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); histo_send = (long*) calloc(nsectors+1,sizeof(long)); int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int)); double uuh,vvh; for (long iphi = 0; iphi < metaData.Nmeasures; iphi++) { boundary[iphi] = -1; vvh = data.vv[iphi]; //less or equal to 0.6 int binphi = (int)(vvh*nsectors); //has values expect 0 and nsectors-1. So we use updist and downdist condition // check if the point influence also neighboring slabs double updist = (double)((binphi+1)*yaxis)*dx - vvh; double downdist = vvh - (double)(binphi*yaxis)*dx; // find the largest value in histo_send[] // histo_send[binphi]++; if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1;}; if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1;}; } uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); sectorarray = (long**)malloc ((nsectors+1) * sizeof(long*)); for(int sec=0; sec<(nsectors+1); sec++) { sectorarray[sec] = (long*)malloc(histo_send[sec]*sizeof(long)); } uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; long *counter = (long*) calloc(nsectors+1,sizeof(long)); for (long iphi = 0; iphi < metaData.Nmeasures; iphi++) { vvh = data.vv[iphi]; int binphi = (int)(vvh*nsectors); double updist = (double)((binphi+1)*yaxis)*dx - vvh; double downdist = vvh - (double)(binphi*yaxis)*dx; sectorarray[binphi][counter[binphi]] = iphi; counter[binphi]++; if(updist < w_supporth && updist >= 0.0) { sectorarray[binphi+1][counter[binphi+1]] = iphi; counter[binphi+1]++;}; if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) { sectorarray[binphi-1][counter[binphi-1]] = iphi; counter[binphi-1]++;}; } #ifdef PIPPO long iiii = 0; for (int j=0; j<nsectors; j++) { iiii = 0; for(long iphi = histo_send[j]-1; iphi>=0; iphi--) { printf("%d %d %ld %ld %ld\n",rank,j,iiii,histo_send[j],sectorarray[j][iphi]); iiii++; } } #endif #ifdef VERBOSE for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]); #endif } void gridding_data(){ // allocate sector arrays // note: we use the largest allocation among all sectors double shift = (double)(dx*yaxis); double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); #ifndef USE_MPI file.pFile1 = fopen (out.outfile1,"w"); #endif if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); timing.kernel_time = 0.0; timing.kernel_time1 = 0.0; timing.reduce_time = 0.0; timing.reduce_time1 = 0.0; timing.compose_time = 0.0; timing.compose_time1 = 0.0; double_t *uus = (double*) memory; double_t *vvs = (double*) uus+Nsec; double_t *wws = (double*) vvs+Nsec; float_t *weightss = (float_t*)(wws+Nsec); float_t *visreals = weightss + Nweightss; float_t *visimgs = visreals + Nvissec; // calculate the resolution in radians resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax)); // calculate the resolution in arcsec double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI; if ( rank == 0 ) printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); //Initialize nccl #ifdef NCCL_REDUCE double * grid_gpu, *gridss_gpu; int local_rank = 0; Loading Loading @@ -170,87 +120,80 @@ void gridding_data(){ hipStreamCreate(&stream_reduce); ncclCommInitRank(&comm, size, id, rank); #endif for (long isector = 0; isector < nsectors; isector++) for (uint isector = 0; isector < nsectors; isector++) { clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); // define local destination sector //isector = (isector_count+rank)%size; // this line must be wrong! [LT] // allocate sector arrays long Nsec = histo_send[isector]; double *uus = (double*) malloc(Nsec*sizeof(double)); double *vvs = (double*) malloc(Nsec*sizeof(double)); double *wws = (double*) malloc(Nsec*sizeof(double)); long Nweightss = Nsec*metaData.polarisations; long Nvissec = Nweightss*metaData.freq_per_chan; float *weightss = (float*) malloc(Nweightss*sizeof(float)); float *visreals = (float*) malloc(Nvissec*sizeof(float)); float *visimgs = (float*) malloc(Nvissec*sizeof(float)); double start = CPU_TIME_wt; // select data for this sector long icount = 0; long ip = 0; long inu = 0; uint icount = 0; uint ip = 0; uint inu = 0; for(long iphi = histo_send[isector]-1; iphi>=0; iphi--) #warning "this loop should be threaded" #warning "the counter of this loop should not be int" for(int iphi = histo_send[isector]-1; iphi>=0; iphi--) { long ilocal = sectorarray[isector][iphi]; //double vvh = data.vv[ilocal]; //int binphi = (int)(vvh*nsectors); //if (binphi == isector || boundary[ilocal] == isector) { uint ilocal = sectorarray[isector][iphi]; uus[icount] = data.uu[ilocal]; vvs[icount] = data.vv[ilocal]-isector*shift; wws[icount] = data.ww[ilocal]; for (long ipol=0; ipol<metaData.polarisations; ipol++) for (uint ipol=0; ipol<metaData.polarisations; ipol++) { weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; ip++; } for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) { visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis); inu++; } icount++; } clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; #ifndef USE_MPI double uumin = 1e20; double vvmin = 1e20; double uumax = -1e20; double vvmax = -1e20; for (long ipart=0; ipart<Nsec; ipart++) #pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads) { uumin = MIN(uumin,uus[ipart]); uumax = MAX(uumax,uus[ipart]); vvmin = MIN(vvmin,vvs[ipart]); vvmax = MAX(vvmax,vvs[ipart]); double my_uumin = 1e20; double my_vvmin = 1e20; double my_uumax = -1e20; double my_vvmax = -1e20; if(ipart%10 == 0)fprintf (file.pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]); #pragma omp for for (uint ipart=0; ipart<Nsec; ipart++) { my_uumin = MIN(my_uumin, uus[ipart]); my_uumax = MAX(my_uumax, uus[ipart]); my_vvmin = MIN(my_vvmin, vvs[ipart]); my_vvmax = MAX(my_vvmax, vvs[ipart]); } printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); #endif uumin = MIN( uumin, my_uumin ); uumax = MAX( uumax, my_uumax ); vvmin = MIN( vvmin, my_vvmin ); vvmax = MAX( vvmax, my_vvmax ); } timing_wt.compose += CPU_TIME_wt - start; //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); // Make convolution on the grid #ifdef VERBOSE printf("Processing sector %ld\n",isector); #endif clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); start = CPU_TIME_wt; //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Loading @@ -274,99 +217,47 @@ void gridding_data(){ //Allocate memory on devices non-blocking for the host /////////////////////////////////////////////////////// #ifdef NCCL_REDUCE timing_wt.kernel += CPU_TIME_wt - start; hipMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyHostToDevice, stream_reduce); #endif /* int z =0 ; * #pragma omp target map(to:test_i_gpu) map(from:z) * { * int x; // only accessible from accelerator * x = 2; * z = x + test_i_gpu; * }*/ clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec); timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; #ifdef VERBOSE printf("Processed sector %ld\n",isector); #endif clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); start = CPU_TIME_wt; //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]); #ifndef USE_MPI long stride = isector*2*xaxis*yaxis*num_w_planes; for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++) gridtot[stride+iii] = gridss[iii]; #endif if( size > 1 ) { // Write grid in the corresponding remote slab #ifdef USE_MPI // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); #ifdef NCCL_REDUCE hipStreamSynchronize(stream_reduce); ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); hipStreamSynchronize(stream_reduce); //hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce); //hipStreamSynchronize(stream_reduce); #endif #ifdef REDUCE MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); #endif //REDUCE timing_wt.reduce += CPU_TIME_wt - start; //Let's use now the new implementation (ring in shmem and Ired inter-nodes) #endif //USE_MPI } clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec); timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; // Go to next sector for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0; // Deallocate all sector arrays free(uus); free(vvs); free(wws); free(weightss); free(visreals); free(visimgs); // End of loop over sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } //Copy data back from device to host (to be deleted in next steps) #ifdef NCCL_REDUCE free(memory); hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce); #endif #ifndef USE_MPI fclose(file.pFile1); #endif #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.process_time1 = (finish.tv_sec - begin.tv_sec); timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); #ifdef NCCL_REDUCE hipStreamSynchronize(stream_reduce); hipFree(gridss_gpu); hipFree(grid_gpu); Loading @@ -374,8 +265,9 @@ void gridding_data(){ hipStreamDestroy(stream_reduce); ncclCommDestroy(comm); #endif } return; } #endif