Loading Build/Makefile.local +2 −2 Original line number Diff line number Diff line Loading @@ -4,8 +4,8 @@ CXX = g++ MPICC = mpicc MPIC++ = mpiCC OPTIMIZE = -ggdb3 -O4 -fopenmp -march=native -mavx -mavx2 OMP_GPU = -ggdb3 -mp=multicore,gpu -gpu=cuda12.0 -gpu=cc86 OPTIMIZE = -O4 -fopenmp -march=native -mavx -mavx2 OMP_GPU = -mp=multicore,gpu -gpu=cuda12.0 -gpu=cc86 CUDA_INC = -I/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/include CUDA_LIB = -L/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/lib64 Loading Makefile +3 −9 Original line number Diff line number Diff line Loading @@ -55,10 +55,10 @@ OPT += -DPHASE_ON #OPT += -DNVIDIA #use cuda for GPUs OPT += -DCUDACC #OPT += -DCUDACC # use GPU acceleration via OMP #OPT += -DACCOMP OPT += -DACCOMP # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE Loading @@ -79,13 +79,7 @@ DEPS = w-stacking.h main.c allvars.h ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o else OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o endif ifneq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o else else ifneq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o endif Loading gridding_cpu.c +2 −1 Original line number Diff line number Diff line Loading @@ -122,6 +122,7 @@ void gridding_data() #warning "the counter of this loop should not be int" for( int iphi = histo_send[isector]-1; iphi >=0 ; iphi--) { uint ilocal = sectorarray[isector][iphi]; uus[icount] = data.uu[ilocal]; Loading Loading @@ -171,7 +172,7 @@ void gridding_data() timing_wt.compose += CPU_TIME_wt - start; printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); // Make convolution on the grid Loading gridding_nccl.cpp +42 −42 Original line number Diff line number Diff line #include "allvars.h" #include "allvars_nccl.h" #include "proto.h" #include <cuda.h> #include <cuda_runtime.h> Loading @@ -14,7 +14,7 @@ #if defined( NCCL_REDUCE ) /* #define NCCLCHECK(cmd) do { ncclResult_t r = cmd; if (r!= ncclSuccess) { Loading @@ -23,7 +23,7 @@ if (r!= ncclSuccess) { exit(EXIT_FAILURE); } } while(0) */ static uint64_t getHostHash(const char* string) { // Based on DJB2a, result = result * 33 ^ char Loading Loading @@ -66,6 +66,33 @@ void gridding_data(){ if ( rank == 0 ) printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); // find the largest value in histo_send[] // uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; // allocate sector arrays // note: we use the largest allocation among all sectors double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double*) memory; double_t *vvs = (double*) uus+Nsec; double_t *wws = (double*) vvs+Nsec; float_t *weightss = (float_t*)(wws+Nsec); float_t *visreals = weightss + Nweightss; float_t *visimgs = visreals + Nvissec; //Initialize nccl double * grid_gpu, *gridss_gpu; Loading Loading @@ -96,31 +123,6 @@ void gridding_data(){ ncclCommInitRank(&comm, size, id, rank); // find the largest value in histo_send[] // uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; // allocate sector arrays // note: we use the largest allocation among all sectors double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double*) memory; double_t *vvs = (double*) uus+Nsec; double_t *wws = (double*) vvs+Nsec; float_t *weightss = (float_t*)(wws+Nsec); float_t *visreals = weightss + Nweightss; float_t *visimgs = visreals + Nvissec; for (uint isector = 0; isector < nsectors; isector++) { Loading @@ -135,10 +137,9 @@ void gridding_data(){ #warning "the counter of this loop should not be int" for(int iphi = histo_send[isector]-1; iphi>=0; iphi--) { uint ilocal = sectorarray[isector][iphi]; //double vvh = data.vv[ilocal]; //int binphi = (int)(vvh*nsectors); //if (binphi == isector || boundary[ilocal] == isector) { uus[icount] = data.uu[ilocal]; vvs[icount] = data.vv[ilocal]-isector*shift; wws[icount] = data.ww[ilocal]; Loading @@ -151,7 +152,7 @@ void gridding_data(){ { visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis); inu++; } icount++; Loading Loading @@ -186,7 +187,7 @@ void gridding_data(){ timing_wt.compose += CPU_TIME_wt - start; printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); // Make convolution on the grid Loading Loading @@ -218,11 +219,11 @@ void gridding_data(){ //Allocate memory on devices non-blocking for the host /////////////////////////////////////////////////////// cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce); timing_wt.kernel += CPU_TIME_wt - start; cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce); #ifdef VERBOSE printf("Processed sector %ld\n",isector); #endif Loading @@ -240,7 +241,7 @@ void gridding_data(){ cudaStreamSynchronize(stream_reduce); NCCLCHECK(ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce)); ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); cudaStreamSynchronize(stream_reduce); timing_wt.reduce += CPU_TIME_wt - start; Loading @@ -250,17 +251,16 @@ void gridding_data(){ // Go to next sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); free(memory); } //Copy data back from device to host (to be deleted in next steps) free(memory); cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); MPI_Barrier(MPI_COMM_WORLD); cudaStreamSynchronize(stream_reduce); cudaFree(gridss_gpu); cudaFree(grid_gpu); Loading proto.h +9 −1 Original line number Diff line number Diff line Loading @@ -11,9 +11,17 @@ void readMetaData(char fileLocal[1000]); void metaData_calculation(); void allocate_memory(); void readData(); #ifdef __cplusplus extern "C" { void shutdown_wstacking( int, char *, char *, int); } #else void shutdown_wstacking( int, char *, char *, int); #endif #ifdef __cplusplus extern "C" { void gridding (void); Loading Loading
Build/Makefile.local +2 −2 Original line number Diff line number Diff line Loading @@ -4,8 +4,8 @@ CXX = g++ MPICC = mpicc MPIC++ = mpiCC OPTIMIZE = -ggdb3 -O4 -fopenmp -march=native -mavx -mavx2 OMP_GPU = -ggdb3 -mp=multicore,gpu -gpu=cuda12.0 -gpu=cc86 OPTIMIZE = -O4 -fopenmp -march=native -mavx -mavx2 OMP_GPU = -mp=multicore,gpu -gpu=cuda12.0 -gpu=cc86 CUDA_INC = -I/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/include CUDA_LIB = -L/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/lib64 Loading
Makefile +3 −9 Original line number Diff line number Diff line Loading @@ -55,10 +55,10 @@ OPT += -DPHASE_ON #OPT += -DNVIDIA #use cuda for GPUs OPT += -DCUDACC #OPT += -DCUDACC # use GPU acceleration via OMP #OPT += -DACCOMP OPT += -DACCOMP # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE Loading @@ -79,13 +79,7 @@ DEPS = w-stacking.h main.c allvars.h ifneq (ACCOMP,$(findstring ACCOMP, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o else OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o endif ifneq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o else else ifneq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o endif Loading
gridding_cpu.c +2 −1 Original line number Diff line number Diff line Loading @@ -122,6 +122,7 @@ void gridding_data() #warning "the counter of this loop should not be int" for( int iphi = histo_send[isector]-1; iphi >=0 ; iphi--) { uint ilocal = sectorarray[isector][iphi]; uus[icount] = data.uu[ilocal]; Loading Loading @@ -171,7 +172,7 @@ void gridding_data() timing_wt.compose += CPU_TIME_wt - start; printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); // Make convolution on the grid Loading
gridding_nccl.cpp +42 −42 Original line number Diff line number Diff line #include "allvars.h" #include "allvars_nccl.h" #include "proto.h" #include <cuda.h> #include <cuda_runtime.h> Loading @@ -14,7 +14,7 @@ #if defined( NCCL_REDUCE ) /* #define NCCLCHECK(cmd) do { ncclResult_t r = cmd; if (r!= ncclSuccess) { Loading @@ -23,7 +23,7 @@ if (r!= ncclSuccess) { exit(EXIT_FAILURE); } } while(0) */ static uint64_t getHostHash(const char* string) { // Based on DJB2a, result = result * 33 ^ char Loading Loading @@ -66,6 +66,33 @@ void gridding_data(){ if ( rank == 0 ) printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); // find the largest value in histo_send[] // uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; // allocate sector arrays // note: we use the largest allocation among all sectors double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double*) memory; double_t *vvs = (double*) uus+Nsec; double_t *wws = (double*) vvs+Nsec; float_t *weightss = (float_t*)(wws+Nsec); float_t *visreals = weightss + Nweightss; float_t *visimgs = visreals + Nvissec; //Initialize nccl double * grid_gpu, *gridss_gpu; Loading Loading @@ -96,31 +123,6 @@ void gridding_data(){ ncclCommInitRank(&comm, size, id, rank); // find the largest value in histo_send[] // uint Nsec = histo_send[0]; for (uint isector = 1; isector < nsectors; isector++) Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); uint Nweightss = Nsec*metaData.polarisations; uint Nvissec = Nweightss*metaData.freq_per_chan; // allocate sector arrays // note: we use the largest allocation among all sectors double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t) ); if ( memory == NULL ) shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); double_t *uus = (double*) memory; double_t *vvs = (double*) uus+Nsec; double_t *wws = (double*) vvs+Nsec; float_t *weightss = (float_t*)(wws+Nsec); float_t *visreals = weightss + Nweightss; float_t *visimgs = visreals + Nvissec; for (uint isector = 0; isector < nsectors; isector++) { Loading @@ -135,10 +137,9 @@ void gridding_data(){ #warning "the counter of this loop should not be int" for(int iphi = histo_send[isector]-1; iphi>=0; iphi--) { uint ilocal = sectorarray[isector][iphi]; //double vvh = data.vv[ilocal]; //int binphi = (int)(vvh*nsectors); //if (binphi == isector || boundary[ilocal] == isector) { uus[icount] = data.uu[ilocal]; vvs[icount] = data.vv[ilocal]-isector*shift; wws[icount] = data.ww[ilocal]; Loading @@ -151,7 +152,7 @@ void gridding_data(){ { visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis); inu++; } icount++; Loading Loading @@ -186,7 +187,7 @@ void gridding_data(){ timing_wt.compose += CPU_TIME_wt - start; printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); // Make convolution on the grid Loading Loading @@ -218,11 +219,11 @@ void gridding_data(){ //Allocate memory on devices non-blocking for the host /////////////////////////////////////////////////////// cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce); timing_wt.kernel += CPU_TIME_wt - start; cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce); #ifdef VERBOSE printf("Processed sector %ld\n",isector); #endif Loading @@ -240,7 +241,7 @@ void gridding_data(){ cudaStreamSynchronize(stream_reduce); NCCLCHECK(ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce)); ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); cudaStreamSynchronize(stream_reduce); timing_wt.reduce += CPU_TIME_wt - start; Loading @@ -250,17 +251,16 @@ void gridding_data(){ // Go to next sector memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); free(memory); } //Copy data back from device to host (to be deleted in next steps) free(memory); cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); MPI_Barrier(MPI_COMM_WORLD); cudaStreamSynchronize(stream_reduce); cudaFree(gridss_gpu); cudaFree(grid_gpu); Loading
proto.h +9 −1 Original line number Diff line number Diff line Loading @@ -11,9 +11,17 @@ void readMetaData(char fileLocal[1000]); void metaData_calculation(); void allocate_memory(); void readData(); #ifdef __cplusplus extern "C" { void shutdown_wstacking( int, char *, char *, int); } #else void shutdown_wstacking( int, char *, char *, int); #endif #ifdef __cplusplus extern "C" { void gridding (void); Loading