Loading Build/Makefile.leo +9 −9 Original line number Diff line number Diff line Loading @@ -9,8 +9,8 @@ OPT_PURE_MPI = -O4 -march=native -mavx -mavx2 OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80 CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/include CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/targets/x86_64-linux/lib/stubs CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs FFTW_INCL= FFTW_LIB= Loading @@ -19,18 +19,18 @@ FFTW_LIB= ########################################################## #NVIDIA CUFFTMP CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/lib64 CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/include/cufftmp CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64 CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp ########################################################## NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/include NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/lib NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/ NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/ ########################################################## #NVIDIA NCCL REDUCE NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/include NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/lib NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib ########################################################## NVC = nvc Loading @@ -44,7 +44,7 @@ NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFTMP_INC) $(CUFFTMP NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcudart -lnccl NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/11.8/bin/nvcc NVCC = nvcc OPT_NVCC = -std=c++17 --generate-code arch=compute_80,code=sm_80 CFLAGS += Loading Makefile +7 −7 Original line number Diff line number Diff line Loading @@ -35,10 +35,10 @@ FFTWLIBS = OPT += -DUSE_FFTW # use omp-ized version of fftw routines OPT += -DHYBRID_FFTW #OPT += -DHYBRID_FFTW # switch on the OpenMP parallelization OPT += -DUSE_OMP #OPT += -DUSE_OMP # write the full 3D cube of gridded visibilities and its FFT transform #OPT += -DWRITE_DATA Loading @@ -53,7 +53,7 @@ OPT += -DPHASE_ON #OPT += -DFITSIO # Perform true parallel images writing #OPT += -DPARALLELIO OPT += -DPARALLELIO # Normalize uvw in case it is not done in the binMS #OPT += -DNORMALIZE_UVW Loading @@ -74,13 +74,13 @@ OPT += -DGAUSS_HI_PRECISION #OPT += -DNVIDIA #use cuda for GPUs #OPT += -DCUDACC OPT += -DCUDACC # use GPU acceleration via OMP OPT += -DACCOMP #OPT += -DACCOMP # perform stacking on GPUs #OPT += -DGPU_STACKING OPT += -DGPU_STACKING # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE Loading @@ -89,7 +89,7 @@ OPT += -DACCOMP #OPT += -DRCCL_REDUCE # use GPU to perform FFT #OPT += -DCUFFTMP OPT += -DCUFFTMP #support for AMD GPUs #OPT += __HIP_PLATFORM_AMD__ Loading README.md +7 −0 Original line number Diff line number Diff line Loading @@ -50,5 +50,12 @@ In the case in which the code has been compiled without either -fopenmp or -D_OP the code is forced to use the standard MPI_Reduce implementation, since our reduce works only with OpenMP. To use the cufftMp with nvhpc 23.5 you have to add the following paths to the environmental variable ´LD_LIBRARY_PATH´: ########################################### export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/:$LD_LIBRARY_PATH" export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64/:$LD_LIBRARY_PATH" ########################################### No newline at end of file scripts/bench.go_cufftmp 0 → 100644 +111 −0 Original line number Diff line number Diff line #!/bin/bash #SBATCH -A IscrC_RICK #SBATCH -p boost_usr_prod ##SBATCH --qos boost_qos_bprod #SBATCH -J RICK ### number of nodes #SBATCH -N 1 ### number of MPI tasks per node #SBATCH --ntasks-per-node=4 #SBATCH -n 4 ### number of openmp threads #SBATCH --cpus-per-task=8 ### number of allocated GPUs per node #SBATCH --gpus-per-node=4 #SBATCH --mem=450G #SBATCH -o test.out #SBATCH -e test.err #SBATCH -t 03:00:00 module load openmpi/ module load fftw/ module load nvhpc/23.5 module load cuda/ export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/:$LD_LIBRARY_PATH" export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64/:$LD_LIBRARY_PATH" export OMPI_CC=gcc export OMPI_CXX=g++ export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} export OMP_PLACES=cores cd ../ make -j1 clean rm -f w-stacking_fftw_acc-omp_acc-fft make -j1 w-stacking export typestring=omp_gpu_cufftmp export exe=w-stacking_fftw_acc-omp_acc-fft OUT_SHM=result_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK} OUT_SHM_RES=/leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/scripts/Tests/times_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK}_large rm -f ${OUT_SHM} ${OUT_SHM_RES} export logdir=mpi_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK} echo "Creating $logdir" rm -fr $logdir mkdir $logdir for itest in {1..3} do export logfile=test_${itest}_${logdir}.log echo "time mpirun -np ${SLURM_NTASKS} --bind-to core --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:pe=${SLURM_CPUS_PER_TASK} -x OMP_NUM_THREADS data/paramfile.txt" > $logfile time mpirun -np ${SLURM_NTASKS} --bind-to core --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:pe=${SLURM_CPUS_PER_TASK} -x OMP_NUM_THREADS --mca btl self,vader /leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/${exe} data/paramfile.txt >> $logfile mv $logfile $logdir mv timings.dat ${logdir}/timings_${itest}.dat cat ${logdir}/timings_all.dat ${logdir}/timings_${itest}.dat >> ${logdir}/timings_all.dat Reduce_time=$( grep -w 'Reduce time :' $logdir/$logfile | gawk '{print $4}' ) FFTW_time=$( grep -w 'cufftMP time :' $logdir/$logfile | gawk '{print $4}' ) Composition_time=$( grep -w 'Array Composition time :' $logdir/$logfile | gawk '{print $5}' ) Writing_time=$( grep -w ' Image writing time :' $logdir/$logfile | gawk '{print $5}' ) Total_time=$( grep -w 'TOT time :' $logdir/$logfile | gawk '{print $4}' ) #Not relevant for the paper Setup_time=$( grep -w 'Setup time:' $logdir/$logfile | gawk '{print $3}' ) Kernel_time=$( grep -w 'Kernel time :' $logdir/$logfile | gawk '{print $4}' ) Phase_time=$( grep -w 'Phase time :' $logdir/$logfile | gawk '{print $4}' ) ########################## echo $itest $Reduce_time $FFTW_time $Composition_time $Writing_time $Total_time $Setup_time $Kernel_time $Phase_time >> ${OUT_SHM} done echo -e "\n\n" >> ${OUT_SHM} avg_red=$( awk '{sum+=$2} END { print sum/3 }' ${OUT_SHM} ) avg_fftw=$( awk '{sum+=$3} END { print sum/3 }' ${OUT_SHM} ) avg_comp=$( awk '{sum+=$4} END { print sum/3 }' ${OUT_SHM} ) avg_write=$( awk '{sum+=$5} END { print sum/3 }' ${OUT_SHM} ) avg_tot=$( awk '{sum+=$6} END { print sum/3 }' ${OUT_SHM} ) std_red=$( awk '{if($2!=""){count++;sum+=$2};y+=$2^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_fftw=$( awk '{if($3!=""){count++;sum+=$3};y+=$3^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_comp=$( awk '{if($4!=""){count++;sum+=$4};y+=$4^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_write=$( awk '{if($5!=""){count++;sum+=$5};y+=$5^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_tot=$( awk '{if($6!=""){count++;sum+=$6};y+=$6^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) #Not relevant for the paper avg_setup=$( awk '{sum+=$7} END { print sum/3 }' ${OUT_SHM} ) avg_ker=$( awk '{sum+=$8} END { print sum/3 }' ${OUT_SHM} ) avg_phase=$( awk '{sum+=$9} END { print sum/3 }' ${OUT_SHM} ) std_setup=$( awk '{if($7!=""){count++;sum+=$7};y+=$7^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_ker=$( awk '{if($8!=""){count++;sum+=$8};y+=$8^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_phase=$( awk '{if($9!=""){count++;sum+=$9};y+=$9^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) ########################## echo "Averages and standard deviations over 3 shots" >> ${OUT_SHM_RES} echo -e "\n" ${OUT_SHM_RES} echo "${SLURM_NTASKS} MPI tasks; ${SLURM_CPUS_PER_TASK} OpenMP threads per task; ${SLURM_GPUS_PER_NODE} GPUs per node;" >> ${OUT_SHM_RES} echo -e "\n\n" ${OUT_SHM_RES} echo $avg_red $std_red $avg_fftw $std_fftw $avg_comp $std_comp $avg_write $std_write $avg_tot $std_tot >> ${OUT_SHM_RES} echo -e "\n\n" ${OUT_SHM_RES} echo $avg_setup $std_setup $avg_ker $std_ker $avg_phase $std_phase >> ${OUT_SHM_RES} rm -f ${OUT_SHM} scripts/create_binMS.py 0 → 100644 +218 −0 Original line number Diff line number Diff line USE_MPI = 0 import numpy as np import casacore.tables as pt import time import sys import os #outpath = '/data/gridding/data/shortgauss_t201806301100_SBH255.binMS/' print(sys.argv[1]) outpath = "/data/gridding/data/Lofarbig/"+sys.argv[1]+".binMS/" os.mkdir(outpath) ufile = 'ucoord.bin' vfile = 'vcoord.bin' wfile = 'wcoord.bin' weights = 'weights.bin' visrealfile = 'visibilities_real.bin' visimgfile = 'visibilities_img.bin' metafile = 'meta.txt' offset = 0.0 if USE_MPI == 1: from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print(rank,size) else: comm = 0 rank = 0 size = 1 num_threads = 1 # input MS readtime0 = time.time() #msfile = "/data/Lofar-data/results/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.ms" msfile = "/data/Lofar-Luca/results/"+sys.argv[1]+".ms/" ms = pt.table(msfile, readonly=True, ack=False) if rank == 0: print("Reading ", msfile) print("Writing ", outpath) # load data and metadata with pt.table(msfile + '::SPECTRAL_WINDOW', ack=False) as freqtab: freq = freqtab.getcol('REF_FREQUENCY')[0] / 1000000.0 freqpersample = np.mean(freqtab.getcol('RESOLUTION')) timepersample = ms.getcell('INTERVAL',0) print("Frequencies (MHz) : ",freq) print("Time interval (sec) : ",timepersample) with pt.taql("SELECT ANTENNA1,ANTENNA2,sqrt(sumsqr(UVW)),GCOUNT() FROM $ms GROUPBY ANTENNA1,ANTENNA2") as BL: ants1, ants2 = BL.getcol('ANTENNA1'), BL.getcol('ANTENNA2') Ntime = BL.getcol('Col_4')[0] # number of timesteps Nbaselines = len(ants1) print("Number of timesteps : ",Ntime) print("Total obs time (hrs): ",timepersample*Ntime/3600) print("Number of baselines : ",Nbaselines) #sp = pt.table(msfile+'::LOFAR_ANTENNA_FIELD', readonly=True, ack=False, memorytable=True).getcol('POSITION') ant1, ant2 = ms.getcol('ANTENNA1'), ms.getcol('ANTENNA2') number_of_measures = Ntime * Nbaselines #nm_pe_aux = int(number_of_measures / size) #remaining_aux = number_of_measures % size nm_pe = np.array(0) nm_pe = int(number_of_measures / size) remaining = np.array(0) remaining = number_of_measures % size print(nm_pe,remaining) else: nm_pe = None remaining = None if USE_MPI == 1: nm_pe = comm.bcast(nm_pe, root=0) remaining = comm.bcast(remaining, root=0) # set the data domain for each MPI rank startrow = rank*nm_pe if rank == size-1: nm_pe = nm_pe+remaining print(rank,nm_pe,remaining) nrow = nm_pe # read data uvw = ms.getcol('UVW',startrow,nrow) vis = ms.getcol('DATA',startrow,nrow) weight = ms.getcol('WEIGHT_SPECTRUM',startrow,nrow) print("Freqs per channel : ",vis.shape[1]) print("Polarizations : ",vis.shape[2]) print("Number of observations : ",uvw.shape[0]) print("Data size (MB) : ",uvw.shape[0]*vis.shape[1]*vis.shape[2]*2*4/1024.0/1024.0) # set parameters num_points = uvw.shape[0] num_w_planes = 1 grid_size = 100 # number of cells of the grid # serialize arrays vis_ser_real = vis.real.flatten() vis_ser_img = vis.imag.flatten() print("data types: uvw = ",uvw.dtype," vis = ",vis_ser_real.dtype) #vis_ser = np.zeros(2*vis_ser_real.size) #for i in range(vis_ser_real.size): # vis_ser[2*i]=vis_ser_real[i] # vis_ser[2*i+1]=vis_ser_img[i] uu_ser = uvw[:,0].flatten() vv_ser = uvw[:,1].flatten() ww_ser = uvw[:,2].flatten() weight_ser = weight.flatten() grid = np.zeros(2*num_w_planes*grid_size*grid_size) # complex! gridtot = np.zeros(2*num_w_planes*grid_size*grid_size) # complex! peanokeys = np.empty(vis_ser_real.size,dtype=np.uint64) gsize = grid.size hist, bin_edges = np.histogram(ww_ser,num_w_planes) print(hist) print(vis_ser_real.dtype) # normalize uv minu = np.amin(uu_ser) maxu = np.amax(abs(uu_ser)) minv = np.amin(vv_ser) maxv = np.amax(abs(vv_ser)) minw = np.amin(ww_ser) maxw = np.amax(ww_ser) if USE_MPI == 1: maxu_all = np.array(0,dtype=np.float) maxv_all = np.array(0,dtype=np.float) maxw_all = np.array(0,dtype=np.float) minu_all = np.array(0,dtype=np.float) minv_all = np.array(0,dtype=np.float) minw_all = np.array(0,dtype=np.float) comm.Allreduce(maxu, maxu_all, op=MPI.MAX) comm.Allreduce(maxv, maxv_all, op=MPI.MAX) comm.Allreduce(maxw, maxw_all, op=MPI.MAX) comm.Allreduce(minu, minu_all, op=MPI.MIN) comm.Allreduce(minv, minv_all, op=MPI.MIN) comm.Allreduce(minw, minw_all, op=MPI.MIN) ming = min(minu_all,minv_all) maxg = max(maxu_all,maxv_all) minw = minw_all maxw = maxw_all ming = ming-offset*ming maxg = maxg+offset*maxg minw = minw maxw = maxw else: ming = min(minu,minv) maxg = max(maxu,maxv) ming = ming-offset*ming maxg = maxg+offset*maxg minw = minw maxw = maxw print(maxu,maxv,maxg) #uu_ser = (uu_ser-ming)/(maxg-ming) #vv_ser = (vv_ser-ming)/(maxg-ming) uu_ser = (uu_ser+maxg)/(2*maxg) vv_ser = (vv_ser+maxg)/(2*maxg) ww_ser = (ww_ser-minw)/(maxw-minw) #print(uu_ser.shape, vv_ser.dtype, ww_ser.dtype, vis_ser_real.shape, vis_ser_img.dtype, weight_ser.dtype, grid.dtype) print(np.amin(uu_ser),np.amax(uu_ser)) print(np.amin(vv_ser),np.amax(vv_ser)) print(np.amin(ww_ser),np.amax(ww_ser)) # set normalized uvw - mesh conversion factors dx = 1.0/grid_size dw = 1.0/num_w_planes readtime1 = time.time() if rank == 0: outfile = outpath+ufile uu_ser.tofile(outfile,sep='') outfile = outpath+vfile vv_ser.tofile(outfile,sep='') outfile = outpath+wfile ww_ser.tofile(outfile,sep='') outfile = outpath+weights weight_ser.tofile(outfile,sep='') outfile = outpath+weights weight_ser.tofile(outfile,sep='') outfile = outpath+visrealfile vis_ser_real.tofile(outfile,sep='') outfile = outpath+visimgfile vis_ser_img.tofile(outfile,sep='') outfile = outpath+metafile f = open(outfile, 'w') f.writelines(str(uu_ser.size)+"\n") f.writelines(str(vis_ser_real.size)+"\n") f.writelines(str(vis.shape[1])+"\n") f.writelines(str(vis.shape[2])+"\n") f.writelines(str(Ntime)+"\n") f.writelines(str(timepersample)+"\n") f.writelines(str(timepersample*Ntime/3600)+"\n") f.writelines(str(Nbaselines)+"\n") f.writelines(str(ming)+"\n") f.writelines(str(maxg)+"\n") f.writelines(str(minw)+"\n") f.writelines(str(maxw)+"\n") f.close() Loading
Build/Makefile.leo +9 −9 Original line number Diff line number Diff line Loading @@ -9,8 +9,8 @@ OPT_PURE_MPI = -O4 -march=native -mavx -mavx2 OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80 CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/include CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/targets/x86_64-linux/lib/stubs CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs FFTW_INCL= FFTW_LIB= Loading @@ -19,18 +19,18 @@ FFTW_LIB= ########################################################## #NVIDIA CUFFTMP CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/lib64 CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/include/cufftmp CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64 CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp ########################################################## NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/include NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/lib NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/ NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/ ########################################################## #NVIDIA NCCL REDUCE NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/include NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/lib NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib ########################################################## NVC = nvc Loading @@ -44,7 +44,7 @@ NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFTMP_INC) $(CUFFTMP NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcudart -lnccl NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/11.8/bin/nvcc NVCC = nvcc OPT_NVCC = -std=c++17 --generate-code arch=compute_80,code=sm_80 CFLAGS += Loading
Makefile +7 −7 Original line number Diff line number Diff line Loading @@ -35,10 +35,10 @@ FFTWLIBS = OPT += -DUSE_FFTW # use omp-ized version of fftw routines OPT += -DHYBRID_FFTW #OPT += -DHYBRID_FFTW # switch on the OpenMP parallelization OPT += -DUSE_OMP #OPT += -DUSE_OMP # write the full 3D cube of gridded visibilities and its FFT transform #OPT += -DWRITE_DATA Loading @@ -53,7 +53,7 @@ OPT += -DPHASE_ON #OPT += -DFITSIO # Perform true parallel images writing #OPT += -DPARALLELIO OPT += -DPARALLELIO # Normalize uvw in case it is not done in the binMS #OPT += -DNORMALIZE_UVW Loading @@ -74,13 +74,13 @@ OPT += -DGAUSS_HI_PRECISION #OPT += -DNVIDIA #use cuda for GPUs #OPT += -DCUDACC OPT += -DCUDACC # use GPU acceleration via OMP OPT += -DACCOMP #OPT += -DACCOMP # perform stacking on GPUs #OPT += -DGPU_STACKING OPT += -DGPU_STACKING # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE Loading @@ -89,7 +89,7 @@ OPT += -DACCOMP #OPT += -DRCCL_REDUCE # use GPU to perform FFT #OPT += -DCUFFTMP OPT += -DCUFFTMP #support for AMD GPUs #OPT += __HIP_PLATFORM_AMD__ Loading
README.md +7 −0 Original line number Diff line number Diff line Loading @@ -50,5 +50,12 @@ In the case in which the code has been compiled without either -fopenmp or -D_OP the code is forced to use the standard MPI_Reduce implementation, since our reduce works only with OpenMP. To use the cufftMp with nvhpc 23.5 you have to add the following paths to the environmental variable ´LD_LIBRARY_PATH´: ########################################### export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/:$LD_LIBRARY_PATH" export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64/:$LD_LIBRARY_PATH" ########################################### No newline at end of file
scripts/bench.go_cufftmp 0 → 100644 +111 −0 Original line number Diff line number Diff line #!/bin/bash #SBATCH -A IscrC_RICK #SBATCH -p boost_usr_prod ##SBATCH --qos boost_qos_bprod #SBATCH -J RICK ### number of nodes #SBATCH -N 1 ### number of MPI tasks per node #SBATCH --ntasks-per-node=4 #SBATCH -n 4 ### number of openmp threads #SBATCH --cpus-per-task=8 ### number of allocated GPUs per node #SBATCH --gpus-per-node=4 #SBATCH --mem=450G #SBATCH -o test.out #SBATCH -e test.err #SBATCH -t 03:00:00 module load openmpi/ module load fftw/ module load nvhpc/23.5 module load cuda/ export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/:$LD_LIBRARY_PATH" export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64/:$LD_LIBRARY_PATH" export OMPI_CC=gcc export OMPI_CXX=g++ export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} export OMP_PLACES=cores cd ../ make -j1 clean rm -f w-stacking_fftw_acc-omp_acc-fft make -j1 w-stacking export typestring=omp_gpu_cufftmp export exe=w-stacking_fftw_acc-omp_acc-fft OUT_SHM=result_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK} OUT_SHM_RES=/leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/scripts/Tests/times_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK}_large rm -f ${OUT_SHM} ${OUT_SHM_RES} export logdir=mpi_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK} echo "Creating $logdir" rm -fr $logdir mkdir $logdir for itest in {1..3} do export logfile=test_${itest}_${logdir}.log echo "time mpirun -np ${SLURM_NTASKS} --bind-to core --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:pe=${SLURM_CPUS_PER_TASK} -x OMP_NUM_THREADS data/paramfile.txt" > $logfile time mpirun -np ${SLURM_NTASKS} --bind-to core --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:pe=${SLURM_CPUS_PER_TASK} -x OMP_NUM_THREADS --mca btl self,vader /leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/${exe} data/paramfile.txt >> $logfile mv $logfile $logdir mv timings.dat ${logdir}/timings_${itest}.dat cat ${logdir}/timings_all.dat ${logdir}/timings_${itest}.dat >> ${logdir}/timings_all.dat Reduce_time=$( grep -w 'Reduce time :' $logdir/$logfile | gawk '{print $4}' ) FFTW_time=$( grep -w 'cufftMP time :' $logdir/$logfile | gawk '{print $4}' ) Composition_time=$( grep -w 'Array Composition time :' $logdir/$logfile | gawk '{print $5}' ) Writing_time=$( grep -w ' Image writing time :' $logdir/$logfile | gawk '{print $5}' ) Total_time=$( grep -w 'TOT time :' $logdir/$logfile | gawk '{print $4}' ) #Not relevant for the paper Setup_time=$( grep -w 'Setup time:' $logdir/$logfile | gawk '{print $3}' ) Kernel_time=$( grep -w 'Kernel time :' $logdir/$logfile | gawk '{print $4}' ) Phase_time=$( grep -w 'Phase time :' $logdir/$logfile | gawk '{print $4}' ) ########################## echo $itest $Reduce_time $FFTW_time $Composition_time $Writing_time $Total_time $Setup_time $Kernel_time $Phase_time >> ${OUT_SHM} done echo -e "\n\n" >> ${OUT_SHM} avg_red=$( awk '{sum+=$2} END { print sum/3 }' ${OUT_SHM} ) avg_fftw=$( awk '{sum+=$3} END { print sum/3 }' ${OUT_SHM} ) avg_comp=$( awk '{sum+=$4} END { print sum/3 }' ${OUT_SHM} ) avg_write=$( awk '{sum+=$5} END { print sum/3 }' ${OUT_SHM} ) avg_tot=$( awk '{sum+=$6} END { print sum/3 }' ${OUT_SHM} ) std_red=$( awk '{if($2!=""){count++;sum+=$2};y+=$2^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_fftw=$( awk '{if($3!=""){count++;sum+=$3};y+=$3^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_comp=$( awk '{if($4!=""){count++;sum+=$4};y+=$4^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_write=$( awk '{if($5!=""){count++;sum+=$5};y+=$5^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_tot=$( awk '{if($6!=""){count++;sum+=$6};y+=$6^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) #Not relevant for the paper avg_setup=$( awk '{sum+=$7} END { print sum/3 }' ${OUT_SHM} ) avg_ker=$( awk '{sum+=$8} END { print sum/3 }' ${OUT_SHM} ) avg_phase=$( awk '{sum+=$9} END { print sum/3 }' ${OUT_SHM} ) std_setup=$( awk '{if($7!=""){count++;sum+=$7};y+=$7^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_ker=$( awk '{if($8!=""){count++;sum+=$8};y+=$8^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_phase=$( awk '{if($9!=""){count++;sum+=$9};y+=$9^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) ########################## echo "Averages and standard deviations over 3 shots" >> ${OUT_SHM_RES} echo -e "\n" ${OUT_SHM_RES} echo "${SLURM_NTASKS} MPI tasks; ${SLURM_CPUS_PER_TASK} OpenMP threads per task; ${SLURM_GPUS_PER_NODE} GPUs per node;" >> ${OUT_SHM_RES} echo -e "\n\n" ${OUT_SHM_RES} echo $avg_red $std_red $avg_fftw $std_fftw $avg_comp $std_comp $avg_write $std_write $avg_tot $std_tot >> ${OUT_SHM_RES} echo -e "\n\n" ${OUT_SHM_RES} echo $avg_setup $std_setup $avg_ker $std_ker $avg_phase $std_phase >> ${OUT_SHM_RES} rm -f ${OUT_SHM}
scripts/create_binMS.py 0 → 100644 +218 −0 Original line number Diff line number Diff line USE_MPI = 0 import numpy as np import casacore.tables as pt import time import sys import os #outpath = '/data/gridding/data/shortgauss_t201806301100_SBH255.binMS/' print(sys.argv[1]) outpath = "/data/gridding/data/Lofarbig/"+sys.argv[1]+".binMS/" os.mkdir(outpath) ufile = 'ucoord.bin' vfile = 'vcoord.bin' wfile = 'wcoord.bin' weights = 'weights.bin' visrealfile = 'visibilities_real.bin' visimgfile = 'visibilities_img.bin' metafile = 'meta.txt' offset = 0.0 if USE_MPI == 1: from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print(rank,size) else: comm = 0 rank = 0 size = 1 num_threads = 1 # input MS readtime0 = time.time() #msfile = "/data/Lofar-data/results/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.ms" msfile = "/data/Lofar-Luca/results/"+sys.argv[1]+".ms/" ms = pt.table(msfile, readonly=True, ack=False) if rank == 0: print("Reading ", msfile) print("Writing ", outpath) # load data and metadata with pt.table(msfile + '::SPECTRAL_WINDOW', ack=False) as freqtab: freq = freqtab.getcol('REF_FREQUENCY')[0] / 1000000.0 freqpersample = np.mean(freqtab.getcol('RESOLUTION')) timepersample = ms.getcell('INTERVAL',0) print("Frequencies (MHz) : ",freq) print("Time interval (sec) : ",timepersample) with pt.taql("SELECT ANTENNA1,ANTENNA2,sqrt(sumsqr(UVW)),GCOUNT() FROM $ms GROUPBY ANTENNA1,ANTENNA2") as BL: ants1, ants2 = BL.getcol('ANTENNA1'), BL.getcol('ANTENNA2') Ntime = BL.getcol('Col_4')[0] # number of timesteps Nbaselines = len(ants1) print("Number of timesteps : ",Ntime) print("Total obs time (hrs): ",timepersample*Ntime/3600) print("Number of baselines : ",Nbaselines) #sp = pt.table(msfile+'::LOFAR_ANTENNA_FIELD', readonly=True, ack=False, memorytable=True).getcol('POSITION') ant1, ant2 = ms.getcol('ANTENNA1'), ms.getcol('ANTENNA2') number_of_measures = Ntime * Nbaselines #nm_pe_aux = int(number_of_measures / size) #remaining_aux = number_of_measures % size nm_pe = np.array(0) nm_pe = int(number_of_measures / size) remaining = np.array(0) remaining = number_of_measures % size print(nm_pe,remaining) else: nm_pe = None remaining = None if USE_MPI == 1: nm_pe = comm.bcast(nm_pe, root=0) remaining = comm.bcast(remaining, root=0) # set the data domain for each MPI rank startrow = rank*nm_pe if rank == size-1: nm_pe = nm_pe+remaining print(rank,nm_pe,remaining) nrow = nm_pe # read data uvw = ms.getcol('UVW',startrow,nrow) vis = ms.getcol('DATA',startrow,nrow) weight = ms.getcol('WEIGHT_SPECTRUM',startrow,nrow) print("Freqs per channel : ",vis.shape[1]) print("Polarizations : ",vis.shape[2]) print("Number of observations : ",uvw.shape[0]) print("Data size (MB) : ",uvw.shape[0]*vis.shape[1]*vis.shape[2]*2*4/1024.0/1024.0) # set parameters num_points = uvw.shape[0] num_w_planes = 1 grid_size = 100 # number of cells of the grid # serialize arrays vis_ser_real = vis.real.flatten() vis_ser_img = vis.imag.flatten() print("data types: uvw = ",uvw.dtype," vis = ",vis_ser_real.dtype) #vis_ser = np.zeros(2*vis_ser_real.size) #for i in range(vis_ser_real.size): # vis_ser[2*i]=vis_ser_real[i] # vis_ser[2*i+1]=vis_ser_img[i] uu_ser = uvw[:,0].flatten() vv_ser = uvw[:,1].flatten() ww_ser = uvw[:,2].flatten() weight_ser = weight.flatten() grid = np.zeros(2*num_w_planes*grid_size*grid_size) # complex! gridtot = np.zeros(2*num_w_planes*grid_size*grid_size) # complex! peanokeys = np.empty(vis_ser_real.size,dtype=np.uint64) gsize = grid.size hist, bin_edges = np.histogram(ww_ser,num_w_planes) print(hist) print(vis_ser_real.dtype) # normalize uv minu = np.amin(uu_ser) maxu = np.amax(abs(uu_ser)) minv = np.amin(vv_ser) maxv = np.amax(abs(vv_ser)) minw = np.amin(ww_ser) maxw = np.amax(ww_ser) if USE_MPI == 1: maxu_all = np.array(0,dtype=np.float) maxv_all = np.array(0,dtype=np.float) maxw_all = np.array(0,dtype=np.float) minu_all = np.array(0,dtype=np.float) minv_all = np.array(0,dtype=np.float) minw_all = np.array(0,dtype=np.float) comm.Allreduce(maxu, maxu_all, op=MPI.MAX) comm.Allreduce(maxv, maxv_all, op=MPI.MAX) comm.Allreduce(maxw, maxw_all, op=MPI.MAX) comm.Allreduce(minu, minu_all, op=MPI.MIN) comm.Allreduce(minv, minv_all, op=MPI.MIN) comm.Allreduce(minw, minw_all, op=MPI.MIN) ming = min(minu_all,minv_all) maxg = max(maxu_all,maxv_all) minw = minw_all maxw = maxw_all ming = ming-offset*ming maxg = maxg+offset*maxg minw = minw maxw = maxw else: ming = min(minu,minv) maxg = max(maxu,maxv) ming = ming-offset*ming maxg = maxg+offset*maxg minw = minw maxw = maxw print(maxu,maxv,maxg) #uu_ser = (uu_ser-ming)/(maxg-ming) #vv_ser = (vv_ser-ming)/(maxg-ming) uu_ser = (uu_ser+maxg)/(2*maxg) vv_ser = (vv_ser+maxg)/(2*maxg) ww_ser = (ww_ser-minw)/(maxw-minw) #print(uu_ser.shape, vv_ser.dtype, ww_ser.dtype, vis_ser_real.shape, vis_ser_img.dtype, weight_ser.dtype, grid.dtype) print(np.amin(uu_ser),np.amax(uu_ser)) print(np.amin(vv_ser),np.amax(vv_ser)) print(np.amin(ww_ser),np.amax(ww_ser)) # set normalized uvw - mesh conversion factors dx = 1.0/grid_size dw = 1.0/num_w_planes readtime1 = time.time() if rank == 0: outfile = outpath+ufile uu_ser.tofile(outfile,sep='') outfile = outpath+vfile vv_ser.tofile(outfile,sep='') outfile = outpath+wfile ww_ser.tofile(outfile,sep='') outfile = outpath+weights weight_ser.tofile(outfile,sep='') outfile = outpath+weights weight_ser.tofile(outfile,sep='') outfile = outpath+visrealfile vis_ser_real.tofile(outfile,sep='') outfile = outpath+visimgfile vis_ser_img.tofile(outfile,sep='') outfile = outpath+metafile f = open(outfile, 'w') f.writelines(str(uu_ser.size)+"\n") f.writelines(str(vis_ser_real.size)+"\n") f.writelines(str(vis.shape[1])+"\n") f.writelines(str(vis.shape[2])+"\n") f.writelines(str(Ntime)+"\n") f.writelines(str(timepersample)+"\n") f.writelines(str(timepersample*Ntime/3600)+"\n") f.writelines(str(Nbaselines)+"\n") f.writelines(str(ming)+"\n") f.writelines(str(maxg)+"\n") f.writelines(str(minw)+"\n") f.writelines(str(maxw)+"\n") f.close()