Loading scripts/bench.go_gpu +91 −55 Original line number Diff line number Diff line #!/bin/bash #SBATCH -A clusters #SBATCH -p pleiadi-gpu #SBATCH -J LOFAR #SBATCH -A IscrC_RICK #SBATCH -p boost_usr_prod ##SBATCH --qos boost_qos_bprod #SBATCH -J RICK ### number of nodes #SBATCH -N 4 #SBATCH -N 1 ### number of hyperthreading threads #SBATCH --ntasks-per-core=1 ### number of MPI tasks per node #SBATCH --ntasks-per-node=36 #SBATCH -n 144 #SBATCH --ntasks-per-node=4 #SBATCH -n 4 ### number of openmp threads #SBATCH --cpus-per-task=1 #SBATCH --cpus-per-task=8 ### number of allocated GPUs per node ##SBATCH --gres=gpu:2 #SBATCH --mem=110G #SBATCH --gpus-per-node=4 #SBATCH --mem=450G #SBATCH -o test.out #SBATCH -e test.err #SBATCH -t 08:00:00 #SBATCH -t 03:00:00 module purge module load openmpi/ module load fftw/ module load nvhpc/ echo $SLURM_NODELIST export MODULE_VERSION=5.0.1 source /opt/cluster/spack/share/spack/setup-env.sh module load default-gcc-11.2.0 export PATH=/opt/cluster/spack/nvidia/gcc/11.2.0/bin:$PATH export LD_LIBRARY_PATH=/opt/cluster/spack/nvidia/gcc/11.2.0/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/lib:$LD_LIBRARY_PATH export OMPI_CC=gcc export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} cd /u/glacopo/LOFAR/hpc_imaging-end-of-re_structuring/ make SYSTYPE=Amonra_gpu -j1 clean rm -f w-stackingCfftw_ring make SYSTYPE=Amonra_gpu -j1 mpi_ring_omp cd ../ make -j1 clean rm -f w-stacking_fftw_acc-omp_acc-reduce make -j1 w-stacking export use_cuda=no if [ "$use_cuda" = "no" ] then export typestring=omp_cpu export exe=w-stackingCfftw_ring_omp fi OUT_SHM=result_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK} OUT_SHM_RES=/leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/scripts/Tests/times_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK} if [ "$use_cuda" = "yes" ] then export typestring=cuda export exe=w-stackingfftw fi rm -f ${OUT_SHM} ${OUT_SHM_RES} Tmin=2 Tmax=32 export typestring=omp_gpu export exe=w-stacking_fftw_acc-omp_acc-reduce for (( t=$Tmin; t<=$Tmax; t*=2 )) do N=$(( 4*${t} )) echo -e "\tRunning $t tasks per node\n" export logdir=mpi_${N}_${typestring}_${SLURM_CPUS_PER_TASK} export logdir=mpi_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK} echo "Creating $logdir" rm -fr $logdir mkdir $logdir for itest in {1..3} for itest in {1..10} do export logfile=test_${itest}_${logdir}.log echo "time mpirun -np ${N} --map-by ppr:${t}:node /u/glacopo/LOFAR/hpc_imaging-end-of-re_structuring/${exe} /u/glacopo/LOFAR/hpc_imaging-end-of-re_structuring/data/paramfile.txt" > $logfile time mpirun -np ${N} --map-by ppr:${t}:node --bind-to core --mca btl self,vader /u/glacopo/LOFAR/hpc_imaging-end-of-re_structuring/${exe} /u/glacopo/LOFAR/hpc_imaging-end-of-re_structuring/data/paramfile.txt >> $logfile echo "time mpirun -np ${SLURM_NTASKS} --map-by ppr:4:node /leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/${exe} /leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/data/paramfile.txt" > $logfile time mpirun -np ${SLURM_NTASKS} --map-by ppr:4:node --bind-to core --mca btl self,vader /leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/${exe} data/paramfile.txt >> $logfile mv $logfile $logdir mv timings.dat ${logdir}/timings_${itest}.dat cat ${logdir}/timings_all.dat ${logdir}/timings_${itest}.dat >> ${logdir}/timings_all.dat Reduce_time=$( grep -w 'Reduce time :' $logdir/$logfile | gawk '{print $4}' ) FFTW_time=$( grep -w 'FFTW time :' $logdir/$logfile | gawk '{print $4}' ) Composition_time=$( grep -w 'Array Composition time :' $logdir/$logfile | gawk '{print $5}' ) Writing_time=$( grep -w ' Image writing time :' $logdir/$logfile | gawk '{print $5}' ) Total_time=$( grep -w 'TOT time :' $logdir/$logfile | gawk '{print $4}' ) #Not relevant for the paper Setup_time=$( grep -w 'Setup time:' $logdir/$logfile | gawk '{print $3}' ) Kernel_time=$( grep -w 'Kernel time :' $logdir/$logfile | gawk '{print $4}' ) Phase_time=$( grep -w 'Phase time :' $logdir/$logfile | gawk '{print $4}' ) ########################## echo $itest $Reduce_time $FFTW_time $Composition_time $Writing_time $Total_time $Setup_time $Kernel_time $Phase_time >> ${OUT_SHM} done done echo -e "\n\n" >> ${OUT_SHM} avg_red=$( awk '{sum+=$2} END { print sum/10 }' ${OUT_SHM} ) avg_fftw=$( awk '{sum+=$3} END { print sum/10 }' ${OUT_SHM} ) avg_comp=$( awk '{sum+=$4} END { print sum/10 }' ${OUT_SHM} ) avg_write=$( awk '{sum+=$5} END { print sum/10 }' ${OUT_SHM} ) avg_tot=$( awk '{sum+=$6} END { print sum/10 }' ${OUT_SHM} ) std_red=$( awk '{if($2!=""){count++;sum+=$2};y+=$2^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_fftw=$( awk '{if($3!=""){count++;sum+=$3};y+=$3^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_comp=$( awk '{if($4!=""){count++;sum+=$4};y+=$4^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_write=$( awk '{if($5!=""){count++;sum+=$5};y+=$5^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_tot=$( awk '{if($6!=""){count++;sum+=$6};y+=$6^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) #Not relevant for the paper avg_setup=$( awk '{sum+=$7} END { print sum/10 }' ${OUT_SHM} ) avg_ker=$( awk '{sum+=$8} END { print sum/10 }' ${OUT_SHM} ) avg_phase=$( awk '{sum+=$9} END { print sum/10 }' ${OUT_SHM} ) std_setup=$( awk '{if($7!=""){count++;sum+=$7};y+=$7^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_ker=$( awk '{if($8!=""){count++;sum+=$8};y+=$8^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_phase=$( awk '{if($9!=""){count++;sum+=$9};y+=$9^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) ########################## echo "Averages and standard deviations over 10 shots" >> ${OUT_SHM_RES} echo -e "\n" ${OUT_SHM_RES} echo "${SLURM_NTASKS} MPI tasks; ${SLURM_CPUS_PER_TASK} OpenMP threads per task; ${SLURM_GPUS_PER_NODE} GPUs per node;" >> ${OUT_SHM_RES} echo -e "\n\n" ${OUT_SHM_RES} echo $avg_red $std_red $avg_fftw $std_fftw $avg_comp $std_comp $avg_write $std_write $avg_tot $std_tot >> ${OUT_SHM_RES} echo -e "\n\n" ${OUT_SHM_RES} echo $avg_setup $std_setup $avg_ker $std_ker $avg_phase $std_phase >> ${OUT_SHM_RES} rm -f ${OUT_SHM} Loading
scripts/bench.go_gpu +91 −55 Original line number Diff line number Diff line #!/bin/bash #SBATCH -A clusters #SBATCH -p pleiadi-gpu #SBATCH -J LOFAR #SBATCH -A IscrC_RICK #SBATCH -p boost_usr_prod ##SBATCH --qos boost_qos_bprod #SBATCH -J RICK ### number of nodes #SBATCH -N 4 #SBATCH -N 1 ### number of hyperthreading threads #SBATCH --ntasks-per-core=1 ### number of MPI tasks per node #SBATCH --ntasks-per-node=36 #SBATCH -n 144 #SBATCH --ntasks-per-node=4 #SBATCH -n 4 ### number of openmp threads #SBATCH --cpus-per-task=1 #SBATCH --cpus-per-task=8 ### number of allocated GPUs per node ##SBATCH --gres=gpu:2 #SBATCH --mem=110G #SBATCH --gpus-per-node=4 #SBATCH --mem=450G #SBATCH -o test.out #SBATCH -e test.err #SBATCH -t 08:00:00 #SBATCH -t 03:00:00 module purge module load openmpi/ module load fftw/ module load nvhpc/ echo $SLURM_NODELIST export MODULE_VERSION=5.0.1 source /opt/cluster/spack/share/spack/setup-env.sh module load default-gcc-11.2.0 export PATH=/opt/cluster/spack/nvidia/gcc/11.2.0/bin:$PATH export LD_LIBRARY_PATH=/opt/cluster/spack/nvidia/gcc/11.2.0/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/lib:$LD_LIBRARY_PATH export OMPI_CC=gcc export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} cd /u/glacopo/LOFAR/hpc_imaging-end-of-re_structuring/ make SYSTYPE=Amonra_gpu -j1 clean rm -f w-stackingCfftw_ring make SYSTYPE=Amonra_gpu -j1 mpi_ring_omp cd ../ make -j1 clean rm -f w-stacking_fftw_acc-omp_acc-reduce make -j1 w-stacking export use_cuda=no if [ "$use_cuda" = "no" ] then export typestring=omp_cpu export exe=w-stackingCfftw_ring_omp fi OUT_SHM=result_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK} OUT_SHM_RES=/leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/scripts/Tests/times_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK} if [ "$use_cuda" = "yes" ] then export typestring=cuda export exe=w-stackingfftw fi rm -f ${OUT_SHM} ${OUT_SHM_RES} Tmin=2 Tmax=32 export typestring=omp_gpu export exe=w-stacking_fftw_acc-omp_acc-reduce for (( t=$Tmin; t<=$Tmax; t*=2 )) do N=$(( 4*${t} )) echo -e "\tRunning $t tasks per node\n" export logdir=mpi_${N}_${typestring}_${SLURM_CPUS_PER_TASK} export logdir=mpi_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK} echo "Creating $logdir" rm -fr $logdir mkdir $logdir for itest in {1..3} for itest in {1..10} do export logfile=test_${itest}_${logdir}.log echo "time mpirun -np ${N} --map-by ppr:${t}:node /u/glacopo/LOFAR/hpc_imaging-end-of-re_structuring/${exe} /u/glacopo/LOFAR/hpc_imaging-end-of-re_structuring/data/paramfile.txt" > $logfile time mpirun -np ${N} --map-by ppr:${t}:node --bind-to core --mca btl self,vader /u/glacopo/LOFAR/hpc_imaging-end-of-re_structuring/${exe} /u/glacopo/LOFAR/hpc_imaging-end-of-re_structuring/data/paramfile.txt >> $logfile echo "time mpirun -np ${SLURM_NTASKS} --map-by ppr:4:node /leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/${exe} /leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/data/paramfile.txt" > $logfile time mpirun -np ${SLURM_NTASKS} --map-by ppr:4:node --bind-to core --mca btl self,vader /leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/${exe} data/paramfile.txt >> $logfile mv $logfile $logdir mv timings.dat ${logdir}/timings_${itest}.dat cat ${logdir}/timings_all.dat ${logdir}/timings_${itest}.dat >> ${logdir}/timings_all.dat Reduce_time=$( grep -w 'Reduce time :' $logdir/$logfile | gawk '{print $4}' ) FFTW_time=$( grep -w 'FFTW time :' $logdir/$logfile | gawk '{print $4}' ) Composition_time=$( grep -w 'Array Composition time :' $logdir/$logfile | gawk '{print $5}' ) Writing_time=$( grep -w ' Image writing time :' $logdir/$logfile | gawk '{print $5}' ) Total_time=$( grep -w 'TOT time :' $logdir/$logfile | gawk '{print $4}' ) #Not relevant for the paper Setup_time=$( grep -w 'Setup time:' $logdir/$logfile | gawk '{print $3}' ) Kernel_time=$( grep -w 'Kernel time :' $logdir/$logfile | gawk '{print $4}' ) Phase_time=$( grep -w 'Phase time :' $logdir/$logfile | gawk '{print $4}' ) ########################## echo $itest $Reduce_time $FFTW_time $Composition_time $Writing_time $Total_time $Setup_time $Kernel_time $Phase_time >> ${OUT_SHM} done done echo -e "\n\n" >> ${OUT_SHM} avg_red=$( awk '{sum+=$2} END { print sum/10 }' ${OUT_SHM} ) avg_fftw=$( awk '{sum+=$3} END { print sum/10 }' ${OUT_SHM} ) avg_comp=$( awk '{sum+=$4} END { print sum/10 }' ${OUT_SHM} ) avg_write=$( awk '{sum+=$5} END { print sum/10 }' ${OUT_SHM} ) avg_tot=$( awk '{sum+=$6} END { print sum/10 }' ${OUT_SHM} ) std_red=$( awk '{if($2!=""){count++;sum+=$2};y+=$2^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_fftw=$( awk '{if($3!=""){count++;sum+=$3};y+=$3^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_comp=$( awk '{if($4!=""){count++;sum+=$4};y+=$4^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_write=$( awk '{if($5!=""){count++;sum+=$5};y+=$5^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_tot=$( awk '{if($6!=""){count++;sum+=$6};y+=$6^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) #Not relevant for the paper avg_setup=$( awk '{sum+=$7} END { print sum/10 }' ${OUT_SHM} ) avg_ker=$( awk '{sum+=$8} END { print sum/10 }' ${OUT_SHM} ) avg_phase=$( awk '{sum+=$9} END { print sum/10 }' ${OUT_SHM} ) std_setup=$( awk '{if($7!=""){count++;sum+=$7};y+=$7^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_ker=$( awk '{if($8!=""){count++;sum+=$8};y+=$8^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) std_phase=$( awk '{if($9!=""){count++;sum+=$9};y+=$9^2} END{sq=sqrt(y/10-(sum/10)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} ) ########################## echo "Averages and standard deviations over 10 shots" >> ${OUT_SHM_RES} echo -e "\n" ${OUT_SHM_RES} echo "${SLURM_NTASKS} MPI tasks; ${SLURM_CPUS_PER_TASK} OpenMP threads per task; ${SLURM_GPUS_PER_NODE} GPUs per node;" >> ${OUT_SHM_RES} echo -e "\n\n" ${OUT_SHM_RES} echo $avg_red $std_red $avg_fftw $std_fftw $avg_comp $std_comp $avg_write $std_write $avg_tot $std_tot >> ${OUT_SHM_RES} echo -e "\n\n" ${OUT_SHM_RES} echo $avg_setup $std_setup $avg_ker $std_ker $avg_phase $std_phase >> ${OUT_SHM_RES} rm -f ${OUT_SHM}