Loading leonardo_booster.sh +21 −22 Original line number Original line Diff line number Diff line Loading @@ -4,46 +4,45 @@ #SBATCH --cpus-per-task 32 #SBATCH --cpus-per-task 32 #SBATCH --gres=gpu:1 #SBATCH --gres=gpu:1 #SBATCH --partition=boost_usr_prod #SBATCH --partition=boost_usr_prod set -xev set -xev #exit on error export DEBUG=1 export DEBUG=1 export NV_ACC_CUDA_STACKSIZE=15000 PPL=(1 64) #vary particle per leaf PPL=(1 8 16 32) THREADS=(1 2 4 8 16 32 64 128) #vary branching (thread) count THREADS=(1 2 4 8 16 32) MODE=(cpu gpu) #vary gpu/cpu switch MODE=(gpu cpu) #N=(1000 10000 100000 1000000 10000000) N=(1e7) N=(50000000) #scaling will use 5e7 particles OMP_GPU=100 #number of particles per GPU thread # # # run tree build tests on GPU # test tree build on GPU and CPU # # CC='nvc++' HW_FLAGS='-mp=gpu -gpu=rdc,managed -gpu=cc80 -Minfo=mp,accel -Minline=1000 -O3 -DTARGET=1 -DOPENMP_RECURSION_TRICK -DOPENMP_TARGET_HILBERTCOPY' srun -n 1 python -m hotwheels_core.wrap hotwheels_octree/test_target.cpp OMP_NUM_THREADS=4 CC='gcc' HW_FLAGS='-fopenmp -O3' srun -n 1 python -m hotwheels_core.wrap src/test_target.cpp OMP_NUM_THREADS=4 CC='nvc++' HW_FLAGS='-mp=gpu -gpu=rdc,managed -gpu=cc80 -Minfo=mp,accel -Minline=10000 -O3' srun -n 1 python -m hotwheels_core.wrap src/test_target.cpp rm -f report.csv for ppl in "${PPL[@]}" ; do # loop over particles per leaf # for n in "${N[@]}" ; do # loop over n of particles # loop over tree build configurations PPL, THREADS, MODE, N for mode in "${MODE[@]}" ; do #switch between gpu and gpu mode # for threads in "${THREADS[@]}" ; do #loop over number of OpenMPThreads for ppl in "${PPL[@]}" ; do for n in "${N[@]}" ; do for mode in "${MODE[@]}" ; do for threads in "${THREADS[@]}" ; do if [[ "$mode" == "cpu" ]]; then if [[ "$mode" == "cpu" ]]; then export CC=gcc export CC=gcc export HW_FLAGS='-O3 -fopenmp' export HW_FLAGS='-O3 -fopenmp' export OMP_NUM_THREADS=$threads export OMP_NUM_THREADS=$threads else else if [[ $threads -lt 16 ]]; then if [[ $threads != 8 ]]; then continue continue fi fi export CC=nvc++ export CC=nvc++ export HW_FLAGS='-mp=gpu -gpu=rdc,managed -gpu=cc80 -Minfo=mp,accel -Minline=1000 -O3' export HW_FLAGS='-mp=gpu -gpu=rdc,managed -gpu=cc80 -Minfo=mp,accel -Minline=1000 -O3' export OMP_NUM_THREADS=4 export OMP_NUM_THREADS=4 # gpu always uses 4 threads threads=$(python -c "print(int($N//100))") #for the text hilight: " fi fi # rnu the test # run thre tree build srun -n 1 python hotwheels_octree/testgpu.py -ppl $ppl -n $n -mode $mode -threads $threads 1>run.out 2>run.err srun -n 1 python src/testgpu.py -ppl $ppl -n $n -mode $mode -threads $threads -defrag 1 1>run.out 2>run.err # grep the timer and store t=$(grep time: run.out | sed s/time://) t=$(grep time: run.out | sed s/time://) echo $ppl,$n,$mode,$threads,$t >> report.csv # store the report echo $ppl $n $mode $threads $t >> report.csv done done done done done done Loading Loading
leonardo_booster.sh +21 −22 Original line number Original line Diff line number Diff line Loading @@ -4,46 +4,45 @@ #SBATCH --cpus-per-task 32 #SBATCH --cpus-per-task 32 #SBATCH --gres=gpu:1 #SBATCH --gres=gpu:1 #SBATCH --partition=boost_usr_prod #SBATCH --partition=boost_usr_prod set -xev set -xev #exit on error export DEBUG=1 export DEBUG=1 export NV_ACC_CUDA_STACKSIZE=15000 PPL=(1 64) #vary particle per leaf PPL=(1 8 16 32) THREADS=(1 2 4 8 16 32 64 128) #vary branching (thread) count THREADS=(1 2 4 8 16 32) MODE=(cpu gpu) #vary gpu/cpu switch MODE=(gpu cpu) #N=(1000 10000 100000 1000000 10000000) N=(1e7) N=(50000000) #scaling will use 5e7 particles OMP_GPU=100 #number of particles per GPU thread # # # run tree build tests on GPU # test tree build on GPU and CPU # # CC='nvc++' HW_FLAGS='-mp=gpu -gpu=rdc,managed -gpu=cc80 -Minfo=mp,accel -Minline=1000 -O3 -DTARGET=1 -DOPENMP_RECURSION_TRICK -DOPENMP_TARGET_HILBERTCOPY' srun -n 1 python -m hotwheels_core.wrap hotwheels_octree/test_target.cpp OMP_NUM_THREADS=4 CC='gcc' HW_FLAGS='-fopenmp -O3' srun -n 1 python -m hotwheels_core.wrap src/test_target.cpp OMP_NUM_THREADS=4 CC='nvc++' HW_FLAGS='-mp=gpu -gpu=rdc,managed -gpu=cc80 -Minfo=mp,accel -Minline=10000 -O3' srun -n 1 python -m hotwheels_core.wrap src/test_target.cpp rm -f report.csv for ppl in "${PPL[@]}" ; do # loop over particles per leaf # for n in "${N[@]}" ; do # loop over n of particles # loop over tree build configurations PPL, THREADS, MODE, N for mode in "${MODE[@]}" ; do #switch between gpu and gpu mode # for threads in "${THREADS[@]}" ; do #loop over number of OpenMPThreads for ppl in "${PPL[@]}" ; do for n in "${N[@]}" ; do for mode in "${MODE[@]}" ; do for threads in "${THREADS[@]}" ; do if [[ "$mode" == "cpu" ]]; then if [[ "$mode" == "cpu" ]]; then export CC=gcc export CC=gcc export HW_FLAGS='-O3 -fopenmp' export HW_FLAGS='-O3 -fopenmp' export OMP_NUM_THREADS=$threads export OMP_NUM_THREADS=$threads else else if [[ $threads -lt 16 ]]; then if [[ $threads != 8 ]]; then continue continue fi fi export CC=nvc++ export CC=nvc++ export HW_FLAGS='-mp=gpu -gpu=rdc,managed -gpu=cc80 -Minfo=mp,accel -Minline=1000 -O3' export HW_FLAGS='-mp=gpu -gpu=rdc,managed -gpu=cc80 -Minfo=mp,accel -Minline=1000 -O3' export OMP_NUM_THREADS=4 export OMP_NUM_THREADS=4 # gpu always uses 4 threads threads=$(python -c "print(int($N//100))") #for the text hilight: " fi fi # rnu the test # run thre tree build srun -n 1 python hotwheels_octree/testgpu.py -ppl $ppl -n $n -mode $mode -threads $threads 1>run.out 2>run.err srun -n 1 python src/testgpu.py -ppl $ppl -n $n -mode $mode -threads $threads -defrag 1 1>run.out 2>run.err # grep the timer and store t=$(grep time: run.out | sed s/time://) t=$(grep time: run.out | sed s/time://) echo $ppl,$n,$mode,$threads,$t >> report.csv # store the report echo $ppl $n $mode $threads $t >> report.csv done done done done done done Loading