Loading Build/Makefile.leo 0 → 100644 +70 −0 Original line number Diff line number Diff line CC = gcc CXX = g++ MPICC = mpicc MPIC++ = mpic++ OPTIMIZE = -O4 -fopenmp -march=native -mavx -mavx2 OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80 CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/include CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/lib64 FFTW_INCL= FFTW_LIB= ########################################################## #NVIDIA CUFFTMP CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/lib64 CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/include/cufftmp ########################################################## NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/include NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/lib ########################################################## #NVIDIA NCCL REDUCE NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/include NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/lib ########################################################## NVC = nvc NVC++ = nvc++ NVFLAGS = -O4 -fast -march=native $(OMP_GPU) -mavx -mavx2 NVLIB = $(CUDA_INC) $(CUDA_LIB) -lcudart NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFTMP_INC) $(CUFFTMP_LIB) $(NVSHMEM_INC) $(NVSHMEM_LIB) -lnvshmem_host -lnvshmem_device -lcudart -lcufftMp NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcudart -lnccl NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/11.8/bin/nvcc OPT_NVCC = -std=c++17 --generate-code arch=compute_80,code=sm_80 CFLAGS += MPICHLIB = ########################################################## #AMD GPUs (DEFAULT = LUMI) CLANG = clang CLANG++ = clang++ OPTIMIZE_AMD = -O3 -Ofast -fopenmp -march=native -mavx -mavx2 -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a RCCL_INCL= -I/opt/rocm-5.2.3/rccl/include RCCL_LIB= -L/opt/rocm-5.2.3/rccl/lib HIP_INCL= -I/opt/rocm-5.2.3/hip/include HIP_LIB= -L/opt/rocm-5.2.3/hip/lib AMDLIB = $(HIP_INCL) $(HIP_LIB) $(RCCL_INCL) $(RCCL_LIB) -lamdhip64 -lrccl ########################################################### Makefile +4 −4 Original line number Diff line number Diff line Loading @@ -141,7 +141,7 @@ OBJ_RCCL_REDUCE = gridding_rccl.o # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT # DEPS_ACC_CUFFTMP = w-stacking.h cuda_fft.cu DEPS_ACC_CUFFTMP = w-stacking_omp.h cuda_fft.cpp OBJ_ACC_CUFFTMP = cuda_fft.o Loading Loading @@ -251,11 +251,11 @@ endif ifeq (CUFFTMP,$(findstring CUFFTMP,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-fft LINKER=$(MPIC++) FLAGS=$(OPTIMIZE) LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_2) $(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP) $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c $^ $(LIBS) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_ACC_CUFFTMP) endif Loading fourier_transform.c +7 −4 Original line number Diff line number Diff line #include "allvars.h" #include "proto.h" #if defined(CUFFTMP) #include "w-stacking_omp.h" #endif // ------------------------------------ #if defined(USE_FFTW) && !defined(CUFFTMP) // PERFORM FFT on CPU with FFTW Loading Loading @@ -106,13 +108,14 @@ void fftw_data ( void ) double start = CPU_TIME_wt; cuda_fft( num_w_planes, grid_size_x, grid_size_y, param.num_w_planes, param.grid_size_x, param.grid_size_y, xaxis, yaxis, grid, gridss, rank, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); Loading main.c +1 −1 Original line number Diff line number Diff line Loading @@ -77,7 +77,7 @@ int main(int argc, char * argv[]) FFT_INIT; #if defined(CUDACC) || defined(CUFFTMP) #if defined(CUDACC) int ndevices; cudaGetDeviceCount(&ndevices); cudaSetDevice(rank % ndevices); Loading w-stacking.h +2 −0 Original line number Diff line number Diff line Loading @@ -97,6 +97,8 @@ void cuda_fft( int, double*, double*, int, MPI_Comm); #endif Loading
Build/Makefile.leo 0 → 100644 +70 −0 Original line number Diff line number Diff line CC = gcc CXX = g++ MPICC = mpicc MPIC++ = mpic++ OPTIMIZE = -O4 -fopenmp -march=native -mavx -mavx2 OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80 CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/include CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/lib64 FFTW_INCL= FFTW_LIB= ########################################################## #NVIDIA CUFFTMP CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/lib64 CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/include/cufftmp ########################################################## NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/include NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/lib ########################################################## #NVIDIA NCCL REDUCE NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/include NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/lib ########################################################## NVC = nvc NVC++ = nvc++ NVFLAGS = -O4 -fast -march=native $(OMP_GPU) -mavx -mavx2 NVLIB = $(CUDA_INC) $(CUDA_LIB) -lcudart NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFTMP_INC) $(CUFFTMP_LIB) $(NVSHMEM_INC) $(NVSHMEM_LIB) -lnvshmem_host -lnvshmem_device -lcudart -lcufftMp NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcudart -lnccl NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/11.8/bin/nvcc OPT_NVCC = -std=c++17 --generate-code arch=compute_80,code=sm_80 CFLAGS += MPICHLIB = ########################################################## #AMD GPUs (DEFAULT = LUMI) CLANG = clang CLANG++ = clang++ OPTIMIZE_AMD = -O3 -Ofast -fopenmp -march=native -mavx -mavx2 -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a RCCL_INCL= -I/opt/rocm-5.2.3/rccl/include RCCL_LIB= -L/opt/rocm-5.2.3/rccl/lib HIP_INCL= -I/opt/rocm-5.2.3/hip/include HIP_LIB= -L/opt/rocm-5.2.3/hip/lib AMDLIB = $(HIP_INCL) $(HIP_LIB) $(RCCL_INCL) $(RCCL_LIB) -lamdhip64 -lrccl ###########################################################
Makefile +4 −4 Original line number Diff line number Diff line Loading @@ -141,7 +141,7 @@ OBJ_RCCL_REDUCE = gridding_rccl.o # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT # DEPS_ACC_CUFFTMP = w-stacking.h cuda_fft.cu DEPS_ACC_CUFFTMP = w-stacking_omp.h cuda_fft.cpp OBJ_ACC_CUFFTMP = cuda_fft.o Loading Loading @@ -251,11 +251,11 @@ endif ifeq (CUFFTMP,$(findstring CUFFTMP,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-fft LINKER=$(MPIC++) FLAGS=$(OPTIMIZE) LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) LIBS=$(NVLIB) $(NVLIB_2) $(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP) $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c $^ $(LIBS) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_ACC_CUFFTMP) endif Loading
fourier_transform.c +7 −4 Original line number Diff line number Diff line #include "allvars.h" #include "proto.h" #if defined(CUFFTMP) #include "w-stacking_omp.h" #endif // ------------------------------------ #if defined(USE_FFTW) && !defined(CUFFTMP) // PERFORM FFT on CPU with FFTW Loading Loading @@ -106,13 +108,14 @@ void fftw_data ( void ) double start = CPU_TIME_wt; cuda_fft( num_w_planes, grid_size_x, grid_size_y, param.num_w_planes, param.grid_size_x, param.grid_size_y, xaxis, yaxis, grid, gridss, rank, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); Loading
main.c +1 −1 Original line number Diff line number Diff line Loading @@ -77,7 +77,7 @@ int main(int argc, char * argv[]) FFT_INIT; #if defined(CUDACC) || defined(CUFFTMP) #if defined(CUDACC) int ndevices; cudaGetDeviceCount(&ndevices); cudaSetDevice(rank % ndevices); Loading
w-stacking.h +2 −0 Original line number Diff line number Diff line Loading @@ -97,6 +97,8 @@ void cuda_fft( int, double*, double*, int, MPI_Comm); #endif