Bug fixing for cufftMP (01c8720a) · Commits · Claudio Gheller / HPC_Imaging

Build/Makefile.leo

0 → 100644

+70 −0

Original line number	Diff line number	Diff line
		CC = gcc
		CXX = g++

		MPICC = mpicc
		MPIC++ = mpic++

		OPTIMIZE = -O4 -fopenmp -march=native -mavx -mavx2
		OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80

		CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/include
		CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/lib64

		FFTW_INCL=
		FFTW_LIB=


		##########################################################
		#NVIDIA CUFFTMP

		CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/lib64
		CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/include/cufftmp
		##########################################################

		NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/include
		NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/lib

		##########################################################
		#NVIDIA NCCL REDUCE

		NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/include
		NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/lib
		##########################################################

		NVC = nvc
		NVC++ = nvc++
		NVFLAGS = -O4 -fast -march=native $(OMP_GPU) -mavx -mavx2
		NVLIB = $(CUDA_INC) $(CUDA_LIB) -lcudart


		NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFTMP_INC) $(CUFFTMP_LIB) $(NVSHMEM_INC) $(NVSHMEM_LIB) -lnvshmem_host -lnvshmem_device -lcudart -lcufftMp

		NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcudart -lnccl


		NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/11.8/bin/nvcc
		OPT_NVCC = -std=c++17 --generate-code arch=compute_80,code=sm_80

		CFLAGS +=

		MPICHLIB =



		##########################################################
		#AMD GPUs (DEFAULT = LUMI)

		CLANG = clang
		CLANG++ = clang++

		OPTIMIZE_AMD = -O3 -Ofast -fopenmp -march=native -mavx -mavx2 -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a

		RCCL_INCL= -I/opt/rocm-5.2.3/rccl/include
		RCCL_LIB= -L/opt/rocm-5.2.3/rccl/lib

		HIP_INCL= -I/opt/rocm-5.2.3/hip/include
		HIP_LIB= -L/opt/rocm-5.2.3/hip/lib

		AMDLIB = $(HIP_INCL) $(HIP_LIB) $(RCCL_INCL) $(RCCL_LIB) -lamdhip64 -lrccl
		###########################################################

Makefile

+4 −4

Original line number	Diff line number	Diff line
		@@ -141,7 +141,7 @@ OBJ_RCCL_REDUCE = gridding_rccl.o

		# ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
		#
		DEPS_ACC_CUFFTMP = w-stacking.h cuda_fft.cu
		DEPS_ACC_CUFFTMP = w-stacking_omp.h cuda_fft.cpp
		OBJ_ACC_CUFFTMP = cuda_fft.o


		@@ -251,11 +251,11 @@ endif

		ifeq (CUFFTMP,$(findstring CUFFTMP,$(OPT)))
		EXEC_EXT := $(EXEC_EXT)_acc-fft
		LINKER=$(MPIC++)
		FLAGS=$(OPTIMIZE)
		LINKER=$(NVC++)
		FLAGS=$(NVFLAGS) $(CFLAGS)
		LIBS=$(NVLIB) $(NVLIB_2)
		$(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP)
		$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c $^ $(LIBS)
		$(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS)
		OBJ += $(OBJ_ACC_CUFFTMP)
		endif

fourier_transform.c

+7 −4

Original line number	Diff line number	Diff line

		#include "allvars.h"
		#include "proto.h"

		#if defined(CUFFTMP)
		#include "w-stacking_omp.h"
		#endif

		// ------------------------------------
		#if defined(USE_FFTW) && !defined(CUFFTMP) // PERFORM FFT on CPU with FFTW
		@@ -106,13 +108,14 @@ void fftw_data ( void )
		double start = CPU_TIME_wt;

		cuda_fft(
		num_w_planes,
		grid_size_x,
		grid_size_y,
		param.num_w_planes,
		param.grid_size_x,
		param.grid_size_y,
		xaxis,
		yaxis,
		grid,
		gridss,
		rank,
		MPI_COMM_WORLD);

		MPI_Barrier(MPI_COMM_WORLD);

main.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -77,7 +77,7 @@ int main(int argc, char * argv[])

		FFT_INIT;

		#if defined(CUDACC) \|\| defined(CUFFTMP)
		#if defined(CUDACC)
		int ndevices;
		cudaGetDeviceCount(&ndevices);
		cudaSetDevice(rank % ndevices);

w-stacking.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -97,6 +97,8 @@ void cuda_fft(
		int,
		double*,
		double*,
		int,
		MPI_Comm);


		#endif