Commit 01c8720a authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

Bug fixing for cufftMP

parent d8e2a6a0
Loading
Loading
Loading
Loading

Build/Makefile.leo

0 → 100644
+70 −0
Original line number Diff line number Diff line
CC       =  gcc
CXX      =  g++

MPICC    =  mpicc
MPIC++   =  mpic++

OPTIMIZE = -O4 -fopenmp -march=native -mavx -mavx2 
OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80

CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/include
CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/lib64

FFTW_INCL=
FFTW_LIB=


##########################################################
#NVIDIA CUFFTMP

CUFFTMP_LIB  = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/lib64
CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/include/cufftmp
##########################################################

NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/include
NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/lib

##########################################################
#NVIDIA NCCL REDUCE

NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/include
NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/lib
##########################################################

NVC = nvc 
NVC++ = nvc++
NVFLAGS = -O4 -fast -march=native $(OMP_GPU) -mavx -mavx2  
NVLIB = $(CUDA_INC) $(CUDA_LIB) -lcudart


NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFTMP_INC) $(CUFFTMP_LIB) $(NVSHMEM_INC) $(NVSHMEM_LIB) -lnvshmem_host -lnvshmem_device -lcudart -lcufftMp 

NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcudart -lnccl


NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/11.8/bin/nvcc
OPT_NVCC   = -std=c++17 --generate-code arch=compute_80,code=sm_80 

CFLAGS +=

MPICHLIB =



##########################################################
#AMD GPUs (DEFAULT = LUMI)

CLANG   = clang
CLANG++ = clang++
 
OPTIMIZE_AMD = -O3 -Ofast -fopenmp -march=native -mavx -mavx2 -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a

RCCL_INCL= -I/opt/rocm-5.2.3/rccl/include
RCCL_LIB= -L/opt/rocm-5.2.3/rccl/lib

HIP_INCL= -I/opt/rocm-5.2.3/hip/include
HIP_LIB= -L/opt/rocm-5.2.3/hip/lib

AMDLIB = $(HIP_INCL) $(HIP_LIB) $(RCCL_INCL) $(RCCL_LIB) -lamdhip64 -lrccl
###########################################################
+4 −4
Original line number Diff line number Diff line
@@ -141,7 +141,7 @@ OBJ_RCCL_REDUCE = gridding_rccl.o

# ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
#
DEPS_ACC_CUFFTMP = w-stacking.h cuda_fft.cu
DEPS_ACC_CUFFTMP = w-stacking_omp.h cuda_fft.cpp
OBJ_ACC_CUFFTMP  = cuda_fft.o


@@ -251,11 +251,11 @@ endif

ifeq (CUFFTMP,$(findstring CUFFTMP,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-fft
LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE)
LINKER=$(NVC++)
FLAGS=$(NVFLAGS) $(CFLAGS)
LIBS=$(NVLIB) $(NVLIB_2)
$(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP)
	$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c $^ $(LIBS)
	$(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS)
OBJ += $(OBJ_ACC_CUFFTMP)
endif

+7 −4
Original line number Diff line number Diff line

#include "allvars.h"
#include "proto.h"

#if defined(CUFFTMP)
#include "w-stacking_omp.h"
#endif

                                                // ------------------------------------
#if defined(USE_FFTW) && !defined(CUFFTMP)      //  PERFORM FFT on CPU with FFTW
@@ -106,13 +108,14 @@ void fftw_data ( void )
  double start = CPU_TIME_wt;

  cuda_fft(
	   num_w_planes,
	   grid_size_x,
	   grid_size_y,
	   param.num_w_planes,
	   param.grid_size_x,
	   param.grid_size_y,
	   xaxis,
	   yaxis,
	   grid,
	   gridss,
	   rank,
	   MPI_COMM_WORLD);

  MPI_Barrier(MPI_COMM_WORLD);
+1 −1
Original line number Diff line number Diff line
@@ -77,7 +77,7 @@ int main(int argc, char * argv[])

  FFT_INIT;    

 #if defined(CUDACC) || defined(CUFFTMP)
 #if defined(CUDACC) 
  int ndevices;
  cudaGetDeviceCount(&ndevices);
  cudaSetDevice(rank % ndevices);
+2 −0
Original line number Diff line number Diff line
@@ -97,6 +97,8 @@ void cuda_fft(
	int,
	double*,
	double*,
	int,
	MPI_Comm);


#endif
Loading