Commit 011d30c6 authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

CUDA bug fixing

parent 1cde6e96
Loading
Loading
Loading
Loading
+33 −9
Original line number Diff line number Diff line
@@ -37,7 +37,7 @@ FFTWLIBS =
OPT += -DUSE_FFTW

# use omp-ized version of fftw routines
OPT += -DHYBRID_FFTW
#OPT += -DHYBRID_FFTW

# write the full 3D cube of gridded visibilities and its FFT transform
#OPT += -DWRITE_DATA
@@ -73,10 +73,10 @@ OPT += -DPHASE_ON
#OPT += -DNVIDIA

#use cuda for GPUs
#OPT += -DCUDACC
OPT += -DCUDACC

# use GPU acceleration via OMP 
OPT += -DACCOMP
#OPT += -DACCOMP

# use NVIDIA GPU to perform the reduce
#OPT += -DNCCL_REDUCE
@@ -85,7 +85,7 @@ OPT += -DACCOMP
#OPT += -DRCCL_REDUCE

# use GPU to perform FFT
#OPT += -DCUFFTMP
OPT += -DCUFFTMP

#support for AMD GPUs
#OPT += __HIP_PLATFORM_AMD__
@@ -141,8 +141,13 @@ OBJ_RCCL_REDUCE = gridding_rccl.o

# ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
#
DEPS_ACC_CUFFTMP = w-stacking_omp.h cuda_fft.cpp
ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
DEPS_ACC_CUFFTMP = cuda_fft.cu 
OBJ_ACC_CUFFTMP  = cuda_fft.o
else
DEPS_ACC_CUFFTMP = cuda_fft.cpp 
OBJ_ACC_CUFFTMP  = cuda_fft.o
endif


# -----------------------------------------------------
@@ -173,13 +178,19 @@ w-stacking.c: w-stacking.cu

phase_correction.c: phase_correction.cu
	cp phase_correction.cu phase_correction.c

cuda_fft.cpp: cuda_fft.cu
	cp cuda_fft.cu cuda_fft.cpp
else
w-stacking.c: w-stacking.cu
	rm -f w-stacking.cun
	rm -f w-stacking.c
	touch w-stacking.c
phase_correction.c: phase_correction.cu
	rm -f phase_correction.c
	touch phase_correction.c
cuda_fft.cpp: cuda_fft.cu
	rm -f cuda_fft.cpp
	touch cuda_fft.cpp
endif


@@ -250,15 +261,28 @@ OBJ += $(OBJ_RCCL_REDUCE)
endif

ifeq (CUFFTMP,$(findstring CUFFTMP,$(OPT)))

ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-fft
LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE)
LIBS=$(NVLIB_2)
$(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP)
	$(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS)
OBJ += $(OBJ_ACC_CUFFTMP)

else

EXEC_EXT := $(EXEC_EXT)_acc-fft
LINKER=$(NVC++)
FLAGS=$(NVFLAGS) $(CFLAGS)
LIBS=$(NVLIB) $(NVLIB_2)
LIBS=$(NVLIB_2)
$(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP)
	$(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS)
OBJ += $(OBJ_ACC_CUFFTMP)
endif

endif

###################################################################################

@@ -270,11 +294,11 @@ w-stacking: $(OBJ) $(DEPS) Makefile
%.o: %.c $(DEPS)
	$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)


clean:
	rm -f *.o
	rm -f w-stacking.c
	rm -f phase_correction.c
	rm -f cuda_fft.cpp

cleanall:
	rm -f $(EXEC)$(EXT)
+0 −3
Original line number Diff line number Diff line
@@ -34,9 +34,6 @@
#include "w-stacking.h"
#endif 

#if defined(CUDACC)
#include <cuda.h>
#endif

#if defined(NVIDIA)
#include <cuda_runtime.h>
+1 −1
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@
#include <unistd.h>


#if !defined( NCCL_REDUCE )
#if !defined( NCCL_REDUCE ) && !defined(__CUDACC__)
#include <stdatomic.h>
#endif

+0 −20
Original line number Diff line number Diff line
@@ -3,7 +3,6 @@
#include "allvars.h"
#include "proto.h"


void shutdown_wstacking( int errcode, char *message, char *fname, int linenum )
{
  if ( ( rank == 0 ) &&
@@ -77,25 +76,6 @@ int main(int argc, char * argv[])

  FFT_INIT;    

 #if defined(CUDACC) 
  int ndevices;
  cudaGetDeviceCount(&ndevices);
  cudaSetDevice(rank % ndevices);

  if ( rank == 0 ) {
    if (0 == ndevices) {
      
      shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
    }
  

  printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices);
 #ifdef NVIDIA
  prtAccelInfo();
 #endif
  }
 #endif
  
 #ifdef ACCOMP
  if ( rank == 0 ) {
    if (0 == omp_get_num_devices()) {
+18 −1
Original line number Diff line number Diff line
@@ -11,6 +11,8 @@
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include "errcodes.h"
#include "proto.h"

#ifdef __CUDACC__

@@ -107,6 +109,21 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
        long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1;
        if(NWORKERS == 1) {Nbl = 1; Nth = 1;};

	int ndevices;
	cudaGetDeviceCount(&ndevices);
	cudaSetDevice(rank % ndevices);

	if ( rank == 0 ) {
	  if (0 == ndevices) {

	    shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
	  }

	}
	  printf("Running rank %d using GPU %d\n", rank, rank % ndevices);
	 #ifdef NVIDIA
	  prtAccelInfo();
	 #endif

	cudaError_t mmm;
	double * image_real_g;
Loading