Commit 726c5ecb authored by David Goz's avatar David Goz
Browse files

hybrid cuda-omp example added

parent 31d1aa2e
Loading
Loading
Loading
Loading
+159 −0
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Passing OpenMP data to cuBlas.
//
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 02.09.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
//         hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
// - Run the code:
//   $ export OMP_TARGET_OFFLOAD=mandatory
//   $ ./hybrid_cuda_omp
////////////////////////////////////////////////////////////////////////////////////////////////////

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <assert.h>
#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define N     512
#define SIZE  ((N) * (N))
#define ALPHA 1.0
#define BETA  0.0

typedef double MyData;

void InitHost(MyData *const restrict A,
	      MyData *const restrict B,
	      MyData *const restrict C)
{
  //#pragma omp parallel for collapse(2)
  for (int i=0 ; i<N ; i++)
    for (int j=0 ; j<N ; j++)
      {
	A[(i * N) + j] = 1.0;
	B[(i * N) + j] = 2.0;
	C[(i * N) + j] = 0.0;
      } 
}

void InitDev(MyData *const restrict A,
	     MyData *const restrict B,
	     MyData *const restrict C)
{
 #pragma omp target teams loop collapse(2)
  for (int i=0 ; i<N ; i++)
    for (int j=0 ; j<N ; j++)
      {
	A[(i * N) + j] = 1.0;
	B[(i * N) + j] = 2.0;
	C[(i * N) + j] = 0.0;
      }

  return;
}

void HostDgemm(MyData *const restrict A,
	       MyData *const restrict B,
	       MyData *const restrict C,
	       const MyData           alpha,
	       const MyData           beta)
{
  // C = alpha * A * B + beta * C;

  // naive calculation
  //  #pragma omp parallel for collapse(2)
  for (int i=0 ; i<N ; i++)
    for (int j=0 ; j<N ; j++)
      {
	MyData sum = 0.0;
	for (int k=0 ; k<N ; k++)
	  sum += A[(i * N) + k] * B[(k * N) + j];

	C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
      }

  return;
}

void check(MyData *const restrict host_array,
	   MyData *const restrict dev_array)
{
  int flag = 0;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);

  if (!flag)
    printf("\n\t Result OK");
  else
    printf("\n\t Result wrong");
  
  return;
}

int main()
{
  // Host allocation
  MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData));
  assert(buffer != NULL);
  MyData *const restrict A  = buffer;
  MyData *const restrict B  = A + SIZE;
  MyData *const restrict C  = B + SIZE;
  MyData *const restrict CC = C + SIZE;

  // Spawning 2 host threads
  #pragma omp parallel num_threads(2)
  {
    // Evaluate the Dgemm on the host
    #pragma omp single nowait
    {
      InitHost(A, B, CC);
      HostDgemm(A, B, CC, ALPHA, BETA);
    } // omp single

    #pragma omp single nowait
    {
      // Initialize cuBLAS library
      cublasHandle_t handle;
      cublasCreate(&handle);
      
      // Allocate A, B, C on the device
      #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE])

      // Init device with blocking omp target directive
      InitDev(A, B, C);

      // Define a target data region where A, B, and C pointers
      // refer to device's address space
      #pragma omp target data use_device_addr(A, B, C)
      {
	MyData const alpha = ALPHA;
	MyData const beta  = BETA;
	
	cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
		    &alpha, A, N, B, N, &beta, C, N);

	// CUDA synchronization point
	cudaDeviceSynchronize();
      }

      // Fetch data from the device and deallocate
      #pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE])
      
      cublasDestroy(handle);
    } // omp single
  } // synchronization point

  check(CC, C);
  
  free(buffer);
  
  return 0;
}
+110 −0
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Splitting the asynchronous vector addition task graph across four devices
//
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 28.08.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v asynchronous.c -o asynchronous_omp
// - Run the code:
//   $ export OMP_TARGET_OFFLOAD=mandatory
//   $ ./asynchronous_omp
////////////////////////////////////////////////////////////////////////////////////////////////////


#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <assert.h>

typedef int MyData;

#define NDEBUG

void check(const MyData *const C,
	   const size_t        size)
{
  int flag = 0;
  for (size_t i=0 ; i<size ; i++)
    flag = ((C[i] != 98) ? 1 : flag);

  if (flag)
    printf("\n\t Result wrong \n");
  else
    printf("\n\t Result OK \n");

  return;
}

int main()
{
    // alloc data on the device
  const int NumDev = omp_get_num_devices();

  if (NumDev != 4)
    {
      printf("\n\t The program runs using 4 GPUs... aborting...\n");
      exit(EXIT_FAILURE);
    }

  const int size = 1000000;
  MyData *buffer = (MyData *)malloc(3 * size * sizeof(MyData));
  assert(buffer != NULL);
  MyData *const restrict A = buffer;
  MyData *const restrict B = A + size;
  MyData *const restrict C = B + size;

  // init A on GPU 0
  #pragma omp target nowait                 \
                     map(tofrom: A[0:size]) \
                     depend(out: A[0:size]) \
                     device(0)
  {
    #pragma omp loop
    for (int i=0 ; i<size ; i++)
      A[i] = (MyData)i;
  } // device 0
  
  // init B on GPU 1
  #pragma omp target nowait                 \
                     map(tofrom: B[0:size]) \
                     depend(out: B[0:size]) \
                     device(1)
  {
    #pragma omp loop
    for (int i=0 ; i<size ; i++)
      B[i] = (MyData)-i;
  } // device 1

  // init C on GPU 2
  #pragma omp target nowait                 \
                     map(tofrom: C[0:size]) \
                     depend(out: C[0:size]) \
                     device(2)
  {
    #pragma omp loop
    for (int i=0 ; i<size ; i++)
      C[i] = 99;
  } // device 2

  // perform the calculation on GPU 3
  #pragma omp target nowait                                                    \
                     map(to: A[0:size], B[0:size]) map(tofrom: C[0:size])      \
                     depend(in: A[0:size], B[0:size]) depend(inout: C[0:size]) \
                     device(3)
  {
    #pragma omp loop
    for (int i=0 ; i<size ; i++)
      C[i] += (A[i] + B[i] - 1);
  } // device 3

  #pragma omp task depend(in: C[0:size])
  check(C, size);
  
  free(buffer);
  
  return 0;
}
+95 −0
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Task dependencies:
// example of synchronization by defining explicit dependencies between the tasks.
//
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 28.08.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v dependencies.c -o dependencies_omp
// - Run the code:
//   $ export OMP_TARGET_OFFLOAD=mandatory
//   $ ./dependencies_omp
////////////////////////////////////////////////////////////////////////////////////////////////////


#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <assert.h>

typedef int MyData;

#define NDEBUG

void check(const MyData *const C,
	   const size_t        size)
{
  int flag = 0;
  for (size_t i=0 ; i<size ; i++)
    flag = ((C[i] != 0) ? 1 : flag);

  if (flag)
    printf("\n\t Result wrong \n");
  else
    printf("\n\t Result OK \n");

  return;
}

int main()
{
  const int size = 1000000;

  MyData *C = (MyData *)malloc(size * sizeof(MyData));
  assert(C != NULL);

  // alloc data on the device
  const int dev_gpu = omp_get_default_device();
  MyData *gpu_buffer = (MyData *)omp_target_alloc(2 * size * sizeof(MyData), dev_gpu);
  assert(gpu_buffer != NULL);
  MyData *A = gpu_buffer;
  MyData *B = A + size;

  // init C with random number
  for (int i=0 ; i<size ; i++)
    C[i] = rand() % size;
  
  #pragma omp target enter data map(to: C[0:size])
  
  // init A
  #pragma omp target nowait depend(out: A[0:size]) is_device_ptr(A)
  {
    #pragma omp loop
    for (int i=0 ; i<size; i++)
      A[i] = i;
  }

  // init B
  #pragma omp target nowait depend(out: B[0:size]) is_device_ptr(B)
  {
    #pragma omp loop
    for (int i=0 ; i<size; i++)
      B[i] = -i;
  }
  
  // vector add
 #pragma omp target nowait depend(in: A[0:size], B[0:size]) depend(out: C[0:size]) is_device_ptr(A, B)
  {
    #pragma omp loop
    for (int i=0 ; i<size; i++)
      C[i] = A[i] + B[i];
  }
  
 #pragma omp target update from (C[0:size]) depend(in: C[0:size])

  check(C, size);
  omp_target_free(gpu_buffer, dev_gpu);
  free(C);  
  #pragma omp target exit data map(release: C[0:size])
  
  return 0;
}
+50 −36
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////////
// 
// Offload to multiple devices within the same node:
// - using one OMP host-thread per device synchronously;
// - using one OMP host-thread asynchronously.
//
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 27.08.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v multiple_devices.c -o multiple_devices_omp
// - Run the code:
//   $ export OMP_TARGET_OFFLOAD=mandatory
//   $ ./multiple_devices_omp
////////////////////////////////////////////////////////////////////////////////////////////////////


#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
@@ -15,30 +34,10 @@ typedef int MyData;
#error "N_PER_DEV < BLOCKSIZE"
#endif

#define NDEBUG

void check(const MyData *const restrict vector_cpu,
	   const MyData *const restrict vector_gpu,
	   const size_t                 size)
{
  int flag = 0;
  for (size_t i=0 ; i<size ; i++)
{
#if !defined(NDEBUG)
    printf("\n\t vector_cpu[%zu] = %d - vector_gpu[%zu] = %d",
           i, vector_cpu[i], i, vector_gpu[i]);
#endif

    flag = ((vector_cpu[i] != vector_gpu[i]) ? 1 : flag);
}

  if (flag)
    printf("\n\t Result wrong \n");
  else
    printf("\n\t Result OK \n");
#define TRUE  1
#define FALSE 0

  return;
}
#define NDEBUG

void VectorAdd(const MyData *const restrict A,
	       const MyData *const restrict B,
@@ -46,7 +45,8 @@ void VectorAdd(const MyData *const restrict A,
	       const int                    offset,
	       const int                    size,
               const int                    dev,
               const int                    nblocks)
               const int                    nblocks,
	       const int                    async)
{
#pragma omp target							\
  teams num_teams(nblocks) thread_limit(BLOCKSIZE)			\
@@ -84,6 +84,8 @@ int main()
  // get the number of the available devices
  const int NumDev = omp_get_num_devices();

  const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE);
  
  // global vector size
  const int size = (NumDev * N_PER_DEV);
  assert(size > 0);
@@ -103,6 +105,7 @@ int main()
      C_CPU[i] = A[i] + B[i];
    }

  // each device is managed by a single OMP thread
  #pragma omp parallel num_threads(NumDev)
  {
    // check
@@ -119,11 +122,22 @@ int main()
    
    const int tid    = omp_get_thread_num();
    const int offset = (tid * N_PER_DEV);
    const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE);
    
    VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks);
    VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks, FALSE);
  } // omp parallel

  check(C_CPU, C_GPU, size);
  memset(C_GPU, 0, (size * sizeof(MyData)));
  
  // one OMP thread manages asynchronously all the devices
  for (int dev=0 ; dev<NumDev ; dev++)
    {
      const int offset = (dev * N_PER_DEV);
      VectorAdd(A, B, C_GPU, offset, N_PER_DEV, dev, nblocks, TRUE);
    }

  // host-devices synchronization
  #pragma omp taskwait
  check(C_CPU, C_GPU, size);
  
  free(buffer);
+83 −0
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////////
// 
//
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 28.08.2024
// code tested using nvhpc
//
// The code does not compile with nvc
// The code compiles with clang

////////////////////////////////////////////////////////////////////////////////////////////////////


#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <assert.h>

typedef int MyData;
#define N_PER_DEV   1000000

#define NDEBUG

int main()
{
  // get the number of the available devices
  const int NumDev = omp_get_num_devices();

  // global vector size
  const int size = (NumDev * N_PER_DEV);
  assert(size > 0);

  MyData *buffer = (MyData *)malloc(2 * size * sizeof(MyData));
  assert(buffer != NULL);
  MyData *const restrict A = buffer;
  MyData *const restrict B = A + size;
  MyData sum_cpu = (MyData)0;
  
 #pragma omp parallel for simd reduction(+: sum_cpu)
  for (int i=0 ; i<size ; i++)
    {
      A[i] = rand() % N_PER_DEV;
      B[i] = rand() % N_PER_DEV;
      sum_cpu += A[i] + B[i];
    }

  MyData sum_gpu = (MyData)0;

#pragma omp parallel num_threads(NumDev) reduction(task, +:sum_gpu)
  {
   #pragma omp single
   {
     if (NumDev != omp_get_num_threads())
       exit(EXIT_FAILURE);
     else
     {
       printf("\n\t Using %d GPUs \n", NumDev);
       fflush(stdout);
      }
    } // implicit barrier

    const int tid    = omp_get_thread_num();
    const int offset = (tid * N_PER_DEV);
    
   #pragma omp target   					 \
	       map(to: A[offset:N_PER_DEV], B[offset:N_PER_DEV]) \
               device(tid)                                       \
               in_reduction(+: sum_gpu)
    #pragma omp loop reduction(+: sum_gpu)
    for (int i=offset ; i<(offset + N_PER_DEV) ; i++)
      sum_gpu += (A[i] + B[i]);
  } // omp parallel

  if (sum_cpu == sum_gpu)
    printf("\n\t Result OK \n");
  else
    printf("\n\t Result wrong \n");
  
  free(buffer);
  
  return 0;
}