Commit 31d1aa2e authored by David Goz's avatar David Goz 😴
Browse files

omp/miscellaneous/multiple_device example

parent aa704d3b
Loading
Loading
Loading
Loading
+132 −0
Original line number Diff line number Diff line
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <assert.h>

typedef int MyData;
#define N_PER_DEV   1000000
#define BLOCKSIZE   256

#if (BLOCKSIZE < 32) || (BLOCKSIZE > 1024)
#error "32 <= BLOCKSIZE <= 1024"
#endif

#if (N_PER_DEV < BLOCKSIZE)
#error "N_PER_DEV < BLOCKSIZE"
#endif

#define NDEBUG

void check(const MyData *const restrict vector_cpu,
	   const MyData *const restrict vector_gpu,
	   const size_t                 size)
{
  int flag = 0;
  for (size_t i=0 ; i<size ; i++)
{
#if !defined(NDEBUG)
    printf("\n\t vector_cpu[%zu] = %d - vector_gpu[%zu] = %d",
           i, vector_cpu[i], i, vector_gpu[i]);
#endif

    flag = ((vector_cpu[i] != vector_gpu[i]) ? 1 : flag);
}

  if (flag)
    printf("\n\t Result wrong \n");
  else
    printf("\n\t Result OK \n");

  return;
}

void VectorAdd(const MyData *const restrict A,
	       const MyData *const restrict B,
	             MyData *const restrict C,
	       const int                    offset,
	       const int                    size,
               const int                    dev,
               const int                    nblocks)
{
 #pragma omp target                                                  \
   teams num_teams(nblocks) thread_limit(BLOCKSIZE)                  \
   map(to: A[offset:size], B[offset:size]) map(from: C[offset:size]) \
   device(dev)
  {
    const int team  = omp_get_team_num();
    const int team_start_index = (team * BLOCKSIZE) + offset;
    const int team_end_index   = team_start_index + BLOCKSIZE;

    #pragma omp parallel num_threads(BLOCKSIZE)
    {
      const int localID = omp_get_thread_num();
      const int block   = omp_get_num_threads();

      int globalID = team_start_index + localID;

     for (int index=globalID ; index<team_end_index ; index+=block)
      	C[index] = A[index] + B[index];	

#if !defined(NDEBUG)

      if ((localID == 0) && (team == 0))
	printf("\n\t Device: %d - Teams: %d [requested: %d]- Thread per team: %d [requested: %d]",
	       dev, omp_get_num_teams(), nblocks, block, BLOCKSIZE);
#endif
    } // omp parallel
  } // omp target

  return;
}

int main()
{
  // get the number of the available devices
  const int NumDev = omp_get_num_devices();

  // global vector size
  const int size = (NumDev * N_PER_DEV);
  assert(size > 0);

  MyData *buffer = (MyData *)malloc(4 * size * sizeof(MyData));
  assert(buffer != NULL);
  MyData *const restrict A     = buffer;
  MyData *const restrict B     = A + size;
  MyData *const restrict C_CPU = B + size;
  MyData *const restrict C_GPU = C_CPU + size;

  #pragma omp parallel for simd
  for (int i=0 ; i<size ; i++)
    {
      A[i] = rand() % N_PER_DEV;
      B[i] = rand() % N_PER_DEV;
      C_CPU[i] = A[i] + B[i];
    }
  
  #pragma omp parallel num_threads(NumDev)
  {
    // check
    #pragma omp single
    {
      if (NumDev != omp_get_num_threads())
	exit(EXIT_FAILURE);
      else
	{
	  printf("\n\t Using %d GPUs \n", NumDev);
	  fflush(stdout);
	}
    } // implicit barrier
    
    const int tid    = omp_get_thread_num();
    const int offset = (tid * N_PER_DEV);
    const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE);

    VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks);
  } // omp parallel

  check(C_CPU, C_GPU, size);
  
  free(buffer);
  
  return 0;
}