Commit ee264684 authored by David Goz's avatar David Goz
Browse files

miscellaneous update

parent 726c5ecb
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -257,9 +257,9 @@ int main()
  const dim3 block   = {(BLOCK * BLOCK), 1, 1};
  
  /////////////////////////// GPU naive block algorithm ////////////////////////////////////////
  time = 0.0;
  GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
  cudaDeviceSynchronize();
  time = 0.0;
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
@@ -274,9 +274,9 @@ int main()
  /////////////////////////////////////////////////////////////////////////////////////

  /////////////////////////// GPU block shared memory algorithm ///////////////////////
  time = 0.0;
  GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
  cudaDeviceSynchronize();
  time = 0.0;
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
+282 −0
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Using CUDA data in OpenMP
//
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 03.09.2024
// code tested using nvhpc
//
// - Compile the code:
//      - using nvc
//          $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
//                 hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
//      - using clang
//          $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda 
//                 hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
//
// - Run the code:
//   $ export OMP_TARGET_OFFLOAD=mandatory
//   $ ./hybrid_cuda_omp
////////////////////////////////////////////////////////////////////////////////////////////////////

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <assert.h>
#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>

#define N       2048
#define SIZE    ((N) * (N))
#define ALPHA   1.0
#define BETA    0.0
#define HOST    0
#define DEV     1
#define LOOP    10
#define INIT    0
#define KERNEL  1
#define DATA    2

typedef double MyData;

static double _time[2][3];
static int thr[2];

double process_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

double thread_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

void InitHost(MyData *const restrict A,
	      MyData *const restrict B,
	      MyData *const restrict C,
	      int    *const restrict thr)
{
  double start;

  #pragma omp parallel 
  {
    #pragma omp barrier
    #pragma omp master
    {
      *thr = omp_get_num_threads();
      start = thread_time();
    }
   #pragma omp barrier

    #pragma omp for collapse(2)
    for (int i=0 ; i<N ; i++)
      for (int j=0 ; j<N ; j++)
	{
	  A[(i * N) + j] = 1.0;
	  B[(i * N) + j] = 2.0;
	  C[(i * N) + j] = 0.0;
	}

    #pragma omp master
    {
      _time[HOST][INIT] += (thread_time() - start); 
    }
  } // omp parallel

  return;
}

void InitDev(MyData *const restrict A,
	     MyData *const restrict B,
	     MyData *const restrict C)
{
  const double start = thread_time();

 #pragma omp target teams loop collapse(2)
  for (int i=0 ; i<N ; i++)
    for (int j=0 ; j<N ; j++)
      {
	A[(i * N) + j] = 1.0;
	B[(i * N) + j] = 2.0;
	C[(i * N) + j] = 0.0;
      }

  _time[DEV][INIT] += (thread_time() - start);

  return;
}

void HostDgemm(MyData *const restrict A,
	       MyData *const restrict B,
	       MyData *const restrict C,
	       const MyData           alpha,
	       const MyData           beta,
	       int    *const restrict thr)
{
  // C = alpha * A * B + beta * C;

  double start;

  // naive calculation
 #pragma omp parallel
  {
    #pragma omp barrier
    #pragma omp master
    {
      *thr = omp_get_num_threads();
      start = thread_time();
    }
    #pragma omp barrier

    #pragma omp for collapse(2)
    for (int i=0 ; i<N ; i++)
      for (int j=0 ; j<N ; j++)
	{
	  MyData sum = 0.0;
	  for (int k=0 ; k<N ; k++)
	    sum += A[(i * N) + k] * B[(k * N) + j];
  
	  C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
	}

    #pragma omp master
    {
      _time[HOST][KERNEL] += (thread_time() - start);
    }
  } // omp parallel

  return;
}

void check(MyData *const restrict host_array,
	   MyData *const restrict dev_array)
{
  int flag = 0;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);

  if (!flag)
    printf("\n\t Result OK \n");
  else
    printf("\n\t Result wrong \n");
  
  return;
}

int main()
{
  // Host allocation
  MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData));
  assert(buffer != NULL);
  MyData *const restrict A     = buffer;
  MyData *const restrict B     = A + SIZE;
  MyData *const restrict C_CPU = B + SIZE;
  MyData *const restrict C_GPU = C_CPU + SIZE;

  // Spawning 2 host threads
  #pragma omp parallel num_threads(2)
  {
    // Evaluate the Dgemm on the host
    #pragma omp single nowait
    {
      // allowing nested parallelism
      omp_set_max_active_levels(2);
      
      for (int loop=0 ; loop<LOOP ; loop++)
	{
	  InitHost(A, B, C_CPU, &thr[0]);
	  HostDgemm(A, B, C_CPU, ALPHA, BETA, &thr[1]);
	}
    } // omp single

    #pragma omp single nowait
    {
      // Initialize cuBLAS library
      cublasHandle_t handle;
      cublasCreate(&handle);

      // Allocate A, B, C on the device using CUDA API
      MyData *d_buffer = NULL;
      cudaMalloc((void **)&d_buffer, (3 * SIZE * sizeof(MyData)));
      assert(d_buffer != NULL);
      MyData *const restrict d_A = d_buffer;
      MyData *const restrict d_B = d_A + SIZE;
      MyData *const restrict d_C = d_B + SIZE;

      // get the default device
      const int dev = omp_get_default_device();

      // Associate external device pointers
      omp_target_associate_ptr(A,     d_A, (SIZE * sizeof(MyData)), 0, dev);
      omp_target_associate_ptr(B,     d_B, (SIZE * sizeof(MyData)), 0, dev);
      omp_target_associate_ptr(C_GPU, d_C, (SIZE * sizeof(MyData)), 0, dev);
      
      for (int loop=0 ; loop<LOOP ; loop++)
	{
	  // Init device with blocking omp target directive
	  InitDev(A, B, C_GPU);

	  // Apply DGEMM using device pointers directly
	  const MyData alpha = ALPHA;
	  const MyData beta  = BETA;

	  double start = thread_time();
	  cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
		      &alpha, 
		      d_A, N, 
		      d_B, N, 
		      &beta, 
		      d_C, N);

	  // CUDA synchronization point
	  cudaDeviceSynchronize();
	  _time[DEV][KERNEL] += (thread_time() - start);

	  // Fetch data from the device and deallocate
	  start = thread_time();
	  cudaMemcpy(C_GPU, d_C, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
	  _time[DEV][DATA] += (thread_time() - start);
	} // LOOP

      // release pointer association
      omp_target_disassociate_ptr(A, dev);
      omp_target_disassociate_ptr(B, dev);
      omp_target_disassociate_ptr(C_GPU, dev);
      
      // deallocate device's memory
      cudaFree(d_buffer);

      cublasDestroy(handle);
    } // omp single
  } // synchronization point

  check(C_CPU, C_GPU);

  free(buffer);

  printf("\n\t Matrix size: %d x %d\n", N, N);

  printf("\n\t Host execution time:");
  printf("\n\t\t Init      : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
  printf("\n\t\t Dgemm     : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);

  printf("\n\t Device execution time:");
  printf("\n\t\t Init      : %lg [s]", _time[DEV][INIT]/LOOP);
  printf("\n\t\t Dgemm     : %lg [s]", _time[DEV][KERNEL]/LOOP);
  printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);
  
  return 0;
}
+268 −0
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Passing OpenMP data to cuBlas.
// Passing OpenMP data to foreign runtime (cuBLAS library).
//
// Author: David Goz
// mail  : david.goz@inaf.it
@@ -8,11 +8,16 @@
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
//         hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
//      - using nvc
//          $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
//                 hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas
//      - using clang
//          $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda 
//                 hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas
//
// - Run the code:
//   $ export OMP_TARGET_OFFLOAD=mandatory
//   $ ./hybrid_cuda_omp
//   $ ./hybrid_omp_cuda
////////////////////////////////////////////////////////////////////////////////////////////////////

#include <cuda_runtime.h>
@@ -22,19 +27,61 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>

#define N     512
#define N       2048
#define SIZE    ((N) * (N))
#define ALPHA   1.0
#define BETA    0.0
#define HOST    0
#define DEV     1
#define LOOP    10
#define INIT    0
#define KERNEL  1
#define DATA    2

typedef double MyData;

static double _time[2][3];
static int thr[2];

double process_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

double thread_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

void InitHost(MyData *const restrict A,
	      MyData *const restrict B,
	      MyData *const restrict C)
	      MyData *const restrict C,
	      int    *const restrict thr)
{
  double start;

  #pragma omp parallel 
  {
    #pragma omp barrier
    #pragma omp master
    {
  //#pragma omp parallel for collapse(2)
      *thr = omp_get_num_threads();
      start = thread_time();
    }
   #pragma omp barrier

    #pragma omp for collapse(2)
    for (int i=0 ; i<N ; i++)
      for (int j=0 ; j<N ; j++)
	{
@@ -42,12 +89,22 @@ void InitHost(MyData *const restrict A,
	  B[(i * N) + j] = 2.0;
	  C[(i * N) + j] = 0.0;
	}

    #pragma omp master
    {
      _time[HOST][INIT] += (thread_time() - start); 
    }
  } // omp parallel

  return;
}

void InitDev(MyData *const restrict A,
	     MyData *const restrict B,
	     MyData *const restrict C)
{
  const double start = thread_time();

 #pragma omp target teams loop collapse(2)
  for (int i=0 ; i<N ; i++)
    for (int j=0 ; j<N ; j++)
@@ -57,6 +114,8 @@ void InitDev(MyData *const restrict A,
	C[(i * N) + j] = 0.0;
      }

  _time[DEV][INIT] += (thread_time() - start);

  return;
}

@@ -64,12 +123,25 @@ void HostDgemm(MyData *const restrict A,
	       MyData *const restrict B,
	       MyData *const restrict C,
	       const MyData           alpha,
	       const MyData           beta)
	       const MyData           beta,
	       int    *const restrict thr)
{
  // C = alpha * A * B + beta * C;

  double start;

  // naive calculation
  //  #pragma omp parallel for collapse(2)
 #pragma omp parallel
  {
    #pragma omp barrier
    #pragma omp master
    {
      *thr = omp_get_num_threads();
      start = thread_time();
    }
    #pragma omp barrier

    #pragma omp for collapse(2)
    for (int i=0 ; i<N ; i++)
      for (int j=0 ; j<N ; j++)
	{
@@ -80,6 +152,12 @@ void HostDgemm(MyData *const restrict A,
	  C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
	}

    #pragma omp master
    {
      _time[HOST][KERNEL] += (thread_time() - start);
    }
  } // omp parallel

  return;
}

@@ -91,9 +169,9 @@ void check(MyData *const restrict host_array,
    flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);

  if (!flag)
    printf("\n\t Result OK");
    printf("\n\t Result OK \n");
  else
    printf("\n\t Result wrong");
    printf("\n\t Result wrong \n");
  
  return;
}
@@ -114,8 +192,14 @@ int main()
    // Evaluate the Dgemm on the host
    #pragma omp single nowait
    {
      InitHost(A, B, CC);
      HostDgemm(A, B, CC, ALPHA, BETA);
      // allowing nested parallelism
      omp_set_max_active_levels(2);
      
      for (int loop=0 ; loop<LOOP ; loop++)
	{
	  InitHost(A, B, CC, &thr[0]);
	  HostDgemm(A, B, CC, ALPHA, BETA, &thr[1]);
	}
    } // omp single

    #pragma omp single nowait
@@ -127,25 +211,39 @@ int main()
      // Allocate A, B, C on the device
      #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE])

      for (int loop=0 ; loop<LOOP ; loop++)
	{
	  // Init device with blocking omp target directive
	  InitDev(A, B, C);

	  // Define a target data region where A, B, and C pointers
	  // refer to device's address space
      #pragma omp target data use_device_addr(A, B, C)
          #pragma omp target data use_device_ptr(A, B, C)
	  {
	MyData const alpha = ALPHA;
	MyData const beta  = BETA;
	    const MyData alpha = ALPHA;
	    const MyData beta  = BETA;

	    const double start = thread_time();
	    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
		    &alpha, A, N, B, N, &beta, C, N);
			&alpha, 
			A, N, 
			B, N, 
			&beta, 
			C, N);

	    // CUDA synchronization point
	    cudaDeviceSynchronize();
	    _time[DEV][KERNEL] += (thread_time() - start);
	  }

	  // Fetch data from the device and deallocate
      #pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE])
	  const double start = thread_time();
          #pragma omp target update from (C[0:SIZE])
	  _time[DEV][DATA] += (thread_time() - start);
	} // LOOP
      
      // deallocate device's memory
      #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C[0:SIZE])

      cublasDestroy(handle);
    } // omp single
@@ -155,5 +253,16 @@ int main()

  free(buffer);

  printf("\n\t Matrix size: %d x %d\n", N, N);

  printf("\n\t Host execution time:");
  printf("\n\t\t Init      : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
  printf("\n\t\t Dgemm     : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);

  printf("\n\t Device execution time:");
  printf("\n\t\t Init      : %lg [s]", _time[DEV][INIT]/LOOP);
  printf("\n\t\t Dgemm     : %lg [s]", _time[DEV][KERNEL]/LOOP);
  printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);
  
  return 0;
}
+281 −0
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Passing OpenMP data to foreign runtime (cuBLAS library).
//
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 03.09.2024
// code tested using nvhpc
//
// - Compile the code:
//      - using nvc
//          $ nvc++ -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
//                  hybrid_omp_cuda.cu -o hybrid_omp_cuda -lm && ./hybrid_omp_cuda
//
//      - using clang
//          $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda 
//                 hybrid_omp_cuda.c -o hybrid_omp_cuda -lm
//
// - Run the code:
//   $ export OMP_TARGET_OFFLOAD=mandatory
//   $ ./hybrid_omp_cuda
////////////////////////////////////////////////////////////////////////////////////////////////////

#include <cuda.h>
#include <math.h>
#include <assert.h>
#include <float.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>

#define N         1024
#define SIZE      ((N) * (N))
#define HOST      0
#define DEV       1
#define LOOP      10
#define INIT      0
#define KERNEL    1
#define DATA      2
#define BLOCKSIZE 1024

typedef double MyData;

static double _time[2][3];
static int thr[2];

double process_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

double thread_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

void InitHost(MyData *const restrict A,
	      MyData *const restrict B,
	      MyData *const restrict C,
	      int    *const restrict thr)
{
  double start;

  #pragma omp parallel 
  {
    #pragma omp barrier
    #pragma omp master
    {
      *thr = omp_get_num_threads();
      start = thread_time();
    }
   #pragma omp barrier

    #pragma omp for collapse(2)
    for (int i=0 ; i<N ; i++)
      for (int j=0 ; j<N ; j++)
	{
	  A[(i * N) + j] = 1.0;
	  B[(i * N) + j] = (1.0 / M_PI);
	  C[(i * N) + j] = 0.0;
	}

    #pragma omp master
    {
      _time[HOST][INIT] += (thread_time() - start); 
    }
  } // omp parallel

  return;
}

void InitDev(MyData *const restrict A,
	     MyData *const restrict B,
	     MyData *const restrict C)
{
  const double start = thread_time();

 #pragma omp target teams loop collapse(2) is_device_ptr(A, B, C)
  for (int i=0 ; i<N ; i++)
    for (int j=0 ; j<N ; j++)
      {
	A[(i * N) + j] = 1.0;
	B[(i * N) + j] = (1.0 / M_PI);
	C[(i * N) + j] = 0.0;
      }

  _time[DEV][INIT] += (thread_time() - start);

  return;
}

void HostMM(MyData *const restrict A,
	    MyData *const restrict B,
	    MyData *const restrict C,
	    int    *const restrict thr)
{
  // C = alpha * A * B + beta * C;

  double start;

  // naive calculation
  #pragma omp parallel
  {
    #pragma omp barrier
    #pragma omp master
    {
      *thr = omp_get_num_threads();
      start = thread_time();
    }
    #pragma omp barrier

    #pragma omp for collapse(2)
    for (int i=0 ; i<N ; i++)
      for (int j=0 ; j<N ; j++)
	{
	  MyData sum = 0.0;
	  for (int k=0 ; k<N ; k++)
	    sum += A[(i * N) + k] * B[(k * N) + j];
  
	  C[(i * N) + j] = sum;
	}

    #pragma omp master
    {
      _time[HOST][KERNEL] += (thread_time() - start);
    }
  } // omp parallel

  return;
}

__global__ void DevMM(MyData *const restrict A,
		      MyData *const restrict B,
		      MyData *const restrict C,
		      const int              n)
{
  const int size = (n * n);
  const int globalID = threadIdx.x + (blockIdx.x * blockDim.x);

  if (globalID >= size)
    return;

  const int i = (globalID / N);
  const int j = (globalID % N);

  MyData sum = 0.0;
  for (int k=0 ; k<N ; k++)
    sum += (A[(i * N) + k] * B[(k * N) + j]);

  C[(i * N) + j] = sum;
  
  return;
}

void check(MyData *const restrict host_array,
	   MyData *const restrict dev_array)
{
  int flag = 0;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);

  if (!flag)
    printf("\n\t Result OK \n");
  else
    printf("\n\t Result wrong \n");
  
  return;
}

int main()
{
  // Host allocation
  MyData *h_buffer = (MyData *)malloc(2 * SIZE * sizeof(MyData));
  assert(h_buffer != NULL);
  MyData *const restrict C_HOST = h_buffer;
  MyData *const restrict C_DEV  = C_HOST + SIZE;
  
  // Spawning 2 host threads
  #pragma omp parallel num_threads(2)
  {
    // Evaluate the Dgemm on the host
    #pragma omp single nowait
    {
      // allowing nested parallelism
      omp_set_max_active_levels(2);

      MyData *tmp = (MyData *)malloc(2 * SIZE * sizeof(MyData));
      MyData *const restrict A = tmp;
      MyData *const restrict B = A + SIZE;
      
      for (int loop=0 ; loop<LOOP ; loop++)
	{
	  InitHost(A, B, C_HOST, &thr[0]);
	  HostMM(A, B, C_HOST, &thr[1]);
	}

      free(tmp);
    } // omp single

    #pragma omp single nowait
    {
      // Device allocation
      const int dev = omp_get_default_device();
      const int host = omp_get_initial_device();
      MyData *d_buffer = (MyData *)omp_target_alloc((3 * SIZE * sizeof(MyData)), dev);
      assert(d_buffer != NULL);
      MyData *const restrict d_A = d_buffer;
      MyData *const restrict d_B = d_A + SIZE;
      MyData *const restrict d_C = d_B + SIZE;

      const dim3 nblock = {((SIZE + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1};
      const dim3 block  = {BLOCKSIZE, 1, 1};
      
      for (int loop=0 ; loop<LOOP ; loop++)
	{
	  // Init device with blocking omp target directive
	  InitDev(d_A, d_B, d_C);

	  double start = thread_time();
	  DevMM<<< nblock, block >>>(d_A, d_B, d_C, N);
	  // CUDA synchronization point
	  cudaDeviceSynchronize();
	  _time[DEV][KERNEL] += (thread_time() - start);

	  // Fetch data from the device and deallocate
	  start = thread_time();
	  omp_target_memcpy(C_DEV, d_C, (SIZE * sizeof(MyData)), 0, 0, host, dev);
	  _time[DEV][DATA] += (thread_time() - start);
	} // LOOP
      
      // deallocate device's memory
      omp_target_free(d_buffer, dev);
    } // omp single
  } // synchronization point

  check(C_HOST, C_DEV);

  free(h_buffer);

  printf("\n\t Matrix size: %d x %d\n", N, N);

  printf("\n\t Host execution time:");
  printf("\n\t\t Init      : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
  printf("\n\t\t Dgemm     : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);

  printf("\n\t Device execution time:");
  printf("\n\t\t Init      : %lg [s]", _time[DEV][INIT]/LOOP);
  printf("\n\t\t Dgemm     : %lg [s]", _time[DEV][KERNEL]/LOOP);
  printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);
  
  return 0;
}
+76 −0
Original line number Diff line number Diff line
//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 03.09.2024
// code tested using nvhpc
//
// - Compile the code:
//   - nvc complains about the '#pragma omp target has_device_addr'
//   - clang works
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

static int answer = 42;
#define SIZE 3

int main()
{
  int *A = (int *)malloc(SIZE * sizeof(int));
  assert(A != NULL);
  memset(A, 0, (SIZE * sizeof(int)));
  
  printf("\n\t Host address of 'answer' [value: %d] is: %p", answer, &answer);
  printf("\n\t Host address of 'A' is: %p", A);

  // Map 'answer' to the device
  #pragma omp target data map(alloc: answer, A[0:SIZE])
  {
    #pragma omp target update to(answer, A[0:SIZE])
    
    // Update 'answer' on the device
    #pragma omp target map(answer)
    {
      answer += 1;
      for (int i=0 ; i<SIZE ; i++)
	A[i] += (i + 1);
    }

    #pragma omp target update from(answer, A[0:SIZE])
    
    printf("\n\t Address of 'answer' [value: %d] in target data region is: %p",
  	   answer, &answer);
    printf("\n\t Address of 'A' in target data region is: %p", A);
    printf("\n\t Value of A:");
    for (int i=0 ; i<SIZE ; i++)
      printf("\n\t\t A[%d] = %d", i, A[i]);
    printf("\n");

    #pragma omp target data use_device_addr(answer) use_device_ptr(A)
    {
      #pragma omp target has_device_addr(answer)
      {
	answer += 1;
	for (int i=0 ; i<SIZE ; i++)
	  A[i] += (i + 1);
      }
      
      printf("\n\t Device address of 'answer' is: %p", &answer);
      printf("\n\t Device address of 'A' is: %p", A);
    }

    #pragma omp target update from(answer, A[0:SIZE])
    printf("\n\t Last value of 'answer' is: %d", answer);
    printf("\n\t Last value if 'A' is:");
    for (int i=0 ; i<SIZE ; i++)
      printf("\n\t\t A[%d] = %d", i, A[i]);
    printf("\n\n");
  } // omp target data

  free(A);
  
  return 0;
}
Loading