Commit 7e6f37d5 authored by David Goz's avatar David Goz 😴
Browse files

omp block matrix

parent 86b043e3
Loading
Loading
Loading
Loading
+185 −0
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////
// - Naive matrix multiplication algorithm
//   for (size_t i=0 ; i<N ; i++)
//      for (size_t j=0 ; j<N ; j++)
//         for (size_t k=0 ; k<_N ; k++)
//            C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
//
////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 31.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
// - Run the code:
//   $ ./classwork_omp
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include <omp.h>
#include <string.h>

#define N                     512
#define SIZE                  (N * N) // matrix size
typedef double MyData;                // do not change

#define LOOP 100
#define NDEBUG

double wall_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

void CPU_mat_mult(const MyData *const restrict A,
		  const MyData *const restrict B,
	                MyData *const restrict C,
		  const size_t                 size)
{
  for (size_t i=0 ; i<size ; i++)
    for (size_t j=0 ; j<size ; j++)
      for (size_t k=0 ; k<size ; k++)
	C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);

  return;
}

void GPU_mat_mult(const MyData *const restrict A,
		  const MyData *const restrict B,
	                MyData *const restrict C,
		  const size_t                 size)
{
  #pragma omp target
  {
    #pragma omp teams distribute num_teams(size)
    for (size_t i=0 ; i<size ; i++)
      {
        #pragma omp parallel for num_threads(size)
	for (size_t j=0 ; j<size ; j++)
	  {
	    MyData value = (MyData)0;
	    for (size_t k=0 ; k<size ; k++)
	      value += (A[(i * size) + k] * B[(k * size) + j]);

	    C[(i * size) + j] = value;
	  } // omp thread
      } // omp teams
   } // omp target

  return;
}

void GPU_mat_mult_no_loops(const MyData *const restrict A,
			   const MyData *const restrict B,
			         MyData *const restrict C,
			   const size_t                 size)
{
 #pragma omp target
  {
   #pragma omp teams num_teams(size)
    {
      const size_t team_size = (size * omp_get_team_num());
      
     #pragma omp parallel firstprivate(team_size) num_threads(size)
      {
         const size_t tid = omp_get_thread_num();
	 MyData value = (MyData)0;
	 for (size_t k=0 ; k<size ; k++)
	   value += (A[team_size + k] * B[(k * size) + tid]);

	 C[team_size + tid] = value;
      } // omp threads      
    } // omp teams
  } // omp target

  return;
}

void check(const MyData *const __restrict__ cpu_matrix,
	   const MyData *const __restrict__ gpu_matrix)
{
  int flag;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);

  if (!flag)
    printf("\n\t Result OK");
  else
    printf("\n\t Result wrong");
  
  return;
}

int main()
{
  double time;
  MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData));
  assert(buffer != NULL);

  // host reference matrix A
  MyData *const restrict A     = buffer;
  MyData *const restrict B     = A + SIZE;
  MyData *const restrict C_CPU = B + SIZE;
  MyData *const restrict C_GPU = C_CPU + SIZE;
  for (size_t i=0 ; i<SIZE ; i++)
    {
      A[i] = drand48();
      B[i] = drand48();
    }

  ////////////////////////// CPU naive algorithm //////////////////////////////////////////
  CPU_mat_mult(A, B, C_CPU, N);
  /////////////////////////////////////////////////////////////////////////////////////////

  // copy/alloc data to the GPU
  #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE])

  /////////////////////////// GPU naive algorithm ////////////////////////////////////////
  time = 0.0;
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
      GPU_mat_mult(A, B, C_GPU, N);
      time += (wall_time() - start);
    }
  
  #pragma omp target update from(C_GPU[0:SIZE])
  check(C_CPU, C_GPU);
  printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
  ////////////////////////////////////////////////////////////////////////////////

  /////////////////////////// GPU naive no loops algorithm ////////////////////////////
  time = 0.0;
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
      GPU_mat_mult_no_loops(A, B, C_GPU, N);
      time += (wall_time() - start);
    }
  
  #pragma omp target update from(C_GPU[0:SIZE])
  check(C_CPU, C_GPU);
  printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP));
  ////////////////////////////////////////////////////////////////////////////////

  // free CPU memory
  free(buffer);
  // free GPU memory
  #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE])
  
  printf("\n");

  return EXIT_SUCCESS;
}
+208 −0
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////
// - Block matrix multiplication algorithm
//
//   const size_t Nblocks = (N / Bsize);
//
//   // loop over blocks of matrix C
//   for (size_t ib=0 ; ib<Nblocks ; ib++)
//   {
//      for (size_t jb=0 ; jb<Nblocks ; jb++)
//      {
//        
//         // loop over blocks of rows of A       
//         for (size_t kb=0 ; kb<Nblocks ; kb++)
//
//
//
//         for (size_t i=0 ; i<N ; i++)
//            for (size_t j=0 ; j<N ; j++)
//               for (size_t k=0 ; k<_N ; k++)
//                  C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
//
//      } // jb
//   } // ib
// - Exploit shared-memory.
////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 31.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
// - Run the code:
//   $ ./classwork_omp
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include <omp.h>
#include <string.h>

#define N                     512
#define SIZE                  (N * N) // matrix size
typedef double MyData;                // do not change
#define BLOCKSIZE             32      // number of threads per block

// sanity check
#if BLOCKSIZE > 1024
#error BLOCKSIZE cannot be larger than 1024
#endif

#define LOOP 100
#define NDEBUG

double wall_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

void CPU_mat_mult(const MyData *const restrict A,
		  const MyData *const restrict B,
	                MyData *const restrict C,
		  const size_t                 size)
{
  for (size_t i=0 ; i<size ; i++)
    for (size_t j=0 ; j<size ; j++)
      for (size_t k=0 ; k<size ; k++)
	C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);

  return;
}

void GPU_mat_mult(const MyData *const restrict A,
		  const MyData *const restrict B,
	                MyData *const restrict C,
		  const size_t                 size)
{
  #pragma omp target
  {
    #pragma omp teams distribute num_teams(size)
    for (size_t i=0 ; i<size ; i++)
      {
        #pragma omp parallel for num_threads(size)
	for (size_t j=0 ; j<size ; j++)
	  {
	    MyData value = (MyData)0;
	    for (size_t k=0 ; k<size ; k++)
	      value += (A[(i * size) + k] * B[(k * size) + j]);

	    C[(i * size) + j] = value;
	  } // omp thread
      } // omp teams
   } // omp target

  return;
}

void GPU_mat_mult_no_loops(const MyData *const restrict A,
			   const MyData *const restrict B,
			         MyData *const restrict C,
			   const size_t                 size)
{
 #pragma omp target
  {
   #pragma omp teams num_teams(size)
    {
      const size_t team_size = (size * omp_get_team_num());
      
     #pragma omp parallel firstprivate(team_size) num_threads(size)
      {
         const size_t tid = omp_get_thread_num();
	 MyData value = (MyData)0;
	 for (size_t k=0 ; k<size ; k++)
	   value += (A[team_size + k] * B[(k * size) + tid]);

	 C[team_size + tid] = value;
      } // omp threads      
    } // omp teams
  } // omp target

  return;
}

void check(const MyData *const __restrict__ cpu_matrix,
	   const MyData *const __restrict__ gpu_matrix)
{
  int flag;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);

  if (!flag)
    printf("\n\t Result OK");
  else
    printf("\n\t Result wrong");
  
  return;
}

int main()
{
  double time;
  MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData));
  assert(buffer != NULL);

  // host reference matrix A
  MyData *const restrict A     = buffer;
  MyData *const restrict B     = A + SIZE;
  MyData *const restrict C_CPU = B + SIZE;
  MyData *const restrict C_GPU = C_CPU + SIZE;
  for (size_t i=0 ; i<SIZE ; i++)
    {
      A[i] = drand48();
      B[i] = drand48();
    }

  ////////////////////////// CPU naive algorithm //////////////////////////////////////////
  CPU_mat_mult(A, B, C_CPU, N);
  /////////////////////////////////////////////////////////////////////////////////////////

  // copy/alloc data to the GPU
  #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE])

  /////////////////////////// GPU naive algorithm ////////////////////////////////////////
  time = 0.0;
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
      GPU_mat_mult(A, B, C_GPU, N);
      time += (wall_time() - start);
    }
  
  #pragma omp target update from(C_GPU[0:SIZE])
  check(C_CPU, C_GPU);
  printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
  ////////////////////////////////////////////////////////////////////////////////

  /////////////////////////// GPU naive no loops algorithm ////////////////////////////
  time = 0.0;
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
      GPU_mat_mult_no_loops(A, B, C_GPU, N);
      time += (wall_time() - start);
    }
  
  #pragma omp target update from(C_GPU[0:SIZE])
  check(C_CPU, C_GPU);
  printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP));
  ////////////////////////////////////////////////////////////////////////////////

  // free CPU memory
  free(buffer);
  // free GPU memory
  #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE])
  
  printf("\n");

  return EXIT_SUCCESS;
}
+34 −25
Original line number Diff line number Diff line
@@ -17,7 +17,8 @@
#include <assert.h>
#include <omp.h>

#define N  8
#define SIZE   8
#define SIZE_2 (SIZE / 2)
typedef double MyData;

typedef struct my_span
@@ -29,26 +30,20 @@ typedef struct my_span
void allocate(      span  *my_struct,
	      const size_t size)
{
  span tmp;
  /* allocate the buffer on the host memory */
  tmp.ptr = (MyData *)malloc(size * sizeof(MyData));
  assert(tmp.ptr != NULL);
  tmp.N = size;

  /* declare how the object 'span' has to be mapped on the device */
  #pragma omp declare mapper(span tmp) map(from: tmp, tmp.ptr[0: tmp.N])

  my_struct->ptr = tmp.ptr;
  my_struct->N   = tmp.N;
  my_struct->ptr = (MyData *)calloc(size, sizeof(MyData));
  assert(my_struct->ptr != NULL);
  my_struct->N = size;
  
  return;
}

void print(const span *const A)
void print(const span *const A,
	   const char *const string)
{
  printf("\n");
  for (size_t i=0 ; i<A->N ; i++)
    printf("\n\t array[%i] = %lg", i, A->ptr[i]);
  for (int i=0 ; i<A->N ; i++)
    printf("\n\t %s[%d] = %lg", string, i, A->ptr[i]);
  printf("\n\n");
  
  return;
@@ -56,27 +51,41 @@ void print(const span *const A)

int main()
{
  span A, B;
  span A, B, C;

  allocate(&A, N);
  allocate(&B, N);
  allocate(&A, SIZE);
  allocate(&B, SIZE);
  allocate(&C, SIZE);

  /* declare how the object 'span' has to be mapped on the device */
  #pragma omp declare mapper(all  : span S) map(to: S) map(from: S.ptr[0    : S.N])
  #pragma omp declare mapper(left : span S) map(to: S) map(from: S.ptr[0    : S.N/2])
  #pragma omp declare mapper(right: span S) map(to: S) map(from: S.ptr[S.N/2: S.N/2])

  /* init on the GPU */
  #pragma omp target
  {
    for (size_t i=0 ; i<N ; i++)
#pragma omp target map(mapper(all): A) map(mapper(left): B) map(mapper(right): C)
  {
    #pragma omp loop
    for (size_t i=0 ; i<SIZE ; i++)
      A.ptr[i] = (MyData)(i);
	B.ptr[i] = (MyData)(2 * i);
      }

    #pragma omp loop
    for (size_t ii=0 ; ii<SIZE_2 ; ii++)
      B.ptr[ii] = (MyData)(ii);

    #pragma omp loop
    for (size_t iii=SIZE_2 ; iii<SIZE ; iii++)
      C.ptr[iii] = (MyData)(iii);
  }

  print(&A);
  print(&B);
  print(&A, "A");
  print(&B, "B");
  print(&C, "C");
  
  /* free the host's memory */
  free(A.ptr);
  free(B.ptr);
  free(C.ptr);

  return 0;
}
+17 KiB

File added.

No diff preview for this file type.