Commit aa704d3b authored by David Goz's avatar David Goz
Browse files

memory management and unified memory

parent 4921ce23
Loading
Loading
Loading
Loading
+420 −0
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////
// - Block matrix multiplication algorithm
//
//   const size_t Nblocks = (N / Bsize);
//
//   // loop over blocks of matrix C
//   for (size_t ib=0 ; ib<Nblocks ; ib++)
//   {
//      for (size_t jb=0 ; jb<Nblocks ; jb++)
//      {
//        
//         // loop over blocks of rows of A       
//         for (size_t kb=0 ; kb<Nblocks ; kb++)
//         {
//
//           // Matrix multiplication within a block
//           for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++)
//             for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++)
//               for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++)
//                  C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
//
//         } // kb
//      } // jb
//   } // ib
//
// - Exploit GPU shared-memory.
// - Use explicit target memory allocator and memory management OpenMP APIs
////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 26.08.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v memory_management.c -o memory_management_omp
// - Run the code:
//   $ export OMP_TARGET_OFFLOAD=MANDATORY
//   $ ./memory_management_omp
//
//  N.B. team size in the range [32, 1024)
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include <omp.h>
#include <string.h>
#include <math.h>
#include <float.h>

#define N                   1024
#define SIZE                (N * N) // matrix size
typedef double MyData;                // do not change
#define _BLOCK_             16
#define BLOCKSIZE           ((_BLOCK_) * (_BLOCK_)) /* 32 <= BLOCKSIZE < 1024 */

// sanity check
#if _BLOCK_*_BLOCK_ > 1024
#error _BLOCK_*_BLOCK_ cannot larger than 1024
#endif

#if _BLOCK_ > N
#error _BLOCK_ > N
#endif

#define LOOP 100
#define NDEBUG

double wall_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

void CPU_mat_mult_block(const MyData *const restrict A,
			const MyData *const restrict B,
	                      MyData *const restrict C,
			const size_t                 size)
{
  const size_t Nblocks = (size / _BLOCK_);

  // loop over blocks of matrix C
  for (size_t ib=0 ; ib<Nblocks ; ib++)
    {
      for (size_t jb=0 ; jb<Nblocks ; jb++)
	{       
	  // loop over blocks of rows of A       
	  for (size_t kb=0 ; kb<Nblocks ; kb++)
	    {
	      // Matrix multiplication within a block
	      for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
		for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
		  for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
		    C[(i * size) + j] += A[(i * size) + k] * B[(k * size) + j];
	    } // kb
	} // jb
    } // ib
  
  return;
}

void GPU_mat_mult_block(const MyData *const restrict A,
			const MyData *const restrict B,
	                      MyData *const restrict C,
			const size_t                 size)
{
  const size_t Nblocks = (size / _BLOCK_);

  #pragma omp target                                                    \
              teams distribute collapse(2) num_teams(Nblocks * Nblocks) \
              thread_limit(BLOCKSIZE)					\
              is_device_ptr(A, B, C)
  for (size_t ib=0 ; ib<Nblocks ; ib++)
    {
      for (size_t jb=0 ; jb<Nblocks ; jb++)
	{
	  for (size_t kb=0 ; kb<Nblocks ; kb++)
	    {
              #pragma omp parallel for collapse(2) num_threads(BLOCKSIZE)
	      for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
		{
		  for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
		    {
		      MyData value = ((kb == 0) ? (MyData)0 : C[(i * size) + j]);
		      for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
			value += A[(i * size) + k] * B[(k * size) + j];

		      C[(i * size) + j] = value;
		    } // j
		} // i
	    } // kb
	} // jb
    } // ib

  return;
}

void GPU_mat_mult_block_shared(const MyData *const restrict A,
			       const MyData *const restrict B,
			             MyData *const restrict C,
			       const size_t                 size)
{
  const size_t Nblocks = (size / _BLOCK_);
  // Team local copies of tiles
  MyData Ablock[BLOCKSIZE];
  MyData Bblock[BLOCKSIZE];  
  MyData Cblock[BLOCKSIZE];
  
  #pragma omp target                                                     \
               teams distribute collapse(2) num_teams(Nblocks * Nblocks) \
               private(Ablock, Bblock, Cblock)				 \
               thread_limit(BLOCKSIZE)                                   \
               is_device_ptr(A, B, C)
  for (size_t ib=0 ; ib<Nblocks ; ib++)
    {
      for (size_t jb=0 ; jb<Nblocks ; jb++)
	{ 
	  for (size_t kb=0 ; kb<Nblocks ; kb++)
	    {
              #pragma omp parallel num_threads(BLOCKSIZE)
	      {
		if (kb == 0) // Cblock initialization
		  {
                    #pragma omp for collapse(2) nowait
		    for (size_t i=0 ; i<_BLOCK_ ; i++)
		      for (size_t j=0 ; j<_BLOCK_ ; j++)
			Cblock[(i * _BLOCK_) + j] = (MyData)0;
		  }
		
		// copy block of A into pteam memory Ablock
                #pragma omp for collapse(2) nowait // implicit barrier is removed 	
		for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
		  for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
		    Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] = A[(i * N) + k];

		// copy block of B into pteam memory Bblock
                #pragma omp for collapse(2) // implicit barrier to ensure that shared Ablock and Bblock can be accessed by all threads within the block
		for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
		  for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
		    Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)] = B[(k * N) + j];

		// matrix multiply within the block
                #pragma omp for collapse(2) nowait
		for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
		  for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
		    {
		      MyData value = (MyData)0;
		      for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
			value += Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] *
			         Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)];

		      Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)] += value;

		      if (kb == (Nblocks - 1))
			C[(i * N) + j] = Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)];
		    }
	      } // omp parallel
	    } // kb
	} // jb
    } // ib

  return;
}

void GPU_mat_mult_block_shared_no_loops(const MyData *const restrict A,
					const MyData *const restrict B,
					      MyData *const restrict C,
					const size_t                 size)
{
  const size_t Nblocks = (size / _BLOCK_);
  // Team local copies of tiles
  MyData Ablock[BLOCKSIZE];
  MyData Bblock[BLOCKSIZE];  
  MyData Cblock[BLOCKSIZE];
  
  #pragma omp target                             \
              teams num_teams(Nblocks * Nblocks) \
              private(Ablock, Bblock, Cblock)    \
              thread_limit(BLOCKSIZE)	         \
              is_device_ptr(A, B, C)
  {
    const size_t ib = omp_get_team_num() / Nblocks;
    const size_t jb = omp_get_team_num() % Nblocks;

    #pragma omp parallel num_threads(BLOCKSIZE)
    {
#if !defined(NDEBUG)
	  
      /* ID within the team (CUDA block) */
      const int localID  = omp_get_thread_num();
      /* Team ID (ID CUDA block) */
      const int team     = omp_get_team_num();
      /* Team size (CUDA block size) */
      const int nthr     = omp_get_num_threads();
      /* global thread index */
      const int globalID = (localID + (team * nthr));

      printf("\n\t globalID: %d - localID: %d - nthr: %d - team: %d\n",
	     globalID, localID, nthr, team);

      #pragma omp barrier
      
#endif /* NDEBUG */
      
      // local matrix's indexes mapped to each OMP GPU-thread within its own team
      const size_t i_local = omp_get_thread_num() / _BLOCK_;
      const size_t j_local = omp_get_thread_num() % _BLOCK_;

      // global matrix's indexes mapped to each OMP GPU-thread
      const size_t i_global = i_local + (ib * _BLOCK_);
      const size_t j_global = j_local + (jb * _BLOCK_);
      
      // Cblock initialization
      Cblock[(i_local * _BLOCK_) + j_local] = (MyData)0;

      // loop over blocks
      for (size_t block=0 ; block<Nblocks ; block++)
	{
	  const size_t j_A = j_local + (block * _BLOCK_);
	  const size_t i_B = i_local + (block * _BLOCK_);

	  // waits until all threads in the thread block have reached this point and shared memory accesses
	  #pragma omp barrier

	  Ablock[(i_local * _BLOCK_) + j_local] = A[(i_global * size) + j_A];
	  Bblock[(i_local * _BLOCK_) + j_local] = B[(i_B * size)      + j_global];

	  // waits until all threads in the thread block have reached this point and shared memory accesses
	  #pragma omp barrier

	  // perform the matrix multiplication within the block
	  MyData value = (MyData)0;
	  for (size_t k=0 ; k<_BLOCK_ ; k++)
	    value += Ablock[(i_local * _BLOCK_) + k] * Bblock[(k * _BLOCK_) + j_local];

	  // store the partial result in shared memory
	  Cblock[(i_local * _BLOCK_) + j_local] += value;
	} // block loop

      // store the result into global memory
      C[(i_global * size) + j_global] = Cblock[(i_local * _BLOCK_) + j_local];
    } // omp parallel
  } // target teams
	  
  return;
}

void check(const MyData *const __restrict__ cpu_matrix,
	   const MyData *const __restrict__ gpu_matrix)
{
  int flag = 0;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);

  if (!flag)
    printf("\n\t Result OK");
  else
    printf("\n\t Result wrong");

#if !defined(NDEBUG)

  for (size_t i=0 ; i<N ; i++)
    {
      printf("\n");
      for (size_t j=0 ; j<N ; j++)
	{
	  const size_t index = ((i * N) + j);
	  printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
		 index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
	}
    }
  printf("\n");

#endif // NDEBUG
  
  return;
}

int main()
{
  double time;
  MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData));
  assert(buffer_cpu != NULL);

  // host reference matrix A
  MyData *const restrict A          = buffer_cpu;
  MyData *const restrict B          = A + SIZE;
  MyData *const restrict C_CPU      = B + SIZE;
  MyData *const restrict C_GPU_HOST = C_CPU + SIZE;
  for (size_t i=0 ; i<SIZE ; i++)
    {
      A[i] = drand48();
      B[i] = drand48();
    }

  ////////////////////////// CPU naive algorithm ///////////////////////////////////////////////////
  CPU_mat_mult_block(A, B, C_CPU, N);
  //////////////////////////////////////////////////////////////////////////////////////////////////

  // get the host id
  const int dev_host = omp_get_initial_device();
  
  // get the default device
  const int dev_gpu = omp_get_default_device();
  assert(dev_host != dev_gpu);
  
  // alloc data to the GPU
  MyData *const buffer_gpu = (MyData *)omp_target_alloc(3 * SIZE * sizeof(MyData), dev_gpu);
  assert(buffer_gpu != NULL);
  MyData *const restrict A_GPU = buffer_gpu;
  MyData *const restrict B_GPU = A_GPU + SIZE;
  MyData *const restrict C_GPU = B_GPU + SIZE;

  // copy data to GPU
  omp_target_memcpy(buffer_gpu, buffer_cpu, (2 * SIZE * sizeof(MyData)), 0, 0, dev_gpu, dev_host);
  
  /////////////////////////// GPU naive block algorithm ////////////////////////////////////////////
  time = 0.0;
  GPU_mat_mult_block(A_GPU, B_GPU, C_GPU, N); // warm-up
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
      GPU_mat_mult_block(A_GPU, B_GPU, C_GPU, N);
      time += (wall_time() - start);
    }
  // copy data device-to-host
  omp_target_memcpy(C_GPU_HOST, C_GPU, (SIZE * sizeof(MyData)), 0, 0, dev_host, dev_gpu);
  
  check(C_CPU, C_GPU_HOST);
  memset(C_GPU_HOST, 0, (SIZE * sizeof(MyData)));
  printf("\n\t GPU block time %lg [s]\n", (time / LOOP));
  //////////////////////////////////////////////////////////////////////////////////////////////////

  /////////////////////////// GPU naive block shared memory algorithm //////////////////////////////
  time = 0.0;
  GPU_mat_mult_block_shared(A_GPU, B_GPU, C_GPU, N); // warm-up
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
      GPU_mat_mult_block_shared(A_GPU, B_GPU, C_GPU, N);
      time += (wall_time() - start);
    }
  
  omp_target_memcpy(C_GPU_HOST, C_GPU, (SIZE * sizeof(MyData)), 0, 0, dev_host, dev_gpu);
  check(C_CPU, C_GPU_HOST);
  memset(C_GPU_HOST, 0, (SIZE * sizeof(MyData)));
  printf("\n\t GPU block pteam memory time %lg [s]\n", (time / LOOP));
  //////////////////////////////////////////////////////////////////////////////////////////////////

  /////////////////////////// GPU naive block shared memory no loops algorithm /////////////////////
  time = 0.0;
  GPU_mat_mult_block_shared_no_loops(A_GPU, B_GPU, C_GPU, N); // warm-up
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
      GPU_mat_mult_block_shared_no_loops(A_GPU, B_GPU, C_GPU, N);
      time += (wall_time() - start);
    }
  
  omp_target_memcpy(C_GPU_HOST, C_GPU, (SIZE * sizeof(MyData)), 0, 0, dev_host, dev_gpu);
  check(C_CPU, C_GPU_HOST);
  printf("\n\t GPU block no loops pteam memory time %lg [s]\n", (time / LOOP));
  //////////////////////////////////////////////////////////////////////////////////////////////////

  // free CPU memory
  free(buffer_cpu);
  // free GPU memory
  omp_target_free(buffer_gpu, dev_gpu);
  
  printf("\n");

  return EXIT_SUCCESS;
}
+389 −0

File added.

Preview size limit exceeded, changes collapsed.

+1 −1

File changed.

Contains only whitespace changes.