Commit 4921ce23 authored by David Goz's avatar David Goz
Browse files

cuda/omp matrix multiply examples

parent 7e6f37d5
Loading
Loading
Loading
Loading
+176 −0
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////
// - Naive matrix multiplication algorithm
//   for (size_t i=0 ; i<N ; i++)
//      for (size_t j=0 ; j<N ; j++)
//         for (size_t k=0 ; k<_N ; k++)
//            C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
//
////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 20.08.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc++ mat_mult.cu -o mat_mult
// - Run the code:
//   $ ./mat_mult
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include <cuda.h>
#include <string.h>
#include <math.h>
#include <float.h>

#define N                     1024
#define SIZE                  (N * N) // matrix size
typedef double MyData;                // do not change
#define BLOCKSIZE             1024

// sanity check
#if BLOCKSIZE > 1024
#error BLOCKSIZE must be <= 1024
#endif

#define LOOP 100
#define NDEBUG

double wall_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

void CPU_mat_mult(const MyData *const restrict A,
		  const MyData *const restrict B,
	                MyData *const restrict C,
		  const size_t                 size)
{
  for (size_t i=0 ; i<size ; i++)
    for (size_t j=0 ; j<size ; j++)
      for (size_t k=0 ; k<size ; k++)
	C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);

  return;
}

__global__ void GPU_mat_mult(const MyData *const restrict A,
			     const MyData *const restrict B,
			           MyData *const restrict C,
			     const size_t                 size)
{
  const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
  if (globalID >= SIZE)
    return;
  
  const size_t i = globalID / size;
  const size_t j = globalID % size;

  MyData value = (MyData)0;
  for (size_t k=0 ; k<size ; k++)
    value += A[(i * N) + k] * B[(k * N) + j];

  C[(i * N) + j] = value;
  
  return;
}

void check(const MyData *const __restrict__ cpu_matrix,
	   const MyData *const __restrict__ gpu_matrix)
{
  int flag = 0;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);

  if (!flag)
    printf("\n\t Result OK");
  else
    printf("\n\t Result wrong");

#if !defined(NDEBUG)

  for (size_t i=0 ; i<N ; i++)
    {
      printf("\n");
      for (size_t j=0 ; j<N ; j++)
	{
	  const size_t index = ((i * N) + j);
	  printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
		 index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
	}
    }
  printf("\n");

#endif // NDEBUG  
  
  return;
}

int main()
{
  double time;
  MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData));
  assert(buffer_cpu != NULL);

  // host reference matrix A
  MyData *const restrict A_CPU = buffer_cpu;
  MyData *const restrict B_CPU = A_CPU + SIZE;
  MyData *const restrict C_CPU = B_CPU + SIZE;
  MyData *const restrict C_GPU_host = C_CPU + SIZE;
  for (size_t i=0 ; i<SIZE ; i++)
    {
      A_CPU[i] = drand48();
      B_CPU[i] = drand48();
    }

  ////////////////////////// CPU naive algorithm //////////////////////////////////////////
  CPU_mat_mult(A_CPU, B_CPU, C_CPU, N);
  /////////////////////////////////////////////////////////////////////////////////////////

  // copy/alloc data to the GPU
  MyData *buffer_gpu = NULL;
  cudaMalloc((void **)&buffer_gpu, (3 * SIZE * sizeof(MyData)));
  MyData *const restrict A_GPU = buffer_gpu;
  MyData *const restrict B_GPU = A_GPU + SIZE;
  MyData *const restrict C_GPU = B_GPU + SIZE;
  cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);

  const dim3 block   = {BLOCKSIZE, 1, 1};
  const dim3 nblocks = {(SIZE + BLOCKSIZE  -1)/BLOCKSIZE, 1, 1};
  
  /////////////////////////// GPU naive algorithm ////////////////////////////////////////
  time = 0.0;
  GPU_mat_mult<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
  cudaDeviceSynchronize();
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
      GPU_mat_mult<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
      cudaDeviceSynchronize();
      time += (wall_time() - start);
    }
  cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
  
  check(C_CPU, C_GPU_host);
  printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
  /////////////////////////////////////////////////////////////////////////////////////

  // free CPU memory
  free(buffer_cpu);
  // free GPU memory
  cudaFree(buffer_gpu);
  
  printf("\n");

  return EXIT_SUCCESS;
}
+301 −0
Original line number Diff line number Diff line
////////////////////////////////////////////////////////////////////////////////////////////////
// - Block matrix multiplication algorithm
//
//   const size_t Nblocks = (N / Bsize);
//
//   // loop over blocks of matrix C
//   for (size_t ib=0 ; ib<Nblocks ; ib++)
//   {
//      for (size_t jb=0 ; jb<Nblocks ; jb++)
//      {
//        
//         // loop over blocks of rows of A       
//         for (size_t kb=0 ; kb<Nblocks ; kb++)
//         {
//
//           // Matrix multiplication within a block
//           for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++)
//             for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++)
//               for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++)
//                  C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
//
//         } // kb
//      } // jb
//   } // ib
//
// - Exploit GPU shared-memory.
////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 22.08.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc++ mat_mult_block.cu -o mat_mult_block
// - Run the code:
//   $ ./mat_mult_block
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include <cuda.h>
#include <string.h>
#include <math.h>
#include <float.h>

#define N                    1024
#define SIZE                 (N * N) // matrix size
typedef double MyData;               // do not change
#define BLOCK                16

// sanity check
#if BLOCK * BLOCK > 1024
#error BLOCKSIZE must be <= 1024
#endif

#if BLOCK * BLOCK > SIZE
#error BLOCKSIZE must be <= SIZE
#endif

#define LOOP 100
#define NDEBUG

double wall_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

void CPU_mat_mult(const MyData *const restrict A,
		  const MyData *const restrict B,
	                MyData *const restrict C,
		  const size_t                 size)
{
  for (size_t i=0 ; i<size ; i++)
    for (size_t j=0 ; j<size ; j++)
      for (size_t k=0 ; k<size ; k++)
	C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);

  return;
}

void CPU_mat_mult_block(const MyData *const restrict A,
			const MyData *const restrict B,
	                      MyData *const restrict C,
			const size_t                 size)
{
  const size_t Nblocks = (size / BLOCK);

  // loop over blocks of matrix C
  for (size_t ib=0 ; ib<Nblocks ; ib++)
    {
      for (size_t jb=0 ; jb<Nblocks ; jb++)
	{       
	  // loop over blocks of rows of A       
	  for (size_t kb=0 ; kb<Nblocks ; kb++)
	    {
	      // Matrix multiplication within a block
	      for (size_t i=(ib * BLOCK) ; i<((ib + 1) * BLOCK) ; i++)
		for (size_t j=(jb * BLOCK) ; j<((jb + 1) * BLOCK) ; j++)
		  for (size_t k=(kb * BLOCK) ; k<((kb + 1) * BLOCK) ; k++)
		    C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
	    } // kb
	} // jb
    } // ib
  
  return;
}

__global__ void GPU_mat_mult_block(const MyData *const restrict A,
				   const MyData *const restrict B,
			                 MyData *const restrict C,
				   const size_t                 size)
{
  const size_t size2 = (size * size);
  
  const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
  if (globalID >= size2)
    return;
  
  const size_t Nblocks  = size / BLOCK;           // number of blocks to loop over A and B
  const size_t ib       = blockIdx.x / Nblocks;   // indexes of starting matrix's block
  const size_t jb       = blockIdx.x % Nblocks;
  const size_t i_local  = threadIdx.x / BLOCK;    // local matrix's indexes mapped to each CUDA thread
  const size_t j_local  = threadIdx.x % BLOCK;    // within its own block
  const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
  const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
  
  C[(i_global * size) + j_global] = (MyData)0;

  // loop over blocks
  for (size_t block=0 ; block<Nblocks ; block++)
    {
      const size_t j_A = (block * BLOCK);
      const size_t i_B = (block * BLOCK);

      // perform the matrix multiplication within the block
      MyData value = (MyData)0;
      for (size_t k=0 ; k<BLOCK ; k++)
	value += A[(i_global * size) + k + j_A] * B[((k + i_B) * size) + j_global];

      C[(i_global * size) + j_global] += value;
    }

  return;
}

__global__ void GPU_mat_mult_block_shared(const MyData *const restrict A,
					  const MyData *const restrict B,
					        MyData *const restrict C,
					  const size_t                 size)
{
  __shared__ MyData Ablock[BLOCK * BLOCK];
  __shared__ MyData Bblock[BLOCK * BLOCK];
  __shared__ MyData Cblock[BLOCK * BLOCK];
  
  const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
  const size_t size2 = (size * size);
  if (globalID >= size2)
    return;
  
  const size_t Nblocks  = size / BLOCK;           // number of blocks to loop over
  const size_t ib       = blockIdx.x / Nblocks;   // indexes of starting matrix's block
  const size_t jb       = blockIdx.x % Nblocks;
  const size_t i_local  = threadIdx.x / BLOCK;    // local matrix's indexes mapped to each CUDA thread
  const size_t j_local  = threadIdx.x % BLOCK;    // within its own block
  const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
  const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices

  // Init Cblock
  Cblock[(i_local * BLOCK) + j_local] = (MyData)0;

  // loop over blocks
  for (size_t block=0 ; block<Nblocks ; block++)
    {
      const size_t j_A = (block * BLOCK);
      const size_t i_B = (block * BLOCK);

      // waits until all threads in the thread block have reached this point and shared memory accesses
      // made by these threads prior to __syncthreads() are visible to all threads in the block.
      __syncthreads();
      
      // copy block of A into shared memory
      Ablock[(i_local * BLOCK) + j_local] = A[(i_global * size) + j_local + j_A];
      // copy block of B into shared memory
      Bblock[(i_local * BLOCK) + j_local] = B[((i_local + i_B) * size) + j_global];
      
      // waits until all threads in the thread block have reached this point and shared memory accesses      // made by these threads prior to __syncthreads() are visible to all threads in the block.
      __syncthreads();

      // perform the matrix multiplication within the block
      MyData value = (MyData)0;
      for (size_t k=0 ; k<BLOCK ; k++)
	value += Ablock[(i_local * BLOCK) + k] * Bblock[(k * BLOCK) + j_local];

      // store the partial result in shared memory
      Cblock[(i_local * BLOCK) + j_local] += value;
    }

  C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK) + j_local];
  
  return;
}

void check(const MyData *const restrict cpu_matrix,
	   const MyData *const restrict gpu_matrix)
{
  int flag = 0;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);

  if (!flag)
    printf("\n\t Result OK");
  else
    printf("\n\t Result wrong");
  
  return;
}

int main()
{
  double time;
  MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData));
  assert(buffer_cpu != NULL);

  // host reference matrix A
  MyData *const restrict A_CPU = buffer_cpu;
  MyData *const restrict B_CPU = A_CPU + SIZE;
  MyData *const restrict C_CPU = B_CPU + SIZE;
  MyData *const restrict C_GPU_host = C_CPU + SIZE;
  for (size_t i=0 ; i<SIZE ; i++)
    {
      A_CPU[i] = drand48();
      B_CPU[i] = drand48();
    }

  ////////////////////////// CPU naive algorithm //////////////////////////////////////////
  CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N);
  /////////////////////////////////////////////////////////////////////////////////////////

  // copy/alloc data to the GPU
  MyData *buffer_gpu = NULL;
  cudaMalloc((void **)&buffer_gpu, (3 * SIZE * sizeof(MyData)));
  MyData *const A_GPU = buffer_gpu;
  MyData *const B_GPU = A_GPU + SIZE;
  MyData *const C_GPU = B_GPU + SIZE;
  cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);

  const dim3 nblocks = {(SIZE + (BLOCK * BLOCK)  -1)/(BLOCK * BLOCK), 1, 1};
  const dim3 block   = {(BLOCK * BLOCK), 1, 1};
  
  /////////////////////////// GPU naive block algorithm ////////////////////////////////////////
  time = 0.0;
  GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
  cudaDeviceSynchronize();
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
      GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
      cudaDeviceSynchronize();
      time += (wall_time() - start);
    }
  cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
  
  check(C_CPU, C_GPU_host);
  printf("\n\t GPU naive block time %lg [s]\n", (time / LOOP));
  /////////////////////////////////////////////////////////////////////////////////////

  /////////////////////////// GPU block shared memory algorithm ///////////////////////
  time = 0.0;
  GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
  cudaDeviceSynchronize();
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
      GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
      cudaDeviceSynchronize();
      time += (wall_time() - start);
    }
  cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
  
  check(C_CPU, C_GPU_host);
  printf("\n\t GPU block shared memory time %lg [s]\n", (time / LOOP));
  /////////////////////////////////////////////////////////////////////////////////////

  // free CPU memory
  free(buffer_cpu);
  // free GPU memory
  cudaFree(buffer_gpu);
  
  printf("\n");

  return EXIT_SUCCESS;
}

cuda-omp/omp/6/classwork_omp

deleted100755 → 0
−555 KiB

File deleted.

+72 −35
Original line number Diff line number Diff line
@@ -14,9 +14,10 @@
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v mat_mult.c -o mat_mult_omp
// - Run the code:
//   $ ./classwork_omp
//   $ export OMP_TARGET_OFFLOAD=MANDATORY
//   $ ./mat_mult_omp
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
@@ -26,8 +27,10 @@
#include <assert.h>
#include <omp.h>
#include <string.h>
#include <math.h>
#include <float.h>

#define N                     512
#define N                     1024
#define SIZE                  (N * N) // matrix size
typedef double MyData;                // do not change

@@ -61,22 +64,30 @@ void GPU_mat_mult(const MyData *const restrict A,
	                MyData *const restrict C,
		  const size_t                 size)
{
  #pragma omp target
  {
    #pragma omp teams distribute num_teams(size)
  #pragma omp target teams distribute num_teams(size)
  for (size_t i=0 ; i<size ; i++)
    {
        #pragma omp parallel for num_threads(size)
      #pragma omp parallel for firstprivate(i) num_threads(size)
      for (size_t j=0 ; j<size ; j++)
	{
#if !defined(NDEBUG)

	  const int num_teams   = omp_get_num_teams();
	  const int num_threads = omp_get_num_threads();

	  if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0))
	    printf("\n\t Kernel GPU_mat_mult: teams: %d - threads: %d\n",
		   num_teams, num_threads);

#endif // NDEBUG
	  
	  MyData value = (MyData)0;
	  for (size_t k=0 ; k<size ; k++)
	    value += (A[(i * size) + k] * B[(k * size) + j]);

	  C[(i * size) + j] = value;
	  } // omp thread
      } // omp teams
   } // omp target
	} // omp threads
    } // omp target teams

  return;
}
@@ -86,14 +97,23 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A,
			         MyData *const restrict C,
			   const size_t                 size)
{
 #pragma omp target
  {
   #pragma omp teams num_teams(size)
 #pragma omp target teams num_teams(size)
  {
    const size_t team_size = (size * omp_get_team_num());
      
   #pragma omp parallel firstprivate(team_size) num_threads(size)
    {
#if !defined(NDEBUG)

      const int num_teams   = omp_get_num_teams();
      const int num_threads = omp_get_num_threads();
      
      if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0))
	printf("\n\t Kernel GPU_mat_mult_no_loops: teams: %d - threads: %d\n",
	       num_teams, num_threads);

#endif // NDEBUG
      
      const size_t tid = omp_get_thread_num();
      MyData value = (MyData)0;
      for (size_t k=0 ; k<size ; k++)
@@ -101,8 +121,7 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A,

      C[team_size + tid] = value;
    } // omp threads      
    } // omp teams
  } // omp target
  } // omp target teams

  return;
}
@@ -110,15 +129,31 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A,
void check(const MyData *const __restrict__ cpu_matrix,
	   const MyData *const __restrict__ gpu_matrix)
{
  int flag;
  int flag = 0;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
    flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);

  if (!flag)
    printf("\n\t Result OK");
  else
    printf("\n\t Result wrong");

#if !defined(NDEBUG)

  for (size_t i=0 ; i<N ; i++)
    {
      printf("\n");
      for (size_t j=0 ; j<N ; j++)
	{
	  const size_t index = ((i * N) + j);
	  printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
		 index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
	}
    }
  printf("\n");

#endif // NDEBUG  
  
  return;
}

@@ -148,6 +183,7 @@ int main()

  /////////////////////////// GPU naive algorithm ////////////////////////////////////////
  time = 0.0;
  GPU_mat_mult(A, B, C_GPU, N); // warm-up
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
@@ -162,6 +198,7 @@ int main()

  /////////////////////////// GPU naive no loops algorithm ////////////////////////////
  time = 0.0;
  GPU_mat_mult_no_loops(A, B, C_GPU, N); // warm-up
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      const double start = wall_time();
+259 −73

File changed.

Preview size limit exceeded, changes collapsed.