Commit 86b043e3 authored by David Goz's avatar David Goz
Browse files

cuda-ompgpu more examples

parent b317d509
Loading
Loading
Loading
Loading
+25 −8
Original line number Diff line number Diff line
@@ -17,9 +17,10 @@
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 06.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvcc classwork_1.cu -o classwork_1_cuda
//   $ nvc++ classwork_1.cu -o classwork_1_cuda
// - Run the code:
//   $ ./classwork_1_cuda
// - Check the result:
@@ -33,10 +34,23 @@
#define N        100
#define NThreads 1024

__global__ void GPUkernel(const int size)
__global__ void GPUkernelSerial(const int size)
{
  const int myID = threadIdx.x + (blockIdx.x * blockDim.x);

  // C printf is supported on CUDA
  // C++ cout class is not supported in CUDA
  for (int i=0 ; i<size ; i++)
    printf("Hello from CUDA thread: %d - result %d\n", myID, (i * i));

  return;
}

__global__ void GPUkernelParallel(const int size)
{
  const int myID = threadIdx.x + (blockIdx.x * blockDim.x);

  /* guard if the number of available threads is larger than size */
  if (myID >= size)
    return;

@@ -49,12 +63,15 @@ __global__ void GPUkernel(const int size)

int main()
{
  printf("\n\t The host issues the kernel on the GPU \n");
  
  printf("\n\t The host issues the kernel on the GPU in serial \n");  
  // kernel lunch
  GPUkernel<<<1, NThreads>>>(N);
  GPUkernelSerial<<<1, 1>>>(N);
  // GPU synchronization
  cudaDeviceSynchronize();

  printf("\n\t cudaDeviceSynchronize \n");
  printf("\n\t The host issues the kernel on the GPU in parallel \n");
  // kernel lunch
  GPUkernelParallel<<<1, NThreads>>>(N);
  // GPU synchronization
  cudaDeviceSynchronize();
  
+25 −19
Original line number Diff line number Diff line
@@ -18,10 +18,11 @@
//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 17.11.2022
// date  : 06.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvcc classwork_2.cu -o classwork_2
//   $ nvc++ classwork_2.cu -o classwork_2
// - Run the code:
//   $ ./classwork_2
// - Check the result:
@@ -29,9 +30,10 @@

//////////////////////////////////////////////////////////////////////////////////////////////////

#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <assert.h>

#define N        100
#define NThreads 1024
@@ -49,31 +51,35 @@ __global__ void GPUkernel( int *A,

int main()
{
  // allocate array that allows direct access of both host and device
  //  CUDA is responsible 
  int *A;
  const size_t size = (N * sizeof(int));
  cudaError error = cudaMallocManaged(&A, size);
  if (!error)
    std::cout << "Memory allocated for the host/device" << std::endl;
  else
    {
      std::cout << "Cannot allocate memory for the host/device. CUDA error : " << error << " ... aborting" << std::endl;
      exit(EXIT_FAILURE);
    }
  /* host array */
  int *h_A = (int *)malloc(N * sizeof(*h_A));
  assert(h_A != NULL);

  /* device array */
  int *d_A = NULL;
  cudaMalloc((void **)&d_A, (N * sizeof(*d_A)));
  assert(d_A != NULL);
  
  // kernel lunch
  GPUkernel<<<1, NThreads>>>(A, N);
  GPUkernel<<<1, NThreads>>>(d_A, N);
  
  // device synchronization
  cudaDeviceSynchronize();

  // fetch device memory
  cudaMemcpy(h_A, d_A, (sizeof(*h_A) * N), cudaMemcpyDeviceToHost);

  // free GPU memory
  cudaFree(d_A);
  
  // check the result
  printf("\n");
  for (size_t i=0 ; i<N ; i++)
    std::cout << "A[" << i << "] - Result: " << A[i] << std::endl;
    printf("\t A[%d] = %d", i, h_A[i]);
  printf("\n\n");

  // free the memory
  cudaFree(A);
  // free host memory
  free(h_A);
  
  return 0;
}
+107 −0
Original line number Diff line number Diff line
//////////////////////////////////////////////////////////////////////////////////////////////////
// Assigment : write a CUDA code corresponding to the
// following sequential C code
//
// #include <stdio.h>
// #define ROW 4
// #define COL 8
// int main()
// {
//   int Matrix[ROW * COL];
//
//   for (int row=0 ; row<ROW ; row++)
//      for (int col=0 ; col<COL ; col++)
//         Matrix[(row * COL) + col] = ((row * COL) + col);
//
//   return 0;
// }
//////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 06.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc++ classwork.cu -o classwork
// - Run the code:
//   $ ./classwork
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <iostream>
#include <stdlib.h>
#include <cuda.h>
#include <assert.h>

#define BLOCKSIZE 32
#define ROW 1089
#define COL 1111

typedef int MyData;

__global__ void GPUMatrix(MyData *Matrix,
			  const int size)
{
  /* global thread index */
  const int index = threadIdx.x + (blockIdx.x * blockDim.x);

  if (index < size)
    Matrix[index] = index;

  return;
}

int main()
{
  /* host matrix . 1D mapping matrix[i][j] ---> matrix[(i * COL) + j] */
  MyData *h_matrix = (MyData *)malloc(ROW * COL * sizeof(*h_matrix));
  assert(h_matrix != NULL);

  /* device matrix */
  MyData *d_matrix = NULL;
  cudaMalloc((void **)&d_matrix, ROW * COL * sizeof(*d_matrix));
  assert(d_matrix != NULL);
  
  /* 1D grid  */
  const dim3 grid = {(((ROW * COL) + BLOCKSIZE - 1) / BLOCKSIZE), // number of blocks along X
		     1,                                           // number of blocks along Y
		     1};                                          // number of blocks along Z
  const dim3 block = {BLOCKSIZE, // number of threads per block along X
		      1,         // number of threads per block along Y
		      1};        // number of threads per block along Z
  
  // kernel
  GPUMatrix<<<grid, block>>>(d_matrix, (ROW * COL));
  
  // device synchronization
  cudaDeviceSynchronize();

  /* fetch matrix from the device */
  cudaMemcpy(h_matrix, d_matrix, (ROW * COL * sizeof(*d_matrix)), cudaMemcpyDeviceToHost);

  /* free device memory */
  cudaFree(d_matrix);
  
  // check the result
  int flag = 0;
  for (size_t row=0 ; row<ROW ; row++)
    {
      const size_t i = (row * COL);
      
      for (size_t col=0 ; col<COL ; col++)
      {
	flag = ((h_matrix[i + col] != (i + col)) ? -1 : flag);
      } /* col loop */
    } /* row loop */

  // free host memory
  free(h_matrix);

  if (flag)
    printf("\n\t Result wrong! \n\n");
  else
    printf("\n\t Result OK! \n\n");
  
  return 0;
}
+145 −0
Original line number Diff line number Diff line
//////////////////////////////////////////////////////////////////////////////////////////////////
// Assigment : write a CUDA code that does:
//             - each thread generates a pair of points as (x, y) coordinates randomly distributed;
//             - each thread computes the euclidean distance d = sqrt((x1 - x2)^2 + (y1 - y2)^2);
//             - the kernel prints the maximum distance;
//             - use one CUDA block and shared memory within the block
//////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 06.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc++ classwork.cu -o classwork -lm
// - Run the code:
//   $ ./classwork
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include <assert.h>

#define NUMPOINTS  1024
#define BLOCKSIZE  NUMPOINTS

#if NUMPOINTS != BLOCKSIZE
#error NUMPOINTS must be equal to BLOCKSIZE
#endif

#if BLOCKSIZE > 1024
#error BLOCKSIZE cannot be larger than 1024
#endif

#define SQUARE(A)      ((A) * (A))

typedef double MyData;

typedef struct PairPoints
{
  MyData x[2];
  MyData y[2];
} PairPoints;

__global__ void GPUDistance(const PairPoints *const points,
			    const int               size)
{
  /* global thread's ID */
  const int globalID = (threadIdx.x + (blockIdx.x * blockDim.x));
  /* local (i.e. within the block) thread's ID */
  const int localID  = threadIdx.x;

  if ((globalID >= size) || (globalID != localID))
    return;

  /* coalescent memory accesses */
  const PairPoints myPoints = points[localID];
  const MyData pair_distance_X2 = SQUARE(myPoints.x[0] - myPoints.x[1]);
  const MyData pair_distance_Y2 = SQUARE(myPoints.y[0] - myPoints.y[1]);  
  const MyData pair_distance    = sqrt(pair_distance_X2 + pair_distance_Y2);

  // shared-block memory statically allocated
  __shared__ MyData distance[BLOCKSIZE];
  
  /* store the distance in shared memory */
  distance[localID] = pair_distance;
  // block level synchronization barrier
  __syncthreads();

  /* the master thread within the block gets the max */
  if (localID == 0)
    {
      MyData max_dis = -1.0;
      for (size_t i=0 ; i<size ; i++)
	max_dis = ((max_dis < distance[i]) ? distance[i] : max_dis);

      printf("\t GPU maximum distance: %lg\n", max_dis);
    }
  
  return;
}

void CPUMaxDistance(const PairPoints *const points,
		    const        int        size)
{
  MyData distance = -1.0;
  for (size_t i=0 ; i<size ; i++)
    {
      const MyData pair_distance_X2 = SQUARE(points[i].x[0] - points[i].x[1]);
      const MyData pair_distance_Y2 = SQUARE(points[i].y[0] - points[i].y[1]);
      const MyData pair_distance    = sqrt(pair_distance_X2 + pair_distance_Y2);

      distance = ((distance < pair_distance) ? pair_distance : distance);
    }

  printf("\n\t CPU maximum distance: %lg\n", distance);
  
  return;
}

int main()
{
  /* host allocation */
  PairPoints *h_points = (PairPoints *)malloc(NUMPOINTS * sizeof(*h_points));
  assert(h_points != NULL);
  
  /* device allocation */
  PairPoints *d_points = NULL;
  cudaMalloc((void **)&d_points, (NUMPOINTS * sizeof(*d_points)));

  /* initialization */
  srand48(time(NULL));
  for(size_t i=0 ; i<NUMPOINTS ; i++)
    {
      h_points[i].x[0] = drand48();
      h_points[i].x[1] = drand48();
      h_points[i].y[0] = drand48();
      h_points[i].y[1] = drand48();
    }

  /* copy data to the device's memory */
  cudaMemcpy(d_points, h_points, (NUMPOINTS * sizeof(*d_points)), cudaMemcpyHostToDevice);
  
  const dim3 grid  = {1, 1, 1};
  const dim3 block = {BLOCKSIZE, 1, 1};
  
  // lunch kernel
  GPUDistance<<< grid, block >>>(d_points, NUMPOINTS);

  // check the result on the host while the kernel is executing on the GPU
  CPUMaxDistance(h_points, NUMPOINTS);

  free(h_points);
  
  // device synchronization
  cudaDeviceSynchronize();

  // free memory
  cudaFree(d_points);
  
  return 0;
}
+178 −0
Original line number Diff line number Diff line
//////////////////////////////////////////////////////////////////////////////////////////////////
// Assigment : write a CUDA code corresponding to the
// following sequential C code
//
// #include <stdio.h>
// int main()
// {
//   /* Matrix multiplication: C = A X B */
//
//   int A[N][N], B[N][N], C[N][N];
//
//   for (int ii=0 ; ii<N ; ii++)
//      for (int jj=0 ; jj<N ; jj++)
//         for (int kk=0 ; kk<N ; kk++)
//            C[ii][jj] += A[ii][kk] * B[kk][jj];
//
//   return 0;
// }
//////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 08.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc++ classwork.cu -o classwork
// - Run the code:
//   $ ./classwork <N>
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <assert.h>

#define BLOCKSIZE 32
#define SQUARE(SIZE) ((SIZE) * (SIZE))

typedef long int MyData;

void matrix_init(      MyData *const AB,
		 const MyData        N)
{
  MyData *const restrict A = AB;
  MyData *const restrict B = AB + SQUARE(N);
  
  for (MyData ii=0 ; ii<N ; ii++)
    for (MyData jj=0 ; jj<N ; jj++)
      {
	A[(ii * N) + jj] = (ii - jj);
	B[(ii * N) + jj] = 1;
      }
  
  return;
}

__global__ void GPU_MM(const MyData *const __restrict__ A,
		       const MyData *const __restrict__ B,
		             MyData *const __restrict__ C,
		       const MyData                     N)
{
  const MyData IDx = threadIdx.x + (blockIdx.x * blockDim.x);
  const MyData IDy = threadIdx.y + (blockIdx.y * blockDim.y);

  /* check out of boundaries */
  if ((IDx >= N) || (IDy >= N))
    return;

  /* each thread performs the calculation of one element */
  /* of the matrix, i.e. C[IDx][IDy]                     */
  
  MyData sum = 0;
  for (MyData kk=0 ; kk<N ; kk++)
    sum += (A[(IDx * N) + kk] * B[(kk * N) + IDy]);

  C[(IDx * N) + IDy] = sum;
  
  return;
}

void check(const MyData *const __restrict__ GPU_MM,
	   const MyData *const __restrict__ A,
	   const MyData                     N)
{
  int flag = 0;

  for (MyData ii=0 ; ii<N ; ii++)
    {
      MyData row_a = 0;      
      for (MyData kk=0 ; kk<N ; kk++)
	row_a += A[(ii * N) + kk];

      for (MyData jj=0 ; jj<N ; jj++)
	if (GPU_MM[(ii * N) + jj] != row_a)
	  {
	    flag = 1;
	    break;
	  }
    }

  if (flag)
    printf("\n\t Result WRONG \n\n");
  else
    printf("\n\t Result OK \n\n");
  
  return;
}

int main(int arg, char *argv[])
{
  int N;
  
  if (arg < 2)
    {
      printf("\n\t Usage: $ ./classwork <number_of_matrix_elements> \n");
      exit(EXIT_FAILURE);
    }
  else
    {
      N = atoi(argv[1]);
      if (N <= 0)
	{
	  printf("\n\t Number of matrix elements must be greater than zero \n");
	  exit(EXIT_FAILURE);
	}
      else
	{
	  printf("\n\t Matrix size    : %d x %d", N, N);
	  printf("\n\t CUDA block size: %d x %d", BLOCKSIZE, BLOCKSIZE);
	}
    }

  /* host memory allocation */
  MyData *h_buffer = (MyData *)malloc(3 * SQUARE(N) * sizeof(MyData));
  assert(h_buffer != NULL);
  /* set-up host pointers */
  MyData *const restrict h_A = h_buffer;
  MyData *const restrict h_B = h_A + SQUARE(N);
  MyData *const restrict h_C = h_B + SQUARE(N);

  /* device memory allocation */
  MyData *d_buffer = NULL;
  cudaMalloc((void **)&d_buffer, (3 * SQUARE(N) * sizeof(MyData)));
  assert(d_buffer != NULL);
  /* set-up device pointers */
  MyData *const restrict d_A = d_buffer;
  MyData *const restrict d_B = d_A + SQUARE(N);
  MyData *const restrict d_C = d_B + SQUARE(N);

  // matrix initialization
  matrix_init(h_buffer, N);

  /* copy host data to device */
  cudaMemcpy(d_A, h_A, (2 * SQUARE(N) * sizeof(MyData)), cudaMemcpyHostToDevice);
  
  // kernel lunch on GPU
  const size_t nblocks = ((N + BLOCKSIZE - 1) / BLOCKSIZE);
  const dim3 grid      = {nblocks, nblocks, 1};
  const dim3 block     = {BLOCKSIZE, BLOCKSIZE, 1};
  GPU_MM<<< grid, block >>>(d_A, d_B, d_C, N);

  /* host-device data synchronization         */
  /* N.B. cudaDeviceSynchronize() is not      */
  /* necessary since cudaMemcpy() is blocking */
  cudaMemcpy(h_C, d_C, (SQUARE(N) * sizeof(MyData)), cudaMemcpyDeviceToHost);

  // free the GPU memory
  cudaFree(d_buffer);

  // check the result
  check(h_C, h_A, N);

  free(h_buffer);
  
  return 0;
}
Loading