cuda-ompgpu more examples (86b043e3) · Commits · HPC_SCHOOL / HPC_SCHOOL_2024

cuda-omp/cuda/1/classwork_1.cu

+25 −8

Original line number	Diff line number	Diff line
		@@ -17,9 +17,10 @@
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 06.07.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvcc classwork_1.cu -o classwork_1_cuda
		// $ nvc++ classwork_1.cu -o classwork_1_cuda
		// - Run the code:
		// $ ./classwork_1_cuda
		// - Check the result:
		@@ -33,10 +34,23 @@
		#define N 100
		#define NThreads 1024

		__global__ void GPUkernel(const int size)
		__global__ void GPUkernelSerial(const int size)
		{
		const int myID = threadIdx.x + (blockIdx.x * blockDim.x);

		// C printf is supported on CUDA
		// C++ cout class is not supported in CUDA
		for (int i=0 ; i<size ; i++)
		printf("Hello from CUDA thread: %d - result %d\n", myID, (i * i));

		return;
		}

		__global__ void GPUkernelParallel(const int size)
		{
		const int myID = threadIdx.x + (blockIdx.x * blockDim.x);

		/* guard if the number of available threads is larger than size */
		if (myID >= size)
		return;

		@@ -49,12 +63,15 @@ __global__ void GPUkernel(const int size)

		int main()
		{
		printf("\n\t The host issues the kernel on the GPU \n");

		printf("\n\t The host issues the kernel on the GPU in serial \n");
		// kernel lunch
		GPUkernel<<<1, NThreads>>>(N);
		GPUkernelSerial<<<1, 1>>>(N);
		// GPU synchronization
		cudaDeviceSynchronize();

		printf("\n\t cudaDeviceSynchronize \n");
		printf("\n\t The host issues the kernel on the GPU in parallel \n");
		// kernel lunch
		GPUkernelParallel<<<1, NThreads>>>(N);
		// GPU synchronization
		cudaDeviceSynchronize();

cuda-omp/cuda/1/classwork_2.cu

+25 −19

Original line number	Diff line number	Diff line
		@@ -18,10 +18,11 @@
		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 17.11.2022
		// date : 06.07.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvcc classwork_2.cu -o classwork_2
		// $ nvc++ classwork_2.cu -o classwork_2
		// - Run the code:
		// $ ./classwork_2
		// - Check the result:
		@@ -29,9 +30,10 @@

		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <iostream>
		#include <stdlib.h>
		#include <stdio.h>
		#include <cuda.h>
		#include <assert.h>

		#define N 100
		#define NThreads 1024
		@@ -49,31 +51,35 @@ __global__ void GPUkernel( int *A,

		int main()
		{
		// allocate array that allows direct access of both host and device
		// CUDA is responsible
		int *A;
		const size_t size = (N * sizeof(int));
		cudaError error = cudaMallocManaged(&A, size);
		if (!error)
		std::cout << "Memory allocated for the host/device" << std::endl;
		else
		{
		std::cout << "Cannot allocate memory for the host/device. CUDA error : " << error << " ... aborting" << std::endl;
		exit(EXIT_FAILURE);
		}
		/* host array */
		int h_A = (int )malloc(N * sizeof(*h_A));
		assert(h_A != NULL);

		/* device array */
		int *d_A = NULL;
		cudaMalloc((void *)&d_A, (N sizeof(*d_A)));
		assert(d_A != NULL);

		// kernel lunch
		GPUkernel<<<1, NThreads>>>(A, N);
		GPUkernel<<<1, NThreads>>>(d_A, N);

		// device synchronization
		cudaDeviceSynchronize();

		// fetch device memory
		cudaMemcpy(h_A, d_A, (sizeof(h_A) N), cudaMemcpyDeviceToHost);

		// free GPU memory
		cudaFree(d_A);

		// check the result
		printf("\n");
		for (size_t i=0 ; i<N ; i++)
		std::cout << "A[" << i << "] - Result: " << A[i] << std::endl;
		printf("\t A[%d] = %d", i, h_A[i]);
		printf("\n\n");

		// free the memory
		cudaFree(A);
		// free host memory
		free(h_A);

		return 0;
		}

cuda-omp/cuda/2/classwork.cu

0 → 100644

+107 −0

Original line number	Diff line number	Diff line
		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Assigment : write a CUDA code corresponding to the
		// following sequential C code
		//
		// #include <stdio.h>
		// #define ROW 4
		// #define COL 8
		// int main()
		// {
		// int Matrix[ROW * COL];
		//
		// for (int row=0 ; row<ROW ; row++)
		// for (int col=0 ; col<COL ; col++)
		// Matrix[(row * COL) + col] = ((row * COL) + col);
		//
		// return 0;
		// }
		//////////////////////////////////////////////////////////////////////////////////////////////////

		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 06.07.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc++ classwork.cu -o classwork
		// - Run the code:
		// $ ./classwork
		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <iostream>
		#include <stdlib.h>
		#include <cuda.h>
		#include <assert.h>

		#define BLOCKSIZE 32
		#define ROW 1089
		#define COL 1111

		typedef int MyData;

		__global__ void GPUMatrix(MyData *Matrix,
		const int size)
		{
		/* global thread index */
		const int index = threadIdx.x + (blockIdx.x * blockDim.x);

		if (index < size)
		Matrix[index] = index;

		return;
		}

		int main()
		{
		/* host matrix . 1D mapping matrix[i][j] ---> matrix[(i * COL) + j] */
		MyData h_matrix = (MyData )malloc(ROW * COL * sizeof(*h_matrix));
		assert(h_matrix != NULL);

		/* device matrix */
		MyData *d_matrix = NULL;
		cudaMalloc((void *)&d_matrix, ROW COL * sizeof(*d_matrix));
		assert(d_matrix != NULL);

		/* 1D grid */
		const dim3 grid = {(((ROW * COL) + BLOCKSIZE - 1) / BLOCKSIZE), // number of blocks along X
		1, // number of blocks along Y
		1}; // number of blocks along Z
		const dim3 block = {BLOCKSIZE, // number of threads per block along X
		1, // number of threads per block along Y
		1}; // number of threads per block along Z

		// kernel
		GPUMatrix<<<grid, block>>>(d_matrix, (ROW * COL));

		// device synchronization
		cudaDeviceSynchronize();

		/* fetch matrix from the device */
		cudaMemcpy(h_matrix, d_matrix, (ROW * COL * sizeof(*d_matrix)), cudaMemcpyDeviceToHost);

		/* free device memory */
		cudaFree(d_matrix);

		// check the result
		int flag = 0;
		for (size_t row=0 ; row<ROW ; row++)
		{
		const size_t i = (row * COL);

		for (size_t col=0 ; col<COL ; col++)
		{
		flag = ((h_matrix[i + col] != (i + col)) ? -1 : flag);
		} /* col loop */
		} /* row loop */

		// free host memory
		free(h_matrix);

		if (flag)
		printf("\n\t Result wrong! \n\n");
		else
		printf("\n\t Result OK! \n\n");

		return 0;
		}

cuda-omp/cuda/3/classwork.cu

0 → 100644

+145 −0

Original line number	Diff line number	Diff line
		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Assigment : write a CUDA code that does:
		// - each thread generates a pair of points as (x, y) coordinates randomly distributed;
		// - each thread computes the euclidean distance d = sqrt((x1 - x2)^2 + (y1 - y2)^2);
		// - the kernel prints the maximum distance;
		// - use one CUDA block and shared memory within the block
		//////////////////////////////////////////////////////////////////////////////////////////////////

		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 06.07.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc++ classwork.cu -o classwork -lm
		// - Run the code:
		// $ ./classwork
		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		#include <stdlib.h>
		#include <math.h>
		#include <cuda.h>
		#include <assert.h>

		#define NUMPOINTS 1024
		#define BLOCKSIZE NUMPOINTS

		#if NUMPOINTS != BLOCKSIZE
		#error NUMPOINTS must be equal to BLOCKSIZE
		#endif

		#if BLOCKSIZE > 1024
		#error BLOCKSIZE cannot be larger than 1024
		#endif

		#define SQUARE(A) ((A) * (A))

		typedef double MyData;

		typedef struct PairPoints
		{
		MyData x[2];
		MyData y[2];
		} PairPoints;

		__global__ void GPUDistance(const PairPoints *const points,
		const int size)
		{
		/* global thread's ID */
		const int globalID = (threadIdx.x + (blockIdx.x * blockDim.x));
		/* local (i.e. within the block) thread's ID */
		const int localID = threadIdx.x;

		if ((globalID >= size) \|\| (globalID != localID))
		return;

		/* coalescent memory accesses */
		const PairPoints myPoints = points[localID];
		const MyData pair_distance_X2 = SQUARE(myPoints.x[0] - myPoints.x[1]);
		const MyData pair_distance_Y2 = SQUARE(myPoints.y[0] - myPoints.y[1]);
		const MyData pair_distance = sqrt(pair_distance_X2 + pair_distance_Y2);

		// shared-block memory statically allocated
		__shared__ MyData distance[BLOCKSIZE];

		/* store the distance in shared memory */
		distance[localID] = pair_distance;
		// block level synchronization barrier
		__syncthreads();

		/* the master thread within the block gets the max */
		if (localID == 0)
		{
		MyData max_dis = -1.0;
		for (size_t i=0 ; i<size ; i++)
		max_dis = ((max_dis < distance[i]) ? distance[i] : max_dis);

		printf("\t GPU maximum distance: %lg\n", max_dis);
		}

		return;
		}

		void CPUMaxDistance(const PairPoints *const points,
		const int size)
		{
		MyData distance = -1.0;
		for (size_t i=0 ; i<size ; i++)
		{
		const MyData pair_distance_X2 = SQUARE(points[i].x[0] - points[i].x[1]);
		const MyData pair_distance_Y2 = SQUARE(points[i].y[0] - points[i].y[1]);
		const MyData pair_distance = sqrt(pair_distance_X2 + pair_distance_Y2);

		distance = ((distance < pair_distance) ? pair_distance : distance);
		}

		printf("\n\t CPU maximum distance: %lg\n", distance);

		return;
		}

		int main()
		{
		/* host allocation */
		PairPoints h_points = (PairPoints )malloc(NUMPOINTS * sizeof(*h_points));
		assert(h_points != NULL);

		/* device allocation */
		PairPoints *d_points = NULL;
		cudaMalloc((void *)&d_points, (NUMPOINTS sizeof(*d_points)));

		/* initialization */
		srand48(time(NULL));
		for(size_t i=0 ; i<NUMPOINTS ; i++)
		{
		h_points[i].x[0] = drand48();
		h_points[i].x[1] = drand48();
		h_points[i].y[0] = drand48();
		h_points[i].y[1] = drand48();
		}

		/* copy data to the device's memory */
		cudaMemcpy(d_points, h_points, (NUMPOINTS * sizeof(*d_points)), cudaMemcpyHostToDevice);

		const dim3 grid = {1, 1, 1};
		const dim3 block = {BLOCKSIZE, 1, 1};

		// lunch kernel
		GPUDistance<<< grid, block >>>(d_points, NUMPOINTS);

		// check the result on the host while the kernel is executing on the GPU
		CPUMaxDistance(h_points, NUMPOINTS);

		free(h_points);

		// device synchronization
		cudaDeviceSynchronize();

		// free memory
		cudaFree(d_points);

		return 0;
		}

cuda-omp/cuda/4/classwork.cu

0 → 100644

+178 −0

Original line number	Diff line number	Diff line
		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Assigment : write a CUDA code corresponding to the
		// following sequential C code
		//
		// #include <stdio.h>
		// int main()
		// {
		// /* Matrix multiplication: C = A X B */
		//
		// int A[N][N], B[N][N], C[N][N];
		//
		// for (int ii=0 ; ii<N ; ii++)
		// for (int jj=0 ; jj<N ; jj++)
		// for (int kk=0 ; kk<N ; kk++)
		// C[ii][jj] += A[ii][kk] * B[kk][jj];
		//
		// return 0;
		// }
		//////////////////////////////////////////////////////////////////////////////////////////////////

		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 08.07.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc++ classwork.cu -o classwork
		// - Run the code:
		// $ ./classwork <N>
		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		#include <stdlib.h>
		#include <cuda.h>
		#include <assert.h>

		#define BLOCKSIZE 32
		#define SQUARE(SIZE) ((SIZE) * (SIZE))

		typedef long int MyData;

		void matrix_init( MyData *const AB,
		const MyData N)
		{
		MyData *const restrict A = AB;
		MyData *const restrict B = AB + SQUARE(N);

		for (MyData ii=0 ; ii<N ; ii++)
		for (MyData jj=0 ; jj<N ; jj++)
		{
		A[(ii * N) + jj] = (ii - jj);
		B[(ii * N) + jj] = 1;
		}

		return;
		}

		__global__ void GPU_MM(const MyData *const __restrict__ A,
		const MyData *const __restrict__ B,
		MyData *const __restrict__ C,
		const MyData N)
		{
		const MyData IDx = threadIdx.x + (blockIdx.x * blockDim.x);
		const MyData IDy = threadIdx.y + (blockIdx.y * blockDim.y);

		/* check out of boundaries */
		if ((IDx >= N) \|\| (IDy >= N))
		return;

		/* each thread performs the calculation of one element */
		/* of the matrix, i.e. C[IDx][IDy] */

		MyData sum = 0;
		for (MyData kk=0 ; kk<N ; kk++)
		sum += (A[(IDx * N) + kk] * B[(kk * N) + IDy]);

		C[(IDx * N) + IDy] = sum;

		return;
		}

		void check(const MyData *const __restrict__ GPU_MM,
		const MyData *const __restrict__ A,
		const MyData N)
		{
		int flag = 0;

		for (MyData ii=0 ; ii<N ; ii++)
		{
		MyData row_a = 0;
		for (MyData kk=0 ; kk<N ; kk++)
		row_a += A[(ii * N) + kk];

		for (MyData jj=0 ; jj<N ; jj++)
		if (GPU_MM[(ii * N) + jj] != row_a)
		{
		flag = 1;
		break;
		}
		}

		if (flag)
		printf("\n\t Result WRONG \n\n");
		else
		printf("\n\t Result OK \n\n");

		return;
		}

		int main(int arg, char *argv[])
		{
		int N;

		if (arg < 2)
		{
		printf("\n\t Usage: $ ./classwork <number_of_matrix_elements> \n");
		exit(EXIT_FAILURE);
		}
		else
		{
		N = atoi(argv[1]);
		if (N <= 0)
		{
		printf("\n\t Number of matrix elements must be greater than zero \n");
		exit(EXIT_FAILURE);
		}
		else
		{
		printf("\n\t Matrix size : %d x %d", N, N);
		printf("\n\t CUDA block size: %d x %d", BLOCKSIZE, BLOCKSIZE);
		}
		}

		/* host memory allocation */
		MyData h_buffer = (MyData )malloc(3 * SQUARE(N) * sizeof(MyData));
		assert(h_buffer != NULL);
		/* set-up host pointers */
		MyData *const restrict h_A = h_buffer;
		MyData *const restrict h_B = h_A + SQUARE(N);
		MyData *const restrict h_C = h_B + SQUARE(N);

		/* device memory allocation */
		MyData *d_buffer = NULL;
		cudaMalloc((void *)&d_buffer, (3 SQUARE(N) * sizeof(MyData)));
		assert(d_buffer != NULL);
		/* set-up device pointers */
		MyData *const restrict d_A = d_buffer;
		MyData *const restrict d_B = d_A + SQUARE(N);
		MyData *const restrict d_C = d_B + SQUARE(N);

		// matrix initialization
		matrix_init(h_buffer, N);

		/* copy host data to device */
		cudaMemcpy(d_A, h_A, (2 * SQUARE(N) * sizeof(MyData)), cudaMemcpyHostToDevice);

		// kernel lunch on GPU
		const size_t nblocks = ((N + BLOCKSIZE - 1) / BLOCKSIZE);
		const dim3 grid = {nblocks, nblocks, 1};
		const dim3 block = {BLOCKSIZE, BLOCKSIZE, 1};
		GPU_MM<<< grid, block >>>(d_A, d_B, d_C, N);

		/* host-device data synchronization */
		/* N.B. cudaDeviceSynchronize() is not */
		/* necessary since cudaMemcpy() is blocking */
		cudaMemcpy(h_C, d_C, (SQUARE(N) * sizeof(MyData)), cudaMemcpyDeviceToHost);

		// free the GPU memory
		cudaFree(d_buffer);

		// check the result
		check(h_C, h_A, N);

		free(h_buffer);

		return 0;
		}