hybrid cuda-omp example added (726c5ecb) · Commits · HPC_SCHOOL / HPC_SCHOOL_2024

cuda-omp/hybrid/hybrid_cuda_omp.c

0 → 100644

+159 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////////
		//
		// Passing OpenMP data to cuBlas.
		//
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 02.09.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
		// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
		// - Run the code:
		// $ export OMP_TARGET_OFFLOAD=mandatory
		// $ ./hybrid_cuda_omp
		////////////////////////////////////////////////////////////////////////////////////////////////////

		#include <cuda_runtime.h>
		#include <cublas_v2.h>
		#include <assert.h>
		#include <float.h>
		#include <math.h>
		#include <stdio.h>
		#include <stdlib.h>

		#define N 512
		#define SIZE ((N) * (N))
		#define ALPHA 1.0
		#define BETA 0.0

		typedef double MyData;

		void InitHost(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C)
		{
		//#pragma omp parallel for collapse(2)
		for (int i=0 ; i<N ; i++)
		for (int j=0 ; j<N ; j++)
		{
		A[(i * N) + j] = 1.0;
		B[(i * N) + j] = 2.0;
		C[(i * N) + j] = 0.0;
		}
		}

		void InitDev(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C)
		{
		#pragma omp target teams loop collapse(2)
		for (int i=0 ; i<N ; i++)
		for (int j=0 ; j<N ; j++)
		{
		A[(i * N) + j] = 1.0;
		B[(i * N) + j] = 2.0;
		C[(i * N) + j] = 0.0;
		}

		return;
		}

		void HostDgemm(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C,
		const MyData alpha,
		const MyData beta)
		{
		// C = alpha * A * B + beta * C;

		// naive calculation
		// #pragma omp parallel for collapse(2)
		for (int i=0 ; i<N ; i++)
		for (int j=0 ; j<N ; j++)
		{
		MyData sum = 0.0;
		for (int k=0 ; k<N ; k++)
		sum += A[(i * N) + k] * B[(k * N) + j];

		C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
		}

		return;
		}

		void check(MyData *const restrict host_array,
		MyData *const restrict dev_array)
		{
		int flag = 0;
		for (size_t i=0 ; i<SIZE ; i++)
		flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);

		if (!flag)
		printf("\n\t Result OK");
		else
		printf("\n\t Result wrong");

		return;
		}

		int main()
		{
		// Host allocation
		MyData buffer = (MyData )malloc(4 * SIZE * sizeof(MyData));
		assert(buffer != NULL);
		MyData *const restrict A = buffer;
		MyData *const restrict B = A + SIZE;
		MyData *const restrict C = B + SIZE;
		MyData *const restrict CC = C + SIZE;

		// Spawning 2 host threads
		#pragma omp parallel num_threads(2)
		{
		// Evaluate the Dgemm on the host
		#pragma omp single nowait
		{
		InitHost(A, B, CC);
		HostDgemm(A, B, CC, ALPHA, BETA);
		} // omp single

		#pragma omp single nowait
		{
		// Initialize cuBLAS library
		cublasHandle_t handle;
		cublasCreate(&handle);

		// Allocate A, B, C on the device
		#pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE])

		// Init device with blocking omp target directive
		InitDev(A, B, C);

		// Define a target data region where A, B, and C pointers
		// refer to device's address space
		#pragma omp target data use_device_addr(A, B, C)
		{
		MyData const alpha = ALPHA;
		MyData const beta = BETA;

		cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
		&alpha, A, N, B, N, &beta, C, N);

		// CUDA synchronization point
		cudaDeviceSynchronize();
		}

		// Fetch data from the device and deallocate
		#pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE])

		cublasDestroy(handle);
		} // omp single
		} // synchronization point

		check(CC, C);

		free(buffer);

		return 0;
		}

cuda-omp/omp/miscellaneous/asynchronous.c

0 → 100644

+110 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////////
		//
		// Splitting the asynchronous vector addition task graph across four devices
		//
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 28.08.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v asynchronous.c -o asynchronous_omp
		// - Run the code:
		// $ export OMP_TARGET_OFFLOAD=mandatory
		// $ ./asynchronous_omp
		////////////////////////////////////////////////////////////////////////////////////////////////////


		#include <stdio.h>
		#include <stdlib.h>
		#include <omp.h>
		#include <assert.h>

		typedef int MyData;

		#define NDEBUG

		void check(const MyData *const C,
		const size_t size)
		{
		int flag = 0;
		for (size_t i=0 ; i<size ; i++)
		flag = ((C[i] != 98) ? 1 : flag);

		if (flag)
		printf("\n\t Result wrong \n");
		else
		printf("\n\t Result OK \n");

		return;
		}

		int main()
		{
		// alloc data on the device
		const int NumDev = omp_get_num_devices();

		if (NumDev != 4)
		{
		printf("\n\t The program runs using 4 GPUs... aborting...\n");
		exit(EXIT_FAILURE);
		}

		const int size = 1000000;
		MyData buffer = (MyData )malloc(3 * size * sizeof(MyData));
		assert(buffer != NULL);
		MyData *const restrict A = buffer;
		MyData *const restrict B = A + size;
		MyData *const restrict C = B + size;

		// init A on GPU 0
		#pragma omp target nowait \
		map(tofrom: A[0:size]) \
		depend(out: A[0:size]) \
		device(0)
		{
		#pragma omp loop
		for (int i=0 ; i<size ; i++)
		A[i] = (MyData)i;
		} // device 0

		// init B on GPU 1
		#pragma omp target nowait \
		map(tofrom: B[0:size]) \
		depend(out: B[0:size]) \
		device(1)
		{
		#pragma omp loop
		for (int i=0 ; i<size ; i++)
		B[i] = (MyData)-i;
		} // device 1

		// init C on GPU 2
		#pragma omp target nowait \
		map(tofrom: C[0:size]) \
		depend(out: C[0:size]) \
		device(2)
		{
		#pragma omp loop
		for (int i=0 ; i<size ; i++)
		C[i] = 99;
		} // device 2

		// perform the calculation on GPU 3
		#pragma omp target nowait \
		map(to: A[0:size], B[0:size]) map(tofrom: C[0:size]) \
		depend(in: A[0:size], B[0:size]) depend(inout: C[0:size]) \
		device(3)
		{
		#pragma omp loop
		for (int i=0 ; i<size ; i++)
		C[i] += (A[i] + B[i] - 1);
		} // device 3

		#pragma omp task depend(in: C[0:size])
		check(C, size);

		free(buffer);

		return 0;
		}

cuda-omp/omp/miscellaneous/dependencies.c

0 → 100644

+95 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////////
		//
		// Task dependencies:
		// example of synchronization by defining explicit dependencies between the tasks.
		//
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 28.08.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v dependencies.c -o dependencies_omp
		// - Run the code:
		// $ export OMP_TARGET_OFFLOAD=mandatory
		// $ ./dependencies_omp
		////////////////////////////////////////////////////////////////////////////////////////////////////


		#include <stdio.h>
		#include <stdlib.h>
		#include <omp.h>
		#include <assert.h>

		typedef int MyData;

		#define NDEBUG

		void check(const MyData *const C,
		const size_t size)
		{
		int flag = 0;
		for (size_t i=0 ; i<size ; i++)
		flag = ((C[i] != 0) ? 1 : flag);

		if (flag)
		printf("\n\t Result wrong \n");
		else
		printf("\n\t Result OK \n");

		return;
		}

		int main()
		{
		const int size = 1000000;

		MyData C = (MyData )malloc(size * sizeof(MyData));
		assert(C != NULL);

		// alloc data on the device
		const int dev_gpu = omp_get_default_device();
		MyData gpu_buffer = (MyData )omp_target_alloc(2 * size * sizeof(MyData), dev_gpu);
		assert(gpu_buffer != NULL);
		MyData *A = gpu_buffer;
		MyData *B = A + size;

		// init C with random number
		for (int i=0 ; i<size ; i++)
		C[i] = rand() % size;

		#pragma omp target enter data map(to: C[0:size])

		// init A
		#pragma omp target nowait depend(out: A[0:size]) is_device_ptr(A)
		{
		#pragma omp loop
		for (int i=0 ; i<size; i++)
		A[i] = i;
		}

		// init B
		#pragma omp target nowait depend(out: B[0:size]) is_device_ptr(B)
		{
		#pragma omp loop
		for (int i=0 ; i<size; i++)
		B[i] = -i;
		}

		// vector add
		#pragma omp target nowait depend(in: A[0:size], B[0:size]) depend(out: C[0:size]) is_device_ptr(A, B)
		{
		#pragma omp loop
		for (int i=0 ; i<size; i++)
		C[i] = A[i] + B[i];
		}

		#pragma omp target update from (C[0:size]) depend(in: C[0:size])

		check(C, size);
		omp_target_free(gpu_buffer, dev_gpu);
		free(C);
		#pragma omp target exit data map(release: C[0:size])

		return 0;
		}

cuda-omp/omp/miscellaneous/multiple_devices.c

+50 −36

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////////
		//
		// Offload to multiple devices within the same node:
		// - using one OMP host-thread per device synchronously;
		// - using one OMP host-thread asynchronously.
		//
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 27.08.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v multiple_devices.c -o multiple_devices_omp
		// - Run the code:
		// $ export OMP_TARGET_OFFLOAD=mandatory
		// $ ./multiple_devices_omp
		////////////////////////////////////////////////////////////////////////////////////////////////////


		#include <stdio.h>
		#include <stdlib.h>
		#include <omp.h>
		@@ -15,30 +34,10 @@ typedef int MyData;
		#error "N_PER_DEV < BLOCKSIZE"
		#endif

		#define NDEBUG

		void check(const MyData *const restrict vector_cpu,
		const MyData *const restrict vector_gpu,
		const size_t size)
		{
		int flag = 0;
		for (size_t i=0 ; i<size ; i++)
		{
		#if !defined(NDEBUG)
		printf("\n\t vector_cpu[%zu] = %d - vector_gpu[%zu] = %d",
		i, vector_cpu[i], i, vector_gpu[i]);
		#endif

		flag = ((vector_cpu[i] != vector_gpu[i]) ? 1 : flag);
		}

		if (flag)
		printf("\n\t Result wrong \n");
		else
		printf("\n\t Result OK \n");
		#define TRUE 1
		#define FALSE 0

		return;
		}
		#define NDEBUG

		void VectorAdd(const MyData *const restrict A,
		const MyData *const restrict B,
		@@ -46,7 +45,8 @@ void VectorAdd(const MyData *const restrict A,
		const int offset,
		const int size,
		const int dev,
		const int nblocks)
		const int nblocks,
		const int async)
		{
		#pragma omp target \
		teams num_teams(nblocks) thread_limit(BLOCKSIZE) \
		@@ -84,6 +84,8 @@ int main()
		// get the number of the available devices
		const int NumDev = omp_get_num_devices();

		const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE);

		// global vector size
		const int size = (NumDev * N_PER_DEV);
		assert(size > 0);
		@@ -103,6 +105,7 @@ int main()
		C_CPU[i] = A[i] + B[i];
		}

		// each device is managed by a single OMP thread
		#pragma omp parallel num_threads(NumDev)
		{
		// check
		@@ -119,11 +122,22 @@ int main()

		const int tid = omp_get_thread_num();
		const int offset = (tid * N_PER_DEV);
		const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE);

		VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks);
		VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks, FALSE);
		} // omp parallel

		check(C_CPU, C_GPU, size);
		memset(C_GPU, 0, (size * sizeof(MyData)));

		// one OMP thread manages asynchronously all the devices
		for (int dev=0 ; dev<NumDev ; dev++)
		{
		const int offset = (dev * N_PER_DEV);
		VectorAdd(A, B, C_GPU, offset, N_PER_DEV, dev, nblocks, TRUE);
		}

		// host-devices synchronization
		#pragma omp taskwait
		check(C_CPU, C_GPU, size);

		free(buffer);

cuda-omp/omp/miscellaneous/task_reductions.c

0 → 100644

+83 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////////
		//
		//
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 28.08.2024
		// code tested using nvhpc
		//
		// The code does not compile with nvc
		// The code compiles with clang

		////////////////////////////////////////////////////////////////////////////////////////////////////


		#include <stdio.h>
		#include <stdlib.h>
		#include <omp.h>
		#include <assert.h>

		typedef int MyData;
		#define N_PER_DEV 1000000

		#define NDEBUG

		int main()
		{
		// get the number of the available devices
		const int NumDev = omp_get_num_devices();

		// global vector size
		const int size = (NumDev * N_PER_DEV);
		assert(size > 0);

		MyData buffer = (MyData )malloc(2 * size * sizeof(MyData));
		assert(buffer != NULL);
		MyData *const restrict A = buffer;
		MyData *const restrict B = A + size;
		MyData sum_cpu = (MyData)0;

		#pragma omp parallel for simd reduction(+: sum_cpu)
		for (int i=0 ; i<size ; i++)
		{
		A[i] = rand() % N_PER_DEV;
		B[i] = rand() % N_PER_DEV;
		sum_cpu += A[i] + B[i];
		}

		MyData sum_gpu = (MyData)0;

		#pragma omp parallel num_threads(NumDev) reduction(task, +:sum_gpu)
		{
		#pragma omp single
		{
		if (NumDev != omp_get_num_threads())
		exit(EXIT_FAILURE);
		else
		{
		printf("\n\t Using %d GPUs \n", NumDev);
		fflush(stdout);
		}
		} // implicit barrier

		const int tid = omp_get_thread_num();
		const int offset = (tid * N_PER_DEV);

		#pragma omp target \
		map(to: A[offset:N_PER_DEV], B[offset:N_PER_DEV]) \
		device(tid) \
		in_reduction(+: sum_gpu)
		#pragma omp loop reduction(+: sum_gpu)
		for (int i=offset ; i<(offset + N_PER_DEV) ; i++)
		sum_gpu += (A[i] + B[i]);
		} // omp parallel

		if (sum_cpu == sum_gpu)
		printf("\n\t Result OK \n");
		else
		printf("\n\t Result wrong \n");

		free(buffer);

		return 0;
		}