miscellaneous update (ee264684) · Commits · HPC_SCHOOL / HPC_SCHOOL_2024

cuda-omp/cuda/7/mat_mult_block.cu

+2 −2

Original line number	Diff line number	Diff line
		@@ -257,9 +257,9 @@ int main()
		const dim3 block = {(BLOCK * BLOCK), 1, 1};

		/////////////////////////// GPU naive block algorithm ////////////////////////////////////////
		time = 0.0;
		GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
		cudaDeviceSynchronize();
		time = 0.0;
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		@@ -274,9 +274,9 @@ int main()
		/////////////////////////////////////////////////////////////////////////////////////

		/////////////////////////// GPU block shared memory algorithm ///////////////////////
		time = 0.0;
		GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
		cudaDeviceSynchronize();
		time = 0.0;
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();

cuda-omp/hybrid/hybrid_cublas_omp.c

0 → 100644

+282 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////////
		//
		// Using CUDA data in OpenMP
		//
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 03.09.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// - using nvc
		// $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
		// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
		// - using clang
		// $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda
		// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
		//
		// - Run the code:
		// $ export OMP_TARGET_OFFLOAD=mandatory
		// $ ./hybrid_cuda_omp
		////////////////////////////////////////////////////////////////////////////////////////////////////

		#include <cuda_runtime.h>
		#include <cublas_v2.h>
		#include <assert.h>
		#include <float.h>
		#include <math.h>
		#include <stdio.h>
		#include <stdlib.h>
		#include <time.h>
		#include <omp.h>

		#define N 2048
		#define SIZE ((N) * (N))
		#define ALPHA 1.0
		#define BETA 0.0
		#define HOST 0
		#define DEV 1
		#define LOOP 10
		#define INIT 0
		#define KERNEL 1
		#define DATA 2

		typedef double MyData;

		static double _time[2][3];
		static int thr[2];

		double process_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
		const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

		return ret;
		}

		double thread_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
		const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

		return ret;
		}

		void InitHost(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C,
		int *const restrict thr)
		{
		double start;

		#pragma omp parallel
		{
		#pragma omp barrier
		#pragma omp master
		{
		*thr = omp_get_num_threads();
		start = thread_time();
		}
		#pragma omp barrier

		#pragma omp for collapse(2)
		for (int i=0 ; i<N ; i++)
		for (int j=0 ; j<N ; j++)
		{
		A[(i * N) + j] = 1.0;
		B[(i * N) + j] = 2.0;
		C[(i * N) + j] = 0.0;
		}

		#pragma omp master
		{
		_time[HOST][INIT] += (thread_time() - start);
		}
		} // omp parallel

		return;
		}

		void InitDev(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C)
		{
		const double start = thread_time();

		#pragma omp target teams loop collapse(2)
		for (int i=0 ; i<N ; i++)
		for (int j=0 ; j<N ; j++)
		{
		A[(i * N) + j] = 1.0;
		B[(i * N) + j] = 2.0;
		C[(i * N) + j] = 0.0;
		}

		_time[DEV][INIT] += (thread_time() - start);

		return;
		}

		void HostDgemm(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C,
		const MyData alpha,
		const MyData beta,
		int *const restrict thr)
		{
		// C = alpha * A * B + beta * C;

		double start;

		// naive calculation
		#pragma omp parallel
		{
		#pragma omp barrier
		#pragma omp master
		{
		*thr = omp_get_num_threads();
		start = thread_time();
		}
		#pragma omp barrier

		#pragma omp for collapse(2)
		for (int i=0 ; i<N ; i++)
		for (int j=0 ; j<N ; j++)
		{
		MyData sum = 0.0;
		for (int k=0 ; k<N ; k++)
		sum += A[(i * N) + k] * B[(k * N) + j];

		C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
		}

		#pragma omp master
		{
		_time[HOST][KERNEL] += (thread_time() - start);
		}
		} // omp parallel

		return;
		}

		void check(MyData *const restrict host_array,
		MyData *const restrict dev_array)
		{
		int flag = 0;
		for (size_t i=0 ; i<SIZE ; i++)
		flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);

		if (!flag)
		printf("\n\t Result OK \n");
		else
		printf("\n\t Result wrong \n");

		return;
		}

		int main()
		{
		// Host allocation
		MyData buffer = (MyData )malloc(4 * SIZE * sizeof(MyData));
		assert(buffer != NULL);
		MyData *const restrict A = buffer;
		MyData *const restrict B = A + SIZE;
		MyData *const restrict C_CPU = B + SIZE;
		MyData *const restrict C_GPU = C_CPU + SIZE;

		// Spawning 2 host threads
		#pragma omp parallel num_threads(2)
		{
		// Evaluate the Dgemm on the host
		#pragma omp single nowait
		{
		// allowing nested parallelism
		omp_set_max_active_levels(2);

		for (int loop=0 ; loop<LOOP ; loop++)
		{
		InitHost(A, B, C_CPU, &thr[0]);
		HostDgemm(A, B, C_CPU, ALPHA, BETA, &thr[1]);
		}
		} // omp single

		#pragma omp single nowait
		{
		// Initialize cuBLAS library
		cublasHandle_t handle;
		cublasCreate(&handle);

		// Allocate A, B, C on the device using CUDA API
		MyData *d_buffer = NULL;
		cudaMalloc((void *)&d_buffer, (3 SIZE * sizeof(MyData)));
		assert(d_buffer != NULL);
		MyData *const restrict d_A = d_buffer;
		MyData *const restrict d_B = d_A + SIZE;
		MyData *const restrict d_C = d_B + SIZE;

		// get the default device
		const int dev = omp_get_default_device();

		// Associate external device pointers
		omp_target_associate_ptr(A, d_A, (SIZE * sizeof(MyData)), 0, dev);
		omp_target_associate_ptr(B, d_B, (SIZE * sizeof(MyData)), 0, dev);
		omp_target_associate_ptr(C_GPU, d_C, (SIZE * sizeof(MyData)), 0, dev);

		for (int loop=0 ; loop<LOOP ; loop++)
		{
		// Init device with blocking omp target directive
		InitDev(A, B, C_GPU);

		// Apply DGEMM using device pointers directly
		const MyData alpha = ALPHA;
		const MyData beta = BETA;

		double start = thread_time();
		cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
		&alpha,
		d_A, N,
		d_B, N,
		&beta,
		d_C, N);

		// CUDA synchronization point
		cudaDeviceSynchronize();
		_time[DEV][KERNEL] += (thread_time() - start);

		// Fetch data from the device and deallocate
		start = thread_time();
		cudaMemcpy(C_GPU, d_C, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
		_time[DEV][DATA] += (thread_time() - start);
		} // LOOP

		// release pointer association
		omp_target_disassociate_ptr(A, dev);
		omp_target_disassociate_ptr(B, dev);
		omp_target_disassociate_ptr(C_GPU, dev);

		// deallocate device's memory
		cudaFree(d_buffer);

		cublasDestroy(handle);
		} // omp single
		} // synchronization point

		check(C_CPU, C_GPU);

		free(buffer);

		printf("\n\t Matrix size: %d x %d\n", N, N);

		printf("\n\t Host execution time:");
		printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
		printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);

		printf("\n\t Device execution time:");
		printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP);
		printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP);
		printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);

		return 0;
		}

cuda-omp/hybrid/hybrid_cuda_omp.c→cuda-omp/hybrid/hybrid_omp_cublas.c

+268 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////////
		//
		// Passing OpenMP data to cuBlas.
		// Passing OpenMP data to foreign runtime (cuBLAS library).
		//
		// Author: David Goz
		// mail : david.goz@inaf.it
		@@ -8,11 +8,16 @@
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
		// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
		// - using nvc
		// $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
		// hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas
		// - using clang
		// $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda
		// hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas
		//
		// - Run the code:
		// $ export OMP_TARGET_OFFLOAD=mandatory
		// $ ./hybrid_cuda_omp
		// $ ./hybrid_omp_cuda
		////////////////////////////////////////////////////////////////////////////////////////////////////

		#include <cuda_runtime.h>
		@@ -22,19 +27,61 @@
		#include <math.h>
		#include <stdio.h>
		#include <stdlib.h>
		#include <time.h>
		#include <omp.h>

		#define N 512
		#define N 2048
		#define SIZE ((N) * (N))
		#define ALPHA 1.0
		#define BETA 0.0
		#define HOST 0
		#define DEV 1
		#define LOOP 10
		#define INIT 0
		#define KERNEL 1
		#define DATA 2

		typedef double MyData;

		static double _time[2][3];
		static int thr[2];

		double process_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
		const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

		return ret;
		}

		double thread_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
		const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

		return ret;
		}

		void InitHost(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C)
		MyData *const restrict C,
		int *const restrict thr)
		{
		double start;

		#pragma omp parallel
		{
		#pragma omp barrier
		#pragma omp master
		{
		//#pragma omp parallel for collapse(2)
		*thr = omp_get_num_threads();
		start = thread_time();
		}
		#pragma omp barrier

		#pragma omp for collapse(2)
		for (int i=0 ; i<N ; i++)
		for (int j=0 ; j<N ; j++)
		{
		@@ -42,12 +89,22 @@ void InitHost(MyData *const restrict A,
		B[(i * N) + j] = 2.0;
		C[(i * N) + j] = 0.0;
		}

		#pragma omp master
		{
		_time[HOST][INIT] += (thread_time() - start);
		}
		} // omp parallel

		return;
		}

		void InitDev(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C)
		{
		const double start = thread_time();

		#pragma omp target teams loop collapse(2)
		for (int i=0 ; i<N ; i++)
		for (int j=0 ; j<N ; j++)
		@@ -57,6 +114,8 @@ void InitDev(MyData *const restrict A,
		C[(i * N) + j] = 0.0;
		}

		_time[DEV][INIT] += (thread_time() - start);

		return;
		}

		@@ -64,12 +123,25 @@ void HostDgemm(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C,
		const MyData alpha,
		const MyData beta)
		const MyData beta,
		int *const restrict thr)
		{
		// C = alpha * A * B + beta * C;

		double start;

		// naive calculation
		// #pragma omp parallel for collapse(2)
		#pragma omp parallel
		{
		#pragma omp barrier
		#pragma omp master
		{
		*thr = omp_get_num_threads();
		start = thread_time();
		}
		#pragma omp barrier

		#pragma omp for collapse(2)
		for (int i=0 ; i<N ; i++)
		for (int j=0 ; j<N ; j++)
		{
		@@ -80,6 +152,12 @@ void HostDgemm(MyData *const restrict A,
		C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
		}

		#pragma omp master
		{
		_time[HOST][KERNEL] += (thread_time() - start);
		}
		} // omp parallel

		return;
		}

		@@ -91,9 +169,9 @@ void check(MyData *const restrict host_array,
		flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);

		if (!flag)
		printf("\n\t Result OK");
		printf("\n\t Result OK \n");
		else
		printf("\n\t Result wrong");
		printf("\n\t Result wrong \n");

		return;
		}
		@@ -114,8 +192,14 @@ int main()
		// Evaluate the Dgemm on the host
		#pragma omp single nowait
		{
		InitHost(A, B, CC);
		HostDgemm(A, B, CC, ALPHA, BETA);
		// allowing nested parallelism
		omp_set_max_active_levels(2);

		for (int loop=0 ; loop<LOOP ; loop++)
		{
		InitHost(A, B, CC, &thr[0]);
		HostDgemm(A, B, CC, ALPHA, BETA, &thr[1]);
		}
		} // omp single

		#pragma omp single nowait
		@@ -127,25 +211,39 @@ int main()
		// Allocate A, B, C on the device
		#pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE])

		for (int loop=0 ; loop<LOOP ; loop++)
		{
		// Init device with blocking omp target directive
		InitDev(A, B, C);

		// Define a target data region where A, B, and C pointers
		// refer to device's address space
		#pragma omp target data use_device_addr(A, B, C)
		#pragma omp target data use_device_ptr(A, B, C)
		{
		MyData const alpha = ALPHA;
		MyData const beta = BETA;
		const MyData alpha = ALPHA;
		const MyData beta = BETA;

		const double start = thread_time();
		cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
		&alpha, A, N, B, N, &beta, C, N);
		&alpha,
		A, N,
		B, N,
		&beta,
		C, N);

		// CUDA synchronization point
		cudaDeviceSynchronize();
		_time[DEV][KERNEL] += (thread_time() - start);
		}

		// Fetch data from the device and deallocate
		#pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE])
		const double start = thread_time();
		#pragma omp target update from (C[0:SIZE])
		_time[DEV][DATA] += (thread_time() - start);
		} // LOOP

		// deallocate device's memory
		#pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C[0:SIZE])

		cublasDestroy(handle);
		} // omp single
		@@ -155,5 +253,16 @@ int main()

		free(buffer);

		printf("\n\t Matrix size: %d x %d\n", N, N);

		printf("\n\t Host execution time:");
		printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
		printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);

		printf("\n\t Device execution time:");
		printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP);
		printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP);
		printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);

		return 0;
		}

cuda-omp/hybrid/hybrid_omp_cuda.cu

0 → 100644

+281 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////////
		//
		// Passing OpenMP data to foreign runtime (cuBLAS library).
		//
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 03.09.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// - using nvc
		// $ nvc++ -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
		// hybrid_omp_cuda.cu -o hybrid_omp_cuda -lm && ./hybrid_omp_cuda
		//
		// - using clang
		// $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda
		// hybrid_omp_cuda.c -o hybrid_omp_cuda -lm
		//
		// - Run the code:
		// $ export OMP_TARGET_OFFLOAD=mandatory
		// $ ./hybrid_omp_cuda
		////////////////////////////////////////////////////////////////////////////////////////////////////

		#include <cuda.h>
		#include <math.h>
		#include <assert.h>
		#include <float.h>
		#include <stdio.h>
		#include <stdlib.h>
		#include <time.h>
		#include <omp.h>

		#define N 1024
		#define SIZE ((N) * (N))
		#define HOST 0
		#define DEV 1
		#define LOOP 10
		#define INIT 0
		#define KERNEL 1
		#define DATA 2
		#define BLOCKSIZE 1024

		typedef double MyData;

		static double _time[2][3];
		static int thr[2];

		double process_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
		const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

		return ret;
		}

		double thread_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
		const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

		return ret;
		}

		void InitHost(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C,
		int *const restrict thr)
		{
		double start;

		#pragma omp parallel
		{
		#pragma omp barrier
		#pragma omp master
		{
		*thr = omp_get_num_threads();
		start = thread_time();
		}
		#pragma omp barrier

		#pragma omp for collapse(2)
		for (int i=0 ; i<N ; i++)
		for (int j=0 ; j<N ; j++)
		{
		A[(i * N) + j] = 1.0;
		B[(i * N) + j] = (1.0 / M_PI);
		C[(i * N) + j] = 0.0;
		}

		#pragma omp master
		{
		_time[HOST][INIT] += (thread_time() - start);
		}
		} // omp parallel

		return;
		}

		void InitDev(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C)
		{
		const double start = thread_time();

		#pragma omp target teams loop collapse(2) is_device_ptr(A, B, C)
		for (int i=0 ; i<N ; i++)
		for (int j=0 ; j<N ; j++)
		{
		A[(i * N) + j] = 1.0;
		B[(i * N) + j] = (1.0 / M_PI);
		C[(i * N) + j] = 0.0;
		}

		_time[DEV][INIT] += (thread_time() - start);

		return;
		}

		void HostMM(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C,
		int *const restrict thr)
		{
		// C = alpha * A * B + beta * C;

		double start;

		// naive calculation
		#pragma omp parallel
		{
		#pragma omp barrier
		#pragma omp master
		{
		*thr = omp_get_num_threads();
		start = thread_time();
		}
		#pragma omp barrier

		#pragma omp for collapse(2)
		for (int i=0 ; i<N ; i++)
		for (int j=0 ; j<N ; j++)
		{
		MyData sum = 0.0;
		for (int k=0 ; k<N ; k++)
		sum += A[(i * N) + k] * B[(k * N) + j];

		C[(i * N) + j] = sum;
		}

		#pragma omp master
		{
		_time[HOST][KERNEL] += (thread_time() - start);
		}
		} // omp parallel

		return;
		}

		__global__ void DevMM(MyData *const restrict A,
		MyData *const restrict B,
		MyData *const restrict C,
		const int n)
		{
		const int size = (n * n);
		const int globalID = threadIdx.x + (blockIdx.x * blockDim.x);

		if (globalID >= size)
		return;

		const int i = (globalID / N);
		const int j = (globalID % N);

		MyData sum = 0.0;
		for (int k=0 ; k<N ; k++)
		sum += (A[(i * N) + k] * B[(k * N) + j]);

		C[(i * N) + j] = sum;

		return;
		}

		void check(MyData *const restrict host_array,
		MyData *const restrict dev_array)
		{
		int flag = 0;
		for (size_t i=0 ; i<SIZE ; i++)
		flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);

		if (!flag)
		printf("\n\t Result OK \n");
		else
		printf("\n\t Result wrong \n");

		return;
		}

		int main()
		{
		// Host allocation
		MyData h_buffer = (MyData )malloc(2 * SIZE * sizeof(MyData));
		assert(h_buffer != NULL);
		MyData *const restrict C_HOST = h_buffer;
		MyData *const restrict C_DEV = C_HOST + SIZE;

		// Spawning 2 host threads
		#pragma omp parallel num_threads(2)
		{
		// Evaluate the Dgemm on the host
		#pragma omp single nowait
		{
		// allowing nested parallelism
		omp_set_max_active_levels(2);

		MyData tmp = (MyData )malloc(2 * SIZE * sizeof(MyData));
		MyData *const restrict A = tmp;
		MyData *const restrict B = A + SIZE;

		for (int loop=0 ; loop<LOOP ; loop++)
		{
		InitHost(A, B, C_HOST, &thr[0]);
		HostMM(A, B, C_HOST, &thr[1]);
		}

		free(tmp);
		} // omp single

		#pragma omp single nowait
		{
		// Device allocation
		const int dev = omp_get_default_device();
		const int host = omp_get_initial_device();
		MyData d_buffer = (MyData )omp_target_alloc((3 * SIZE * sizeof(MyData)), dev);
		assert(d_buffer != NULL);
		MyData *const restrict d_A = d_buffer;
		MyData *const restrict d_B = d_A + SIZE;
		MyData *const restrict d_C = d_B + SIZE;

		const dim3 nblock = {((SIZE + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1};
		const dim3 block = {BLOCKSIZE, 1, 1};

		for (int loop=0 ; loop<LOOP ; loop++)
		{
		// Init device with blocking omp target directive
		InitDev(d_A, d_B, d_C);

		double start = thread_time();
		DevMM<<< nblock, block >>>(d_A, d_B, d_C, N);
		// CUDA synchronization point
		cudaDeviceSynchronize();
		_time[DEV][KERNEL] += (thread_time() - start);

		// Fetch data from the device and deallocate
		start = thread_time();
		omp_target_memcpy(C_DEV, d_C, (SIZE * sizeof(MyData)), 0, 0, host, dev);
		_time[DEV][DATA] += (thread_time() - start);
		} // LOOP

		// deallocate device's memory
		omp_target_free(d_buffer, dev);
		} // omp single
		} // synchronization point

		check(C_HOST, C_DEV);

		free(h_buffer);

		printf("\n\t Matrix size: %d x %d\n", N, N);

		printf("\n\t Host execution time:");
		printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
		printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);

		printf("\n\t Device execution time:");
		printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP);
		printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP);
		printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);

		return 0;
		}

cuda-omp/omp/miscellaneous/host_device_address_space.c

0 → 100644

+76 −0

Original line number	Diff line number	Diff line
		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 03.09.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// - nvc complains about the '#pragma omp target has_device_addr'
		// - clang works
		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		#include <stdlib.h>
		#include <string.h>
		#include <assert.h>

		static int answer = 42;
		#define SIZE 3

		int main()
		{
		int A = (int )malloc(SIZE * sizeof(int));
		assert(A != NULL);
		memset(A, 0, (SIZE * sizeof(int)));

		printf("\n\t Host address of 'answer' [value: %d] is: %p", answer, &answer);
		printf("\n\t Host address of 'A' is: %p", A);

		// Map 'answer' to the device
		#pragma omp target data map(alloc: answer, A[0:SIZE])
		{
		#pragma omp target update to(answer, A[0:SIZE])

		// Update 'answer' on the device
		#pragma omp target map(answer)
		{
		answer += 1;
		for (int i=0 ; i<SIZE ; i++)
		A[i] += (i + 1);
		}

		#pragma omp target update from(answer, A[0:SIZE])

		printf("\n\t Address of 'answer' [value: %d] in target data region is: %p",
		answer, &answer);
		printf("\n\t Address of 'A' in target data region is: %p", A);
		printf("\n\t Value of A:");
		for (int i=0 ; i<SIZE ; i++)
		printf("\n\t\t A[%d] = %d", i, A[i]);
		printf("\n");

		#pragma omp target data use_device_addr(answer) use_device_ptr(A)
		{
		#pragma omp target has_device_addr(answer)
		{
		answer += 1;
		for (int i=0 ; i<SIZE ; i++)
		A[i] += (i + 1);
		}

		printf("\n\t Device address of 'answer' is: %p", &answer);
		printf("\n\t Device address of 'A' is: %p", A);
		}

		#pragma omp target update from(answer, A[0:SIZE])
		printf("\n\t Last value of 'answer' is: %d", answer);
		printf("\n\t Last value if 'A' is:");
		for (int i=0 ; i<SIZE ; i++)
		printf("\n\t\t A[%d] = %d", i, A[i]);
		printf("\n\n");
		} // omp target data

		free(A);

		return 0;
		}