omp/miscellaneous/multiple_device example (31d1aa2e) · Commits · HPC_SCHOOL / HPC_SCHOOL_2024

cuda-omp/omp/miscellaneous/multiple_devices.c

0 → 100644

+132 −0

Original line number	Diff line number	Diff line
		#include <stdio.h>
		#include <stdlib.h>
		#include <omp.h>
		#include <assert.h>

		typedef int MyData;
		#define N_PER_DEV 1000000
		#define BLOCKSIZE 256

		#if (BLOCKSIZE < 32) \|\| (BLOCKSIZE > 1024)
		#error "32 <= BLOCKSIZE <= 1024"
		#endif

		#if (N_PER_DEV < BLOCKSIZE)
		#error "N_PER_DEV < BLOCKSIZE"
		#endif

		#define NDEBUG

		void check(const MyData *const restrict vector_cpu,
		const MyData *const restrict vector_gpu,
		const size_t size)
		{
		int flag = 0;
		for (size_t i=0 ; i<size ; i++)
		{
		#if !defined(NDEBUG)
		printf("\n\t vector_cpu[%zu] = %d - vector_gpu[%zu] = %d",
		i, vector_cpu[i], i, vector_gpu[i]);
		#endif

		flag = ((vector_cpu[i] != vector_gpu[i]) ? 1 : flag);
		}

		if (flag)
		printf("\n\t Result wrong \n");
		else
		printf("\n\t Result OK \n");

		return;
		}

		void VectorAdd(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const int offset,
		const int size,
		const int dev,
		const int nblocks)
		{
		#pragma omp target \
		teams num_teams(nblocks) thread_limit(BLOCKSIZE) \
		map(to: A[offset:size], B[offset:size]) map(from: C[offset:size]) \
		device(dev)
		{
		const int team = omp_get_team_num();
		const int team_start_index = (team * BLOCKSIZE) + offset;
		const int team_end_index = team_start_index + BLOCKSIZE;

		#pragma omp parallel num_threads(BLOCKSIZE)
		{
		const int localID = omp_get_thread_num();
		const int block = omp_get_num_threads();

		int globalID = team_start_index + localID;

		for (int index=globalID ; index<team_end_index ; index+=block)
		C[index] = A[index] + B[index];

		#if !defined(NDEBUG)

		if ((localID == 0) && (team == 0))
		printf("\n\t Device: %d - Teams: %d [requested: %d]- Thread per team: %d [requested: %d]",
		dev, omp_get_num_teams(), nblocks, block, BLOCKSIZE);
		#endif
		} // omp parallel
		} // omp target

		return;
		}

		int main()
		{
		// get the number of the available devices
		const int NumDev = omp_get_num_devices();

		// global vector size
		const int size = (NumDev * N_PER_DEV);
		assert(size > 0);

		MyData buffer = (MyData )malloc(4 * size * sizeof(MyData));
		assert(buffer != NULL);
		MyData *const restrict A = buffer;
		MyData *const restrict B = A + size;
		MyData *const restrict C_CPU = B + size;
		MyData *const restrict C_GPU = C_CPU + size;

		#pragma omp parallel for simd
		for (int i=0 ; i<size ; i++)
		{
		A[i] = rand() % N_PER_DEV;
		B[i] = rand() % N_PER_DEV;
		C_CPU[i] = A[i] + B[i];
		}

		#pragma omp parallel num_threads(NumDev)
		{
		// check
		#pragma omp single
		{
		if (NumDev != omp_get_num_threads())
		exit(EXIT_FAILURE);
		else
		{
		printf("\n\t Using %d GPUs \n", NumDev);
		fflush(stdout);
		}
		} // implicit barrier

		const int tid = omp_get_thread_num();
		const int offset = (tid * N_PER_DEV);
		const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE);

		VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks);
		} // omp parallel

		check(C_CPU, C_GPU, size);

		free(buffer);

		return 0;
		}