memory management and unified memory (aa704d3b) · Commits · HPC_SCHOOL / HPC_SCHOOL_2024

cuda-omp/omp/miscellaneous/memory_management.c

0 → 100644

+420 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////
		// - Block matrix multiplication algorithm
		//
		// const size_t Nblocks = (N / Bsize);
		//
		// // loop over blocks of matrix C
		// for (size_t ib=0 ; ib<Nblocks ; ib++)
		// {
		// for (size_t jb=0 ; jb<Nblocks ; jb++)
		// {
		//
		// // loop over blocks of rows of A
		// for (size_t kb=0 ; kb<Nblocks ; kb++)
		// {
		//
		// // Matrix multiplication within a block
		// for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++)
		// for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++)
		// for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++)
		// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
		//
		// } // kb
		// } // jb
		// } // ib
		//
		// - Exploit GPU shared-memory.
		// - Use explicit target memory allocator and memory management OpenMP APIs
		////////////////////////////////////////////////////////////////////////////////////////////////

		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 26.08.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v memory_management.c -o memory_management_omp
		// - Run the code:
		// $ export OMP_TARGET_OFFLOAD=MANDATORY
		// $ ./memory_management_omp
		//
		// N.B. team size in the range [32, 1024)
		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		#include <stdlib.h>
		#include <unistd.h>
		#include <time.h>
		#include <assert.h>
		#include <omp.h>
		#include <string.h>
		#include <math.h>
		#include <float.h>

		#define N 1024
		#define SIZE (N * N) // matrix size
		typedef double MyData; // do not change
		#define _BLOCK_ 16
		#define BLOCKSIZE ((_BLOCK_) * (_BLOCK_)) /* 32 <= BLOCKSIZE < 1024 */

		// sanity check
		#if _BLOCK_*_BLOCK_ > 1024
		#error _BLOCK_*_BLOCK_ cannot larger than 1024
		#endif

		#if _BLOCK_ > N
		#error _BLOCK_ > N
		#endif

		#define LOOP 100
		#define NDEBUG

		double wall_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
		const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

		return ret;
		}

		void CPU_mat_mult_block(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		const size_t Nblocks = (size / _BLOCK_);

		// loop over blocks of matrix C
		for (size_t ib=0 ; ib<Nblocks ; ib++)
		{
		for (size_t jb=0 ; jb<Nblocks ; jb++)
		{
		// loop over blocks of rows of A
		for (size_t kb=0 ; kb<Nblocks ; kb++)
		{
		// Matrix multiplication within a block
		for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
		for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
		for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
		C[(i * size) + j] += A[(i * size) + k] * B[(k * size) + j];
		} // kb
		} // jb
		} // ib

		return;
		}

		void GPU_mat_mult_block(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		const size_t Nblocks = (size / _BLOCK_);

		#pragma omp target \
		teams distribute collapse(2) num_teams(Nblocks * Nblocks) \
		thread_limit(BLOCKSIZE) \
		is_device_ptr(A, B, C)
		for (size_t ib=0 ; ib<Nblocks ; ib++)
		{
		for (size_t jb=0 ; jb<Nblocks ; jb++)
		{
		for (size_t kb=0 ; kb<Nblocks ; kb++)
		{
		#pragma omp parallel for collapse(2) num_threads(BLOCKSIZE)
		for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
		{
		for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
		{
		MyData value = ((kb == 0) ? (MyData)0 : C[(i * size) + j]);
		for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
		value += A[(i * size) + k] * B[(k * size) + j];

		C[(i * size) + j] = value;
		} // j
		} // i
		} // kb
		} // jb
		} // ib

		return;
		}

		void GPU_mat_mult_block_shared(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		const size_t Nblocks = (size / _BLOCK_);
		// Team local copies of tiles
		MyData Ablock[BLOCKSIZE];
		MyData Bblock[BLOCKSIZE];
		MyData Cblock[BLOCKSIZE];

		#pragma omp target \
		teams distribute collapse(2) num_teams(Nblocks * Nblocks) \
		private(Ablock, Bblock, Cblock) \
		thread_limit(BLOCKSIZE) \
		is_device_ptr(A, B, C)
		for (size_t ib=0 ; ib<Nblocks ; ib++)
		{
		for (size_t jb=0 ; jb<Nblocks ; jb++)
		{
		for (size_t kb=0 ; kb<Nblocks ; kb++)
		{
		#pragma omp parallel num_threads(BLOCKSIZE)
		{
		if (kb == 0) // Cblock initialization
		{
		#pragma omp for collapse(2) nowait
		for (size_t i=0 ; i<_BLOCK_ ; i++)
		for (size_t j=0 ; j<_BLOCK_ ; j++)
		Cblock[(i * _BLOCK_) + j] = (MyData)0;
		}

		// copy block of A into pteam memory Ablock
		#pragma omp for collapse(2) nowait // implicit barrier is removed
		for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
		for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
		Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] = A[(i * N) + k];

		// copy block of B into pteam memory Bblock
		#pragma omp for collapse(2) // implicit barrier to ensure that shared Ablock and Bblock can be accessed by all threads within the block
		for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
		for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
		Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)] = B[(k * N) + j];

		// matrix multiply within the block
		#pragma omp for collapse(2) nowait
		for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
		for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
		{
		MyData value = (MyData)0;
		for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
		value += Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] *
		Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)];

		Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)] += value;

		if (kb == (Nblocks - 1))
		C[(i * N) + j] = Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)];
		}
		} // omp parallel
		} // kb
		} // jb
		} // ib

		return;
		}

		void GPU_mat_mult_block_shared_no_loops(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		const size_t Nblocks = (size / _BLOCK_);
		// Team local copies of tiles
		MyData Ablock[BLOCKSIZE];
		MyData Bblock[BLOCKSIZE];
		MyData Cblock[BLOCKSIZE];

		#pragma omp target \
		teams num_teams(Nblocks * Nblocks) \
		private(Ablock, Bblock, Cblock) \
		thread_limit(BLOCKSIZE) \
		is_device_ptr(A, B, C)
		{
		const size_t ib = omp_get_team_num() / Nblocks;
		const size_t jb = omp_get_team_num() % Nblocks;

		#pragma omp parallel num_threads(BLOCKSIZE)
		{
		#if !defined(NDEBUG)

		/* ID within the team (CUDA block) */
		const int localID = omp_get_thread_num();
		/* Team ID (ID CUDA block) */
		const int team = omp_get_team_num();
		/* Team size (CUDA block size) */
		const int nthr = omp_get_num_threads();
		/* global thread index */
		const int globalID = (localID + (team * nthr));

		printf("\n\t globalID: %d - localID: %d - nthr: %d - team: %d\n",
		globalID, localID, nthr, team);

		#pragma omp barrier

		#endif /* NDEBUG */

		// local matrix's indexes mapped to each OMP GPU-thread within its own team
		const size_t i_local = omp_get_thread_num() / _BLOCK_;
		const size_t j_local = omp_get_thread_num() % _BLOCK_;

		// global matrix's indexes mapped to each OMP GPU-thread
		const size_t i_global = i_local + (ib * _BLOCK_);
		const size_t j_global = j_local + (jb * _BLOCK_);

		// Cblock initialization
		Cblock[(i_local * _BLOCK_) + j_local] = (MyData)0;

		// loop over blocks
		for (size_t block=0 ; block<Nblocks ; block++)
		{
		const size_t j_A = j_local + (block * _BLOCK_);
		const size_t i_B = i_local + (block * _BLOCK_);

		// waits until all threads in the thread block have reached this point and shared memory accesses
		#pragma omp barrier

		Ablock[(i_local * _BLOCK_) + j_local] = A[(i_global * size) + j_A];
		Bblock[(i_local * _BLOCK_) + j_local] = B[(i_B * size) + j_global];

		// waits until all threads in the thread block have reached this point and shared memory accesses
		#pragma omp barrier

		// perform the matrix multiplication within the block
		MyData value = (MyData)0;
		for (size_t k=0 ; k<_BLOCK_ ; k++)
		value += Ablock[(i_local * _BLOCK_) + k] * Bblock[(k * _BLOCK_) + j_local];

		// store the partial result in shared memory
		Cblock[(i_local * _BLOCK_) + j_local] += value;
		} // block loop

		// store the result into global memory
		C[(i_global * size) + j_global] = Cblock[(i_local * _BLOCK_) + j_local];
		} // omp parallel
		} // target teams

		return;
		}

		void check(const MyData *const __restrict__ cpu_matrix,
		const MyData *const __restrict__ gpu_matrix)
		{
		int flag = 0;
		for (size_t i=0 ; i<SIZE ; i++)
		flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);

		if (!flag)
		printf("\n\t Result OK");
		else
		printf("\n\t Result wrong");

		#if !defined(NDEBUG)

		for (size_t i=0 ; i<N ; i++)
		{
		printf("\n");
		for (size_t j=0 ; j<N ; j++)
		{
		const size_t index = ((i * N) + j);
		printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
		index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
		}
		}
		printf("\n");

		#endif // NDEBUG

		return;
		}

		int main()
		{
		double time;
		MyData buffer_cpu = (MyData )calloc(4 * SIZE, sizeof(MyData));
		assert(buffer_cpu != NULL);

		// host reference matrix A
		MyData *const restrict A = buffer_cpu;
		MyData *const restrict B = A + SIZE;
		MyData *const restrict C_CPU = B + SIZE;
		MyData *const restrict C_GPU_HOST = C_CPU + SIZE;
		for (size_t i=0 ; i<SIZE ; i++)
		{
		A[i] = drand48();
		B[i] = drand48();
		}

		////////////////////////// CPU naive algorithm ///////////////////////////////////////////////////
		CPU_mat_mult_block(A, B, C_CPU, N);
		//////////////////////////////////////////////////////////////////////////////////////////////////

		// get the host id
		const int dev_host = omp_get_initial_device();

		// get the default device
		const int dev_gpu = omp_get_default_device();
		assert(dev_host != dev_gpu);

		// alloc data to the GPU
		MyData const buffer_gpu = (MyData )omp_target_alloc(3 * SIZE * sizeof(MyData), dev_gpu);
		assert(buffer_gpu != NULL);
		MyData *const restrict A_GPU = buffer_gpu;
		MyData *const restrict B_GPU = A_GPU + SIZE;
		MyData *const restrict C_GPU = B_GPU + SIZE;

		// copy data to GPU
		omp_target_memcpy(buffer_gpu, buffer_cpu, (2 * SIZE * sizeof(MyData)), 0, 0, dev_gpu, dev_host);

		/////////////////////////// GPU naive block algorithm ////////////////////////////////////////////
		time = 0.0;
		GPU_mat_mult_block(A_GPU, B_GPU, C_GPU, N); // warm-up
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		GPU_mat_mult_block(A_GPU, B_GPU, C_GPU, N);
		time += (wall_time() - start);
		}
		// copy data device-to-host
		omp_target_memcpy(C_GPU_HOST, C_GPU, (SIZE * sizeof(MyData)), 0, 0, dev_host, dev_gpu);

		check(C_CPU, C_GPU_HOST);
		memset(C_GPU_HOST, 0, (SIZE * sizeof(MyData)));
		printf("\n\t GPU block time %lg [s]\n", (time / LOOP));
		//////////////////////////////////////////////////////////////////////////////////////////////////

		/////////////////////////// GPU naive block shared memory algorithm //////////////////////////////
		time = 0.0;
		GPU_mat_mult_block_shared(A_GPU, B_GPU, C_GPU, N); // warm-up
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		GPU_mat_mult_block_shared(A_GPU, B_GPU, C_GPU, N);
		time += (wall_time() - start);
		}

		omp_target_memcpy(C_GPU_HOST, C_GPU, (SIZE * sizeof(MyData)), 0, 0, dev_host, dev_gpu);
		check(C_CPU, C_GPU_HOST);
		memset(C_GPU_HOST, 0, (SIZE * sizeof(MyData)));
		printf("\n\t GPU block pteam memory time %lg [s]\n", (time / LOOP));
		//////////////////////////////////////////////////////////////////////////////////////////////////

		/////////////////////////// GPU naive block shared memory no loops algorithm /////////////////////
		time = 0.0;
		GPU_mat_mult_block_shared_no_loops(A_GPU, B_GPU, C_GPU, N); // warm-up
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		GPU_mat_mult_block_shared_no_loops(A_GPU, B_GPU, C_GPU, N);
		time += (wall_time() - start);
		}

		omp_target_memcpy(C_GPU_HOST, C_GPU, (SIZE * sizeof(MyData)), 0, 0, dev_host, dev_gpu);
		check(C_CPU, C_GPU_HOST);
		printf("\n\t GPU block no loops pteam memory time %lg [s]\n", (time / LOOP));
		//////////////////////////////////////////////////////////////////////////////////////////////////

		// free CPU memory
		free(buffer_cpu);
		// free GPU memory
		omp_target_free(buffer_gpu, dev_gpu);

		printf("\n");

		return EXIT_SUCCESS;
		}

cuda-omp/omp/miscellaneous/unified_shared_memory.c

0 → 100644

+389 −0

File added.

Preview size limit exceeded, changes collapsed.

cuda-omp/omp/7/mat_mult.c

+1 −1

File changed.

Contains only whitespace changes.