cuda/omp matrix multiply examples (4921ce23) · Commits · HPC_SCHOOL / HPC_SCHOOL_2024

cuda-omp/cuda/7/mat_mult.cu

0 → 100644

+176 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////
		// - Naive matrix multiplication algorithm
		// for (size_t i=0 ; i<N ; i++)
		// for (size_t j=0 ; j<N ; j++)
		// for (size_t k=0 ; k<_N ; k++)
		// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
		//
		////////////////////////////////////////////////////////////////////////////////////////////////

		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 20.08.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc++ mat_mult.cu -o mat_mult
		// - Run the code:
		// $ ./mat_mult
		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		#include <stdlib.h>
		#include <unistd.h>
		#include <time.h>
		#include <assert.h>
		#include <cuda.h>
		#include <string.h>
		#include <math.h>
		#include <float.h>

		#define N 1024
		#define SIZE (N * N) // matrix size
		typedef double MyData; // do not change
		#define BLOCKSIZE 1024

		// sanity check
		#if BLOCKSIZE > 1024
		#error BLOCKSIZE must be <= 1024
		#endif

		#define LOOP 100
		#define NDEBUG

		double wall_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
		const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

		return ret;
		}

		void CPU_mat_mult(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		for (size_t i=0 ; i<size ; i++)
		for (size_t j=0 ; j<size ; j++)
		for (size_t k=0 ; k<size ; k++)
		C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);

		return;
		}

		__global__ void GPU_mat_mult(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
		if (globalID >= SIZE)
		return;

		const size_t i = globalID / size;
		const size_t j = globalID % size;

		MyData value = (MyData)0;
		for (size_t k=0 ; k<size ; k++)
		value += A[(i * N) + k] * B[(k * N) + j];

		C[(i * N) + j] = value;

		return;
		}

		void check(const MyData *const __restrict__ cpu_matrix,
		const MyData *const __restrict__ gpu_matrix)
		{
		int flag = 0;
		for (size_t i=0 ; i<SIZE ; i++)
		flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);

		if (!flag)
		printf("\n\t Result OK");
		else
		printf("\n\t Result wrong");

		#if !defined(NDEBUG)

		for (size_t i=0 ; i<N ; i++)
		{
		printf("\n");
		for (size_t j=0 ; j<N ; j++)
		{
		const size_t index = ((i * N) + j);
		printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
		index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
		}
		}
		printf("\n");

		#endif // NDEBUG

		return;
		}

		int main()
		{
		double time;
		MyData buffer_cpu = (MyData )calloc(4 * SIZE, sizeof(MyData));
		assert(buffer_cpu != NULL);

		// host reference matrix A
		MyData *const restrict A_CPU = buffer_cpu;
		MyData *const restrict B_CPU = A_CPU + SIZE;
		MyData *const restrict C_CPU = B_CPU + SIZE;
		MyData *const restrict C_GPU_host = C_CPU + SIZE;
		for (size_t i=0 ; i<SIZE ; i++)
		{
		A_CPU[i] = drand48();
		B_CPU[i] = drand48();
		}

		////////////////////////// CPU naive algorithm //////////////////////////////////////////
		CPU_mat_mult(A_CPU, B_CPU, C_CPU, N);
		/////////////////////////////////////////////////////////////////////////////////////////

		// copy/alloc data to the GPU
		MyData *buffer_gpu = NULL;
		cudaMalloc((void *)&buffer_gpu, (3 SIZE * sizeof(MyData)));
		MyData *const restrict A_GPU = buffer_gpu;
		MyData *const restrict B_GPU = A_GPU + SIZE;
		MyData *const restrict C_GPU = B_GPU + SIZE;
		cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);

		const dim3 block = {BLOCKSIZE, 1, 1};
		const dim3 nblocks = {(SIZE + BLOCKSIZE -1)/BLOCKSIZE, 1, 1};

		/////////////////////////// GPU naive algorithm ////////////////////////////////////////
		time = 0.0;
		GPU_mat_mult<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
		cudaDeviceSynchronize();
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		GPU_mat_mult<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
		cudaDeviceSynchronize();
		time += (wall_time() - start);
		}
		cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);

		check(C_CPU, C_GPU_host);
		printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
		/////////////////////////////////////////////////////////////////////////////////////

		// free CPU memory
		free(buffer_cpu);
		// free GPU memory
		cudaFree(buffer_gpu);

		printf("\n");

		return EXIT_SUCCESS;
		}

cuda-omp/cuda/7/mat_mult_block.cu

0 → 100644

+301 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////
		// - Block matrix multiplication algorithm
		//
		// const size_t Nblocks = (N / Bsize);
		//
		// // loop over blocks of matrix C
		// for (size_t ib=0 ; ib<Nblocks ; ib++)
		// {
		// for (size_t jb=0 ; jb<Nblocks ; jb++)
		// {
		//
		// // loop over blocks of rows of A
		// for (size_t kb=0 ; kb<Nblocks ; kb++)
		// {
		//
		// // Matrix multiplication within a block
		// for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++)
		// for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++)
		// for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++)
		// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
		//
		// } // kb
		// } // jb
		// } // ib
		//
		// - Exploit GPU shared-memory.
		////////////////////////////////////////////////////////////////////////////////////////////////

		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 22.08.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc++ mat_mult_block.cu -o mat_mult_block
		// - Run the code:
		// $ ./mat_mult_block
		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		#include <stdlib.h>
		#include <unistd.h>
		#include <time.h>
		#include <assert.h>
		#include <cuda.h>
		#include <string.h>
		#include <math.h>
		#include <float.h>

		#define N 1024
		#define SIZE (N * N) // matrix size
		typedef double MyData; // do not change
		#define BLOCK 16

		// sanity check
		#if BLOCK * BLOCK > 1024
		#error BLOCKSIZE must be <= 1024
		#endif

		#if BLOCK * BLOCK > SIZE
		#error BLOCKSIZE must be <= SIZE
		#endif

		#define LOOP 100
		#define NDEBUG

		double wall_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
		const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

		return ret;
		}

		void CPU_mat_mult(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		for (size_t i=0 ; i<size ; i++)
		for (size_t j=0 ; j<size ; j++)
		for (size_t k=0 ; k<size ; k++)
		C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);

		return;
		}

		void CPU_mat_mult_block(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		const size_t Nblocks = (size / BLOCK);

		// loop over blocks of matrix C
		for (size_t ib=0 ; ib<Nblocks ; ib++)
		{
		for (size_t jb=0 ; jb<Nblocks ; jb++)
		{
		// loop over blocks of rows of A
		for (size_t kb=0 ; kb<Nblocks ; kb++)
		{
		// Matrix multiplication within a block
		for (size_t i=(ib * BLOCK) ; i<((ib + 1) * BLOCK) ; i++)
		for (size_t j=(jb * BLOCK) ; j<((jb + 1) * BLOCK) ; j++)
		for (size_t k=(kb * BLOCK) ; k<((kb + 1) * BLOCK) ; k++)
		C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
		} // kb
		} // jb
		} // ib

		return;
		}

		__global__ void GPU_mat_mult_block(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		const size_t size2 = (size * size);

		const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
		if (globalID >= size2)
		return;

		const size_t Nblocks = size / BLOCK; // number of blocks to loop over A and B
		const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block
		const size_t jb = blockIdx.x % Nblocks;
		const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread
		const size_t j_local = threadIdx.x % BLOCK; // within its own block
		const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
		const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices

		C[(i_global * size) + j_global] = (MyData)0;

		// loop over blocks
		for (size_t block=0 ; block<Nblocks ; block++)
		{
		const size_t j_A = (block * BLOCK);
		const size_t i_B = (block * BLOCK);

		// perform the matrix multiplication within the block
		MyData value = (MyData)0;
		for (size_t k=0 ; k<BLOCK ; k++)
		value += A[(i_global * size) + k + j_A] * B[((k + i_B) * size) + j_global];

		C[(i_global * size) + j_global] += value;
		}

		return;
		}

		__global__ void GPU_mat_mult_block_shared(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		__shared__ MyData Ablock[BLOCK * BLOCK];
		__shared__ MyData Bblock[BLOCK * BLOCK];
		__shared__ MyData Cblock[BLOCK * BLOCK];

		const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
		const size_t size2 = (size * size);
		if (globalID >= size2)
		return;

		const size_t Nblocks = size / BLOCK; // number of blocks to loop over
		const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block
		const size_t jb = blockIdx.x % Nblocks;
		const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread
		const size_t j_local = threadIdx.x % BLOCK; // within its own block
		const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
		const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices

		// Init Cblock
		Cblock[(i_local * BLOCK) + j_local] = (MyData)0;

		// loop over blocks
		for (size_t block=0 ; block<Nblocks ; block++)
		{
		const size_t j_A = (block * BLOCK);
		const size_t i_B = (block * BLOCK);

		// waits until all threads in the thread block have reached this point and shared memory accesses
		// made by these threads prior to __syncthreads() are visible to all threads in the block.
		__syncthreads();

		// copy block of A into shared memory
		Ablock[(i_local * BLOCK) + j_local] = A[(i_global * size) + j_local + j_A];
		// copy block of B into shared memory
		Bblock[(i_local * BLOCK) + j_local] = B[((i_local + i_B) * size) + j_global];

		// waits until all threads in the thread block have reached this point and shared memory accesses // made by these threads prior to __syncthreads() are visible to all threads in the block.
		__syncthreads();

		// perform the matrix multiplication within the block
		MyData value = (MyData)0;
		for (size_t k=0 ; k<BLOCK ; k++)
		value += Ablock[(i_local * BLOCK) + k] * Bblock[(k * BLOCK) + j_local];

		// store the partial result in shared memory
		Cblock[(i_local * BLOCK) + j_local] += value;
		}

		C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK) + j_local];

		return;
		}

		void check(const MyData *const restrict cpu_matrix,
		const MyData *const restrict gpu_matrix)
		{
		int flag = 0;
		for (size_t i=0 ; i<SIZE ; i++)
		flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);

		if (!flag)
		printf("\n\t Result OK");
		else
		printf("\n\t Result wrong");

		return;
		}

		int main()
		{
		double time;
		MyData buffer_cpu = (MyData )calloc(4 * SIZE, sizeof(MyData));
		assert(buffer_cpu != NULL);

		// host reference matrix A
		MyData *const restrict A_CPU = buffer_cpu;
		MyData *const restrict B_CPU = A_CPU + SIZE;
		MyData *const restrict C_CPU = B_CPU + SIZE;
		MyData *const restrict C_GPU_host = C_CPU + SIZE;
		for (size_t i=0 ; i<SIZE ; i++)
		{
		A_CPU[i] = drand48();
		B_CPU[i] = drand48();
		}

		////////////////////////// CPU naive algorithm //////////////////////////////////////////
		CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N);
		/////////////////////////////////////////////////////////////////////////////////////////

		// copy/alloc data to the GPU
		MyData *buffer_gpu = NULL;
		cudaMalloc((void *)&buffer_gpu, (3 SIZE * sizeof(MyData)));
		MyData *const A_GPU = buffer_gpu;
		MyData *const B_GPU = A_GPU + SIZE;
		MyData *const C_GPU = B_GPU + SIZE;
		cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);

		const dim3 nblocks = {(SIZE + (BLOCK * BLOCK) -1)/(BLOCK * BLOCK), 1, 1};
		const dim3 block = {(BLOCK * BLOCK), 1, 1};

		/////////////////////////// GPU naive block algorithm ////////////////////////////////////////
		time = 0.0;
		GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
		cudaDeviceSynchronize();
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
		cudaDeviceSynchronize();
		time += (wall_time() - start);
		}
		cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);

		check(C_CPU, C_GPU_host);
		printf("\n\t GPU naive block time %lg [s]\n", (time / LOOP));
		/////////////////////////////////////////////////////////////////////////////////////

		/////////////////////////// GPU block shared memory algorithm ///////////////////////
		time = 0.0;
		GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
		cudaDeviceSynchronize();
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
		cudaDeviceSynchronize();
		time += (wall_time() - start);
		}
		cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);

		check(C_CPU, C_GPU_host);
		printf("\n\t GPU block shared memory time %lg [s]\n", (time / LOOP));
		/////////////////////////////////////////////////////////////////////////////////////

		// free CPU memory
		free(buffer_cpu);
		// free GPU memory
		cudaFree(buffer_gpu);

		printf("\n");

		return EXIT_SUCCESS;
		}

cuda-omp/omp/6/classwork_omp

deleted100755 → 0

−555 KiB

File deleted.

View file

cuda-omp/omp/7/mat_mult.c

+72 −35

Original line number	Diff line number	Diff line
		@@ -14,9 +14,10 @@
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
		// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v mat_mult.c -o mat_mult_omp
		// - Run the code:
		// $ ./classwork_omp
		// $ export OMP_TARGET_OFFLOAD=MANDATORY
		// $ ./mat_mult_omp
		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		@@ -26,8 +27,10 @@
		#include <assert.h>
		#include <omp.h>
		#include <string.h>
		#include <math.h>
		#include <float.h>

		#define N 512
		#define N 1024
		#define SIZE (N * N) // matrix size
		typedef double MyData; // do not change

		@@ -61,22 +64,30 @@ void GPU_mat_mult(const MyData *const restrict A,
		MyData *const restrict C,
		const size_t size)
		{
		#pragma omp target
		{
		#pragma omp teams distribute num_teams(size)
		#pragma omp target teams distribute num_teams(size)
		for (size_t i=0 ; i<size ; i++)
		{
		#pragma omp parallel for num_threads(size)
		#pragma omp parallel for firstprivate(i) num_threads(size)
		for (size_t j=0 ; j<size ; j++)
		{
		#if !defined(NDEBUG)

		const int num_teams = omp_get_num_teams();
		const int num_threads = omp_get_num_threads();

		if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0))
		printf("\n\t Kernel GPU_mat_mult: teams: %d - threads: %d\n",
		num_teams, num_threads);

		#endif // NDEBUG

		MyData value = (MyData)0;
		for (size_t k=0 ; k<size ; k++)
		value += (A[(i * size) + k] * B[(k * size) + j]);

		C[(i * size) + j] = value;
		} // omp thread
		} // omp teams
		} // omp target
		} // omp threads
		} // omp target teams

		return;
		}
		@@ -86,14 +97,23 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A,
		MyData *const restrict C,
		const size_t size)
		{
		#pragma omp target
		{
		#pragma omp teams num_teams(size)
		#pragma omp target teams num_teams(size)
		{
		const size_t team_size = (size * omp_get_team_num());

		#pragma omp parallel firstprivate(team_size) num_threads(size)
		{
		#if !defined(NDEBUG)

		const int num_teams = omp_get_num_teams();
		const int num_threads = omp_get_num_threads();

		if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0))
		printf("\n\t Kernel GPU_mat_mult_no_loops: teams: %d - threads: %d\n",
		num_teams, num_threads);

		#endif // NDEBUG

		const size_t tid = omp_get_thread_num();
		MyData value = (MyData)0;
		for (size_t k=0 ; k<size ; k++)
		@@ -101,8 +121,7 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A,

		C[team_size + tid] = value;
		} // omp threads
		} // omp teams
		} // omp target
		} // omp target teams

		return;
		}
		@@ -110,15 +129,31 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A,
		void check(const MyData *const __restrict__ cpu_matrix,
		const MyData *const __restrict__ gpu_matrix)
		{
		int flag;
		int flag = 0;
		for (size_t i=0 ; i<SIZE ; i++)
		flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
		flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);

		if (!flag)
		printf("\n\t Result OK");
		else
		printf("\n\t Result wrong");

		#if !defined(NDEBUG)

		for (size_t i=0 ; i<N ; i++)
		{
		printf("\n");
		for (size_t j=0 ; j<N ; j++)
		{
		const size_t index = ((i * N) + j);
		printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
		index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
		}
		}
		printf("\n");

		#endif // NDEBUG

		return;
		}

		@@ -148,6 +183,7 @@ int main()

		/////////////////////////// GPU naive algorithm ////////////////////////////////////////
		time = 0.0;
		GPU_mat_mult(A, B, C_GPU, N); // warm-up
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		@@ -162,6 +198,7 @@ int main()

		/////////////////////////// GPU naive no loops algorithm ////////////////////////////
		time = 0.0;
		GPU_mat_mult_no_loops(A, B, C_GPU, N); // warm-up
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();

cuda-omp/omp/7/mat_mult_block.c

+259 −73

File changed.

Preview size limit exceeded, changes collapsed.