omp block matrix (7e6f37d5) · Commits · HPC_SCHOOL / HPC_SCHOOL_2024

cuda-omp/omp/7/mat_mult.c

0 → 100644

+185 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////
		// - Naive matrix multiplication algorithm
		// for (size_t i=0 ; i<N ; i++)
		// for (size_t j=0 ; j<N ; j++)
		// for (size_t k=0 ; k<_N ; k++)
		// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
		//
		////////////////////////////////////////////////////////////////////////////////////////////////

		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 31.07.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
		// - Run the code:
		// $ ./classwork_omp
		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		#include <stdlib.h>
		#include <unistd.h>
		#include <time.h>
		#include <assert.h>
		#include <omp.h>
		#include <string.h>

		#define N 512
		#define SIZE (N * N) // matrix size
		typedef double MyData; // do not change

		#define LOOP 100
		#define NDEBUG

		double wall_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
		const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

		return ret;
		}

		void CPU_mat_mult(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		for (size_t i=0 ; i<size ; i++)
		for (size_t j=0 ; j<size ; j++)
		for (size_t k=0 ; k<size ; k++)
		C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);

		return;
		}

		void GPU_mat_mult(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		#pragma omp target
		{
		#pragma omp teams distribute num_teams(size)
		for (size_t i=0 ; i<size ; i++)
		{
		#pragma omp parallel for num_threads(size)
		for (size_t j=0 ; j<size ; j++)
		{
		MyData value = (MyData)0;
		for (size_t k=0 ; k<size ; k++)
		value += (A[(i * size) + k] * B[(k * size) + j]);

		C[(i * size) + j] = value;
		} // omp thread
		} // omp teams
		} // omp target

		return;
		}

		void GPU_mat_mult_no_loops(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		#pragma omp target
		{
		#pragma omp teams num_teams(size)
		{
		const size_t team_size = (size * omp_get_team_num());

		#pragma omp parallel firstprivate(team_size) num_threads(size)
		{
		const size_t tid = omp_get_thread_num();
		MyData value = (MyData)0;
		for (size_t k=0 ; k<size ; k++)
		value += (A[team_size + k] * B[(k * size) + tid]);

		C[team_size + tid] = value;
		} // omp threads
		} // omp teams
		} // omp target

		return;
		}

		void check(const MyData *const __restrict__ cpu_matrix,
		const MyData *const __restrict__ gpu_matrix)
		{
		int flag;
		for (size_t i=0 ; i<SIZE ; i++)
		flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);

		if (!flag)
		printf("\n\t Result OK");
		else
		printf("\n\t Result wrong");

		return;
		}

		int main()
		{
		double time;
		MyData buffer = (MyData )calloc(4 * SIZE, sizeof(MyData));
		assert(buffer != NULL);

		// host reference matrix A
		MyData *const restrict A = buffer;
		MyData *const restrict B = A + SIZE;
		MyData *const restrict C_CPU = B + SIZE;
		MyData *const restrict C_GPU = C_CPU + SIZE;
		for (size_t i=0 ; i<SIZE ; i++)
		{
		A[i] = drand48();
		B[i] = drand48();
		}

		////////////////////////// CPU naive algorithm //////////////////////////////////////////
		CPU_mat_mult(A, B, C_CPU, N);
		/////////////////////////////////////////////////////////////////////////////////////////

		// copy/alloc data to the GPU
		#pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE])

		/////////////////////////// GPU naive algorithm ////////////////////////////////////////
		time = 0.0;
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		GPU_mat_mult(A, B, C_GPU, N);
		time += (wall_time() - start);
		}

		#pragma omp target update from(C_GPU[0:SIZE])
		check(C_CPU, C_GPU);
		printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
		////////////////////////////////////////////////////////////////////////////////

		/////////////////////////// GPU naive no loops algorithm ////////////////////////////
		time = 0.0;
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		GPU_mat_mult_no_loops(A, B, C_GPU, N);
		time += (wall_time() - start);
		}

		#pragma omp target update from(C_GPU[0:SIZE])
		check(C_CPU, C_GPU);
		printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP));
		////////////////////////////////////////////////////////////////////////////////

		// free CPU memory
		free(buffer);
		// free GPU memory
		#pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE])

		printf("\n");

		return EXIT_SUCCESS;
		}

cuda-omp/omp/7/mat_mult_block.c

0 → 100644

+208 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////
		// - Block matrix multiplication algorithm
		//
		// const size_t Nblocks = (N / Bsize);
		//
		// // loop over blocks of matrix C
		// for (size_t ib=0 ; ib<Nblocks ; ib++)
		// {
		// for (size_t jb=0 ; jb<Nblocks ; jb++)
		// {
		//
		// // loop over blocks of rows of A
		// for (size_t kb=0 ; kb<Nblocks ; kb++)
		//
		//
		//
		// for (size_t i=0 ; i<N ; i++)
		// for (size_t j=0 ; j<N ; j++)
		// for (size_t k=0 ; k<_N ; k++)
		// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
		//
		// } // jb
		// } // ib
		// - Exploit shared-memory.
		////////////////////////////////////////////////////////////////////////////////////////////////

		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 31.07.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
		// - Run the code:
		// $ ./classwork_omp
		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		#include <stdlib.h>
		#include <unistd.h>
		#include <time.h>
		#include <assert.h>
		#include <omp.h>
		#include <string.h>

		#define N 512
		#define SIZE (N * N) // matrix size
		typedef double MyData; // do not change
		#define BLOCKSIZE 32 // number of threads per block

		// sanity check
		#if BLOCKSIZE > 1024
		#error BLOCKSIZE cannot be larger than 1024
		#endif

		#define LOOP 100
		#define NDEBUG

		double wall_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
		const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

		return ret;
		}

		void CPU_mat_mult(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		for (size_t i=0 ; i<size ; i++)
		for (size_t j=0 ; j<size ; j++)
		for (size_t k=0 ; k<size ; k++)
		C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);

		return;
		}

		void GPU_mat_mult(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		#pragma omp target
		{
		#pragma omp teams distribute num_teams(size)
		for (size_t i=0 ; i<size ; i++)
		{
		#pragma omp parallel for num_threads(size)
		for (size_t j=0 ; j<size ; j++)
		{
		MyData value = (MyData)0;
		for (size_t k=0 ; k<size ; k++)
		value += (A[(i * size) + k] * B[(k * size) + j]);

		C[(i * size) + j] = value;
		} // omp thread
		} // omp teams
		} // omp target

		return;
		}

		void GPU_mat_mult_no_loops(const MyData *const restrict A,
		const MyData *const restrict B,
		MyData *const restrict C,
		const size_t size)
		{
		#pragma omp target
		{
		#pragma omp teams num_teams(size)
		{
		const size_t team_size = (size * omp_get_team_num());

		#pragma omp parallel firstprivate(team_size) num_threads(size)
		{
		const size_t tid = omp_get_thread_num();
		MyData value = (MyData)0;
		for (size_t k=0 ; k<size ; k++)
		value += (A[team_size + k] * B[(k * size) + tid]);

		C[team_size + tid] = value;
		} // omp threads
		} // omp teams
		} // omp target

		return;
		}

		void check(const MyData *const __restrict__ cpu_matrix,
		const MyData *const __restrict__ gpu_matrix)
		{
		int flag;
		for (size_t i=0 ; i<SIZE ; i++)
		flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);

		if (!flag)
		printf("\n\t Result OK");
		else
		printf("\n\t Result wrong");

		return;
		}

		int main()
		{
		double time;
		MyData buffer = (MyData )calloc(4 * SIZE, sizeof(MyData));
		assert(buffer != NULL);

		// host reference matrix A
		MyData *const restrict A = buffer;
		MyData *const restrict B = A + SIZE;
		MyData *const restrict C_CPU = B + SIZE;
		MyData *const restrict C_GPU = C_CPU + SIZE;
		for (size_t i=0 ; i<SIZE ; i++)
		{
		A[i] = drand48();
		B[i] = drand48();
		}

		////////////////////////// CPU naive algorithm //////////////////////////////////////////
		CPU_mat_mult(A, B, C_CPU, N);
		/////////////////////////////////////////////////////////////////////////////////////////

		// copy/alloc data to the GPU
		#pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE])

		/////////////////////////// GPU naive algorithm ////////////////////////////////////////
		time = 0.0;
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		GPU_mat_mult(A, B, C_GPU, N);
		time += (wall_time() - start);
		}

		#pragma omp target update from(C_GPU[0:SIZE])
		check(C_CPU, C_GPU);
		printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
		////////////////////////////////////////////////////////////////////////////////

		/////////////////////////// GPU naive no loops algorithm ////////////////////////////
		time = 0.0;
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		GPU_mat_mult_no_loops(A, B, C_GPU, N);
		time += (wall_time() - start);
		}

		#pragma omp target update from(C_GPU[0:SIZE])
		check(C_CPU, C_GPU);
		printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP));
		////////////////////////////////////////////////////////////////////////////////

		// free CPU memory
		free(buffer);
		// free GPU memory
		#pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE])

		printf("\n");

		return EXIT_SUCCESS;
		}

cuda-omp/omp/miscellaneous/structure.c

+34 −25

Original line number	Diff line number	Diff line
		@@ -17,7 +17,8 @@
		#include <assert.h>
		#include <omp.h>

		#define N 8
		#define SIZE 8
		#define SIZE_2 (SIZE / 2)
		typedef double MyData;

		typedef struct my_span
		@@ -29,26 +30,20 @@ typedef struct my_span
		void allocate( span *my_struct,
		const size_t size)
		{
		span tmp;
		/* allocate the buffer on the host memory */
		tmp.ptr = (MyData )malloc(size sizeof(MyData));
		assert(tmp.ptr != NULL);
		tmp.N = size;

		/* declare how the object 'span' has to be mapped on the device */
		#pragma omp declare mapper(span tmp) map(from: tmp, tmp.ptr[0: tmp.N])

		my_struct->ptr = tmp.ptr;
		my_struct->N = tmp.N;
		my_struct->ptr = (MyData *)calloc(size, sizeof(MyData));
		assert(my_struct->ptr != NULL);
		my_struct->N = size;

		return;
		}

		void print(const span *const A)
		void print(const span *const A,
		const char *const string)
		{
		printf("\n");
		for (size_t i=0 ; i<A->N ; i++)
		printf("\n\t array[%i] = %lg", i, A->ptr[i]);
		for (int i=0 ; i<A->N ; i++)
		printf("\n\t %s[%d] = %lg", string, i, A->ptr[i]);
		printf("\n\n");

		return;
		@@ -56,27 +51,41 @@ void print(const span *const A)

		int main()
		{
		span A, B;
		span A, B, C;

		allocate(&A, N);
		allocate(&B, N);
		allocate(&A, SIZE);
		allocate(&B, SIZE);
		allocate(&C, SIZE);

		/* declare how the object 'span' has to be mapped on the device */
		#pragma omp declare mapper(all : span S) map(to: S) map(from: S.ptr[0 : S.N])
		#pragma omp declare mapper(left : span S) map(to: S) map(from: S.ptr[0 : S.N/2])
		#pragma omp declare mapper(right: span S) map(to: S) map(from: S.ptr[S.N/2: S.N/2])

		/* init on the GPU */
		#pragma omp target
		{
		for (size_t i=0 ; i<N ; i++)
		#pragma omp target map(mapper(all): A) map(mapper(left): B) map(mapper(right): C)
		{
		#pragma omp loop
		for (size_t i=0 ; i<SIZE ; i++)
		A.ptr[i] = (MyData)(i);
		B.ptr[i] = (MyData)(2 * i);
		}

		#pragma omp loop
		for (size_t ii=0 ; ii<SIZE_2 ; ii++)
		B.ptr[ii] = (MyData)(ii);

		#pragma omp loop
		for (size_t iii=SIZE_2 ; iii<SIZE ; iii++)
		C.ptr[iii] = (MyData)(iii);
		}

		print(&A);
		print(&B);
		print(&A, "A");
		print(&B, "B");
		print(&C, "C");

		/* free the host's memory */
		free(A.ptr);
		free(B.ptr);
		free(C.ptr);

		return 0;
		}

jacobi/serial/not_opt/jacobi_serial_not_opt_len14

0 → 100755

+17 KiB

File added.

No diff preview for this file type.

View file