cuda-ompgpu examples (b317d509) · Commits · HPC_SCHOOL / HPC_SCHOOL_2024

cuda-omp/cuda/1/classwork_1.cu

0 → 100644

+62 −0

Original line number	Diff line number	Diff line
		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Assigment : write a CUDA code corresponding to the
		// following sequential C code
		//
		// #include <stdio.h>
		// #define N 100
		// int main()
		// {
		// for (int i=0 ; i<N ; i++)
		// printf("%d\n", (i * i));

		// return 0;
		// }
		//////////////////////////////////////////////////////////////////////////////////////////////////

		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 06.07.2024
		//
		// - Compile the code:
		// $ nvcc classwork_1.cu -o classwork_1_cuda
		// - Run the code:
		// $ ./classwork_1_cuda
		// - Check the result:
		// $ ./classwork_1_cuda \| tail -n 100 \| sort -nk 5

		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		#include <cuda.h>

		#define N 100
		#define NThreads 1024

		__global__ void GPUkernel(const int size)
		{
		const int myID = threadIdx.x + (blockIdx.x * blockDim.x);

		if (myID >= size)
		return;

		// C printf is supported on CUDA
		// C++ cout class is not supported in CUDA
		printf("Hello from CUDA thread: %d - result %d\n", myID, (myID * myID));

		return;
		}

		int main()
		{
		printf("\n\t The host issues the kernel on the GPU \n");

		// kernel lunch
		GPUkernel<<<1, NThreads>>>(N);

		printf("\n\t cudaDeviceSynchronize \n");
		// GPU synchronization
		cudaDeviceSynchronize();

		return 0;
		}

cuda-omp/cuda/1/classwork_2.cu

0 → 100644

+79 −0

Original line number	Diff line number	Diff line
		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Assigment : write a CUDA code corresponding to the
		// following sequential C code
		//
		// #include <stdio.h>
		// #define N 100
		// int main()
		// {
		// int A[N];
		//
		// for (int i=0 ; i<N ; i++)
		// A[i] = (i * i);
		//
		// return 0;
		// }
		//////////////////////////////////////////////////////////////////////////////////////////////////

		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 17.11.2022
		//
		// - Compile the code:
		// $ nvcc classwork_2.cu -o classwork_2
		// - Run the code:
		// $ ./classwork_2
		// - Check the result:
		// $ ./classwork_2 \| tail -n 100 \| sort -nk 5

		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <iostream>
		#include <stdlib.h>
		#include <cuda.h>

		#define N 100
		#define NThreads 1024

		__global__ void GPUkernel( int *A,
		const int size)
		{
		const int myID = threadIdx.x + (blockIdx.x * blockDim.x);

		if (myID < size)
		A[myID] = (myID * myID);

		return;
		}

		int main()
		{
		// allocate array that allows direct access of both host and device
		// CUDA is responsible
		int *A;
		const size_t size = (N * sizeof(int));
		cudaError error = cudaMallocManaged(&A, size);
		if (!error)
		std::cout << "Memory allocated for the host/device" << std::endl;
		else
		{
		std::cout << "Cannot allocate memory for the host/device. CUDA error : " << error << " ... aborting" << std::endl;
		exit(EXIT_FAILURE);
		}

		// kernel lunch
		GPUkernel<<<1, NThreads>>>(A, N);

		// device synchronization
		cudaDeviceSynchronize();

		// check the result
		for (size_t i=0 ; i<N ; i++)
		std::cout << "A[" << i << "] - Result: " << A[i] << std::endl;

		// free the memory
		cudaFree(A);

		return 0;
		}

cuda-omp/omp/1/classwork_1.c

0 → 100644

+62 −0

Original line number	Diff line number	Diff line
		//////////////////////////////////////////////////////////////////////////////////////////////////
		//
		// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70'
		//
		// Assigment : write a OMP-GPU code corresponding to the
		// following sequential C code
		//
		// #include <stdio.h>
		// #define N 100
		// int main()
		// {
		// for (int i=0 ; i<N ; i++)
		// printf("%d\n", (i * i));

		// return 0;
		// }
		//////////////////////////////////////////////////////////////////////////////////////////////////

		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 06.07.2024
		//
		// - Compile the code to run on :
		// $ nvcc classwork_1.c -o classwork_1_omp
		// - Run the code:
		// $ ./classwork_1_omp
		// - Check the result:
		// $ ./classwork_1_omp \| tail -n 100 \| sort -nk 5

		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		#include <omp.h>

		#define N 100
		#define NThreads 1024

		void GPUkernelSerial(const int size)
		{
		#pragma omp target
		{
		if (!omp_is_initial_device())
		printf("\n\t GPU is executing GPUkernelSerial\n" );
		else
		printf("\n\t CPU is executing GPUkernelSerial\n" );

		for (int i=0 ; i<size ; i++)
		printf("Hello from OMP-GPU thread: %d - result %d\n", i, (i * i));
		}
		return;
		}

		int main()
		{
		printf("\n\t The host issues the kernel on the GPU \n");

		/* kernel lunch using one GPU thread */
		GPUkernelSerial(N);

		return 0;
		}