cuda-omp energy add (17762d0f) · Commits · HPC_SCHOOL / HPC_SCHOOL_2024

cuda-omp/cuda/energy/Makefile

0 → 100644

+30 −0

Original line number	Diff line number	Diff line
		ENERGY ?= YES

		ifeq ($(ENERGY), YES)
		OPTIONS += -D_ENERGY_PMT_

		# enable RAPL
		OPTIONS += -D_ENERGY_RAPL_
		# enable NVIDIA
		OPTIONS += -D_ENERGY_NVIDIA_
		# enable AMD
		OPTIONS += # -D_ENERGY_AMD_
		endif

		NVCPP = nvc++
		OPT = -O3
		INC = /home/darkenergy/software/pmt/include
		LIB = /home/darkenergy/software/pmt/lib -lpmt -lm

		all: mat_mult
		.PHONY: clean test

		mat_mult: mat_mult_block.cu energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile
		$(NVCPP) $(OPT) $(OPTIONS) -I$(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ -L$(LIB)
		ldd ./mat_mult

		test: mat_mult
		./mat_mult

		clean:
		rm -rf .o mat_mult ~ energy/~ energy/.o

cuda-omp/cuda/energy/energy/energy_pmt.h

0 → 100644

+30 −0

Original line number	Diff line number	Diff line
		#pragma once

		#include "energy_pmt_methods.h"

		#if defined(_ENERGY_RAPL_) \|\| defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)
		#define PMT_CREATE(devID, numGPUs) Create_PMT((devID), (numGPUs))
		#else
		#define PMT_CREATE(numGPUs)
		#endif // defined(_ENERGY_RAPL_) \|\| defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)

		#if defined(_ENERGY_RAPL_)
		#define PMT_CPU_START(string) Start_PMT_CPU((string))
		#define PMT_CPU_STOP(string) Stop_PMT_CPU((string))
		#define PMT_CPU_SHOW(string) Show_PMT_CPU((string))
		#else
		#define PMT_CPU_START(string)
		#define PMT_CPU_STOP(string)
		#define PMT_CPU_SHOW(string)
		#endif // _ENERGY_RAPL_

		#if defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)
		#define PMT_GPU_START(string, devID) Start_PMT_GPU((string), (devID))
		#define PMT_GPU_STOP(string, devID) Stop_PMT_GPU((string), (devID))
		#define PMT_GPU_SHOW(string) Show_PMT_GPU((string))
		#else
		#define PMT_GPU_START(string, dev)
		#define PMT_GPU_STOP(string, dev)
		#define PMT_GPU_SHOW(string)
		#endif // defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)

cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp

0 → 100644

+269 −0

Original line number	Diff line number	Diff line
		#if defined(_ENERGY_PMT_)

		#include <pmt.h>
		#include <assert.h>
		#include <iostream>
		#include <vector>
		#include <string>
		#include <map>

		struct EnergyState
		{
		pmt::State start;
		pmt::State stop;
		double joules;
		double watts;
		double seconds;
		unsigned int count;
		};

		static bool PMT_ERROR{false};

		void PMT_err(void)
		{
		std::cout << "\n\t PMT Error \n" << std::endl;

		return;
		}

		#if defined(_ENERGY_RAPL_)
		static std::unique_ptr<pmt::PMT> sensor_cpu;
		static std::map<std::string, EnergyState> state_cpu;
		#endif // _ENERGY_RAPL_

		#if defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)
		static std::map<int, std::unique_ptr<pmt::PMT>> sensor_gpu;
		static std::map<int, std::map<std::string, EnergyState>> state_gpu;
		#endif // defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)

		#if !defined(__NVCC__) && !defined(__NVCOMPILER)
		extern "C"
		{
		#include "energy_pmt_methods.h"
		}
		#endif // !defined(__NVCC__) && !defined(__NVCOMPILER)

		#if defined(_ENERGY_RAPL_) \|\| defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)
		void Create_PMT(int *devID,
		const unsigned int numGPUs)
		{
		#if defined(_ENERGY_RAPL_)

		sensor_cpu = pmt::Create("rapl");

		#endif // _ENERGY_RAPL_

		if ((numGPUs > 0) && (devID != nullptr))
		{
		for (unsigned int dev=0 ; dev<numGPUs ; dev++)
		{
		#if defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)
		sensor_gpu.insert({devID[dev],

		#if defined(_ENERGY_NVIDIA_)
		pmt::nvml::NVML::Create(dev)});
		#elif defined(_ENERGY_AMD_)
		pmt::rocm::AMD::Create(dev)});
		#endif

		#endif // defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)
		} // numGPUs
		}

		return;
		}
		#endif // defined(_ENERGY_RAPL_) \|\| defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)

		#if defined(_ENERGY_RAPL_)
		void Start_PMT_CPU(const char *label)
		{
		if (PMT_ERROR)
		{
		PMT_err();
		return;
		}

		// check if the label already exists
		if (state_cpu.count(std::string{label}))
		{
		state_cpu[std::string{label}].start = sensor_cpu->Read();
		}
		else
		{
		// insert the key and initialize the counters
		state_cpu.insert({std::string{label},
		{sensor_cpu->Read(),
		0,
		0.0, 0.0, 0.0, 0
		}});
		}

		return;
		}

		void Stop_PMT_CPU(const char *label)
		{
		if (PMT_ERROR)
		{
		PMT_err();
		return;
		}

		// check if the label already exists
		// if not error
		if (!state_cpu.count(std::string{label}))
		{
		PMT_ERROR = true;
		PMT_err();
		return;
		}
		else
		{
		// read the counter
		state_cpu[std::string{label}].stop = sensor_cpu->Read();

		// update quantities
		state_cpu[std::string{label}].seconds +=
		sensor_cpu->seconds(state_cpu[std::string{label}].start,
		state_cpu[std::string{label}].stop);

		state_cpu[std::string{label}].joules +=
		sensor_cpu->joules(state_cpu[std::string{label}].start,
		state_cpu[std::string{label}].stop);

		state_cpu[std::string{label}].watts +=
		sensor_cpu->watts(state_cpu[std::string{label}].start,
		state_cpu[std::string{label}].stop);

		state_cpu[std::string{label}].count++;
		}

		return;
		}

		void Show_PMT_CPU(const char *label)
		{
		if (PMT_ERROR \|\| !state_cpu.count(std::string{label}))
		{
		PMT_err();
		return;
		}
		else
		{
		std::cout << "\n\t CPU Kernel:" << std::string{label} << ":" << std::endl;
		std::cout << "\t\t" << state_cpu[std::string{label}].seconds << " [S]" << std::endl;
		std::cout << "\t\t" << state_cpu[std::string{label}].joules << " [J]" << std::endl;
		std::cout << "\t\t" << state_cpu[std::string{label}].watts / state_cpu[std::string{label}].count << " [W]" << "\n" << std::endl;
		}

		return;
		}
		#endif // _ENERGY_RAPL_

		#if defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)
		void Start_PMT_GPU(const char *label,
		const int devID)
		{
		if (PMT_ERROR \|\| !sensor_gpu.count(devID))
		{
		PMT_err();
		return;
		}

		// check if the devID already exists
		if (!state_gpu.count(devID))
		{
		// insert devID
		state_gpu.insert({devID, {}});
		}
		// check if the label already exists
		if (state_gpu[devID].count(std::string{label}))
		{
		// read the sensor
		state_gpu[devID][std::string{label}].start = sensor_gpu[devID]->Read();
		}
		else
		{
		// insert the label and initialize the counters
		state_gpu[devID].insert({std::string{label},
		{
		sensor_gpu[devID]->Read(),
		0,
		0.0, 0.0, 0.0, 0
		}});
		}

		return;
		}

		void Stop_PMT_GPU(const char *label,
		const int devID)
		{
		// check if the devID already exists
		// if not error
		if (!state_gpu.count(devID) \|\| PMT_ERROR \|\| !sensor_gpu.count(devID))
		{
		PMT_ERROR = true;
		PMT_err();
		return;
		}
		else
		{
		// check if the label already exists
		// if not error
		if (!state_gpu[devID].count(std::string{label}))
		{
		PMT_ERROR = true;
		PMT_err();
		return;
		}
		else
		{
		// read the counter
		state_gpu[devID][std::string{label}].stop = sensor_gpu[devID]->Read();

		// update quantities
		state_gpu[devID][std::string{label}].seconds +=
		sensor_gpu[devID]->seconds(state_gpu[devID][std::string{label}].start,
		state_gpu[devID][std::string{label}].stop);

		state_gpu[devID][std::string{label}].joules +=
		sensor_gpu[devID]->joules(state_gpu[devID][std::string{label}].start,
		state_gpu[devID][std::string{label}].stop);

		state_gpu[devID][std::string{label}].watts +=
		sensor_gpu[devID]->watts(state_gpu[devID][std::string{label}].start,
		state_gpu[devID][std::string{label}].stop);

		state_gpu[devID][std::string{label}].count++;
		}
		}

		return;
		}

		void Show_PMT_GPU(const char *label)
		{
		if (PMT_ERROR)
		{
		PMT_err();
		return;
		}
		else
		{
		// show quantities for all devices
		for (const auto& [key, value]: state_gpu)
		{
		std::cout << "\n\t GPU [" << key << "] kernel:" << std::string{label} << ":" << std::endl;
		std::cout << "\t\t" << value.at(std::string{label}).seconds << " [s]" << std::endl;
		std::cout << "\t\t" << value.at(std::string{label}).joules << " [J]" << std::endl;
		std::cout << "\t\t" << value.at(std::string{label}).watts / value.at(std::string{label}).count << " [W]" << "\n" << std::endl;
		}
		}

		return;
		}

		#endif // defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)

		#endif // _ENERGY_PMT_

cuda-omp/cuda/energy/energy/energy_pmt_methods.h

0 → 100644

+19 −0

Original line number	Diff line number	Diff line
		#pragma once

		#if defined(_ENERGY_RAPL_) \|\| defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)
		void Create_PMT(int *devID, const unsigned int numGPUs);
		#endif // defined(_ENERGY_RAPL_) \|\| defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)

		#if defined(_ENERGY_RAPL_)
		void Start_PMT_CPU(const char *string);
		void Stop_PMT_CPU(const char *string);
		void Show_PMT_CPU(const char *string);
		#endif // _ENERGY_RAPL_

		#if defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)
		void Start_PMT_GPU(const char *string, const int);
		void Stop_PMT_GPU(const char *string, const int);
		void Show_PMT_GPU(const char *string);
		#endif // defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)

cuda-omp/cuda/energy/mat_mult_block.cu

0 → 100644

+326 −0

Original line number	Diff line number	Diff line
		////////////////////////////////////////////////////////////////////////////////////////////////
		// - Block matrix multiplication algorithm
		//
		// const size_t Nblocks = (N / Bsize);
		//
		// // loop over blocks of matrix C
		// for (size_t ib=0 ; ib<Nblocks ; ib++)
		// {
		// for (size_t jb=0 ; jb<Nblocks ; jb++)
		// {
		//
		// // loop over blocks of rows of A
		// for (size_t kb=0 ; kb<Nblocks ; kb++)
		// {
		//
		// // Matrix multiplication within a block
		// for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++)
		// for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++)
		// for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++)
		// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
		//
		// } // kb
		// } // jb
		// } // ib
		//
		// - Exploit GPU shared-memory.
		////////////////////////////////////////////////////////////////////////////////////////////////

		//////////////////////////////////////////////////////////////////////////////////////////////////
		// Author: David Goz
		// mail : david.goz@inaf.it
		// date : 22.08.2024
		// code tested using nvhpc
		//
		// - Compile the code:
		// $ nvc++ mat_mult_block.cu -o mat_mult_block
		// - Run the code:
		// $ ./mat_mult_block
		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		#include <stdlib.h>
		#include <unistd.h>
		#include <time.h>
		#include <assert.h>
		#include <cuda.h>
		#include <string.h>
		#include <math.h>
		#include <float.h>

		#include "energy/energy_pmt.h"

		#define N 1024
		#define SIZE (N * N) // matrix size
		typedef double MyData; // do not change
		#define BLOCK 16

		// sanity check
		#if BLOCK * BLOCK > 1024
		#error BLOCKSIZE must be <= 1024
		#endif

		#if BLOCK * BLOCK > SIZE
		#error BLOCKSIZE must be <= SIZE
		#endif

		#define LOOP 100
		#define NDEBUG

		double wall_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_REALTIME, &ts);
		const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

		return ret;
		}

		void CPU_mat_mult(const MyData *const __restrict__ A,
		const MyData *const __restrict__ B,
		MyData *const __restrict__ C,
		const size_t size)
		{
		for (size_t i=0 ; i<size ; i++)
		for (size_t j=0 ; j<size ; j++)
		for (size_t k=0 ; k<size ; k++)
		C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);

		return;
		}

		void CPU_mat_mult_block(const MyData *const __restrict__ A,
		const MyData *const __restrict__ B,
		MyData *const __restrict__ C,
		const size_t size)
		{
		const size_t Nblocks = (size / BLOCK);

		// loop over blocks of matrix C
		for (size_t ib=0 ; ib<Nblocks ; ib++)
		{
		for (size_t jb=0 ; jb<Nblocks ; jb++)
		{
		// loop over blocks of rows of A
		for (size_t kb=0 ; kb<Nblocks ; kb++)
		{
		// Matrix multiplication within a block
		for (size_t i=(ib * BLOCK) ; i<((ib + 1) * BLOCK) ; i++)
		for (size_t j=(jb * BLOCK) ; j<((jb + 1) * BLOCK) ; j++)
		for (size_t k=(kb * BLOCK) ; k<((kb + 1) * BLOCK) ; k++)
		C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
		} // kb
		} // jb
		} // ib

		return;
		}

		__global__ void GPU_mat_mult_block(const MyData *const __restrict__ A,
		const MyData *const __restrict__ B,
		MyData *const __restrict__ C,
		const size_t size)
		{
		const size_t size2 = (size * size);

		const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
		if (globalID >= size2)
		return;

		const size_t Nblocks = size / BLOCK; // number of blocks to loop over A and B
		const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block
		const size_t jb = blockIdx.x % Nblocks;
		const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread
		const size_t j_local = threadIdx.x % BLOCK; // within its own block
		const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
		const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices

		C[(i_global * size) + j_global] = (MyData)0;

		// loop over blocks
		for (size_t block=0 ; block<Nblocks ; block++)
		{
		const size_t j_A = (block * BLOCK);
		const size_t i_B = (block * BLOCK);

		// perform the matrix multiplication within the block
		MyData value = (MyData)0;
		for (size_t k=0 ; k<BLOCK ; k++)
		value += A[(i_global * size) + k + j_A] * B[((k + i_B) * size) + j_global];

		C[(i_global * size) + j_global] += value;
		}

		return;
		}

		__global__ void GPU_mat_mult_block_shared(const MyData *const __restrict__ A,
		const MyData *const __restrict__ B,
		MyData *const __restrict__ C,
		const size_t size)
		{
		__shared__ MyData Ablock[BLOCK * BLOCK];
		__shared__ MyData Bblock[BLOCK * BLOCK];
		__shared__ MyData Cblock[BLOCK * BLOCK];

		const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
		const size_t size2 = (size * size);
		if (globalID >= size2)
		return;

		const size_t Nblocks = size / BLOCK; // number of blocks to loop over
		const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block
		const size_t jb = blockIdx.x % Nblocks;
		const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread
		const size_t j_local = threadIdx.x % BLOCK; // within its own block
		const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
		const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices

		// Init Cblock
		Cblock[(i_local * BLOCK) + j_local] = (MyData)0;

		// loop over blocks
		for (size_t block=0 ; block<Nblocks ; block++)
		{
		const size_t j_A = (block * BLOCK);
		const size_t i_B = (block * BLOCK);

		// waits until all threads in the thread block have reached this point and shared memory accesses
		// made by these threads prior to __syncthreads() are visible to all threads in the block.
		__syncthreads();

		// copy block of A into shared memory
		Ablock[(i_local * BLOCK) + j_local] = A[(i_global * size) + j_local + j_A];
		// copy block of B into shared memory
		Bblock[(i_local * BLOCK) + j_local] = B[((i_local + i_B) * size) + j_global];

		// waits until all threads in the thread block have reached this point and shared memory accesses // made by these threads prior to __syncthreads() are visible to all threads in the block.
		__syncthreads();

		// perform the matrix multiplication within the block
		MyData value = (MyData)0;
		for (size_t k=0 ; k<BLOCK ; k++)
		value += Ablock[(i_local * BLOCK) + k] * Bblock[(k * BLOCK) + j_local];

		// store the partial result in shared memory
		Cblock[(i_local * BLOCK) + j_local] += value;
		}

		C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK) + j_local];

		return;
		}

		void check(const MyData *const __restrict__ cpu_matrix,
		const MyData *const __restrict__ gpu_matrix)
		{
		int flag = 0;
		for (size_t i=0 ; i<SIZE ; i++)
		flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);

		if (!flag)
		printf("\n\t Result OK");
		else
		printf("\n\t Result wrong");

		return;
		}

		int main()
		{
		double time;
		MyData buffer_cpu = (MyData )calloc(4 * SIZE, sizeof(MyData));
		assert(buffer_cpu != NULL);

		// host reference matrix A
		MyData *const __restrict__ A_CPU = buffer_cpu;
		MyData *const __restrict__ B_CPU = A_CPU + SIZE;
		MyData *const __restrict__ C_CPU = B_CPU + SIZE;
		MyData *const __restrict__ C_GPU_host = C_CPU + SIZE;
		for (size_t i=0 ; i<SIZE ; i++)
		{
		A_CPU[i] = drand48();
		B_CPU[i] = drand48();
		}

		// Get the number of compute-capable devices
		int NumGPUs{-1};
		cudaGetDeviceCount(&NumGPUs);
		if (NumGPUs <= 0)
		{
		printf("\n\t Nvidia-GPU is not available \n\n");
		return EXIT_FAILURE;
		}

		int devID = 0;
		// Init PMT
		PMT_CREATE(&devID, 1);

		////////////////////////// CPU naive algorithm //////////////////////////////////////////
		PMT_CPU_START("CPU_mat_mult_block");
		CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N);
		PMT_CPU_STOP("CPU_mat_mult_block");
		/////////////////////////////////////////////////////////////////////////////////////////

		// copy/alloc data to the GPU
		MyData *buffer_gpu = NULL;
		cudaMalloc((void *)&buffer_gpu, (3 SIZE * sizeof(MyData)));
		MyData *const A_GPU = buffer_gpu;
		MyData *const B_GPU = A_GPU + SIZE;
		MyData *const C_GPU = B_GPU + SIZE;
		cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);

		const dim3 nblocks = {(SIZE + (BLOCK * BLOCK) -1)/(BLOCK * BLOCK), 1, 1};
		const dim3 block = {(BLOCK * BLOCK), 1, 1};

		/////////////////////////// GPU naive block algorithm ////////////////////////////////////////
		GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
		cudaDeviceSynchronize();
		time = 0.0;
		PMT_GPU_START("GPU_mat_mult_block", 0);
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
		cudaDeviceSynchronize();
		time += (wall_time() - start);
		}
		PMT_GPU_STOP("GPU_mat_mult_block", 0);
		cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);

		check(C_CPU, C_GPU_host);
		printf("\n\t GPU naive block time %lg [s]\n", (time / LOOP));
		/////////////////////////////////////////////////////////////////////////////////////

		/////////////////////////// GPU block shared memory algorithm ///////////////////////
		GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
		cudaDeviceSynchronize();
		time = 0.0;
		PMT_GPU_START("GPU_mat_mult_block_shared", 0);
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		const double start = wall_time();
		GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
		cudaDeviceSynchronize();
		time += (wall_time() - start);
		}
		PMT_GPU_STOP("GPU_mat_mult_block_shared", 0);
		cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);

		check(C_CPU, C_GPU_host);
		printf("\n\t GPU block shared memory time %lg [s]\n", (time / LOOP));
		/////////////////////////////////////////////////////////////////////////////////////

		// free CPU memory
		free(buffer_cpu);
		// free GPU memory
		cudaFree(buffer_gpu);

		printf("\n");

		PMT_CPU_SHOW("CPU_mat_mult_block");
		PMT_GPU_SHOW("GPU_mat_mult_block");
		PMT_GPU_SHOW("GPU_mat_mult_block_shared");

		return EXIT_SUCCESS;
		}