energy PMT (185a9cf7) · Commits · HPC_SCHOOL / HPC_SCHOOL_2024

cuda-omp/cuda/energy/Makefile

+20 −6

Original line number	Diff line number	Diff line
		@@ -9,22 +9,36 @@ OPTIONS += -D_ENERGY_RAPL_
		OPTIONS += -D_ENERGY_NVIDIA_
		# enable AMD
		OPTIONS += # -D_ENERGY_AMD_

		PMT = /leonardo/home/userexternal/dgoz0000/lib/pmt/local
		# PMT = /home/darkenergy/software/pmt
		INC = -I$(PMT)/include
		LIB = -L$(PMT)/lib64 -lpmt -lm

		endif

		NVCPP = nvc++
		TOOLCHAIN ?=

		NVCPP = nvc++ -std=c++17 # --gcc-toolchain=$(TOOLCHAIN)
		OPT = -O3
		INC = /home/darkenergy/software/pmt/include
		LIB = /home/darkenergy/software/pmt/lib -lpmt -lm
		DEBUG = -O0 -g

		all: mat_mult
		.PHONY: clean test
		.PHONY: clean test debug

		mat_mult: mat_mult_block.cu energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile
		$(NVCPP) $(OPT) $(OPTIONS) -I$(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ -L$(LIB)
		$(NVCPP) $(OPT) $(OPTIONS) $(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ $(LIB)
		ldd ./mat_mult

		mat_mult_debug: mat_mult_block.cu energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile
		$(NVCPP) $(DEBUG) $(OPTIONS) $(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ $(LIB)
		ldd ./mat_mult_debug

		test: mat_mult
		./mat_mult

		debug: mat_mult_debug
		cuda-gdb ./$<

		clean:
		rm -rf .o mat_mult ~ energy/~ energy/.o
		rm -rf .o mat_mult mat_mult_debug ~ energy/~ energy/.o

cuda-omp/cuda/energy/energy/energy_pmt.h

+3 −3

Original line number	Diff line number	Diff line
		@@ -5,7 +5,7 @@
		#if defined(_ENERGY_RAPL_) \|\| defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)
		#define PMT_CREATE(devID, numGPUs) Create_PMT((devID), (numGPUs))
		#else
		#define PMT_CREATE(numGPUs)
		#define PMT_CREATE(devID, numGPUs)
		#endif // defined(_ENERGY_RAPL_) \|\| defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)

		#if defined(_ENERGY_RAPL_)
		@@ -23,8 +23,8 @@
		#define PMT_GPU_STOP(string, devID) Stop_PMT_GPU((string), (devID))
		#define PMT_GPU_SHOW(string) Show_PMT_GPU((string))
		#else
		#define PMT_GPU_START(string, dev)
		#define PMT_GPU_STOP(string, dev)
		#define PMT_GPU_START(string, devID)
		#define PMT_GPU_STOP(string, devID)
		#define PMT_GPU_SHOW(string)
		#endif // defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)

cuda-omp/cuda/energy/energy/energy_pmt_methods.cpp

+74 −54

Original line number	Diff line number	Diff line
		@@ -49,7 +49,7 @@ void Create_PMT(int *devID,
		{
		#if defined(_ENERGY_RAPL_)

		sensor_cpu = pmt::Create("rapl");
		sensor_cpu = pmt::rapl::Rapl::Create();

		#endif // _ENERGY_RAPL_

		@@ -61,9 +61,9 @@ void Create_PMT(int *devID,
		sensor_gpu.insert({devID[dev],

		#if defined(_ENERGY_NVIDIA_)
		pmt::nvml::NVML::Create(dev)});
		pmt::nvml::NVML::Create(devID[dev])});
		#elif defined(_ENERGY_AMD_)
		pmt::rocm::AMD::Create(dev)});
		pmt::rocm::AMD::Create(devID[dev])});
		#endif

		#endif // defined(_ENERGY_NVIDIA_) \|\| defined(_ENERGY_AMD_)
		@@ -83,19 +83,25 @@ void Start_PMT_CPU(const char *label)
		return;
		}

		const std::string tag{std::string{label}};

		// check if the label already exists
		if (state_cpu.count(std::string{label}))
		if (state_cpu.count(tag))
		{
		state_cpu[std::string{label}].start = sensor_cpu->Read();
		state_cpu[tag].start = sensor_cpu->Read();
		}
		else
		{
		// create new EnergyState
		const EnergyState newState{sensor_cpu->Read(),
		static_cast<pmt::State>(0),
		static_cast<double>(0),
		static_cast<double>(0),
		static_cast<double>(0),
		static_cast<unsigned int>(0)};

		// insert the key and initialize the counters
		state_cpu.insert({std::string{label},
		{sensor_cpu->Read(),
		0,
		0.0, 0.0, 0.0, 0
		}});
		state_cpu.insert(std::pair<std::string, EnergyState>(tag, newState));
		}

		return;
		@@ -109,9 +115,11 @@ void Stop_PMT_CPU(const char *label)
		return;
		}

		const std::string tag{std::string{label}};

		// check if the label already exists
		// if not error
		if (!state_cpu.count(std::string{label}))
		if (!state_cpu.count(tag))
		{
		PMT_ERROR = true;
		PMT_err();
		@@ -119,23 +127,20 @@ void Stop_PMT_CPU(const char *label)
		}
		else
		{
		// get the energy state
		EnergyState &State = state_cpu[tag];

		// read the counter
		state_cpu[std::string{label}].stop = sensor_cpu->Read();
		State.stop = sensor_cpu->Read();

		// update quantities
		state_cpu[std::string{label}].seconds +=
		sensor_cpu->seconds(state_cpu[std::string{label}].start,
		state_cpu[std::string{label}].stop);
		State.seconds += sensor_cpu->seconds(State.start, State.stop);

		state_cpu[std::string{label}].joules +=
		sensor_cpu->joules(state_cpu[std::string{label}].start,
		state_cpu[std::string{label}].stop);
		State.joules += sensor_cpu->joules(State.start, State.stop);

		state_cpu[std::string{label}].watts +=
		sensor_cpu->watts(state_cpu[std::string{label}].start,
		state_cpu[std::string{label}].stop);
		State.watts += sensor_cpu->watts(State.start, State.stop);

		state_cpu[std::string{label}].count++;
		State.count++;
		}

		return;
		@@ -143,17 +148,19 @@ void Stop_PMT_CPU(const char *label)

		void Show_PMT_CPU(const char *label)
		{
		if (PMT_ERROR \|\| !state_cpu.count(std::string{label}))
		const std::string tag{std::string{label}};

		if (PMT_ERROR \|\| !state_cpu.count(tag))
		{
		PMT_err();
		return;
		}
		else
		{
		std::cout << "\n\t CPU Kernel:" << std::string{label} << ":" << std::endl;
		std::cout << "\t\t" << state_cpu[std::string{label}].seconds << " [S]" << std::endl;
		std::cout << "\t\t" << state_cpu[std::string{label}].joules << " [J]" << std::endl;
		std::cout << "\t\t" << state_cpu[std::string{label}].watts / state_cpu[std::string{label}].count << " [W]" << "\n" << std::endl;
		std::cout << "\n\t CPU Kernel:" << tag << ":" << std::endl;
		std::cout << "\t\t" << state_cpu[tag].seconds << " [S]" << std::endl;
		std::cout << "\t\t" << state_cpu[tag].joules << " [J]" << std::endl;
		std::cout << "\t\t" << state_cpu[tag].watts / state_cpu[tag].count << " [W]" << "\n" << std::endl;
		}

		return;
		@@ -174,23 +181,28 @@ void Start_PMT_GPU(const char *label,
		if (!state_gpu.count(devID))
		{
		// insert devID
		state_gpu.insert({devID, {}});
		state_gpu.insert(std::pair<int, std::map<std::string, EnergyState>>(devID, {}));
		}

		const std::string tag{std::string{label}};

		// check if the label already exists
		if (state_gpu[devID].count(std::string{label}))
		if (state_gpu[devID].count(tag))
		{
		// read the sensor
		state_gpu[devID][std::string{label}].start = sensor_gpu[devID]->Read();
		state_gpu[devID][tag].start = sensor_gpu[devID]->Read();
		}
		else
		{
		// insert the label and initialize the counters
		state_gpu[devID].insert({std::string{label},
		{
		sensor_gpu[devID]->Read(),
		0,
		0.0, 0.0, 0.0, 0
		}});
		const EnergyState newState{sensor_gpu[devID]->Read(),
		static_cast<pmt::State>(0),
		static_cast<double>(0),
		static_cast<double>(0),
		static_cast<double>(0),
		static_cast<unsigned int>(0)};

		state_gpu[devID].insert(std::pair<std::string, EnergyState>(tag, newState));
		}

		return;
		@@ -209,9 +221,11 @@ void Stop_PMT_GPU(const char *label,
		}
		else
		{
		const std::string tag{std::string{label}};

		// check if the label already exists
		// if not error
		if (!state_gpu[devID].count(std::string{label}))
		if (!state_gpu[devID].count(tag))
		{
		PMT_ERROR = true;
		PMT_err();
		@@ -219,23 +233,25 @@ void Stop_PMT_GPU(const char *label,
		}
		else
		{
		EnergyState &State = state_gpu[devID][tag];

		// read the counter
		state_gpu[devID][std::string{label}].stop = sensor_gpu[devID]->Read();
		State.stop = sensor_gpu[devID]->Read();

		// update quantities
		state_gpu[devID][std::string{label}].seconds +=
		sensor_gpu[devID]->seconds(state_gpu[devID][std::string{label}].start,
		state_gpu[devID][std::string{label}].stop);
		State.seconds +=
		sensor_gpu[devID]->seconds(State.start,
		State.stop);

		state_gpu[devID][std::string{label}].joules +=
		sensor_gpu[devID]->joules(state_gpu[devID][std::string{label}].start,
		state_gpu[devID][std::string{label}].stop);
		State.joules +=
		sensor_gpu[devID]->joules(State.start,
		State.stop);

		state_gpu[devID][std::string{label}].watts +=
		sensor_gpu[devID]->watts(state_gpu[devID][std::string{label}].start,
		state_gpu[devID][std::string{label}].stop);
		State.watts +=
		sensor_gpu[devID]->watts(State.start,
		State.stop);

		state_gpu[devID][std::string{label}].count++;
		State.count++;
		}
		}

		@@ -251,13 +267,17 @@ void Show_PMT_GPU(const char *label)
		}
		else
		{
		// show quantities for all devices
		const std::string tag{std::string{label}};

		for (const auto &[key, value]: state_gpu)
		{
		std::cout << "\n\t GPU [" << key << "] kernel:" << std::string{label} << ":" << std::endl;
		std::cout << "\t\t" << value.at(std::string{label}).seconds << " [s]" << std::endl;
		std::cout << "\t\t" << value.at(std::string{label}).joules << " [J]" << std::endl;
		std::cout << "\t\t" << value.at(std::string{label}).watts / value.at(std::string{label}).count << " [W]" << "\n" << std::endl;
		if (value.count(tag))
		{
		std::cout << "\n\t GPU [" << key << "] kernel:" << tag << ":" << std::endl;
		std::cout << "\t\t" << value.at(tag).seconds << " [s]" << std::endl;
		std::cout << "\t\t" << value.at(tag).joules << " [J]" << std::endl;
		std::cout << "\t\t" << value.at(tag).watts / value.at(tag).count << " [W]" << "\n" << std::endl;
		}
		}
		}

cuda-omp/cuda/energy/mat_mult_block.cu

+52 −46

Original line number	Diff line number	Diff line
		@@ -39,6 +39,7 @@
		//////////////////////////////////////////////////////////////////////////////////////////////////

		#include <stdio.h>
		#include <string.h>
		#include <stdlib.h>
		#include <unistd.h>
		#include <time.h>
		@@ -51,20 +52,21 @@
		#include "energy/energy_pmt.h"

		#define N 1024
		#define SIZE (N * N) // matrix size
		#define SIZE ((N) * (N)) // matrix size
		typedef double MyData; // do not change
		#define BLOCK 16
		#define BLOCK_ 16
		#define BLOCKSIZE ((BLOCK_) * (BLOCK_))

		// sanity check
		#if BLOCK * BLOCK > 1024
		#if BLOCKSIZE > 1024
		#error BLOCKSIZE must be <= 1024
		#endif

		#if BLOCK * BLOCK > SIZE
		#if BLOCKSIZE > SIZE
		#error BLOCKSIZE must be <= SIZE
		#endif

		#define LOOP 100
		#define LOOP 3
		#define NDEBUG

		double wall_time()
		@@ -94,7 +96,7 @@ void CPU_mat_mult_block(const MyData *const __restrict__ A,
		MyData *const __restrict__ C,
		const size_t size)
		{
		const size_t Nblocks = (size / BLOCK);
		const size_t Nblocks = (size / BLOCK_);

		// loop over blocks of matrix C
		for (size_t ib=0 ; ib<Nblocks ; ib++)
		@@ -105,9 +107,9 @@ void CPU_mat_mult_block(const MyData *const __restrict__ A,
		for (size_t kb=0 ; kb<Nblocks ; kb++)
		{
		// Matrix multiplication within a block
		for (size_t i=(ib * BLOCK) ; i<((ib + 1) * BLOCK) ; i++)
		for (size_t j=(jb * BLOCK) ; j<((jb + 1) * BLOCK) ; j++)
		for (size_t k=(kb * BLOCK) ; k<((kb + 1) * BLOCK) ; k++)
		for (size_t i=(ib * BLOCK_) ; i<((ib + 1) * BLOCK_) ; i++)
		for (size_t j=(jb * BLOCK_) ; j<((jb + 1) * BLOCK_) ; j++)
		for (size_t k=(kb * BLOCK_) ; k<((kb + 1) * BLOCK_) ; k++)
		C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
		} // kb
		} // jb
		@@ -127,25 +129,25 @@ __global__ void GPU_mat_mult_block(const MyData *const __restrict__ A,
		if (globalID >= size2)
		return;

		const size_t Nblocks = size / BLOCK; // number of blocks to loop over A and B
		const size_t Nblocks = size / BLOCK_; // number of blocks to loop over A and B
		const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block
		const size_t jb = blockIdx.x % Nblocks;
		const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread
		const size_t j_local = threadIdx.x % BLOCK; // within its own block
		const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
		const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
		const size_t i_local = threadIdx.x / BLOCK_; // local matrix's indexes mapped to each CUDA thread
		const size_t j_local = threadIdx.x % BLOCK_; // within its own block
		const size_t i_global = i_local + (ib * BLOCK_); // global matrix's indexes mapped to each CUDA thread
		const size_t j_global = j_local + (jb * BLOCK_); // N.B. uncoalescent memory accesses to A and B matrices

		C[(i_global * size) + j_global] = (MyData)0;

		// loop over blocks
		for (size_t block=0 ; block<Nblocks ; block++)
		{
		const size_t j_A = (block * BLOCK);
		const size_t i_B = (block * BLOCK);
		const size_t j_A = (block * BLOCK_);
		const size_t i_B = (block * BLOCK_);

		// perform the matrix multiplication within the block
		MyData value = (MyData)0;
		for (size_t k=0 ; k<BLOCK ; k++)
		for (size_t k=0 ; k<BLOCK_ ; k++)
		value += A[(i_global * size) + k + j_A] * B[((k + i_B) * size) + j_global];

		C[(i_global * size) + j_global] += value;
		@@ -159,54 +161,54 @@ __global__ void GPU_mat_mult_block_shared(const MyData *const __restrict__ A,
		MyData *const __restrict__ C,
		const size_t size)
		{
		__shared__ MyData Ablock[BLOCK * BLOCK];
		__shared__ MyData Bblock[BLOCK * BLOCK];
		__shared__ MyData Cblock[BLOCK * BLOCK];
		__shared__ MyData Ablock[BLOCKSIZE];
		__shared__ MyData Bblock[BLOCKSIZE];
		__shared__ MyData Cblock[BLOCKSIZE];

		const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
		const size_t size2 = (size * size);
		if (globalID >= size2)
		return;

		const size_t Nblocks = size / BLOCK; // number of blocks to loop over
		const size_t Nblocks = size / BLOCK_; // number of blocks to loop over
		const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block
		const size_t jb = blockIdx.x % Nblocks;
		const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread
		const size_t j_local = threadIdx.x % BLOCK; // within its own block
		const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
		const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
		const size_t i_local = threadIdx.x / BLOCK_; // local matrix's indexes mapped to each CUDA thread
		const size_t j_local = threadIdx.x % BLOCK_; // within its own block
		const size_t i_global = i_local + (ib * BLOCK_); // global matrix's indexes mapped to each CUDA thread
		const size_t j_global = j_local + (jb * BLOCK_); // N.B. uncoalescent memory accesses to A and B matrices

		// Init Cblock
		Cblock[(i_local * BLOCK) + j_local] = (MyData)0;
		Cblock[(i_local * BLOCK_) + j_local] = (MyData)0;

		// loop over blocks
		for (size_t block=0 ; block<Nblocks ; block++)
		{
		const size_t j_A = (block * BLOCK);
		const size_t i_B = (block * BLOCK);
		const size_t j_A = (block * BLOCK_);
		const size_t i_B = (block * BLOCK_);

		// waits until all threads in the thread block have reached this point and shared memory accesses
		// made by these threads prior to __syncthreads() are visible to all threads in the block.
		__syncthreads();

		// copy block of A into shared memory
		Ablock[(i_local * BLOCK) + j_local] = A[(i_global * size) + j_local + j_A];
		Ablock[(i_local * BLOCK_) + j_local] = A[(i_global * size) + j_local + j_A];
		// copy block of B into shared memory
		Bblock[(i_local * BLOCK) + j_local] = B[((i_local + i_B) * size) + j_global];
		Bblock[(i_local * BLOCK_) + j_local] = B[((i_local + i_B) * size) + j_global];

		// waits until all threads in the thread block have reached this point and shared memory accesses // made by these threads prior to __syncthreads() are visible to all threads in the block.
		__syncthreads();

		// perform the matrix multiplication within the block
		MyData value = (MyData)0;
		for (size_t k=0 ; k<BLOCK ; k++)
		value += Ablock[(i_local * BLOCK) + k] * Bblock[(k * BLOCK) + j_local];
		for (size_t k=0 ; k<BLOCK_ ; k++)
		value += Ablock[(i_local * BLOCK_) + k] * Bblock[(k * BLOCK_) + j_local];

		// store the partial result in shared memory
		Cblock[(i_local * BLOCK) + j_local] += value;
		Cblock[(i_local * BLOCK_) + j_local] += value;
		}

		C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK) + j_local];
		C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK_) + j_local];

		return;
		}
		@@ -257,9 +259,13 @@ int main()
		PMT_CREATE(&devID, 1);

		////////////////////////// CPU naive algorithm //////////////////////////////////////////
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		PMT_CPU_START("CPU_mat_mult_block");
		memset(C_CPU, 0, SIZE * sizeof(MyData));
		CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N);
		PMT_CPU_STOP("CPU_mat_mult_block");
		}
		/////////////////////////////////////////////////////////////////////////////////////////

		// copy/alloc data to the GPU
		@@ -270,22 +276,22 @@ int main()
		MyData *const C_GPU = B_GPU + SIZE;
		cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);

		const dim3 nblocks = {(SIZE + (BLOCK * BLOCK) -1)/(BLOCK * BLOCK), 1, 1};
		const dim3 block = {(BLOCK * BLOCK), 1, 1};
		const unsigned int nblocks = ((SIZE + (BLOCKSIZE) - 1) / BLOCKSIZE);
		const unsigned int block = BLOCKSIZE;

		/////////////////////////// GPU naive block algorithm ////////////////////////////////////////
		GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
		cudaDeviceSynchronize();
		time = 0.0;
		PMT_GPU_START("GPU_mat_mult_block", 0);
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		PMT_GPU_START("GPU_mat_mult_block", 0);
		const double start = wall_time();
		GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
		cudaDeviceSynchronize();
		time += (wall_time() - start);
		}
		PMT_GPU_STOP("GPU_mat_mult_block", 0);
		}
		cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);

		check(C_CPU, C_GPU_host);
		@@ -296,15 +302,15 @@ int main()
		GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
		cudaDeviceSynchronize();
		time = 0.0;
		PMT_GPU_START("GPU_mat_mult_block_shared", 0);
		for (unsigned short int loop=0 ; loop<LOOP ; loop++)
		{
		PMT_GPU_START("GPU_mat_mult_block_shared", 0);
		const double start = wall_time();
		GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
		cudaDeviceSynchronize();
		time += (wall_time() - start);
		}
		PMT_GPU_STOP("GPU_mat_mult_block_shared", 0);
		}
		cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);

		check(C_CPU, C_GPU_host);

cuda-omp/cuda/energy/mat_mult_debug

0 → 100755

+436 KiB

File added.

No diff preview for this file type.

View file