Commit 185a9cf7 authored by David Goz's avatar David Goz
Browse files

energy PMT

parent 17762d0f
Loading
Loading
Loading
Loading
+20 −6
Original line number Diff line number Diff line
@@ -9,22 +9,36 @@ OPTIONS += -D_ENERGY_RAPL_
OPTIONS += -D_ENERGY_NVIDIA_
# enable AMD
OPTIONS += # -D_ENERGY_AMD_

PMT = /leonardo/home/userexternal/dgoz0000/lib/pmt/local
# PMT   = /home/darkenergy/software/pmt
INC = -I$(PMT)/include
LIB = -L$(PMT)/lib64 -lpmt -lm

endif

NVCPP = nvc++
TOOLCHAIN ?= 

NVCPP = nvc++ -std=c++17 # --gcc-toolchain=$(TOOLCHAIN)
OPT   = -O3
INC   = /home/darkenergy/software/pmt/include
LIB   = /home/darkenergy/software/pmt/lib -lpmt -lm
DEBUG = -O0 -g

all: mat_mult
.PHONY: clean test
.PHONY: clean test debug

mat_mult: mat_mult_block.cu energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile
	$(NVCPP) $(OPT) $(OPTIONS) -I$(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ -L$(LIB)
	$(NVCPP) $(OPT) $(OPTIONS) $(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ $(LIB)
	ldd ./mat_mult

mat_mult_debug: mat_mult_block.cu energy/energy_pmt_methods.cpp energy/energy_pmt.h energy/energy_pmt_methods.h Makefile
	$(NVCPP) $(DEBUG) $(OPTIONS) $(INC) mat_mult_block.cu energy/energy_pmt_methods.cpp -o $@ $(LIB)
	ldd ./mat_mult_debug

test: mat_mult
	./mat_mult

debug: mat_mult_debug
	cuda-gdb ./$<

clean:
	rm -rf *.o mat_mult *~ energy/*~ energy/*.o
	rm -rf *.o mat_mult mat_mult_debug *~ energy/*~ energy/*.o
+3 −3
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@
#if defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
   #define PMT_CREATE(devID, numGPUs) Create_PMT((devID), (numGPUs))
#else
   #define PMT_CREATE(numGPUs)
   #define PMT_CREATE(devID, numGPUs)
#endif // defined(_ENERGY_RAPL_) || defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)

#if defined(_ENERGY_RAPL_)
@@ -23,8 +23,8 @@
   #define PMT_GPU_STOP(string, devID)  Stop_PMT_GPU((string), (devID))
   #define PMT_GPU_SHOW(string)  Show_PMT_GPU((string))
#else
   #define PMT_GPU_START(string, dev)
   #define PMT_GPU_STOP(string, dev)
   #define PMT_GPU_START(string, devID)
   #define PMT_GPU_STOP(string, devID)
   #define PMT_GPU_SHOW(string)
#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
+74 −54
Original line number Diff line number Diff line
@@ -49,7 +49,7 @@ void Create_PMT(int *devID,
{
#if defined(_ENERGY_RAPL_)

  sensor_cpu = pmt::Create("rapl");
  sensor_cpu = pmt::rapl::Rapl::Create();

#endif // _ENERGY_RAPL_
  
@@ -61,9 +61,9 @@ void Create_PMT(int *devID,
	  sensor_gpu.insert({devID[dev],

#if defined(_ENERGY_NVIDIA_)
	                     pmt::nvml::NVML::Create(dev)});
	                     pmt::nvml::NVML::Create(devID[dev])});
#elif defined(_ENERGY_AMD_)
	                     pmt::rocm::AMD::Create(dev)});
	                     pmt::rocm::AMD::Create(devID[dev])});
#endif

#endif // defined(_ENERGY_NVIDIA_) || defined(_ENERGY_AMD_)
@@ -83,19 +83,25 @@ void Start_PMT_CPU(const char *label)
      return;
    }

  const std::string tag{std::string{label}};

  // check if the label already exists
  if (state_cpu.count(std::string{label}))
  if (state_cpu.count(tag))
    {
      state_cpu[std::string{label}].start = sensor_cpu->Read();
      state_cpu[tag].start = sensor_cpu->Read();
    }
  else
    {
      // create new EnergyState
      const EnergyState newState{sensor_cpu->Read(),
                                 static_cast<pmt::State>(0),
                                 static_cast<double>(0),
                                 static_cast<double>(0),
                                 static_cast<double>(0),
                                 static_cast<unsigned int>(0)};

      // insert the key and initialize the counters
      state_cpu.insert({std::string{label},
			{sensor_cpu->Read(),
			 0,
			 0.0, 0.0, 0.0, 0
			}});
      state_cpu.insert(std::pair<std::string, EnergyState>(tag, newState));
    }

  return;
@@ -109,9 +115,11 @@ void Stop_PMT_CPU(const char *label)
      return;
    }

  const std::string tag{std::string{label}};
  
  // check if the label already exists
  // if not error
  if (!state_cpu.count(std::string{label}))
  if (!state_cpu.count(tag))
    {
      PMT_ERROR = true;
      PMT_err();
@@ -119,23 +127,20 @@ void Stop_PMT_CPU(const char *label)
    }
  else
    {
      // get the energy state
      EnergyState &State = state_cpu[tag];
      
      // read the counter
      state_cpu[std::string{label}].stop = sensor_cpu->Read();
      State.stop = sensor_cpu->Read();

      // update quantities
      state_cpu[std::string{label}].seconds +=
	sensor_cpu->seconds(state_cpu[std::string{label}].start,
			    state_cpu[std::string{label}].stop);
      State.seconds += sensor_cpu->seconds(State.start, State.stop);
      
      state_cpu[std::string{label}].joules +=
	sensor_cpu->joules(state_cpu[std::string{label}].start,
			   state_cpu[std::string{label}].stop);
      State.joules  += sensor_cpu->joules(State.start, State.stop);

      state_cpu[std::string{label}].watts +=
	sensor_cpu->watts(state_cpu[std::string{label}].start,
			  state_cpu[std::string{label}].stop);
      State.watts   += sensor_cpu->watts(State.start, State.stop);

      state_cpu[std::string{label}].count++;
      State.count++;
    }
  
  return;
@@ -143,17 +148,19 @@ void Stop_PMT_CPU(const char *label)

void Show_PMT_CPU(const char *label)
{
  if (PMT_ERROR || !state_cpu.count(std::string{label}))
  const std::string tag{std::string{label}};
  
  if (PMT_ERROR || !state_cpu.count(tag))
    {
      PMT_err();
      return;
    }
  else
    {
      std::cout << "\n\t CPU Kernel:" << std::string{label} << ":" << std::endl;
      std::cout << "\t\t" << state_cpu[std::string{label}].seconds << " [S]" << std::endl;
      std::cout << "\t\t" << state_cpu[std::string{label}].joules  << " [J]" << std::endl;
      std::cout << "\t\t" << state_cpu[std::string{label}].watts / state_cpu[std::string{label}].count  << " [W]" << "\n" << std::endl;
      std::cout << "\n\t CPU Kernel:" << tag << ":" << std::endl;
      std::cout << "\t\t" << state_cpu[tag].seconds << " [S]" << std::endl;
      std::cout << "\t\t" << state_cpu[tag].joules  << " [J]" << std::endl;
      std::cout << "\t\t" << state_cpu[tag].watts / state_cpu[tag].count  << " [W]" << "\n" << std::endl;
    }
  
  return;
@@ -174,23 +181,28 @@ void Start_PMT_GPU(const char *label,
  if (!state_gpu.count(devID))
    {
      // insert devID
      state_gpu.insert({devID, {}});
      state_gpu.insert(std::pair<int, std::map<std::string, EnergyState>>(devID, {}));
    }

  const std::string tag{std::string{label}};
  
  // check if the label already exists
  if (state_gpu[devID].count(std::string{label}))
  if (state_gpu[devID].count(tag))
    {
      // read the sensor
      state_gpu[devID][std::string{label}].start = sensor_gpu[devID]->Read();
      state_gpu[devID][tag].start = sensor_gpu[devID]->Read();
    }
  else
    {
      // insert the label and initialize the counters
      state_gpu[devID].insert({std::string{label},
			       {
				 sensor_gpu[devID]->Read(),
				 0,
				 0.0, 0.0, 0.0, 0
			       }});
      const EnergyState newState{sensor_gpu[devID]->Read(),
                                 static_cast<pmt::State>(0),
                                 static_cast<double>(0),
                                 static_cast<double>(0),
                                 static_cast<double>(0),
                                 static_cast<unsigned int>(0)};

      state_gpu[devID].insert(std::pair<std::string, EnergyState>(tag, newState));      
    }

  return;
@@ -209,9 +221,11 @@ void Stop_PMT_GPU(const char *label,
    }
  else
    {
      const std::string tag{std::string{label}};
      
      // check if the label already exists
      // if not error
      if (!state_gpu[devID].count(std::string{label}))
      if (!state_gpu[devID].count(tag))
	{
	  PMT_ERROR = true;
	  PMT_err();
@@ -219,23 +233,25 @@ void Stop_PMT_GPU(const char *label,
	}
      else
	{
	  EnergyState &State = state_gpu[devID][tag];
	  
	  // read the counter
	  state_gpu[devID][std::string{label}].stop = sensor_gpu[devID]->Read();
	  State.stop = sensor_gpu[devID]->Read();

	  // update quantities
	  state_gpu[devID][std::string{label}].seconds +=
	    sensor_gpu[devID]->seconds(state_gpu[devID][std::string{label}].start,
				       state_gpu[devID][std::string{label}].stop);
	  State.seconds +=
	    sensor_gpu[devID]->seconds(State.start,
				       State.stop);
      
	  state_gpu[devID][std::string{label}].joules +=
	    sensor_gpu[devID]->joules(state_gpu[devID][std::string{label}].start,
				      state_gpu[devID][std::string{label}].stop);
	  State.joules +=
	    sensor_gpu[devID]->joules(State.start,
				      State.stop);

	  state_gpu[devID][std::string{label}].watts +=
	    sensor_gpu[devID]->watts(state_gpu[devID][std::string{label}].start,
				     state_gpu[devID][std::string{label}].stop);
	  State.watts +=
	    sensor_gpu[devID]->watts(State.start,
				     State.stop);
      
	  state_gpu[devID][std::string{label}].count++;
	  State.count++;
	}
    }
  
@@ -251,13 +267,17 @@ void Show_PMT_GPU(const char *label)
    }
  else
    {
      // show quantities for all devices
      const std::string tag{std::string{label}};

      for (const auto &[key, value]: state_gpu)
	{
	  std::cout << "\n\t GPU [" << key << "] kernel:" << std::string{label} << ":" << std::endl;
	  std::cout << "\t\t" << value.at(std::string{label}).seconds << " [s]" << std::endl;
	  std::cout << "\t\t" << value.at(std::string{label}).joules  << " [J]" << std::endl;
	  std::cout << "\t\t" << value.at(std::string{label}).watts / value.at(std::string{label}).count  << " [W]" << "\n" << std::endl;
	  if (value.count(tag))
	    {
	      std::cout << "\n\t GPU [" << key << "] kernel:" << tag << ":" << std::endl;
	      std::cout << "\t\t" << value.at(tag).seconds << " [s]" << std::endl;
	      std::cout << "\t\t" << value.at(tag).joules  << " [J]" << std::endl;
	      std::cout << "\t\t" << value.at(tag).watts / value.at(tag).count  << " [W]" << "\n" << std::endl;
	    }
	}
    }
  
+52 −46
Original line number Diff line number Diff line
@@ -39,6 +39,7 @@
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
@@ -51,20 +52,21 @@
#include "energy/energy_pmt.h"

#define N                    1024
#define SIZE                 (N * N) // matrix size
#define SIZE                 ((N) * (N)) // matrix size
typedef double MyData;               // do not change
#define BLOCK                16
#define BLOCK_               16
#define BLOCKSIZE            ((BLOCK_) * (BLOCK_))

// sanity check
#if BLOCK * BLOCK > 1024
#if BLOCKSIZE > 1024
#error BLOCKSIZE must be <= 1024
#endif

#if BLOCK * BLOCK > SIZE
#if BLOCKSIZE > SIZE
#error BLOCKSIZE must be <= SIZE
#endif

#define LOOP 100
#define LOOP 3
#define NDEBUG

double wall_time()
@@ -94,7 +96,7 @@ void CPU_mat_mult_block(const MyData *const __restrict__ A,
	                      MyData *const __restrict__ C,
			const size_t                 size)
{
  const size_t Nblocks = (size / BLOCK);
  const size_t Nblocks = (size / BLOCK_);

  // loop over blocks of matrix C
  for (size_t ib=0 ; ib<Nblocks ; ib++)
@@ -105,9 +107,9 @@ void CPU_mat_mult_block(const MyData *const __restrict__ A,
	  for (size_t kb=0 ; kb<Nblocks ; kb++)
	    {
	      // Matrix multiplication within a block
	      for (size_t i=(ib * BLOCK) ; i<((ib + 1) * BLOCK) ; i++)
		for (size_t j=(jb * BLOCK) ; j<((jb + 1) * BLOCK) ; j++)
		  for (size_t k=(kb * BLOCK) ; k<((kb + 1) * BLOCK) ; k++)
	      for (size_t i=(ib * BLOCK_) ; i<((ib + 1) * BLOCK_) ; i++)
		for (size_t j=(jb * BLOCK_) ; j<((jb + 1) * BLOCK_) ; j++)
		  for (size_t k=(kb * BLOCK_) ; k<((kb + 1) * BLOCK_) ; k++)
		    C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
	    } // kb
	} // jb
@@ -127,25 +129,25 @@ __global__ void GPU_mat_mult_block(const MyData *const __restrict__ A,
  if (globalID >= size2)
    return;
  
  const size_t Nblocks  = size / BLOCK;           // number of blocks to loop over A and B
  const size_t Nblocks  = size / BLOCK_;           // number of blocks to loop over A and B
  const size_t ib       = blockIdx.x / Nblocks;    // indexes of starting matrix's block
  const size_t jb       = blockIdx.x % Nblocks;
  const size_t i_local  = threadIdx.x / BLOCK;    // local matrix's indexes mapped to each CUDA thread
  const size_t j_local  = threadIdx.x % BLOCK;    // within its own block
  const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
  const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
  const size_t i_local  = threadIdx.x / BLOCK_;    // local matrix's indexes mapped to each CUDA thread
  const size_t j_local  = threadIdx.x % BLOCK_;    // within its own block
  const size_t i_global = i_local + (ib * BLOCK_); // global matrix's indexes mapped to each CUDA thread
  const size_t j_global = j_local + (jb * BLOCK_); // N.B. uncoalescent memory accesses to A and B matrices
  
  C[(i_global * size) + j_global] = (MyData)0;

  // loop over blocks
  for (size_t block=0 ; block<Nblocks ; block++)
    {
      const size_t j_A = (block * BLOCK);
      const size_t i_B = (block * BLOCK);
      const size_t j_A = (block * BLOCK_);
      const size_t i_B = (block * BLOCK_);

      // perform the matrix multiplication within the block
      MyData value = (MyData)0;
      for (size_t k=0 ; k<BLOCK ; k++)
      for (size_t k=0 ; k<BLOCK_ ; k++)
	value += A[(i_global * size) + k + j_A] * B[((k + i_B) * size) + j_global];

      C[(i_global * size) + j_global] += value;
@@ -159,54 +161,54 @@ __global__ void GPU_mat_mult_block_shared(const MyData *const __restrict__ A,
					        MyData *const __restrict__ C,
					  const size_t                 size)
{
  __shared__ MyData Ablock[BLOCK * BLOCK];
  __shared__ MyData Bblock[BLOCK * BLOCK];
  __shared__ MyData Cblock[BLOCK * BLOCK];
  __shared__ MyData Ablock[BLOCKSIZE];
  __shared__ MyData Bblock[BLOCKSIZE];
  __shared__ MyData Cblock[BLOCKSIZE];
  
  const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
  const size_t size2 = (size * size);
  if (globalID >= size2)
    return;
  
  const size_t Nblocks  = size / BLOCK;           // number of blocks to loop over
  const size_t Nblocks  = size / BLOCK_;           // number of blocks to loop over
  const size_t ib       = blockIdx.x / Nblocks;    // indexes of starting matrix's block
  const size_t jb       = blockIdx.x % Nblocks;
  const size_t i_local  = threadIdx.x / BLOCK;    // local matrix's indexes mapped to each CUDA thread
  const size_t j_local  = threadIdx.x % BLOCK;    // within its own block
  const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
  const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
  const size_t i_local  = threadIdx.x / BLOCK_;    // local matrix's indexes mapped to each CUDA thread
  const size_t j_local  = threadIdx.x % BLOCK_;    // within its own block
  const size_t i_global = i_local + (ib * BLOCK_); // global matrix's indexes mapped to each CUDA thread
  const size_t j_global = j_local + (jb * BLOCK_); // N.B. uncoalescent memory accesses to A and B matrices

  // Init Cblock
  Cblock[(i_local * BLOCK) + j_local] = (MyData)0;
  Cblock[(i_local * BLOCK_) + j_local] = (MyData)0;

  // loop over blocks
  for (size_t block=0 ; block<Nblocks ; block++)
    {
      const size_t j_A = (block * BLOCK);
      const size_t i_B = (block * BLOCK);
      const size_t j_A = (block * BLOCK_);
      const size_t i_B = (block * BLOCK_);

      // waits until all threads in the thread block have reached this point and shared memory accesses
      // made by these threads prior to __syncthreads() are visible to all threads in the block.
      __syncthreads();
      
      // copy block of A into shared memory
      Ablock[(i_local * BLOCK) + j_local] = A[(i_global * size) + j_local + j_A];
      Ablock[(i_local * BLOCK_) + j_local] = A[(i_global * size) + j_local + j_A];
      // copy block of B into shared memory
      Bblock[(i_local * BLOCK) + j_local] = B[((i_local + i_B) * size) + j_global];
      Bblock[(i_local * BLOCK_) + j_local] = B[((i_local + i_B) * size) + j_global];
      
      // waits until all threads in the thread block have reached this point and shared memory accesses      // made by these threads prior to __syncthreads() are visible to all threads in the block.
      __syncthreads();

      // perform the matrix multiplication within the block
      MyData value = (MyData)0;
      for (size_t k=0 ; k<BLOCK ; k++)
	value += Ablock[(i_local * BLOCK) + k] * Bblock[(k * BLOCK) + j_local];
      for (size_t k=0 ; k<BLOCK_ ; k++)
	value += Ablock[(i_local * BLOCK_) + k] * Bblock[(k * BLOCK_) + j_local];

      // store the partial result in shared memory
      Cblock[(i_local * BLOCK) + j_local] += value;
      Cblock[(i_local * BLOCK_) + j_local] += value;
    }

  C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK) + j_local];
  C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK_) + j_local];
  
  return;
}
@@ -257,9 +259,13 @@ int main()
  PMT_CREATE(&devID, 1);
  
  ////////////////////////// CPU naive algorithm //////////////////////////////////////////
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      PMT_CPU_START("CPU_mat_mult_block");
      memset(C_CPU, 0, SIZE * sizeof(MyData));
      CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N);
      PMT_CPU_STOP("CPU_mat_mult_block");
    }
  /////////////////////////////////////////////////////////////////////////////////////////
  
  // copy/alloc data to the GPU
@@ -270,22 +276,22 @@ int main()
  MyData *const C_GPU = B_GPU + SIZE;
  cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);

  const dim3 nblocks = {(SIZE + (BLOCK * BLOCK)  -1)/(BLOCK * BLOCK), 1, 1};
  const dim3 block   = {(BLOCK * BLOCK), 1, 1};
  const unsigned int nblocks = ((SIZE + (BLOCKSIZE) - 1) / BLOCKSIZE);
  const unsigned int block   = BLOCKSIZE;
  
  /////////////////////////// GPU naive block algorithm ////////////////////////////////////////
  GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
  cudaDeviceSynchronize();
  time = 0.0;
  PMT_GPU_START("GPU_mat_mult_block", 0);
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      PMT_GPU_START("GPU_mat_mult_block", 0);
      const double start = wall_time();
      GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
      cudaDeviceSynchronize();
      time += (wall_time() - start);
    }
      PMT_GPU_STOP("GPU_mat_mult_block", 0);
    }
  cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
  
  check(C_CPU, C_GPU_host);
@@ -296,15 +302,15 @@ int main()
  GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
  cudaDeviceSynchronize();
  time = 0.0;
  PMT_GPU_START("GPU_mat_mult_block_shared", 0);
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      PMT_GPU_START("GPU_mat_mult_block_shared", 0);
      const double start = wall_time();
      GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
      cudaDeviceSynchronize();
      time += (wall_time() - start);
    }
      PMT_GPU_STOP("GPU_mat_mult_block_shared", 0);
    }
  cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
  
  check(C_CPU, C_GPU_host);
+436 KiB

File added.

No diff preview for this file type.

Loading