Commit ec502a4a authored by David Goz's avatar David Goz 😴
Browse files

Initial commit

parents
Loading
Loading
Loading
Loading

FER/Makefile

0 → 100644
+42 −0
Original line number Diff line number Diff line
CFLAGS_      = -O3
MCC_FLAGS_   = --ompss --variable=disable_final_clause_transformation:1

PROGRAM_ = fer

# MCC               ?= fpgacc
# CROSS_COMPILE_ZED ?= arm-linux-gnueabihf-
# MCC_               = $(CROSS_COMPILE_ZED)$(MCC)

MCC               ?= fpgacc
CROSS_COMPILE_ZCU ?= aarch64-linux-gnu-
MCC_               = $(CROSS_COMPILE_ZCU)$(MCC)

# BOARD      = zedboard
# FPGA_CLOCK = 100

BOARD      = zcu102
FPGA_CLOCK = 200

.PHONY: HLS bitstream energy clean

HLS: $(PROGRAM_).c Makefile
	mkdir -p ${PWD}/$(BOARD)_HLS
	$(MCC_) $(CFLAGS_) $(MCC_FLAGS_) --bitstream-generation \
	--Wf,"--verbose,--name=$(PROGRAM_)_HLS,--dir=${PWD}/$(BOARD)_HLS,--board=$(BOARD),--hwruntime=som,--clock=$(FPGA_CLOCK),--to_step=HLS" \
         $< -o $(PROGRAM_)_HLS

bitstream: $(PROGRAM_).c Makefile
	mkdir -p ${PWD}/$(BOARD)_fpga
	$(MCC_) $(CFLAGS_) $(MCC_FLAGS_) --bitstream-generation \
	--Wf,"--verbose,--name=$(PROGRAM_)_fpga,--dir=${PWD}/$(BOARD)_fpga,--board=$(BOARD),--hwruntime=som,--clock=$(FPGA_CLOCK)" \
	$< -o $(PROGRAM_)_fpga

energy: $(PROGRAM_).c Makefile
	$(MCC_) $(CFLAGS_) $(MCC_FLAGS_) -D_ENERGY_ \
	--Wf,"--verbose" \
        $< -o $(PROGRAM_)_energy

clean:
	rm -f *.o *~
	rm -f $(PROGRAM_)_HLS $(PROGRAM_)_fpga $(PROGRAM_)_energy
	rm -rf ${PWD}/$(BOARD)_*

FER/README

0 → 100644
+13 −0
Original line number Diff line number Diff line
- Set the number of FLOP editing the FLOP_ELEM macro in "parameters.h"

- $ make HLS
  The FER is compile to the step HLS

- $ make bitstream
  The bitstream is generated

- $ make energy
  Only the host code of the FER is compiled. The resulting executable calls 100
  times the accelerator, allowing in the meantime power measurements.
  A valid bitstream should be already available.
 No newline at end of file

FER/fer.c

0 → 100644
+371 −0
Original line number Diff line number Diff line
#include "header.h"

void sf_kernel(const MyData *const __restrict__ input,
	             MyData *const __restrict__ output)
{
  for (u_int32_t i=0 ; i<DIM ; i++)
    {
#if   (FLOP_ELEM != 0)

      const MyData alpha = 0.5;

      /* load element from DRAM to FPGA register */
      const MyData elem = input[i];

      MyData beta = 0.8;

#endif
      
#if   (FLOP_ELEM == 1)                     /* 1 FLOP   */
      SUM(beta, elem, alpha);
#endif

#if   (FLOP_ELEM == 2)                     /* 2 FLOPs  */
      FMA(beta, elem, alpha);
#endif

#if   (FLOP_ELEM == 4)                     /* 4 FLOPs  */
      REP2(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 8)                     /* 8 FLOPs  */
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 16)                    /* 16 FLOPs */
      REP8(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 24)                    /* 24 FLOPs */
      REP8(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 30)                    /* 30 FLOPs */
      REP8(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
      REP2(FMA(beta, elem, alpha));
      FMA(beta, elem, alpha);
#endif
      
#if   (FLOP_ELEM == 32)                    /* 32 FLOPs */
      REP16(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 64)                    /* 64 FLOPs */
      REP32(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 72)                    /* 72 FLOPs */
      REP32(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 80)                    /* 80 FLOPs */
      REP32(FMA(beta, elem, alpha));
      REP8(FMA(beta, elem, alpha));
#endif
      
#if   (FLOP_ELEM == 88)                    /* 88 FLOPs */
      REP32(FMA(beta, elem, alpha));
      REP8(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 358)                   /* 358 FLOPs */
      REP128(FMA(beta, elem, alpha));
      REP32(FMA(beta, elem, alpha));
      REP16(FMA(beta, elem, alpha));
      REP2(FMA(beta, elem, alpha));
      FMA(beta, elem, alpha);
#endif
      
#if   (FLOP_ELEM == 974)                   /* 974 FLOPs */
      REP256(FMA(beta, elem, alpha));
      REP128(FMA(beta, elem, alpha));
      REP64(FMA(beta, elem, alpha));
      REP32(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
      REP2(FMA(beta, elem, alpha));
      FMA(beta, elem, alpha);
#endif
      
      /* store result */
#if   (FLOP_ELEM != 0)
      
      output[i] = beta;

#else

      output[i] = input[i];

#endif
    }

  return;
}

void check_result(const MyData *const sf_vector,
                  const MyData *const hw_vector,
                  const u_int32_t     size)
{
  u_int8_t flag = TRUE;
  for (u_int32_t el=0 ; el<size ; el++)
    {
      const MyData sf_val = sf_vector[el];
      const MyData maxv = sf_val * (1.0 + (sf_val < 0.0 ? -threshold : threshold));
      const MyData minv = sf_val * (1.0 - (sf_val < 0.0 ? -threshold : threshold));
      const MyData hw_val = hw_vector[el];

      if ((hw_val > maxv) || (hw_val < minv))
        {
          flag = FALSE;
          break;
        }
    }

# pragma omp taskwait

  if (!flag)
    {
      printf("\n\t TEST failed \n\n");

      FILE *fp = NULL;
      if (!(fp = fopen("error.txt", "w")))
	{
	  printf("\n\t Cannot open file ... aborting ...\n");
	  exit(EXIT_FAILURE);
	}

      fprintf(fp, "%s", "# 1] sf_out   2] hw_out \n#\n");

      for (u_int32_t i=0 ; i<size ; i++)
	fprintf(fp, "%lg   %lg \n", sf_vector[i], hw_vector[i]);
      
      fclose(fp);
    }
  else if (flag)
    printf("\n\t TEST passed \n\n");

  return;
}

/* no_local_mem_copies is used in order to directly access the host */
/* DRAM from the FPGA, otherwise before starting the execution of   */
/* the kernel the 'input' and 'output' arrays are copied into BRAMs */
/* of the FPGA in chunks of size BSIZE.                             */
/* The '#pragma omp task' must have the clause 'in' and 'out'       */
/* specifying the size of the arguments in order to allow the OmpSs */
/* wrapper to correctly allocate the size in kernel space.          */
#pragma omp target no_localmem_copies num_instances(1) device(fpga)
#pragma omp task in([BSIZE]input) out([BSIZE]output)
void hw_kernel(const MyData *const __restrict__ input,
	             MyData *const __restrict__ output)
{
 main_loop:
  for (u_int32_t i=0 ; i<DIM ; i++)
    {
#    pragma HLS pipeline II=1

#if   (FLOP_ELEM != 0)
      
      const MyData alpha = 0.5;

      /* load element from DRAM to FPGA register */
      const MyData elem = input[i];

      MyData beta = 0.8;

#endif      

#if   (FLOP_ELEM == 1)                     /* 1 FLOP   */
      SUM(beta, elem, alpha);
#endif

#if   (FLOP_ELEM == 2)                     /* 2 FLOPs  */
      FMA(beta, elem, alpha);
#endif

#if   (FLOP_ELEM == 4)                     /* 4 FLOPs  */
      REP2(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 8)                     /* 8 FLOPs  */
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 16)                    /* 16 FLOPs */
      REP8(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 24)                    /* 24 FLOPs */
      REP8(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 30)                    /* 30 FLOPs */
      REP8(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
      REP2(FMA(beta, elem, alpha));
      FMA(beta, elem, alpha);
#endif
      
#if   (FLOP_ELEM == 32)                    /* 32 FLOPs */
      REP16(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 64)                    /* 64 FLOPs */
      REP32(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 72)                    /* 72 FLOPs */
      REP32(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 80)                    /* 80 FLOPs */
      REP32(FMA(beta, elem, alpha));
      REP8(FMA(beta, elem, alpha));
#endif
      
#if   (FLOP_ELEM == 88)                    /* 88 FLOPs */
      REP32(FMA(beta, elem, alpha));
      REP8(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 358)                   /* 358 FLOPs */
      REP128(FMA(beta, elem, alpha));
      REP32(FMA(beta, elem, alpha));
      REP16(FMA(beta, elem, alpha));
      REP2(FMA(beta, elem, alpha));
      FMA(beta, elem, alpha);
#endif
      
#if   (FLOP_ELEM == 974)                   /* 974 FLOPs */
      REP256(FMA(beta, elem, alpha));
      REP128(FMA(beta, elem, alpha));
      REP64(FMA(beta, elem, alpha));
      REP32(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
      REP2(FMA(beta, elem, alpha));
      FMA(beta, elem, alpha);
#endif
      
      /* store result */
#if   (FLOP_ELEM != 0)
      
      output[i] = beta;

#else

      output[i] = input[i];

#endif
    }

  return;
}

int main()
{
  /* arrays allocation */
  MyData *input = NULL;
  int ret_input = posix_memalign((void **)&input, MEMORY_ALIGNMENT, SIZE);
  
  MyData *sf_out = NULL;
  int ret_sf_out = posix_memalign((void **)&sf_out, MEMORY_ALIGNMENT, SIZE);

  MyData *hw_out = NULL;
  int ret_hw_out = posix_memalign((void **)&hw_out, MEMORY_ALIGNMENT, SIZE);

  if (ret_input || ret_sf_out || ret_hw_out ||
      !input    || !sf_out    || !hw_out)
    {
      printf("\n\t Cannot allocate arrays ... aborting ... \n\n");
      return -1;
    }  
  
  /* input initialization */
  for (u_int32_t i=0 ; i<DIM ; i++)
    input[i] = 0.1 + (1.0 / ((MyData)(i + 1)));

#if !defined(_ENERGY_)
  
  /* run software implementation */
  double sf_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL_CPU ; loop++)
    {      
      double sf_start = wall_time();
      
      sf_kernel(input, sf_out);

#     pragma omp taskwait

      sf_time += (wall_time() - sf_start);
    }

  const double sf_time_iteration = sf_time / (double)REPEAT_KERNEL_CPU;
  
#endif /* !defined(_ENERGY_) */
  
#if defined(_ENERGY_)

  printf("\n\t Start HW energy measurements now! \n\n");

#endif  
  
  /* run hardware implementation */
  double hw_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL_FPGA ; loop++)
    {
      double hw_start = wall_time();
      
      hw_kernel(input, hw_out);

#     pragma omp taskwait

      hw_time += (wall_time() - hw_start);
    }

  const double hw_time_iteration = hw_time / (double)REPEAT_KERNEL_FPGA;
  
#if !defined(_ENERGY_)

  /* check result */
  check_result(sf_out, hw_out, DIM);
  
  /* summary */
  printf("\t ====================== RESULTS ================================================== \n");
  printf("\t Benchmark: %s (%s) \n", "FMA", "OmpSs");
  printf("\t Data type                                 : %s  \n", TYPE);
  printf("\t Data type [byte]                          : %ld \n", sizeof(MyData));
  printf("\t FLOPs per element is set to               : %d  \n", FLOP_ELEM);
  printf("\t Computational Intensity is [FLOPs/byte]   : %ld \n", (FLOP_ELEM / (2 * sizeof(MyData))));
  printf("\t CPU execution time per iteration [secs]   : %lg \n", sf_time_iteration);
  printf("\t FPGA execution time per iteration [secs]  : %lg \n", hw_time_iteration);
  printf("\t CPU  - GFLOPs/sec                         : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_CPU)) / sf_time) / 1.e9);
  printf("\t FPGA - GFLOPs/sec                         : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_FPGA)) / hw_time) / 1.e9);
  printf("\t Bi-directional bandwidth [GByte/s]:       : %lg \n", (double)(2 * SIZE) / hw_time_iteration / 1.e9);
  printf("\t ================================================================================= \n\n");

#else

  printf("\t ====================== ENERGY ================================================== \n");
  printf("\t Data type                                 : %s  \n", TYPE);
  printf("\t Data type [byte]                          : %ld \n", sizeof(MyData));
  printf("\t FLOPs per element is set to               : %d  \n", FLOP_ELEM);
  printf("\t Computational Intensity is [FLOPs/byte]   : %ld \n", (FLOP_ELEM / (2 * sizeof(MyData))));
  printf("\t FPGA execution time per iteration [secs]  : %lg \n", hw_time_iteration);
  printf("\t FPGA - GFLOPs/sec                         : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_FPGA)) / hw_time) / 1.e9);
  printf("\t Bi-directional bandwidth [GByte/s]:       : %lg \n", (double)(2 * SIZE) / hw_time_iteration / 1.e9);
  printf("\t ================================================================================= \n\n");  
  
#endif
  
  free(input);
  free(sf_out);
  free(hw_out);
  
  return 0;
}

FER/header.h

0 → 100644
+53 −0
Original line number Diff line number Diff line
#pragma once

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>

#include "parameters.h"

#define TRUE  1
#define FALSE 0

#define REPEAT_KERNEL_CPU    10

#if defined(_ENERGY_)
#define REPEAT_KERNEL_FPGA 100
#else
#define REPEAT_KERNEL_FPGA REPEAT_KERNEL_CPU
#endif

#define MEMORY_ALIGNMENT 4096
#define DIM              (2*1024*1024)
#define SIZE             (DIM * sizeof(MyData))
const unsigned int BSIZE = DIM;

struct get_time
{
  double start, end;
};

struct get_time exe;

double wall_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_MONOTONIC,&ts);
  const double ret = (double) (ts.tv_sec) + (double)ts.tv_nsec * 1.0e-9;
  return ret;
}

/* MACROS for Ops */
#define REP2(S)        S ;        S
#define REP4(S)   REP2(S);   REP2(S)
#define REP8(S)   REP4(S);   REP4(S)
#define REP16(S)  REP8(S);   REP8(S)
#define REP32(S)  REP16(S);  REP16(S)
#define REP64(S)  REP32(S);  REP32(S)
#define REP128(S) REP64(S);  REP64(S)
#define REP256(S) REP128(S); REP128(S)
#define REP512(S) REP256(S); REP256(S)

#define SUM(a,b,c)  ((a) = (b) + (c))
#define FMA(a,b,c)  ((a) = ((a) * (b)) + (c))

FER/kernel.txt

0 → 100644
+54 −0
Original line number Diff line number Diff line
/********** PARAMETERS *****************/
/* Data type */
typedef float MyData;

/* Number of floating-point operations */
#define FLOP_ELEM    2
/***************************************/

/******** MACROS for Ops ***************/
#define REP2(S)        S ;        S
#define REP4(S)   REP2(S);   REP2(S)
#define REP8(S)   REP4(S);   REP4(S)
#define REP16(S)  REP8(S);   REP8(S)
#define REP32(S)  REP16(S);  REP16(S)
#define REP64(S)  REP32(S);  REP32(S)

#define SUM(a,b,c)  ((a) = (b) + (c))
#define FMA(a,b,c)  ((a) = ((a) * (b)) + (c))
/*********************************************/


#pragma omp target no_localmem_copies device(fpga)
#pragma omp task in([BSIZE]input) out([BSIZE]output)
void hw_kernel(const MyData *const __restrict__ input,
	             MyData *const __restrict__ output)
{
 main_loop:
  for (u_int32_t i=0 ; i<DIM ; i++)
    {
#    pragma HLS pipeline II=1

      const MyData alpha = 0.5;

      /* load element from DRAM to FPGA register */
      const MyData elem = input[i];

      MyData beta = 0.8;

#if   (FLOP_ELEM == 16)              /* 16 FLOPs */
      REP8(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 32)              /* 32 FLOPs */
      REP16(FMA(beta, elem, alpha));
#endif

      ....	
      
      /* store result */
      output[i] = beta;
    }

  return;
}