Initial commit (ec502a4a) · Commits · David Goz / OATs_Technical Report-OmpSs_at_FPGA

FER/Makefile

0 → 100644

+42 −0

Original line number	Diff line number	Diff line
		CFLAGS_ = -O3
		MCC_FLAGS_ = --ompss --variable=disable_final_clause_transformation:1

		PROGRAM_ = fer

		# MCC ?= fpgacc
		# CROSS_COMPILE_ZED ?= arm-linux-gnueabihf-
		# MCC_ = $(CROSS_COMPILE_ZED)$(MCC)

		MCC ?= fpgacc
		CROSS_COMPILE_ZCU ?= aarch64-linux-gnu-
		MCC_ = $(CROSS_COMPILE_ZCU)$(MCC)

		# BOARD = zedboard
		# FPGA_CLOCK = 100

		BOARD = zcu102
		FPGA_CLOCK = 200

		.PHONY: HLS bitstream energy clean

		HLS: $(PROGRAM_).c Makefile
		mkdir -p ${PWD}/$(BOARD)_HLS
		$(MCC_) $(CFLAGS_) $(MCC_FLAGS_) --bitstream-generation \
		--Wf,"--verbose,--name=$(PROGRAM_)_HLS,--dir=${PWD}/$(BOARD)_HLS,--board=$(BOARD),--hwruntime=som,--clock=$(FPGA_CLOCK),--to_step=HLS" \
		$< -o $(PROGRAM_)_HLS

		bitstream: $(PROGRAM_).c Makefile
		mkdir -p ${PWD}/$(BOARD)_fpga
		$(MCC_) $(CFLAGS_) $(MCC_FLAGS_) --bitstream-generation \
		--Wf,"--verbose,--name=$(PROGRAM_)_fpga,--dir=${PWD}/$(BOARD)_fpga,--board=$(BOARD),--hwruntime=som,--clock=$(FPGA_CLOCK)" \
		$< -o $(PROGRAM_)_fpga

		energy: $(PROGRAM_).c Makefile
		$(MCC_) $(CFLAGS_) $(MCC_FLAGS_) -D_ENERGY_ \
		--Wf,"--verbose" \
		$< -o $(PROGRAM_)_energy

		clean:
		rm -f .o ~
		rm -f $(PROGRAM_)_HLS $(PROGRAM_)_fpga $(PROGRAM_)_energy
		rm -rf ${PWD}/$(BOARD)_*

FER/README

0 → 100644

+13 −0

Original line number	Diff line number	Diff line
		- Set the number of FLOP editing the FLOP_ELEM macro in "parameters.h"

		- $ make HLS
		The FER is compile to the step HLS

		- $ make bitstream
		The bitstream is generated

		- $ make energy
		Only the host code of the FER is compiled. The resulting executable calls 100
		times the accelerator, allowing in the meantime power measurements.
		A valid bitstream should be already available.
		No newline at end of file

FER/fer.c

0 → 100644

+371 −0

Original line number	Diff line number	Diff line
		#include "header.h"

		void sf_kernel(const MyData *const __restrict__ input,
		MyData *const __restrict__ output)
		{
		for (u_int32_t i=0 ; i<DIM ; i++)
		{
		#if (FLOP_ELEM != 0)

		const MyData alpha = 0.5;

		/* load element from DRAM to FPGA register */
		const MyData elem = input[i];

		MyData beta = 0.8;

		#endif

		#if (FLOP_ELEM == 1) /* 1 FLOP */
		SUM(beta, elem, alpha);
		#endif

		#if (FLOP_ELEM == 2) /* 2 FLOPs */
		FMA(beta, elem, alpha);
		#endif

		#if (FLOP_ELEM == 4) /* 4 FLOPs */
		REP2(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 8) /* 8 FLOPs */
		REP4(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 16) /* 16 FLOPs */
		REP8(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 24) /* 24 FLOPs */
		REP8(FMA(beta, elem, alpha));
		REP4(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 30) /* 30 FLOPs */
		REP8(FMA(beta, elem, alpha));
		REP4(FMA(beta, elem, alpha));
		REP2(FMA(beta, elem, alpha));
		FMA(beta, elem, alpha);
		#endif

		#if (FLOP_ELEM == 32) /* 32 FLOPs */
		REP16(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 64) /* 64 FLOPs */
		REP32(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 72) /* 72 FLOPs */
		REP32(FMA(beta, elem, alpha));
		REP4(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 80) /* 80 FLOPs */
		REP32(FMA(beta, elem, alpha));
		REP8(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 88) /* 88 FLOPs */
		REP32(FMA(beta, elem, alpha));
		REP8(FMA(beta, elem, alpha));
		REP4(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 358) /* 358 FLOPs */
		REP128(FMA(beta, elem, alpha));
		REP32(FMA(beta, elem, alpha));
		REP16(FMA(beta, elem, alpha));
		REP2(FMA(beta, elem, alpha));
		FMA(beta, elem, alpha);
		#endif

		#if (FLOP_ELEM == 974) /* 974 FLOPs */
		REP256(FMA(beta, elem, alpha));
		REP128(FMA(beta, elem, alpha));
		REP64(FMA(beta, elem, alpha));
		REP32(FMA(beta, elem, alpha));
		REP4(FMA(beta, elem, alpha));
		REP2(FMA(beta, elem, alpha));
		FMA(beta, elem, alpha);
		#endif

		/* store result */
		#if (FLOP_ELEM != 0)

		output[i] = beta;

		#else

		output[i] = input[i];

		#endif
		}

		return;
		}

		void check_result(const MyData *const sf_vector,
		const MyData *const hw_vector,
		const u_int32_t size)
		{
		u_int8_t flag = TRUE;
		for (u_int32_t el=0 ; el<size ; el++)
		{
		const MyData sf_val = sf_vector[el];
		const MyData maxv = sf_val * (1.0 + (sf_val < 0.0 ? -threshold : threshold));
		const MyData minv = sf_val * (1.0 - (sf_val < 0.0 ? -threshold : threshold));
		const MyData hw_val = hw_vector[el];

		if ((hw_val > maxv) \|\| (hw_val < minv))
		{
		flag = FALSE;
		break;
		}
		}

		# pragma omp taskwait

		if (!flag)
		{
		printf("\n\t TEST failed \n\n");

		FILE *fp = NULL;
		if (!(fp = fopen("error.txt", "w")))
		{
		printf("\n\t Cannot open file ... aborting ...\n");
		exit(EXIT_FAILURE);
		}

		fprintf(fp, "%s", "# 1] sf_out 2] hw_out \n#\n");

		for (u_int32_t i=0 ; i<size ; i++)
		fprintf(fp, "%lg %lg \n", sf_vector[i], hw_vector[i]);

		fclose(fp);
		}
		else if (flag)
		printf("\n\t TEST passed \n\n");

		return;
		}

		/* no_local_mem_copies is used in order to directly access the host */
		/* DRAM from the FPGA, otherwise before starting the execution of */
		/* the kernel the 'input' and 'output' arrays are copied into BRAMs */
		/* of the FPGA in chunks of size BSIZE. */
		/* The '#pragma omp task' must have the clause 'in' and 'out' */
		/* specifying the size of the arguments in order to allow the OmpSs */
		/* wrapper to correctly allocate the size in kernel space. */
		#pragma omp target no_localmem_copies num_instances(1) device(fpga)
		#pragma omp task in([BSIZE]input) out([BSIZE]output)
		void hw_kernel(const MyData *const __restrict__ input,
		MyData *const __restrict__ output)
		{
		main_loop:
		for (u_int32_t i=0 ; i<DIM ; i++)
		{
		# pragma HLS pipeline II=1

		#if (FLOP_ELEM != 0)

		const MyData alpha = 0.5;

		/* load element from DRAM to FPGA register */
		const MyData elem = input[i];

		MyData beta = 0.8;

		#endif

		#if (FLOP_ELEM == 1) /* 1 FLOP */
		SUM(beta, elem, alpha);
		#endif

		#if (FLOP_ELEM == 2) /* 2 FLOPs */
		FMA(beta, elem, alpha);
		#endif

		#if (FLOP_ELEM == 4) /* 4 FLOPs */
		REP2(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 8) /* 8 FLOPs */
		REP4(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 16) /* 16 FLOPs */
		REP8(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 24) /* 24 FLOPs */
		REP8(FMA(beta, elem, alpha));
		REP4(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 30) /* 30 FLOPs */
		REP8(FMA(beta, elem, alpha));
		REP4(FMA(beta, elem, alpha));
		REP2(FMA(beta, elem, alpha));
		FMA(beta, elem, alpha);
		#endif

		#if (FLOP_ELEM == 32) /* 32 FLOPs */
		REP16(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 64) /* 64 FLOPs */
		REP32(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 72) /* 72 FLOPs */
		REP32(FMA(beta, elem, alpha));
		REP4(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 80) /* 80 FLOPs */
		REP32(FMA(beta, elem, alpha));
		REP8(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 88) /* 88 FLOPs */
		REP32(FMA(beta, elem, alpha));
		REP8(FMA(beta, elem, alpha));
		REP4(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 358) /* 358 FLOPs */
		REP128(FMA(beta, elem, alpha));
		REP32(FMA(beta, elem, alpha));
		REP16(FMA(beta, elem, alpha));
		REP2(FMA(beta, elem, alpha));
		FMA(beta, elem, alpha);
		#endif

		#if (FLOP_ELEM == 974) /* 974 FLOPs */
		REP256(FMA(beta, elem, alpha));
		REP128(FMA(beta, elem, alpha));
		REP64(FMA(beta, elem, alpha));
		REP32(FMA(beta, elem, alpha));
		REP4(FMA(beta, elem, alpha));
		REP2(FMA(beta, elem, alpha));
		FMA(beta, elem, alpha);
		#endif

		/* store result */
		#if (FLOP_ELEM != 0)

		output[i] = beta;

		#else

		output[i] = input[i];

		#endif
		}

		return;
		}

		int main()
		{
		/* arrays allocation */
		MyData *input = NULL;
		int ret_input = posix_memalign((void **)&input, MEMORY_ALIGNMENT, SIZE);

		MyData *sf_out = NULL;
		int ret_sf_out = posix_memalign((void **)&sf_out, MEMORY_ALIGNMENT, SIZE);

		MyData *hw_out = NULL;
		int ret_hw_out = posix_memalign((void **)&hw_out, MEMORY_ALIGNMENT, SIZE);

		if (ret_input \|\| ret_sf_out \|\| ret_hw_out \|\|
		!input \|\| !sf_out \|\| !hw_out)
		{
		printf("\n\t Cannot allocate arrays ... aborting ... \n\n");
		return -1;
		}

		/* input initialization */
		for (u_int32_t i=0 ; i<DIM ; i++)
		input[i] = 0.1 + (1.0 / ((MyData)(i + 1)));

		#if !defined(_ENERGY_)

		/* run software implementation */
		double sf_time = 0.0;
		for (u_int8_t loop=0 ; loop<REPEAT_KERNEL_CPU ; loop++)
		{
		double sf_start = wall_time();

		sf_kernel(input, sf_out);

		# pragma omp taskwait

		sf_time += (wall_time() - sf_start);
		}

		const double sf_time_iteration = sf_time / (double)REPEAT_KERNEL_CPU;

		#endif /* !defined(_ENERGY_) */

		#if defined(_ENERGY_)

		printf("\n\t Start HW energy measurements now! \n\n");

		#endif

		/* run hardware implementation */
		double hw_time = 0.0;
		for (u_int8_t loop=0 ; loop<REPEAT_KERNEL_FPGA ; loop++)
		{
		double hw_start = wall_time();

		hw_kernel(input, hw_out);

		# pragma omp taskwait

		hw_time += (wall_time() - hw_start);
		}

		const double hw_time_iteration = hw_time / (double)REPEAT_KERNEL_FPGA;

		#if !defined(_ENERGY_)

		/* check result */
		check_result(sf_out, hw_out, DIM);

		/* summary */
		printf("\t ====================== RESULTS ================================================== \n");
		printf("\t Benchmark: %s (%s) \n", "FMA", "OmpSs");
		printf("\t Data type : %s \n", TYPE);
		printf("\t Data type [byte] : %ld \n", sizeof(MyData));
		printf("\t FLOPs per element is set to : %d \n", FLOP_ELEM);
		printf("\t Computational Intensity is [FLOPs/byte] : %ld \n", (FLOP_ELEM / (2 * sizeof(MyData))));
		printf("\t CPU execution time per iteration [secs] : %lg \n", sf_time_iteration);
		printf("\t FPGA execution time per iteration [secs] : %lg \n", hw_time_iteration);
		printf("\t CPU - GFLOPs/sec : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_CPU)) / sf_time) / 1.e9);
		printf("\t FPGA - GFLOPs/sec : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_FPGA)) / hw_time) / 1.e9);
		printf("\t Bi-directional bandwidth [GByte/s]: : %lg \n", (double)(2 * SIZE) / hw_time_iteration / 1.e9);
		printf("\t ================================================================================= \n\n");

		#else

		printf("\t ====================== ENERGY ================================================== \n");
		printf("\t Data type : %s \n", TYPE);
		printf("\t Data type [byte] : %ld \n", sizeof(MyData));
		printf("\t FLOPs per element is set to : %d \n", FLOP_ELEM);
		printf("\t Computational Intensity is [FLOPs/byte] : %ld \n", (FLOP_ELEM / (2 * sizeof(MyData))));
		printf("\t FPGA execution time per iteration [secs] : %lg \n", hw_time_iteration);
		printf("\t FPGA - GFLOPs/sec : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_FPGA)) / hw_time) / 1.e9);
		printf("\t Bi-directional bandwidth [GByte/s]: : %lg \n", (double)(2 * SIZE) / hw_time_iteration / 1.e9);
		printf("\t ================================================================================= \n\n");

		#endif

		free(input);
		free(sf_out);
		free(hw_out);

		return 0;
		}

FER/header.h

0 → 100644

+53 −0

Original line number	Diff line number	Diff line
		#pragma once

		#include <stdio.h>
		#include <stdlib.h>
		#include <sys/time.h>
		#include <time.h>

		#include "parameters.h"

		#define TRUE 1
		#define FALSE 0

		#define REPEAT_KERNEL_CPU 10

		#if defined(_ENERGY_)
		#define REPEAT_KERNEL_FPGA 100
		#else
		#define REPEAT_KERNEL_FPGA REPEAT_KERNEL_CPU
		#endif

		#define MEMORY_ALIGNMENT 4096
		#define DIM (210241024)
		#define SIZE (DIM * sizeof(MyData))
		const unsigned int BSIZE = DIM;

		struct get_time
		{
		double start, end;
		};

		struct get_time exe;

		double wall_time()
		{
		struct timespec ts;
		clock_gettime(CLOCK_MONOTONIC,&ts);
		const double ret = (double) (ts.tv_sec) + (double)ts.tv_nsec * 1.0e-9;
		return ret;
		}

		/* MACROS for Ops */
		#define REP2(S) S ; S
		#define REP4(S) REP2(S); REP2(S)
		#define REP8(S) REP4(S); REP4(S)
		#define REP16(S) REP8(S); REP8(S)
		#define REP32(S) REP16(S); REP16(S)
		#define REP64(S) REP32(S); REP32(S)
		#define REP128(S) REP64(S); REP64(S)
		#define REP256(S) REP128(S); REP128(S)
		#define REP512(S) REP256(S); REP256(S)

		#define SUM(a,b,c) ((a) = (b) + (c))
		#define FMA(a,b,c) ((a) = ((a) * (b)) + (c))

FER/kernel.txt

0 → 100644

+54 −0

Original line number	Diff line number	Diff line
		/******** PARAMETERS ***************/
		/* Data type */
		typedef float MyData;

		/* Number of floating-point operations */
		#define FLOP_ELEM 2
		/***************************************/

		/****** MACROS for Ops *************/
		#define REP2(S) S ; S
		#define REP4(S) REP2(S); REP2(S)
		#define REP8(S) REP4(S); REP4(S)
		#define REP16(S) REP8(S); REP8(S)
		#define REP32(S) REP16(S); REP16(S)
		#define REP64(S) REP32(S); REP32(S)

		#define SUM(a,b,c) ((a) = (b) + (c))
		#define FMA(a,b,c) ((a) = ((a) * (b)) + (c))
		/*********************************************/


		#pragma omp target no_localmem_copies device(fpga)
		#pragma omp task in([BSIZE]input) out([BSIZE]output)
		void hw_kernel(const MyData *const __restrict__ input,
		MyData *const __restrict__ output)
		{
		main_loop:
		for (u_int32_t i=0 ; i<DIM ; i++)
		{
		# pragma HLS pipeline II=1

		const MyData alpha = 0.5;

		/* load element from DRAM to FPGA register */
		const MyData elem = input[i];

		MyData beta = 0.8;

		#if (FLOP_ELEM == 16) /* 16 FLOPs */
		REP8(FMA(beta, elem, alpha));
		#endif

		#if (FLOP_ELEM == 32) /* 32 FLOPs */
		REP16(FMA(beta, elem, alpha));
		#endif

		....

		/* store result */
		output[i] = beta;
		}

		return;
		}