Loading FER/Makefile 0 → 100644 +42 −0 Original line number Diff line number Diff line CFLAGS_ = -O3 MCC_FLAGS_ = --ompss --variable=disable_final_clause_transformation:1 PROGRAM_ = fer # MCC ?= fpgacc # CROSS_COMPILE_ZED ?= arm-linux-gnueabihf- # MCC_ = $(CROSS_COMPILE_ZED)$(MCC) MCC ?= fpgacc CROSS_COMPILE_ZCU ?= aarch64-linux-gnu- MCC_ = $(CROSS_COMPILE_ZCU)$(MCC) # BOARD = zedboard # FPGA_CLOCK = 100 BOARD = zcu102 FPGA_CLOCK = 200 .PHONY: HLS bitstream energy clean HLS: $(PROGRAM_).c Makefile mkdir -p ${PWD}/$(BOARD)_HLS $(MCC_) $(CFLAGS_) $(MCC_FLAGS_) --bitstream-generation \ --Wf,"--verbose,--name=$(PROGRAM_)_HLS,--dir=${PWD}/$(BOARD)_HLS,--board=$(BOARD),--hwruntime=som,--clock=$(FPGA_CLOCK),--to_step=HLS" \ $< -o $(PROGRAM_)_HLS bitstream: $(PROGRAM_).c Makefile mkdir -p ${PWD}/$(BOARD)_fpga $(MCC_) $(CFLAGS_) $(MCC_FLAGS_) --bitstream-generation \ --Wf,"--verbose,--name=$(PROGRAM_)_fpga,--dir=${PWD}/$(BOARD)_fpga,--board=$(BOARD),--hwruntime=som,--clock=$(FPGA_CLOCK)" \ $< -o $(PROGRAM_)_fpga energy: $(PROGRAM_).c Makefile $(MCC_) $(CFLAGS_) $(MCC_FLAGS_) -D_ENERGY_ \ --Wf,"--verbose" \ $< -o $(PROGRAM_)_energy clean: rm -f *.o *~ rm -f $(PROGRAM_)_HLS $(PROGRAM_)_fpga $(PROGRAM_)_energy rm -rf ${PWD}/$(BOARD)_* FER/README 0 → 100644 +13 −0 Original line number Diff line number Diff line - Set the number of FLOP editing the FLOP_ELEM macro in "parameters.h" - $ make HLS The FER is compile to the step HLS - $ make bitstream The bitstream is generated - $ make energy Only the host code of the FER is compiled. The resulting executable calls 100 times the accelerator, allowing in the meantime power measurements. A valid bitstream should be already available. No newline at end of file FER/fer.c 0 → 100644 +371 −0 Original line number Diff line number Diff line #include "header.h" void sf_kernel(const MyData *const __restrict__ input, MyData *const __restrict__ output) { for (u_int32_t i=0 ; i<DIM ; i++) { #if (FLOP_ELEM != 0) const MyData alpha = 0.5; /* load element from DRAM to FPGA register */ const MyData elem = input[i]; MyData beta = 0.8; #endif #if (FLOP_ELEM == 1) /* 1 FLOP */ SUM(beta, elem, alpha); #endif #if (FLOP_ELEM == 2) /* 2 FLOPs */ FMA(beta, elem, alpha); #endif #if (FLOP_ELEM == 4) /* 4 FLOPs */ REP2(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 8) /* 8 FLOPs */ REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 16) /* 16 FLOPs */ REP8(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 24) /* 24 FLOPs */ REP8(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 30) /* 30 FLOPs */ REP8(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); REP2(FMA(beta, elem, alpha)); FMA(beta, elem, alpha); #endif #if (FLOP_ELEM == 32) /* 32 FLOPs */ REP16(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 64) /* 64 FLOPs */ REP32(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 72) /* 72 FLOPs */ REP32(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 80) /* 80 FLOPs */ REP32(FMA(beta, elem, alpha)); REP8(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 88) /* 88 FLOPs */ REP32(FMA(beta, elem, alpha)); REP8(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 358) /* 358 FLOPs */ REP128(FMA(beta, elem, alpha)); REP32(FMA(beta, elem, alpha)); REP16(FMA(beta, elem, alpha)); REP2(FMA(beta, elem, alpha)); FMA(beta, elem, alpha); #endif #if (FLOP_ELEM == 974) /* 974 FLOPs */ REP256(FMA(beta, elem, alpha)); REP128(FMA(beta, elem, alpha)); REP64(FMA(beta, elem, alpha)); REP32(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); REP2(FMA(beta, elem, alpha)); FMA(beta, elem, alpha); #endif /* store result */ #if (FLOP_ELEM != 0) output[i] = beta; #else output[i] = input[i]; #endif } return; } void check_result(const MyData *const sf_vector, const MyData *const hw_vector, const u_int32_t size) { u_int8_t flag = TRUE; for (u_int32_t el=0 ; el<size ; el++) { const MyData sf_val = sf_vector[el]; const MyData maxv = sf_val * (1.0 + (sf_val < 0.0 ? -threshold : threshold)); const MyData minv = sf_val * (1.0 - (sf_val < 0.0 ? -threshold : threshold)); const MyData hw_val = hw_vector[el]; if ((hw_val > maxv) || (hw_val < minv)) { flag = FALSE; break; } } # pragma omp taskwait if (!flag) { printf("\n\t TEST failed \n\n"); FILE *fp = NULL; if (!(fp = fopen("error.txt", "w"))) { printf("\n\t Cannot open file ... aborting ...\n"); exit(EXIT_FAILURE); } fprintf(fp, "%s", "# 1] sf_out 2] hw_out \n#\n"); for (u_int32_t i=0 ; i<size ; i++) fprintf(fp, "%lg %lg \n", sf_vector[i], hw_vector[i]); fclose(fp); } else if (flag) printf("\n\t TEST passed \n\n"); return; } /* no_local_mem_copies is used in order to directly access the host */ /* DRAM from the FPGA, otherwise before starting the execution of */ /* the kernel the 'input' and 'output' arrays are copied into BRAMs */ /* of the FPGA in chunks of size BSIZE. */ /* The '#pragma omp task' must have the clause 'in' and 'out' */ /* specifying the size of the arguments in order to allow the OmpSs */ /* wrapper to correctly allocate the size in kernel space. */ #pragma omp target no_localmem_copies num_instances(1) device(fpga) #pragma omp task in([BSIZE]input) out([BSIZE]output) void hw_kernel(const MyData *const __restrict__ input, MyData *const __restrict__ output) { main_loop: for (u_int32_t i=0 ; i<DIM ; i++) { # pragma HLS pipeline II=1 #if (FLOP_ELEM != 0) const MyData alpha = 0.5; /* load element from DRAM to FPGA register */ const MyData elem = input[i]; MyData beta = 0.8; #endif #if (FLOP_ELEM == 1) /* 1 FLOP */ SUM(beta, elem, alpha); #endif #if (FLOP_ELEM == 2) /* 2 FLOPs */ FMA(beta, elem, alpha); #endif #if (FLOP_ELEM == 4) /* 4 FLOPs */ REP2(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 8) /* 8 FLOPs */ REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 16) /* 16 FLOPs */ REP8(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 24) /* 24 FLOPs */ REP8(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 30) /* 30 FLOPs */ REP8(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); REP2(FMA(beta, elem, alpha)); FMA(beta, elem, alpha); #endif #if (FLOP_ELEM == 32) /* 32 FLOPs */ REP16(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 64) /* 64 FLOPs */ REP32(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 72) /* 72 FLOPs */ REP32(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 80) /* 80 FLOPs */ REP32(FMA(beta, elem, alpha)); REP8(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 88) /* 88 FLOPs */ REP32(FMA(beta, elem, alpha)); REP8(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 358) /* 358 FLOPs */ REP128(FMA(beta, elem, alpha)); REP32(FMA(beta, elem, alpha)); REP16(FMA(beta, elem, alpha)); REP2(FMA(beta, elem, alpha)); FMA(beta, elem, alpha); #endif #if (FLOP_ELEM == 974) /* 974 FLOPs */ REP256(FMA(beta, elem, alpha)); REP128(FMA(beta, elem, alpha)); REP64(FMA(beta, elem, alpha)); REP32(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); REP2(FMA(beta, elem, alpha)); FMA(beta, elem, alpha); #endif /* store result */ #if (FLOP_ELEM != 0) output[i] = beta; #else output[i] = input[i]; #endif } return; } int main() { /* arrays allocation */ MyData *input = NULL; int ret_input = posix_memalign((void **)&input, MEMORY_ALIGNMENT, SIZE); MyData *sf_out = NULL; int ret_sf_out = posix_memalign((void **)&sf_out, MEMORY_ALIGNMENT, SIZE); MyData *hw_out = NULL; int ret_hw_out = posix_memalign((void **)&hw_out, MEMORY_ALIGNMENT, SIZE); if (ret_input || ret_sf_out || ret_hw_out || !input || !sf_out || !hw_out) { printf("\n\t Cannot allocate arrays ... aborting ... \n\n"); return -1; } /* input initialization */ for (u_int32_t i=0 ; i<DIM ; i++) input[i] = 0.1 + (1.0 / ((MyData)(i + 1))); #if !defined(_ENERGY_) /* run software implementation */ double sf_time = 0.0; for (u_int8_t loop=0 ; loop<REPEAT_KERNEL_CPU ; loop++) { double sf_start = wall_time(); sf_kernel(input, sf_out); # pragma omp taskwait sf_time += (wall_time() - sf_start); } const double sf_time_iteration = sf_time / (double)REPEAT_KERNEL_CPU; #endif /* !defined(_ENERGY_) */ #if defined(_ENERGY_) printf("\n\t Start HW energy measurements now! \n\n"); #endif /* run hardware implementation */ double hw_time = 0.0; for (u_int8_t loop=0 ; loop<REPEAT_KERNEL_FPGA ; loop++) { double hw_start = wall_time(); hw_kernel(input, hw_out); # pragma omp taskwait hw_time += (wall_time() - hw_start); } const double hw_time_iteration = hw_time / (double)REPEAT_KERNEL_FPGA; #if !defined(_ENERGY_) /* check result */ check_result(sf_out, hw_out, DIM); /* summary */ printf("\t ====================== RESULTS ================================================== \n"); printf("\t Benchmark: %s (%s) \n", "FMA", "OmpSs"); printf("\t Data type : %s \n", TYPE); printf("\t Data type [byte] : %ld \n", sizeof(MyData)); printf("\t FLOPs per element is set to : %d \n", FLOP_ELEM); printf("\t Computational Intensity is [FLOPs/byte] : %ld \n", (FLOP_ELEM / (2 * sizeof(MyData)))); printf("\t CPU execution time per iteration [secs] : %lg \n", sf_time_iteration); printf("\t FPGA execution time per iteration [secs] : %lg \n", hw_time_iteration); printf("\t CPU - GFLOPs/sec : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_CPU)) / sf_time) / 1.e9); printf("\t FPGA - GFLOPs/sec : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_FPGA)) / hw_time) / 1.e9); printf("\t Bi-directional bandwidth [GByte/s]: : %lg \n", (double)(2 * SIZE) / hw_time_iteration / 1.e9); printf("\t ================================================================================= \n\n"); #else printf("\t ====================== ENERGY ================================================== \n"); printf("\t Data type : %s \n", TYPE); printf("\t Data type [byte] : %ld \n", sizeof(MyData)); printf("\t FLOPs per element is set to : %d \n", FLOP_ELEM); printf("\t Computational Intensity is [FLOPs/byte] : %ld \n", (FLOP_ELEM / (2 * sizeof(MyData)))); printf("\t FPGA execution time per iteration [secs] : %lg \n", hw_time_iteration); printf("\t FPGA - GFLOPs/sec : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_FPGA)) / hw_time) / 1.e9); printf("\t Bi-directional bandwidth [GByte/s]: : %lg \n", (double)(2 * SIZE) / hw_time_iteration / 1.e9); printf("\t ================================================================================= \n\n"); #endif free(input); free(sf_out); free(hw_out); return 0; } FER/header.h 0 → 100644 +53 −0 Original line number Diff line number Diff line #pragma once #include <stdio.h> #include <stdlib.h> #include <sys/time.h> #include <time.h> #include "parameters.h" #define TRUE 1 #define FALSE 0 #define REPEAT_KERNEL_CPU 10 #if defined(_ENERGY_) #define REPEAT_KERNEL_FPGA 100 #else #define REPEAT_KERNEL_FPGA REPEAT_KERNEL_CPU #endif #define MEMORY_ALIGNMENT 4096 #define DIM (2*1024*1024) #define SIZE (DIM * sizeof(MyData)) const unsigned int BSIZE = DIM; struct get_time { double start, end; }; struct get_time exe; double wall_time() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC,&ts); const double ret = (double) (ts.tv_sec) + (double)ts.tv_nsec * 1.0e-9; return ret; } /* MACROS for Ops */ #define REP2(S) S ; S #define REP4(S) REP2(S); REP2(S) #define REP8(S) REP4(S); REP4(S) #define REP16(S) REP8(S); REP8(S) #define REP32(S) REP16(S); REP16(S) #define REP64(S) REP32(S); REP32(S) #define REP128(S) REP64(S); REP64(S) #define REP256(S) REP128(S); REP128(S) #define REP512(S) REP256(S); REP256(S) #define SUM(a,b,c) ((a) = (b) + (c)) #define FMA(a,b,c) ((a) = ((a) * (b)) + (c)) FER/kernel.txt 0 → 100644 +54 −0 Original line number Diff line number Diff line /********** PARAMETERS *****************/ /* Data type */ typedef float MyData; /* Number of floating-point operations */ #define FLOP_ELEM 2 /***************************************/ /******** MACROS for Ops ***************/ #define REP2(S) S ; S #define REP4(S) REP2(S); REP2(S) #define REP8(S) REP4(S); REP4(S) #define REP16(S) REP8(S); REP8(S) #define REP32(S) REP16(S); REP16(S) #define REP64(S) REP32(S); REP32(S) #define SUM(a,b,c) ((a) = (b) + (c)) #define FMA(a,b,c) ((a) = ((a) * (b)) + (c)) /*********************************************/ #pragma omp target no_localmem_copies device(fpga) #pragma omp task in([BSIZE]input) out([BSIZE]output) void hw_kernel(const MyData *const __restrict__ input, MyData *const __restrict__ output) { main_loop: for (u_int32_t i=0 ; i<DIM ; i++) { # pragma HLS pipeline II=1 const MyData alpha = 0.5; /* load element from DRAM to FPGA register */ const MyData elem = input[i]; MyData beta = 0.8; #if (FLOP_ELEM == 16) /* 16 FLOPs */ REP8(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 32) /* 32 FLOPs */ REP16(FMA(beta, elem, alpha)); #endif .... /* store result */ output[i] = beta; } return; } Loading
FER/Makefile 0 → 100644 +42 −0 Original line number Diff line number Diff line CFLAGS_ = -O3 MCC_FLAGS_ = --ompss --variable=disable_final_clause_transformation:1 PROGRAM_ = fer # MCC ?= fpgacc # CROSS_COMPILE_ZED ?= arm-linux-gnueabihf- # MCC_ = $(CROSS_COMPILE_ZED)$(MCC) MCC ?= fpgacc CROSS_COMPILE_ZCU ?= aarch64-linux-gnu- MCC_ = $(CROSS_COMPILE_ZCU)$(MCC) # BOARD = zedboard # FPGA_CLOCK = 100 BOARD = zcu102 FPGA_CLOCK = 200 .PHONY: HLS bitstream energy clean HLS: $(PROGRAM_).c Makefile mkdir -p ${PWD}/$(BOARD)_HLS $(MCC_) $(CFLAGS_) $(MCC_FLAGS_) --bitstream-generation \ --Wf,"--verbose,--name=$(PROGRAM_)_HLS,--dir=${PWD}/$(BOARD)_HLS,--board=$(BOARD),--hwruntime=som,--clock=$(FPGA_CLOCK),--to_step=HLS" \ $< -o $(PROGRAM_)_HLS bitstream: $(PROGRAM_).c Makefile mkdir -p ${PWD}/$(BOARD)_fpga $(MCC_) $(CFLAGS_) $(MCC_FLAGS_) --bitstream-generation \ --Wf,"--verbose,--name=$(PROGRAM_)_fpga,--dir=${PWD}/$(BOARD)_fpga,--board=$(BOARD),--hwruntime=som,--clock=$(FPGA_CLOCK)" \ $< -o $(PROGRAM_)_fpga energy: $(PROGRAM_).c Makefile $(MCC_) $(CFLAGS_) $(MCC_FLAGS_) -D_ENERGY_ \ --Wf,"--verbose" \ $< -o $(PROGRAM_)_energy clean: rm -f *.o *~ rm -f $(PROGRAM_)_HLS $(PROGRAM_)_fpga $(PROGRAM_)_energy rm -rf ${PWD}/$(BOARD)_*
FER/README 0 → 100644 +13 −0 Original line number Diff line number Diff line - Set the number of FLOP editing the FLOP_ELEM macro in "parameters.h" - $ make HLS The FER is compile to the step HLS - $ make bitstream The bitstream is generated - $ make energy Only the host code of the FER is compiled. The resulting executable calls 100 times the accelerator, allowing in the meantime power measurements. A valid bitstream should be already available. No newline at end of file
FER/fer.c 0 → 100644 +371 −0 Original line number Diff line number Diff line #include "header.h" void sf_kernel(const MyData *const __restrict__ input, MyData *const __restrict__ output) { for (u_int32_t i=0 ; i<DIM ; i++) { #if (FLOP_ELEM != 0) const MyData alpha = 0.5; /* load element from DRAM to FPGA register */ const MyData elem = input[i]; MyData beta = 0.8; #endif #if (FLOP_ELEM == 1) /* 1 FLOP */ SUM(beta, elem, alpha); #endif #if (FLOP_ELEM == 2) /* 2 FLOPs */ FMA(beta, elem, alpha); #endif #if (FLOP_ELEM == 4) /* 4 FLOPs */ REP2(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 8) /* 8 FLOPs */ REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 16) /* 16 FLOPs */ REP8(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 24) /* 24 FLOPs */ REP8(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 30) /* 30 FLOPs */ REP8(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); REP2(FMA(beta, elem, alpha)); FMA(beta, elem, alpha); #endif #if (FLOP_ELEM == 32) /* 32 FLOPs */ REP16(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 64) /* 64 FLOPs */ REP32(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 72) /* 72 FLOPs */ REP32(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 80) /* 80 FLOPs */ REP32(FMA(beta, elem, alpha)); REP8(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 88) /* 88 FLOPs */ REP32(FMA(beta, elem, alpha)); REP8(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 358) /* 358 FLOPs */ REP128(FMA(beta, elem, alpha)); REP32(FMA(beta, elem, alpha)); REP16(FMA(beta, elem, alpha)); REP2(FMA(beta, elem, alpha)); FMA(beta, elem, alpha); #endif #if (FLOP_ELEM == 974) /* 974 FLOPs */ REP256(FMA(beta, elem, alpha)); REP128(FMA(beta, elem, alpha)); REP64(FMA(beta, elem, alpha)); REP32(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); REP2(FMA(beta, elem, alpha)); FMA(beta, elem, alpha); #endif /* store result */ #if (FLOP_ELEM != 0) output[i] = beta; #else output[i] = input[i]; #endif } return; } void check_result(const MyData *const sf_vector, const MyData *const hw_vector, const u_int32_t size) { u_int8_t flag = TRUE; for (u_int32_t el=0 ; el<size ; el++) { const MyData sf_val = sf_vector[el]; const MyData maxv = sf_val * (1.0 + (sf_val < 0.0 ? -threshold : threshold)); const MyData minv = sf_val * (1.0 - (sf_val < 0.0 ? -threshold : threshold)); const MyData hw_val = hw_vector[el]; if ((hw_val > maxv) || (hw_val < minv)) { flag = FALSE; break; } } # pragma omp taskwait if (!flag) { printf("\n\t TEST failed \n\n"); FILE *fp = NULL; if (!(fp = fopen("error.txt", "w"))) { printf("\n\t Cannot open file ... aborting ...\n"); exit(EXIT_FAILURE); } fprintf(fp, "%s", "# 1] sf_out 2] hw_out \n#\n"); for (u_int32_t i=0 ; i<size ; i++) fprintf(fp, "%lg %lg \n", sf_vector[i], hw_vector[i]); fclose(fp); } else if (flag) printf("\n\t TEST passed \n\n"); return; } /* no_local_mem_copies is used in order to directly access the host */ /* DRAM from the FPGA, otherwise before starting the execution of */ /* the kernel the 'input' and 'output' arrays are copied into BRAMs */ /* of the FPGA in chunks of size BSIZE. */ /* The '#pragma omp task' must have the clause 'in' and 'out' */ /* specifying the size of the arguments in order to allow the OmpSs */ /* wrapper to correctly allocate the size in kernel space. */ #pragma omp target no_localmem_copies num_instances(1) device(fpga) #pragma omp task in([BSIZE]input) out([BSIZE]output) void hw_kernel(const MyData *const __restrict__ input, MyData *const __restrict__ output) { main_loop: for (u_int32_t i=0 ; i<DIM ; i++) { # pragma HLS pipeline II=1 #if (FLOP_ELEM != 0) const MyData alpha = 0.5; /* load element from DRAM to FPGA register */ const MyData elem = input[i]; MyData beta = 0.8; #endif #if (FLOP_ELEM == 1) /* 1 FLOP */ SUM(beta, elem, alpha); #endif #if (FLOP_ELEM == 2) /* 2 FLOPs */ FMA(beta, elem, alpha); #endif #if (FLOP_ELEM == 4) /* 4 FLOPs */ REP2(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 8) /* 8 FLOPs */ REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 16) /* 16 FLOPs */ REP8(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 24) /* 24 FLOPs */ REP8(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 30) /* 30 FLOPs */ REP8(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); REP2(FMA(beta, elem, alpha)); FMA(beta, elem, alpha); #endif #if (FLOP_ELEM == 32) /* 32 FLOPs */ REP16(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 64) /* 64 FLOPs */ REP32(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 72) /* 72 FLOPs */ REP32(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 80) /* 80 FLOPs */ REP32(FMA(beta, elem, alpha)); REP8(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 88) /* 88 FLOPs */ REP32(FMA(beta, elem, alpha)); REP8(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 358) /* 358 FLOPs */ REP128(FMA(beta, elem, alpha)); REP32(FMA(beta, elem, alpha)); REP16(FMA(beta, elem, alpha)); REP2(FMA(beta, elem, alpha)); FMA(beta, elem, alpha); #endif #if (FLOP_ELEM == 974) /* 974 FLOPs */ REP256(FMA(beta, elem, alpha)); REP128(FMA(beta, elem, alpha)); REP64(FMA(beta, elem, alpha)); REP32(FMA(beta, elem, alpha)); REP4(FMA(beta, elem, alpha)); REP2(FMA(beta, elem, alpha)); FMA(beta, elem, alpha); #endif /* store result */ #if (FLOP_ELEM != 0) output[i] = beta; #else output[i] = input[i]; #endif } return; } int main() { /* arrays allocation */ MyData *input = NULL; int ret_input = posix_memalign((void **)&input, MEMORY_ALIGNMENT, SIZE); MyData *sf_out = NULL; int ret_sf_out = posix_memalign((void **)&sf_out, MEMORY_ALIGNMENT, SIZE); MyData *hw_out = NULL; int ret_hw_out = posix_memalign((void **)&hw_out, MEMORY_ALIGNMENT, SIZE); if (ret_input || ret_sf_out || ret_hw_out || !input || !sf_out || !hw_out) { printf("\n\t Cannot allocate arrays ... aborting ... \n\n"); return -1; } /* input initialization */ for (u_int32_t i=0 ; i<DIM ; i++) input[i] = 0.1 + (1.0 / ((MyData)(i + 1))); #if !defined(_ENERGY_) /* run software implementation */ double sf_time = 0.0; for (u_int8_t loop=0 ; loop<REPEAT_KERNEL_CPU ; loop++) { double sf_start = wall_time(); sf_kernel(input, sf_out); # pragma omp taskwait sf_time += (wall_time() - sf_start); } const double sf_time_iteration = sf_time / (double)REPEAT_KERNEL_CPU; #endif /* !defined(_ENERGY_) */ #if defined(_ENERGY_) printf("\n\t Start HW energy measurements now! \n\n"); #endif /* run hardware implementation */ double hw_time = 0.0; for (u_int8_t loop=0 ; loop<REPEAT_KERNEL_FPGA ; loop++) { double hw_start = wall_time(); hw_kernel(input, hw_out); # pragma omp taskwait hw_time += (wall_time() - hw_start); } const double hw_time_iteration = hw_time / (double)REPEAT_KERNEL_FPGA; #if !defined(_ENERGY_) /* check result */ check_result(sf_out, hw_out, DIM); /* summary */ printf("\t ====================== RESULTS ================================================== \n"); printf("\t Benchmark: %s (%s) \n", "FMA", "OmpSs"); printf("\t Data type : %s \n", TYPE); printf("\t Data type [byte] : %ld \n", sizeof(MyData)); printf("\t FLOPs per element is set to : %d \n", FLOP_ELEM); printf("\t Computational Intensity is [FLOPs/byte] : %ld \n", (FLOP_ELEM / (2 * sizeof(MyData)))); printf("\t CPU execution time per iteration [secs] : %lg \n", sf_time_iteration); printf("\t FPGA execution time per iteration [secs] : %lg \n", hw_time_iteration); printf("\t CPU - GFLOPs/sec : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_CPU)) / sf_time) / 1.e9); printf("\t FPGA - GFLOPs/sec : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_FPGA)) / hw_time) / 1.e9); printf("\t Bi-directional bandwidth [GByte/s]: : %lg \n", (double)(2 * SIZE) / hw_time_iteration / 1.e9); printf("\t ================================================================================= \n\n"); #else printf("\t ====================== ENERGY ================================================== \n"); printf("\t Data type : %s \n", TYPE); printf("\t Data type [byte] : %ld \n", sizeof(MyData)); printf("\t FLOPs per element is set to : %d \n", FLOP_ELEM); printf("\t Computational Intensity is [FLOPs/byte] : %ld \n", (FLOP_ELEM / (2 * sizeof(MyData)))); printf("\t FPGA execution time per iteration [secs] : %lg \n", hw_time_iteration); printf("\t FPGA - GFLOPs/sec : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_FPGA)) / hw_time) / 1.e9); printf("\t Bi-directional bandwidth [GByte/s]: : %lg \n", (double)(2 * SIZE) / hw_time_iteration / 1.e9); printf("\t ================================================================================= \n\n"); #endif free(input); free(sf_out); free(hw_out); return 0; }
FER/header.h 0 → 100644 +53 −0 Original line number Diff line number Diff line #pragma once #include <stdio.h> #include <stdlib.h> #include <sys/time.h> #include <time.h> #include "parameters.h" #define TRUE 1 #define FALSE 0 #define REPEAT_KERNEL_CPU 10 #if defined(_ENERGY_) #define REPEAT_KERNEL_FPGA 100 #else #define REPEAT_KERNEL_FPGA REPEAT_KERNEL_CPU #endif #define MEMORY_ALIGNMENT 4096 #define DIM (2*1024*1024) #define SIZE (DIM * sizeof(MyData)) const unsigned int BSIZE = DIM; struct get_time { double start, end; }; struct get_time exe; double wall_time() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC,&ts); const double ret = (double) (ts.tv_sec) + (double)ts.tv_nsec * 1.0e-9; return ret; } /* MACROS for Ops */ #define REP2(S) S ; S #define REP4(S) REP2(S); REP2(S) #define REP8(S) REP4(S); REP4(S) #define REP16(S) REP8(S); REP8(S) #define REP32(S) REP16(S); REP16(S) #define REP64(S) REP32(S); REP32(S) #define REP128(S) REP64(S); REP64(S) #define REP256(S) REP128(S); REP128(S) #define REP512(S) REP256(S); REP256(S) #define SUM(a,b,c) ((a) = (b) + (c)) #define FMA(a,b,c) ((a) = ((a) * (b)) + (c))
FER/kernel.txt 0 → 100644 +54 −0 Original line number Diff line number Diff line /********** PARAMETERS *****************/ /* Data type */ typedef float MyData; /* Number of floating-point operations */ #define FLOP_ELEM 2 /***************************************/ /******** MACROS for Ops ***************/ #define REP2(S) S ; S #define REP4(S) REP2(S); REP2(S) #define REP8(S) REP4(S); REP4(S) #define REP16(S) REP8(S); REP8(S) #define REP32(S) REP16(S); REP16(S) #define REP64(S) REP32(S); REP32(S) #define SUM(a,b,c) ((a) = (b) + (c)) #define FMA(a,b,c) ((a) = ((a) * (b)) + (c)) /*********************************************/ #pragma omp target no_localmem_copies device(fpga) #pragma omp task in([BSIZE]input) out([BSIZE]output) void hw_kernel(const MyData *const __restrict__ input, MyData *const __restrict__ output) { main_loop: for (u_int32_t i=0 ; i<DIM ; i++) { # pragma HLS pipeline II=1 const MyData alpha = 0.5; /* load element from DRAM to FPGA register */ const MyData elem = input[i]; MyData beta = 0.8; #if (FLOP_ELEM == 16) /* 16 FLOPs */ REP8(FMA(beta, elem, alpha)); #endif #if (FLOP_ELEM == 32) /* 32 FLOPs */ REP16(FMA(beta, elem, alpha)); #endif .... /* store result */ output[i] = beta; } return; }