/********** PARAMETERS *****************/
/* Data type */
typedef float MyData;

/* Number of floating-point operations */
#define FLOP_ELEM    2
/***************************************/

/******** MACROS for Ops ***************/
#define REP2(S)        S ;        S
#define REP4(S)   REP2(S);   REP2(S)
#define REP8(S)   REP4(S);   REP4(S)
#define REP16(S)  REP8(S);   REP8(S)
#define REP32(S)  REP16(S);  REP16(S)
#define REP64(S)  REP32(S);  REP32(S)

#define SUM(a,b,c)  ((a) = (b) + (c))
#define FMA(a,b,c)  ((a) = ((a) * (b)) + (c))
/*********************************************/


#pragma omp target no_localmem_copies device(fpga)
#pragma omp task in([BSIZE]input) out([BSIZE]output)
void hw_kernel(const MyData *const __restrict__ input,
	             MyData *const __restrict__ output)
{
 main_loop:
  for (u_int32_t i=0 ; i<DIM ; i++)
    {
#    pragma HLS pipeline II=1

      const MyData alpha = 0.5;

      /* load element from DRAM to FPGA register */
      const MyData elem = input[i];

      MyData beta = 0.8;

#if   (FLOP_ELEM == 16)              /* 16 FLOPs */
      REP8(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 32)              /* 32 FLOPs */
      REP16(FMA(beta, elem, alpha));
#endif

      ....	
      
      /* store result */
      output[i] = beta;
    }

  return;
}
