#include "header.h"

void sf_kernel(const MyData *const __restrict__ input,
	             MyData *const __restrict__ output)
{
  for (u_int32_t i=0 ; i<DIM ; i++)
    {
#if   (FLOP_ELEM != 0)

      const MyData alpha = 0.5;

      /* load element from DRAM to FPGA register */
      const MyData elem = input[i];

      MyData beta = 0.8;

#endif
      
#if   (FLOP_ELEM == 1)                     /* 1 FLOP   */
      SUM(beta, elem, alpha);
#endif

#if   (FLOP_ELEM == 2)                     /* 2 FLOPs  */
      FMA(beta, elem, alpha);
#endif

#if   (FLOP_ELEM == 4)                     /* 4 FLOPs  */
      REP2(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 8)                     /* 8 FLOPs  */
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 16)                    /* 16 FLOPs */
      REP8(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 24)                    /* 24 FLOPs */
      REP8(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 30)                    /* 30 FLOPs */
      REP8(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
      REP2(FMA(beta, elem, alpha));
      FMA(beta, elem, alpha);
#endif
      
#if   (FLOP_ELEM == 32)                    /* 32 FLOPs */
      REP16(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 64)                    /* 64 FLOPs */
      REP32(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 72)                    /* 72 FLOPs */
      REP32(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 80)                    /* 80 FLOPs */
      REP32(FMA(beta, elem, alpha));
      REP8(FMA(beta, elem, alpha));
#endif
      
#if   (FLOP_ELEM == 88)                    /* 88 FLOPs */
      REP32(FMA(beta, elem, alpha));
      REP8(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 358)                   /* 358 FLOPs */
      REP128(FMA(beta, elem, alpha));
      REP32(FMA(beta, elem, alpha));
      REP16(FMA(beta, elem, alpha));
      REP2(FMA(beta, elem, alpha));
      FMA(beta, elem, alpha);
#endif
      
#if   (FLOP_ELEM == 974)                   /* 974 FLOPs */
      REP256(FMA(beta, elem, alpha));
      REP128(FMA(beta, elem, alpha));
      REP64(FMA(beta, elem, alpha));
      REP32(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
      REP2(FMA(beta, elem, alpha));
      FMA(beta, elem, alpha);
#endif
      
      /* store result */
#if   (FLOP_ELEM != 0)
      
      output[i] = beta;

#else

      output[i] = input[i];

#endif
    }

  return;
}

void check_result(const MyData *const sf_vector,
                  const MyData *const hw_vector,
                  const u_int32_t     size)
{
  u_int8_t flag = TRUE;
  for (u_int32_t el=0 ; el<size ; el++)
    {
      const MyData sf_val = sf_vector[el];
      const MyData maxv = sf_val * (1.0 + (sf_val < 0.0 ? -threshold : threshold));
      const MyData minv = sf_val * (1.0 - (sf_val < 0.0 ? -threshold : threshold));
      const MyData hw_val = hw_vector[el];

      if ((hw_val > maxv) || (hw_val < minv))
        {
          flag = FALSE;
          break;
        }
    }

# pragma omp taskwait

  if (!flag)
    {
      printf("\n\t TEST failed \n\n");

      FILE *fp = NULL;
      if (!(fp = fopen("error.txt", "w")))
	{
	  printf("\n\t Cannot open file ... aborting ...\n");
	  exit(EXIT_FAILURE);
	}

      fprintf(fp, "%s", "# 1] sf_out   2] hw_out \n#\n");

      for (u_int32_t i=0 ; i<size ; i++)
	fprintf(fp, "%lg   %lg \n", sf_vector[i], hw_vector[i]);
      
      fclose(fp);
    }
  else if (flag)
    printf("\n\t TEST passed \n\n");

  return;
}

/* no_local_mem_copies is used in order to directly access the host */
/* DRAM from the FPGA, otherwise before starting the execution of   */
/* the kernel the 'input' and 'output' arrays are copied into BRAMs */
/* of the FPGA in chunks of size BSIZE.                             */
/* The '#pragma omp task' must have the clause 'in' and 'out'       */
/* specifying the size of the arguments in order to allow the OmpSs */
/* wrapper to correctly allocate the size in kernel space.          */
#pragma omp target no_localmem_copies num_instances(1) device(fpga)
#pragma omp task in([BSIZE]input) out([BSIZE]output)
void hw_kernel(const MyData *const __restrict__ input,
	             MyData *const __restrict__ output)
{
 main_loop:
  for (u_int32_t i=0 ; i<DIM ; i++)
    {
#    pragma HLS pipeline II=1

#if   (FLOP_ELEM != 0)
      
      const MyData alpha = 0.5;

      /* load element from DRAM to FPGA register */
      const MyData elem = input[i];

      MyData beta = 0.8;

#endif      

#if   (FLOP_ELEM == 1)                     /* 1 FLOP   */
      SUM(beta, elem, alpha);
#endif

#if   (FLOP_ELEM == 2)                     /* 2 FLOPs  */
      FMA(beta, elem, alpha);
#endif

#if   (FLOP_ELEM == 4)                     /* 4 FLOPs  */
      REP2(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 8)                     /* 8 FLOPs  */
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 16)                    /* 16 FLOPs */
      REP8(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 24)                    /* 24 FLOPs */
      REP8(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 30)                    /* 30 FLOPs */
      REP8(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
      REP2(FMA(beta, elem, alpha));
      FMA(beta, elem, alpha);
#endif
      
#if   (FLOP_ELEM == 32)                    /* 32 FLOPs */
      REP16(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 64)                    /* 64 FLOPs */
      REP32(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 72)                    /* 72 FLOPs */
      REP32(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 80)                    /* 80 FLOPs */
      REP32(FMA(beta, elem, alpha));
      REP8(FMA(beta, elem, alpha));
#endif
      
#if   (FLOP_ELEM == 88)                    /* 88 FLOPs */
      REP32(FMA(beta, elem, alpha));
      REP8(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
#endif

#if   (FLOP_ELEM == 358)                   /* 358 FLOPs */
      REP128(FMA(beta, elem, alpha));
      REP32(FMA(beta, elem, alpha));
      REP16(FMA(beta, elem, alpha));
      REP2(FMA(beta, elem, alpha));
      FMA(beta, elem, alpha);
#endif
      
#if   (FLOP_ELEM == 974)                   /* 974 FLOPs */
      REP256(FMA(beta, elem, alpha));
      REP128(FMA(beta, elem, alpha));
      REP64(FMA(beta, elem, alpha));
      REP32(FMA(beta, elem, alpha));
      REP4(FMA(beta, elem, alpha));
      REP2(FMA(beta, elem, alpha));
      FMA(beta, elem, alpha);
#endif
      
      /* store result */
#if   (FLOP_ELEM != 0)
      
      output[i] = beta;

#else

      output[i] = input[i];

#endif
    }

  return;
}

int main()
{
  /* arrays allocation */
  MyData *input = NULL;
  int ret_input = posix_memalign((void **)&input, MEMORY_ALIGNMENT, SIZE);
  
  MyData *sf_out = NULL;
  int ret_sf_out = posix_memalign((void **)&sf_out, MEMORY_ALIGNMENT, SIZE);

  MyData *hw_out = NULL;
  int ret_hw_out = posix_memalign((void **)&hw_out, MEMORY_ALIGNMENT, SIZE);

  if (ret_input || ret_sf_out || ret_hw_out ||
      !input    || !sf_out    || !hw_out)
    {
      printf("\n\t Cannot allocate arrays ... aborting ... \n\n");
      return -1;
    }  
  
  /* input initialization */
  for (u_int32_t i=0 ; i<DIM ; i++)
    input[i] = 0.1 + (1.0 / ((MyData)(i + 1)));

#if !defined(_ENERGY_)
  
  /* run software implementation */
  double sf_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL_CPU ; loop++)
    {      
      double sf_start = wall_time();
      
      sf_kernel(input, sf_out);

#     pragma omp taskwait

      sf_time += (wall_time() - sf_start);
    }

  const double sf_time_iteration = sf_time / (double)REPEAT_KERNEL_CPU;
  
#endif /* !defined(_ENERGY_) */
  
#if defined(_ENERGY_)

  printf("\n\t Start HW energy measurements now! \n\n");

#endif  
  
  /* run hardware implementation */
  double hw_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL_FPGA ; loop++)
    {
      double hw_start = wall_time();
      
      hw_kernel(input, hw_out);

#     pragma omp taskwait

      hw_time += (wall_time() - hw_start);
    }

  const double hw_time_iteration = hw_time / (double)REPEAT_KERNEL_FPGA;
  
#if !defined(_ENERGY_)

  /* check result */
  check_result(sf_out, hw_out, DIM);
  
  /* summary */
  printf("\t ====================== RESULTS ================================================== \n");
  printf("\t Benchmark: %s (%s) \n", "FMA", "OmpSs");
  printf("\t Data type                                 : %s  \n", TYPE);
  printf("\t Data type [byte]                          : %ld \n", sizeof(MyData));
  printf("\t FLOPs per element is set to               : %d  \n", FLOP_ELEM);
  printf("\t Computational Intensity is [FLOPs/byte]   : %ld \n", (FLOP_ELEM / (2 * sizeof(MyData))));
  printf("\t CPU execution time per iteration [secs]   : %lg \n", sf_time_iteration);
  printf("\t FPGA execution time per iteration [secs]  : %lg \n", hw_time_iteration);
  printf("\t CPU  - GFLOPs/sec                         : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_CPU)) / sf_time) / 1.e9);
  printf("\t FPGA - GFLOPs/sec                         : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_FPGA)) / hw_time) / 1.e9);
  printf("\t Bi-directional bandwidth [GByte/s]:       : %lg \n", (double)(2 * SIZE) / hw_time_iteration / 1.e9);
  printf("\t ================================================================================= \n\n");

#else

  printf("\t ====================== ENERGY ================================================== \n");
  printf("\t Data type                                 : %s  \n", TYPE);
  printf("\t Data type [byte]                          : %ld \n", sizeof(MyData));
  printf("\t FLOPs per element is set to               : %d  \n", FLOP_ELEM);
  printf("\t Computational Intensity is [FLOPs/byte]   : %ld \n", (FLOP_ELEM / (2 * sizeof(MyData))));
  printf("\t FPGA execution time per iteration [secs]  : %lg \n", hw_time_iteration);
  printf("\t FPGA - GFLOPs/sec                         : %lg \n", (((double)(DIM * FLOP_ELEM * REPEAT_KERNEL_FPGA)) / hw_time) / 1.e9);
  printf("\t Bi-directional bandwidth [GByte/s]:       : %lg \n", (double)(2 * SIZE) / hw_time_iteration / 1.e9);
  printf("\t ================================================================================= \n\n");  
  
#endif
  
  free(input);
  free(sf_out);
  free(hw_out);
  
  return 0;
}
