#include "header.h"

void sf_accumulation(const MyData  *const restrict in,
		           MyData  *const restrict out,
		     const u_int32_t               size)
{
  out[0] = in[0];

  for (u_int32_t i=1 ; i<size ; i++)
    out[i] = (out[i - 1] + in[i]);
  
  return;
}

#pragma omp target device(fpga)
#pragma omp task in([BSIZE]in) out([BSIZE]out)
void accumulation(const MyData  *const restrict in,
	                MyData  *const restrict out,
		  const u_int32_t               el)

{
#pragma HLS inline off

# pragma HLS array_partition variable=out complete
# pragma HLS array_partition variable=in  complete
  
  static MyData sum;

  out[0] = ((el == 0) ? in[0] : (sum + in[0]));
  
 loop_acc:
  for (u_int16_t i=1 ; i<BLOCK ; i++)
    {
#     pragma HLS unroll

      out[i] = (out[i - 1] + in[i]);
    }

  sum = out[BLOCK - 1];
  
  return;
}

void hw_accumulation(const MyData  *const restrict in,
		           MyData  *const restrict out,
		     const u_int32_t               size)
{  
  /* the accelerator is going to be called size/BLOCK times */
  for (u_int32_t el=0 ; el<size ; el+=BLOCK)
    {
      /* FPGA */
      accumulation(&in[el], &out[el], el);
    } /* loop over blocks */
  
  return;
}

void check_result(const MyData  *const restrict sf,
		  const MyData  *const restrict hw,
		  const u_int32_t               size)
{
  u_int16_t flag = 0;
  for (u_int32_t i=0 ; i<size ; i++)
    {
      if (sf[i] != hw[i])
	{
	  flag = 1;
	  break;
	}
    }

# pragma omp taskwait
  
  if (flag)
    printf("\n\t TEST failed \n\n");
  else
    printf("\n\t TEST passed \n\n");

  return;
}

int main(int argc, char **argv)
{
  if (argc < 2)
    {
      printf("\n\t Usage: <executable> <array size> \n\n");
      return -1;
    }

  const u_int32_t size = atoi(argv[1]);

  if (size < 1)
    {
      printf("\n\t <array size> must be greater than 1 \n\n");
      return -1;
    }

  if (size % BLOCK)
    {
      printf("\n\t <array size> must be a multiple of block size.");
      printf("\n\t BLOCK is set to %d \n\n", BLOCK);
      return -1;
    }

  /* arrays allocation */
  int ret;  
  MyData *buffer = NULL;
  ret = posix_memalign((void **)&buffer, MEMORY_ALIGNMENT, (3 * size * sizeof(MyData)));  
  if (ret || !buffer)
    {
      printf("\n\t Cannot allocate arrays ... aborting ... \n\n");
      return -1;
    }

  MyData *in     = buffer;
  MyData *sf_acc = in + size;
  MyData *hw_acc = sf_acc + size;
  
  /* fill array */
  for (u_int32_t i=0 ; i<size ; i++)
    in[i] = (MyData)(i - (size / 2));  

  /* run sf implementation */
  double sf_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL ; loop++)
    {
#     pragma omp taskwait
      
      exe.start = wall_time();
      
      sf_accumulation(in, sf_acc, size);

#     pragma omp taskwait
      sf_time += (wall_time() - exe.start);
    }
  
  /* run hw implementation */
  double hw_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL ; loop++)
    {
#     pragma omp taskwait

      exe.start = wall_time();
      
      hw_accumulation(in, hw_acc, size);

#     pragma omp taskwait
      hw_time += (wall_time() - exe.start);
    }
  
  /* check result */
  check_result(sf_acc, hw_acc, size);
  
  /* free memory */
  free(buffer);

  /* summary */
  printf("\t ================== RESULTS ================== \n");
  printf("\t Benchmark: %s (%s) \n", "vadd", "OmpSs");
  printf("\t Data type: %s \n", MyType);
  printf("\t Software execution time (secs): %lg \n", (sf_time / REPEAT_KERNEL));
  printf("\t Hardware execution time (secs): %lg \n", (hw_time / REPEAT_KERNEL));
  printf("\t ============================================= \n\n");
  
  return 0;
}
