#include "header.h"

void sf_vadd(const MyData    *const restrict in1,
	     const MyData    *const restrict in2,
	           MyData    *const restrict out,
	     const u_int32_t                 size)
{
  for (u_int32_t i=0 ; i<size ; i++)
    out[i] = (in1[i] + in2[i]);
  
  return;
}

#pragma omp target device(fpga)
#pragma omp task in([BSIZE]in1, [BSIZE]in2) out([BSIZE]out)
void acc_vadd(const MyData *const restrict in1,
	      const MyData *const restrict in2,
	            MyData *const restrict out)
{
# pragma HLS inline off
  
 loop_vector_add:
  for (u_int16_t i=0 ; i<BLOCK ; i++)
    {
      out[i] = (in1[i] + in2[i]);
    }
  
  return;
}

void hw_vadd(const MyData    *const restrict in1,
	     const MyData    *const restrict in2,
	           MyData    *const restrict out,
	     const u_int32_t                 size)
{
  /* the accelerator is going to be called size/BLOCK times */
  for (u_int32_t el=0 ; el<size ; el+=BLOCK)
    {
      acc_vadd(&in1[el], &in2[el], &out[el]);
    } /* loop over blocks */
  
  return;
}

void check_arrays(const MyData    *const restrict sf,
		  const MyData    *const restrict hw,
		  const u_int32_t                 size)
{
  u_int16_t flag = TRUE;
  for (u_int32_t i=0 ; i<size ; i++)
    if (sf[i] != hw[i])
      flag = FALSE;

# pragma omp taskwait

  if (!flag)
    printf("\n\t TEST failed \n\n");
  else if (flag)
    printf("\n\t TEST passed \n\n");
  
  return;
}

int main(int argc, char **argv)
{
  if (argc < 2)
    {
      printf("\n\t Usage: <executable> <array size> \n\n");
      return -1;
    }

  const u_int32_t size = atoi(argv[1]);

  if (size < 1)
    {
      printf("\n\t <array size> must be greater than 1 \n\n");
      return -1;
    }

  if (size % BLOCK)
    {
      printf("\n\t <array size> must be a multiple of block size.");
      printf("\n\t BLOCK is set to %d \n\n", BLOCK);
      return -1;
    }

  /* arrays allocation */
  MyData *buffer = NULL;
  int ret;
  ret = posix_memalign((void **)&buffer, MEMORY_ALIGNMENT, (4 * size * sizeof(MyData)));
  
  if (ret || !buffer)
    {
      printf("\n\t Cannot allocate arrays ... aborting ... \n\n");
      return -1;
    }

  MyData *in1    = buffer;
  MyData *in2    = in1 + size;
  MyData *sf_out = in2 + size;
  MyData *hw_out = sf_out + size;
  
  /* fill arrays */
  for (u_int32_t i=0 ; i<size ; i++)
    {
      in1[i] = (MyData)i;
      in2[i] = (MyData)(size - i);
    }

  /* run sf implementation */
  double sf_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL ; loop++)
    {
#     pragma omp taskwait
      exe.start = wall_time();
      
      sf_vadd(in1, in2, sf_out, size);

#     pragma omp taskwait
      sf_time += (wall_time() - exe.start);
    }

  /* run hw implementation */
  double hw_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL ; loop++)
    {
#     pragma omp taskwait
      exe.start = wall_time();
      
      hw_vadd(in1, in2, hw_out, size);

#     pragma omp taskwait
      hw_time += (wall_time() - exe.start);
    }
  
  /* check results */
  check_arrays(sf_out, hw_out, size);
  
  /* free memory */
  free(buffer);

  /* summary */
  printf("\t ================== RESULTS ================== \n");
  printf("\t Benchmark: %s (%s) \n", "vadd", "OmpSs");
  printf("\t Data type: %s \n", MyType);
  printf("\t Software execution time (secs): %lg \n", (sf_time / REPEAT_KERNEL));
  printf("\t Hardware execution time (secs): %lg \n", (hw_time / REPEAT_KERNEL));
  printf("\t ============================================= \n\n");
  
  return 0;
}
