#include "header.h"

void sf_histogram(const MyData  *const restrict in,
		        MyData  *const restrict histog,
		  const u_int32_t               size)
{
  for (u_int32_t i=0 ; i<size ; i++)
    histog[in[i]]++;
  
  return;
}

#pragma omp target device(fpga)
#pragma omp task in([BSIZE]in) out([BSIZE]histog)
void histogram(const MyData  *const restrict in,
	             MyData  *const restrict histog,
	       const u_int8_t                copy_out,
	       const u_int8_t                reset)

{
#pragma HLS inline off

#pragma HLS array_partition variable=local_histog complete
  
  static u_int32_t local_histog[BLOCK];

  if (reset)
    {
    loop_reset:
      for (u_int16_t i=0 ; i<BLOCK ; i++)
	{
#        pragma HLS pipeline II=1

	  local_histog[i] = 0;
	}
    }
  
 loop_histogram:
  for (u_int16_t i=0 ; i<BLOCK ; i++)
    {
#    pragma HLS pipeline II=1

      local_histog[in[i]]++;
    }

  /* copy local_histog to histog */
  if (copy_out)
    {
    loop_copy:
      for (u_int16_t i=0 ; i<BLOCK ; i++)
	{
#        pragma HLS pipeline II=1
	  
	  histog[i] = local_histog[i];
	}
    }

  return;
}

void hw_histogram(const MyData  *const restrict in,
		        MyData  *const restrict histog,
		  const u_int32_t               size)
{
  /* the accelerator is going to be called size/BLOCK times */
  for (u_int32_t el=0 ; el<size ; el+=BLOCK)
    {
      u_int8_t flag = (u_int8_t)((el + BLOCK) / size);

      u_int8_t reset = ((el == 0) ? 1 : 0);
      
      /* FPGA */
      histogram(&in[el], histog, flag, reset);
    } /* loop over blocks */
  
  return;
}

void check_result(const MyData  *const restrict sf,
		  const MyData  *const restrict hw,
		  const u_int32_t               size)
{
  u_int16_t flag = 0;
  for (u_int32_t i=0 ; i<size ; i++)
    {
      if (sf[i] != hw[i])
	{
	  flag = 1;
	  break;
	}
    }

# pragma omp taskwait
  
  if (flag)
    printf("\n\t TEST failed \n\n");
  else
    printf("\n\t TEST passed \n\n");

  return;
}

int main(int argc, char **argv)
{
  if (argc < 2)
    {
      printf("\n\t Usage: <executable> <array size> \n\n");
      return -1;
    }

  const u_int32_t size = atoi(argv[1]);

  if (size < 1)
    {
      printf("\n\t <array size> must be greater than 1 \n\n");
      return -1;
    }

  if (size % BLOCK)
    {
      printf("\n\t <array size> must be a multiple of block size.");
      printf("\n\t BLOCK is set to %d \n\n", BLOCK);
      return -1;
    }

  /* arrays allocation */
  int ret;  
  MyData *buffer = NULL;
  ret = posix_memalign((void **)&buffer, MEMORY_ALIGNMENT,
		       ((size + (2 * BLOCK)) * sizeof(MyData)));
  if (ret || !buffer)
    {
      printf("\n\t Cannot allocate arrays ... aborting ... \n\n");
      return -1;
    }

  MyData *in   = buffer;
  MyData *sf_h = in + size;
  MyData *hw_h = sf_h + BLOCK;
  
  /* fill array and init the histogram */
  srand(time(NULL)); /* random seed */
  in[0] = (MyData)(rand() % BLOCK);
  for (u_int32_t i=1 ; i<size ; i++)
    {
      MyData tmp;
      
      do
	{
	  tmp = (MyData)(rand() % BLOCK);
	} while (tmp == in[i-1]);

      in[i] = tmp;
    }
    
  /* run sf implementation */
  double sf_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL ; loop++)
    {
      /* reset the histogram */
      memset(sf_h, 0, (sizeof(MyData) * BLOCK));

#     pragma omp taskwait
      
      exe.start = wall_time();
      
      sf_histogram(in, sf_h, size);

#     pragma omp taskwait
      sf_time += (wall_time() - exe.start);
    }
  
  /* run hw implementation */
  double hw_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL ; loop++)
    {
#     pragma omp taskwait

      exe.start = wall_time();
      
      hw_histogram(in, hw_h, size);

#     pragma omp taskwait
      hw_time += (wall_time() - exe.start);
    }
  
  /* check result */
  check_result(sf_h, hw_h, BLOCK);
  
  /* free memory */
  free(buffer);

  /* summary */
  printf("\t ================== RESULTS ================== \n");
  printf("\t Benchmark: %s (%s) \n", "vadd", "OmpSs");
  printf("\t Data type: %s \n", MyType);
  printf("\t Software execution time (secs): %lg \n", (sf_time / REPEAT_KERNEL));
  printf("\t Hardware execution time (secs): %lg \n", (hw_time / REPEAT_KERNEL));
  printf("\t ============================================= \n\n");
  
  return 0;
}
