#include "header.h"

void alloc_buffer(MyData        **buffer,
                  const u_int32_t size)
{
  if (!(*buffer = (MyData *)malloc(size * sizeof(MyData))))
    {
      printf("\n\t Cannot allocate buffer ... aborting ...\n\n");
      exit(EXIT_FAILURE);
    }

  return;
}

void check_result(const MyData *const restrict sf_vector,
		  const MyData *const restrict hw_vector,
		  const u_int32_t              size)
{
  u_int32_t flag = TRUE;
  for (u_int32_t el=0 ; el<size ; el++)
    if (sf_vector[el] != hw_vector[el])
      flag = FALSE;

# pragma omp taskwait

  if (!flag)
    printf("\n\t TEST failed \n\n");
  else if (flag)
    printf("\n\t TEST passed \n\n");
  
  return;
}

void sf_matrix_vector_mul(const MyData *const restrict matrix,
			  const MyData *const restrict vector,
			        MyData *const restrict out,
			  const u_int32_t              size)
{
  for (u_int32_t i=0 ; i<size ; i++)
    {
      MyData sum = (MyData)0.0;
      for (u_int32_t j=0 ; j<size ; j++)
	{
	  sum += (matrix[(i * size) + j] * vector[j]);
	}
      out[i] = sum;
    }
  
  return;
}

void fill_buffer(      MyData *buffer,
                 const int32_t size)
{
  for (u_int32_t i=0 ; i<size ; i++)
    buffer[i] = (MyData)(i - (size / 2));

  return;
}

#if defined(_SMP_)
#pragma omp target device(smp)
#else
#pragma omp target device(fpga)
#endif
#pragma omp task in ([BSIZE]matrix, [BSIZE]vector) inout([1]out)
void acc_matrix_vector_mul(const MyData *const restrict matrix,
			   const MyData *const restrict vector,
			         MyData *const restrict out)
{
# pragma HLS inline off
  
  MyData local_out = (MyData)0.0;
  
 loop_matrix_vector_product:
  for (u_int16_t i=0 ; i<BLOCK ; i++)
    {
#     pragma HLS pipeline II=1
      
      local_out += (matrix[i] * vector[i]);
    }

  out[0] += local_out;
  
  return;
}

void hw_matrix_vector_mul(const MyData *const restrict matrix,
			  const MyData *const restrict vector,
			        MyData *const restrict out,
			  const u_int32_t              size)
{
  const u_int32_t s2ize = (size * size);

  for (u_int32_t el=0 ; el<s2ize ; el+=BLOCK)
    {
      const u_int32_t vec_index = (el % size);
      const u_int32_t out_index = (el / size);

      acc_matrix_vector_mul(&matrix[el], &vector[vec_index], &out[out_index]);

#    pragma omp taskwait
    }
  
  return;
}

int main(int argc, char **argv)
{
  if (argc < 2)
    {
      printf("\n\t Usage: <executable> <array size> \n\n");
      return -1;
    }

  const u_int32_t size = atoi(argv[1]);
  
  if (size % BLOCK)
    {
      printf("\n\t <array size> must be a multiple of the BLOCK size ");
      printf("\n\t BLOCK is set to %d \n\n", BLOCK);
      return -1;
    }

  /* allocate buffer */
  MyData *buffer = NULL;
  alloc_buffer(&buffer, ((size * size) + (3 * size)));

  /*-matrix*/
  MyData *matrix = buffer;
  /* vector */
  MyData *vector = matrix + (size * size);
  /* sf output */
  MyData *sf_out = vector + size;
  /* hw out */
  MyData *hw_out = sf_out + size;

  /* fill matrix and vector */
  fill_buffer(matrix, (size * size));
  fill_buffer(vector, size);
  
  /* software matrix-vector multiplication */
  double sf_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL ; loop++)
    {
#     pragma omp taskwait
      exe.start = wall_time();
      
      sf_matrix_vector_mul(matrix, vector, sf_out, size);

#     pragma omp taskwait
      sf_time += (wall_time() - exe.start);
    }
  
  /* hardware matrix vector multiplication */
  double hw_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL ; loop++)
    {
      memset(hw_out, 0, (size * sizeof(MyData)));
      
#     pragma omp taskwait
      exe.start = wall_time();
      
      hw_matrix_vector_mul(matrix, vector, hw_out, size);

#     pragma omp taskwait
      hw_time += (wall_time() - exe.start);
    }

  /* check result */
  check_result(sf_out, hw_out, size);
  
  /* free buffer */
  free(buffer);

  /* summary */
  printf("\t ================== RESULTS ================== \n");
  printf("\t Benchmark: %s (%s) \n", "matrix_vector_mult", "OmpSs");
  printf("\t Data type: %s \n", TYPE);
#if defined(_SMP_)
  printf("\t Hardware execution on SMP \n");
#else
  printf("\t Hardware execution on FPGA \n");
#endif  
  printf("\t Software execution time (secs): %lg \n", (sf_time / REPEAT_KERNEL));
  printf("\t Hardware execution time (secs): %lg \n", (hw_time / REPEAT_KERNEL));
  printf("\t ============================================= \n\n");
  
  return 0;
}
