#include "header.h"

void alloc_buffer(MyData        **buffer,
                  const u_int32_t size)
{
  if (!(*buffer = (MyData *)malloc(size * sizeof(MyData))))
    {
      printf("\n\t Cannot allocate buffer ... aborting ...\n\n");
      exit(EXIT_FAILURE);
    }

  return;
}

void check_result(const MyData *const restrict sf_vector,
		  const MyData *const restrict hw_vector,
		  const u_int32_t              size)
{
  u_int32_t flag = TRUE;
  for (u_int32_t el=0 ; el<size ; el++)
    if (sf_vector[el] != hw_vector[el])
      flag = FALSE;

# pragma omp taskwait

  if (!flag)
    printf("\n\t TEST failed \n\n");
  else if (flag)
    printf("\n\t TEST passed \n\n");

#if defined(DEBUG)

  FILE*fp=NULL;
  if (!(fp = fopen("debug.txt", "w")))
    {
      printf("\n\t Cannot open debuf file... aborting...\n");
      exit(EXIT_FAILURE);
    }

  fprintf(fp, "# i   sf   hw\n");
  for (u_int32_t i=0 ; i<size ; i++)
    fprintf(fp, "%d   %d   %d\n", i, sf_vector[i], hw_vector[i]);

  fclose(fp);
  
#endif  
  
  return;
}

void sf_matrix_vector_mul(const MyData *const restrict matrix,
			  const MyData *const restrict vector,
			        MyData *const restrict out,
			  const u_int32_t              size)
{
  for (u_int32_t i=0 ; i<size ; i++)
    {
      MyData sum = (MyData)0.0;
      for (u_int32_t j=0 ; j<size ; j++)
	{
	  sum += (matrix[(i * size) + j] * vector[j]);
	}
      out[i] = sum;
    }
  
  return;
}

void fill_buffer(      MyData *buffer,
                 const int32_t size)
{
  for (u_int32_t i=0 ; i<size ; i++)
    buffer[i] = (MyData)(i - (size / 2));

  return;
}

#if defined(_SMP_)
#pragma omp target device(smp)
#else
#pragma omp target device(fpga)
#endif
#pragma omp task in ([BSIZE*BSIZE]matrix, [BSIZE]vector) inout([BSIZE]out)
void acc_matrix_vector_block_mul(const MyData *const restrict matrix,
				 const MyData *const restrict vector,
			               MyData *const restrict out)
{
# pragma HLS inline off

  /* loop over rows of blockMatrix */
 loop_data:
  for (u_int16_t i=0 ; i<BLOCK ; i++)
    {
      MyData local_out = (MyData)0.0;

    loop_dot_product:
      for (u_int16_t j=0 ; j<BLOCK ; j++)
	{
	  local_out += (matrix[(i * BLOCK) + j] * vector[j]);
	}
      /* store result */
      out[i] += local_out;
    }
  
  return;
}

void copy_block(const MyData *const restrict matrix,
                      MyData *const restrict block,
                const u_int32_t              matrix_size,
                const u_int32_t              row,
                const u_int32_t              col)
{
  for (u_int32_t i=0 ; i<BLOCK ; i++)
    {
      const u_int32_t ii = (i + row);

      for (u_int32_t j=0 ; j<BLOCK ; j++)
        {
          const u_int32_t jj = (j + col);

          block[(i * BLOCK) + j] = matrix[(ii * matrix_size) + jj];
	}
    }

  return;
}

void hw_matrix_vector_mul(const MyData *const restrict matrix,
			  const MyData *const restrict vector,
			        MyData *const restrict out,
			  const u_int32_t              size)
{
  /* init to zero out */
  memset(out, 0, (sizeof(MyData) * size));
  
  for (u_int32_t bi=0 ; bi<size ; bi+=BLOCK)
    {
      for (u_int32_t bj=0 ; bj<size ; bj+=BLOCK)
	{
	  /* local block of matrix */
	  MyData blockMatrix[BLOCK2];

	  /* copy block of matrix from global to local memory */
	  copy_block(matrix, blockMatrix, size, bi, bj);

	  /* perform matrix_vector_block multiplication on hardware */
	  acc_matrix_vector_block_mul(blockMatrix, &vector[bj], &out[bi]);

#         pragma omp taskwait	  
	} /* bj */
    } /* bi */
  
  return;
}

int main(int argc, char **argv)
{
  if (argc < 2)
    {
      printf("\n\t Usage: <executable> <array size> \n\n");
      return -1;
    }

  const u_int32_t size = atoi(argv[1]);
  
  if (size % BLOCK)
    {
      printf("\n\t <array size> must be a multiple of the BLOCK size ");
      printf("\n\t BLOCK is set to %d \n\n", BLOCK);
      return -1;
    }

  /* allocate buffer */
  MyData *buffer = NULL;
  alloc_buffer(&buffer, ((size * size) + (3 * size)));

  /*-matrix*/
  MyData *matrix = buffer;
  /* vector */
  MyData *vector = matrix + (size * size);
  /* sf output */
  MyData *sf_out = vector + size;
  /* hw out */
  MyData *hw_out = sf_out + size;

  /* fill matrix and vector */
  fill_buffer(matrix, (size * size));
  fill_buffer(vector, size);
  
  /* software matrix-vector multiplication */
  double sf_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL ; loop++)
    {
#     pragma omp taskwait
      exe.start = wall_time();
      
      sf_matrix_vector_mul(matrix, vector, sf_out, size);

#     pragma omp taskwait
      sf_time += (wall_time() - exe.start);
    }
  
  /* hardware matrix vector multiplication */
  double hw_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL ; loop++)
    {
#     pragma omp taskwait
      exe.start = wall_time();
      
      hw_matrix_vector_mul(matrix, vector, hw_out, size);

#     pragma omp taskwait
      hw_time += (wall_time() - exe.start);
    }

  /* check result */
  check_result(sf_out, hw_out, size);
  
  /* free buffer */
  free(buffer);

  /* summary */
  printf("\t ================== RESULTS ================== \n");
  printf("\t Benchmark: %s (%s) \n", "matrix_vector_mult", "OmpSs");
  printf("\t Data type: %s \n", TYPE);
#if defined(_SMP_)
  printf("\t Hardware execution on SMP \n");
#else
  printf("\t Hardware execution on FPGA \n");
#endif  
  printf("\t Software execution time (secs): %lg \n", (sf_time / REPEAT_KERNEL));
  printf("\t Hardware execution time (secs): %lg \n", (hw_time / REPEAT_KERNEL));
  printf("\t ============================================= \n\n");
  
  return 0;
}
