#include "header.h"

#define REPEAT_KERNEL 10

#define RANDOM   1
#define ZERO     0

void alloc_buffer(MyData      **buffer,
                  const int32_t size)
{
  if (!(*buffer = (MyData *)malloc(size * sizeof(MyData))))
    {
      printf("\n\t Cannot allocate buffer ... aborting ...\n\n");
      exit(EXIT_FAILURE);
    }

  return;
}

void check_result(const MyData *const restrict sf_matrix,
		  const MyData *const restrict hw_matrix,
		  const u_int32_t              size)
{
  u_int32_t flag = TRUE;
  for (u_int32_t el=0 ; el<size ; el++)
    if (sf_matrix[el] != hw_matrix[el])
      flag = FALSE;

# pragma omp taskwait

  if (!flag)
    printf("\n\t TEST failed \n\n");
  else if (flag)
    printf("\n\t TEST passed \n\n");
  
  return;
}

void sf_matrix_matrix_mul(const MyData *const restrict A,
			  const MyData *const restrict B,
			        MyData *const restrict C,
			  const int32_t                size)
{
  for (int32_t i=0 ; i<size ; i++)
    {
      for (int32_t j=0 ; j<size ; j++)
	{
	  MyData ABij = (MyData)0.0;
	  for (int32_t k=0 ; k<size ; k++)
	    {
	      ABij += (A[(i * size) + k] * B[(k * size) + j]);
	    } /* product */
	  C[(i * size) + j] = ABij;
	} /* loop over columns */
    } /* loop over rows */
  
  return;
}

void copy_block(const MyData *const restrict matrix,
		      MyData *const restrict block,
		const u_int32_t              matrix_size,
		const u_int32_t              row,
		const u_int32_t              col)
{
  for (u_int32_t i=0 ; i<BLOCK ; i++)
    {
      const u_int32_t ii = (i + row);

      for (u_int32_t j=0 ; j<BLOCK ; j++)
	{
	  const u_int32_t jj = (j + col);

	  block[(i * BLOCK) + j] = matrix[(ii * matrix_size) + jj];
	}
    }
  
  return;
}

void init_block(      MyData *const restrict block,
		const int32_t                matrix_size,
		const int32_t                bi,
		const int32_t                bj)
{
  for (int32_t i=0 ; i<BLOCK ; i++)
    {
      const int32_t ii = (bi + i);

      for (int32_t j=0 ; j<BLOCK ; j++)
	{
	  const int32_t jj = (bj + j);

	  block[(ii * matrix_size) + jj] = (MyData)0.0;
	}
    }
  
  return;
}

void store_block(const MyData *const restrict block,
		       MyData *const restrict matrix,
		 const int32_t                matrix_size,
		 const int32_t                bi,
		 const int32_t                bj)
{
  for (int32_t i=0 ; i<BLOCK ; i++)
    {
      const int32_t ii = (bi + i);

      for (int32_t j=0 ; j<BLOCK ; j++)
	{
	  const int32_t jj = (bj + j);

	  matrix[(ii * matrix_size) + jj] += block[(i * BLOCK) + j];
	}
    }
  
  return;
}

#if defined(_SMP_)
#pragma omp target device(smp)
#elif (_SMP_FPGA_)
#pragma omp target device(smp, fpga)
#else
#pragma omp target device(fpga)
#endif
#pragma omp task in([B2SIZE]A, [B2SIZE]B) out([B2SIZE]C)
void acc_block_block_mul(const MyData *const restrict A,
		  	 const MyData *const restrict B,
			       MyData *const restrict C)
{
# pragma HLS inline off
  
  /* one row of A should be read in one clock cycle */
# pragma HLS array_partition variable=A cyclic factor=4
  /* one column of B should be read in one clock cycle */
# pragma HLS array_partition variable=B block  factor=4

/* # pragma HLS array_partition variable=A cyclic factor=4 */
/* # pragma HLS array_partition variable=B complete */
/* # pragma HLS array_partition variable=C cyclic factor=4 */

/* # pragma HLS pipeline II=1 */
  
 loop_row:
  for (u_int8_t row=0 ; row<BLOCK ; row++)
    {
/* #     pragma HLS pipeline II=1 */
      
    loop_col:
      for (u_int8_t col=0 ; col<BLOCK ; col++)
	{
#        pragma HLS pipeline II=1
	  
	  MyData sum = (MyData)0.0;
	  
	loop_product:
	  for (u_int8_t k=0 ; k<BLOCK ; k++)
	    {
	      sum += (A[(row * BLOCK) + k] * B[(k * BLOCK) + col]);
	    }
	  C[(row * BLOCK) + col] = sum;
	} /* loop col */
    } /* loop row */

  return;
}

void hw_matrix_matrix_block_mul(const MyData *const restrict A,
			        const MyData *const restrict B,
			              MyData *const restrict C,
			        const u_int32_t              size)
{
  for (u_int32_t bi=0 ; bi<size ; bi+=BLOCK)
    {
      for (u_int32_t bj=0 ; bj<size ; bj+=BLOCK)
	{
	  /* initialize to zero the block */
	  init_block(C, size, bi, bj);
	  
	  for (u_int32_t bk=0 ; bk<size ; bk+=BLOCK)
	    {
	      /* local block of matrix A */
	      MyData blockA[BLOCK2];
	      /* local block of matrix B */
	      MyData blockB[BLOCK2];
	      /* local block of matrix C */
	      MyData blockC[BLOCK2];
	      
	      /* copy block of A from global to local memory */
	      copy_block(A, blockA, size, bi, bk);
	      /* copy block of B from global to local memory */
	      copy_block(B, blockB, size, bk, bj);

	      /* perform block multiplication on hardware */
	      acc_block_block_mul(blockA, blockB, blockC);

#             pragma omp taskwait
	      
	      /* store blockC to global memory */
	      store_block(blockC, C, size, bi, bj);
	    } /* bk */
	} /* bj */
    } /* bi */

  return;
}

void fill_buffer(      MyData   *buffer,
                 const u_int32_t size,
		 const u_int8_t  flag)
{
  switch (flag)
    {
    case RANDOM:
      for (u_int32_t i=0 ; i<size ; i++)
	for (u_int32_t j=0 ; j<size ; j++)
	  buffer[(i * size) + j] = (MyData)((rand() % (41)) - 20);
      break;

    case ZERO:
      memset(buffer, 0, (size * sizeof(MyData)));
      break;

    default:
      printf("\n\t Error! fill_buffer() flag not available... aborting...\n\n");
      fflush(stdout);
      exit(EXIT_FAILURE);
      break;
    }

  return;
}

int main(int argc, char **argv)
{
  if (argc < 2)
    {
      printf("\n\t Usage: <executable> <matrix size> \n\n");
      return -1;
    }

  const u_int32_t size = atoi(argv[1]);

  if (size % BLOCK)
    {
      printf("\n\t <array size> must be a multiple of the BLOCK size ");
      printf("\n\t The actual BLOCK is %d \n\n", (int)BLOCK);
      return -1;
    }
 
  const u_int32_t s2ize = (size * size);
  
  /* allocate buffer */
  MyData *buffer = NULL;
  alloc_buffer(&buffer, (4 * s2ize));

  /* matrix A */
  MyData *A    = buffer;
  /* matrix B */
  MyData *B    = A + s2ize;
  /* sf matrix C = A x B */
  MyData *sf_C = B + s2ize;
  /* hw matrix C = A x B */
  MyData *hw_C = sf_C + s2ize;

  /* fill matrices A, B and sf_C */
  fill_buffer(A, size, RANDOM);
  fill_buffer(B, size, RANDOM);
  
  /* software matrix-matrix multiplication */
  double sf_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL ; loop++)
    {
#     pragma omp taskwait
      exe.start = wall_time();
      
      sf_matrix_matrix_mul(A, B, sf_C, size);

#     pragma omp taskwait
      sf_time += (wall_time() - exe.start);
    }

  /* hardware matrix-matrix multiplication */
  double hw_time = 0.0;
  for (u_int8_t loop=0 ; loop<REPEAT_KERNEL ; loop++)
    {
#     pragma omp taskwait
      exe.start = wall_time();
      
      hw_matrix_matrix_block_mul(A, B, hw_C, size);

#     pragma omp taskwait
      hw_time += (wall_time() - exe.start);
    }

  /* check result */
  check_result(sf_C, hw_C, size);
  
  /* free buffer */
  free(buffer);

  /* summary */
  printf("\t ================== RESULTS ================== \n");
  printf("\t Benchmark: %s (%s) \n", "matrix_matrix_block_mult", "OmpSs");
  printf("\t Software execution time (secs): %lg \n", (sf_time / REPEAT_KERNEL));
  printf("\t Hardware execution time (secs): %lg \n", (hw_time / REPEAT_KERNEL));
  printf("\t ============================================= \n\n");
  
  return 0;
}
