#define BLOCK  4
#define BLOCK2 (BLOCK * BLOCK)
typedef int32_t MyData;
const unsigned int B2SIZE = (BLOCK * BLOCK);

#if defined(_SMP_)
#pragma omp target device(smp)
#elif (_SMP_FPGA_)
#pragma omp target device(smp, fpga)
#else
#pragma omp target device(fpga)
#endif
#pragma omp task in([B2SIZE]A, [B2SIZE]B) out([B2SIZE]C)
void acc_block_block_mul(const MyData *const restrict A,
		  	 const MyData *const restrict B,
			       MyData *const restrict C)
{
# pragma HLS inline off

 loop_row:
  for (u_int8_t row=0 ; row<BLOCK ; row++)
    {
    loop_col:
      for (u_int8_t col=0 ; col<BLOCK ; col++)
	{
	  MyData sum = (MyData)0.0;
	  
	loop_product:
	  for (u_int8_t k=0 ; k<BLOCK ; k++)
	    {
	      sum += (A[(row * BLOCK) + k] * B[(k * BLOCK) + col]);
	    }
	  C[(row * BLOCK) + col] = sum;
	} /* loop col */
    } /* loop row */

  return;
}

void hw_matrix_matrix_block_mul(const MyData *const restrict A,
			        const MyData *const restrict B,
			              MyData *const restrict C,
			        const u_int32_t           size)
{
  for (u_int32_t bi=0 ; bi<size ; bi+=BLOCK)
    {
      for (u_int32_t bj=0 ; bj<size ; bj+=BLOCK)
	{
	  /* initialize to zero the block */
	  init_block(C, size, bi, bj);
	  
	  for (u_int32_t bk=0 ; bk<size ; bk+=BLOCK)
	    {
	      /* local block of matrix A */
	      MyData blockA[BLOCK2];
	      /* local block of matrix B */
	      MyData blockB[BLOCK2];
	      /* local block of matrix C */
	      MyData blockC[BLOCK2];
	      
	      /* copy block of A from global to local memory */
	      copy_block(A, blockA, size, bi, bk);
	      /* copy block of B from global to local memory */
	      copy_block(B, blockB, size, bk, bj);

	      /* perform block multiplication */
	      acc_block_block_mul(blockA, blockB, blockC);

#             pragma omp taskwait
	      
	      /* store blockC to global memory */
	      store_block(blockC, C, size, bi, bj);
	    } /* bk */
	} /* bj */
    } /* bi */

  return;
}
