#define BLOCK 32
const unsigned int BSIZE = BLOCK;

#if defined(_SMP_)
#pragma omp target device(smp)
#else
#pragma omp target device(fpga)
#endif
#pragma omp task in ([BSIZE*BSIZE]matrix, [BSIZE]vector) inout([BSIZE]out)
void acc_m_v_block(const MyData *const restrict matrix,
		   const MyData *const restrict vector,
		         MyData *const restrict out)
{
# pragma HLS inline off

# pragma HLS array_partition variable=matrix cyclic factor=16
# pragma HLS array_partition variable=vector cyclic factor=16
  
  /* loop over rows of blockMatrix */
 loop_data:
  for (u_int16_t i=0 ; i<BLOCK ; i++)
    {
#     pragma HLS pipeline II=1
      
      MyData local_out = (MyData)0.0;

    loop_dot_product:
      for (u_int16_t j=0 ; j<BLOCK ; j++)
	{
	  local_out += (matrix[(i * BLOCK) + j] * vector[j]);
	}
      /* store result */
      out[i] += local_out;
    }
  
  return;
}

void copy_block(const MyData *const restrict matrix,
                      MyData *const restrict block,
                const u_int32_t              matrix_size,
                const u_int32_t              row,
                const u_int32_t              col)
{
  for (u_int32_t i=0 ; i<BLOCK ; i++)
    {
      const u_int32_t ii = (i + row);

      for (u_int32_t j=0 ; j<BLOCK ; j++)
        {
          const u_int32_t jj = (j + col);

          block[(i * BLOCK) + j] = matrix[(ii * matrix_size) + jj];
	}
    }

  return;
}

void hw_matrix_vector_mul(const MyData *const restrict matrix,
			  const MyData *const restrict vector,
			        MyData *const restrict out,
			  const u_int32_t              size)
{
  /* init to zero out */
  memset(out, 0, (sizeof(MyData) * size));
  
  for (u_int32_t bi=0 ; bi<size ; bi+=BLOCK)
    {
      for (u_int32_t bj=0 ; bj<size ; bj+=BLOCK)
	{
	  /* local block of matrix */
	  MyData blockMatrix[BLOCK2];

	  /* copy block of matrix from global to local memory */
	  copy_block(matrix, blockMatrix, size, bi, bj);

	  /* perform matrix_vector_block multiplication on hardware */
	  acc_m_v_block(blockMatrix, &vector[bj], &out[bi]);

#         pragma omp taskwait	  
	} /* bj */
    } /* bi */
  
  return;
}

/******************** VIVADO  HLS  REPORT ********************/
/* Target Board: ZedBoard                                    */
/*************************************************************/
/* DSP48E   96    used |    220 available-43.64% utilization */
/* BRAM_18K 42    used |    280 available-15.00% utilization */
/* LUT      11011 used |  53200 available-20.70% utilization */
/* FF       9034  used | 106400 available- 8.49% utilization */
/*************************************************************/
