#define BLOCK 32
const unsigned int BSIZE = BLOCK;

#if defined(_SMP_)
#pragma omp target device(smp)
#else
#pragma omp target device(fpga)
#endif
#pragma omp task in ([BSIZE]matrix, [BSIZE]vector) inout([1]out)
void acc_matrix_vector_mul(const MyData *const restrict matrix,
			   const MyData *const restrict vector,
			         MyData *const restrict out)
{
# pragma HLS inline off
  
  MyData local_out = (MyData)0.0;
  
 loop_matrix_vector_product:
  for (u_int16_t i=0 ; i<BLOCK ; i++)
    {
      local_out += (matrix[i] * vector[i]);
    }

  out[0] += local_out;
  
  return;
}

void hw_matrix_vector_mul(const MyData *const restrict matrix,
			  const MyData *const restrict vector,
			        MyData *const restrict out,
			  const u_int32_t              size)
{
  const u_int32_t s2ize = (size * size);

  for (u_int32_t el=0 ; el<s2ize ; el+=BLOCK)
    {
      const u_int32_t vec_index = (el % size);
      const u_int32_t out_index = (el / size);

      acc_matrix_vector_mul(&matrix[el], &vector[vec_index], &out[out_index]);

#    pragma omp taskwait
    }
  
  return;
}
