// waits until all threads in the thread block have reached this point and shared memory accesses // made by these threads prior to __syncthreads() are visible to all threads in the block.
__syncthreads();
// perform the matrix multiplication within the block