diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index d266edd..7f3c7da 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -202,8 +202,9 @@ class TensorCoreDecoupled( // Address generation // def addressGen(base: UInt, set: UInt, index: UInt): UInt = { - // note that both A and B are K-major to facilitate bank conflict-free SMEM - // accesses, so that below code applies to both. + // A is assumed to be block-wise M-major, and B block-wise N-major, to + // facilitate bank conflict-free SMEM accesses. With these layouts, the + // same code below works for both A and B. // // a "block" is the 4*8 byte-sized contiguous memory that can be read in // one SMEM request. The A and B matrix is assumed to be stored in @@ -211,8 +212,7 @@ class TensorCoreDecoupled( val blockRow = set val blockCol = index val blockIndex = (blockRow << indexBits) + blockCol - val blockSize = numLanes * laneWidth - require(blockSize == memWidth) + val blockSize = numLanes * (laneWidth / 8/*bits*/) val blockSizeBits = log2Ceil(blockSize) val byteOffset = blockIndex << blockSizeBits base + byteOffset