diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index b899ce9..cd3bfa4 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -272,7 +272,13 @@ class TensorCoreDecoupled( // // Backend of the decoupled access/execute pipeline. // - val respQueueDepth = 2 // FIXME: parameterize + val respQueueDepth = 4 // FIXME: parameterize + require(respQueueDepth >= 4, + "respQueueDepth must be at least 4. This is because the B operand buffer " ++ + "is shallower than A's, so the B response queue has to be deep enough to " ++ + "hold younger requests until A operand buffer becomes valid and the first DPU " ++ + "fire can happen. FIXME: make operand buffer report per-subtile valid so " ++ + "the first compute can happen earlier.") val respQueueA = Queue(respATagged, respQueueDepth) val respQueueB = Queue(respBTagged, respQueueDepth) @@ -547,6 +553,7 @@ class FillBuffer[T <: Data]( val data = Reg(Vec(entries, gen)) val ptr = Counter(entries + 1) + dontTouch(ptr.value) val full = (ptr.value === entries.U) io.enq.ready := !full when (io.enq.fire) {