From efaf599fbe679f0e5e7ef671522408f34984057e Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 15 Oct 2024 17:08:14 -0700
Subject: [PATCH] tensor: Assert alignment of A and B response queues

---
 .../radiance/core/TensorCoreDecoupled.scala   | 41 +++++++++++++------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 43dc1ca..4f5ecb3 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -97,15 +97,16 @@ class TensorCoreDecoupled(
   // steps: i-j iteration
   val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc)
   val stepBits = log2Ceil(numSteps)
-  val set = RegInit(0.U(setBits.W))
-  val step = RegInit(0.U(stepBits.W))
+  // set and step being currently accessed in the acc/ex frontend
+  val setAccess = RegInit(0.U(setBits.W))
+  val stepAccess = RegInit(0.U(stepBits.W))
 
   when(io.initiate.fire) {
     val wid = io.initiate.bits.wid
     busy := true.B
     warpReg := wid
-    set := 0.U
-    step := 0.U
+    setAccess := 0.U
+    stepAccess := 0.U
     when(io.writeback.fire) {
       assert(
         io.writeback.bits.wid =/= wid,
@@ -129,8 +130,8 @@ class TensorCoreDecoupled(
   // use concatenation of set/step as the memory request source.  This will get
   // translated to the actual TL sourcewidth in sourceGen.
   val tag = Wire(new TensorMemTag)
-  tag.set := set
-  tag.step := step
+  tag.set := setAccess
+  tag.step := stepAccess
 
   val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
   val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
@@ -176,16 +177,32 @@ class TensorCoreDecoupled(
   // -------------
   // Backend of the decoupled access/execute pipeline.
   //
+  // set and step being currently executed in the acc/ex backend
+  val setExecute = RegInit(0.U(setBits.W))
+  val stepExecute = RegInit(0.U(stepBits.W))
+
   val respQueueDepth = 4 // FIXME: parameterize
   val respQueueA = Queue(respATagged, respQueueDepth)
   val respQueueB = Queue(respBTagged, respQueueDepth)
-  respQueueA.ready := io.writeback.ready // FIXME
-  respQueueB.ready := io.writeback.ready // FIXME
 
   require(respQueueA.bits.data.widthOption.get ==
           io.writeback.bits.data.widthOption.get,
     "response data width does not match the writeback data width")
 
+  val bothQueueValid = (respQueueA.valid && respQueueB.valid)
+  // assume in-order response and that A/B responses are always aligned; this
+  // might be too strong an assumption depending on the backing memory
+  when (bothQueueValid) {
+    assert((respQueueA.bits.tag.set === respQueueB.bits.tag.set) &&
+           (respQueueA.bits.tag.step === respQueueB.bits.tag.step),
+           "A and B response queue pointing to different set/steps. " ++
+           "This might indicate memory response coming back out-of-order.")
+  }
+  // synchronized dequeue
+  val deqResp = bothQueueValid && io.writeback.ready
+  respQueueA.ready := deqResp
+  respQueueB.ready := deqResp
+
   // FIXME: debug dummy: pipe A directly to writeback
   io.writeback.valid := respQueueA.valid
   val groupedRespA = respQueueA.bits.data
@@ -201,12 +218,12 @@ class TensorCoreDecoupled(
   // set/step sequencing logic
   val lastSet = ((1 << setBits) - 1)
   val lastStep = ((1 << stepBits) - 1)
-  val setDone = (set === lastSet.U)
-  val stepDone = (step === lastStep.U)
+  val setDone = (setAccess === lastSet.U)
+  val stepDone = (stepAccess === lastStep.U)
   when (nextStep) {
-    step := (step + 1.U) & lastStep.U
+    stepAccess := (stepAccess + 1.U) & lastStep.U
     when (stepDone) {
-      set := (set + 1.U) & lastSet.U
+      setAccess := (setAccess + 1.U) & lastSet.U
     }
   }