diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 897edb2..f7c6c63 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -269,13 +269,13 @@ class TensorCoreDecoupled(
 
   require(respQueueA.bits.data.widthOption.get ==
           io.writeback.bits.data.widthOption.get,
-    "response data width does not match the writeback data width")
+          "response data width does not match the writeback data width")
 
-  val substepExecute = RegInit(0.U(1.W))
+  val substepDeqA = RegInit(0.U(1.W))
   when (respQueueA.fire) {
-    substepExecute := substepExecute + 1.U
+    substepDeqA := substepDeqA + 1.U
   }
-  dontTouch(substepExecute)
+  dontTouch(substepDeqA)
 
   // Do pipelining for the A operand so that we obtain the full 4x4 A tile
   // ready for compute.  The pipeline is two-stage:
@@ -292,7 +292,7 @@ class TensorCoreDecoupled(
   val halfAQueue = Module(new Queue(
     chiselTypeOf(respQueueA.bits), entries = 1, pipe = true
   ))
-  halfAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 0.U)
+  halfAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 0.U)
   halfAQueue.io.enq.bits := respQueueA.bits
 
   // substep == 0 data goes to the LSB
@@ -305,9 +305,9 @@ class TensorCoreDecoupled(
     new TensorMemRespWithTag(dataWidth * 2), entries = 1, pipe = true
   ))
   // hold first half A data for the first substep
-  halfAQueue.io.deq.ready := respQueueA.valid && (substepExecute === 1.U) &&
+  halfAQueue.io.deq.ready := respQueueA.valid && (substepDeqA === 1.U) &&
                              fullAQueue.io.enq.ready
-  fullAQueue.io.enq.valid := respQueueA.valid && (substepExecute === 1.U) &&
+  fullAQueue.io.enq.valid := respQueueA.valid && (substepDeqA === 1.U) &&
                              halfAQueue.io.deq.valid
   fullAQueue.io.enq.bits.data := fullAEnqData
   fullAQueue.io.enq.bits.tag := fullAEnqTag
@@ -332,8 +332,8 @@ class TensorCoreDecoupled(
   // respQueueA output arbitrates to either halfAQueue or fullAQueue depending
   // on the substep
   respQueueA.ready := MuxCase(false.B,
-                              Seq((substepExecute === 0.U) -> halfAQueue.io.enq.ready,
-                                  (substepExecute === 1.U) -> fullAQueue.io.enq.ready))
+                              Seq((substepDeqA === 0.U) -> halfAQueue.io.enq.ready,
+                                  (substepDeqA === 1.U) -> fullAQueue.io.enq.ready))
   // Hold B tile at respQueueB for multiple steps for reuse, only dequeue when
   // we fully iterated a column (M-dimension).
   val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U