Merge branch 'main' of https://github.com/ucb-bar/radiance into main

2024-10-23 15:09:48 -07:00
parent 0a54018650 2a8c488d28
commit 9b8d16d184
8 changed files with 838 additions and 18 deletions
--- a/src/main/resources/vsrc/vortex
+++ b/src/main/resources/vsrc/vortex
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -0,0 +1,716 @@
+// See LICENSE.SiFive for license details.
+// See LICENSE.Berkeley for license details.
+
+package radiance.core
+
+import chisel3._
+import chisel3.util._
+import chisel3.experimental.requireIsChiselType
+import org.chipsalliance.cde.config.Parameters
+import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
+import freechips.rocketchip.tilelink._
+import freechips.rocketchip.diplomacy.{IdRange, AddressSet}
+import freechips.rocketchip.unittest.{UnitTest, UnitTestModule}
+import radiance.memory.SourceGenerator
+
+case class TensorTilingParams(
+  // Dimension of the SMEM tile
+  m: Int = 16,
+  n: Int = 16,
+  k: Int = 16,
+  // Dimension of the compute tile.  This is determined by the number of MAC
+  // units
+  mc: Int = 4,
+  nc: Int = 4,
+  kc: Int = 4
+)
+
+class TensorCoreDecoupled(
+    val numWarps: Int,
+    val numLanes: Int,
+    val numSourceIds: Int,
+    val tilingParams: TensorTilingParams,
+    val numFPRegs: Int = 32
+) extends Module {
+  val numWarpBits = log2Ceil(numWarps)
+  val wordSize = 4 // TODO FP16
+  val wordSizeInBits = wordSize * 8 // TODO FP16
+  val sourceWidth = log2Ceil(numSourceIds)
+  val dataWidth = numLanes * wordSizeInBits // TODO FP16
+  val numFPRegBits = log2Ceil(numFPRegs)
+
+  val io = IO(new Bundle {
+    val initiate = Flipped(Decoupled(new Bundle {
+      val wid = UInt(numWarpBits.W)
+    }))
+    val writeback = Decoupled(new Bundle {
+      val last = Bool()
+      val wid = UInt(numWarpBits.W)
+      val rd = UInt(numFPRegBits.W)
+      val data = Vec(numLanes, UInt((wordSizeInBits).W))
+    })
+    val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
+    val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
+    val reqA = Decoupled(new TensorMemReq(sourceWidth))
+    val reqB = Decoupled(new TensorMemReq(sourceWidth))
+  })
+  dontTouch(io)
+
+  class TensorMemReq(
+    sourceWidth: Int
+  ) extends Bundle {
+    val source = UInt(sourceWidth.W)
+    val address = UInt(32.W)
+  }
+  class TensorMemResp(
+    sourceWidth: Int,
+    dataWidth: Int
+  ) extends Bundle {
+    val source = UInt(sourceWidth.W)
+    val data = UInt(dataWidth.W)
+  }
+  class TensorMemTag extends Bundle {
+    val warp = UInt(numWarpBits.W)
+    val set = UInt(setBits.W)
+    val index = UInt(indexBits.W)
+  }
+  // mem response after translation from TL source to set/step tag
+  class TensorMemRespWithTag(
+    dataWidth: Int
+  ) extends Bundle {
+    val tag = new TensorMemTag
+    val data = UInt(dataWidth.W)
+  }
+
+  // ===========================================================================
+  // Access stage
+  // ===========================================================================
+  //
+  // Frontend of the decoupled access/execute pipeline.
+
+  // sets: k iteration
+  val numSets = (tilingParams.k / tilingParams.kc)
+  val setBits = log2Ceil(numSets)
+  // steps: i-j iteration
+  val numSteps = (tilingParams.m * tilingParams.n) / (tilingParams.mc * tilingParams.nc)
+  val stepBits = log2Ceil(numSteps)
+  val lastSet = ((1 << setBits) - 1)
+  val lastStep = ((1 << stepBits) - 1)
+  def setDone(set: UInt) = (set === lastSet.U)
+  def stepDone(step: UInt) = (step === lastStep.U)
+  // 'index' is the index of a memory request among the sequence of requests
+  // needed to read a full M-column of A or N-row of B.  Its range is [0,m/2)
+  // or [0,n/2), where 2 is the stride can be read in a single request size.
+  require(tilingParams.m == tilingParams.n,
+          "currently only supports square SMEM tile")
+  val numIndices = tilingParams.m / 2/*FIXME:hardcoded?*/
+  val indexBits = log2Ceil(numIndices)
+  val lastIndex = (1 << indexBits) - 1
+
+  object AccessorState extends ChiselEnum {
+    val idle = Value(0.U)
+    val access = Value(1.U)
+    // All set/step sequencing is complete and the tensor core is holding the
+    // result data until downstream writeback is ready.
+    // FIXME: is this necessary if writeback is decoupled with queues?
+    val finish = Value(2.U)
+  }
+  val state = RegInit(AccessorState.idle)
+  val allReqsDone = WireInit(false.B)
+  dontTouch(allReqsDone)
+
+  val warpAccess = RegInit(0.U(numWarpBits.W))
+
+  class BlockState extends Bundle {
+    val set = UInt(setBits.W)
+    val index = UInt(indexBits.W)
+  }
+  val stateInit = Wire(new BlockState)
+  stateInit.set := 0.U
+  stateInit.index := 0.U
+  val stateA = RegInit(stateInit)
+  val stateB = RegInit(stateInit)
+  dontTouch(stateA)
+  dontTouch(stateA.index)
+  dontTouch(stateB)
+  dontTouch(stateB.index)
+
+  io.initiate.ready := (state === AccessorState.idle)
+  when (io.initiate.fire) {
+    warpAccess := io.initiate.bits.wid
+    assert(stateA.set === 0.U && stateA.index === 0.U &&
+           stateB.set === 0.U && stateB.index === 0.U,
+           "stateA and stateB not initialized to zero")
+  }
+
+  switch(state) {
+    is(AccessorState.idle) {
+      when(io.initiate.fire) {
+        state := AccessorState.access
+      }
+    }
+    is(AccessorState.access) {
+      when (allReqsDone) {
+        state := AccessorState.finish
+      }
+    }
+    is(AccessorState.finish) {
+      // FIXME: is finish state needed?
+      state := AccessorState.idle
+    }
+  }
+
+  when (io.reqA.fire) {
+    when (stateA.index === lastIndex.U) {
+      stateA.set := stateA.set + 1.U
+    }
+    stateA.index := stateA.index + 1.U
+  }
+  when (io.reqB.fire) {
+    when (stateB.index === lastIndex.U) {
+      stateB.set := stateB.set + 1.U
+    }
+    stateB.index := stateB.index + 1.U
+  }
+
+  // Address generation
+  //
+  def addressGen(base: UInt, set: UInt, index: UInt): UInt = {
+    // note that both A and B are K-major to facilitate bank conflict-free SMEM
+    // accesses, so that below code applies to both.
+    //
+    // a "block" is the 4*8 byte-sized contiguous memory that can be read in
+    // one SMEM request.  The A and B matrix is assumed to be stored in
+    // block-wise "index"-major order (M-major for A, N-major for B)
+    val blockRow = set
+    val blockCol = index
+    val blockIndex = (blockRow << indexBits) + blockCol
+    val blockSize = numLanes * wordSize
+    val blockSizeBits = log2Ceil(blockSize)
+    val byteOffset = blockIndex << blockSizeBits
+    base + byteOffset
+
+    // address generation for byte-wise K-major A and B layout
+    // val elemRow = blockRow << 1
+    // val elemCol =  blockCol << log2Ceil(tilingParams.kc)
+    // val rowStride = tilingParams.k * wordSize
+    // val rowStrideBits = log2Ceil(rowStride)
+    // val wordStrideBits = log2Ceil(wordSize)
+    // val tileOffset = (elemRow << rowStrideBits) + (elemCol << wordStrideBits)
+    // base + tileOffset
+  }
+
+  // FIXME: bogus base address
+  val addressA = addressGen(0.U, stateA.set, stateA.index)
+  // SMEM 256KB, 8 banks: 0x8000B(32KB) per bank
+  val addressB = addressGen(0x8000.U, stateB.set, stateB.index)
+
+  val lastReqA = (stateA.set === lastSet.U) && (stateA.index === lastIndex.U)
+  val lastReqB = (stateB.set === lastSet.U) && (stateB.index === lastIndex.U)
+  val doneReqA = RegInit(false.B)
+  val doneReqB = RegInit(false.B)
+  when (lastReqA && io.reqA.fire) { doneReqA := true.B }
+  when (lastReqB && io.reqB.fire) { doneReqB := true.B }
+  val genReqA = (state === AccessorState.access) && !doneReqA
+  val genReqB = (state === AccessorState.access) && !doneReqB
+  when (state === AccessorState.finish) {
+    doneReqA := false.B
+    doneReqB := false.B
+    stateA.set := 0.U
+    stateA.index := 0.U
+    stateB.set := 0.U
+    stateB.index := 0.U
+  }
+
+  allReqsDone := doneReqA && doneReqB
+
+  // Request generation
+  //
+  val tagA = Wire(new TensorMemTag)
+  tagA.warp := warpAccess
+  tagA.set := stateA.set
+  tagA.index := stateA.index
+  val tagB = Wire(new TensorMemTag)
+  tagB.warp := warpAccess
+  tagB.set := stateB.set
+  tagB.index := stateB.index
+
+  val respATagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
+  val respBTagged = Wire(Decoupled(new TensorMemRespWithTag(dataWidth)))
+  Seq((io.reqA, (io.respA, respATagged)),
+      (io.reqB, (io.respB, respBTagged))).zipWithIndex.foreach {
+    case ((req, (resp, respTagged)), i) => {
+      val sourceGen = Module(new SourceGenerator(
+        log2Ceil(numSourceIds),
+        metadata = Some(new TensorMemTag)
+      ))
+
+      sourceGen.io.gen := req.fire
+      sourceGen.io.meta := (if (i == 0) tagA else tagB)
+      req.valid := (if (i == 0) genReqA else genReqB)
+      req.bits.address := (if (i == 0) addressA else addressB)
+      req.bits.source := sourceGen.io.id.bits
+
+      sourceGen.io.reclaim.valid := resp.fire
+      sourceGen.io.reclaim.bits := resp.bits.source
+
+      // translate source
+      respTagged.valid := resp.valid
+      respTagged.bits.tag := sourceGen.io.peek
+      respTagged.bits.data := resp.bits.data
+      resp.ready := respTagged.ready
+    }
+  }
+
+  // ===========================================================================
+  // Execute stage
+  // ===========================================================================
+  //
+  // Backend of the decoupled access/execute pipeline.
+  //
+  val respQueueDepth = 4 // FIXME: parameterize
+  require(respQueueDepth >= 4,
+    "respQueueDepth must be at least 4.  This is because the B operand buffer " ++
+    "is shallower than A's, so the B response queue has to be deep enough to " ++
+    "hold younger requests until A operand buffer becomes valid and the first DPU " ++
+    "fire can happen.  FIXME: make operand buffer report per-subtile valid so " ++
+    "the first compute can happen earlier.")
+  val respQueueA = Queue(respATagged, respQueueDepth)
+  val respQueueB = Queue(respBTagged, respQueueDepth)
+
+  require(respQueueA.bits.data.widthOption.get ==
+          io.writeback.bits.data.widthOption.get,
+          "response data width does not match the writeback data width")
+
+  // FIXME: unnecessary
+  val substepDeqA = RegInit(0.U(1.W))
+  when (respQueueA.fire) {
+    substepDeqA := substepDeqA + 1.U
+  }
+  dontTouch(substepDeqA)
+
+  // Stage the operands in a pipeline so that we obtain the full 4x4 tiles
+  // ready for compute.  Also send the set/step tag along the pipe for
+  // alignment check.
+
+  // @cleanup: dedup A and B below
+
+  val fullA = Module(new FillBuffer(
+    chiselTypeOf(respQueueB.bits.data), numIndices
+  ))
+  fullA.io.enq.valid := respQueueA.valid
+  fullA.io.enq.bits := respQueueA.bits.data
+  respQueueA.ready := fullA.io.enq.ready
+  // `pipe` combinationally couples enq-deq ready
+  val fullATag = Module(new Queue(
+    new TensorMemTag, entries = 1, pipe = true
+  ))
+  fullATag.io.enq.valid := respQueueA.valid
+  fullATag.io.enq.bits := respQueueA.bits.tag
+
+  // stage the full A tile once more so that FillBuffer can be filled up in the
+  // background while the tile is being used for compute.  This does come with
+  // capacity overhead.
+  val fullABuf = Module(new Queue(
+    new Bundle {
+      val data = chiselTypeOf(fullA.io.deq.bits)
+      val tag = new TensorMemTag
+    }, entries = 1, pipe = true
+  ))
+  fullABuf.io.enq.valid := fullA.io.deq.valid
+  fullABuf.io.enq.bits.data := fullA.io.deq.bits
+  fullABuf.io.enq.bits.tag := fullATag.io.deq.bits
+  fullA.io.deq.ready := fullABuf.io.enq.ready
+  fullATag.io.deq.ready := fullABuf.io.enq.ready
+
+  // serialize every two B responses into one full 4x4 B tile
+  // FIXME: do the same for A
+  val fullB = Module(new FillBuffer(
+    chiselTypeOf(respQueueB.bits.data), 2/*substeps*/
+  ))
+  fullB.io.enq.valid := respQueueB.valid
+  fullB.io.enq.bits := respQueueB.bits.data
+  respQueueB.ready := fullB.io.enq.ready
+  val fullBTag = Module(new Queue(
+    new TensorMemTag, entries = 1, pipe = true
+  ))
+  fullBTag.io.enq.valid := respQueueB.valid
+  fullBTag.io.enq.bits := respQueueB.bits.tag
+
+  val fullBBuf = Module(new Queue(
+    new Bundle {
+      val data = chiselTypeOf(fullB.io.deq.bits)
+      val tag = new TensorMemTag
+    }, entries = 1, pipe = true
+  ))
+  fullBBuf.io.enq.valid := fullB.io.deq.valid
+  fullBBuf.io.enq.bits.data := fullB.io.deq.bits
+  fullBBuf.io.enq.bits.tag := fullBTag.io.deq.bits
+  fullB.io.deq.ready := fullBBuf.io.enq.ready
+  fullBTag.io.deq.ready := fullBBuf.io.enq.ready
+
+  val dpuReady = Wire(Bool())
+  val operandsValid = fullABuf.io.deq.valid && fullBBuf.io.deq.valid
+  val dpuFire = operandsValid && dpuReady
+
+  val setCompute = RegInit(0.U(setBits.W))
+  val stepCompute = RegInit(0.U(stepBits.W))
+  val substepCompute = RegInit(0.U(1.W))
+  val nextStepCompute = dpuFire && (substepCompute === 1.U)
+  dontTouch(setCompute)
+  dontTouch(stepCompute)
+  dontTouch(substepCompute)
+  when (dpuFire) {
+    substepCompute := substepCompute + 1.U
+  }
+
+  // Operand selection
+  //
+  // select the correct 4x4 tile from A operand buffer
+  val numTilesM = tilingParams.m / tilingParams.mc
+  val numTilesMBits = log2Ceil(numTilesM)
+  def selectOperandA(buf: Vec[UInt]): UInt = {
+    require(buf.length == numIndices)
+    val stepM = stepCompute & ((1 << numTilesMBits) - 1).U
+    Cat(buf((stepM << 1) + 1.U), buf(stepM << 1))
+  }
+  val operandA = selectOperandA(fullABuf.io.deq.bits.data)
+  val operandATag = fullABuf.io.deq.bits.tag
+  // select the correct 2x4 tile from B operand buffer
+  val operandB = fullBBuf.io.deq.bits.data(substepCompute)
+  val operandBTag = fullBBuf.io.deq.bits.tag
+  dontTouch(operandATag)
+  dontTouch(operandBTag)
+
+  // Operand buffer logic
+  //
+  // hold A data until the entire set is done
+  val shouldDequeueAMask = ((1 << stepBits) - 1).U
+  val shouldDequeueA =
+    ((stepCompute & shouldDequeueAMask) === shouldDequeueAMask) &&
+    (substepCompute === 1.U)
+  fullABuf.io.deq.ready := dpuFire && shouldDequeueA
+  // hold B tile at respQueueB for multiple steps for reuse, only dequeue when
+  // we fully iterated a column (M-dimension)
+  val shouldDequeueBMask = ((1 << numTilesMBits) - 1).U
+  val shouldDequeueB =
+    ((stepCompute & shouldDequeueBMask) === shouldDequeueBMask) &&
+    (substepCompute === 1.U)
+  fullBBuf.io.deq.ready := dpuFire && shouldDequeueB
+  dontTouch(respQueueA)
+  dontTouch(respQueueB)
+  dontTouch(shouldDequeueA)
+  dontTouch(shouldDequeueB)
+
+  // Assert that the DPU is computing with operands of the same set/step. Note
+  // that the B resp will only have step values multiple of 4 due to reuse.
+  //
+  // This check assumes that memory responses come back in-order.  Might be too
+  // strong of an assumption depending on the backing memory.
+  def assertAligned = {
+    val stepMask = (1 << numTilesMBits).U
+    when (dpuFire) {
+      assert(operandATag.warp === operandBTag.warp &&
+             operandATag.set  === operandBTag.set,
+             "A and B operands are pointing to different warps and sets. " ++
+             "This might indicate memory response coming back out-of-order.")
+      assert(operandATag.set === setCompute,
+             "Operand arrived from memory is pointing at a different set than the FSM.")
+    }
+  }
+  assertAligned
+
+  // Dot-product unit
+  //
+  // 4x2 four-element DPUs summing up to 32 MACs in total
+  //
+  val ncSubstep = tilingParams.nc / 2
+  require(tilingParams.mc * ncSubstep == numLanes,
+          "substep tile size doesn't match writeback throughput")
+  val dpus = Seq.fill(tilingParams.mc)(Seq.fill(ncSubstep)(
+    Module(new TensorDotProductUnit(half = false))
+  ))
+
+  // reshape operands for easier routing to DPU
+  def reshapeByFourWords(x: UInt): Seq[Seq[UInt]] = {
+    x.asBools.grouped(wordSizeInBits).map(VecInit(_).asUInt).toSeq
+     .grouped(4/*k-dim*/).toSeq
+  }
+  val operandADimensional = reshapeByFourWords(operandA)
+  require(operandADimensional.length == tilingParams.mc &&
+          operandADimensional(0).length == tilingParams.kc,
+          "operand width doesn't agree with tiling parameter")
+  val operandBDimensional = reshapeByFourWords(operandB)
+  require(operandBDimensional.length == ncSubstep &&
+          operandBDimensional(0).length == tilingParams.kc,
+          "operand width doesn't agree with tiling parameter")
+
+  for (m <- 0 until tilingParams.mc) {
+    for (n <- 0 until ncSubstep) {
+      dpus(m)(n).io.in.valid := dpuFire
+      dpus(m)(n).io.in.bits.a := operandADimensional(m)
+      dpus(m)(n).io.in.bits.b := operandBDimensional(n)
+      dpus(m)(n).io.in.bits.c := 0.U // FIXME: bogus accum data
+      // dpu ready couples with writeback backpressure
+      dpus(m)(n).io.stall := !io.writeback.ready
+    }
+  }
+  dpuReady := !dpus(0)(0).io.stall
+  dontTouch(dpuFire)
+  dontTouch(dpuReady)
+
+  val dpuValids = dpus.flatMap(_.map(_.io.out.valid))
+  val dpuValid = dpuValids.reduce(_ && _)
+  def assertDPU = {
+    val dpuStalls = dpus.flatMap(_.map(_.io.stall))
+    assert(dpuStalls.reduce(_ && _) === dpuStalls.reduce(_ || _),
+      "stall signals of DPUs went out of sync")
+    assert(dpuValids.reduce(_ && _) === dpuValids.reduce(_ || _),
+      "valid signals of DPUs went out of sync")
+  }
+  assertDPU
+
+  // flatten DPU output into 1D array in M-major order
+  val flattenedDPUOut = (0 until ncSubstep).flatMap { n =>
+    (0 until tilingParams.mc).map { m =>
+      dpus(m)(n).io.out.bits.data
+    }
+  }
+  io.writeback.bits.data := flattenedDPUOut
+
+  // Writeback logic
+  //
+  // These queues hold metadata needed for writeback in sync with the DPU.
+
+  class TensorComputeTag extends Bundle {
+    val warp = UInt(numWarpBits.W)
+    val set = UInt(setBits.W)
+    val step = UInt(stepBits.W)
+    val substep = UInt(1.W)
+  }
+
+  val queueDepth = 5 // needs to be at least the DPU latency
+  val tagQueue = Module(new Queue(new TensorComputeTag, queueDepth))
+  tagQueue.io.enq.valid := dpuFire
+  tagQueue.io.enq.bits.warp := operandATag.warp
+  tagQueue.io.enq.bits.set := setCompute
+  tagQueue.io.enq.bits.step := stepCompute
+  tagQueue.io.enq.bits.substep := substepCompute
+  tagQueue.io.deq.ready := io.writeback.fire
+  assert(tagQueue.io.enq.ready === true.B,
+         "tag queue full, DPU operation might be throttled")
+  assert(!dpuValid || tagQueue.io.deq.valid,
+         "tag queue and DPU went out of sync")
+
+  // val widQueue = Queue(io.initiate, queueDepth, pipe = (queueDepth == 1))
+
+  // note rd is independent to sets
+  def rdGen(step: UInt, substep: UInt): UInt = {
+    // each step produces 4x4 output tile, written by 8 threads with 2 regs per
+    // thread
+    (step << 1/*2 substeps*/) + substep
+  }
+
+  val warpWriteback = tagQueue.io.deq.bits.warp
+  val setWriteback = tagQueue.io.deq.bits.set
+  val stepWriteback = tagQueue.io.deq.bits.step
+  val substepWriteback = tagQueue.io.deq.bits.substep
+  io.writeback.valid := dpuValid
+  io.writeback.bits.wid := warpWriteback
+  io.writeback.bits.rd := rdGen(stepWriteback, substepWriteback)
+  io.writeback.bits.last := setDone(setWriteback) && stepDone(stepWriteback) &&
+                            (substepWriteback === 1.U)
+
+  // State transition
+  // ----------------
+  //
+  // set/step sequencing logic
+
+  def sequenceSetStep(set: UInt, step: UInt, nextStep: Bool) = {
+    when (nextStep) {
+      step := (step + 1.U) & lastStep.U
+      when (stepDone(step)) {
+        set := (set + 1.U) & lastSet.U
+      }
+    }
+  }
+  sequenceSetStep(setCompute, stepCompute, nextStepCompute)
+}
+
+// A buffer that collects multiple entries of input data and exposes the
+// coalesced data as output.  Effectively acts as a width-widening
+// chisel.util.Pipe.
+class FillBuffer[T <: Data](
+  gen: T,
+  entries: Int
+) extends Module {
+  require(entries > 0, "FillBuffer must have a positive number of entries")
+  requireIsChiselType(gen)
+
+  val io = IO(new Bundle {
+    val enq = Flipped(Decoupled(gen))
+    val deq = Decoupled(Vec(entries, gen))
+  })
+
+  val data = Reg(Vec(entries, gen))
+  val ptr = Counter(entries + 1)
+  dontTouch(ptr.value)
+  val full = (ptr.value === entries.U)
+  io.enq.ready := !full
+  when (io.enq.fire) {
+    data(ptr.value) := io.enq.bits
+    ptr.inc()
+  }
+  io.deq.valid := full
+  (io.deq.bits zip data).foreach { case (io, d) => io := d }
+  when (io.deq.fire) {
+    assert(ptr.value === entries.U, "FillBuffer fired before buffer was full")
+    ptr.reset()
+  }
+}
+
+// synthesizable unit tests
+
+// wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy
+// graph.
+class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule {
+  val numSourceIds = 16
+
+  // node with two edges; one for A and one for B matrix
+  val node = TLClientNode(Seq(
+    TLMasterPortParameters.v2(
+      Seq(TLMasterParameters.v2(
+        name = "TensorCoreDecoupledMatrixANode",
+        sourceId = IdRange(0, numSourceIds)
+      ))
+    ),
+    TLMasterPortParameters.v2(
+      Seq(TLMasterParameters.v2(
+        name = "TensorCoreDecoupledMatrixBNode",
+        sourceId = IdRange(0, numSourceIds)
+      ))
+    )
+  ))
+
+  lazy val module = new TensorCoreDecoupledTLImp(this)
+}
+
+class TensorCoreDecoupledTLImp(outer: TensorCoreDecoupledTL)
+    extends LazyModuleImp(outer) with UnitTestModule {
+  require(outer.node.out.length == 2/*A and B*/)
+
+  val tensor = Module(new TensorCoreDecoupled(
+                      8, 8, outer.numSourceIds , TensorTilingParams()))
+  val wordSize = 4 // @cleanup: hardcoded
+
+  val zip = Seq((outer.node.out(0), tensor.io.reqA),
+                (outer.node.out(1), tensor.io.reqB))
+  zip.foreach { case ((tl, edge), req) =>
+    tl.a.valid := req.valid
+    val (legal, bits) = edge.Get(
+      fromSource = req.bits.source,
+      toAddress = req.bits.address,
+      lgSize = log2Ceil(wordSize).U
+    )
+    tl.a.bits := bits
+    req.ready := tl.a.ready
+    when(tl.a.fire) {
+      assert(legal, "illegal TL req gen")
+    }
+  }
+
+  // TODO: dedup A and B
+  val (tlOutA, _) = outer.node.out(0)
+  val (tlOutB, _) = outer.node.out(1)
+  tensor.io.respA.valid := tlOutA.d.valid
+  tensor.io.respA.bits.data := tlOutA.d.bits.data
+  tensor.io.respA.bits.source := tlOutA.d.bits.source
+  tlOutA.d.ready := tensor.io.respA.ready
+  tensor.io.respB.valid := tlOutB.d.valid
+  tensor.io.respB.bits.data := tlOutB.d.bits.data
+  tensor.io.respB.bits.source := tlOutB.d.bits.source
+  tlOutB.d.ready := tensor.io.respB.ready
+
+  tensor.io.initiate.valid := io.start
+  tensor.io.initiate.bits.wid := 0.U // TODO
+  tensor.io.writeback.ready := true.B
+
+  io.finished := tensor.io.writeback.valid && tensor.io.writeback.bits.last
+  when (io.finished) {
+    // might be too strong
+    assert(tensor.io.writeback.bits.rd === 31.U)
+  }
+}
+
+// a minimal Diplomacy graph with a tensor core and a TLRAM
+class TensorCoreDecoupledTLRAM(implicit p: Parameters) extends LazyModule {
+  val tensor = LazyModule(new TensorCoreDecoupledTL)
+  val xbar = LazyModule(new TLXbar)
+  val ram = LazyModule(new TLRAM(
+    address = AddressSet(0x0000, 0xffffff),
+    beatBytes = 32 // @cleanup: hardcoded
+  ))
+
+  ram.node :=* xbar.node :=* tensor.node
+
+  lazy val module = new Impl
+  class Impl extends LazyModuleImp(this) with UnitTestModule {
+    tensor.module.io.start := io.start
+    io.finished := tensor.module.io.finished
+  }
+}
+
+// two separate TLRAMs for A and B for full throughput
+class TensorCoreDecoupledTwoTLRAM(implicit p: Parameters) extends LazyModule {
+  val tensor = LazyModule(new TensorCoreDecoupledTL)
+  val xbar = LazyModule(new TLXbar)
+  val ramA = LazyModule(new TLRAM(
+    address = AddressSet(0x000, 0xfffbff),
+    beatBytes = 32 // @cleanup: hardcoded
+  ))
+  val ramB = LazyModule(new TLRAM(
+    address = AddressSet(0x400, 0xfffbff),
+    beatBytes = 32 // @cleanup: hardcoded
+  ))
+
+  val stutter = new TLIdentityNode
+  xbar.node :=* tensor.node
+  ramA.node := stutter := xbar.node
+  ramB.node := xbar.node
+
+  val fuzz = false
+
+  lazy val module = new Impl
+  class Impl extends LazyModuleImp(this) with UnitTestModule {
+    tensor.module.io.start := io.start
+    io.finished := tensor.module.io.finished
+
+    val (tlIn, _) = stutter.in(0)
+    val (tlOut, _) = stutter.out(0)
+    require(stutter.in.length == 1)
+    require(stutter.out.length == 1)
+
+    // inject stalls for fuzzing
+    val incr = Wire(Bool())
+    val (count, _) = Counter(incr, 0x1000)
+    def cond(x: UInt) = (x & ((1 << 3) - 1).U) =/= 0.U
+    val stall = if (fuzz) cond(count) else false.B
+
+    tlOut.a <> tlIn.a
+    tlIn.d <> tlOut.d
+    incr := tlIn.a.fire || stall
+    when (stall) {
+      tlIn.a.ready := false.B
+      tlOut.a.valid := false.B
+    }
+  }
+}
+
+// unit test harness
+class TensorCoreDecoupledTest(timeout: Int = 500000)(implicit p: Parameters)
+    extends UnitTest(timeout) {
+  // val dut = Module(LazyModule(new TensorCoreDecoupledTLRAM).module)
+  val dut = Module(LazyModule(new TensorCoreDecoupledTwoTLRAM).module)
+  dut.io.start := io.start
+  io.finished := dut.io.finished
+}
--- a/src/main/scala/radiance/core/TensorDPU.scala
+++ b/src/main/scala/radiance/core/TensorDPU.scala
@@ -27,11 +27,13 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar
      val b = Vec(dotProductDim, Bits((inFLen).W))
      val c = Bits((outFLen).W) // note C has the out length for accumulation
    }))
+    // 'stall' is effectively out.ready, combinationally coupled to in.ready
    val stall = Input(Bool())
    val out = Valid(new Bundle {
      val data = Bits((outFLen).W)
    })
  })
+  dontTouch(io)

  // [IEEE] -> recode() -> unbox() -> [Hardfloat] -> box() -> ieee() -> [IEEE]
  // make sure recoding/uncoding happens only at the edge, not at every
@@ -52,7 +54,7 @@ class TensorDotProductUnit(val half: Boolean) extends Module with tile.HasFPUPar
  io.out.bits.data := ieee(box(dpu.io.out.bits.data, S))
 }

-// Copied from chisel3.util.Pipe.
+// An implementation of chisel3.util.Pipe that supports stalls.
 class StallingPipe[T <: Data](val gen: T, val latency: Int = 1) extends Module {
  /** A non-ambiguous name of this `StallingPipe` for use in generated Verilog
   *  names. Includes the latency cycle count in the name as well as the
--- a/src/main/scala/radiance/memory/Coalescing.scala
+++ b/src/main/scala/radiance/memory/Coalescing.scala
@@ -372,7 +372,8 @@ class SourceGenerator[T <: Data](
      outstanding := outstanding + 1.U
    }
  }.elsewhen(io.reclaim.valid) {
-    assert(outstanding > 0.U)
+    assert(outstanding > 0.U,
+           "Over-reclaim. Did some responses get dropped?")
    outstanding := outstanding - 1.U
  }
  dontTouch(outstanding)
--- a/src/main/scala/radiance/tile/RadianceTile.scala
+++ b/src/main/scala/radiance/tile/RadianceTile.scala
@@ -379,6 +379,12 @@ class RadianceTile private (
    tlMasterXbar.node :=* AddressOrNode(base) :=* dcacheNode
  }

+  // Instantiate a fake TensorCoreDecoupled module to force unique-ification of
+  // module names in the Chisel-generated Verilog.  This should be disabled for
+  // synthesis runs
+  val tensor = LazyModule(new radiance.core.TensorCoreDecoupledTL)
+  tlMasterXbar.node :=* tensor.node
+
  /* below are copied from rocket */

  val tile_master_blocker =
@@ -733,7 +739,7 @@ class RadianceTileModuleImp(outer: RadianceTile)
      }
    }

-    def connectTc {
+    def connectTensor = {
      val tcb0 = new {
        val addr = core.io.tc_a_bits_address(31, 0)
        val tag = core.io.tc_a_bits_tag(3, 0)
@@ -752,16 +758,18 @@ class RadianceTileModuleImp(outer: RadianceTile)
        val adapter = Module(
          new VortexTLAdapter(
            outer.smemSourceWidth,
-            new VortexBundleA(tagWidth = 1, dataWidth = 32 * 8),
-            new VortexBundleD(tagWidth = 1, dataWidth = 32 * 8),
+            new VortexBundleA(tagWidth = 4, dataWidth = 32 * 8),
+            new VortexBundleD(tagWidth = 4, dataWidth = 32 * 8),
            client
          )
        )
+        require(adapter.io.inReq.bits.source.widthOption.get == bundle.tag.widthOption.get)
+        require(adapter.io.inReq.bits.address.widthOption.get == bundle.addr.widthOption.get)
        adapter.io.inReq.bits <> DontCare
        adapter.io.inReq.valid := bundle.aValid
        adapter.io.inReq.bits.address := bundle.addr
        adapter.io.inReq.bits.source := bundle.tag
-        adapter.io.inReq.bits.size := 5.U
+        adapter.io.inReq.bits.size := 5.U // 256 bits
        adapter.io.inReq.bits.opcode := TLMessages.Get
        adapter.io.inReq.bits.mask := x"ffffffff".U
        adapter.io.inResp.ready := bundle.dReady
@@ -774,6 +782,8 @@ class RadianceTileModuleImp(outer: RadianceTile)
      core.io.tc_d_valid := Cat(adapters.last.io.inResp.valid, adapters.head.io.inResp.valid)
      core.io.tc_d_bits_data := Cat(adapters.last.io.inResp.bits.data, adapters.head.io.inResp.bits.data)
      core.io.tc_d_bits_tag := Cat(adapters.last.io.inResp.bits.source, adapters.head.io.inResp.bits.source)
+      require(core.io.tc_d_bits_data.widthOption.get == adapters.head.io.inResp.bits.data.widthOption.get * 2)
+      require(core.io.tc_d_bits_tag.widthOption.get == adapters.head.io.inResp.bits.source.widthOption.get * 2)
    }

    def connectBarrier = {
@@ -790,7 +800,7 @@ class RadianceTileModuleImp(outer: RadianceTile)
      outer.barrierMasterNode.out(0)._1.resp.ready := true.B
    }

-    def connectAccelerator: Unit = {
+    def connectAccelerator = {
      outer.accMasterNode.out.head._1.cmd.bits := core.io.acc_write_out
      outer.accMasterNode.out.head._1.cmd.valid := core.io.acc_write_en
      core.io.acc_read_in := outer.accMasterNode.out.head._1.status
@@ -831,7 +841,7 @@ class RadianceTileModuleImp(outer: RadianceTile)
    connectImem
    connectDmem
    connectSmem
-    connectTc
+    connectTensor
    connectBarrier
    connectAccelerator
  }
@@ -839,6 +849,9 @@ class RadianceTileModuleImp(outer: RadianceTile)
  // TODO: generalize for useVxCache
  if (!outer.radianceParams.useVxCache) {}

+  // connect io.start and io.finish of the fake TensorCoreDecoupled module
+  outer.tensor.module.io.start := false.B
+
  // // RoCC
  // if (outer.roccs.size > 0) {
  //   val (respArb, cmdRouter) = {
--- a/src/main/scala/radiance/tile/VortexCore.scala
+++ b/src/main/scala/radiance/tile/VortexCore.scala
@@ -137,7 +137,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
        "NUM_THREADS" -> tile.numLsuLanes
      )
    )
-    with HasBlackBoxResource {
+    with HasBlackBoxResource with HasBlackBoxPath {
  // addResource("/vsrc/vortex/hw/unit_tests/generic_queue/testbench.v")
  // addResource("/vsrc/vortex/hw/unit_tests/VX_divide_tb.v")
  // addResource("/vsrc/vortex/hw/syn/synopsys/models/memory/cln28hpm/rf2_256x19_wm0/rf2_256x19_wm0_rtl.v")
@@ -242,8 +242,6 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
  // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv")
  // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv")

-  addResource("/vsrc/vortex/hw/rtl/mem/VX_tc_bus_if.sv")
-
  addResource("/vsrc/vortex/hw/rtl/libs/VX_allocator.sv")
  // addResource("/vsrc/vortex/hw/rtl/libs/VX_avs_adapter.sv")
  // addResource("/vsrc/vortex/hw/rtl/libs/VX_axi_adapter.sv")
@@ -407,7 +405,37 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)

  // tensor core
  addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_core.sv")
+  addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_hopper_core.sv")
+  addResource("/vsrc/vortex/hw/rtl/mem/VX_tc_bus_if.sv")
 //  addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh")
+  def addHopperTensorCore = {
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRawFN.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRecFN.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/DotProductPipe.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/FillBuffer_1.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/FillBuffer.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/metadataTable_4x5.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/MulFullRawFN.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/occupancyTable_4x1.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorCoreDecoupled_Anon_1.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorCoreDecoupled_Anon.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue1_TensorMemTag.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue4_TensorMemRespWithTag.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/Queue5_TensorComputeTag.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/ram_4x261.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/ram_5x7.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundAnyRawFNToRecFN_ie8_is26_oe8_os24.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundAnyRawFNToRecFN_ie8_is47_oe8_os24.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/RoundRawFNToRecFN_e8_s24.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/SimpleTimer.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/SourceGenerator.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe_1.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe_2.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/StallingPipe.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorCoreDecoupled.sv")
+    addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/TensorDotProductUnit.sv")
+  }
+  // addHopperTensorCore
  addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv")
  addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv")
  addResource("/vsrc/vortex/hw/rtl/fpu/VX_tensor_dpu.sv")
--- a/src/main/scala/radiance/unittest/Configs.scala
+++ b/src/main/scala/radiance/unittest/Configs.scala
@@ -1,6 +1,6 @@
 // See LICENSE.SiFive for license details.

-package radiance.memory
+package radiance.unittest

 import chisel3._
 import org.chipsalliance.cde.config._
@@ -8,6 +8,8 @@ import freechips.rocketchip.subsystem.{BaseSubsystemConfig}
 import freechips.rocketchip.devices.tilelink._
 import freechips.rocketchip.tilelink._
 import freechips.rocketchip.util._
+import radiance.core.TensorCoreDecoupledTest
+import radiance.memory._
 import radiance.subsystem.WithSimtConfig
 import freechips.rocketchip.unittest._
 //import rocket.VortexFatBankTest
@@ -17,6 +19,16 @@ case object TestDurationMultiplier extends Field[Int]
 class WithTestDuration(x: Int) extends Config((site, here, up) => {
  case TestDurationMultiplier => x
 })
+
+class WithTensorUnitTests extends Config((site, _, _) => {
+  case UnitTests => (q: Parameters) => {
+    implicit val p = q
+    val timeout = 50000 * site(TestDurationMultiplier)
+    Seq(
+      Module(new TensorCoreDecoupledTest(timeout=timeout)),
+    ) }
+})
+
 class WithCoalescingUnitTests extends Config((site, _, _) => {
  case UnitTests => (q: Parameters) => {
    implicit val p = q
@@ -52,12 +64,34 @@ class WithCoalescingUnitSynthesisDummy(nLanes: Int) extends Config((site, _, _)
    ) }
 })

-class CoalescingUnitTestConfig extends Config(new WithCoalescingUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nMemLanes=4) ++ new BaseSubsystemConfig)
+class TensorUnitTestConfig extends Config(
+  new WithTensorUnitTests ++
+  new WithTestDuration(10) ++
+  new BaseSubsystemConfig)
+
+class CoalescingUnitTestConfig extends Config(
+  new WithCoalescingUnitTests ++
+  new WithTestDuration(10) ++
+  new WithSimtConfig(nMemLanes=4) ++
+  new BaseSubsystemConfig)
+
 //class VortexFatBankUnitTestConfig extends Config(new WithVortexFatBankUnitTests ++ new WithTestDuration(10) ++ new WithSimtConfig(nLanes=4) ++ new BaseSubsystemConfig)

 // Dummy configs of various sizes for synthesis
-class CoalescingSynthesisDummyLane4Config extends Config(new WithCoalescingUnitSynthesisDummy(4) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)
-class CoalescingSynthesisDummyLane8Config extends Config(new WithCoalescingUnitSynthesisDummy(8) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)
-class CoalescingSynthesisDummyLane16Config extends Config(new WithCoalescingUnitSynthesisDummy(16) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)
-class CoalescingSynthesisDummyLane32Config extends Config(new WithCoalescingUnitSynthesisDummy(32) ++ new WithTestDuration(10) ++ new BaseSubsystemConfig)
+class CoalescingSynthesisDummyLane4Config extends Config(
+  new WithCoalescingUnitSynthesisDummy(4) ++
+  new WithTestDuration(10) ++
+  new BaseSubsystemConfig)
+class CoalescingSynthesisDummyLane8Config extends Config(
+  new WithCoalescingUnitSynthesisDummy(8) ++
+  new WithTestDuration(10) ++
+  new BaseSubsystemConfig)
+class CoalescingSynthesisDummyLane16Config extends Config(
+  new WithCoalescingUnitSynthesisDummy(16) ++
+  new WithTestDuration(10) ++
+  new BaseSubsystemConfig)
+class CoalescingSynthesisDummyLane32Config extends Config(
+  new WithCoalescingUnitSynthesisDummy(32) ++
+  new WithTestDuration(10) ++
+  new BaseSubsystemConfig)

--- a/src/test/scala/radiance/TensorCoreDecoupledTest.scala
+++ b/src/test/scala/radiance/TensorCoreDecoupledTest.scala
@@ -0,0 +1,26 @@
+package radiance.core
+
+import chisel3._
+import chisel3.util._
+import chiseltest._
+import org.scalatest.flatspec.AnyFlatSpec
+
+class TensorCoreDecoupledTest extends AnyFlatSpec with ChiselScalatestTester {
+  behavior of "TensorCoreDecoupled"
+
+  it should "do the right thing" in {
+    test(new TensorCoreDecoupled(8, 8, tilingParams = TensorTilingParams()))
+      { c =>
+        c.io.initiate.valid.poke(true.B)
+        c.io.initiate.bits.wid.poke(0.U)
+
+        c.io.respA.valid.poke(false.B)
+        c.io.respA.bits.data.poke(0.U)
+        c.io.respB.valid.poke(false.B)
+        c.io.respB.bits.data.poke(0.U)
+
+        c.clock.step()
+        c.io.writeback.valid.expect(true.B)
+      }
+  }
+}