Add Blackwell tensor core baseline plumbing

2026-04-25 10:15:31 +08:00
parent 4a0b1c05cd
commit 136cf70a58
7 changed files with 528 additions and 7 deletions
--- a/radiance.mk
+++ b/radiance.mk
@@ -23,6 +23,9 @@ endif
 ifeq ($(shell echo $(CONFIG) | grep -E "HopperConfig$$"),$(CONFIG))
    EXTRA_SIM_PREPROC_DEFINES += +define+NUM_CORES=4 +define+EXT_T_HOPPER
 endif
+ifeq ($(shell echo $(CONFIG) | grep -E "BlackwellConfig$$"),$(CONFIG))
+    EXTRA_SIM_PREPROC_DEFINES += +define+NUM_CORES=4 +define+EXT_T_BLACKWELL
+endif
 ifeq ($(shell echo $(CONFIG) | grep -E "FlashConfig$$"),$(CONFIG))
    EXTRA_SIM_PREPROC_DEFINES += +define+NUM_CORES=4
 endif
--- a/src/main/resources/vsrc/vortex
+++ b/src/main/resources/vsrc/vortex
--- a/src/main/scala/radiance/core/TensorCoreBlackwell.scala
+++ b/src/main/scala/radiance/core/TensorCoreBlackwell.scala
@@ -0,0 +1,238 @@
+// See LICENSE.SiFive for license details.
+// See LICENSE.Berkeley for license details.
+
+package radiance.core
+
+import chisel3._
+import chisel3.util._
+
+class TensorCoreBlackwell(
+    val numWarps: Int,
+    val numLanes: Int,
+    val half: Boolean,
+    val numSourceIds: Int = 16,
+    val numFPRegs: Int = 32
+) extends Module {
+  val numWarpBits = log2Ceil(numWarps)
+  val sourceWidth = log2Ceil(numSourceIds)
+  val laneWidth = 4 * 8
+  val memWidth = numLanes * laneWidth
+  val numFPRegBits = log2Ceil(numFPRegs)
+  val addressWidth = 32
+  val maskWidth = memWidth / 8
+
+  object Ops {
+    val bwgmma :: bwgmmaWait :: tcgen05Cp :: tcgen05CpWait :: tcgen05Ld :: tcgen05St :: Nil = Enum(6)
+  }
+
+  class TensorMemReq(
+    sourceWidth: Int,
+    dataWidth: Int
+  ) extends Bundle {
+    val rw = Bool()
+    val byteen = UInt((dataWidth / 8).W)
+    val source = UInt(sourceWidth.W)
+    val address = UInt(addressWidth.W)
+    val data = UInt(dataWidth.W)
+  }
+
+  class TensorMemResp(
+    sourceWidth: Int,
+    dataWidth: Int
+  ) extends Bundle {
+    val source = UInt(sourceWidth.W)
+    val data = UInt(dataWidth.W)
+  }
+
+  val io = IO(new Bundle {
+    val initiate = Flipped(Decoupled(new Bundle {
+      val op = UInt(3.W)
+      val wid = UInt(numWarpBits.W)
+      val rd = UInt(numFPRegBits.W)
+      val addressA = UInt(addressWidth.W)
+      val addressB = UInt(addressWidth.W)
+    }))
+    val writeback = Decoupled(new Bundle {
+      val last = Bool()
+      val wid = UInt(numWarpBits.W)
+      val rd = UInt(numFPRegBits.W)
+      val data = Vec(numLanes, UInt(laneWidth.W))
+    })
+    val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, memWidth)))
+    val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, memWidth)))
+    val respC = Input(UInt(memWidth.W))
+    val reqA = Decoupled(new TensorMemReq(sourceWidth, memWidth))
+    val reqB = Decoupled(new TensorMemReq(sourceWidth, memWidth))
+    val reqC = Output(Valid(UInt(numFPRegBits.W)))
+  })
+
+  object State extends ChiselEnum {
+    val idle, bwReq, bwResp, cpRead, cpWrite, ldReq, stReq, waitWb = Value
+  }
+  val state = RegInit(State.idle)
+
+  val opReg = RegInit(0.U(3.W))
+  val widReg = RegInit(0.U(numWarpBits.W))
+  val rdReg = RegInit(0.U(numFPRegBits.W))
+  val addrAReg = RegInit(0.U(addressWidth.W))
+  val addrBReg = RegInit(0.U(addressWidth.W))
+  val aDataReg = Reg(UInt(memWidth.W))
+  val bDataReg = Reg(UInt(memWidth.W))
+  val haveA = RegInit(false.B)
+  val haveB = RegInit(false.B)
+  val sourceCounter = RegInit(0.U(sourceWidth.W))
+
+  private def bumpSource(): Unit = {
+    sourceCounter := sourceCounter + 1.U
+  }
+
+  val reqA = Wire(Decoupled(new TensorMemReq(sourceWidth, memWidth)))
+  val reqB = Wire(Decoupled(new TensorMemReq(sourceWidth, memWidth)))
+  reqA.valid := false.B
+  reqA.bits := 0.U.asTypeOf(reqA.bits)
+  reqB.valid := false.B
+  reqB.bits := 0.U.asTypeOf(reqB.bits)
+  io.reqA <> reqA
+  io.reqB <> reqB
+
+  val wbValid = RegInit(false.B)
+  val wbData = Reg(Vec(numLanes, UInt(laneWidth.W)))
+  io.writeback.valid := wbValid
+  io.writeback.bits.last := true.B
+  io.writeback.bits.wid := widReg
+  io.writeback.bits.rd := rdReg
+  io.writeback.bits.data := wbData
+
+  io.reqC.valid := false.B
+  io.reqC.bits := rdReg
+
+  io.respA.ready := false.B
+  io.respB.ready := false.B
+  io.initiate.ready := state === State.idle && !wbValid
+
+  when(io.writeback.fire) {
+    wbValid := false.B
+  }
+
+  when(io.initiate.fire) {
+    opReg := io.initiate.bits.op
+    widReg := io.initiate.bits.wid
+    rdReg := io.initiate.bits.rd
+    addrAReg := io.initiate.bits.addressA
+    addrBReg := io.initiate.bits.addressB
+    haveA := false.B
+    haveB := false.B
+    switch(io.initiate.bits.op) {
+      is(Ops.bwgmma) { state := State.bwReq }
+      is(Ops.tcgen05Cp) { state := State.cpRead }
+      is(Ops.tcgen05Ld) { state := State.ldReq }
+      is(Ops.tcgen05St) { state := State.stReq }
+      is(Ops.bwgmmaWait) { state := State.idle }
+      is(Ops.tcgen05CpWait) { state := State.idle }
+    }
+  }
+
+  when(state === State.bwReq) {
+    reqA.valid := true.B
+    reqA.bits.rw := false.B
+    reqA.bits.byteen := Fill(maskWidth, 1.U(1.W))
+    reqA.bits.address := addrAReg
+    reqA.bits.source := sourceCounter
+
+    reqB.valid := true.B
+    reqB.bits.rw := false.B
+    reqB.bits.byteen := Fill(maskWidth, 1.U(1.W))
+    reqB.bits.address := addrBReg
+    reqB.bits.source := sourceCounter
+
+    io.reqC.valid := true.B
+    when(reqA.fire && reqB.fire) {
+      bumpSource()
+      state := State.bwResp
+    }
+  }
+
+  when(state === State.bwResp) {
+    io.respA.ready := true.B
+    io.respB.ready := true.B
+    when(io.respA.fire) {
+      aDataReg := io.respA.bits.data
+      haveA := true.B
+    }
+    when(io.respB.fire) {
+      bDataReg := io.respB.bits.data
+      haveB := true.B
+    }
+    when(haveA && haveB) {
+      val cWords = io.respC.asTypeOf(Vec(numLanes, UInt(laneWidth.W)))
+      val aWords = aDataReg.asTypeOf(Vec(numLanes, UInt(laneWidth.W)))
+      val bWords = bDataReg.asTypeOf(Vec(numLanes, UInt(laneWidth.W)))
+      for (i <- 0 until numLanes) {
+        wbData(i) := aWords(i) + bWords(i) + cWords(i)
+      }
+      wbValid := true.B
+      state := State.idle
+    }
+  }
+
+  when(state === State.cpRead) {
+    reqB.valid := true.B
+    reqB.bits.rw := false.B
+    reqB.bits.byteen := Fill(maskWidth, 1.U(1.W))
+    reqB.bits.address := addrBReg
+    reqB.bits.source := sourceCounter
+    when(reqB.fire) {
+      bumpSource()
+      state := State.cpWrite
+    }
+  }
+
+  when(state === State.cpWrite) {
+    io.respB.ready := reqA.ready
+    reqA.valid := io.respB.valid
+    reqA.bits.rw := true.B
+    reqA.bits.byteen := Fill(maskWidth, 1.U(1.W))
+    reqA.bits.address := addrAReg
+    reqA.bits.source := sourceCounter
+    reqA.bits.data := io.respB.bits.data
+    when(reqA.fire) {
+      bumpSource()
+      state := State.idle
+    }
+  }
+
+  when(state === State.ldReq) {
+    reqA.valid := true.B
+    reqA.bits.rw := false.B
+    reqA.bits.byteen := Fill(maskWidth, 1.U(1.W))
+    reqA.bits.address := addrAReg
+    reqA.bits.source := sourceCounter
+    when(reqA.fire) {
+      bumpSource()
+      state := State.waitWb
+    }
+  }
+
+  when(state === State.waitWb && opReg === Ops.tcgen05Ld) {
+    io.respA.ready := !wbValid
+    when(io.respA.fire) {
+      wbData := io.respA.bits.data.asTypeOf(Vec(numLanes, UInt(laneWidth.W)))
+      wbValid := true.B
+      state := State.idle
+    }
+  }
+
+  when(state === State.stReq) {
+    io.reqC.valid := true.B
+    reqA.valid := true.B
+    reqA.bits.rw := true.B
+    reqA.bits.byteen := Fill(maskWidth, 1.U(1.W))
+    reqA.bits.address := addrAReg
+    reqA.bits.source := sourceCounter
+    reqA.bits.data := io.respC
+    when(reqA.fire) {
+      bumpSource()
+      state := State.idle
+    }
+  }
+}
--- a/src/main/scala/radiance/subsystem/Configs.scala
+++ b/src/main/scala/radiance/subsystem/Configs.scala
@@ -50,6 +50,7 @@ class WithRadianceCores(
  crossing: RocketCrossingParams,
  tensorCoreFP16: Boolean,
  tensorCoreDecoupled: Boolean,
+  tensorCoreBlackwell: Boolean,
  useVxCache: Boolean
 ) extends Config((site, _, up) => {
  case TilesLocated(`location`) => {
@@ -59,7 +60,8 @@ class WithRadianceCores(
    val vortex = RadianceTileParams(
      core = VortexCoreParams(
        tensorCoreFP16 = tensorCoreFP16,
-        tensorCoreDecoupled = tensorCoreDecoupled
+        tensorCoreDecoupled = tensorCoreDecoupled,
+        tensorCoreBlackwell = tensorCoreBlackwell
      ),
      btb = None,
      useVxCache = useVxCache,
@@ -96,6 +98,7 @@ class WithRadianceCores(
  // constructor override that omits `crossing`
  def this(n: Int, location: HierarchicalLocation = InSubsystem,
    tensorCoreFP16: Boolean = false, tensorCoreDecoupled: Boolean = false,
+    tensorCoreBlackwell: Boolean = false,
    useVxCache: Boolean = false)
  = this(n, location, RocketCrossingParams(
    master = HierarchicalElementMasterPortParams.locationDefault(location),
@@ -104,9 +107,23 @@ class WithRadianceCores(
      case InSubsystem => CBUS
      case InCluster(clusterId) => CCBUS(clusterId)
    }
-  ), tensorCoreFP16, tensorCoreDecoupled, useVxCache)
+  ), tensorCoreFP16, tensorCoreDecoupled, tensorCoreBlackwell, useVxCache)
 }

+class WithBlackwellTensorCore(location: HierarchicalLocation = InSubsystem) extends Config((site, _, up) => {
+  case TilesLocated(`location`) =>
+    up(TilesLocated(`location`)).map {
+      case r: RadianceTileAttachParams =>
+        r.copy(tileParams = r.tileParams.copy(
+          core = r.tileParams.core.copy(
+            tensorCoreBlackwell = true,
+            tensorCoreDecoupled = false
+          )
+        ))
+      case other => other
+    }
+})
+
 class WithEmulatorCores(
  n: Int,
  useVxCache: Boolean
--- a/src/main/scala/radiance/tile/RadianceTile.scala
+++ b/src/main/scala/radiance/tile/RadianceTile.scala
@@ -101,6 +101,7 @@ case class VortexCoreParams(
  fpu: Option[FPUParams] = None,
  tensorCoreFP16: Boolean = false, // FP16 if true, FP32 if false
  tensorCoreDecoupled: Boolean = false, // hopper-style SMEM operand decoupling
+  tensorCoreBlackwell: Boolean = false, // blackwell-style TMEM + SMEM tensor core
  debugROB: Boolean = false, // if enabled, uses a C++ debug ROB to generate trace-with-wdata
  haveCease: Boolean = true, // non-standard CEASE instruction
  haveSimTimeout: Boolean = true // add plusarg for simulation timeout
@@ -152,6 +153,10 @@ class RadianceTile private (
    p(SIMTCoreKey).isDefined,
    "SIMTCoreKey not defined; make sure to use WithSimtConfig when using RadianceTile"
  )
+  require(
+    !(radianceParams.core.tensorCoreDecoupled && radianceParams.core.tensorCoreBlackwell),
+    "tensorCoreDecoupled and tensorCoreBlackwell are mutually exclusive"
+  )

  // NOTE: when changing these, remember to change +define+NUM_CORES/THREADS/WARPS in
  // radiance.mk as well!
@@ -280,7 +285,9 @@ class RadianceTile private (
  }

  val tcSmemSize = 32
-  val tcSmemNodes = Seq.tabulate(if (radianceParams.core.tensorCoreDecoupled) 2 else 0) { i =>
+  val tensorUsesAsyncMem = radianceParams.core.tensorCoreDecoupled || radianceParams.core.tensorCoreBlackwell
+  val tcSmemNodeCount = if (radianceParams.core.tensorCoreDecoupled) 2 else if (radianceParams.core.tensorCoreBlackwell) 1 else 0
+  val tcSmemNodes = Seq.tabulate(tcSmemNodeCount) { i =>
    TLClientNode(Seq(TLMasterPortParameters.v2(
      masters = Seq(TLMasterParameters.v2(
        name = s"rad_tc_${radianceParams.coreId}_$i",
@@ -294,6 +301,42 @@ class RadianceTile private (
    )))
  }

+  val tmemNodes = Seq.tabulate(if (radianceParams.core.tensorCoreBlackwell) 2 else 0) { i =>
+    TLClientNode(Seq(TLMasterPortParameters.v2(
+      masters = Seq(TLMasterParameters.v2(
+        name = s"rad_tmem_${radianceParams.coreId}_$i",
+        sourceId = IdRange(0, 1 << smemSourceWidth),
+        supports = TLSlaveToMasterTransferSizes(
+          probe = TransferSizes(1, tcSmemSize),
+          get = TransferSizes(1, tcSmemSize),
+          putFull = TransferSizes(1, tcSmemSize),
+          putPartial = TransferSizes(1, tcSmemSize),
+        ),
+        requestFifo = true
+      ))
+    )))
+  }
+
+  val tmemNode = if (radianceParams.core.tensorCoreBlackwell) {
+    Some(LazyModule(new TLRAM(
+      address = AddressSet(0x0, 0x3fff),
+      beatBytes = tcSmemSize
+    )))
+  } else {
+    None
+  }
+  val tmemXbar = if (radianceParams.core.tensorCoreBlackwell) {
+    Some(LazyModule(new TLXbar))
+  } else {
+    None
+  }
+  (tmemNode, tmemXbar) match {
+    case (Some(tmem), Some(xbar)) =>
+      tmem.node :=* xbar.node
+      tmemNodes.foreach { node => xbar.node :=* node }
+    case _ =>
+  }
+
  // combine outgoing per-lane dmemNode into 1 idenity node
  //
  // NOTE: We need TLWidthWidget here because there might be a data width
@@ -743,12 +786,18 @@ class RadianceTileModuleImp(outer: RadianceTile)
        val tcb0 = new {
          val addr = core.io.tc_a_bits_address(31, 0)
          val tag = core.io.tc_a_bits_tag(outer.tensorTagWidth - 1, 0)
+          val write = core.io.tc_a_bits_write(0)
+          val mask = core.io.tc_a_bits_mask(31, 0)
+          val data = core.io.tc_a_bits_data(255, 0)
          val aValid = core.io.tc_a_valid(0)
          val dReady = core.io.tc_d_ready(0)
        }
        val tcb1 = new {
          val addr = core.io.tc_a_bits_address(63, 32)
          val tag = core.io.tc_a_bits_tag(4 + outer.tensorTagWidth - 1, 4)
+          val write = core.io.tc_a_bits_write(1)
+          val mask = core.io.tc_a_bits_mask(63, 32)
+          val data = core.io.tc_a_bits_data(511, 256)
          val aValid = core.io.tc_a_valid(1)
          val dReady = core.io.tc_d_ready(1)
        }
@@ -770,8 +819,9 @@ class RadianceTileModuleImp(outer: RadianceTile)
          adapter.io.inReq.bits.address := bundle.addr
          adapter.io.inReq.bits.source := bundle.tag
          adapter.io.inReq.bits.size := 5.U // 256 bits
-          adapter.io.inReq.bits.opcode := TLMessages.Get
-          adapter.io.inReq.bits.mask := x"ffffffff".U
+          adapter.io.inReq.bits.opcode := Mux(bundle.write.asBool, TLMessages.PutFullData, TLMessages.Get)
+          adapter.io.inReq.bits.mask := bundle.mask
+          adapter.io.inReq.bits.data := bundle.data
          adapter.io.inResp.ready := bundle.dReady

          client._1.a <> adapter.io.outReq
@@ -792,6 +842,71 @@ class RadianceTileModuleImp(outer: RadianceTile)
      }
    }

+    def connectTensorBlackwell = {
+      if (outer.radianceParams.core.tensorCoreBlackwell) {
+        require(outer.tmemNodes.nonEmpty)
+        require(outer.tcSmemNodes.nonEmpty)
+
+        val bundles = Seq(
+          (outer.tmemNodes.head, new {
+            val addr = core.io.tc_a_bits_address(31, 0)
+            val tag = core.io.tc_a_bits_tag(outer.tensorTagWidth - 1, 0)
+            val write = core.io.tc_a_bits_write(0)
+            val mask = core.io.tc_a_bits_mask(31, 0)
+            val data = core.io.tc_a_bits_data(255, 0)
+            val aValid = core.io.tc_a_valid(0)
+            val dReady = core.io.tc_d_ready(0)
+          }),
+          (outer.tcSmemNodes.head, new {
+            val addr = core.io.tc_a_bits_address(63, 32)
+            val tag = core.io.tc_a_bits_tag(4 + outer.tensorTagWidth - 1, 4)
+            val write = core.io.tc_a_bits_write(1)
+            val mask = core.io.tc_a_bits_mask(63, 32)
+            val data = core.io.tc_a_bits_data(511, 256)
+            val aValid = core.io.tc_a_valid(1)
+            val dReady = core.io.tc_d_ready(1)
+          })
+        )
+
+        val adapters = bundles.map { case (node, bundle) =>
+          val client = node.out.head
+          val adapter = Module(
+            new VortexTLAdapter(
+              outer.smemSourceWidth,
+              new VortexBundleA(tagWidth = outer.tensorTagWidth, dataWidth = 32 * 8),
+              new VortexBundleD(tagWidth = outer.tensorTagWidth, dataWidth = 32 * 8),
+              client
+            )
+          )
+          require(adapter.io.inReq.bits.source.widthOption.get == bundle.tag.widthOption.get)
+          require(adapter.io.inReq.bits.address.widthOption.get == bundle.addr.widthOption.get)
+          adapter.io.inReq.bits <> DontCare
+          adapter.io.inReq.valid := bundle.aValid
+          adapter.io.inReq.bits.address := bundle.addr
+          adapter.io.inReq.bits.source := bundle.tag
+          adapter.io.inReq.bits.size := 5.U
+          adapter.io.inReq.bits.opcode := Mux(bundle.write.asBool, TLMessages.PutFullData, TLMessages.Get)
+          adapter.io.inReq.bits.mask := bundle.mask
+          adapter.io.inReq.bits.data := bundle.data
+          adapter.io.inResp.ready := bundle.dReady
+
+          client._1.a <> adapter.io.outReq
+          adapter.io.outResp <> client._1.d
+          adapter
+        }
+
+        core.io.tc_a_ready := Cat(adapters.last.io.inReq.ready, adapters.head.io.inReq.ready)
+        core.io.tc_d_valid := Cat(adapters.last.io.inResp.valid, adapters.head.io.inResp.valid)
+        core.io.tc_d_bits_data := Cat(adapters.last.io.inResp.bits.data, adapters.head.io.inResp.bits.data)
+        core.io.tc_d_bits_tag := Cat(adapters.last.io.inResp.bits.source, adapters.head.io.inResp.bits.source)
+      } else {
+        core.io.tc_a_ready := false.B
+        core.io.tc_d_valid := false.B
+        core.io.tc_d_bits_data := DontCare
+        core.io.tc_d_bits_tag := DontCare
+      }
+    }
+
    def connectBarrier = {
      require(outer.barrierMasterNode.out.length == 1)
      // FIXME: bits not flattened
@@ -847,7 +962,11 @@ class RadianceTileModuleImp(outer: RadianceTile)
    connectImem
    connectDmem
    connectSmem
+    if (outer.radianceParams.core.tensorCoreBlackwell) {
+      connectTensorBlackwell
+    } else {
      connectTensor
+    }
    connectBarrier
    connectAccelerator
  }
@@ -874,6 +993,20 @@ class RadianceTileModuleImp(outer: RadianceTile)
    tensor.io.reqA.ready := false.B
    tensor.io.reqB.ready := false.B
    tensor.io.writeback.ready := false.B
+  } else if (outer.radianceParams.core.tensorCoreBlackwell) {
+    val tensorNumSourceIds = (1 << outer.tensorTagWidth)
+    val tensor = Module(new radiance.core.TensorCoreBlackwell(
+      8, 8, half = true, tensorNumSourceIds))
+    tensor.io.initiate.valid := false.B
+    tensor.io.initiate.bits := DontCare
+    tensor.io.respA.valid := false.B
+    tensor.io.respA.bits := DontCare
+    tensor.io.respB.valid := false.B
+    tensor.io.respB.bits := DontCare
+    tensor.io.respC := DontCare
+    tensor.io.reqA.ready := false.B
+    tensor.io.reqB.ready := false.B
+    tensor.io.writeback.ready := false.B
  } else {
    if (outer.radianceParams.core.tensorCoreFP16) {
      val dpu = Module(new radiance.core.TensorDotProductUnit(4, half = true))
--- a/src/main/scala/radiance/tile/VortexCore.scala
+++ b/src/main/scala/radiance/tile/VortexCore.scala
@@ -91,8 +91,11 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl
  val smem_d_ready = Output(UInt((tile.numLsuLanes * 1).W))

  val tc_a_valid = Output(UInt(2.W))
+  val tc_a_bits_write = Output(UInt(2.W))
  val tc_a_bits_address = Output(UInt((2 * 32).W))
  val tc_a_bits_tag = Output(UInt((2 * 4).W))
+  val tc_a_bits_mask = Output(UInt((2 * 32).W))
+  val tc_a_bits_data = Output(UInt((2 * 32 * 8).W))
  val tc_a_ready = Input(UInt(2.W))
  val tc_d_valid = Input(UInt(2.W))
  val tc_d_bits_data = Input(UInt((2 * 32 * 8).W))
@@ -411,6 +414,8 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
  // hopper-style SMEM operand decoupling
  if (tile.radianceParams.core.tensorCoreDecoupled) {
    addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_hopper_core.sv")
+  } else if (tile.radianceParams.core.tensorCoreBlackwell) {
+    addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_blackwell_core.sv")
  //  addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh")
    def addHopperTensorCore = {
      addPath("/scratch/hansung/chipyard/sims/vcs/generated-src/chipyard.unittest.TestHarness.TensorUnitTestConfig/gen-collateral/AddRawFN.sv")
--- a/src/test/scala/radiance/TensorCoreBlackwellTest.scala
+++ b/src/test/scala/radiance/TensorCoreBlackwellTest.scala
@@ -0,0 +1,125 @@
+package radiance.core
+
+import chisel3._
+import chiseltest._
+import org.scalatest.flatspec.AnyFlatSpec
+
+class TensorCoreBlackwellTest extends AnyFlatSpec with ChiselScalatestTester {
+  behavior of "TensorCoreBlackwell"
+
+  private def idleIO(c: TensorCoreBlackwell): Unit = {
+    c.io.initiate.valid.poke(false.B)
+    c.io.respA.valid.poke(false.B)
+    c.io.respB.valid.poke(false.B)
+    c.io.respA.bits.source.poke(0.U)
+    c.io.respB.bits.source.poke(0.U)
+    c.io.respA.bits.data.poke(0.U)
+    c.io.respB.bits.data.poke(0.U)
+    c.io.respC.poke(0.U)
+    c.io.writeback.ready.poke(false.B)
+  }
+
+  it should "run a minimal BWGMMA path" in {
+    test(new TensorCoreBlackwell(8, 8, numSourceIds = 4, half = true)) { c =>
+      idleIO(c)
+
+      c.io.initiate.valid.poke(true.B)
+      c.io.initiate.bits.op.poke(0.U)
+      c.io.initiate.bits.wid.poke(1.U)
+      c.io.initiate.bits.rd.poke(3.U)
+      c.io.initiate.bits.addressA.poke(0x40.U)
+      c.io.initiate.bits.addressB.poke(0x80.U)
+      c.io.reqA.ready.poke(true.B)
+      c.io.reqB.ready.poke(true.B)
+      c.io.respC.poke("h0000000800000007000000060000000500000004000000030000000200000001".U)
+      c.clock.step()
+
+      c.io.initiate.valid.poke(false.B)
+      c.io.reqA.valid.expect(true.B)
+      c.io.reqB.valid.expect(true.B)
+      c.clock.step()
+
+      c.io.respA.valid.poke(true.B)
+      c.io.respB.valid.poke(true.B)
+      c.io.respA.bits.data.poke("h0000000800000007000000060000000500000004000000030000000200000001".U)
+      c.io.respB.bits.data.poke("h000000100000000f0000000e0000000d0000000c0000000b0000000a00000009".U)
+      c.clock.step()
+
+      c.io.respA.valid.poke(false.B)
+      c.io.respB.valid.poke(false.B)
+      c.clock.step()
+      c.clock.step()
+      c.io.writeback.valid.expect(true.B)
+      c.io.writeback.bits.rd.expect(3.U)
+      c.io.writeback.bits.wid.expect(1.U)
+      c.io.writeback.ready.poke(true.B)
+      c.clock.step()
+    }
+  }
+
+  it should "copy from SMEM to TMEM on TCGEN05_CP" in {
+    test(new TensorCoreBlackwell(8, 8, numSourceIds = 4, half = true)) { c =>
+      idleIO(c)
+
+      c.io.initiate.valid.poke(true.B)
+      c.io.initiate.bits.op.poke(2.U)
+      c.io.initiate.bits.wid.poke(0.U)
+      c.io.initiate.bits.rd.poke(0.U)
+      c.io.initiate.bits.addressA.poke(0x100.U)
+      c.io.initiate.bits.addressB.poke(0x200.U)
+      c.io.reqB.ready.poke(true.B)
+      c.clock.step()
+
+      c.io.initiate.valid.poke(false.B)
+      c.io.reqB.valid.expect(true.B)
+      c.io.respB.valid.poke(true.B)
+      c.io.respB.bits.data.poke("hdeadbeef".U)
+      c.io.reqA.ready.poke(true.B)
+      c.clock.step()
+      c.io.reqA.valid.expect(true.B)
+      c.io.reqA.bits.rw.expect(true.B)
+      c.io.reqA.bits.address.expect(0x100.U)
+    }
+  }
+
+  it should "load and store fragments through TMEM" in {
+    test(new TensorCoreBlackwell(8, 8, numSourceIds = 4, half = true)) { c =>
+      idleIO(c)
+
+      c.io.initiate.valid.poke(true.B)
+      c.io.initiate.bits.op.poke(4.U)
+      c.io.initiate.bits.wid.poke(2.U)
+      c.io.initiate.bits.rd.poke(5.U)
+      c.io.initiate.bits.addressA.poke(0x300.U)
+      c.io.initiate.bits.addressB.poke(0.U)
+      c.io.reqA.ready.poke(true.B)
+      c.clock.step()
+
+      c.io.initiate.valid.poke(false.B)
+      c.clock.step()
+      c.io.respA.valid.poke(true.B)
+      c.io.respA.bits.data.poke("h1234".U)
+      c.clock.step()
+      c.io.respA.valid.poke(false.B)
+      c.clock.step()
+      c.io.writeback.valid.expect(true.B)
+      c.io.writeback.bits.rd.expect(5.U)
+      c.io.writeback.ready.poke(true.B)
+      c.clock.step()
+
+      idleIO(c)
+      c.io.initiate.valid.poke(true.B)
+      c.io.initiate.bits.op.poke(5.U)
+      c.io.initiate.bits.wid.poke(2.U)
+      c.io.initiate.bits.rd.poke(6.U)
+      c.io.initiate.bits.addressA.poke(0x340.U)
+      c.io.initiate.bits.addressB.poke(0.U)
+      c.io.reqA.ready.poke(true.B)
+      c.io.respC.poke("habcd".U)
+      c.clock.step()
+      c.io.reqA.valid.expect(true.B)
+      c.io.reqA.bits.rw.expect(true.B)
+      c.io.reqA.bits.address.expect(0x340.U)
+    }
+  }
+}