From c6df484c0048abee36ce6ea33ad98287041e6a1a Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Sat, 5 Oct 2024 02:48:47 -0700 Subject: [PATCH 1/4] add tensor core read client --- src/main/scala/radiance/tile/RadianceTile.scala | 13 +++++++++++++ .../radiance/tile/VirgoSharedMemComponents.scala | 11 +++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index eefd491..821e154 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -274,6 +274,19 @@ class RadianceTile private ( ) } + val tcSmemSize = 32 + val tcSmemNodes = Seq(TLClientNode(Seq(TLMasterPortParameters.v2( + masters = Seq(TLMasterParameters.v2( + name = s"rad_tc_${radianceParams.coreId}", + sourceId = IdRange(0, 1 << smemSourceWidth), + supports = TLSlaveToMasterTransferSizes( + get = TransferSizes(1, tcSmemSize), + putFull = TransferSizes(1, tcSmemSize), + putPartial = TransferSizes(1, tcSmemSize) + ) + )) + )))) + // combine outgoing per-lane dmemNode into 1 idenity node // // NOTE: We need TLWidthWidget here because there might be a data width diff --git a/src/main/scala/radiance/tile/VirgoSharedMemComponents.scala b/src/main/scala/radiance/tile/VirgoSharedMemComponents.scala index a3fde96..c72fc7f 100644 --- a/src/main/scala/radiance/tile/VirgoSharedMemComponents.scala +++ b/src/main/scala/radiance/tile/VirgoSharedMemComponents.scala @@ -54,6 +54,7 @@ class VirgoSharedMemComponents( smemFanoutXbar.node } } + val tcNodeFanouts = radianceTiles.flatMap(_.tcSmemNodes).map(connectXbarName(_, Some("tc_fanout"))) val clBusClients: Seq[TLNode] = radianceSmemFanout val (uniformRNodes, uniformWNodes, nonuniformRNodes, nonuniformWNodes) = @@ -84,6 +85,12 @@ class VirgoSharedMemComponents( val spadSpWriteNodesSingleBank = distAndDuplicate(gemminis.map(_.spad.spad_writer.node), "ws") val spadSpWriteNodes = Seq.fill(smemBanks)(spadSpWriteNodesSingleBank) // executed only once + // tensor core read nodes + val tcDistNodes = Seq.fill(smemBanks)(tcNodeFanouts.map(connectOne(_, () => DistributorNode(smemWidth, wordSize)))) + val tcNodes = tcDistNodes.map { tcBank => + Seq.fill(smemSubbanks)(tcBank.map(connectXbarName(_, Some("tc_dist_fanout")))) + } // (banks, subbanks, tc client) + if (filterAligned) { val numLsuLanes = radianceTiles.head.numLsuLanes val numLaneDupes = Math.max(1, smemSubbanks / numLsuLanes) @@ -186,8 +193,8 @@ class VirgoSharedMemComponents( } - val uniformRNodes: Seq[Seq[Seq[TLNexusNode]]] = spadReadNodes.map { rb => - (rb zip fAligned.head).map { case (rw, fa) => rw ++ fa } + val uniformRNodes: Seq[Seq[Seq[TLNexusNode]]] = (spadReadNodes zip tcNodes).map { case (rb, tcrb) => + (rb lazyZip tcrb lazyZip fAligned.head).map { case (rw, tcrw, fa) => rw ++ tcrw ++ fa } } val uniformWNodes: Seq[Seq[Seq[TLNexusNode]]] = (spadWriteNodes zip spadSpWriteNodes).map { case (wb, wsb) => (wb lazyZip wsb lazyZip fAligned.last).map { From 0989d90dd25c51d8a7dc6b0aaac809868e39024d Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Mon, 7 Oct 2024 02:59:06 -0700 Subject: [PATCH 2/4] connect tc nodes and maybe fix distributor node --- src/main/resources/vsrc/vortex | 2 +- .../radiance/memory/DistributorNode.scala | 39 ++++++----- .../scala/radiance/tile/RadianceTile.scala | 67 +++++++++++++++---- .../tile/VirgoSharedMemComponents.scala | 8 ++- src/main/scala/radiance/tile/VortexCore.scala | 9 +++ 5 files changed, 93 insertions(+), 32 deletions(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index da54162..8bf7f39 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit da54162241da020807274bd4087844d379d8170e +Subproject commit 8bf7f39f04e6d3cbc47559fdd3cacca0febe9baa diff --git a/src/main/scala/radiance/memory/DistributorNode.scala b/src/main/scala/radiance/memory/DistributorNode.scala index 8f46af8..29ccebd 100644 --- a/src/main/scala/radiance/memory/DistributorNode.scala +++ b/src/main/scala/radiance/memory/DistributorNode.scala @@ -91,13 +91,15 @@ class DistributorNode(from: Int, to: Int)(implicit p: Parameters) extends LazyMo } def partialData: UInt = VecInit(mn.map(_.d).map(d => Mux(d.fire, d.bits.data, 0.U(d.bits.data.getWidth.W)))).asUInt - def partialValid: UInt = VecInit(mn.map(_.d.fire)).asUInt + def partialValid: UInt = VecInit(mn.map(_.d.valid)).asUInt + def partialFire: UInt = VecInit(mn.map(_.d.fire)).asUInt mn.map(_.d.ready).zip(arrived.asBools).foreach { case (r, a) => r := cn.d.ready && (!partialWait || !a) // if waiting for partial response, ready only if not arrived yet } // TODO: might need coverage test for this + cd := DontCare when (!partialWait) { cn.d.valid := false.B partialWait := false.B @@ -109,31 +111,36 @@ class DistributorNode(from: Int, to: Int)(implicit p: Parameters) extends LazyMo assert(cd.data === partialData, "sanity check") }.elsewhen (partialValid.orR) { // at least 1 valid: enter partial valid state, store partial data into regs - partialWait := true.B - arrived := partialValid + partialWait := cn.d.ready // if something fired, enter partial wait + arrived := partialFire cdReg.data := partialData - when (mn.head.d.valid) { setMetadata(cdReg, mn.head.d.bits) } + when (mn.head.d.fire) { setMetadata(cdReg, mn.head.d.bits) } } }.otherwise { cn.d.valid := false.B partialWait := true.B when ((arrived | partialValid).andR) { // all valids received now - when (mn.head.d.valid) { - setMetadata(cd, mn.head.d.bits) - }.otherwise { - cd := cdReg - } cn.d.valid := true.B - cd.data := cdReg.data | partialData - partialWait := false.B - cdReg := 0.U.asTypeOf(cdReg.cloneType) - arrived := 0.U + when (cn.d.ready) { + assert((arrived | partialFire).andR) + when (mn.head.d.valid) { + setMetadata(cd, mn.head.d.bits) + }.otherwise { + cd := cdReg + } + cd.data := cdReg.data | partialData + partialWait := false.B + cdReg := 0.U.asTypeOf(cdReg.cloneType) + arrived := 0.U + } }.elsewhen (partialValid.orR) { // update partial data - arrived := arrived | partialValid - cdReg.data := cdReg.data | partialData - when (mn.head.d.valid) { setMetadata(cdReg, mn.head.d.bits) } + when (cn.d.ready) { + arrived := arrived | partialValid + cdReg.data := cdReg.data | partialData + when (mn.head.d.valid) { setMetadata(cdReg, mn.head.d.bits) } + } } } } diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index e6b48f3..8ab5984 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -11,6 +11,7 @@ import freechips.rocketchip.diplomacy._ import org.chipsalliance.diplomacy.lazymodule.LazyModule import freechips.rocketchip.prci.{ClockCrossingType, ClockSinkParameters, RationalCrossing} import freechips.rocketchip.regmapper.RegField +import freechips.rocketchip.resources.BigIntHexContext import freechips.rocketchip.rocket._ import freechips.rocketchip.subsystem.HierarchicalElementCrossingParamsLike import freechips.rocketchip.tile._ @@ -275,17 +276,18 @@ class RadianceTile private ( } val tcSmemSize = 32 - val tcSmemNodes = Seq(TLClientNode(Seq(TLMasterPortParameters.v2( - masters = Seq(TLMasterParameters.v2( - name = s"rad_tc_${radianceParams.coreId}", - sourceId = IdRange(0, 1 << smemSourceWidth), - supports = TLSlaveToMasterTransferSizes( - get = TransferSizes(1, tcSmemSize), - putFull = TransferSizes(1, tcSmemSize), - putPartial = TransferSizes(1, tcSmemSize) - ) - )) - )))) + val tcSmemNodes = Seq.tabulate(2) { i => + TLClientNode(Seq(TLMasterPortParameters.v2( + masters = Seq(TLMasterParameters.v2( + name = s"rad_tc_${radianceParams.coreId}_$i", + sourceId = IdRange(0, 1 << smemSourceWidth), + supports = TLSlaveToMasterTransferSizes( + probe = TransferSizes(1, tcSmemSize), + get = TransferSizes(1, tcSmemSize), + ) + )) + ))) + } // combine outgoing per-lane dmemNode into 1 idenity node // @@ -686,7 +688,7 @@ class RadianceTileModuleImp(outer: RadianceTile) outer.smemSourceWidth, new VortexBundleA(tagWidth = outer.smemTagWidth, dataWidth = 32), new VortexBundleD(tagWidth = outer.smemTagWidth, dataWidth = 32), - outer.smemNodes(0).out.head + outer.smemNodes.head.out.head ) ) } @@ -731,6 +733,46 @@ class RadianceTileModuleImp(outer: RadianceTile) } } + def connectTc { + val tcb0 = new { + val addr = core.io.tc_a_bits_address(31, 0) + val aValid = core.io.tc_a_valid(0) + val dReady = core.io.tc_d_ready(0) + } + val tcb1 = new { + val addr = core.io.tc_a_bits_address(63, 32) + val aValid = core.io.tc_a_valid(1) + val dReady = core.io.tc_d_ready(1) + } + val tcBundles = Seq(tcb0, tcb1) + val adapters = (outer.tcSmemNodes zip tcBundles).zipWithIndex.map { case ((node, bundle), i) => + val client = node.out.head + val adapter = Module( + new VortexTLAdapter( + outer.smemSourceWidth, + new VortexBundleA(tagWidth = 1, dataWidth = 32 * 8), + new VortexBundleD(tagWidth = 1, dataWidth = 32 * 8), + client + ) + ) + adapter.io.inReq.bits <> DontCare + adapter.io.inReq.valid := bundle.aValid + adapter.io.inReq.bits.address := bundle.addr + adapter.io.inReq.bits.source := i.U + adapter.io.inReq.bits.size := 5.U + adapter.io.inReq.bits.opcode := TLMessages.Get + adapter.io.inReq.bits.mask := x"ffffffff".U + adapter.io.inResp.ready := bundle.dReady + + client._1.a <> adapter.io.outReq + adapter.io.outResp <> client._1.d + adapter + } + core.io.tc_a_ready := Cat(adapters.last.io.inReq.ready, adapters.head.io.inReq.ready) + core.io.tc_d_valid := Cat(adapters.last.io.inResp.valid, adapters.head.io.inResp.valid) + core.io.tc_d_bits_data := Cat(adapters.last.io.inResp.bits.data, adapters.head.io.inResp.bits.data) + } + def connectBarrier = { require(outer.barrierMasterNode.out.length == 1) // FIXME: bits not flattened @@ -786,6 +828,7 @@ class RadianceTileModuleImp(outer: RadianceTile) connectImem connectDmem connectSmem + connectTc connectBarrier connectAccelerator } diff --git a/src/main/scala/radiance/tile/VirgoSharedMemComponents.scala b/src/main/scala/radiance/tile/VirgoSharedMemComponents.scala index c72fc7f..8ea5c6f 100644 --- a/src/main/scala/radiance/tile/VirgoSharedMemComponents.scala +++ b/src/main/scala/radiance/tile/VirgoSharedMemComponents.scala @@ -54,7 +54,9 @@ class VirgoSharedMemComponents( smemFanoutXbar.node } } - val tcNodeFanouts = radianceTiles.flatMap(_.tcSmemNodes).map(connectXbarName(_, Some("tc_fanout"))) + val tcNodeFanouts = radianceTiles.flatMap(_.tcSmemNodes) + .map(connectOne(_, () => TLBuffer(BufferParams(2, false, false), BufferParams(0)))) + .map(connectXbarName(_, Some("tc_fanout"))) val clBusClients: Seq[TLNode] = radianceSmemFanout val (uniformRNodes, uniformWNodes, nonuniformRNodes, nonuniformWNodes) = @@ -69,7 +71,7 @@ class VirgoSharedMemComponents( dist := node } val fanout = Seq.tabulate(spSubbanks) { w => - val buf = TLBuffer(BufferParams(1, false, true), BufferParams(0)) + val buf = TLBuffer(BufferParams(2, false, false), BufferParams(0)) buf := dist connectXbarName(buf, Some(s"spad_g${gemminiIdx}w${w}_fanout_$suffix")) } @@ -88,7 +90,7 @@ class VirgoSharedMemComponents( // tensor core read nodes val tcDistNodes = Seq.fill(smemBanks)(tcNodeFanouts.map(connectOne(_, () => DistributorNode(smemWidth, wordSize)))) val tcNodes = tcDistNodes.map { tcBank => - Seq.fill(smemSubbanks)(tcBank.map(connectXbarName(_, Some("tc_dist_fanout")))) + Seq.fill(smemSubbanks)(tcBank.map(connectOne(_, () => TLBuffer(BufferParams(2, false, false)))).map(connectXbarName(_, Some("tc_dist_fanout")))) } // (banks, subbanks, tc client) if (filterAligned) { diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index d202aaa..1409ddd 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -90,6 +90,13 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl val smem_d_bits_data = Input(UInt((tile.numLsuLanes * 32).W)) val smem_d_ready = Output(UInt((tile.numLsuLanes * 1).W)) + val tc_a_valid = Output(UInt(2.W)) + val tc_a_bits_address = Output(UInt((2 * 32).W)) + val tc_a_ready = Input(UInt(2.W)) + val tc_d_valid = Input(UInt(2.W)) + val tc_d_bits_data = Input(UInt((2 * 32 * 8).W)) + val tc_d_ready = Output(UInt(2.W)) + // FIXME: hardcoded val barrierIdBits = tile.barrierMasterNode.out(0)._2.barrierIdBits val coreIdBits = tile.barrierMasterNode.out(0)._2.numCoreBits @@ -233,6 +240,8 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_arb.sv") // addResource("/vsrc/vortex/hw/rtl/mem/VX_gbar_unit.sv") + addResource("/vsrc/vortex/hw/rtl/mem/VX_tc_bus_if.sv") + addResource("/vsrc/vortex/hw/rtl/libs/VX_allocator.sv") // addResource("/vsrc/vortex/hw/rtl/libs/VX_avs_adapter.sv") // addResource("/vsrc/vortex/hw/rtl/libs/VX_axi_adapter.sv") From 9e86007e9097ccac26efeda0ac918422ed5b4f2f Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Wed, 16 Oct 2024 16:20:58 -0700 Subject: [PATCH 3/4] add imp component to rad smem, add core serialized access, update 2p smem --- src/main/scala/radiance/memory/SyncMem.scala | 81 +++++++++ .../radiance/memory/XbarWithExtPolicy.scala | 25 ++- .../scala/radiance/subsystem/Configs.scala | 14 +- .../scala/radiance/tile/RadianceCluster.scala | 4 +- .../radiance/tile/RadianceSharedMem.scala | 30 ++-- .../tile/VirgoSharedMemComponents.scala | 163 +++++++++++------- 6 files changed, 225 insertions(+), 92 deletions(-) create mode 100644 src/main/scala/radiance/memory/SyncMem.scala diff --git a/src/main/scala/radiance/memory/SyncMem.scala b/src/main/scala/radiance/memory/SyncMem.scala new file mode 100644 index 0000000..f90ad7a --- /dev/null +++ b/src/main/scala/radiance/memory/SyncMem.scala @@ -0,0 +1,81 @@ +package radiance.memory +import chisel3._ +import chisel3.util._ +import midas.targetutils.SynthesizePrintf + +// modified from gemmini's two port sync mem +class TwoPortSyncMem[T <: Data](n: Int, t: T, maskedUnitWidth: Int = 8) extends Module { + val maskWidth = t.getWidth / maskedUnitWidth + val io = IO(new Bundle { + val waddr = Input(UInt((log2Ceil(n) max 1).W)) + val raddr = Input(UInt((log2Ceil(n) max 1).W)) + val wdata = Input(t) + val rdata = Output(t) + val wen = Input(Bool()) + val ren = Input(Bool()) + val mask = Input(UInt(maskWidth.W)) + }) + + when (io.wen && io.ren && io.raddr === io.waddr) { + SynthesizePrintf(printf("WARNING: read and write collided at address 0x%x\n", io.raddr)) + } + + val maskElem = UInt(maskedUnitWidth.W) + val memT = Vec(maskWidth, maskElem) + val mem = SyncReadMem(n, memT, SyncReadMem.WriteFirst) + + io.rdata := mem.read(io.raddr, io.ren).asTypeOf(t) + + when (io.wen) { + mem.write(io.waddr, io.wdata.asTypeOf(memT), io.mask.asBools) + } +} + +class TwoReadOneWriteSyncMem[T <: Data](n: Int, t: T, maskedUnitWidth: Int = 8) extends Module { + val maskWidth = t.getWidth / maskedUnitWidth + val io = IO(new Bundle { + val waddr = Input(UInt((log2Ceil(n) max 1).W)) + val raddr0 = Input(UInt((log2Ceil(n) max 1).W)) + val raddr1 = Input(UInt((log2Ceil(n) max 1).W)) + val wdata = Input(t) + val rdata0 = Output(t) + val rdata1 = Output(t) + val wen = Input(Bool()) + val ren0 = Input(Bool()) + val ren1 = Input(Bool()) + val mask = Input(UInt(maskWidth.W)) + }) + + when (io.wen && io.ren0 && io.raddr0 === io.waddr) { + SynthesizePrintf(printf("WARNING: read0 and write collided at address 0x%x\n", io.raddr0)) + } + when (io.wen && io.ren1 && io.raddr1 === io.waddr) { + SynthesizePrintf(printf("WARNING: read1 and write collided at address 0x%x\n", io.raddr1)) + } + + val maskElem = UInt(maskedUnitWidth.W) + val memT = Vec(maskWidth, maskElem) + val mem0 = SyncReadMem(n, memT, SyncReadMem.WriteFirst) + val mem1 = SyncReadMem(n, memT, SyncReadMem.WriteFirst) + + io.rdata0 := mem0.read(io.raddr0, io.ren0).asTypeOf(t) + io.rdata1 := mem1.read(io.raddr1, io.ren1).asTypeOf(t) + + when (io.wen) { + mem0.write(io.waddr, io.wdata.asTypeOf(memT), io.mask.asBools) + mem1.write(io.waddr, io.wdata.asTypeOf(memT), io.mask.asBools) + } +} + + +object TwoPortSyncMem { + def apply[T <: Data](n: Int, t: T, maskedUnitWidth: Int = 8): TwoPortSyncMem[T] = { + Module(new TwoPortSyncMem[T](n, t, maskedUnitWidth)) + } +} + +object TwoReadOneWriteSyncMem { + def apply[T <: Data](n: Int, t: T, maskedUnitWidth: Int = 8): TwoReadOneWriteSyncMem[T] = { + Module(new TwoReadOneWriteSyncMem[T](n, t, maskedUnitWidth)) + } +} diff --git a/src/main/scala/radiance/memory/XbarWithExtPolicy.scala b/src/main/scala/radiance/memory/XbarWithExtPolicy.scala index 57ba4f3..071c106 100644 --- a/src/main/scala/radiance/memory/XbarWithExtPolicy.scala +++ b/src/main/scala/radiance/memory/XbarWithExtPolicy.scala @@ -20,17 +20,22 @@ object ExtPolicyNodeImp extends SimpleNodeImp[Int, Int, Int, ExtPolicyBundle] { case class ExtPolicyMasterNode(w: Int)(implicit valName: ValName) extends SourceNode(ExtPolicyNodeImp)(Seq(w)) case class ExtPolicySlaveNode()(implicit valName: ValName) extends SinkNode(ExtPolicyNodeImp)(Seq(0)) -class XbarWithExtPolicy(nameSuffix: Option[String] = None) +class XbarWithExtPolicy(nameSuffix: Option[String] = None, useFallback: Boolean = true) (implicit p: Parameters) extends TLXbar(nameSuffix = nameSuffix) { val policySlaveNode = ExtPolicySlaveNode() class ImplChild extends Impl { val policy: TLArbiter.Policy = (width, valids, select) => { val in = policySlaveNode.in.head._1 - val hintHit = (valids & in.hint).orR - val fallback = TLArbiter.lowestIndexFirst(width, valids, !hintHit && select) in.actual := select.asTypeOf(in.actual.cloneType) - Mux(hintHit, in.hint, fallback) + + if (useFallback) { + val hintHit = (valids & in.hint).orR + val fallback = TLArbiter.lowestIndexFirst(width, valids, !hintHit && select) + Mux(hintHit, in.hint, fallback) + } else { + in.hint + } } TLXbar.circuit(policy, node.in, node.out) } @@ -44,4 +49,14 @@ object XbarWithExtPolicy { val xbar = LazyModule(new XbarWithExtPolicy(nameSuffix)) xbar } -} \ No newline at end of file +} + +object XbarWithExtPolicyNoFallback { + def apply(nameSuffix: Option[String] = None) + (implicit p: Parameters): (XbarWithExtPolicy, TLIdentityNode) = { + val inIdNode = TLIdentityNode() + val xbar = LazyModule(new XbarWithExtPolicy(nameSuffix, false)) + xbar.node :=* inIdNode + (xbar, inIdNode) + } +} diff --git a/src/main/scala/radiance/subsystem/Configs.scala b/src/main/scala/radiance/subsystem/Configs.scala index 8709262..a1d0ce6 100644 --- a/src/main/scala/radiance/subsystem/Configs.scala +++ b/src/main/scala/radiance/subsystem/Configs.scala @@ -15,6 +15,12 @@ import radiance.tile._ import radiance.memory._ import radiance.subsystem.RadianceGemminiDataType.{BF16, FP16, FP32, Int8} +sealed trait RadianceSmemSerialization + +case object FullySerialized extends RadianceSmemSerialization +case object CoreSerialized extends RadianceSmemSerialization +case object NotSerialized extends RadianceSmemSerialization + case class RadianceSharedMemKey(address: BigInt, size: Int, numBanks: Int, @@ -23,7 +29,7 @@ case class RadianceSharedMemKey(address: BigInt, strideByWord: Boolean = true, filterAligned: Boolean = true, disableMonitors: Boolean = true, - serializeUnaligned: Boolean = true) + serializeUnaligned: RadianceSmemSerialization = FullySerialized) case object RadianceSharedMemKey extends Field[Option[RadianceSharedMemKey]](None) case class RadianceFrameBufferKey(baseAddress: BigInt, @@ -56,7 +62,7 @@ class WithRadianceCores( nTLBWays = 1, nTLBBasePageSectors = 1, nTLBSuperpages = 1, - nMSHRs = 0, + nMSHRs = 0, blockBytes = site(CacheBlockBytes))), icache = Some(ICacheParams( rowBits = site(SystemBusKey).beatBits, @@ -194,8 +200,8 @@ class WithRadianceSharedMem(address: BigInt, strideByWord: Boolean = true, filterAligned: Boolean = true, disableMonitors: Boolean = true, - serializeUnaligned: Boolean = true - ) extends Config((site, _, _) => { + serializeUnaligned: RadianceSmemSerialization = FullySerialized + ) extends Config((_, _, _) => { case RadianceSharedMemKey => { require(isPow2(size) && size >= 1024) Some(RadianceSharedMemKey( diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index 99258ae..0bb21be 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -38,7 +38,9 @@ class RadianceCluster ( // TODO: this probably needs to be instantiated inside the radiance shared mem module def virgoSharedMemComponentsGen() = new VirgoSharedMemComponents(thisClusterParams, gemminiTiles, radianceTiles) - LazyModule(new RadianceSharedMem(virgoSharedMemComponentsGen, clbus)).suggestName("shared_mem") + def virgoSharedMemComponentsImpGen(outer: VirgoSharedMemComponents) = new VirgoSharedMemComponentsImp(outer) + LazyModule(new RadianceSharedMem( + virgoSharedMemComponentsGen, Some(virgoSharedMemComponentsImpGen(_)), clbus)).suggestName("shared_mem") // direct core-accelerator connections val smemKey = p(RadianceSharedMemKey).get diff --git a/src/main/scala/radiance/tile/RadianceSharedMem.scala b/src/main/scala/radiance/tile/RadianceSharedMem.scala index 5b85914..696b3c2 100644 --- a/src/main/scala/radiance/tile/RadianceSharedMem.scala +++ b/src/main/scala/radiance/tile/RadianceSharedMem.scala @@ -4,14 +4,14 @@ import chisel3._ import chisel3.util._ import org.chipsalliance.diplomacy.lazymodule._ import org.chipsalliance.cde.config.Parameters -import radiance.memory._ import freechips.rocketchip.tilelink._ import freechips.rocketchip.diplomacy.{AddressSet, TransferSizes} +import gemmini.Pipeline import radiance.subsystem.RadianceSharedMemKey -import gemmini._ +import radiance.memory._ import scala.collection.mutable.ArrayBuffer -trait RadianceSmemNodeProvider { +abstract class RadianceSmemNodeProvider { val uniformRNodes: Seq[Seq[Seq[TLNexusNode]]] val uniformWNodes: Seq[Seq[Seq[TLNexusNode]]] val nonuniformRNodes: Seq[TLNode] @@ -19,8 +19,11 @@ trait RadianceSmemNodeProvider { val clBusClients: Seq[TLNode] } -class RadianceSharedMem( - provider: () => RadianceSmemNodeProvider, +abstract class RadianceSmemNodeProviderImp[T <: RadianceSmemNodeProvider](val outer: T) {} + +class RadianceSharedMem[T <: RadianceSmemNodeProvider]( + provider: () => T, + val providerImp: Option[(T) => RadianceSmemNodeProviderImp[T]], clbus: TLBusWrapper )(implicit p: Parameters) extends LazyModule { val smemKey = p(RadianceSharedMemKey).get @@ -31,6 +34,7 @@ class RadianceSharedMem( val smemDepth = smemKey.size / smemWidth / smemBanks val smemSubbanks = smemWidth / wordSize val smemSize = smemWidth * smemDepth * smemBanks + val strideByWord = smemKey.strideByWord require(isPow2(smemBanks)) @@ -38,11 +42,7 @@ class RadianceSharedMem( val (uniformRNodes, uniformWNodes, nonuniformRNodes, nonuniformWNodes) = (smNodes.uniformRNodes, smNodes.uniformWNodes, smNodes.nonuniformRNodes, smNodes.nonuniformWNodes) - // TODO: move this to config - val strideByWord = true - val filterAligned = true - val serializeUnaligned = true - implicit val disableMonitors = true // otherwise it generate 1k+ different tl monitors + implicit val disableMonitors = smemKey.disableMonitors // otherwise it generate 1k+ different tl monitors // collection of read and write managers for each sram (sub)bank val smemBankMgrs : Seq[Seq[TLManagerNode]] = if (strideByWord) { @@ -180,9 +180,11 @@ class RadianceSharedMem( lazy val module = new RadianceSharedMemImp(this) } -class RadianceSharedMemImp(outer: RadianceSharedMem) extends LazyModuleImp(outer) { +class RadianceSharedMemImp[T <: RadianceSmemNodeProvider](outer: RadianceSharedMem[T]) extends LazyModuleImp(outer) { - def makeBuffer[T <: Data](mem: TwoPortSyncMem[T], rNode: TLBundle, rEdge: TLEdgeIn, + val smNodesImp = outer.providerImp.map(impFn => impFn(outer.smNodes)) + + def makeBuffer[U <: Data](mem: TwoPortSyncMem[U], rNode: TLBundle, rEdge: TLEdgeIn, wNode: TLBundle, wEdge: TLEdgeIn): Unit = { mem.io.ren := rNode.a.fire @@ -240,7 +242,7 @@ class RadianceSharedMemImp(outer: RadianceSharedMem) extends LazyModuleImp(outer // WRITE mem.io.wen := RegNext(wNode.a.fire) mem.io.wdata := RegNext(wNode.a.bits.data) - mem.io.mask := RegNext(VecInit(wNode.a.bits.mask.asBools)) + mem.io.mask := RegNext(wNode.a.bits.mask) val writeResp = Wire(Flipped(wNode.d.cloneType)) writeResp.bits := wEdge.AccessAck(wNode.a.bits) @@ -286,7 +288,6 @@ class RadianceSharedMemImp(outer: RadianceSharedMem) extends LazyModuleImp(outer val mem = TwoPortSyncMem( n = memDepth, t = UInt((wordWidth * 8).W), - mask_len = wordWidth // byte level mask ) // TODO: bring in cluster id // mem.suggestName(s"rad_smem_cl${outer.thisClusterParams.clusterId}_b${bid}_w${wid}") @@ -346,7 +347,6 @@ class RadianceSharedMemImp(outer: RadianceSharedMem) extends LazyModuleImp(outer val mem = TwoPortSyncMem( n = memDepth, t = UInt((memWidth * 8).W), - mask_len = memWidth // byte level mask ) val (rNode, rEdge) = r.in.head diff --git a/src/main/scala/radiance/tile/VirgoSharedMemComponents.scala b/src/main/scala/radiance/tile/VirgoSharedMemComponents.scala index 8ea5c6f..8cc0072 100644 --- a/src/main/scala/radiance/tile/VirgoSharedMemComponents.scala +++ b/src/main/scala/radiance/tile/VirgoSharedMemComponents.scala @@ -9,8 +9,9 @@ import radiance.memory._ import freechips.rocketchip.tilelink._ import freechips.rocketchip.diplomacy.{AddressSet, BufferParams} import freechips.rocketchip.subsystem.BaseClusterParams -import radiance.subsystem.RadianceSharedMemKey +import radiance.subsystem.{CoreSerialized, FullySerialized, NotSerialized, RadianceSharedMemKey} import gemmini._ +import scala.collection.mutable.ArrayBuffer // virgo-specific tilelink nodes // generic smem implementation is in RadianceSharedMem.scala @@ -28,6 +29,9 @@ class VirgoSharedMemComponents( val smemSubbanks = smemWidth / wordSize val smemSize = smemWidth * smemDepth * smemBanks + val numCores = radianceTiles.length + val numLanes = radianceTiles.head.numLsuLanes + val gemminis = gemminiTiles.map(_.gemmini) val gemminiConfigs = gemminis.map(_.config) gemminiConfigs.foreach { config => @@ -55,11 +59,25 @@ class VirgoSharedMemComponents( } } val tcNodeFanouts = radianceTiles.flatMap(_.tcSmemNodes) - .map(connectOne(_, () => TLBuffer(BufferParams(2, false, false), BufferParams(0)))) + // .map(connectOne(_, () => TLBuffer(BufferParams(2, false, false), BufferParams(0)))) .map(connectXbarName(_, Some("tc_fanout"))) val clBusClients: Seq[TLNode] = radianceSmemFanout - val (uniformRNodes, uniformWNodes, nonuniformRNodes, nonuniformWNodes) = + // convert to monad (very fancy) + val coreSerialOpt: Option[Unit] = serializeUnaligned match { + case CoreSerialized => Some(()) + case _ => None + } + + // uniform mux select for selecting lanes from a single core in unison + val coreSerialPolicy = coreSerialOpt.map(_ => Seq.fill(2)(Seq.fill(numLanes)(ExtPolicyMasterNode(numCores)))) + val laneSerialXbars = coreSerialOpt.map(_ => Seq.tabulate(2) { rw => + Seq.tabulate(numLanes) { lid => + XbarWithExtPolicyNoFallback(Some(f"lane_${lid}_serial_in_xbar_$rw")) + } + }) + + override val (uniformRNodes, uniformWNodes, nonuniformRNodes, nonuniformWNodes) = if (strideByWord) { def distAndDuplicate(nodes: Seq[TLNode], suffix: String): Seq[Seq[TLNexusNode]] = { @@ -93,57 +111,35 @@ class VirgoSharedMemComponents( Seq.fill(smemSubbanks)(tcBank.map(connectOne(_, () => TLBuffer(BufferParams(2, false, false)))).map(connectXbarName(_, Some("tc_dist_fanout")))) } // (banks, subbanks, tc client) + val unalignedRWNodes: ArrayBuffer[ArrayBuffer[TLNexusNode]] = // mutable for readability + ArrayBuffer.fill(numLanes)(ArrayBuffer.fill(numCores)(null)) + if (filterAligned) { - val numLsuLanes = radianceTiles.head.numLsuLanes - val numLaneDupes = Math.max(1, smemSubbanks / numLsuLanes) - val filterRange = Math.min(smemSubbanks, numLsuLanes) - println(s"num_lsu_lanes ${numLsuLanes} num_lane_dupes ${numLaneDupes} filter_range ${filterRange}") + val numLaneDupes = Math.max(1, smemSubbanks / numLanes) + val filterRange = Math.min(smemSubbanks, numLanes) - // (subbank, sources, aligned) = rw node - val (fAligned, fUnaligned) = if (numLsuLanes >= smemSubbanks) { - val filterNodes: Seq[Seq[(TLNode, TLNode)]] = Seq.tabulate(numLaneDupes) { did => - Seq.tabulate(filterRange) { wid => - val trueWid = did * filterRange + wid - val address = AddressSet(smemBase + wordSize * trueWid, (smemSize - 1) - (smemSubbanks - 1) * wordSize) + // (subbank, sources) = rw node + val fAligned = if (numLanes >= smemSubbanks) { + val filterNodes: Seq[Seq[TLNode]] = Seq.tabulate(filterRange) { wid => + val address = AddressSet(smemBase + wordSize * wid, (smemSize - 1) - (smemSubbanks - 1) * wordSize) - radianceSmemFanout.grouped(numLsuLanes).toList.zipWithIndex.flatMap { case (lanes, cid) => - lanes.zipWithIndex.flatMap { case (lane, lid) => - if ((lid % filterRange) == wid) { - println(f"c${cid}_l${lid} connected to d${did}w${wid}") - val filterNode = AlignFilterNode(Seq(address))(p, ValName(s"filter_l${lid}_w${trueWid}")) - DisableMonitors { implicit p => filterNode := lane } - // Seq((aligned splitter, unaligned splitter)) - Seq(( - connectOne(filterNode, () => - RWSplitterNode(address, s"aligned_splitter_c${cid}_l${lid}_w${trueWid}")), - connectOne(filterNode, () => - RWSplitterNode(AddressSet.everything, s"unaligned_splitter_c${cid}_l${lid}")) - )) - } else Seq() - } + radianceSmemFanout.grouped(numLanes).toList.zipWithIndex.flatMap { case (lanes, cid) => + lanes.zipWithIndex.flatMap { case (lane, lid) => + if ((lid % filterRange) == wid) { + val filterNode = AlignFilterNode(Seq(address))(p, ValName(s"filter_l${lid}_w${wid}")) + DisableMonitors { implicit p => filterNode := lane } + + unalignedRWNodes(lid)(cid) = connectOne(filterNode, () => + RWSplitterNode(AddressSet.everything, s"unaligned_splitter_c${cid}_l${lid}")) + + Seq(connectOne(filterNode, () => + RWSplitterNode(address, s"aligned_splitter_c${cid}_l${lid}_w${wid}"))) + } else Seq() } } - }.flatten - - val fAligned = Seq.fill(2)(filterNodes.map(_.map(_._1).map(connectXbarName(_, Some("rad_aligned"))))) - val fUnaligned = if (serializeUnaligned) { - Seq.fill(2) { - val serializedNode = TLEphemeralNode() - val serializedInXbar = LazyModule(new TLXbar()) - val serializedOutXbar = LazyModule(new TLXbar()) - serializedInXbar.suggestName("unaligned_serialized_in_xbar") - serializedOutXbar.suggestName("unaligned_serialized_out_xbar") - guardMonitors { implicit p => - filterNodes.foreach(_.map(_._2).foreach(serializedInXbar.node := _)) - serializedNode := serializedInXbar.node - serializedOutXbar.node := serializedNode - } - Seq(serializedOutXbar.node) - } - } else { - Seq.fill(2)(filterNodes.flatMap(_.map(_._2).map(connectXbar.apply))) } - (fAligned, fUnaligned) + + Seq.fill(2)(filterNodes.map(_.map(connectXbarName(_, Some("rad_aligned"))))) } else { // aligned: (subbanks, cores) = rw node // (lanes, cores) = filter_node val filterNodes = Seq.tabulate(filterRange) { wid => @@ -151,7 +147,7 @@ class VirgoSharedMemComponents( AddressSet(smemBase + (did * filterRange + wid) * wordSize, (smemSize - 1) - (smemSubbanks - 1) * wordSize) } - radianceSmemFanout.grouped(numLsuLanes).toSeq.zipWithIndex.map { case (lanes, cid) => + radianceSmemFanout.grouped(numLanes).toSeq.zipWithIndex.map { case (lanes, cid) => val lane = lanes(wid) val filterNode = AlignFilterNode(addresses)(p, ValName(s"filter_c${cid}_w${wid}")) guardMonitors { implicit p => @@ -169,29 +165,40 @@ class VirgoSharedMemComponents( } } }.flatten - val fUnalignedRW = filterNodes.zipWithIndex.flatMap { case (cores, lid) => - cores.zipWithIndex.map { case (fn, cid) => - connectOne(fn, () => RWSplitterNode(AddressSet.everything, s"unaligned_split_c${cid}_l${lid}")) + filterNodes.zipWithIndex.foreach { case (cores, lid) => + cores.zipWithIndex.foreach { case (fn, cid) => + unalignedRWNodes(lid)(cid) = connectOne(fn, () => + RWSplitterNode(AddressSet.everything, s"unaligned_split_c${cid}_l${lid}")) } } - val fAligned = Seq.fill(2)(fAlignedRW.map(_.map(connectXbarName(_, Some("rad_aligned"))))) + Seq.fill(2)(fAlignedRW.map(_.map(connectXbarName(_, Some("rad_aligned"))))) + } - val fUnaligned = if (serializeUnaligned) { - Seq.fill(2) { - val serializedNode = TLEphemeralNode() - val serializedInXbar = TLXbar(nameSuffix = Some("unaligned_ser_in")) - val serializedOutXbar = TLXbar(nameSuffix = Some("unaligned_ser_out")) - guardMonitors { implicit p => - fUnalignedRW.foreach(serializedInXbar := _) - serializedNode := serializedInXbar - serializedOutXbar := serializedNode - } - Seq(serializedOutXbar) + val fUnaligned: Seq[Seq[TLNode]] = serializeUnaligned match { + case FullySerialized => Seq.fill(2) { + val serializedNode = TLEphemeralNode() + val serializedInXbar = LazyModule(new TLXbar()) + val serializedOutXbar = LazyModule(new TLXbar()) + serializedInXbar.suggestName("unaligned_serialized_in_xbar") + serializedOutXbar.suggestName("unaligned_serialized_out_xbar") + guardMonitors { implicit p => + unalignedRWNodes.flatten.foreach(serializedInXbar.node := _) + serializedNode := serializedInXbar.node + serializedOutXbar.node := serializedNode } - } else { - Seq.fill(2)(fUnalignedRW.map(connectXbar.apply)) + Seq(serializedOutXbar.node) } - (fAligned, fUnaligned) + case CoreSerialized => Seq.tabulate(2) { rw => + // we can either have one core per lane selected (multiple mux selects) + // or strictly lanes from a single selected core (one mux select). doing the latter here + unalignedRWNodes.toSeq.zipWithIndex.map { case (coresRW, lid) => + val laneSerialXbar = laneSerialXbars.get(rw)(lid) + laneSerialXbar._1.policySlaveNode := coreSerialPolicy.get(rw)(lid) + coresRW.foreach(laneSerialXbar._2 := _) + connectXbarName(connectOne(laneSerialXbar._1.node, TLEphemeralNode.apply), Some(s"lane_${lid}_serial_out")) + } + } + case NotSerialized => Seq.fill(2)(unalignedRWNodes.toSeq.flatten.map(connectXbar.apply)) } @@ -215,6 +222,8 @@ class VirgoSharedMemComponents( val uniformWNodes: Seq[Seq[Seq[TLNexusNode]]] = (spadWriteNodes zip spadSpWriteNodes).map { case (wb, wsb) => (wb zip wsb).map { case (ww, wsw) => ww ++ wsw } } + // random accesses are not serialized here, require so + require(serializeUnaligned == NotSerialized, "when not filtering, unaligned accesses must be serialized") // these nodes are random access val nonuniformRNodes: Seq[TLNode] = splitterNodes.map(connectXbarName(_, Some("rad_unaligned_r"))) val nonuniformWNodes: Seq[TLNode] = splitterNodes.map(connectXbarName(_, Some("rad_unaligned_w"))) @@ -242,3 +251,23 @@ class VirgoSharedMemComponents( (Seq.empty, Seq.empty, Seq(unifiedMemReadNode), Seq(unifiedMemWriteNode)) } } + +class VirgoSharedMemComponentsImp[T <: VirgoSharedMemComponents] + (override val outer: T) extends RadianceSmemNodeProviderImp[T](outer) { + + (outer.laneSerialXbars zip outer.coreSerialPolicy).foreach { case (xbarsRW, policiesRW) => + (xbarsRW zip policiesRW).foreach { case (xbars, policies) => + // for each lane, if any core is valid + val coreValids = xbars.map(_._2.in.map(_._1)).transpose.map { core => VecInit(core.map(_.a.valid)).asUInt.orR } + val select = xbars.map(_._2.out.map(_._1)).transpose.map { core => VecInit(core.map(_.a.ready)).asUInt.orR } + val coreSelect = TLArbiter.roundRobin(outer.numCores, VecInit(coreValids).asUInt, VecInit(select).asUInt.orR) + // TODO: roll this into XbarWithExtPolicy + xbars.foreach { lane => + (lane._2.in.map(_._1) lazyZip lane._2.out.map(_._1) lazyZip coreSelect.asBools).foreach { case (li, lo, cs) => + lo.a.valid := li.a.valid && cs + } + } + policies.foreach { _.out.head._1.hint := coreSelect } + } + } +} From ffdabf9184854979268345fc135971a24f87e6c0 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Thu, 17 Oct 2024 14:49:11 -0700 Subject: [PATCH 4/4] add tag to tc smem interface, bump vortex --- src/main/resources/vsrc/vortex | 2 +- src/main/scala/radiance/tile/RadianceTile.scala | 5 ++++- src/main/scala/radiance/tile/VortexCore.scala | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index 8bf7f39..cde8da1 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit 8bf7f39f04e6d3cbc47559fdd3cacca0febe9baa +Subproject commit cde8da1f3b1354e2f0a5231d9089ed4e95eb3272 diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 8ab5984..36aef41 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -736,11 +736,13 @@ class RadianceTileModuleImp(outer: RadianceTile) def connectTc { val tcb0 = new { val addr = core.io.tc_a_bits_address(31, 0) + val tag = core.io.tc_a_bits_tag(3, 0) val aValid = core.io.tc_a_valid(0) val dReady = core.io.tc_d_ready(0) } val tcb1 = new { val addr = core.io.tc_a_bits_address(63, 32) + val tag = core.io.tc_a_bits_tag(7, 4) val aValid = core.io.tc_a_valid(1) val dReady = core.io.tc_d_ready(1) } @@ -758,7 +760,7 @@ class RadianceTileModuleImp(outer: RadianceTile) adapter.io.inReq.bits <> DontCare adapter.io.inReq.valid := bundle.aValid adapter.io.inReq.bits.address := bundle.addr - adapter.io.inReq.bits.source := i.U + adapter.io.inReq.bits.source := bundle.tag adapter.io.inReq.bits.size := 5.U adapter.io.inReq.bits.opcode := TLMessages.Get adapter.io.inReq.bits.mask := x"ffffffff".U @@ -771,6 +773,7 @@ class RadianceTileModuleImp(outer: RadianceTile) core.io.tc_a_ready := Cat(adapters.last.io.inReq.ready, adapters.head.io.inReq.ready) core.io.tc_d_valid := Cat(adapters.last.io.inResp.valid, adapters.head.io.inResp.valid) core.io.tc_d_bits_data := Cat(adapters.last.io.inResp.bits.data, adapters.head.io.inResp.bits.data) + core.io.tc_d_bits_tag := Cat(adapters.last.io.inResp.bits.source, adapters.head.io.inResp.bits.source) } def connectBarrier = { diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index 1409ddd..d45e303 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -92,9 +92,11 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl val tc_a_valid = Output(UInt(2.W)) val tc_a_bits_address = Output(UInt((2 * 32).W)) + val tc_a_bits_tag = Output(UInt((2 * 4).W)) val tc_a_ready = Input(UInt(2.W)) val tc_d_valid = Input(UInt(2.W)) val tc_d_bits_data = Input(UInt((2 * 32 * 8).W)) + val tc_d_bits_tag = Input(UInt((2 * 4).W)) val tc_d_ready = Output(UInt(2.W)) // FIXME: hardcoded