From 19852693b7bd073895e5dfb85b9a615a99b170e6 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Wed, 12 Jun 2024 02:17:40 -0700 Subject: [PATCH] dual core gemmini, unpeg gemmini size and smem width --- radiance.mk | 1 - src/main/resources/vsrc/vortex | 2 +- src/main/scala/radiance/core/TensorDPU.scala | 2 +- .../scala/radiance/subsystem/Configs.scala | 63 +++++----- src/main/scala/radiance/tile/AccNode.scala | 6 +- .../scala/radiance/tile/GemminiTile.scala | 14 +-- .../scala/radiance/tile/RadianceCluster.scala | 118 ++++++++++-------- src/main/scala/radiance/tile/VortexCore.scala | 2 +- 8 files changed, 108 insertions(+), 100 deletions(-) diff --git a/radiance.mk b/radiance.mk index 674bb95..e041c1d 100644 --- a/radiance.mk +++ b/radiance.mk @@ -26,7 +26,6 @@ EXTRA_SIM_PREPROC_DEFINES += \ +define+GBAR_ENABLE \ +define+GBAR_CLUSTER_ENABLE \ +define+NUM_FPU_BLOCKS=2 \ - +define+EXT_T_DISABLE \ +define+FPU_FPNEW \ +define+SMEM_LOG_SIZE=17 diff --git a/src/main/resources/vsrc/vortex b/src/main/resources/vsrc/vortex index a47389f..1833e8a 160000 --- a/src/main/resources/vsrc/vortex +++ b/src/main/resources/vsrc/vortex @@ -1 +1 @@ -Subproject commit a47389fc0e8ec8aa95d024db0cec0f4f37b44e54 +Subproject commit 1833e8a17666e5b8d8660d67a32bbec76af1b990 diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala index 5732fe2..6cc103e 100644 --- a/src/main/scala/radiance/core/TensorDPU.scala +++ b/src/main/scala/radiance/core/TensorDPU.scala @@ -49,7 +49,7 @@ class StallingPipe[T <: Data](val gen: T, val latency: Int = 1) extends Module { * names. Includes the latency cycle count in the name as well as the * parameterized generator's `typeName`, e.g. `Pipe4_UInt4` */ - override def desiredName = s"${simpleClassName(this.getClass)}${latency}_${gen.typeName}" + // override def desiredName = s"${simpleClassName(this.getClass)}${latency}_${gen.typeName}" class StallingPipeIO extends Bundle { val stall = Input(Bool()) diff --git a/src/main/scala/radiance/subsystem/Configs.scala b/src/main/scala/radiance/subsystem/Configs.scala index 9ef551c..971f4d6 100644 --- a/src/main/scala/radiance/subsystem/Configs.scala +++ b/src/main/scala/radiance/subsystem/Configs.scala @@ -4,7 +4,6 @@ package radiance.subsystem import chisel3.util._ -import freechips.rocketchip.diplomacy.BigIntHexContext import org.chipsalliance.cde.config._ import freechips.rocketchip.rocket._ import freechips.rocketchip.tile._ @@ -24,10 +23,6 @@ case class RadianceSharedMemKey(address: BigInt, serializeUnaligned: Boolean = true) case object RadianceSharedMemKey extends Field[Option[RadianceSharedMemKey]](None) -case class RadianceGemminiKey(tileSize: Int, - slaveAddress: BigInt) -case object RadianceGemminiKey extends Field[Option[RadianceGemminiKey]](None) - case class RadianceFrameBufferKey(baseAddress: BigInt, width: Int, size: Int, @@ -92,38 +87,40 @@ class WithRadianceGemmini(location: HierarchicalLocation, if (idOffset == 0) { println("******WARNING****** gemmini tile id is 0! radiance tiles in the same cluster needs to be before gemmini") } + val numPrevGemminis = prev.map(_.tileParams).map { + case _: GemminiTileParams => 1 + case _ => 0 + }.sum val smKey = site(RadianceSharedMemKey).get - val gemmini = GemminiTileParams(gemminiConfig = GemminiFPConfigs.FP32DefaultConfig.copy( - has_training_convs = false, - has_max_pool = false, - use_tl_ext_mem = true, - sp_singleported = false, - spad_read_delay = 4, - use_shared_ext_mem = true, - acc_sub_banks = 1, - has_normalizations = false, - meshRows = dim, - meshColumns = dim, - tile_latency = 0, - dma_maxbytes = site(CacheBlockBytes), - dma_buswidth = dim * 32, - tl_ext_mem_base = smKey.address, - sp_banks = smKey.numBanks, - sp_capacity = CapacityInKilobytes(smKey.size >> 10), - acc_capacity = CapacityInKilobytes(accSizeInKB), - )) - List.tabulate(1)(i => GemminiTileAttachParams( - gemmini.copy(tileId = i + idOffset), + val tileParams = GemminiTileParams( + gemminiConfig = GemminiFPConfigs.FP32DefaultConfig.copy( + has_training_convs = false, + has_max_pool = false, + use_tl_ext_mem = true, + sp_singleported = false, + spad_read_delay = 4, + use_shared_ext_mem = true, + acc_sub_banks = 1, + has_normalizations = false, + meshRows = dim, + meshColumns = dim, + tile_latency = 0, + dma_maxbytes = site(CacheBlockBytes), + dma_buswidth = dim * 32, + tl_ext_mem_base = smKey.address, + sp_banks = smKey.numBanks, + sp_capacity = CapacityInKilobytes(smKey.size >> 10), + acc_capacity = CapacityInKilobytes(accSizeInKB), + ), + tileId = idOffset, + tileSize = tileSize, + slaveAddress = smKey.address + smKey.size + 0x3000 + 0x100 * numPrevGemminis + ) + Seq(GemminiTileAttachParams( + tileParams, crossing )) ++ prev } - case RadianceGemminiKey => { - val smKey = site(RadianceSharedMemKey).get - Some(RadianceGemminiKey( - tileSize = tileSize, - slaveAddress = smKey.address + smKey.size + 0x3000 - )) - } }) { def this(location: HierarchicalLocation = InSubsystem, dim: Int, accSizeInKB: Int, tileSize: Int) = this(location, RocketCrossingParams( diff --git a/src/main/scala/radiance/tile/AccNode.scala b/src/main/scala/radiance/tile/AccNode.scala index 4dd0b3e..b6109bb 100644 --- a/src/main/scala/radiance/tile/AccNode.scala +++ b/src/main/scala/radiance/tile/AccNode.scala @@ -2,13 +2,17 @@ package radiance.tile; import chisel3._ import chisel3.experimental.SourceInfo import chisel3.util._ +import freechips.rocketchip.diplomacy.BigIntHexContext import org.chipsalliance.cde.config.Parameters import org.chipsalliance.diplomacy._ import org.chipsalliance.diplomacy.nodes._ -class AccBundle() extends Bundle { +class AccBundle extends Bundle { val cmd = Output(Valid(UInt(32.W))) val status = Input(UInt(1.W)) + + def dest(): UInt = { cmd.bits(7, 5) } + def masked(): UInt = { cmd.bits & x"ffffff1f".U } } case class NullParams() diff --git a/src/main/scala/radiance/tile/GemminiTile.scala b/src/main/scala/radiance/tile/GemminiTile.scala index 6f63693..c0791c2 100644 --- a/src/main/scala/radiance/tile/GemminiTile.scala +++ b/src/main/scala/radiance/tile/GemminiTile.scala @@ -17,7 +17,7 @@ import freechips.rocketchip.tile._ import freechips.rocketchip.tilelink._ import gemmini._ import org.chipsalliance.cde.config.Parameters -import radiance.subsystem.{GPUMemParams, GPUMemory, RadianceGemminiKey} +import radiance.subsystem.{GPUMemParams, GPUMemory} case class GemminiCoreParams( useVM: Boolean = false, @@ -61,7 +61,9 @@ case class GemminiCoreParams( case class GemminiTileParams( tileId: Int = 0, - gemminiConfig: GemminiArrayConfig[Float, Float, Float] + gemminiConfig: GemminiArrayConfig[Float, Float, Float], + tileSize: Int = 4, + slaveAddress: BigInt ) extends InstantiableTileParams[GemminiTile] { def instantiate(crossing: HierarchicalElementCrossingParamsLike, lookup: LookupByHartIdImpl)( implicit p: Parameters @@ -100,7 +102,7 @@ class GemminiTile private ( lookup: LookupByHartIdImpl)(implicit p: Parameters) = this(params, crossing.crossingType, lookup, p) - val cpuDevice: SimpleDevice = new SimpleDevice("gemmini", Nil) + val cpuDevice: SimpleDevice = new SimpleDevice(s"gemmini${tileId}", Nil) val intOutwardNode = None val slaveNode = TLIdentityNode() @@ -127,11 +129,9 @@ class GemminiTile private ( require(!gemmini.config.sp_singleported, "external scratchpad must be dual ported") - val configKey = p(RadianceGemminiKey).get - val regDevice = new SimpleDevice("gemmini-cmd-reg", Seq(s"gemmini-cmd-reg")) val regNode = TLRegisterNode( - address = Seq(AddressSet(configKey.slaveAddress, 0xfff)), + address = Seq(AddressSet(gemminiParams.slaveAddress, 0xff)), device = regDevice, beatBytes = 8, concurrency = 1) @@ -187,7 +187,7 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer) ciscInst := 0.U.asTypeOf(ciscInstT) - val tileSize = outer.configKey.tileSize + val tileSize = outer.gemminiParams.tileSize val (boundsInst, spadQuartile) = if (tileSize == 4) { (ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"4_00040004".U), 0x80) } else if (tileSize == 8) { diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index 1040026..c75f4f3 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -46,14 +46,10 @@ class RadianceCluster ( // val numLsuLanes = 4 // FIXME: hardcoded // must toSeq here, otherwise Iterable is lazy and will break diplomacy - val gemminis = leafTiles.values.filter(_.isInstanceOf[GemminiTile]).toSeq.asInstanceOf[Seq[GemminiTile]] - require(gemminis.size == 1, "there should be one and only one gemmini per cluster") - val gemmini = gemminis.head.gemmini - val gemminiTile = gemminis.head + val gemminiTiles = leafTiles.values.filter(_.isInstanceOf[GemminiTile]).toSeq.asInstanceOf[Seq[GemminiTile]] + val gemminis = gemminiTiles.map(_.gemmini) + val gemminiConfigs = gemminis.map(_.config) // val gemminiConfig = thisClusterParams.gemminiConfig.get // TODO: handle None gracefully - val gemminiConfig = gemmini.config - - val max_write_width_bytes = gemminiConfig.dma_buswidth / 8 val radianceTiles = leafTiles.values.filter(_.isInstanceOf[RadianceTile]).toSeq.asInstanceOf[Seq[RadianceTile]] @@ -67,9 +63,6 @@ class RadianceCluster ( // // ************************************** - // TODO: parametrize bank configuration - // TODO: move rw split node to separate file - // TODO: stride by word val unified_mem_read_node = TLIdentityNode() val unified_mem_write_node = TLIdentityNode() @@ -81,15 +74,17 @@ class RadianceCluster ( val smem_depth = smem_key.size / smem_width / smem_banks val smem_subbanks = smem_width / wordSize val smem_size = smem_width * smem_depth * smem_banks - assert(gemminiConfig.sp_banks == smem_banks) - assert(gemminiConfig.sp_width / 8 == smem_width) - assert(gemminiConfig.sp_bank_entries == smem_depth) - VecInit(Seq(0.U, 1.U)).reduceTree(_ +& _) + gemminiConfigs.foreach { config => + assert(smem_banks == config.sp_banks && isPow2(smem_banks / config.sp_banks)) // TODO: should allow >= + assert(smem_width >= (config.sp_width / 8) && isPow2(smem_width / (config.sp_width / 8))) + assert(smem_size == config.sp_capacity.asInstanceOf[CapacityInKilobytes].kilobytes * 1024) + } + val stride_by_word = true val filter_aligned = true val disable_monitors = true // otherwise it generate 1k+ different tl monitors - val serialize_unaligned = false + val serialize_unaligned = true def guard_monitors[T](callback: Parameters => T)(implicit p: Parameters): Unit = { if (disable_monitors) { @@ -186,26 +181,27 @@ class RadianceCluster ( } if (stride_by_word) { - // ask if you need to deal with this, it's not supposed to be readable + def dist_and_duplicate(nodes: Seq[TLNode], suffix: String): Seq[Seq[TLNode]] = { + val word_fanout_nodes = gemminis.zip(nodes).zipWithIndex.map { case ((gemmini, node), gemmini_idx) => + val sp_width_bytes = gemmini.config.sp_width / 8 + val sp_subbanks = sp_width_bytes / wordSize + val dist = DistributorNode(from = sp_width_bytes, to = wordSize) + guard_monitors { implicit p => + dist := TLBuffer(BufferParams(1, false, true), BufferParams(0)) := node + } + val fanout = Seq.fill(sp_subbanks) { + connect_xbar_name(dist, Some(s"spad_g${gemmini_idx}_fanout_$suffix")) + } + Seq.fill(smem_width / sp_width_bytes)(fanout).flatten // smem wider than spad, duplicate masters + } + // (gemmini, word) => (word, gemmini) + word_fanout_nodes.transpose + } - val spad_read_nodes = Seq.fill(smem_banks) { - val r_dist = DistributorNode(from = smem_width, to = wordSize) - guard_monitors { implicit p => r_dist := TLBuffer(BufferParams(1, false, true), BufferParams(0)) := gemmini.spad_read_nodes } - Seq.fill(smem_subbanks) { connect_one(r_dist, TLIdentityNode.apply) } - } - val spad_write_nodes = Seq.fill(smem_banks) { - val w_dist = DistributorNode(from = smem_width, to = wordSize) - guard_monitors { implicit p => w_dist := TLBuffer(BufferParams(1, false, true), BufferParams(0)) := gemmini.spad_write_nodes } - Seq.fill(smem_subbanks) { connect_one(w_dist, TLIdentityNode.apply) } - /* Seq.fill(smem_subbanks) { - val buf = TLBuffer(BufferParams(1, false, true), BufferParams(0)) - buf := w_dist - buf - } */ - } - val ws_dist = DistributorNode(from = smem_width, to = wordSize) - guard_monitors { implicit p => ws_dist := gemmini.spad.spad_writer.node } // this is the dma write node - val spad_sp_write_nodes = Seq.fill(smem_subbanks) { connect_xbar(ws_dist) } + val spad_read_nodes = Seq.fill(smem_banks)(dist_and_duplicate(gemminis.map(_.spad_read_nodes), "r")) + val spad_write_nodes = Seq.fill(smem_banks)(dist_and_duplicate(gemminis.map(_.spad_write_nodes), "w")) + val spad_sp_write_nodes_single_bank = dist_and_duplicate(gemminis.map(_.spad.spad_writer.node), "ws") + val spad_sp_write_nodes = Seq.fill(smem_banks)(spad_sp_write_nodes_single_bank) // executed only once val (uniform_r_nodes, uniform_w_nodes, nonuniform_r_nodes, nonuniform_w_nodes): (Seq[Seq[Seq[TLNode]]], Seq[Seq[Seq[TLNode]]], Seq[TLNode], Seq[TLNode]) = if (filter_aligned) { @@ -252,11 +248,11 @@ class RadianceCluster ( } val uniform_r_nodes: Seq[Seq[Seq[TLNode]]] = spad_read_nodes.map { rb => - (rb zip f_aligned.head).map { case (rw, fa) => Seq(rw) ++ fa } + (rb zip f_aligned.head).map { case (rw, fa) => rw ++ fa } } - val uniform_w_nodes: Seq[Seq[Seq[TLNode]]] = spad_write_nodes.map { wb => - (wb lazyZip spad_sp_write_nodes lazyZip f_aligned.last).map { - case (ww, sw, fa) => Seq(ww, sw) ++ fa + val uniform_w_nodes: Seq[Seq[Seq[TLNode]]] = (spad_write_nodes zip spad_sp_write_nodes).map { case (wb, wsb) => + (wb lazyZip wsb lazyZip f_aligned.last).map { + case (ww, wsw, fa) => ww ++ wsw ++ fa } } @@ -267,11 +263,9 @@ class RadianceCluster ( } else { val splitter_nodes = radiance_smem_fanout.map { connect_one(_, RWSplitterNode.apply) } // these nodes access an entire line simultaneously - val uniform_r_nodes: Seq[Seq[Seq[TLNode]]] = spad_read_nodes.map { rb => - rb.map { rw => Seq(rw) } - } - val uniform_w_nodes: Seq[Seq[Seq[TLNode]]] = spad_write_nodes.map { wb => - (wb zip spad_sp_write_nodes).map { case (ww, sw) => Seq(ww, sw) } + val uniform_r_nodes: Seq[Seq[Seq[TLNode]]] = spad_read_nodes + val uniform_w_nodes: Seq[Seq[Seq[TLNode]]] = (spad_write_nodes zip spad_sp_write_nodes).map { case (wb, wsb) => + (wb zip wsb).map { case (ww, wsw) => ww ++ wsw } } // these nodes are random access val nonuniform_r_nodes: Seq[TLNode] = splitter_nodes.map(connect_xbar_name(_, Some("rad_unaligned_r"))) @@ -302,9 +296,11 @@ class RadianceCluster ( } } } else { - unified_mem_read_node :=* TLWidthWidget(smem_width) :=* gemmini.spad_read_nodes - unified_mem_write_node :=* TLWidthWidget(smem_width) :=* gemmini.spad_write_nodes - unified_mem_write_node := gemmini.spad.spad_writer.node // this is the dma write node + gemminis.foreach { gemmini => + unified_mem_read_node :=* TLWidthWidget(smem_width) :=* gemmini.spad_read_nodes + unified_mem_write_node :=* TLWidthWidget(smem_width) :=* gemmini.spad_write_nodes + unified_mem_write_node := gemmini.spad.spad_writer.node // this is the dma write node + } val splitter_node = RWSplitterNode() unified_mem_read_node := TLWidthWidget(smem_width) := splitter_node @@ -327,14 +323,22 @@ class RadianceCluster ( } } - // connect tile smem nodes to xbar, and xbar to banks - // val smem_xbar = TLXbar() + // ******************************************************* + // ___ _______ _______ __ _________ ___ __ ____ + // / _ \/ __/ _ \/ _/ _ \/ // / __/ _ \/ _ | / / / __/ + // / ___/ _// , _// // ___/ _ / _// , _/ __ |/ /___\ \ + // /_/ /___/_/|_/___/_/ /_//_/___/_/|_/_/ |_/____/___/ + // + // ******************************************************* val radianceAccSlaveNodes = Seq.fill(numCores)(AccSlaveNode()) (radianceAccSlaveNodes zip radianceTiles).foreach { case (a, r) => a := r.accMasterNode } - val gemminiAccMasterNode = AccMasterNode() - gemminiTile.accSlaveNode := gemminiAccMasterNode - gemminiTile.slaveNode :=* TLWidthWidget(4) :=* clbus.outwardNode + val gemminiAccMasterNodes = gemminiTiles.map { tile => + val masterNode = AccMasterNode() + tile.accSlaveNode := masterNode + masterNode + } + gemminiTiles.foreach { _.slaveNode :=* TLWidthWidget(4) :=* clbus.outwardNode } val traceTLNode = TLAdapterNode(clientFn = c => c, managerFn = m => m) // printf and perf counter buffer @@ -385,15 +389,19 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( } val coreAcc = outer.radianceAccSlaveNodes.head.in.head._1 - val gemminiAcc = outer.gemminiAccMasterNode.out.head._1 - dontTouch(gemminiAcc) + val gemminiAccs = outer.gemminiAccMasterNodes.map(_.out.head._1) // val gemminiTileAcc = outer.gemminiTile.accSlaveNode.in.head._1 // gemminiTileAcc.cmd := gemminiAcc.cmd // gemminiAcc.status := gemminiTileAcc.status - outer.radianceAccSlaveNodes.foreach(_.in.head._1.status := gemminiAcc.status) - gemminiAcc.cmd := coreAcc.cmd + gemminiAccs.zipWithIndex.foreach { case (g, gi) => + g.cmd.bits := coreAcc.masked + g.cmd.valid := coreAcc.cmd.valid && (coreAcc.dest === gi.U) + } + + // this might need some more tweaking (e.g. bitmask instead of or) + outer.radianceAccSlaveNodes.foreach(_.in.head._1.status := VecInit(gemminiAccs.map(_.status)).reduceTree(_ | _)) (outer.traceTLNode.in.map(_._1) zip outer.traceTLNode.out.map(_._1)).foreach { case (i, o) => o.a <> i.a diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index 08e206d..dab3ec0 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -389,7 +389,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters) // tensor core addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_core.sv") - addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh") +// addResource("/vsrc/vortex/hw/rtl/core/VX_tensor_ucode.vh") addResource("/vsrc/vortex/hw/rtl/core/VX_uop_sequencer.sv") addResource("/vsrc/vortex/hw/rtl/core/VX_reduce_unit.sv") addResource("/vsrc/vortex/hw/rtl/fpu/VX_tensor_dpu.sv")