diff --git a/src/main/scala/radiance/memory/Coalescing.scala b/src/main/scala/radiance/memory/Coalescing.scala index e19467f..966e1c2 100644 --- a/src/main/scala/radiance/memory/Coalescing.scala +++ b/src/main/scala/radiance/memory/Coalescing.scala @@ -6,7 +6,7 @@ import chisel3._ import chisel3.util._ import org.chipsalliance.cde.config.{Field, Parameters} import freechips.rocketchip.diplomacy._ -import freechips.rocketchip.util.MultiPortQueue +import freechips.rocketchip.util.{Code, MultiPortQueue, OnePortLanePositionedQueue} import freechips.rocketchip.unittest._ import freechips.rocketchip.tilelink._ @@ -133,7 +133,7 @@ object DefaultCoalescerConfig extends CoalescerConfig( // when attaching to SoC, 16 source IDs are not enough due to longer latency numOldSrcIds = 8, numNewSrcIds = 8, - respQueueDepth = 2, + respQueueDepth = 4, sizeEnum = DefaultInFlightTableSizeEnum, numCoalReqs = 1, numArbiterOutputPorts = 4, @@ -392,24 +392,12 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) // eltPrototype.bits := DontCare // eltPrototype.valid := false.B - val elts = Reg(Vec(config.numLanes, Vec(entries, Valid(gen)))) + val elts = RegInit(0.U.asTypeOf(Vec(config.numLanes, Vec(entries, Valid(gen))))) val writePtr = RegInit( VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W))) ) val deqDone = RegInit(VecInit(Seq.fill(config.numLanes)(false.B))) - private def resetElts = { - elts.foreach { laneQ => - laneQ.foreach { entry => - entry.valid := false.B - entry.bits := DontCare - } - } - } - when(reset.asBool) { - resetElts - } - val controlSignals = Wire(Vec(config.numLanes, new Bundle { val shift = Bool() val full = Bool() @@ -1046,6 +1034,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) log2Ceil(config.maxCoalLogSize), (1 << config.maxCoalLogSize) * 8 ) + require(config.respQueueDepth > 2, "MultiPortQueue requires depth of at least 4 in FPGAs") val respQueues = Seq.tabulate(config.numLanes) { _ => Module( new MultiPortQueue( @@ -1068,7 +1057,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) // make queue block up in the middle of the simulation. Ideally there // should be a more logical way to set this, or we should handle // response queue blocking. - config.respQueueDepth + config.respQueueDepth, + flow = false, + // storage = OnePortLanePositionedQueue(Code.fromString("identity")) ) ) } diff --git a/src/main/scala/radiance/memory/VortexCache.scala b/src/main/scala/radiance/memory/VortexCache.scala index 0c95eb4..850958f 100644 --- a/src/main/scala/radiance/memory/VortexCache.scala +++ b/src/main/scala/radiance/memory/VortexCache.scala @@ -101,12 +101,7 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters) clients = Seq( TLMasterParameters.v1( name = "VortexBankPassthrough", - sourceId = IdRange( - 0, - 1 << (log2Ceil( - config.memSideSourceIds - ) + 5 /*FIXME: give more sourceId so that passthrough doesn't block; hacky*/ ) - ), + sourceId = IdRange(0, 1 << config.coreTagWidth), supportsProbe = TransferSizes(1, config.cacheLineSize), supportsGet = TransferSizes(1, config.cacheLineSize), supportsPutFull = TransferSizes(1, config.cacheLineSize), @@ -236,7 +231,7 @@ class VortexBankImp( } class ReadReqInfo(config: VortexL1Config) extends Bundle { - val size = UInt(log2Ceil(config.inputSize + 1).W) + val size = UInt(log2Ceil(4).W + 1) val id = UInt(config.coreTagWidth.W) } @@ -273,6 +268,14 @@ class VortexBankImp( // vxCache.io.core_req_tag readReqInfo.id := tlInFromCoal.a.bits.source readReqInfo.size := tlInFromCoal.a.bits.size + assert(readReqInfo.id.getWidth == tlInFromCoal.a.bits.source.getWidth, + s"id width mismatch; coalescer ${tlInFromCoal.a.bits.source.getWidth}, cache ${readReqInfo.id.getWidth}") + assert(readReqInfo.size.getWidth == tlInFromCoal.a.bits.size.getWidth, + s"size width mismatch; coalescer ${tlInFromCoal.a.bits.size.getWidth}, cache ${readReqInfo.size.getWidth}") + assert(readReqInfo.id.getWidth == tlInFromCoal.a.bits.source.getWidth, + s"id width mismatch; coalescer ${tlInFromCoal.a.bits.source.getWidth}, cache ${readReqInfo.id.getWidth}") + assert(readReqInfo.size.getWidth == tlInFromCoal.a.bits.size.getWidth, + s"size width mismatch; coalescer ${tlInFromCoal.a.bits.size.getWidth}, cache ${readReqInfo.size.getWidth}") // ignore param, size, corrupt vxCache.io.core_req_tag := readReqInfo.asTypeOf(vxCache.io.core_req_tag) diff --git a/src/main/scala/radiance/subsystem/Configs.scala b/src/main/scala/radiance/subsystem/Configs.scala index 1a576cb..fc24a3b 100644 --- a/src/main/scala/radiance/subsystem/Configs.scala +++ b/src/main/scala/radiance/subsystem/Configs.scala @@ -178,8 +178,8 @@ class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, _, up) => { numBanks = nBanks, inputSize = up(SIMTCoreKey).get.nMemLanes * 4, cacheLineSize = up(SIMTCoreKey).get.nMemLanes * 4, - memSideSourceIds = 64, - mshrSize = 64, + memSideSourceIds = 16, + mshrSize = 16, coreTagWidth = log2Ceil(up(SIMTCoreKey).get.nSrcIds.max(up(CoalescerKey) match { case Some(key) => key.numNewSrcIds case None => 0 diff --git a/src/main/scala/radiance/tile/AccNode.java b/src/main/scala/radiance/tile/AccNode.java new file mode 100644 index 0000000..99697d6 --- /dev/null +++ b/src/main/scala/radiance/tile/AccNode.java @@ -0,0 +1,2 @@ +package radiance.tile;public class AccNode { +} diff --git a/src/main/scala/radiance/tile/AccNode.scala b/src/main/scala/radiance/tile/AccNode.scala new file mode 100644 index 0000000..4dd0b3e --- /dev/null +++ b/src/main/scala/radiance/tile/AccNode.scala @@ -0,0 +1,23 @@ +package radiance.tile; +import chisel3._ +import chisel3.experimental.SourceInfo +import chisel3.util._ +import org.chipsalliance.cde.config.Parameters +import org.chipsalliance.diplomacy._ +import org.chipsalliance.diplomacy.nodes._ + +class AccBundle() extends Bundle { + val cmd = Output(Valid(UInt(32.W))) + val status = Input(UInt(1.W)) +} + +case class NullParams() + +object AcceleratorNodeImp extends SimpleNodeImp[NullParams, NullParams, NullParams, AccBundle] { + def bundle(x: NullParams) = new AccBundle() + def edge(x: NullParams, y: NullParams, p: Parameters, sourceInfo: SourceInfo): NullParams = NullParams() + def render(x: NullParams): RenderedEdge = RenderedEdge("ffffff") +} +case class AccMasterNode()(implicit valName: ValName) extends SourceNode(AcceleratorNodeImp)(Seq(NullParams())) +case class AccSlaveNode()(implicit valName: ValName) extends SinkNode(AcceleratorNodeImp)(Seq(NullParams())) + diff --git a/src/main/scala/radiance/tile/GemminiTile.scala b/src/main/scala/radiance/tile/GemminiTile.scala index 897a314..248ece0 100644 --- a/src/main/scala/radiance/tile/GemminiTile.scala +++ b/src/main/scala/radiance/tile/GemminiTile.scala @@ -4,7 +4,9 @@ package radiance.tile import chisel3._ -import freechips.rocketchip.diplomacy.{ClockCrossingType, DisableMonitors, LazyModule, SimpleDevice} +import chisel3.util._ +import chisel3.experimental.BundleLiterals._ +import freechips.rocketchip.diplomacy.{BigIntHexContext, ClockCrossingType, DisableMonitors, LazyModule, SimpleDevice} import freechips.rocketchip.prci.ClockSinkParameters import freechips.rocketchip.rocket._ import freechips.rocketchip.subsystem.{CanAttachTile, HierarchicalElementCrossingParamsLike, RocketCrossingParams} @@ -102,6 +104,8 @@ class GemminiTile private ( val masterNode = visibilityNode // val statusNode = BundleBridgeSource(() => new GroundTestStatus) + val accSlaveNode = AccSlaveNode() + tlOtherMastersNode := tlMasterXbar.node masterNode :=* tlOtherMastersNode DisableMonitors { implicit p => tlSlaveXbar.node :*= slaveNode } @@ -129,8 +133,6 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer) val gemmini_io = outer.gemmini.module.io gemmini_io.ptw <> DontCare gemmini_io.mem <> DontCare - gemmini_io.cmd <> DontCare - gemmini_io.cmd.valid := false.B gemmini_io.resp <> DontCare gemmini_io.fpu_req.ready := false.B gemmini_io.fpu_resp.valid := false.B @@ -140,6 +142,85 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer) tieOffGemminiRocc + val accSlave = outer.accSlaveNode.in.head._1 + + val instCounter = Counter(4) + val ciscValid = RegInit(false.B) + val ciscId = RegInit(0.U(8.W)) + val ciscInstT = new Bundle { + val inst = UInt(32.W) + val rs1 = UInt(64.W) + val rs2 = UInt(64.W) + } + val ciscInst = Wire(ciscInstT) + + when (accSlave.cmd.valid) { + ciscValid := true.B + ciscId := accSlave.cmd.bits(7, 0) + instCounter.reset() + } + + def microcodeEntry[T <: Data](insts: Seq[T]): T = { + when (instCounter.value === (insts.size - 1).U) { + ciscValid := false.B + instCounter.reset() + }.otherwise { + instCounter.inc() + } + VecInit(insts)(instCounter.value) + } + + ciscInst := 0.U.asTypeOf(ciscInstT) + when (ciscValid) { + assert(!accSlave.cmd.valid, "cisc state machine already busy") + switch (ciscId) { + is (0.U) { + ciscInst := microcodeEntry(Seq( + ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"4_00040004".U), // set I, J, K + ciscInstT.Lit(_.inst -> 0x3020b07b.U, _.rs1 -> 0.U, _.rs2 -> 0x180.U), // set A, B address + ciscInstT.Lit(_.inst -> 0x1020b07b.U, _.rs1 -> 0.U, _.rs2 -> x"0_000002b8".U) // set skip, acc + )) + } + is (2.U) { + ciscInst := microcodeEntry(Seq( + ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"4_00040004".U), + ciscInstT.Lit(_.inst -> 0x3020b07b.U, _.rs1 -> 0x80.U, _.rs2 -> 0x200.U), + ciscInstT.Lit(_.inst -> 0x1020b07b.U, _.rs1 -> 0x1.U, _.rs2 -> x"0_000002b8".U) + )) + } + is (1.U) { + ciscInst := microcodeEntry(Seq( + ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"4_00040004".U), + ciscInstT.Lit(_.inst -> 0x3020b07b.U, _.rs1 -> 0.U, _.rs2 -> 0x180.U), + ciscInstT.Lit(_.inst -> 0x1020b07b.U, _.rs1 -> 0x1.U, _.rs2 -> x"0_000002b8".U) + )) + } + is (9.U) { + ciscInst := microcodeEntry(Seq( + ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"4_00040004".U), + ciscInstT.Lit(_.inst -> 0x1020b07b.U, _.rs1 -> 0.U, _.rs2 -> 0x278.U), + )) + } + is (16.U) { + ciscInst := microcodeEntry(Seq( + ciscInstT.Lit(_.inst -> 0x0020b07b.U, _.rs1 -> x"3f800000_00080101".U, _.rs2 -> 0.U), + ciscInstT.Lit(_.inst -> 0x0020b07b.U, _.rs1 -> x"3f800000_00010004".U, _.rs2 -> x"10000_00000000".U), + ciscInstT.Lit(_.inst -> 0x0020b07b.U, _.rs1 -> 0x2.U, _.rs2 -> x"3f800000_00000000".U) + )) + } + } + } + + val gemminiIO = outer.gemmini.module.io.cmd + gemminiIO.bits.status := 0.U.asTypeOf(gemminiIO.bits.status) + gemminiIO.bits.inst := ciscInst.inst.asTypeOf(gemminiIO.bits.inst) + gemminiIO.bits.rs1 := ciscInst.rs1 + gemminiIO.bits.rs2 := ciscInst.rs2 + gemminiIO.valid := ciscValid + assert(gemminiIO.ready || !gemminiIO.valid) + + accSlave.status := RegNext(outer.gemmini.module.io.busy).asUInt + outer.traceSourceNode.bundle := DontCare outer.traceSourceNode.bundle.insns foreach (_.valid := false.B) diff --git a/src/main/scala/radiance/tile/RadianceCluster.scala b/src/main/scala/radiance/tile/RadianceCluster.scala index cb50a32..172e9aa 100644 --- a/src/main/scala/radiance/tile/RadianceCluster.scala +++ b/src/main/scala/radiance/tile/RadianceCluster.scala @@ -84,6 +84,7 @@ class RadianceCluster ( val stride_by_word = true val filter_aligned = true val disable_monitors = true // otherwise it generate 1k+ different tl monitors + val serialize_unaligned = true def guard_monitors[T](callback: Parameters => T)(implicit p: Parameters): Unit = { if (disable_monitors) { @@ -226,23 +227,25 @@ class RadianceCluster ( } } val f_aligned = Seq.fill(2)(filter_nodes.map(_.map(_._1).map(connect_xbar_name(_, Some("rad_aligned"))))) - // val f_unaligned = Seq.fill(2)(filter_nodes.map(_.map(_._2).map(connect_xbar))) - val f_unaligned = Seq.fill(2) { - val serialized_node = TLEphemeralNode() - val serialized_in_xbar = LazyModule(new TLXbar()) - val serialized_out_xbar = LazyModule(new TLXbar()) - serialized_in_xbar.suggestName("unaligned_serialized_in_xbar") - serialized_out_xbar.suggestName("unaligned_serialized_out_xbar") - guard_monitors { implicit p => - filter_nodes.foreach(_.map(_._2).foreach(serialized_in_xbar.node := _)) - serialized_node := serialized_in_xbar.node - serialized_out_xbar.node := serialized_node + val f_unaligned = if (serialize_unaligned) { + Seq.fill(2) { + val serialized_node = TLEphemeralNode() + val serialized_in_xbar = LazyModule(new TLXbar()) + val serialized_out_xbar = LazyModule(new TLXbar()) + serialized_in_xbar.suggestName("unaligned_serialized_in_xbar") + serialized_out_xbar.suggestName("unaligned_serialized_out_xbar") + guard_monitors { implicit p => + filter_nodes.foreach(_.map(_._2).foreach(serialized_in_xbar.node := _)) + serialized_node := serialized_in_xbar.node + serialized_out_xbar.node := serialized_node + } + Seq(serialized_out_xbar.node) } - Seq(serialized_out_xbar.node) + } else { + Seq.fill(2)(filter_nodes.flatMap(_.map(_._2).map(connect_xbar))) } - val uniform_r_nodes: Seq[Seq[Seq[TLNode]]] = spad_read_nodes.map { rb => (rb zip f_aligned.head).map { case (rw, fa) => Seq(rw) ++ fa } } @@ -253,7 +256,7 @@ class RadianceCluster ( } // all to all xbar - val Seq(nonuniform_r_nodes, nonuniform_w_nodes) = f_unaligned // f_unaligned.map(_.flatten) + val Seq(nonuniform_r_nodes, nonuniform_w_nodes) = f_unaligned (uniform_r_nodes, uniform_w_nodes, nonuniform_r_nodes, nonuniform_w_nodes) } else { @@ -321,6 +324,12 @@ class RadianceCluster ( // connect tile smem nodes to xbar, and xbar to banks // val smem_xbar = TLXbar() + + val radianceAccSlaveNodes = Seq.fill(numCores)(AccSlaveNode()) + (radianceAccSlaveNodes zip radianceTiles).foreach { case (a, r) => a := r.accMasterNode } + val gemminiAccMasterNode = AccMasterNode() + gemminiTile.accSlaveNode := gemminiAccMasterNode + gemminiTile.slaveNode :=* TLWidthWidget(4) :=* clbus.outwardNode assert(smem_size == 0x4000, "fix me") @@ -379,6 +388,18 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp( b.resp <> synchronizer.io.resp // broadcast } + val coreAcc = outer.radianceAccSlaveNodes.head.in.head._1 + val gemminiAcc = outer.gemminiAccMasterNode.out.head._1 + dontTouch(gemminiAcc) + // val gemminiTileAcc = outer.gemminiTile.accSlaveNode.in.head._1 + + // gemminiTileAcc.cmd := gemminiAcc.cmd + // gemminiAcc.status := gemminiTileAcc.status + + outer.radianceAccSlaveNodes.foreach(_.in.head._1.status := gemminiAcc.status) + gemminiAcc.cmd := coreAcc.cmd + + // TODO: remove Pipeline dependency of gemmini def makeSmemBanks(): Unit = { def make_buffer[T <: Data](mem: TwoPortSyncMem[T], r_node: TLBundle, r_edge: TLEdgeIn, diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 2cfcc5c..502371e 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -6,6 +6,7 @@ package radiance.tile import chisel3._ import chisel3.util._ import freechips.rocketchip.devices.tilelink._ +import org.chipsalliance.diplomacy._ import freechips.rocketchip.diplomacy._ import freechips.rocketchip.prci.ClockSinkParameters import freechips.rocketchip.regmapper.RegField @@ -192,8 +193,9 @@ class RadianceTile private ( } val imemTagWidth = UUID_WIDTH + NW_WIDTH - val LSUQ_SIZE = 8 * numWarps * (numCoreLanes / numLsuLanes) - assert(LSUQ_SIZE == p(SIMTCoreKey).get.nSrcIds) + // val LSUQ_SIZE = 4 * numWarps * (numCoreLanes / numLsuLanes) + // assert(LSUQ_SIZE == p(SIMTCoreKey).get.nSrcIds) + val LSUQ_SIZE = p(SIMTCoreKey).get.nSrcIds val LSUQ_TAG_BITS = log2Ceil(LSUQ_SIZE) + 1 /*DCACHE_BATCH_SEL_BITS*/ val dmemTagWidth = UUID_WIDTH + LSUQ_TAG_BITS // dmem and smem shares the same tag width, DCACHE_NOSM_TAG_WIDTH @@ -314,7 +316,10 @@ class RadianceTile private ( // "Vortex L1 configuration currently only works when coalescer is also enabled." // ) - val icache = LazyModule(new VortexL1Cache(vortexL1Config.copy(numBanks = 1))) + val icache = LazyModule(new VortexL1Cache(vortexL1Config.copy( + numBanks = 1, + coreTagWidth = imemSourceWidth + ))) val dcache = LazyModule(new VortexL1Cache(vortexL1Config)) // imemNodes.foreach { icache.coresideNode := TLWidthWidget(4) := _ } assert(imemNodes.length == 1) // FIXME @@ -337,6 +342,8 @@ class RadianceTile private ( def barrierIdBits = log2Ceil(numBarriers) val barrierMasterNode = BarrierMasterNode(barrierIdBits) + val accMasterNode = AccMasterNode() + val base = p(GPUMemory()) match { case Some(GPUMemParams(baseAddr, _)) => baseAddr case _ => BigInt(0) @@ -366,7 +373,7 @@ class RadianceTile private ( _.node := tlMasterXbar.node } getOrElse { tlMasterXbar.node } masterNode :=* tlOtherMastersNode - DisableMonitors { implicit p => tlSlaveXbar.node :*= slaveNode } + org.chipsalliance.diplomacy.DisableMonitors { implicit p => tlSlaveXbar.node :*= slaveNode } val dtimProperty = Nil // Seq(dmemDevice.asProperty).flatMap(p => Map("sifive,dtim" -> p)) @@ -685,6 +692,12 @@ class RadianceTileModuleImp(outer: RadianceTile) outer.barrierMasterNode.out(0)._1.resp.ready := true.B } + def connectAccelerator: Unit = { + outer.accMasterNode.out.head._1.cmd.bits := core.io.acc_write_out + outer.accMasterNode.out.head._1.cmd.valid := core.io.acc_write_en + core.io.acc_read_in := outer.accMasterNode.out.head._1.status + } + def performanceCounters(reqBundles: Seq[DecoupledIO[VortexBundleA]], respBundles: Seq[DecoupledIO[VortexBundleD]], desc: String) = { @@ -721,6 +734,7 @@ class RadianceTileModuleImp(outer: RadianceTile) connectDmem connectSmem connectBarrier + connectAccelerator } // TODO: generalize for useVxCache diff --git a/src/main/scala/radiance/tile/VortexCore.scala b/src/main/scala/radiance/tile/VortexCore.scala index ba74508..101c6c3 100644 --- a/src/main/scala/radiance/tile/VortexCore.scala +++ b/src/main/scala/radiance/tile/VortexCore.scala @@ -100,6 +100,10 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl val gbar_rsp_valid = Input(Bool()) val gbar_rsp_id = Input(UInt(barrierIdBits.W)) + val acc_read_in = Input(UInt(32.W)) + val acc_write_out = Output(UInt(32.W)) + val acc_write_en = Output(Bool()) + // val fpu = Flipped(new FPUCoreIO()) //val rocc = Flipped(new RoCCCoreIO(nTotalRoCCCSRs)) //val trace = Output(new TraceBundle)