acclerator cisc, fpga ready

This commit is contained in:
Richard Yan
2024-05-07 13:51:09 -07:00
parent a915451d03
commit c916c2052d
9 changed files with 185 additions and 46 deletions

View File

@@ -6,7 +6,7 @@ import chisel3._
import chisel3.util._
import org.chipsalliance.cde.config.{Field, Parameters}
import freechips.rocketchip.diplomacy._
import freechips.rocketchip.util.MultiPortQueue
import freechips.rocketchip.util.{Code, MultiPortQueue, OnePortLanePositionedQueue}
import freechips.rocketchip.unittest._
import freechips.rocketchip.tilelink._
@@ -133,7 +133,7 @@ object DefaultCoalescerConfig extends CoalescerConfig(
// when attaching to SoC, 16 source IDs are not enough due to longer latency
numOldSrcIds = 8,
numNewSrcIds = 8,
respQueueDepth = 2,
respQueueDepth = 4,
sizeEnum = DefaultInFlightTableSizeEnum,
numCoalReqs = 1,
numArbiterOutputPorts = 4,
@@ -392,24 +392,12 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig)
// eltPrototype.bits := DontCare
// eltPrototype.valid := false.B
val elts = Reg(Vec(config.numLanes, Vec(entries, Valid(gen))))
val elts = RegInit(0.U.asTypeOf(Vec(config.numLanes, Vec(entries, Valid(gen)))))
val writePtr = RegInit(
VecInit(Seq.fill(config.numLanes)(0.asUInt(log2Ceil(entries + 1).W)))
)
val deqDone = RegInit(VecInit(Seq.fill(config.numLanes)(false.B)))
private def resetElts = {
elts.foreach { laneQ =>
laneQ.foreach { entry =>
entry.valid := false.B
entry.bits := DontCare
}
}
}
when(reset.asBool) {
resetElts
}
val controlSignals = Wire(Vec(config.numLanes, new Bundle {
val shift = Bool()
val full = Bool()
@@ -1046,6 +1034,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
log2Ceil(config.maxCoalLogSize),
(1 << config.maxCoalLogSize) * 8
)
require(config.respQueueDepth > 2, "MultiPortQueue requires depth of at least 4 in FPGAs")
val respQueues = Seq.tabulate(config.numLanes) { _ =>
Module(
new MultiPortQueue(
@@ -1068,7 +1057,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig)
// make queue block up in the middle of the simulation. Ideally there
// should be a more logical way to set this, or we should handle
// response queue blocking.
config.respQueueDepth
config.respQueueDepth,
flow = false,
// storage = OnePortLanePositionedQueue(Code.fromString("identity"))
)
)
}

View File

@@ -101,12 +101,7 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
clients = Seq(
TLMasterParameters.v1(
name = "VortexBankPassthrough",
sourceId = IdRange(
0,
1 << (log2Ceil(
config.memSideSourceIds
) + 5 /*FIXME: give more sourceId so that passthrough doesn't block; hacky*/ )
),
sourceId = IdRange(0, 1 << config.coreTagWidth),
supportsProbe = TransferSizes(1, config.cacheLineSize),
supportsGet = TransferSizes(1, config.cacheLineSize),
supportsPutFull = TransferSizes(1, config.cacheLineSize),
@@ -236,7 +231,7 @@ class VortexBankImp(
}
class ReadReqInfo(config: VortexL1Config) extends Bundle {
val size = UInt(log2Ceil(config.inputSize + 1).W)
val size = UInt(log2Ceil(4).W + 1)
val id = UInt(config.coreTagWidth.W)
}
@@ -273,6 +268,14 @@ class VortexBankImp(
// vxCache.io.core_req_tag
readReqInfo.id := tlInFromCoal.a.bits.source
readReqInfo.size := tlInFromCoal.a.bits.size
assert(readReqInfo.id.getWidth == tlInFromCoal.a.bits.source.getWidth,
s"id width mismatch; coalescer ${tlInFromCoal.a.bits.source.getWidth}, cache ${readReqInfo.id.getWidth}")
assert(readReqInfo.size.getWidth == tlInFromCoal.a.bits.size.getWidth,
s"size width mismatch; coalescer ${tlInFromCoal.a.bits.size.getWidth}, cache ${readReqInfo.size.getWidth}")
assert(readReqInfo.id.getWidth == tlInFromCoal.a.bits.source.getWidth,
s"id width mismatch; coalescer ${tlInFromCoal.a.bits.source.getWidth}, cache ${readReqInfo.id.getWidth}")
assert(readReqInfo.size.getWidth == tlInFromCoal.a.bits.size.getWidth,
s"size width mismatch; coalescer ${tlInFromCoal.a.bits.size.getWidth}, cache ${readReqInfo.size.getWidth}")
// ignore param, size, corrupt
vxCache.io.core_req_tag := readReqInfo.asTypeOf(vxCache.io.core_req_tag)

View File

@@ -178,8 +178,8 @@ class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, _, up) => {
numBanks = nBanks,
inputSize = up(SIMTCoreKey).get.nMemLanes * 4,
cacheLineSize = up(SIMTCoreKey).get.nMemLanes * 4,
memSideSourceIds = 64,
mshrSize = 64,
memSideSourceIds = 16,
mshrSize = 16,
coreTagWidth = log2Ceil(up(SIMTCoreKey).get.nSrcIds.max(up(CoalescerKey) match {
case Some(key) => key.numNewSrcIds
case None => 0

View File

@@ -0,0 +1,2 @@
package radiance.tile;public class AccNode {
}

View File

@@ -0,0 +1,23 @@
package radiance.tile;
import chisel3._
import chisel3.experimental.SourceInfo
import chisel3.util._
import org.chipsalliance.cde.config.Parameters
import org.chipsalliance.diplomacy._
import org.chipsalliance.diplomacy.nodes._
class AccBundle() extends Bundle {
val cmd = Output(Valid(UInt(32.W)))
val status = Input(UInt(1.W))
}
case class NullParams()
object AcceleratorNodeImp extends SimpleNodeImp[NullParams, NullParams, NullParams, AccBundle] {
def bundle(x: NullParams) = new AccBundle()
def edge(x: NullParams, y: NullParams, p: Parameters, sourceInfo: SourceInfo): NullParams = NullParams()
def render(x: NullParams): RenderedEdge = RenderedEdge("ffffff")
}
case class AccMasterNode()(implicit valName: ValName) extends SourceNode(AcceleratorNodeImp)(Seq(NullParams()))
case class AccSlaveNode()(implicit valName: ValName) extends SinkNode(AcceleratorNodeImp)(Seq(NullParams()))

View File

@@ -4,7 +4,9 @@
package radiance.tile
import chisel3._
import freechips.rocketchip.diplomacy.{ClockCrossingType, DisableMonitors, LazyModule, SimpleDevice}
import chisel3.util._
import chisel3.experimental.BundleLiterals._
import freechips.rocketchip.diplomacy.{BigIntHexContext, ClockCrossingType, DisableMonitors, LazyModule, SimpleDevice}
import freechips.rocketchip.prci.ClockSinkParameters
import freechips.rocketchip.rocket._
import freechips.rocketchip.subsystem.{CanAttachTile, HierarchicalElementCrossingParamsLike, RocketCrossingParams}
@@ -102,6 +104,8 @@ class GemminiTile private (
val masterNode = visibilityNode
// val statusNode = BundleBridgeSource(() => new GroundTestStatus)
val accSlaveNode = AccSlaveNode()
tlOtherMastersNode := tlMasterXbar.node
masterNode :=* tlOtherMastersNode
DisableMonitors { implicit p => tlSlaveXbar.node :*= slaveNode }
@@ -129,8 +133,6 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer)
val gemmini_io = outer.gemmini.module.io
gemmini_io.ptw <> DontCare
gemmini_io.mem <> DontCare
gemmini_io.cmd <> DontCare
gemmini_io.cmd.valid := false.B
gemmini_io.resp <> DontCare
gemmini_io.fpu_req.ready := false.B
gemmini_io.fpu_resp.valid := false.B
@@ -140,6 +142,85 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer)
tieOffGemminiRocc
val accSlave = outer.accSlaveNode.in.head._1
val instCounter = Counter(4)
val ciscValid = RegInit(false.B)
val ciscId = RegInit(0.U(8.W))
val ciscInstT = new Bundle {
val inst = UInt(32.W)
val rs1 = UInt(64.W)
val rs2 = UInt(64.W)
}
val ciscInst = Wire(ciscInstT)
when (accSlave.cmd.valid) {
ciscValid := true.B
ciscId := accSlave.cmd.bits(7, 0)
instCounter.reset()
}
def microcodeEntry[T <: Data](insts: Seq[T]): T = {
when (instCounter.value === (insts.size - 1).U) {
ciscValid := false.B
instCounter.reset()
}.otherwise {
instCounter.inc()
}
VecInit(insts)(instCounter.value)
}
ciscInst := 0.U.asTypeOf(ciscInstT)
when (ciscValid) {
assert(!accSlave.cmd.valid, "cisc state machine already busy")
switch (ciscId) {
is (0.U) {
ciscInst := microcodeEntry(Seq(
ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"4_00040004".U), // set I, J, K
ciscInstT.Lit(_.inst -> 0x3020b07b.U, _.rs1 -> 0.U, _.rs2 -> 0x180.U), // set A, B address
ciscInstT.Lit(_.inst -> 0x1020b07b.U, _.rs1 -> 0.U, _.rs2 -> x"0_000002b8".U) // set skip, acc
))
}
is (2.U) {
ciscInst := microcodeEntry(Seq(
ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"4_00040004".U),
ciscInstT.Lit(_.inst -> 0x3020b07b.U, _.rs1 -> 0x80.U, _.rs2 -> 0x200.U),
ciscInstT.Lit(_.inst -> 0x1020b07b.U, _.rs1 -> 0x1.U, _.rs2 -> x"0_000002b8".U)
))
}
is (1.U) {
ciscInst := microcodeEntry(Seq(
ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"4_00040004".U),
ciscInstT.Lit(_.inst -> 0x3020b07b.U, _.rs1 -> 0.U, _.rs2 -> 0x180.U),
ciscInstT.Lit(_.inst -> 0x1020b07b.U, _.rs1 -> 0x1.U, _.rs2 -> x"0_000002b8".U)
))
}
is (9.U) {
ciscInst := microcodeEntry(Seq(
ciscInstT.Lit(_.inst -> 0x1220b07b.U, _.rs1 -> 0.U, _.rs2 -> x"4_00040004".U),
ciscInstT.Lit(_.inst -> 0x1020b07b.U, _.rs1 -> 0.U, _.rs2 -> 0x278.U),
))
}
is (16.U) {
ciscInst := microcodeEntry(Seq(
ciscInstT.Lit(_.inst -> 0x0020b07b.U, _.rs1 -> x"3f800000_00080101".U, _.rs2 -> 0.U),
ciscInstT.Lit(_.inst -> 0x0020b07b.U, _.rs1 -> x"3f800000_00010004".U, _.rs2 -> x"10000_00000000".U),
ciscInstT.Lit(_.inst -> 0x0020b07b.U, _.rs1 -> 0x2.U, _.rs2 -> x"3f800000_00000000".U)
))
}
}
}
val gemminiIO = outer.gemmini.module.io.cmd
gemminiIO.bits.status := 0.U.asTypeOf(gemminiIO.bits.status)
gemminiIO.bits.inst := ciscInst.inst.asTypeOf(gemminiIO.bits.inst)
gemminiIO.bits.rs1 := ciscInst.rs1
gemminiIO.bits.rs2 := ciscInst.rs2
gemminiIO.valid := ciscValid
assert(gemminiIO.ready || !gemminiIO.valid)
accSlave.status := RegNext(outer.gemmini.module.io.busy).asUInt
outer.traceSourceNode.bundle := DontCare
outer.traceSourceNode.bundle.insns foreach (_.valid := false.B)

View File

@@ -84,6 +84,7 @@ class RadianceCluster (
val stride_by_word = true
val filter_aligned = true
val disable_monitors = true // otherwise it generate 1k+ different tl monitors
val serialize_unaligned = true
def guard_monitors[T](callback: Parameters => T)(implicit p: Parameters): Unit = {
if (disable_monitors) {
@@ -226,9 +227,9 @@ class RadianceCluster (
}
}
val f_aligned = Seq.fill(2)(filter_nodes.map(_.map(_._1).map(connect_xbar_name(_, Some("rad_aligned")))))
// val f_unaligned = Seq.fill(2)(filter_nodes.map(_.map(_._2).map(connect_xbar)))
val f_unaligned = Seq.fill(2) {
val f_unaligned = if (serialize_unaligned) {
Seq.fill(2) {
val serialized_node = TLEphemeralNode()
val serialized_in_xbar = LazyModule(new TLXbar())
val serialized_out_xbar = LazyModule(new TLXbar())
@@ -241,7 +242,9 @@ class RadianceCluster (
}
Seq(serialized_out_xbar.node)
}
} else {
Seq.fill(2)(filter_nodes.flatMap(_.map(_._2).map(connect_xbar)))
}
val uniform_r_nodes: Seq[Seq[Seq[TLNode]]] = spad_read_nodes.map { rb =>
(rb zip f_aligned.head).map { case (rw, fa) => Seq(rw) ++ fa }
@@ -253,7 +256,7 @@ class RadianceCluster (
}
// all to all xbar
val Seq(nonuniform_r_nodes, nonuniform_w_nodes) = f_unaligned // f_unaligned.map(_.flatten)
val Seq(nonuniform_r_nodes, nonuniform_w_nodes) = f_unaligned
(uniform_r_nodes, uniform_w_nodes, nonuniform_r_nodes, nonuniform_w_nodes)
} else {
@@ -321,6 +324,12 @@ class RadianceCluster (
// connect tile smem nodes to xbar, and xbar to banks
// val smem_xbar = TLXbar()
val radianceAccSlaveNodes = Seq.fill(numCores)(AccSlaveNode())
(radianceAccSlaveNodes zip radianceTiles).foreach { case (a, r) => a := r.accMasterNode }
val gemminiAccMasterNode = AccMasterNode()
gemminiTile.accSlaveNode := gemminiAccMasterNode
gemminiTile.slaveNode :=* TLWidthWidget(4) :=* clbus.outwardNode
assert(smem_size == 0x4000, "fix me")
@@ -379,6 +388,18 @@ class RadianceClusterModuleImp(outer: RadianceCluster) extends ClusterModuleImp(
b.resp <> synchronizer.io.resp // broadcast
}
val coreAcc = outer.radianceAccSlaveNodes.head.in.head._1
val gemminiAcc = outer.gemminiAccMasterNode.out.head._1
dontTouch(gemminiAcc)
// val gemminiTileAcc = outer.gemminiTile.accSlaveNode.in.head._1
// gemminiTileAcc.cmd := gemminiAcc.cmd
// gemminiAcc.status := gemminiTileAcc.status
outer.radianceAccSlaveNodes.foreach(_.in.head._1.status := gemminiAcc.status)
gemminiAcc.cmd := coreAcc.cmd
// TODO: remove Pipeline dependency of gemmini
def makeSmemBanks(): Unit = {
def make_buffer[T <: Data](mem: TwoPortSyncMem[T], r_node: TLBundle, r_edge: TLEdgeIn,

View File

@@ -6,6 +6,7 @@ package radiance.tile
import chisel3._
import chisel3.util._
import freechips.rocketchip.devices.tilelink._
import org.chipsalliance.diplomacy._
import freechips.rocketchip.diplomacy._
import freechips.rocketchip.prci.ClockSinkParameters
import freechips.rocketchip.regmapper.RegField
@@ -192,8 +193,9 @@ class RadianceTile private (
}
val imemTagWidth = UUID_WIDTH + NW_WIDTH
val LSUQ_SIZE = 8 * numWarps * (numCoreLanes / numLsuLanes)
assert(LSUQ_SIZE == p(SIMTCoreKey).get.nSrcIds)
// val LSUQ_SIZE = 4 * numWarps * (numCoreLanes / numLsuLanes)
// assert(LSUQ_SIZE == p(SIMTCoreKey).get.nSrcIds)
val LSUQ_SIZE = p(SIMTCoreKey).get.nSrcIds
val LSUQ_TAG_BITS = log2Ceil(LSUQ_SIZE) + 1 /*DCACHE_BATCH_SEL_BITS*/
val dmemTagWidth = UUID_WIDTH + LSUQ_TAG_BITS
// dmem and smem shares the same tag width, DCACHE_NOSM_TAG_WIDTH
@@ -314,7 +316,10 @@ class RadianceTile private (
// "Vortex L1 configuration currently only works when coalescer is also enabled."
// )
val icache = LazyModule(new VortexL1Cache(vortexL1Config.copy(numBanks = 1)))
val icache = LazyModule(new VortexL1Cache(vortexL1Config.copy(
numBanks = 1,
coreTagWidth = imemSourceWidth
)))
val dcache = LazyModule(new VortexL1Cache(vortexL1Config))
// imemNodes.foreach { icache.coresideNode := TLWidthWidget(4) := _ }
assert(imemNodes.length == 1) // FIXME
@@ -337,6 +342,8 @@ class RadianceTile private (
def barrierIdBits = log2Ceil(numBarriers)
val barrierMasterNode = BarrierMasterNode(barrierIdBits)
val accMasterNode = AccMasterNode()
val base = p(GPUMemory()) match {
case Some(GPUMemParams(baseAddr, _)) => baseAddr
case _ => BigInt(0)
@@ -366,7 +373,7 @@ class RadianceTile private (
_.node := tlMasterXbar.node
} getOrElse { tlMasterXbar.node }
masterNode :=* tlOtherMastersNode
DisableMonitors { implicit p => tlSlaveXbar.node :*= slaveNode }
org.chipsalliance.diplomacy.DisableMonitors { implicit p => tlSlaveXbar.node :*= slaveNode }
val dtimProperty =
Nil // Seq(dmemDevice.asProperty).flatMap(p => Map("sifive,dtim" -> p))
@@ -685,6 +692,12 @@ class RadianceTileModuleImp(outer: RadianceTile)
outer.barrierMasterNode.out(0)._1.resp.ready := true.B
}
def connectAccelerator: Unit = {
outer.accMasterNode.out.head._1.cmd.bits := core.io.acc_write_out
outer.accMasterNode.out.head._1.cmd.valid := core.io.acc_write_en
core.io.acc_read_in := outer.accMasterNode.out.head._1.status
}
def performanceCounters(reqBundles: Seq[DecoupledIO[VortexBundleA]],
respBundles: Seq[DecoupledIO[VortexBundleD]],
desc: String) = {
@@ -721,6 +734,7 @@ class RadianceTileModuleImp(outer: RadianceTile)
connectDmem
connectSmem
connectBarrier
connectAccelerator
}
// TODO: generalize for useVxCache

View File

@@ -100,6 +100,10 @@ class VortexBundle(tile: RadianceTile)(implicit p: Parameters) extends CoreBundl
val gbar_rsp_valid = Input(Bool())
val gbar_rsp_id = Input(UInt(barrierIdBits.W))
val acc_read_in = Input(UInt(32.W))
val acc_write_out = Output(UInt(32.W))
val acc_write_en = Output(Bool())
// val fpu = Flipped(new FPUCoreIO())
//val rocc = Flipped(new RoCCCoreIO(nTotalRoCCCSRs))
//val trace = Output(new TraceBundle)