wide dram support and enlarge queues

This commit is contained in:
Richard Yan
2024-04-28 01:28:33 -07:00
parent e08bf2c2c9
commit a915451d03
6 changed files with 69 additions and 49 deletions

View File

@@ -14,7 +14,6 @@ EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradp
EXTRA_SIM_PREPROC_DEFINES += \
+define+SIMULATION \
+define+GPR_RESET \
+define+GPR_DUPLICATED \
+define+LSU_DUP_DISABLE \
+define+DBG_TRACE_CORE_PIPELINE_VCS \
+define+PERF_ENABLE \

View File

@@ -12,7 +12,7 @@ case object VortexL1Key extends Field[Option[VortexL1Config]](None /*default*/ )
case class VortexL1Config(
cacheSize: Int, // total cache size in bytes
numBanks: Int,
wordSize: Int, // This is the read/write granularity of the L1 cache
inputSize: Int, // This is the read/write granularity of the L1 cache
cacheLineSize: Int,
coreTagWidth: Int,
writeInfoReqQSize: Int,
@@ -21,7 +21,7 @@ case class VortexL1Config(
uncachedAddrSets: Seq[AddressSet]
) {
def coreTagPlusSizeWidth: Int = {
log2Ceil(wordSize) + coreTagWidth
log2Ceil(inputSize) + coreTagWidth
}
// NOTE: This assertion depends on the fact that the Vortex cache is
// configured to have 1 bank, and that it uses MSHR id as the tag of
@@ -37,7 +37,7 @@ object defaultVortexL1Config
extends VortexL1Config(
cacheSize = 16384,
numBanks = 4,
wordSize = 16,
inputSize = 16,
cacheLineSize = 16,
coreTagWidth = 8,
writeInfoReqQSize = 16,
@@ -80,15 +80,15 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
// Slave node to upstream
val managerParam = Seq(
TLSlavePortParameters.v1(
beatBytes = config.wordSize,
beatBytes = config.inputSize,
managers = Seq(
TLSlaveParameters.v1(
address = config.uncachedAddrSets,
regionType = RegionType.IDEMPOTENT,
executable = false,
supportsGet = TransferSizes(1, config.wordSize),
supportsPutPartial = TransferSizes(1, config.wordSize),
supportsPutFull = TransferSizes(1, config.wordSize),
supportsGet = TransferSizes(1, config.inputSize),
supportsPutPartial = TransferSizes(1, config.inputSize),
supportsPutFull = TransferSizes(1, config.inputSize),
fifoId = Some(0)
)
)
@@ -107,10 +107,10 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
config.memSideSourceIds
) + 5 /*FIXME: give more sourceId so that passthrough doesn't block; hacky*/ )
),
supportsProbe = TransferSizes(1, config.wordSize),
supportsGet = TransferSizes(1, config.wordSize),
supportsPutFull = TransferSizes(1, config.wordSize),
supportsPutPartial = TransferSizes(1, config.wordSize)
supportsProbe = TransferSizes(1, config.cacheLineSize),
supportsGet = TransferSizes(1, config.cacheLineSize),
supportsPutFull = TransferSizes(1, config.cacheLineSize),
supportsPutPartial = TransferSizes(1, config.cacheLineSize)
)
)
)
@@ -141,8 +141,8 @@ class VortexBank(
// suppose have 4 bank
// base for bank 1: ...000000|01|0000
// mask for bank 1; 111111|00|1111
val base = 0x00000000L | (bankId * config.wordSize)
val mask = 0xffffffffL ^ ((config.numBanks - 1) * config.wordSize)
val base = 0x00000000L | (bankId * config.inputSize)
val mask = 0xffffffffL ^ ((config.numBanks - 1) * config.inputSize)
val excludeSets = config.uncachedAddrSets
var remainingSets: Seq[AddressSet] = Seq(AddressSet(base, mask))
@@ -155,15 +155,15 @@ class VortexBank(
// Slave node to upstream
val managerParam = Seq(
TLSlavePortParameters.v1(
beatBytes = config.wordSize,
beatBytes = config.inputSize,
managers = Seq(
TLSlaveParameters.v1(
address = generateAddressSets(),
regionType = RegionType.IDEMPOTENT, // idk what this does
executable = false,
supportsGet = TransferSizes(1, config.wordSize),
supportsPutPartial = TransferSizes(1, config.wordSize),
supportsPutFull = TransferSizes(1, config.wordSize),
supportsGet = TransferSizes(1, config.inputSize),
supportsPutPartial = TransferSizes(1, config.inputSize),
supportsPutFull = TransferSizes(1, config.inputSize),
fifoId = Some(0)
)
)
@@ -177,10 +177,10 @@ class VortexBank(
TLMasterParameters.v1(
name = s"VortexBank${bankId}",
sourceId = IdRange(0, config.memSideSourceIds),
supportsProbe = TransferSizes(1, config.wordSize),
supportsGet = TransferSizes(1, config.wordSize),
supportsPutFull = TransferSizes(1, config.wordSize),
supportsPutPartial = TransferSizes(1, config.wordSize)
supportsProbe = TransferSizes(1, config.inputSize),
supportsGet = TransferSizes(1, config.inputSize),
supportsPutFull = TransferSizes(1, config.inputSize),
supportsPutPartial = TransferSizes(1, config.inputSize)
)
)
)
@@ -204,7 +204,7 @@ class VortexBankImp(
) extends LazyModuleImp(outer) {
val vxCache = Module(
new VX_cache_top(
WORD_SIZE = config.wordSize,
WORD_SIZE = config.inputSize,
// distribute total size across numBanks
CACHE_SIZE = config.cacheSize / config.numBanks,
CACHE_LINE_SIZE = config.cacheLineSize,
@@ -236,7 +236,7 @@ class VortexBankImp(
}
class ReadReqInfo(config: VortexL1Config) extends Bundle {
val size = UInt(log2Ceil(config.wordSize).W)
val size = UInt(log2Ceil(config.inputSize + 1).W)
val id = UInt(config.coreTagWidth.W)
}
@@ -264,7 +264,7 @@ class VortexBankImp(
// 4 is also hardcoded, it should be log2WordSize
vxCache.io.core_req_addr := tlInFromCoal.a.bits.address(
31,
log2Ceil(config.wordSize)
log2Ceil(config.inputSize)
)
vxCache.io.core_req_byteen := tlInFromCoal.a.bits.mask
vxCache.io.core_req_data := tlInFromCoal.a.bits.data
@@ -362,17 +362,17 @@ class VortexBankImp(
TLMessages.Get
)
tlOutToL2.a.bits.address := Cat(vxCache.io.mem_req_addr, 0.U(4.W))
tlOutToL2.a.bits.address := Cat(vxCache.io.mem_req_addr, 0.U(log2Ceil(config.cacheLineSize).W))
tlOutToL2.a.bits.mask := Mux(
vxCache.io.mem_req_rw,
vxCache.io.mem_req_byteen,
0xffff.U
~(0.U(config.cacheLineSize.W))
)
tlOutToL2.a.bits.data := vxCache.io.mem_req_data
tlOutToL2.a.bits.source := sourceGen.io.id.bits
// ignore param, size, corrupt fields
tlOutToL2.a.bits.param := 0.U
tlOutToL2.a.bits.size := 4.U // FIXME: hardcoded
tlOutToL2.a.bits.size := log2Ceil(config.cacheLineSize).U
tlOutToL2.a.bits.corrupt := false.B
// downstream L2 -> vxCache response
tlOutToL2.d.ready := vxCache.io.mem_rsp_ready

View File

@@ -126,7 +126,7 @@ class WithFuzzerCores(
class WithRadianceCluster(
clusterId: Int,
location: HierarchicalLocation = InSubsystem,
crossing: RocketCrossingParams = RocketCrossingParams() // TODO make this not rocket
crossing: RocketCrossingParams = RocketCrossingParams()
) extends Config((site, here, up) => {
case ClustersLocated(`location`) => up(ClustersLocated(location)) :+ RadianceClusterAttachParams(
RadianceClusterParams(clusterId = clusterId),
@@ -174,7 +174,17 @@ class WithPriorityCoalXbar extends Config((site, _, up) => {
class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, _, up) => {
case VortexL1Key => {
Some(defaultVortexL1Config.copy(numBanks = nBanks))
Some(defaultVortexL1Config.copy(
numBanks = nBanks,
inputSize = up(SIMTCoreKey).get.nMemLanes * 4,
cacheLineSize = up(SIMTCoreKey).get.nMemLanes * 4,
memSideSourceIds = 64,
mshrSize = 64,
coreTagWidth = log2Ceil(up(SIMTCoreKey).get.nSrcIds.max(up(CoalescerKey) match {
case Some(key) => key.numNewSrcIds
case None => 0
})) + log2Ceil(up(SIMTCoreKey).get.nMemLanes) + 1
))
}
})
@@ -197,8 +207,7 @@ class WithCoalescer(nNewSrcIds: Int = 8, enable : Boolean = true) extends Config
// If instantiating L1 cache, the maximum coalescing size should match the
// cache line size
val maxCoalSizeInBytes = up(VortexL1Key, site) match {
case Some(param) =>
(param.wordSize)
case Some(param) => param.inputSize
case None => sbusWidthInBytes
}

View File

@@ -140,6 +140,9 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer)
tieOffGemminiRocc
outer.traceSourceNode.bundle := DontCare
outer.traceSourceNode.bundle.insns foreach (_.valid := false.B)
// hacky, but cluster will AND the cease signals from all tiles, and we want
// the core tiles to determine cluster cease not Gemmini
outer.reportCease(Some(true.B))

View File

@@ -8,6 +8,7 @@ import chisel3.util._
import freechips.rocketchip.diplomacy._
import freechips.rocketchip.prci.ClockSinkParameters
import freechips.rocketchip.subsystem._
import freechips.rocketchip.tile.TraceBundle
import freechips.rocketchip.tilelink._
import gemmini._
import org.chipsalliance.cde.config.Parameters
@@ -91,7 +92,7 @@ class RadianceCluster (
callback(p)
}
}
def connect_one[T <: BaseNode with TLNode](from: TLNode, to: () => T): T = {
def connect_one[T <: TLNode](from: TLNode, to: () => T): T = {
val t = to()
guard_monitors { implicit p => t := from }
t
@@ -183,13 +184,18 @@ class RadianceCluster (
val spad_read_nodes = Seq.fill(smem_banks) {
val r_dist = DistributorNode(from = smem_width, to = wordSize)
guard_monitors { implicit p => r_dist := gemmini.spad_read_nodes }
guard_monitors { implicit p => r_dist := TLBuffer(BufferParams(1, false, true), BufferParams(0)) := gemmini.spad_read_nodes }
Seq.fill(smem_subbanks) { connect_one(r_dist, TLIdentityNode.apply) }
}
val spad_write_nodes = Seq.fill(smem_banks) {
val w_dist = DistributorNode(from = smem_width, to = wordSize)
guard_monitors { implicit p => w_dist := gemmini.spad_write_nodes }
guard_monitors { implicit p => w_dist := TLBuffer(BufferParams(1, false, true), BufferParams(0)) := gemmini.spad_write_nodes }
Seq.fill(smem_subbanks) { connect_one(w_dist, TLIdentityNode.apply) }
/* Seq.fill(smem_subbanks) {
val buf = TLBuffer(BufferParams(1, false, true), BufferParams(0))
buf := w_dist
buf
} */
}
val ws_dist = DistributorNode(from = smem_width, to = wordSize)
guard_monitors { implicit p => ws_dist := gemmini.spad.spad_writer.node } // this is the dma write node

View File

@@ -165,11 +165,15 @@ class RadianceTile private (
// to a stall in the backend pipeline and resulting in a deadlock.
val imemSourceWidth = 4 // 1 << imemSourceWidth == IBUF_SIZE
val dmemSourceWidth = p(SIMTCoreKey) match {
// TODO: respect coalescer newSrcIds
val smemSourceWidth = p(SIMTCoreKey) match {
case Some(simtParam) => log2Ceil(simtParam.nSrcIds)
case None => 4
}
val dmemSourceWidth = p(CoalescerKey) match {
case Some(coalParam) => log2Ceil(coalParam.numOldSrcIds)
case None => smemSourceWidth
}
// require(
// dmemSourceWidth >= 4,
// "Setting a small number of sourceIds may cause correctness bug inside " +
@@ -177,8 +181,6 @@ class RadianceTile private (
// "We recommend setting nSrcIds to at least 16."
// )
val smemSourceWidth = 4 // FIXME: hardcoded
// Replicates some of the logic of how Vortex determines the tag width of
// memory requests so that Chisel and Verilog are in agreement on bitwidths.
// See VX_gpu_pkg.sv
@@ -190,7 +192,8 @@ class RadianceTile private (
}
val imemTagWidth = UUID_WIDTH + NW_WIDTH
val LSUQ_SIZE = 2 * numWarps * (numCoreLanes / numLsuLanes)
val LSUQ_SIZE = 8 * numWarps * (numCoreLanes / numLsuLanes)
assert(LSUQ_SIZE == p(SIMTCoreKey).get.nSrcIds)
val LSUQ_TAG_BITS = log2Ceil(LSUQ_SIZE) + 1 /*DCACHE_BATCH_SEL_BITS*/
val dmemTagWidth = UUID_WIDTH + LSUQ_TAG_BITS
// dmem and smem shares the same tag width, DCACHE_NOSM_TAG_WIDTH