wide dram support and enlarge queues

2024-04-28 01:28:33 -07:00
parent e08bf2c2c9
commit a915451d03
6 changed files with 69 additions and 49 deletions
--- a/radiance.mk
+++ b/radiance.mk
@@ -14,7 +14,6 @@ EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradp
 EXTRA_SIM_PREPROC_DEFINES += \
 	+define+SIMULATION \
 	+define+GPR_RESET \
-	+define+GPR_DUPLICATED \
 	+define+LSU_DUP_DISABLE \
 	+define+DBG_TRACE_CORE_PIPELINE_VCS \
 	+define+PERF_ENABLE \
--- a/src/main/scala/radiance/memory/VortexCache.scala
+++ b/src/main/scala/radiance/memory/VortexCache.scala
@@ -12,7 +12,7 @@ case object VortexL1Key extends Field[Option[VortexL1Config]](None /*default*/ )
 case class VortexL1Config(
                           cacheSize: Int, // total cache size in bytes
                           numBanks: Int,
-    wordSize: Int, // This is the read/write granularity of the L1 cache
+                           inputSize: Int, // This is the read/write granularity of the L1 cache
                           cacheLineSize: Int,
                           coreTagWidth: Int,
                           writeInfoReqQSize: Int,
@@ -21,7 +21,7 @@ case class VortexL1Config(
                           uncachedAddrSets: Seq[AddressSet]
 ) {
  def coreTagPlusSizeWidth: Int = {
-    log2Ceil(wordSize) + coreTagWidth
+    log2Ceil(inputSize) + coreTagWidth
  }
  // NOTE: This assertion depends on the fact that the Vortex cache is
  // configured to have 1 bank, and that it uses MSHR id as the tag of
@@ -37,7 +37,7 @@ object defaultVortexL1Config
    extends VortexL1Config(
      cacheSize = 16384,
      numBanks = 4,
-      wordSize = 16,
+      inputSize = 16,
      cacheLineSize = 16,
      coreTagWidth = 8,
      writeInfoReqQSize = 16,
@@ -80,15 +80,15 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
  // Slave node to upstream
  val managerParam = Seq(
    TLSlavePortParameters.v1(
-      beatBytes = config.wordSize,
+      beatBytes = config.inputSize,
      managers = Seq(
        TLSlaveParameters.v1(
          address = config.uncachedAddrSets,
          regionType = RegionType.IDEMPOTENT,
          executable = false,
-          supportsGet = TransferSizes(1, config.wordSize),
-          supportsPutPartial = TransferSizes(1, config.wordSize),
-          supportsPutFull = TransferSizes(1, config.wordSize),
+          supportsGet = TransferSizes(1, config.inputSize),
+          supportsPutPartial = TransferSizes(1, config.inputSize),
+          supportsPutFull = TransferSizes(1, config.inputSize),
          fifoId = Some(0)
        )
      )
@@ -107,10 +107,10 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
              config.memSideSourceIds
            ) + 5 /*FIXME: give more sourceId so that passthrough doesn't block; hacky*/ )
          ),
-          supportsProbe = TransferSizes(1, config.wordSize),
-          supportsGet = TransferSizes(1, config.wordSize),
-          supportsPutFull = TransferSizes(1, config.wordSize),
-          supportsPutPartial = TransferSizes(1, config.wordSize)
+          supportsProbe = TransferSizes(1, config.cacheLineSize),
+          supportsGet = TransferSizes(1, config.cacheLineSize),
+          supportsPutFull = TransferSizes(1, config.cacheLineSize),
+          supportsPutPartial = TransferSizes(1, config.cacheLineSize)
        )
      )
    )
@@ -141,8 +141,8 @@ class VortexBank(
    // suppose have 4 bank
    // base for bank 1: ...000000|01|0000
    // mask for bank 1;    111111|00|1111
-    val base = 0x00000000L | (bankId * config.wordSize)
-    val mask = 0xffffffffL ^ ((config.numBanks - 1) * config.wordSize)
+    val base = 0x00000000L | (bankId * config.inputSize)
+    val mask = 0xffffffffL ^ ((config.numBanks - 1) * config.inputSize)

    val excludeSets = config.uncachedAddrSets
    var remainingSets: Seq[AddressSet] = Seq(AddressSet(base, mask))
@@ -155,15 +155,15 @@ class VortexBank(
  // Slave node to upstream
  val managerParam = Seq(
    TLSlavePortParameters.v1(
-      beatBytes = config.wordSize,
+      beatBytes = config.inputSize,
      managers = Seq(
        TLSlaveParameters.v1(
          address = generateAddressSets(),
          regionType = RegionType.IDEMPOTENT, // idk what this does
          executable = false,
-          supportsGet = TransferSizes(1, config.wordSize),
-          supportsPutPartial = TransferSizes(1, config.wordSize),
-          supportsPutFull = TransferSizes(1, config.wordSize),
+          supportsGet = TransferSizes(1, config.inputSize),
+          supportsPutPartial = TransferSizes(1, config.inputSize),
+          supportsPutFull = TransferSizes(1, config.inputSize),
          fifoId = Some(0)
        )
      )
@@ -177,10 +177,10 @@ class VortexBank(
        TLMasterParameters.v1(
          name = s"VortexBank${bankId}",
          sourceId = IdRange(0, config.memSideSourceIds),
-          supportsProbe = TransferSizes(1, config.wordSize),
-          supportsGet = TransferSizes(1, config.wordSize),
-          supportsPutFull = TransferSizes(1, config.wordSize),
-          supportsPutPartial = TransferSizes(1, config.wordSize)
+          supportsProbe = TransferSizes(1, config.inputSize),
+          supportsGet = TransferSizes(1, config.inputSize),
+          supportsPutFull = TransferSizes(1, config.inputSize),
+          supportsPutPartial = TransferSizes(1, config.inputSize)
        )
      )
    )
@@ -204,7 +204,7 @@ class VortexBankImp(
 ) extends LazyModuleImp(outer) {
  val vxCache = Module(
    new VX_cache_top(
-      WORD_SIZE = config.wordSize,
+      WORD_SIZE = config.inputSize,
      // distribute total size across numBanks
      CACHE_SIZE = config.cacheSize / config.numBanks,
      CACHE_LINE_SIZE = config.cacheLineSize,
@@ -236,7 +236,7 @@ class VortexBankImp(
  }

  class ReadReqInfo(config: VortexL1Config) extends Bundle {
-    val size = UInt(log2Ceil(config.wordSize).W)
+    val size = UInt(log2Ceil(config.inputSize + 1).W)
    val id = UInt(config.coreTagWidth.W)
  }

@@ -264,7 +264,7 @@ class VortexBankImp(
    // 4 is also hardcoded, it should be log2WordSize
    vxCache.io.core_req_addr := tlInFromCoal.a.bits.address(
      31,
-      log2Ceil(config.wordSize)
+      log2Ceil(config.inputSize)
    )
    vxCache.io.core_req_byteen := tlInFromCoal.a.bits.mask
    vxCache.io.core_req_data := tlInFromCoal.a.bits.data
@@ -362,17 +362,17 @@ class VortexBankImp(
      TLMessages.Get
    )

-    tlOutToL2.a.bits.address := Cat(vxCache.io.mem_req_addr, 0.U(4.W))
+    tlOutToL2.a.bits.address := Cat(vxCache.io.mem_req_addr, 0.U(log2Ceil(config.cacheLineSize).W))
    tlOutToL2.a.bits.mask := Mux(
      vxCache.io.mem_req_rw,
      vxCache.io.mem_req_byteen,
-      0xffff.U
+      ~(0.U(config.cacheLineSize.W))
    )
    tlOutToL2.a.bits.data := vxCache.io.mem_req_data
    tlOutToL2.a.bits.source := sourceGen.io.id.bits
    // ignore param, size, corrupt fields
    tlOutToL2.a.bits.param := 0.U
-    tlOutToL2.a.bits.size := 4.U // FIXME: hardcoded
+    tlOutToL2.a.bits.size := log2Ceil(config.cacheLineSize).U
    tlOutToL2.a.bits.corrupt := false.B
    // downstream L2 -> vxCache response
    tlOutToL2.d.ready := vxCache.io.mem_rsp_ready
--- a/src/main/scala/radiance/subsystem/Configs.scala
+++ b/src/main/scala/radiance/subsystem/Configs.scala
@@ -126,7 +126,7 @@ class WithFuzzerCores(
 class WithRadianceCluster(
  clusterId: Int,
  location: HierarchicalLocation = InSubsystem,
-  crossing: RocketCrossingParams = RocketCrossingParams() // TODO make this not rocket
+  crossing: RocketCrossingParams = RocketCrossingParams()
 ) extends Config((site, here, up) => {
  case ClustersLocated(`location`) => up(ClustersLocated(location)) :+ RadianceClusterAttachParams(
    RadianceClusterParams(clusterId = clusterId),
@@ -174,7 +174,17 @@ class WithPriorityCoalXbar extends Config((site, _, up) => {

 class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, _, up) => {
  case VortexL1Key => {
-    Some(defaultVortexL1Config.copy(numBanks = nBanks))
+    Some(defaultVortexL1Config.copy(
+      numBanks = nBanks,
+      inputSize = up(SIMTCoreKey).get.nMemLanes * 4,
+      cacheLineSize = up(SIMTCoreKey).get.nMemLanes * 4,
+      memSideSourceIds = 64,
+      mshrSize = 64,
+      coreTagWidth = log2Ceil(up(SIMTCoreKey).get.nSrcIds.max(up(CoalescerKey) match {
+        case Some(key) => key.numNewSrcIds
+        case None => 0
+      })) + log2Ceil(up(SIMTCoreKey).get.nMemLanes) + 1
+    ))
  }
 })

@@ -197,8 +207,7 @@ class WithCoalescer(nNewSrcIds: Int = 8, enable : Boolean = true) extends Config
    // If instantiating L1 cache, the maximum coalescing size should match the
    // cache line size
    val maxCoalSizeInBytes = up(VortexL1Key, site) match {
-      case Some(param) =>
-        (param.wordSize) 
+      case Some(param) => param.inputSize
      case None => sbusWidthInBytes
    }
      
--- a/src/main/scala/radiance/tile/GemminiTile.scala
+++ b/src/main/scala/radiance/tile/GemminiTile.scala
@@ -140,6 +140,9 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer)

  tieOffGemminiRocc

+  outer.traceSourceNode.bundle := DontCare
+  outer.traceSourceNode.bundle.insns foreach (_.valid := false.B)
+
  // hacky, but cluster will AND the cease signals from all tiles, and we want
  // the core tiles to determine cluster cease not Gemmini
  outer.reportCease(Some(true.B))
--- a/src/main/scala/radiance/tile/RadianceCluster.scala
+++ b/src/main/scala/radiance/tile/RadianceCluster.scala
@@ -8,6 +8,7 @@ import chisel3.util._
 import freechips.rocketchip.diplomacy._
 import freechips.rocketchip.prci.ClockSinkParameters
 import freechips.rocketchip.subsystem._
+import freechips.rocketchip.tile.TraceBundle
 import freechips.rocketchip.tilelink._
 import gemmini._
 import org.chipsalliance.cde.config.Parameters
@@ -91,7 +92,7 @@ class RadianceCluster (
      callback(p)
    }
  }
-  def connect_one[T <: BaseNode with TLNode](from: TLNode, to: () => T): T = {
+  def connect_one[T <: TLNode](from: TLNode, to: () => T): T = {
    val t = to()
    guard_monitors { implicit p => t := from }
    t
@@ -183,13 +184,18 @@ class RadianceCluster (

    val spad_read_nodes = Seq.fill(smem_banks) {
      val r_dist = DistributorNode(from = smem_width, to = wordSize)
-      guard_monitors { implicit p => r_dist := gemmini.spad_read_nodes }
+      guard_monitors { implicit p => r_dist := TLBuffer(BufferParams(1, false, true), BufferParams(0)) := gemmini.spad_read_nodes }
      Seq.fill(smem_subbanks) { connect_one(r_dist, TLIdentityNode.apply) }
    }
    val spad_write_nodes = Seq.fill(smem_banks) {
      val w_dist = DistributorNode(from = smem_width, to = wordSize)
-      guard_monitors { implicit p => w_dist := gemmini.spad_write_nodes }
+      guard_monitors { implicit p => w_dist := TLBuffer(BufferParams(1, false, true), BufferParams(0)) := gemmini.spad_write_nodes }
      Seq.fill(smem_subbanks) { connect_one(w_dist, TLIdentityNode.apply) }
+      /* Seq.fill(smem_subbanks) {
+        val buf = TLBuffer(BufferParams(1, false, true), BufferParams(0))
+        buf := w_dist
+        buf
+      } */
    }
    val ws_dist = DistributorNode(from = smem_width, to = wordSize)
    guard_monitors { implicit p => ws_dist := gemmini.spad.spad_writer.node } // this is the dma write node
--- a/src/main/scala/radiance/tile/RadianceTile.scala
+++ b/src/main/scala/radiance/tile/RadianceTile.scala
@@ -165,11 +165,15 @@ class RadianceTile private (
  // to a stall in the backend pipeline and resulting in a deadlock.
  val imemSourceWidth = 4 // 1 << imemSourceWidth == IBUF_SIZE

-  val dmemSourceWidth = p(SIMTCoreKey) match {
-    // TODO: respect coalescer newSrcIds
+  val smemSourceWidth = p(SIMTCoreKey) match {
    case Some(simtParam) => log2Ceil(simtParam.nSrcIds)
    case None => 4
  }
+
+  val dmemSourceWidth = p(CoalescerKey) match {
+    case Some(coalParam) => log2Ceil(coalParam.numOldSrcIds)
+    case None => smemSourceWidth
+  }
  // require(
  //   dmemSourceWidth >= 4,
  //   "Setting a small number of sourceIds may cause correctness bug inside " +
@@ -177,8 +181,6 @@ class RadianceTile private (
  //     "We recommend setting nSrcIds to at least 16."
  // )

-  val smemSourceWidth = 4 // FIXME: hardcoded
-
  // Replicates some of the logic of how Vortex determines the tag width of
  // memory requests so that Chisel and Verilog are in agreement on bitwidths.
  // See VX_gpu_pkg.sv
@@ -190,7 +192,8 @@ class RadianceTile private (
  }
  val imemTagWidth = UUID_WIDTH + NW_WIDTH

-  val LSUQ_SIZE = 2 * numWarps * (numCoreLanes / numLsuLanes)
+  val LSUQ_SIZE = 8 * numWarps * (numCoreLanes / numLsuLanes)
+  assert(LSUQ_SIZE == p(SIMTCoreKey).get.nSrcIds)
  val LSUQ_TAG_BITS = log2Ceil(LSUQ_SIZE) + 1 /*DCACHE_BATCH_SEL_BITS*/
  val dmemTagWidth = UUID_WIDTH + LSUQ_TAG_BITS
  // dmem and smem shares the same tag width, DCACHE_NOSM_TAG_WIDTH