Add EmulatorTile

also split core-specific config keys from radiance.memory to radiance.core.
This commit is contained in:
Hansung Kim
2024-11-26 15:23:24 -08:00
parent bf0527e2ad
commit 7cc40eedde
9 changed files with 545 additions and 35 deletions

View File

@@ -0,0 +1,31 @@
#ifndef NO_VPI
#include <vpi_user.h>
#include <svdpi.h>
#endif
#include <stdint.h>
extern "C" void emulator_init_rs(int num_lanes);
extern "C" void emulator_generate_rs(uint8_t *vec_a_ready, uint8_t *vec_a_valid,
long long *vec_a_address,
uint8_t *vec_a_is_store, int *vec_a_size,
long long *vec_a_data, uint8_t *vec_d_ready,
uint8_t *vec_d_valid,
uint8_t *vec_d_is_store, int *vec_d_size,
uint8_t inflight, uint8_t *finished);
extern "C" void emulator_init(int num_lanes) {
emulator_init_rs(num_lanes);
}
extern "C" void emulator_generate(uint8_t *vec_a_ready, uint8_t *vec_a_valid,
long long *vec_a_address,
uint8_t *vec_a_is_store, int *vec_a_size,
long long *vec_a_data, uint8_t *vec_d_ready,
uint8_t *vec_d_valid, uint8_t *vec_d_is_store,
int *vec_d_size, uint8_t inflight,
uint8_t *finished) {
emulator_generate_rs(vec_a_ready, vec_a_valid, vec_a_address, vec_a_is_store,
vec_a_size, vec_a_data, vec_d_ready, vec_d_valid,
vec_d_is_store, vec_d_size, inflight, finished);
}

View File

@@ -0,0 +1,132 @@
`include "SimDefaults.vh"
import "DPI-C" function void emulator_init(
input longint num_lanes
);
// Make sure to sync the parameters for:
// (1) import "DPI-C" declaration
// (2) C function declaration
// (3) DPI function calls inside initial/always blocks
import "DPI-C" function void emulator_generate
(
input bit vec_a_ready[`MAX_NUM_LANES],
output bit vec_a_valid[`MAX_NUM_LANES],
output longint vec_a_address[`MAX_NUM_LANES],
output bit vec_a_is_store[`MAX_NUM_LANES],
output int vec_a_size[`MAX_NUM_LANES],
output longint vec_a_data[`MAX_NUM_LANES],
output bit vec_d_ready[`MAX_NUM_LANES],
input bit vec_d_valid[`MAX_NUM_LANES],
input bit vec_d_is_store[`MAX_NUM_LANES],
input int vec_d_size[`MAX_NUM_LANES],
input bit inflight,
output bit finished
);
module SimEmulator #(parameter NUM_LANES = 4) (
input clock,
input reset,
input [NUM_LANES-1:0] a_ready,
output [NUM_LANES-1:0] a_valid,
output [`SIMMEM_DATA_WIDTH*NUM_LANES-1:0] a_address,
output [NUM_LANES-1:0] a_is_store,
output [`SIMMEM_LOGSIZE_WIDTH*NUM_LANES-1:0] a_size,
output [`SIMMEM_DATA_WIDTH*NUM_LANES-1:0] a_data,
output [NUM_LANES-1:0] d_ready,
input [NUM_LANES-1:0] d_valid,
input [NUM_LANES-1:0] d_is_store,
input [`SIMMEM_LOGSIZE_WIDTH*NUM_LANES-1:0] d_size,
// TODO: d_mask
// TODO: d_data
input inflight,
output finished
);
// "in": C->verilog, "out": verilog->C
// need to be in ascending order to match with C indexing
// C array sizes are static, so need to use MAX_NUM_LANES
bit __out_a_ready [0:`MAX_NUM_LANES-1];
bit __in_a_valid [0:`MAX_NUM_LANES-1];
longint __in_a_address [0:`MAX_NUM_LANES-1];
bit __in_a_is_store [0:`MAX_NUM_LANES-1];
int __in_a_size [0:`MAX_NUM_LANES-1];
longint __in_a_data [0:`MAX_NUM_LANES-1];
bit __in_d_ready [0:`MAX_NUM_LANES-1];
bit __out_d_valid [0:`MAX_NUM_LANES-1];
bit __out_d_is_store [0:`MAX_NUM_LANES-1];
int __out_d_size [0:`MAX_NUM_LANES-1];
bit __out_inflight;
bit __in_finished;
genvar g;
generate
for (g = 0; g < NUM_LANES; g = g + 1) begin
assign __out_a_ready[g] = a_ready[g];
assign a_valid[g] = __in_a_valid[g];
assign a_address[`SIMMEM_DATA_WIDTH*g +: `SIMMEM_DATA_WIDTH]
= __in_a_address[g][`SIMMEM_DATA_WIDTH-1:0];
assign a_is_store[g] = __in_a_is_store[g];
assign a_size[`SIMMEM_LOGSIZE_WIDTH*g +: `SIMMEM_LOGSIZE_WIDTH]
= __in_a_size[g][`SIMMEM_LOGSIZE_WIDTH-1:0];
assign a_data[`SIMMEM_DATA_WIDTH*g +: `SIMMEM_DATA_WIDTH]
= __in_a_data[g][`SIMMEM_DATA_WIDTH-1:0];
assign d_ready[g] = __in_d_ready[g];
assign __out_d_valid[g] = d_valid[g];
assign __out_d_is_store[g] = d_is_store[g];
assign __out_d_size[g] = d_size[`SIMMEM_LOGSIZE_WIDTH*g +: `SIMMEM_LOGSIZE_WIDTH];
end
assign __out_inflight = inflight;
endgenerate
assign finished = __in_finished;
initial begin
emulator_init(NUM_LANES);
end
// negedge is important here; the DPI logic is essentially functioning as
// a combinational logic, so we want to reflect the signal change from DPI
// at the *current* cycle, not the next.
always @(negedge clock) begin
if (reset) begin
for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
__in_a_valid[tid] = 1'b0;
__in_a_address[tid] = `SIMMEM_DATA_WIDTH'b0;
__in_a_is_store[tid] = 1'b0;
__in_a_size[tid] = 32'b0;
__in_a_data[tid] = `SIMMEM_DATA_WIDTH'b0;
__in_d_ready[tid] = 1'b0;
end
__in_finished = 1'b0;
end else begin
emulator_generate(
__out_a_ready,
__in_a_valid,
__in_a_address,
__in_a_is_store,
__in_a_size,
__in_a_data,
__in_d_ready,
__out_d_valid,
__out_d_is_store,
__out_d_size,
__out_inflight,
__in_finished
);
for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
$display("verilog: %04d a_valid[%d]=%d, a_address[%d]=0x%x, d_ready[%d]=%d",
$time, tid, __in_a_valid[tid], tid, __in_a_address[tid], tid, __in_d_ready[tid]);
end
if (finished) begin
$finish;
end
end
end
endmodule

View File

@@ -0,0 +1,243 @@
package radiance.core
import chisel3._
import chisel3.util._
import org.chipsalliance.cde.config.{Field, Parameters}
import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
import freechips.rocketchip.tilelink._
import freechips.rocketchip.diplomacy.{IdRange, AddressSet, BufferParams}
import radiance.memory.{SourceGenerator, TraceLine, TLPrintf}
case class SIMTCoreParams(
nWarps: Int = 4, // # of warps in the core
nCoreLanes: Int = 4, // # of SIMT threads in the core
nMemLanes: Int = 4, // # of memory lanes in the memory interface to the
// cache; relates to the LSU lanes
nSrcIds: Int = 8 // # of source IDs allocated to each of the nMemLanes
)
case class MemtraceCoreParams(
tracefilename: String = "undefined",
traceHasSource: Boolean = false
)
case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/ )
case object MemtraceCoreKey
extends Field[Option[MemtraceCoreParams]](None /*default*/ )
// #############################################################################
// FIXME: copy-paste from MemFuzzer
// #############################################################################
class Emulator(
numLanes: Int,
numSrcIds: Int,
wordSizeInBytes: Int,
)(implicit p: Parameters)
extends LazyModule {
val laneNodes = Seq.tabulate(numLanes) { i =>
val clientParam = Seq(
TLMasterParameters.v1(
name = "Emulator" + i.toString,
sourceId = IdRange(0, numSrcIds)
// visibility = Seq(AddressSet(0x0000, 0xffffff))
)
)
TLClientNode(Seq(TLMasterPortParameters.v1(clientParam)))
}
val node = TLIdentityNode()
laneNodes.foreach(node := _)
lazy val module = new EmulatorImp(this, numLanes, numSrcIds, wordSizeInBytes)
}
class EmulatorImp(
outer: Emulator,
numLanes : Int,
numSrcIds: Int,
wordSizeInBytes: Int,
) extends LazyModuleImp(outer) {
val io = IO(new Bundle {
val finished = Output(Bool())
})
val sim = Module(new SimEmulator(numLanes))
sim.io.clock := clock
sim.io.reset := reset.asBool
sim.io.a.ready := VecInit(outer.laneNodes.map { node =>
val (tlOut, _) = node.out(0)
tlOut.a.ready
}).asUInt
io.finished := sim.io.finished
// connect Verilog <-> Chisel IO
// Verilog IO flattened across all lanes
val laneReqs = Wire(Vec(numLanes, Decoupled(new TraceLine)))
val addrW = laneReqs(0).bits.address.getWidth
val sizeW = laneReqs(0).bits.size.getWidth
val dataW = laneReqs(0).bits.data.getWidth
laneReqs.zipWithIndex.foreach { case (req, i) =>
req.valid := sim.io.a.valid(i)
req.bits.source := 0.U // DPI doesn't generate contain source id
req.bits.address := sim.io.a.address(addrW * (i + 1) - 1, addrW * i)
req.bits.is_store := sim.io.a.is_store(i)
req.bits.size := sim.io.a.size(sizeW * (i + 1) - 1, sizeW * i)
req.bits.data := sim.io.a.data(dataW * (i + 1) - 1, dataW * i)
}
sim.io.a.ready := VecInit(laneReqs.map(_.ready)).asUInt
val laneResps = Wire(Vec(numLanes, Flipped(Decoupled(new TraceLine))))
laneResps.zipWithIndex.foreach { case (resp, i) =>
resp.ready := sim.io.d.ready(i)
// TODO: not handled in DPI
resp.bits.source := DontCare
resp.bits.address := DontCare
resp.bits.data := DontCare
}
sim.io.d.valid := VecInit(laneResps.map(_.valid)).asUInt
sim.io.d.is_store := VecInit(laneResps.map(_.bits.is_store)).asUInt
sim.io.d.size := VecInit(laneResps.map(_.bits.size)).asUInt
val sourceGens = Seq.fill(numLanes)(
Module(
new SourceGenerator(
log2Ceil(numSrcIds),
ignoreInUse = false
)
)
)
val anyInflight = sourceGens.map(_.io.inflight).reduce(_ || _)
sim.io.inflight := anyInflight
// Take requests off of the queue and generate TL requests
(outer.laneNodes zip (laneReqs zip laneResps)).zipWithIndex.foreach {
case ((node, (req, resp)), lane) =>
val (tlOut, edge) = node.out(0)
// Requests --------------------------------------------------------------
//
// Core only makes accesses of granularity larger than a word, so we want
// the trace driver to act so as well.
// That means if req.size is smaller than word size, we need to pad data
// with zeros to generate a word-size request, and set mask accordingly.
val offsetInWord = req.bits.address % wordSizeInBytes.U
val subword = req.bits.size < log2Ceil(wordSizeInBytes).U
// `mask` is currently unused
// val mask = Wire(UInt(wordSizeInBytes.W))
val wordData = Wire(UInt((wordSizeInBytes * 8 * 2).W))
val sizeInBytes = Wire(UInt((sizeW + 1).W))
sizeInBytes := (1.U) << req.bits.size
// mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U)
wordData := Mux(subword, req.bits.data << (offsetInWord * 8.U), req.bits.data)
val wordAlignedAddress =
req.bits.address & ~((1 << log2Ceil(wordSizeInBytes)) - 1).U(addrW.W)
val wordAlignedSize = Mux(subword, 2.U, req.bits.size)
val sourceGen = sourceGens(lane)
sourceGen.io.gen := tlOut.a.fire
sourceGen.io.reclaim.valid := tlOut.d.fire
sourceGen.io.reclaim.bits := tlOut.d.bits.source
sourceGen.io.meta := DontCare
val (plegal, pbits) = edge.Put(
fromSource = sourceGen.io.id.bits,
toAddress = wordAlignedAddress,
lgSize = wordAlignedSize, // trace line already holds log2(size)
// data should be aligned to beatBytes
data =
(wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
)
val (glegal, gbits) = edge.Get(
fromSource = sourceGen.io.id.bits,
toAddress = wordAlignedAddress,
lgSize = wordAlignedSize
)
val legal = Mux(req.bits.is_store, plegal, glegal)
val bits = Mux(req.bits.is_store, pbits, gbits)
tlOut.a.valid := req.valid && sourceGen.io.id.valid
req.ready := tlOut.a.ready && sourceGen.io.id.valid
when(tlOut.a.fire) {
assert(legal, "illegal TL req gen")
}
tlOut.a.bits := bits
// Responses -------------------------------------------------------------
//
tlOut.d.ready := resp.ready
resp.valid := tlOut.d.valid
resp.bits.is_store := !edge.hasData(tlOut.d.bits)
resp.bits.size := tlOut.d.bits.size
tlOut.b.ready := true.B
tlOut.c.valid := false.B
tlOut.e.valid := false.B
// debug
dontTouch(req)
when(tlOut.a.valid) {
printf(s"Lane ${lane}: ");
TLPrintf(
"Emulator",
tlOut.a.bits.source,
tlOut.a.bits.address,
tlOut.a.bits.size,
tlOut.a.bits.mask,
req.bits.is_store,
tlOut.a.bits.data,
req.bits.data
)
}
dontTouch(tlOut.a)
dontTouch(tlOut.d)
}
// when(traceFinished && allReqReclaimed && noValidReqs) {
// assert(
// false.B,
// "\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)"
// )
// }
}
class SimEmulator(numLanes: Int)
extends BlackBox(Map("NUM_LANES" -> numLanes))
with HasBlackBoxResource {
val traceLineT = new TraceLine
val addrW = traceLineT.address.getWidth
val sizeW = traceLineT.size.getWidth
val dataW = traceLineT.data.getWidth
val io = IO(new Bundle {
val clock = Input(Clock())
val reset = Input(Bool())
val inflight = Input(Bool())
val finished = Output(Bool())
val a =
new Bundle {
val ready = Input(UInt(numLanes.W))
val valid = Output(UInt(numLanes.W))
// Chisel can't interface with Verilog 2D port, so flatten all lanes into
// single wide 1D array.
val address = Output(UInt((addrW * numLanes).W))
val is_store = Output(UInt(numLanes.W))
val size = Output(UInt((sizeW * numLanes).W))
val data = Output(UInt((dataW * numLanes).W))
}
val d =
new Bundle {
val ready = Output(UInt(numLanes.W))
val valid = Input(UInt(numLanes.W))
val is_store = Input(UInt(numLanes.W))
val size = Input(UInt((sizeW * numLanes).W))
}
})
addResource("/vsrc/SimDefaults.vh")
addResource("/vsrc/SimEmulator.v")
addResource("/csrc/SimEmulator.cc")
}

View File

@@ -4,6 +4,7 @@ import freechips.rocketchip.diplomacy.LazyModule
import freechips.rocketchip.subsystem._
import org.chipsalliance.cde.config.Parameters
import freechips.rocketchip.tilelink._
import radiance.core.{SIMTCoreKey, MemtraceCoreKey}
// TODO: possibly move to somewhere closer to CoalescingUnit
// TODO: separate coalescer config from CanHaveMemtraceCore

View File

@@ -10,25 +10,10 @@ import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
import freechips.rocketchip.util.{Code, MultiPortQueue, OnePortLanePositionedQueue}
import freechips.rocketchip.unittest._
import freechips.rocketchip.tilelink._
import radiance.core.{SIMTCoreParams, SIMTCoreKey}
// TODO: find better place for these
case class SIMTCoreParams(
nWarps: Int = 4, // # of warps in the core
nCoreLanes: Int = 4, // # of SIMT threads in the core
nMemLanes: Int = 4, // # of memory lanes in the memory interface to the
// cache; relates to the LSU lanes
nSrcIds: Int = 8 // # of source IDs allocated to each of the nMemLanes
)
case class MemtraceCoreParams(
tracefilename: String = "undefined",
traceHasSource: Boolean = false
)
case class CoalXbarParam()
case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/ )
case object MemtraceCoreKey
extends Field[Option[MemtraceCoreParams]](None /*default*/ )
case object CoalescerKey
extends Field[Option[CoalescerConfig]](None /*default*/ )
case object CoalXbarKey extends Field[Option[CoalXbarParam]](None /*default*/ )

View File

@@ -12,6 +12,7 @@ import freechips.rocketchip.subsystem._
import gemmini._
import gemmini.Arithmetic.FloatArithmetic._
import radiance.tile._
import radiance.core._
import radiance.memory._
import radiance.subsystem.RadianceGemminiDataType.{BF16, FP16, FP32, Int8}
@@ -106,6 +107,44 @@ class WithRadianceCores(
), tensorCoreFP16, tensorCoreDecoupled, useVxCache)
}
class WithEmulatorCores(
n: Int,
useVxCache: Boolean
) extends Config((site, _, up) => {
case TilesLocated(InSubsystem) => {
val prev = up(TilesLocated(InSubsystem))
val idOffset = up(NumTiles)
val emulator = EmulatorTileParams(
core = VortexCoreParams(),
useVxCache = useVxCache)
List.tabulate(n)(i => EmulatorTileAttachParams(
emulator.copy(tileId = i + idOffset),
RocketCrossingParams()
)) ++ prev
}
case NumTiles => up(NumTiles) + 1
case NumRadianceCores => up(NumRadianceCores) + 1
})
class WithFuzzerCores(
n: Int,
useVxCache: Boolean
) extends Config((site, _, up) => {
case TilesLocated(InSubsystem) => {
val prev = up(TilesLocated(InSubsystem))
val idOffset = up(NumTiles)
val fuzzer = FuzzerTileParams(
core = VortexCoreParams(),
useVxCache = useVxCache)
List.tabulate(n)(i => FuzzerTileAttachParams(
fuzzer.copy(tileId = i + idOffset),
RocketCrossingParams()
)) ++ prev
}
case NumTiles => up(NumTiles) + 1
case NumRadianceCores => up(NumRadianceCores) + 1
})
object RadianceGemminiDataType extends Enumeration {
type Type = Value
val FP32, FP16, BF16, Int8 = Value
@@ -244,25 +283,6 @@ class WithRadianceFrameBuffer(baseAddress: BigInt,
}
})
class WithFuzzerCores(
n: Int,
useVxCache: Boolean
) extends Config((site, _, up) => {
case TilesLocated(InSubsystem) => {
val prev = up(TilesLocated(InSubsystem))
val idOffset = up(NumTiles)
val fuzzer = FuzzerTileParams(
core = VortexCoreParams(),
useVxCache = useVxCache)
List.tabulate(n)(i => FuzzerTileAttachParams(
fuzzer.copy(tileId = i + idOffset),
RocketCrossingParams()
)) ++ prev
}
case NumTiles => up(NumTiles) + 1
case NumRadianceCores => up(NumRadianceCores) + 1
})
class WithRadianceCluster(
clusterId: Int,
location: HierarchicalLocation = InSubsystem,

View File

@@ -0,0 +1,96 @@
// See LICENSE.SiFive for license details.
// See LICENSE.Berkeley for license details.
package radiance.tile
import chisel3._
import org.chipsalliance.cde.config.Parameters
import org.chipsalliance.diplomacy.lazymodule.LazyModule
import freechips.rocketchip.resources.SimpleDevice
import freechips.rocketchip.prci.ClockCrossingType
import freechips.rocketchip.rocket._
import freechips.rocketchip.tile._
import freechips.rocketchip.tilelink._
import freechips.rocketchip.subsystem.{HierarchicalElementCrossingParamsLike, CanAttachTile}
import freechips.rocketchip.prci.{ClockSinkParameters}
import radiance.core._
import radiance.memory.{CoalescingUnit, CoalescerKey}
// TODO: De-duplicate between this and FuzzerTile
case class EmulatorTileParams(
core: VortexCoreParams = VortexCoreParams(), // TODO: remove this
useVxCache: Boolean = false,
tileId: Int = 0,
) extends InstantiableTileParams[EmulatorTile] {
def instantiate(crossing: HierarchicalElementCrossingParamsLike, lookup: LookupByHartIdImpl)(
implicit p: Parameters
): EmulatorTile = {
new EmulatorTile(this, crossing, lookup)
}
val clockSinkParams = ClockSinkParameters()
val blockerCtrlAddr = None
val icache = None
val dcache = None
val btb = None
val baseName = "radiance_emulator_tile"
val uniqueName = s"${baseName}_$tileId"
}
case class EmulatorTileAttachParams(
tileParams: EmulatorTileParams,
crossingParams: HierarchicalElementCrossingParamsLike
) extends CanAttachTile { type TileType = EmulatorTile }
class EmulatorTile private (
val EmulatorParams: EmulatorTileParams,
crossing: ClockCrossingType,
lookup: LookupByHartIdImpl,
q: Parameters
) extends BaseTile(EmulatorParams, crossing, lookup, q)
with SinksExternalInterrupts
with SourcesExternalNotifications {
def this(
params: EmulatorTileParams,
crossing: HierarchicalElementCrossingParamsLike,
lookup: LookupByHartIdImpl
)(implicit p: Parameters) =
this(params, crossing.crossingType, lookup, p)
val cpuDevice: SimpleDevice = new SimpleDevice("emulator", Nil)
val intOutwardNode = None
val slaveNode: TLInwardNode = TLIdentityNode()
val masterNode = visibilityNode
// val statusNode = BundleBridgeSource(() => new GroundTestStatus)
val (numLanes, numSrcIds) = p(SIMTCoreKey) match {
case Some(param) => (param.nMemLanes, param.nSrcIds)
case None => {
require(false, "emulator requires SIMTCoreKey to be defined")
(0, 0)
}
}
// FIXME: parameterize
val wordSizeInBytes = 4
val emulator = LazyModule(new Emulator(numLanes, numSrcIds, wordSizeInBytes))
// Conditionally instantiate memory coalescer
val coalescerNode = p(CoalescerKey) match {
case Some(coalParam) => {
val coal = LazyModule(new CoalescingUnit(coalParam))
coal.cpuNode :=* TLWidthWidget(4) :=* emulator.node
coal.aggregateNode
}
case None => emulator.node
}
masterNode :=* coalescerNode
override lazy val module = new EmulatorTileModuleImp(this)
}
class EmulatorTileModuleImp(outer: EmulatorTile) extends BaseTileModuleImp(outer) {
outer.reportCease(Some(outer.emulator.module.io.finished))
}

View File

@@ -13,6 +13,7 @@ import freechips.rocketchip.tile._
import freechips.rocketchip.tilelink._
import freechips.rocketchip.subsystem.{HierarchicalElementCrossingParamsLike, CanAttachTile}
import freechips.rocketchip.prci.{ClockSinkParameters}
import radiance.core.{SIMTCoreKey}
import radiance.memory._
case class FuzzerTileParams(

View File

@@ -19,6 +19,7 @@ import freechips.rocketchip.tilelink._
import freechips.rocketchip.util._
import midas.targetutils.SynthesizePrintf
import org.chipsalliance.cde.config._
import radiance.core._
import radiance.memory._
import radiance.subsystem.{GPUMemParams, GPUMemory, RadianceSimArgs}