Merge remote-tracking branch 'origin/main' into asplos-ae

This commit is contained in:
Hansung Kim
2025-01-28 22:29:19 -08:00
17 changed files with 619 additions and 65 deletions

6
.gitmodules vendored
View File

@@ -1,6 +1,8 @@
[submodule "src/main/resources/vsrc/vortex"]
path = src/main/resources/vsrc/vortex
url = https://github.com/hansungk/vortex.git
[submodule "radpie"]
path = radpie
[submodule "cyclotron-main"]
path = cyclotron-main
url = https://github.com/hansungk/cyclotron-main.git
[submodule "cyclotron"]
url = https://github.com/hansungk/cyclotron.git

1
cyclotron Submodule

Submodule cyclotron added at 073584b083

1
cyclotron-main Submodule

Submodule cyclotron-main added at 06081eb052

View File

@@ -3,15 +3,18 @@
##############################################################
VORTEX_SRC_DIR = $(base_dir)/generators/radiance/src/main/resources/vsrc/vortex
RADPIE_SRC_DIR = $(base_dir)/generators/radiance/radpie
RADPIE_BUILD_DIR = $(RADPIE_SRC_DIR)/target/release
CYCLOTRON_SRC_DIR = $(base_dir)/generators/radiance/cyclotron
CYCLOTRON_BUILD_DIR = $(CYCLOTRON_SRC_DIR)/target/debug
# CYCLOTRON_BUILD_DIR = $(CYCLOTRON_SRC_DIR)/target/release
RADIANCE_CSRC_DIR = $(base_dir)/generators/radiance/src/main/resources/csrc
RADIANCE_VSRC_DIR = $(base_dir)/generators/radiance/src/main/resources/vsrc
##################################################################
# THE FOLLOWING MUST BE += operators
##################################################################
# EXTRA_SIM_REQS += radpie
# EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradpie
EXTRA_SIM_REQS += cyclotron
EXTRA_SIM_LDFLAGS += -L$(CYCLOTRON_BUILD_DIR) -Wl,-rpath,$(CYCLOTRON_BUILD_DIR) -lcyclotron
ifeq ($(shell echo $(CONFIG) | grep -E "SynConfig$$"),$(CONFIG))
EXTRA_SIM_PREPROC_DEFINES += +define+SYNTHESIS +define+NDEBUG +define+DPI_DISABLE
endif
@@ -31,21 +34,25 @@ VCS_NONCC_OPTS += +vcs+initreg+random
# cargo handles building of Rust files all on its own, so make this a PHONY
# target to run cargo unconditionally
.PHONY: radpie
radpie:
cd $(RADPIE_SRC_DIR) && cargo build --release
.PHONY: cyclotron
cyclotron:
cd $(CYCLOTRON_SRC_DIR) && cargo build # --release
EXTRA_SIM_REQS += vortex_vsrc.$(CONFIG)
# below manipulation of VORTEX_VLOG_SOURCES doesn't work if we try to reuse
# below manipulation of RADIANCE_EXTERNAL_SRCS doesn't work if we try to reuse
# $(call lookup_srcs) from common.mk, the variable doesn't expand somehow
ifeq ($(shell which fd 2> /dev/null),)
VORTEX_VLOG_SOURCES := $(shell find -L $(VORTEX_SRC_DIR) -type f -iname "*.sv" -o -iname "*.vh" -o -iname "*.v")
ifeq ($(shell which fdfd 2> /dev/null),)
# RADIANCE_EXTERNAL_SRCS := $(shell find -L $(VORTEX_SRC_DIR) -type f -iname "*.sv" -o -iname "*.vh" -o -iname "*.v")
RADIANCE_EXTERNAL_SRCS := $(shell find -L $(RADIANCE_VSRC_DIR) -type f -iname "*.sv" -o -iname "*.vh" -o -iname "*.v")
RADIANCE_EXTERNAL_SRCS += $(shell find -L $(RADIANCE_CSRC_DIR) -type f)
else
VORTEX_VLOG_SOURCES := $(shell fd -L -t f -e "sv" -e "vh" -e "v" . $(VORTEX_SRC_DIR))
# RADIANCE_EXTERNAL_SRCS := $(shell fdfind -L -t f -e "sv" -e "vh" -e "v" . $(VORTEX_SRC_DIR))
RADIANCE_EXTERNAL_SRCS := $(shell fdfind -L -t f -e "sv" -e "vh" -e "v" . $(RADIANCE_VSRC_DIR))
RADIANCE_EXTERNAL_SRCS += $(shell fdfind -L -t f . $(RADIANCE_CSRC_DIR))
endif
# VORTEX_COLLATERAL := $(patsubst $(VORTEX_SRC_DIR)%,$(GEN_COLLATERAL_DIR)%,$(VORTEX_VLOG_SOURCES))
# check if expanded
# $(info VORTEX_VLOG_SOURCES: $(VORTEX_VLOG_SOURCES))
# for debug; check if expanded
# $(info RADIANCE_EXTERNAL_SRCS: $(RADIANCE_EXTERNAL_SRCS))
# For every Vortex verilog source file, if there's a matching file in
# gen-collateral/, copy them over. This is a hacky way to ensure the changes
@@ -53,8 +60,8 @@ endif
# necessary when common.mk does not trigger chipyard jar rebuild upon verilog
# source updates, in which case we need to manually ensure the up-to-date-ness
# of gen-collateral/.
vortex_vsrc.$(CONFIG): $(VORTEX_VLOG_SOURCES)
@for file in $(VORTEX_VLOG_SOURCES); do \
vortex_vsrc.$(CONFIG): $(RADIANCE_EXTERNAL_SRCS)
@for file in $(RADIANCE_EXTERNAL_SRCS); do \
filename=$$(basename "$$file"); \
if [ -f $(GEN_COLLATERAL_DIR)/$$filename ]; then \
if ! diff $$file $(GEN_COLLATERAL_DIR)/$$filename &>/dev/null ; then \

1
radpie

Submodule radpie deleted from 493b8e10a5

View File

@@ -0,0 +1,43 @@
#ifndef NO_VPI
#include <vpi_user.h>
#include <svdpi.h>
#endif
#include <stdint.h>
extern "C" void emulator_init_rs(int num_lanes);
extern "C" void emulator_tick_rs(uint8_t *vec_a_ready, uint8_t *vec_a_valid,
long long *vec_a_address,
uint8_t *vec_a_is_store, int *vec_a_size,
long long *vec_a_data, uint8_t *vec_d_ready,
uint8_t *vec_d_valid, uint8_t *vec_d_is_store,
int *vec_d_size, long long *vec_d_data,
uint8_t inflight, uint8_t *finished);
// extern "C" void emulator_generate_rs(uint8_t *vec_a_ready, uint8_t *vec_a_valid,
// long long *vec_a_address,
// uint8_t *vec_a_is_store, int *vec_a_size,
// long long *vec_a_data,
// uint8_t *vec_d_ready, uint8_t inflight,
// uint8_t *finished);
extern "C" void emulator_init(int num_lanes) { emulator_init_rs(num_lanes); }
extern "C" void emulator_tick(uint8_t *vec_a_ready, uint8_t *vec_a_valid,
long long *vec_a_address, uint8_t *vec_a_is_store,
int *vec_a_size, long long *vec_a_data,
uint8_t *vec_d_ready, uint8_t *vec_d_valid,
uint8_t *vec_d_is_store, int *vec_d_size,
long long *vec_d_data, uint8_t inflight,
uint8_t *finished) {
emulator_tick_rs(vec_a_ready, vec_a_valid, vec_a_address, vec_a_is_store,
vec_a_size, vec_a_data, vec_d_ready, vec_d_valid,
vec_d_is_store, vec_d_size, vec_d_data, inflight, finished);
}
// extern "C" void emulator_generate(uint8_t *vec_a_ready, uint8_t *vec_a_valid,
// long long *vec_a_address,
// uint8_t *vec_a_is_store, int *vec_a_size,
// long long *vec_a_data, uint8_t *vec_d_ready,
// uint8_t inflight, uint8_t *finished) {
// emulator_generate_rs(vec_a_ready, vec_a_valid, vec_a_address, vec_a_is_store,
// vec_a_size, vec_a_data, vec_d_ready, inflight, finished);
// }

View File

@@ -2,7 +2,6 @@
#include <vpi_user.h>
#include <svdpi.h>
#endif
#include <stdio.h>
#include <stdint.h>
extern "C" void memfuzz_init_rs(int num_lanes);

View File

@@ -0,0 +1,131 @@
`include "SimDefaults.vh"
import "DPI-C" function void emulator_init(
input longint num_lanes
);
// Make sure to sync the parameters for:
// (1) import "DPI-C" declaration
// (2) C function declaration
// (3) DPI function calls inside initial/always blocks
import "DPI-C" function void emulator_tick
(
input bit vec_a_ready[`MAX_NUM_LANES],
output bit vec_a_valid[`MAX_NUM_LANES],
output longint vec_a_address[`MAX_NUM_LANES],
output bit vec_a_is_store[`MAX_NUM_LANES],
output int vec_a_size[`MAX_NUM_LANES],
output longint vec_a_data[`MAX_NUM_LANES],
output bit vec_d_ready[`MAX_NUM_LANES],
input bit vec_d_valid[`MAX_NUM_LANES],
input bit vec_d_is_store[`MAX_NUM_LANES],
input int vec_d_size[`MAX_NUM_LANES],
input longint vec_d_data[`MAX_NUM_LANES],
input bit inflight,
output bit finished
);
module SimEmulator #(parameter NUM_LANES = 4) (
input clock,
input reset,
input [NUM_LANES-1:0] a_ready,
output [NUM_LANES-1:0] a_valid,
output [`SIMMEM_DATA_WIDTH*NUM_LANES-1:0] a_address,
output [NUM_LANES-1:0] a_is_store,
output [`SIMMEM_LOGSIZE_WIDTH*NUM_LANES-1:0] a_size,
output [`SIMMEM_DATA_WIDTH*NUM_LANES-1:0] a_data,
output [NUM_LANES-1:0] d_ready,
input [NUM_LANES-1:0] d_valid,
input [NUM_LANES-1:0] d_is_store,
input [`SIMMEM_LOGSIZE_WIDTH*NUM_LANES-1:0] d_size,
input [`SIMMEM_DATA_WIDTH*NUM_LANES-1:0] d_data,
// TODO: d_mask
input inflight,
output finished
);
// "in": C->verilog, "out": verilog->C
// need to be in ascending order to match with C indexing
// C array sizes are static, so need to use MAX_NUM_LANES
bit __out_a_ready [0:`MAX_NUM_LANES-1];
bit __in_a_valid [0:`MAX_NUM_LANES-1];
longint __in_a_address [0:`MAX_NUM_LANES-1];
bit __in_a_is_store [0:`MAX_NUM_LANES-1];
int __in_a_size [0:`MAX_NUM_LANES-1];
longint __in_a_data [0:`MAX_NUM_LANES-1];
bit __in_d_ready [0:`MAX_NUM_LANES-1];
bit __out_d_valid [0:`MAX_NUM_LANES-1];
bit __out_d_is_store [0:`MAX_NUM_LANES-1];
int __out_d_size [0:`MAX_NUM_LANES-1];
longint __out_d_data [0:`MAX_NUM_LANES-1];
bit __out_inflight;
bit __in_finished;
genvar g;
generate
for (g = 0; g < NUM_LANES; g = g + 1) begin
assign __out_a_ready[g] = a_ready[g];
assign a_valid[g] = __in_a_valid[g];
assign a_address[`SIMMEM_DATA_WIDTH*g +: `SIMMEM_DATA_WIDTH]
= __in_a_address[g][`SIMMEM_DATA_WIDTH-1:0];
assign a_is_store[g] = __in_a_is_store[g];
assign a_size[`SIMMEM_LOGSIZE_WIDTH*g +: `SIMMEM_LOGSIZE_WIDTH]
= __in_a_size[g][`SIMMEM_LOGSIZE_WIDTH-1:0];
assign a_data[`SIMMEM_DATA_WIDTH*g +: `SIMMEM_DATA_WIDTH]
= __in_a_data[g][`SIMMEM_DATA_WIDTH-1:0];
assign d_ready[g] = __in_d_ready[g];
assign __out_d_valid[g] = d_valid[g];
assign __out_d_is_store[g] = d_is_store[g];
assign __out_d_size[g] = d_size[`SIMMEM_LOGSIZE_WIDTH*g +: `SIMMEM_LOGSIZE_WIDTH];
assign __out_d_data[g] = d_data[`SIMMEM_DATA_WIDTH*g +: `SIMMEM_DATA_WIDTH];
end
assign __out_inflight = inflight;
endgenerate
assign finished = __in_finished;
initial begin
emulator_init(NUM_LANES);
end
// negedge might make it easier to view waveform since DPI changes are
// instant and make it look like they happen before the clockedge
always @(posedge clock) begin
if (reset) begin
for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
__in_a_valid[tid] = 1'b0;
__in_a_address[tid] = `SIMMEM_DATA_WIDTH'b0;
__in_a_is_store[tid] = 1'b0;
__in_a_size[tid] = 32'b0;
__in_a_data[tid] = `SIMMEM_DATA_WIDTH'b0;
__in_d_ready[tid] = 1'b0;
end
__in_finished = 1'b0;
end else begin
emulator_tick(
__out_a_ready,
__in_a_valid,
__in_a_address,
__in_a_is_store,
__in_a_size,
__in_a_data,
__in_d_ready,
__out_d_valid,
__out_d_is_store,
__out_d_size,
__out_d_data,
__out_inflight,
__in_finished
);
// for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
// $display("verilog: %04d a_valid[%d]=%d, a_address[%d]=0x%x, d_ready[%d]=%d",
// $time, tid, __in_a_valid[tid], tid, __in_a_address[tid], tid, __in_d_ready[tid]);
// end
end
end
endmodule

View File

@@ -47,7 +47,7 @@ module SimMemFuzzer #(parameter NUM_LANES = 4) (
input inflight,
output finished
);
// "in": verilog->C, "out": C->verilog
// "in": C->verilog, "out": verilog->C
// need to be in ascending order to match with C indexing
// C array sizes are static, so need to use MAX_NUM_LANES
bit __out_a_ready [0:`MAX_NUM_LANES-1];

View File

@@ -0,0 +1,246 @@
package radiance.core
import chisel3._
import chisel3.util._
import org.chipsalliance.cde.config.{Field, Parameters}
import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
import freechips.rocketchip.tilelink._
import freechips.rocketchip.diplomacy.{IdRange, AddressSet, BufferParams}
import radiance.memory.{SourceGenerator, TraceLine, TLPrintf}
case class SIMTCoreParams(
nWarps: Int = 4, // # of warps in the core
nCoreLanes: Int = 4, // # of SIMT threads in the core
nMemLanes: Int = 4, // # of memory lanes in the memory interface to the
// cache; relates to the LSU lanes
nSrcIds: Int = 8 // # of source IDs allocated to each of the nMemLanes
)
case class MemtraceCoreParams(
tracefilename: String = "undefined",
traceHasSource: Boolean = false
)
case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/ )
case object MemtraceCoreKey
extends Field[Option[MemtraceCoreParams]](None /*default*/ )
// #############################################################################
// FIXME: copy-paste from MemFuzzer
// #############################################################################
class Emulator(
numLanes: Int,
numSrcIds: Int,
wordSizeInBytes: Int,
)(implicit p: Parameters)
extends LazyModule {
val laneNodes = Seq.tabulate(numLanes) { i =>
val clientParam = Seq(
TLMasterParameters.v1(
name = "Emulator" + i.toString,
sourceId = IdRange(0, numSrcIds)
// visibility = Seq(AddressSet(0x0000, 0xffffff))
)
)
TLClientNode(Seq(TLMasterPortParameters.v1(clientParam)))
}
val node = TLIdentityNode()
laneNodes.foreach(node := _)
lazy val module = new EmulatorImp(this, numLanes, numSrcIds, wordSizeInBytes)
}
class EmulatorImp(
outer: Emulator,
numLanes : Int,
numSrcIds: Int,
wordSizeInBytes: Int,
) extends LazyModuleImp(outer) {
val io = IO(new Bundle {
val finished = Output(Bool())
})
val sim = Module(new SimEmulator(numLanes))
sim.io.clock := clock
sim.io.reset := reset.asBool
sim.io.a.ready := VecInit(outer.laneNodes.map { node =>
val (tlOut, _) = node.out(0)
tlOut.a.ready
}).asUInt
io.finished := sim.io.finished
// connect Verilog <-> Chisel IO
// Verilog IO flattened across all lanes
val laneReqs = Wire(Vec(numLanes, Decoupled(new TraceLine)))
val addrW = laneReqs(0).bits.address.getWidth
val sizeW = laneReqs(0).bits.size.getWidth
val dataW = laneReqs(0).bits.data.getWidth
laneReqs.zipWithIndex.foreach { case (req, i) =>
req.valid := sim.io.a.valid(i)
req.bits.source := 0.U // DPI doesn't generate contain source id
req.bits.address := sim.io.a.address(addrW * (i + 1) - 1, addrW * i)
req.bits.is_store := sim.io.a.is_store(i)
req.bits.size := sim.io.a.size(sizeW * (i + 1) - 1, sizeW * i)
req.bits.data := sim.io.a.data(dataW * (i + 1) - 1, dataW * i)
}
sim.io.a.ready := VecInit(laneReqs.map(_.ready)).asUInt
val laneResps = Wire(Vec(numLanes, Flipped(Decoupled(new TraceLine))))
laneResps.zipWithIndex.foreach { case (resp, i) =>
resp.ready := sim.io.d.ready(i)
// TODO: not handled in DPI
resp.bits.source := DontCare
resp.bits.address := DontCare
resp.bits.data := DontCare
}
sim.io.d.valid := VecInit(laneResps.map(_.valid)).asUInt
sim.io.d.is_store := VecInit(laneResps.map(_.bits.is_store)).asUInt
sim.io.d.size := VecInit(laneResps.map(_.bits.size)).asUInt
sim.io.d.data := VecInit(laneResps.map(_.bits.data)).asUInt
val sourceGens = Seq.fill(numLanes)(
Module(
new SourceGenerator(
log2Ceil(numSrcIds),
ignoreInUse = false
)
)
)
val anyInflight = sourceGens.map(_.io.inflight).reduce(_ || _)
sim.io.inflight := anyInflight
// Take requests off of the queue and generate TL requests
(outer.laneNodes zip (laneReqs zip laneResps)).zipWithIndex.foreach {
case ((node, (req, resp)), lane) =>
val (tlOut, edge) = node.out(0)
// Requests --------------------------------------------------------------
//
// Core only makes accesses of granularity larger than a word, so we want
// the trace driver to act so as well.
// That means if req.size is smaller than word size, we need to pad data
// with zeros to generate a word-size request, and set mask accordingly.
val offsetInWord = req.bits.address % wordSizeInBytes.U
val subword = req.bits.size < log2Ceil(wordSizeInBytes).U
// `mask` is currently unused
// val mask = Wire(UInt(wordSizeInBytes.W))
val wordData = Wire(UInt((wordSizeInBytes * 8 * 2).W))
val sizeInBytes = Wire(UInt((sizeW + 1).W))
sizeInBytes := (1.U) << req.bits.size
// mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U)
wordData := Mux(subword, req.bits.data << (offsetInWord * 8.U), req.bits.data)
val wordAlignedAddress =
req.bits.address & ~((1 << log2Ceil(wordSizeInBytes)) - 1).U(addrW.W)
val wordAlignedSize = Mux(subword, 2.U, req.bits.size)
val sourceGen = sourceGens(lane)
sourceGen.io.gen := tlOut.a.fire
sourceGen.io.reclaim.valid := tlOut.d.fire
sourceGen.io.reclaim.bits := tlOut.d.bits.source
sourceGen.io.meta := DontCare
val (plegal, pbits) = edge.Put(
fromSource = sourceGen.io.id.bits,
toAddress = wordAlignedAddress,
lgSize = wordAlignedSize, // trace line already holds log2(size)
// data should be aligned to beatBytes
data =
(wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt
)
val (glegal, gbits) = edge.Get(
fromSource = sourceGen.io.id.bits,
toAddress = wordAlignedAddress,
lgSize = wordAlignedSize
)
val legal = Mux(req.bits.is_store, plegal, glegal)
val bits = Mux(req.bits.is_store, pbits, gbits)
tlOut.a.valid := req.valid && sourceGen.io.id.valid
req.ready := tlOut.a.ready && sourceGen.io.id.valid
when(tlOut.a.fire) {
assert(legal, "illegal TL req gen")
}
tlOut.a.bits := bits
// Responses -------------------------------------------------------------
//
tlOut.d.ready := resp.ready
resp.valid := tlOut.d.valid
resp.bits.is_store := !edge.hasData(tlOut.d.bits)
resp.bits.size := tlOut.d.bits.size
tlOut.b.ready := true.B
tlOut.c.valid := false.B
tlOut.e.valid := false.B
// debug
dontTouch(req)
when(tlOut.a.valid) {
printf(s"Lane ${lane}: ");
TLPrintf(
"Emulator",
tlOut.a.bits.source,
tlOut.a.bits.address,
tlOut.a.bits.size,
tlOut.a.bits.mask,
req.bits.is_store,
tlOut.a.bits.data,
req.bits.data
)
}
dontTouch(tlOut.a)
dontTouch(tlOut.d)
}
// when(traceFinished && allReqReclaimed && noValidReqs) {
// assert(
// false.B,
// "\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)"
// )
// }
}
class SimEmulator(numLanes: Int)
extends BlackBox(Map("NUM_LANES" -> numLanes))
with HasBlackBoxResource {
val traceLineT = new TraceLine
val addrW = traceLineT.address.getWidth
val sizeW = traceLineT.size.getWidth
val dataW = traceLineT.data.getWidth
val io = IO(new Bundle {
val clock = Input(Clock())
val reset = Input(Bool())
val inflight = Input(Bool())
val finished = Output(Bool())
val a =
new Bundle {
val ready = Input(UInt(numLanes.W))
val valid = Output(UInt(numLanes.W))
// Chisel can't interface with Verilog 2D port, so flatten all lanes into
// single wide 1D array.
val address = Output(UInt((addrW * numLanes).W))
val is_store = Output(UInt(numLanes.W))
val size = Output(UInt((sizeW * numLanes).W))
val data = Output(UInt((dataW * numLanes).W))
}
val d =
new Bundle {
val ready = Output(UInt(numLanes.W))
val valid = Input(UInt(numLanes.W))
val is_store = Input(UInt(numLanes.W))
val size = Input(UInt((sizeW * numLanes).W))
val data = Input(UInt((dataW * numLanes).W))
}
})
addResource("/vsrc/SimDefaults.vh")
addResource("/vsrc/SimEmulator.v")
addResource("/csrc/SimEmulator.cc")
}

View File

@@ -4,6 +4,7 @@ import freechips.rocketchip.diplomacy.LazyModule
import freechips.rocketchip.subsystem._
import org.chipsalliance.cde.config.Parameters
import freechips.rocketchip.tilelink._
import radiance.core.{SIMTCoreKey, MemtraceCoreKey}
// TODO: possibly move to somewhere closer to CoalescingUnit
// TODO: separate coalescer config from CanHaveMemtraceCore

View File

@@ -10,25 +10,10 @@ import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
import freechips.rocketchip.util.{Code, MultiPortQueue, OnePortLanePositionedQueue}
import freechips.rocketchip.unittest._
import freechips.rocketchip.tilelink._
import radiance.core.{SIMTCoreParams, SIMTCoreKey}
// TODO: find better place for these
case class SIMTCoreParams(
nWarps: Int = 4, // # of warps in the core
nCoreLanes: Int = 4, // # of SIMT threads in the core
nMemLanes: Int = 4, // # of memory lanes in the memory interface to the
// cache; relates to the LSU lanes
nSrcIds: Int = 8 // # of source IDs allocated to each of the nMemLanes
)
case class MemtraceCoreParams(
tracefilename: String = "undefined",
traceHasSource: Boolean = false
)
case class CoalXbarParam()
case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/ )
case object MemtraceCoreKey
extends Field[Option[MemtraceCoreParams]](None /*default*/ )
case object CoalescerKey
extends Field[Option[CoalescerConfig]](None /*default*/ )
case object CoalXbarKey extends Field[Option[CoalXbarParam]](None /*default*/ )
@@ -2055,7 +2040,7 @@ class MemFuzzer(
val laneNodes = Seq.tabulate(numLanes) { i =>
val clientParam = Seq(
TLMasterParameters.v1(
name = "MemTraceDriver" + i.toString,
name = "MemFuzzer" + i.toString,
sourceId = IdRange(0, numSrcIds)
// visibility = Seq(AddressSet(0x0000, 0xffffff))
)

View File

@@ -12,6 +12,7 @@ import freechips.rocketchip.subsystem._
import gemmini._
import gemmini.Arithmetic.FloatArithmetic._
import radiance.tile._
import radiance.core._
import radiance.memory._
import radiance.subsystem.RadianceGemminiDataType.{BF16, FP16, FP32, Int8}
@@ -106,6 +107,44 @@ class WithRadianceCores(
), tensorCoreFP16, tensorCoreDecoupled, useVxCache)
}
class WithEmulatorCores(
n: Int,
useVxCache: Boolean
) extends Config((site, _, up) => {
case TilesLocated(InSubsystem) => {
val prev = up(TilesLocated(InSubsystem))
val idOffset = up(NumTiles)
val emulator = EmulatorTileParams(
core = VortexCoreParams(),
useVxCache = useVxCache)
List.tabulate(n)(i => EmulatorTileAttachParams(
emulator.copy(tileId = i + idOffset),
RocketCrossingParams()
)) ++ prev
}
case NumTiles => up(NumTiles) + 1
case NumRadianceCores => up(NumRadianceCores) + 1
})
class WithFuzzerCores(
n: Int,
useVxCache: Boolean
) extends Config((site, _, up) => {
case TilesLocated(InSubsystem) => {
val prev = up(TilesLocated(InSubsystem))
val idOffset = up(NumTiles)
val fuzzer = FuzzerTileParams(
core = VortexCoreParams(),
useVxCache = useVxCache)
List.tabulate(n)(i => FuzzerTileAttachParams(
fuzzer.copy(tileId = i + idOffset),
RocketCrossingParams()
)) ++ prev
}
case NumTiles => up(NumTiles) + 1
case NumRadianceCores => up(NumRadianceCores) + 1
})
object RadianceGemminiDataType extends Enumeration {
type Type = Value
val FP32, FP16, BF16, Int8 = Value
@@ -136,7 +175,7 @@ class WithRadianceGemmini(location: HierarchicalLocation, crossing: RocketCrossi
case FP16 => GemminiFPConfigs.FP16DefaultConfig.copy(
acc_scale_args = Some(ScaleArguments(
(t: Float, u: Float) => {t},
1, Float(5, 11), -1, identity = "1.0", c_str = "((x))"
1, Float(8, 24), -1, identity = "1.0", c_str = "((x))"
)),
mvin_scale_args = Some(ScaleArguments(
(t: Float, u: Float) => t * u,
@@ -148,8 +187,8 @@ class WithRadianceGemmini(location: HierarchicalLocation, crossing: RocketCrossi
// from sirius
spatialArrayInputType = Float(5, 11, isRecoded = skipRecoding),
spatialArrayWeightType = Float(5, 11, isRecoded = skipRecoding),
spatialArrayOutputType = Float(5, 11, isRecoded = skipRecoding),
accType = Float(5, 11),
spatialArrayOutputType = Float(8, 24, isRecoded = skipRecoding),
accType = Float(8, 24),
// hardcode_d_to_garbage_addr = true,
acc_read_full_width = false, // set to true to output fp32
@@ -244,25 +283,6 @@ class WithRadianceFrameBuffer(baseAddress: BigInt,
}
})
class WithFuzzerCores(
n: Int,
useVxCache: Boolean
) extends Config((site, _, up) => {
case TilesLocated(InSubsystem) => {
val prev = up(TilesLocated(InSubsystem))
val idOffset = up(NumTiles)
val fuzzer = FuzzerTileParams(
core = VortexCoreParams(),
useVxCache = useVxCache)
List.tabulate(n)(i => FuzzerTileAttachParams(
fuzzer.copy(tileId = i + idOffset),
RocketCrossingParams()
)) ++ prev
}
case NumTiles => up(NumTiles) + 1
case NumRadianceCores => up(NumRadianceCores) + 1
})
class WithRadianceCluster(
clusterId: Int,
location: HierarchicalLocation = InSubsystem,

View File

@@ -0,0 +1,96 @@
// See LICENSE.SiFive for license details.
// See LICENSE.Berkeley for license details.
package radiance.tile
import chisel3._
import org.chipsalliance.cde.config.Parameters
import org.chipsalliance.diplomacy.lazymodule.LazyModule
import freechips.rocketchip.resources.SimpleDevice
import freechips.rocketchip.prci.ClockCrossingType
import freechips.rocketchip.rocket._
import freechips.rocketchip.tile._
import freechips.rocketchip.tilelink._
import freechips.rocketchip.subsystem.{HierarchicalElementCrossingParamsLike, CanAttachTile}
import freechips.rocketchip.prci.{ClockSinkParameters}
import radiance.core._
import radiance.memory.{CoalescingUnit, CoalescerKey}
// TODO: De-duplicate between this and FuzzerTile
case class EmulatorTileParams(
core: VortexCoreParams = VortexCoreParams(), // TODO: remove this
useVxCache: Boolean = false,
tileId: Int = 0,
) extends InstantiableTileParams[EmulatorTile] {
def instantiate(crossing: HierarchicalElementCrossingParamsLike, lookup: LookupByHartIdImpl)(
implicit p: Parameters
): EmulatorTile = {
new EmulatorTile(this, crossing, lookup)
}
val clockSinkParams = ClockSinkParameters()
val blockerCtrlAddr = None
val icache = None
val dcache = None
val btb = None
val baseName = "radiance_emulator_tile"
val uniqueName = s"${baseName}_$tileId"
}
case class EmulatorTileAttachParams(
tileParams: EmulatorTileParams,
crossingParams: HierarchicalElementCrossingParamsLike
) extends CanAttachTile { type TileType = EmulatorTile }
class EmulatorTile private (
val EmulatorParams: EmulatorTileParams,
crossing: ClockCrossingType,
lookup: LookupByHartIdImpl,
q: Parameters
) extends BaseTile(EmulatorParams, crossing, lookup, q)
with SinksExternalInterrupts
with SourcesExternalNotifications {
def this(
params: EmulatorTileParams,
crossing: HierarchicalElementCrossingParamsLike,
lookup: LookupByHartIdImpl
)(implicit p: Parameters) =
this(params, crossing.crossingType, lookup, p)
val cpuDevice: SimpleDevice = new SimpleDevice("emulator", Nil)
val intOutwardNode = None
val slaveNode: TLInwardNode = TLIdentityNode()
val masterNode = visibilityNode
// val statusNode = BundleBridgeSource(() => new GroundTestStatus)
val (numLanes, numSrcIds) = p(SIMTCoreKey) match {
case Some(param) => (param.nMemLanes, param.nSrcIds)
case None => {
require(false, "emulator requires SIMTCoreKey to be defined")
(0, 0)
}
}
// FIXME: parameterize
val wordSizeInBytes = 4
val emulator = LazyModule(new Emulator(numLanes, numSrcIds, wordSizeInBytes))
// Conditionally instantiate memory coalescer
val coalescerNode = p(CoalescerKey) match {
case Some(coalParam) => {
val coal = LazyModule(new CoalescingUnit(coalParam))
coal.cpuNode :=* TLWidthWidget(4) :=* emulator.node
coal.aggregateNode
}
case None => emulator.node
}
masterNode :=* coalescerNode
override lazy val module = new EmulatorTileModuleImp(this)
}
class EmulatorTileModuleImp(outer: EmulatorTile) extends BaseTileModuleImp(outer) {
outer.reportCease(Some(outer.emulator.module.io.finished))
}

View File

@@ -4,14 +4,16 @@
package radiance.tile
import chisel3._
import org.chipsalliance.cde.config.{Parameters}
import freechips.rocketchip.diplomacy.{SimpleDevice, LazyModule}
import org.chipsalliance.cde.config.Parameters
import org.chipsalliance.diplomacy.lazymodule.LazyModule
import freechips.rocketchip.resources.SimpleDevice
import freechips.rocketchip.prci.ClockCrossingType
import freechips.rocketchip.rocket._
import freechips.rocketchip.tile._
import freechips.rocketchip.tilelink._
import freechips.rocketchip.subsystem.{HierarchicalElementCrossingParamsLike, CanAttachTile}
import freechips.rocketchip.prci.{ClockSinkParameters}
import radiance.core.{SIMTCoreKey}
import radiance.memory._
case class FuzzerTileParams(

View File

@@ -168,6 +168,8 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer)
val rs2 = UInt(64.W)
}
val ciscInst = Wire(ciscInstT)
val startsLoop = WireInit(false.B)
val runningLoops = RegInit(0.U(4.W))
val accCommandQueue = Module(new Queue(UInt(32.W), 4, false, true))
accCommandQueue.io.enq.bits := accSlave.cmd.bits
@@ -175,10 +177,15 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer)
accCommandQueue.io.deq.ready := !ciscValid
assert(!accSlave.cmd.valid || accCommandQueue.io.enq.ready, "cisc command queue full")
when (accCommandQueue.io.enq.fire) {
val enqId = accSlave.cmd.bits(6, 0)
startsLoop := VecInit(Seq(0, 1, 2, 9, 10, 12).map { x => enqId === x.U }).asUInt.orR
}
when (accCommandQueue.io.deq.fire) {
ciscValid := true.B
ciscId := accSlave.cmd.bits(7, 0)
ciscArgs := accSlave.cmd.bits(31, 8)
ciscId := accCommandQueue.io.deq.bits(7, 0)
ciscArgs := accCommandQueue.io.deq.bits(31, 8)
instCounter.reset()
}
@@ -228,6 +235,7 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer)
println(s"gemmini cisc initialized with DIM=${config.DIM}, tileSize=${tileSizeM},${tileSizeN},${tileSizeK}")
println(f"boundsInst=${rectBoundsInst.litValue}%x, hexadecile=${spadHexadecile}")
when (ciscValid) {
switch (ciscId(6, 0)) {
is (0.U) { // compute on given hexadeciles
@@ -241,6 +249,7 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer)
val accSkipInst = genAccSkipInst(0.U, ((ciscArgs(23, 16) * spadHexadecile.U) << 32).asUInt | 0x238.U)
ciscInst := microcodeEntry(Seq(boundsInst, strideInst, accSkipInst))
}
is (2.U) {} // no actual invocation, fake job placeholder
is (8.U) { // set a, b stride
val inst = Wire(ciscInstT)
inst.inst := 0x1820b07b.U
@@ -279,6 +288,11 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer)
}
}
val completionCount = PopCount(outer.gemmini.module.completion_io.completed)
val loopStarted = Mux(startsLoop, 1.U, 0.U)
runningLoops := runningLoops + loopStarted - completionCount
assert(runningLoops + loopStarted >= completionCount)
val gemminiIO = outer.gemmini.module.io.cmd
val regValid = Wire(Bool())
@@ -299,6 +313,11 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer)
// (!outer.gemmini.module.io.busy, outer.gemmini.module.io.busy.asUInt)
(true.B, outer.gemmini.module.io.busy.asUInt)
}
def gemminiRunningLoopsReg(_dReady: Bool): (Bool, UInt) = {
(true.B, runningLoops)
}
outer.regNode.regmap(
0x00 -> Seq(RegField.w(32, gemminiCommandReg(_, _))),
0x10 -> Seq(
@@ -307,7 +326,8 @@ class GemminiTileModuleImp(outer: GemminiTile) extends BaseTileModuleImp(outer)
0x18 -> Seq(
RegField.w(32, gemminiRs2RegLSB),
RegField.w(32, gemminiRs2RegMSB)),
0x20 -> Seq(RegField.r(32, gemminiBusyReg(_)))
0x20 -> Seq(RegField.r(32, gemminiBusyReg(_))),
0x28 -> Seq(RegField.r(32, gemminiRunningLoopsReg(_)))
)
assert(!regValid || gemminiIO.ready)

View File

@@ -19,6 +19,7 @@ import freechips.rocketchip.tilelink._
import freechips.rocketchip.util._
import midas.targetutils.SynthesizePrintf
import org.chipsalliance.cde.config._
import radiance.core._
import radiance.memory._
import radiance.subsystem.{GPUMemParams, GPUMemory, RadianceSimArgs}