From 3b71276c4ae41bfbca41a368f797989416975f2f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 13 Nov 2024 16:01:11 -0800 Subject: [PATCH 01/14] tensor: Do dot-product in fp16, only do accum in fp32 This is to better match Gemmini PEs doing MACs in full fp16, and only doing accumulation in fp32. --- src/main/scala/radiance/core/TensorDPU.scala | 14 ++++++++++---- .../scala/radiance/TensorCoreDecoupledTest.scala | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala index a4e6db0..ce131df 100644 --- a/src/main/scala/radiance/core/TensorDPU.scala +++ b/src/main/scala/radiance/core/TensorDPU.scala @@ -147,7 +147,7 @@ class DotProductPipe(dim: Int, inputType: tile.FType, outputType: tile.FType) ex val mulSigWidth = m.io.rawOut.sigWidth val roundRawFNToRecFN = Module(new hardfloat.RoundAnyRawFNToRecFN( - mulExpWidth, mulSigWidth, outExpWidth, outSigWidth, 0)) + mulExpWidth, mulSigWidth, expWidth, sigWidth, 0)) roundRawFNToRecFN.io.invalidExc := m.io.invalidExc roundRawFNToRecFN.io.infiniteExc := false.B roundRawFNToRecFN.io.in := m.io.rawOut @@ -169,7 +169,7 @@ class DotProductPipe(dim: Int, inputType: tile.FType, outputType: tile.FType) ex // instantiate wires for input values to each reduction pipeline stage val interim = (log2Dim to 0 by -1).map { i => - Wire(Valid(Vec(1 << i, Bits(recOutFLen.W)))) + Wire(Valid(Vec(1 << i, Bits(recInFLen.W)))) } // instantiate wires for pipe registers for C val interimC = (log2Dim to 0 by -1).map( _ => Wire(Valid(Bits(recOutFLen.W))) ) @@ -186,7 +186,7 @@ class DotProductPipe(dim: Int, inputType: tile.FType, outputType: tile.FType) ex require(inputs.bits.length == 2 * outputs.bits.length) val thisDim = inputs.bits.length val adders = Seq.fill(thisDim / 2)( - Module(new hardfloat.AddRecFN(outExpWidth, outSigWidth)) + Module(new hardfloat.AddRecFN(expWidth, sigWidth)) ) val addOuts = adders.zipWithIndex.map { case (a, i) => a.io.subOp := 0.U // FIXME dont know what this is @@ -212,9 +212,15 @@ class DotProductPipe(dim: Int, inputType: tile.FType, outputType: tile.FType) ex // add stages end ------------------------------------------------------------ // add final A and B dot-product result to accumulator C + val conv = Module(new hardfloat.RecFNToRecFN(expWidth, sigWidth, outExpWidth, outSigWidth)) + conv.io.in := addStageOut.bits(0) + conv.io.roundingMode := hardfloat.consts.round_near_even + conv.io.detectTininess := hardfloat.consts.tininess_afterRounding + // assert(conv.io.exceptionFlags === 0.U) + val acc = Module(new hardfloat.AddRecFN(outExpWidth, outSigWidth)) acc.io.subOp := 0.U // FIXME - acc.io.a := addStageOut.bits(0) + acc.io.a := conv.io.out acc.io.b := addStageC.bits acc.io.roundingMode := hardfloat.consts.round_near_even acc.io.detectTininess := hardfloat.consts.tininess_afterRounding diff --git a/src/test/scala/radiance/TensorCoreDecoupledTest.scala b/src/test/scala/radiance/TensorCoreDecoupledTest.scala index 7b31eb7..619f699 100644 --- a/src/test/scala/radiance/TensorCoreDecoupledTest.scala +++ b/src/test/scala/radiance/TensorCoreDecoupledTest.scala @@ -9,7 +9,7 @@ class TensorCoreDecoupledTest extends AnyFlatSpec with ChiselScalatestTester { behavior of "TensorCoreDecoupled" it should "do the right thing" in { - test(new TensorCoreDecoupled(8, 8, numSourceIds = 4, tilingParams = TensorTilingParams())) + test(new TensorCoreDecoupled(8, 8, numSourceIds = 4, half = true)) { c => c.io.initiate.valid.poke(true.B) c.io.initiate.bits.wid.poke(0.U) From 9d703708019c46a54025068569c1e81186179f44 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 25 Nov 2024 22:33:29 -0800 Subject: [PATCH 02/14] Fix deprecation warnings in FuzzerTile --- src/main/scala/radiance/tile/FuzzerTile.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/scala/radiance/tile/FuzzerTile.scala b/src/main/scala/radiance/tile/FuzzerTile.scala index 138b0f8..730f04a 100644 --- a/src/main/scala/radiance/tile/FuzzerTile.scala +++ b/src/main/scala/radiance/tile/FuzzerTile.scala @@ -4,8 +4,9 @@ package radiance.tile import chisel3._ -import org.chipsalliance.cde.config.{Parameters} -import freechips.rocketchip.diplomacy.{SimpleDevice, LazyModule} +import org.chipsalliance.cde.config.Parameters +import org.chipsalliance.diplomacy.lazymodule.LazyModule +import freechips.rocketchip.resources.SimpleDevice import freechips.rocketchip.prci.ClockCrossingType import freechips.rocketchip.rocket._ import freechips.rocketchip.tile._ From 33ff495febe7c3c0fdbfe54c89d31e792cdd9575 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Nov 2024 11:49:26 -0800 Subject: [PATCH 03/14] Fix doc errors and warnings for memfuzzer --- src/main/resources/csrc/SimMemFuzzer.cc | 1 - src/main/resources/vsrc/SimMemFuzzer.v | 2 +- src/main/scala/radiance/memory/Coalescing.scala | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/resources/csrc/SimMemFuzzer.cc b/src/main/resources/csrc/SimMemFuzzer.cc index e0a7799..5572048 100644 --- a/src/main/resources/csrc/SimMemFuzzer.cc +++ b/src/main/resources/csrc/SimMemFuzzer.cc @@ -2,7 +2,6 @@ #include #include #endif -#include #include extern "C" void memfuzz_init_rs(int num_lanes); diff --git a/src/main/resources/vsrc/SimMemFuzzer.v b/src/main/resources/vsrc/SimMemFuzzer.v index 263eb24..8d50121 100644 --- a/src/main/resources/vsrc/SimMemFuzzer.v +++ b/src/main/resources/vsrc/SimMemFuzzer.v @@ -47,7 +47,7 @@ module SimMemFuzzer #(parameter NUM_LANES = 4) ( input inflight, output finished ); - // "in": verilog->C, "out": C->verilog + // "in": C->verilog, "out": verilog->C // need to be in ascending order to match with C indexing // C array sizes are static, so need to use MAX_NUM_LANES bit __out_a_ready [0:`MAX_NUM_LANES-1]; diff --git a/src/main/scala/radiance/memory/Coalescing.scala b/src/main/scala/radiance/memory/Coalescing.scala index a21daee..aafe29c 100644 --- a/src/main/scala/radiance/memory/Coalescing.scala +++ b/src/main/scala/radiance/memory/Coalescing.scala @@ -2055,7 +2055,7 @@ class MemFuzzer( val laneNodes = Seq.tabulate(numLanes) { i => val clientParam = Seq( TLMasterParameters.v1( - name = "MemTraceDriver" + i.toString, + name = "MemFuzzer" + i.toString, sourceId = IdRange(0, numSrcIds) // visibility = Seq(AddressSet(0x0000, 0xffffff)) ) From bf0527e2ade1e8eb4d1f444431e831e9af68447a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Nov 2024 14:56:09 -0800 Subject: [PATCH 04/14] radiance.mk: Re-enable radpie; sync csrc/ as well --- radiance.mk | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/radiance.mk b/radiance.mk index c0e1e87..1664cca 100644 --- a/radiance.mk +++ b/radiance.mk @@ -5,13 +5,14 @@ VORTEX_SRC_DIR = $(base_dir)/generators/radiance/src/main/resources/vsrc/vortex RADPIE_SRC_DIR = $(base_dir)/generators/radiance/radpie RADPIE_BUILD_DIR = $(RADPIE_SRC_DIR)/target/release +RADIANCE_CSRC_DIR = $(base_dir)/generators/radiance/src/main/resources/csrc ################################################################## # THE FOLLOWING MUST BE += operators ################################################################## -# EXTRA_SIM_REQS += radpie -# EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradpie +EXTRA_SIM_REQS += radpie +EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradpie ifeq ($(shell echo $(CONFIG) | grep -E "SynConfig$$"),$(CONFIG)) EXTRA_SIM_PREPROC_DEFINES += +define+SYNTHESIS +define+NDEBUG +define+DPI_DISABLE endif @@ -36,16 +37,18 @@ radpie: cd $(RADPIE_SRC_DIR) && cargo build --release EXTRA_SIM_REQS += vortex_vsrc.$(CONFIG) -# below manipulation of VORTEX_VLOG_SOURCES doesn't work if we try to reuse +# below manipulation of RADIANCE_EXTERNAL_SRCS doesn't work if we try to reuse # $(call lookup_srcs) from common.mk, the variable doesn't expand somehow -ifeq ($(shell which fd 2> /dev/null),) - VORTEX_VLOG_SOURCES := $(shell find -L $(VORTEX_SRC_DIR) -type f -iname "*.sv" -o -iname "*.vh" -o -iname "*.v") +ifeq ($(shell which fdfd 2> /dev/null),) + RADIANCE_EXTERNAL_SRCS := $(shell find -L $(VORTEX_SRC_DIR) -type f -iname "*.sv" -o -iname "*.vh" -o -iname "*.v") + RADIANCE_EXTERNAL_SRCS += $(shell find -L $(RADIANCE_CSRC_DIR) -type f) else - VORTEX_VLOG_SOURCES := $(shell fd -L -t f -e "sv" -e "vh" -e "v" . $(VORTEX_SRC_DIR)) + RADIANCE_EXTERNAL_SRCS := $(shell fdfind -L -t f -e "sv" -e "vh" -e "v" . $(VORTEX_SRC_DIR)) + RADIANCE_EXTERNAL_SRCS += $(shell fdfind -L -t f . $(RADIANCE_CSRC_DIR)) endif -# VORTEX_COLLATERAL := $(patsubst $(VORTEX_SRC_DIR)%,$(GEN_COLLATERAL_DIR)%,$(VORTEX_VLOG_SOURCES)) -# check if expanded -# $(info VORTEX_VLOG_SOURCES: $(VORTEX_VLOG_SOURCES)) + +# for debug; check if expanded +# $(info RADIANCE_EXTERNAL_SRCS: $(RADIANCE_EXTERNAL_SRCS)) # For every Vortex verilog source file, if there's a matching file in # gen-collateral/, copy them over. This is a hacky way to ensure the changes @@ -53,8 +56,8 @@ endif # necessary when common.mk does not trigger chipyard jar rebuild upon verilog # source updates, in which case we need to manually ensure the up-to-date-ness # of gen-collateral/. -vortex_vsrc.$(CONFIG): $(VORTEX_VLOG_SOURCES) - @for file in $(VORTEX_VLOG_SOURCES); do \ +vortex_vsrc.$(CONFIG): $(RADIANCE_EXTERNAL_SRCS) + @for file in $(RADIANCE_EXTERNAL_SRCS); do \ filename=$$(basename "$$file"); \ if [ -f $(GEN_COLLATERAL_DIR)/$$filename ]; then \ if ! diff $$file $(GEN_COLLATERAL_DIR)/$$filename &>/dev/null ; then \ From 7cc40eedde35ed4ca5ef94804806fe66cbfa5fc4 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Nov 2024 15:23:24 -0800 Subject: [PATCH 05/14] Add EmulatorTile also split core-specific config keys from radiance.memory to radiance.core. --- src/main/resources/csrc/SimEmulator.cc | 31 +++ src/main/resources/vsrc/SimEmulator.v | 132 ++++++++++ src/main/scala/radiance/core/Emulator.scala | 243 ++++++++++++++++++ .../radiance/memory/CanHaveMemtraceCore.scala | 1 + .../scala/radiance/memory/Coalescing.scala | 17 +- .../scala/radiance/subsystem/Configs.scala | 58 +++-- .../scala/radiance/tile/EmulatorTile.scala | 96 +++++++ src/main/scala/radiance/tile/FuzzerTile.scala | 1 + .../scala/radiance/tile/RadianceTile.scala | 1 + 9 files changed, 545 insertions(+), 35 deletions(-) create mode 100644 src/main/resources/csrc/SimEmulator.cc create mode 100644 src/main/resources/vsrc/SimEmulator.v create mode 100644 src/main/scala/radiance/core/Emulator.scala create mode 100644 src/main/scala/radiance/tile/EmulatorTile.scala diff --git a/src/main/resources/csrc/SimEmulator.cc b/src/main/resources/csrc/SimEmulator.cc new file mode 100644 index 0000000..af454d5 --- /dev/null +++ b/src/main/resources/csrc/SimEmulator.cc @@ -0,0 +1,31 @@ +#ifndef NO_VPI +#include +#include +#endif +#include + +extern "C" void emulator_init_rs(int num_lanes); + +extern "C" void emulator_generate_rs(uint8_t *vec_a_ready, uint8_t *vec_a_valid, + long long *vec_a_address, + uint8_t *vec_a_is_store, int *vec_a_size, + long long *vec_a_data, uint8_t *vec_d_ready, + uint8_t *vec_d_valid, + uint8_t *vec_d_is_store, int *vec_d_size, + uint8_t inflight, uint8_t *finished); + +extern "C" void emulator_init(int num_lanes) { + emulator_init_rs(num_lanes); +} + +extern "C" void emulator_generate(uint8_t *vec_a_ready, uint8_t *vec_a_valid, + long long *vec_a_address, + uint8_t *vec_a_is_store, int *vec_a_size, + long long *vec_a_data, uint8_t *vec_d_ready, + uint8_t *vec_d_valid, uint8_t *vec_d_is_store, + int *vec_d_size, uint8_t inflight, + uint8_t *finished) { + emulator_generate_rs(vec_a_ready, vec_a_valid, vec_a_address, vec_a_is_store, + vec_a_size, vec_a_data, vec_d_ready, vec_d_valid, + vec_d_is_store, vec_d_size, inflight, finished); +} diff --git a/src/main/resources/vsrc/SimEmulator.v b/src/main/resources/vsrc/SimEmulator.v new file mode 100644 index 0000000..9b60316 --- /dev/null +++ b/src/main/resources/vsrc/SimEmulator.v @@ -0,0 +1,132 @@ +`include "SimDefaults.vh" + +import "DPI-C" function void emulator_init( + input longint num_lanes +); + +// Make sure to sync the parameters for: +// (1) import "DPI-C" declaration +// (2) C function declaration +// (3) DPI function calls inside initial/always blocks +import "DPI-C" function void emulator_generate +( + input bit vec_a_ready[`MAX_NUM_LANES], + output bit vec_a_valid[`MAX_NUM_LANES], + output longint vec_a_address[`MAX_NUM_LANES], + output bit vec_a_is_store[`MAX_NUM_LANES], + output int vec_a_size[`MAX_NUM_LANES], + output longint vec_a_data[`MAX_NUM_LANES], + + output bit vec_d_ready[`MAX_NUM_LANES], + input bit vec_d_valid[`MAX_NUM_LANES], + input bit vec_d_is_store[`MAX_NUM_LANES], + input int vec_d_size[`MAX_NUM_LANES], + + input bit inflight, + output bit finished +); + +module SimEmulator #(parameter NUM_LANES = 4) ( + input clock, + input reset, + + input [NUM_LANES-1:0] a_ready, + output [NUM_LANES-1:0] a_valid, + output [`SIMMEM_DATA_WIDTH*NUM_LANES-1:0] a_address, + output [NUM_LANES-1:0] a_is_store, + output [`SIMMEM_LOGSIZE_WIDTH*NUM_LANES-1:0] a_size, + output [`SIMMEM_DATA_WIDTH*NUM_LANES-1:0] a_data, + + output [NUM_LANES-1:0] d_ready, + input [NUM_LANES-1:0] d_valid, + input [NUM_LANES-1:0] d_is_store, + input [`SIMMEM_LOGSIZE_WIDTH*NUM_LANES-1:0] d_size, + // TODO: d_mask + // TODO: d_data + + input inflight, + output finished +); + // "in": C->verilog, "out": verilog->C + // need to be in ascending order to match with C indexing + // C array sizes are static, so need to use MAX_NUM_LANES + bit __out_a_ready [0:`MAX_NUM_LANES-1]; + bit __in_a_valid [0:`MAX_NUM_LANES-1]; + longint __in_a_address [0:`MAX_NUM_LANES-1]; + bit __in_a_is_store [0:`MAX_NUM_LANES-1]; + int __in_a_size [0:`MAX_NUM_LANES-1]; + longint __in_a_data [0:`MAX_NUM_LANES-1]; + bit __in_d_ready [0:`MAX_NUM_LANES-1]; + bit __out_d_valid [0:`MAX_NUM_LANES-1]; + bit __out_d_is_store [0:`MAX_NUM_LANES-1]; + int __out_d_size [0:`MAX_NUM_LANES-1]; + bit __out_inflight; + bit __in_finished; + + genvar g; + generate + for (g = 0; g < NUM_LANES; g = g + 1) begin + assign __out_a_ready[g] = a_ready[g]; + assign a_valid[g] = __in_a_valid[g]; + assign a_address[`SIMMEM_DATA_WIDTH*g +: `SIMMEM_DATA_WIDTH] + = __in_a_address[g][`SIMMEM_DATA_WIDTH-1:0]; + assign a_is_store[g] = __in_a_is_store[g]; + assign a_size[`SIMMEM_LOGSIZE_WIDTH*g +: `SIMMEM_LOGSIZE_WIDTH] + = __in_a_size[g][`SIMMEM_LOGSIZE_WIDTH-1:0]; + assign a_data[`SIMMEM_DATA_WIDTH*g +: `SIMMEM_DATA_WIDTH] + = __in_a_data[g][`SIMMEM_DATA_WIDTH-1:0]; + assign d_ready[g] = __in_d_ready[g]; + assign __out_d_valid[g] = d_valid[g]; + assign __out_d_is_store[g] = d_is_store[g]; + assign __out_d_size[g] = d_size[`SIMMEM_LOGSIZE_WIDTH*g +: `SIMMEM_LOGSIZE_WIDTH]; + end + assign __out_inflight = inflight; + endgenerate + assign finished = __in_finished; + + initial begin + emulator_init(NUM_LANES); + end + + // negedge is important here; the DPI logic is essentially functioning as + // a combinational logic, so we want to reflect the signal change from DPI + // at the *current* cycle, not the next. + always @(negedge clock) begin + if (reset) begin + for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin + __in_a_valid[tid] = 1'b0; + __in_a_address[tid] = `SIMMEM_DATA_WIDTH'b0; + __in_a_is_store[tid] = 1'b0; + __in_a_size[tid] = 32'b0; + __in_a_data[tid] = `SIMMEM_DATA_WIDTH'b0; + __in_d_ready[tid] = 1'b0; + end + __in_finished = 1'b0; + end else begin + emulator_generate( + __out_a_ready, + __in_a_valid, + __in_a_address, + __in_a_is_store, + __in_a_size, + __in_a_data, + + __in_d_ready, + __out_d_valid, + __out_d_is_store, + __out_d_size, + + __out_inflight, + __in_finished + ); + for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin + $display("verilog: %04d a_valid[%d]=%d, a_address[%d]=0x%x, d_ready[%d]=%d", + $time, tid, __in_a_valid[tid], tid, __in_a_address[tid], tid, __in_d_ready[tid]); + end + + if (finished) begin + $finish; + end + end + end +endmodule diff --git a/src/main/scala/radiance/core/Emulator.scala b/src/main/scala/radiance/core/Emulator.scala new file mode 100644 index 0000000..1afe2d7 --- /dev/null +++ b/src/main/scala/radiance/core/Emulator.scala @@ -0,0 +1,243 @@ +package radiance.core + +import chisel3._ +import chisel3.util._ +import org.chipsalliance.cde.config.{Field, Parameters} +import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp} +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.diplomacy.{IdRange, AddressSet, BufferParams} +import radiance.memory.{SourceGenerator, TraceLine, TLPrintf} + +case class SIMTCoreParams( + nWarps: Int = 4, // # of warps in the core + nCoreLanes: Int = 4, // # of SIMT threads in the core + nMemLanes: Int = 4, // # of memory lanes in the memory interface to the + // cache; relates to the LSU lanes + nSrcIds: Int = 8 // # of source IDs allocated to each of the nMemLanes +) +case class MemtraceCoreParams( + tracefilename: String = "undefined", + traceHasSource: Boolean = false +) + +case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/ ) +case object MemtraceCoreKey + extends Field[Option[MemtraceCoreParams]](None /*default*/ ) + +// ############################################################################# +// FIXME: copy-paste from MemFuzzer +// ############################################################################# + +class Emulator( + numLanes: Int, + numSrcIds: Int, + wordSizeInBytes: Int, +)(implicit p: Parameters) + extends LazyModule { + val laneNodes = Seq.tabulate(numLanes) { i => + val clientParam = Seq( + TLMasterParameters.v1( + name = "Emulator" + i.toString, + sourceId = IdRange(0, numSrcIds) + // visibility = Seq(AddressSet(0x0000, 0xffffff)) + ) + ) + TLClientNode(Seq(TLMasterPortParameters.v1(clientParam))) + } + + val node = TLIdentityNode() + laneNodes.foreach(node := _) + + lazy val module = new EmulatorImp(this, numLanes, numSrcIds, wordSizeInBytes) +} + +class EmulatorImp( + outer: Emulator, + numLanes : Int, + numSrcIds: Int, + wordSizeInBytes: Int, +) extends LazyModuleImp(outer) { + val io = IO(new Bundle { + val finished = Output(Bool()) + }) + val sim = Module(new SimEmulator(numLanes)) + sim.io.clock := clock + sim.io.reset := reset.asBool + + sim.io.a.ready := VecInit(outer.laneNodes.map { node => + val (tlOut, _) = node.out(0) + tlOut.a.ready + }).asUInt + + io.finished := sim.io.finished + + // connect Verilog <-> Chisel IO + // Verilog IO flattened across all lanes + val laneReqs = Wire(Vec(numLanes, Decoupled(new TraceLine))) + val addrW = laneReqs(0).bits.address.getWidth + val sizeW = laneReqs(0).bits.size.getWidth + val dataW = laneReqs(0).bits.data.getWidth + laneReqs.zipWithIndex.foreach { case (req, i) => + req.valid := sim.io.a.valid(i) + req.bits.source := 0.U // DPI doesn't generate contain source id + req.bits.address := sim.io.a.address(addrW * (i + 1) - 1, addrW * i) + req.bits.is_store := sim.io.a.is_store(i) + req.bits.size := sim.io.a.size(sizeW * (i + 1) - 1, sizeW * i) + req.bits.data := sim.io.a.data(dataW * (i + 1) - 1, dataW * i) + } + sim.io.a.ready := VecInit(laneReqs.map(_.ready)).asUInt + + val laneResps = Wire(Vec(numLanes, Flipped(Decoupled(new TraceLine)))) + laneResps.zipWithIndex.foreach { case (resp, i) => + resp.ready := sim.io.d.ready(i) + // TODO: not handled in DPI + resp.bits.source := DontCare + resp.bits.address := DontCare + resp.bits.data := DontCare + } + sim.io.d.valid := VecInit(laneResps.map(_.valid)).asUInt + sim.io.d.is_store := VecInit(laneResps.map(_.bits.is_store)).asUInt + sim.io.d.size := VecInit(laneResps.map(_.bits.size)).asUInt + + val sourceGens = Seq.fill(numLanes)( + Module( + new SourceGenerator( + log2Ceil(numSrcIds), + ignoreInUse = false + ) + ) + ) + val anyInflight = sourceGens.map(_.io.inflight).reduce(_ || _) + sim.io.inflight := anyInflight + + // Take requests off of the queue and generate TL requests + (outer.laneNodes zip (laneReqs zip laneResps)).zipWithIndex.foreach { + case ((node, (req, resp)), lane) => + val (tlOut, edge) = node.out(0) + + // Requests -------------------------------------------------------------- + // + // Core only makes accesses of granularity larger than a word, so we want + // the trace driver to act so as well. + // That means if req.size is smaller than word size, we need to pad data + // with zeros to generate a word-size request, and set mask accordingly. + val offsetInWord = req.bits.address % wordSizeInBytes.U + val subword = req.bits.size < log2Ceil(wordSizeInBytes).U + + // `mask` is currently unused + // val mask = Wire(UInt(wordSizeInBytes.W)) + val wordData = Wire(UInt((wordSizeInBytes * 8 * 2).W)) + val sizeInBytes = Wire(UInt((sizeW + 1).W)) + sizeInBytes := (1.U) << req.bits.size + // mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U) + wordData := Mux(subword, req.bits.data << (offsetInWord * 8.U), req.bits.data) + val wordAlignedAddress = + req.bits.address & ~((1 << log2Ceil(wordSizeInBytes)) - 1).U(addrW.W) + val wordAlignedSize = Mux(subword, 2.U, req.bits.size) + + val sourceGen = sourceGens(lane) + sourceGen.io.gen := tlOut.a.fire + sourceGen.io.reclaim.valid := tlOut.d.fire + sourceGen.io.reclaim.bits := tlOut.d.bits.source + sourceGen.io.meta := DontCare + + val (plegal, pbits) = edge.Put( + fromSource = sourceGen.io.id.bits, + toAddress = wordAlignedAddress, + lgSize = wordAlignedSize, // trace line already holds log2(size) + // data should be aligned to beatBytes + data = + (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt + ) + val (glegal, gbits) = edge.Get( + fromSource = sourceGen.io.id.bits, + toAddress = wordAlignedAddress, + lgSize = wordAlignedSize + ) + val legal = Mux(req.bits.is_store, plegal, glegal) + val bits = Mux(req.bits.is_store, pbits, gbits) + + tlOut.a.valid := req.valid && sourceGen.io.id.valid + req.ready := tlOut.a.ready && sourceGen.io.id.valid + + when(tlOut.a.fire) { + assert(legal, "illegal TL req gen") + } + tlOut.a.bits := bits + + // Responses ------------------------------------------------------------- + // + tlOut.d.ready := resp.ready + resp.valid := tlOut.d.valid + resp.bits.is_store := !edge.hasData(tlOut.d.bits) + resp.bits.size := tlOut.d.bits.size + + tlOut.b.ready := true.B + tlOut.c.valid := false.B + tlOut.e.valid := false.B + + // debug + dontTouch(req) + when(tlOut.a.valid) { + printf(s"Lane ${lane}: "); + TLPrintf( + "Emulator", + tlOut.a.bits.source, + tlOut.a.bits.address, + tlOut.a.bits.size, + tlOut.a.bits.mask, + req.bits.is_store, + tlOut.a.bits.data, + req.bits.data + ) + } + dontTouch(tlOut.a) + dontTouch(tlOut.d) + } + + // when(traceFinished && allReqReclaimed && noValidReqs) { + // assert( + // false.B, + // "\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)" + // ) + // } +} + +class SimEmulator(numLanes: Int) + extends BlackBox(Map("NUM_LANES" -> numLanes)) + with HasBlackBoxResource { + val traceLineT = new TraceLine + val addrW = traceLineT.address.getWidth + val sizeW = traceLineT.size.getWidth + val dataW = traceLineT.data.getWidth + val io = IO(new Bundle { + val clock = Input(Clock()) + val reset = Input(Bool()) + val inflight = Input(Bool()) + val finished = Output(Bool()) + + val a = + new Bundle { + val ready = Input(UInt(numLanes.W)) + val valid = Output(UInt(numLanes.W)) + // Chisel can't interface with Verilog 2D port, so flatten all lanes into + // single wide 1D array. + val address = Output(UInt((addrW * numLanes).W)) + val is_store = Output(UInt(numLanes.W)) + val size = Output(UInt((sizeW * numLanes).W)) + val data = Output(UInt((dataW * numLanes).W)) + } + val d = + new Bundle { + val ready = Output(UInt(numLanes.W)) + val valid = Input(UInt(numLanes.W)) + val is_store = Input(UInt(numLanes.W)) + val size = Input(UInt((sizeW * numLanes).W)) + } + }) + + addResource("/vsrc/SimDefaults.vh") + addResource("/vsrc/SimEmulator.v") + addResource("/csrc/SimEmulator.cc") +} + diff --git a/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala b/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala index 0801071..658db38 100644 --- a/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala +++ b/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala @@ -4,6 +4,7 @@ import freechips.rocketchip.diplomacy.LazyModule import freechips.rocketchip.subsystem._ import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.tilelink._ +import radiance.core.{SIMTCoreKey, MemtraceCoreKey} // TODO: possibly move to somewhere closer to CoalescingUnit // TODO: separate coalescer config from CanHaveMemtraceCore diff --git a/src/main/scala/radiance/memory/Coalescing.scala b/src/main/scala/radiance/memory/Coalescing.scala index aafe29c..5f24cc3 100644 --- a/src/main/scala/radiance/memory/Coalescing.scala +++ b/src/main/scala/radiance/memory/Coalescing.scala @@ -10,25 +10,10 @@ import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp} import freechips.rocketchip.util.{Code, MultiPortQueue, OnePortLanePositionedQueue} import freechips.rocketchip.unittest._ import freechips.rocketchip.tilelink._ +import radiance.core.{SIMTCoreParams, SIMTCoreKey} -// TODO: find better place for these - -case class SIMTCoreParams( - nWarps: Int = 4, // # of warps in the core - nCoreLanes: Int = 4, // # of SIMT threads in the core - nMemLanes: Int = 4, // # of memory lanes in the memory interface to the - // cache; relates to the LSU lanes - nSrcIds: Int = 8 // # of source IDs allocated to each of the nMemLanes -) -case class MemtraceCoreParams( - tracefilename: String = "undefined", - traceHasSource: Boolean = false -) case class CoalXbarParam() -case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/ ) -case object MemtraceCoreKey - extends Field[Option[MemtraceCoreParams]](None /*default*/ ) case object CoalescerKey extends Field[Option[CoalescerConfig]](None /*default*/ ) case object CoalXbarKey extends Field[Option[CoalXbarParam]](None /*default*/ ) diff --git a/src/main/scala/radiance/subsystem/Configs.scala b/src/main/scala/radiance/subsystem/Configs.scala index 4a3a940..522fb48 100644 --- a/src/main/scala/radiance/subsystem/Configs.scala +++ b/src/main/scala/radiance/subsystem/Configs.scala @@ -12,6 +12,7 @@ import freechips.rocketchip.subsystem._ import gemmini._ import gemmini.Arithmetic.FloatArithmetic._ import radiance.tile._ +import radiance.core._ import radiance.memory._ import radiance.subsystem.RadianceGemminiDataType.{BF16, FP16, FP32, Int8} @@ -106,6 +107,44 @@ class WithRadianceCores( ), tensorCoreFP16, tensorCoreDecoupled, useVxCache) } +class WithEmulatorCores( + n: Int, + useVxCache: Boolean +) extends Config((site, _, up) => { + case TilesLocated(InSubsystem) => { + val prev = up(TilesLocated(InSubsystem)) + val idOffset = up(NumTiles) + val emulator = EmulatorTileParams( + core = VortexCoreParams(), + useVxCache = useVxCache) + List.tabulate(n)(i => EmulatorTileAttachParams( + emulator.copy(tileId = i + idOffset), + RocketCrossingParams() + )) ++ prev + } + case NumTiles => up(NumTiles) + 1 + case NumRadianceCores => up(NumRadianceCores) + 1 +}) + +class WithFuzzerCores( + n: Int, + useVxCache: Boolean +) extends Config((site, _, up) => { + case TilesLocated(InSubsystem) => { + val prev = up(TilesLocated(InSubsystem)) + val idOffset = up(NumTiles) + val fuzzer = FuzzerTileParams( + core = VortexCoreParams(), + useVxCache = useVxCache) + List.tabulate(n)(i => FuzzerTileAttachParams( + fuzzer.copy(tileId = i + idOffset), + RocketCrossingParams() + )) ++ prev + } + case NumTiles => up(NumTiles) + 1 + case NumRadianceCores => up(NumRadianceCores) + 1 +}) + object RadianceGemminiDataType extends Enumeration { type Type = Value val FP32, FP16, BF16, Int8 = Value @@ -244,25 +283,6 @@ class WithRadianceFrameBuffer(baseAddress: BigInt, } }) -class WithFuzzerCores( - n: Int, - useVxCache: Boolean -) extends Config((site, _, up) => { - case TilesLocated(InSubsystem) => { - val prev = up(TilesLocated(InSubsystem)) - val idOffset = up(NumTiles) - val fuzzer = FuzzerTileParams( - core = VortexCoreParams(), - useVxCache = useVxCache) - List.tabulate(n)(i => FuzzerTileAttachParams( - fuzzer.copy(tileId = i + idOffset), - RocketCrossingParams() - )) ++ prev - } - case NumTiles => up(NumTiles) + 1 - case NumRadianceCores => up(NumRadianceCores) + 1 -}) - class WithRadianceCluster( clusterId: Int, location: HierarchicalLocation = InSubsystem, diff --git a/src/main/scala/radiance/tile/EmulatorTile.scala b/src/main/scala/radiance/tile/EmulatorTile.scala new file mode 100644 index 0000000..d6881ca --- /dev/null +++ b/src/main/scala/radiance/tile/EmulatorTile.scala @@ -0,0 +1,96 @@ +// See LICENSE.SiFive for license details. +// See LICENSE.Berkeley for license details. + +package radiance.tile + +import chisel3._ +import org.chipsalliance.cde.config.Parameters +import org.chipsalliance.diplomacy.lazymodule.LazyModule +import freechips.rocketchip.resources.SimpleDevice +import freechips.rocketchip.prci.ClockCrossingType +import freechips.rocketchip.rocket._ +import freechips.rocketchip.tile._ +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.subsystem.{HierarchicalElementCrossingParamsLike, CanAttachTile} +import freechips.rocketchip.prci.{ClockSinkParameters} +import radiance.core._ +import radiance.memory.{CoalescingUnit, CoalescerKey} + +// TODO: De-duplicate between this and FuzzerTile + +case class EmulatorTileParams( + core: VortexCoreParams = VortexCoreParams(), // TODO: remove this + useVxCache: Boolean = false, + tileId: Int = 0, +) extends InstantiableTileParams[EmulatorTile] { + def instantiate(crossing: HierarchicalElementCrossingParamsLike, lookup: LookupByHartIdImpl)( + implicit p: Parameters + ): EmulatorTile = { + new EmulatorTile(this, crossing, lookup) + } + val clockSinkParams = ClockSinkParameters() + val blockerCtrlAddr = None + val icache = None + val dcache = None + val btb = None + val baseName = "radiance_emulator_tile" + val uniqueName = s"${baseName}_$tileId" +} + +case class EmulatorTileAttachParams( + tileParams: EmulatorTileParams, + crossingParams: HierarchicalElementCrossingParamsLike +) extends CanAttachTile { type TileType = EmulatorTile } + +class EmulatorTile private ( + val EmulatorParams: EmulatorTileParams, + crossing: ClockCrossingType, + lookup: LookupByHartIdImpl, + q: Parameters +) extends BaseTile(EmulatorParams, crossing, lookup, q) + with SinksExternalInterrupts + with SourcesExternalNotifications { + def this( + params: EmulatorTileParams, + crossing: HierarchicalElementCrossingParamsLike, + lookup: LookupByHartIdImpl + )(implicit p: Parameters) = + this(params, crossing.crossingType, lookup, p) + + val cpuDevice: SimpleDevice = new SimpleDevice("emulator", Nil) + + val intOutwardNode = None + val slaveNode: TLInwardNode = TLIdentityNode() + val masterNode = visibilityNode + // val statusNode = BundleBridgeSource(() => new GroundTestStatus) + + val (numLanes, numSrcIds) = p(SIMTCoreKey) match { + case Some(param) => (param.nMemLanes, param.nSrcIds) + case None => { + require(false, "emulator requires SIMTCoreKey to be defined") + (0, 0) + } + } + // FIXME: parameterize + val wordSizeInBytes = 4 + + val emulator = LazyModule(new Emulator(numLanes, numSrcIds, wordSizeInBytes)) + + // Conditionally instantiate memory coalescer + val coalescerNode = p(CoalescerKey) match { + case Some(coalParam) => { + val coal = LazyModule(new CoalescingUnit(coalParam)) + coal.cpuNode :=* TLWidthWidget(4) :=* emulator.node + coal.aggregateNode + } + case None => emulator.node + } + + masterNode :=* coalescerNode + + override lazy val module = new EmulatorTileModuleImp(this) +} + +class EmulatorTileModuleImp(outer: EmulatorTile) extends BaseTileModuleImp(outer) { + outer.reportCease(Some(outer.emulator.module.io.finished)) +} diff --git a/src/main/scala/radiance/tile/FuzzerTile.scala b/src/main/scala/radiance/tile/FuzzerTile.scala index 730f04a..5e17672 100644 --- a/src/main/scala/radiance/tile/FuzzerTile.scala +++ b/src/main/scala/radiance/tile/FuzzerTile.scala @@ -13,6 +13,7 @@ import freechips.rocketchip.tile._ import freechips.rocketchip.tilelink._ import freechips.rocketchip.subsystem.{HierarchicalElementCrossingParamsLike, CanAttachTile} import freechips.rocketchip.prci.{ClockSinkParameters} +import radiance.core.{SIMTCoreKey} import radiance.memory._ case class FuzzerTileParams( diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 202543a..f4e4165 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -19,6 +19,7 @@ import freechips.rocketchip.tilelink._ import freechips.rocketchip.util._ import midas.targetutils.SynthesizePrintf import org.chipsalliance.cde.config._ +import radiance.core._ import radiance.memory._ import radiance.subsystem.{GPUMemParams, GPUMemory, RadianceSimArgs} From 6de4e875d4dafcfe83aa079bc87c555c3a791fb3 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Nov 2024 15:26:39 -0800 Subject: [PATCH 06/14] Bump radpie --- radpie | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/radpie b/radpie index 493b8e1..0f48932 160000 --- a/radpie +++ b/radpie @@ -1 +1 @@ -Subproject commit 493b8e10a5116385946deaaef1a82f6597d7b8a2 +Subproject commit 0f489320882903fdb014f6a6aef361df2a7a931c From e3080bf3ee8d7475a9885bcbbc61efbdf00a2412 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 4 Dec 2024 18:07:55 -0800 Subject: [PATCH 07/14] Update DPI for tick/generate split --- src/main/resources/csrc/SimEmulator.cc | 34 ++++++++-------- src/main/resources/vsrc/SimEmulator.v | 45 ++++++++++++--------- src/main/scala/radiance/core/Emulator.scala | 1 + 3 files changed, 44 insertions(+), 36 deletions(-) diff --git a/src/main/resources/csrc/SimEmulator.cc b/src/main/resources/csrc/SimEmulator.cc index af454d5..a452d6c 100644 --- a/src/main/resources/csrc/SimEmulator.cc +++ b/src/main/resources/csrc/SimEmulator.cc @@ -5,27 +5,27 @@ #include extern "C" void emulator_init_rs(int num_lanes); - +extern "C" void emulator_tick_rs(uint8_t *vec_d_ready, uint8_t *vec_d_valid, + uint8_t *vec_d_is_store, int *vec_d_size); extern "C" void emulator_generate_rs(uint8_t *vec_a_ready, uint8_t *vec_a_valid, - long long *vec_a_address, - uint8_t *vec_a_is_store, int *vec_a_size, - long long *vec_a_data, uint8_t *vec_d_ready, - uint8_t *vec_d_valid, - uint8_t *vec_d_is_store, int *vec_d_size, - uint8_t inflight, uint8_t *finished); + long long *vec_a_address, + uint8_t *vec_a_is_store, int *vec_a_size, + long long *vec_a_data, + uint8_t *vec_d_ready, uint8_t inflight, + uint8_t *finished); -extern "C" void emulator_init(int num_lanes) { - emulator_init_rs(num_lanes); +extern "C" void emulator_init(int num_lanes) { emulator_init_rs(num_lanes); } + +extern "C" void emulator_tick(uint8_t *vec_d_ready, uint8_t *vec_d_valid, + uint8_t *vec_d_is_store, int *vec_d_size) { + emulator_tick_rs(vec_d_ready, vec_d_valid, vec_d_is_store, vec_d_size); } extern "C" void emulator_generate(uint8_t *vec_a_ready, uint8_t *vec_a_valid, - long long *vec_a_address, - uint8_t *vec_a_is_store, int *vec_a_size, - long long *vec_a_data, uint8_t *vec_d_ready, - uint8_t *vec_d_valid, uint8_t *vec_d_is_store, - int *vec_d_size, uint8_t inflight, - uint8_t *finished) { + long long *vec_a_address, + uint8_t *vec_a_is_store, int *vec_a_size, + long long *vec_a_data, uint8_t *vec_d_ready, + uint8_t inflight, uint8_t *finished) { emulator_generate_rs(vec_a_ready, vec_a_valid, vec_a_address, vec_a_is_store, - vec_a_size, vec_a_data, vec_d_ready, vec_d_valid, - vec_d_is_store, vec_d_size, inflight, finished); + vec_a_size, vec_a_data, vec_d_ready, inflight, finished); } diff --git a/src/main/resources/vsrc/SimEmulator.v b/src/main/resources/vsrc/SimEmulator.v index 9b60316..af5b408 100644 --- a/src/main/resources/vsrc/SimEmulator.v +++ b/src/main/resources/vsrc/SimEmulator.v @@ -8,6 +8,14 @@ import "DPI-C" function void emulator_init( // (1) import "DPI-C" declaration // (2) C function declaration // (3) DPI function calls inside initial/always blocks +import "DPI-C" function void emulator_tick +( + output bit vec_d_ready[`MAX_NUM_LANES], + input bit vec_d_valid[`MAX_NUM_LANES], + input bit vec_d_is_store[`MAX_NUM_LANES], + input int vec_d_size[`MAX_NUM_LANES] +); + import "DPI-C" function void emulator_generate ( input bit vec_a_ready[`MAX_NUM_LANES], @@ -16,11 +24,7 @@ import "DPI-C" function void emulator_generate output bit vec_a_is_store[`MAX_NUM_LANES], output int vec_a_size[`MAX_NUM_LANES], output longint vec_a_data[`MAX_NUM_LANES], - output bit vec_d_ready[`MAX_NUM_LANES], - input bit vec_d_valid[`MAX_NUM_LANES], - input bit vec_d_is_store[`MAX_NUM_LANES], - input int vec_d_size[`MAX_NUM_LANES], input bit inflight, output bit finished @@ -88,10 +92,7 @@ module SimEmulator #(parameter NUM_LANES = 4) ( emulator_init(NUM_LANES); end - // negedge is important here; the DPI logic is essentially functioning as - // a combinational logic, so we want to reflect the signal change from DPI - // at the *current* cycle, not the next. - always @(negedge clock) begin + always @(posedge clock) begin if (reset) begin for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin __in_a_valid[tid] = 1'b0; @@ -110,23 +111,29 @@ module SimEmulator #(parameter NUM_LANES = 4) ( __in_a_is_store, __in_a_size, __in_a_data, - __in_d_ready, - __out_d_valid, - __out_d_is_store, - __out_d_size, __out_inflight, __in_finished ); - for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin - $display("verilog: %04d a_valid[%d]=%d, a_address[%d]=0x%x, d_ready[%d]=%d", - $time, tid, __in_a_valid[tid], tid, __in_a_address[tid], tid, __in_d_ready[tid]); - end + // for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin + // $display("verilog: %04d a_valid[%d]=%d, a_address[%d]=0x%x, d_ready[%d]=%d", + // $time, tid, __in_a_valid[tid], tid, __in_a_address[tid], tid, __in_d_ready[tid]); + // end + end + end - if (finished) begin - $finish; - end + // negedge is important here; the DPI logic is essentially functioning as + // a combinational logic, so we want to reflect the signal change from DPI + // at the *current* cycle, not the next. + always @(negedge clock) begin + if (!reset) begin + emulator_tick( + __in_d_ready, + __out_d_valid, + __out_d_is_store, + __out_d_size + ); end end endmodule diff --git a/src/main/scala/radiance/core/Emulator.scala b/src/main/scala/radiance/core/Emulator.scala index 1afe2d7..9a6cd75 100644 --- a/src/main/scala/radiance/core/Emulator.scala +++ b/src/main/scala/radiance/core/Emulator.scala @@ -61,6 +61,7 @@ class EmulatorImp( val finished = Output(Bool()) }) val sim = Module(new SimEmulator(numLanes)) + sim.io.clock := clock sim.io.reset := reset.asBool From 81595a9a9ff5c9d01568f560bc3d9b6080fc2d4a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 4 Dec 2024 18:10:19 -0800 Subject: [PATCH 08/14] Bump cyclotron --- .gitmodules | 2 +- cyclotron | 1 + radpie | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) create mode 160000 cyclotron delete mode 160000 radpie diff --git a/.gitmodules b/.gitmodules index d49652c..468a6f8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,5 +2,5 @@ path = src/main/resources/vsrc/vortex url = git@github.com:hansungk/vortex-private.git [submodule "radpie"] - path = radpie + path = cyclotron url = git@github.com:hansungk/radpie.git diff --git a/cyclotron b/cyclotron new file mode 160000 index 0000000..b6ad5b5 --- /dev/null +++ b/cyclotron @@ -0,0 +1 @@ +Subproject commit b6ad5b54546b4fa4417e58cb24ea40c3c275816a diff --git a/radpie b/radpie deleted file mode 160000 index 0f48932..0000000 --- a/radpie +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0f489320882903fdb014f6a6aef361df2a7a931c From ba67263b408733b7a3ecc8dcd0fbf82fa3204a8a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 4 Dec 2024 18:14:29 -0800 Subject: [PATCH 09/14] Update paths --- cyclotron | 2 +- radiance.mk | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/cyclotron b/cyclotron index b6ad5b5..d61fa68 160000 --- a/cyclotron +++ b/cyclotron @@ -1 +1 @@ -Subproject commit b6ad5b54546b4fa4417e58cb24ea40c3c275816a +Subproject commit d61fa682ad8d1bfb54b1860aeeae47184dc80ba8 diff --git a/radiance.mk b/radiance.mk index 1664cca..0699dc5 100644 --- a/radiance.mk +++ b/radiance.mk @@ -3,16 +3,17 @@ ############################################################## VORTEX_SRC_DIR = $(base_dir)/generators/radiance/src/main/resources/vsrc/vortex -RADPIE_SRC_DIR = $(base_dir)/generators/radiance/radpie -RADPIE_BUILD_DIR = $(RADPIE_SRC_DIR)/target/release +CYCLOTRON_SRC_DIR = $(base_dir)/generators/radiance/cyclotron +CYCLOTRON_BUILD_DIR = $(CYCLOTRON_SRC_DIR)/target/release RADIANCE_CSRC_DIR = $(base_dir)/generators/radiance/src/main/resources/csrc +RADIANCE_VSRC_DIR = $(base_dir)/generators/radiance/src/main/resources/vsrc ################################################################## # THE FOLLOWING MUST BE += operators ################################################################## -EXTRA_SIM_REQS += radpie -EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradpie +EXTRA_SIM_REQS += cyclotron +EXTRA_SIM_LDFLAGS += -L$(CYCLOTRON_BUILD_DIR) -Wl,-rpath,$(CYCLOTRON_BUILD_DIR) -lcyclotron ifeq ($(shell echo $(CONFIG) | grep -E "SynConfig$$"),$(CONFIG)) EXTRA_SIM_PREPROC_DEFINES += +define+SYNTHESIS +define+NDEBUG +define+DPI_DISABLE endif @@ -32,18 +33,20 @@ VCS_NONCC_OPTS += +vcs+initreg+random # cargo handles building of Rust files all on its own, so make this a PHONY # target to run cargo unconditionally -.PHONY: radpie -radpie: - cd $(RADPIE_SRC_DIR) && cargo build --release +.PHONY: cyclotron +cyclotron: + cd $(CYCLOTRON_SRC_DIR) && cargo build --release EXTRA_SIM_REQS += vortex_vsrc.$(CONFIG) # below manipulation of RADIANCE_EXTERNAL_SRCS doesn't work if we try to reuse # $(call lookup_srcs) from common.mk, the variable doesn't expand somehow ifeq ($(shell which fdfd 2> /dev/null),) - RADIANCE_EXTERNAL_SRCS := $(shell find -L $(VORTEX_SRC_DIR) -type f -iname "*.sv" -o -iname "*.vh" -o -iname "*.v") + # RADIANCE_EXTERNAL_SRCS := $(shell find -L $(VORTEX_SRC_DIR) -type f -iname "*.sv" -o -iname "*.vh" -o -iname "*.v") + RADIANCE_EXTERNAL_SRCS := $(shell find -L $(RADIANCE_VSRC_DIR) -type f -iname "*.sv" -o -iname "*.vh" -o -iname "*.v") RADIANCE_EXTERNAL_SRCS += $(shell find -L $(RADIANCE_CSRC_DIR) -type f) else - RADIANCE_EXTERNAL_SRCS := $(shell fdfind -L -t f -e "sv" -e "vh" -e "v" . $(VORTEX_SRC_DIR)) + # RADIANCE_EXTERNAL_SRCS := $(shell fdfind -L -t f -e "sv" -e "vh" -e "v" . $(VORTEX_SRC_DIR)) + RADIANCE_EXTERNAL_SRCS := $(shell fdfind -L -t f -e "sv" -e "vh" -e "v" . $(RADIANCE_VSRC_DIR)) RADIANCE_EXTERNAL_SRCS += $(shell fdfind -L -t f . $(RADIANCE_CSRC_DIR)) endif From 3af0670527001a0ad586c8de6ad34f552586fd8e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 5 Dec 2024 11:51:49 +0900 Subject: [PATCH 10/14] Add cyclotron-main --- .gitmodules | 5 +++++ cyclotron-main | 1 + 2 files changed, 6 insertions(+) create mode 160000 cyclotron-main diff --git a/.gitmodules b/.gitmodules index 468a6f8..29685ca 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,8 @@ [submodule "radpie"] path = cyclotron url = git@github.com:hansungk/radpie.git +[submodule "cyclotron-main"] + path = cyclotron-main + url = https://github.com/hansungk/cyclotron-main.git +[submodule "cyclotron"] + url = https://github.com/hansungk/cyclotron.git diff --git a/cyclotron-main b/cyclotron-main new file mode 160000 index 0000000..25f2e77 --- /dev/null +++ b/cyclotron-main @@ -0,0 +1 @@ +Subproject commit 25f2e7734bfbff5a25fc0f4688fe67adb4116ef0 From a4fa1522abbfe6c3d5ea64e8280e48464e946db1 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 4 Jan 2025 23:03:25 -0800 Subject: [PATCH 11/14] Add D data to DPI interface --- src/main/resources/csrc/SimEmulator.cc | 9 ++++++--- src/main/resources/vsrc/SimEmulator.v | 10 +++++++--- src/main/scala/radiance/core/Emulator.scala | 2 ++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/main/resources/csrc/SimEmulator.cc b/src/main/resources/csrc/SimEmulator.cc index a452d6c..365c22f 100644 --- a/src/main/resources/csrc/SimEmulator.cc +++ b/src/main/resources/csrc/SimEmulator.cc @@ -6,7 +6,8 @@ extern "C" void emulator_init_rs(int num_lanes); extern "C" void emulator_tick_rs(uint8_t *vec_d_ready, uint8_t *vec_d_valid, - uint8_t *vec_d_is_store, int *vec_d_size); + uint8_t *vec_d_is_store, int *vec_d_size, + long long *vec_d_data); extern "C" void emulator_generate_rs(uint8_t *vec_a_ready, uint8_t *vec_a_valid, long long *vec_a_address, uint8_t *vec_a_is_store, int *vec_a_size, @@ -17,8 +18,10 @@ extern "C" void emulator_generate_rs(uint8_t *vec_a_ready, uint8_t *vec_a_valid, extern "C" void emulator_init(int num_lanes) { emulator_init_rs(num_lanes); } extern "C" void emulator_tick(uint8_t *vec_d_ready, uint8_t *vec_d_valid, - uint8_t *vec_d_is_store, int *vec_d_size) { - emulator_tick_rs(vec_d_ready, vec_d_valid, vec_d_is_store, vec_d_size); + uint8_t *vec_d_is_store, int *vec_d_size, + long long *vec_d_data) { + emulator_tick_rs(vec_d_ready, vec_d_valid, vec_d_is_store, vec_d_size, + vec_d_data); } extern "C" void emulator_generate(uint8_t *vec_a_ready, uint8_t *vec_a_valid, diff --git a/src/main/resources/vsrc/SimEmulator.v b/src/main/resources/vsrc/SimEmulator.v index af5b408..7abb8f7 100644 --- a/src/main/resources/vsrc/SimEmulator.v +++ b/src/main/resources/vsrc/SimEmulator.v @@ -13,7 +13,8 @@ import "DPI-C" function void emulator_tick output bit vec_d_ready[`MAX_NUM_LANES], input bit vec_d_valid[`MAX_NUM_LANES], input bit vec_d_is_store[`MAX_NUM_LANES], - input int vec_d_size[`MAX_NUM_LANES] + input int vec_d_size[`MAX_NUM_LANES], + input longint vec_d_data[`MAX_NUM_LANES] ); import "DPI-C" function void emulator_generate @@ -45,8 +46,8 @@ module SimEmulator #(parameter NUM_LANES = 4) ( input [NUM_LANES-1:0] d_valid, input [NUM_LANES-1:0] d_is_store, input [`SIMMEM_LOGSIZE_WIDTH*NUM_LANES-1:0] d_size, + input [`SIMMEM_DATA_WIDTH*NUM_LANES-1:0] d_data, // TODO: d_mask - // TODO: d_data input inflight, output finished @@ -64,6 +65,7 @@ module SimEmulator #(parameter NUM_LANES = 4) ( bit __out_d_valid [0:`MAX_NUM_LANES-1]; bit __out_d_is_store [0:`MAX_NUM_LANES-1]; int __out_d_size [0:`MAX_NUM_LANES-1]; + longint __out_d_data [0:`MAX_NUM_LANES-1]; bit __out_inflight; bit __in_finished; @@ -83,6 +85,7 @@ module SimEmulator #(parameter NUM_LANES = 4) ( assign __out_d_valid[g] = d_valid[g]; assign __out_d_is_store[g] = d_is_store[g]; assign __out_d_size[g] = d_size[`SIMMEM_LOGSIZE_WIDTH*g +: `SIMMEM_LOGSIZE_WIDTH]; + assign __out_d_data[g] = d_data[`SIMMEM_DATA_WIDTH*g +: `SIMMEM_DATA_WIDTH]; end assign __out_inflight = inflight; endgenerate @@ -132,7 +135,8 @@ module SimEmulator #(parameter NUM_LANES = 4) ( __in_d_ready, __out_d_valid, __out_d_is_store, - __out_d_size + __out_d_size, + __out_d_data ); end end diff --git a/src/main/scala/radiance/core/Emulator.scala b/src/main/scala/radiance/core/Emulator.scala index 9a6cd75..06a9579 100644 --- a/src/main/scala/radiance/core/Emulator.scala +++ b/src/main/scala/radiance/core/Emulator.scala @@ -99,6 +99,7 @@ class EmulatorImp( sim.io.d.valid := VecInit(laneResps.map(_.valid)).asUInt sim.io.d.is_store := VecInit(laneResps.map(_.bits.is_store)).asUInt sim.io.d.size := VecInit(laneResps.map(_.bits.size)).asUInt + sim.io.d.data := VecInit(laneResps.map(_.bits.data)).asUInt val sourceGens = Seq.fill(numLanes)( Module( @@ -234,6 +235,7 @@ class SimEmulator(numLanes: Int) val valid = Input(UInt(numLanes.W)) val is_store = Input(UInt(numLanes.W)) val size = Input(UInt((sizeW * numLanes).W)) + val data = Input(UInt((dataW * numLanes).W)) } }) From 049394518bdb3cea76134f870ada29fe799cb37a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 4 Jan 2025 23:03:49 -0800 Subject: [PATCH 12/14] Default to debug mode for cyclotron --- radiance.mk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/radiance.mk b/radiance.mk index 0699dc5..7d1abb1 100644 --- a/radiance.mk +++ b/radiance.mk @@ -4,7 +4,8 @@ VORTEX_SRC_DIR = $(base_dir)/generators/radiance/src/main/resources/vsrc/vortex CYCLOTRON_SRC_DIR = $(base_dir)/generators/radiance/cyclotron -CYCLOTRON_BUILD_DIR = $(CYCLOTRON_SRC_DIR)/target/release +CYCLOTRON_BUILD_DIR = $(CYCLOTRON_SRC_DIR)/target/debug +# CYCLOTRON_BUILD_DIR = $(CYCLOTRON_SRC_DIR)/target/release RADIANCE_CSRC_DIR = $(base_dir)/generators/radiance/src/main/resources/csrc RADIANCE_VSRC_DIR = $(base_dir)/generators/radiance/src/main/resources/vsrc @@ -35,7 +36,7 @@ VCS_NONCC_OPTS += +vcs+initreg+random # target to run cargo unconditionally .PHONY: cyclotron cyclotron: - cd $(CYCLOTRON_SRC_DIR) && cargo build --release + cd $(CYCLOTRON_SRC_DIR) && cargo build # --release EXTRA_SIM_REQS += vortex_vsrc.$(CONFIG) # below manipulation of RADIANCE_EXTERNAL_SRCS doesn't work if we try to reuse From 17001efbf3d74b8212d188ab2f05c1ddeb31fd7c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 5 Jan 2025 00:04:28 -0800 Subject: [PATCH 13/14] Remove emulator_generate and merge into emulator_tick --- src/main/resources/csrc/SimEmulator.cc | 51 +++++++++++++++----------- src/main/resources/vsrc/SimEmulator.v | 38 +++++++------------ 2 files changed, 43 insertions(+), 46 deletions(-) diff --git a/src/main/resources/csrc/SimEmulator.cc b/src/main/resources/csrc/SimEmulator.cc index 365c22f..379eeb2 100644 --- a/src/main/resources/csrc/SimEmulator.cc +++ b/src/main/resources/csrc/SimEmulator.cc @@ -5,30 +5,39 @@ #include extern "C" void emulator_init_rs(int num_lanes); -extern "C" void emulator_tick_rs(uint8_t *vec_d_ready, uint8_t *vec_d_valid, - uint8_t *vec_d_is_store, int *vec_d_size, - long long *vec_d_data); -extern "C" void emulator_generate_rs(uint8_t *vec_a_ready, uint8_t *vec_a_valid, - long long *vec_a_address, - uint8_t *vec_a_is_store, int *vec_a_size, - long long *vec_a_data, - uint8_t *vec_d_ready, uint8_t inflight, - uint8_t *finished); +extern "C" void emulator_tick_rs(uint8_t *vec_a_ready, uint8_t *vec_a_valid, + long long *vec_a_address, + uint8_t *vec_a_is_store, int *vec_a_size, + long long *vec_a_data, uint8_t *vec_d_ready, + uint8_t *vec_d_valid, uint8_t *vec_d_is_store, + int *vec_d_size, long long *vec_d_data, + uint8_t inflight, uint8_t *finished); +// extern "C" void emulator_generate_rs(uint8_t *vec_a_ready, uint8_t *vec_a_valid, +// long long *vec_a_address, +// uint8_t *vec_a_is_store, int *vec_a_size, +// long long *vec_a_data, +// uint8_t *vec_d_ready, uint8_t inflight, +// uint8_t *finished); extern "C" void emulator_init(int num_lanes) { emulator_init_rs(num_lanes); } -extern "C" void emulator_tick(uint8_t *vec_d_ready, uint8_t *vec_d_valid, +extern "C" void emulator_tick(uint8_t *vec_a_ready, uint8_t *vec_a_valid, + long long *vec_a_address, uint8_t *vec_a_is_store, + int *vec_a_size, long long *vec_a_data, + uint8_t *vec_d_ready, uint8_t *vec_d_valid, uint8_t *vec_d_is_store, int *vec_d_size, - long long *vec_d_data) { - emulator_tick_rs(vec_d_ready, vec_d_valid, vec_d_is_store, vec_d_size, - vec_d_data); + long long *vec_d_data, uint8_t inflight, + uint8_t *finished) { + emulator_tick_rs(vec_a_ready, vec_a_valid, vec_a_address, vec_a_is_store, + vec_a_size, vec_a_data, vec_d_ready, vec_d_valid, + vec_d_is_store, vec_d_size, vec_d_data, inflight, finished); } -extern "C" void emulator_generate(uint8_t *vec_a_ready, uint8_t *vec_a_valid, - long long *vec_a_address, - uint8_t *vec_a_is_store, int *vec_a_size, - long long *vec_a_data, uint8_t *vec_d_ready, - uint8_t inflight, uint8_t *finished) { - emulator_generate_rs(vec_a_ready, vec_a_valid, vec_a_address, vec_a_is_store, - vec_a_size, vec_a_data, vec_d_ready, inflight, finished); -} +// extern "C" void emulator_generate(uint8_t *vec_a_ready, uint8_t *vec_a_valid, +// long long *vec_a_address, +// uint8_t *vec_a_is_store, int *vec_a_size, +// long long *vec_a_data, uint8_t *vec_d_ready, +// uint8_t inflight, uint8_t *finished) { +// emulator_generate_rs(vec_a_ready, vec_a_valid, vec_a_address, vec_a_is_store, +// vec_a_size, vec_a_data, vec_d_ready, inflight, finished); +// } diff --git a/src/main/resources/vsrc/SimEmulator.v b/src/main/resources/vsrc/SimEmulator.v index 7abb8f7..97e0131 100644 --- a/src/main/resources/vsrc/SimEmulator.v +++ b/src/main/resources/vsrc/SimEmulator.v @@ -9,15 +9,6 @@ import "DPI-C" function void emulator_init( // (2) C function declaration // (3) DPI function calls inside initial/always blocks import "DPI-C" function void emulator_tick -( - output bit vec_d_ready[`MAX_NUM_LANES], - input bit vec_d_valid[`MAX_NUM_LANES], - input bit vec_d_is_store[`MAX_NUM_LANES], - input int vec_d_size[`MAX_NUM_LANES], - input longint vec_d_data[`MAX_NUM_LANES] -); - -import "DPI-C" function void emulator_generate ( input bit vec_a_ready[`MAX_NUM_LANES], output bit vec_a_valid[`MAX_NUM_LANES], @@ -25,7 +16,12 @@ import "DPI-C" function void emulator_generate output bit vec_a_is_store[`MAX_NUM_LANES], output int vec_a_size[`MAX_NUM_LANES], output longint vec_a_data[`MAX_NUM_LANES], + output bit vec_d_ready[`MAX_NUM_LANES], + input bit vec_d_valid[`MAX_NUM_LANES], + input bit vec_d_is_store[`MAX_NUM_LANES], + input int vec_d_size[`MAX_NUM_LANES], + input longint vec_d_data[`MAX_NUM_LANES], input bit inflight, output bit finished @@ -95,6 +91,8 @@ module SimEmulator #(parameter NUM_LANES = 4) ( emulator_init(NUM_LANES); end + // negedge might make it easier to view waveform since DPI changes are + // instant and make it look like they happen before the clockedge always @(posedge clock) begin if (reset) begin for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin @@ -107,14 +105,19 @@ module SimEmulator #(parameter NUM_LANES = 4) ( end __in_finished = 1'b0; end else begin - emulator_generate( + emulator_tick( __out_a_ready, __in_a_valid, __in_a_address, __in_a_is_store, __in_a_size, __in_a_data, + __in_d_ready, + __out_d_valid, + __out_d_is_store, + __out_d_size, + __out_d_data, __out_inflight, __in_finished @@ -125,19 +128,4 @@ module SimEmulator #(parameter NUM_LANES = 4) ( // end end end - - // negedge is important here; the DPI logic is essentially functioning as - // a combinational logic, so we want to reflect the signal change from DPI - // at the *current* cycle, not the next. - always @(negedge clock) begin - if (!reset) begin - emulator_tick( - __in_d_ready, - __out_d_valid, - __out_d_is_store, - __out_d_size, - __out_d_data - ); - end - end endmodule From dd2721d26229bec21a3c61359220bad2974bd9be Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 5 Jan 2025 00:12:11 -0800 Subject: [PATCH 14/14] Bump cyclotron --- cyclotron | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cyclotron b/cyclotron index d61fa68..ca6933c 160000 --- a/cyclotron +++ b/cyclotron @@ -1 +1 @@ -Subproject commit d61fa682ad8d1bfb54b1860aeeae47184dc80ba8 +Subproject commit ca6933c4ec0ba1d9d7ec5452d325a8cdd8c2120d