diff --git a/.gitmodules b/.gitmodules index d49652c..29685ca 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,5 +2,10 @@ path = src/main/resources/vsrc/vortex url = git@github.com:hansungk/vortex-private.git [submodule "radpie"] - path = radpie + path = cyclotron url = git@github.com:hansungk/radpie.git +[submodule "cyclotron-main"] + path = cyclotron-main + url = https://github.com/hansungk/cyclotron-main.git +[submodule "cyclotron"] + url = https://github.com/hansungk/cyclotron.git diff --git a/cyclotron b/cyclotron new file mode 160000 index 0000000..ca6933c --- /dev/null +++ b/cyclotron @@ -0,0 +1 @@ +Subproject commit ca6933c4ec0ba1d9d7ec5452d325a8cdd8c2120d diff --git a/cyclotron-main b/cyclotron-main new file mode 160000 index 0000000..25f2e77 --- /dev/null +++ b/cyclotron-main @@ -0,0 +1 @@ +Subproject commit 25f2e7734bfbff5a25fc0f4688fe67adb4116ef0 diff --git a/radiance.mk b/radiance.mk index c0e1e87..7d1abb1 100644 --- a/radiance.mk +++ b/radiance.mk @@ -3,15 +3,18 @@ ############################################################## VORTEX_SRC_DIR = $(base_dir)/generators/radiance/src/main/resources/vsrc/vortex -RADPIE_SRC_DIR = $(base_dir)/generators/radiance/radpie -RADPIE_BUILD_DIR = $(RADPIE_SRC_DIR)/target/release +CYCLOTRON_SRC_DIR = $(base_dir)/generators/radiance/cyclotron +CYCLOTRON_BUILD_DIR = $(CYCLOTRON_SRC_DIR)/target/debug +# CYCLOTRON_BUILD_DIR = $(CYCLOTRON_SRC_DIR)/target/release +RADIANCE_CSRC_DIR = $(base_dir)/generators/radiance/src/main/resources/csrc +RADIANCE_VSRC_DIR = $(base_dir)/generators/radiance/src/main/resources/vsrc ################################################################## # THE FOLLOWING MUST BE += operators ################################################################## -# EXTRA_SIM_REQS += radpie -# EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradpie +EXTRA_SIM_REQS += cyclotron +EXTRA_SIM_LDFLAGS += -L$(CYCLOTRON_BUILD_DIR) -Wl,-rpath,$(CYCLOTRON_BUILD_DIR) -lcyclotron ifeq ($(shell echo $(CONFIG) | grep -E "SynConfig$$"),$(CONFIG)) EXTRA_SIM_PREPROC_DEFINES += +define+SYNTHESIS +define+NDEBUG +define+DPI_DISABLE endif @@ -31,21 +34,25 @@ VCS_NONCC_OPTS += +vcs+initreg+random # cargo handles building of Rust files all on its own, so make this a PHONY # target to run cargo unconditionally -.PHONY: radpie -radpie: - cd $(RADPIE_SRC_DIR) && cargo build --release +.PHONY: cyclotron +cyclotron: + cd $(CYCLOTRON_SRC_DIR) && cargo build # --release EXTRA_SIM_REQS += vortex_vsrc.$(CONFIG) -# below manipulation of VORTEX_VLOG_SOURCES doesn't work if we try to reuse +# below manipulation of RADIANCE_EXTERNAL_SRCS doesn't work if we try to reuse # $(call lookup_srcs) from common.mk, the variable doesn't expand somehow -ifeq ($(shell which fd 2> /dev/null),) - VORTEX_VLOG_SOURCES := $(shell find -L $(VORTEX_SRC_DIR) -type f -iname "*.sv" -o -iname "*.vh" -o -iname "*.v") +ifeq ($(shell which fdfd 2> /dev/null),) + # RADIANCE_EXTERNAL_SRCS := $(shell find -L $(VORTEX_SRC_DIR) -type f -iname "*.sv" -o -iname "*.vh" -o -iname "*.v") + RADIANCE_EXTERNAL_SRCS := $(shell find -L $(RADIANCE_VSRC_DIR) -type f -iname "*.sv" -o -iname "*.vh" -o -iname "*.v") + RADIANCE_EXTERNAL_SRCS += $(shell find -L $(RADIANCE_CSRC_DIR) -type f) else - VORTEX_VLOG_SOURCES := $(shell fd -L -t f -e "sv" -e "vh" -e "v" . $(VORTEX_SRC_DIR)) + # RADIANCE_EXTERNAL_SRCS := $(shell fdfind -L -t f -e "sv" -e "vh" -e "v" . $(VORTEX_SRC_DIR)) + RADIANCE_EXTERNAL_SRCS := $(shell fdfind -L -t f -e "sv" -e "vh" -e "v" . $(RADIANCE_VSRC_DIR)) + RADIANCE_EXTERNAL_SRCS += $(shell fdfind -L -t f . $(RADIANCE_CSRC_DIR)) endif -# VORTEX_COLLATERAL := $(patsubst $(VORTEX_SRC_DIR)%,$(GEN_COLLATERAL_DIR)%,$(VORTEX_VLOG_SOURCES)) -# check if expanded -# $(info VORTEX_VLOG_SOURCES: $(VORTEX_VLOG_SOURCES)) + +# for debug; check if expanded +# $(info RADIANCE_EXTERNAL_SRCS: $(RADIANCE_EXTERNAL_SRCS)) # For every Vortex verilog source file, if there's a matching file in # gen-collateral/, copy them over. This is a hacky way to ensure the changes @@ -53,8 +60,8 @@ endif # necessary when common.mk does not trigger chipyard jar rebuild upon verilog # source updates, in which case we need to manually ensure the up-to-date-ness # of gen-collateral/. -vortex_vsrc.$(CONFIG): $(VORTEX_VLOG_SOURCES) - @for file in $(VORTEX_VLOG_SOURCES); do \ +vortex_vsrc.$(CONFIG): $(RADIANCE_EXTERNAL_SRCS) + @for file in $(RADIANCE_EXTERNAL_SRCS); do \ filename=$$(basename "$$file"); \ if [ -f $(GEN_COLLATERAL_DIR)/$$filename ]; then \ if ! diff $$file $(GEN_COLLATERAL_DIR)/$$filename &>/dev/null ; then \ diff --git a/radpie b/radpie deleted file mode 160000 index 493b8e1..0000000 --- a/radpie +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 493b8e10a5116385946deaaef1a82f6597d7b8a2 diff --git a/src/main/resources/csrc/SimEmulator.cc b/src/main/resources/csrc/SimEmulator.cc new file mode 100644 index 0000000..379eeb2 --- /dev/null +++ b/src/main/resources/csrc/SimEmulator.cc @@ -0,0 +1,43 @@ +#ifndef NO_VPI +#include +#include +#endif +#include + +extern "C" void emulator_init_rs(int num_lanes); +extern "C" void emulator_tick_rs(uint8_t *vec_a_ready, uint8_t *vec_a_valid, + long long *vec_a_address, + uint8_t *vec_a_is_store, int *vec_a_size, + long long *vec_a_data, uint8_t *vec_d_ready, + uint8_t *vec_d_valid, uint8_t *vec_d_is_store, + int *vec_d_size, long long *vec_d_data, + uint8_t inflight, uint8_t *finished); +// extern "C" void emulator_generate_rs(uint8_t *vec_a_ready, uint8_t *vec_a_valid, +// long long *vec_a_address, +// uint8_t *vec_a_is_store, int *vec_a_size, +// long long *vec_a_data, +// uint8_t *vec_d_ready, uint8_t inflight, +// uint8_t *finished); + +extern "C" void emulator_init(int num_lanes) { emulator_init_rs(num_lanes); } + +extern "C" void emulator_tick(uint8_t *vec_a_ready, uint8_t *vec_a_valid, + long long *vec_a_address, uint8_t *vec_a_is_store, + int *vec_a_size, long long *vec_a_data, + uint8_t *vec_d_ready, uint8_t *vec_d_valid, + uint8_t *vec_d_is_store, int *vec_d_size, + long long *vec_d_data, uint8_t inflight, + uint8_t *finished) { + emulator_tick_rs(vec_a_ready, vec_a_valid, vec_a_address, vec_a_is_store, + vec_a_size, vec_a_data, vec_d_ready, vec_d_valid, + vec_d_is_store, vec_d_size, vec_d_data, inflight, finished); +} + +// extern "C" void emulator_generate(uint8_t *vec_a_ready, uint8_t *vec_a_valid, +// long long *vec_a_address, +// uint8_t *vec_a_is_store, int *vec_a_size, +// long long *vec_a_data, uint8_t *vec_d_ready, +// uint8_t inflight, uint8_t *finished) { +// emulator_generate_rs(vec_a_ready, vec_a_valid, vec_a_address, vec_a_is_store, +// vec_a_size, vec_a_data, vec_d_ready, inflight, finished); +// } diff --git a/src/main/resources/csrc/SimMemFuzzer.cc b/src/main/resources/csrc/SimMemFuzzer.cc index e0a7799..5572048 100644 --- a/src/main/resources/csrc/SimMemFuzzer.cc +++ b/src/main/resources/csrc/SimMemFuzzer.cc @@ -2,7 +2,6 @@ #include #include #endif -#include #include extern "C" void memfuzz_init_rs(int num_lanes); diff --git a/src/main/resources/vsrc/SimEmulator.v b/src/main/resources/vsrc/SimEmulator.v new file mode 100644 index 0000000..97e0131 --- /dev/null +++ b/src/main/resources/vsrc/SimEmulator.v @@ -0,0 +1,131 @@ +`include "SimDefaults.vh" + +import "DPI-C" function void emulator_init( + input longint num_lanes +); + +// Make sure to sync the parameters for: +// (1) import "DPI-C" declaration +// (2) C function declaration +// (3) DPI function calls inside initial/always blocks +import "DPI-C" function void emulator_tick +( + input bit vec_a_ready[`MAX_NUM_LANES], + output bit vec_a_valid[`MAX_NUM_LANES], + output longint vec_a_address[`MAX_NUM_LANES], + output bit vec_a_is_store[`MAX_NUM_LANES], + output int vec_a_size[`MAX_NUM_LANES], + output longint vec_a_data[`MAX_NUM_LANES], + + output bit vec_d_ready[`MAX_NUM_LANES], + input bit vec_d_valid[`MAX_NUM_LANES], + input bit vec_d_is_store[`MAX_NUM_LANES], + input int vec_d_size[`MAX_NUM_LANES], + input longint vec_d_data[`MAX_NUM_LANES], + + input bit inflight, + output bit finished +); + +module SimEmulator #(parameter NUM_LANES = 4) ( + input clock, + input reset, + + input [NUM_LANES-1:0] a_ready, + output [NUM_LANES-1:0] a_valid, + output [`SIMMEM_DATA_WIDTH*NUM_LANES-1:0] a_address, + output [NUM_LANES-1:0] a_is_store, + output [`SIMMEM_LOGSIZE_WIDTH*NUM_LANES-1:0] a_size, + output [`SIMMEM_DATA_WIDTH*NUM_LANES-1:0] a_data, + + output [NUM_LANES-1:0] d_ready, + input [NUM_LANES-1:0] d_valid, + input [NUM_LANES-1:0] d_is_store, + input [`SIMMEM_LOGSIZE_WIDTH*NUM_LANES-1:0] d_size, + input [`SIMMEM_DATA_WIDTH*NUM_LANES-1:0] d_data, + // TODO: d_mask + + input inflight, + output finished +); + // "in": C->verilog, "out": verilog->C + // need to be in ascending order to match with C indexing + // C array sizes are static, so need to use MAX_NUM_LANES + bit __out_a_ready [0:`MAX_NUM_LANES-1]; + bit __in_a_valid [0:`MAX_NUM_LANES-1]; + longint __in_a_address [0:`MAX_NUM_LANES-1]; + bit __in_a_is_store [0:`MAX_NUM_LANES-1]; + int __in_a_size [0:`MAX_NUM_LANES-1]; + longint __in_a_data [0:`MAX_NUM_LANES-1]; + bit __in_d_ready [0:`MAX_NUM_LANES-1]; + bit __out_d_valid [0:`MAX_NUM_LANES-1]; + bit __out_d_is_store [0:`MAX_NUM_LANES-1]; + int __out_d_size [0:`MAX_NUM_LANES-1]; + longint __out_d_data [0:`MAX_NUM_LANES-1]; + bit __out_inflight; + bit __in_finished; + + genvar g; + generate + for (g = 0; g < NUM_LANES; g = g + 1) begin + assign __out_a_ready[g] = a_ready[g]; + assign a_valid[g] = __in_a_valid[g]; + assign a_address[`SIMMEM_DATA_WIDTH*g +: `SIMMEM_DATA_WIDTH] + = __in_a_address[g][`SIMMEM_DATA_WIDTH-1:0]; + assign a_is_store[g] = __in_a_is_store[g]; + assign a_size[`SIMMEM_LOGSIZE_WIDTH*g +: `SIMMEM_LOGSIZE_WIDTH] + = __in_a_size[g][`SIMMEM_LOGSIZE_WIDTH-1:0]; + assign a_data[`SIMMEM_DATA_WIDTH*g +: `SIMMEM_DATA_WIDTH] + = __in_a_data[g][`SIMMEM_DATA_WIDTH-1:0]; + assign d_ready[g] = __in_d_ready[g]; + assign __out_d_valid[g] = d_valid[g]; + assign __out_d_is_store[g] = d_is_store[g]; + assign __out_d_size[g] = d_size[`SIMMEM_LOGSIZE_WIDTH*g +: `SIMMEM_LOGSIZE_WIDTH]; + assign __out_d_data[g] = d_data[`SIMMEM_DATA_WIDTH*g +: `SIMMEM_DATA_WIDTH]; + end + assign __out_inflight = inflight; + endgenerate + assign finished = __in_finished; + + initial begin + emulator_init(NUM_LANES); + end + + // negedge might make it easier to view waveform since DPI changes are + // instant and make it look like they happen before the clockedge + always @(posedge clock) begin + if (reset) begin + for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin + __in_a_valid[tid] = 1'b0; + __in_a_address[tid] = `SIMMEM_DATA_WIDTH'b0; + __in_a_is_store[tid] = 1'b0; + __in_a_size[tid] = 32'b0; + __in_a_data[tid] = `SIMMEM_DATA_WIDTH'b0; + __in_d_ready[tid] = 1'b0; + end + __in_finished = 1'b0; + end else begin + emulator_tick( + __out_a_ready, + __in_a_valid, + __in_a_address, + __in_a_is_store, + __in_a_size, + __in_a_data, + + __in_d_ready, + __out_d_valid, + __out_d_is_store, + __out_d_size, + __out_d_data, + + __out_inflight, + __in_finished + ); + // for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin + // $display("verilog: %04d a_valid[%d]=%d, a_address[%d]=0x%x, d_ready[%d]=%d", + // $time, tid, __in_a_valid[tid], tid, __in_a_address[tid], tid, __in_d_ready[tid]); + // end + end + end +endmodule diff --git a/src/main/resources/vsrc/SimMemFuzzer.v b/src/main/resources/vsrc/SimMemFuzzer.v index 263eb24..8d50121 100644 --- a/src/main/resources/vsrc/SimMemFuzzer.v +++ b/src/main/resources/vsrc/SimMemFuzzer.v @@ -47,7 +47,7 @@ module SimMemFuzzer #(parameter NUM_LANES = 4) ( input inflight, output finished ); - // "in": verilog->C, "out": C->verilog + // "in": C->verilog, "out": verilog->C // need to be in ascending order to match with C indexing // C array sizes are static, so need to use MAX_NUM_LANES bit __out_a_ready [0:`MAX_NUM_LANES-1]; diff --git a/src/main/scala/radiance/core/Emulator.scala b/src/main/scala/radiance/core/Emulator.scala new file mode 100644 index 0000000..06a9579 --- /dev/null +++ b/src/main/scala/radiance/core/Emulator.scala @@ -0,0 +1,246 @@ +package radiance.core + +import chisel3._ +import chisel3.util._ +import org.chipsalliance.cde.config.{Field, Parameters} +import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp} +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.diplomacy.{IdRange, AddressSet, BufferParams} +import radiance.memory.{SourceGenerator, TraceLine, TLPrintf} + +case class SIMTCoreParams( + nWarps: Int = 4, // # of warps in the core + nCoreLanes: Int = 4, // # of SIMT threads in the core + nMemLanes: Int = 4, // # of memory lanes in the memory interface to the + // cache; relates to the LSU lanes + nSrcIds: Int = 8 // # of source IDs allocated to each of the nMemLanes +) +case class MemtraceCoreParams( + tracefilename: String = "undefined", + traceHasSource: Boolean = false +) + +case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/ ) +case object MemtraceCoreKey + extends Field[Option[MemtraceCoreParams]](None /*default*/ ) + +// ############################################################################# +// FIXME: copy-paste from MemFuzzer +// ############################################################################# + +class Emulator( + numLanes: Int, + numSrcIds: Int, + wordSizeInBytes: Int, +)(implicit p: Parameters) + extends LazyModule { + val laneNodes = Seq.tabulate(numLanes) { i => + val clientParam = Seq( + TLMasterParameters.v1( + name = "Emulator" + i.toString, + sourceId = IdRange(0, numSrcIds) + // visibility = Seq(AddressSet(0x0000, 0xffffff)) + ) + ) + TLClientNode(Seq(TLMasterPortParameters.v1(clientParam))) + } + + val node = TLIdentityNode() + laneNodes.foreach(node := _) + + lazy val module = new EmulatorImp(this, numLanes, numSrcIds, wordSizeInBytes) +} + +class EmulatorImp( + outer: Emulator, + numLanes : Int, + numSrcIds: Int, + wordSizeInBytes: Int, +) extends LazyModuleImp(outer) { + val io = IO(new Bundle { + val finished = Output(Bool()) + }) + val sim = Module(new SimEmulator(numLanes)) + + sim.io.clock := clock + sim.io.reset := reset.asBool + + sim.io.a.ready := VecInit(outer.laneNodes.map { node => + val (tlOut, _) = node.out(0) + tlOut.a.ready + }).asUInt + + io.finished := sim.io.finished + + // connect Verilog <-> Chisel IO + // Verilog IO flattened across all lanes + val laneReqs = Wire(Vec(numLanes, Decoupled(new TraceLine))) + val addrW = laneReqs(0).bits.address.getWidth + val sizeW = laneReqs(0).bits.size.getWidth + val dataW = laneReqs(0).bits.data.getWidth + laneReqs.zipWithIndex.foreach { case (req, i) => + req.valid := sim.io.a.valid(i) + req.bits.source := 0.U // DPI doesn't generate contain source id + req.bits.address := sim.io.a.address(addrW * (i + 1) - 1, addrW * i) + req.bits.is_store := sim.io.a.is_store(i) + req.bits.size := sim.io.a.size(sizeW * (i + 1) - 1, sizeW * i) + req.bits.data := sim.io.a.data(dataW * (i + 1) - 1, dataW * i) + } + sim.io.a.ready := VecInit(laneReqs.map(_.ready)).asUInt + + val laneResps = Wire(Vec(numLanes, Flipped(Decoupled(new TraceLine)))) + laneResps.zipWithIndex.foreach { case (resp, i) => + resp.ready := sim.io.d.ready(i) + // TODO: not handled in DPI + resp.bits.source := DontCare + resp.bits.address := DontCare + resp.bits.data := DontCare + } + sim.io.d.valid := VecInit(laneResps.map(_.valid)).asUInt + sim.io.d.is_store := VecInit(laneResps.map(_.bits.is_store)).asUInt + sim.io.d.size := VecInit(laneResps.map(_.bits.size)).asUInt + sim.io.d.data := VecInit(laneResps.map(_.bits.data)).asUInt + + val sourceGens = Seq.fill(numLanes)( + Module( + new SourceGenerator( + log2Ceil(numSrcIds), + ignoreInUse = false + ) + ) + ) + val anyInflight = sourceGens.map(_.io.inflight).reduce(_ || _) + sim.io.inflight := anyInflight + + // Take requests off of the queue and generate TL requests + (outer.laneNodes zip (laneReqs zip laneResps)).zipWithIndex.foreach { + case ((node, (req, resp)), lane) => + val (tlOut, edge) = node.out(0) + + // Requests -------------------------------------------------------------- + // + // Core only makes accesses of granularity larger than a word, so we want + // the trace driver to act so as well. + // That means if req.size is smaller than word size, we need to pad data + // with zeros to generate a word-size request, and set mask accordingly. + val offsetInWord = req.bits.address % wordSizeInBytes.U + val subword = req.bits.size < log2Ceil(wordSizeInBytes).U + + // `mask` is currently unused + // val mask = Wire(UInt(wordSizeInBytes.W)) + val wordData = Wire(UInt((wordSizeInBytes * 8 * 2).W)) + val sizeInBytes = Wire(UInt((sizeW + 1).W)) + sizeInBytes := (1.U) << req.bits.size + // mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U) + wordData := Mux(subword, req.bits.data << (offsetInWord * 8.U), req.bits.data) + val wordAlignedAddress = + req.bits.address & ~((1 << log2Ceil(wordSizeInBytes)) - 1).U(addrW.W) + val wordAlignedSize = Mux(subword, 2.U, req.bits.size) + + val sourceGen = sourceGens(lane) + sourceGen.io.gen := tlOut.a.fire + sourceGen.io.reclaim.valid := tlOut.d.fire + sourceGen.io.reclaim.bits := tlOut.d.bits.source + sourceGen.io.meta := DontCare + + val (plegal, pbits) = edge.Put( + fromSource = sourceGen.io.id.bits, + toAddress = wordAlignedAddress, + lgSize = wordAlignedSize, // trace line already holds log2(size) + // data should be aligned to beatBytes + data = + (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))).asUInt + ) + val (glegal, gbits) = edge.Get( + fromSource = sourceGen.io.id.bits, + toAddress = wordAlignedAddress, + lgSize = wordAlignedSize + ) + val legal = Mux(req.bits.is_store, plegal, glegal) + val bits = Mux(req.bits.is_store, pbits, gbits) + + tlOut.a.valid := req.valid && sourceGen.io.id.valid + req.ready := tlOut.a.ready && sourceGen.io.id.valid + + when(tlOut.a.fire) { + assert(legal, "illegal TL req gen") + } + tlOut.a.bits := bits + + // Responses ------------------------------------------------------------- + // + tlOut.d.ready := resp.ready + resp.valid := tlOut.d.valid + resp.bits.is_store := !edge.hasData(tlOut.d.bits) + resp.bits.size := tlOut.d.bits.size + + tlOut.b.ready := true.B + tlOut.c.valid := false.B + tlOut.e.valid := false.B + + // debug + dontTouch(req) + when(tlOut.a.valid) { + printf(s"Lane ${lane}: "); + TLPrintf( + "Emulator", + tlOut.a.bits.source, + tlOut.a.bits.address, + tlOut.a.bits.size, + tlOut.a.bits.mask, + req.bits.is_store, + tlOut.a.bits.data, + req.bits.data + ) + } + dontTouch(tlOut.a) + dontTouch(tlOut.d) + } + + // when(traceFinished && allReqReclaimed && noValidReqs) { + // assert( + // false.B, + // "\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)" + // ) + // } +} + +class SimEmulator(numLanes: Int) + extends BlackBox(Map("NUM_LANES" -> numLanes)) + with HasBlackBoxResource { + val traceLineT = new TraceLine + val addrW = traceLineT.address.getWidth + val sizeW = traceLineT.size.getWidth + val dataW = traceLineT.data.getWidth + val io = IO(new Bundle { + val clock = Input(Clock()) + val reset = Input(Bool()) + val inflight = Input(Bool()) + val finished = Output(Bool()) + + val a = + new Bundle { + val ready = Input(UInt(numLanes.W)) + val valid = Output(UInt(numLanes.W)) + // Chisel can't interface with Verilog 2D port, so flatten all lanes into + // single wide 1D array. + val address = Output(UInt((addrW * numLanes).W)) + val is_store = Output(UInt(numLanes.W)) + val size = Output(UInt((sizeW * numLanes).W)) + val data = Output(UInt((dataW * numLanes).W)) + } + val d = + new Bundle { + val ready = Output(UInt(numLanes.W)) + val valid = Input(UInt(numLanes.W)) + val is_store = Input(UInt(numLanes.W)) + val size = Input(UInt((sizeW * numLanes).W)) + val data = Input(UInt((dataW * numLanes).W)) + } + }) + + addResource("/vsrc/SimDefaults.vh") + addResource("/vsrc/SimEmulator.v") + addResource("/csrc/SimEmulator.cc") +} + diff --git a/src/main/scala/radiance/core/TensorDPU.scala b/src/main/scala/radiance/core/TensorDPU.scala index a4e6db0..ce131df 100644 --- a/src/main/scala/radiance/core/TensorDPU.scala +++ b/src/main/scala/radiance/core/TensorDPU.scala @@ -147,7 +147,7 @@ class DotProductPipe(dim: Int, inputType: tile.FType, outputType: tile.FType) ex val mulSigWidth = m.io.rawOut.sigWidth val roundRawFNToRecFN = Module(new hardfloat.RoundAnyRawFNToRecFN( - mulExpWidth, mulSigWidth, outExpWidth, outSigWidth, 0)) + mulExpWidth, mulSigWidth, expWidth, sigWidth, 0)) roundRawFNToRecFN.io.invalidExc := m.io.invalidExc roundRawFNToRecFN.io.infiniteExc := false.B roundRawFNToRecFN.io.in := m.io.rawOut @@ -169,7 +169,7 @@ class DotProductPipe(dim: Int, inputType: tile.FType, outputType: tile.FType) ex // instantiate wires for input values to each reduction pipeline stage val interim = (log2Dim to 0 by -1).map { i => - Wire(Valid(Vec(1 << i, Bits(recOutFLen.W)))) + Wire(Valid(Vec(1 << i, Bits(recInFLen.W)))) } // instantiate wires for pipe registers for C val interimC = (log2Dim to 0 by -1).map( _ => Wire(Valid(Bits(recOutFLen.W))) ) @@ -186,7 +186,7 @@ class DotProductPipe(dim: Int, inputType: tile.FType, outputType: tile.FType) ex require(inputs.bits.length == 2 * outputs.bits.length) val thisDim = inputs.bits.length val adders = Seq.fill(thisDim / 2)( - Module(new hardfloat.AddRecFN(outExpWidth, outSigWidth)) + Module(new hardfloat.AddRecFN(expWidth, sigWidth)) ) val addOuts = adders.zipWithIndex.map { case (a, i) => a.io.subOp := 0.U // FIXME dont know what this is @@ -212,9 +212,15 @@ class DotProductPipe(dim: Int, inputType: tile.FType, outputType: tile.FType) ex // add stages end ------------------------------------------------------------ // add final A and B dot-product result to accumulator C + val conv = Module(new hardfloat.RecFNToRecFN(expWidth, sigWidth, outExpWidth, outSigWidth)) + conv.io.in := addStageOut.bits(0) + conv.io.roundingMode := hardfloat.consts.round_near_even + conv.io.detectTininess := hardfloat.consts.tininess_afterRounding + // assert(conv.io.exceptionFlags === 0.U) + val acc = Module(new hardfloat.AddRecFN(outExpWidth, outSigWidth)) acc.io.subOp := 0.U // FIXME - acc.io.a := addStageOut.bits(0) + acc.io.a := conv.io.out acc.io.b := addStageC.bits acc.io.roundingMode := hardfloat.consts.round_near_even acc.io.detectTininess := hardfloat.consts.tininess_afterRounding diff --git a/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala b/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala index 0801071..658db38 100644 --- a/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala +++ b/src/main/scala/radiance/memory/CanHaveMemtraceCore.scala @@ -4,6 +4,7 @@ import freechips.rocketchip.diplomacy.LazyModule import freechips.rocketchip.subsystem._ import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.tilelink._ +import radiance.core.{SIMTCoreKey, MemtraceCoreKey} // TODO: possibly move to somewhere closer to CoalescingUnit // TODO: separate coalescer config from CanHaveMemtraceCore diff --git a/src/main/scala/radiance/memory/Coalescing.scala b/src/main/scala/radiance/memory/Coalescing.scala index a21daee..5f24cc3 100644 --- a/src/main/scala/radiance/memory/Coalescing.scala +++ b/src/main/scala/radiance/memory/Coalescing.scala @@ -10,25 +10,10 @@ import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp} import freechips.rocketchip.util.{Code, MultiPortQueue, OnePortLanePositionedQueue} import freechips.rocketchip.unittest._ import freechips.rocketchip.tilelink._ +import radiance.core.{SIMTCoreParams, SIMTCoreKey} -// TODO: find better place for these - -case class SIMTCoreParams( - nWarps: Int = 4, // # of warps in the core - nCoreLanes: Int = 4, // # of SIMT threads in the core - nMemLanes: Int = 4, // # of memory lanes in the memory interface to the - // cache; relates to the LSU lanes - nSrcIds: Int = 8 // # of source IDs allocated to each of the nMemLanes -) -case class MemtraceCoreParams( - tracefilename: String = "undefined", - traceHasSource: Boolean = false -) case class CoalXbarParam() -case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/ ) -case object MemtraceCoreKey - extends Field[Option[MemtraceCoreParams]](None /*default*/ ) case object CoalescerKey extends Field[Option[CoalescerConfig]](None /*default*/ ) case object CoalXbarKey extends Field[Option[CoalXbarParam]](None /*default*/ ) @@ -2055,7 +2040,7 @@ class MemFuzzer( val laneNodes = Seq.tabulate(numLanes) { i => val clientParam = Seq( TLMasterParameters.v1( - name = "MemTraceDriver" + i.toString, + name = "MemFuzzer" + i.toString, sourceId = IdRange(0, numSrcIds) // visibility = Seq(AddressSet(0x0000, 0xffffff)) ) diff --git a/src/main/scala/radiance/subsystem/Configs.scala b/src/main/scala/radiance/subsystem/Configs.scala index 98addce..c2cdb18 100644 --- a/src/main/scala/radiance/subsystem/Configs.scala +++ b/src/main/scala/radiance/subsystem/Configs.scala @@ -12,6 +12,7 @@ import freechips.rocketchip.subsystem._ import gemmini._ import gemmini.Arithmetic.FloatArithmetic._ import radiance.tile._ +import radiance.core._ import radiance.memory._ import radiance.subsystem.RadianceGemminiDataType.{BF16, FP16, FP32, Int8} @@ -106,6 +107,44 @@ class WithRadianceCores( ), tensorCoreFP16, tensorCoreDecoupled, useVxCache) } +class WithEmulatorCores( + n: Int, + useVxCache: Boolean +) extends Config((site, _, up) => { + case TilesLocated(InSubsystem) => { + val prev = up(TilesLocated(InSubsystem)) + val idOffset = up(NumTiles) + val emulator = EmulatorTileParams( + core = VortexCoreParams(), + useVxCache = useVxCache) + List.tabulate(n)(i => EmulatorTileAttachParams( + emulator.copy(tileId = i + idOffset), + RocketCrossingParams() + )) ++ prev + } + case NumTiles => up(NumTiles) + 1 + case NumRadianceCores => up(NumRadianceCores) + 1 +}) + +class WithFuzzerCores( + n: Int, + useVxCache: Boolean +) extends Config((site, _, up) => { + case TilesLocated(InSubsystem) => { + val prev = up(TilesLocated(InSubsystem)) + val idOffset = up(NumTiles) + val fuzzer = FuzzerTileParams( + core = VortexCoreParams(), + useVxCache = useVxCache) + List.tabulate(n)(i => FuzzerTileAttachParams( + fuzzer.copy(tileId = i + idOffset), + RocketCrossingParams() + )) ++ prev + } + case NumTiles => up(NumTiles) + 1 + case NumRadianceCores => up(NumRadianceCores) + 1 +}) + object RadianceGemminiDataType extends Enumeration { type Type = Value val FP32, FP16, BF16, Int8 = Value @@ -244,25 +283,6 @@ class WithRadianceFrameBuffer(baseAddress: BigInt, } }) -class WithFuzzerCores( - n: Int, - useVxCache: Boolean -) extends Config((site, _, up) => { - case TilesLocated(InSubsystem) => { - val prev = up(TilesLocated(InSubsystem)) - val idOffset = up(NumTiles) - val fuzzer = FuzzerTileParams( - core = VortexCoreParams(), - useVxCache = useVxCache) - List.tabulate(n)(i => FuzzerTileAttachParams( - fuzzer.copy(tileId = i + idOffset), - RocketCrossingParams() - )) ++ prev - } - case NumTiles => up(NumTiles) + 1 - case NumRadianceCores => up(NumRadianceCores) + 1 -}) - class WithRadianceCluster( clusterId: Int, location: HierarchicalLocation = InSubsystem, diff --git a/src/main/scala/radiance/tile/EmulatorTile.scala b/src/main/scala/radiance/tile/EmulatorTile.scala new file mode 100644 index 0000000..d6881ca --- /dev/null +++ b/src/main/scala/radiance/tile/EmulatorTile.scala @@ -0,0 +1,96 @@ +// See LICENSE.SiFive for license details. +// See LICENSE.Berkeley for license details. + +package radiance.tile + +import chisel3._ +import org.chipsalliance.cde.config.Parameters +import org.chipsalliance.diplomacy.lazymodule.LazyModule +import freechips.rocketchip.resources.SimpleDevice +import freechips.rocketchip.prci.ClockCrossingType +import freechips.rocketchip.rocket._ +import freechips.rocketchip.tile._ +import freechips.rocketchip.tilelink._ +import freechips.rocketchip.subsystem.{HierarchicalElementCrossingParamsLike, CanAttachTile} +import freechips.rocketchip.prci.{ClockSinkParameters} +import radiance.core._ +import radiance.memory.{CoalescingUnit, CoalescerKey} + +// TODO: De-duplicate between this and FuzzerTile + +case class EmulatorTileParams( + core: VortexCoreParams = VortexCoreParams(), // TODO: remove this + useVxCache: Boolean = false, + tileId: Int = 0, +) extends InstantiableTileParams[EmulatorTile] { + def instantiate(crossing: HierarchicalElementCrossingParamsLike, lookup: LookupByHartIdImpl)( + implicit p: Parameters + ): EmulatorTile = { + new EmulatorTile(this, crossing, lookup) + } + val clockSinkParams = ClockSinkParameters() + val blockerCtrlAddr = None + val icache = None + val dcache = None + val btb = None + val baseName = "radiance_emulator_tile" + val uniqueName = s"${baseName}_$tileId" +} + +case class EmulatorTileAttachParams( + tileParams: EmulatorTileParams, + crossingParams: HierarchicalElementCrossingParamsLike +) extends CanAttachTile { type TileType = EmulatorTile } + +class EmulatorTile private ( + val EmulatorParams: EmulatorTileParams, + crossing: ClockCrossingType, + lookup: LookupByHartIdImpl, + q: Parameters +) extends BaseTile(EmulatorParams, crossing, lookup, q) + with SinksExternalInterrupts + with SourcesExternalNotifications { + def this( + params: EmulatorTileParams, + crossing: HierarchicalElementCrossingParamsLike, + lookup: LookupByHartIdImpl + )(implicit p: Parameters) = + this(params, crossing.crossingType, lookup, p) + + val cpuDevice: SimpleDevice = new SimpleDevice("emulator", Nil) + + val intOutwardNode = None + val slaveNode: TLInwardNode = TLIdentityNode() + val masterNode = visibilityNode + // val statusNode = BundleBridgeSource(() => new GroundTestStatus) + + val (numLanes, numSrcIds) = p(SIMTCoreKey) match { + case Some(param) => (param.nMemLanes, param.nSrcIds) + case None => { + require(false, "emulator requires SIMTCoreKey to be defined") + (0, 0) + } + } + // FIXME: parameterize + val wordSizeInBytes = 4 + + val emulator = LazyModule(new Emulator(numLanes, numSrcIds, wordSizeInBytes)) + + // Conditionally instantiate memory coalescer + val coalescerNode = p(CoalescerKey) match { + case Some(coalParam) => { + val coal = LazyModule(new CoalescingUnit(coalParam)) + coal.cpuNode :=* TLWidthWidget(4) :=* emulator.node + coal.aggregateNode + } + case None => emulator.node + } + + masterNode :=* coalescerNode + + override lazy val module = new EmulatorTileModuleImp(this) +} + +class EmulatorTileModuleImp(outer: EmulatorTile) extends BaseTileModuleImp(outer) { + outer.reportCease(Some(outer.emulator.module.io.finished)) +} diff --git a/src/main/scala/radiance/tile/FuzzerTile.scala b/src/main/scala/radiance/tile/FuzzerTile.scala index 138b0f8..5e17672 100644 --- a/src/main/scala/radiance/tile/FuzzerTile.scala +++ b/src/main/scala/radiance/tile/FuzzerTile.scala @@ -4,14 +4,16 @@ package radiance.tile import chisel3._ -import org.chipsalliance.cde.config.{Parameters} -import freechips.rocketchip.diplomacy.{SimpleDevice, LazyModule} +import org.chipsalliance.cde.config.Parameters +import org.chipsalliance.diplomacy.lazymodule.LazyModule +import freechips.rocketchip.resources.SimpleDevice import freechips.rocketchip.prci.ClockCrossingType import freechips.rocketchip.rocket._ import freechips.rocketchip.tile._ import freechips.rocketchip.tilelink._ import freechips.rocketchip.subsystem.{HierarchicalElementCrossingParamsLike, CanAttachTile} import freechips.rocketchip.prci.{ClockSinkParameters} +import radiance.core.{SIMTCoreKey} import radiance.memory._ case class FuzzerTileParams( diff --git a/src/main/scala/radiance/tile/RadianceTile.scala b/src/main/scala/radiance/tile/RadianceTile.scala index 202543a..f4e4165 100644 --- a/src/main/scala/radiance/tile/RadianceTile.scala +++ b/src/main/scala/radiance/tile/RadianceTile.scala @@ -19,6 +19,7 @@ import freechips.rocketchip.tilelink._ import freechips.rocketchip.util._ import midas.targetutils.SynthesizePrintf import org.chipsalliance.cde.config._ +import radiance.core._ import radiance.memory._ import radiance.subsystem.{GPUMemParams, GPUMemory, RadianceSimArgs} diff --git a/src/test/scala/radiance/TensorCoreDecoupledTest.scala b/src/test/scala/radiance/TensorCoreDecoupledTest.scala index 7b31eb7..619f699 100644 --- a/src/test/scala/radiance/TensorCoreDecoupledTest.scala +++ b/src/test/scala/radiance/TensorCoreDecoupledTest.scala @@ -9,7 +9,7 @@ class TensorCoreDecoupledTest extends AnyFlatSpec with ChiselScalatestTester { behavior of "TensorCoreDecoupled" it should "do the right thing" in { - test(new TensorCoreDecoupled(8, 8, numSourceIds = 4, tilingParams = TensorTilingParams())) + test(new TensorCoreDecoupled(8, 8, numSourceIds = 4, half = true)) { c => c.io.initiate.valid.poke(true.B) c.io.initiate.bits.wid.poke(0.U)