diff --git a/src/main/resources/csrc/SimMemTrace.cc b/src/main/resources/csrc/SimMemTrace.cc index 32b9012..20960b6 100644 --- a/src/main/resources/csrc/SimMemTrace.cc +++ b/src/main/resources/csrc/SimMemTrace.cc @@ -35,7 +35,7 @@ void MemTraceReader::parse() { printf("MemTraceReader: started parsing\n"); while (infile >> line.cycle >> line.loadstore >> line.core_id >> - line.thread_id >> std::hex >> line.address >> line.data >> std::dec >> + line.lane_id >> std::hex >> line.address >> line.data >> std::dec >> line.data_size) { line.valid = true; trace.push_back(line); @@ -49,7 +49,7 @@ void MemTraceReader::parse() { // given SIMD lane (= "thread"). In case no request happened at that point, // return an empty line with .valid = false. MemTraceLine MemTraceReader::read_trace_at(const long cycle, - const int thread_id) { + const int lane_id) { MemTraceLine line; line.valid = false; @@ -67,17 +67,17 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, assert(false && "some trace lines are left unread in the past"); } - if (line.thread_id != thread_id) { + if (line.lane_id != lane_id) { line.valid = false; } if (line.cycle > cycle) { // We haven't reached the cycle mark specified in this line yet, so we don't // read it right now. return MemTraceLine{}; - } else if (line.cycle == cycle && line.thread_id == thread_id) { + } else if (line.cycle == cycle && line.lane_id == lane_id) { printf("fire! cycle=%ld, valid=%d, %s \n", cycle, line.valid, line.loadstore); - // FIXME! Currently thread_id is assumed to be in round-robin order, e.g. + // FIXME! Currently lane_id is assumed to be in round-robin order, e.g. // 0->1->2->3->0->..., both in the trace file and the order the caller calls // this function. If this is not true, we cannot simply monotonically // increment read_pos. @@ -101,7 +101,7 @@ extern "C" void memtrace_init(const char *filename) { // TODO: accept core_id as well extern "C" void memtrace_query(unsigned char trace_read_ready, unsigned long trace_read_cycle, - int trace_read_thread_id, + int trace_read_lane_id, unsigned char *trace_read_valid, unsigned long *trace_read_address, unsigned char *trace_read_is_store, @@ -109,13 +109,13 @@ extern "C" void memtrace_query(unsigned char trace_read_ready, unsigned long *trace_read_data, unsigned char *trace_read_finished) { // printf("memtrace_query(cycle=%ld, tid=%d)\n", trace_read_cycle, - // trace_read_thread_id); + // trace_read_lane_id); if (!trace_read_ready) { return; } - auto line = reader->read_trace_at(trace_read_cycle, trace_read_thread_id); + auto line = reader->read_trace_at(trace_read_cycle, trace_read_lane_id); *trace_read_valid = line.valid; *trace_read_address = line.address; *trace_read_is_store = strcmp(line.loadstore, "STORE") == 0 ; diff --git a/src/main/resources/csrc/SimMemTrace.h b/src/main/resources/csrc/SimMemTrace.h index 94ffef8..b046fcc 100644 --- a/src/main/resources/csrc/SimMemTrace.h +++ b/src/main/resources/csrc/SimMemTrace.h @@ -12,7 +12,7 @@ struct MemTraceLine { long cycle = 0; char loadstore[10]; int core_id = 0; - int thread_id = 0; + int lane_id = 0; unsigned long address = 0; unsigned long data = 0; int data_size = 0; @@ -23,7 +23,7 @@ public: MemTraceReader(const std::string &filename); ~MemTraceReader(); void parse(); - MemTraceLine read_trace_at(const long cycle, const int thread_id); + MemTraceLine read_trace_at(const long cycle, const int lane_id); bool finished() const { return read_pos == trace.cend(); } std::ifstream infile; @@ -34,7 +34,7 @@ public: extern "C" void memtrace_init(const char *filename); extern "C" void memtrace_query(unsigned char trace_read_ready, unsigned long trace_read_cycle, - int trace_read_thread_id, + int trace_read_lane_id, unsigned char *trace_read_valid, unsigned long *trace_read_address, unsigned char *trace_read_is_store, diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index cdf2d8b..9a91848 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -1,5 +1,5 @@ `define DATA_WIDTH 64 -`define MAX_NUM_THREADS 32 +`define MAX_NUM_LANES 32 `define MASK_WIDTH 8 import "DPI-C" function void memtrace_init( @@ -23,26 +23,26 @@ import "DPI-C" function void memtrace_query output bit trace_read_finished ); -module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) ( +module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( input clock, input reset, // These have to match the IO port of the Chisel wrapper module. input trace_read_ready, - output [NUM_THREADS-1:0] trace_read_valid, - output [`DATA_WIDTH*NUM_THREADS-1:0] trace_read_address, + output [NUM_LANES-1:0] trace_read_valid, + output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_address, - output [NUM_THREADS-1:0] trace_read_is_store, - output [NUM_THREADS*`MASK_WIDTH-1:0] trace_read_store_mask, - output [`DATA_WIDTH*NUM_THREADS-1:0] trace_read_data, + output [NUM_LANES-1:0] trace_read_is_store, + output [NUM_LANES*`MASK_WIDTH-1:0] trace_read_store_mask, + output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data, output trace_read_finished ); - bit __in_valid[NUM_THREADS-1:0]; - longint __in_address[NUM_THREADS-1:0]; + bit __in_valid[NUM_LANES-1:0]; + longint __in_address[NUM_LANES-1:0]; - bit __in_is_store[NUM_THREADS-1:0]; - int __in_store_mask [NUM_THREADS-1:0]; - longint __in_data[NUM_THREADS-1:0]; + bit __in_is_store[NUM_LANES-1:0]; + int __in_store_mask [NUM_LANES-1:0]; + longint __in_data[NUM_LANES-1:0]; bit __in_finished; string __uartlog; @@ -54,18 +54,18 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) ( assign next_cycle_counter = cycle_counter + 1'b1; // registers that stage outputs of the C parser - reg [NUM_THREADS-1:0] __in_valid_reg; - reg [`DATA_WIDTH-1:0] __in_address_reg [NUM_THREADS-1:0]; + reg [NUM_LANES-1:0] __in_valid_reg; + reg [`DATA_WIDTH-1:0] __in_address_reg [NUM_LANES-1:0]; - reg [NUM_THREADS-1:0] __in_is_store_reg; - reg [`MASK_WIDTH-1:0] __in_store_mask_reg [NUM_THREADS-1:0]; - reg [`DATA_WIDTH-1:0] __in_data_reg [NUM_THREADS-1:0]; + reg [NUM_LANES-1:0] __in_is_store_reg; + reg [`MASK_WIDTH-1:0] __in_store_mask_reg [NUM_LANES-1:0]; + reg [`DATA_WIDTH-1:0] __in_data_reg [NUM_LANES-1:0]; reg __in_finished_reg; genvar g; generate - for (g = 0; g < NUM_THREADS; g = g + 1) begin + for (g = 0; g < NUM_LANES; g = g + 1) begin assign trace_read_valid[g] = __in_valid_reg[g]; assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address_reg[g]; @@ -86,7 +86,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) ( // Setting reset value if (reset) begin - for (integer tid = 0; tid < NUM_THREADS; tid = tid + 1) begin + for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin __in_valid[tid] = 1'b0; __in_address[tid] = `DATA_WIDTH'b0; @@ -100,7 +100,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) ( cycle_counter <= `DATA_WIDTH'b0; // setting default value for register to avoid latches - for (integer tid = 0; tid < NUM_THREADS; tid = tid + 1) begin + for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin __in_valid_reg[tid] <= 1'b0; __in_address_reg[tid] <= `DATA_WIDTH'b0; @@ -114,7 +114,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) ( cycle_counter <= next_cycle_counter; // Getting values from C function into pseudeo register - for (integer tid = 0; tid < NUM_THREADS; tid = tid + 1) begin + for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin memtrace_query( trace_read_ready, // Since parsed results are latched to the output on the next @@ -135,7 +135,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) ( end // Connect values from pseudo register into verilog register - for (integer tid = 0; tid < NUM_THREADS; tid = tid + 1) begin + for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin __in_valid_reg[tid] <= __in_valid[tid]; __in_address_reg[tid] <= __in_address[tid]; diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 9e9f1e6..43fceaa 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -16,7 +16,7 @@ class CoalRegEntry(val sourceWidth: Int, val addressWidth: Int) extends Bundle { val data = UInt(64.W /* FIXME hardcoded */ ) } -class CoalescingUnit(numThreads: Int = 1)(implicit p: Parameters) +class CoalescingUnit(numLanes: Int = 1)(implicit p: Parameters) extends LazyModule { // val beatBytes = 8 // val seqParam = Seq( @@ -63,7 +63,7 @@ class CoalescingUnit(numThreads: Int = 1)(implicit p: Parameters) val sourceWidth = node.in(0)._1.params.sourceBits val addressWidth = node.in(0)._1.params.addressBits val coalRegEntry = new CoalRegEntry(sourceWidth, addressWidth) - val fifos = Seq.tabulate(numThreads) { _ => + val fifos = Seq.tabulate(numLanes) { _ => Module( new ShiftQueue(coalRegEntry, 4 /* FIXME hardcoded */ ) ) @@ -108,18 +108,33 @@ class CoalescingUnit(numThreads: Int = 1)(implicit p: Parameters) dontTouch(tlOut.d) } - // val (tlIn, edgeIn) = coalescerNode.in(0) - // tlIn.d.bits.data := 0.U + val (tlCoal, edgeCoal) = coalescerNode.out(0) - val (tlCoal, _) = coalescerNode.out(0) - dontTouch(tlCoal.a) + // FIXME: currently generating bogus coalesced requests + tlCoal.a.valid := true.B + tlCoal.a.bits := edgeCoal + .Get( + fromSource = 0.U, + // `toAddress` should be aligned to 2**lgSize + toAddress = 0xabcd00.U, + // 64 bits = 8 bytes = 2**(3) bytes + lgSize = 3.U + ) + ._2 + + val coalRespValid = Wire(Bool()) + coalRespValid := tlCoal.a.valid + val coalRespData = Wire(UInt(tlCoal.params.dataBits.W)) + coalRespData := tlCoal.d.bits.data + dontTouch(coalRespValid) + dontTouch(coalRespData) } } -class MemTraceDriver(numThreads: Int = 1)(implicit p: Parameters) +class MemTraceDriver(numLanes: Int = 1)(implicit p: Parameters) extends LazyModule { // Create N client nodes together - val threadNodes = Seq.tabulate(numThreads) { i => + val laneNodes = Seq.tabulate(numLanes) { i => val clientParam = Seq( TLMasterParameters.v1( name = "MemTraceDriver" + i.toString, @@ -133,11 +148,9 @@ class MemTraceDriver(numThreads: Int = 1)(implicit p: Parameters) // Combine N outgoing client node into 1 idenity node for diplomatic // connection. val node = TLIdentityNode() - threadNodes.foreach { threadNode => - node := threadNode - } + laneNodes.foreach { l => node := l } - lazy val module = new MemTraceDriverImp(this, numThreads) + lazy val module = new MemTraceDriverImp(this, numLanes) } class TraceReq extends Bundle { @@ -148,22 +161,22 @@ class TraceReq extends Bundle { val data = UInt(64.W) } -class MemTraceDriverImp(outer: MemTraceDriver, numThreads: Int) +class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int) extends LazyModuleImp(outer) with UnitTestModule { val sim = Module( - new SimMemTrace(filename = "vecadd.core1.thread4.trace", numThreads) + new SimMemTrace(filename = "vecadd.core1.thread4.trace", numLanes) ) sim.io.clock := clock sim.io.reset := reset.asBool sim.io.trace_read.ready := true.B - // Split output of SimMemTrace, which is flattened across all threads, - // back to each thread's. + // Split output of SimMemTrace, which is flattened across all lanes, + // back to each lane's. // Maybe this part can be improved, since now we are still mannually shifting everything - val threadReqs = Wire(Vec(numThreads, new TraceReq)) - threadReqs.zipWithIndex.foreach { case (req, i) => + val laneReqs = Wire(Vec(numLanes, new TraceReq)) + laneReqs.zipWithIndex.foreach { case (req, i) => req.valid := (sim.io.trace_read.valid >> i) req.address := (sim.io.trace_read.address >> (64 * i)) req.is_store := (sim.io.trace_read.is_store >> i) @@ -178,8 +191,8 @@ class MemTraceDriverImp(outer: MemTraceDriver, numThreads: Int) val sourceIdCounter = Reg(UInt(64.W)) sourceIdCounter := sourceIdCounter + 1.U - // Connect each thread to its respective TL node. - (outer.threadNodes zip threadReqs).foreach { case (node, req) => + // Connect each lane to its respective TL node. + (outer.laneNodes zip laneReqs).foreach { case (node, req) => val (tlOut, edge) = node.out(0) tlOut.a.valid := req.valid @@ -222,9 +235,9 @@ class MemTraceDriverImp(outer: MemTraceDriver, numThreads: Int) dontTouch(clkcount) } -class SimMemTrace(val filename: String, numThreads: Int) +class SimMemTrace(val filename: String, numLanes: Int) extends BlackBox( - Map("FILENAME" -> filename, "NUM_THREADS" -> numThreads) + Map("FILENAME" -> filename, "NUM_LANES" -> numLanes) ) with HasBlackBoxResource { val io = IO(new Bundle { @@ -235,14 +248,14 @@ class SimMemTrace(val filename: String, numThreads: Int) // trace_read_address. val trace_read = new Bundle { val ready = Input(Bool()) - val valid = Output(UInt(numThreads.W)) + val valid = Output(UInt(numLanes.W)) // Chisel can't interface with Verilog 2D port, so flatten all lanes into // single wide 1D array. // TODO: assumes 64-bit address. - val address = Output(UInt((64 * numThreads).W)) - val is_store = Output(UInt(numThreads.W)) - val store_mask = Output(UInt((8 * numThreads).W)) - val data = Output(UInt((64 * numThreads).W)) + val address = Output(UInt((64 * numLanes).W)) + val is_store = Output(UInt(numLanes.W)) + val store_mask = Output(UInt((8 * numLanes).W)) + val data = Output(UInt((64 * numLanes).W)) val finished = Output(Bool()) } }) @@ -253,16 +266,16 @@ class SimMemTrace(val filename: String, numThreads: Int) } class CoalConnectTrace(implicit p: Parameters) extends LazyModule { - // TODO: use parameters for numThreads - val numThreads = 4 - val coal = LazyModule(new CoalescingUnit(numThreads)) - val driver = LazyModule(new MemTraceDriver(numThreads)) + // TODO: use parameters for numLanes + val numLanes = 4 + val coal = LazyModule(new CoalescingUnit(numLanes)) + val driver = LazyModule(new MemTraceDriver(numLanes)) coal.node :=* driver.node // Use TLTestRAM as bogus downstream TL manager nodes // TODO: swap this out with a memtrace logger - val rams = Seq.tabulate(numThreads + 1) { _ => + val rams = Seq.tabulate(numLanes + 1) { _ => LazyModule( // TODO: properly propagate beatBytes? new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8)