Thread -> Lane

"thread" is confusing, unify to lane when denoting a hardware SIMD lane
inside a single warp.
This commit is contained in:
Hansung Kim
2023-03-09 22:09:07 -08:00
parent a495149869
commit 9bfb813e1b
4 changed files with 78 additions and 65 deletions

View File

@@ -35,7 +35,7 @@ void MemTraceReader::parse() {
printf("MemTraceReader: started parsing\n"); printf("MemTraceReader: started parsing\n");
while (infile >> line.cycle >> line.loadstore >> line.core_id >> while (infile >> line.cycle >> line.loadstore >> line.core_id >>
line.thread_id >> std::hex >> line.address >> line.data >> std::dec >> line.lane_id >> std::hex >> line.address >> line.data >> std::dec >>
line.data_size) { line.data_size) {
line.valid = true; line.valid = true;
trace.push_back(line); trace.push_back(line);
@@ -49,7 +49,7 @@ void MemTraceReader::parse() {
// given SIMD lane (= "thread"). In case no request happened at that point, // given SIMD lane (= "thread"). In case no request happened at that point,
// return an empty line with .valid = false. // return an empty line with .valid = false.
MemTraceLine MemTraceReader::read_trace_at(const long cycle, MemTraceLine MemTraceReader::read_trace_at(const long cycle,
const int thread_id) { const int lane_id) {
MemTraceLine line; MemTraceLine line;
line.valid = false; line.valid = false;
@@ -67,17 +67,17 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle,
assert(false && "some trace lines are left unread in the past"); assert(false && "some trace lines are left unread in the past");
} }
if (line.thread_id != thread_id) { if (line.lane_id != lane_id) {
line.valid = false; line.valid = false;
} }
if (line.cycle > cycle) { if (line.cycle > cycle) {
// We haven't reached the cycle mark specified in this line yet, so we don't // We haven't reached the cycle mark specified in this line yet, so we don't
// read it right now. // read it right now.
return MemTraceLine{}; return MemTraceLine{};
} else if (line.cycle == cycle && line.thread_id == thread_id) { } else if (line.cycle == cycle && line.lane_id == lane_id) {
printf("fire! cycle=%ld, valid=%d, %s \n", cycle, line.valid, line.loadstore); printf("fire! cycle=%ld, valid=%d, %s \n", cycle, line.valid, line.loadstore);
// FIXME! Currently thread_id is assumed to be in round-robin order, e.g. // FIXME! Currently lane_id is assumed to be in round-robin order, e.g.
// 0->1->2->3->0->..., both in the trace file and the order the caller calls // 0->1->2->3->0->..., both in the trace file and the order the caller calls
// this function. If this is not true, we cannot simply monotonically // this function. If this is not true, we cannot simply monotonically
// increment read_pos. // increment read_pos.
@@ -101,7 +101,7 @@ extern "C" void memtrace_init(const char *filename) {
// TODO: accept core_id as well // TODO: accept core_id as well
extern "C" void memtrace_query(unsigned char trace_read_ready, extern "C" void memtrace_query(unsigned char trace_read_ready,
unsigned long trace_read_cycle, unsigned long trace_read_cycle,
int trace_read_thread_id, int trace_read_lane_id,
unsigned char *trace_read_valid, unsigned char *trace_read_valid,
unsigned long *trace_read_address, unsigned long *trace_read_address,
unsigned char *trace_read_is_store, unsigned char *trace_read_is_store,
@@ -109,13 +109,13 @@ extern "C" void memtrace_query(unsigned char trace_read_ready,
unsigned long *trace_read_data, unsigned long *trace_read_data,
unsigned char *trace_read_finished) { unsigned char *trace_read_finished) {
// printf("memtrace_query(cycle=%ld, tid=%d)\n", trace_read_cycle, // printf("memtrace_query(cycle=%ld, tid=%d)\n", trace_read_cycle,
// trace_read_thread_id); // trace_read_lane_id);
if (!trace_read_ready) { if (!trace_read_ready) {
return; return;
} }
auto line = reader->read_trace_at(trace_read_cycle, trace_read_thread_id); auto line = reader->read_trace_at(trace_read_cycle, trace_read_lane_id);
*trace_read_valid = line.valid; *trace_read_valid = line.valid;
*trace_read_address = line.address; *trace_read_address = line.address;
*trace_read_is_store = strcmp(line.loadstore, "STORE") == 0 ; *trace_read_is_store = strcmp(line.loadstore, "STORE") == 0 ;

View File

@@ -12,7 +12,7 @@ struct MemTraceLine {
long cycle = 0; long cycle = 0;
char loadstore[10]; char loadstore[10];
int core_id = 0; int core_id = 0;
int thread_id = 0; int lane_id = 0;
unsigned long address = 0; unsigned long address = 0;
unsigned long data = 0; unsigned long data = 0;
int data_size = 0; int data_size = 0;
@@ -23,7 +23,7 @@ public:
MemTraceReader(const std::string &filename); MemTraceReader(const std::string &filename);
~MemTraceReader(); ~MemTraceReader();
void parse(); void parse();
MemTraceLine read_trace_at(const long cycle, const int thread_id); MemTraceLine read_trace_at(const long cycle, const int lane_id);
bool finished() const { return read_pos == trace.cend(); } bool finished() const { return read_pos == trace.cend(); }
std::ifstream infile; std::ifstream infile;
@@ -34,7 +34,7 @@ public:
extern "C" void memtrace_init(const char *filename); extern "C" void memtrace_init(const char *filename);
extern "C" void memtrace_query(unsigned char trace_read_ready, extern "C" void memtrace_query(unsigned char trace_read_ready,
unsigned long trace_read_cycle, unsigned long trace_read_cycle,
int trace_read_thread_id, int trace_read_lane_id,
unsigned char *trace_read_valid, unsigned char *trace_read_valid,
unsigned long *trace_read_address, unsigned long *trace_read_address,
unsigned char *trace_read_is_store, unsigned char *trace_read_is_store,

View File

@@ -1,5 +1,5 @@
`define DATA_WIDTH 64 `define DATA_WIDTH 64
`define MAX_NUM_THREADS 32 `define MAX_NUM_LANES 32
`define MASK_WIDTH 8 `define MASK_WIDTH 8
import "DPI-C" function void memtrace_init( import "DPI-C" function void memtrace_init(
@@ -23,26 +23,26 @@ import "DPI-C" function void memtrace_query
output bit trace_read_finished output bit trace_read_finished
); );
module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) ( module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) (
input clock, input clock,
input reset, input reset,
// These have to match the IO port of the Chisel wrapper module. // These have to match the IO port of the Chisel wrapper module.
input trace_read_ready, input trace_read_ready,
output [NUM_THREADS-1:0] trace_read_valid, output [NUM_LANES-1:0] trace_read_valid,
output [`DATA_WIDTH*NUM_THREADS-1:0] trace_read_address, output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_address,
output [NUM_THREADS-1:0] trace_read_is_store, output [NUM_LANES-1:0] trace_read_is_store,
output [NUM_THREADS*`MASK_WIDTH-1:0] trace_read_store_mask, output [NUM_LANES*`MASK_WIDTH-1:0] trace_read_store_mask,
output [`DATA_WIDTH*NUM_THREADS-1:0] trace_read_data, output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data,
output trace_read_finished output trace_read_finished
); );
bit __in_valid[NUM_THREADS-1:0]; bit __in_valid[NUM_LANES-1:0];
longint __in_address[NUM_THREADS-1:0]; longint __in_address[NUM_LANES-1:0];
bit __in_is_store[NUM_THREADS-1:0]; bit __in_is_store[NUM_LANES-1:0];
int __in_store_mask [NUM_THREADS-1:0]; int __in_store_mask [NUM_LANES-1:0];
longint __in_data[NUM_THREADS-1:0]; longint __in_data[NUM_LANES-1:0];
bit __in_finished; bit __in_finished;
string __uartlog; string __uartlog;
@@ -54,18 +54,18 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) (
assign next_cycle_counter = cycle_counter + 1'b1; assign next_cycle_counter = cycle_counter + 1'b1;
// registers that stage outputs of the C parser // registers that stage outputs of the C parser
reg [NUM_THREADS-1:0] __in_valid_reg; reg [NUM_LANES-1:0] __in_valid_reg;
reg [`DATA_WIDTH-1:0] __in_address_reg [NUM_THREADS-1:0]; reg [`DATA_WIDTH-1:0] __in_address_reg [NUM_LANES-1:0];
reg [NUM_THREADS-1:0] __in_is_store_reg; reg [NUM_LANES-1:0] __in_is_store_reg;
reg [`MASK_WIDTH-1:0] __in_store_mask_reg [NUM_THREADS-1:0]; reg [`MASK_WIDTH-1:0] __in_store_mask_reg [NUM_LANES-1:0];
reg [`DATA_WIDTH-1:0] __in_data_reg [NUM_THREADS-1:0]; reg [`DATA_WIDTH-1:0] __in_data_reg [NUM_LANES-1:0];
reg __in_finished_reg; reg __in_finished_reg;
genvar g; genvar g;
generate generate
for (g = 0; g < NUM_THREADS; g = g + 1) begin for (g = 0; g < NUM_LANES; g = g + 1) begin
assign trace_read_valid[g] = __in_valid_reg[g]; assign trace_read_valid[g] = __in_valid_reg[g];
assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address_reg[g]; assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address_reg[g];
@@ -86,7 +86,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) (
// Setting reset value // Setting reset value
if (reset) begin if (reset) begin
for (integer tid = 0; tid < NUM_THREADS; tid = tid + 1) begin for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
__in_valid[tid] = 1'b0; __in_valid[tid] = 1'b0;
__in_address[tid] = `DATA_WIDTH'b0; __in_address[tid] = `DATA_WIDTH'b0;
@@ -100,7 +100,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) (
cycle_counter <= `DATA_WIDTH'b0; cycle_counter <= `DATA_WIDTH'b0;
// setting default value for register to avoid latches // setting default value for register to avoid latches
for (integer tid = 0; tid < NUM_THREADS; tid = tid + 1) begin for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
__in_valid_reg[tid] <= 1'b0; __in_valid_reg[tid] <= 1'b0;
__in_address_reg[tid] <= `DATA_WIDTH'b0; __in_address_reg[tid] <= `DATA_WIDTH'b0;
@@ -114,7 +114,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) (
cycle_counter <= next_cycle_counter; cycle_counter <= next_cycle_counter;
// Getting values from C function into pseudeo register // Getting values from C function into pseudeo register
for (integer tid = 0; tid < NUM_THREADS; tid = tid + 1) begin for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
memtrace_query( memtrace_query(
trace_read_ready, trace_read_ready,
// Since parsed results are latched to the output on the next // Since parsed results are latched to the output on the next
@@ -135,7 +135,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) (
end end
// Connect values from pseudo register into verilog register // Connect values from pseudo register into verilog register
for (integer tid = 0; tid < NUM_THREADS; tid = tid + 1) begin for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
__in_valid_reg[tid] <= __in_valid[tid]; __in_valid_reg[tid] <= __in_valid[tid];
__in_address_reg[tid] <= __in_address[tid]; __in_address_reg[tid] <= __in_address[tid];

View File

@@ -16,7 +16,7 @@ class CoalRegEntry(val sourceWidth: Int, val addressWidth: Int) extends Bundle {
val data = UInt(64.W /* FIXME hardcoded */ ) val data = UInt(64.W /* FIXME hardcoded */ )
} }
class CoalescingUnit(numThreads: Int = 1)(implicit p: Parameters) class CoalescingUnit(numLanes: Int = 1)(implicit p: Parameters)
extends LazyModule { extends LazyModule {
// val beatBytes = 8 // val beatBytes = 8
// val seqParam = Seq( // val seqParam = Seq(
@@ -63,7 +63,7 @@ class CoalescingUnit(numThreads: Int = 1)(implicit p: Parameters)
val sourceWidth = node.in(0)._1.params.sourceBits val sourceWidth = node.in(0)._1.params.sourceBits
val addressWidth = node.in(0)._1.params.addressBits val addressWidth = node.in(0)._1.params.addressBits
val coalRegEntry = new CoalRegEntry(sourceWidth, addressWidth) val coalRegEntry = new CoalRegEntry(sourceWidth, addressWidth)
val fifos = Seq.tabulate(numThreads) { _ => val fifos = Seq.tabulate(numLanes) { _ =>
Module( Module(
new ShiftQueue(coalRegEntry, 4 /* FIXME hardcoded */ ) new ShiftQueue(coalRegEntry, 4 /* FIXME hardcoded */ )
) )
@@ -108,18 +108,33 @@ class CoalescingUnit(numThreads: Int = 1)(implicit p: Parameters)
dontTouch(tlOut.d) dontTouch(tlOut.d)
} }
// val (tlIn, edgeIn) = coalescerNode.in(0) val (tlCoal, edgeCoal) = coalescerNode.out(0)
// tlIn.d.bits.data := 0.U
val (tlCoal, _) = coalescerNode.out(0) // FIXME: currently generating bogus coalesced requests
dontTouch(tlCoal.a) tlCoal.a.valid := true.B
tlCoal.a.bits := edgeCoal
.Get(
fromSource = 0.U,
// `toAddress` should be aligned to 2**lgSize
toAddress = 0xabcd00.U,
// 64 bits = 8 bytes = 2**(3) bytes
lgSize = 3.U
)
._2
val coalRespValid = Wire(Bool())
coalRespValid := tlCoal.a.valid
val coalRespData = Wire(UInt(tlCoal.params.dataBits.W))
coalRespData := tlCoal.d.bits.data
dontTouch(coalRespValid)
dontTouch(coalRespData)
} }
} }
class MemTraceDriver(numThreads: Int = 1)(implicit p: Parameters) class MemTraceDriver(numLanes: Int = 1)(implicit p: Parameters)
extends LazyModule { extends LazyModule {
// Create N client nodes together // Create N client nodes together
val threadNodes = Seq.tabulate(numThreads) { i => val laneNodes = Seq.tabulate(numLanes) { i =>
val clientParam = Seq( val clientParam = Seq(
TLMasterParameters.v1( TLMasterParameters.v1(
name = "MemTraceDriver" + i.toString, name = "MemTraceDriver" + i.toString,
@@ -133,11 +148,9 @@ class MemTraceDriver(numThreads: Int = 1)(implicit p: Parameters)
// Combine N outgoing client node into 1 idenity node for diplomatic // Combine N outgoing client node into 1 idenity node for diplomatic
// connection. // connection.
val node = TLIdentityNode() val node = TLIdentityNode()
threadNodes.foreach { threadNode => laneNodes.foreach { l => node := l }
node := threadNode
}
lazy val module = new MemTraceDriverImp(this, numThreads) lazy val module = new MemTraceDriverImp(this, numLanes)
} }
class TraceReq extends Bundle { class TraceReq extends Bundle {
@@ -148,22 +161,22 @@ class TraceReq extends Bundle {
val data = UInt(64.W) val data = UInt(64.W)
} }
class MemTraceDriverImp(outer: MemTraceDriver, numThreads: Int) class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int)
extends LazyModuleImp(outer) extends LazyModuleImp(outer)
with UnitTestModule { with UnitTestModule {
val sim = Module( val sim = Module(
new SimMemTrace(filename = "vecadd.core1.thread4.trace", numThreads) new SimMemTrace(filename = "vecadd.core1.thread4.trace", numLanes)
) )
sim.io.clock := clock sim.io.clock := clock
sim.io.reset := reset.asBool sim.io.reset := reset.asBool
sim.io.trace_read.ready := true.B sim.io.trace_read.ready := true.B
// Split output of SimMemTrace, which is flattened across all threads, // Split output of SimMemTrace, which is flattened across all lanes,
// back to each thread's. // back to each lane's.
// Maybe this part can be improved, since now we are still mannually shifting everything // Maybe this part can be improved, since now we are still mannually shifting everything
val threadReqs = Wire(Vec(numThreads, new TraceReq)) val laneReqs = Wire(Vec(numLanes, new TraceReq))
threadReqs.zipWithIndex.foreach { case (req, i) => laneReqs.zipWithIndex.foreach { case (req, i) =>
req.valid := (sim.io.trace_read.valid >> i) req.valid := (sim.io.trace_read.valid >> i)
req.address := (sim.io.trace_read.address >> (64 * i)) req.address := (sim.io.trace_read.address >> (64 * i))
req.is_store := (sim.io.trace_read.is_store >> i) req.is_store := (sim.io.trace_read.is_store >> i)
@@ -178,8 +191,8 @@ class MemTraceDriverImp(outer: MemTraceDriver, numThreads: Int)
val sourceIdCounter = Reg(UInt(64.W)) val sourceIdCounter = Reg(UInt(64.W))
sourceIdCounter := sourceIdCounter + 1.U sourceIdCounter := sourceIdCounter + 1.U
// Connect each thread to its respective TL node. // Connect each lane to its respective TL node.
(outer.threadNodes zip threadReqs).foreach { case (node, req) => (outer.laneNodes zip laneReqs).foreach { case (node, req) =>
val (tlOut, edge) = node.out(0) val (tlOut, edge) = node.out(0)
tlOut.a.valid := req.valid tlOut.a.valid := req.valid
@@ -222,9 +235,9 @@ class MemTraceDriverImp(outer: MemTraceDriver, numThreads: Int)
dontTouch(clkcount) dontTouch(clkcount)
} }
class SimMemTrace(val filename: String, numThreads: Int) class SimMemTrace(val filename: String, numLanes: Int)
extends BlackBox( extends BlackBox(
Map("FILENAME" -> filename, "NUM_THREADS" -> numThreads) Map("FILENAME" -> filename, "NUM_LANES" -> numLanes)
) )
with HasBlackBoxResource { with HasBlackBoxResource {
val io = IO(new Bundle { val io = IO(new Bundle {
@@ -235,14 +248,14 @@ class SimMemTrace(val filename: String, numThreads: Int)
// trace_read_address. // trace_read_address.
val trace_read = new Bundle { val trace_read = new Bundle {
val ready = Input(Bool()) val ready = Input(Bool())
val valid = Output(UInt(numThreads.W)) val valid = Output(UInt(numLanes.W))
// Chisel can't interface with Verilog 2D port, so flatten all lanes into // Chisel can't interface with Verilog 2D port, so flatten all lanes into
// single wide 1D array. // single wide 1D array.
// TODO: assumes 64-bit address. // TODO: assumes 64-bit address.
val address = Output(UInt((64 * numThreads).W)) val address = Output(UInt((64 * numLanes).W))
val is_store = Output(UInt(numThreads.W)) val is_store = Output(UInt(numLanes.W))
val store_mask = Output(UInt((8 * numThreads).W)) val store_mask = Output(UInt((8 * numLanes).W))
val data = Output(UInt((64 * numThreads).W)) val data = Output(UInt((64 * numLanes).W))
val finished = Output(Bool()) val finished = Output(Bool())
} }
}) })
@@ -253,16 +266,16 @@ class SimMemTrace(val filename: String, numThreads: Int)
} }
class CoalConnectTrace(implicit p: Parameters) extends LazyModule { class CoalConnectTrace(implicit p: Parameters) extends LazyModule {
// TODO: use parameters for numThreads // TODO: use parameters for numLanes
val numThreads = 4 val numLanes = 4
val coal = LazyModule(new CoalescingUnit(numThreads)) val coal = LazyModule(new CoalescingUnit(numLanes))
val driver = LazyModule(new MemTraceDriver(numThreads)) val driver = LazyModule(new MemTraceDriver(numLanes))
coal.node :=* driver.node coal.node :=* driver.node
// Use TLTestRAM as bogus downstream TL manager nodes // Use TLTestRAM as bogus downstream TL manager nodes
// TODO: swap this out with a memtrace logger // TODO: swap this out with a memtrace logger
val rams = Seq.tabulate(numThreads + 1) { _ => val rams = Seq.tabulate(numLanes + 1) { _ =>
LazyModule( LazyModule(
// TODO: properly propagate beatBytes? // TODO: properly propagate beatBytes?
new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8) new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8)