tensor: Fix out-of-sync enqueue to dpu and metadata queue
This commit is contained in:
@@ -77,7 +77,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
|
||||
// octet. E.g. two tgs map lane 0-3 and lane 16-19 -> 16
|
||||
// FIXME: not sure this is the right logic. just filling in what works
|
||||
localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS);
|
||||
localparam METADATA_QUEUE_DEPTH = 4;
|
||||
// this is only a rule of thumb
|
||||
localparam METADATA_QUEUE_DEPTH = `LATENCY_HMMA;
|
||||
|
||||
wire [1:0] step = 2'(execute_if.data.op_type);
|
||||
wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1));
|
||||
@@ -89,7 +90,11 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_0;
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1;
|
||||
wire [`NW_WIDTH-1:0] wb_wid;
|
||||
|
||||
|
||||
// valid signal synced between the functional units (octet) and the
|
||||
// metadata queue
|
||||
wire operands_valid_synced;
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
for (genvar i = 0; i < NUM_OCTETS; ++i) begin
|
||||
`else
|
||||
@@ -121,7 +126,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
|
||||
.A_in(octet_A),
|
||||
.B_in(octet_B),
|
||||
.C_in(octet_C),
|
||||
.operands_valid(execute_if.valid),
|
||||
.operands_valid(operands_valid_synced),
|
||||
.operands_wid(execute_if.data.wid),
|
||||
.operands_last_in_pair(last_in_pair),
|
||||
.operands_step(step),
|
||||
@@ -172,8 +177,10 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
|
||||
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
|
||||
|
||||
wire execute_if_fire = execute_if.valid && execute_if.ready;
|
||||
wire commit_if_fire = commit_if.valid && commit_if.ready;
|
||||
wire commit_if_ready_override;
|
||||
|
||||
wire operand_enq_fire = operands_valid_synced && execute_if.ready;
|
||||
wire commit_if_fire = commit_if.valid && commit_if_ready_override;
|
||||
wire [DATAW-1:0] execute_if_data_enq = {
|
||||
execute_if.data.uuid,
|
||||
execute_if.data.wid,
|
||||
@@ -184,31 +191,14 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
|
||||
// pid/sop/eop set later
|
||||
};
|
||||
|
||||
// wire [DATAW-1:0] execute_if_data_deq;
|
||||
|
||||
// VX_fifo_queue #(
|
||||
// .DATAW(DATAW),
|
||||
// .DEPTH(4 /* FIXME: arbitrary */)
|
||||
// ) pending_uops (
|
||||
// .clk(clk),
|
||||
// .reset(reset),
|
||||
// .push(execute_if_fire),
|
||||
// .pop(commit_if_fire),
|
||||
// .data_in(execute_if_data_enq),
|
||||
// .data_out(execute_if_data_deq),
|
||||
// `UNUSED_PIN(empty),
|
||||
// `UNUSED_PIN(alm_empty),
|
||||
// `UNUSED_PIN(full), // should be impossible to overflow
|
||||
// `UNUSED_PIN(alm_full),
|
||||
// `UNUSED_PIN(size)
|
||||
// );
|
||||
|
||||
wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
|
||||
|
||||
wire [`NUM_WARPS-1:0] metadata_queue_fulls;
|
||||
// OR not AND, we don't want any warp full
|
||||
wire metadata_queue_full = |(metadata_queue_fulls);
|
||||
|
||||
// need to make sure both metadata and octet issue queues are in sync
|
||||
assign operands_valid_synced = execute_if.valid && !metadata_queue_full;
|
||||
assign execute_if.ready = &(octet_operands_ready) && !metadata_queue_full;
|
||||
|
||||
for (genvar i = 0; i < `NUM_WARPS; i++) begin
|
||||
@@ -220,8 +210,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
|
||||
// ensure two consecutive dequeues are associated with the same warp for
|
||||
// commit. (FIXME: this is not strictly necessary though.)
|
||||
|
||||
wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i));
|
||||
wire deq = commit_if_fire && ( wb_wid == `NW_WIDTH'(i));
|
||||
wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i));
|
||||
wire deq = commit_if_fire && ( wb_wid == `NW_WIDTH'(i));
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW(DATAW),
|
||||
@@ -253,8 +243,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
|
||||
wire all_valid = (& octet_results_valid);
|
||||
|
||||
// define this to inject artificial commit backpressure for debugging
|
||||
`define INJECT_COMMIT_BACKPRESSURE
|
||||
`ifndef INJECT_COMMIT_BACKPRESSURE
|
||||
// `define TENSOR_INJECT_COMMIT_BACKPRESSURE
|
||||
`ifndef TENSOR_INJECT_COMMIT_BACKPRESSURE
|
||||
assign commit_if.valid = all_valid;
|
||||
assign commit_if_ready_override = commit_if.ready;
|
||||
`else
|
||||
@@ -358,47 +348,6 @@ module VX_tensor_octet #(
|
||||
wire operands_last_in_pair_buf;
|
||||
wire [1:0] operands_step_buf;
|
||||
|
||||
// wire inbuf_empty;
|
||||
// wire inbuf_full;
|
||||
// wire inbuf_ready_in;
|
||||
// assign inbuf_ready_in = !inbuf_full;
|
||||
// assign operands_ready = inbuf_ready_in;
|
||||
// assign operands_valid_buf = !inbuf_empty;
|
||||
|
||||
// // wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair;
|
||||
// wire inbuf_enq = operands_ready && operands_valid;
|
||||
// wire inbuf_deq = operands_valid_buf && operands_ready_buf;
|
||||
|
||||
// // the 'issue queue' for the dpu.
|
||||
// // This exists to decouple the input of the dot-product unit from
|
||||
// // execute_if.ready. execute_if can arrive intermittently according to
|
||||
// // the frontend's behavior, and since the dpu can also stall for a fixed
|
||||
// // initiation latency, we need to decouple the two to efficiently feed the
|
||||
// // dpu.
|
||||
// // This only applies to the last instruction in a pair, since the first
|
||||
// // instruction only acts to buffer the operands and can execute
|
||||
// // immediately without backpressure. So we don't enqueue them.
|
||||
// VX_fifo_queue #(
|
||||
// .DATAW ($bits(A_in) + $bits(B_in) + $bits(C_in) +
|
||||
// $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)),
|
||||
// .DEPTH (ISSUE_QUEUE_DEPTH)
|
||||
// ) input_buffer (
|
||||
// .clk (clk),
|
||||
// .reset (reset),
|
||||
// .push (inbuf_enq),
|
||||
// .pop (inbuf_deq),
|
||||
// .data_in ({A_in, B_in, C_in, operands_wid, operands_step, operands_last_in_pair}),
|
||||
// .data_out ({A_in_buf, B_in_buf, C_in_buf, operands_wid_buf, operands_step_buf, operands_last_in_pair_buf}),
|
||||
// .empty (inbuf_empty),
|
||||
// `UNUSED_PIN(alm_empty),
|
||||
// .full (inbuf_full),
|
||||
// `UNUSED_PIN(alm_full),
|
||||
// `UNUSED_PIN(size)
|
||||
// );
|
||||
|
||||
// // FIXME: this shouldn't be necessary
|
||||
// `RUNTIME_ASSERT(reset || !inbuf_full, ("dpu issue queue is full!"))
|
||||
|
||||
assign A_in_buf = A_in;
|
||||
assign B_in_buf = B_in;
|
||||
assign C_in_buf = C_in;
|
||||
@@ -521,7 +470,8 @@ module VX_tensor_octet #(
|
||||
// this does (m,n,k)=(4,4,2) matmul, modeling compute of a single octet
|
||||
VX_tensor_dpu #(
|
||||
.ISW(ISW),
|
||||
.OCTET(OCTET)
|
||||
.OCTET(OCTET),
|
||||
.ISSUE_QUEUE_DEPTH(2)
|
||||
) dpu (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
@@ -556,7 +506,7 @@ module VX_tensor_octet #(
|
||||
// TODO: This is probably oversized.
|
||||
VX_fifo_queue #(
|
||||
.DATAW ($bits(D_wid) + $bits(D_out)),
|
||||
.DEPTH (`LATENCY_HMMA)
|
||||
.DEPTH (2 /*`LATENCY_HMMA*/)
|
||||
) output_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
|
||||
module VX_tensor_dpu #(
|
||||
parameter ISW,
|
||||
parameter OCTET
|
||||
parameter OCTET,
|
||||
parameter ISSUE_QUEUE_DEPTH = `LATENCY_HMMA
|
||||
) (
|
||||
input clk,
|
||||
input reset,
|
||||
@@ -62,6 +63,7 @@ module VX_tensor_dpu #(
|
||||
logic [1:0] threadgroup_readys;
|
||||
// B_tile is shared across the two threadgroups; see Figure 13
|
||||
VX_tensor_threadgroup #(
|
||||
.ISSUE_QUEUE_DEPTH(ISSUE_QUEUE_DEPTH)
|
||||
) threadgroup_0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
@@ -75,6 +77,7 @@ module VX_tensor_dpu #(
|
||||
.D_frag (D_tile[1:0])
|
||||
);
|
||||
VX_tensor_threadgroup #(
|
||||
.ISSUE_QUEUE_DEPTH(ISSUE_QUEUE_DEPTH)
|
||||
) threadgroup_1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
@@ -99,7 +102,7 @@ module VX_tensor_dpu #(
|
||||
// need to pass along warp id's to do multithreading
|
||||
VX_fifo_queue #(
|
||||
.DATAW ($bits(wid)),
|
||||
.DEPTH (`LATENCY_HMMA + `LATENCY_HMMA)
|
||||
.DEPTH (ISSUE_QUEUE_DEPTH)
|
||||
) wid_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
@@ -121,6 +124,7 @@ endmodule
|
||||
// does (m,n,k) = (2,4,2) matmul compute over 2 cycles.
|
||||
// matches Figure 10(b) of the paper.
|
||||
module VX_tensor_threadgroup #(
|
||||
parameter ISSUE_QUEUE_DEPTH
|
||||
) (
|
||||
input clk,
|
||||
input reset,
|
||||
@@ -149,9 +153,18 @@ module VX_tensor_threadgroup #(
|
||||
assign ready_in = !full;
|
||||
assign valid_buf = !empty;
|
||||
|
||||
// 'Issue queue' for the FEDP units.
|
||||
// This exists to decouple the execution of the dot-product unit from
|
||||
// the operand arrival. Operands from execute_if can arrive
|
||||
// intermittently according to the frontend's behavior, and since the dpu
|
||||
// can also stall for a fixed initiation latency, we need to decouple the
|
||||
// two to efficiently feed the dpu.
|
||||
//
|
||||
// TODO: better queue design possible; e.g. B_frag is shared by two
|
||||
// threadgroups, so we need only 1 queue per octet for B
|
||||
VX_fifo_queue #(
|
||||
.DATAW ($bits(A_frag) + $bits(B_frag) + $bits(C_frag)),
|
||||
.DEPTH (4)
|
||||
.DEPTH (ISSUE_QUEUE_DEPTH)
|
||||
) input_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
Reference in New Issue
Block a user