tensor: Parameterize result buffer depth

This commit is contained in:
Hansung Kim
2024-07-25 16:31:45 -07:00
parent f3afd4a6f9
commit 7f43bab0aa
2 changed files with 47 additions and 32 deletions

View File

@@ -298,7 +298,11 @@ endmodule
module VX_tensor_octet #( module VX_tensor_octet #(
parameter ISW, parameter ISW,
parameter OCTET parameter OCTET,
// RESULT_BUFFER_DEPTH = 2 gives good performance by absorbing commit
// backpressure (result_ready), although the value is arbitrary.
// RESULT_BUFFER_DEPTH = 0 eliminates result buffering.
parameter RESULT_BUFFER_DEPTH = 2
) ( ) (
input clk, input clk,
input reset, input reset,
@@ -488,6 +492,7 @@ module VX_tensor_octet #(
.D_wid(D_wid_dpu) .D_wid(D_wid_dpu)
); );
if (RESULT_BUFFER_DEPTH > 0) begin
wire outbuf_empty; wire outbuf_empty;
wire outbuf_full; wire outbuf_full;
// backpressure from commit // backpressure from commit
@@ -497,14 +502,14 @@ module VX_tensor_octet #(
wire outbuf_enq = outbuf_ready_in && dpu_valid; wire outbuf_enq = outbuf_ready_in && dpu_valid;
wire outbuf_deq = result_valid && result_ready; wire outbuf_deq = result_valid && result_ready;
// result buffer to stage the D tile for 2 cycles until commit/writeback // Result buffer that stages the D tile for 2 cycles until
// is complete. This decouples the irregular dpu output traffic from the // commit/writeback is complete. This decouples the irregular dpu
// regular, every-2-cycle commit traffic to ensure the commit pipeline is // output traffic from the regular, every-2-cycle commit traffic to
// used more efficiently. // ensure the commit pipeline is used more efficiently.
// FIXME: unnecessary? // FIXME: unnecessary?
VX_fifo_queue #( VX_fifo_queue #(
.DATAW ($bits(D_wid) + $bits(D_out)), .DATAW ($bits(D_wid) + $bits(D_out)),
.DEPTH (2 /* arbitrary */) .DEPTH (RESULT_BUFFER_DEPTH) // 2 works good
) output_buffer ( ) output_buffer (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
@@ -519,8 +524,18 @@ module VX_tensor_octet #(
`UNUSED_PIN(size) `UNUSED_PIN(size)
); );
// FIXME: this shouldn't be necessary // FIXME: overly strict; this firing doesn't mean a bug
`RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!")) `RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!"))
end else begin
// XXX: this depends on the assumption that commit stage only asserts
// result_ready when result_valid is true
assign outbuf_ready_in = !result_valid || result_ready;
assign result_valid = dpu_valid;
// make direct connections
assign D_wid = D_wid_dpu;
assign D_out = D_tile;
end
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
logic [`PERF_CTR_BITS-1:0] perf_tensor_dpu_total; logic [`PERF_CTR_BITS-1:0] perf_tensor_dpu_total;

View File

@@ -250,7 +250,7 @@ module VX_tensor_threadgroup #(
for (genvar i = 0; i < 4; ++i) begin for (genvar i = 0; i < 4; ++i) begin
localparam int d_row = i / 2; localparam int d_row = i / 2;
localparam int d_col = (i % 2) * 2; localparam int d_col = (i % 2) * 2;
// four-element dot product (FEDP) unit // Dot product (FEDP) unit generated from Chisel
TensorDotProductUnit fedp ( TensorDotProductUnit fedp (
.clock (clk), .clock (clk),
.reset (reset), .reset (reset),