tensor: Parameterize result buffer depth

2024-07-25 16:31:45 -07:00
parent f3afd4a6f9
commit 7f43bab0aa
2 changed files with 47 additions and 32 deletions
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -298,7 +298,11 @@ endmodule
 module VX_tensor_octet #(
    parameter ISW,
-    parameter OCTET
+    parameter OCTET,
    // RESULT_BUFFER_DEPTH = 2 gives good performance by absorbing commit
    // backpressure (result_ready), although the value is arbitrary.
    // RESULT_BUFFER_DEPTH = 0 eliminates result buffering.
    parameter RESULT_BUFFER_DEPTH = 2
 ) (
    input clk,
    input reset,
@@ -488,6 +492,7 @@ module VX_tensor_octet #(
        .D_wid(D_wid_dpu)
    );
    if (RESULT_BUFFER_DEPTH > 0) begin
        wire outbuf_empty;
        wire outbuf_full;
        // backpressure from commit
@@ -497,14 +502,14 @@ module VX_tensor_octet #(
        wire outbuf_enq = outbuf_ready_in && dpu_valid;
        wire outbuf_deq = result_valid && result_ready;
-    // result buffer to stage the D tile for 2 cycles until commit/writeback
+        // Result buffer that stages the D tile for 2 cycles until
-    // is complete.  This decouples the irregular dpu output traffic from the
+        // commit/writeback is complete.  This decouples the irregular dpu
-    // regular, every-2-cycle commit traffic to ensure the commit pipeline is
+        // output traffic from the regular, every-2-cycle commit traffic to
-    // used more efficiently.
+        // ensure the commit pipeline is used more efficiently.
        // FIXME: unnecessary?
        VX_fifo_queue #(
            .DATAW   ($bits(D_wid) + $bits(D_out)),
-        .DEPTH   (2 /* arbitrary */)
+            .DEPTH   (RESULT_BUFFER_DEPTH) // 2 works good
        ) output_buffer (
            .clk   (clk),
            .reset (reset),
@@ -519,8 +524,18 @@ module VX_tensor_octet #(
            `UNUSED_PIN(size)
        );
-    // FIXME: this shouldn't be necessary
+        // FIXME: overly strict; this firing doesn't mean a bug
        `RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!"))
    end else begin
        // XXX: this depends on the assumption that commit stage only asserts
        // result_ready when result_valid is true
        assign outbuf_ready_in = !result_valid || result_ready;
        assign result_valid = dpu_valid;
        // make direct connections
        assign D_wid = D_wid_dpu;
        assign D_out = D_tile;
    end
 `ifdef PERF_ENABLE
    logic [`PERF_CTR_BITS-1:0] perf_tensor_dpu_total;
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -250,7 +250,7 @@ module VX_tensor_threadgroup #(
    for (genvar i = 0; i < 4; ++i) begin
        localparam int d_row = i / 2;
        localparam int d_col = (i % 2) * 2;
-        // four-element dot product (FEDP) unit
+        // Dot product (FEDP) unit generated from Chisel
        TensorDotProductUnit fedp (
          .clock (clk),
          .reset (reset),