tensor: spurious assert, doc, remove unused param

2024-07-27 20:53:56 -07:00
parent 4e0dcdadac
commit d4d18c2823
2 changed files with 18 additions and 18 deletions
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -324,9 +324,6 @@ endmodule
 module VX_tensor_octet #(
    parameter ISW,
    parameter OCTET,
-    // RESULT_BUFFER_DEPTH = 2 gives good performance by absorbing commit
-    // backpressure (result_ready), although the value is arbitrary.
-    // RESULT_BUFFER_DEPTH = 0 eliminates result buffering.
    parameter RESULT_BUFFER_DEPTH = 2
 ) (
    input clk,
@@ -385,8 +382,11 @@ module VX_tensor_octet #(
    assign operands_ready = operands_ready_buf;

    typedef struct {
+      // single column of A
      logic [3:0][31:0] A_half;
+      // single row of B
      logic [3:0][31:0] B_half;
+      // interleaved elements of C
      logic [7:0][31:0] C_half;
    } half_t;

@@ -477,18 +477,20 @@ module VX_tensor_octet #(
    wire hmma_ready;
    assign operands_ready_buf = hmma_ready;

-    // A is 4x2 fp32 matrix
+    // all *_tiles below are row-major
+    // A is a 4x2 fp32 matrix
    wire [3:0][1:0][31:0] A_tile = {
        { halves_buf.A_half[3], A_buffer[operands_wid_buf][3] },
        { halves_buf.A_half[2], A_buffer[operands_wid_buf][2] },
        { halves_buf.A_half[1], A_buffer[operands_wid_buf][1] },
        { halves_buf.A_half[0], A_buffer[operands_wid_buf][0] }
    };
-    // B is 2x4 fp32 matrix
+    // B is a 2x4 fp32 matrix
    wire [1:0][3:0][31:0] B_tile = {
-        halves_buf.B_half, B_buffer[operands_wid_buf]
+        halves_buf.B_half,
+        B_buffer[operands_wid_buf]
    };
-    // C is 4x4 fp32 matrix
+    // C is a 4x4 fp32 matrix
    logic [3:0][3:0][31:0] C_tile;
    wire  [3:0][3:0][31:0] D_tile;
    wire  [`NW_WIDTH-1:0]  D_wid_dpu;
@@ -538,7 +540,10 @@ module VX_tensor_octet #(
        // commit/writeback is complete.  This decouples the irregular dpu
        // output traffic from the regular, every-2-cycle commit traffic to
        // ensure the commit pipeline is used more efficiently.
-        // FIXME: unnecessary?
+        //
+        // @perf: RESULT_BUFFER_DEPTH == 2 gives good performance by
+        // completely dampening commit backpressure (result_ready).
+        // RESULT_BUFFER_DEPTH = 0 removes the fifo queue altogether.
        VX_fifo_queue #(
            .DATAW   ($bits(D_wid) + $bits(D_out)),
            .DEPTH   (RESULT_BUFFER_DEPTH) // 2 works good
@@ -556,8 +561,8 @@ module VX_tensor_octet #(
            `UNUSED_PIN(size)
        );

-        // FIXME: overly strict; this firing doesn't mean a bug
-        `RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!"))
+        // for perf debug
+        // `RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!"))
    end else begin
        // XXX: this depends on the assumption that commit stage only asserts
        // result_ready when result_valid is true
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -60,12 +60,11 @@ module VX_tensor_dpu #(

    wire empty;
    wire full;
-    // sync between operand buffer and wid buffer
+    // sync operand buffer and wid buffer
    assign ready_in = !full && !wid_full;

    wire [1:0] threadgroup_valids_out;
    wire [1:0] threadgroup_readys_in;
-    // sync operand queue and wid queue
    wire threadgroup_valid_in = !empty;
    wire threadgroup_fire_in  = threadgroup_valid_in && &(threadgroup_readys_in);

@@ -98,11 +97,9 @@ module VX_tensor_dpu #(
    );

    // Split A_tile and C_tile by rows (0-1, 2-3) and parallelize in two
-    // threadgroups
-    //
-    // B_tile is shared across the two threadgroups; see Figure 13
+    // threadgroups; B_tile is shared across the two threadgroups. See Figure
+    // 13 in paper
    VX_tensor_threadgroup #(
-        .OPERAND_BUFFER_DEPTH(OPERAND_BUFFER_DEPTH)
    ) threadgroup_0 (
        .clk   (clk),
        .reset (reset),
@@ -116,7 +113,6 @@ module VX_tensor_dpu #(
        .D_frag    (D_tile[1:0])
    );
    VX_tensor_threadgroup #(
-        .OPERAND_BUFFER_DEPTH(OPERAND_BUFFER_DEPTH)
    ) threadgroup_1 (
        .clk   (clk),
        .reset (reset),
@@ -167,7 +163,6 @@ endmodule
 // does (m,n,k) = (2,4,2) matmul compute over 2 cycles.
 // see Figure 10(b) of the paper.
 module VX_tensor_threadgroup #(
-    parameter OPERAND_BUFFER_DEPTH
 ) (
    input clk,
    input reset,