tensor: Rename & docs

2024-08-23 16:21:45 -07:00
parent 45f6ae5aad
commit 2b1a9b7c16
2 changed files with 20 additions and 9 deletions
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -75,8 +75,9 @@ module VX_tensor_core_block import VX_gpu_pkg::*; #(
 );
    localparam NUM_OCTETS = (`NUM_THREADS / 8);
    // offet in the lane numbers that get mapped to the two threadgroups in an
-    // octet. E.g. two tgs map lane 0-3 and lane 16-19 -> 16
-    // FIXME: not sure this is the right logic.  just filling in what works
+    // octet. E.g. two tgs map lane 0-3 and lane 16-19 ->
+    // LANE_OFFSET_THREADGROUP = 16
+    // FIXME: check logic; only verified for single octet
    localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS);
    // this is only a rule of thumb
    localparam METADATA_QUEUE_DEPTH = 2 * `LATENCY_HMMA;
@@ -147,6 +148,10 @@ module VX_tensor_core_block import VX_gpu_pkg::*; #(
        // each octet produces 4x4 output partial sum, but the 8 lanes mapped
        // to the octet can only do 8 fp32 writeback at a time; so we need to
        // split writeback over two cycles
+        //
+        // octet_D matches the mathematical layout of the matrix (4x4 output
+        // per octet).  The logic below replicates the jagged 1x2 mapping in
+        // Figure 7(b) to map values to the lanes.
        assign wb_data_0[4*i+0] = octet_D[0][0];
        assign wb_data_0[4*i+1] = octet_D[1][0];
        assign wb_data_0[4*i+2] = octet_D[0][2];
@@ -511,7 +516,7 @@ module VX_tensor_octet #(
    wire dpu_valid;

    // this does (m,n,k)=(4,4,2) matmul, modeling compute of a single octet
-    VX_tensor_dpu #(
+    VX_tensor_threadgroups #(
        .ISW(ISW),
        .OCTET(OCTET),
        .OPERAND_BUFFER_DEPTH(4 /*@perf: arbtirary*/)
@@ -581,14 +586,14 @@ module VX_tensor_octet #(
    end

 `ifdef PERF_ENABLE
-    logic [`PERF_CTR_BITS-1:0] perf_tensor_dpu_total;
+    logic [`PERF_CTR_BITS-1:0] perf_tensor_ops_total;

    always @(posedge clk) begin
        if (reset) begin
-            perf_tensor_dpu_total <= '0;
+            perf_tensor_ops_total <= '0;
        end else begin
            if (do_hmma) begin
-                perf_tensor_dpu_total <= perf_tensor_dpu_total + 2'd2;
+                perf_tensor_ops_total <= perf_tensor_ops_total + 2'd2;
            end
        end
    end
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -1,7 +1,8 @@
 `ifdef EXT_T_ENABLE
 `include "VX_fpu_define.vh"

-module VX_tensor_dpu #(
+// Module that contains the threadgroups with DPUs + operand buffer.
+module VX_tensor_threadgroups #(
    parameter ISW,
    parameter OCTET,
    // @perf: has big impact on throughput.  A rule of thumb is to set it to
@@ -15,6 +16,7 @@ module VX_tensor_dpu #(
    input valid_in,
    output ready_in,
    // [rows][cols][dtype]
+    // (m,n,k) = (4,4,2)
    input [3:0][1:0][31:0] A_tile,
    input [1:0][3:0][31:0] B_tile,
    input [3:0][3:0][31:0] C_tile,
@@ -172,6 +174,7 @@ module VX_tensor_threadgroup #(
    output ready_in,
    input stall,
    // all *_frag are row-major
+    // (m,n,k) = (2,4,2)
    input [1:0][1:0][31:0] A_frag,
    input [1:0][3:0][31:0] B_frag,
    input [1:0][3:0][31:0] C_frag,
@@ -269,8 +272,11 @@ module VX_tensor_threadgroup #(

    // 4 FEDPs per threadgroup
    for (genvar i = 0; i < 4; ++i) begin
-        // at substep == 0, the 0th and 2nd columns of D begins compute;
-        // at substep == 1, the 1st and 3rd columns of D begins compute.
+        // Determine which elements in the D matrix the dot-product units get
+        // mapped to.
+        //
+        // At substep == 0, the 0th and 2nd columns of D begins compute;
+        // At substep == 1, the 1st and 3rd columns of D begins compute.
        // There are two row elements for each column, rounding out to
        // 4 elements computed by 4 FEDPs at every cycle
        // (see Figure 10(b)).