From 574cc0e5f035826745d281820c625cbe678c5bfb Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 30 May 2024 18:32:27 -0700
Subject: [PATCH] tensor: Document configuring queue depths

---
 hw/rtl/core/VX_tensor_core.sv | 7 ++-----
 hw/rtl/fpu/VX_tensor_dpu.sv   | 9 +++++++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 1f363f45..a5128272 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -318,8 +318,6 @@ module VX_tensor_octet #(
     output result_valid,
     input result_ready
 );
-    localparam ISSUE_QUEUE_DEPTH = 4;
-
     // 512 bits/octet * 4 octets per warp
     logic [`NUM_WARPS-1:0][3:0][31:0] A_buffer, A_buffer_n;
     logic [`NUM_WARPS-1:0][3:0][31:0] B_buffer, B_buffer_n;
@@ -471,7 +469,7 @@ module VX_tensor_octet #(
     VX_tensor_dpu #(
         .ISW(ISW),
         .OCTET(OCTET),
-        .ISSUE_QUEUE_DEPTH(4)
+        .ISSUE_QUEUE_DEPTH(4 /*@perf: arbtirary*/)
     ) dpu (
         .clk(clk),
         .reset(reset),
@@ -503,10 +501,9 @@ module VX_tensor_octet #(
     // regular, every-2-cycle commit traffic to ensure the commit pipeline is
     // used more efficiently.
     // FIXME: unnecessary?
-    // TODO: This is probably oversized.
     VX_fifo_queue #(
         .DATAW   ($bits(D_wid) + $bits(D_out)),
-        .DEPTH   (2 /*`LATENCY_HMMA*/)
+        .DEPTH   (2 /* arbitrary */)
     ) output_buffer (
         .clk   (clk),
         .reset (reset),
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 08e37cfa..79ee5757 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -4,6 +4,9 @@
 module VX_tensor_dpu #(
     parameter ISW,
     parameter OCTET,
+    // @perf: has big impact on throughput.  A rule of thumb is to set it to
+    // the pipeline length of FEDPs in order to make sure there are enough
+    // entries to fully saturate the pipeline, but this is still rough
     parameter ISSUE_QUEUE_DEPTH = `LATENCY_HMMA
 ) (
     input clk,
@@ -105,7 +108,9 @@ module VX_tensor_dpu #(
     // need to pass along warp id's to do multithreading
     VX_fifo_queue #(
         .DATAW   ($bits(wid)),
-        .DEPTH   (ISSUE_QUEUE_DEPTH + ISSUE_QUEUE_DEPTH)
+        // @perf: seems to require deeper depth than the FEDP issue queues to
+        // not cause stalls.
+        .DEPTH   (2 * ISSUE_QUEUE_DEPTH)
     ) wid_queue (
         .clk   (clk),
         .reset (reset),
@@ -167,7 +172,7 @@ module VX_tensor_threadgroup #(
     // threadgroups, so we need only 1 queue per octet for B
     VX_fifo_queue #(
         .DATAW ($bits(A_frag) + $bits(B_frag) + $bits(C_frag)),
-        .DEPTH   (ISSUE_QUEUE_DEPTH)
+        .DEPTH (ISSUE_QUEUE_DEPTH)
     ) input_buffer (
         .clk       (clk),
         .reset     (reset),