From 574cc0e5f035826745d281820c625cbe678c5bfb Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 18:32:27 -0700 Subject: [PATCH] tensor: Document configuring queue depths --- hw/rtl/core/VX_tensor_core.sv | 7 ++----- hw/rtl/fpu/VX_tensor_dpu.sv | 9 +++++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 1f363f45..a5128272 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -318,8 +318,6 @@ module VX_tensor_octet #( output result_valid, input result_ready ); - localparam ISSUE_QUEUE_DEPTH = 4; - // 512 bits/octet * 4 octets per warp logic [`NUM_WARPS-1:0][3:0][31:0] A_buffer, A_buffer_n; logic [`NUM_WARPS-1:0][3:0][31:0] B_buffer, B_buffer_n; @@ -471,7 +469,7 @@ module VX_tensor_octet #( VX_tensor_dpu #( .ISW(ISW), .OCTET(OCTET), - .ISSUE_QUEUE_DEPTH(4) + .ISSUE_QUEUE_DEPTH(4 /*@perf: arbtirary*/) ) dpu ( .clk(clk), .reset(reset), @@ -503,10 +501,9 @@ module VX_tensor_octet #( // regular, every-2-cycle commit traffic to ensure the commit pipeline is // used more efficiently. // FIXME: unnecessary? - // TODO: This is probably oversized. VX_fifo_queue #( .DATAW ($bits(D_wid) + $bits(D_out)), - .DEPTH (2 /*`LATENCY_HMMA*/) + .DEPTH (2 /* arbitrary */) ) output_buffer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 08e37cfa..79ee5757 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -4,6 +4,9 @@ module VX_tensor_dpu #( parameter ISW, parameter OCTET, + // @perf: has big impact on throughput. A rule of thumb is to set it to + // the pipeline length of FEDPs in order to make sure there are enough + // entries to fully saturate the pipeline, but this is still rough parameter ISSUE_QUEUE_DEPTH = `LATENCY_HMMA ) ( input clk, @@ -105,7 +108,9 @@ module VX_tensor_dpu #( // need to pass along warp id's to do multithreading VX_fifo_queue #( .DATAW ($bits(wid)), - .DEPTH (ISSUE_QUEUE_DEPTH + ISSUE_QUEUE_DEPTH) + // @perf: seems to require deeper depth than the FEDP issue queues to + // not cause stalls. + .DEPTH (2 * ISSUE_QUEUE_DEPTH) ) wid_queue ( .clk (clk), .reset (reset), @@ -167,7 +172,7 @@ module VX_tensor_threadgroup #( // threadgroups, so we need only 1 queue per octet for B VX_fifo_queue #( .DATAW ($bits(A_frag) + $bits(B_frag) + $bits(C_frag)), - .DEPTH (ISSUE_QUEUE_DEPTH) + .DEPTH (ISSUE_QUEUE_DEPTH) ) input_buffer ( .clk (clk), .reset (reset),