diff --git a/hw/rtl/VX_core_wrapper.sv b/hw/rtl/VX_core_wrapper.sv index 1ac02d60..367bdb92 100644 --- a/hw/rtl/VX_core_wrapper.sv +++ b/hw/rtl/VX_core_wrapper.sv @@ -4,6 +4,7 @@ module Vortex import VX_gpu_pkg::*; #( parameter CORE_ID = 0, + parameter TENSOR_FP16 = 0, parameter BOOTROM_HANG100 = 32'h10100, parameter NUM_THREADS = 0 ) ( @@ -394,7 +395,8 @@ module Vortex import VX_gpu_pkg::*; #( // TODO: SCOPE_IO_BIND should be socket id VX_core #( - .CORE_ID (CORE_ID) + .CORE_ID (CORE_ID), + .TENSOR_FP16 (TENSOR_FP16) ) core ( `SCOPE_IO_BIND (0) diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 6f8d9778..c8f6ac1f 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -18,7 +18,8 @@ `endif module VX_core import VX_gpu_pkg::*; #( - parameter CORE_ID = 0 + parameter CORE_ID = 0, + parameter TENSOR_FP16 = 0 ) ( `SCOPE_IO_DECL @@ -191,7 +192,8 @@ module VX_core import VX_gpu_pkg::*; #( ); VX_execute #( - .CORE_ID (CORE_ID) + .CORE_ID (CORE_ID), + .TENSOR_FP16 (TENSOR_FP16) ) execute ( `SCOPE_IO_BIND (2) diff --git a/hw/rtl/core/VX_execute.sv b/hw/rtl/core/VX_execute.sv index 58aa5e04..723d7c60 100644 --- a/hw/rtl/core/VX_execute.sv +++ b/hw/rtl/core/VX_execute.sv @@ -14,7 +14,8 @@ `include "VX_define.vh" module VX_execute import VX_gpu_pkg::*; #( - parameter CORE_ID = 0 + parameter CORE_ID = 0, + parameter TENSOR_FP16 = 0 ) ( `SCOPE_IO_DECL @@ -144,7 +145,7 @@ module VX_execute import VX_gpu_pkg::*; #( `ifdef EXT_T_ENABLE VX_tensor_core #( - + .FP16 (TENSOR_FP16) ) tensor_core ( .clk(clk), .reset(reset), diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index cf5a0071..730d7855 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -2,7 +2,7 @@ `include "VX_fpu_define.vh" module VX_tensor_core import VX_gpu_pkg::*; #( - + parameter FP16 ) ( input clk, input reset, @@ -52,7 +52,8 @@ module VX_tensor_core import VX_gpu_pkg::*; #( for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin VX_tensor_core_block #( - .ISW(1) // FIXME: not block_idx + .ISW(1), // FIXME: not block_idx + .FP16(FP16) ) tensor_core ( .clk(clk), .reset(reset), @@ -65,7 +66,8 @@ module VX_tensor_core import VX_gpu_pkg::*; #( endmodule module VX_tensor_core_block import VX_gpu_pkg::*; #( - parameter ISW + parameter ISW, + parameter FP16 ) ( input clk, input reset, @@ -121,7 +123,8 @@ module VX_tensor_core_block import VX_gpu_pkg::*; #( VX_tensor_octet #( .ISW(ISW), - .OCTET(i) + .OCTET(i), + .FP16(FP16) ) octet ( .clk(clk), .reset(reset), @@ -329,6 +332,7 @@ endmodule module VX_tensor_octet #( parameter ISW, parameter OCTET, + parameter FP16, parameter RESULT_BUFFER_DEPTH = 2 ) ( input clk, @@ -519,6 +523,7 @@ module VX_tensor_octet #( VX_tensor_threadgroups #( .ISW(ISW), .OCTET(OCTET), + .FP16(FP16), .OPERAND_BUFFER_DEPTH(4 /*@perf: arbtirary*/) ) dpu ( .clk(clk), diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 4028b528..1cd2df84 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -5,6 +5,7 @@ module VX_tensor_threadgroups #( parameter ISW, parameter OCTET, + parameter FP16, // @perf: has big impact on throughput. A rule of thumb is to set it to // the pipeline length of FEDPs in order to make sure there are enough // entries to fully saturate the pipeline, but this is still rough @@ -102,6 +103,7 @@ module VX_tensor_threadgroups #( // threadgroup DPUs; B_tile is shared across the two threadgroups. See // Figure 13 in paper VX_tensor_threadgroup #( + .FP16(FP16) ) threadgroup_0 ( .clk (clk), .reset (reset), @@ -115,6 +117,7 @@ module VX_tensor_threadgroups #( .D_frag (D_tile[1:0]) ); VX_tensor_threadgroup #( + .FP16(FP16) ) threadgroup_1 ( .clk (clk), .reset (reset), @@ -165,7 +168,7 @@ endmodule // does (m,n,k) = (2,4,2) matmul compute over 2 cycles. // see Figure 10(b) of the paper. module VX_tensor_threadgroup #( - parameter HALF_PRECISION = 1 + parameter FP16 ) ( input clk, input reset, @@ -297,7 +300,7 @@ module VX_tensor_threadgroup #( wire [31:0] d_col_sel = (substep_in == 1'b0) ? d_col : (d_col + 1); // Dot product (FEDP) unit generated from Chisel - if (HALF_PRECISION != 0) begin + if (FP16 != 0) begin TensorDotProductUnit fedp ( .clock (clk), .reset (reset),