diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index ca0d1064..d1c14588 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -81,6 +81,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( localparam METADATA_QUEUE_DEPTH = 2 * `LATENCY_HMMA; wire [1:0] step = 2'(execute_if.data.op_type); + // op_mod is reused to indicate instruction's id in pair wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1)); logic [NUM_OCTETS-1:0] octet_results_valid; @@ -115,7 +116,6 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( logic result_valid; logic result_ready; - // op_mod is reused to indicate instruction's id in pair VX_tensor_octet #( .ISW(ISW), .OCTET(i) diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv index 130866de..26817b8d 100644 --- a/hw/rtl/core/VX_uop_sequencer.sv +++ b/hw/rtl/core/VX_uop_sequencer.sv @@ -128,9 +128,8 @@ module VX_uop_sequencer import VX_gpu_pkg::*; ( if (uop_sequencer_if.valid && use_uop && uop_sequencer_if.data.rd == `NR_BITS'(1)) begin // a little sketchy? but shouldn't create any loop - ibuffer_if.data.rd = ibuffer_if.data.rd + `NR_BITS'(8); + ibuffer_if.data.rd = ibuffer_if.data.rd + `NR_BITS'(8); // FIXME: 8 is hardcoded ibuffer_if.data.rs3 = ibuffer_if.data.rs3 + `NR_BITS'(8); - $display("yoooooooo! uop rd=%d\n", ibuffer_if.data.rd); end end diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 0155417b..8b7a1c26 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -234,6 +234,9 @@ module VX_tensor_threadgroup #( end end + // TODO: Instead of latching half-result and constructing a full D tile, + // we should be able to send these half fragments down to commit stage + // immediately, saving flop space assign D_frag[0][0] = D_reg[0]; assign D_frag[0][2] = D_reg[1]; assign D_frag[1][0] = D_reg[2];