From 4cac1adf7d2b93e6b02f1bc7c1e7914af5d00944 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 7 Oct 2024 17:10:59 -0700
Subject: [PATCH 01/14] Add dummy code for decoupled Hopper tensor core

Define EXT_T_HOPPER that, when EXT_T_ENABLE is defined, distinguishes
whether to instantiate core-coupled Volta-style or decoupled
Hopper-style Tensor Core.
---
 hw/rtl/VX_config.vh                  |   7 +-
 hw/rtl/core/VX_decode.sv             |  34 ++++++---
 hw/rtl/core/VX_tensor_core.sv        |  18 +++--
 hw/rtl/core/VX_tensor_hopper_core.sv | 102 +++++++++++++++++++++++++++
 4 files changed, 146 insertions(+), 15 deletions(-)
 create mode 100644 hw/rtl/core/VX_tensor_hopper_core.sv

diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh
index a9ff2742..f309a84a 100644
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -40,8 +40,13 @@
 `define EXT_F_ENABLE
 `endif
 
+// core-coupled tensor core
 `ifndef EXT_T_DISABLE
 `define EXT_T_ENABLE
+// decoupled Hopper-style tensor core
+`ifndef EXT_T_HOPPER
+`define EXT_T_HOPPER
+`endif
 `endif
 
 `ifndef XLEN_32
@@ -83,7 +88,7 @@
 `endif
 
 `ifndef NUM_CORES
-`define NUM_CORES 8
+`define NUM_CORES 4
 `endif
 
 `ifndef NUM_WARPS
diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv
index cf21d72f..62fdde76 100644
--- a/hw/rtl/core/VX_decode.sv
+++ b/hw/rtl/core/VX_decode.sv
@@ -542,16 +542,30 @@ module VX_decode  #(
                 endcase
             end
         `ifdef EXT_T_ENABLE
-            `INST_EXT4: begin
-                ex_type = `EX_TENSOR;
-                op_type = `INST_TENSOR_HMMA;
-                // tensor core macroop is encoded as r-type
-                use_rd = 1;
-                `USED_IREG (rd);
-                `USED_IREG (rs1);
-                `USED_IREG (rs2);
-                `USED_IREG (rs3);
-            end
+            `ifdef EXT_T_HOPPER
+                // TODO
+                `INST_EXT4: begin
+                    ex_type = `EX_TENSOR;
+                    op_type = `INST_TENSOR_HMMA;
+                    // tensor core macroop is encoded as r-type
+                    use_rd = 1;
+                    `USED_IREG (rd);
+                    `USED_IREG (rs1);
+                    `USED_IREG (rs2);
+                    `USED_IREG (rs3);
+                end
+            `else
+                `INST_EXT4: begin
+                    ex_type = `EX_TENSOR;
+                    op_type = `INST_TENSOR_HMMA;
+                    // tensor core macroop is encoded as r-type
+                    use_rd = 1;
+                    `USED_IREG (rd);
+                    `USED_IREG (rs1);
+                    `USED_IREG (rs2);
+                    `USED_IREG (rs3);
+                end
+            `endif
         `endif
             default:;
         endcase
diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 730d7855..802af43d 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -12,7 +12,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
 );
     localparam BLOCK_SIZE = 1;
     localparam NUM_LANES  = `NUM_THREADS;
-    // FIXME: @perf: PARTIAL_BW==1 increases power instantiating
+    // @perf: PARTIAL_BW==1 increases power instantiating
     // stream_buffers for ISSUE_WIDTH times
     localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
 
@@ -51,16 +51,27 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
     );
 
     for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
-        VX_tensor_core_block #(
+`ifdef EXT_T_HOPPER
+        VX_tensor_hopper_core_block #(
             .ISW(1), // FIXME: not block_idx
             .FP16(FP16)
+        ) tensor_hopper_core (
+            .clk(clk),
+            .reset(reset),
+            .execute_if(execute_if[block_idx]),
+            .commit_if(commit_block_if[block_idx])
+        );
+`else
+        VX_tensor_core_block #(
+            .ISW(1), // FIXME: use block_idx
+            .FP16(FP16)
         ) tensor_core (
             .clk(clk),
             .reset(reset),
-
             .execute_if(execute_if[block_idx]),
             .commit_if(commit_block_if[block_idx])
         );
+`endif
     end
     
 endmodule
@@ -275,7 +286,6 @@ module VX_tensor_core_block import VX_gpu_pkg::*; #(
     localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
     wire [COMMIT_DATAW-1:0] commit_if_data = {
         execute_if_data_deq[wb_wid], /* uuid ~ rd */
-        // execute_if_data_deq, /* uuid ~ rd */
         subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */
         1'b0, /* pid */
         1'b1, /* sop */
diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
new file mode 100644
index 00000000..c79a7994
--- /dev/null
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -0,0 +1,102 @@
+`ifdef EXT_T_ENABLE
+`include "VX_fpu_define.vh"
+
+module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
+    parameter ISW,
+    parameter FP16
+) (
+    input clk,
+    input reset,
+
+    VX_execute_if.slave execute_if,
+    VX_commit_if.master commit_if
+);
+    localparam METADATA_QUEUE_DEPTH = 2; // FIXME: arbitrary
+
+    /* commit_if.data_t parts that we need to keep around:
+        - uuid
+        - wid
+        - tmask
+        - PC
+        - wb
+        - rd
+    */
+
+    localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
+
+    wire operand_enq_fire = execute_if.valid && execute_if.ready;
+    wire commit_if_fire = commit_if.valid && commit_if.ready;
+    wire [DATAW-1:0] execute_if_data_enq = {
+        execute_if.data.uuid,
+        execute_if.data.wid,
+        execute_if.data.tmask,
+        execute_if.data.PC,
+        execute_if.data.wb,
+        execute_if.data.rd
+        // pid/sop/eop set later
+    };
+
+    wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
+
+    wire [`NUM_WARPS-1:0] metadata_queue_fulls;
+    wire [`NUM_WARPS-1:0] metadata_queue_emptys;
+    // OR not AND, we don't want any warp full
+    wire metadata_queue_full = |(metadata_queue_fulls);
+    assign execute_if.ready = !metadata_queue_full;
+
+    `RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
+        ("runtime error: WGMMA execute not supported for warps other than 0!"))
+
+    for (genvar i = 0; i < `NUM_WARPS; i++) begin
+        // Metadata queue for commit_if.  This simply copies execute_if's
+        // metadata and pops them in conjunction with commit fire.
+        //
+        // This has to be separated per-warp, as otherwise requests from
+        // multiple warps can be enqueued interleaved, which makes it hard to
+        // ensure two consecutive dequeues are associated with the same warp for
+        // commit. (FIXME: this is not strictly necessary though.)
+
+        wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i));
+        // FIXME: commit only warp 0
+        wire deq =   commit_if_fire && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
+
+        VX_fifo_queue #(
+            .DATAW(DATAW),
+            .DEPTH(METADATA_QUEUE_DEPTH)
+        ) pending_uops (
+            .clk(clk),
+            .reset(reset),
+            .push(enq),
+            .pop(deq),
+            .data_in(execute_if_data_enq),
+            .data_out(execute_if_data_deq[i]),
+            .empty(metadata_queue_emptys[i]),
+            `UNUSED_PIN(alm_empty),
+            .full(metadata_queue_fulls[i]),
+            `UNUSED_PIN(alm_full),
+            `UNUSED_PIN(size)
+        );
+    end
+
+    // this shouldn't really happen unless there's a big contention over
+    // the commit stage
+    `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
+
+    // FIXME: only checks warp 0 for commit!
+    assign commit_if.valid = ~metadata_queue_emptys[0/*FIXME*/];
+
+    wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
+
+    localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
+    wire [COMMIT_DATAW-1:0] commit_if_data = {
+        execute_if_data_deq[0/*FIXME*/], /* uuid ~ rd */
+        wb_data, /* data */
+        1'b0, /* pid */
+        1'b1, /* sop */
+        1'b1  /* eop */
+    };
+
+    assign commit_if.data = commit_if_data;
+endmodule
+
+`endif

From e8ca4677df05960a908ca1951cedc6e079975507 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 7 Oct 2024 20:21:35 -0700
Subject: [PATCH 02/14] Remove old code for pending_instr underflow fix

---
 hw/rtl/core/VX_commit.sv | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv
index f417d64f..a584cace 100644
--- a/hw/rtl/core/VX_commit.sv
+++ b/hw/rtl/core/VX_commit.sv
@@ -173,30 +173,15 @@ module VX_commit import VX_gpu_pkg::*; #(
 
     // Committed instructions
 
-    // temporary hack to not underflow the pending instructions buffer
-    // relies on 1 cycle delay of arbiter and continuous issuing of tensor instructions, 
-    // so probably want to change this at some point 
+    // prevent underflow of the VX_pending_instr buffer
+    // probably want to change this at some point
     // (i.e. pass a "don't count this towards pending instructions" signal down the pipeline)
-    // logic [`ISSUE_WIDTH-1:0][4:0] hmma_ctr, hmma_ctr_n;
     wire [`ISSUE_WIDTH-1:0] final_hmma;
 `ifdef EXT_T_ENABLE
     for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
-        // assign hmma_ctr_n[i] = (tensor_commit_if[i].valid && tensor_commit_if[i].ready) ? hmma_ctr[i] + 5'b1 : hmma_ctr[i];
-        // assign final_hmma[i] = (commit_sel[i] != `EX_BITS'(2) || hmma_ctr == '0);
-        // i suppose this is now a feature and not a bug
         // if PC is 0, this means it is not final step of a wmma, shouldn't be committed
         assign final_hmma[i] = (commit_if[i].data.PC != 32'b0); 
     end
-    /*
-    always @(posedge clk) begin
-        if (reset) begin
-            hmma_ctr <= '0;
-        end
-        else begin
-            hmma_ctr <= hmma_ctr_n;
-        end 
-    end
-    */
 `else
     assign final_hmma = '1;
 `endif

From 7ab14445f0ad9534a4c625374c3f3fda266642f7 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 7 Oct 2024 21:29:44 -0700
Subject: [PATCH 03/14] tensor: Test many-commit per execute with an FSM

Trick is to set commit_if.data.eop to 0, since the commit module only
signals instruction completion to VX_schedule if the eop bit is 1.
Otherwise it underflows the pending_instr buffer.

The same eop trick works for VX_scoreboard, which works around the
invalid rd writeback error.
---
 hw/rtl/core/VX_tensor_hopper_core.sv | 55 +++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
index c79a7994..a58f4027 100644
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -11,7 +11,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     VX_execute_if.slave execute_if,
     VX_commit_if.master commit_if
 );
-    localparam METADATA_QUEUE_DEPTH = 2; // FIXME: arbitrary
+    localparam METADATA_QUEUE_DEPTH = 16; // FIXME: arbitrary
 
     /* commit_if.data_t parts that we need to keep around:
         - uuid
@@ -37,6 +37,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     };
 
     wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
+    logic [DATAW-1:0] execute_if_data_new_rd;
 
     wire [`NUM_WARPS-1:0] metadata_queue_fulls;
     wire [`NUM_WARPS-1:0] metadata_queue_emptys;
@@ -47,6 +48,8 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     `RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
         ("runtime error: WGMMA execute not supported for warps other than 0!"))
 
+    logic metadata_deq;
+
     for (genvar i = 0; i < `NUM_WARPS; i++) begin
         // Metadata queue for commit_if.  This simply copies execute_if's
         // metadata and pops them in conjunction with commit fire.
@@ -58,7 +61,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
 
         wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i));
         // FIXME: commit only warp 0
-        wire deq =   commit_if_fire && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
+        wire deq = metadata_deq && commit_if.ready && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
 
         VX_fifo_queue #(
             .DATAW(DATAW),
@@ -82,18 +85,58 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     // the commit stage
     `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
 
-    // FIXME: only checks warp 0 for commit!
-    assign commit_if.valid = ~metadata_queue_emptys[0/*FIXME*/];
+    // dummy FSM that generates commits
+    logic [1:0] state, state_n;
+    localparam STATE_IDLE = 4'd0;
+
+    always @(*) begin
+        state_n = state;
+        metadata_deq = 1'b0;
+
+        // when incremented to 1, count up until wrap-around to 0
+        if (state != STATE_IDLE) begin
+            state_n = state + 1'd1;
+        end else begin
+            // kick-off from idle when execute valid
+            // FIXME: only checks warp 0 for commit!
+            if (~metadata_queue_emptys[0/*FIXME*/]) begin
+                state_n = 4'd1;
+            end
+        end
+
+        // dequeue metadata when wrapping around
+        if ((state != STATE_IDLE) && (state_n == STATE_IDLE)) begin
+            metadata_deq = 1'b1;
+        end
+
+        // change rd of the commit data according to state
+        execute_if_data_new_rd =
+            {execute_if_data_deq[0/*FIXME*/][DATAW-1:`NR_BITS],
+             (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state))};
+    end
+
+    always @(posedge clk) begin
+        if (reset) begin
+            state <= '0;
+        end else begin
+            state <= state_n;
+        end
+    end
+
+    // assign commit_if.valid = metadata_deq;
+    assign commit_if.valid = (state != STATE_IDLE);
 
     wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
 
     localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
     wire [COMMIT_DATAW-1:0] commit_if_data = {
-        execute_if_data_deq[0/*FIXME*/], /* uuid ~ rd */
+        // write-back to the correct rd only when eop
+        ((state == 2'b11) ? execute_if_data_deq[0/*FIXME*/] : execute_if_data_new_rd), /* uuid ~ rd */
         wb_data, /* data */
         1'b0, /* pid */
         1'b1, /* sop */
-        1'b1  /* eop */
+        (state == 2'b11)  /* eop */
+        // 1'b1  /* eop */
     };
 
     assign commit_if.data = commit_if_data;

From 58c9761829ffedc12d40aad51c326888a8d2f2c2 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 9 Oct 2024 21:53:04 -0700
Subject: [PATCH 04/14] Revert decode change for hopper

Share the same insn as non-hopper TC.
---
 hw/rtl/core/VX_decode.sv | 34 ++++++++++------------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv
index 62fdde76..cf21d72f 100644
--- a/hw/rtl/core/VX_decode.sv
+++ b/hw/rtl/core/VX_decode.sv
@@ -542,30 +542,16 @@ module VX_decode  #(
                 endcase
             end
         `ifdef EXT_T_ENABLE
-            `ifdef EXT_T_HOPPER
-                // TODO
-                `INST_EXT4: begin
-                    ex_type = `EX_TENSOR;
-                    op_type = `INST_TENSOR_HMMA;
-                    // tensor core macroop is encoded as r-type
-                    use_rd = 1;
-                    `USED_IREG (rd);
-                    `USED_IREG (rs1);
-                    `USED_IREG (rs2);
-                    `USED_IREG (rs3);
-                end
-            `else
-                `INST_EXT4: begin
-                    ex_type = `EX_TENSOR;
-                    op_type = `INST_TENSOR_HMMA;
-                    // tensor core macroop is encoded as r-type
-                    use_rd = 1;
-                    `USED_IREG (rd);
-                    `USED_IREG (rs1);
-                    `USED_IREG (rs2);
-                    `USED_IREG (rs3);
-                end
-            `endif
+            `INST_EXT4: begin
+                ex_type = `EX_TENSOR;
+                op_type = `INST_TENSOR_HMMA;
+                // tensor core macroop is encoded as r-type
+                use_rd = 1;
+                `USED_IREG (rd);
+                `USED_IREG (rs1);
+                `USED_IREG (rs2);
+                `USED_IREG (rs3);
+            end
         `endif
             default:;
         endcase

From d9ad4809ec44bc6b225a0b0c8636c283a5253f76 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 11 Oct 2024 15:41:16 -0700
Subject: [PATCH 05/14] Add 'tensor' bit to commit_if and writeback_if

For use in the asynchronous tensor instruction.  When 1'b1, sets/unsets
the inuse_tensor status bit in the scoreboard to signal
kickoff/completion of the asynchronous tensor op.
---
 hw/rtl/core/VX_alu_unit.sv           |  2 +-
 hw/rtl/core/VX_commit.sv             |  3 ++-
 hw/rtl/core/VX_csr_unit.sv           |  6 +++---
 hw/rtl/core/VX_gather_unit.sv        |  3 ++-
 hw/rtl/core/VX_int_unit.sv           |  6 +++---
 hw/rtl/core/VX_lsu_unit.sv           | 14 +++++++-------
 hw/rtl/core/VX_muldiv_unit.sv        |  8 ++++----
 hw/rtl/core/VX_reduce_unit.sv        |  4 ++--
 hw/rtl/core/VX_scoreboard.sv         |  4 ++++
 hw/rtl/core/VX_sfu_unit.sv           |  2 +-
 hw/rtl/core/VX_tensor_core.sv        |  3 ++-
 hw/rtl/core/VX_tensor_hopper_core.sv |  3 ++-
 hw/rtl/core/VX_wctl_unit.sv          |  6 +++---
 hw/rtl/interfaces/VX_commit_if.sv    |  1 +
 hw/rtl/interfaces/VX_writeback_if.sv |  1 +
 15 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/hw/rtl/core/VX_alu_unit.sv b/hw/rtl/core/VX_alu_unit.sv
index 071cc08d..1c089509 100644
--- a/hw/rtl/core/VX_alu_unit.sv
+++ b/hw/rtl/core/VX_alu_unit.sv
@@ -32,7 +32,7 @@ module VX_alu_unit #(
     localparam NUM_LANES    = `NUM_ALU_LANES;
     localparam PID_BITS     = `CLOG2(`NUM_THREADS / NUM_LANES);
     localparam PID_WIDTH    = `UP(PID_BITS);
-    localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
+    localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + 1 + PID_WIDTH + 1 + 1;
     localparam RSP_ARB_SIZE = 2 + `EXT_M_ENABLED;
     localparam PARTIAL_BW   = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
 
diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv
index a584cace..9b930818 100644
--- a/hw/rtl/core/VX_commit.sv
+++ b/hw/rtl/core/VX_commit.sv
@@ -41,7 +41,7 @@ module VX_commit import VX_gpu_pkg::*; #(
     output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value
 );
     `UNUSED_PARAM (CORE_ID)
-    localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1;
+    localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1 + 1;
     localparam COMMIT_SIZEW = `CLOG2(`NUM_THREADS + 1);
     localparam COMMIT_ALL_SIZEW = COMMIT_SIZEW + `ISSUE_WIDTH - 1;
 
@@ -210,6 +210,7 @@ module VX_commit import VX_gpu_pkg::*; #(
         assign writeback_if[i].data.tmask= commit_if[i].data.tmask; 
         assign writeback_if[i].data.rd   = commit_if[i].data.rd; 
         assign writeback_if[i].data.data = commit_if[i].data.data; 
+        assign writeback_if[i].data.tensor = commit_if[i].data.tensor;
         assign writeback_if[i].data.sop  = commit_if[i].data.sop; 
         assign writeback_if[i].data.eop  = commit_if[i].data.eop;
         assign commit_if[i].ready = 1'b1; // writeback has no backpressure
diff --git a/hw/rtl/core/VX_csr_unit.sv b/hw/rtl/core/VX_csr_unit.sv
index 9fa373b6..bf229789 100644
--- a/hw/rtl/core/VX_csr_unit.sv
+++ b/hw/rtl/core/VX_csr_unit.sv
@@ -43,7 +43,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
     `UNUSED_PARAM (CORE_ID)
     localparam PID_BITS   = `CLOG2(`NUM_THREADS / NUM_LANES);
     localparam PID_WIDTH  = `UP(PID_BITS);
-    localparam DATAW      = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * 32 + PID_WIDTH + 1 + 1;
+    localparam DATAW      = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * 32 + 1 + PID_WIDTH + 1 + 1;
 
     `UNUSED_VAR (execute_if.data.rs3_data)
     
@@ -174,8 +174,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
         .reset     (reset),
         .valid_in  (csr_req_valid),
         .ready_in  (csr_req_ready),
-        .data_in   ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
-        .data_out  ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, csr_commit_data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
+        .data_in   ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, 1'b0/*tensor*/, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
+        .data_out  ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, csr_commit_data, commit_if.data.tensor, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
         .valid_out (commit_if.valid),
         .ready_out (commit_if.ready)
     );
diff --git a/hw/rtl/core/VX_gather_unit.sv b/hw/rtl/core/VX_gather_unit.sv
index 21ae4485..fc8270d4 100644
--- a/hw/rtl/core/VX_gather_unit.sv
+++ b/hw/rtl/core/VX_gather_unit.sv
@@ -31,7 +31,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
     localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
     localparam PID_BITS     = `CLOG2(`NUM_THREADS / NUM_LANES);
     localparam PID_WIDTH    = `UP(PID_BITS);
-    localparam DATAW        = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
+    localparam DATAW        = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + NUM_LANES * `XLEN + 1 + PID_WIDTH + 1 + 1;
     localparam DATA_WIS_OFF = DATAW - (`UUID_WIDTH + `NW_WIDTH);
 
     wire [BLOCK_SIZE-1:0] commit_in_valid;
@@ -119,6 +119,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
             commit_tmp_if.data.wb,
             commit_tmp_if.data.rd,
             commit_data_r,
+            commit_tmp_if.data.tensor,
             1'b0, // PID
             commit_tmp_if.data.sop,
             commit_tmp_if.data.eop
diff --git a/hw/rtl/core/VX_int_unit.sv b/hw/rtl/core/VX_int_unit.sv
index a5e4f394..b8cb78dd 100644
--- a/hw/rtl/core/VX_int_unit.sv
+++ b/hw/rtl/core/VX_int_unit.sv
@@ -136,14 +136,14 @@ module VX_int_unit #(
     end   
 
     VX_elastic_buffer #(
-        .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + (NUM_LANES * `XLEN) + `XLEN + `XLEN + 1 + `INST_BR_BITS + LANE_WIDTH)
+        .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + 1 + (NUM_LANES * `XLEN) + `XLEN + `XLEN + 1 + `INST_BR_BITS + LANE_WIDTH)
     ) rsp_buf (
         .clk      (clk),
         .reset    (reset),
         .valid_in (execute_if.valid),
         .ready_in (execute_if.ready),
-        .data_in  ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, alu_result, execute_if.data.PC, execute_if.data.imm, is_br_op, br_op, tid}),
-        .data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, alu_result_r, PC_r, imm_r, is_br_op_r, br_op_r, tid_r}),
+        .data_in  ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, 1'b0/*tensor*/, alu_result, execute_if.data.PC, execute_if.data.imm, is_br_op, br_op, tid}),
+        .data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, commit_if.data.tensor, alu_result_r, PC_r, imm_r, is_br_op_r, br_op_r, tid_r}),
         .valid_out (commit_if.valid),
         .ready_out (commit_if.ready)
     );
diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv
index 20fac1d1..e8748e39 100644
--- a/hw/rtl/core/VX_lsu_unit.sv
+++ b/hw/rtl/core/VX_lsu_unit.sv
@@ -36,7 +36,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
     localparam NUM_LANES    = `NUM_LSU_LANES;
     localparam PID_BITS     = `CLOG2(`NUM_THREADS / NUM_LANES);
     localparam PID_WIDTH    = `UP(PID_BITS);
-    localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
+    localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + 1 + PID_WIDTH + 1 + 1;
     localparam LSUQ_SIZEW   = `LOG2UP(`LSUQ_SIZE);
     localparam MEM_ASHIFT   = `CLOG2(`MEM_BLOCK_SIZE);    
     localparam MEM_ADDRW    = `XLEN - MEM_ASHIFT;
@@ -527,15 +527,15 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
     // load commit
 
     VX_elastic_buffer #(
-        .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
+        .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + 1 + PID_WIDTH + 1 + 1),
         .SIZE  (2)
     ) ld_rsp_buf (
         .clk       (clk),
         .reset     (reset),
         .valid_in  (mem_rsp_valid),
         .ready_in  (mem_rsp_ready),
-        .data_in   ({rsp_uuid, rsp_wid, rsp_tmask, rsp_pc, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
-        .data_out  ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}),
+        .data_in   ({rsp_uuid, rsp_wid, rsp_tmask, rsp_pc, rsp_rd, rsp_data, 1'b0/*tensor*/, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
+        .data_out  ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.tensor, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}),
         .valid_out (commit_ld_if.valid),
         .ready_out (commit_ld_if.ready)
     );
@@ -545,15 +545,15 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
     // store commit
 
     VX_elastic_buffer #(
-        .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + PID_WIDTH + 1 + 1),
+        .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + PID_WIDTH + 1 + 1),
         .SIZE  (2)
     ) st_rsp_buf (
         .clk       (clk),
         .reset     (reset),
         .valid_in  (mem_req_fire && mem_req_rw),
         .ready_in  (st_rsp_ready),
-        .data_in   ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop}),
-        .data_out  ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}),
+        .data_in   ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, 1'b0/*tensor*/, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop}),
+        .data_out  ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.tensor, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}),
         .valid_out (commit_st_if.valid),
         .ready_out (commit_st_if.ready)
     );
diff --git a/hw/rtl/core/VX_muldiv_unit.sv b/hw/rtl/core/VX_muldiv_unit.sv
index 6daa3c3d..80168c73 100644
--- a/hw/rtl/core/VX_muldiv_unit.sv
+++ b/hw/rtl/core/VX_muldiv_unit.sv
@@ -323,16 +323,16 @@ module VX_muldiv_unit #(
 
     VX_stream_arb #(
         .NUM_INPUTS (2),
-        .DATAW (TAGW + (NUM_LANES * `XLEN)),
+        .DATAW (1/*tensor field only in commit*/ + TAGW + (NUM_LANES * `XLEN)),
         .OUT_REG (1)
     ) rsp_buf (
         .clk       (clk),
         .reset     (reset),
         .valid_in  ({div_valid_out, mul_valid_out}),
         .ready_in  ({div_ready_out, mul_ready_out}),
-        .data_in   ({{div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_pid_out, div_sop_out, div_eop_out, div_result_out},
-                     {mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_pid_out, mul_sop_out, mul_eop_out, mul_result_out}}),
-        .data_out  ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, commit_if.data.data}),
+        .data_in   ({{div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, 1'b0/*tensor*/, div_pid_out, div_sop_out, div_eop_out, div_result_out},
+                     {mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, 1'b0/*tensor*/, mul_pid_out, mul_sop_out, mul_eop_out, mul_result_out}}),
+        .data_out  ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.tensor, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, commit_if.data.data}),
         .valid_out (commit_if.valid),
         .ready_out (commit_if.ready),
         `UNUSED_PIN (sel_out)
diff --git a/hw/rtl/core/VX_reduce_unit.sv b/hw/rtl/core/VX_reduce_unit.sv
index 8522f8d1..b63e57ae 100644
--- a/hw/rtl/core/VX_reduce_unit.sv
+++ b/hw/rtl/core/VX_reduce_unit.sv
@@ -269,7 +269,7 @@ module VX_reduce_unit #(
     );
 
     VX_elastic_buffer #(
-        .DATAW(`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + (`XLEN * NUM_LANES) + PID_WIDTH + 1 + 1)
+        .DATAW(`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + (`XLEN * NUM_LANES) + 1 + PID_WIDTH + 1 + 1)
     ) output_buffer (
         .clk(clk),
         .reset(reset),
@@ -277,7 +277,7 @@ module VX_reduce_unit #(
         .ready_in(commit_if_ready),
         .data_in({execute_if.data.uuid, execute_if.data.wid, stored_tmask, execute_if.data.PC, execute_if.data.wb, execute_if.data.rd, broadcasted_accumulator, stored_pid, stored_sop, stored_eop}),
 
-        .data_out({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.wb, commit_if.data.rd, commit_if.data.data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
+        .data_out({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.wb, commit_if.data.rd, commit_if.data.data, commit_if.data.tensor, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
         .ready_out(commit_if.ready),
         .valid_out(commit_if.valid)
     );
diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv
index c63a5dcb..42a876f5 100644
--- a/hw/rtl/core/VX_scoreboard.sv
+++ b/hw/rtl/core/VX_scoreboard.sv
@@ -142,6 +142,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
 
     for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
         reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
+        // busy bit for the asynchronous Tensor unit.  Since the ISA does not
+        // have an explicit destination register, use a separate status bit.
+        reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor;
 
         wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
 
@@ -227,6 +230,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         always @(posedge clk) begin
             if (reset) begin
                 inuse_regs <= '0;
+                inuse_tensor <= '0;
             end else begin
                 if (writeback_fire) begin
                     inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;            
diff --git a/hw/rtl/core/VX_sfu_unit.sv b/hw/rtl/core/VX_sfu_unit.sv
index ed2023b7..48f1cb8f 100644
--- a/hw/rtl/core/VX_sfu_unit.sv
+++ b/hw/rtl/core/VX_sfu_unit.sv
@@ -49,7 +49,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
     localparam PID_BITS     = `CLOG2(`NUM_THREADS / NUM_LANES);
     localparam PID_WIDTH    = `UP(PID_BITS);
 
-    localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
+    localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + 1 + PID_WIDTH + 1 + 1;
     localparam RSP_ARB_SIZE = 1 + 1;
     localparam RSP_ARB_IDX_WCTL = 0;
     localparam RSP_ARB_IDX_CSRS = 1;
diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 802af43d..1f7a95db 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -283,10 +283,11 @@ module VX_tensor_core_block import VX_gpu_pkg::*; #(
     assign commit_if_ready_override = commit_if.ready && (counter == 2'b0);
 `endif
 
-    localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
+    localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
     wire [COMMIT_DATAW-1:0] commit_if_data = {
         execute_if_data_deq[wb_wid], /* uuid ~ rd */
         subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */
+        1'b0, /* tensor */
         1'b0, /* pid */
         1'b1, /* sop */
         1'b1  /* eop */
diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
index a58f4027..2ecbea70 100644
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -128,11 +128,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
 
     wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
 
-    localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
+    localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
     wire [COMMIT_DATAW-1:0] commit_if_data = {
         // write-back to the correct rd only when eop
         ((state == 2'b11) ? execute_if_data_deq[0/*FIXME*/] : execute_if_data_new_rd), /* uuid ~ rd */
         wb_data, /* data */
+        1'b0, /* tensor */
         1'b0, /* pid */
         1'b1, /* sop */
         (state == 2'b11)  /* eop */
diff --git a/hw/rtl/core/VX_wctl_unit.sv b/hw/rtl/core/VX_wctl_unit.sv
index 5b1ad834..36144018 100644
--- a/hw/rtl/core/VX_wctl_unit.sv
+++ b/hw/rtl/core/VX_wctl_unit.sv
@@ -32,7 +32,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
     localparam PID_BITS   = `CLOG2(`NUM_THREADS / NUM_LANES);
     localparam PID_WIDTH  = `UP(PID_BITS);
     localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
-    localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + PID_WIDTH + 1 + 1;
+    localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + 1 + PID_WIDTH + 1 + 1;
 
     `UNUSED_VAR (execute_if.data.rs3_data)
     
@@ -141,8 +141,8 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
         .reset     (reset),
         .valid_in  (execute_if.valid),
         .ready_in  (execute_if.ready),
-        .data_in   ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, {tmc, wspawn, split, sjoin, barrier}}),
-        .data_out  ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, {tmc_r, wspawn_r, split_r, sjoin_r, barrier_r}}),
+        .data_in   ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, 1'b0/*tensor*/, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, {tmc, wspawn, split, sjoin, barrier}}),
+        .data_out  ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.tensor, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, {tmc_r, wspawn_r, split_r, sjoin_r, barrier_r}}),
         .valid_out (commit_if.valid),
         .ready_out (commit_if.ready)
     );
diff --git a/hw/rtl/interfaces/VX_commit_if.sv b/hw/rtl/interfaces/VX_commit_if.sv
index e5bfa13a..2eaf5d0e 100644
--- a/hw/rtl/interfaces/VX_commit_if.sv
+++ b/hw/rtl/interfaces/VX_commit_if.sv
@@ -26,6 +26,7 @@ interface VX_commit_if #(
         logic                       wb;
         logic [`NR_BITS-1:0]        rd;
         logic [NUM_LANES-1:0][`XLEN-1:0] data;
+        logic                       tensor;
         logic [PID_WIDTH-1:0]       pid;
         logic                       sop;
         logic                       eop;
diff --git a/hw/rtl/interfaces/VX_writeback_if.sv b/hw/rtl/interfaces/VX_writeback_if.sv
index ce6241ef..72abdb45 100644
--- a/hw/rtl/interfaces/VX_writeback_if.sv
+++ b/hw/rtl/interfaces/VX_writeback_if.sv
@@ -22,6 +22,7 @@ interface VX_writeback_if import VX_gpu_pkg::*; ();
         logic [`XLEN-1:0]               PC;
         logic [`NR_BITS-1:0]            rd;
         logic [`NUM_THREADS-1:0][`XLEN-1:0] data;
+        logic                           tensor;
         logic                           sop;
         logic                           eop;
     } data_t;

From 100d69ef210ea648f958146ac20afd53b2ba4c48 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 11 Oct 2024 15:47:58 -0700
Subject: [PATCH 06/14] Doc update on accumulator regs

---
 hw/rtl/core/VX_uop_sequencer.sv | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv
index 8a53d3f1..ab0298cb 100644
--- a/hw/rtl/core/VX_uop_sequencer.sv
+++ b/hw/rtl/core/VX_uop_sequencer.sv
@@ -225,8 +225,9 @@ end
 
         if (uop_sequencer_if.valid && use_uop &&
             uop_sequencer_if.data.rd  == `NR_BITS'(1)) begin
-            // a little sketchy? but shouldn't create any loop
-            ibuffer_if.data.rd  = ibuffer_if.data.rd  + `NR_BITS'(8); // FIXME: 8 is hardcoded
+            // if rd is '1', use a separate set of 8 fp registers as the
+            // destination accumulator data.
+            ibuffer_if.data.rd  = ibuffer_if.data.rd  + `NR_BITS'(8); // note 8 is hardcoded
             ibuffer_if.data.rs3 = ibuffer_if.data.rs3 + `NR_BITS'(8);
         end
     end

From 72f9dedce3f3cee924856897b5dcaa21db6e6d39 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 11 Oct 2024 15:59:31 -0700
Subject: [PATCH 07/14] tensor: Disable micro-ops for hopper

Have an uarch FSM handle the stepping mechanism entirely.
---
 hw/rtl/core/VX_uop_sequencer.sv | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv
index ab0298cb..798466bf 100644
--- a/hw/rtl/core/VX_uop_sequencer.sv
+++ b/hw/rtl/core/VX_uop_sequencer.sv
@@ -183,7 +183,13 @@ end
     // merging the 2 always blocks leads to spurious UNOPTFLAT verilator lint,
     // but conceptually they should be linked
     always @(*) begin
+`ifdef EXT_T_HOPPER
+        // for Hopper, disable micro-op blitzing.  Set/step is managed
+        // microarchitecturally in an FSM inside the tensor core.
+        use_uop = 1'b0;
+`else
         use_uop = uop_sequencer_if.valid && uop_sequencer_if.data.ex_type == `EX_BITS'(`EX_TENSOR);
+`endif
 
         if (uop_start) begin
             // 1st cycle of microcoded operation, use op_type to determine entry point into microcode table

From 408a9b5d2adcc64b76beb84c912da57d92b0b63d Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 11 Oct 2024 17:18:01 -0700
Subject: [PATCH 08/14] tensor: Write stall logic for hgmma_wait

HGMMA_WAIT instruction stalls at issue when inuse_tensor is set, which
is done by the previous HGMMA insn. Currently inuse_tensor is never set
back to zero.
---
 hw/rtl/VX_define.vh          |  5 ++++-
 hw/rtl/core/VX_decode.sv     | 11 +++++++++++
 hw/rtl/core/VX_scoreboard.sv | 16 ++++++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh
index bb96a149..61ce41be 100644
--- a/hw/rtl/VX_define.vh
+++ b/hw/rtl/VX_define.vh
@@ -254,7 +254,10 @@
 `define INST_SFU_IS_WCTL(op) (op <= 5)
 `define INST_SFU_IS_CSR(op)  (op >= 6 && op <= 8)
 
-`define INST_TENSOR_HMMA     4'b0000
+`define INST_TENSOR_HMMA       4'b0000
+// Hopper WGMMA-style asynchronous op
+`define INST_TENSOR_HGMMA      4'b0001
+`define INST_TENSOR_HGMMA_WAIT 4'b0010
 
 ///////////////////////////////////////////////////////////////////////////////
 
diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv
index cf21d72f..df3500f2 100644
--- a/hw/rtl/core/VX_decode.sv
+++ b/hw/rtl/core/VX_decode.sv
@@ -543,6 +543,16 @@ module VX_decode  #(
             end
         `ifdef EXT_T_ENABLE
             `INST_EXT4: begin
+            `ifdef EXT_T_HOPPER
+                ex_type = `EX_TENSOR;
+                // tensor core macroop is encoded as r-type
+                if (func3[0]) begin
+                    op_type = `INST_TENSOR_HGMMA_WAIT;
+                end else begin
+                    op_type = `INST_TENSOR_HGMMA;
+                end
+                // rd/rs1/rs2/rs3 unused
+            `else
                 ex_type = `EX_TENSOR;
                 op_type = `INST_TENSOR_HMMA;
                 // tensor core macroop is encoded as r-type
@@ -551,6 +561,7 @@ module VX_decode  #(
                 `USED_IREG (rs1);
                 `USED_IREG (rs2);
                 `USED_IREG (rs3);
+            `endif
             end
         `endif
             default:;
diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv
index 42a876f5..59886098 100644
--- a/hw/rtl/core/VX_scoreboard.sv
+++ b/hw/rtl/core/VX_scoreboard.sv
@@ -146,6 +146,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         // have an explicit destination register, use a separate status bit.
         reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor;
 
+        wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_TENSOR) &&
+            (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA);
+
         wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
 
         wire inuse_rd  = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
@@ -208,7 +211,15 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
 
         // NOTE(hansung): why is inuse_rd checked? to prevent WAW?
         wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
+    `ifdef EXT_T_HOPPER
+        wire hgmma_wait = ibuffer_if[i].valid &&
+            (ibuffer_if[i].data.ex_type == `EX_TENSOR) &&
+            (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT);
+        wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]);
+        wire operands_ready = (~(| operands_busy)) && hgmma_ready;
+    `else
         wire operands_ready = ~(| operands_busy);
+    `endif
         
         wire stg_valid_in, stg_ready_in;
         assign stg_valid_in = ibuffer_if[i].valid && operands_ready;
@@ -238,6 +249,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
                 if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
                     inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
                 end
+            `ifdef EXT_T_HOPPER
+                if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin
+                    inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1;
+                end
+            `endif
             end
         `ifdef PERF_ENABLE
             if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin

From 42b9d23f832d2192231e66dd55a012193ecdc860 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 11 Oct 2024 17:27:51 -0700
Subject: [PATCH 09/14] tensor: Write release logic for hgmma

Upon completion of an op, tensor_core_hopper sends a "ghost" commit
signal down the pipeline with the `wb` and `tensor` bit set in
commit_if.  The scoreboard receives this signal via writeback_if and
resets the inuse_tensor status bit back to zero, which unblocks the
HGMMA_WAIT instruction.
---
 hw/rtl/core/VX_scoreboard.sv         |  7 +++-
 hw/rtl/core/VX_tensor_hopper_core.sv | 56 +++++++++++++---------------
 2 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv
index 59886098..2a39c058 100644
--- a/hw/rtl/core/VX_scoreboard.sv
+++ b/hw/rtl/core/VX_scoreboard.sv
@@ -146,7 +146,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         // have an explicit destination register, use a separate status bit.
         reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor;
 
-        wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_TENSOR) &&
+        wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
             (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA);
 
         wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
@@ -213,7 +213,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
     `ifdef EXT_T_HOPPER
         wire hgmma_wait = ibuffer_if[i].valid &&
-            (ibuffer_if[i].data.ex_type == `EX_TENSOR) &&
+            (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
             (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT);
         wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]);
         wire operands_ready = (~(| operands_busy)) && hgmma_ready;
@@ -250,6 +250,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
                     inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
                 end
             `ifdef EXT_T_HOPPER
+                if (writeback_fire && writeback_if[i].data.tensor) begin
+                    inuse_tensor[ibuffer_if[i].data.wis] <= 1'b0;
+                end
                 if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin
                     inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1;
                 end
diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
index 2ecbea70..dc763d48 100644
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -11,6 +11,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     VX_execute_if.slave execute_if,
     VX_commit_if.master commit_if
 );
+    localparam NUM_LANES = `NUM_THREADS;
     localparam METADATA_QUEUE_DEPTH = 16; // FIXME: arbitrary
 
     /* commit_if.data_t parts that we need to keep around:
@@ -21,22 +22,17 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
         - wb
         - rd
     */
-
     localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
 
     wire operand_enq_fire = execute_if.valid && execute_if.ready;
     wire commit_if_fire = commit_if.valid && commit_if.ready;
-    wire [DATAW-1:0] execute_if_data_enq = {
-        execute_if.data.uuid,
-        execute_if.data.wid,
-        execute_if.data.tmask,
-        execute_if.data.PC,
-        execute_if.data.wb,
-        execute_if.data.rd
-        // pid/sop/eop set later
-    };
 
-    wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
+    wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
+    wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0]   execute_if_data_wid;
+    wire [`NUM_WARPS-1:0][NUM_LANES-1:0]   execute_if_data_tmask;
+    wire [`NUM_WARPS-1:0][`XLEN-1:0]       execute_if_data_PC;
+    wire [`NUM_WARPS-1:0]                  execute_if_data_wb;
+    wire [`NUM_WARPS-1:0][`NR_BITS-1:0]    execute_if_data_rd;
     logic [DATAW-1:0] execute_if_data_new_rd;
 
     wire [`NUM_WARPS-1:0] metadata_queue_fulls;
@@ -71,8 +67,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
             .reset(reset),
             .push(enq),
             .pop(deq),
-            .data_in(execute_if_data_enq),
-            .data_out(execute_if_data_deq[i]),
+            .data_in({execute_if.data.uuid,  execute_if.data.wid,
+                      execute_if.data.tmask, execute_if.data.PC,
+                      execute_if.data.wb,    execute_if.data.rd}),
+            .data_out({execute_if_data_uuid[i],  execute_if_data_wid[i],
+                       execute_if_data_tmask[i], execute_if_data_PC[i],
+                       execute_if_data_wb[i],    execute_if_data_rd[i]}),
             .empty(metadata_queue_emptys[i]),
             `UNUSED_PIN(alm_empty),
             .full(metadata_queue_fulls[i]),
@@ -108,11 +108,6 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
         if ((state != STATE_IDLE) && (state_n == STATE_IDLE)) begin
             metadata_deq = 1'b1;
         end
-
-        // change rd of the commit data according to state
-        execute_if_data_new_rd =
-            {execute_if_data_deq[0/*FIXME*/][DATAW-1:`NR_BITS],
-             (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state))};
     end
 
     always @(posedge clk) begin
@@ -128,19 +123,18 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
 
     wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
 
-    localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
-    wire [COMMIT_DATAW-1:0] commit_if_data = {
-        // write-back to the correct rd only when eop
-        ((state == 2'b11) ? execute_if_data_deq[0/*FIXME*/] : execute_if_data_new_rd), /* uuid ~ rd */
-        wb_data, /* data */
-        1'b0, /* tensor */
-        1'b0, /* pid */
-        1'b1, /* sop */
-        (state == 2'b11)  /* eop */
-        // 1'b1  /* eop */
-    };
-
-    assign commit_if.data = commit_if_data;
+    assign commit_if.data.uuid   = execute_if_data_uuid[0];
+    assign commit_if.data.wid    = execute_if_data_wid[0];
+    assign commit_if.data.tmask  = execute_if_data_tmask[0];
+    assign commit_if.data.PC     = execute_if_data_PC[0];
+    assign commit_if.data.wb     = (state == 2'b11);
+    // custom rd
+    assign commit_if.data.rd     = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state));
+    assign commit_if.data.data   = wb_data;
+    assign commit_if.data.tensor = (state == 2'b11);
+    assign commit_if.data.pid    = 1'b0;
+    assign commit_if.data.sop    = 1'b1;
+    assign commit_if.data.eop    = (state == 2'b11);
 endmodule
 
 `endif

From f7f23e0c05686039cd8a4f0836f82ca653952e25 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 11 Oct 2024 18:00:36 -0700
Subject: [PATCH 10/14] tensor: Doc update

---
 hw/rtl/core/VX_decode.sv | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv
index df3500f2..8d52d450 100644
--- a/hw/rtl/core/VX_decode.sv
+++ b/hw/rtl/core/VX_decode.sv
@@ -551,11 +551,14 @@ module VX_decode  #(
                 end else begin
                     op_type = `INST_TENSOR_HGMMA;
                 end
-                // rd/rs1/rs2/rs3 unused
+                // rd/rs1/rs2/rs3 unused to prevent hazard stalls at the
+                // scoreboard
             `else
                 ex_type = `EX_TENSOR;
                 op_type = `INST_TENSOR_HMMA;
                 // tensor core macroop is encoded as r-type
+                // hazard stall logic in the scoreboard will handle
+                // read-after-write dependency on rd -> rs3
                 use_rd = 1;
                 `USED_IREG (rd);
                 `USED_IREG (rs1);

From 2934b1bd94a670ba0c9588256b55373f555fa5b5 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 11 Oct 2024 20:09:09 -0700
Subject: [PATCH 11/14] tensor: Split execution module from pipeline logic

---
 hw/rtl/core/VX_tensor_core.sv        |   2 +-
 hw/rtl/core/VX_tensor_hopper_core.sv | 113 ++++++++++++++++++---------
 2 files changed, 77 insertions(+), 38 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 1f7a95db..cad70b97 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -55,7 +55,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
         VX_tensor_hopper_core_block #(
             .ISW(1), // FIXME: not block_idx
             .FP16(FP16)
-        ) tensor_hopper_core (
+        ) tensor_hopper_core_block (
             .clk(clk),
             .reset(reset),
             .execute_if(execute_if[block_idx]),
diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
index dc763d48..8abe463e 100644
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -12,7 +12,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     VX_commit_if.master commit_if
 );
     localparam NUM_LANES = `NUM_THREADS;
-    localparam METADATA_QUEUE_DEPTH = 16; // FIXME: arbitrary
+    localparam METADATA_QUEUE_DEPTH = 2; // FIXME: arbitrary
 
     /* commit_if.data_t parts that we need to keep around:
         - uuid
@@ -22,29 +22,23 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
         - wb
         - rd
     */
-    localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
-
-    wire operand_enq_fire = execute_if.valid && execute_if.ready;
-    wire commit_if_fire = commit_if.valid && commit_if.ready;
-
     wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
     wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0]   execute_if_data_wid;
     wire [`NUM_WARPS-1:0][NUM_LANES-1:0]   execute_if_data_tmask;
     wire [`NUM_WARPS-1:0][`XLEN-1:0]       execute_if_data_PC;
     wire [`NUM_WARPS-1:0]                  execute_if_data_wb;
     wire [`NUM_WARPS-1:0][`NR_BITS-1:0]    execute_if_data_rd;
-    logic [DATAW-1:0] execute_if_data_new_rd;
 
     wire [`NUM_WARPS-1:0] metadata_queue_fulls;
     wire [`NUM_WARPS-1:0] metadata_queue_emptys;
-    // OR not AND, we don't want any warp full
+    // OR not AND; we don't want any warp to be full
     wire metadata_queue_full = |(metadata_queue_fulls);
     assign execute_if.ready = !metadata_queue_full;
 
     `RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
         ("runtime error: WGMMA execute not supported for warps other than 0!"))
 
-    logic metadata_deq;
+    wire metadata_deq;
 
     for (genvar i = 0; i < `NUM_WARPS; i++) begin
         // Metadata queue for commit_if.  This simply copies execute_if's
@@ -55,10 +49,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
         // ensure two consecutive dequeues are associated with the same warp for
         // commit. (FIXME: this is not strictly necessary though.)
 
+        wire operand_enq_fire = execute_if.valid && execute_if.ready;
         wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i));
         // FIXME: commit only warp 0
-        wire deq = metadata_deq && commit_if.ready && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
+        wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
 
+        localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
         VX_fifo_queue #(
             .DATAW(DATAW),
             .DEPTH(METADATA_QUEUE_DEPTH)
@@ -85,28 +81,84 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     // the commit stage
     `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
 
+    wire initiate_ready; // FIXME: unused
+    wire writeback_valid;
+    wire writeback_last;
+
+    wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/];
+    // dequeue metadata at the last writeback
+    assign metadata_deq = metadata_valid && writeback_valid && writeback_last;
+
+    VX_tensor_hopper_core #(
+    ) tensor_hopper_core (
+        .clk(clk),
+        .reset(reset),
+
+        .initiate_valid(metadata_valid),
+        .initiate_wid(`NW_WIDTH'(0)/*FIXME*/),
+        .initiate_ready(initiate_ready),
+
+        .writeback_valid(writeback_valid),
+        `UNUSED_PIN(writeback_wid),
+        .writeback_last(writeback_last),
+        .writeback_ready(commit_if.ready)
+    );
+
+    wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
+
+    assign commit_if.valid = writeback_valid;
+    assign commit_if.data.uuid   = execute_if_data_uuid[0];
+    assign commit_if.data.wid    = execute_if_data_wid[0];
+    assign commit_if.data.tmask  = execute_if_data_tmask[0];
+    assign commit_if.data.PC     = execute_if_data_PC[0];
+    assign commit_if.data.wb     = writeback_last;
+    // custom rd
+    assign commit_if.data.rd     = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/));
+    assign commit_if.data.data   = wb_data;
+    assign commit_if.data.tensor = writeback_last;
+    assign commit_if.data.pid    = 1'b0;
+    assign commit_if.data.sop    = 1'b1;
+    // eop is deliberately set so that we don't underflow the pending_instr
+    // buffer in VX_schedule.  An instruction is considered committed only
+    // when the eop bit is set to one (see VX_commit).
+    assign commit_if.data.eop    = writeback_last;
+endmodule
+
+
+// TODO: replace this with a Chisel module
+module VX_tensor_hopper_core #(
+) (
+    input clk,
+    input reset,
+
+    input                 initiate_valid,
+    input [`NW_WIDTH-1:0] initiate_wid,
+    output                initiate_ready,
+
+    output                 writeback_valid,
+    output [`NW_WIDTH-1:0] writeback_wid,
+    // indicates if this is the last writeback for the given wid, in which
+    // case the original HGMMA instruction should be signalled retired
+    output                 writeback_last,
+    input                  writeback_ready
+);
     // dummy FSM that generates commits
-    logic [1:0] state, state_n;
     localparam STATE_IDLE = 4'd0;
+    logic [1:0] state, state_n;
+
+    assign initiate_ready = (state == STATE_IDLE);
 
     always @(*) begin
         state_n = state;
-        metadata_deq = 1'b0;
 
         // when incremented to 1, count up until wrap-around to 0
         if (state != STATE_IDLE) begin
             state_n = state + 1'd1;
-        end else begin
-            // kick-off from idle when execute valid
-            // FIXME: only checks warp 0 for commit!
-            if (~metadata_queue_emptys[0/*FIXME*/]) begin
-                state_n = 4'd1;
-            end
         end
 
-        // dequeue metadata when wrapping around
-        if ((state != STATE_IDLE) && (state_n == STATE_IDLE)) begin
-            metadata_deq = 1'b1;
+        // kick-off
+        if (initiate_valid && initiate_ready) begin
+            state_n = 4'd1;
         end
     end
 
@@ -118,23 +170,10 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
         end
     end
 
-    // assign commit_if.valid = metadata_deq;
-    assign commit_if.valid = (state != STATE_IDLE);
+    assign writeback_valid = (state != STATE_IDLE);
+    assign writeback_wid = '0; // TODO
+    assign writeback_last = (state == 4'd15);
 
-    wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
-
-    assign commit_if.data.uuid   = execute_if_data_uuid[0];
-    assign commit_if.data.wid    = execute_if_data_wid[0];
-    assign commit_if.data.tmask  = execute_if_data_tmask[0];
-    assign commit_if.data.PC     = execute_if_data_PC[0];
-    assign commit_if.data.wb     = (state == 2'b11);
-    // custom rd
-    assign commit_if.data.rd     = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state));
-    assign commit_if.data.data   = wb_data;
-    assign commit_if.data.tensor = (state == 2'b11);
-    assign commit_if.data.pid    = 1'b0;
-    assign commit_if.data.sop    = 1'b1;
-    assign commit_if.data.eop    = (state == 2'b11);
 endmodule
 
 `endif

From 717fe7ff2959ba5185a69466b666a6439eb43e13 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 11 Oct 2024 20:24:31 -0700
Subject: [PATCH 12/14] tensor: Fix FSM when commit not ready

---
 hw/rtl/core/VX_tensor_hopper_core.sv | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
index 8abe463e..21fad57c 100644
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -144,17 +144,28 @@ module VX_tensor_hopper_core #(
 );
     // dummy FSM that generates commits
     localparam STATE_IDLE = 4'd0;
-    logic [1:0] state, state_n;
+    localparam STATE_FINISH = 4'd15;
+    logic [3:0] state, state_n;
 
     assign initiate_ready = (state == STATE_IDLE);
 
     always @(*) begin
         state_n = state;
 
-        // when incremented to 1, count up until wrap-around to 0
-        if (state != STATE_IDLE) begin
-            state_n = state + 1'd1;
-        end
+        case (state)
+            STATE_IDLE: begin
+                state_n = state;
+            end
+            STATE_FINISH: begin
+                // hold until writeback_ready
+                if (writeback_ready) begin
+                    state_n = STATE_IDLE;
+                end
+            end
+            default: begin
+                state_n = state + 4'd1;
+            end
+        endcase
 
         // kick-off
         if (initiate_valid && initiate_ready) begin
@@ -172,7 +183,7 @@ module VX_tensor_hopper_core #(
 
     assign writeback_valid = (state != STATE_IDLE);
     assign writeback_wid = '0; // TODO
-    assign writeback_last = (state == 4'd15);
+    assign writeback_last = (state == STATE_FINISH);
 
 endmodule
 

From 4dcbc31a88915fff35ccefd00c6e753fa5ef135a Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 11 Oct 2024 21:32:20 -0700
Subject: [PATCH 13/14] tensor: Separate async commit from tensor commit

With this we can prioritize commit of the async hgmma instructions over
the "ghost" commits from the TC.
---
 hw/rtl/core/VX_commit.sv             |   9 ++-
 hw/rtl/core/VX_tensor_hopper_core.sv | 107 +++++++++++++++++++--------
 2 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv
index 9b930818..faca0a2a 100644
--- a/hw/rtl/core/VX_commit.sv
+++ b/hw/rtl/core/VX_commit.sv
@@ -177,17 +177,22 @@ module VX_commit import VX_gpu_pkg::*; #(
     // probably want to change this at some point
     // (i.e. pass a "don't count this towards pending instructions" signal down the pipeline)
     wire [`ISSUE_WIDTH-1:0] final_hmma;
+    // if this is a "ghost" commit generated from the tensor core, don't count
+    // toward committed
+    wire [`ISSUE_WIDTH-1:0] tensor_ghost;
 `ifdef EXT_T_ENABLE
     for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
         // if PC is 0, this means it is not final step of a wmma, shouldn't be committed
         assign final_hmma[i] = (commit_if[i].data.PC != 32'b0); 
+        // handle 'x' with ===.  FIXME fix unitialization
+        assign tensor_ghost[i] = (commit_if[i].data.tensor == 1'b1);
     end
 `else
     assign final_hmma = '1;
+    assign tensor_ghost = '0;
 `endif
 
-
-    wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma;
+    wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma & (~tensor_ghost);
 
     VX_pipe_register #(
         .DATAW  (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
index 21fad57c..2b2136b6 100644
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -22,12 +22,13 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
         - wb
         - rd
     */
-    wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
-    wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0]   execute_if_data_wid;
-    wire [`NUM_WARPS-1:0][NUM_LANES-1:0]   execute_if_data_tmask;
-    wire [`NUM_WARPS-1:0][`XLEN-1:0]       execute_if_data_PC;
-    wire [`NUM_WARPS-1:0]                  execute_if_data_wb;
-    wire [`NUM_WARPS-1:0][`NR_BITS-1:0]    execute_if_data_rd;
+    wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0]    execute_if_data_uuid;
+    wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0]      execute_if_data_wid;
+    wire [`NUM_WARPS-1:0][NUM_LANES-1:0]      execute_if_data_tmask;
+    wire [`NUM_WARPS-1:0][`INST_ALU_BITS-1:0] execute_if_data_op_type;
+    wire [`NUM_WARPS-1:0][`XLEN-1:0]          execute_if_data_PC;
+    wire [`NUM_WARPS-1:0]                     execute_if_data_wb;
+    wire [`NUM_WARPS-1:0][`NR_BITS-1:0]       execute_if_data_rd;
 
     wire [`NUM_WARPS-1:0] metadata_queue_fulls;
     wire [`NUM_WARPS-1:0] metadata_queue_emptys;
@@ -38,7 +39,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     `RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
         ("runtime error: WGMMA execute not supported for warps other than 0!"))
 
-    wire metadata_deq;
+    logic metadata_deq;
 
     for (genvar i = 0; i < `NUM_WARPS; i++) begin
         // Metadata queue for commit_if.  This simply copies execute_if's
@@ -54,7 +55,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
         // FIXME: commit only warp 0
         wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
 
-        localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
+        localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS;
         VX_fifo_queue #(
             .DATAW(DATAW),
             .DEPTH(METADATA_QUEUE_DEPTH)
@@ -64,10 +65,10 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
             .push(enq),
             .pop(deq),
             .data_in({execute_if.data.uuid,  execute_if.data.wid,
-                      execute_if.data.tmask, execute_if.data.PC,
+                      execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
                       execute_if.data.wb,    execute_if.data.rd}),
             .data_out({execute_if_data_uuid[i],  execute_if_data_wid[i],
-                       execute_if_data_tmask[i], execute_if_data_PC[i],
+                       execute_if_data_tmask[i], execute_if_data_op_type[i], execute_if_data_PC[i],
                        execute_if_data_wb[i],    execute_if_data_rd[i]}),
             .empty(metadata_queue_emptys[i]),
             `UNUSED_PIN(alm_empty),
@@ -81,47 +82,91 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     // the commit stage
     `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
 
-    wire initiate_ready; // FIXME: unused
+    wire initiate_ready;
     wire writeback_valid;
     wire writeback_last;
+    logic writeback_ready;
 
     wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/];
-    // dequeue metadata at the last writeback
-    assign metadata_deq = metadata_valid && writeback_valid && writeback_last;
+    wire not_wait = metadata_valid && (execute_if_data_op_type[0] != `INST_TENSOR_HGMMA_WAIT);
+    // skip HGMMA_WAIT for kickoff
+    wire initiate_valid = metadata_valid && not_wait;
+
+    // we're recycling execute_if.op_type as operands_if.op_type which might
+    // have a different width; let's be safe
+    `STATIC_ASSERT((`INST_ALU_BITS == `INST_OP_BITS),
+        ("static assertion failed: `INST_ALU_BITS != `INST_OP_BITS"))
 
     VX_tensor_hopper_core #(
     ) tensor_hopper_core (
         .clk(clk),
         .reset(reset),
 
-        .initiate_valid(metadata_valid),
+        .initiate_valid(initiate_valid),
         .initiate_wid(`NW_WIDTH'(0)/*FIXME*/),
         .initiate_ready(initiate_ready),
 
         .writeback_valid(writeback_valid),
         `UNUSED_PIN(writeback_wid),
         .writeback_last(writeback_last),
-        .writeback_ready(commit_if.ready)
+        .writeback_ready(writeback_ready)
     );
 
     wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
 
-    assign commit_if.valid = writeback_valid;
-    assign commit_if.data.uuid   = execute_if_data_uuid[0];
-    assign commit_if.data.wid    = execute_if_data_wid[0];
-    assign commit_if.data.tmask  = execute_if_data_tmask[0];
-    assign commit_if.data.PC     = execute_if_data_PC[0];
-    assign commit_if.data.wb     = writeback_last;
-    // custom rd
-    assign commit_if.data.rd     = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/));
-    assign commit_if.data.data   = wb_data;
-    assign commit_if.data.tensor = writeback_last;
-    assign commit_if.data.pid    = 1'b0;
-    assign commit_if.data.sop    = 1'b1;
-    // eop is deliberately set so that we don't underflow the pending_instr
-    // buffer in VX_schedule.  An instruction is considered committed only
-    // when the eop bit is set to one (see VX_commit).
-    assign commit_if.data.eop    = writeback_last;
+    always @(*) begin
+        metadata_deq = 1'b0;
+
+        // if there's something in the meta queue, give it priority for commit,
+        // since every HGMMA instructions are asynchronous and should not
+        // block
+        if (metadata_valid) begin
+            // block tensor core writeback
+            writeback_ready = 1'b0;
+
+            commit_if.valid       = metadata_valid;
+            commit_if.data.uuid   = execute_if_data_uuid[0];
+            commit_if.data.wid    = execute_if_data_wid[0];
+            commit_if.data.tmask  = execute_if_data_tmask[0];
+            commit_if.data.PC     = execute_if_data_PC[0];
+            commit_if.data.wb     = execute_if_data_wb[0];
+            commit_if.data.rd     = execute_if_data_rd[0];
+            commit_if.data.data   = wb_data; // FIXME ?
+            commit_if.data.tensor = 1'b0;
+            commit_if.data.pid    = 1'b0;
+            commit_if.data.sop    = 1'b1;
+            commit_if.data.eop    = 1'b1;
+
+            // block meta queue until tensor core is ready.  This will
+            // effectively stall further issue of async HGMMA when tensor core
+            // is busy with too many outstanding requests (depth of meta queue).
+            // be careful to not miss the commit backpressure.
+            metadata_deq = metadata_valid && commit_if.ready && initiate_ready;
+        end else begin
+            // allow tensor core writeback, provided there's no commit
+            // backpressure
+            writeback_ready = commit_if.ready;
+
+            commit_if.valid       = writeback_valid;
+            commit_if.data.uuid   = '0;
+            commit_if.data.wid    = '0; // FIXME
+            commit_if.data.tmask  = {NUM_LANES{1'b1}};
+            commit_if.data.PC     = '0;
+            commit_if.data.wb     = writeback_last;
+            commit_if.data.rd     = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/));
+            commit_if.data.data   = wb_data;
+            // mark as "ghost" commit.  This will prevent this commit from
+            // decrementing from pending_instr buffer
+            commit_if.data.tensor = 1'b1;
+            // eop is deliberately set so that we don't underflow the pending_instr
+            // buffer in VX_schedule.  An instruction is considered committed only
+            // when the eop bit is set to one (see VX_commit).
+            // only the last ghost commit has eop set, which will trigger
+            // scoreboard to clear out the busy bit.
+            commit_if.data.eop    = writeback_last;
+        end
+    end
+
 endmodule
 
 

From 0f06afc3ef7350e82c008f5f25395abf89879213 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 21 Oct 2024 22:37:20 -0700
Subject: [PATCH 14/14] Update doc

---
 hw/rtl/core/VX_commit.sv             | 2 +-
 hw/rtl/core/VX_tensor_hopper_core.sv | 3 ---
 hw/rtl/fpu/VX_tensor_dpu.sv          | 2 +-
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv
index faca0a2a..cf4d92b4 100644
--- a/hw/rtl/core/VX_commit.sv
+++ b/hw/rtl/core/VX_commit.sv
@@ -177,7 +177,7 @@ module VX_commit import VX_gpu_pkg::*; #(
     // probably want to change this at some point
     // (i.e. pass a "don't count this towards pending instructions" signal down the pipeline)
     wire [`ISSUE_WIDTH-1:0] final_hmma;
-    // if this is a "ghost" commit generated from the tensor core, don't count
+    // if this is a "ghost" commit generated at the tensor core, don't count
     // toward committed
     wire [`ISSUE_WIDTH-1:0] tensor_ghost;
 `ifdef EXT_T_ENABLE
diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
index 2b2136b6..b6302cc3 100644
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -158,9 +158,6 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
             // mark as "ghost" commit.  This will prevent this commit from
             // decrementing from pending_instr buffer
             commit_if.data.tensor = 1'b1;
-            // eop is deliberately set so that we don't underflow the pending_instr
-            // buffer in VX_schedule.  An instruction is considered committed only
-            // when the eop bit is set to one (see VX_commit).
             // only the last ghost commit has eop set, which will trigger
             // scoreboard to clear out the busy bit.
             commit_if.data.eop    = writeback_last;
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 1cd2df84..0b5d846e 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -290,7 +290,7 @@ module VX_tensor_threadgroup #(
         // d_col_sel: 1, 3, 1, 3
         //
         // substep 0:
-        // [ 0 x 2 x ]
+        // [ 0 x 2 x ] (0~3 is 'i', the dpu id)
         // [ 1 x 3 x ]
         // substep 1:
         // [ x 0 x 2 ]