diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv
index c4db29d4..de95144f 100644
--- a/hw/rtl/core/VX_core.sv
+++ b/hw/rtl/core/VX_core.sv
@@ -63,6 +63,7 @@ module VX_core import VX_gpu_pkg::*; #(
     VX_decode_if        decode_if();
     VX_sched_csr_if     sched_csr_if();
     VX_decode_sched_if  decode_sched_if();
+    VX_tc_rf_if         tensor_regfile_if();
     VX_commit_sched_if  commit_sched_if();
     VX_commit_csr_if    commit_csr_if();
     VX_branch_ctl_if    branch_ctl_if[`NUM_ALU_BLOCKS]();
@@ -190,6 +191,9 @@ module VX_core import VX_gpu_pkg::*; #(
     `endif
     `ifdef EXT_T_ENABLE
         .tensor_dispatch_if(tensor_dispatch_if),
+    `ifdef EXT_T_HOPPER
+        .tensor_regfile_if (tensor_regfile_if),
+    `endif
     `endif
         .sfu_dispatch_if(sfu_dispatch_if)
     );
@@ -221,8 +225,9 @@ module VX_core import VX_gpu_pkg::*; #(
         .tensor_dispatch_if (tensor_dispatch_if),
         .tensor_commit_if (tensor_commit_if),
     `ifdef EXT_T_HOPPER
-        .tensor_smem_A_if (tensor_smem_A_if),
-        .tensor_smem_B_if (tensor_smem_B_if),
+        .tensor_regfile_if (tensor_regfile_if),
+        .tensor_smem_A_if  (tensor_smem_A_if),
+        .tensor_smem_B_if  (tensor_smem_B_if),
     `endif
     `endif
 
diff --git a/hw/rtl/core/VX_execute.sv b/hw/rtl/core/VX_execute.sv
index c24fea7c..7cbb4ed2 100644
--- a/hw/rtl/core/VX_execute.sv
+++ b/hw/rtl/core/VX_execute.sv
@@ -59,6 +59,7 @@ module VX_execute import VX_gpu_pkg::*; #(
     VX_dispatch_if.slave    tensor_dispatch_if [`ISSUE_WIDTH],
     VX_commit_if.master     tensor_commit_if [`ISSUE_WIDTH],
 `ifdef EXT_T_HOPPER
+    VX_tc_rf_if.master      tensor_regfile_if,
     VX_tc_bus_if.master     tensor_smem_A_if,
     VX_tc_bus_if.master     tensor_smem_B_if,
 `endif
@@ -156,6 +157,7 @@ module VX_execute import VX_gpu_pkg::*; #(
 
         .dispatch_if(tensor_dispatch_if),
 `ifdef EXT_T_HOPPER
+        .regfile_if(tensor_regfile_if),
         .smem_A_if(tensor_smem_A_if),
         .smem_B_if(tensor_smem_B_if),
 `endif
diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv
index 10a46121..8a641c55 100644
--- a/hw/rtl/core/VX_issue.sv
+++ b/hw/rtl/core/VX_issue.sv
@@ -14,7 +14,7 @@
 `include "VX_define.vh"
 `include "VX_trace.vh"
 
-module VX_issue #(
+module VX_issue import VX_gpu_pkg::*; #(
     parameter CORE_ID = 0
 ) (
     `SCOPE_IO_DECL
@@ -36,6 +36,9 @@ module VX_issue #(
 `endif
 `ifdef EXT_T_ENABLE
     VX_dispatch_if.master   tensor_dispatch_if [`ISSUE_WIDTH],
+`ifdef EXT_T_HOPPER
+    VX_tc_rf_if.slave       tensor_regfile_if,
+`endif
 `endif
     VX_dispatch_if.master   sfu_dispatch_if [`ISSUE_WIDTH]
 );
@@ -75,22 +78,6 @@ module VX_issue #(
         .scoreboard_if  (scoreboard_if)
     );
 
-
-    // /*
-    // fake fsm driving tc output
-    reg [11:0] counter;
-    wire tc_rf_valid;
-    wire [4:0] tc_rf_addr;
-    always @(posedge clk) begin
-        if (reset) begin
-            counter <= 12'd1;
-        end else begin
-            counter <= counter + 12'd1;
-        end
-    end
-    assign tc_rf_valid = (counter[6:0] == 7'd0);
-    assign tc_rf_addr = counter[11:7];
-    // */
 `ifdef GPR_DUPLICATED
     VX_operands_dup #(
 `else
@@ -104,11 +91,7 @@ module VX_issue #(
         .writeback_if   (writeback_if),
         .scoreboard_if  (scoreboard_if),
         .operands_if    (operands_if),
-`ifdef GPR_DUPLICATED
-        .tc_rf_valid    ('{`ISSUE_WIDTH{tc_rf_valid}}),
-        .tc_rf_addr     ('{`ISSUE_WIDTH{tc_rf_addr}}),
-        .tc_rf_data     ()
-`endif
+        .tensor_regfile_if (tensor_regfile_if)
     );
 
     VX_dispatch #(
diff --git a/hw/rtl/core/VX_operands_dup.sv b/hw/rtl/core/VX_operands_dup.sv
index e588655d..cfa6e8e4 100644
--- a/hw/rtl/core/VX_operands_dup.sv
+++ b/hw/rtl/core/VX_operands_dup.sv
@@ -24,11 +24,8 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
 
     VX_writeback_if.slave   writeback_if [`ISSUE_WIDTH],
     VX_ibuffer_if.slave     scoreboard_if [`ISSUE_WIDTH],
-    VX_operands_if.master   operands_if [`ISSUE_WIDTH],
-
-    input wire                                        tc_rf_valid [`ISSUE_WIDTH],
-    input wire [`LOG2UP(`NUM_REGS * ISSUE_RATIO)-1:0] tc_rf_addr  [`ISSUE_WIDTH],
-    output wire         [`NUM_THREADS-1:0][`XLEN-1:0] tc_rf_data  [`ISSUE_WIDTH]
+    VX_tc_rf_if.slave       tensor_regfile_if,
+    VX_operands_if.master   operands_if [`ISSUE_WIDTH]
 );
     `UNUSED_PARAM (CORE_ID)
     localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
@@ -47,6 +44,18 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
     logic [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] empty2;
     logic [`ISSUE_WIDTH-1:0][2:0] size1;
 
+    wire                                        tc_rf_valid [`ISSUE_WIDTH];
+    wire [`LOG2UP(`NUM_REGS * ISSUE_RATIO)-1:0] tc_rf_addr  [`ISSUE_WIDTH];
+    // FIXME: don't need full ISSUE_WIDTH; only one warp is read at a time
+    // because NUM_BLOCKS == 1
+    wire         [`NUM_THREADS-1:0][`XLEN-1:0]  tc_rf_data  [`ISSUE_WIDTH];
+
+    `STATIC_ASSERT((ISSUE_RATIO == 1),
+        ("static assertion failed: tensor core only supports ISSUE_RATIO == 1"))
+    assign tc_rf_valid = '{`ISSUE_WIDTH{tensor_regfile_if.req_valid}};
+    assign tc_rf_addr  = '{`ISSUE_WIDTH{tensor_regfile_if.req_data.rs}};
+    assign tensor_regfile_if.rsp_data.data = tc_rf_data[0];
+
     for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
 
         always @(posedge clk) begin
@@ -104,7 +113,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
             .size     (size1[i])
         );
         assign operands_if[i].valid = ~empty1[i];
-        assign scoreboard_if[i].ready = (size1[i] < 2'd2) && ~tc_rf_valid[i];
+        assign scoreboard_if[i].ready = (size1[i] < 3'd2) && ~tc_rf_valid[i];
 
         // assert (full1[i] == full2[i]);
         // assert (empty1[i] == empty2[i]);
@@ -207,10 +216,10 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
         end
         
     `ifdef GPR_RESET
-        reg wr_enabled = 0;
+        reg wr_enabled = 1'b0;
         always @(posedge clk) begin
             if (reset) begin
-                wr_enabled <= 1;
+                wr_enabled <= 1'b1;
             end
         end
     `endif
diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 6d3a8a4a..916340c4 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -9,6 +9,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
 
     VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
 `ifdef EXT_T_HOPPER
+    VX_tc_rf_if.master   regfile_if,
     VX_tc_bus_if.master  smem_A_if,
     VX_tc_bus_if.master  smem_B_if,
 `endif
@@ -63,6 +64,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
             .clk        (clk),
             .reset      (reset),
             .execute_if (execute_if[block_idx]),
+            .regfile_if (regfile_if),
             .smem_A_if  (smem_A_if),
             .smem_B_if  (smem_B_if),
             .commit_if  (commit_block_if[block_idx])
diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
index 40a66640..c6f9d4dd 100644
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -9,6 +9,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     input reset,
 
     VX_execute_if.slave execute_if,
+    VX_tc_rf_if.master  regfile_if,
     VX_tc_bus_if.master smem_A_if,
     VX_tc_bus_if.master smem_B_if,
     VX_commit_if.master commit_if
@@ -104,6 +105,21 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     `STATIC_ASSERT((`XLEN == 32),
         ("static assertion failed: tensor_hopper_core only supports XLEN == 32"))
 
+    // /*
+    // fake fsm driving tc rf port
+    reg [11:0] counter;
+    always @(posedge clk) begin
+        if (reset) begin
+            counter <= 12'd1;
+        end else begin
+            counter <= counter + 12'd1;
+        end
+    end
+    assign regfile_if.req_valid = (counter[3:0] != 4'd0);
+    assign regfile_if.req_data.wis = '0;
+    assign regfile_if.req_data.rs = counter[11:7];
+    // */
+
     TensorCoreDecoupled tensor_hopper_core (
         .clock(clk),
         .reset(reset),
diff --git a/hw/rtl/mem/VX_tc_rf_if.sv b/hw/rtl/mem/VX_tc_rf_if.sv
new file mode 100644
index 00000000..b12cd0dc
--- /dev/null
+++ b/hw/rtl/mem/VX_tc_rf_if.sv
@@ -0,0 +1,46 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_define.vh"
+
+interface VX_tc_rf_if import VX_gpu_pkg::*; ();
+
+    typedef struct packed {
+        logic [ISSUE_WIS_W-1:0] wis;
+        logic [`NR_BITS-1:0]    rs;
+    } req_data_t;
+
+    typedef struct packed {
+        logic [`NUM_THREADS-1:0][`XLEN-1:0] data;
+    } rsp_data_t;
+
+    logic  req_valid;
+    req_data_t req_data;
+
+    rsp_data_t rsp_data;
+
+    modport master (
+        output req_valid,
+        output req_data,
+
+        input  rsp_data
+    );
+
+    modport slave (
+        input  req_valid,
+        input  req_data,
+
+        output rsp_data
+    );
+
+endinterface