diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index ca0d1064..d1c14588 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -81,6 +81,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     localparam METADATA_QUEUE_DEPTH = 2 * `LATENCY_HMMA;
 
     wire [1:0] step = 2'(execute_if.data.op_type);
+    // op_mod is reused to indicate instruction's id in pair
     wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1));
 
     logic [NUM_OCTETS-1:0] octet_results_valid;
@@ -115,7 +116,6 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
         logic result_valid;
         logic result_ready;
 
-        // op_mod is reused to indicate instruction's id in pair
         VX_tensor_octet #(
             .ISW(ISW),
             .OCTET(i)
diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv
index 130866de..26817b8d 100644
--- a/hw/rtl/core/VX_uop_sequencer.sv
+++ b/hw/rtl/core/VX_uop_sequencer.sv
@@ -128,9 +128,8 @@ module VX_uop_sequencer import VX_gpu_pkg::*; (
         if (uop_sequencer_if.valid && use_uop &&
             uop_sequencer_if.data.rd  == `NR_BITS'(1)) begin
             // a little sketchy? but shouldn't create any loop
-            ibuffer_if.data.rd  = ibuffer_if.data.rd  + `NR_BITS'(8);
+            ibuffer_if.data.rd  = ibuffer_if.data.rd  + `NR_BITS'(8); // FIXME: 8 is hardcoded
             ibuffer_if.data.rs3 = ibuffer_if.data.rs3 + `NR_BITS'(8);
-            $display("yoooooooo! uop rd=%d\n", ibuffer_if.data.rd);
         end
     end
 
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 0155417b..8b7a1c26 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -234,6 +234,9 @@ module VX_tensor_threadgroup #(
         end
     end
 
+    // TODO: Instead of latching half-result and constructing a full D tile,
+    // we should be able to send these half fragments down to commit stage
+    // immediately, saving flop space
     assign D_frag[0][0] = D_reg[0];
     assign D_frag[0][2] = D_reg[1];
     assign D_frag[1][0] = D_reg[2];