Add dummy code for decoupled Hopper tensor core

Define EXT_T_HOPPER that, when EXT_T_ENABLE is defined, distinguishes
whether to instantiate core-coupled Volta-style or decoupled
Hopper-style Tensor Core.
This commit is contained in:
Hansung Kim
2024-10-07 17:10:59 -07:00
parent da54162241
commit 4cac1adf7d
4 changed files with 146 additions and 15 deletions

View File

@@ -40,8 +40,13 @@
`define EXT_F_ENABLE
`endif
// core-coupled tensor core
`ifndef EXT_T_DISABLE
`define EXT_T_ENABLE
// decoupled Hopper-style tensor core
`ifndef EXT_T_HOPPER
`define EXT_T_HOPPER
`endif
`endif
`ifndef XLEN_32
@@ -83,7 +88,7 @@
`endif
`ifndef NUM_CORES
`define NUM_CORES 8
`define NUM_CORES 4
`endif
`ifndef NUM_WARPS

View File

@@ -542,6 +542,8 @@ module VX_decode #(
endcase
end
`ifdef EXT_T_ENABLE
`ifdef EXT_T_HOPPER
// TODO
`INST_EXT4: begin
ex_type = `EX_TENSOR;
op_type = `INST_TENSOR_HMMA;
@@ -552,6 +554,18 @@ module VX_decode #(
`USED_IREG (rs2);
`USED_IREG (rs3);
end
`else
`INST_EXT4: begin
ex_type = `EX_TENSOR;
op_type = `INST_TENSOR_HMMA;
// tensor core macroop is encoded as r-type
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
`USED_IREG (rs3);
end
`endif
`endif
default:;
endcase

View File

@@ -12,7 +12,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
);
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_THREADS;
// FIXME: @perf: PARTIAL_BW==1 increases power instantiating
// @perf: PARTIAL_BW==1 increases power instantiating
// stream_buffers for ISSUE_WIDTH times
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
@@ -51,16 +51,27 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
);
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
VX_tensor_core_block #(
`ifdef EXT_T_HOPPER
VX_tensor_hopper_core_block #(
.ISW(1), // FIXME: not block_idx
.FP16(FP16)
) tensor_hopper_core (
.clk(clk),
.reset(reset),
.execute_if(execute_if[block_idx]),
.commit_if(commit_block_if[block_idx])
);
`else
VX_tensor_core_block #(
.ISW(1), // FIXME: use block_idx
.FP16(FP16)
) tensor_core (
.clk(clk),
.reset(reset),
.execute_if(execute_if[block_idx]),
.commit_if(commit_block_if[block_idx])
);
`endif
end
endmodule
@@ -275,7 +286,6 @@ module VX_tensor_core_block import VX_gpu_pkg::*; #(
localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
wire [COMMIT_DATAW-1:0] commit_if_data = {
execute_if_data_deq[wb_wid], /* uuid ~ rd */
// execute_if_data_deq, /* uuid ~ rd */
subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */
1'b0, /* pid */
1'b1, /* sop */

View File

@@ -0,0 +1,102 @@
`ifdef EXT_T_ENABLE
`include "VX_fpu_define.vh"
module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
parameter ISW,
parameter FP16
) (
input clk,
input reset,
VX_execute_if.slave execute_if,
VX_commit_if.master commit_if
);
localparam METADATA_QUEUE_DEPTH = 2; // FIXME: arbitrary
/* commit_if.data_t parts that we need to keep around:
- uuid
- wid
- tmask
- PC
- wb
- rd
*/
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
wire operand_enq_fire = execute_if.valid && execute_if.ready;
wire commit_if_fire = commit_if.valid && commit_if.ready;
wire [DATAW-1:0] execute_if_data_enq = {
execute_if.data.uuid,
execute_if.data.wid,
execute_if.data.tmask,
execute_if.data.PC,
execute_if.data.wb,
execute_if.data.rd
// pid/sop/eop set later
};
wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
wire [`NUM_WARPS-1:0] metadata_queue_fulls;
wire [`NUM_WARPS-1:0] metadata_queue_emptys;
// OR not AND, we don't want any warp full
wire metadata_queue_full = |(metadata_queue_fulls);
assign execute_if.ready = !metadata_queue_full;
`RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
("runtime error: WGMMA execute not supported for warps other than 0!"))
for (genvar i = 0; i < `NUM_WARPS; i++) begin
// Metadata queue for commit_if. This simply copies execute_if's
// metadata and pops them in conjunction with commit fire.
//
// This has to be separated per-warp, as otherwise requests from
// multiple warps can be enqueued interleaved, which makes it hard to
// ensure two consecutive dequeues are associated with the same warp for
// commit. (FIXME: this is not strictly necessary though.)
wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i));
// FIXME: commit only warp 0
wire deq = commit_if_fire && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
VX_fifo_queue #(
.DATAW(DATAW),
.DEPTH(METADATA_QUEUE_DEPTH)
) pending_uops (
.clk(clk),
.reset(reset),
.push(enq),
.pop(deq),
.data_in(execute_if_data_enq),
.data_out(execute_if_data_deq[i]),
.empty(metadata_queue_emptys[i]),
`UNUSED_PIN(alm_empty),
.full(metadata_queue_fulls[i]),
`UNUSED_PIN(alm_full),
`UNUSED_PIN(size)
);
end
// this shouldn't really happen unless there's a big contention over
// the commit stage
`RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
// FIXME: only checks warp 0 for commit!
assign commit_if.valid = ~metadata_queue_emptys[0/*FIXME*/];
wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
wire [COMMIT_DATAW-1:0] commit_if_data = {
execute_if_data_deq[0/*FIXME*/], /* uuid ~ rd */
wb_data, /* data */
1'b0, /* pid */
1'b1, /* sop */
1'b1 /* eop */
};
assign commit_if.data = commit_if_data;
endmodule
`endif