Add dummy code for decoupled Hopper tensor core
Define EXT_T_HOPPER that, when EXT_T_ENABLE is defined, distinguishes whether to instantiate core-coupled Volta-style or decoupled Hopper-style Tensor Core.
This commit is contained in:
@@ -40,8 +40,13 @@
|
||||
`define EXT_F_ENABLE
|
||||
`endif
|
||||
|
||||
// core-coupled tensor core
|
||||
`ifndef EXT_T_DISABLE
|
||||
`define EXT_T_ENABLE
|
||||
// decoupled Hopper-style tensor core
|
||||
`ifndef EXT_T_HOPPER
|
||||
`define EXT_T_HOPPER
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifndef XLEN_32
|
||||
@@ -83,7 +88,7 @@
|
||||
`endif
|
||||
|
||||
`ifndef NUM_CORES
|
||||
`define NUM_CORES 8
|
||||
`define NUM_CORES 4
|
||||
`endif
|
||||
|
||||
`ifndef NUM_WARPS
|
||||
|
||||
@@ -542,16 +542,30 @@ module VX_decode #(
|
||||
endcase
|
||||
end
|
||||
`ifdef EXT_T_ENABLE
|
||||
`INST_EXT4: begin
|
||||
ex_type = `EX_TENSOR;
|
||||
op_type = `INST_TENSOR_HMMA;
|
||||
// tensor core macroop is encoded as r-type
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
`USED_IREG (rs3);
|
||||
end
|
||||
`ifdef EXT_T_HOPPER
|
||||
// TODO
|
||||
`INST_EXT4: begin
|
||||
ex_type = `EX_TENSOR;
|
||||
op_type = `INST_TENSOR_HMMA;
|
||||
// tensor core macroop is encoded as r-type
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
`USED_IREG (rs3);
|
||||
end
|
||||
`else
|
||||
`INST_EXT4: begin
|
||||
ex_type = `EX_TENSOR;
|
||||
op_type = `INST_TENSOR_HMMA;
|
||||
// tensor core macroop is encoded as r-type
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
`USED_IREG (rs3);
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
default:;
|
||||
endcase
|
||||
|
||||
@@ -12,7 +12,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
||||
);
|
||||
localparam BLOCK_SIZE = 1;
|
||||
localparam NUM_LANES = `NUM_THREADS;
|
||||
// FIXME: @perf: PARTIAL_BW==1 increases power instantiating
|
||||
// @perf: PARTIAL_BW==1 increases power instantiating
|
||||
// stream_buffers for ISSUE_WIDTH times
|
||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
||||
|
||||
@@ -51,16 +51,27 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
||||
);
|
||||
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
||||
VX_tensor_core_block #(
|
||||
`ifdef EXT_T_HOPPER
|
||||
VX_tensor_hopper_core_block #(
|
||||
.ISW(1), // FIXME: not block_idx
|
||||
.FP16(FP16)
|
||||
) tensor_hopper_core (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.execute_if(execute_if[block_idx]),
|
||||
.commit_if(commit_block_if[block_idx])
|
||||
);
|
||||
`else
|
||||
VX_tensor_core_block #(
|
||||
.ISW(1), // FIXME: use block_idx
|
||||
.FP16(FP16)
|
||||
) tensor_core (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
|
||||
.execute_if(execute_if[block_idx]),
|
||||
.commit_if(commit_block_if[block_idx])
|
||||
);
|
||||
`endif
|
||||
end
|
||||
|
||||
endmodule
|
||||
@@ -275,7 +286,6 @@ module VX_tensor_core_block import VX_gpu_pkg::*; #(
|
||||
localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
|
||||
wire [COMMIT_DATAW-1:0] commit_if_data = {
|
||||
execute_if_data_deq[wb_wid], /* uuid ~ rd */
|
||||
// execute_if_data_deq, /* uuid ~ rd */
|
||||
subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */
|
||||
1'b0, /* pid */
|
||||
1'b1, /* sop */
|
||||
|
||||
102
hw/rtl/core/VX_tensor_hopper_core.sv
Normal file
102
hw/rtl/core/VX_tensor_hopper_core.sv
Normal file
@@ -0,0 +1,102 @@
|
||||
`ifdef EXT_T_ENABLE
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
||||
parameter ISW,
|
||||
parameter FP16
|
||||
) (
|
||||
input clk,
|
||||
input reset,
|
||||
|
||||
VX_execute_if.slave execute_if,
|
||||
VX_commit_if.master commit_if
|
||||
);
|
||||
localparam METADATA_QUEUE_DEPTH = 2; // FIXME: arbitrary
|
||||
|
||||
/* commit_if.data_t parts that we need to keep around:
|
||||
- uuid
|
||||
- wid
|
||||
- tmask
|
||||
- PC
|
||||
- wb
|
||||
- rd
|
||||
*/
|
||||
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
|
||||
|
||||
wire operand_enq_fire = execute_if.valid && execute_if.ready;
|
||||
wire commit_if_fire = commit_if.valid && commit_if.ready;
|
||||
wire [DATAW-1:0] execute_if_data_enq = {
|
||||
execute_if.data.uuid,
|
||||
execute_if.data.wid,
|
||||
execute_if.data.tmask,
|
||||
execute_if.data.PC,
|
||||
execute_if.data.wb,
|
||||
execute_if.data.rd
|
||||
// pid/sop/eop set later
|
||||
};
|
||||
|
||||
wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
|
||||
|
||||
wire [`NUM_WARPS-1:0] metadata_queue_fulls;
|
||||
wire [`NUM_WARPS-1:0] metadata_queue_emptys;
|
||||
// OR not AND, we don't want any warp full
|
||||
wire metadata_queue_full = |(metadata_queue_fulls);
|
||||
assign execute_if.ready = !metadata_queue_full;
|
||||
|
||||
`RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
|
||||
("runtime error: WGMMA execute not supported for warps other than 0!"))
|
||||
|
||||
for (genvar i = 0; i < `NUM_WARPS; i++) begin
|
||||
// Metadata queue for commit_if. This simply copies execute_if's
|
||||
// metadata and pops them in conjunction with commit fire.
|
||||
//
|
||||
// This has to be separated per-warp, as otherwise requests from
|
||||
// multiple warps can be enqueued interleaved, which makes it hard to
|
||||
// ensure two consecutive dequeues are associated with the same warp for
|
||||
// commit. (FIXME: this is not strictly necessary though.)
|
||||
|
||||
wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i));
|
||||
// FIXME: commit only warp 0
|
||||
wire deq = commit_if_fire && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW(DATAW),
|
||||
.DEPTH(METADATA_QUEUE_DEPTH)
|
||||
) pending_uops (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.push(enq),
|
||||
.pop(deq),
|
||||
.data_in(execute_if_data_enq),
|
||||
.data_out(execute_if_data_deq[i]),
|
||||
.empty(metadata_queue_emptys[i]),
|
||||
`UNUSED_PIN(alm_empty),
|
||||
.full(metadata_queue_fulls[i]),
|
||||
`UNUSED_PIN(alm_full),
|
||||
`UNUSED_PIN(size)
|
||||
);
|
||||
end
|
||||
|
||||
// this shouldn't really happen unless there's a big contention over
|
||||
// the commit stage
|
||||
`RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
|
||||
|
||||
// FIXME: only checks warp 0 for commit!
|
||||
assign commit_if.valid = ~metadata_queue_emptys[0/*FIXME*/];
|
||||
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
|
||||
|
||||
localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
|
||||
wire [COMMIT_DATAW-1:0] commit_if_data = {
|
||||
execute_if_data_deq[0/*FIXME*/], /* uuid ~ rd */
|
||||
wb_data, /* data */
|
||||
1'b0, /* pid */
|
||||
1'b1, /* sop */
|
||||
1'b1 /* eop */
|
||||
};
|
||||
|
||||
assign commit_if.data = commit_if_data;
|
||||
endmodule
|
||||
|
||||
`endif
|
||||
Reference in New Issue
Block a user