Merge branch 'tensor-decoupled' into rtl
This commit is contained in:
@@ -40,8 +40,13 @@
|
|||||||
`define EXT_F_ENABLE
|
`define EXT_F_ENABLE
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
// core-coupled tensor core
|
||||||
`ifndef EXT_T_DISABLE
|
`ifndef EXT_T_DISABLE
|
||||||
`define EXT_T_ENABLE
|
`define EXT_T_ENABLE
|
||||||
|
// decoupled Hopper-style tensor core
|
||||||
|
`ifndef EXT_T_HOPPER
|
||||||
|
`define EXT_T_HOPPER
|
||||||
|
`endif
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`ifndef XLEN_32
|
`ifndef XLEN_32
|
||||||
|
|||||||
@@ -254,7 +254,10 @@
|
|||||||
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
||||||
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
||||||
|
|
||||||
`define INST_TENSOR_HMMA 4'b0000
|
`define INST_TENSOR_HMMA 4'b0000
|
||||||
|
// Hopper WGMMA-style asynchronous op
|
||||||
|
`define INST_TENSOR_HGMMA 4'b0001
|
||||||
|
`define INST_TENSOR_HGMMA_WAIT 4'b0010
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ module VX_alu_unit #(
|
|||||||
localparam NUM_LANES = `NUM_ALU_LANES;
|
localparam NUM_LANES = `NUM_ALU_LANES;
|
||||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||||
localparam PID_WIDTH = `UP(PID_BITS);
|
localparam PID_WIDTH = `UP(PID_BITS);
|
||||||
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + 1 + PID_WIDTH + 1 + 1;
|
||||||
localparam RSP_ARB_SIZE = 2 + `EXT_M_ENABLED;
|
localparam RSP_ARB_SIZE = 2 + `EXT_M_ENABLED;
|
||||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
||||||
|
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value
|
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value
|
||||||
);
|
);
|
||||||
`UNUSED_PARAM (CORE_ID)
|
`UNUSED_PARAM (CORE_ID)
|
||||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1;
|
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1 + 1;
|
||||||
localparam COMMIT_SIZEW = `CLOG2(`NUM_THREADS + 1);
|
localparam COMMIT_SIZEW = `CLOG2(`NUM_THREADS + 1);
|
||||||
localparam COMMIT_ALL_SIZEW = COMMIT_SIZEW + `ISSUE_WIDTH - 1;
|
localparam COMMIT_ALL_SIZEW = COMMIT_SIZEW + `ISSUE_WIDTH - 1;
|
||||||
|
|
||||||
@@ -173,36 +173,26 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
// Committed instructions
|
// Committed instructions
|
||||||
|
|
||||||
// temporary hack to not underflow the pending instructions buffer
|
// prevent underflow of the VX_pending_instr buffer
|
||||||
// relies on 1 cycle delay of arbiter and continuous issuing of tensor instructions,
|
// probably want to change this at some point
|
||||||
// so probably want to change this at some point
|
|
||||||
// (i.e. pass a "don't count this towards pending instructions" signal down the pipeline)
|
// (i.e. pass a "don't count this towards pending instructions" signal down the pipeline)
|
||||||
// logic [`ISSUE_WIDTH-1:0][4:0] hmma_ctr, hmma_ctr_n;
|
|
||||||
wire [`ISSUE_WIDTH-1:0] final_hmma;
|
wire [`ISSUE_WIDTH-1:0] final_hmma;
|
||||||
|
// if this is a "ghost" commit generated at the tensor core, don't count
|
||||||
|
// toward committed
|
||||||
|
wire [`ISSUE_WIDTH-1:0] tensor_ghost;
|
||||||
`ifdef EXT_T_ENABLE
|
`ifdef EXT_T_ENABLE
|
||||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
// assign hmma_ctr_n[i] = (tensor_commit_if[i].valid && tensor_commit_if[i].ready) ? hmma_ctr[i] + 5'b1 : hmma_ctr[i];
|
|
||||||
// assign final_hmma[i] = (commit_sel[i] != `EX_BITS'(2) || hmma_ctr == '0);
|
|
||||||
// i suppose this is now a feature and not a bug
|
|
||||||
// if PC is 0, this means it is not final step of a wmma, shouldn't be committed
|
// if PC is 0, this means it is not final step of a wmma, shouldn't be committed
|
||||||
assign final_hmma[i] = (commit_if[i].data.PC != 32'b0);
|
assign final_hmma[i] = (commit_if[i].data.PC != 32'b0);
|
||||||
|
// handle 'x' with ===. FIXME fix unitialization
|
||||||
|
assign tensor_ghost[i] = (commit_if[i].data.tensor == 1'b1);
|
||||||
end
|
end
|
||||||
/*
|
|
||||||
always @(posedge clk) begin
|
|
||||||
if (reset) begin
|
|
||||||
hmma_ctr <= '0;
|
|
||||||
end
|
|
||||||
else begin
|
|
||||||
hmma_ctr <= hmma_ctr_n;
|
|
||||||
end
|
|
||||||
end
|
|
||||||
*/
|
|
||||||
`else
|
`else
|
||||||
assign final_hmma = '1;
|
assign final_hmma = '1;
|
||||||
|
assign tensor_ghost = '0;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma & (~tensor_ghost);
|
||||||
wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma;
|
|
||||||
|
|
||||||
VX_pipe_register #(
|
VX_pipe_register #(
|
||||||
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
|
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
|
||||||
@@ -225,6 +215,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
assign writeback_if[i].data.tmask= commit_if[i].data.tmask;
|
assign writeback_if[i].data.tmask= commit_if[i].data.tmask;
|
||||||
assign writeback_if[i].data.rd = commit_if[i].data.rd;
|
assign writeback_if[i].data.rd = commit_if[i].data.rd;
|
||||||
assign writeback_if[i].data.data = commit_if[i].data.data;
|
assign writeback_if[i].data.data = commit_if[i].data.data;
|
||||||
|
assign writeback_if[i].data.tensor = commit_if[i].data.tensor;
|
||||||
assign writeback_if[i].data.sop = commit_if[i].data.sop;
|
assign writeback_if[i].data.sop = commit_if[i].data.sop;
|
||||||
assign writeback_if[i].data.eop = commit_if[i].data.eop;
|
assign writeback_if[i].data.eop = commit_if[i].data.eop;
|
||||||
assign commit_if[i].ready = 1'b1; // writeback has no backpressure
|
assign commit_if[i].ready = 1'b1; // writeback has no backpressure
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||||||
`UNUSED_PARAM (CORE_ID)
|
`UNUSED_PARAM (CORE_ID)
|
||||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||||
localparam PID_WIDTH = `UP(PID_BITS);
|
localparam PID_WIDTH = `UP(PID_BITS);
|
||||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * 32 + PID_WIDTH + 1 + 1;
|
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * 32 + 1 + PID_WIDTH + 1 + 1;
|
||||||
|
|
||||||
`UNUSED_VAR (execute_if.data.rs3_data)
|
`UNUSED_VAR (execute_if.data.rs3_data)
|
||||||
|
|
||||||
@@ -174,8 +174,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||||||
.reset (reset),
|
.reset (reset),
|
||||||
.valid_in (csr_req_valid),
|
.valid_in (csr_req_valid),
|
||||||
.ready_in (csr_req_ready),
|
.ready_in (csr_req_ready),
|
||||||
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
|
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, 1'b0/*tensor*/, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
|
||||||
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, csr_commit_data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
|
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, csr_commit_data, commit_if.data.tensor, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
|
||||||
.valid_out (commit_if.valid),
|
.valid_out (commit_if.valid),
|
||||||
.ready_out (commit_if.ready)
|
.ready_out (commit_if.ready)
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -543,14 +543,28 @@ module VX_decode #(
|
|||||||
end
|
end
|
||||||
`ifdef EXT_T_ENABLE
|
`ifdef EXT_T_ENABLE
|
||||||
`INST_EXT4: begin
|
`INST_EXT4: begin
|
||||||
|
`ifdef EXT_T_HOPPER
|
||||||
|
ex_type = `EX_TENSOR;
|
||||||
|
// tensor core macroop is encoded as r-type
|
||||||
|
if (func3[0]) begin
|
||||||
|
op_type = `INST_TENSOR_HGMMA_WAIT;
|
||||||
|
end else begin
|
||||||
|
op_type = `INST_TENSOR_HGMMA;
|
||||||
|
end
|
||||||
|
// rd/rs1/rs2/rs3 unused to prevent hazard stalls at the
|
||||||
|
// scoreboard
|
||||||
|
`else
|
||||||
ex_type = `EX_TENSOR;
|
ex_type = `EX_TENSOR;
|
||||||
op_type = `INST_TENSOR_HMMA;
|
op_type = `INST_TENSOR_HMMA;
|
||||||
// tensor core macroop is encoded as r-type
|
// tensor core macroop is encoded as r-type
|
||||||
|
// hazard stall logic in the scoreboard will handle
|
||||||
|
// read-after-write dependency on rd -> rs3
|
||||||
use_rd = 1;
|
use_rd = 1;
|
||||||
`USED_IREG (rd);
|
`USED_IREG (rd);
|
||||||
`USED_IREG (rs1);
|
`USED_IREG (rs1);
|
||||||
`USED_IREG (rs2);
|
`USED_IREG (rs2);
|
||||||
`USED_IREG (rs3);
|
`USED_IREG (rs3);
|
||||||
|
`endif
|
||||||
end
|
end
|
||||||
`endif
|
`endif
|
||||||
default:;
|
default:;
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
|||||||
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
|
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
|
||||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||||
localparam PID_WIDTH = `UP(PID_BITS);
|
localparam PID_WIDTH = `UP(PID_BITS);
|
||||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + NUM_LANES * `XLEN + 1 + PID_WIDTH + 1 + 1;
|
||||||
localparam DATA_WIS_OFF = DATAW - (`UUID_WIDTH + `NW_WIDTH);
|
localparam DATA_WIS_OFF = DATAW - (`UUID_WIDTH + `NW_WIDTH);
|
||||||
|
|
||||||
wire [BLOCK_SIZE-1:0] commit_in_valid;
|
wire [BLOCK_SIZE-1:0] commit_in_valid;
|
||||||
@@ -119,6 +119,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
|||||||
commit_tmp_if.data.wb,
|
commit_tmp_if.data.wb,
|
||||||
commit_tmp_if.data.rd,
|
commit_tmp_if.data.rd,
|
||||||
commit_data_r,
|
commit_data_r,
|
||||||
|
commit_tmp_if.data.tensor,
|
||||||
1'b0, // PID
|
1'b0, // PID
|
||||||
commit_tmp_if.data.sop,
|
commit_tmp_if.data.sop,
|
||||||
commit_tmp_if.data.eop
|
commit_tmp_if.data.eop
|
||||||
|
|||||||
@@ -136,14 +136,14 @@ module VX_int_unit #(
|
|||||||
end
|
end
|
||||||
|
|
||||||
VX_elastic_buffer #(
|
VX_elastic_buffer #(
|
||||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + (NUM_LANES * `XLEN) + `XLEN + `XLEN + 1 + `INST_BR_BITS + LANE_WIDTH)
|
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + 1 + (NUM_LANES * `XLEN) + `XLEN + `XLEN + 1 + `INST_BR_BITS + LANE_WIDTH)
|
||||||
) rsp_buf (
|
) rsp_buf (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.valid_in (execute_if.valid),
|
.valid_in (execute_if.valid),
|
||||||
.ready_in (execute_if.ready),
|
.ready_in (execute_if.ready),
|
||||||
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, alu_result, execute_if.data.PC, execute_if.data.imm, is_br_op, br_op, tid}),
|
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, 1'b0/*tensor*/, alu_result, execute_if.data.PC, execute_if.data.imm, is_br_op, br_op, tid}),
|
||||||
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, alu_result_r, PC_r, imm_r, is_br_op_r, br_op_r, tid_r}),
|
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, commit_if.data.tensor, alu_result_r, PC_r, imm_r, is_br_op_r, br_op_r, tid_r}),
|
||||||
.valid_out (commit_if.valid),
|
.valid_out (commit_if.valid),
|
||||||
.ready_out (commit_if.ready)
|
.ready_out (commit_if.ready)
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||||||
localparam NUM_LANES = `NUM_LSU_LANES;
|
localparam NUM_LANES = `NUM_LSU_LANES;
|
||||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||||
localparam PID_WIDTH = `UP(PID_BITS);
|
localparam PID_WIDTH = `UP(PID_BITS);
|
||||||
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + 1 + PID_WIDTH + 1 + 1;
|
||||||
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_SIZE);
|
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_SIZE);
|
||||||
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
|
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
|
||||||
localparam MEM_ADDRW = `XLEN - MEM_ASHIFT;
|
localparam MEM_ADDRW = `XLEN - MEM_ASHIFT;
|
||||||
@@ -527,15 +527,15 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||||||
// load commit
|
// load commit
|
||||||
|
|
||||||
VX_elastic_buffer #(
|
VX_elastic_buffer #(
|
||||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
|
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + 1 + PID_WIDTH + 1 + 1),
|
||||||
.SIZE (2)
|
.SIZE (2)
|
||||||
) ld_rsp_buf (
|
) ld_rsp_buf (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.valid_in (mem_rsp_valid),
|
.valid_in (mem_rsp_valid),
|
||||||
.ready_in (mem_rsp_ready),
|
.ready_in (mem_rsp_ready),
|
||||||
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_pc, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
|
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_pc, rsp_rd, rsp_data, 1'b0/*tensor*/, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
|
||||||
.data_out ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}),
|
.data_out ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.tensor, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}),
|
||||||
.valid_out (commit_ld_if.valid),
|
.valid_out (commit_ld_if.valid),
|
||||||
.ready_out (commit_ld_if.ready)
|
.ready_out (commit_ld_if.ready)
|
||||||
);
|
);
|
||||||
@@ -545,15 +545,15 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||||||
// store commit
|
// store commit
|
||||||
|
|
||||||
VX_elastic_buffer #(
|
VX_elastic_buffer #(
|
||||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + PID_WIDTH + 1 + 1),
|
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + PID_WIDTH + 1 + 1),
|
||||||
.SIZE (2)
|
.SIZE (2)
|
||||||
) st_rsp_buf (
|
) st_rsp_buf (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.valid_in (mem_req_fire && mem_req_rw),
|
.valid_in (mem_req_fire && mem_req_rw),
|
||||||
.ready_in (st_rsp_ready),
|
.ready_in (st_rsp_ready),
|
||||||
.data_in ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop}),
|
.data_in ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, 1'b0/*tensor*/, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop}),
|
||||||
.data_out ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}),
|
.data_out ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.tensor, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}),
|
||||||
.valid_out (commit_st_if.valid),
|
.valid_out (commit_st_if.valid),
|
||||||
.ready_out (commit_st_if.ready)
|
.ready_out (commit_st_if.ready)
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -323,16 +323,16 @@ module VX_muldiv_unit #(
|
|||||||
|
|
||||||
VX_stream_arb #(
|
VX_stream_arb #(
|
||||||
.NUM_INPUTS (2),
|
.NUM_INPUTS (2),
|
||||||
.DATAW (TAGW + (NUM_LANES * `XLEN)),
|
.DATAW (1/*tensor field only in commit*/ + TAGW + (NUM_LANES * `XLEN)),
|
||||||
.OUT_REG (1)
|
.OUT_REG (1)
|
||||||
) rsp_buf (
|
) rsp_buf (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.valid_in ({div_valid_out, mul_valid_out}),
|
.valid_in ({div_valid_out, mul_valid_out}),
|
||||||
.ready_in ({div_ready_out, mul_ready_out}),
|
.ready_in ({div_ready_out, mul_ready_out}),
|
||||||
.data_in ({{div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_pid_out, div_sop_out, div_eop_out, div_result_out},
|
.data_in ({{div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, 1'b0/*tensor*/, div_pid_out, div_sop_out, div_eop_out, div_result_out},
|
||||||
{mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_pid_out, mul_sop_out, mul_eop_out, mul_result_out}}),
|
{mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, 1'b0/*tensor*/, mul_pid_out, mul_sop_out, mul_eop_out, mul_result_out}}),
|
||||||
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, commit_if.data.data}),
|
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.tensor, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, commit_if.data.data}),
|
||||||
.valid_out (commit_if.valid),
|
.valid_out (commit_if.valid),
|
||||||
.ready_out (commit_if.ready),
|
.ready_out (commit_if.ready),
|
||||||
`UNUSED_PIN (sel_out)
|
`UNUSED_PIN (sel_out)
|
||||||
|
|||||||
@@ -269,7 +269,7 @@ module VX_reduce_unit #(
|
|||||||
);
|
);
|
||||||
|
|
||||||
VX_elastic_buffer #(
|
VX_elastic_buffer #(
|
||||||
.DATAW(`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + (`XLEN * NUM_LANES) + PID_WIDTH + 1 + 1)
|
.DATAW(`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + (`XLEN * NUM_LANES) + 1 + PID_WIDTH + 1 + 1)
|
||||||
) output_buffer (
|
) output_buffer (
|
||||||
.clk(clk),
|
.clk(clk),
|
||||||
.reset(reset),
|
.reset(reset),
|
||||||
@@ -277,7 +277,7 @@ module VX_reduce_unit #(
|
|||||||
.ready_in(commit_if_ready),
|
.ready_in(commit_if_ready),
|
||||||
.data_in({execute_if.data.uuid, execute_if.data.wid, stored_tmask, execute_if.data.PC, execute_if.data.wb, execute_if.data.rd, broadcasted_accumulator, stored_pid, stored_sop, stored_eop}),
|
.data_in({execute_if.data.uuid, execute_if.data.wid, stored_tmask, execute_if.data.PC, execute_if.data.wb, execute_if.data.rd, broadcasted_accumulator, stored_pid, stored_sop, stored_eop}),
|
||||||
|
|
||||||
.data_out({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.wb, commit_if.data.rd, commit_if.data.data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
|
.data_out({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.wb, commit_if.data.rd, commit_if.data.data, commit_if.data.tensor, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
|
||||||
.ready_out(commit_if.ready),
|
.ready_out(commit_if.ready),
|
||||||
.valid_out(commit_if.valid)
|
.valid_out(commit_if.valid)
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -142,6 +142,12 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
|
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
|
||||||
|
// busy bit for the asynchronous Tensor unit. Since the ISA does not
|
||||||
|
// have an explicit destination register, use a separate status bit.
|
||||||
|
reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor;
|
||||||
|
|
||||||
|
wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
|
||||||
|
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA);
|
||||||
|
|
||||||
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
||||||
|
|
||||||
@@ -205,7 +211,15 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
// NOTE(hansung): why is inuse_rd checked? to prevent WAW?
|
// NOTE(hansung): why is inuse_rd checked? to prevent WAW?
|
||||||
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
||||||
|
`ifdef EXT_T_HOPPER
|
||||||
|
wire hgmma_wait = ibuffer_if[i].valid &&
|
||||||
|
(ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
|
||||||
|
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT);
|
||||||
|
wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]);
|
||||||
|
wire operands_ready = (~(| operands_busy)) && hgmma_ready;
|
||||||
|
`else
|
||||||
wire operands_ready = ~(| operands_busy);
|
wire operands_ready = ~(| operands_busy);
|
||||||
|
`endif
|
||||||
|
|
||||||
wire stg_valid_in, stg_ready_in;
|
wire stg_valid_in, stg_ready_in;
|
||||||
assign stg_valid_in = ibuffer_if[i].valid && operands_ready;
|
assign stg_valid_in = ibuffer_if[i].valid && operands_ready;
|
||||||
@@ -227,6 +241,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
inuse_regs <= '0;
|
inuse_regs <= '0;
|
||||||
|
inuse_tensor <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
if (writeback_fire) begin
|
if (writeback_fire) begin
|
||||||
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
|
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
|
||||||
@@ -234,6 +249,14 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
|
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
|
||||||
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
|
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
|
||||||
end
|
end
|
||||||
|
`ifdef EXT_T_HOPPER
|
||||||
|
if (writeback_fire && writeback_if[i].data.tensor) begin
|
||||||
|
inuse_tensor[ibuffer_if[i].data.wis] <= 1'b0;
|
||||||
|
end
|
||||||
|
if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin
|
||||||
|
inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1;
|
||||||
|
end
|
||||||
|
`endif
|
||||||
end
|
end
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
|
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||||
localparam PID_WIDTH = `UP(PID_BITS);
|
localparam PID_WIDTH = `UP(PID_BITS);
|
||||||
|
|
||||||
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
|
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + 1 + PID_WIDTH + 1 + 1;
|
||||||
localparam RSP_ARB_SIZE = 1 + 1;
|
localparam RSP_ARB_SIZE = 1 + 1;
|
||||||
localparam RSP_ARB_IDX_WCTL = 0;
|
localparam RSP_ARB_IDX_WCTL = 0;
|
||||||
localparam RSP_ARB_IDX_CSRS = 1;
|
localparam RSP_ARB_IDX_CSRS = 1;
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
|||||||
);
|
);
|
||||||
localparam BLOCK_SIZE = 1;
|
localparam BLOCK_SIZE = 1;
|
||||||
localparam NUM_LANES = `NUM_THREADS;
|
localparam NUM_LANES = `NUM_THREADS;
|
||||||
// FIXME: @perf: PARTIAL_BW==1 increases power instantiating
|
// @perf: PARTIAL_BW==1 increases power instantiating
|
||||||
// stream_buffers for ISSUE_WIDTH times
|
// stream_buffers for ISSUE_WIDTH times
|
||||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
||||||
|
|
||||||
@@ -51,16 +51,27 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
|||||||
);
|
);
|
||||||
|
|
||||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
||||||
VX_tensor_core_block #(
|
`ifdef EXT_T_HOPPER
|
||||||
|
VX_tensor_hopper_core_block #(
|
||||||
.ISW(1), // FIXME: not block_idx
|
.ISW(1), // FIXME: not block_idx
|
||||||
.FP16(FP16)
|
.FP16(FP16)
|
||||||
|
) tensor_hopper_core_block (
|
||||||
|
.clk(clk),
|
||||||
|
.reset(reset),
|
||||||
|
.execute_if(execute_if[block_idx]),
|
||||||
|
.commit_if(commit_block_if[block_idx])
|
||||||
|
);
|
||||||
|
`else
|
||||||
|
VX_tensor_core_block #(
|
||||||
|
.ISW(1), // FIXME: use block_idx
|
||||||
|
.FP16(FP16)
|
||||||
) tensor_core (
|
) tensor_core (
|
||||||
.clk(clk),
|
.clk(clk),
|
||||||
.reset(reset),
|
.reset(reset),
|
||||||
|
|
||||||
.execute_if(execute_if[block_idx]),
|
.execute_if(execute_if[block_idx]),
|
||||||
.commit_if(commit_block_if[block_idx])
|
.commit_if(commit_block_if[block_idx])
|
||||||
);
|
);
|
||||||
|
`endif
|
||||||
end
|
end
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
@@ -272,11 +283,11 @@ module VX_tensor_core_block import VX_gpu_pkg::*; #(
|
|||||||
assign commit_if_ready_override = commit_if.ready && (counter == 2'b0);
|
assign commit_if_ready_override = commit_if.ready && (counter == 2'b0);
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
|
localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
|
||||||
wire [COMMIT_DATAW-1:0] commit_if_data = {
|
wire [COMMIT_DATAW-1:0] commit_if_data = {
|
||||||
execute_if_data_deq[wb_wid], /* uuid ~ rd */
|
execute_if_data_deq[wb_wid], /* uuid ~ rd */
|
||||||
// execute_if_data_deq, /* uuid ~ rd */
|
|
||||||
subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */
|
subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */
|
||||||
|
1'b0, /* tensor */
|
||||||
1'b0, /* pid */
|
1'b0, /* pid */
|
||||||
1'b1, /* sop */
|
1'b1, /* sop */
|
||||||
1'b1 /* eop */
|
1'b1 /* eop */
|
||||||
|
|||||||
232
hw/rtl/core/VX_tensor_hopper_core.sv
Normal file
232
hw/rtl/core/VX_tensor_hopper_core.sv
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
`include "VX_fpu_define.vh"
|
||||||
|
|
||||||
|
module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
||||||
|
parameter ISW,
|
||||||
|
parameter FP16
|
||||||
|
) (
|
||||||
|
input clk,
|
||||||
|
input reset,
|
||||||
|
|
||||||
|
VX_execute_if.slave execute_if,
|
||||||
|
VX_commit_if.master commit_if
|
||||||
|
);
|
||||||
|
localparam NUM_LANES = `NUM_THREADS;
|
||||||
|
localparam METADATA_QUEUE_DEPTH = 2; // FIXME: arbitrary
|
||||||
|
|
||||||
|
/* commit_if.data_t parts that we need to keep around:
|
||||||
|
- uuid
|
||||||
|
- wid
|
||||||
|
- tmask
|
||||||
|
- PC
|
||||||
|
- wb
|
||||||
|
- rd
|
||||||
|
*/
|
||||||
|
wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
|
||||||
|
wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid;
|
||||||
|
wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask;
|
||||||
|
wire [`NUM_WARPS-1:0][`INST_ALU_BITS-1:0] execute_if_data_op_type;
|
||||||
|
wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC;
|
||||||
|
wire [`NUM_WARPS-1:0] execute_if_data_wb;
|
||||||
|
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd;
|
||||||
|
|
||||||
|
wire [`NUM_WARPS-1:0] metadata_queue_fulls;
|
||||||
|
wire [`NUM_WARPS-1:0] metadata_queue_emptys;
|
||||||
|
// OR not AND; we don't want any warp to be full
|
||||||
|
wire metadata_queue_full = |(metadata_queue_fulls);
|
||||||
|
assign execute_if.ready = !metadata_queue_full;
|
||||||
|
|
||||||
|
`RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
|
||||||
|
("runtime error: WGMMA execute not supported for warps other than 0!"))
|
||||||
|
|
||||||
|
logic metadata_deq;
|
||||||
|
|
||||||
|
for (genvar i = 0; i < `NUM_WARPS; i++) begin
|
||||||
|
// Metadata queue for commit_if. This simply copies execute_if's
|
||||||
|
// metadata and pops them in conjunction with commit fire.
|
||||||
|
//
|
||||||
|
// This has to be separated per-warp, as otherwise requests from
|
||||||
|
// multiple warps can be enqueued interleaved, which makes it hard to
|
||||||
|
// ensure two consecutive dequeues are associated with the same warp for
|
||||||
|
// commit. (FIXME: this is not strictly necessary though.)
|
||||||
|
|
||||||
|
wire operand_enq_fire = execute_if.valid && execute_if.ready;
|
||||||
|
wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i));
|
||||||
|
// FIXME: commit only warp 0
|
||||||
|
wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
|
||||||
|
|
||||||
|
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS;
|
||||||
|
VX_fifo_queue #(
|
||||||
|
.DATAW(DATAW),
|
||||||
|
.DEPTH(METADATA_QUEUE_DEPTH)
|
||||||
|
) pending_uops (
|
||||||
|
.clk(clk),
|
||||||
|
.reset(reset),
|
||||||
|
.push(enq),
|
||||||
|
.pop(deq),
|
||||||
|
.data_in({execute_if.data.uuid, execute_if.data.wid,
|
||||||
|
execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
|
||||||
|
execute_if.data.wb, execute_if.data.rd}),
|
||||||
|
.data_out({execute_if_data_uuid[i], execute_if_data_wid[i],
|
||||||
|
execute_if_data_tmask[i], execute_if_data_op_type[i], execute_if_data_PC[i],
|
||||||
|
execute_if_data_wb[i], execute_if_data_rd[i]}),
|
||||||
|
.empty(metadata_queue_emptys[i]),
|
||||||
|
`UNUSED_PIN(alm_empty),
|
||||||
|
.full(metadata_queue_fulls[i]),
|
||||||
|
`UNUSED_PIN(alm_full),
|
||||||
|
`UNUSED_PIN(size)
|
||||||
|
);
|
||||||
|
end
|
||||||
|
|
||||||
|
// this shouldn't really happen unless there's a big contention over
|
||||||
|
// the commit stage
|
||||||
|
`RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
|
||||||
|
|
||||||
|
wire initiate_ready;
|
||||||
|
wire writeback_valid;
|
||||||
|
wire writeback_last;
|
||||||
|
logic writeback_ready;
|
||||||
|
|
||||||
|
wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/];
|
||||||
|
wire not_wait = metadata_valid && (execute_if_data_op_type[0] != `INST_TENSOR_HGMMA_WAIT);
|
||||||
|
// skip HGMMA_WAIT for kickoff
|
||||||
|
wire initiate_valid = metadata_valid && not_wait;
|
||||||
|
|
||||||
|
// we're recycling execute_if.op_type as operands_if.op_type which might
|
||||||
|
// have a different width; let's be safe
|
||||||
|
`STATIC_ASSERT((`INST_ALU_BITS == `INST_OP_BITS),
|
||||||
|
("static assertion failed: `INST_ALU_BITS != `INST_OP_BITS"))
|
||||||
|
|
||||||
|
VX_tensor_hopper_core #(
|
||||||
|
) tensor_hopper_core (
|
||||||
|
.clk(clk),
|
||||||
|
.reset(reset),
|
||||||
|
|
||||||
|
.initiate_valid(initiate_valid),
|
||||||
|
.initiate_wid(`NW_WIDTH'(0)/*FIXME*/),
|
||||||
|
.initiate_ready(initiate_ready),
|
||||||
|
|
||||||
|
.writeback_valid(writeback_valid),
|
||||||
|
`UNUSED_PIN(writeback_wid),
|
||||||
|
.writeback_last(writeback_last),
|
||||||
|
.writeback_ready(writeback_ready)
|
||||||
|
);
|
||||||
|
|
||||||
|
wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
|
||||||
|
|
||||||
|
always @(*) begin
|
||||||
|
metadata_deq = 1'b0;
|
||||||
|
|
||||||
|
// if there's something in the meta queue, give it priority for commit,
|
||||||
|
// since every HGMMA instructions are asynchronous and should not
|
||||||
|
// block
|
||||||
|
if (metadata_valid) begin
|
||||||
|
// block tensor core writeback
|
||||||
|
writeback_ready = 1'b0;
|
||||||
|
|
||||||
|
commit_if.valid = metadata_valid;
|
||||||
|
commit_if.data.uuid = execute_if_data_uuid[0];
|
||||||
|
commit_if.data.wid = execute_if_data_wid[0];
|
||||||
|
commit_if.data.tmask = execute_if_data_tmask[0];
|
||||||
|
commit_if.data.PC = execute_if_data_PC[0];
|
||||||
|
commit_if.data.wb = execute_if_data_wb[0];
|
||||||
|
commit_if.data.rd = execute_if_data_rd[0];
|
||||||
|
commit_if.data.data = wb_data; // FIXME ?
|
||||||
|
commit_if.data.tensor = 1'b0;
|
||||||
|
commit_if.data.pid = 1'b0;
|
||||||
|
commit_if.data.sop = 1'b1;
|
||||||
|
commit_if.data.eop = 1'b1;
|
||||||
|
|
||||||
|
// block meta queue until tensor core is ready. This will
|
||||||
|
// effectively stall further issue of async HGMMA when tensor core
|
||||||
|
// is busy with too many outstanding requests (depth of meta queue).
|
||||||
|
// be careful to not miss the commit backpressure.
|
||||||
|
metadata_deq = metadata_valid && commit_if.ready && initiate_ready;
|
||||||
|
end else begin
|
||||||
|
// allow tensor core writeback, provided there's no commit
|
||||||
|
// backpressure
|
||||||
|
writeback_ready = commit_if.ready;
|
||||||
|
|
||||||
|
commit_if.valid = writeback_valid;
|
||||||
|
commit_if.data.uuid = '0;
|
||||||
|
commit_if.data.wid = '0; // FIXME
|
||||||
|
commit_if.data.tmask = {NUM_LANES{1'b1}};
|
||||||
|
commit_if.data.PC = '0;
|
||||||
|
commit_if.data.wb = writeback_last;
|
||||||
|
commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/));
|
||||||
|
commit_if.data.data = wb_data;
|
||||||
|
// mark as "ghost" commit. This will prevent this commit from
|
||||||
|
// decrementing from pending_instr buffer
|
||||||
|
commit_if.data.tensor = 1'b1;
|
||||||
|
// only the last ghost commit has eop set, which will trigger
|
||||||
|
// scoreboard to clear out the busy bit.
|
||||||
|
commit_if.data.eop = writeback_last;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
endmodule
|
||||||
|
|
||||||
|
|
||||||
|
// TODO: replace this with a Chisel module
|
||||||
|
module VX_tensor_hopper_core #(
|
||||||
|
) (
|
||||||
|
input clk,
|
||||||
|
input reset,
|
||||||
|
|
||||||
|
input initiate_valid,
|
||||||
|
input [`NW_WIDTH-1:0] initiate_wid,
|
||||||
|
output initiate_ready,
|
||||||
|
|
||||||
|
output writeback_valid,
|
||||||
|
output [`NW_WIDTH-1:0] writeback_wid,
|
||||||
|
// indicates if this is the last writeback for the given wid, in which
|
||||||
|
// case the original HGMMA instruction should be signalled retired
|
||||||
|
output writeback_last,
|
||||||
|
input writeback_ready
|
||||||
|
);
|
||||||
|
// dummy FSM that generates commits
|
||||||
|
localparam STATE_IDLE = 4'd0;
|
||||||
|
localparam STATE_FINISH = 4'd15;
|
||||||
|
logic [3:0] state, state_n;
|
||||||
|
|
||||||
|
assign initiate_ready = (state == STATE_IDLE);
|
||||||
|
|
||||||
|
always @(*) begin
|
||||||
|
state_n = state;
|
||||||
|
|
||||||
|
case (state)
|
||||||
|
STATE_IDLE: begin
|
||||||
|
state_n = state;
|
||||||
|
end
|
||||||
|
STATE_FINISH: begin
|
||||||
|
// hold until writeback_ready
|
||||||
|
if (writeback_ready) begin
|
||||||
|
state_n = STATE_IDLE;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
default: begin
|
||||||
|
state_n = state + 4'd1;
|
||||||
|
end
|
||||||
|
endcase
|
||||||
|
|
||||||
|
// kick-off
|
||||||
|
if (initiate_valid && initiate_ready) begin
|
||||||
|
state_n = 4'd1;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
state <= '0;
|
||||||
|
end else begin
|
||||||
|
state <= state_n;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
assign writeback_valid = (state != STATE_IDLE);
|
||||||
|
assign writeback_wid = '0; // TODO
|
||||||
|
assign writeback_last = (state == STATE_FINISH);
|
||||||
|
|
||||||
|
endmodule
|
||||||
|
|
||||||
|
`endif
|
||||||
@@ -183,7 +183,13 @@ end
|
|||||||
// merging the 2 always blocks leads to spurious UNOPTFLAT verilator lint,
|
// merging the 2 always blocks leads to spurious UNOPTFLAT verilator lint,
|
||||||
// but conceptually they should be linked
|
// but conceptually they should be linked
|
||||||
always @(*) begin
|
always @(*) begin
|
||||||
|
`ifdef EXT_T_HOPPER
|
||||||
|
// for Hopper, disable micro-op blitzing. Set/step is managed
|
||||||
|
// microarchitecturally in an FSM inside the tensor core.
|
||||||
|
use_uop = 1'b0;
|
||||||
|
`else
|
||||||
use_uop = uop_sequencer_if.valid && uop_sequencer_if.data.ex_type == `EX_BITS'(`EX_TENSOR);
|
use_uop = uop_sequencer_if.valid && uop_sequencer_if.data.ex_type == `EX_BITS'(`EX_TENSOR);
|
||||||
|
`endif
|
||||||
|
|
||||||
if (uop_start) begin
|
if (uop_start) begin
|
||||||
// 1st cycle of microcoded operation, use op_type to determine entry point into microcode table
|
// 1st cycle of microcoded operation, use op_type to determine entry point into microcode table
|
||||||
@@ -225,8 +231,9 @@ end
|
|||||||
|
|
||||||
if (uop_sequencer_if.valid && use_uop &&
|
if (uop_sequencer_if.valid && use_uop &&
|
||||||
uop_sequencer_if.data.rd == `NR_BITS'(1)) begin
|
uop_sequencer_if.data.rd == `NR_BITS'(1)) begin
|
||||||
// a little sketchy? but shouldn't create any loop
|
// if rd is '1', use a separate set of 8 fp registers as the
|
||||||
ibuffer_if.data.rd = ibuffer_if.data.rd + `NR_BITS'(8); // FIXME: 8 is hardcoded
|
// destination accumulator data.
|
||||||
|
ibuffer_if.data.rd = ibuffer_if.data.rd + `NR_BITS'(8); // note 8 is hardcoded
|
||||||
ibuffer_if.data.rs3 = ibuffer_if.data.rs3 + `NR_BITS'(8);
|
ibuffer_if.data.rs3 = ibuffer_if.data.rs3 + `NR_BITS'(8);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||||
localparam PID_WIDTH = `UP(PID_BITS);
|
localparam PID_WIDTH = `UP(PID_BITS);
|
||||||
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
|
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
|
||||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + PID_WIDTH + 1 + 1;
|
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + 1 + PID_WIDTH + 1 + 1;
|
||||||
|
|
||||||
`UNUSED_VAR (execute_if.data.rs3_data)
|
`UNUSED_VAR (execute_if.data.rs3_data)
|
||||||
|
|
||||||
@@ -141,8 +141,8 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||||||
.reset (reset),
|
.reset (reset),
|
||||||
.valid_in (execute_if.valid),
|
.valid_in (execute_if.valid),
|
||||||
.ready_in (execute_if.ready),
|
.ready_in (execute_if.ready),
|
||||||
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, {tmc, wspawn, split, sjoin, barrier}}),
|
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, 1'b0/*tensor*/, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, {tmc, wspawn, split, sjoin, barrier}}),
|
||||||
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, {tmc_r, wspawn_r, split_r, sjoin_r, barrier_r}}),
|
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.tensor, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, {tmc_r, wspawn_r, split_r, sjoin_r, barrier_r}}),
|
||||||
.valid_out (commit_if.valid),
|
.valid_out (commit_if.valid),
|
||||||
.ready_out (commit_if.ready)
|
.ready_out (commit_if.ready)
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -290,7 +290,7 @@ module VX_tensor_threadgroup #(
|
|||||||
// d_col_sel: 1, 3, 1, 3
|
// d_col_sel: 1, 3, 1, 3
|
||||||
//
|
//
|
||||||
// substep 0:
|
// substep 0:
|
||||||
// [ 0 x 2 x ]
|
// [ 0 x 2 x ] (0~3 is 'i', the dpu id)
|
||||||
// [ 1 x 3 x ]
|
// [ 1 x 3 x ]
|
||||||
// substep 1:
|
// substep 1:
|
||||||
// [ x 0 x 2 ]
|
// [ x 0 x 2 ]
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ interface VX_commit_if #(
|
|||||||
logic wb;
|
logic wb;
|
||||||
logic [`NR_BITS-1:0] rd;
|
logic [`NR_BITS-1:0] rd;
|
||||||
logic [NUM_LANES-1:0][`XLEN-1:0] data;
|
logic [NUM_LANES-1:0][`XLEN-1:0] data;
|
||||||
|
logic tensor;
|
||||||
logic [PID_WIDTH-1:0] pid;
|
logic [PID_WIDTH-1:0] pid;
|
||||||
logic sop;
|
logic sop;
|
||||||
logic eop;
|
logic eop;
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ interface VX_writeback_if import VX_gpu_pkg::*; ();
|
|||||||
logic [`XLEN-1:0] PC;
|
logic [`XLEN-1:0] PC;
|
||||||
logic [`NR_BITS-1:0] rd;
|
logic [`NR_BITS-1:0] rd;
|
||||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] data;
|
logic [`NUM_THREADS-1:0][`XLEN-1:0] data;
|
||||||
|
logic tensor;
|
||||||
logic sop;
|
logic sop;
|
||||||
logic eop;
|
logic eop;
|
||||||
} data_t;
|
} data_t;
|
||||||
|
|||||||
Reference in New Issue
Block a user