606 lines
26 KiB
Systemverilog
606 lines
26 KiB
Systemverilog
// Copyright © 2019-2023
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
`include "VX_define.vh"
|
|
|
|
module VX_execute import VX_gpu_pkg::*; #(
|
|
parameter CORE_ID = 0,
|
|
parameter TENSOR_FP16 = 0,
|
|
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS
|
|
) (
|
|
`SCOPE_IO_DECL
|
|
|
|
input wire clk,
|
|
input wire reset,
|
|
|
|
input base_dcrs_t base_dcrs,
|
|
input wire downstream_mem_busy,
|
|
|
|
// Dcache interface
|
|
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
|
|
|
|
// commit interface
|
|
VX_commit_csr_if.slave commit_csr_if,
|
|
|
|
// fetch interface
|
|
VX_sched_csr_if.slave sched_csr_if,
|
|
|
|
`ifdef PERF_ENABLE
|
|
VX_mem_perf_if.slave mem_perf_if,
|
|
VX_pipeline_perf_if.slave pipeline_perf_if,
|
|
output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs,
|
|
output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs,
|
|
output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls,
|
|
output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls,
|
|
output wire [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls,
|
|
`endif
|
|
|
|
`ifdef EXT_F_ENABLE
|
|
VX_dispatch_if.slave fpu_dispatch_if [`ISSUE_WIDTH],
|
|
VX_commit_if.master fpu_commit_if [`ISSUE_WIDTH],
|
|
`endif
|
|
|
|
VX_dispatch_if.slave alu_dispatch_if [`ISSUE_WIDTH],
|
|
VX_commit_if.master alu_commit_if [`ISSUE_WIDTH],
|
|
VX_branch_ctl_if.master branch_ctl_if [2 * `NUM_ALU_BLOCKS],
|
|
|
|
VX_dispatch_if.slave lsu_dispatch_if [`ISSUE_WIDTH],
|
|
VX_commit_if.master lsu_commit_if [`ISSUE_WIDTH],
|
|
|
|
VX_dispatch_if.slave sfu_dispatch_if [`ISSUE_WIDTH],
|
|
VX_commit_if.master sfu_commit_if [`ISSUE_WIDTH],
|
|
VX_warp_ctl_if.master warp_ctl_if,
|
|
|
|
`ifdef EXT_T_ENABLE
|
|
VX_dispatch_if.slave tensor_alu_dispatch_if [`ISSUE_WIDTH],
|
|
VX_dispatch_if.slave tensor_lsu_dispatch_if [`ISSUE_WIDTH],
|
|
VX_dispatch_if.slave tensor_ctrl_dispatch_if [`ISSUE_WIDTH],
|
|
VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH],
|
|
VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH],
|
|
output wire tensor_csr_unlock_valid,
|
|
output wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid,
|
|
output wire tensor_tmc_valid,
|
|
output wire [`NW_WIDTH-1:0] tensor_tmc_wid,
|
|
output wire [`NUM_THREADS-1:0] tensor_tmc_tmask,
|
|
`ifdef EXT_T_ASYNC
|
|
VX_tc_rf_if.master tensor_regfile_if[NUM_TENSOR_CORES],
|
|
VX_tc_bus_if.master tensor_smem_A_if[NUM_TENSOR_CORES],
|
|
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_ren,
|
|
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_rready,
|
|
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_A_raddr,
|
|
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_A_rdata,
|
|
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_ren,
|
|
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_rready,
|
|
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_raddr,
|
|
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
|
|
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wen,
|
|
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wready,
|
|
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_waddr,
|
|
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
|
|
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
|
|
VX_tc_bus_if.master tensor_smem_B_if[NUM_TENSOR_CORES],
|
|
`endif
|
|
`endif
|
|
|
|
// simulation helper signals
|
|
output wire sim_ebreak,
|
|
|
|
input wire [31:0] acc_read_in,
|
|
output wire [31:0] acc_write_out,
|
|
output wire acc_write_en
|
|
);
|
|
|
|
`ifdef EXT_F_ENABLE
|
|
VX_fpu_to_csr_if fpu_to_csr_if[`NUM_FPU_BLOCKS]();
|
|
`endif
|
|
`ifdef EXT_T_ENABLE
|
|
VX_warp_ctl_if scalar_warp_ctl_if();
|
|
VX_warp_ctl_if tensor_warp_ctl_if();
|
|
|
|
localparam WARP_CTL_DATAW = `NW_WIDTH + $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
|
|
|
|
wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_in;
|
|
wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_in;
|
|
reg [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_r;
|
|
reg [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_r;
|
|
reg scalar_warp_ctl_valid_r;
|
|
reg tensor_warp_ctl_valid_r;
|
|
reg warp_ctl_rr;
|
|
|
|
assign scalar_warp_ctl_data_in = {scalar_warp_ctl_if.wid, scalar_warp_ctl_if.tmc, scalar_warp_ctl_if.wspawn, scalar_warp_ctl_if.split, scalar_warp_ctl_if.sjoin, scalar_warp_ctl_if.barrier};
|
|
assign tensor_warp_ctl_data_in = {tensor_warp_ctl_if.wid, tensor_warp_ctl_if.tmc, tensor_warp_ctl_if.wspawn, tensor_warp_ctl_if.split, tensor_warp_ctl_if.sjoin, tensor_warp_ctl_if.barrier};
|
|
|
|
wire scalar_warp_ctl_candidate_valid = scalar_warp_ctl_valid_r || scalar_warp_ctl_if.valid;
|
|
wire tensor_warp_ctl_candidate_valid = tensor_warp_ctl_valid_r || tensor_warp_ctl_if.valid;
|
|
wire select_tensor_warp_ctl = tensor_warp_ctl_candidate_valid && (!scalar_warp_ctl_candidate_valid || warp_ctl_rr);
|
|
wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_out = scalar_warp_ctl_valid_r ? scalar_warp_ctl_data_r : scalar_warp_ctl_data_in;
|
|
wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_out = tensor_warp_ctl_valid_r ? tensor_warp_ctl_data_r : tensor_warp_ctl_data_in;
|
|
wire [WARP_CTL_DATAW-1:0] selected_warp_ctl_data = select_tensor_warp_ctl ? tensor_warp_ctl_data_out : scalar_warp_ctl_data_out;
|
|
|
|
wire consume_scalar_warp_ctl_pending = !select_tensor_warp_ctl && scalar_warp_ctl_valid_r;
|
|
wire consume_scalar_warp_ctl_input = !select_tensor_warp_ctl && !scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid;
|
|
wire consume_tensor_warp_ctl_pending = select_tensor_warp_ctl && tensor_warp_ctl_valid_r;
|
|
wire consume_tensor_warp_ctl_input = select_tensor_warp_ctl && !tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid;
|
|
|
|
assign warp_ctl_if.valid = scalar_warp_ctl_candidate_valid || tensor_warp_ctl_candidate_valid;
|
|
assign {warp_ctl_if.wid, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.sjoin, warp_ctl_if.barrier} = selected_warp_ctl_data;
|
|
|
|
always @(posedge clk) begin
|
|
if (reset) begin
|
|
scalar_warp_ctl_valid_r <= 1'b0;
|
|
tensor_warp_ctl_valid_r <= 1'b0;
|
|
warp_ctl_rr <= 1'b0;
|
|
end else begin
|
|
if (scalar_warp_ctl_candidate_valid && tensor_warp_ctl_candidate_valid) begin
|
|
warp_ctl_rr <= !select_tensor_warp_ctl;
|
|
end
|
|
|
|
if (scalar_warp_ctl_valid_r) begin
|
|
if (consume_scalar_warp_ctl_pending) begin
|
|
scalar_warp_ctl_valid_r <= scalar_warp_ctl_if.valid;
|
|
scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in;
|
|
end
|
|
end else if (scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_input) begin
|
|
scalar_warp_ctl_valid_r <= 1'b1;
|
|
scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in;
|
|
end
|
|
|
|
if (tensor_warp_ctl_valid_r) begin
|
|
if (consume_tensor_warp_ctl_pending) begin
|
|
tensor_warp_ctl_valid_r <= tensor_warp_ctl_if.valid;
|
|
tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in;
|
|
end
|
|
end else if (tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_input) begin
|
|
tensor_warp_ctl_valid_r <= 1'b1;
|
|
tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in;
|
|
end
|
|
end
|
|
end
|
|
|
|
`RUNTIME_ASSERT(
|
|
!(scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_pending),
|
|
("%t: *** core%0d-scalar-warp-ctl-merge-overflow", $time, CORE_ID)
|
|
)
|
|
`RUNTIME_ASSERT(
|
|
!(tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_pending),
|
|
("%t: *** core%0d-tensor-warp-ctl-merge-overflow", $time, CORE_ID)
|
|
)
|
|
`endif
|
|
|
|
`RESET_RELAY (alu_reset, reset);
|
|
`RESET_RELAY (lsu_reset, reset);
|
|
`RESET_RELAY (sfu_reset, reset);
|
|
|
|
VX_commit_if alu_scalar_commit_if[`ISSUE_WIDTH]();
|
|
|
|
VX_alu_unit #(
|
|
.CORE_ID (CORE_ID)
|
|
) alu_unit (
|
|
.clk (clk),
|
|
.reset (alu_reset),
|
|
.dispatch_if (alu_dispatch_if),
|
|
.branch_ctl_if (branch_ctl_if[0 +: `NUM_ALU_BLOCKS]),
|
|
.commit_if (alu_scalar_commit_if)
|
|
);
|
|
|
|
`ifdef EXT_T_ENABLE
|
|
VX_commit_if alu_tensor_commit_if[`ISSUE_WIDTH]();
|
|
|
|
`RESET_RELAY (tensor_alu_reset, reset);
|
|
|
|
VX_alu_unit #(
|
|
.CORE_ID (CORE_ID)
|
|
) tensor_alu_unit (
|
|
.clk (clk),
|
|
.reset (tensor_alu_reset),
|
|
.dispatch_if (tensor_alu_dispatch_if),
|
|
.branch_ctl_if (branch_ctl_if[`NUM_ALU_BLOCKS +: `NUM_ALU_BLOCKS]),
|
|
.commit_if (alu_tensor_commit_if)
|
|
);
|
|
|
|
localparam ALU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
|
|
|
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_domain_commit
|
|
VX_stream_arb #(
|
|
.NUM_INPUTS (2),
|
|
.DATAW (ALU_COMMIT_DATAW),
|
|
.ARBITER ("R"),
|
|
.OUT_REG (1)
|
|
) alu_commit_arb (
|
|
.clk (clk),
|
|
.reset (reset),
|
|
.valid_in ({alu_tensor_commit_if[i].valid, alu_scalar_commit_if[i].valid}),
|
|
.ready_in ({alu_tensor_commit_if[i].ready, alu_scalar_commit_if[i].ready}),
|
|
.data_in ({alu_tensor_commit_if[i].data, alu_scalar_commit_if[i].data}),
|
|
.data_out (alu_commit_if[i].data),
|
|
.valid_out (alu_commit_if[i].valid),
|
|
.ready_out (alu_commit_if[i].ready),
|
|
`UNUSED_PIN (sel_out)
|
|
);
|
|
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
|
always @(posedge clk) begin
|
|
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
|
|
if (alu_scalar_commit_if[i].valid
|
|
&& ((alu_scalar_commit_if[i].data.PC == 32'h80000010) || (alu_scalar_commit_if[i].data.PC == 32'h80000014))) begin
|
|
`TRACE(1, ("%d: core%0d-execute-alu-scalar-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
|
$time, CORE_ID, i, alu_scalar_commit_if[i].valid, alu_scalar_commit_if[i].ready,
|
|
alu_scalar_commit_if[i].data.wid, alu_scalar_commit_if[i].data.PC,
|
|
alu_scalar_commit_if[i].data.wb, alu_scalar_commit_if[i].data.rd,
|
|
alu_scalar_commit_if[i].data.sop, alu_scalar_commit_if[i].data.eop,
|
|
alu_scalar_commit_if[i].data.uuid));
|
|
end
|
|
if (alu_tensor_commit_if[i].valid
|
|
&& ((alu_tensor_commit_if[i].data.PC == 32'h80000010) || (alu_tensor_commit_if[i].data.PC == 32'h80000014))) begin
|
|
`TRACE(1, ("%d: core%0d-execute-alu-tensor-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
|
$time, CORE_ID, i, alu_tensor_commit_if[i].valid, alu_tensor_commit_if[i].ready,
|
|
alu_tensor_commit_if[i].data.wid, alu_tensor_commit_if[i].data.PC,
|
|
alu_tensor_commit_if[i].data.wb, alu_tensor_commit_if[i].data.rd,
|
|
alu_tensor_commit_if[i].data.sop, alu_tensor_commit_if[i].data.eop,
|
|
alu_tensor_commit_if[i].data.uuid));
|
|
end
|
|
if (alu_commit_if[i].valid
|
|
&& ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin
|
|
`TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
|
$time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready,
|
|
alu_commit_if[i].data.wid, alu_commit_if[i].data.PC,
|
|
alu_commit_if[i].data.wb, alu_commit_if[i].data.rd,
|
|
alu_commit_if[i].data.sop, alu_commit_if[i].data.eop,
|
|
alu_commit_if[i].data.uuid));
|
|
end
|
|
end
|
|
end
|
|
`endif
|
|
end
|
|
`else
|
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_commit_passthru
|
|
assign alu_commit_if[i].valid = alu_scalar_commit_if[i].valid;
|
|
assign alu_commit_if[i].data = alu_scalar_commit_if[i].data;
|
|
assign alu_scalar_commit_if[i].ready = alu_commit_if[i].ready;
|
|
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
|
always @(posedge clk) begin
|
|
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
|
|
if (alu_commit_if[i].valid
|
|
&& ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin
|
|
`TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
|
$time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready,
|
|
alu_commit_if[i].data.wid, alu_commit_if[i].data.PC,
|
|
alu_commit_if[i].data.wb, alu_commit_if[i].data.rd,
|
|
alu_commit_if[i].data.sop, alu_commit_if[i].data.eop,
|
|
alu_commit_if[i].data.uuid));
|
|
end
|
|
end
|
|
end
|
|
`endif
|
|
end
|
|
`endif
|
|
|
|
`SCOPE_IO_SWITCH (1)
|
|
|
|
VX_commit_if lsu_scalar_commit_if[`ISSUE_WIDTH]();
|
|
|
|
VX_mem_bus_if #(
|
|
.DATA_SIZE (DCACHE_WORD_SIZE),
|
|
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
|
) scalar_lsu_bus_if[DCACHE_NUM_REQS]();
|
|
|
|
VX_lsu_unit #(
|
|
.CORE_ID (CORE_ID)
|
|
) lsu_unit (
|
|
`SCOPE_IO_BIND (0)
|
|
.clk (clk),
|
|
.reset (lsu_reset),
|
|
.downstream_mem_busy (downstream_mem_busy),
|
|
.cache_bus_if (scalar_lsu_bus_if),
|
|
.dispatch_if (lsu_dispatch_if),
|
|
.commit_if (lsu_scalar_commit_if)
|
|
);
|
|
|
|
`ifdef EXT_T_ENABLE
|
|
VX_commit_if lsu_tensor_commit_if[`ISSUE_WIDTH]();
|
|
|
|
VX_mem_bus_if #(
|
|
.DATA_SIZE (DCACHE_WORD_SIZE),
|
|
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
|
) tensor_lsu_bus_if[DCACHE_NUM_REQS]();
|
|
|
|
`RESET_RELAY (tensor_lsu_reset, reset);
|
|
|
|
VX_lsu_unit #(
|
|
.CORE_ID (CORE_ID)
|
|
) tensor_lsu_unit (
|
|
`SCOPE_IO_BIND (0)
|
|
.clk (clk),
|
|
.reset (tensor_lsu_reset),
|
|
.downstream_mem_busy (downstream_mem_busy),
|
|
.cache_bus_if (tensor_lsu_bus_if),
|
|
.dispatch_if (tensor_lsu_dispatch_if),
|
|
.commit_if (lsu_tensor_commit_if)
|
|
);
|
|
|
|
localparam LSU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
|
|
|
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_domain_commit
|
|
VX_stream_arb #(
|
|
.NUM_INPUTS (2),
|
|
.DATAW (LSU_COMMIT_DATAW),
|
|
.ARBITER ("R"),
|
|
.OUT_REG (1)
|
|
) lsu_commit_arb (
|
|
.clk (clk),
|
|
.reset (reset),
|
|
.valid_in ({lsu_tensor_commit_if[i].valid, lsu_scalar_commit_if[i].valid}),
|
|
.ready_in ({lsu_tensor_commit_if[i].ready, lsu_scalar_commit_if[i].ready}),
|
|
.data_in ({lsu_tensor_commit_if[i].data, lsu_scalar_commit_if[i].data}),
|
|
.data_out (lsu_commit_if[i].data),
|
|
.valid_out (lsu_commit_if[i].valid),
|
|
.ready_out (lsu_commit_if[i].ready),
|
|
`UNUSED_PIN (sel_out)
|
|
);
|
|
end
|
|
|
|
wire scalar_lsu_req_any;
|
|
wire tensor_lsu_req_any;
|
|
wire [DCACHE_NUM_REQS-1:0] scalar_lsu_req_valids;
|
|
wire [DCACHE_NUM_REQS-1:0] tensor_lsu_req_valids;
|
|
wire [DCACHE_NUM_REQS-1:0] lsu_req_fires;
|
|
wire [DCACHE_NUM_REQS-1:0] lsu_rd_req_fires;
|
|
wire [DCACHE_NUM_REQS-1:0] lsu_rsp_fires;
|
|
reg lsu_domain_rr;
|
|
reg lsu_active_domain;
|
|
reg [15:0] lsu_pending_reads;
|
|
logic lsu_select_tensor;
|
|
logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rd_req_fire_count;
|
|
logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rsp_fire_count;
|
|
|
|
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_domain_mem
|
|
assign scalar_lsu_req_valids[i] = scalar_lsu_bus_if[i].req_valid;
|
|
assign tensor_lsu_req_valids[i] = tensor_lsu_bus_if[i].req_valid;
|
|
|
|
assign dcache_bus_if[i].req_valid = lsu_select_tensor ? tensor_lsu_bus_if[i].req_valid : scalar_lsu_bus_if[i].req_valid;
|
|
assign dcache_bus_if[i].req_data = lsu_select_tensor ? tensor_lsu_bus_if[i].req_data : scalar_lsu_bus_if[i].req_data;
|
|
assign scalar_lsu_bus_if[i].req_ready = !lsu_select_tensor && dcache_bus_if[i].req_ready;
|
|
assign tensor_lsu_bus_if[i].req_ready = lsu_select_tensor && dcache_bus_if[i].req_ready;
|
|
|
|
assign scalar_lsu_bus_if[i].rsp_valid = !lsu_active_domain && dcache_bus_if[i].rsp_valid;
|
|
assign scalar_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data;
|
|
assign tensor_lsu_bus_if[i].rsp_valid = lsu_active_domain && dcache_bus_if[i].rsp_valid;
|
|
assign tensor_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data;
|
|
assign dcache_bus_if[i].rsp_ready = lsu_active_domain ? tensor_lsu_bus_if[i].rsp_ready : scalar_lsu_bus_if[i].rsp_ready;
|
|
|
|
assign lsu_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready;
|
|
assign lsu_rd_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && !dcache_bus_if[i].req_data.rw;
|
|
assign lsu_rsp_fires[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
|
|
end
|
|
|
|
assign scalar_lsu_req_any = |scalar_lsu_req_valids;
|
|
assign tensor_lsu_req_any = |tensor_lsu_req_valids;
|
|
|
|
always @(*) begin
|
|
if (lsu_pending_reads != 0) begin
|
|
lsu_select_tensor = lsu_active_domain;
|
|
end else if (scalar_lsu_req_any && tensor_lsu_req_any) begin
|
|
lsu_select_tensor = lsu_domain_rr;
|
|
end else begin
|
|
lsu_select_tensor = tensor_lsu_req_any;
|
|
end
|
|
|
|
lsu_rd_req_fire_count = '0;
|
|
lsu_rsp_fire_count = '0;
|
|
for (integer i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
|
lsu_rd_req_fire_count = lsu_rd_req_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rd_req_fires[i]);
|
|
lsu_rsp_fire_count = lsu_rsp_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rsp_fires[i]);
|
|
end
|
|
end
|
|
|
|
always @(posedge clk) begin
|
|
if (reset) begin
|
|
lsu_domain_rr <= 1'b0;
|
|
lsu_active_domain <= 1'b0;
|
|
lsu_pending_reads <= '0;
|
|
end else begin
|
|
if (lsu_pending_reads == 0 && (|lsu_req_fires)) begin
|
|
lsu_domain_rr <= ~lsu_select_tensor;
|
|
if (lsu_rd_req_fire_count != 0) begin
|
|
lsu_active_domain <= lsu_select_tensor;
|
|
end
|
|
end
|
|
lsu_pending_reads <= lsu_pending_reads + 16'(lsu_rd_req_fire_count) - 16'(lsu_rsp_fire_count);
|
|
end
|
|
end
|
|
|
|
`RUNTIME_ASSERT(
|
|
!(lsu_pending_reads == 0 && (|lsu_rsp_fires)),
|
|
("%t: *** core%0d-lsu-domain-arb-unmatched-response", $time, CORE_ID)
|
|
)
|
|
|
|
`ifdef PERF_ENABLE
|
|
reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs_r;
|
|
reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs_r;
|
|
reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls_r;
|
|
reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls_r;
|
|
reg [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls_r;
|
|
|
|
wire scalar_lsu_req_fire_any = (|lsu_req_fires) && !lsu_select_tensor;
|
|
wire tensor_lsu_req_fire_any = (|lsu_req_fires) && lsu_select_tensor;
|
|
wire scalar_lsu_merge_stall = scalar_lsu_req_any && lsu_select_tensor;
|
|
wire tensor_lsu_merge_stall = tensor_lsu_req_any && !lsu_select_tensor;
|
|
wire mem_merge_stall = scalar_lsu_req_any && tensor_lsu_req_any;
|
|
|
|
always @(posedge clk) begin
|
|
if (reset) begin
|
|
perf_scalar_lsu_reqs_r <= '0;
|
|
perf_tensor_lsu_reqs_r <= '0;
|
|
perf_scalar_lsu_stalls_r <= '0;
|
|
perf_tensor_lsu_stalls_r <= '0;
|
|
perf_mem_merge_stalls_r <= '0;
|
|
end else begin
|
|
perf_scalar_lsu_reqs_r <= perf_scalar_lsu_reqs_r + `PERF_CTR_BITS'(scalar_lsu_req_fire_any);
|
|
perf_tensor_lsu_reqs_r <= perf_tensor_lsu_reqs_r + `PERF_CTR_BITS'(tensor_lsu_req_fire_any);
|
|
perf_scalar_lsu_stalls_r <= perf_scalar_lsu_stalls_r + `PERF_CTR_BITS'(scalar_lsu_merge_stall);
|
|
perf_tensor_lsu_stalls_r <= perf_tensor_lsu_stalls_r + `PERF_CTR_BITS'(tensor_lsu_merge_stall);
|
|
perf_mem_merge_stalls_r <= perf_mem_merge_stalls_r + `PERF_CTR_BITS'(mem_merge_stall);
|
|
end
|
|
end
|
|
|
|
assign perf_scalar_lsu_reqs = perf_scalar_lsu_reqs_r;
|
|
assign perf_tensor_lsu_reqs = perf_tensor_lsu_reqs_r;
|
|
assign perf_scalar_lsu_stalls = perf_scalar_lsu_stalls_r;
|
|
assign perf_tensor_lsu_stalls = perf_tensor_lsu_stalls_r;
|
|
assign perf_mem_merge_stalls = perf_mem_merge_stalls_r;
|
|
`endif
|
|
`else
|
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_commit_passthru
|
|
assign lsu_commit_if[i].valid = lsu_scalar_commit_if[i].valid;
|
|
assign lsu_commit_if[i].data = lsu_scalar_commit_if[i].data;
|
|
assign lsu_scalar_commit_if[i].ready = lsu_commit_if[i].ready;
|
|
end
|
|
|
|
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_mem_passthru
|
|
`ASSIGN_VX_MEM_BUS_IF(dcache_bus_if[i], scalar_lsu_bus_if[i]);
|
|
end
|
|
|
|
`ifdef PERF_ENABLE
|
|
assign perf_scalar_lsu_reqs = '0;
|
|
assign perf_tensor_lsu_reqs = '0;
|
|
assign perf_scalar_lsu_stalls = '0;
|
|
assign perf_tensor_lsu_stalls = '0;
|
|
assign perf_mem_merge_stalls = '0;
|
|
`endif
|
|
`endif
|
|
|
|
`ifdef EXT_F_ENABLE
|
|
`RESET_RELAY (fpu_reset, reset);
|
|
|
|
VX_fpu_unit #(
|
|
.CORE_ID (CORE_ID)
|
|
) fpu_unit (
|
|
.clk (clk),
|
|
.reset (fpu_reset),
|
|
.dispatch_if (fpu_dispatch_if),
|
|
.fpu_to_csr_if (fpu_to_csr_if),
|
|
.commit_if (fpu_commit_if)
|
|
);
|
|
`endif
|
|
|
|
VX_sfu_unit #(
|
|
.CORE_ID (CORE_ID)
|
|
) sfu_unit (
|
|
.clk (clk),
|
|
.reset (sfu_reset),
|
|
|
|
`ifdef PERF_ENABLE
|
|
.mem_perf_if (mem_perf_if),
|
|
.pipeline_perf_if (pipeline_perf_if),
|
|
`endif
|
|
|
|
.base_dcrs (base_dcrs),
|
|
|
|
.dispatch_if (sfu_dispatch_if),
|
|
|
|
`ifdef EXT_F_ENABLE
|
|
.fpu_to_csr_if (fpu_to_csr_if),
|
|
`endif
|
|
|
|
.commit_csr_if (commit_csr_if),
|
|
.sched_csr_if (sched_csr_if),
|
|
`ifdef EXT_T_ENABLE
|
|
.warp_ctl_if (scalar_warp_ctl_if),
|
|
`else
|
|
.warp_ctl_if (warp_ctl_if),
|
|
`endif
|
|
.commit_if (sfu_commit_if),
|
|
|
|
.acc_read_in (acc_read_in),
|
|
.acc_write_out (acc_write_out),
|
|
.acc_write_en (acc_write_en)
|
|
);
|
|
|
|
`ifdef EXT_T_ENABLE
|
|
VX_commit_if tensor_core_commit_if[`ISSUE_WIDTH]();
|
|
VX_commit_if tensor_ctrl_commit_if[`ISSUE_WIDTH]();
|
|
|
|
VX_tensor_ctrl_unit #(
|
|
.CORE_ID (CORE_ID)
|
|
) tensor_ctrl_unit (
|
|
.clk (clk),
|
|
.reset (reset),
|
|
.dispatch_if (tensor_ctrl_dispatch_if),
|
|
.commit_if (tensor_ctrl_commit_if),
|
|
.warp_ctl_if (tensor_warp_ctl_if),
|
|
.csr_unlock_valid (tensor_csr_unlock_valid),
|
|
.csr_unlock_wid (tensor_csr_unlock_wid),
|
|
.tmc_valid (tensor_tmc_valid),
|
|
.tmc_wid (tensor_tmc_wid),
|
|
.tmc_tmask (tensor_tmc_tmask)
|
|
);
|
|
|
|
VX_tensor_core #(
|
|
.FP16 (TENSOR_FP16),
|
|
.NUM_TENSOR_CORES (NUM_TENSOR_CORES)
|
|
) tensor_core (
|
|
.clk(clk),
|
|
.reset(reset),
|
|
|
|
.dispatch_if(tensor_dispatch_if),
|
|
`ifdef EXT_T_ASYNC
|
|
.regfile_if(tensor_regfile_if),
|
|
.smem_A_if(tensor_smem_A_if),
|
|
.tmem_A_ren(tensor_tmem_A_ren),
|
|
.tmem_A_rready(tensor_tmem_A_rready),
|
|
.tmem_A_raddr(tensor_tmem_A_raddr),
|
|
.tmem_A_rdata(tensor_tmem_A_rdata),
|
|
.tmem_C_ren(tensor_tmem_C_ren),
|
|
.tmem_C_rready(tensor_tmem_C_rready),
|
|
.tmem_C_raddr(tensor_tmem_C_raddr),
|
|
.tmem_C_rdata(tensor_tmem_C_rdata),
|
|
.tmem_C_wen(tensor_tmem_C_wen),
|
|
.tmem_C_wready(tensor_tmem_C_wready),
|
|
.tmem_C_waddr(tensor_tmem_C_waddr),
|
|
.tmem_C_wdata(tensor_tmem_C_wdata),
|
|
.tmem_C_mask(tensor_tmem_C_mask),
|
|
.smem_B_if(tensor_smem_B_if),
|
|
`endif
|
|
.commit_if(tensor_core_commit_if)
|
|
);
|
|
|
|
localparam TENSOR_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
|
|
|
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_commit_arb
|
|
VX_stream_arb #(
|
|
.NUM_INPUTS (2),
|
|
.DATAW (TENSOR_COMMIT_DATAW),
|
|
.ARBITER ("R"),
|
|
.OUT_REG (1)
|
|
) tensor_commit_arb (
|
|
.clk (clk),
|
|
.reset (reset),
|
|
.valid_in ({tensor_ctrl_commit_if[i].valid, tensor_core_commit_if[i].valid}),
|
|
.ready_in ({tensor_ctrl_commit_if[i].ready, tensor_core_commit_if[i].ready}),
|
|
.data_in ({tensor_ctrl_commit_if[i].data, tensor_core_commit_if[i].data}),
|
|
.data_out (tensor_commit_if[i].data),
|
|
.valid_out (tensor_commit_if[i].valid),
|
|
.ready_out (tensor_commit_if[i].ready),
|
|
`UNUSED_PIN (sel_out)
|
|
);
|
|
end
|
|
`endif
|
|
|
|
// simulation helper signal to get RISC-V tests Pass/Fail status
|
|
assign sim_ebreak = alu_dispatch_if[0].valid && alu_dispatch_if[0].ready
|
|
&& alu_dispatch_if[0].data.wis == 0
|
|
&& `INST_ALU_IS_BR(alu_dispatch_if[0].data.op_mod)
|
|
&& (`INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_EBREAK
|
|
|| `INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_ECALL);
|
|
|
|
endmodule
|