Files
vortex/hw/rtl/core/VX_execute.sv

606 lines
26 KiB
Systemverilog

// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_execute import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter TENSOR_FP16 = 0,
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
input base_dcrs_t base_dcrs,
input wire downstream_mem_busy,
// Dcache interface
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
// commit interface
VX_commit_csr_if.slave commit_csr_if,
// fetch interface
VX_sched_csr_if.slave sched_csr_if,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs,
output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs,
output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls,
output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls,
output wire [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls,
`endif
`ifdef EXT_F_ENABLE
VX_dispatch_if.slave fpu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master fpu_commit_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.slave alu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master alu_commit_if [`ISSUE_WIDTH],
VX_branch_ctl_if.master branch_ctl_if [2 * `NUM_ALU_BLOCKS],
VX_dispatch_if.slave lsu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master lsu_commit_if [`ISSUE_WIDTH],
VX_dispatch_if.slave sfu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master sfu_commit_if [`ISSUE_WIDTH],
VX_warp_ctl_if.master warp_ctl_if,
`ifdef EXT_T_ENABLE
VX_dispatch_if.slave tensor_alu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.slave tensor_lsu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.slave tensor_ctrl_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH],
output wire tensor_csr_unlock_valid,
output wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid,
output wire tensor_tmc_valid,
output wire [`NW_WIDTH-1:0] tensor_tmc_wid,
output wire [`NUM_THREADS-1:0] tensor_tmc_tmask,
`ifdef EXT_T_ASYNC
VX_tc_rf_if.master tensor_regfile_if[NUM_TENSOR_CORES],
VX_tc_bus_if.master tensor_smem_A_if[NUM_TENSOR_CORES],
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_ren,
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_rready,
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_A_raddr,
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_A_rdata,
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_ren,
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_rready,
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_raddr,
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wen,
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wready,
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_waddr,
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
VX_tc_bus_if.master tensor_smem_B_if[NUM_TENSOR_CORES],
`endif
`endif
// simulation helper signals
output wire sim_ebreak,
input wire [31:0] acc_read_in,
output wire [31:0] acc_write_out,
output wire acc_write_en
);
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if fpu_to_csr_if[`NUM_FPU_BLOCKS]();
`endif
`ifdef EXT_T_ENABLE
VX_warp_ctl_if scalar_warp_ctl_if();
VX_warp_ctl_if tensor_warp_ctl_if();
localparam WARP_CTL_DATAW = `NW_WIDTH + $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_in;
wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_in;
reg [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_r;
reg [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_r;
reg scalar_warp_ctl_valid_r;
reg tensor_warp_ctl_valid_r;
reg warp_ctl_rr;
assign scalar_warp_ctl_data_in = {scalar_warp_ctl_if.wid, scalar_warp_ctl_if.tmc, scalar_warp_ctl_if.wspawn, scalar_warp_ctl_if.split, scalar_warp_ctl_if.sjoin, scalar_warp_ctl_if.barrier};
assign tensor_warp_ctl_data_in = {tensor_warp_ctl_if.wid, tensor_warp_ctl_if.tmc, tensor_warp_ctl_if.wspawn, tensor_warp_ctl_if.split, tensor_warp_ctl_if.sjoin, tensor_warp_ctl_if.barrier};
wire scalar_warp_ctl_candidate_valid = scalar_warp_ctl_valid_r || scalar_warp_ctl_if.valid;
wire tensor_warp_ctl_candidate_valid = tensor_warp_ctl_valid_r || tensor_warp_ctl_if.valid;
wire select_tensor_warp_ctl = tensor_warp_ctl_candidate_valid && (!scalar_warp_ctl_candidate_valid || warp_ctl_rr);
wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_out = scalar_warp_ctl_valid_r ? scalar_warp_ctl_data_r : scalar_warp_ctl_data_in;
wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_out = tensor_warp_ctl_valid_r ? tensor_warp_ctl_data_r : tensor_warp_ctl_data_in;
wire [WARP_CTL_DATAW-1:0] selected_warp_ctl_data = select_tensor_warp_ctl ? tensor_warp_ctl_data_out : scalar_warp_ctl_data_out;
wire consume_scalar_warp_ctl_pending = !select_tensor_warp_ctl && scalar_warp_ctl_valid_r;
wire consume_scalar_warp_ctl_input = !select_tensor_warp_ctl && !scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid;
wire consume_tensor_warp_ctl_pending = select_tensor_warp_ctl && tensor_warp_ctl_valid_r;
wire consume_tensor_warp_ctl_input = select_tensor_warp_ctl && !tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid;
assign warp_ctl_if.valid = scalar_warp_ctl_candidate_valid || tensor_warp_ctl_candidate_valid;
assign {warp_ctl_if.wid, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.sjoin, warp_ctl_if.barrier} = selected_warp_ctl_data;
always @(posedge clk) begin
if (reset) begin
scalar_warp_ctl_valid_r <= 1'b0;
tensor_warp_ctl_valid_r <= 1'b0;
warp_ctl_rr <= 1'b0;
end else begin
if (scalar_warp_ctl_candidate_valid && tensor_warp_ctl_candidate_valid) begin
warp_ctl_rr <= !select_tensor_warp_ctl;
end
if (scalar_warp_ctl_valid_r) begin
if (consume_scalar_warp_ctl_pending) begin
scalar_warp_ctl_valid_r <= scalar_warp_ctl_if.valid;
scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in;
end
end else if (scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_input) begin
scalar_warp_ctl_valid_r <= 1'b1;
scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in;
end
if (tensor_warp_ctl_valid_r) begin
if (consume_tensor_warp_ctl_pending) begin
tensor_warp_ctl_valid_r <= tensor_warp_ctl_if.valid;
tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in;
end
end else if (tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_input) begin
tensor_warp_ctl_valid_r <= 1'b1;
tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in;
end
end
end
`RUNTIME_ASSERT(
!(scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_pending),
("%t: *** core%0d-scalar-warp-ctl-merge-overflow", $time, CORE_ID)
)
`RUNTIME_ASSERT(
!(tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_pending),
("%t: *** core%0d-tensor-warp-ctl-merge-overflow", $time, CORE_ID)
)
`endif
`RESET_RELAY (alu_reset, reset);
`RESET_RELAY (lsu_reset, reset);
`RESET_RELAY (sfu_reset, reset);
VX_commit_if alu_scalar_commit_if[`ISSUE_WIDTH]();
VX_alu_unit #(
.CORE_ID (CORE_ID)
) alu_unit (
.clk (clk),
.reset (alu_reset),
.dispatch_if (alu_dispatch_if),
.branch_ctl_if (branch_ctl_if[0 +: `NUM_ALU_BLOCKS]),
.commit_if (alu_scalar_commit_if)
);
`ifdef EXT_T_ENABLE
VX_commit_if alu_tensor_commit_if[`ISSUE_WIDTH]();
`RESET_RELAY (tensor_alu_reset, reset);
VX_alu_unit #(
.CORE_ID (CORE_ID)
) tensor_alu_unit (
.clk (clk),
.reset (tensor_alu_reset),
.dispatch_if (tensor_alu_dispatch_if),
.branch_ctl_if (branch_ctl_if[`NUM_ALU_BLOCKS +: `NUM_ALU_BLOCKS]),
.commit_if (alu_tensor_commit_if)
);
localparam ALU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_domain_commit
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (ALU_COMMIT_DATAW),
.ARBITER ("R"),
.OUT_REG (1)
) alu_commit_arb (
.clk (clk),
.reset (reset),
.valid_in ({alu_tensor_commit_if[i].valid, alu_scalar_commit_if[i].valid}),
.ready_in ({alu_tensor_commit_if[i].ready, alu_scalar_commit_if[i].ready}),
.data_in ({alu_tensor_commit_if[i].data, alu_scalar_commit_if[i].data}),
.data_out (alu_commit_if[i].data),
.valid_out (alu_commit_if[i].valid),
.ready_out (alu_commit_if[i].ready),
`UNUSED_PIN (sel_out)
);
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
if (alu_scalar_commit_if[i].valid
&& ((alu_scalar_commit_if[i].data.PC == 32'h80000010) || (alu_scalar_commit_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-execute-alu-scalar-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, i, alu_scalar_commit_if[i].valid, alu_scalar_commit_if[i].ready,
alu_scalar_commit_if[i].data.wid, alu_scalar_commit_if[i].data.PC,
alu_scalar_commit_if[i].data.wb, alu_scalar_commit_if[i].data.rd,
alu_scalar_commit_if[i].data.sop, alu_scalar_commit_if[i].data.eop,
alu_scalar_commit_if[i].data.uuid));
end
if (alu_tensor_commit_if[i].valid
&& ((alu_tensor_commit_if[i].data.PC == 32'h80000010) || (alu_tensor_commit_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-execute-alu-tensor-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, i, alu_tensor_commit_if[i].valid, alu_tensor_commit_if[i].ready,
alu_tensor_commit_if[i].data.wid, alu_tensor_commit_if[i].data.PC,
alu_tensor_commit_if[i].data.wb, alu_tensor_commit_if[i].data.rd,
alu_tensor_commit_if[i].data.sop, alu_tensor_commit_if[i].data.eop,
alu_tensor_commit_if[i].data.uuid));
end
if (alu_commit_if[i].valid
&& ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready,
alu_commit_if[i].data.wid, alu_commit_if[i].data.PC,
alu_commit_if[i].data.wb, alu_commit_if[i].data.rd,
alu_commit_if[i].data.sop, alu_commit_if[i].data.eop,
alu_commit_if[i].data.uuid));
end
end
end
`endif
end
`else
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_commit_passthru
assign alu_commit_if[i].valid = alu_scalar_commit_if[i].valid;
assign alu_commit_if[i].data = alu_scalar_commit_if[i].data;
assign alu_scalar_commit_if[i].ready = alu_commit_if[i].ready;
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
if (alu_commit_if[i].valid
&& ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready,
alu_commit_if[i].data.wid, alu_commit_if[i].data.PC,
alu_commit_if[i].data.wb, alu_commit_if[i].data.rd,
alu_commit_if[i].data.sop, alu_commit_if[i].data.eop,
alu_commit_if[i].data.uuid));
end
end
end
`endif
end
`endif
`SCOPE_IO_SWITCH (1)
VX_commit_if lsu_scalar_commit_if[`ISSUE_WIDTH]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) scalar_lsu_bus_if[DCACHE_NUM_REQS]();
VX_lsu_unit #(
.CORE_ID (CORE_ID)
) lsu_unit (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (lsu_reset),
.downstream_mem_busy (downstream_mem_busy),
.cache_bus_if (scalar_lsu_bus_if),
.dispatch_if (lsu_dispatch_if),
.commit_if (lsu_scalar_commit_if)
);
`ifdef EXT_T_ENABLE
VX_commit_if lsu_tensor_commit_if[`ISSUE_WIDTH]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) tensor_lsu_bus_if[DCACHE_NUM_REQS]();
`RESET_RELAY (tensor_lsu_reset, reset);
VX_lsu_unit #(
.CORE_ID (CORE_ID)
) tensor_lsu_unit (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (tensor_lsu_reset),
.downstream_mem_busy (downstream_mem_busy),
.cache_bus_if (tensor_lsu_bus_if),
.dispatch_if (tensor_lsu_dispatch_if),
.commit_if (lsu_tensor_commit_if)
);
localparam LSU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_domain_commit
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (LSU_COMMIT_DATAW),
.ARBITER ("R"),
.OUT_REG (1)
) lsu_commit_arb (
.clk (clk),
.reset (reset),
.valid_in ({lsu_tensor_commit_if[i].valid, lsu_scalar_commit_if[i].valid}),
.ready_in ({lsu_tensor_commit_if[i].ready, lsu_scalar_commit_if[i].ready}),
.data_in ({lsu_tensor_commit_if[i].data, lsu_scalar_commit_if[i].data}),
.data_out (lsu_commit_if[i].data),
.valid_out (lsu_commit_if[i].valid),
.ready_out (lsu_commit_if[i].ready),
`UNUSED_PIN (sel_out)
);
end
wire scalar_lsu_req_any;
wire tensor_lsu_req_any;
wire [DCACHE_NUM_REQS-1:0] scalar_lsu_req_valids;
wire [DCACHE_NUM_REQS-1:0] tensor_lsu_req_valids;
wire [DCACHE_NUM_REQS-1:0] lsu_req_fires;
wire [DCACHE_NUM_REQS-1:0] lsu_rd_req_fires;
wire [DCACHE_NUM_REQS-1:0] lsu_rsp_fires;
reg lsu_domain_rr;
reg lsu_active_domain;
reg [15:0] lsu_pending_reads;
logic lsu_select_tensor;
logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rd_req_fire_count;
logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rsp_fire_count;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_domain_mem
assign scalar_lsu_req_valids[i] = scalar_lsu_bus_if[i].req_valid;
assign tensor_lsu_req_valids[i] = tensor_lsu_bus_if[i].req_valid;
assign dcache_bus_if[i].req_valid = lsu_select_tensor ? tensor_lsu_bus_if[i].req_valid : scalar_lsu_bus_if[i].req_valid;
assign dcache_bus_if[i].req_data = lsu_select_tensor ? tensor_lsu_bus_if[i].req_data : scalar_lsu_bus_if[i].req_data;
assign scalar_lsu_bus_if[i].req_ready = !lsu_select_tensor && dcache_bus_if[i].req_ready;
assign tensor_lsu_bus_if[i].req_ready = lsu_select_tensor && dcache_bus_if[i].req_ready;
assign scalar_lsu_bus_if[i].rsp_valid = !lsu_active_domain && dcache_bus_if[i].rsp_valid;
assign scalar_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data;
assign tensor_lsu_bus_if[i].rsp_valid = lsu_active_domain && dcache_bus_if[i].rsp_valid;
assign tensor_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data;
assign dcache_bus_if[i].rsp_ready = lsu_active_domain ? tensor_lsu_bus_if[i].rsp_ready : scalar_lsu_bus_if[i].rsp_ready;
assign lsu_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready;
assign lsu_rd_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && !dcache_bus_if[i].req_data.rw;
assign lsu_rsp_fires[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
end
assign scalar_lsu_req_any = |scalar_lsu_req_valids;
assign tensor_lsu_req_any = |tensor_lsu_req_valids;
always @(*) begin
if (lsu_pending_reads != 0) begin
lsu_select_tensor = lsu_active_domain;
end else if (scalar_lsu_req_any && tensor_lsu_req_any) begin
lsu_select_tensor = lsu_domain_rr;
end else begin
lsu_select_tensor = tensor_lsu_req_any;
end
lsu_rd_req_fire_count = '0;
lsu_rsp_fire_count = '0;
for (integer i = 0; i < DCACHE_NUM_REQS; ++i) begin
lsu_rd_req_fire_count = lsu_rd_req_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rd_req_fires[i]);
lsu_rsp_fire_count = lsu_rsp_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rsp_fires[i]);
end
end
always @(posedge clk) begin
if (reset) begin
lsu_domain_rr <= 1'b0;
lsu_active_domain <= 1'b0;
lsu_pending_reads <= '0;
end else begin
if (lsu_pending_reads == 0 && (|lsu_req_fires)) begin
lsu_domain_rr <= ~lsu_select_tensor;
if (lsu_rd_req_fire_count != 0) begin
lsu_active_domain <= lsu_select_tensor;
end
end
lsu_pending_reads <= lsu_pending_reads + 16'(lsu_rd_req_fire_count) - 16'(lsu_rsp_fire_count);
end
end
`RUNTIME_ASSERT(
!(lsu_pending_reads == 0 && (|lsu_rsp_fires)),
("%t: *** core%0d-lsu-domain-arb-unmatched-response", $time, CORE_ID)
)
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs_r;
reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs_r;
reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls_r;
reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls_r;
reg [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls_r;
wire scalar_lsu_req_fire_any = (|lsu_req_fires) && !lsu_select_tensor;
wire tensor_lsu_req_fire_any = (|lsu_req_fires) && lsu_select_tensor;
wire scalar_lsu_merge_stall = scalar_lsu_req_any && lsu_select_tensor;
wire tensor_lsu_merge_stall = tensor_lsu_req_any && !lsu_select_tensor;
wire mem_merge_stall = scalar_lsu_req_any && tensor_lsu_req_any;
always @(posedge clk) begin
if (reset) begin
perf_scalar_lsu_reqs_r <= '0;
perf_tensor_lsu_reqs_r <= '0;
perf_scalar_lsu_stalls_r <= '0;
perf_tensor_lsu_stalls_r <= '0;
perf_mem_merge_stalls_r <= '0;
end else begin
perf_scalar_lsu_reqs_r <= perf_scalar_lsu_reqs_r + `PERF_CTR_BITS'(scalar_lsu_req_fire_any);
perf_tensor_lsu_reqs_r <= perf_tensor_lsu_reqs_r + `PERF_CTR_BITS'(tensor_lsu_req_fire_any);
perf_scalar_lsu_stalls_r <= perf_scalar_lsu_stalls_r + `PERF_CTR_BITS'(scalar_lsu_merge_stall);
perf_tensor_lsu_stalls_r <= perf_tensor_lsu_stalls_r + `PERF_CTR_BITS'(tensor_lsu_merge_stall);
perf_mem_merge_stalls_r <= perf_mem_merge_stalls_r + `PERF_CTR_BITS'(mem_merge_stall);
end
end
assign perf_scalar_lsu_reqs = perf_scalar_lsu_reqs_r;
assign perf_tensor_lsu_reqs = perf_tensor_lsu_reqs_r;
assign perf_scalar_lsu_stalls = perf_scalar_lsu_stalls_r;
assign perf_tensor_lsu_stalls = perf_tensor_lsu_stalls_r;
assign perf_mem_merge_stalls = perf_mem_merge_stalls_r;
`endif
`else
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_commit_passthru
assign lsu_commit_if[i].valid = lsu_scalar_commit_if[i].valid;
assign lsu_commit_if[i].data = lsu_scalar_commit_if[i].data;
assign lsu_scalar_commit_if[i].ready = lsu_commit_if[i].ready;
end
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_mem_passthru
`ASSIGN_VX_MEM_BUS_IF(dcache_bus_if[i], scalar_lsu_bus_if[i]);
end
`ifdef PERF_ENABLE
assign perf_scalar_lsu_reqs = '0;
assign perf_tensor_lsu_reqs = '0;
assign perf_scalar_lsu_stalls = '0;
assign perf_tensor_lsu_stalls = '0;
assign perf_mem_merge_stalls = '0;
`endif
`endif
`ifdef EXT_F_ENABLE
`RESET_RELAY (fpu_reset, reset);
VX_fpu_unit #(
.CORE_ID (CORE_ID)
) fpu_unit (
.clk (clk),
.reset (fpu_reset),
.dispatch_if (fpu_dispatch_if),
.fpu_to_csr_if (fpu_to_csr_if),
.commit_if (fpu_commit_if)
);
`endif
VX_sfu_unit #(
.CORE_ID (CORE_ID)
) sfu_unit (
.clk (clk),
.reset (sfu_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if (pipeline_perf_if),
`endif
.base_dcrs (base_dcrs),
.dispatch_if (sfu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`endif
.commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if),
`ifdef EXT_T_ENABLE
.warp_ctl_if (scalar_warp_ctl_if),
`else
.warp_ctl_if (warp_ctl_if),
`endif
.commit_if (sfu_commit_if),
.acc_read_in (acc_read_in),
.acc_write_out (acc_write_out),
.acc_write_en (acc_write_en)
);
`ifdef EXT_T_ENABLE
VX_commit_if tensor_core_commit_if[`ISSUE_WIDTH]();
VX_commit_if tensor_ctrl_commit_if[`ISSUE_WIDTH]();
VX_tensor_ctrl_unit #(
.CORE_ID (CORE_ID)
) tensor_ctrl_unit (
.clk (clk),
.reset (reset),
.dispatch_if (tensor_ctrl_dispatch_if),
.commit_if (tensor_ctrl_commit_if),
.warp_ctl_if (tensor_warp_ctl_if),
.csr_unlock_valid (tensor_csr_unlock_valid),
.csr_unlock_wid (tensor_csr_unlock_wid),
.tmc_valid (tensor_tmc_valid),
.tmc_wid (tensor_tmc_wid),
.tmc_tmask (tensor_tmc_tmask)
);
VX_tensor_core #(
.FP16 (TENSOR_FP16),
.NUM_TENSOR_CORES (NUM_TENSOR_CORES)
) tensor_core (
.clk(clk),
.reset(reset),
.dispatch_if(tensor_dispatch_if),
`ifdef EXT_T_ASYNC
.regfile_if(tensor_regfile_if),
.smem_A_if(tensor_smem_A_if),
.tmem_A_ren(tensor_tmem_A_ren),
.tmem_A_rready(tensor_tmem_A_rready),
.tmem_A_raddr(tensor_tmem_A_raddr),
.tmem_A_rdata(tensor_tmem_A_rdata),
.tmem_C_ren(tensor_tmem_C_ren),
.tmem_C_rready(tensor_tmem_C_rready),
.tmem_C_raddr(tensor_tmem_C_raddr),
.tmem_C_rdata(tensor_tmem_C_rdata),
.tmem_C_wen(tensor_tmem_C_wen),
.tmem_C_wready(tensor_tmem_C_wready),
.tmem_C_waddr(tensor_tmem_C_waddr),
.tmem_C_wdata(tensor_tmem_C_wdata),
.tmem_C_mask(tensor_tmem_C_mask),
.smem_B_if(tensor_smem_B_if),
`endif
.commit_if(tensor_core_commit_if)
);
localparam TENSOR_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_commit_arb
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (TENSOR_COMMIT_DATAW),
.ARBITER ("R"),
.OUT_REG (1)
) tensor_commit_arb (
.clk (clk),
.reset (reset),
.valid_in ({tensor_ctrl_commit_if[i].valid, tensor_core_commit_if[i].valid}),
.ready_in ({tensor_ctrl_commit_if[i].ready, tensor_core_commit_if[i].ready}),
.data_in ({tensor_ctrl_commit_if[i].data, tensor_core_commit_if[i].data}),
.data_out (tensor_commit_if[i].data),
.valid_out (tensor_commit_if[i].valid),
.ready_out (tensor_commit_if[i].ready),
`UNUSED_PIN (sel_out)
);
end
`endif
// simulation helper signal to get RISC-V tests Pass/Fail status
assign sim_ebreak = alu_dispatch_if[0].valid && alu_dispatch_if[0].ready
&& alu_dispatch_if[0].data.wis == 0
&& `INST_ALU_IS_BR(alu_dispatch_if[0].data.op_mod)
&& (`INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_EBREAK
|| `INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_ECALL);
endmodule