// Copyright © 2019-2023 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. `include "VX_define.vh" module VX_execute import VX_gpu_pkg::*; #( parameter CORE_ID = 0, parameter TENSOR_FP16 = 0, parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS ) ( `SCOPE_IO_DECL input wire clk, input wire reset, input base_dcrs_t base_dcrs, input wire downstream_mem_busy, // Dcache interface VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS], // commit interface VX_commit_csr_if.slave commit_csr_if, // fetch interface VX_sched_csr_if.slave sched_csr_if, `ifdef PERF_ENABLE VX_mem_perf_if.slave mem_perf_if, VX_pipeline_perf_if.slave pipeline_perf_if, output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs, output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs, output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls, output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls, output wire [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls, `endif `ifdef EXT_F_ENABLE VX_dispatch_if.slave fpu_dispatch_if [`ISSUE_WIDTH], VX_commit_if.master fpu_commit_if [`ISSUE_WIDTH], `endif VX_dispatch_if.slave alu_dispatch_if [`ISSUE_WIDTH], VX_commit_if.master alu_commit_if [`ISSUE_WIDTH], VX_branch_ctl_if.master branch_ctl_if [2 * `NUM_ALU_BLOCKS], VX_dispatch_if.slave lsu_dispatch_if [`ISSUE_WIDTH], VX_commit_if.master lsu_commit_if [`ISSUE_WIDTH], VX_dispatch_if.slave sfu_dispatch_if [`ISSUE_WIDTH], VX_commit_if.master sfu_commit_if [`ISSUE_WIDTH], VX_warp_ctl_if.master warp_ctl_if, `ifdef EXT_T_ENABLE VX_dispatch_if.slave tensor_alu_dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.slave tensor_lsu_dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.slave tensor_ctrl_dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH], VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH], output wire tensor_csr_unlock_valid, output wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid, output wire tensor_tmc_valid, output wire [`NW_WIDTH-1:0] tensor_tmc_wid, output wire [`NUM_THREADS-1:0] tensor_tmc_tmask, `ifdef EXT_T_ASYNC VX_tc_rf_if.master tensor_regfile_if[NUM_TENSOR_CORES], VX_tc_bus_if.master tensor_smem_A_if[NUM_TENSOR_CORES], output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_ren, input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_rready, output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_A_raddr, input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_A_rdata, output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_ren, input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_rready, output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_raddr, input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata, output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wen, input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wready, output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_waddr, output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata, output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask, VX_tc_bus_if.master tensor_smem_B_if[NUM_TENSOR_CORES], `endif `endif // simulation helper signals output wire sim_ebreak, input wire [31:0] acc_read_in, output wire [31:0] acc_write_out, output wire acc_write_en ); `ifdef EXT_F_ENABLE VX_fpu_to_csr_if fpu_to_csr_if[`NUM_FPU_BLOCKS](); `endif `ifdef EXT_T_ENABLE VX_warp_ctl_if scalar_warp_ctl_if(); VX_warp_ctl_if tensor_warp_ctl_if(); localparam WARP_CTL_DATAW = `NW_WIDTH + $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t); wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_in; wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_in; reg [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_r; reg [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_r; reg scalar_warp_ctl_valid_r; reg tensor_warp_ctl_valid_r; reg warp_ctl_rr; assign scalar_warp_ctl_data_in = {scalar_warp_ctl_if.wid, scalar_warp_ctl_if.tmc, scalar_warp_ctl_if.wspawn, scalar_warp_ctl_if.split, scalar_warp_ctl_if.sjoin, scalar_warp_ctl_if.barrier}; assign tensor_warp_ctl_data_in = {tensor_warp_ctl_if.wid, tensor_warp_ctl_if.tmc, tensor_warp_ctl_if.wspawn, tensor_warp_ctl_if.split, tensor_warp_ctl_if.sjoin, tensor_warp_ctl_if.barrier}; wire scalar_warp_ctl_candidate_valid = scalar_warp_ctl_valid_r || scalar_warp_ctl_if.valid; wire tensor_warp_ctl_candidate_valid = tensor_warp_ctl_valid_r || tensor_warp_ctl_if.valid; wire select_tensor_warp_ctl = tensor_warp_ctl_candidate_valid && (!scalar_warp_ctl_candidate_valid || warp_ctl_rr); wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_out = scalar_warp_ctl_valid_r ? scalar_warp_ctl_data_r : scalar_warp_ctl_data_in; wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_out = tensor_warp_ctl_valid_r ? tensor_warp_ctl_data_r : tensor_warp_ctl_data_in; wire [WARP_CTL_DATAW-1:0] selected_warp_ctl_data = select_tensor_warp_ctl ? tensor_warp_ctl_data_out : scalar_warp_ctl_data_out; wire consume_scalar_warp_ctl_pending = !select_tensor_warp_ctl && scalar_warp_ctl_valid_r; wire consume_scalar_warp_ctl_input = !select_tensor_warp_ctl && !scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid; wire consume_tensor_warp_ctl_pending = select_tensor_warp_ctl && tensor_warp_ctl_valid_r; wire consume_tensor_warp_ctl_input = select_tensor_warp_ctl && !tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid; assign warp_ctl_if.valid = scalar_warp_ctl_candidate_valid || tensor_warp_ctl_candidate_valid; assign {warp_ctl_if.wid, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.sjoin, warp_ctl_if.barrier} = selected_warp_ctl_data; always @(posedge clk) begin if (reset) begin scalar_warp_ctl_valid_r <= 1'b0; tensor_warp_ctl_valid_r <= 1'b0; warp_ctl_rr <= 1'b0; end else begin if (scalar_warp_ctl_candidate_valid && tensor_warp_ctl_candidate_valid) begin warp_ctl_rr <= !select_tensor_warp_ctl; end if (scalar_warp_ctl_valid_r) begin if (consume_scalar_warp_ctl_pending) begin scalar_warp_ctl_valid_r <= scalar_warp_ctl_if.valid; scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in; end end else if (scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_input) begin scalar_warp_ctl_valid_r <= 1'b1; scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in; end if (tensor_warp_ctl_valid_r) begin if (consume_tensor_warp_ctl_pending) begin tensor_warp_ctl_valid_r <= tensor_warp_ctl_if.valid; tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in; end end else if (tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_input) begin tensor_warp_ctl_valid_r <= 1'b1; tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in; end end end `RUNTIME_ASSERT( !(scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_pending), ("%t: *** core%0d-scalar-warp-ctl-merge-overflow", $time, CORE_ID) ) `RUNTIME_ASSERT( !(tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_pending), ("%t: *** core%0d-tensor-warp-ctl-merge-overflow", $time, CORE_ID) ) `endif `RESET_RELAY (alu_reset, reset); `RESET_RELAY (lsu_reset, reset); `RESET_RELAY (sfu_reset, reset); VX_commit_if alu_scalar_commit_if[`ISSUE_WIDTH](); VX_alu_unit #( .CORE_ID (CORE_ID) ) alu_unit ( .clk (clk), .reset (alu_reset), .dispatch_if (alu_dispatch_if), .branch_ctl_if (branch_ctl_if[0 +: `NUM_ALU_BLOCKS]), .commit_if (alu_scalar_commit_if) ); `ifdef EXT_T_ENABLE VX_commit_if alu_tensor_commit_if[`ISSUE_WIDTH](); `RESET_RELAY (tensor_alu_reset, reset); VX_alu_unit #( .CORE_ID (CORE_ID) ) tensor_alu_unit ( .clk (clk), .reset (tensor_alu_reset), .dispatch_if (tensor_alu_dispatch_if), .branch_ctl_if (branch_ctl_if[`NUM_ALU_BLOCKS +: `NUM_ALU_BLOCKS]), .commit_if (alu_tensor_commit_if) ); localparam ALU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1; for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_domain_commit VX_stream_arb #( .NUM_INPUTS (2), .DATAW (ALU_COMMIT_DATAW), .ARBITER ("R"), .OUT_REG (1) ) alu_commit_arb ( .clk (clk), .reset (reset), .valid_in ({alu_tensor_commit_if[i].valid, alu_scalar_commit_if[i].valid}), .ready_in ({alu_tensor_commit_if[i].ready, alu_scalar_commit_if[i].ready}), .data_in ({alu_tensor_commit_if[i].data, alu_scalar_commit_if[i].data}), .data_out (alu_commit_if[i].data), .valid_out (alu_commit_if[i].valid), .ready_out (alu_commit_if[i].ready), `UNUSED_PIN (sel_out) ); `ifdef DBG_TRACE_CORE_PIPELINE_VCS always @(posedge clk) begin if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin if (alu_scalar_commit_if[i].valid && ((alu_scalar_commit_if[i].data.PC == 32'h80000010) || (alu_scalar_commit_if[i].data.PC == 32'h80000014))) begin `TRACE(1, ("%d: core%0d-execute-alu-scalar-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", $time, CORE_ID, i, alu_scalar_commit_if[i].valid, alu_scalar_commit_if[i].ready, alu_scalar_commit_if[i].data.wid, alu_scalar_commit_if[i].data.PC, alu_scalar_commit_if[i].data.wb, alu_scalar_commit_if[i].data.rd, alu_scalar_commit_if[i].data.sop, alu_scalar_commit_if[i].data.eop, alu_scalar_commit_if[i].data.uuid)); end if (alu_tensor_commit_if[i].valid && ((alu_tensor_commit_if[i].data.PC == 32'h80000010) || (alu_tensor_commit_if[i].data.PC == 32'h80000014))) begin `TRACE(1, ("%d: core%0d-execute-alu-tensor-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", $time, CORE_ID, i, alu_tensor_commit_if[i].valid, alu_tensor_commit_if[i].ready, alu_tensor_commit_if[i].data.wid, alu_tensor_commit_if[i].data.PC, alu_tensor_commit_if[i].data.wb, alu_tensor_commit_if[i].data.rd, alu_tensor_commit_if[i].data.sop, alu_tensor_commit_if[i].data.eop, alu_tensor_commit_if[i].data.uuid)); end if (alu_commit_if[i].valid && ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin `TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", $time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready, alu_commit_if[i].data.wid, alu_commit_if[i].data.PC, alu_commit_if[i].data.wb, alu_commit_if[i].data.rd, alu_commit_if[i].data.sop, alu_commit_if[i].data.eop, alu_commit_if[i].data.uuid)); end end end `endif end `else for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_commit_passthru assign alu_commit_if[i].valid = alu_scalar_commit_if[i].valid; assign alu_commit_if[i].data = alu_scalar_commit_if[i].data; assign alu_scalar_commit_if[i].ready = alu_commit_if[i].ready; `ifdef DBG_TRACE_CORE_PIPELINE_VCS always @(posedge clk) begin if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin if (alu_commit_if[i].valid && ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin `TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", $time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready, alu_commit_if[i].data.wid, alu_commit_if[i].data.PC, alu_commit_if[i].data.wb, alu_commit_if[i].data.rd, alu_commit_if[i].data.sop, alu_commit_if[i].data.eop, alu_commit_if[i].data.uuid)); end end end `endif end `endif `SCOPE_IO_SWITCH (1) VX_commit_if lsu_scalar_commit_if[`ISSUE_WIDTH](); VX_mem_bus_if #( .DATA_SIZE (DCACHE_WORD_SIZE), .TAG_WIDTH (DCACHE_TAG_WIDTH) ) scalar_lsu_bus_if[DCACHE_NUM_REQS](); VX_lsu_unit #( .CORE_ID (CORE_ID) ) lsu_unit ( `SCOPE_IO_BIND (0) .clk (clk), .reset (lsu_reset), .downstream_mem_busy (downstream_mem_busy), .cache_bus_if (scalar_lsu_bus_if), .dispatch_if (lsu_dispatch_if), .commit_if (lsu_scalar_commit_if) ); `ifdef EXT_T_ENABLE VX_commit_if lsu_tensor_commit_if[`ISSUE_WIDTH](); VX_mem_bus_if #( .DATA_SIZE (DCACHE_WORD_SIZE), .TAG_WIDTH (DCACHE_TAG_WIDTH) ) tensor_lsu_bus_if[DCACHE_NUM_REQS](); `RESET_RELAY (tensor_lsu_reset, reset); VX_lsu_unit #( .CORE_ID (CORE_ID) ) tensor_lsu_unit ( `SCOPE_IO_BIND (0) .clk (clk), .reset (tensor_lsu_reset), .downstream_mem_busy (downstream_mem_busy), .cache_bus_if (tensor_lsu_bus_if), .dispatch_if (tensor_lsu_dispatch_if), .commit_if (lsu_tensor_commit_if) ); localparam LSU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1; for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_domain_commit VX_stream_arb #( .NUM_INPUTS (2), .DATAW (LSU_COMMIT_DATAW), .ARBITER ("R"), .OUT_REG (1) ) lsu_commit_arb ( .clk (clk), .reset (reset), .valid_in ({lsu_tensor_commit_if[i].valid, lsu_scalar_commit_if[i].valid}), .ready_in ({lsu_tensor_commit_if[i].ready, lsu_scalar_commit_if[i].ready}), .data_in ({lsu_tensor_commit_if[i].data, lsu_scalar_commit_if[i].data}), .data_out (lsu_commit_if[i].data), .valid_out (lsu_commit_if[i].valid), .ready_out (lsu_commit_if[i].ready), `UNUSED_PIN (sel_out) ); end wire scalar_lsu_req_any; wire tensor_lsu_req_any; wire [DCACHE_NUM_REQS-1:0] scalar_lsu_req_valids; wire [DCACHE_NUM_REQS-1:0] tensor_lsu_req_valids; wire [DCACHE_NUM_REQS-1:0] lsu_req_fires; wire [DCACHE_NUM_REQS-1:0] lsu_rd_req_fires; wire [DCACHE_NUM_REQS-1:0] lsu_rsp_fires; reg lsu_domain_rr; reg lsu_active_domain; reg [15:0] lsu_pending_reads; logic lsu_select_tensor; logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rd_req_fire_count; logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rsp_fire_count; for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_domain_mem assign scalar_lsu_req_valids[i] = scalar_lsu_bus_if[i].req_valid; assign tensor_lsu_req_valids[i] = tensor_lsu_bus_if[i].req_valid; assign dcache_bus_if[i].req_valid = lsu_select_tensor ? tensor_lsu_bus_if[i].req_valid : scalar_lsu_bus_if[i].req_valid; assign dcache_bus_if[i].req_data = lsu_select_tensor ? tensor_lsu_bus_if[i].req_data : scalar_lsu_bus_if[i].req_data; assign scalar_lsu_bus_if[i].req_ready = !lsu_select_tensor && dcache_bus_if[i].req_ready; assign tensor_lsu_bus_if[i].req_ready = lsu_select_tensor && dcache_bus_if[i].req_ready; assign scalar_lsu_bus_if[i].rsp_valid = !lsu_active_domain && dcache_bus_if[i].rsp_valid; assign scalar_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data; assign tensor_lsu_bus_if[i].rsp_valid = lsu_active_domain && dcache_bus_if[i].rsp_valid; assign tensor_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data; assign dcache_bus_if[i].rsp_ready = lsu_active_domain ? tensor_lsu_bus_if[i].rsp_ready : scalar_lsu_bus_if[i].rsp_ready; assign lsu_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready; assign lsu_rd_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && !dcache_bus_if[i].req_data.rw; assign lsu_rsp_fires[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready; end assign scalar_lsu_req_any = |scalar_lsu_req_valids; assign tensor_lsu_req_any = |tensor_lsu_req_valids; always @(*) begin if (lsu_pending_reads != 0) begin lsu_select_tensor = lsu_active_domain; end else if (scalar_lsu_req_any && tensor_lsu_req_any) begin lsu_select_tensor = lsu_domain_rr; end else begin lsu_select_tensor = tensor_lsu_req_any; end lsu_rd_req_fire_count = '0; lsu_rsp_fire_count = '0; for (integer i = 0; i < DCACHE_NUM_REQS; ++i) begin lsu_rd_req_fire_count = lsu_rd_req_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rd_req_fires[i]); lsu_rsp_fire_count = lsu_rsp_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rsp_fires[i]); end end always @(posedge clk) begin if (reset) begin lsu_domain_rr <= 1'b0; lsu_active_domain <= 1'b0; lsu_pending_reads <= '0; end else begin if (lsu_pending_reads == 0 && (|lsu_req_fires)) begin lsu_domain_rr <= ~lsu_select_tensor; if (lsu_rd_req_fire_count != 0) begin lsu_active_domain <= lsu_select_tensor; end end lsu_pending_reads <= lsu_pending_reads + 16'(lsu_rd_req_fire_count) - 16'(lsu_rsp_fire_count); end end `RUNTIME_ASSERT( !(lsu_pending_reads == 0 && (|lsu_rsp_fires)), ("%t: *** core%0d-lsu-domain-arb-unmatched-response", $time, CORE_ID) ) `ifdef PERF_ENABLE reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs_r; reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs_r; reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls_r; reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls_r; reg [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls_r; wire scalar_lsu_req_fire_any = (|lsu_req_fires) && !lsu_select_tensor; wire tensor_lsu_req_fire_any = (|lsu_req_fires) && lsu_select_tensor; wire scalar_lsu_merge_stall = scalar_lsu_req_any && lsu_select_tensor; wire tensor_lsu_merge_stall = tensor_lsu_req_any && !lsu_select_tensor; wire mem_merge_stall = scalar_lsu_req_any && tensor_lsu_req_any; always @(posedge clk) begin if (reset) begin perf_scalar_lsu_reqs_r <= '0; perf_tensor_lsu_reqs_r <= '0; perf_scalar_lsu_stalls_r <= '0; perf_tensor_lsu_stalls_r <= '0; perf_mem_merge_stalls_r <= '0; end else begin perf_scalar_lsu_reqs_r <= perf_scalar_lsu_reqs_r + `PERF_CTR_BITS'(scalar_lsu_req_fire_any); perf_tensor_lsu_reqs_r <= perf_tensor_lsu_reqs_r + `PERF_CTR_BITS'(tensor_lsu_req_fire_any); perf_scalar_lsu_stalls_r <= perf_scalar_lsu_stalls_r + `PERF_CTR_BITS'(scalar_lsu_merge_stall); perf_tensor_lsu_stalls_r <= perf_tensor_lsu_stalls_r + `PERF_CTR_BITS'(tensor_lsu_merge_stall); perf_mem_merge_stalls_r <= perf_mem_merge_stalls_r + `PERF_CTR_BITS'(mem_merge_stall); end end assign perf_scalar_lsu_reqs = perf_scalar_lsu_reqs_r; assign perf_tensor_lsu_reqs = perf_tensor_lsu_reqs_r; assign perf_scalar_lsu_stalls = perf_scalar_lsu_stalls_r; assign perf_tensor_lsu_stalls = perf_tensor_lsu_stalls_r; assign perf_mem_merge_stalls = perf_mem_merge_stalls_r; `endif `else for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_commit_passthru assign lsu_commit_if[i].valid = lsu_scalar_commit_if[i].valid; assign lsu_commit_if[i].data = lsu_scalar_commit_if[i].data; assign lsu_scalar_commit_if[i].ready = lsu_commit_if[i].ready; end for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_mem_passthru `ASSIGN_VX_MEM_BUS_IF(dcache_bus_if[i], scalar_lsu_bus_if[i]); end `ifdef PERF_ENABLE assign perf_scalar_lsu_reqs = '0; assign perf_tensor_lsu_reqs = '0; assign perf_scalar_lsu_stalls = '0; assign perf_tensor_lsu_stalls = '0; assign perf_mem_merge_stalls = '0; `endif `endif `ifdef EXT_F_ENABLE `RESET_RELAY (fpu_reset, reset); VX_fpu_unit #( .CORE_ID (CORE_ID) ) fpu_unit ( .clk (clk), .reset (fpu_reset), .dispatch_if (fpu_dispatch_if), .fpu_to_csr_if (fpu_to_csr_if), .commit_if (fpu_commit_if) ); `endif VX_sfu_unit #( .CORE_ID (CORE_ID) ) sfu_unit ( .clk (clk), .reset (sfu_reset), `ifdef PERF_ENABLE .mem_perf_if (mem_perf_if), .pipeline_perf_if (pipeline_perf_if), `endif .base_dcrs (base_dcrs), .dispatch_if (sfu_dispatch_if), `ifdef EXT_F_ENABLE .fpu_to_csr_if (fpu_to_csr_if), `endif .commit_csr_if (commit_csr_if), .sched_csr_if (sched_csr_if), `ifdef EXT_T_ENABLE .warp_ctl_if (scalar_warp_ctl_if), `else .warp_ctl_if (warp_ctl_if), `endif .commit_if (sfu_commit_if), .acc_read_in (acc_read_in), .acc_write_out (acc_write_out), .acc_write_en (acc_write_en) ); `ifdef EXT_T_ENABLE VX_commit_if tensor_core_commit_if[`ISSUE_WIDTH](); VX_commit_if tensor_ctrl_commit_if[`ISSUE_WIDTH](); VX_tensor_ctrl_unit #( .CORE_ID (CORE_ID) ) tensor_ctrl_unit ( .clk (clk), .reset (reset), .dispatch_if (tensor_ctrl_dispatch_if), .commit_if (tensor_ctrl_commit_if), .warp_ctl_if (tensor_warp_ctl_if), .csr_unlock_valid (tensor_csr_unlock_valid), .csr_unlock_wid (tensor_csr_unlock_wid), .tmc_valid (tensor_tmc_valid), .tmc_wid (tensor_tmc_wid), .tmc_tmask (tensor_tmc_tmask) ); VX_tensor_core #( .FP16 (TENSOR_FP16), .NUM_TENSOR_CORES (NUM_TENSOR_CORES) ) tensor_core ( .clk(clk), .reset(reset), .dispatch_if(tensor_dispatch_if), `ifdef EXT_T_ASYNC .regfile_if(tensor_regfile_if), .smem_A_if(tensor_smem_A_if), .tmem_A_ren(tensor_tmem_A_ren), .tmem_A_rready(tensor_tmem_A_rready), .tmem_A_raddr(tensor_tmem_A_raddr), .tmem_A_rdata(tensor_tmem_A_rdata), .tmem_C_ren(tensor_tmem_C_ren), .tmem_C_rready(tensor_tmem_C_rready), .tmem_C_raddr(tensor_tmem_C_raddr), .tmem_C_rdata(tensor_tmem_C_rdata), .tmem_C_wen(tensor_tmem_C_wen), .tmem_C_wready(tensor_tmem_C_wready), .tmem_C_waddr(tensor_tmem_C_waddr), .tmem_C_wdata(tensor_tmem_C_wdata), .tmem_C_mask(tensor_tmem_C_mask), .smem_B_if(tensor_smem_B_if), `endif .commit_if(tensor_core_commit_if) ); localparam TENSOR_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1; for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_commit_arb VX_stream_arb #( .NUM_INPUTS (2), .DATAW (TENSOR_COMMIT_DATAW), .ARBITER ("R"), .OUT_REG (1) ) tensor_commit_arb ( .clk (clk), .reset (reset), .valid_in ({tensor_ctrl_commit_if[i].valid, tensor_core_commit_if[i].valid}), .ready_in ({tensor_ctrl_commit_if[i].ready, tensor_core_commit_if[i].ready}), .data_in ({tensor_ctrl_commit_if[i].data, tensor_core_commit_if[i].data}), .data_out (tensor_commit_if[i].data), .valid_out (tensor_commit_if[i].valid), .ready_out (tensor_commit_if[i].ready), `UNUSED_PIN (sel_out) ); end `endif // simulation helper signal to get RISC-V tests Pass/Fail status assign sim_ebreak = alu_dispatch_if[0].valid && alu_dispatch_if[0].ready && alu_dispatch_if[0].data.wis == 0 && `INST_ALU_IS_BR(alu_dispatch_if[0].data.op_mod) && (`INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_EBREAK || `INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_ECALL); endmodule