// Copyright © 2019-2023 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. `include "VX_define.vh" module VX_schedule import VX_gpu_pkg::*; #( parameter CORE_ID = 0, parameter NUM_BRANCHES = `NUM_ALU_BLOCKS ) ( input wire clk, input wire reset, `ifdef PERF_ENABLE VX_pipeline_perf_if.schedule perf_schedule_if, `endif // configuration input base_dcrs_t base_dcrs, // inputsdecode_if VX_warp_ctl_if.slave warp_ctl_if, VX_branch_ctl_if.slave branch_ctl_if [NUM_BRANCHES], VX_decode_sched_if.slave decode_sched_if, VX_commit_sched_if.slave commit_sched_if, `ifdef EXT_T_ENABLE input wire tensor_csr_unlock_valid, input wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid, input wire tensor_tmc_valid, input wire [`NW_WIDTH-1:0] tensor_tmc_wid, input wire [`NUM_THREADS-1:0] tensor_tmc_tmask, `endif // outputs VX_schedule_if.master scalar_schedule_if, VX_schedule_if.master tensor_schedule_if, `ifdef GBAR_ENABLE VX_gbar_bus_if.master gbar_bus_if, `endif VX_sched_csr_if.master sched_csr_if, // status output wire busy ); `UNUSED_PARAM (CORE_ID) reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // updated when a warp is activated or disabled reg [`NUM_WARPS-1:0] stalled_warps, stalled_warps_n; // set when branch/gpgpu instructions are issued reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks, thread_masks_n; reg [`NUM_WARPS-1:0][`XLEN-1:0] warp_pcs, warp_pcs_n; wire scalar_schedule_fire = scalar_schedule_if.valid && scalar_schedule_if.ready; wire tensor_schedule_fire = tensor_schedule_if.valid && tensor_schedule_if.ready; wire schedule_fire_any = scalar_schedule_fire || tensor_schedule_fire; wire [`NW_WIDTH-1:0] schedule_fire_wid = tensor_schedule_fire ? tensor_schedule_if.data.wid : scalar_schedule_if.data.wid; // split/join wire join_valid; wire join_is_dvg; wire join_is_else; wire [`NW_WIDTH-1:0] join_wid; wire [`NUM_THREADS-1:0] join_tmask; wire [`XLEN-1:0] join_pc; reg [`PERF_CTR_BITS-1:0] cycles; reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs; wire schedule_if_fire = schedule_fire_any; // branch wire [NUM_BRANCHES-1:0] branch_valid; wire [NUM_BRANCHES-1:0][`NW_WIDTH-1:0] branch_wid; wire [NUM_BRANCHES-1:0] branch_taken; wire [NUM_BRANCHES-1:0][`XLEN-1:0] branch_dest; for (genvar i = 0; i < NUM_BRANCHES; ++i) begin assign branch_valid[i] = branch_ctl_if[i].valid; assign branch_wid[i] = branch_ctl_if[i].wid; assign branch_taken[i] = branch_ctl_if[i].taken; assign branch_dest[i] = branch_ctl_if[i].dest; end // barriers reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks, barrier_masks_n; reg [`NUM_WARPS-1:0] barrier_stalls, barrier_stalls_n; wire [`CLOG2(`NUM_WARPS+1)-1:0] active_barrier_count; wire [`NUM_WARPS-1:0] curr_barrier_mask; wire [`NUM_WARPS-1:0] curr_barrier_mask_with_self; wire [`NUM_WARPS-1:0] scalar_warp_mask; wire [`NUM_WARPS-1:0] tensor_warp_mask; wire [`NUM_WARPS-1:0] barrier_domain_mask; wire [`NUM_WARPS-1:0] barrier_arrived_mask; wire [`CLOG2(`NUM_WARPS+1)-1:0] barrier_arrived_count; `ifdef GBAR_ENABLE reg [`NUM_WARPS-1:0] curr_barrier_mask_n; reg gbar_req_valid; reg [`NB_WIDTH-1:0] gbar_req_id; reg [`NC_WIDTH-1:0] gbar_req_size_m1; `endif for (genvar i = 0; i < `NUM_WARPS; ++i) begin assign scalar_warp_mask[i] = `IS_SCALAR_WARP(i); assign tensor_warp_mask[i] = `IS_TENSOR_WARP(i); end assign curr_barrier_mask = barrier_masks[warp_ctl_if.barrier.id]; assign curr_barrier_mask_with_self = curr_barrier_mask | (`NUM_WARPS'(1) << warp_ctl_if.wid); assign barrier_domain_mask = (warp_ctl_if.barrier.domain == BARRIER_SCALAR) ? (active_warps & scalar_warp_mask) : (warp_ctl_if.barrier.domain == BARRIER_TENSOR) ? (active_warps & tensor_warp_mask) : (warp_ctl_if.barrier.domain == BARRIER_MASK) ? (active_warps & warp_ctl_if.barrier.mask) : active_warps; assign barrier_arrived_mask = curr_barrier_mask_with_self & barrier_domain_mask; `POP_COUNT(active_barrier_count, curr_barrier_mask); `POP_COUNT(barrier_arrived_count, barrier_arrived_mask); `UNUSED_VAR (active_barrier_count) always @(*) begin active_warps_n = active_warps; stalled_warps_n = stalled_warps; thread_masks_n = thread_masks; barrier_masks_n = barrier_masks; barrier_stalls_n= barrier_stalls; warp_pcs_n = warp_pcs; // wspawn handling if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin active_warps_n |= warp_ctl_if.wspawn.wmask; for (integer i = 0; i < `NUM_WARPS; ++i) begin if (warp_ctl_if.wspawn.wmask[i]) begin thread_masks_n[i][0] = 1; warp_pcs_n[i] = warp_ctl_if.wspawn.pc; end end stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp end // TMC handling if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin active_warps_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0); thread_masks_n[warp_ctl_if.wid] = warp_ctl_if.tmc.tmask; stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp end // split handling if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin if (warp_ctl_if.split.is_dvg) begin thread_masks_n[warp_ctl_if.wid] = warp_ctl_if.split.then_tmask; end stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp end // join handling if (join_valid) begin if (join_is_dvg) begin if (join_is_else) begin warp_pcs_n[join_wid] = join_pc; end thread_masks_n[join_wid] = join_tmask; end stalled_warps_n[join_wid] = 0; // unlock warp end // barrier handling `ifdef GBAR_ENABLE curr_barrier_mask_n = curr_barrier_mask; curr_barrier_mask_n[warp_ctl_if.wid] = 1; `endif if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin if (~warp_ctl_if.barrier.is_global && ((warp_ctl_if.barrier.domain == BARRIER_MASK) ? ((barrier_arrived_mask & warp_ctl_if.barrier.mask) == warp_ctl_if.barrier.mask) : (barrier_arrived_count[`NW_WIDTH-1:0] == (warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0] + `NW_WIDTH'(1))))) begin barrier_masks_n[warp_ctl_if.barrier.id] = '0; barrier_stalls_n &= ~barrier_arrived_mask; end else begin barrier_masks_n[warp_ctl_if.barrier.id][warp_ctl_if.wid] = 1; barrier_stalls_n[warp_ctl_if.wid] = 1; end stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp end `ifdef GBAR_ENABLE `ifdef GBAR_CLUSTER_ENABLE // don't check req_id == rsp_id, otherwise it limits us to // 1 outstanding request. instead assume that any response coming // back contains a valid id // // NOTE(hansung): Because every response is broadcasted to all cores, // this doesn't work when cores in the cluster use different sets of // IDs. Need a way to keep track of in-use barriers for each core and // validate responses accordingly. if (gbar_bus_if.rsp_valid) begin barrier_masks_n[gbar_bus_if.rsp_id] = '0; // instead of unlocking all warps, only unlock those that // requested this barrier barrier_stalls_n &= ~barrier_masks[gbar_bus_if.rsp_id]; end `else if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin barrier_masks_n[gbar_bus_if.rsp_id] = '0; barrier_stalls_n = '0; // unlock all warps end `endif `endif // Branch handling for (integer i = 0; i < NUM_BRANCHES; ++i) begin if (branch_valid[i]) begin if (branch_taken[i]) begin warp_pcs_n[branch_wid[i]] = branch_dest[i]; end stalled_warps_n[branch_wid[i]] = 0; // unlock warp end end // decode unlock if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin stalled_warps_n[decode_sched_if.wid] = 0; end // CSR unlock if (sched_csr_if.unlock_warp) begin stalled_warps_n[sched_csr_if.unlock_wid] = 0; end `ifdef EXT_T_ENABLE // Tensor control handles a minimal CSR-read/TMC subset without // reusing the scalar SFU. if (tensor_csr_unlock_valid) begin stalled_warps_n[tensor_csr_unlock_wid] = 0; end if (tensor_tmc_valid) begin active_warps_n[tensor_tmc_wid] = (tensor_tmc_tmask != 0); thread_masks_n[tensor_tmc_wid] = tensor_tmc_tmask; stalled_warps_n[tensor_tmc_wid] = 0; end `endif // stall the warp until decode stage if (schedule_fire_any) begin stalled_warps_n[schedule_fire_wid] = 1; end // advance PC if (scalar_schedule_fire) begin warp_pcs_n[scalar_schedule_if.data.wid] = scalar_schedule_if.data.PC + 4; end if (tensor_schedule_fire) begin warp_pcs_n[tensor_schedule_if.data.wid] = tensor_schedule_if.data.PC + 4; end end `UNUSED_VAR (base_dcrs) always @(posedge clk) begin if (reset) begin barrier_masks <= '0; `ifdef GBAR_ENABLE gbar_req_valid <= 0; `endif stalled_warps <= '0; warp_pcs <= '0; active_warps <= '0; thread_masks <= '0; barrier_stalls <= '0; issued_instrs <= '0; cycles <= '0; // activate first warp warp_pcs[0] <= base_dcrs.startup_addr; active_warps[0] <= 1; thread_masks[0][0] <= 1; end else begin active_warps <= active_warps_n; stalled_warps <= stalled_warps_n; thread_masks <= thread_masks_n; warp_pcs <= warp_pcs_n; barrier_masks <= barrier_masks_n; barrier_stalls <= barrier_stalls_n; // global barrier scheduling `ifdef GBAR_ENABLE if (warp_ctl_if.valid && warp_ctl_if.barrier.valid && warp_ctl_if.barrier.is_global `ifdef GBAR_CLUSTER_ENABLE // engage cluster barrier as soon as the barrier count is // fulfilled, instead of requiring all warps to be synchronized && (barrier_arrived_count[`NW_WIDTH-1:0] == (warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0] + `NW_WIDTH'(1)))) begin `else && (barrier_arrived_mask == barrier_domain_mask)) begin `endif gbar_req_valid <= 1; gbar_req_id <= warp_ctl_if.barrier.id; gbar_req_size_m1 <= warp_ctl_if.barrier.size_m1[`NC_WIDTH-1:0]; end if (gbar_bus_if.req_valid && gbar_bus_if.req_ready) begin gbar_req_valid <= 0; end `endif if (scalar_schedule_fire) begin issued_instrs[scalar_schedule_if.data.wid] <= issued_instrs[scalar_schedule_if.data.wid] + `UUID_WIDTH'(1); end if (tensor_schedule_fire) begin issued_instrs[tensor_schedule_if.data.wid] <= issued_instrs[tensor_schedule_if.data.wid] + `UUID_WIDTH'(1); end if (busy) begin cycles <= cycles + 1; end end end // barrier handling `ifdef GBAR_ENABLE assign gbar_bus_if.req_valid = gbar_req_valid; assign gbar_bus_if.req_id = gbar_req_id; assign gbar_bus_if.req_size_m1 = gbar_req_size_m1; // NOTE(hansung): since CORE_ID is global across multiple clusters, we // need the modulo to get the per-cluster local core id assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES); `endif // split/join handling `RESET_RELAY (split_join_reset, reset); VX_split_join #( .CORE_ID (CORE_ID) ) split_join ( .clk (clk), .reset (split_join_reset), .valid (warp_ctl_if.valid), .wid (warp_ctl_if.wid), .split (warp_ctl_if.split), .sjoin (warp_ctl_if.sjoin), .join_valid (join_valid), .join_is_dvg (join_is_dvg), .join_is_else (join_is_else), .join_wid (join_wid), .join_tmask (join_tmask), .join_pc (join_pc) ); // schedule the next ready warp wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls); wire [`NUM_WARPS-1:0] scalar_ready_warps = ready_warps & scalar_warp_mask; wire [`NUM_WARPS-1:0] tensor_ready_warps = ready_warps & tensor_warp_mask; wire [`NW_WIDTH-1:0] scalar_schedule_wid; wire [`NW_WIDTH-1:0] tensor_schedule_wid; wire scalar_schedule_valid; wire tensor_schedule_valid; wire scalar_schedule_ready; wire tensor_schedule_ready; VX_lzc_rr #( .N (`NUM_WARPS) ) scalar_wid_select ( .clk (clk), .reset (reset), .data_in (scalar_ready_warps), .data_out (scalar_schedule_wid), .valid_out (scalar_schedule_valid) ); VX_lzc_rr #( .N (`NUM_WARPS) ) tensor_wid_select ( .clk (clk), .reset (reset), .data_in (tensor_ready_warps), .data_out (tensor_schedule_wid), .valid_out (tensor_schedule_valid) ); wire [`NUM_WARPS-1:0][(`NUM_THREADS + `XLEN)-1:0] schedule_data; for (genvar i = 0; i < `NUM_WARPS; ++i) begin assign schedule_data[i] = {thread_masks[i], warp_pcs[i]}; end `ifndef NDEBUG localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS); function automatic [`UUID_WIDTH-1:0] schedule_uuid ( input logic [`NW_WIDTH-1:0] wid, input logic [`XLEN-1:0] pc ); logic [GNW_WIDTH-1:0] g_wid; begin g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(wid); schedule_uuid = `UUID_WIDTH'({g_wid, 16'(pc)}); end endfunction `else function automatic [`UUID_WIDTH-1:0] schedule_uuid ( input logic [`NW_WIDTH-1:0] wid, input logic [`XLEN-1:0] pc ); begin `UNUSED_VAR (wid) `UNUSED_VAR (pc) schedule_uuid = '0; end endfunction `endif VX_elastic_buffer #( .DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH), .SIZE (0) ) scalar_out_buf ( .clk (clk), .reset (reset), .valid_in (!reset && scalar_schedule_valid), .ready_in (scalar_schedule_ready), .data_in ({schedule_data[scalar_schedule_wid], scalar_schedule_wid}), .data_out ({scalar_schedule_if.data.tmask, scalar_schedule_if.data.PC, scalar_schedule_if.data.wid}), .valid_out (scalar_schedule_if.valid), .ready_out (scalar_schedule_if.ready) ); VX_elastic_buffer #( .DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH), .SIZE (0) ) tensor_out_buf ( .clk (clk), .reset (reset), .valid_in (!reset && tensor_schedule_valid), .ready_in (tensor_schedule_ready), .data_in ({schedule_data[tensor_schedule_wid], tensor_schedule_wid}), .data_out ({tensor_schedule_if.data.tmask, tensor_schedule_if.data.PC, tensor_schedule_if.data.wid}), .valid_out (tensor_schedule_if.valid), .ready_out (tensor_schedule_if.ready) ); assign scalar_schedule_if.data.uuid = schedule_uuid(scalar_schedule_if.data.wid, scalar_schedule_if.data.PC); assign tensor_schedule_if.data.uuid = schedule_uuid(tensor_schedule_if.data.wid, tensor_schedule_if.data.PC); `RUNTIME_ASSERT( !(scalar_schedule_fire && tensor_schedule_fire), ("%t: *** core%0d-schedule-two-domain-fire-with-single-fetch", $time, CORE_ID) ) `RUNTIME_ASSERT( !scalar_schedule_if.valid || `IS_SCALAR_WARP(scalar_schedule_if.data.wid), ("%t: *** core%0d-scalar-scheduler-issued-tensor-warp wid=%0d", $time, CORE_ID, scalar_schedule_if.data.wid) ) `RUNTIME_ASSERT( !tensor_schedule_if.valid || `IS_TENSOR_WARP(tensor_schedule_if.data.wid), ("%t: *** core%0d-tensor-scheduler-issued-scalar-warp wid=%0d", $time, CORE_ID, tensor_schedule_if.data.wid) ) `RESET_RELAY (pending_instr_reset, reset); wire no_pending_instr; VX_pending_instr #( .CTR_WIDTH (12), .DECR_COUNT (`ISSUE_WIDTH), .ALM_EMPTY (1) ) pending_instr( .clk (clk), .reset (pending_instr_reset), .incr (decode_sched_if.valid), .incr_wid (decode_sched_if.wid), .decr (commit_sched_if.committed), .decr_wid (commit_sched_if.committed_wid), .alm_empty_wid (sched_csr_if.alm_empty_wid), .alm_empty (sched_csr_if.alm_empty), .empty (no_pending_instr) ); `BUFFER_EX(busy, (active_warps != 0 || stalled_warps != 0 || barrier_stalls != 0 || ~no_pending_instr), 1'b1, 1); // export CSRs assign sched_csr_if.cycles = cycles; assign sched_csr_if.active_warps = active_warps; assign sched_csr_if.thread_masks = thread_masks; // timeout handling reg [31:0] timeout_ctr; reg timeout_enable; always @(posedge clk) begin if (reset) begin timeout_ctr <= '0; timeout_enable <= 0; end else begin if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin timeout_enable <= 1; end if (timeout_enable && active_warps !=0 && active_warps == stalled_warps) begin timeout_ctr <= timeout_ctr + 1; end else if (active_warps == 0 || active_warps != stalled_warps) begin timeout_ctr <= '0; end end end `RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps)); `RUNTIME_ASSERT( !(warp_ctl_if.valid && warp_ctl_if.barrier.valid) || barrier_domain_mask != '0, ("%t: *** core%0d-invalid-barrier-empty-domain: wid=%0d id=%0d domain=%0d active=%b mask=%b", $time, CORE_ID, warp_ctl_if.wid, warp_ctl_if.barrier.id, warp_ctl_if.barrier.domain, active_warps, warp_ctl_if.barrier.mask) ) `RUNTIME_ASSERT( !(warp_ctl_if.valid && warp_ctl_if.barrier.valid) || barrier_domain_mask[warp_ctl_if.wid], ("%t: *** core%0d-invalid-barrier-wid-domain: wid=%0d id=%0d domain=%0d active=%b mask=%b", $time, CORE_ID, warp_ctl_if.wid, warp_ctl_if.barrier.id, warp_ctl_if.barrier.domain, active_warps, warp_ctl_if.barrier.mask) ) `ifdef PERF_ENABLE reg [`PERF_CTR_BITS-1:0] perf_sched_idles; reg [`PERF_CTR_BITS-1:0] perf_sched_stalls; reg [`PERF_CTR_BITS-1:0] perf_sched_barrier_idles; reg [`PERF_CTR_BITS-1:0] perf_scalar_sched_ready_cycles; reg [`PERF_CTR_BITS-1:0] perf_tensor_sched_ready_cycles; reg [`PERF_CTR_BITS-1:0] perf_scalar_sched_issued_cycles; reg [`PERF_CTR_BITS-1:0] perf_tensor_sched_issued_cycles; wire schedule_idle = ~(scalar_schedule_if.valid || tensor_schedule_if.valid); wire schedule_stall = (scalar_schedule_if.valid && ~scalar_schedule_if.ready) || (tensor_schedule_if.valid && ~tensor_schedule_if.ready); wire [`CLOG2(`NUM_WARPS+1)-1:0] schedule_barrier_idle; `POP_COUNT(schedule_barrier_idle, barrier_stalls); always @(posedge clk) begin if (reset) begin perf_sched_idles <= '0; perf_sched_barrier_idles <= '0; perf_sched_stalls <= '0; perf_scalar_sched_ready_cycles <= '0; perf_tensor_sched_ready_cycles <= '0; perf_scalar_sched_issued_cycles <= '0; perf_tensor_sched_issued_cycles <= '0; end else begin perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle); perf_sched_barrier_idles <= perf_sched_barrier_idles + `PERF_CTR_BITS'(schedule_barrier_idle); perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall); perf_scalar_sched_ready_cycles <= perf_scalar_sched_ready_cycles + `PERF_CTR_BITS'(scalar_schedule_valid); perf_tensor_sched_ready_cycles <= perf_tensor_sched_ready_cycles + `PERF_CTR_BITS'(tensor_schedule_valid); perf_scalar_sched_issued_cycles <= perf_scalar_sched_issued_cycles + `PERF_CTR_BITS'(scalar_schedule_fire); perf_tensor_sched_issued_cycles <= perf_tensor_sched_issued_cycles + `PERF_CTR_BITS'(tensor_schedule_fire); end end assign perf_schedule_if.sched_idles = perf_sched_idles; assign perf_schedule_if.sched_barrier_idles = perf_sched_barrier_idles; assign perf_schedule_if.sched_stalls = perf_sched_stalls; assign perf_schedule_if.scalar_sched_ready_cycles = perf_scalar_sched_ready_cycles; assign perf_schedule_if.tensor_sched_ready_cycles = perf_tensor_sched_ready_cycles; assign perf_schedule_if.scalar_sched_issued_cycles = perf_scalar_sched_issued_cycles; assign perf_schedule_if.tensor_sched_issued_cycles = perf_tensor_sched_issued_cycles; `endif endmodule