diff --git a/hw/rtl/VX_commit.v b/hw/rtl/VX_commit.v index 95c52c59..807d874c 100644 --- a/hw/rtl/VX_commit.v +++ b/hw/rtl/VX_commit.v @@ -17,8 +17,8 @@ module VX_commit #( VX_commit_if gpu_commit_if, // outputs - VX_writeback_if writeback_if, - VX_cmt_to_csr_if cmt_to_csr_if + VX_writeback_if writeback_if, + VX_cmt_to_csr_if cmt_to_csr_if ); localparam CMTW = $clog2(3*`NUM_THREADS+1); diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index a294a6ff..e28a41b1 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -223,6 +223,7 @@ `define CSR_LWID 12'hCC3 `define CSR_GWID `CSR_MHARTID `define CSR_GCID 12'hCC5 +`define CSR_TMASK 12'hCC4 // Machine SIMT CSRs `define CSR_NT 12'hFC0 diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index 5dc6d6d8..3aa3a17e 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -12,6 +12,7 @@ module VX_csr_data #( `endif VX_cmt_to_csr_if cmt_to_csr_if, + VX_fetch_to_csr_if fetch_to_csr_if, `ifdef EXT_F_ENABLE VX_fpu_to_csr_if fpu_to_csr_if, @@ -62,15 +63,15 @@ module VX_csr_data #( `CSR_FRM: fcsr[write_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0]; `CSR_FCSR: fcsr[write_wid] <= write_data[`FFG_BITS+`FRM_BITS-1:0]; - `CSR_SATP: csr_satp <= write_data; + `CSR_SATP: csr_satp <= write_data; - `CSR_MSTATUS: csr_mstatus <= write_data; - `CSR_MEDELEG: csr_medeleg <= write_data; - `CSR_MIDELEG: csr_mideleg <= write_data; - `CSR_MIE: csr_mie <= write_data; - `CSR_MTVEC: csr_mtvec <= write_data; + `CSR_MSTATUS: csr_mstatus <= write_data; + `CSR_MEDELEG: csr_medeleg <= write_data; + `CSR_MIDELEG: csr_mideleg <= write_data; + `CSR_MIE: csr_mie <= write_data; + `CSR_MTVEC: csr_mtvec <= write_data; - `CSR_MEPC: csr_mepc <= write_data; + `CSR_MEPC: csr_mepc <= write_data; `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data; `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data; @@ -114,6 +115,9 @@ module VX_csr_data #( /*`CSR_MHARTID ,*/ `CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid); `CSR_GCID : read_data_r = CORE_ID; + + `CSR_TMASK : read_data_r = 32'(fetch_to_csr_if.thread_masks[read_wid]); + `CSR_NT : read_data_r = `NUM_THREADS; `CSR_NW : read_data_r = `NUM_WARPS; `CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS; diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index 9be2ec9a..54cad7e7 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -12,6 +12,7 @@ module VX_csr_unit #( `endif VX_cmt_to_csr_if cmt_to_csr_if, + VX_fetch_to_csr_if fetch_to_csr_if, VX_csr_req_if csr_req_if, VX_commit_if csr_commit_if, @@ -42,6 +43,7 @@ module VX_csr_unit #( .perf_pipeline_if (perf_pipeline_if), `endif .cmt_to_csr_if (cmt_to_csr_if), + .fetch_to_csr_if(fetch_to_csr_if), `ifdef EXT_F_ENABLE .fpu_to_csr_if (fpu_to_csr_if), `endif diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index 96fb1f1a..dfb45259 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -12,9 +12,12 @@ module VX_execute #( VX_dcache_req_if dcache_req_if, VX_dcache_rsp_if dcache_rsp_if, - // commit status + // commit interface VX_cmt_to_csr_if cmt_to_csr_if, + // fetch interface + VX_fetch_to_csr_if fetch_to_csr_if, + `ifdef PERF_ENABLE VX_perf_memsys_if perf_memsys_if, VX_perf_pipeline_if perf_pipeline_if, @@ -84,9 +87,10 @@ module VX_execute #( .reset (csr_reset), `ifdef PERF_ENABLE .perf_memsys_if (perf_memsys_if), - .perf_pipeline_if (perf_pipeline_if), + .perf_pipeline_if(perf_pipeline_if), `endif .cmt_to_csr_if (cmt_to_csr_if), + .fetch_to_csr_if(fetch_to_csr_if), .csr_req_if (csr_req_if), .csr_commit_if (csr_commit_if), `ifdef EXT_F_ENABLE diff --git a/hw/rtl/VX_fetch.v b/hw/rtl/VX_fetch.v index 5760beea..30f786e3 100644 --- a/hw/rtl/VX_fetch.v +++ b/hw/rtl/VX_fetch.v @@ -21,6 +21,10 @@ module VX_fetch #( // outputs VX_ifetch_rsp_if ifetch_rsp_if, + // csr interface + VX_fetch_to_csr_if fetch_to_csr_if, + + // busy status output wire busy ); @@ -32,13 +36,18 @@ module VX_fetch #( `SCOPE_BIND_VX_fetch_warp_sched .clk (clk), - .reset (reset), + .reset (reset), + .warp_ctl_if (warp_ctl_if), .wstall_if (wstall_if), .join_if (join_if), .branch_ctl_if (branch_ctl_if), + .ifetch_req_if (ifetch_req_if), .ifetch_rsp_if (ifetch_rsp_if), + + .fetch_to_csr_if (fetch_to_csr_if), + .busy (busy) ); diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index b6a575ff..a1e991c4 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -3,14 +3,10 @@ module VX_fpu_unit #( parameter CORE_ID = 0 ) ( - // inputs input wire clk, input wire reset, - // inputs VX_fpu_req_if fpu_req_if, - - // outputs VX_fpu_to_csr_if fpu_to_csr_if, VX_commit_if fpu_commit_if, diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index 9b02db2a..3836bb10 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -32,19 +32,15 @@ module VX_gpu_unit #( // tmc - wire [`NUM_THREADS-1:0] tmc_new_mask; - for (genvar i = 0; i < `NUM_THREADS; i++) begin - assign tmc_new_mask[i] = (i < gpu_req_if.rs1_data[0]); - end assign tmc.valid = is_tmc; - assign tmc.tmask = tmc_new_mask; + assign tmc.tmask = `NUM_THREADS'(gpu_req_if.rs1_data[gpu_req_if.tid]); // wspawn wire [31:0] wspawn_pc = gpu_req_if.rs2_data; wire [`NUM_WARPS-1:0] wspawn_wmask; for (genvar i = 0; i < `NUM_WARPS; i++) begin - assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[0]); + assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[gpu_req_if.tid]); end assign wspawn.valid = is_wspawn; assign wspawn.wmask = wspawn_wmask; @@ -56,7 +52,7 @@ module VX_gpu_unit #( wire [`NUM_THREADS-1:0] split_else_mask; for (genvar i = 0; i < `NUM_THREADS; i++) begin - wire taken = gpu_req_if.rs1_data[i][0]; + wire taken = gpu_req_if.rs1_data[i][gpu_req_if.tid]; assign split_then_mask[i] = gpu_req_if.tmask[i] & taken; assign split_else_mask[i] = gpu_req_if.tmask[i] & ~taken; end @@ -70,7 +66,7 @@ module VX_gpu_unit #( // barrier assign barrier.valid = is_bar; - assign barrier.id = gpu_req_if.rs1_data[0][`NB_BITS-1:0]; + assign barrier.id = gpu_req_if.rs1_data[gpu_req_if.tid][`NB_BITS-1:0]; assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data - 1); // output diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index 403be8b8..1fd79f37 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -119,15 +119,15 @@ module VX_instr_demux ( wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32)), + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)), .OUTPUT_REG (1) ) gpu_buffer ( .clk (clk), .reset (reset), .valid_in (gpu_req_valid), .ready_in (gpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `GPU_OP(ibuffer_if.op_type), ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}), - .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.rs1_data, gpu_req_if.rs2_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `GPU_OP(ibuffer_if.op_type), ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}), + .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data}), .valid_out (gpu_req_if.valid), .ready_out (gpu_req_if.ready) ); diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index c5f76f6e..53a88e41 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -108,6 +108,7 @@ module VX_pipeline #( /////////////////////////////////////////////////////////////////////////// + VX_fetch_to_csr_if fetch_to_csr_if(); VX_cmt_to_csr_if cmt_to_csr_if(); VX_decode_if decode_if(); VX_branch_ctl_if branch_ctl_if(); @@ -155,6 +156,7 @@ module VX_pipeline #( .warp_ctl_if (warp_ctl_if), .branch_ctl_if (branch_ctl_if), .ifetch_rsp_if (ifetch_rsp_if), + .fetch_to_csr_if(fetch_to_csr_if), .busy (busy) ); @@ -209,7 +211,8 @@ module VX_pipeline #( .dcache_req_if (dcache_req_if), .dcache_rsp_if (dcache_rsp_if), - .cmt_to_csr_if (cmt_to_csr_if), + .cmt_to_csr_if (cmt_to_csr_if), + .fetch_to_csr_if(fetch_to_csr_if), .alu_req_if (alu_req_if), .lsu_req_if (lsu_req_if), diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index 85ec9753..0b140a12 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -16,6 +16,8 @@ module VX_warp_sched #( VX_ifetch_rsp_if ifetch_rsp_if, VX_ifetch_req_if ifetch_req_if, + VX_fetch_to_csr_if fetch_to_csr_if, + output wire busy ); @@ -153,6 +155,9 @@ module VX_warp_sched #( end end + // export thread mask register + assign fetch_to_csr_if.thread_masks = thread_masks; + // calculate active barrier status `IGNORE_UNUSED_BEGIN diff --git a/hw/rtl/interfaces/VX_fetch_to_csr_if.v b/hw/rtl/interfaces/VX_fetch_to_csr_if.v new file mode 100644 index 00000000..e2593fe5 --- /dev/null +++ b/hw/rtl/interfaces/VX_fetch_to_csr_if.v @@ -0,0 +1,12 @@ +`ifndef VX_FETCH_TO_CSR_IF +`define VX_FETCH_TO_CSR_IF + +`include "VX_define.vh" + +interface VX_fetch_to_csr_if (); + + wire [`NUM_THREADS-1:0] thread_masks [`NUM_WARPS-1:0]; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_gpu_req_if.v b/hw/rtl/interfaces/VX_gpu_req_if.v index 5f024ae9..1b49fda0 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.v +++ b/hw/rtl/interfaces/VX_gpu_req_if.v @@ -12,6 +12,7 @@ interface VX_gpu_req_if(); wire [31:0] PC; wire [31:0] next_PC; wire [`GPU_BITS-1:0] op_type; + wire [`NT_BITS-1:0] tid; wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [31:0] rs2_data; wire [`NR_BITS-1:0] rd; diff --git a/runtime/include/vx_intrinsics.h b/runtime/include/vx_intrinsics.h index 41e1b5a5..cb780c3d 100644 --- a/runtime/include/vx_intrinsics.h +++ b/runtime/include/vx_intrinsics.h @@ -121,6 +121,13 @@ inline int vx_core_id() { return result; } +// Return current threadk mask +inline int vx_thread_mask() { + int result; + asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_TMASK)); + return result; +} + // Return the number of threads in a warp inline int vx_num_threads() { int result;