thread mask redesign

This commit is contained in:
Blaise Tine
2021-08-05 17:32:58 -07:00
parent 7b8fe11e6a
commit e4d9fd8a00
14 changed files with 68 additions and 28 deletions

View File

@@ -223,6 +223,7 @@
`define CSR_LWID 12'hCC3 `define CSR_LWID 12'hCC3
`define CSR_GWID `CSR_MHARTID `define CSR_GWID `CSR_MHARTID
`define CSR_GCID 12'hCC5 `define CSR_GCID 12'hCC5
`define CSR_TMASK 12'hCC4
// Machine SIMT CSRs // Machine SIMT CSRs
`define CSR_NT 12'hFC0 `define CSR_NT 12'hFC0

View File

@@ -12,6 +12,7 @@ module VX_csr_data #(
`endif `endif
VX_cmt_to_csr_if cmt_to_csr_if, VX_cmt_to_csr_if cmt_to_csr_if,
VX_fetch_to_csr_if fetch_to_csr_if,
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
VX_fpu_to_csr_if fpu_to_csr_if, VX_fpu_to_csr_if fpu_to_csr_if,
@@ -114,6 +115,9 @@ module VX_csr_data #(
/*`CSR_MHARTID ,*/ /*`CSR_MHARTID ,*/
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid); `CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
`CSR_GCID : read_data_r = CORE_ID; `CSR_GCID : read_data_r = CORE_ID;
`CSR_TMASK : read_data_r = 32'(fetch_to_csr_if.thread_masks[read_wid]);
`CSR_NT : read_data_r = `NUM_THREADS; `CSR_NT : read_data_r = `NUM_THREADS;
`CSR_NW : read_data_r = `NUM_WARPS; `CSR_NW : read_data_r = `NUM_WARPS;
`CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS; `CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS;

View File

@@ -12,6 +12,7 @@ module VX_csr_unit #(
`endif `endif
VX_cmt_to_csr_if cmt_to_csr_if, VX_cmt_to_csr_if cmt_to_csr_if,
VX_fetch_to_csr_if fetch_to_csr_if,
VX_csr_req_if csr_req_if, VX_csr_req_if csr_req_if,
VX_commit_if csr_commit_if, VX_commit_if csr_commit_if,
@@ -42,6 +43,7 @@ module VX_csr_unit #(
.perf_pipeline_if (perf_pipeline_if), .perf_pipeline_if (perf_pipeline_if),
`endif `endif
.cmt_to_csr_if (cmt_to_csr_if), .cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if),
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if), .fpu_to_csr_if (fpu_to_csr_if),
`endif `endif

View File

@@ -12,9 +12,12 @@ module VX_execute #(
VX_dcache_req_if dcache_req_if, VX_dcache_req_if dcache_req_if,
VX_dcache_rsp_if dcache_rsp_if, VX_dcache_rsp_if dcache_rsp_if,
// commit status // commit interface
VX_cmt_to_csr_if cmt_to_csr_if, VX_cmt_to_csr_if cmt_to_csr_if,
// fetch interface
VX_fetch_to_csr_if fetch_to_csr_if,
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if, VX_perf_memsys_if perf_memsys_if,
VX_perf_pipeline_if perf_pipeline_if, VX_perf_pipeline_if perf_pipeline_if,
@@ -84,9 +87,10 @@ module VX_execute #(
.reset (csr_reset), .reset (csr_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.perf_memsys_if (perf_memsys_if), .perf_memsys_if (perf_memsys_if),
.perf_pipeline_if (perf_pipeline_if), .perf_pipeline_if(perf_pipeline_if),
`endif `endif
.cmt_to_csr_if (cmt_to_csr_if), .cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if),
.csr_req_if (csr_req_if), .csr_req_if (csr_req_if),
.csr_commit_if (csr_commit_if), .csr_commit_if (csr_commit_if),
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE

View File

@@ -21,6 +21,10 @@ module VX_fetch #(
// outputs // outputs
VX_ifetch_rsp_if ifetch_rsp_if, VX_ifetch_rsp_if ifetch_rsp_if,
// csr interface
VX_fetch_to_csr_if fetch_to_csr_if,
// busy status
output wire busy output wire busy
); );
@@ -33,12 +37,17 @@ module VX_fetch #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.warp_ctl_if (warp_ctl_if), .warp_ctl_if (warp_ctl_if),
.wstall_if (wstall_if), .wstall_if (wstall_if),
.join_if (join_if), .join_if (join_if),
.branch_ctl_if (branch_ctl_if), .branch_ctl_if (branch_ctl_if),
.ifetch_req_if (ifetch_req_if), .ifetch_req_if (ifetch_req_if),
.ifetch_rsp_if (ifetch_rsp_if), .ifetch_rsp_if (ifetch_rsp_if),
.fetch_to_csr_if (fetch_to_csr_if),
.busy (busy) .busy (busy)
); );

View File

@@ -3,14 +3,10 @@
module VX_fpu_unit #( module VX_fpu_unit #(
parameter CORE_ID = 0 parameter CORE_ID = 0
) ( ) (
// inputs
input wire clk, input wire clk,
input wire reset, input wire reset,
// inputs
VX_fpu_req_if fpu_req_if, VX_fpu_req_if fpu_req_if,
// outputs
VX_fpu_to_csr_if fpu_to_csr_if, VX_fpu_to_csr_if fpu_to_csr_if,
VX_commit_if fpu_commit_if, VX_commit_if fpu_commit_if,

View File

@@ -32,19 +32,15 @@ module VX_gpu_unit #(
// tmc // tmc
wire [`NUM_THREADS-1:0] tmc_new_mask;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign tmc_new_mask[i] = (i < gpu_req_if.rs1_data[0]);
end
assign tmc.valid = is_tmc; assign tmc.valid = is_tmc;
assign tmc.tmask = tmc_new_mask; assign tmc.tmask = `NUM_THREADS'(gpu_req_if.rs1_data[gpu_req_if.tid]);
// wspawn // wspawn
wire [31:0] wspawn_pc = gpu_req_if.rs2_data; wire [31:0] wspawn_pc = gpu_req_if.rs2_data;
wire [`NUM_WARPS-1:0] wspawn_wmask; wire [`NUM_WARPS-1:0] wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; i++) begin for (genvar i = 0; i < `NUM_WARPS; i++) begin
assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[0]); assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[gpu_req_if.tid]);
end end
assign wspawn.valid = is_wspawn; assign wspawn.valid = is_wspawn;
assign wspawn.wmask = wspawn_wmask; assign wspawn.wmask = wspawn_wmask;
@@ -56,7 +52,7 @@ module VX_gpu_unit #(
wire [`NUM_THREADS-1:0] split_else_mask; wire [`NUM_THREADS-1:0] split_else_mask;
for (genvar i = 0; i < `NUM_THREADS; i++) begin for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire taken = gpu_req_if.rs1_data[i][0]; wire taken = gpu_req_if.rs1_data[i][gpu_req_if.tid];
assign split_then_mask[i] = gpu_req_if.tmask[i] & taken; assign split_then_mask[i] = gpu_req_if.tmask[i] & taken;
assign split_else_mask[i] = gpu_req_if.tmask[i] & ~taken; assign split_else_mask[i] = gpu_req_if.tmask[i] & ~taken;
end end
@@ -70,7 +66,7 @@ module VX_gpu_unit #(
// barrier // barrier
assign barrier.valid = is_bar; assign barrier.valid = is_bar;
assign barrier.id = gpu_req_if.rs1_data[0][`NB_BITS-1:0]; assign barrier.id = gpu_req_if.rs1_data[gpu_req_if.tid][`NB_BITS-1:0];
assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data - 1); assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data - 1);
// output // output

View File

@@ -119,15 +119,15 @@ module VX_instr_demux (
wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU); wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU);
VX_skid_buffer #( VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32)), .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)),
.OUTPUT_REG (1) .OUTPUT_REG (1)
) gpu_buffer ( ) gpu_buffer (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.valid_in (gpu_req_valid), .valid_in (gpu_req_valid),
.ready_in (gpu_req_ready), .ready_in (gpu_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `GPU_OP(ibuffer_if.op_type), ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}), .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `GPU_OP(ibuffer_if.op_type), ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}),
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.rs1_data, gpu_req_if.rs2_data}), .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data}),
.valid_out (gpu_req_if.valid), .valid_out (gpu_req_if.valid),
.ready_out (gpu_req_if.ready) .ready_out (gpu_req_if.ready)
); );

View File

@@ -108,6 +108,7 @@ module VX_pipeline #(
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
VX_fetch_to_csr_if fetch_to_csr_if();
VX_cmt_to_csr_if cmt_to_csr_if(); VX_cmt_to_csr_if cmt_to_csr_if();
VX_decode_if decode_if(); VX_decode_if decode_if();
VX_branch_ctl_if branch_ctl_if(); VX_branch_ctl_if branch_ctl_if();
@@ -155,6 +156,7 @@ module VX_pipeline #(
.warp_ctl_if (warp_ctl_if), .warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if), .branch_ctl_if (branch_ctl_if),
.ifetch_rsp_if (ifetch_rsp_if), .ifetch_rsp_if (ifetch_rsp_if),
.fetch_to_csr_if(fetch_to_csr_if),
.busy (busy) .busy (busy)
); );
@@ -210,6 +212,7 @@ module VX_pipeline #(
.dcache_rsp_if (dcache_rsp_if), .dcache_rsp_if (dcache_rsp_if),
.cmt_to_csr_if (cmt_to_csr_if), .cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if),
.alu_req_if (alu_req_if), .alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if), .lsu_req_if (lsu_req_if),

View File

@@ -16,6 +16,8 @@ module VX_warp_sched #(
VX_ifetch_rsp_if ifetch_rsp_if, VX_ifetch_rsp_if ifetch_rsp_if,
VX_ifetch_req_if ifetch_req_if, VX_ifetch_req_if ifetch_req_if,
VX_fetch_to_csr_if fetch_to_csr_if,
output wire busy output wire busy
); );
@@ -153,6 +155,9 @@ module VX_warp_sched #(
end end
end end
// export thread mask register
assign fetch_to_csr_if.thread_masks = thread_masks;
// calculate active barrier status // calculate active barrier status
`IGNORE_UNUSED_BEGIN `IGNORE_UNUSED_BEGIN

View File

@@ -0,0 +1,12 @@
`ifndef VX_FETCH_TO_CSR_IF
`define VX_FETCH_TO_CSR_IF
`include "VX_define.vh"
interface VX_fetch_to_csr_if ();
wire [`NUM_THREADS-1:0] thread_masks [`NUM_WARPS-1:0];
endinterface
`endif

View File

@@ -12,6 +12,7 @@ interface VX_gpu_req_if();
wire [31:0] PC; wire [31:0] PC;
wire [31:0] next_PC; wire [31:0] next_PC;
wire [`GPU_BITS-1:0] op_type; wire [`GPU_BITS-1:0] op_type;
wire [`NT_BITS-1:0] tid;
wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [31:0] rs2_data; wire [31:0] rs2_data;
wire [`NR_BITS-1:0] rd; wire [`NR_BITS-1:0] rd;

View File

@@ -121,6 +121,13 @@ inline int vx_core_id() {
return result; return result;
} }
// Return current threadk mask
inline int vx_thread_mask() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_TMASK));
return result;
}
// Return the number of threads in a warp // Return the number of threads in a warp
inline int vx_num_threads() { inline int vx_num_threads() {
int result; int result;