From 9f128085d590c34a53cdd294b282fec40cfb1d9e Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 30 Dec 2020 06:47:56 -0800 Subject: [PATCH] scoreboard optimization - using writeback's end-of-packet status --- hw/rtl/VX_alu_unit.v | 2 ++ hw/rtl/VX_csr_io_arb.v | 1 + hw/rtl/VX_csr_unit.v | 2 ++ hw/rtl/VX_fpu_unit.v | 2 ++ hw/rtl/VX_gpu_unit.v | 2 ++ hw/rtl/VX_ibuffer.v | 3 ++ hw/rtl/VX_issue.v | 6 +++- hw/rtl/VX_lsu_unit.v | 7 ++-- hw/rtl/VX_mul_unit.v | 2 ++ hw/rtl/VX_scoreboard.v | 53 ++++++++++++++--------------- hw/rtl/VX_writeback.v | 35 ++++++++++--------- hw/rtl/fp_cores/VX_fp_ncomp.v | 15 ++++---- hw/rtl/interfaces/VX_commit_if.v | 1 + hw/rtl/interfaces/VX_writeback_if.v | 1 + hw/scripts/scope.json | 1 + 15 files changed, 76 insertions(+), 57 deletions(-) diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index 1077de02..4db50618 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -106,6 +106,8 @@ module VX_alu_unit #( .data_in ({alu_req_if.valid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, br_dest, cmp_result}), .data_out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.tmask, alu_commit_if.PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_r, br_op_r, branch_ctl_if.dest, cmp_result_r}) ); + + assign alu_commit_if.eop = 1'b1; wire is_less = cmp_result_r[32]; wire is_equal = ~(| cmp_result_r[31:0]); diff --git a/hw/rtl/VX_csr_io_arb.v b/hw/rtl/VX_csr_io_arb.v index b43afc44..50f23797 100644 --- a/hw/rtl/VX_csr_io_arb.v +++ b/hw/rtl/VX_csr_io_arb.v @@ -63,6 +63,7 @@ module VX_csr_io_arb ( assign csr_commit_if.PC = csr_pipe_rsp_if.PC; assign csr_commit_if.rd = csr_pipe_rsp_if.rd; assign csr_commit_if.wb = csr_pipe_rsp_if.wb; + assign csr_commit_if.eop = csr_pipe_rsp_if.eop; assign csr_commit_if.data = csr_pipe_rsp_if.data; assign csr_pipe_rsp_if.ready = select_io_rsp ? csr_io_rsp_ready : csr_commit_if.ready; diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index 8ff25b4a..d58f5180 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -125,6 +125,8 @@ module VX_csr_unit #( csr_read_data_s1; end + assign csr_pipe_rsp_if.eop = 1'b1; + // can accept new request? assign csr_pipe_req_if.ready = ~(stall_out || stall_in); diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index ede9a8db..1ca5c66c 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -161,6 +161,8 @@ module VX_fpu_unit #( .data_out ({fpu_commit_if.valid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r}) ); + assign fpu_commit_if.eop = 1'b1; + assign ready_out = ~stall_out; // CSR fflags Update diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index c3bce8a0..0c3b1a52 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -86,6 +86,8 @@ module VX_gpu_unit #( .data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier}) ); + assign gpu_commit_if.eop = 1'b1; + assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready; assign warp_ctl_if.wid = gpu_commit_if.wid; diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index f08f1f16..131074ef 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -11,6 +11,7 @@ module VX_ibuffer #( VX_decode_if ibuf_enq_if, // outputs + output wire [`NW_BITS-1:0] deq_wid_next, VX_decode_if ibuf_deq_if ); localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `OP_BITS + `FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1 + `NUM_REGS; @@ -194,6 +195,8 @@ module VX_ibuffer #( end end + assign deq_wid_next = deq_wid_n; + assign ibuf_enq_if.ready = ~q_full[ibuf_enq_if.wid]; assign q_data_in = {ibuf_enq_if.tmask, ibuf_enq_if.PC, diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index b519f6f4..78455220 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -27,6 +27,7 @@ module VX_issue #( VX_gpr_req_if gpr_req_if(); VX_gpr_rsp_if gpr_rsp_if(); + wire [`NW_BITS-1:0] deq_wid_next; wire scoreboard_delay; VX_ibuffer #( @@ -36,7 +37,8 @@ module VX_issue #( .reset (reset), .freeze (1'b0), .ibuf_enq_if (decode_if), - .ibuf_deq_if (ibuf_deq_if) + .ibuf_deq_if (ibuf_deq_if), + .deq_wid_next (deq_wid_next) ); VX_scoreboard #( @@ -46,6 +48,7 @@ module VX_issue #( .reset (reset), .ibuf_deq_if (ibuf_deq_if), .writeback_if (writeback_if), + .deq_wid_next (deq_wid_next), .delay (scoreboard_delay) ); @@ -120,6 +123,7 @@ module VX_issue #( `SCOPE_ASSIGN (writeback_pc, writeback_if.PC); `SCOPE_ASSIGN (writeback_rd, writeback_if.rd); `SCOPE_ASSIGN (writeback_data, writeback_if.data); + `SCOPE_ASSIGN (writeback_eop, writeback_if.eof); `ifdef PERF_ENABLE reg [63:0] perf_ibf_stalls; diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 7eb74eb5..4cd4017d 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -201,6 +201,7 @@ module VX_lsu_unit #( assign st_commit_if.PC = req_pc; assign st_commit_if.rd = 0; assign st_commit_if.wb = 0; + assign st_commit_if.eop = 1'b1; assign st_commit_if.data = 0; // send load commit @@ -210,14 +211,14 @@ module VX_lsu_unit #( wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1), .RESETW (1) ) rsp_pipe_reg ( .clk (clk), .reset (reset), .enable (!load_rsp_stall), - .data_in ({is_load_rsp, rsp_wid, dcache_rsp_if.valid, rsp_pc, rsp_rd, rsp_wb, rsp_data}), - .data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data}) + .data_in ({is_load_rsp, rsp_wid, dcache_rsp_if.valid, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), + .data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) ); // Can accept new cache response? diff --git a/hw/rtl/VX_mul_unit.v b/hw/rtl/VX_mul_unit.v index 3f86e81b..a60e337c 100644 --- a/hw/rtl/VX_mul_unit.v +++ b/hw/rtl/VX_mul_unit.v @@ -138,6 +138,8 @@ module VX_mul_unit #( .data_out ({mul_commit_if.valid, mul_commit_if.wid, mul_commit_if.tmask, mul_commit_if.PC, mul_commit_if.rd, mul_commit_if.wb, mul_commit_if.data}) ); + assign mul_commit_if.eop = 1'b1; + // can accept new request? assign mul_req_if.ready = is_div_op ? div_ready_in : mul_ready_in; diff --git a/hw/rtl/VX_scoreboard.v b/hw/rtl/VX_scoreboard.v index 063f2e73..fd4efee0 100644 --- a/hw/rtl/VX_scoreboard.v +++ b/hw/rtl/VX_scoreboard.v @@ -8,44 +8,41 @@ module VX_scoreboard #( VX_decode_if ibuf_deq_if, VX_writeback_if writeback_if, + input wire [`NW_BITS-1:0] deq_wid_next, output wire delay ); - reg [`NUM_THREADS-1:0] inuse_registers [(`NUM_WARPS * `NUM_REGS)-1:0]; - reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_reg_mask; - wire [`NUM_REGS-1:0] inuse_regs; - wire [`NUM_THREADS-1:0] inuse_registers_n; + reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n; + reg [`NUM_REGS-1:0] deq_inuse_regs; + wire [`NUM_REGS-1:0] deq_real_inuse_regs; - assign inuse_regs = inuse_reg_mask[ibuf_deq_if.wid] & ibuf_deq_if.used_regs; + assign deq_real_inuse_regs = deq_inuse_regs & ibuf_deq_if.used_regs; - assign delay = (| inuse_regs); + assign delay = (| deq_real_inuse_regs); wire reserve_reg = ibuf_deq_if.valid && ibuf_deq_if.ready && (ibuf_deq_if.wb != 0); - wire release_reg = writeback_if.valid && writeback_if.ready; + wire release_reg = writeback_if.valid && writeback_if.ready && writeback_if.eop; - assign inuse_registers_n = inuse_registers[{writeback_if.wid, writeback_if.rd}] & ~writeback_if.tmask; + always @(*) begin + inuse_regs_n = inuse_regs; + if (reserve_reg) begin + inuse_regs_n[ibuf_deq_if.wid][ibuf_deq_if.rd] = 1; + end + if (release_reg) begin + inuse_regs_n[writeback_if.wid][writeback_if.rd] = 0; + assert(inuse_regs[writeback_if.wid][writeback_if.rd] != 0) + else $error("*** %t: core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d", + $time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd); + end + end always @(posedge clk) begin if (reset) begin - for (integer w = 0; w < `NUM_WARPS; w++) begin - for (integer i = 0; i < `NUM_REGS; i++) begin - inuse_registers[w * `NUM_REGS + i] <= 0; - end - inuse_reg_mask[w] <= `NUM_REGS'(0); - end + inuse_regs <= (`NUM_WARPS*`NUM_REGS)'(0); end else begin - if (reserve_reg) begin - inuse_registers[{ibuf_deq_if.wid, ibuf_deq_if.rd}] <= ibuf_deq_if.tmask; - inuse_reg_mask[ibuf_deq_if.wid][ibuf_deq_if.rd] <= 1; - end - if (release_reg) begin - assert(inuse_reg_mask[writeback_if.wid][writeback_if.rd] != 0) - else $error("*** %t: core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d", - $time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd); - inuse_registers[{writeback_if.wid, writeback_if.rd}] <= inuse_registers_n; - inuse_reg_mask[writeback_if.wid][writeback_if.rd] <= (| inuse_registers_n); - end - end + inuse_regs <= inuse_regs_n; + end + deq_inuse_regs <= inuse_regs_n[deq_wid_next]; end `ifdef DBG_PRINT_PIPELINE @@ -53,7 +50,7 @@ module VX_scoreboard #( if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin $display("%t: core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b", $time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.PC, ibuf_deq_if.rd, ibuf_deq_if.wb, - inuse_regs[ibuf_deq_if.rd], inuse_regs[ibuf_deq_if.rs1], inuse_regs[ibuf_deq_if.rs2], inuse_regs[ibuf_deq_if.rs3]); + deq_real_inuse_regs[ibuf_deq_if.rd], deq_real_inuse_regs[ibuf_deq_if.rs1], deq_real_inuse_regs[ibuf_deq_if.rs2], deq_real_inuse_regs[ibuf_deq_if.rs3]); end end `endif @@ -66,7 +63,7 @@ module VX_scoreboard #( stall_ctr <= stall_ctr + 1; assert(stall_ctr < 100000) else $error("*** %t: core%0d-stalled: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b", $time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.PC, ibuf_deq_if.rd, ibuf_deq_if.wb, - inuse_regs[ibuf_deq_if.rd], inuse_regs[ibuf_deq_if.rs1], inuse_regs[ibuf_deq_if.rs2], inuse_regs[ibuf_deq_if.rs3]); + deq_real_inuse_regs[ibuf_deq_if.rd], deq_real_inuse_regs[ibuf_deq_if.rs1], deq_real_inuse_regs[ibuf_deq_if.rs2], deq_real_inuse_regs[ibuf_deq_if.rs3]); end else if (ibuf_deq_if.valid && ibuf_deq_if.ready) begin stall_ctr <= 0; end diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index 7caff91f..62794375 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -20,68 +20,69 @@ module VX_writeback #( wire ld_valid = ld_commit_if.valid && ld_commit_if.wb; wire csr_valid = csr_commit_if.valid && csr_commit_if.wb; wire mul_valid = mul_commit_if.valid && mul_commit_if.wb; - wire fpu_valid = fpu_commit_if.valid && fpu_commit_if.wb; + /*wire fpu_valid = fpu_commit_if.valid && fpu_commit_if.wb;*/ wire wb_valid; wire [`NW_BITS-1:0] wb_wid; - wire [31:0] wb_PC; + wire [31:0] wb_PC; wire [`NUM_THREADS-1:0] wb_tmask; wire [`NR_BITS-1:0] wb_rd; wire [`NUM_THREADS-1:0][31:0] wb_data; + wire wb_eop; assign wb_valid = alu_valid ? alu_commit_if.valid : ld_valid ? ld_commit_if.valid : csr_valid ? csr_commit_if.valid : mul_valid ? mul_commit_if.valid : - fpu_valid ? fpu_commit_if.valid : - 0; + /*fpu_valid ?*/ fpu_commit_if.valid; assign wb_wid = alu_valid ? alu_commit_if.wid : ld_valid ? ld_commit_if.wid : csr_valid ? csr_commit_if.wid : mul_valid ? mul_commit_if.wid : - fpu_valid ? fpu_commit_if.wid : - 0; + /*fpu_valid ?*/ fpu_commit_if.wid; assign wb_PC = alu_valid ? alu_commit_if.PC : ld_valid ? ld_commit_if.PC : csr_valid ? csr_commit_if.PC : mul_valid ? mul_commit_if.PC : - fpu_valid ? fpu_commit_if.PC : - 0; + /*fpu_valid ?*/ fpu_commit_if.PC; assign wb_tmask = alu_valid ? alu_commit_if.tmask : ld_valid ? ld_commit_if.tmask : csr_valid ? csr_commit_if.tmask : mul_valid ? mul_commit_if.tmask : - fpu_valid ? fpu_commit_if.tmask : - 0; + /*fpu_valid ?*/ fpu_commit_if.tmask; assign wb_rd = alu_valid ? alu_commit_if.rd : ld_valid ? ld_commit_if.rd : csr_valid ? csr_commit_if.rd : mul_valid ? mul_commit_if.rd : - fpu_valid ? fpu_commit_if.rd : - 0; + /*fpu_valid ?*/ fpu_commit_if.rd; assign wb_data = alu_valid ? alu_commit_if.data : ld_valid ? ld_commit_if.data : csr_valid ? csr_commit_if.data : mul_valid ? mul_commit_if.data : - fpu_valid ? fpu_commit_if.data : - 0; + /*fpu_valid ?*/ fpu_commit_if.data; + + assign wb_eop = alu_valid ? alu_commit_if.eop : + ld_valid ? ld_commit_if.eop : + csr_valid ? csr_commit_if.eop : + mul_valid ? mul_commit_if.eop : + /*fpu_valid ?*/ fpu_commit_if.eop; wire stall = ~writeback_if.ready && writeback_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32)), + .DATAW (1 + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall), - .data_in ({wb_valid, wb_wid, wb_PC, wb_tmask, wb_rd, wb_data}), - .data_out ({writeback_if.valid, writeback_if.wid, writeback_if.PC, writeback_if.tmask, writeback_if.rd, writeback_if.data}) + .data_in ({wb_valid, wb_wid, wb_PC, wb_tmask, wb_rd, wb_data, wb_eop}), + .data_out ({writeback_if.valid, writeback_if.wid, writeback_if.PC, writeback_if.tmask, writeback_if.rd, writeback_if.data, writeback_if.eop}) ); assign alu_commit_if.ready = !stall; diff --git a/hw/rtl/fp_cores/VX_fp_ncomp.v b/hw/rtl/fp_cores/VX_fp_ncomp.v index bdd4b199..a804f1a7 100644 --- a/hw/rtl/fp_cores/VX_fp_ncomp.v +++ b/hw/rtl/fp_cores/VX_fp_ncomp.v @@ -44,12 +44,6 @@ module VX_fp_ncomp #( fp_type_t [LANES-1:0] tmp_a_type, tmp_b_type; wire [LANES-1:0] tmp_a_smaller, tmp_ab_equal; - wire [LANES-1:0][31:0] fclass_mask; // generate a 10-bit mask for integer reg - wire [LANES-1:0][31:0] fminmax_res; // result of fmin/fmax - wire [LANES-1:0][31:0] fsgnj_res; // result of sign injection - wire [LANES-1:0][31:0] fcmp_res; // result of comparison - fflags_t [LANES-1:0] fcmp_fflags; // comparison fflags - // Setup for (genvar i = 0; i < LANES; i++) begin assign tmp_a_sign[i] = dataa[i][31]; @@ -103,6 +97,7 @@ module VX_fp_ncomp #( ); // FCLASS + reg [LANES-1:0][31:0] fclass_mask; // generate a 10-bit mask for integer reg for (genvar i = 0; i < LANES; i++) begin always @(*) begin if (a_type_s0[i].is_normal) begin @@ -126,7 +121,8 @@ module VX_fp_ncomp #( end end - // Min/Max + // Min/Max + reg [LANES-1:0][31:0] fminmax_res; // result of fmin/fmax for (genvar i = 0; i < LANES; i++) begin always @(*) begin if (a_type_s0[i].is_nan && b_type_s0[i].is_nan) @@ -145,7 +141,8 @@ module VX_fp_ncomp #( end end - // Sign injection + // Sign injection + reg [LANES-1:0][31:0] fsgnj_res; // result of sign injection for (genvar i = 0; i < LANES; i++) begin always @(*) begin case (frm_s0) @@ -158,6 +155,8 @@ module VX_fp_ncomp #( end // Comparison + reg [LANES-1:0][31:0] fcmp_res; // result of comparison + fflags_t [LANES-1:0] fcmp_fflags; // comparison fflags for (genvar i = 0; i < LANES; i++) begin always @(*) begin case (frm_s0) diff --git a/hw/rtl/interfaces/VX_commit_if.v b/hw/rtl/interfaces/VX_commit_if.v index 273408fe..05d0f11c 100644 --- a/hw/rtl/interfaces/VX_commit_if.v +++ b/hw/rtl/interfaces/VX_commit_if.v @@ -12,6 +12,7 @@ interface VX_commit_if (); wire [`NUM_THREADS-1:0][31:0] data; wire [`NR_BITS-1:0] rd; wire wb; + wire eop; wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_writeback_if.v b/hw/rtl/interfaces/VX_writeback_if.v index dbc0efba..6e2c9cc3 100644 --- a/hw/rtl/interfaces/VX_writeback_if.v +++ b/hw/rtl/interfaces/VX_writeback_if.v @@ -15,6 +15,7 @@ interface VX_writeback_if (); wire [`NR_BITS-1:0] rd; wire [`NUM_THREADS-1:0][31:0] data; + wire eop; wire ready; endinterface diff --git a/hw/scripts/scope.json b/hw/scripts/scope.json index deb6ba85..92e11ec8 100644 --- a/hw/scripts/scope.json +++ b/hw/scripts/scope.json @@ -185,6 +185,7 @@ "writeback_tmask":"`NUM_THREADS", "writeback_rd":"`NR_BITS", "writeback_data":"`NUM_THREADS * 32", + "writeback_eop": 1, "!scoreboard_delay": 1, "!execute_delay": 1 },