From b459192dec6c709ec94b164bf3a22ba03276bd4c Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 26 Dec 2020 03:28:32 -0800 Subject: [PATCH] critical path optimization - fpga fmax @4c = ~212 mhz --- hw/rtl/VX_config.vh | 2 +- hw/rtl/VX_decode.v | 33 ++++--- hw/rtl/VX_ibuffer.v | 95 +++++++++++-------- hw/rtl/VX_ipdom_stack.v | 3 +- hw/rtl/VX_issue.v | 13 ++- hw/rtl/VX_scoreboard.v | 54 ++++------- hw/rtl/VX_writeback.v | 2 +- hw/rtl/cache/VX_cache.v | 33 +++++-- hw/rtl/cache/VX_cache_core_req_bank_sel.v | 110 +++++++++++++++------- hw/rtl/cache/VX_cache_core_rsp_merge.v | 8 +- hw/rtl/libs/VX_generic_queue.v | 32 +++---- 11 files changed, 216 insertions(+), 169 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 7fc6da0e..0d17e6da 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -327,7 +327,7 @@ // Size of cache in bytes `ifndef SMEM_SIZE -`define SMEM_SIZE 4096 +`define SMEM_SIZE 8192 `endif // Number of banks diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index ac4f45d7..f0e86471 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -307,21 +307,20 @@ module VX_decode #( /////////////////////////////////////////////////////////////////////////// - assign decode_if.valid = ifetch_rsp_if.valid - && (decode_if.ex_type != `EX_NOP); // skip noop + assign decode_if.valid = ifetch_rsp_if.valid; assign decode_if.wid = ifetch_rsp_if.wid; assign decode_if.tmask = ifetch_rsp_if.tmask; assign decode_if.PC = ifetch_rsp_if.PC; assign decode_if.ex_type = is_lsu ? `EX_LSU : - is_csr ? `EX_CSR : - is_mul ? `EX_MUL : - is_fpu ? `EX_FPU : - is_gpu ? `EX_GPU : - is_br ? `EX_ALU : - (is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU : - `EX_NOP; + is_csr ? `EX_CSR : + is_mul ? `EX_MUL : + is_fpu ? `EX_FPU : + is_gpu ? `EX_GPU : + is_br ? `EX_ALU : + (is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU : + `EX_NOP; assign decode_if.op_type = is_lsu ? `OP_BITS'(lsu_op) : is_csr ? `OP_BITS'(csr_op) : @@ -367,17 +366,17 @@ module VX_decode #( /////////////////////////////////////////////////////////////////////////// - wire decode_fire = decode_if.valid && decode_if.ready; + wire decode_fire_unqual = ifetch_rsp_if.valid && decode_if.ready; - assign join_if.valid = decode_fire && is_gpu && (gpu_op == `GPU_JOIN); + assign join_if.valid = decode_fire_unqual && is_gpu && (gpu_op == `GPU_JOIN); assign join_if.wid = ifetch_rsp_if.wid; - assign wstall_if.valid = decode_fire && (is_btype - || is_jal - || is_jalr - || (is_gpu && (gpu_op == `GPU_TMC - || gpu_op == `GPU_SPLIT - || gpu_op == `GPU_BAR))); + assign wstall_if.valid = decode_fire_unqual && (is_btype + || is_jal + || is_jalr + || (is_gpu && (gpu_op == `GPU_TMC + || gpu_op == `GPU_SPLIT + || gpu_op == `GPU_BAR))); assign wstall_if.wid = ifetch_rsp_if.wid; /////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index c367d002..83d63287 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -11,19 +11,18 @@ module VX_ibuffer #( VX_decode_if ibuf_enq_if, // outputs - output wire [`NW_BITS-1:0] deq_wid_next, VX_decode_if ibuf_deq_if ); localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `OP_BITS + `FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1 + `NUM_REGS; localparam SIZE = `IBUF_SIZE; - localparam SIZEW = $clog2(SIZE+1); localparam ADDRW = $clog2(SIZE); localparam NWARPSW = $clog2(`NUM_WARPS+1); - reg [`NUM_WARPS-1:0][SIZEW-1:0] size_r; + reg [`NUM_WARPS-1:0][ADDRW-1:0] used_r; + reg [`NUM_WARPS-1:0] full_r, empty_r, sizeMany_r; - wire [`NUM_WARPS-1:0] q_full; - wire [`NUM_WARPS-1:0][SIZEW-1:0] q_size; + wire [`NUM_WARPS-1:0] q_full, q_empty; + wire [`NUM_WARPS-1:0] q_sizeMany; wire [DATAW-1:0] q_data_in; wire [`NUM_WARPS-1:0][DATAW-1:0] q_data_prev; reg [`NUM_WARPS-1:0][DATAW-1:0] q_data_out; @@ -36,15 +35,16 @@ module VX_ibuffer #( wire writing = enq_fire && (i == ibuf_enq_if.wid); wire reading = deq_fire && (i == ibuf_deq_if.wid); - wire is_slot0 = (0 == size_r[i]) || ((1 == size_r[i]) && reading); + wire is_slot0 = (0 == used_r[i]) || ((1 == used_r[i]) && reading); wire push = writing && !is_slot0; - wire pop = reading && (size_r[i] != 1); + wire pop = reading && sizeMany_r[i]; VX_generic_queue #( - .DATAW (DATAW), - .SIZE (SIZE), - .FASTRAM (1) + .DATAW (DATAW), + .SIZE (SIZE), + .BUFFERED (1), + .FASTRAM (1) ) queue ( .clk (clk), .reset (reset), @@ -58,27 +58,44 @@ module VX_ibuffer #( ); always @(posedge clk) begin - if (reset) begin - size_r[i] <= 0; - end else begin - if (writing && !reading) begin - size_r[i] <= size_r[i] + SIZEW'(1); + if (reset) begin + used_r[i] <= 0; + full_r[i] <= 0; + empty_r[i] <= 1; + sizeMany_r[i] <= 0; + end else begin + if (writing && !reading) begin + empty_r[i] <= 0; + if (used_r[i] == ADDRW'(SIZE-1)) begin + full_r[i] <= 1; + end + if (used_r[i] == 1) begin + sizeMany_r[i] <= 1; + end end - if (reading && !writing) begin - size_r[i] <= size_r[i] - SIZEW'(1); + if (reading && !writing) begin + full_r[i] <= 0; + if (used_r[i] == ADDRW'(1)) begin + empty_r[i] <= 1; + end + if (used_r[i] == ADDRW'(2)) begin + sizeMany_r[i] <= 0; + end end + used_r[i] <= used_r[i] + ADDRW'($signed(2'(writing) - 2'(reading))); end if (writing && is_slot0) begin q_data_out[i] <= q_data_in; end - if (reading && (size_r[i] != 1)) begin + if (pop) begin q_data_out[i] <= q_data_prev[i]; end end - assign q_full[i] = (size_r[i] == SIZE); - assign q_size[i] = size_r[i]; + assign q_full[i] = full_r[i]; + assign q_empty[i] = empty_r[i]; + assign q_sizeMany[i] = sizeMany_r[i]; end /////////////////////////////////////////////////////////////////////////// @@ -93,7 +110,7 @@ module VX_ibuffer #( always @(*) begin valid_table_n = valid_table; if (deq_fire) begin - valid_table_n[deq_wid] = (q_size[deq_wid] != SIZEW'(1)); + valid_table_n[deq_wid] = q_sizeMany[deq_wid]; end if (enq_fire) begin valid_table_n[ibuf_enq_if.wid] = 1; @@ -103,24 +120,24 @@ module VX_ibuffer #( // schedule the next instruction to issue // do round-robin when multiple warps are active always @(*) begin - deq_valid_n = 0; - deq_wid_n = 'x; - deq_instr_n = 'x; + deq_valid_n = 0; + deq_wid_n = 'x; + deq_instr_n = 'x; schedule_table_n = schedule_table; if (0 == num_warps) begin - deq_valid_n = enq_fire; - deq_wid_n = ibuf_enq_if.wid; - deq_instr_n = q_data_in; + deq_valid_n = enq_fire; + deq_wid_n = ibuf_enq_if.wid; + deq_instr_n = q_data_in; end else if ((1 == num_warps) || freeze) begin - deq_valid_n = (!deq_fire || (q_size[deq_wid] != SIZEW'(1))) || enq_fire; - deq_wid_n = (!deq_fire || (q_size[deq_wid] != SIZEW'(1))) ? deq_wid : ibuf_enq_if.wid; - deq_instr_n = deq_fire ? ((q_size[deq_wid] != SIZEW'(1)) ? q_data_prev[deq_wid] : q_data_in) : q_data_out[deq_wid]; + deq_valid_n = (!deq_fire || q_sizeMany[deq_wid]) || enq_fire; + deq_wid_n = (!deq_fire || q_sizeMany[deq_wid]) ? deq_wid : ibuf_enq_if.wid; + deq_instr_n = deq_fire ? (q_sizeMany[deq_wid] ? q_data_prev[deq_wid] : q_data_in) : q_data_out[deq_wid]; end else begin + deq_valid_n = (| schedule_table_n); for (integer i = 0; i < `NUM_WARPS; i++) begin - if (schedule_table_n[i]) begin - deq_valid_n = 1; + if (schedule_table_n[i]) begin deq_wid_n = `NW_BITS'(i); deq_instr_n = q_data_out[i]; schedule_table_n[i] = 0; @@ -130,8 +147,8 @@ module VX_ibuffer #( end end - wire warp_added = enq_fire && (0 == q_size[ibuf_enq_if.wid]); - wire warp_removed = deq_fire && ~(enq_fire && ibuf_enq_if.wid == deq_wid) && ~(q_size[deq_wid] != SIZEW'(1)); + wire warp_added = enq_fire && q_empty[ibuf_enq_if.wid]; + wire warp_removed = deq_fire && ~(enq_fire && ibuf_enq_if.wid == deq_wid) && ~q_sizeMany[deq_wid]; always @(posedge clk) begin if (reset) begin @@ -162,23 +179,21 @@ module VX_ibuffer #( `ifdef VERILATOR /*if (enq_fire || deq_fire || deq_valid) begin $display("*** %t: cur=%b(%0d), nxt=%b(%0d), enq=%b(%0d), deq=%b(%0d), nw=%0d(%0d,%0d,%0d,%0d), sched=%b, sched_n=%b", - $time, deq_valid, deq_wid, deq_valid_n, deq_wid_n, enq_fire, ibuf_enq_if.wid, deq_fire, ibuf_deq_if.wid, num_warps, size_r[0], size_r[1], size_r[2], size_r[3], schedule_table, schedule_table_n); + $time, deq_valid, deq_wid, deq_valid_n, deq_wid_n, enq_fire, ibuf_enq_if.wid, deq_fire, ibuf_deq_if.wid, num_warps, used_r[0], used_r[1], used_r[2], used_r[3], schedule_table, schedule_table_n); end*/ begin // verify 'num_warps' integer nw = 0; for (integer i = 0; i < `NUM_WARPS; i++) begin - nw += 32'(q_size[i] != 0); + nw += 32'(!q_empty[i]); end assert(nw == 32'(num_warps)) else $error("%t: error: invalid num_warps: nw=%0d, ref=%0d", $time, num_warps, nw); - assert(~deq_valid || (q_size[deq_wid] != 0)) else $error("%t: error: invalid schedule: wid=%0d", $time, deq_wid); - assert(~deq_fire || (q_size[deq_wid] != 0)) else $error("%t: error: invalid dequeu: wid=%0d", $time, deq_wid); + assert(~deq_valid || !q_empty[deq_wid]) else $error("%t: error: invalid schedule: wid=%0d", $time, deq_wid); + assert(~deq_fire || !q_empty[deq_wid]) else $error("%t: error: invalid dequeu: wid=%0d", $time, deq_wid); end `endif end end - assign deq_wid_next = deq_wid_n; - assign ibuf_enq_if.ready = ~q_full[ibuf_enq_if.wid]; assign q_data_in = {ibuf_enq_if.tmask, ibuf_enq_if.PC, diff --git a/hw/rtl/VX_ipdom_stack.v b/hw/rtl/VX_ipdom_stack.v index 4dccbf6f..19608d99 100644 --- a/hw/rtl/VX_ipdom_stack.v +++ b/hw/rtl/VX_ipdom_stack.v @@ -40,7 +40,8 @@ module VX_ipdom_stack #( VX_dp_ram #( .DATAW(WIDTH * 2), .SIZE(DEPTH), - .RWCHECK(0) + .RWCHECK(1), + .FASTRAM(1) ) store ( .clk(clk), .waddr(wr_ptr), diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index a25ed23b..b519f6f4 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -28,7 +28,6 @@ module VX_issue #( VX_gpr_rsp_if gpr_rsp_if(); wire scoreboard_delay; - wire [`NW_BITS-1:0] deq_wid_next; VX_ibuffer #( .CORE_ID(CORE_ID) @@ -37,7 +36,6 @@ module VX_issue #( .reset (reset), .freeze (1'b0), .ibuf_enq_if (decode_if), - .deq_wid_next (deq_wid_next), .ibuf_deq_if (ibuf_deq_if) ); @@ -48,8 +46,6 @@ module VX_issue #( .reset (reset), .ibuf_deq_if (ibuf_deq_if), .writeback_if (writeback_if), - .deq_wid_next (deq_wid_next), - .exe_delay (~execute_if.ready), .delay (scoreboard_delay) ); @@ -93,7 +89,10 @@ module VX_issue #( .mul_req_if (mul_req_if), .fpu_req_if (fpu_req_if), .gpu_req_if (gpu_req_if) - ); + ); + + // issue the instruction + assign ibuf_deq_if.ready = !scoreboard_delay && execute_if.ready; `SCOPE_ASSIGN (issue_fire, ibuf_deq_if.valid && ibuf_deq_if.ready); `SCOPE_ASSIGN (issue_wid, ibuf_deq_if.wid); @@ -123,8 +122,8 @@ module VX_issue #( `SCOPE_ASSIGN (writeback_data, writeback_if.data); `ifdef PERF_ENABLE - reg [63:0] perf_ibf_stalls ; - reg [63:0] perf_scb_stalls ; + reg [63:0] perf_ibf_stalls; + reg [63:0] perf_scb_stalls; reg [63:0] perf_alu_stalls; reg [63:0] perf_lsu_stalls; reg [63:0] perf_csr_stalls; diff --git a/hw/rtl/VX_scoreboard.v b/hw/rtl/VX_scoreboard.v index 4b4f4138..063f2e73 100644 --- a/hw/rtl/VX_scoreboard.v +++ b/hw/rtl/VX_scoreboard.v @@ -3,21 +3,19 @@ module VX_scoreboard #( parameter CORE_ID = 0 ) ( - input wire clk, - input wire reset, + input wire clk, + input wire reset, - VX_decode_if ibuf_deq_if, - VX_writeback_if writeback_if, - input wire [`NW_BITS-1:0] deq_wid_next, - input wire exe_delay, - - output wire delay + VX_decode_if ibuf_deq_if, + VX_writeback_if writeback_if, + output wire delay ); reg [`NUM_THREADS-1:0] inuse_registers [(`NUM_WARPS * `NUM_REGS)-1:0]; - reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_reg_mask, inuse_reg_mask_n; - reg [`NUM_REGS-1:0] deq_used_regs; + reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_reg_mask; + wire [`NUM_REGS-1:0] inuse_regs; + wire [`NUM_THREADS-1:0] inuse_registers_n; - wire [`NUM_REGS-1:0] inuse_regs = deq_used_regs & ibuf_deq_if.used_regs; + assign inuse_regs = inuse_reg_mask[ibuf_deq_if.wid] & ibuf_deq_if.used_regs; assign delay = (| inuse_regs); @@ -25,17 +23,7 @@ module VX_scoreboard #( wire release_reg = writeback_if.valid && writeback_if.ready; - wire [`NUM_THREADS-1:0] inuse_registers_n = inuse_registers[{writeback_if.wid, writeback_if.rd}] & ~writeback_if.tmask; - - always @(*) begin - inuse_reg_mask_n = inuse_reg_mask; - if (reserve_reg) begin - inuse_reg_mask_n[ibuf_deq_if.wid][ibuf_deq_if.rd] = 1; - end - if (release_reg) begin - inuse_reg_mask_n[writeback_if.wid][writeback_if.rd] = (| inuse_registers_n); - end - end + assign inuse_registers_n = inuse_registers[{writeback_if.wid, writeback_if.rd}] & ~writeback_if.tmask; always @(posedge clk) begin if (reset) begin @@ -48,28 +36,24 @@ module VX_scoreboard #( end else begin if (reserve_reg) begin inuse_registers[{ibuf_deq_if.wid, ibuf_deq_if.rd}] <= ibuf_deq_if.tmask; + inuse_reg_mask[ibuf_deq_if.wid][ibuf_deq_if.rd] <= 1; end if (release_reg) begin assert(inuse_reg_mask[writeback_if.wid][writeback_if.rd] != 0) else $error("*** %t: core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d", - $time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd); + $time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd); inuse_registers[{writeback_if.wid, writeback_if.rd}] <= inuse_registers_n; - end - inuse_reg_mask <= inuse_reg_mask_n; - end - - deq_used_regs <= inuse_reg_mask_n[deq_wid_next]; + inuse_reg_mask[writeback_if.wid][writeback_if.rd] <= (| inuse_registers_n); + end + end end - // issue the instruction - assign ibuf_deq_if.ready = ~(delay || exe_delay); - `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin - $display("%t: core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b, exe=%b", + $display("%t: core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b", $time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.PC, ibuf_deq_if.rd, ibuf_deq_if.wb, - inuse_regs[ibuf_deq_if.rd], inuse_regs[ibuf_deq_if.rs1], inuse_regs[ibuf_deq_if.rs2], inuse_regs[ibuf_deq_if.rs3], exe_delay); + inuse_regs[ibuf_deq_if.rd], inuse_regs[ibuf_deq_if.rs1], inuse_regs[ibuf_deq_if.rs2], inuse_regs[ibuf_deq_if.rs3]); end end `endif @@ -80,9 +64,9 @@ module VX_scoreboard #( stall_ctr <= 0; end else if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin stall_ctr <= stall_ctr + 1; - assert(stall_ctr < 100000) else $error("*** %t: core%0d-stalled: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b, exe=%b", + assert(stall_ctr < 100000) else $error("*** %t: core%0d-stalled: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b", $time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.PC, ibuf_deq_if.rd, ibuf_deq_if.wb, - inuse_regs[ibuf_deq_if.rd], inuse_regs[ibuf_deq_if.rs1], inuse_regs[ibuf_deq_if.rs2], inuse_regs[ibuf_deq_if.rs3], exe_delay); + inuse_regs[ibuf_deq_if.rd], inuse_regs[ibuf_deq_if.rs1], inuse_regs[ibuf_deq_if.rs2], inuse_regs[ibuf_deq_if.rs3]); end else if (ibuf_deq_if.valid && ibuf_deq_if.ready) begin stall_ctr <= 0; end diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index ecd6f807..f97d933d 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -17,7 +17,7 @@ module VX_writeback #( VX_writeback_if writeback_if ); wire alu_valid = alu_commit_if.valid && alu_commit_if.wb; - wire ld_valid = ld_commit_if.valid /*&& ld_commit_if.wb*/; + wire ld_valid = ld_commit_if.valid && ld_commit_if.wb; wire csr_valid = csr_commit_if.valid && csr_commit_if.wb; wire mul_valid = mul_commit_if.valid && mul_commit_if.wb; wire fpu_valid = fpu_commit_if.valid && fpu_commit_if.wb; diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index a8371460..2e40e1cc 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -91,6 +91,11 @@ module VX_cache #( wire [NUM_BANKS-1:0] per_bank_core_req_valid; wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid; + wire [NUM_BANKS-1:0] per_bank_core_req_rw; + wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen; + wire [NUM_BANKS-1:0][`WORD_ADDR_WIDTH-1:0] per_bank_core_req_addr; + wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag; + wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data; wire [NUM_BANKS-1:0] per_bank_core_req_ready; wire [NUM_BANKS-1:0] per_bank_core_rsp_valid; @@ -122,7 +127,8 @@ module VX_cache #( .BANK_LINE_SIZE (BANK_LINE_SIZE), .NUM_BANKS (NUM_BANKS), .WORD_SIZE (WORD_SIZE), - .NUM_REQS (NUM_REQS) + .NUM_REQS (NUM_REQS), + .CORE_TAG_WIDTH (CORE_TAG_WIDTH) ) cache_core_req_bank_sel ( .clk (clk), .reset (reset), @@ -132,11 +138,20 @@ module VX_cache #( `UNUSED_PIN (bank_stalls), `endif .core_req_valid (core_req_valid), + .core_req_rw (core_req_rw), + .core_req_byteen(core_req_byteen), .core_req_addr (core_req_addr), + .core_req_data (core_req_data), + .core_req_tag (core_req_tag), .core_req_ready (core_req_ready), - .per_bank_valid (per_bank_core_req_valid), - .per_bank_tid (per_bank_core_req_tid), - .per_bank_ready (per_bank_core_req_ready) + .per_bank_core_req_valid (per_bank_core_req_valid), + .per_bank_core_req_tid (per_bank_core_req_tid), + .per_bank_core_req_rw (per_bank_core_req_rw), + .per_bank_core_req_byteen(per_bank_core_req_byteen), + .per_bank_core_req_addr (per_bank_core_req_addr), + .per_bank_core_req_tag (per_bank_core_req_tag), + .per_bank_core_req_data (per_bank_core_req_data), + .per_bank_core_req_ready (per_bank_core_req_ready) ); assign dram_req_tag = dram_req_addr; @@ -179,11 +194,11 @@ module VX_cache #( // Core Req assign curr_bank_core_req_valid = per_bank_core_req_valid[i]; assign curr_bank_core_req_tid = per_bank_core_req_tid[i]; - assign curr_bank_core_req_addr = core_req_addr[per_bank_core_req_tid[i]]; - assign curr_bank_core_req_rw = core_req_rw[per_bank_core_req_tid[i]]; - assign curr_bank_core_req_byteen = core_req_byteen[per_bank_core_req_tid[i]]; - assign curr_bank_core_req_data = core_req_data[per_bank_core_req_tid[i]]; - assign curr_bank_core_req_tag = core_req_tag[per_bank_core_req_tid[i]]; + assign curr_bank_core_req_addr = per_bank_core_req_addr[i]; + assign curr_bank_core_req_rw = per_bank_core_req_rw[i]; + assign curr_bank_core_req_byteen = per_bank_core_req_byteen[i]; + assign curr_bank_core_req_data = per_bank_core_req_data[i]; + assign curr_bank_core_req_tag = per_bank_core_req_tag[i]; assign per_bank_core_req_ready[i] = curr_bank_core_req_ready; // Core WB diff --git a/hw/rtl/cache/VX_cache_core_req_bank_sel.v b/hw/rtl/cache/VX_cache_core_req_bank_sel.v index 960f13cc..12f0d997 100644 --- a/hw/rtl/cache/VX_cache_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_cache_core_req_bank_sel.v @@ -8,48 +8,80 @@ module VX_cache_core_req_bank_sel #( // Number of banks parameter NUM_BANKS = 1, // Number of Word requests per cycle - parameter NUM_REQS = 1 + parameter NUM_REQS = 1, + // core request tag size + parameter CORE_TAG_WIDTH = 1 ) ( - input wire clk, - input wire reset, - input wire [NUM_REQS-1:0] core_req_valid, - input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, - output wire [NUM_REQS-1:0] core_req_ready, - output wire [NUM_BANKS-1:0] per_bank_valid, - output wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_tid, - input wire [NUM_BANKS-1:0] per_bank_ready, - output wire [63:0] bank_stalls + input wire clk, + input wire reset, + + output wire [63:0] bank_stalls, + + input wire [NUM_REQS-1:0] core_req_valid, + input wire [NUM_REQS-1:0] core_req_rw, + input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen, + input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, + input wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data, + input wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag, + output wire [NUM_REQS-1:0] core_req_ready, + + output wire [NUM_BANKS-1:0] per_bank_core_req_valid, + output wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid, + output wire [NUM_BANKS-1:0] per_bank_core_req_rw, + output wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen, + output wire [NUM_BANKS-1:0][`WORD_ADDR_WIDTH-1:0] per_bank_core_req_addr, + output wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag, + output wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data, + input wire [NUM_BANKS-1:0] per_bank_core_req_ready ); - if (NUM_BANKS > 1) begin - reg [NUM_BANKS-1:0] per_bank_valid_r; - reg [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_tid_r; - reg [NUM_REQS-1:0] core_req_ready_r; - reg [NUM_BANKS-1:0] core_req_sel_r; - wire [NUM_REQS-1:0][`BANK_BITS-1:0] core_req_bid; + if (NUM_BANKS > 1) begin + + reg [NUM_BANKS-1:0] per_bank_core_req_valid_r; + reg [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid_r; + reg [NUM_BANKS-1:0] per_bank_core_req_rw_r; + reg [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen_r; + reg [NUM_BANKS-1:0][`WORD_ADDR_WIDTH-1:0] per_bank_core_req_addr_r; + reg [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag_r; + reg [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data_r; + reg [NUM_REQS-1:0] core_req_ready_r; + reg [NUM_BANKS-1:0] core_req_sel_r; + wire [NUM_REQS-1:0][`BANK_BITS-1:0] core_req_bid; for (genvar i = 0; i < NUM_REQS; ++i) begin assign core_req_bid[i] = core_req_addr[i][`BANK_SELECT_ADDR_RNG]; end always @(*) begin - per_bank_valid_r = 0; - per_bank_tid_r = 'x; + per_bank_core_req_valid_r = 0; + per_bank_core_req_tid_r = 'x; + per_bank_core_req_rw_r = 'x; + per_bank_core_req_byteen_r= 'x; + per_bank_core_req_addr_r = 'x; + per_bank_core_req_tag_r = 'x; + per_bank_core_req_data_r = 'x; + for (integer i = NUM_REQS-1; i >= 0; --i) begin if (core_req_valid[i]) begin - per_bank_valid_r[core_req_bid[i]] = 1; - per_bank_tid_r[core_req_bid[i]] = `REQS_BITS'(i); + per_bank_core_req_valid_r[core_req_bid[i]] = 1; + per_bank_core_req_tid_r[core_req_bid[i]] = `REQS_BITS'(i); + per_bank_core_req_rw_r[core_req_bid[i]] = core_req_rw[i]; + per_bank_core_req_byteen_r[core_req_bid[i]]= core_req_byteen[i]; + per_bank_core_req_addr_r[core_req_bid[i]] = core_req_addr[i]; + per_bank_core_req_tag_r[core_req_bid[i]] = core_req_tag[i]; + per_bank_core_req_data_r[core_req_bid[i]] = core_req_data[i]; end end end always @(*) begin core_req_ready_r = 0; - core_req_sel_r = 0; + core_req_sel_r = 0; + for (integer j = 0; j < NUM_BANKS; ++j) begin for (integer i = 0; i < NUM_REQS; ++i) begin if (core_req_valid[i] && (core_req_bid[i] == `BANK_BITS'(j))) begin - core_req_ready_r[i] = per_bank_ready[j]; - core_req_sel_r[i] = 1; + core_req_ready_r[i] = per_bank_core_req_ready[j]; + core_req_sel_r[i] = 1; break; end end @@ -65,20 +97,30 @@ module VX_cache_core_req_bank_sel #( end end - assign per_bank_valid = per_bank_valid_r; - assign per_bank_tid = per_bank_tid_r; - assign core_req_ready = core_req_ready_r; - assign bank_stalls = bank_stalls_r; + assign bank_stalls = bank_stalls_r; + assign per_bank_core_req_valid = per_bank_core_req_valid_r; + assign per_bank_core_req_tid = per_bank_core_req_tid_r; + assign per_bank_core_req_rw = per_bank_core_req_rw_r; + assign per_bank_core_req_byteen = per_bank_core_req_byteen_r; + assign per_bank_core_req_addr = per_bank_core_req_addr_r; + assign per_bank_core_req_tag = per_bank_core_req_tag_r; + assign per_bank_core_req_data = per_bank_core_req_data_r; + assign core_req_ready = core_req_ready_r; + + end else begin - end else begin `UNUSED_VAR (clk) `UNUSED_VAR (reset) - `UNUSED_VAR (core_req_valid) - `UNUSED_VAR (core_req_addr) - assign per_bank_valid = core_req_valid; - assign per_bank_tid = 0; - assign core_req_ready[0] = per_bank_ready; - assign bank_stalls = 0; + assign bank_stalls = 0; + assign per_bank_core_req_valid = core_req_valid; + assign per_bank_core_req_tid[0] = 0; + assign per_bank_core_req_rw[0] = core_req_rw; + assign per_bank_core_req_byteen[0] = core_req_byteen; + assign per_bank_core_req_addr[0] = core_req_addr; + assign per_bank_core_req_tag[0] = core_req_tag; + assign per_bank_core_req_data[0] = core_req_data; + assign core_req_ready[0] = per_bank_core_req_ready; + end endmodule \ No newline at end of file diff --git a/hw/rtl/cache/VX_cache_core_rsp_merge.v b/hw/rtl/cache/VX_cache_core_rsp_merge.v index 9aa4496d..8b410bdb 100644 --- a/hw/rtl/cache/VX_cache_core_rsp_merge.v +++ b/hw/rtl/cache/VX_cache_core_rsp_merge.v @@ -37,7 +37,6 @@ module VX_cache_core_rsp_merge #( if (CORE_TAG_ID_BITS != 0) begin reg [CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual; - reg [CORE_TAG_ID_BITS-1:0] sel_tag_id; reg core_rsp_valid_unaual_any; wire core_rsp_ready_unqual; @@ -46,21 +45,18 @@ module VX_cache_core_rsp_merge #( core_rsp_valid_unaual_any = 0; core_rsp_tag_unqual = 'x; core_rsp_data_unqual = 'x; - core_rsp_bank_select = 0; + core_rsp_bank_select = 0; - sel_tag_id = 'x; - for (integer i = 0; i < NUM_BANKS; i++) begin if (per_bank_core_rsp_valid[i]) begin core_rsp_tag_unqual = per_bank_core_rsp_tag[i]; - sel_tag_id = per_bank_core_rsp_tag[i][CORE_TAG_ID_BITS-1:0]; break; end end for (integer i = 0; i < NUM_BANKS; i++) begin if (per_bank_core_rsp_valid[i] - && (per_bank_core_rsp_tag[i][CORE_TAG_ID_BITS-1:0] == sel_tag_id)) begin + && (per_bank_core_rsp_tag[i][CORE_TAG_ID_BITS-1:0] == core_rsp_tag_unqual[CORE_TAG_ID_BITS-1:0])) begin core_rsp_valid_unaual_any = 1; core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; diff --git a/hw/rtl/libs/VX_generic_queue.v b/hw/rtl/libs/VX_generic_queue.v index 0a951a6b..63698cb6 100644 --- a/hw/rtl/libs/VX_generic_queue.v +++ b/hw/rtl/libs/VX_generic_queue.v @@ -56,29 +56,25 @@ module VX_generic_queue #( always @(posedge clk) begin if (reset) begin - empty_r <= 1; - full_r <= 0; + empty_r <= 1; + full_r <= 0; used_r <= 0; end else begin - if (push) begin - assert(!full); - if (!pop) begin - empty_r <= 0; - if (used_r == ADDRW'(SIZE-1)) begin - full_r <= 1; - end + assert(!push || !full); + assert(!pop || !empty); + if (push && !pop) begin + empty_r <= 0; + if (used_r == ADDRW'(SIZE-1)) begin + full_r <= 1; end end - if (pop) begin - assert(!empty); - if (!push) begin - full_r <= 0; - if (used_r == ADDRW'(1)) begin - empty_r <= 1; - end; - end + if (pop && !push) begin + full_r <= 0; + if (used_r == ADDRW'(1)) begin + empty_r <= 1; + end; end - used_r <= used_r + (ADDRW'(push) - ADDRW'(pop)); + used_r <= used_r + ADDRW'($signed(2'(push) - 2'(pop))); end end