From 725322807e3b693bc14d7200a0764b776161b9c9 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 10 Nov 2020 05:24:57 -0800 Subject: [PATCH] fixed DRAM response backpressure inside Cache --- driver/opae/vlsim/Makefile | 6 +++--- driver/opae/vlsim/opae_sim.cpp | 20 +++++++++++++++++++- hw/rtl/VX_cluster.v | 2 +- hw/rtl/VX_config.vh | 16 ++++++++-------- hw/rtl/VX_core.v | 2 +- hw/rtl/VX_csr_io_arb.v | 2 +- hw/rtl/VX_gpr_ram.v | 4 +++- hw/rtl/VX_io_arb.v | 3 ++- hw/rtl/VX_mem_arb.v | 3 ++- hw/rtl/VX_platform.vh | 4 ++-- hw/rtl/VX_scoreboard.v | 16 +++++++++++++--- hw/rtl/VX_writeback.v | 2 +- hw/rtl/Vortex.v | 2 +- hw/rtl/cache/VX_bank.v | 2 +- hw/rtl/cache/VX_cache.v | 2 +- hw/rtl/cache/VX_cache_miss_resrv.v | 2 +- hw/rtl/cache/VX_snp_forwarder.v | 2 +- hw/rtl/fp_cores/VX_fpnew.v | 11 +++++------ hw/rtl/libs/VX_dp_ram.v | 4 +++- 19 files changed, 69 insertions(+), 36 deletions(-) diff --git a/driver/opae/vlsim/Makefile b/driver/opae/vlsim/Makefile index cfe398af..279d4b8f 100644 --- a/driver/opae/vlsim/Makefile +++ b/driver/opae/vlsim/Makefile @@ -20,11 +20,11 @@ DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO #CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 -DL3_ENABLE=1 -CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 -#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 +#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 +CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 #CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DEBUG=1 +#DEBUG=1 #SCOPE=1 CFLAGS += -fPIC diff --git a/driver/opae/vlsim/opae_sim.cpp b/driver/opae/vlsim/opae_sim.cpp index 1883126a..5339de61 100644 --- a/driver/opae/vlsim/opae_sim.cpp +++ b/driver/opae/vlsim/opae_sim.cpp @@ -263,7 +263,16 @@ void opae_sim::avs_bus() { if (dram_rd_it != dram_reads_.end()) { vortex_afu_->avs_readdatavalid = 1; memcpy(vortex_afu_->avs_readdata, dram_rd_it->block.data(), CACHE_BLOCK_SIZE); + uint32_t tag = dram_rd_it->tag; dram_reads_.erase(dram_rd_it); + /*printf("%0ld: VLSIM: DRAM rsp: addr=%x, pending={", timestamp, tag); + for (auto& req : dram_reads_) { + if (req.cycles_left != 0) + printf(" !%0x", req.tag); + else + printf(" %0x", req.tag); + } + printf("}\n");*/ } // handle DRAM stalls @@ -293,10 +302,19 @@ void opae_sim::avs_bus() { if (vortex_afu_->avs_read) { assert(0 == vortex_afu_->mem_bank_select); dram_rd_req_t dram_req; - dram_req.cycles_left = DRAM_LATENCY; + dram_req.cycles_left = DRAM_LATENCY; unsigned base_addr = (vortex_afu_->avs_address * CACHE_BLOCK_SIZE); ram_.read(base_addr, CACHE_BLOCK_SIZE, dram_req.block.data()); + dram_req.tag = base_addr; dram_reads_.emplace_back(dram_req); + /*printf("%0ld: VLSIM: DRAM req: addr=%x, pending={", timestamp, base_addr); + for (auto& req : dram_reads_) { + if (req.cycles_left != 0) + printf(" !%0x", req.tag); + else + printf(" %0x", req.tag); + } + printf("}\n");*/ } } diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 4f96db36..1a654566 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -275,7 +275,7 @@ module VX_cluster #( ); assign busy = (| per_core_busy); - assign ebreak = (& per_core_ebreak); + assign ebreak = (| per_core_ebreak); if (`L2_ENABLE) begin diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 7772f032..8b6da4aa 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -8,7 +8,7 @@ `endif `ifndef NUM_CORES -`define NUM_CORES 2 +`define NUM_CORES 4 `endif `ifndef NUM_WARPS @@ -223,16 +223,16 @@ `define DDREQ_SIZE 8 `endif -// Snoop Response Queue Size -`ifndef DSNPQ_SIZE -`define DSNPQ_SIZE 8 -`endif - // DRAM Response Queue Size `ifndef DDRFQ_SIZE `define DDRFQ_SIZE 8 `endif +// Snoop Response Queue Size +`ifndef DSNPQ_SIZE +`define DSNPQ_SIZE 8 +`endif + // Snoop Req Queue Size `ifndef DSNRQ_SIZE `define DSNRQ_SIZE 8 @@ -359,7 +359,7 @@ `define L2DRFQ_SIZE 8 `endif -// Snoop Req Queue Size +// Snoop Request Queue Size `ifndef L2SNRQ_SIZE `define L2SNRQ_SIZE 8 `endif @@ -416,7 +416,7 @@ `define L3DRFQ_SIZE 8 `endif -// Snoop Req Queue Size +// Snoop Request Queue Size `ifndef L3SNRQ_SIZE `define L3SNRQ_SIZE 8 `endif diff --git a/hw/rtl/VX_core.v b/hw/rtl/VX_core.v index 689b21c2..cfff5700 100644 --- a/hw/rtl/VX_core.v +++ b/hw/rtl/VX_core.v @@ -276,7 +276,7 @@ module VX_core #( ); // select io bus - wire is_io_addr = ({core_dcache_req_if.addr[0], 2'b0} >= `IO_BUS_BASE_ADDR); + wire is_io_addr = ({core_dcache_req_if.addr[0], 2'b0} >= `IO_BUS_BASE_ADDR); wire io_req_select = (| core_dcache_req_if.valid) ? is_io_addr : 0; wire io_rsp_select = (| arb_io_rsp_if.valid); diff --git a/hw/rtl/VX_csr_io_arb.v b/hw/rtl/VX_csr_io_arb.v index facd417d..4155e9bd 100644 --- a/hw/rtl/VX_csr_io_arb.v +++ b/hw/rtl/VX_csr_io_arb.v @@ -60,7 +60,7 @@ module VX_csr_io_arb #( .grant_onehot (rsp_1hot) ); - wire stall = csr_io_rsp_valid_out && ~csr_io_rsp_ready_out; + wire stall = ~csr_io_rsp_ready_out && csr_io_rsp_valid_out; VX_generic_register #( .N(1 + 32), diff --git a/hw/rtl/VX_gpr_ram.v b/hw/rtl/VX_gpr_ram.v index 05833b9d..473a8c74 100644 --- a/hw/rtl/VX_gpr_ram.v +++ b/hw/rtl/VX_gpr_ram.v @@ -1,5 +1,6 @@ `include "VX_define.vh" +`TRACING_OFF module VX_gpr_ram ( input wire clk, input wire [`NUM_THREADS-1:0] we, @@ -30,4 +31,5 @@ module VX_gpr_ram ( assign rs1_data = q1; assign rs2_data = q2; -endmodule \ No newline at end of file +endmodule +`TRACING_ON \ No newline at end of file diff --git a/hw/rtl/VX_io_arb.v b/hw/rtl/VX_io_arb.v index 32d06587..1742e7a5 100644 --- a/hw/rtl/VX_io_arb.v +++ b/hw/rtl/VX_io_arb.v @@ -64,7 +64,7 @@ module VX_io_arb #( .grant_onehot (req_1hot) ); - wire stall = (| io_req_valid_out) && ~io_req_ready_out; + wire stall = ~io_req_ready_out && (| io_req_valid_out); VX_generic_register #( .N(`NUM_THREADS + TAG_OUT_WIDTH + (`NUM_THREADS * ADDR_WIDTH) + 1 + (`NUM_THREADS * WORD_SIZE) + (`NUM_THREADS * WORD_WIDTH)), @@ -91,6 +91,7 @@ module VX_io_arb #( assign io_rsp_tag_in[i] = io_rsp_tag_out[REQS_BITS +: TAG_IN_WIDTH]; assign io_rsp_data_in[i] = io_rsp_data_out; end + assign io_rsp_ready_out = io_rsp_ready_in[rsp_sel]; end else begin diff --git a/hw/rtl/VX_mem_arb.v b/hw/rtl/VX_mem_arb.v index 937231e1..08d84b11 100644 --- a/hw/rtl/VX_mem_arb.v +++ b/hw/rtl/VX_mem_arb.v @@ -59,7 +59,7 @@ module VX_mem_arb #( .grant_onehot (req_1hot) ); - wire stall = mem_req_valid_out && ~mem_req_ready_out; + wire stall = ~mem_req_ready_out && mem_req_valid_out; VX_generic_register #( .N(1 + TAG_OUT_WIDTH + ADDR_WIDTH + 1 + WORD_SIZE + WORD_WIDTH), @@ -86,6 +86,7 @@ module VX_mem_arb #( assign mem_rsp_tag_in[i] = mem_rsp_tag_out[REQS_BITS +: TAG_IN_WIDTH]; assign mem_rsp_data_in[i] = mem_rsp_data_out; end + assign mem_rsp_ready_out = mem_rsp_ready_in[rsp_sel]; end else begin diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index e5db8d3c..45cc01fb 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -46,8 +46,8 @@ if (!(cond)) $error msg; \ endgenerate -`define ENABLE_TRACING /* verilator tracing_on */ -`define DISABLE_TRACING /* verilator tracing_off */ +`define TRACING_ON /* verilator tracing_on */ +`define TRACING_OFF /* verilator tracing_off */ /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_scoreboard.v b/hw/rtl/VX_scoreboard.v index 2370745c..63e7b1d1 100644 --- a/hw/rtl/VX_scoreboard.v +++ b/hw/rtl/VX_scoreboard.v @@ -64,13 +64,23 @@ module VX_scoreboard #( assign ibuf_deq_if.ready = ~(delay || exe_delay || gpr_delay); `ifdef DBG_PRINT_PIPELINE + reg [31:0] stall_ctr; always @(posedge clk) begin - if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin + if (reset) begin + stall_ctr <= 0; + end else if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin $display("%t: core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b, exe=%b, gpr=%b", $time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.PC, ibuf_deq_if.rd, ibuf_deq_if.wb, - inuse_regs[ibuf_deq_if.rd], inuse_regs[ibuf_deq_if.rs1], inuse_regs[ibuf_deq_if.rs2], inuse_regs[ibuf_deq_if.rs3], exe_delay, gpr_delay); + inuse_regs[ibuf_deq_if.rd], inuse_regs[ibuf_deq_if.rs1], inuse_regs[ibuf_deq_if.rs2], inuse_regs[ibuf_deq_if.rs3], exe_delay, gpr_delay); + stall_ctr <= stall_ctr + 1; + if (stall_ctr >= 2000) begin + $fflush(); + assert(0); + end + end else if (ibuf_deq_if.valid && ibuf_deq_if.ready) begin + stall_ctr <= 0; end - end + end `endif endmodule \ No newline at end of file diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index 772ac3c0..55df0466 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -73,7 +73,7 @@ module VX_writeback #( 0; always @(*) assert(writeback_if.ready); - wire stall = 0/*~writeback_if.ready && writeback_if.valid*/; + wire stall =~writeback_if.ready && writeback_if.valid; VX_generic_register #( .N(1 + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32)) diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index 05c2b8db..578c83a1 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -316,7 +316,7 @@ module Vortex ( ); assign busy = (| per_cluster_busy); - assign ebreak = (& per_cluster_ebreak); + assign ebreak = (| per_cluster_ebreak); // L3 Cache /////////////////////////////////////////////////////////// diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 60d8fe16..05eb476b 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -561,7 +561,7 @@ module VX_bank #( wire[WORD_SIZE-1:0] req_byteen_st3; wire msrq_push_unqual = miss_st3 || force_miss_st3; - assign msrq_push_stall = (miss_st3 || force_miss_st3) && msrq_full; + assign msrq_push_stall = msrq_push_unqual && msrq_full; wire msrq_push = msrq_push_unqual && !msrq_full diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 13109130..803bd4bf 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -221,7 +221,7 @@ module VX_cache #( ); assign dram_req_tag = dram_req_addr; - assign dram_rsp_ready = (| per_bank_dram_rsp_ready); + assign dram_rsp_ready = (& per_bank_dram_rsp_ready); for (genvar i = 0; i < NUM_BANKS; i++) begin wire [NUM_REQUESTS-1:0] curr_bank_core_req_valid; diff --git a/hw/rtl/cache/VX_cache_miss_resrv.v b/hw/rtl/cache/VX_cache_miss_resrv.v index 73d958fc..7e65bb8e 100644 --- a/hw/rtl/cache/VX_cache_miss_resrv.v +++ b/hw/rtl/cache/VX_cache_miss_resrv.v @@ -178,7 +178,7 @@ module VX_cache_miss_resrv #( `ifdef DBG_PRINT_CACHE_MSRQ always @(posedge clk) begin - if (schedule_st0 || enqueue_st3 || dequeue_st3) begin + if (update_ready_st0 || schedule_st0 || enqueue_st3 || dequeue_st3) begin if (schedule_st0) $display("%t: cache%0d:%0d msrq-schedule: addr%0d=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, schedule_ptr, `LINE_TO_BYTE_ADDR(dequeue_addr_st0, BANK_ID), debug_wid_st0, debug_pc_st0); if (enqueue_st3) begin diff --git a/hw/rtl/cache/VX_snp_forwarder.v b/hw/rtl/cache/VX_snp_forwarder.v index d1aab523..ed9845d8 100644 --- a/hw/rtl/cache/VX_snp_forwarder.v +++ b/hw/rtl/cache/VX_snp_forwarder.v @@ -109,7 +109,7 @@ module VX_snp_forwarder #( .grant_onehot (sel_1hot) ); - wire stall = fwdin_valid && ~fwdin_ready; + wire stall = ~fwdin_ready && fwdin_valid; VX_generic_register #( .N(1 + `LOG2UP(SNRQ_SIZE)), diff --git a/hw/rtl/fp_cores/VX_fpnew.v b/hw/rtl/fp_cores/VX_fpnew.v index 0808fc80..d7dd399e 100644 --- a/hw/rtl/fp_cores/VX_fpnew.v +++ b/hw/rtl/fp_cores/VX_fpnew.v @@ -2,7 +2,9 @@ `include "fpnew_pkg.sv" `include "defs_div_sqrt_mvp.sv" -module VX_fpnew #( +`TRACING_OFF +module VX_fpnew +#( parameter TAGW = 1, parameter FMULADD = 1, parameter FDIVSQRT = 1, @@ -130,8 +132,6 @@ module VX_fpnew #( default:; endcase end - -`DISABLE_TRACING for (genvar i = 0; i < `NUM_THREADS; i++) begin if (0 == i) begin @@ -191,8 +191,6 @@ module VX_fpnew #( end end -`ENABLE_TRACING - assign fpu_valid_in = valid_in; assign ready_in = fpu_ready_in; @@ -207,4 +205,5 @@ module VX_fpnew #( assign valid_out = fpu_valid_out; assign fpu_ready_out = ready_out; -endmodule \ No newline at end of file +endmodule +`TRACING_ON \ No newline at end of file diff --git a/hw/rtl/libs/VX_dp_ram.v b/hw/rtl/libs/VX_dp_ram.v index 7cc4fae2..9c9366cc 100644 --- a/hw/rtl/libs/VX_dp_ram.v +++ b/hw/rtl/libs/VX_dp_ram.v @@ -1,5 +1,6 @@ `include "VX_platform.vh" +`TRACING_OFF module VX_dp_ram #( parameter DATAW = 1, parameter SIZE = 1, @@ -284,4 +285,5 @@ module VX_dp_ram #( end end -endmodule \ No newline at end of file +endmodule +`TRACING_ON \ No newline at end of file