From d5fa82f5e4d451386a2bb8097050363b5f7df303 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 8 Dec 2020 02:58:08 -0800 Subject: [PATCH] cache req datapath optimizations --- driver/opae/vx_scope.cpp | 6 +- hw/opae/vortex_afu.sv | 2 +- hw/rtl/VX_cluster.v | 91 ++++++++--------- hw/rtl/VX_csr_io_arb.v | 45 +++----- hw/rtl/VX_databus_arb.v | 88 ++++++---------- hw/rtl/VX_dcache_arb.v | 10 +- hw/rtl/VX_mem_arb.v | 56 ++++------ hw/rtl/Vortex.v | 92 ++++++++--------- hw/rtl/cache/VX_bank.v | 19 ++-- hw/rtl/cache/VX_cache.v | 65 ++++++------ hw/rtl/cache/VX_cache_config.vh | 2 +- hw/rtl/cache/VX_cache_core_req_bank_sel.v | 73 ++++++++----- hw/rtl/cache/VX_cache_core_rsp_merge.v | 77 +++++++++----- hw/rtl/cache/VX_miss_resrv.v | 21 ++-- hw/rtl/cache/VX_snp_forwarder.v | 33 ++---- hw/rtl/libs/VX_cam_buffer.v | 4 +- hw/rtl/libs/VX_stream_arbiter.v | 119 +++++++++++++--------- 17 files changed, 393 insertions(+), 410 deletions(-) diff --git a/driver/opae/vx_scope.cpp b/driver/opae/vx_scope.cpp index b40d74ee..e46fee69 100644 --- a/driver/opae/vx_scope.cpp +++ b/driver/opae/vx_scope.cpp @@ -149,8 +149,6 @@ int vx_scope_stop(fpga_handle hfpga, uint64_t delay) { ofs << "$upscope $end" << std::endl; ofs << "enddefinitions $end" << std::endl; - std::cout << "OK" << std::flush << std::endl; - uint64_t frame_width, max_frames, data_valid, offset, delta; uint64_t timestamp = 0; uint64_t frame_offset = 0; @@ -167,8 +165,6 @@ int vx_scope_stop(fpga_handle hfpga, uint64_t delay) { std::this_thread::sleep_for(std::chrono::seconds(1)); } while (true); - std::cout << "OK" << std::flush << std::endl; - // get frame width CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_WIDTH)); CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &frame_width)); @@ -239,7 +235,7 @@ int vx_scope_stop(fpga_handle hfpga, uint64_t delay) { signal_id = num_taps; if (0 == (frame_no % FRAME_FLUSH_SIZE)) { ofs << std::flush; - std::cout << "*** " << frame_no << " frames, timestamp=" << timestamp << std::flush << std::endl; + std::cout << "*** " << frame_no << "/" << max_frames << " frames" << std::endl; } } } diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index 5881b11a..59d54136 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -548,7 +548,7 @@ VX_mem_arb #( .ADDR_WIDTH ($bits(t_local_mem_addr)), .TAG_IN_WIDTH (AVS_REQ_TAGW), .TAG_OUT_WIDTH (AVS_REQ_TAGW+1) -) vx_cci_avs_arb ( +) dram_arb ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 5b1fc22f..5df7e9dc 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -204,17 +204,17 @@ module VX_cluster #( .req_tag_out (io_req_tag), .req_ready_out (io_req_ready), - // input responses - .rsp_valid_in (per_core_io_rsp_valid), - .rsp_data_in (per_core_io_rsp_data), - .rsp_tag_in (per_core_io_rsp_tag), - .rsp_ready_in (per_core_io_rsp_ready), - - // output response - .rsp_valid_out (io_rsp_valid), - .rsp_tag_out (io_rsp_tag), - .rsp_data_out (io_rsp_data), - .rsp_ready_out (io_rsp_ready) + // input response + .rsp_valid_in (io_rsp_valid), + .rsp_tag_in (io_rsp_tag), + .rsp_data_in (io_rsp_data), + .rsp_ready_in (io_rsp_ready), + + // output responses + .rsp_valid_out (per_core_io_rsp_valid), + .rsp_data_out (per_core_io_rsp_data), + .rsp_tag_out (per_core_io_rsp_tag), + .rsp_ready_out (per_core_io_rsp_ready) ); VX_csr_io_arb #( @@ -298,35 +298,30 @@ module VX_cluster #( if (`L2_ENABLE) begin - wire [`NUM_CORES-1:0] core_dram_rsp_valid; - wire [`NUM_CORES-1:0][`DDRAM_LINE_WIDTH-1:0] core_dram_rsp_data; - wire [`NUM_CORES-1:0][`XDRAM_TAG_WIDTH-1:0] core_dram_rsp_tag; - wire core_dram_rsp_ready; + wire [`NUM_CORES-1:0] per_core_dram_req_valid_qual; + wire [`NUM_CORES-1:0] per_core_dram_req_rw_qual; + wire [`NUM_CORES-1:0][`DDRAM_BYTEEN_WIDTH-1:0] per_core_dram_req_byteen_qual; + wire [`NUM_CORES-1:0][`DDRAM_ADDR_WIDTH-1:0] per_core_dram_req_addr_qual; + wire [`NUM_CORES-1:0][`DDRAM_LINE_WIDTH-1:0] per_core_dram_req_data_qual; + wire [`NUM_CORES-1:0][`XDRAM_TAG_WIDTH-1:0] per_core_dram_req_tag_qual; + wire [`NUM_CORES-1:0] per_core_dram_req_ready_qual; - reg [`NUM_CORES-1:0] core_dram_rsp_ready_other; - always @(*) begin - core_dram_rsp_ready_other = {`NUM_CORES{1'b1}}; - for (integer i = 0; i < `NUM_CORES; i++) begin - for (integer j = 0; j < `NUM_CORES; j++) begin - if (i != j) begin - core_dram_rsp_ready_other[i] &= (per_core_dram_rsp_ready [j] | !core_dram_rsp_valid [j]); - end - end - end + for (genvar i = 0; i < `NUM_CORES; i++) begin + VX_skid_buffer #( + .DATAW (1 + `DDRAM_BYTEEN_WIDTH + `DDRAM_ADDR_WIDTH + `DDRAM_LINE_WIDTH + `XDRAM_TAG_WIDTH), + .PASSTHRU (`NUM_CORES < 4) + ) dram_req_buffer ( + .clk (clk), + .reset (reset), + .valid_in (per_core_dram_req_valid[i]), + .data_in ({per_core_dram_req_rw[i], per_core_dram_req_byteen[i], per_core_dram_req_addr[i], per_core_dram_req_data[i], per_core_dram_req_tag[i]}), + .ready_in (per_core_dram_req_ready[i]), + .valid_out (per_core_dram_req_valid_qual[i]), + .data_out ({per_core_dram_req_rw_qual[i], per_core_dram_req_byteen_qual[i], per_core_dram_req_addr_qual[i], per_core_dram_req_data_qual[i], per_core_dram_req_tag_qual[i]}), + .ready_out (per_core_dram_req_ready_qual[i]) + ); end - for (genvar i = 0; i < `NUM_CORES; i++) begin - assign per_core_dram_rsp_valid [i] = core_dram_rsp_valid[i] & core_dram_rsp_ready_other [i]; - assign per_core_dram_rsp_data [i] = core_dram_rsp_data[i]; - assign per_core_dram_rsp_tag [i] = core_dram_rsp_tag[i]; - end - assign core_dram_rsp_ready = & (per_core_dram_rsp_ready | ~core_dram_rsp_valid); - - wire core_dram_req_ready; - for (genvar i = 0; i < `NUM_CORES; i++) begin - assign per_core_dram_req_ready[i] = core_dram_req_ready; - end - VX_cache #( .CACHE_ID (`L2CACHE_ID), .CACHE_SIZE (`L2CACHE_SIZE), @@ -355,19 +350,19 @@ module VX_cluster #( .reset (reset), // Core request - .core_req_valid (per_core_dram_req_valid), - .core_req_rw (per_core_dram_req_rw), - .core_req_byteen (per_core_dram_req_byteen), - .core_req_addr (per_core_dram_req_addr), - .core_req_data (per_core_dram_req_data), - .core_req_tag (per_core_dram_req_tag), - .core_req_ready (core_dram_req_ready), + .core_req_valid (per_core_dram_req_valid_qual), + .core_req_rw (per_core_dram_req_rw_qual), + .core_req_byteen (per_core_dram_req_byteen_qual), + .core_req_addr (per_core_dram_req_addr_qual), + .core_req_data (per_core_dram_req_data_qual), + .core_req_tag (per_core_dram_req_tag_qual), + .core_req_ready (per_core_dram_req_ready_qual), // Core response - .core_rsp_valid (core_dram_rsp_valid), - .core_rsp_data (core_dram_rsp_data), - .core_rsp_tag (core_dram_rsp_tag), - .core_rsp_ready (core_dram_rsp_ready), + .core_rsp_valid (per_core_dram_rsp_valid), + .core_rsp_data (per_core_dram_rsp_data), + .core_rsp_tag (per_core_dram_rsp_tag), + .core_rsp_ready (per_core_dram_rsp_ready), // DRAM request .dram_req_valid (dram_req_valid), diff --git a/hw/rtl/VX_csr_io_arb.v b/hw/rtl/VX_csr_io_arb.v index 935006f5..d596ee9b 100644 --- a/hw/rtl/VX_csr_io_arb.v +++ b/hw/rtl/VX_csr_io_arb.v @@ -4,14 +4,14 @@ module VX_csr_io_arb #( parameter NUM_REQS = 1, parameter DATA_WIDTH = 1, - parameter DATA_SIZE = (DATA_WIDTH / 8), - parameter ADDR_WIDTH = 32 - `CLOG2(DATA_SIZE), - parameter REQS_BITS = `LOG2UP(NUM_REQS) + parameter DATA_SIZE = (DATA_WIDTH / 8), + parameter ADDR_WIDTH = 32 - `CLOG2(DATA_SIZE), + parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS) ) ( input wire clk, input wire reset, - input wire [REQS_BITS-1:0] request_id, + input wire [LOG_NUM_REQS-1:0] request_id, // input requests input wire req_valid_in, @@ -40,7 +40,7 @@ module VX_csr_io_arb #( if (NUM_REQS > 1) begin for (genvar i = 0; i < NUM_REQS; i++) begin - assign req_valid_out[i] = req_valid_in && (request_id == `REQS_BITS'(i)); + assign req_valid_out[i] = req_valid_in && (request_id == LOG_NUM_REQS'(i)); assign req_addr_out[i] = req_addr_in; assign req_rw_out[i] = req_rw_in; assign req_data_out[i] = req_data_in; @@ -50,8 +50,6 @@ module VX_csr_io_arb #( end else begin - `UNUSED_VAR (clk) - `UNUSED_VAR (reset) `UNUSED_VAR (request_id) assign req_valid_out = req_valid_in; @@ -64,36 +62,17 @@ module VX_csr_io_arb #( /////////////////////////////////////////////////////////////////////// - // Inputs buffering - wire [NUM_REQS-1:0] rsp_valid_in_qual; - wire [NUM_REQS-1:0][DATA_WIDTH-1:0] rsp_data_in_qual; - wire [NUM_REQS-1:0] rsp_ready_in_qual; - for (genvar i = 0; i < NUM_REQS; ++i) begin - VX_skid_buffer #( - .DATAW (DATA_WIDTH), - .PASSTHRU (NUM_REQS < 4) - ) rsp_buffer ( - .clk (clk), - .reset (reset), - .valid_in (rsp_valid_in[i]), - .data_in (rsp_data_in[i]), - .ready_in (rsp_ready_in[i]), - .valid_out (rsp_valid_in_qual[i]), - .data_out (rsp_data_in_qual[i]), - .ready_out (rsp_ready_in_qual[i]) - ); - end - VX_stream_arbiter #( - .NUM_REQS(NUM_REQS), - .DATAW(DATA_WIDTH), - .BUFFERED(NUM_REQS >= 4) + .NUM_REQS (NUM_REQS), + .DATAW (DATA_WIDTH), + .IN_BUFFER (NUM_REQS >= 4), + .OUT_BUFFER (NUM_REQS >= 4) ) rsp_arb ( .clk (clk), .reset (reset), - .valid_in (rsp_valid_in_qual), - .data_in (rsp_data_in_qual), - .ready_in (rsp_ready_in_qual), + .valid_in (rsp_valid_in), + .data_in (rsp_data_in), + .ready_in (rsp_ready_in), .valid_out (rsp_valid_out), .data_out (rsp_data_out), .ready_out (rsp_ready_out) diff --git a/hw/rtl/VX_databus_arb.v b/hw/rtl/VX_databus_arb.v index 83980542..03e67ce0 100644 --- a/hw/rtl/VX_databus_arb.v +++ b/hw/rtl/VX_databus_arb.v @@ -6,9 +6,9 @@ module VX_databus_arb #( parameter TAG_IN_WIDTH = 1, parameter TAG_OUT_WIDTH = 1, - parameter WORD_WIDTH = WORD_SIZE * 8, - parameter ADDR_WIDTH = 32 - `CLOG2(WORD_SIZE), - parameter REQS_BITS = `CLOG2(NUM_REQS) + parameter WORD_WIDTH = WORD_SIZE * 8, + parameter ADDR_WIDTH = 32 - `CLOG2(WORD_SIZE), + parameter LOG_NUM_REQS = `CLOG2(NUM_REQS) ) ( input wire clk, input wire reset, @@ -32,64 +32,42 @@ module VX_databus_arb #( input wire req_ready_out, // input response - output wire [NUM_REQS-1:0] rsp_valid_in, - output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_in, - output wire [NUM_REQS-1:0][WORD_WIDTH-1:0] rsp_data_in, - input wire [NUM_REQS-1:0] rsp_ready_in, + input wire rsp_valid_in, + input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in, + input wire [WORD_WIDTH-1:0] rsp_data_in, + output wire rsp_ready_in, - // output response - input wire rsp_valid_out, - input wire [TAG_OUT_WIDTH-1:0] rsp_tag_out, - input wire [WORD_WIDTH-1:0] rsp_data_out, - output wire rsp_ready_out + // output responses + output wire [NUM_REQS-1:0] rsp_valid_out, + output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out, + output wire [NUM_REQS-1:0][WORD_WIDTH-1:0] rsp_data_out, + input wire [NUM_REQS-1:0] rsp_ready_out ); localparam DATAW = `NUM_THREADS + TAG_OUT_WIDTH + (`NUM_THREADS * ADDR_WIDTH) + 1 + (`NUM_THREADS * WORD_SIZE) + (`NUM_THREADS * WORD_WIDTH); if (NUM_REQS > 1) begin wire [NUM_REQS-1:0] valids; - for (genvar i = 0; i < NUM_REQS; i++) begin - assign valids[i] = (| req_valid_in[i]); - end - wire [NUM_REQS-1:0][DATAW-1:0] data_in; - for (genvar i = 0; i < NUM_REQS; i++) begin - assign data_in[i] = {req_valid_in[i], {req_tag_in[i], REQS_BITS'(i)}, req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]}; - end - - // Inputs buffering - wire [NUM_REQS-1:0] req_valid_in_qual; - wire [NUM_REQS-1:0][DATAW-1:0] req_data_in_qual; - wire [NUM_REQS-1:0] req_ready_in_qual; - for (genvar i = 0; i < NUM_REQS; ++i) begin - VX_skid_buffer #( - .DATAW (DATAW), - .PASSTHRU (NUM_REQS < 4) - ) req_buffer ( - .clk (clk), - .reset (reset), - .valid_in (valids[i]), - .data_in (data_in[i]), - .ready_in (req_ready_in[i]), - .valid_out (req_valid_in_qual[i]), - .data_out (req_data_in_qual[i]), - .ready_out (req_ready_in_qual[i]) - ); - end - wire [`NUM_THREADS-1:0] req_tmask_out; wire req_valid_out_unqual; + for (genvar i = 0; i < NUM_REQS; i++) begin + assign valids[i] = (| req_valid_in[i]); + assign data_in[i] = {req_valid_in[i], {req_tag_in[i], LOG_NUM_REQS'(i)}, req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]}; + end + VX_stream_arbiter #( - .NUM_REQS (NUM_REQS), - .DATAW (DATAW), - .BUFFERED (NUM_REQS >= 4) + .NUM_REQS (NUM_REQS), + .DATAW (DATAW), + .IN_BUFFER (NUM_REQS >= 4), + .OUT_BUFFER (NUM_REQS >= 4) ) req_arb ( .clk (clk), .reset (reset), - .valid_in (req_valid_in_qual), - .data_in (req_data_in_qual), - .ready_in (req_ready_in_qual), + .valid_in (valids), + .data_in (data_in), + .ready_in (req_ready_in), .valid_out (req_valid_out_unqual), .data_out ({req_tmask_out, req_tag_out, req_addr_out, req_rw_out, req_byteen_out, req_data_out}), .ready_out (req_ready_out) @@ -99,15 +77,15 @@ module VX_databus_arb #( /////////////////////////////////////////////////////////////////////// - wire [REQS_BITS-1:0] rsp_sel = rsp_tag_out[REQS_BITS-1:0]; + wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[LOG_NUM_REQS-1:0]; for (genvar i = 0; i < NUM_REQS; i++) begin - assign rsp_valid_in[i] = rsp_valid_out && (rsp_sel == REQS_BITS'(i)); - assign rsp_tag_in[i] = rsp_tag_out[REQS_BITS +: TAG_IN_WIDTH]; - assign rsp_data_in[i] = rsp_data_out; + assign rsp_valid_out[i] = rsp_valid_in && (rsp_sel == LOG_NUM_REQS'(i)); + assign rsp_tag_out[i] = rsp_tag_in[LOG_NUM_REQS +: TAG_IN_WIDTH]; + assign rsp_data_out[i] = rsp_data_in; end - assign rsp_ready_out = rsp_ready_in[rsp_sel]; + assign rsp_ready_in = rsp_ready_out[rsp_sel]; end else begin @@ -122,10 +100,10 @@ module VX_databus_arb #( assign req_data_out = req_data_in; assign req_ready_in = req_ready_out; - assign rsp_valid_in = rsp_valid_out; - assign rsp_tag_in = rsp_tag_out; - assign rsp_data_in = rsp_data_out; - assign rsp_ready_out = rsp_ready_in; + assign rsp_valid_out = rsp_valid_in; + assign rsp_tag_out = rsp_tag_in; + assign rsp_data_out = rsp_data_in; + assign rsp_ready_in = rsp_ready_out; end diff --git a/hw/rtl/VX_dcache_arb.v b/hw/rtl/VX_dcache_arb.v index c8367f2b..45d1dfec 100644 --- a/hw/rtl/VX_dcache_arb.v +++ b/hw/rtl/VX_dcache_arb.v @@ -36,7 +36,7 @@ module VX_dcache_arb ( wire core_req_valid; VX_skid_buffer #( - .DATAW (REQ_DATAW) + .DATAW (REQ_DATAW) ) req_buffer ( .clk (clk), .reset (reset), @@ -121,9 +121,10 @@ module VX_dcache_arb ( assign rsp_valid_in[2] = (| io_rsp_if.valid); VX_stream_arbiter #( - .NUM_REQS (3), - .DATAW (RSP_DATAW), - .BUFFERED (1) + .NUM_REQS (3), + .DATAW (RSP_DATAW), + .IN_BUFFER (1), + .OUT_BUFFER (1) ) rsp_arb ( .clk (clk), .reset (reset), @@ -138,6 +139,7 @@ module VX_dcache_arb ( assign cache_rsp_if.ready = rsp_ready_in[0]; assign smem_rsp_if.ready = rsp_ready_in[1]; assign io_rsp_if.ready = rsp_ready_in[2]; + assign core_rsp_if.valid = core_rsp_tmask & {`NUM_THREADS{core_rsp_valid}}; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_mem_arb.v b/hw/rtl/VX_mem_arb.v index b7f1f30e..f0869ab7 100644 --- a/hw/rtl/VX_mem_arb.v +++ b/hw/rtl/VX_mem_arb.v @@ -6,9 +6,9 @@ module VX_mem_arb #( parameter TAG_IN_WIDTH = 1, parameter TAG_OUT_WIDTH = 1, - parameter DATA_SIZE = (DATA_WIDTH / 8), - parameter ADDR_WIDTH = 32 - `CLOG2(DATA_SIZE), - parameter REQS_BITS = `CLOG2(NUM_REQS) + parameter DATA_SIZE = (DATA_WIDTH / 8), + parameter ADDR_WIDTH = 32 - `CLOG2(DATA_SIZE), + parameter LOG_NUM_REQS = `CLOG2(NUM_REQS) ) ( input wire clk, input wire reset, @@ -43,45 +43,27 @@ module VX_mem_arb #( output wire [NUM_REQS-1:0][DATA_WIDTH-1:0] rsp_data_out, input wire [NUM_REQS-1:0] rsp_ready_out ); - localparam DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH; + localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH; + localparam RSP_DATAW = TAG_IN_WIDTH + DATA_WIDTH; if (NUM_REQS > 1) begin - wire [NUM_REQS-1:0][DATAW-1:0] data_in; + wire [NUM_REQS-1:0][REQ_DATAW-1:0] data_in; for (genvar i = 0; i < NUM_REQS; i++) begin - assign data_in[i] = {{req_tag_in[i], REQS_BITS'(i)}, req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]}; - end - - // Inputs buffering - wire [NUM_REQS-1:0] req_valid_in_qual; - wire [NUM_REQS-1:0][DATAW-1:0] req_data_in_qual; - wire [NUM_REQS-1:0] req_ready_in_qual; - for (genvar i = 0; i < NUM_REQS; ++i) begin - VX_skid_buffer #( - .DATAW (DATAW), - .PASSTHRU (NUM_REQS < 4) - ) req_buffer ( - .clk (clk), - .reset (reset), - .valid_in (req_valid_in[i]), - .data_in (data_in[i]), - .ready_in (req_ready_in[i]), - .valid_out (req_valid_in_qual[i]), - .data_out (req_data_in_qual[i]), - .ready_out (req_ready_in_qual[i]) - ); + assign data_in[i] = {{req_tag_in[i], LOG_NUM_REQS'(i)}, req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]}; end VX_stream_arbiter #( - .NUM_REQS (NUM_REQS), - .DATAW (DATAW), - .BUFFERED (NUM_REQS >= 4) + .NUM_REQS (NUM_REQS), + .DATAW (REQ_DATAW), + .IN_BUFFER (NUM_REQS >= 4), + .OUT_BUFFER (NUM_REQS >= 4) ) req_arb ( .clk (clk), .reset (reset), - .valid_in (req_valid_in_qual), - .data_in (req_data_in_qual), - .ready_in (req_ready_in_qual), + .valid_in (req_valid_in), + .data_in (data_in), + .ready_in (req_ready_in), .valid_out (req_valid_out), .data_out ({req_tag_out, req_addr_out, req_rw_out, req_byteen_out, req_data_out}), .ready_out (req_ready_out) @@ -89,15 +71,15 @@ module VX_mem_arb #( /////////////////////////////////////////////////////////////////////// - wire [REQS_BITS-1:0] rsp_sel = rsp_tag_in [REQS_BITS-1:0]; + wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in [LOG_NUM_REQS-1:0]; for (genvar i = 0; i < NUM_REQS; i++) begin - assign rsp_valid_out [i] = rsp_valid_in && (rsp_sel == REQS_BITS'(i)); - assign rsp_tag_out [i] = rsp_tag_in[REQS_BITS +: TAG_IN_WIDTH]; - assign rsp_data_out [i] = rsp_data_in; + assign rsp_valid_out [i] = rsp_valid_in && (rsp_sel == LOG_NUM_REQS'(i)); + assign rsp_tag_out [i] = rsp_tag_in[LOG_NUM_REQS +: TAG_IN_WIDTH]; + assign rsp_data_out [i] = rsp_data_in; end - assign rsp_ready_in = rsp_ready_out [rsp_sel]; + assign rsp_ready_in = rsp_ready_out [rsp_sel]; end else begin diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index 7efb54bd..c7be43cb 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -206,17 +206,17 @@ module Vortex ( .req_tag_out (io_req_tag), .req_ready_out (io_req_ready), - // input responses - .rsp_valid_in (per_cluster_io_rsp_valid), - .rsp_data_in (per_cluster_io_rsp_data), - .rsp_tag_in (per_cluster_io_rsp_tag), - .rsp_ready_in (per_cluster_io_rsp_ready), - - // output response - .rsp_valid_out (io_rsp_valid), - .rsp_tag_out (io_rsp_tag), - .rsp_data_out (io_rsp_data), - .rsp_ready_out (io_rsp_ready) + // input response + .rsp_valid_in (io_rsp_valid), + .rsp_tag_in (io_rsp_tag), + .rsp_data_in (io_rsp_data), + .rsp_ready_in (io_rsp_ready), + + // output responses + .rsp_valid_out (per_cluster_io_rsp_valid), + .rsp_data_out (per_cluster_io_rsp_data), + .rsp_tag_out (per_cluster_io_rsp_tag), + .rsp_ready_out (per_cluster_io_rsp_ready) ); VX_csr_io_arb #( @@ -300,36 +300,30 @@ module Vortex ( if (`L3_ENABLE) begin - wire [`NUM_CLUSTERS-1:0] cluster_dram_rsp_valid; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_LINE_WIDTH-1:0] cluster_dram_rsp_data; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_TAG_WIDTH-1:0] cluster_dram_rsp_tag; - wire cluster_dram_rsp_ready; + wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_valid_qual; + wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_rw_qual; + wire [`NUM_CLUSTERS-1:0][`L2DRAM_BYTEEN_WIDTH-1:0] per_cluster_dram_req_byteen_qual; + wire [`NUM_CLUSTERS-1:0][`L2DRAM_ADDR_WIDTH-1:0] per_cluster_dram_req_addr_qual; + wire [`NUM_CLUSTERS-1:0][`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_req_data_qual; + wire [`NUM_CLUSTERS-1:0][`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_req_tag_qual; + wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_ready_qual; - reg [`NUM_CLUSTERS-1:0] cluster_dram_rsp_ready_other; - always @(*) begin - cluster_dram_rsp_ready_other = {`NUM_CLUSTERS{1'b1}}; - for (integer i = 0; i < `NUM_CLUSTERS; i++) begin - for (integer j = 0; j < `NUM_CLUSTERS; j++) begin - if (i != j) begin - cluster_dram_rsp_ready_other[i] &= (per_cluster_dram_rsp_ready [j] | !cluster_dram_rsp_valid [j]); - end - end - end + for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin + VX_skid_buffer #( + .DATAW (1 + `L2DRAM_BYTEEN_WIDTH + `L2DRAM_ADDR_WIDTH + `L2DRAM_LINE_WIDTH + `L2DRAM_TAG_WIDTH), + .PASSTHRU (`NUM_CLUSTERS < 4) + ) dram_req_buffer ( + .clk (clk), + .reset (reset), + .valid_in (per_cluster_dram_req_valid[i]), + .data_in ({per_cluster_dram_req_rw[i], per_cluster_dram_req_byteen[i], per_cluster_dram_req_addr[i], per_cluster_dram_req_data[i], per_cluster_dram_req_tag[i]}), + .ready_in (per_cluster_dram_req_ready[i]), + .valid_out (per_cluster_dram_req_valid_qual[i]), + .data_out ({per_cluster_dram_req_rw_qual[i], per_cluster_dram_req_byteen_qual[i], per_cluster_dram_req_addr_qual[i], per_cluster_dram_req_data_qual[i], per_cluster_dram_req_tag_qual[i]}), + .ready_out (per_cluster_dram_req_ready_qual[i]) + ); end - for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin - // Core Response - assign per_cluster_dram_rsp_valid [i] = cluster_dram_rsp_valid [i] & cluster_dram_rsp_ready_other [i]; - assign per_cluster_dram_rsp_data [i] = cluster_dram_rsp_data [i]; - assign per_cluster_dram_rsp_tag [i] = cluster_dram_rsp_tag [i]; - end - assign cluster_dram_rsp_ready = & (per_cluster_dram_rsp_ready | ~cluster_dram_rsp_valid); - - wire cluster_dram_req_ready; - for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin - assign per_cluster_dram_req_ready[i] = cluster_dram_req_ready; - end - VX_cache #( .CACHE_ID (`L3CACHE_ID), .CACHE_SIZE (`L3CACHE_SIZE), @@ -358,19 +352,19 @@ module Vortex ( .reset (reset), // Core request - .core_req_valid (per_cluster_dram_req_valid), - .core_req_rw (per_cluster_dram_req_rw), - .core_req_byteen (per_cluster_dram_req_byteen), - .core_req_addr (per_cluster_dram_req_addr), - .core_req_data (per_cluster_dram_req_data), - .core_req_tag (per_cluster_dram_req_tag), - .core_req_ready (cluster_dram_req_ready), + .core_req_valid (per_cluster_dram_req_valid_qual), + .core_req_rw (per_cluster_dram_req_rw_qual), + .core_req_byteen (per_cluster_dram_req_byteen_qual), + .core_req_addr (per_cluster_dram_req_addr_qual), + .core_req_data (per_cluster_dram_req_data_qual), + .core_req_tag (per_cluster_dram_req_tag_qual), + .core_req_ready (per_cluster_dram_req_ready_qual), // Core response - .core_rsp_valid (cluster_dram_rsp_valid), - .core_rsp_data (cluster_dram_rsp_data), - .core_rsp_tag (cluster_dram_rsp_tag), - .core_rsp_ready (cluster_dram_rsp_ready), + .core_rsp_valid (per_cluster_dram_rsp_valid), + .core_rsp_data (per_cluster_dram_rsp_data), + .core_rsp_tag (per_cluster_dram_rsp_tag), + .core_rsp_ready (per_cluster_dram_rsp_ready), // DRAM request .dram_req_valid (dram_req_valid), diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 34f6c9c4..b5758eef 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -264,7 +264,9 @@ module VX_bank #( .full (creq_full) ); - reg [$clog2(MSHR_SIZE+1)-1:0] mshr_pending_size; + reg [$clog2(MSHR_SIZE+1)-1:0] mshr_pending_size; + wire [$clog2(MSHR_SIZE+1)-1:0] mshr_pending_size_n; + reg mshr_going_full; wire mshr_pop; wire mshr_valid_st0; wire[`REQS_BITS-1:0] mshr_tid_st0; @@ -346,14 +348,12 @@ module VX_bank #( wire dreq_push_stall; wire srsq_push_stall; wire pipeline_stall; - + wire is_mshr_miss_st2 = valid_st2 && is_mshr_st2 && (miss_st2 || force_miss_st2); wire is_mshr_miss_st3 = valid_st3 && is_mshr_st3 && (miss_st3 || force_miss_st3); wire creq_commit = valid_st1 && core_req_hit_st1 && !pipeline_stall; - wire mshr_going_full = (mshr_pending_size == MSHR_SIZE); - // determine which queue to pop next in piority order wire mshr_pop_unqual = mshr_valid_st0; wire drsq_pop_unqual = !mshr_pop_unqual && !drsq_empty; @@ -367,13 +367,16 @@ module VX_bank #( assign sreq_pop = sreq_pop_unqual && !pipeline_stall; // MSHR pending size + assign mshr_pending_size_n = mshr_pending_size + + ((creq_pop && !creq_commit) ? 1 : ((creq_commit && !creq_pop) ? -1 : 0)); always @(posedge clk) begin if (reset) begin mshr_pending_size <= 0; + mshr_going_full <= 0; end else begin - mshr_pending_size <= mshr_pending_size + - ((creq_pop && !creq_commit) ? 1 : ((creq_commit && !creq_pop) ? -1 : 0)); - end + mshr_pending_size <= mshr_pending_size_n; + mshr_going_full <= (mshr_pending_size_n == MSHR_SIZE); + end end assign is_mshr_st0 = mshr_pop_unqual; @@ -736,7 +739,7 @@ end .enqueue_byteen_st3 (req_byteen_st3), .enqueue_is_snp_st3 (is_snp_st3), .enqueue_snp_inv_st3(snp_inv_st3), - .enqueue_mshr_st3 (is_mshr_st3), + .enqueue_is_mshr_st3(is_mshr_st3), .enqueue_ready_st3 (mshr_init_ready_state_st3), .enqueue_full (mshr_full), diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index f65027fa..5dc12063 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -39,11 +39,11 @@ module VX_cache #( // Enable cache flush parameter FLUSH_ENABLE = 1, - // size of tag id in core request tag - parameter CORE_TAG_ID_BITS = $clog2(MSHR_SIZE), - // core request tag size - parameter CORE_TAG_WIDTH = CORE_TAG_ID_BITS, + parameter CORE_TAG_WIDTH = $clog2(MSHR_SIZE), + + // size of tag id in core request tag + parameter CORE_TAG_ID_BITS = 0, // dram request tag size parameter DRAM_TAG_WIDTH = (32 - $clog2(BANK_LINE_SIZE)), @@ -63,13 +63,13 @@ module VX_cache #( input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, input wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data, input wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_req_tag, - output wire core_req_ready, + output wire [`CORE_REQ_TAG_COUNT-1:0] core_req_ready, // Core response output wire [NUM_REQS-1:0] core_rsp_valid, output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data, output wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag, - input wire core_rsp_ready, + input wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready, // DRAM request output wire dram_req_valid, @@ -139,9 +139,10 @@ module VX_cache #( VX_cache_core_req_bank_sel #( .BANK_LINE_SIZE (BANK_LINE_SIZE), - .NUM_BANKS (NUM_BANKS), - .WORD_SIZE (WORD_SIZE), - .NUM_REQS (NUM_REQS) + .NUM_BANKS (NUM_BANKS), + .WORD_SIZE (WORD_SIZE), + .NUM_REQS (NUM_REQS), + .CORE_TAG_ID_BITS (CORE_TAG_ID_BITS) ) cache_core_req_bank_sel ( .core_req_valid (core_req_valid), .core_req_addr (core_req_addr), @@ -197,7 +198,7 @@ module VX_cache #( wire curr_bank_miss; // Core Req - assign curr_bank_core_req_valid = (per_bank_valid[i] & {NUM_REQS{core_req_ready}}); + assign curr_bank_core_req_valid = per_bank_valid[i]; assign curr_bank_core_req_addr = core_req_addr; assign curr_bank_core_req_rw = core_req_rw; assign curr_bank_core_req_byteen = core_req_byteen; @@ -355,18 +356,18 @@ module VX_cache #( end VX_stream_arbiter #( - .NUM_REQS(NUM_BANKS), - .DATAW(`DRAM_ADDR_WIDTH + 1 + BANK_LINE_SIZE + `BANK_LINE_WIDTH), - .BUFFERED(NUM_BANKS >= 4) + .NUM_REQS (NUM_BANKS), + .DATAW (`DRAM_ADDR_WIDTH + 1 + BANK_LINE_SIZE + `BANK_LINE_WIDTH), + .OUT_BUFFER (NUM_BANKS >= 4) ) dram_req_arb ( - .clk (clk), - .reset (reset), - .valid_in (per_bank_dram_req_valid), - .data_in (data_in), - .ready_in (per_bank_dram_req_ready), - .valid_out (dram_req_valid), - .data_out ({dram_req_addr, dram_req_rw, dram_req_byteen, dram_req_data}), - .ready_out (dram_req_ready) + .clk (clk), + .reset (reset), + .valid_in (per_bank_dram_req_valid), + .data_in (data_in), + .ready_in (per_bank_dram_req_ready), + .valid_out (dram_req_valid), + .data_out ({dram_req_addr, dram_req_rw, dram_req_byteen, dram_req_data}), + .ready_out (dram_req_ready) ); end else begin `UNUSED_VAR (per_bank_dram_req_valid) @@ -385,18 +386,18 @@ module VX_cache #( if (FLUSH_ENABLE) begin VX_stream_arbiter #( - .NUM_REQS(NUM_BANKS), - .DATAW(SNP_TAG_WIDTH), - .BUFFERED(NUM_BANKS >= 4) + .NUM_REQS (NUM_BANKS), + .DATAW (SNP_TAG_WIDTH), + .OUT_BUFFER (NUM_BANKS >= 4) ) snp_rsp_arb ( - .clk (clk), - .reset (reset), - .valid_in (per_bank_snp_rsp_valid), - .data_in (per_bank_snp_rsp_tag), - .ready_in (per_bank_snp_rsp_ready), - .valid_out (snp_rsp_valid), - .data_out (snp_rsp_tag), - .ready_out (snp_rsp_ready) + .clk (clk), + .reset (reset), + .valid_in (per_bank_snp_rsp_valid), + .data_in (per_bank_snp_rsp_tag), + .ready_in (per_bank_snp_rsp_ready), + .valid_out (snp_rsp_valid), + .data_out (snp_rsp_tag), + .ready_out (snp_rsp_ready) ); end else begin `UNUSED_VAR (per_bank_snp_rsp_valid) diff --git a/hw/rtl/cache/VX_cache_config.vh b/hw/rtl/cache/VX_cache_config.vh index ff90463f..e37cd912 100644 --- a/hw/rtl/cache/VX_cache_config.vh +++ b/hw/rtl/cache/VX_cache_config.vh @@ -15,7 +15,7 @@ `define REQ_INST_META_WIDTH (`REQ_TAG_WIDTH + 1 + WORD_SIZE + `REQS_BITS) // data metadata word_sel is_snp snp_inv -`define MSHR_METADATA_WIDTH (`WORD_WIDTH + `REQ_INST_META_WIDTH + `UP(`WORD_SELECT_WIDTH) + 1 + 1) +`define MSHR_DATA_WIDTH (`WORD_WIDTH + `REQ_INST_META_WIDTH + `UP(`WORD_SELECT_WIDTH) + 1 + 1) `define BANK_BITS `LOG2UP(NUM_BANKS) diff --git a/hw/rtl/cache/VX_cache_core_req_bank_sel.v b/hw/rtl/cache/VX_cache_core_req_bank_sel.v index e48db901..e7a2ce66 100644 --- a/hw/rtl/cache/VX_cache_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_cache_core_req_bank_sel.v @@ -8,53 +8,72 @@ module VX_cache_core_req_bank_sel #( // Number of banks parameter NUM_BANKS = 1, // Number of Word requests per cycle - parameter NUM_REQS = 1 + parameter NUM_REQS = 1, + // size of tag id in core request tag + parameter CORE_TAG_ID_BITS = 1 ) ( input wire [NUM_REQS-1:0] core_req_valid, input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, - output wire core_req_ready, + output wire [`CORE_REQ_TAG_COUNT-1:0] core_req_ready, output wire [NUM_BANKS-1:0][NUM_REQS-1:0] per_bank_valid, input wire [NUM_BANKS-1:0] per_bank_ready ); if (NUM_BANKS > 1) begin - - reg [NUM_BANKS-1:0][NUM_REQS-1:0] per_bank_valid_r; - reg [NUM_BANKS-1:0] per_bank_ready_ignore; - reg [NUM_BANKS-1:0] per_bank_ready_other; - - always @(*) begin - per_bank_valid_r = 0; - per_bank_ready_other = {NUM_BANKS{1'b1}}; - per_bank_ready_ignore = {NUM_BANKS{1'b1}}; - - for (integer i = 0; i < NUM_BANKS; i++) begin - for (integer j = 0; j < NUM_BANKS; j++) begin - if (i != j) begin - per_bank_ready_other[i] &= (per_bank_ready[j] | per_bank_ready_ignore[j]); - end - end - end + reg [NUM_BANKS-1:0][NUM_REQS-1:0] per_bank_valid_r; + + always @(*) begin + per_bank_valid_r = 0; for (integer i = 0; i < NUM_REQS; i++) begin per_bank_valid_r[core_req_addr[i][`BANK_SELECT_ADDR_RNG]][i] = core_req_valid[i]; - per_bank_ready_ignore[core_req_addr[i][`BANK_SELECT_ADDR_RNG]] = 1'b0; end end - for (genvar i = 0; i < NUM_BANKS; i++) begin - for (genvar j = 0; j < NUM_REQS; j++) begin - assign per_bank_valid[i][j] = per_bank_valid_r[i][j] & per_bank_ready_other[i]; - end - end + if (CORE_TAG_ID_BITS != 0) begin + + reg [NUM_BANKS-1:0] per_bank_ready_other, per_bank_ready_ignore; + + always @(*) begin + per_bank_ready_other = {NUM_BANKS{1'b1}}; + per_bank_ready_ignore = {NUM_BANKS{1'b1}}; - assign core_req_ready = & (per_bank_ready | per_bank_ready_ignore); + for (integer i = 0; i < NUM_REQS; i++) begin + per_bank_ready_ignore[core_req_addr[i][`BANK_SELECT_ADDR_RNG]] = 1'b0; + end + + for (integer i = 0; i < NUM_BANKS; i++) begin + for (integer j = 0; j < NUM_BANKS; j++) begin + if (i != j) begin + per_bank_ready_other[i] &= (per_bank_ready[j] | per_bank_ready_ignore[j]); + end + end + end + end + + for (genvar i = 0; i < NUM_BANKS; i++) begin + for (genvar j = 0; j < NUM_REQS; j++) begin + assign per_bank_valid[i][j] = per_bank_valid_r[i][j] && per_bank_ready_other[i]; + end + end + + assign core_req_ready[0] = & (per_bank_ready | per_bank_ready_ignore); + + end else begin + + assign per_bank_valid = per_bank_valid_r; + + for (genvar i = 0; i < NUM_REQS; i++) begin + assign core_req_ready[i] = per_bank_ready[core_req_addr[i][`BANK_SELECT_ADDR_RNG]]; + end + + end end else begin `UNUSED_VAR (core_req_addr) assign per_bank_valid = core_req_valid; - assign core_req_ready = per_bank_ready; + assign core_req_ready[0] = per_bank_ready; end diff --git a/hw/rtl/cache/VX_cache_core_rsp_merge.v b/hw/rtl/cache/VX_cache_core_rsp_merge.v index 2feb90e6..b670b3d3 100644 --- a/hw/rtl/cache/VX_cache_core_rsp_merge.v +++ b/hw/rtl/cache/VX_cache_core_rsp_merge.v @@ -26,18 +26,20 @@ module VX_cache_core_rsp_merge #( output wire [NUM_REQS-1:0] core_rsp_valid, output wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag, output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data, - input wire core_rsp_ready + input wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready ); if (NUM_BANKS > 1) begin reg [NUM_REQS-1:0] core_rsp_valid_unqual; - reg [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual; reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_unqual; reg [NUM_BANKS-1:0] core_rsp_bank_select; if (CORE_TAG_ID_BITS != 0) begin + reg [CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual; reg [CORE_TAG_ID_BITS-1:0] sel_tag_id; + + wire stall = ~core_rsp_ready && (| core_rsp_valid); always @(*) begin core_rsp_valid_unqual = 0; @@ -60,13 +62,32 @@ module VX_cache_core_rsp_merge #( && (per_bank_core_rsp_tag[i][CORE_TAG_ID_BITS-1:0] == sel_tag_id)) begin core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; - core_rsp_bank_select[i] = 1; + core_rsp_bank_select[i] = ~stall; end end + end + + VX_generic_register #( + .N(NUM_REQS + (NUM_REQS *`WORD_WIDTH) + CORE_TAG_WIDTH), + .R(NUM_REQS) + ) pipe_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (1'b0), + .data_in ({core_rsp_valid_unqual, core_rsp_data_unqual, core_rsp_tag_unqual}), + .data_out ({core_rsp_valid, core_rsp_data, core_rsp_tag}) + ); + + for (genvar i = 0; i < NUM_BANKS; i++) begin + assign per_bank_core_rsp_ready[i] = core_rsp_bank_select[i]; end end else begin + reg [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual; + reg [NUM_REQS-1:0] stall; + always @(*) begin core_rsp_valid_unqual = 0; core_rsp_tag_unqual = 'x; @@ -79,29 +100,32 @@ module VX_cache_core_rsp_merge #( core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; core_rsp_tag_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_tag[i]; core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; - core_rsp_bank_select[i] = 1; + core_rsp_bank_select[i] = ~stall[per_bank_core_rsp_tid[i]]; end end end - end + for (genvar i = 0; i < NUM_REQS; i++) begin - wire stall = ~core_rsp_ready && (| core_rsp_valid); + assign stall[i] = ~core_rsp_ready[i] && core_rsp_valid[i]; - VX_generic_register #( - .N(NUM_REQS + (NUM_REQS *`WORD_WIDTH) + (`CORE_REQ_TAG_COUNT * CORE_TAG_WIDTH)), - .R(NUM_REQS) - ) pipe_reg ( - .clk (clk), - .reset (reset), - .stall (stall), - .flush (1'b0), - .data_in ({core_rsp_valid_unqual, core_rsp_data_unqual, core_rsp_tag_unqual}), - .data_out ({core_rsp_valid, core_rsp_data, core_rsp_tag}) - ); + VX_generic_register #( + .N(1 + `WORD_WIDTH + CORE_TAG_WIDTH), + .R(1) + ) pipe_reg ( + .clk (clk), + .reset (reset), + .stall (stall[i]), + .flush (1'b0), + .data_in ({core_rsp_valid_unqual[i], core_rsp_data_unqual[i], core_rsp_tag_unqual[i]}), + .data_out ({core_rsp_valid[i], core_rsp_data[i], core_rsp_tag[i]}) + ); + end + + for (genvar i = 0; i < NUM_BANKS; i++) begin + assign per_bank_core_rsp_ready[i] = core_rsp_bank_select[i]; + end - for (genvar i = 0; i < NUM_BANKS; i++) begin - assign per_bank_core_rsp_ready[i] = core_rsp_bank_select[i] && ~stall; end end else begin @@ -116,14 +140,19 @@ module VX_cache_core_rsp_merge #( reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_unqual; if (CORE_TAG_ID_BITS != 0) begin + always @(*) begin core_rsp_valid_unqual = 0; core_rsp_tag_unqual = per_bank_core_rsp_tag[0]; core_rsp_data_unqual = 'x; core_rsp_valid_unqual[per_bank_core_rsp_tid[0]] = per_bank_core_rsp_valid; core_rsp_data_unqual[per_bank_core_rsp_tid[0]] = per_bank_core_rsp_data[0]; - end + end + + assign per_bank_core_rsp_ready[0] = core_rsp_ready; + end else begin + always @(*) begin core_rsp_valid_unqual = 0; core_rsp_tag_unqual = 'x; @@ -131,14 +160,16 @@ module VX_cache_core_rsp_merge #( core_rsp_valid_unqual[per_bank_core_rsp_tid[0]] = per_bank_core_rsp_valid; core_rsp_tag_unqual[per_bank_core_rsp_tid[0]] = per_bank_core_rsp_tag[0]; core_rsp_data_unqual[per_bank_core_rsp_tid[0]] = per_bank_core_rsp_data[0]; - end + end + + assign per_bank_core_rsp_ready[0] = core_rsp_ready[per_bank_core_rsp_tid[0]]; + end assign core_rsp_valid = core_rsp_valid_unqual; assign core_rsp_tag = core_rsp_tag_unqual; assign core_rsp_data = core_rsp_data_unqual; - assign per_bank_core_rsp_ready[0] = core_rsp_ready; - + end else begin `UNUSED_VAR(per_bank_core_rsp_tid) diff --git a/hw/rtl/cache/VX_miss_resrv.v b/hw/rtl/cache/VX_miss_resrv.v index ee3bc71a..fbe3e954 100644 --- a/hw/rtl/cache/VX_miss_resrv.v +++ b/hw/rtl/cache/VX_miss_resrv.v @@ -48,7 +48,7 @@ module VX_miss_resrv #( input wire[WORD_SIZE-1:0] enqueue_byteen_st3, input wire enqueue_is_snp_st3, input wire enqueue_snp_inv_st3, - input wire enqueue_mshr_st3, + input wire enqueue_is_mshr_st3, input wire enqueue_ready_st3, output wire enqueue_full, @@ -71,7 +71,7 @@ module VX_miss_resrv #( output wire dequeue_snp_inv_st0, input wire dequeue_st3 ); - wire [`MSHR_METADATA_WIDTH-1:0] metadata_table; + wire [`MSHR_DATA_WIDTH-1:0] data_table; reg [`LINE_ADDR_WIDTH-1:0] addr_table [MSHR_SIZE-1:0]; @@ -91,7 +91,7 @@ module VX_miss_resrv #( assign pending_hazard_st0 = (| valid_address_match); - wire dequeue_ready = valid_table[schedule_ptr] && ready_table[schedule_ptr]; + wire dequeue_ready = ready_table[schedule_ptr]; assign dequeue_valid_st0 = dequeue_ready; assign dequeue_addr_st0 = addr_table[schedule_ptr]; @@ -102,9 +102,9 @@ module VX_miss_resrv #( dequeue_byteen_st0, dequeue_wsel_st0, dequeue_is_snp_st0, - dequeue_snp_inv_st0} = metadata_table; + dequeue_snp_inv_st0} = data_table; - wire mshr_push = enqueue_st3 && !enqueue_mshr_st3; + wire mshr_push = enqueue_st3 && !enqueue_is_mshr_st3; wire [`LOG2UP(MSHR_SIZE)-1:0] head_ptr_n = head_ptr + $bits(head_ptr)'(1); @@ -124,7 +124,7 @@ module VX_miss_resrv #( if (enqueue_st3) begin assert(!enqueue_full); - if (enqueue_mshr_st3) begin + if (enqueue_is_mshr_st3) begin // returning missed msrq entry, restore schedule valid_table[restore_ptr] <= 1; ready_table[restore_ptr] <= enqueue_ready_st3; @@ -146,19 +146,20 @@ module VX_miss_resrv #( if (schedule_st0) begin assert(dequeue_valid_st0); valid_table[schedule_ptr] <= 0; + ready_table[schedule_ptr] <= 0; schedule_ptr <= schedule_ptr + $bits(schedule_ptr)'(1); end end end always @(posedge clk) begin - if (enqueue_st3 && !enqueue_mshr_st3) begin + if (enqueue_st3 && !enqueue_is_mshr_st3) begin addr_table[tail_ptr] <= enqueue_addr_st3; end end VX_dp_ram #( - .DATAW(`MSHR_METADATA_WIDTH), + .DATAW(`MSHR_DATA_WIDTH), .SIZE(MSHR_SIZE), .BYTEENW(1), .BUFFERED(0), @@ -171,7 +172,7 @@ module VX_miss_resrv #( .byteen(1'b1), .rden(1'b1), .din({enqueue_data_st3, enqueue_tid_st3, enqueue_tag_st3, enqueue_rw_st3, enqueue_byteen_st3, enqueue_wsel_st3, enqueue_is_snp_st3, enqueue_snp_inv_st3}), - .dout(metadata_table) + .dout(data_table) ); `ifdef DBG_PRINT_CACHE_MSHR @@ -180,7 +181,7 @@ module VX_miss_resrv #( if (schedule_st0) $display("%t: cache%0d:%0d msrq-schedule: addr%0d=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, schedule_ptr, `LINE_TO_BYTE_ADDR(dequeue_addr_st0, BANK_ID), debug_wid_st0, debug_pc_st0); if (enqueue_st3) begin - if (enqueue_mshr_st3) + if (enqueue_is_mshr_st3) $display("%t: cache%0d:%0d msrq-restore: addr%0d=%0h, ready=%b", $time, CACHE_ID, BANK_ID, restore_ptr, `LINE_TO_BYTE_ADDR(enqueue_addr_st3, BANK_ID), enqueue_ready_st3); else $display("%t: cache%0d:%0d msrq-enq: addr%0d=%0h, ready=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, tail_ptr, `LINE_TO_BYTE_ADDR(enqueue_addr_st3, BANK_ID), enqueue_ready_st3, debug_wid_st3, debug_pc_st3); diff --git a/hw/rtl/cache/VX_snp_forwarder.v b/hw/rtl/cache/VX_snp_forwarder.v index 08aed6d8..bd2c6a30 100644 --- a/hw/rtl/cache/VX_snp_forwarder.v +++ b/hw/rtl/cache/VX_snp_forwarder.v @@ -44,26 +44,6 @@ module VX_snp_forwarder #( if (NUM_REQS > 1) begin - // Inputs buffering - wire [NUM_REQS-1:0] snp_fwdin_valid_qual; - wire [NUM_REQS-1:0][TAG_OUT_WIDTH-1:0] snp_fwdin_tag_qual; - wire [NUM_REQS-1:0] snp_fwdin_ready_qual; - for (genvar i = 0; i < NUM_REQS; ++i) begin - VX_skid_buffer #( - .DATAW (TAG_OUT_WIDTH), - .PASSTHRU (NUM_REQS < 4) - ) snp_fwdin_buffer ( - .clk (clk), - .reset (reset), - .valid_in (snp_fwdin_valid[i]), - .data_in (snp_fwdin_tag[i]), - .ready_in (snp_fwdin_ready[i]), - .valid_out (snp_fwdin_valid_qual[i]), - .data_out (snp_fwdin_tag_qual[i]), - .ready_out (snp_fwdin_ready_qual[i]) - ); - end - reg [REQ_QUAL_BITS:0] pending_cntrs [SREQ_SIZE-1:0]; wire [TAG_OUT_WIDTH-1:0] sfq_write_addr, sfq_read_addr; @@ -181,15 +161,16 @@ module VX_snp_forwarder #( assign snp_req_ready = fwdout_ready && !sfq_full && !dispatch_hold; VX_stream_arbiter #( - .NUM_REQS(NUM_REQS), - .DATAW(TAG_OUT_WIDTH), - .BUFFERED(NUM_REQS >= 4) + .NUM_REQS (NUM_REQS), + .DATAW (TAG_OUT_WIDTH), + .IN_BUFFER (NUM_REQS >= 4), + .OUT_BUFFER (NUM_REQS >= 4) ) snp_fwdin_arb ( .clk (clk), .reset (reset), - .valid_in (snp_fwdin_valid_qual), - .data_in (snp_fwdin_tag_qual), - .ready_in (snp_fwdin_ready_qual), + .valid_in (snp_fwdin_valid), + .data_in (snp_fwdin_tag), + .ready_in (snp_fwdin_ready), .valid_out (fwdin_valid), .data_out (fwdin_tag), .ready_out (fwdin_ready) diff --git a/hw/rtl/libs/VX_cam_buffer.v b/hw/rtl/libs/VX_cam_buffer.v index fd38e6a4..fae05b2f 100644 --- a/hw/rtl/libs/VX_cam_buffer.v +++ b/hw/rtl/libs/VX_cam_buffer.v @@ -52,9 +52,7 @@ module VX_cam_buffer #( write_addr_r <= ADDRW'(1'b0); end else begin if (release_slot) begin - assert(0 == free_slots[release_addr]) else begin - $display("%t: releasing invalid slot at port %d", $time, release_addr); - end + assert(0 == free_slots[release_addr]) else $error("%t: releasing invalid slot at port %d", $time, release_addr); end free_slots <= free_slots_n; write_addr_r <= free_index; diff --git a/hw/rtl/libs/VX_stream_arbiter.v b/hw/rtl/libs/VX_stream_arbiter.v index 26a704a5..b17fc81b 100644 --- a/hw/rtl/libs/VX_stream_arbiter.v +++ b/hw/rtl/libs/VX_stream_arbiter.v @@ -1,10 +1,11 @@ `include "VX_platform.vh" module VX_stream_arbiter #( - parameter NUM_REQS = 1, - parameter DATAW = 1, - parameter TYPE = "R", - parameter BUFFERED = 0 + parameter NUM_REQS = 1, + parameter DATAW = 1, + parameter TYPE = "R", + parameter IN_BUFFER = 0, + parameter OUT_BUFFER = 0 ) ( input wire clk, input wire reset, @@ -17,18 +18,30 @@ module VX_stream_arbiter #( output wire [DATAW-1:0] data_out, input wire ready_out ); + localparam LOG_NUM_REQS = $clog2(NUM_REQS); - if (NUM_REQS == 1) begin + if (NUM_REQS > 1) begin - `UNUSED_VAR (clk) - `UNUSED_VAR (reset) - - assign valid_out = valid_in; - assign data_out = data_in; - assign ready_in = ready_out; + wire [NUM_REQS-1:0] valid_in_qual; + wire [NUM_REQS-1:0][DATAW-1:0] data_in_qual; + wire [NUM_REQS-1:0] ready_in_qual; - end else begin + for (genvar i = 0; i < NUM_REQS; ++i) begin + VX_skid_buffer #( + .DATAW (DATAW), + .PASSTHRU (!IN_BUFFER) + ) req_buffer ( + .clk (clk), + .reset (reset), + .valid_in (valid_in[i]), + .data_in (data_in[i]), + .ready_in (ready_in[i]), + .valid_out (valid_in_qual[i]), + .data_out (data_in_qual[i]), + .ready_out (ready_in_qual[i]) + ); + end wire sel_enable; wire sel_valid; @@ -41,13 +54,13 @@ module VX_stream_arbiter #( .NUM_REQS(NUM_REQS), .LOCK_ENABLE(1) ) sel_arb ( - .clk (clk), - .reset (reset), - .requests (valid_in), - .enable (sel_enable), - .grant_valid (sel_valid), - .grant_index (sel_idx), - .grant_onehot(sel_1hot) + .clk (clk), + .reset (reset), + .requests (valid_in_qual), + .enable (sel_enable), + .grant_valid (sel_valid), + .grant_index (sel_idx), + .grant_onehot (sel_1hot) ); end else if (TYPE == "R") begin @@ -56,13 +69,13 @@ module VX_stream_arbiter #( .NUM_REQS(NUM_REQS), .LOCK_ENABLE(1) ) sel_arb ( - .clk (clk), - .reset (reset), - .requests (valid_in), - .enable (sel_enable), - .grant_valid (sel_valid), - .grant_index (sel_idx), - .grant_onehot(sel_1hot) + .clk (clk), + .reset (reset), + .requests (valid_in_qual), + .enable (sel_enable), + .grant_valid (sel_valid), + .grant_index (sel_idx), + .grant_onehot (sel_1hot) ); end else if (TYPE == "F") begin @@ -71,13 +84,13 @@ module VX_stream_arbiter #( .NUM_REQS(NUM_REQS), .LOCK_ENABLE(1) ) sel_arb ( - .clk (clk), - .reset (reset), - .requests (valid_in), - .enable (sel_enable), - .grant_valid (sel_valid), - .grant_index (sel_idx), - .grant_onehot(sel_1hot) + .clk (clk), + .reset (reset), + .requests (valid_in_qual), + .enable (sel_enable), + .grant_valid (sel_valid), + .grant_index (sel_idx), + .grant_onehot (sel_1hot) ); end else if (TYPE == "M") begin @@ -86,18 +99,18 @@ module VX_stream_arbiter #( .NUM_REQS(NUM_REQS), .LOCK_ENABLE(1) ) sel_arb ( - .clk (clk), - .reset (reset), - .requests (valid_in), - .enable (sel_enable), - .grant_valid (sel_valid), - .grant_index (sel_idx), - .grant_onehot(sel_1hot) + .clk (clk), + .reset (reset), + .requests (valid_in_qual), + .enable (sel_enable), + .grant_valid (sel_valid), + .grant_index (sel_idx), + .grant_onehot (sel_1hot) ); end - if (BUFFERED) begin + if (OUT_BUFFER) begin wire stall = ~ready_out && valid_out; assign sel_enable = ~stall; @@ -110,25 +123,35 @@ module VX_stream_arbiter #( .reset (reset), .stall (stall), .flush (1'b0), - .data_in ({sel_valid, data_in[sel_idx]}), + .data_in ({sel_valid, data_in_qual[sel_idx]}), .data_out ({valid_out, data_out}) ); for (genvar i = 0; i < NUM_REQS; i++) begin - assign ready_in[i] = sel_1hot[i] && ~stall; + assign ready_in_qual[i] = sel_1hot[i] && ~stall; end end else begin assign sel_enable = ready_out; + assign valid_out = sel_valid; + assign data_out = data_in_qual[sel_idx]; - assign valid_out = sel_valid; - assign data_out = data_in[sel_idx]; - for (genvar i = 0; i < NUM_REQS; i++) begin - assign ready_in[i] = sel_1hot[i] && ready_out; + assign ready_in_qual[i] = sel_1hot[i] && ready_out; end - end + + end + + end else begin + + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + + assign valid_out = valid_in; + assign data_out = data_in; + assign ready_in = ready_out; + end endmodule \ No newline at end of file