diff --git a/benchmarks/opencl/sgemm/Makefile b/benchmarks/opencl/sgemm/Makefile index 1c842209..eca87e70 100644 --- a/benchmarks/opencl/sgemm/Makefile +++ b/benchmarks/opencl/sgemm/Makefile @@ -4,7 +4,7 @@ SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf POCL_CC_PATH ?= /opt/pocl/compiler POCL_RT_PATH ?= /opt/pocl/runtime -OPTS ?= -n64 +OPTS ?= -n32 VORTEX_DRV_PATH ?= $(realpath ../../../driver) VORTEX_RT_PATH ?= $(realpath ../../../runtime) diff --git a/benchmarks/opencl/sgemm/main.cc b/benchmarks/opencl/sgemm/main.cc index 1b2e6293..1f92a14a 100644 --- a/benchmarks/opencl/sgemm/main.cc +++ b/benchmarks/opencl/sgemm/main.cc @@ -101,7 +101,7 @@ static void cleanup() { if (h_c) free(h_c); } -int size = 64; +int size = 32; static void show_usage() { printf("Usage: [-n size] [-h: help]\n"); diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 03260f2c..c79a8777 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -172,7 +172,6 @@ module VX_cluster #( .DRSQ_SIZE (`L2DRSQ_SIZE), .CRSQ_SIZE (`L2CRSQ_SIZE), .DREQ_SIZE (`L2DREQ_SIZE), - .DRAM_ENABLE (1), .WRITE_ENABLE (1), .CORE_TAG_WIDTH (`XDRAM_TAG_WIDTH), .CORE_TAG_ID_BITS (0), diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index f7e7462f..407bd223 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -299,14 +299,11 @@ // Cache ID `define SCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2) -// Block size in bytes -`define SCACHE_LINE_SIZE 4 - // Word size in bytes `define SWORD_SIZE 4 // bank address offset -`define SBANK_ADDR_OFFSET `CLOG2(`STACK_SIZE / `SCACHE_LINE_SIZE) +`define SBANK_ADDR_OFFSET `CLOG2(`STACK_SIZE / `SWORD_SIZE) // Core request size `define SNUM_REQUESTS `NUM_THREADS diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index 52289b8f..14f1caa1 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -103,7 +103,6 @@ module VX_mem_unit # ( .DRSQ_SIZE (`IDRSQ_SIZE), .CRSQ_SIZE (`ICRSQ_SIZE), .DREQ_SIZE (`IDREQ_SIZE), - .DRAM_ENABLE (1), .WRITE_ENABLE (0), .CORE_TAG_WIDTH (`ICORE_TAG_WIDTH), .CORE_TAG_ID_BITS (`ICORE_TAG_ID_BITS), @@ -160,8 +159,7 @@ module VX_mem_unit # ( .MSHR_SIZE (`DMSHR_SIZE), .DRSQ_SIZE (`DDRSQ_SIZE), .CRSQ_SIZE (`DCRSQ_SIZE), - .DREQ_SIZE (`DDREQ_SIZE), - .DRAM_ENABLE (1), + .DREQ_SIZE (`DDREQ_SIZE), .WRITE_ENABLE (1), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH), .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS), @@ -215,22 +213,16 @@ module VX_mem_unit # ( .clk (clk), .reset (reset), .reset_out (scache_reset) - ); + ); - VX_cache #( + VX_shared_mem #( .CACHE_ID (`SCACHE_ID), .CACHE_SIZE (`SMEM_SIZE), - .CACHE_LINE_SIZE (`SCACHE_LINE_SIZE), .NUM_BANKS (`SNUM_BANKS), .WORD_SIZE (`SWORD_SIZE), .NUM_REQS (`SNUM_REQUESTS), .CREQ_SIZE (`SCREQ_SIZE), - .MSHR_SIZE (8), - .DRSQ_SIZE (1), .CRSQ_SIZE (`SCRSQ_SIZE), - .DREQ_SIZE (1), - .DRAM_ENABLE (0), - .WRITE_ENABLE (1), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH), .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS), .BANK_ADDR_OFFSET (`SBANK_ADDR_OFFSET) @@ -240,6 +232,10 @@ module VX_mem_unit # ( .clk (clk), .reset (scache_reset), + `ifdef PERF_ENABLE + .perf_cache_if (perf_smem_if), + `endif + // Core request .core_req_valid (smem_req_if.valid), .core_req_rw (smem_req_if.rw), @@ -253,26 +249,7 @@ module VX_mem_unit # ( .core_rsp_valid (smem_rsp_if.valid), .core_rsp_data (smem_rsp_if.data), .core_rsp_tag (smem_rsp_if.tag), - .core_rsp_ready (smem_rsp_if.ready), - - `ifdef PERF_ENABLE - .perf_cache_if (perf_smem_if), - `endif - - // DRAM request - `UNUSED_PIN (dram_req_valid), - `UNUSED_PIN (dram_req_rw), - `UNUSED_PIN (dram_req_byteen), - `UNUSED_PIN (dram_req_addr), - `UNUSED_PIN (dram_req_data), - `UNUSED_PIN (dram_req_tag), - .dram_req_ready (1'b0), - - // DRAM response - .dram_rsp_valid (0), - .dram_rsp_data (0), - .dram_rsp_tag (0), - `UNUSED_PIN (dram_rsp_ready) + .core_rsp_ready (smem_rsp_if.ready) ); end diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index 3402d997..1bf0464a 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -174,7 +174,6 @@ module Vortex ( .DRSQ_SIZE (`L3DRSQ_SIZE), .CRSQ_SIZE (`L3CRSQ_SIZE), .DREQ_SIZE (`L3DREQ_SIZE), - .DRAM_ENABLE (1), .WRITE_ENABLE (1), .CORE_TAG_WIDTH (`L2DRAM_TAG_WIDTH), .CORE_TAG_ID_BITS (0), diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index dd0c2a84..4e968583 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -27,9 +27,6 @@ module VX_bank #( // DRAM Request Queue Size parameter DREQ_SIZE = 1, - // Enable dram update - parameter DRAM_ENABLE = 1, - // Enable cache writeable parameter WRITE_ENABLE = 1, @@ -103,38 +100,27 @@ module VX_bank #( wire drsq_push = dram_rsp_valid && dram_rsp_ready; - if (DRAM_ENABLE) begin - wire drsq_full; - assign dram_rsp_ready = !drsq_full; + wire drsq_full; + assign dram_rsp_ready = !drsq_full; - VX_fifo_queue_xt #( - .DATAW (`LINE_ADDR_WIDTH + $bits(dram_rsp_data)), - .SIZE (DRSQ_SIZE), - .FASTRAM (1) - ) dram_rsp_queue ( - .clk (clk), - .reset (reset), - .push (drsq_push), - .pop (drsq_pop), - .data_in ({dram_rsp_addr, dram_rsp_data}), - `UNUSED_PIN (data_out), - .empty (drsq_empty), - .data_out_next ({drsq_addr_next, drsq_filldata_next}), - .empty_next (drsq_empty_next), - .full (drsq_full), - `UNUSED_PIN (almost_full), - `UNUSED_PIN (size) - ); - end else begin - `UNUSED_VAR (dram_rsp_valid) - `UNUSED_VAR (dram_rsp_addr) - `UNUSED_VAR (dram_rsp_data) - assign drsq_empty = 1; - assign drsq_empty_next = 1; - assign drsq_addr_next = 0; - assign drsq_filldata_next = 0; - assign dram_rsp_ready = 0; - end + VX_fifo_queue_xt #( + .DATAW (`LINE_ADDR_WIDTH + $bits(dram_rsp_data)), + .SIZE (DRSQ_SIZE), + .FASTRAM (1) + ) dram_rsp_queue ( + .clk (clk), + .reset (reset), + .push (drsq_push), + .pop (drsq_pop), + .data_in ({dram_rsp_addr, dram_rsp_data}), + `UNUSED_PIN (data_out), + .empty (drsq_empty), + .data_out_next ({drsq_addr_next, drsq_filldata_next}), + .empty_next (drsq_empty_next), + .full (drsq_full), + `UNUSED_PIN (almost_full), + `UNUSED_PIN (size) + ); wire creq_pop; wire creq_full, creq_empty; @@ -221,14 +207,6 @@ module VX_bank #( wire dreq_push_unqual_st0, dreq_push_unqual_st1; wire writeen_st1; wire core_req_hit_st1; - - wire valid_st01; - wire writeen_st01; - wire [`LINE_ADDR_WIDTH-1:0] addr_st01; - wire [`UP(`WORD_SELECT_BITS)-1:0] wsel_st01; - wire [WORD_SIZE-1:0] byteen_st01; - wire [`WORD_WIDTH-1:0] writeword_st01; - wire [`REQ_TAG_WIDTH-1:0] tag_st01; wire mshr_push_stall; wire crsq_push_stall; @@ -278,8 +256,7 @@ module VX_bank #( assign {debug_pc_st0, debug_wid_st0} = 0; end `endif - -if (DRAM_ENABLE) begin + VX_tag_access #( .BANK_ID (BANK_ID), .CACHE_ID (CACHE_ID), @@ -290,7 +267,7 @@ if (DRAM_ENABLE) begin .WORD_SIZE (WORD_SIZE), .WRITE_ENABLE (WRITE_ENABLE), .BANK_ADDR_OFFSET (BANK_ADDR_OFFSET) - ) tag_access ( + ) tag_access ( .clk (clk), .reset (reset), @@ -314,66 +291,28 @@ if (DRAM_ENABLE) begin .writeen_in (valid_st1 && writeen_st1) ); - assign valid_st01 = valid_st1; - assign writeen_st01 = writeen_st1; - assign addr_st01 = addr_st1; - assign wsel_st01 = wsel_st1; - assign byteen_st01 = byteen_st1; - assign writeword_st01 = writeword_st1; - assign tag_st01 = tag_st1; - // redundant fills wire is_redundant_fill = is_fill_st0 && !miss_st0; // we have a miss in mshr or going to it for the current address wire mshr_pending_st0 = mshr_pending_unqual_st0 - || (valid_st1 && (miss_st1 || force_miss_st1) && (addr_st0 == addr_st1)); + || (valid_st1 && (miss_st1 || force_miss_st1) && (addr_st0 == addr_st1)); // force miss to ensure commit order when a new request has pending previous requests to same block assign force_miss_st0 = !is_mshr_st0 && !is_fill_st0 && mshr_pending_st0; - + assign writeen_unqual_st0 = (!is_fill_st0 && !miss_st0 && mem_rw_st0) - || (is_fill_st0 && !is_redundant_fill); + || (is_fill_st0 && !is_redundant_fill); wire send_fill_req_st0 = !is_fill_st0 && miss_st0 - && !(WRITE_THROUGH && mem_rw_st0); + && !(WRITE_THROUGH && mem_rw_st0); assign do_writeback_st0 = (WRITE_THROUGH && !is_fill_st0 && mem_rw_st0) - || (!WRITE_THROUGH && is_fill_st0 && dirty_st0 && !is_redundant_fill); + || (!WRITE_THROUGH && is_fill_st0 && dirty_st0 && !is_redundant_fill); assign dreq_push_unqual_st0 = send_fill_req_st0 || do_writeback_st0; - assign mshr_push_unqual_st0 = !is_fill_st0 && !(WRITE_THROUGH && mem_rw_st0); - -end else begin - - `UNUSED_VAR (mshr_pending_unqual_st0) - `UNUSED_VAR (drsq_push) - `UNUSED_VAR (dirty_st0) - `UNUSED_VAR (writeen_st1) - -`ifdef DBG_CACHE_REQ_INFO - assign debug_pc_st1 = debug_pc_st0; - assign debug_wid_st1 = debug_wid_st0; -`endif - - assign valid_st01 = valid_st0; - assign writeen_st01 = mem_rw_st0; - assign addr_st01 = addr_st0; - assign wsel_st01 = wsel_st0; - assign byteen_st01 = byteen_st0; - assign writeword_st01 = writeword_st0; - assign tag_st01 = tag_st0; - - assign miss_st0 = 0; - assign dirty_st0 = 0; - assign force_miss_st0 = 0; - assign readtag_st0 = 0; - assign do_writeback_st0 = 0; - assign writeen_unqual_st0 = mem_rw_st0; - assign dreq_push_unqual_st0 = 0; - assign mshr_push_unqual_st0 = 0; -end + assign mshr_push_unqual_st0 = !is_fill_st0 && !(WRITE_THROUGH && mem_rw_st0); VX_pipe_register #( .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `UP(`WORD_SELECT_BITS) + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH + `WORD_WIDTH + `TAG_SELECT_BITS + `CACHE_LINE_WIDTH + 1 + WORD_SIZE + `REQS_BITS + `REQ_TAG_WIDTH), @@ -403,7 +342,6 @@ end assign {debug_pc_st01, debug_wid_st01} = 0; end `endif - `UNUSED_VAR (tag_st01) VX_data_access #( .BANK_ID (BANK_ID), @@ -412,7 +350,6 @@ end .CACHE_SIZE (CACHE_SIZE), .CACHE_LINE_SIZE (CACHE_LINE_SIZE), .NUM_BANKS (NUM_BANKS), - .DRAM_ENABLE (DRAM_ENABLE), .WORD_SIZE (WORD_SIZE), .WRITE_ENABLE (WRITE_ENABLE), .WRITE_THROUGH (WRITE_THROUGH) @@ -435,12 +372,12 @@ end .dirtyb_out (dirtyb_st0), // writing - .writeen_in (valid_st01 && writeen_st01), - .waddr_in (addr_st01), + .writeen_in (valid_st1 && writeen_st1), + .waddr_in (addr_st1), .wfill_in (is_fill_st1), - .wwsel_in (wsel_st01), - .wbyteen_in (byteen_st01), - .writeword_in (writeword_st01), + .wwsel_in (wsel_st1), + .wbyteen_in (byteen_st1), + .writeword_in (writeword_st1), .filldata_in (filldata_st1) ); @@ -461,81 +398,59 @@ end wire incoming_fill_st1 = valid_st0 && is_fill_st0 && (addr_st1 == addr_st0); - if (DRAM_ENABLE) begin + wire mshr_dequeue_st1 = valid_st1 && is_mshr_st1 && !mshr_push_unqual && !pipeline_stall; - wire mshr_dequeue_st1 = valid_st1 && is_mshr_st1 && !mshr_push_unqual && !pipeline_stall; + // push a missed request as 'ready' if it was a forced miss that actually had a hit + // or the fill request for this block is comming + wire mshr_init_ready_state_st1 = !miss_st1 || incoming_fill_st1; - // push a missed request as 'ready' if it was a forced miss that actually had a hit - // or the fill request for this block is comming - wire mshr_init_ready_state_st1 = !miss_st1 || incoming_fill_st1; + VX_miss_resrv #( + .BANK_ID (BANK_ID), + .CACHE_ID (CACHE_ID), + .CORE_TAG_ID_BITS (CORE_TAG_ID_BITS), + .CACHE_LINE_SIZE (CACHE_LINE_SIZE), + .NUM_BANKS (NUM_BANKS), + .WORD_SIZE (WORD_SIZE), + .NUM_REQS (NUM_REQS), + .MSHR_SIZE (MSHR_SIZE), + .ALM_FULL (MSHR_SIZE-1), + .CORE_TAG_WIDTH (CORE_TAG_WIDTH) + ) miss_resrv ( + .clk (clk), + .reset (reset), - VX_miss_resrv #( - .BANK_ID (BANK_ID), - .CACHE_ID (CACHE_ID), - .CORE_TAG_ID_BITS (CORE_TAG_ID_BITS), - .CACHE_LINE_SIZE (CACHE_LINE_SIZE), - .NUM_BANKS (NUM_BANKS), - .WORD_SIZE (WORD_SIZE), - .NUM_REQS (NUM_REQS), - .MSHR_SIZE (MSHR_SIZE), - .ALM_FULL (MSHR_SIZE-1), - .CORE_TAG_WIDTH (CORE_TAG_WIDTH) - ) miss_resrv ( - .clk (clk), - .reset (reset), + `ifdef DBG_CACHE_REQ_INFO + .deq_debug_pc (debug_pc_st0), + .deq_debug_wid (debug_wid_st0), + .enq_debug_pc (debug_pc_st1), + .enq_debug_wid (debug_wid_st1), + `endif - `ifdef DBG_CACHE_REQ_INFO - .deq_debug_pc (debug_pc_st0), - .deq_debug_wid (debug_wid_st0), - .enq_debug_pc (debug_pc_st1), - .enq_debug_wid (debug_wid_st1), - `endif + // enqueue + .enqueue (mshr_push), + .enqueue_addr (addr_st1), + .enqueue_data ({writeword_st1, req_tid_st1, tag_st1, mem_rw_st1, byteen_st1, wsel_st1}), + .enqueue_is_mshr (is_mshr_st1), + .enqueue_as_ready (mshr_init_ready_state_st1), + .enqueue_almfull (mshr_almost_full), - // enqueue - .enqueue (mshr_push), - .enqueue_addr (addr_st1), - .enqueue_data ({writeword_st1, req_tid_st1, tag_st1, mem_rw_st1, byteen_st1, wsel_st1}), - .enqueue_is_mshr (is_mshr_st1), - .enqueue_as_ready (mshr_init_ready_state_st1), - .enqueue_almfull (mshr_almost_full), + // lookup + .lookup_ready (drsq_pop), + .lookup_addr (addr_st0), + .lookup_match (mshr_pending_unqual_st0), + + // schedule + .schedule (mshr_pop), + .schedule_valid (mshr_valid), + `UNUSED_PIN (schedule_addr), + `UNUSED_PIN (schedule_data), + .schedule_valid_next(mshr_valid_next), + .schedule_addr_next (mshr_addr_next), + .schedule_data_next ({mshr_writeword_next, mshr_tid_next, mshr_tag_next, mshr_rw_next, mshr_byteen_next, mshr_wsel_next}), - // lookup - .lookup_ready (drsq_pop), - .lookup_addr (addr_st0), - .lookup_match (mshr_pending_unqual_st0), - - // schedule - .schedule (mshr_pop), - .schedule_valid (mshr_valid), - `UNUSED_PIN (schedule_addr), - `UNUSED_PIN (schedule_data), - .schedule_valid_next(mshr_valid_next), - .schedule_addr_next (mshr_addr_next), - .schedule_data_next ({mshr_writeword_next, mshr_tid_next, mshr_tag_next, mshr_rw_next, mshr_byteen_next, mshr_wsel_next}), - - // dequeue - .dequeue (mshr_dequeue_st1) - ); - end else begin - `UNUSED_VAR (valid_st1) - `UNUSED_VAR (mshr_push) - `UNUSED_VAR (wsel_st1) - `UNUSED_VAR (writeword_st1) - `UNUSED_VAR (mem_rw_st1) - `UNUSED_VAR (byteen_st1) - `UNUSED_VAR (incoming_fill_st1) - assign mshr_almost_full = 0; - assign mshr_pending_unqual_st0 = 0; - assign mshr_valid = 0; - assign mshr_valid_next = 0; - assign mshr_addr_next = 0; - assign mshr_wsel_next = 0; - assign mshr_writeword_next = 0; - assign mshr_tid_next = 0; - assign mshr_tag_next = 0; - assign mshr_rw_next = 0; - assign mshr_byteen_next = 0; - end + // dequeue + .dequeue (mshr_dequeue_st1) + ); // Enqueue core response @@ -625,44 +540,25 @@ end assign dreq_byteen = writeback ? dreq_byteen_unqual : {CACHE_LINE_SIZE{1'b1}}; - if (DRAM_ENABLE) begin - VX_fifo_queue_xt #( - .DATAW (1 + CACHE_LINE_SIZE + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH), - .SIZE (DREQ_SIZE), - .ALM_FULL (DREQ_SIZE-1), - .FASTRAM (1) - ) dram_req_queue ( - .clk (clk), - .reset (reset), - .push (dreq_push), - .pop (dreq_pop), - .data_in ({writeback, dreq_byteen, dreq_addr, dreq_data}), - .data_out({dram_req_rw, dram_req_byteen, dram_req_addr, dram_req_data}), - .empty (dreq_empty), - .almost_full (dreq_almost_full), - `UNUSED_PIN (full), - `UNUSED_PIN (data_out_next), - `UNUSED_PIN (empty_next), - `UNUSED_PIN (size) - ); - end else begin - `UNUSED_VAR (dreq_push) - `UNUSED_VAR (dreq_pop) - `UNUSED_VAR (dreq_addr) - `UNUSED_VAR (dreq_data) - `UNUSED_VAR (dreq_byteen) - `UNUSED_VAR (readtag_st1) - `UNUSED_VAR (dirtyb_st1) - `UNUSED_VAR (readdata_st1) - `UNUSED_VAR (writeback) - `UNUSED_VAR (dram_req_ready) - assign dreq_empty = 1; - assign dreq_almost_full = 0; - assign dram_req_rw = 0; - assign dram_req_byteen = 0; - assign dram_req_addr = 0; - assign dram_req_data = 0; - end + VX_fifo_queue_xt #( + .DATAW (1 + CACHE_LINE_SIZE + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH), + .SIZE (DREQ_SIZE), + .ALM_FULL (DREQ_SIZE-1), + .FASTRAM (1) + ) dram_req_queue ( + .clk (clk), + .reset (reset), + .push (dreq_push), + .pop (dreq_pop), + .data_in ({writeback, dreq_byteen, dreq_addr, dreq_data}), + .data_out({dram_req_rw, dram_req_byteen, dram_req_addr, dram_req_data}), + .empty (dreq_empty), + .almost_full (dreq_almost_full), + `UNUSED_PIN (full), + `UNUSED_PIN (data_out_next), + `UNUSED_PIN (empty_next), + `UNUSED_PIN (size) + ); assign dram_req_valid = !dreq_empty; diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index e75490c4..c4f2dbad 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -24,10 +24,7 @@ module VX_cache #( // Core Response Queue Size parameter CRSQ_SIZE = 4, // DRAM Request Queue Size - parameter DREQ_SIZE = 4, - - // Enable dram update - parameter DRAM_ENABLE = 1, + parameter DREQ_SIZE = 4, // Enable cache writeable parameter WRITE_ENABLE = 1, @@ -129,8 +126,8 @@ module VX_cache #( .NUM_REQS (NUM_REQS), .CORE_TAG_WIDTH (CORE_TAG_WIDTH), .BANK_ADDR_OFFSET(BANK_ADDR_OFFSET), - .BUFFERED ((NUM_BANKS > 1) && DRAM_ENABLE) - ) cache_core_req_bank_sel ( + .BUFFERED (NUM_BANKS > 1) + ) core_req_bank_sel ( .clk (clk), .reset (reset), `ifdef PERF_ENABLE @@ -244,7 +241,6 @@ module VX_cache #( .DRSQ_SIZE (DRSQ_SIZE), .CRSQ_SIZE (CRSQ_SIZE), .DREQ_SIZE (DREQ_SIZE), - .DRAM_ENABLE (DRAM_ENABLE), .WRITE_ENABLE (WRITE_ENABLE), .WRITE_THROUGH (WRITE_THROUGH), .CORE_TAG_WIDTH (CORE_TAG_WIDTH), @@ -302,7 +298,7 @@ module VX_cache #( .NUM_REQS (NUM_REQS), .CORE_TAG_WIDTH (CORE_TAG_WIDTH), .CORE_TAG_ID_BITS (CORE_TAG_ID_BITS) - ) cache_core_rsp_merge ( + ) core_rsp_merge ( .clk (clk), .reset (reset), .per_bank_core_rsp_valid (per_bank_core_rsp_valid), @@ -316,41 +312,26 @@ module VX_cache #( .core_rsp_ready (core_rsp_ready) ); - if (DRAM_ENABLE) begin - wire [NUM_BANKS-1:0][(`DRAM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH)-1:0] data_in; - for (genvar i = 0; i < NUM_BANKS; i++) begin - assign data_in[i] = {per_bank_dram_req_addr[i], per_bank_dram_req_rw[i], per_bank_dram_req_byteen[i], per_bank_dram_req_data[i]}; - end - - VX_stream_arbiter #( - .NUM_REQS (NUM_BANKS), - .DATAW (`DRAM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH), - .BUFFERED (1) - ) dram_req_arb ( - .clk (clk), - .reset (reset), - .valid_in (per_bank_dram_req_valid), - .data_in (data_in), - .ready_in (per_bank_dram_req_ready), - .valid_out (dram_req_valid), - .data_out ({dram_req_addr, dram_req_rw, dram_req_byteen, dram_req_data}), - .ready_out (dram_req_ready) - ); - end else begin - `UNUSED_VAR (per_bank_dram_req_valid) - `UNUSED_VAR (per_bank_dram_req_rw) - `UNUSED_VAR (per_bank_dram_req_byteen) - `UNUSED_VAR (per_bank_dram_req_addr) - `UNUSED_VAR (per_bank_dram_req_data) - assign per_bank_dram_req_ready = 0; - assign dram_req_valid = 0; - assign dram_req_rw = 0; - assign dram_req_byteen = 0; - assign dram_req_addr = 0; - assign dram_req_data = 0; - `UNUSED_VAR (dram_req_ready) + wire [NUM_BANKS-1:0][(`DRAM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH)-1:0] data_in; + for (genvar i = 0; i < NUM_BANKS; i++) begin + assign data_in[i] = {per_bank_dram_req_addr[i], per_bank_dram_req_rw[i], per_bank_dram_req_byteen[i], per_bank_dram_req_data[i]}; end + VX_stream_arbiter #( + .NUM_REQS (NUM_BANKS), + .DATAW (`DRAM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH), + .BUFFERED (1) + ) dram_req_arb ( + .clk (clk), + .reset (reset), + .valid_in (per_bank_dram_req_valid), + .data_in (data_in), + .ready_in (per_bank_dram_req_ready), + .valid_out (dram_req_valid), + .data_out ({dram_req_addr, dram_req_rw, dram_req_byteen, dram_req_data}), + .ready_out (dram_req_ready) + ); + `ifdef PERF_ENABLE // per cycle: core_reads, core_writes reg [($clog2(NUM_REQS+1)-1):0] perf_core_reads_per_cycle, perf_core_writes_per_cycle; diff --git a/hw/rtl/cache/VX_cache_core_req_bank_sel.v b/hw/rtl/cache/VX_cache_core_req_bank_sel.v index 29a9b75b..d9dc24cd 100644 --- a/hw/rtl/cache/VX_cache_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_cache_core_req_bank_sel.v @@ -112,7 +112,7 @@ module VX_cache_core_req_bank_sel #( end for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign per_bank_core_req_stall[i] = ~per_bank_core_req_ready[i] & per_bank_core_req_valid[i]; + assign per_bank_core_req_stall[i] = ~per_bank_core_req_ready[i] && (!BUFFERED || per_bank_core_req_valid[i]); VX_pipe_register #( .DATAW (1 + `REQS_BITS + 1 + WORD_SIZE + `WORD_ADDR_WIDTH + CORE_TAG_WIDTH + `WORD_WIDTH), .RESETW (1), diff --git a/hw/rtl/cache/VX_cache_core_rsp_merge.v b/hw/rtl/cache/VX_cache_core_rsp_merge.v index 8b410bdb..47cfb5ec 100644 --- a/hw/rtl/cache/VX_cache_core_rsp_merge.v +++ b/hw/rtl/cache/VX_cache_core_rsp_merge.v @@ -50,7 +50,6 @@ module VX_cache_core_rsp_merge #( for (integer i = 0; i < NUM_BANKS; i++) begin if (per_bank_core_rsp_valid[i]) begin core_rsp_tag_unqual = per_bank_core_rsp_tag[i]; - break; end end diff --git a/hw/rtl/cache/VX_data_access.v b/hw/rtl/cache/VX_data_access.v index 5c9c79d8..0ac55f7c 100644 --- a/hw/rtl/cache/VX_data_access.v +++ b/hw/rtl/cache/VX_data_access.v @@ -2,8 +2,7 @@ module VX_data_access #( parameter CACHE_ID = 0, - parameter BANK_ID = 0, - + parameter BANK_ID = 0, // Size of cache in bytes parameter CACHE_SIZE = 1, // Size of line inside a bank in bytes @@ -11,17 +10,11 @@ module VX_data_access #( // Number of banks parameter NUM_BANKS = 1, // Size of a word in bytes - parameter WORD_SIZE = 1, - - // Enable dram update - parameter DRAM_ENABLE = 1, - + parameter WORD_SIZE = 1, // Enable cache writeable parameter WRITE_ENABLE = 1, - // Enable write-through parameter WRITE_THROUGH = 1, - // size of tag id in core request tag parameter CORE_TAG_ID_BITS = 0 ) ( @@ -111,7 +104,7 @@ module VX_data_access #( assign byte_enable = wfill_in ? {CACHE_LINE_SIZE{1'b1}} : wbyteen_qual; assign write_data = wfill_in ? filldata_in : writedata_qual; - wire rw_hazard = DRAM_ENABLE && (raddr == waddr) && writeen_in; + wire rw_hazard = (raddr == waddr) && writeen_in; if (`WORD_SELECT_BITS != 0) begin for (genvar i = 0; i < `WORDS_PER_LINE; i++) begin diff --git a/hw/rtl/cache/VX_shared_mem.v b/hw/rtl/cache/VX_shared_mem.v new file mode 100644 index 00000000..22428ec9 --- /dev/null +++ b/hw/rtl/cache/VX_shared_mem.v @@ -0,0 +1,259 @@ +`include "VX_cache_config.vh" + +module VX_shared_mem #( + parameter CACHE_ID = 0, + + // Size of cache in bytes + parameter CACHE_SIZE = 16384, + // Number of banks + parameter NUM_BANKS = 4, + // Size of a word in bytes + parameter WORD_SIZE = 4, + // Number of Word requests per cycle + parameter NUM_REQS = NUM_BANKS, + + // Core Request Queue Size + parameter CREQ_SIZE = 4, + + // Core Response Queue Size + parameter CRSQ_SIZE = 4, + + // core request tag size + parameter CORE_TAG_WIDTH = 1, + + // size of tag id in core request tag + parameter CORE_TAG_ID_BITS = 0, + + // bank offset from beginning of index range + parameter BANK_ADDR_OFFSET = 0 + ) ( + `SCOPE_IO_VX_cache + + input wire clk, + input wire reset, + + // PERF +`ifdef PERF_ENABLE + VX_perf_cache_if perf_cache_if, +`endif + + // Core request + input wire [NUM_REQS-1:0] core_req_valid, + input wire [NUM_REQS-1:0] core_req_rw, + input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen, + input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, + input wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data, + input wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag, + output wire [NUM_REQS-1:0] core_req_ready, + + // Core response + output wire [NUM_REQS-1:0] core_rsp_valid, + output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data, + output wire [CORE_TAG_WIDTH-1:0] core_rsp_tag, + input wire core_rsp_ready +); + + `STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid value")) + + localparam CACHE_LINE_SIZE = WORD_SIZE; + +`ifdef DBG_CACHE_REQ_INFO + /* verilator lint_off UNUSED */ + wire [31:0] debug_pc_st0; + wire [`NW_BITS-1:0] debug_wid_st0; + /* verilator lint_on UNUSED */ +`endif + + wire [NUM_BANKS-1:0] per_bank_core_req_valid_unqual; + wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid_unqual; + wire [NUM_BANKS-1:0] per_bank_core_req_rw_unqual; + wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen_unqual; + wire [NUM_BANKS-1:0][`WORD_ADDR_WIDTH-1:0] per_bank_core_req_addr_unqual; + wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data_unqual; + wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag_unqual; + wire [NUM_BANKS-1:0] per_bank_core_req_ready_unqual; + + VX_cache_core_req_bank_sel #( + .CACHE_LINE_SIZE (WORD_SIZE), + .NUM_BANKS (NUM_BANKS), + .WORD_SIZE (WORD_SIZE), + .NUM_REQS (NUM_REQS), + .CORE_TAG_WIDTH (CORE_TAG_WIDTH), + .BANK_ADDR_OFFSET(BANK_ADDR_OFFSET), + .BUFFERED (0) + ) core_req_bank_sel ( + .clk (clk), + .reset (reset), + `ifdef PERF_ENABLE + .bank_stalls(perf_cache_if.bank_stalls), + `else + `UNUSED_PIN (bank_stalls), + `endif + .core_req_valid (core_req_valid), + .core_req_rw (core_req_rw), + .core_req_byteen(core_req_byteen), + .core_req_addr (core_req_addr), + .core_req_data (core_req_data), + .core_req_tag (core_req_tag), + .core_req_ready (core_req_ready), + .per_bank_core_req_valid (per_bank_core_req_valid_unqual), + .per_bank_core_req_tid (per_bank_core_req_tid_unqual), + .per_bank_core_req_rw (per_bank_core_req_rw_unqual), + .per_bank_core_req_byteen(per_bank_core_req_byteen_unqual), + .per_bank_core_req_addr (per_bank_core_req_addr_unqual), + .per_bank_core_req_tag (per_bank_core_req_tag_unqual), + .per_bank_core_req_data (per_bank_core_req_data_unqual), + .per_bank_core_req_ready (per_bank_core_req_ready_unqual) + ); + + `UNUSED_VAR (per_bank_core_req_tag_unqual) + `UNUSED_VAR (per_bank_core_req_rw_unqual) + + wire [NUM_BANKS-1:0] per_bank_core_req_valid; + wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid; + wire [NUM_REQS-1:0] per_bank_core_req_rw; + wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen; + wire [NUM_BANKS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr; + wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data; + wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag; + + wire creq_push, creq_pop, creq_empty, creq_full; + wire crsq_full; + + assign creq_push = (| core_req_valid) && !creq_full; + assign creq_pop = ~creq_empty && ~crsq_full; + + assign per_bank_core_req_ready_unqual = {NUM_BANKS{~creq_full}}; + + wire [NUM_REQS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr_qual; + `UNUSED_VAR (per_bank_core_req_addr_unqual) + for (genvar i = 0; i < NUM_REQS; i++) begin + wire [`LINE_ADDR_WIDTH-1:0] tmp = `LINE_SELECT_ADDRX(per_bank_core_req_addr_unqual[i]); + assign per_bank_core_req_addr_qual[i] = tmp[`LINE_SELECT_BITS-1:0]; + `UNUSED_VAR (tmp) + end + + VX_fifo_queue #( + .DATAW (NUM_BANKS * (1 + `REQS_BITS + 1 + WORD_SIZE + `LINE_SELECT_BITS + `WORD_WIDTH + CORE_TAG_WIDTH)), + .SIZE (CREQ_SIZE), + .FASTRAM (1) + ) core_req_queue ( + .clk (clk), + .reset (reset), + .push (creq_push), + .pop (creq_pop), + .data_in ({per_bank_core_req_valid_unqual, + per_bank_core_req_tid_unqual, + per_bank_core_req_rw_unqual, + per_bank_core_req_byteen_unqual, + per_bank_core_req_addr_qual, + per_bank_core_req_data_unqual, + per_bank_core_req_tag_unqual}), + .data_out({per_bank_core_req_valid, + per_bank_core_req_tid, + per_bank_core_req_rw, + per_bank_core_req_byteen, + per_bank_core_req_addr, + per_bank_core_req_data, + per_bank_core_req_tag}), + .empty (creq_empty), + .full (creq_full), + `UNUSED_PIN (size) + ); + + wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data; + + for (genvar i = 0; i < NUM_BANKS; i++) begin + VX_sp_ram #( + .DATAW(`WORD_WIDTH), + .SIZE(`LINES_PER_BANK), + .BYTEENW(WORD_SIZE), + .RWCHECK(1) + ) data ( + .clk(clk), + .addr(per_bank_core_req_addr[i]), + .wren(per_bank_core_req_valid[i] && per_bank_core_req_rw[i] && ~crsq_full), + .byteen(per_bank_core_req_byteen[i]), + .rden(1'b1), + .din(per_bank_core_req_data[i]), + .dout(per_bank_core_rsp_data[i]) + ); + end + + reg [NUM_REQS-1:0] core_rsp_valid_unqual; + reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_unqual; + reg [CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual; + + always @(*) begin + core_rsp_valid_unqual = 0; + core_rsp_data_unqual = 'x; + core_rsp_tag_unqual = 'x; + for (integer i = 0; i < NUM_BANKS; i++) begin + if (per_bank_core_req_valid[i]) begin + core_rsp_valid_unqual[per_bank_core_req_tid[i]] = 1; + core_rsp_data_unqual[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i]; + core_rsp_tag_unqual = per_bank_core_req_tag[i]; + end + end + end + +`ifdef DBG_CACHE_REQ_INFO + if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin + assign {debug_pc_st0, debug_wid_st0} = core_rsp_tag_unqual[CORE_TAG_WIDTH-1:CORE_TAG_ID_BITS]; + end else begin + assign {debug_pc_st0, debug_wid_st0} = 0; + end +`endif + + wire [NUM_REQS-1:0] core_rsp_valid_tmask; + wire crsq_push, crsq_pop, crsq_empty; + + wire core_rsp_rw = | (per_bank_core_req_valid & per_bank_core_req_rw); + + assign crsq_push = ~creq_empty && ~core_rsp_rw && ~crsq_full; + assign crsq_pop = ~crsq_empty && core_rsp_ready; + + VX_fifo_queue #( + .DATAW (NUM_BANKS * (1 + `WORD_WIDTH) + CORE_TAG_WIDTH), + .SIZE (CRSQ_SIZE), + .BUFFERED (1), + .FASTRAM (1) + ) core_rsp_queue ( + .clk (clk), + .reset (reset), + .push (crsq_push), + .pop (crsq_pop), + .data_in ({core_rsp_valid_unqual, core_rsp_data_unqual, core_rsp_tag_unqual}), + .data_out({core_rsp_valid_tmask, core_rsp_data, core_rsp_tag}), + .empty (crsq_empty), + .full (crsq_full), + `UNUSED_PIN (size) + ); + + assign core_rsp_valid = core_rsp_valid_tmask & {NUM_REQS{~crsq_empty}}; + +`ifdef DBG_PRINT_CACHE_BANK + always @(posedge clk) begin + if (crsq_full) begin + $display("%t: cache%0d pipeline-stall", $time, CACHE_ID); + end + if (creq_pop) begin + if (core_rsp_rw) + $display("%t: cache%0d core-wr-req: tmask=%0b, addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, per_bank_core_req_valid, per_bank_core_req_addr, per_bank_core_req_tag, per_bank_core_req_byteen, per_bank_core_req_data, debug_wid_st0, debug_pc_st0); + else + $display("%t: cache%0d core-rd-req: tmask=%0b, addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, per_bank_core_req_valid, per_bank_core_req_addr, per_bank_core_req_tag, per_bank_core_req_byteen, per_bank_core_rsp_data, debug_wid_st0, debug_pc_st0); + end + end +`endif + +`ifdef PERF_ENABLE + assign perf_cache_if.reads = '0; + assign perf_cache_if.writes = '0; + assign perf_cache_if.read_misses = '0; + assign perf_cache_if.write_misses = '0; + assign perf_cache_if.mshr_stalls = '0; + assign perf_cache_if.pipe_stalls = '0; + assign perf_cache_if.crsp_stalls = '0; +`endif + +endmodule