diff --git a/ci/blackbox.sh b/ci/blackbox.sh index 2dd2a905..4a8ca943 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -9,6 +9,9 @@ show_usage() echo "Usage: [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=rtlsim|vlsim] [--debug] [--scope] [--perf] [--app=vecadd|sgemm|basic|demo|dogfood] [--args=] [--help]]" } +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +VORTEX_HOME=$SCRIPT_DIR/.. + DRIVER=vlsim APP=sgemm CLUSTERS=1 @@ -62,6 +65,7 @@ case $i in ;; --scope) SCOPE=1 + CORES=1 shift ;; --perf) @@ -86,19 +90,19 @@ done case $DRIVER in rtlsim) - DRIVER_PATH=driver/rtlsim + DRIVER_PATH=$VORTEX_HOME/driver/rtlsim DRIVER_EXTRA= ;; vlsim) - DRIVER_PATH=driver/opae + DRIVER_PATH=$VORTEX_HOME/driver/opae DRIVER_EXTRA=vlsim ;; asesim) - DRIVER_PATH=driver/opae + DRIVER_PATH=$VORTEX_HOME/driver/opae DRIVER_EXTRA=asesim ;; fpga) - DRIVER_PATH=driver/opae + DRIVER_PATH=$VORTEX_HOME/driver/opae DRIVER_EXTRA=fpga ;; *) @@ -109,19 +113,19 @@ esac case $APP in sgemm) - APP_PATH=benchmarks/opencl/sgemm + APP_PATH=$VORTEX_HOME/benchmarks/opencl/sgemm ;; vecadd) - APP_PATH=benchmarks/opencl/vacadd + APP_PATH=$VORTEX_HOME/benchmarks/opencl/vacadd ;; basic) - APP_PATH=driver/tests/basic + APP_PATH=$VORTEX_HOME/driver/tests/basic ;; demo) - APP_PATH=driver/tests/demo + APP_PATH=$VORTEX_HOME/driver/tests/demo ;; dogfood) - APP_PATH=driver/tests/dogfood + APP_PATH=$VORTEX_HOME/driver/tests/dogfood ;; *) echo "invalid app: $APP" diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index 174b601b..584ce270 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -172,32 +172,32 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { // alu_stall uint64_t alu_stalls_per_core; ret |= vx_csr_get_l(device, core_id, CSR_MPM_ALU_ST, CSR_MPM_ALU_ST_H, &alu_stalls_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: alu stalls=%ld\n", core_id, alu_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core); alu_stalls += alu_stalls_per_core; // lsu_stall uint64_t lsu_stalls_per_core; ret |= vx_csr_get_l(device, core_id, CSR_MPM_LSU_ST, CSR_MPM_LSU_ST_H, &lsu_stalls_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu stalls=%ld\n", core_id, lsu_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core); lsu_stalls += lsu_stalls_per_core; // csr_stall uint64_t csr_stalls_per_core; ret |= vx_csr_get_l(device, core_id, CSR_MPM_CSR_ST, CSR_MPM_CSR_ST_H, &csr_stalls_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: csr stalls=%ld\n", core_id, csr_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: csr unit stalls=%ld\n", core_id, csr_stalls_per_core); csr_stalls += csr_stalls_per_core; // mul_stall uint64_t mul_stalls_per_core; ret |= vx_csr_get_l(device, core_id, CSR_MPM_MUL_ST, CSR_MPM_MUL_ST_H, &mul_stalls_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: mul stalls=%ld\n", core_id, mul_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: mul unit stalls=%ld\n", core_id, mul_stalls_per_core); mul_stalls += mul_stalls_per_core; // fpu_stall uint64_t fpu_stalls_per_core; ret |= vx_csr_get_l(device, core_id, CSR_MPM_FPU_ST, CSR_MPM_FPU_ST_H, &fpu_stalls_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu stalls=%ld\n", core_id, fpu_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core); fpu_stalls += fpu_stalls_per_core; // gpu_stall uint64_t gpu_stalls_per_core; ret |= vx_csr_get_l(device, core_id, CSR_MPM_GPU_ST, CSR_MPM_GPU_ST_H, &gpu_stalls_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu stalls=%ld\n", core_id, gpu_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu unit stalls=%ld\n", core_id, gpu_stalls_per_core); gpu_stalls += gpu_stalls_per_core; // PERF: Icache @@ -300,12 +300,12 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { fprintf(stream, "PERF: icache stalls=%ld\n", icache_stalls); fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls); fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls); - fprintf(stream, "PERF: alu stalls=%ld\n", alu_stalls); - fprintf(stream, "PERF: lsu stalls=%ld\n", lsu_stalls); - fprintf(stream, "PERF: csr stalls=%ld\n", csr_stalls); - fprintf(stream, "PERF: mul stalls=%ld\n", mul_stalls); - fprintf(stream, "PERF: fpu stalls=%ld\n", fpu_stalls); - fprintf(stream, "PERF: gpu stalls=%ld\n", gpu_stalls); + fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls); + fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls); + fprintf(stream, "PERF: csr unit stalls=%ld\n", csr_stalls); + fprintf(stream, "PERF: mul unit stalls=%ld\n", mul_stalls); + fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls); + fprintf(stream, "PERF: gpu unit stalls=%ld\n", gpu_stalls); fprintf(stream, "PERF: icache reads=%ld\n", icache_reads); fprintf(stream, "PERF: icache read misses=%ld\n", icache_read_misses); fprintf(stream, "PERF: icache reponse stalls=%ld\n", icache_rsp_stalls); diff --git a/driver/opae/vlsim/opae_sim.cpp b/driver/opae/vlsim/opae_sim.cpp index 4b8cb468..ac92999a 100644 --- a/driver/opae/vlsim/opae_sim.cpp +++ b/driver/opae/vlsim/opae_sim.cpp @@ -9,7 +9,7 @@ #define CCI_WQ_SIZE 16 #define ENABLE_DRAM_STALLS -#define DRAM_LATENCY 100 +#define DRAM_LATENCY 24 #define DRAM_RQ_SIZE 16 #define DRAM_STALLS_MODULO 16 diff --git a/hw/opae/README b/hw/opae/README index da93a61e..483a93fa 100644 --- a/hw/opae/README +++ b/hw/opae/README @@ -66,7 +66,7 @@ make ase ./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd # modify "vsim_run.tcl" to dump VCD trace -vcd file vortex.vcd +vcd file trace.vcd vcd add -r /*/Vortex/hw/rtl/* run -all @@ -104,8 +104,11 @@ lsof +D build_ase_1c make -C pipeline clean && make -C pipeline > pipeline/build.log 2>&1 & make -C cache clean && make -C cache > cache/build.log 2>&1 & make -C core clean && make -C core > core/build.log 2>&1 & +make -C core8 clean && make -C core8 > core8/build.log 2>&1 & make -C vortex clean && make -C vortex > vortex/build.log 2>&1 & make -C top clean && make -C top > top/build.log 2>&1 & +make -C top1 clean && make -C top1 > top1/build.log 2>&1 & +make -C top8 clean && make -C top8 > top8/build.log 2>&1 & # How to calculate the maximum operating frequency? 200 Mhz -> period = 1/200x10^6 = 5ns diff --git a/hw/opae/run_ase.sh b/hw/opae/run_ase.sh index 9e3b5d6c..866e1490 100755 --- a/hw/opae/run_ase.sh +++ b/hw/opae/run_ase.sh @@ -1,6 +1,6 @@ #!/bin/bash -SCRIPT_DIR=$PWD +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" BUILD_DIR=$1 diff --git a/hw/opae/sources_1c.txt b/hw/opae/sources_1c.txt index ad731859..4671c87a 100644 --- a/hw/opae/sources_1c.txt +++ b/hw/opae/sources_1c.txt @@ -4,21 +4,21 @@ +define+QUARTUS +define+FPU_FAST #+define+SCOPE -+define+PERF_ENABLE +#+define+PERF_ENABLE -+define+DBG_PRINT_CORE_ICACHE -+define+DBG_PRINT_CORE_DCACHE -+define+DBG_PRINT_CACHE_BANK -+define+DBG_PRINT_CACHE_SNP -+define+DBG_PRINT_CACHE_MSRQ -+define+DBG_PRINT_CACHE_TAG -+define+DBG_PRINT_CACHE_DATA -+define+DBG_PRINT_DRAM -+define+DBG_PRINT_PIPELINE -+define+DBG_PRINT_OPAE -+define+DBG_PRINT_AVS -+define+DBG_PRINT_SCOPE -+define+DBG_CACHE_REQ_INFO +#+define+DBG_PRINT_CORE_ICACHE +#+define+DBG_PRINT_CORE_DCACHE +#+define+DBG_PRINT_CACHE_BANK +#+define+DBG_PRINT_CACHE_SNP +#+define+DBG_PRINT_CACHE_MSRQ +#+define+DBG_PRINT_CACHE_TAG +#+define+DBG_PRINT_CACHE_DATA +#+define+DBG_PRINT_DRAM +#+define+DBG_PRINT_PIPELINE +#+define+DBG_PRINT_OPAE +#+define+DBG_PRINT_AVS +#+define+DBG_PRINT_SCOPE +#+define+DBG_CACHE_REQ_INFO vortex_afu.json QI:vortex_afu.qsf diff --git a/hw/opae/sources_2c.txt b/hw/opae/sources_2c.txt index 460daddd..ef988ec6 100644 --- a/hw/opae/sources_2c.txt +++ b/hw/opae/sources_2c.txt @@ -1,5 +1,5 @@ +define+NUM_CORES=2 -+define+L2_ENABLE=0 + +define+SYNTHESIS +define+QUARTUS +define+FPU_FAST diff --git a/hw/opae/sources_4c.txt b/hw/opae/sources_4c.txt index 918020cb..fa3b1aa2 100644 --- a/hw/opae/sources_4c.txt +++ b/hw/opae/sources_4c.txt @@ -1,5 +1,5 @@ +define+NUM_CORES=4 -+define+L2_ENABLE=1 + +define+SYNTHESIS +define+QUARTUS +define+FPU_FAST diff --git a/hw/opae/vortex_afu.qsf b/hw/opae/vortex_afu.qsf index c24f3549..5c79c605 100644 --- a/hw/opae/vortex_afu.qsf +++ b/hw/opae/vortex_afu.qsf @@ -1,5 +1,7 @@ # Analysis & Synthesis Assignments + +set_global_assignment -name NUM_PARALLEL_PROCESSORS ALL set_global_assignment -name VERILOG_INPUT_VERSION SYSTEMVERILOG_2009 set_global_assignment -name ADD_PASS_THROUGH_LOGIC_TO_INFERRED_RAMS ON set_global_assignment -name VERILOG_MACRO QUARTUS @@ -7,7 +9,14 @@ set_global_assignment -name VERILOG_MACRO SYNTHESIS set_global_assignment -name VERILOG_MACRO NDEBUG set_global_assignment -name MESSAGE_DISABLE 16818 set_global_assignment -name VERILOG_MACRO FPU_FAST +set_global_assignment -name TIMEQUEST_DO_REPORT_TIMING ON +set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED +set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE" +set_global_assignment -name FITTER_EFFORT "STANDARD FIT" +set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS" +set_global_assignment -name ROUTER_TIMING_OPTIMIZATION_LEVEL MAXIMUM +set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0 set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100 set_global_assignment -name POWER_BOARD_THERMAL_MODEL "NONE (CONSERVATIVE)" @@ -17,10 +26,4 @@ set_global_assignment -name TIMEQUEST_DO_CCPP_REMOVAL ON set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON set_global_assignment -name POWER_USE_TA_VALUE 65 -set_global_assignment -name SEED 1 -set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON -set_global_assignment -name FITTER_EFFORT "STANDARD FIT" -set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS" -set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED -set_global_assignment -name ROUTER_TIMING_OPTIMIZATION_LEVEL MAXIMUM -set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE" \ No newline at end of file +set_global_assignment -name SEED 1 \ No newline at end of file diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 71ffb1c9..61a6a978 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -181,7 +181,9 @@ module VX_cluster #( .NUM_REQS (`NUM_CORES), .WORD_SIZE (4), .TAG_IN_WIDTH (`DCORE_TAG_WIDTH), - .TAG_OUT_WIDTH (`L2CORE_TAG_WIDTH) + .TAG_OUT_WIDTH (`L2CORE_TAG_WIDTH), + .BUFFERED_REQ (`NUM_CORES >= 4), + .BUFFERED_RSP (1) ) io_arb ( .clk (clk), .reset (reset), @@ -218,9 +220,11 @@ module VX_cluster #( ); VX_csr_io_arb #( - .NUM_REQS (`NUM_CORES), - .DATA_WIDTH (32), - .ADDR_WIDTH (12) + .NUM_REQS (`NUM_CORES), + .DATA_WIDTH (32), + .ADDR_WIDTH (12), + .BUFFERED_REQ (1), + .BUFFERED_RSP (`NUM_CORES >= 4) ) csr_io_arb ( .clk (clk), .reset (reset), @@ -268,7 +272,8 @@ module VX_cluster #( .DST_ADDR_WIDTH (`DDRAM_ADDR_WIDTH), .SREQ_SIZE (`L2SREQ_SIZE), .TAG_IN_WIDTH (`L2SNP_TAG_WIDTH), - .TAG_OUT_WIDTH (`DSNP_TAG_WIDTH) + .TAG_OUT_WIDTH (`DSNP_TAG_WIDTH), + .BUFFERED (`NUM_CORES >= 4) ) snp_forwarder ( .clk (clk), .reset (reset), @@ -301,49 +306,6 @@ module VX_cluster #( VX_perf_cache_if perf_l2cache_if(); `endif - wire [`NUM_CORES-1:0] per_core_dram_req_valid_qual; - wire [`NUM_CORES-1:0] per_core_dram_req_rw_qual; - wire [`NUM_CORES-1:0][`DDRAM_BYTEEN_WIDTH-1:0] per_core_dram_req_byteen_qual; - wire [`NUM_CORES-1:0][`DDRAM_ADDR_WIDTH-1:0] per_core_dram_req_addr_qual; - wire [`NUM_CORES-1:0][`DDRAM_LINE_WIDTH-1:0] per_core_dram_req_data_qual; - wire [`NUM_CORES-1:0][`XDRAM_TAG_WIDTH-1:0] per_core_dram_req_tag_qual; - wire [`NUM_CORES-1:0] per_core_dram_req_ready_qual; - - wire [`NUM_CORES-1:0] per_core_dram_rsp_valid_unqual; - wire [`NUM_CORES-1:0][`DDRAM_LINE_WIDTH-1:0] per_core_dram_rsp_data_unqual; - wire [`NUM_CORES-1:0][`XDRAM_TAG_WIDTH-1:0] per_core_dram_rsp_tag_unqual; - wire [`NUM_CORES-1:0] per_core_dram_rsp_ready_unqual; - - for (genvar i = 0; i < `NUM_CORES; i++) begin - VX_skid_buffer #( - .DATAW (1 + `DDRAM_BYTEEN_WIDTH + `DDRAM_ADDR_WIDTH + `DDRAM_LINE_WIDTH + `XDRAM_TAG_WIDTH), - .PASSTHRU (`NUM_CORES < 4) - ) core_req_buffer ( - .clk (clk), - .reset (reset), - .valid_in (per_core_dram_req_valid[i]), - .data_in ({per_core_dram_req_rw[i], per_core_dram_req_byteen[i], per_core_dram_req_addr[i], per_core_dram_req_data[i], per_core_dram_req_tag[i]}), - .ready_in (per_core_dram_req_ready[i]), - .valid_out (per_core_dram_req_valid_qual[i]), - .data_out ({per_core_dram_req_rw_qual[i], per_core_dram_req_byteen_qual[i], per_core_dram_req_addr_qual[i], per_core_dram_req_data_qual[i], per_core_dram_req_tag_qual[i]}), - .ready_out (per_core_dram_req_ready_qual[i]) - ); - - VX_skid_buffer #( - .DATAW (`DDRAM_LINE_WIDTH + `XDRAM_TAG_WIDTH), - .PASSTHRU (1) - ) core_rsp_buffer ( - .clk (clk), - .reset (reset), - .valid_in (per_core_dram_rsp_valid_unqual[i]), - .data_in ({per_core_dram_rsp_data_unqual[i], per_core_dram_rsp_tag_unqual[i]}), - .ready_in (per_core_dram_rsp_ready_unqual[i]), - .valid_out (per_core_dram_rsp_valid[i]), - .data_out ({per_core_dram_rsp_data[i], per_core_dram_rsp_tag[i]}), - .ready_out (per_core_dram_rsp_ready[i]) - ); - end - VX_cache #( .CACHE_ID (`L2CACHE_ID), .CACHE_SIZE (`L2CACHE_SIZE), @@ -376,19 +338,19 @@ module VX_cluster #( `endif // Core request - .core_req_valid (per_core_dram_req_valid_qual), - .core_req_rw (per_core_dram_req_rw_qual), - .core_req_byteen (per_core_dram_req_byteen_qual), - .core_req_addr (per_core_dram_req_addr_qual), - .core_req_data (per_core_dram_req_data_qual), - .core_req_tag (per_core_dram_req_tag_qual), - .core_req_ready (per_core_dram_req_ready_qual), + .core_req_valid (per_core_dram_req_valid), + .core_req_rw (per_core_dram_req_rw), + .core_req_byteen (per_core_dram_req_byteen), + .core_req_addr (per_core_dram_req_addr), + .core_req_data (per_core_dram_req_data), + .core_req_tag (per_core_dram_req_tag), + .core_req_ready (per_core_dram_req_ready), // Core response - .core_rsp_valid (per_core_dram_rsp_valid_unqual), - .core_rsp_data (per_core_dram_rsp_data_unqual), - .core_rsp_tag (per_core_dram_rsp_tag_unqual), - .core_rsp_ready (per_core_dram_rsp_ready_unqual), + .core_rsp_valid (per_core_dram_rsp_valid), + .core_rsp_data (per_core_dram_rsp_data), + .core_rsp_tag (per_core_dram_rsp_tag), + .core_rsp_ready (per_core_dram_rsp_ready), // DRAM request .dram_req_valid (dram_req_valid), @@ -427,7 +389,9 @@ module VX_cluster #( .NUM_REQS (`NUM_CORES), .DATA_WIDTH (`L2DRAM_LINE_WIDTH), .TAG_IN_WIDTH (`XDRAM_TAG_WIDTH), - .TAG_OUT_WIDTH (`L2DRAM_TAG_WIDTH) + .TAG_OUT_WIDTH (`L2DRAM_TAG_WIDTH), + .BUFFERED_REQ (`NUM_CORES >= 4), + .BUFFERED_RSP (1) ) dram_arb ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 984b0bc0..a1b13b38 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -24,11 +24,15 @@ `endif `ifndef L2_ENABLE -`define L2_ENABLE (`NUM_CORES >= 4) +`define L2_ENABLE 0 `endif `ifndef L3_ENABLE -`define L3_ENABLE (`NUM_CLUSTERS >= 4) +`define L3_ENABLE 0 +`endif + +`ifndef SM_ENABLE +`define SM_ENABLE 0 `endif `ifndef GLOBAL_BLOCK_SIZE @@ -253,7 +257,7 @@ // Size of cache in bytes `ifndef ICACHE_SIZE -`define ICACHE_SIZE 4096 +`define ICACHE_SIZE 2048 `endif // Core Request Queue Size @@ -285,7 +289,7 @@ // Size of cache in bytes `ifndef DCACHE_SIZE -`define DCACHE_SIZE 8192 +`define DCACHE_SIZE 4096 `endif // Number of banks @@ -332,7 +336,7 @@ // Size of cache in bytes `ifndef SMEM_SIZE -`define SMEM_SIZE 4096 +`define SMEM_SIZE 2048 `endif // Number of banks diff --git a/hw/rtl/VX_csr_arb.v b/hw/rtl/VX_csr_arb.v index 2b57df24..f90270cb 100644 --- a/hw/rtl/VX_csr_arb.v +++ b/hw/rtl/VX_csr_arb.v @@ -43,8 +43,19 @@ module VX_csr_arb ( assign csr_io_req_if.ready = csr_pipe_req_if.ready && !csr_core_req_if.valid; // responses - assign csr_io_rsp_if.valid = csr_pipe_rsp_if.valid & select_io_rsp; - assign csr_io_rsp_if.data = csr_pipe_rsp_if.data[0]; + wire csr_io_rsp_ready; + VX_skid_buffer #( + .DATAW (32) + ) csr_io_out_buffer ( + .clk (clk), + .reset (reset), + .valid_in (csr_pipe_rsp_if.valid & select_io_rsp), + .data_in (csr_pipe_rsp_if.data[0]), + .ready_in (csr_io_rsp_ready), + .valid_out (csr_io_rsp_if.valid), + .data_out (csr_io_rsp_if.data), + .ready_out (csr_io_rsp_if.ready) + ); assign csr_commit_if.valid = csr_pipe_rsp_if.valid & ~select_io_rsp; assign csr_commit_if.wid = csr_pipe_rsp_if.wid; @@ -54,6 +65,6 @@ module VX_csr_arb ( assign csr_commit_if.wb = csr_pipe_rsp_if.wb; assign csr_commit_if.data = csr_pipe_rsp_if.data; - assign csr_pipe_rsp_if.ready = select_io_rsp ? csr_io_rsp_if.ready : csr_commit_if.ready; + assign csr_pipe_rsp_if.ready = select_io_rsp ? csr_io_rsp_ready : csr_commit_if.ready; endmodule diff --git a/hw/rtl/VX_csr_io_arb.v b/hw/rtl/VX_csr_io_arb.v index d596ee9b..b70ff3ad 100644 --- a/hw/rtl/VX_csr_io_arb.v +++ b/hw/rtl/VX_csr_io_arb.v @@ -1,24 +1,26 @@ `include "VX_define.vh" module VX_csr_io_arb #( - parameter NUM_REQS = 1, - parameter DATA_WIDTH = 1, + parameter NUM_REQS = 1, + parameter DATA_WIDTH = 1, + parameter BUFFERED_REQ = 0, + parameter BUFFERED_RSP = 0, parameter DATA_SIZE = (DATA_WIDTH / 8), parameter ADDR_WIDTH = 32 - `CLOG2(DATA_SIZE), parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS) ) ( - input wire clk, - input wire reset, + input wire clk, + input wire reset, - input wire [LOG_NUM_REQS-1:0] request_id, + input wire [LOG_NUM_REQS-1:0] request_id, // input requests - input wire req_valid_in, - input wire [ADDR_WIDTH-1:0] req_addr_in, - input wire req_rw_in, - input wire [DATA_WIDTH-1:0] req_data_in, - output wire req_ready_in, + input wire req_valid_in, + input wire [ADDR_WIDTH-1:0] req_addr_in, + input wire req_rw_in, + input wire [DATA_WIDTH-1:0] req_data_in, + output wire req_ready_in, // output request output wire [NUM_REQS-1:0] req_valid_out, @@ -33,40 +35,38 @@ module VX_csr_io_arb #( output wire [NUM_REQS-1:0] rsp_ready_in, // output response - output wire rsp_valid_out, - output wire [DATA_WIDTH-1:0] rsp_data_out, - input wire rsp_ready_out + output wire rsp_valid_out, + output wire [DATA_WIDTH-1:0] rsp_data_out, + input wire rsp_ready_out ); - if (NUM_REQS > 1) begin + localparam REQ_DATAW = ADDR_WIDTH + 1 + DATA_WIDTH; + localparam RSP_DATAW = DATA_WIDTH; - for (genvar i = 0; i < NUM_REQS; i++) begin - assign req_valid_out[i] = req_valid_in && (request_id == LOG_NUM_REQS'(i)); - assign req_addr_out[i] = req_addr_in; - assign req_rw_out[i] = req_rw_in; - assign req_data_out[i] = req_data_in; - end - - assign req_ready_in = req_ready_out[request_id]; - - end else begin - - `UNUSED_VAR (request_id) - - assign req_valid_out = req_valid_in; - assign req_addr_out = req_addr_in; - assign req_rw_out = req_rw_in; - assign req_data_out = req_data_in; - assign req_ready_in = req_ready_out; - + wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_merged_data_out; + for (genvar i = 0; i < NUM_REQS; i++) begin + assign {req_addr_out[i], req_rw_out[i], req_data_out[i]} = req_merged_data_out[i]; end - /////////////////////////////////////////////////////////////////////// + VX_stream_demux #( + .NUM_REQS (NUM_REQS), + .DATAW (REQ_DATAW), + .BUFFERED (BUFFERED_REQ) + ) req_demux ( + .clk (clk), + .reset (reset), + .sel (request_id), + .valid_in (req_valid_in), + .data_in ({req_addr_in, req_rw_in, req_data_in}), + .ready_in (req_ready_in), + .valid_out (req_valid_out), + .data_out (req_merged_data_out), + .ready_out (req_ready_out) + ); VX_stream_arbiter #( - .NUM_REQS (NUM_REQS), - .DATAW (DATA_WIDTH), - .IN_BUFFER (NUM_REQS >= 4), - .OUT_BUFFER (NUM_REQS >= 4) + .NUM_REQS (NUM_REQS), + .DATAW (RSP_DATAW), + .BUFFERED (BUFFERED_RSP) ) rsp_arb ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_databus_arb.v b/hw/rtl/VX_databus_arb.v index 03e67ce0..6830d0f2 100644 --- a/hw/rtl/VX_databus_arb.v +++ b/hw/rtl/VX_databus_arb.v @@ -5,6 +5,8 @@ module VX_databus_arb #( parameter WORD_SIZE = 1, parameter TAG_IN_WIDTH = 1, parameter TAG_OUT_WIDTH = 1, + parameter BUFFERED_REQ = 0, + parameter BUFFERED_RSP = 0, parameter WORD_WIDTH = WORD_SIZE * 8, parameter ADDR_WIDTH = 32 - `CLOG2(WORD_SIZE), @@ -43,12 +45,13 @@ module VX_databus_arb #( output wire [NUM_REQS-1:0][WORD_WIDTH-1:0] rsp_data_out, input wire [NUM_REQS-1:0] rsp_ready_out ); - localparam DATAW = `NUM_THREADS + TAG_OUT_WIDTH + (`NUM_THREADS * ADDR_WIDTH) + 1 + (`NUM_THREADS * WORD_SIZE) + (`NUM_THREADS * WORD_WIDTH); + localparam REQ_DATAW = `NUM_THREADS + TAG_OUT_WIDTH + (`NUM_THREADS * ADDR_WIDTH) + 1 + (`NUM_THREADS * WORD_SIZE) + (`NUM_THREADS * WORD_WIDTH); + localparam RSP_DATAW = TAG_IN_WIDTH + WORD_WIDTH; if (NUM_REQS > 1) begin wire [NUM_REQS-1:0] valids; - wire [NUM_REQS-1:0][DATAW-1:0] data_in; + wire [NUM_REQS-1:0][REQ_DATAW-1:0] data_in; wire [`NUM_THREADS-1:0] req_tmask_out; wire req_valid_out_unqual; @@ -58,34 +61,46 @@ module VX_databus_arb #( end VX_stream_arbiter #( - .NUM_REQS (NUM_REQS), - .DATAW (DATAW), - .IN_BUFFER (NUM_REQS >= 4), - .OUT_BUFFER (NUM_REQS >= 4) + .NUM_REQS (NUM_REQS), + .DATAW (REQ_DATAW), + .BUFFERED (BUFFERED_REQ) ) req_arb ( - .clk (clk), - .reset (reset), - .valid_in (valids), - .data_in (data_in), - .ready_in (req_ready_in), - .valid_out (req_valid_out_unqual), - .data_out ({req_tmask_out, req_tag_out, req_addr_out, req_rw_out, req_byteen_out, req_data_out}), - .ready_out (req_ready_out) + .clk (clk), + .reset (reset), + .valid_in (valids), + .data_in (data_in), + .ready_in (req_ready_in), + .valid_out (req_valid_out_unqual), + .data_out ({req_tmask_out, req_tag_out, req_addr_out, req_rw_out, req_byteen_out, req_data_out}), + .ready_out (req_ready_out) ); assign req_valid_out = {`NUM_THREADS{req_valid_out_unqual}} & req_tmask_out; /////////////////////////////////////////////////////////////////////// - wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[LOG_NUM_REQS-1:0]; - - for (genvar i = 0; i < NUM_REQS; i++) begin - assign rsp_valid_out[i] = rsp_valid_in && (rsp_sel == LOG_NUM_REQS'(i)); - assign rsp_tag_out[i] = rsp_tag_in[LOG_NUM_REQS +: TAG_IN_WIDTH]; - assign rsp_data_out[i] = rsp_data_in; + wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[LOG_NUM_REQS-1:0]; + + wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_merged_data_out; + for (genvar i = 0; i < NUM_REQS; i++) begin + assign {rsp_tag_out[i], rsp_data_out[i]} = rsp_merged_data_out[i]; end - - assign rsp_ready_in = rsp_ready_out[rsp_sel]; + + VX_stream_demux #( + .NUM_REQS (NUM_REQS), + .DATAW (RSP_DATAW), + .BUFFERED (BUFFERED_RSP) + ) rsp_demux ( + .clk (clk), + .reset (reset), + .sel (rsp_sel), + .valid_in (rsp_valid_in), + .data_in ({rsp_tag_in[LOG_NUM_REQS +: TAG_IN_WIDTH], rsp_data_in}), + .ready_in (rsp_ready_in), + .valid_out (rsp_valid_out), + .data_out (rsp_merged_data_out), + .ready_out (rsp_ready_out) + ); end else begin diff --git a/hw/rtl/VX_dcache_arb.v b/hw/rtl/VX_dcache_arb.v index d6670ca9..1926ceb7 100644 --- a/hw/rtl/VX_dcache_arb.v +++ b/hw/rtl/VX_dcache_arb.v @@ -20,7 +20,8 @@ module VX_dcache_arb ( // output response VX_cache_core_rsp_if core_rsp_if ); - localparam REQ_DATAW = `NUM_THREADS + 1 + `NUM_THREADS * `DWORD_SIZE + `NUM_THREADS * (32-`CLOG2(`DWORD_SIZE)) + `NUM_THREADS * (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; + localparam REQ_ADDRW = 32 - `CLOG2(`DWORD_SIZE); + localparam REQ_DATAW = `NUM_THREADS + 1 + `NUM_THREADS * `DWORD_SIZE + `NUM_THREADS * REQ_ADDRW + `NUM_THREADS * (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; localparam RSP_DATAW = `NUM_THREADS + `NUM_THREADS * (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; // @@ -28,54 +29,90 @@ module VX_dcache_arb ( // // select shared memory bus - wire is_smem_addr = (| core_req_if.valid) - && ({core_req_if.addr[0], 2'b0} >= `SHARED_MEM_BASE_ADDR) - && ({core_req_if.addr[0], 2'b0} < (`SHARED_MEM_BASE_ADDR + `SMEM_SIZE)); + wire is_smem_addr = core_req_if.valid[0] && `SM_ENABLE + && (core_req_if.addr[0] >= REQ_ADDRW'((`SHARED_MEM_BASE_ADDR - `SMEM_SIZE) >> 2)) + && (core_req_if.addr[0] < REQ_ADDRW'(`SHARED_MEM_BASE_ADDR >> 2)); // select io bus - wire is_io_addr = (| core_req_if.valid) - && ({core_req_if.addr[0], 2'b0} >= `IO_BUS_BASE_ADDR); + wire is_io_addr = core_req_if.valid[0] + && (core_req_if.addr[0] >= REQ_ADDRW'(`IO_BUS_BASE_ADDR >> 2)); + + wire cache_req_valid_out; + wire [`NUM_THREADS-1:0] cache_req_tmask; + wire cache_req_ready_in; + + wire smem_req_valid_out; + wire [`NUM_THREADS-1:0] smem_req_tmask; + wire smem_req_ready_in; + + wire io_req_valid_out; + wire [`NUM_THREADS-1:0] io_req_tmask; + wire io_req_ready_in; reg [2:0] req_select; reg req_ready; - assign cache_req_if.valid = core_req_if.valid & {`NUM_THREADS{req_select[0]}}; - assign cache_req_if.rw = core_req_if.rw; - assign cache_req_if.byteen = core_req_if.byteen; - assign cache_req_if.addr = core_req_if.addr; - assign cache_req_if.data = core_req_if.data; - assign cache_req_if.tag = core_req_if.tag; + VX_skid_buffer #( + .DATAW (REQ_DATAW) + ) cache_out_buffer ( + .clk (clk), + .reset (reset), + .valid_in (req_select[0]), + .data_in ({core_req_if.valid, core_req_if.addr, core_req_if.rw, core_req_if.byteen, core_req_if.data, core_req_if.tag}), + .ready_in (cache_req_ready_in), + .valid_out (cache_req_valid_out), + .data_out ({cache_req_tmask, cache_req_if.addr, cache_req_if.rw, cache_req_if.byteen, cache_req_if.data, cache_req_if.tag}), + .ready_out (cache_req_if.ready) + ); - assign smem_req_if.valid = core_req_if.valid & {`NUM_THREADS{req_select[1]}}; - assign smem_req_if.rw = core_req_if.rw; - assign smem_req_if.byteen = core_req_if.byteen; - assign smem_req_if.addr = core_req_if.addr; - assign smem_req_if.data = core_req_if.data; - assign smem_req_if.tag = core_req_if.tag; + assign cache_req_if.valid = cache_req_tmask & {`NUM_THREADS{cache_req_valid_out}}; - assign io_req_if.valid = core_req_if.valid & {`NUM_THREADS{req_select[2]}}; - assign io_req_if.rw = core_req_if.rw; - assign io_req_if.byteen = core_req_if.byteen; - assign io_req_if.addr = core_req_if.addr; - assign io_req_if.data = core_req_if.data; - assign io_req_if.tag = core_req_if.tag; + VX_skid_buffer #( + .DATAW (REQ_DATAW) + ) smem_out_buffer ( + .clk (clk), + .reset (reset), + .valid_in (req_select[1]), + .data_in ({core_req_if.valid, core_req_if.addr, core_req_if.rw, core_req_if.byteen, core_req_if.data, core_req_if.tag}), + .ready_in (smem_req_ready_in), + .valid_out (smem_req_valid_out), + .data_out ({smem_req_tmask, smem_req_if.addr, smem_req_if.rw, smem_req_if.byteen, smem_req_if.data, smem_req_if.tag}), + .ready_out (smem_req_if.ready) + ); - assign core_req_if.ready = req_ready; + assign smem_req_if.valid = smem_req_tmask & {`NUM_THREADS{smem_req_valid_out}}; + + VX_skid_buffer #( + .DATAW (REQ_DATAW) + ) io_out_buffer ( + .clk (clk), + .reset (reset), + .valid_in (req_select[2]), + .data_in ({core_req_if.valid, core_req_if.addr, core_req_if.rw, core_req_if.byteen, core_req_if.data, core_req_if.tag}), + .ready_in (io_req_ready_in), + .valid_out (io_req_valid_out), + .data_out ({io_req_tmask, io_req_if.addr, io_req_if.rw, io_req_if.byteen, io_req_if.data, io_req_if.tag}), + .ready_out (io_req_if.ready) + ); + + assign io_req_if.valid = io_req_tmask & {`NUM_THREADS{io_req_valid_out}}; always @(*) begin req_select = 0; if (is_smem_addr) begin req_select[1] = 1; - req_ready = smem_req_if.ready; + req_ready = smem_req_ready_in; end else if (is_io_addr) begin req_select[2] = 1; - req_ready = io_req_if.ready; + req_ready = io_req_ready_in; end else begin req_select[0] = 1; - req_ready = cache_req_if.ready; + req_ready = cache_req_ready_in; end end + assign core_req_if.ready = req_ready; + // // select response // @@ -92,14 +129,13 @@ module VX_dcache_arb ( assign rsp_data_in[2] = {io_rsp_if.valid, io_rsp_if.data, io_rsp_if.tag}; assign rsp_valid_in[0] = (| cache_rsp_if.valid); - assign rsp_valid_in[1] = (| smem_rsp_if.valid); + assign rsp_valid_in[1] = (| smem_rsp_if.valid) & `SM_ENABLE; assign rsp_valid_in[2] = (| io_rsp_if.valid); VX_stream_arbiter #( - .NUM_REQS (3), - .DATAW (RSP_DATAW), - .IN_BUFFER (1), - .OUT_BUFFER (1) + .NUM_REQS (3), + .DATAW (RSP_DATAW), + .BUFFERED (1) ) rsp_arb ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index d44e8966..2604bce1 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -40,19 +40,20 @@ module VX_fpu_unit #( wire fpuq_pop = valid_out && ready_out; VX_cam_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), - .SIZE (`FPUQ_SIZE) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), + .SIZE (`FPUQ_SIZE), + .FASTRAM (1) ) req_metadata_buf ( - .clk (clk), - .reset (reset), - .acquire_slot (fpuq_push), - .write_addr (tag_in), - .read_addr (tag_out), - .release_addr (tag_out), - .write_data ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}), - .read_data ({rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}), - .release_slot (fpuq_pop), - .full (fpuq_full) + .clk (clk), + .reset (reset), + .acquire_slot (fpuq_push), + .write_addr (tag_in), + .read_addr (tag_out), + .release_addr (tag_out), + .write_data ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}), + .read_data ({rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}), + .release_slot (fpuq_pop), + .full (fpuq_full) ); // can accept new request? diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index 3564d851..688c767c 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -99,7 +99,6 @@ module VX_gpu_unit #( `SCOPE_ASSIGN (gpu_req_op_type, gpu_req_if.op_type); `SCOPE_ASSIGN (gpu_req_rs1, gpu_req_if.rs1_data[0]); `SCOPE_ASSIGN (gpu_req_rs2, gpu_req_if.rs2_data); - `SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid); `SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid); `SCOPE_ASSIGN (gpu_rsp_tmc, warp_ctl_if.tmc); diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index 4488323c..c367d002 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -42,9 +42,9 @@ module VX_ibuffer #( wire pop = reading && (size_r[i] != 1); VX_generic_queue #( - .DATAW(DATAW), - .SIZE(SIZE), - .BUFFERED(1) + .DATAW (DATAW), + .SIZE (SIZE), + .FASTRAM (1) ) queue ( .clk (clk), .reset (reset), @@ -101,7 +101,7 @@ module VX_ibuffer #( end // schedule the next instruction to issue - // does round-robin scheduling when multiple warps are present + // do round-robin when multiple warps are active always @(*) begin deq_valid_n = 0; deq_wid_n = 'x; diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index c4857834..12127ccf 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -28,15 +28,14 @@ module VX_icache_stage #( VX_dp_ram #( .DATAW(32 + `NUM_THREADS), .SIZE(`NUM_WARPS), - .BUFFERED(0), - .RWCHECK(0) + .FASTRAM(1) ) req_metadata ( .clk(clk), .waddr(req_tag), .raddr(rsp_tag), .wren(icache_req_fire), .byteen(1'b1), - .rden(1'b1), + .rden(ifetch_rsp_if.valid), .din({ifetch_req_if.PC, ifetch_req_if.tmask}), .dout({ifetch_rsp_if.PC, ifetch_rsp_if.tmask}) ); @@ -68,7 +67,6 @@ module VX_icache_stage #( `SCOPE_ASSIGN (icache_req_wid, ifetch_req_if.wid); `SCOPE_ASSIGN (icache_req_addr, {icache_req_if.addr, 2'b0}); `SCOPE_ASSIGN (icache_req_tag, req_tag); - `SCOPE_ASSIGN (icache_rsp_fire, icache_rsp_if.valid && icache_rsp_if.ready); `SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data[0]); `SCOPE_ASSIGN (icache_rsp_tag, rsp_tag); diff --git a/hw/rtl/VX_ipdom_stack.v b/hw/rtl/VX_ipdom_stack.v index 88bdb18d..4dccbf6f 100644 --- a/hw/rtl/VX_ipdom_stack.v +++ b/hw/rtl/VX_ipdom_stack.v @@ -14,13 +14,13 @@ module VX_ipdom_stack #( output wire empty, output wire full ); - localparam STACK_SIZE = 2 ** DEPTH; + localparam ADDRW = $clog2(DEPTH); - reg is_part [STACK_SIZE-1:0]; + reg is_part [DEPTH-1:0]; - reg [DEPTH-1:0] rd_ptr, wr_ptr; + reg [ADDRW-1:0] rd_ptr, wr_ptr; - wire [WIDTH - 1:0] d1, d2; + wire [WIDTH-1:0] d1, d2; always @(posedge clk) begin if (reset) begin @@ -29,18 +29,17 @@ module VX_ipdom_stack #( end else begin if (push) begin rd_ptr <= wr_ptr; - wr_ptr <= wr_ptr + DEPTH'(1); + wr_ptr <= wr_ptr + ADDRW'(1); end else if (pop) begin - wr_ptr <= wr_ptr - DEPTH'(is_part[rd_ptr]); - rd_ptr <= rd_ptr - DEPTH'(is_part[rd_ptr]); + wr_ptr <= wr_ptr - ADDRW'(is_part[rd_ptr]); + rd_ptr <= rd_ptr - ADDRW'(is_part[rd_ptr]); end end end VX_dp_ram #( .DATAW(WIDTH * 2), - .SIZE(STACK_SIZE), - .BUFFERED(0), + .SIZE(DEPTH), .RWCHECK(0) ) store ( .clk(clk), @@ -48,7 +47,7 @@ module VX_ipdom_stack #( .raddr(rd_ptr), .wren(push), .byteen(1'b1), - .rden(1'b1), + .rden(pop), .din({q2, q1}), .dout({d2, d1}) ); @@ -64,6 +63,6 @@ module VX_ipdom_stack #( assign d = p ? d1 : d2; assign empty = ~(| wr_ptr); - assign full = ((STACK_SIZE-1) == wr_ptr); + assign full = (ADDRW'(DEPTH-1) == wr_ptr); endmodule \ No newline at end of file diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 8cb02fe8..80b994ea 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -110,15 +110,13 @@ module VX_issue #( `SCOPE_ASSIGN (issue_imm, ibuf_deq_if.imm); `SCOPE_ASSIGN (issue_rs1_is_pc, ibuf_deq_if.rs1_is_PC); `SCOPE_ASSIGN (issue_rs2_is_imm, ibuf_deq_if.rs2_is_imm); - `SCOPE_ASSIGN (scoreboard_delay, scoreboard_delay); - `SCOPE_ASSIGN (execute_delay, ~execute_if.ready); - + `SCOPE_ASSIGN (execute_delay, ~execute_if.ready); `SCOPE_ASSIGN (gpr_rsp_a, gpr_rsp_if.rs1_data); `SCOPE_ASSIGN (gpr_rsp_b, gpr_rsp_if.rs2_data); `SCOPE_ASSIGN (gpr_rsp_c, gpr_rsp_if.rs3_data); - `SCOPE_ASSIGN (writeback_valid, writeback_if.valid); + `SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask); `SCOPE_ASSIGN (writeback_wid, writeback_if.wid); `SCOPE_ASSIGN (writeback_pc, writeback_if.PC); `SCOPE_ASSIGN (writeback_rd, writeback_if.rd); diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index a67a6a38..ecacd1e2 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -77,7 +77,7 @@ module VX_lsu_unit #( VX_generic_register #( .N(1 + `NW_BITS + `NUM_THREADS + 32 + 1 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 2 + (`NUM_THREADS * (30 + 2 + 4 + 32))), .R(1) - ) pipe_reg0 ( + ) req_pipe_reg ( .clk (clk), .reset (reset), .stall (stall_in), @@ -111,8 +111,9 @@ module VX_lsu_unit #( wire lsuq_pop = lsuq_pop_part && (0 == mem_rsp_mask_n); VX_cam_buffer #( - .DATAW (`NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 2) + 2), - .SIZE (`LSUQ_SIZE) + .DATAW (`NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 2) + 2), + .SIZE (`LSUQ_SIZE), + .FASTRAM (1) ) req_metadata_buf ( .clk (clk), .reset (reset), @@ -192,7 +193,7 @@ module VX_lsu_unit #( VX_generic_register #( .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .R(1) - ) pipe_reg1 ( + ) rsp_pipe_reg ( .clk (clk), .reset (reset), .stall (load_rsp_stall), @@ -213,7 +214,6 @@ module VX_lsu_unit #( `SCOPE_ASSIGN (dcache_req_byteen,dcache_req_if.byteen); `SCOPE_ASSIGN (dcache_req_data, dcache_req_if.data); `SCOPE_ASSIGN (dcache_req_tag, req_tag); - `SCOPE_ASSIGN (dcache_rsp_fire, dcache_rsp_if.valid & {`NUM_THREADS{dcache_rsp_if.ready}}); `SCOPE_ASSIGN (dcache_rsp_data, dcache_rsp_if.data); `SCOPE_ASSIGN (dcache_rsp_tag, rsp_tag); @@ -222,11 +222,11 @@ module VX_lsu_unit #( always @(posedge clk) begin if ((| dcache_req_if.valid) && dcache_req_if.ready) begin if (dcache_req_if.rw) - $display("%t: D$%0d Rw Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, data=%0h", + $display("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, data=%0h", $time, CORE_ID, req_wid, req_pc, dcache_req_if.valid, req_address, dcache_req_if.tag, dcache_req_if.byteen, dcache_req_if.data); else - $display("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, rd=%0d, byteen=%0h", - $time, CORE_ID, req_wid, req_pc, dcache_req_if.valid, req_address, dcache_req_if.tag, req_rd, dcache_req_if.byteen, dcache_req_if.data); + $display("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, rd=%0d", + $time, CORE_ID, req_wid, req_pc, dcache_req_if.valid, req_address, dcache_req_if.tag, dcache_req_if.byteen, req_rd); end if ((| dcache_rsp_if.valid) && dcache_rsp_if.ready) begin $display("%t: D$%0d Rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=%0h", diff --git a/hw/rtl/VX_mem_arb.v b/hw/rtl/VX_mem_arb.v index f0869ab7..958146a2 100644 --- a/hw/rtl/VX_mem_arb.v +++ b/hw/rtl/VX_mem_arb.v @@ -5,6 +5,8 @@ module VX_mem_arb #( parameter DATA_WIDTH = 1, parameter TAG_IN_WIDTH = 1, parameter TAG_OUT_WIDTH = 1, + parameter BUFFERED_REQ = 0, + parameter BUFFERED_RSP = 0, parameter DATA_SIZE = (DATA_WIDTH / 8), parameter ADDR_WIDTH = 32 - `CLOG2(DATA_SIZE), @@ -48,38 +50,50 @@ module VX_mem_arb #( if (NUM_REQS > 1) begin - wire [NUM_REQS-1:0][REQ_DATAW-1:0] data_in; + wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_merged_data_in; for (genvar i = 0; i < NUM_REQS; i++) begin - assign data_in[i] = {{req_tag_in[i], LOG_NUM_REQS'(i)}, req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]}; + assign req_merged_data_in[i] = {{req_tag_in[i], LOG_NUM_REQS'(i)}, req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]}; end VX_stream_arbiter #( - .NUM_REQS (NUM_REQS), - .DATAW (REQ_DATAW), - .IN_BUFFER (NUM_REQS >= 4), - .OUT_BUFFER (NUM_REQS >= 4) + .NUM_REQS (NUM_REQS), + .DATAW (REQ_DATAW), + .BUFFERED (BUFFERED_REQ) ) req_arb ( - .clk (clk), - .reset (reset), - .valid_in (req_valid_in), - .data_in (data_in), - .ready_in (req_ready_in), - .valid_out (req_valid_out), - .data_out ({req_tag_out, req_addr_out, req_rw_out, req_byteen_out, req_data_out}), - .ready_out (req_ready_out) + .clk (clk), + .reset (reset), + .valid_in (req_valid_in), + .data_in (req_merged_data_in), + .ready_in (req_ready_in), + .valid_out (req_valid_out), + .data_out ({req_tag_out, req_addr_out, req_rw_out, req_byteen_out, req_data_out}), + .ready_out (req_ready_out) ); /////////////////////////////////////////////////////////////////////// wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in [LOG_NUM_REQS-1:0]; - - for (genvar i = 0; i < NUM_REQS; i++) begin - assign rsp_valid_out [i] = rsp_valid_in && (rsp_sel == LOG_NUM_REQS'(i)); - assign rsp_tag_out [i] = rsp_tag_in[LOG_NUM_REQS +: TAG_IN_WIDTH]; - assign rsp_data_out [i] = rsp_data_in; + + wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_merged_data_out; + for (genvar i = 0; i < NUM_REQS; i++) begin + assign {rsp_tag_out[i], rsp_data_out[i]} = rsp_merged_data_out[i]; end - - assign rsp_ready_in = rsp_ready_out [rsp_sel]; + + VX_stream_demux #( + .NUM_REQS (NUM_REQS), + .DATAW (RSP_DATAW), + .BUFFERED (BUFFERED_RSP) + ) rsp_demux ( + .clk (clk), + .reset (reset), + .sel (rsp_sel), + .valid_in (rsp_valid_in), + .data_in ({rsp_tag_in[LOG_NUM_REQS +: TAG_IN_WIDTH], rsp_data_in}), + .ready_in (rsp_ready_in), + .valid_out (rsp_valid_out), + .data_out (rsp_merged_data_out), + .ready_out (rsp_ready_out) + ); end else begin diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index 06d886b8..f4faa31a 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -242,90 +242,96 @@ module VX_mem_unit # ( // Miss status `UNUSED_PIN (miss_vec) - ); + ); - VX_cache #( - .CACHE_ID (`SCACHE_ID), - .CACHE_SIZE (`SMEM_SIZE), - .BANK_LINE_SIZE (`SBANK_LINE_SIZE), - .NUM_BANKS (`SNUM_BANKS), - .WORD_SIZE (`SWORD_SIZE), - .NUM_REQS (`SNUM_REQUESTS), - .CREQ_SIZE (`SCREQ_SIZE), - .MSHR_SIZE (8), - .DRSQ_SIZE (1), - .SREQ_SIZE (1), - .CRSQ_SIZE (`SCRSQ_SIZE), - .DREQ_SIZE (1), - .SRSQ_SIZE (1), - .DRAM_ENABLE (0), - .FLUSH_ENABLE (0), - .WRITE_ENABLE (1), - .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH), - .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS), - .DRAM_TAG_WIDTH (`SDRAM_TAG_WIDTH) - ) smem ( - `SCOPE_BIND_VX_mem_unit_smem - - .clk (clk), - .reset (reset), + if (`SM_ENABLE) begin - // Core request - .core_req_valid (smem_req_if.valid), - .core_req_rw (smem_req_if.rw), - .core_req_byteen (smem_req_if.byteen), - .core_req_addr (smem_req_if.addr), - .core_req_data (smem_req_if.data), - .core_req_tag (smem_req_if.tag), - .core_req_ready (smem_req_if.ready), + VX_cache #( + .CACHE_ID (`SCACHE_ID), + .CACHE_SIZE (`SMEM_SIZE), + .BANK_LINE_SIZE (`SBANK_LINE_SIZE), + .NUM_BANKS (`SNUM_BANKS), + .WORD_SIZE (`SWORD_SIZE), + .NUM_REQS (`SNUM_REQUESTS), + .CREQ_SIZE (`SCREQ_SIZE), + .MSHR_SIZE (8), + .DRSQ_SIZE (1), + .SREQ_SIZE (1), + .CRSQ_SIZE (`SCRSQ_SIZE), + .DREQ_SIZE (1), + .SRSQ_SIZE (1), + .DRAM_ENABLE (0), + .FLUSH_ENABLE (0), + .WRITE_ENABLE (1), + .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH), + .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS), + .DRAM_TAG_WIDTH (`SDRAM_TAG_WIDTH) + ) smem ( + `SCOPE_BIND_VX_mem_unit_smem + + .clk (clk), + .reset (reset), - // Core response - .core_rsp_valid (smem_rsp_if.valid), - .core_rsp_data (smem_rsp_if.data), - .core_rsp_tag (smem_rsp_if.tag), - .core_rsp_ready (smem_rsp_if.ready), + // Core request + .core_req_valid (smem_req_if.valid), + .core_req_rw (smem_req_if.rw), + .core_req_byteen (smem_req_if.byteen), + .core_req_addr (smem_req_if.addr), + .core_req_data (smem_req_if.data), + .core_req_tag (smem_req_if.tag), + .core_req_ready (smem_req_if.ready), - `ifdef PERF_ENABLE - .perf_cache_if (perf_smem_if), - `endif + // Core response + .core_rsp_valid (smem_rsp_if.valid), + .core_rsp_data (smem_rsp_if.data), + .core_rsp_tag (smem_rsp_if.tag), + .core_rsp_ready (smem_rsp_if.ready), - // DRAM request - `UNUSED_PIN (dram_req_valid), - `UNUSED_PIN (dram_req_rw), - `UNUSED_PIN (dram_req_byteen), - `UNUSED_PIN (dram_req_addr), - `UNUSED_PIN (dram_req_data), - `UNUSED_PIN (dram_req_tag), - .dram_req_ready (1'b0), + `ifdef PERF_ENABLE + .perf_cache_if (perf_smem_if), + `endif - // DRAM response - .dram_rsp_valid (0), - .dram_rsp_data (0), - .dram_rsp_tag (0), - `UNUSED_PIN (dram_rsp_ready), + // DRAM request + `UNUSED_PIN (dram_req_valid), + `UNUSED_PIN (dram_req_rw), + `UNUSED_PIN (dram_req_byteen), + `UNUSED_PIN (dram_req_addr), + `UNUSED_PIN (dram_req_data), + `UNUSED_PIN (dram_req_tag), + .dram_req_ready (1'b0), - // Snoop request - .snp_req_valid (1'b0), - .snp_req_addr (0), - .snp_req_inv (0), - .snp_req_tag (0), - `UNUSED_PIN (snp_req_ready), + // DRAM response + .dram_rsp_valid (0), + .dram_rsp_data (0), + .dram_rsp_tag (0), + `UNUSED_PIN (dram_rsp_ready), - // Snoop response - `UNUSED_PIN (snp_rsp_valid), - `UNUSED_PIN (snp_rsp_tag), - .snp_rsp_ready (1'b0), + // Snoop request + .snp_req_valid (1'b0), + .snp_req_addr (0), + .snp_req_inv (0), + .snp_req_tag (0), + `UNUSED_PIN (snp_req_ready), - // Miss status - `UNUSED_PIN (miss_vec) - ); + // Snoop response + `UNUSED_PIN (snp_rsp_valid), + `UNUSED_PIN (snp_rsp_tag), + .snp_rsp_ready (1'b0), + + // Miss status + `UNUSED_PIN (miss_vec) + ); + + end VX_mem_arb #( .NUM_REQS (2), .DATA_WIDTH (`DDRAM_LINE_WIDTH), .ADDR_WIDTH (`DDRAM_ADDR_WIDTH), .TAG_IN_WIDTH (`DDRAM_TAG_WIDTH), - .TAG_OUT_WIDTH (`XDRAM_TAG_WIDTH) + .TAG_OUT_WIDTH (`XDRAM_TAG_WIDTH), + .BUFFERED_REQ (1), + .BUFFERED_RSP (0) ) dram_arb ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_mul_unit.v b/hw/rtl/VX_mul_unit.v index 14a46e54..b0b9cb7c 100644 --- a/hw/rtl/VX_mul_unit.v +++ b/hw/rtl/VX_mul_unit.v @@ -32,19 +32,20 @@ module VX_mul_unit #( wire mulq_pop = valid_out && ready_out; VX_cam_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), - .SIZE (`MULQ_SIZE) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), + .SIZE (`MULQ_SIZE), + .FASTRAM (1) ) req_metadata_buf ( - .clk (clk), - .reset (reset), - .acquire_slot (mulq_push), - .write_addr (tag_in), - .read_addr (tag_out), - .release_addr (tag_out), - .write_data ({mul_req_if.wid, mul_req_if.tmask, mul_req_if.PC, mul_req_if.rd, mul_req_if.wb}), - .read_data ({rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}), - .release_slot (mulq_pop), - .full (mulq_full) + .clk (clk), + .reset (reset), + .acquire_slot (mulq_push), + .write_addr (tag_in), + .read_addr (tag_out), + .release_addr (tag_out), + .write_data ({mul_req_if.wid, mul_req_if.tmask, mul_req_if.PC, mul_req_if.rd, mul_req_if.wb}), + .read_data ({rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}), + .release_slot (mulq_pop), + .full (mulq_full) ); wire valid_in = mul_req_if.valid && ~mulq_full; diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 45cc01fb..1b6b4717 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -51,7 +51,7 @@ /////////////////////////////////////////////////////////////////////////////// -`define USE_FAST_BRAM (* ramstyle="mlab" *) +`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *) `define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *) /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index 9c9fd6c0..326e83b8 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -29,8 +29,8 @@ module VX_warp_sched #( // Lock warp until instruction decode to resolve branches reg [`NUM_WARPS-1:0] fetch_lock; - reg [`NUM_THREADS-1:0] thread_masks[`NUM_WARPS-1:0]; - reg [31:0] warp_pcs[`NUM_WARPS-1:0]; + reg [`NUM_THREADS-1:0] thread_masks [`NUM_WARPS-1:0]; + reg [31:0] warp_pcs [`NUM_WARPS-1:0]; // barriers reg [`NUM_WARPS-1:0] barrier_stall_mask[`NUM_BARRIERS-1:0]; // warps waiting on barrier @@ -180,11 +180,11 @@ module VX_warp_sched #( // split/join stack management - wire [(1+32+`NUM_THREADS-1):0] ipdom[`NUM_WARPS-1:0]; + wire [(1+32+`NUM_THREADS-1):0] ipdom [`NUM_WARPS-1:0]; wire [(1+32+`NUM_THREADS-1):0] q1 = {1'b1, 32'b0, thread_masks[warp_ctl_if.wid]}; wire [(1+32+`NUM_THREADS-1):0] q2 = {1'b0, warp_ctl_if.split.pc, warp_ctl_if.split.else_mask}; - assign {join_fall, join_pc, join_tm} = ipdom[join_if.wid]; + assign {join_fall, join_pc, join_tm} = ipdom [join_if.wid]; for (genvar i = 0; i < `NUM_WARPS; i++) begin wire push = warp_ctl_if.valid @@ -196,7 +196,7 @@ module VX_warp_sched #( VX_ipdom_stack #( .WIDTH(1+32+`NUM_THREADS), - .DEPTH(`NT_BITS+1) + .DEPTH(2 ** (`NT_BITS+1)) ) ipdom_stack ( .clk (clk), .reset(reset), diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index 7581f1c3..ecd6f807 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -71,8 +71,7 @@ module VX_writeback #( fpu_valid ? fpu_commit_if.data : 0; - wire stall =~writeback_if.ready && writeback_if.valid; - always @(*) assert(writeback_if.ready); // the writeback currently has no backpressure from issue stage + wire stall = ~writeback_if.ready && writeback_if.valid; VX_generic_register #( .N(1 + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32)), diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index cf9d8dff..dbac8308 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -183,7 +183,9 @@ module Vortex ( .NUM_REQS (`NUM_CLUSTERS), .WORD_SIZE (4), .TAG_IN_WIDTH (`L2CORE_TAG_WIDTH), - .TAG_OUT_WIDTH (`L3CORE_TAG_WIDTH) + .TAG_OUT_WIDTH (`L3CORE_TAG_WIDTH), + .BUFFERED_REQ (1), + .BUFFERED_RSP (`NUM_CLUSTERS >= 4) ) io_arb ( .clk (clk), .reset (reset), @@ -220,9 +222,11 @@ module Vortex ( ); VX_csr_io_arb #( - .NUM_REQS (`NUM_CLUSTERS), - .DATA_WIDTH (32), - .ADDR_WIDTH (12) + .NUM_REQS (`NUM_CLUSTERS), + .DATA_WIDTH (32), + .ADDR_WIDTH (12), + .BUFFERED_REQ (`NUM_CLUSTERS >= 4), + .BUFFERED_RSP (1) ) csr_io_arb ( .clk (clk), .reset (reset), @@ -270,7 +274,8 @@ module Vortex ( .DST_ADDR_WIDTH (`L2DRAM_ADDR_WIDTH), .TAG_IN_WIDTH (`L3SNP_TAG_WIDTH), .TAG_OUT_WIDTH (`L2SNP_TAG_WIDTH), - .SREQ_SIZE (`L3SREQ_SIZE) + .SREQ_SIZE (`L3SREQ_SIZE), + .BUFFERED (`NUM_CLUSTERS >= 4) ) snp_forwarder ( .clk (clk), .reset (reset), @@ -303,49 +308,6 @@ module Vortex ( VX_perf_cache_if perf_l3cache_if(); `endif - wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_valid_qual; - wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_rw_qual; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_BYTEEN_WIDTH-1:0] per_cluster_dram_req_byteen_qual; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_ADDR_WIDTH-1:0] per_cluster_dram_req_addr_qual; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_req_data_qual; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_req_tag_qual; - wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_ready_qual; - - wire [`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_valid_unqual; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_rsp_data_unqual; - wire [`NUM_CLUSTERS-1:0][`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_rsp_tag_unqual; - wire [`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_ready_unqual; - - for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin - VX_skid_buffer #( - .DATAW (1 + `L2DRAM_BYTEEN_WIDTH + `L2DRAM_ADDR_WIDTH + `L2DRAM_LINE_WIDTH + `L2DRAM_TAG_WIDTH), - .PASSTHRU (`NUM_CLUSTERS < 4) - ) dram_req_buffer ( - .clk (clk), - .reset (reset), - .valid_in (per_cluster_dram_req_valid[i]), - .data_in ({per_cluster_dram_req_rw[i], per_cluster_dram_req_byteen[i], per_cluster_dram_req_addr[i], per_cluster_dram_req_data[i], per_cluster_dram_req_tag[i]}), - .ready_in (per_cluster_dram_req_ready[i]), - .valid_out (per_cluster_dram_req_valid_qual[i]), - .data_out ({per_cluster_dram_req_rw_qual[i], per_cluster_dram_req_byteen_qual[i], per_cluster_dram_req_addr_qual[i], per_cluster_dram_req_data_qual[i], per_cluster_dram_req_tag_qual[i]}), - .ready_out (per_cluster_dram_req_ready_qual[i]) - ); - - VX_skid_buffer #( - .DATAW (`L2DRAM_LINE_WIDTH + `L2DRAM_TAG_WIDTH), - .PASSTHRU (1) - ) core_rsp_buffer ( - .clk (clk), - .reset (reset), - .valid_in (per_cluster_dram_rsp_valid_unqual[i]), - .data_in ({per_cluster_dram_rsp_data_unqual[i], per_cluster_dram_rsp_tag_unqual[i]}), - .ready_in (per_cluster_dram_rsp_ready_unqual[i]), - .valid_out (per_cluster_dram_rsp_valid[i]), - .data_out ({per_cluster_dram_rsp_data[i], per_cluster_dram_rsp_tag[i]}), - .ready_out (per_cluster_dram_rsp_ready[i]) - ); - end - VX_cache #( .CACHE_ID (`L3CACHE_ID), .CACHE_SIZE (`L3CACHE_SIZE), @@ -378,19 +340,19 @@ module Vortex ( `endif // Core request - .core_req_valid (per_cluster_dram_req_valid_qual), - .core_req_rw (per_cluster_dram_req_rw_qual), - .core_req_byteen (per_cluster_dram_req_byteen_qual), - .core_req_addr (per_cluster_dram_req_addr_qual), - .core_req_data (per_cluster_dram_req_data_qual), - .core_req_tag (per_cluster_dram_req_tag_qual), - .core_req_ready (per_cluster_dram_req_ready_qual), + .core_req_valid (per_cluster_dram_req_valid), + .core_req_rw (per_cluster_dram_req_rw), + .core_req_byteen (per_cluster_dram_req_byteen), + .core_req_addr (per_cluster_dram_req_addr), + .core_req_data (per_cluster_dram_req_data), + .core_req_tag (per_cluster_dram_req_tag), + .core_req_ready (per_cluster_dram_req_ready), // Core response - .core_rsp_valid (per_cluster_dram_rsp_valid_unqual), - .core_rsp_data (per_cluster_dram_rsp_data_unqual), - .core_rsp_tag (per_cluster_dram_rsp_tag_unqual), - .core_rsp_ready (per_cluster_dram_rsp_ready_unqual), + .core_rsp_valid (per_cluster_dram_rsp_valid), + .core_rsp_data (per_cluster_dram_rsp_data), + .core_rsp_tag (per_cluster_dram_rsp_tag), + .core_rsp_ready (per_cluster_dram_rsp_ready), // DRAM request .dram_req_valid (dram_req_valid), @@ -429,7 +391,9 @@ module Vortex ( .NUM_REQS (`NUM_CLUSTERS), .DATA_WIDTH (`L3DRAM_LINE_WIDTH), .TAG_IN_WIDTH (`L2DRAM_TAG_WIDTH), - .TAG_OUT_WIDTH (`L3DRAM_TAG_WIDTH) + .TAG_OUT_WIDTH (`L3DRAM_TAG_WIDTH), + .BUFFERED_REQ (1), + .BUFFERED_RSP (`NUM_CLUSTERS >= 4) ) dram_arb ( .clk (clk), .reset (reset), @@ -476,28 +440,23 @@ module Vortex ( `SCOPE_ASSIGN (reset, reset); - `SCOPE_ASSIGN (dram_req_fire, dram_req_valid && dram_req_ready); - `SCOPE_ASSIGN (dram_req_addr, `TO_FULL_ADDR(dram_req_addr)); - `SCOPE_ASSIGN (dram_req_rw, dram_req_rw); - `SCOPE_ASSIGN (dram_req_byteen,dram_req_byteen); - `SCOPE_ASSIGN (dram_req_data, dram_req_data); - `SCOPE_ASSIGN (dram_req_tag, dram_req_tag); - - `SCOPE_ASSIGN (dram_rsp_fire, dram_rsp_valid && dram_rsp_ready); - `SCOPE_ASSIGN (dram_rsp_data, dram_rsp_data); - `SCOPE_ASSIGN (dram_rsp_tag, dram_rsp_tag); - + `SCOPE_ASSIGN (dram_req_fire, dram_req_valid && dram_req_ready); + `SCOPE_ASSIGN (dram_req_addr, `TO_FULL_ADDR(dram_req_addr)); + `SCOPE_ASSIGN (dram_req_rw, dram_req_rw); + `SCOPE_ASSIGN (dram_req_byteen, dram_req_byteen); + `SCOPE_ASSIGN (dram_req_data, dram_req_data); + `SCOPE_ASSIGN (dram_req_tag, dram_req_tag); + `SCOPE_ASSIGN (dram_rsp_fire, dram_rsp_valid && dram_rsp_ready); + `SCOPE_ASSIGN (dram_rsp_data, dram_rsp_data); + `SCOPE_ASSIGN (dram_rsp_tag, dram_rsp_tag); `SCOPE_ASSIGN (snp_req_fire, snp_req_valid && snp_req_ready); `SCOPE_ASSIGN (snp_req_addr, `TO_FULL_ADDR(snp_req_addr)); `SCOPE_ASSIGN (snp_req_inv, snp_req_inv); `SCOPE_ASSIGN (snp_req_tag, snp_req_tag); - `SCOPE_ASSIGN (snp_rsp_fire, snp_rsp_valid && snp_rsp_ready); `SCOPE_ASSIGN (snp_rsp_tag, snp_rsp_tag); - `SCOPE_ASSIGN (snp_rsp_fire, snp_rsp_valid && snp_rsp_ready); `SCOPE_ASSIGN (snp_rsp_tag, snp_rsp_tag); - `SCOPE_ASSIGN (busy, busy); `ifdef DBG_PRINT_DRAM diff --git a/hw/rtl/afu/VX_avs_wrapper.v b/hw/rtl/afu/VX_avs_wrapper.v index ca814e47..eced0d34 100644 --- a/hw/rtl/afu/VX_avs_wrapper.v +++ b/hw/rtl/afu/VX_avs_wrapper.v @@ -59,20 +59,25 @@ module VX_avs_wrapper #( + RD_QUEUE_ADDRW'((avs_reqq_push && !avs_rspq_pop) ? 1 : (avs_rspq_pop && !avs_reqq_push) ? -1 : 0); + reg rsp_queue_ready; + always @(posedge clk) begin if (reset) begin avs_burstcount_r <= 1; avs_bankselect_r <= 0; avs_pending_reads <= 0; + rsp_queue_ready <= 1; end else begin avs_pending_reads <= avs_pending_reads_n; + rsp_queue_ready <= (avs_pending_reads_n != RD_QUEUE_SIZE); end end VX_generic_queue #( - .DATAW (REQ_TAGW), - .SIZE (RD_QUEUE_SIZE), - .BUFFERED (1) + .DATAW (REQ_TAGW), + .SIZE (RD_QUEUE_SIZE), + .BUFFERED(1), + .FASTRAM (1) ) rd_req_queue ( .clk (clk), .reset (reset), @@ -86,9 +91,10 @@ module VX_avs_wrapper #( ); VX_generic_queue #( - .DATAW (AVS_DATAW), - .SIZE (RD_QUEUE_SIZE), - .BUFFERED (1) + .DATAW (AVS_DATAW), + .SIZE (RD_QUEUE_SIZE), + .BUFFERED(1), + .FASTRAM (1) ) rd_rsp_queue ( .clk (clk), .reset (reset), @@ -101,8 +107,6 @@ module VX_avs_wrapper #( `UNUSED_PIN (size) ); - wire rsp_queue_ready = (avs_pending_reads != RD_QUEUE_SIZE); - assign avs_read = dram_req_valid && !dram_req_rw && rsp_queue_ready; assign avs_write = dram_req_valid && dram_req_rw && rsp_queue_ready; assign avs_address = dram_req_addr; diff --git a/hw/rtl/afu/vortex_afu.sv b/hw/rtl/afu/vortex_afu.sv index 59d54136..474a3cf3 100644 --- a/hw/rtl/afu/vortex_afu.sv +++ b/hw/rtl/afu/vortex_afu.sv @@ -700,8 +700,8 @@ always @(posedge clk) begin end cci_rd_req_enable <= (STATE_WRITE == state) - && (cci_rd_req_ctr_next < cmd_data_size) - && (cci_pending_reads_next < CCI_RD_QUEUE_SIZE) + && (cci_rd_req_ctr_next != cmd_data_size) + && (cci_pending_reads_next != CCI_RD_QUEUE_SIZE) && !cp2af_sRxPort.c0TxAlmFull; if (cci_rd_req_fire) begin @@ -741,8 +741,9 @@ always @(posedge clk) begin end VX_generic_queue #( - .DATAW(CCI_RD_RQ_DATAW), - .SIZE(CCI_RD_QUEUE_SIZE) + .DATAW (CCI_RD_RQ_DATAW), + .SIZE (CCI_RD_QUEUE_SIZE), + .FASTRAM (1) ) cci_rd_req_queue ( .clk (clk), .reset (reset), @@ -898,7 +899,7 @@ always @(posedge clk) begin end if ((STATE_CLFLUSH == state) - && (snp_req_ctr_next >= snp_req_size)) begin + && (snp_req_ctr_next == snp_req_size)) begin vx_snp_req_valid <= 0; end diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 43232d90..0eec3696 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -111,36 +111,24 @@ module VX_bank #( `ifdef DBG_CACHE_REQ_INFO /* verilator lint_off UNUSED */ - wire[31:0] debug_pc_st0; - wire[`NR_BITS-1:0] debug_rd_st0; - wire[`NW_BITS-1:0] debug_wid_st0; - wire debug_rw_st0; - wire[WORD_SIZE-1:0] debug_byteen_st0; - wire[`REQS_BITS-1:0] debug_tid_st0; + wire [31:0] debug_pc_st0; + wire [`NR_BITS-1:0] debug_rd_st0; + wire [`NW_BITS-1:0] debug_wid_st0; wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st0; - - wire[31:0] debug_pc_st1; - wire[`NR_BITS-1:0] debug_rd_st1; - wire[`NW_BITS-1:0] debug_wid_st1; - wire debug_rw_st1; - wire[WORD_SIZE-1:0] debug_byteen_st1; - wire[`REQS_BITS-1:0] debug_tid_st1; + + wire [31:0] debug_pc_st1; + wire [`NR_BITS-1:0] debug_rd_st1; + wire [`NW_BITS-1:0] debug_wid_st1; wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st1; - wire[31:0] debug_pc_st2; - wire[`NR_BITS-1:0] debug_rd_st2; - wire[`NW_BITS-1:0] debug_wid_st2; - wire debug_rw_st2; - wire[WORD_SIZE-1:0] debug_byteen_st2; - wire[`REQS_BITS-1:0] debug_tid_st2; + wire [31:0] debug_pc_st2; + wire [`NR_BITS-1:0] debug_rd_st2; + wire [`NW_BITS-1:0] debug_wid_st2; wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st2; - wire[31:0] debug_pc_st3; - wire[`NR_BITS-1:0] debug_rd_st3; - wire[`NW_BITS-1:0] debug_wid_st3; - wire debug_rw_st3; - wire[WORD_SIZE-1:0] debug_byteen_st3; - wire[`REQS_BITS-1:0] debug_tid_st3; + wire [31:0] debug_pc_st3; + wire [`NR_BITS-1:0] debug_rd_st3; + wire [`NW_BITS-1:0] debug_wid_st3; wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st3; /* verilator lint_on UNUSED */ `endif @@ -159,9 +147,10 @@ module VX_bank #( wire sreq_push = snp_req_valid && snp_req_ready; VX_generic_queue #( - .DATAW(`LINE_ADDR_WIDTH + 1 + SNP_TAG_WIDTH), - .SIZE(SREQ_SIZE), - .BUFFERED(1) + .DATAW (`LINE_ADDR_WIDTH + 1 + SNP_TAG_WIDTH), + .SIZE (SREQ_SIZE), + .BUFFERED (1), + .FASTRAM (1) ) snp_req_queue ( .clk (clk), .reset (reset), @@ -200,9 +189,10 @@ module VX_bank #( assign dram_rsp_ready = !drsq_full; VX_generic_queue #( - .DATAW(`LINE_ADDR_WIDTH + $bits(dram_rsp_data)), - .SIZE(DRSQ_SIZE), - .BUFFERED(1) + .DATAW (`LINE_ADDR_WIDTH + $bits(dram_rsp_data)), + .SIZE (DRSQ_SIZE), + .BUFFERED (1), + .FASTRAM (1) ) dram_rsp_queue ( .clk (clk), .reset (reset), @@ -246,30 +236,30 @@ module VX_bank #( .CORE_TAG_WIDTH (CORE_TAG_WIDTH), .CORE_TAG_ID_BITS (CORE_TAG_ID_BITS) ) core_req_queue ( - .clk (clk), - .reset (reset), + .clk (clk), + .reset (reset), // Enqueue - .push (creq_push), - .tag_in (core_req_tag), - .valids_in (core_req_valid), - .rw_in (core_req_rw), - .byteen_in (core_req_byteen), - .addr_in (core_req_addr), - .writedata_in (core_req_data), + .push (creq_push), + .tag_in (core_req_tag), + .valids_in (core_req_valid), + .rw_in (core_req_rw), + .byteen_in (core_req_byteen), + .addr_in (core_req_addr), + .wdata_in (core_req_data), // Dequeue - .pop (creq_pop), - .tag_out (creq_tag_st0), - .tid_out (creq_tid_st0), - .rw_out (creq_rw_st0), - .byteen_out (creq_byteen_st0), - .addr_out (creq_addr_st0), - .writedata_out (creq_writeword_st0), + .pop (creq_pop), + .tag_out (creq_tag_st0), + .tid_out (creq_tid_st0), + .rw_out (creq_rw_st0), + .byteen_out (creq_byteen_st0), + .addr_out (creq_addr_st0), + .wdata_out (creq_writeword_st0), // States - .empty (creq_empty), - .full (creq_full) + .empty (creq_empty), + .full (creq_full) ); reg [$clog2(MSHR_SIZE+1)-1:0] mshr_pending_size; @@ -277,7 +267,7 @@ module VX_bank #( reg mshr_going_full; wire mshr_pop; wire mshr_valid_st0; - wire[`REQS_BITS-1:0] mshr_tid_st0; + wire [`REQS_BITS-1:0] mshr_tid_st0; wire [`LINE_ADDR_WIDTH-1:0] mshr_addr_st0; wire [`UP(`WORD_SELECT_WIDTH)-1:0] mshr_wsel_st0; wire [`WORD_WIDTH-1:0] mshr_writeword_st0; @@ -286,6 +276,7 @@ module VX_bank #( wire [WORD_SIZE-1:0] mshr_byteen_st0; wire mshr_is_snp_st0; wire mshr_snp_inv_st0; + wire mshr_pending_hazard_unqual_st0; wire is_fill_st0; wire is_mshr_st0; @@ -295,9 +286,11 @@ module VX_bank #( wire [`UP(`WORD_SELECT_WIDTH)-1:0] wsel_st0; wire [`WORD_WIDTH-1:0] writeword_st0; wire [`BANK_LINE_WIDTH-1:0] writedata_st0; - wire [`REQ_INST_META_WIDTH-1:0] inst_meta_st0; wire snp_inv_st0; - wire mshr_pending_hazard_unqual_st0; + wire [`REQ_TAG_WIDTH-1:0] tag_st0; + wire mem_rw_st0; + wire [WORD_SIZE-1:0] byteen_st0; + wire [`REQS_BITS-1:0] req_tid_st0; wire is_fill_st1; wire is_mshr_st1; @@ -306,32 +299,26 @@ module VX_bank #( wire [`LINE_ADDR_WIDTH-1:0] addr_st1; wire [`UP(`WORD_SELECT_WIDTH)-1:0] wsel_st1; wire [`WORD_WIDTH-1:0] writeword_st1; - wire [`REQ_INST_META_WIDTH-1:0] inst_meta_st1; wire [`BANK_LINE_WIDTH-1:0] writedata_st1; wire snp_inv_st1; - wire [`TAG_SELECT_BITS-1:0] readtag_st1; wire miss_st1; wire force_miss_st1; wire dirty_st1; - wire [WORD_SIZE-1:0] mem_byteen_st1; wire writeen_st1; - wire mem_rw_st1; -`DEBUG_BEGIN wire [`REQ_TAG_WIDTH-1:0] tag_st1; - wire [`REQS_BITS-1:0] tid_st1; -`DEBUG_END + wire mem_rw_st1; + wire [WORD_SIZE-1:0] byteen_st1; + wire [`REQS_BITS-1:0] req_tid_st1; wire valid_st2; wire [`UP(`WORD_SELECT_WIDTH)-1:0] wsel_st2; - wire [`WORD_WIDTH-1:0] writeword_st2; wire [`WORD_WIDTH-1:0] readword_st2; + wire [`WORD_WIDTH-1:0] writeword_st2; wire [`BANK_LINE_WIDTH-1:0] readdata_st2; wire [`BANK_LINE_WIDTH-1:0] writedata_st2; - wire [WORD_SIZE-1:0] mem_byteen_st2; wire dirty_st2; wire [BANK_LINE_SIZE-1:0] dirtyb_st2; - wire [`REQ_INST_META_WIDTH-1:0] inst_meta_st2; wire [`TAG_SELECT_BITS-1:0] readtag_st2; wire is_fill_st2; wire is_snp_st2; @@ -342,15 +329,22 @@ module VX_bank #( wire[`LINE_ADDR_WIDTH-1:0] addr_st2; wire writeen_st2; wire core_req_hit_st2; + wire incoming_fill_st2; + wire [`REQ_TAG_WIDTH-1:0] tag_st2; + wire mem_rw_st2; + wire [WORD_SIZE-1:0] byteen_st2; + wire [`REQS_BITS-1:0] req_tid_st2; wire valid_st3; wire is_mshr_st3; wire miss_st3; wire force_miss_st3; wire [`LINE_ADDR_WIDTH-1:0] addr_st3; - - wire core_req_hit_st1; - + wire [`REQ_TAG_WIDTH-1:0] tag_st3; + wire mem_rw_st3; + wire [WORD_SIZE-1:0] byteen_st3; + wire [`REQS_BITS-1:0] req_tid_st3; + wire mshr_push_stall; wire crsq_push_stall; wire dreq_push_stall; @@ -360,7 +354,7 @@ module VX_bank #( wire is_mshr_miss_st2 = valid_st2 && is_mshr_st2 && (miss_st2 || force_miss_st2); wire is_mshr_miss_st3 = valid_st3 && is_mshr_st3 && (miss_st3 || force_miss_st3); - wire creq_commit = valid_st1 && core_req_hit_st1 && !pipeline_stall; + wire creq_commit = valid_st2 && core_req_hit_st2 && !pipeline_stall; // determine which queue to pop next in piority order wire mshr_pop_unqual = mshr_valid_st0; @@ -383,7 +377,7 @@ module VX_bank #( mshr_going_full <= 0; end else begin mshr_pending_size <= mshr_pending_size_n; - mshr_going_full <= (mshr_pending_size_n == MSHR_SIZE); + mshr_going_full <= (mshr_pending_size_n == MSHR_SIZE); end end @@ -409,10 +403,25 @@ module VX_bank #( assign writedata_st0 = drsq_filldata_st0; - assign inst_meta_st0 = mshr_pop_unqual ? {`REQ_TAG_WIDTH'(mshr_tag_st0), mshr_rw_st0, mshr_byteen_st0, mshr_tid_st0} : - creq_pop_unqual ? {`REQ_TAG_WIDTH'(creq_tag_st0), creq_rw_st0, creq_byteen_st0, creq_tid_st0} : - sreq_pop_unqual ? {`REQ_TAG_WIDTH'(sreq_tag_st0), 1'b0, WORD_SIZE'(0), `REQS_BITS'(0)} : - 0; + assign tag_st0 = mshr_pop_unqual ? `REQ_TAG_WIDTH'(mshr_tag_st0) : + creq_pop_unqual ? `REQ_TAG_WIDTH'(creq_tag_st0) : + sreq_pop_unqual ? `REQ_TAG_WIDTH'(sreq_tag_st0) : + 0; + + assign mem_rw_st0 = mshr_pop_unqual ? mshr_rw_st0 : + creq_pop_unqual ? creq_rw_st0 : + sreq_pop_unqual ? 1'b0 : + 0; + + assign byteen_st0 = mshr_pop_unqual ? mshr_byteen_st0 : + creq_pop_unqual ? creq_byteen_st0 : + sreq_pop_unqual ? WORD_SIZE'(0) : + 0; + + assign req_tid_st0 = mshr_pop_unqual ? mshr_tid_st0 : + creq_pop_unqual ? creq_tid_st0 : + sreq_pop_unqual ? `REQS_BITS'(0) : + 0; assign is_snp_st0 = mshr_pop_unqual ? mshr_is_snp_st0 : sreq_pop_unqual ? 1 : @@ -428,9 +437,9 @@ module VX_bank #( `ifdef DBG_CACHE_REQ_INFO if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_pc_st0, debug_rd_st0, debug_wid_st0, debug_tagid_st0, debug_rw_st0, debug_byteen_st0, debug_tid_st0} = inst_meta_st0; + assign {debug_pc_st0, debug_rd_st0, debug_wid_st0, debug_tagid_st0} = tag_st0; end else begin - assign {debug_pc_st0, debug_rd_st0, debug_wid_st0, debug_tagid_st0, debug_rw_st0, debug_byteen_st0, debug_tid_st0} = 0; + assign {debug_pc_st0, debug_rd_st0, debug_wid_st0, debug_tagid_st0} = 0; end `endif @@ -443,27 +452,25 @@ if (DRAM_ENABLE) begin || (valid_st3 && (miss_st3 || force_miss_st3) && (addr_st3 == addr_st0)); VX_generic_register #( - .N(1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `UP(`WORD_SELECT_WIDTH) + `WORD_WIDTH + `REQ_INST_META_WIDTH + 1 + `BANK_LINE_WIDTH), + .N(1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `UP(`WORD_SELECT_WIDTH) + `WORD_WIDTH + 1 + `BANK_LINE_WIDTH + 1 + WORD_SIZE + `REQS_BITS + `REQ_TAG_WIDTH), .R(1) ) pipe_reg0 ( .clk (clk), .reset (reset), .stall (pipeline_stall), .flush (1'b0), - .data_in ({valid_st0, is_mshr_st0, is_snp_st0, snp_inv_st0, mshr_pending_hazard_st0, addr_st0, wsel_st0, writeword_st0, inst_meta_st0, is_fill_st0, writedata_st0}), - .data_out ({valid_st1, is_mshr_st1, is_snp_st1, snp_inv_st1, mshr_pending_hazard_st1, addr_st1, wsel_st1, writeword_st1, inst_meta_st1, is_fill_st1, writedata_st1}) + .data_in ({valid_st0, is_mshr_st0, is_snp_st0, snp_inv_st0, mshr_pending_hazard_st0, addr_st0, wsel_st0, writeword_st0, is_fill_st0, writedata_st0, mem_rw_st0, byteen_st0, req_tid_st0, tag_st0}), + .data_out ({valid_st1, is_mshr_st1, is_snp_st1, snp_inv_st1, mshr_pending_hazard_st1, addr_st1, wsel_st1, writeword_st1, is_fill_st1, writedata_st1, mem_rw_st1, byteen_st1, req_tid_st1, tag_st1}) ); `ifdef DBG_CACHE_REQ_INFO if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_pc_st1, debug_rd_st1, debug_wid_st1, debug_tagid_st1, debug_rw_st1, debug_byteen_st1, debug_tid_st1} = inst_meta_st1; + assign {debug_pc_st1, debug_rd_st1, debug_wid_st1, debug_tagid_st1} = tag_st1; end else begin - assign {debug_pc_st1, debug_rd_st1, debug_wid_st1, debug_tagid_st1, debug_rw_st1, debug_byteen_st1, debug_tid_st1} = 0; + assign {debug_pc_st1, debug_rd_st1, debug_wid_st1, debug_tagid_st1} = 0; end `endif - assign {tag_st1, mem_rw_st1, mem_byteen_st1, tid_st1} = inst_meta_st1; - // force miss to ensure commit order when a new request has pending previous requests to same block // also force a miss for msrq requests when previous requests got a miss wire st2_pending_hazard_st1 = valid_st2 && (miss_st2 || force_miss_st2) && (addr_st2 == addr_st1); @@ -511,20 +518,22 @@ if (DRAM_ENABLE) begin .writeen_out (writeen_st1) ); - assign core_req_hit_st1 = !is_fill_st1 && !is_snp_st1 && !miss_st1 && !force_miss_st1; - assign misses = miss_st1; - + + wire core_req_hit_st1 = !is_fill_st1 && !is_snp_st1 && !miss_st1 && !force_miss_st1; + + wire incoming_fill_st1 = !drsq_empty && (addr_st1 == drsq_addr_st0); + VX_generic_register #( - .N(1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `UP(`WORD_SELECT_WIDTH) + `WORD_WIDTH + `TAG_SELECT_BITS + 1 + `BANK_LINE_WIDTH + WORD_SIZE + `REQ_INST_META_WIDTH), + .N(1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `UP(`WORD_SELECT_WIDTH) + `WORD_WIDTH + `TAG_SELECT_BITS + 1 + `BANK_LINE_WIDTH + 1 + WORD_SIZE + `REQS_BITS + `REQ_TAG_WIDTH), .R(1) ) pipe_reg1 ( .clk (clk), .reset (reset), .stall (pipeline_stall), .flush (1'b0), - .data_in ({valid_st1, core_req_hit_st1, is_mshr_st1, writeen_st1, force_miss_st1, dirty_st1, is_snp_st1, snp_inv_st1, is_fill_st1, addr_st1, wsel_st1, writeword_st1, readtag_st1, miss_st1, writedata_st1, mem_byteen_st1, inst_meta_st1}), - .data_out ({valid_st2, core_req_hit_st2, is_mshr_st2, writeen_st2, force_miss_st2, dirty_st2, is_snp_st2, snp_inv_st2, is_fill_st2, addr_st2, wsel_st2, writeword_st2, readtag_st2, miss_st2, writedata_st2, mem_byteen_st2, inst_meta_st2}) + .data_in ({valid_st1, incoming_fill_st1, core_req_hit_st1, is_mshr_st1, writeen_st1, force_miss_st1, dirty_st1, is_snp_st1, snp_inv_st1, is_fill_st1, addr_st1, wsel_st1, writeword_st1, readtag_st1, miss_st1, writedata_st1, mem_rw_st1, byteen_st1, req_tid_st1, tag_st1}), + .data_out ({valid_st2, incoming_fill_st2, core_req_hit_st2, is_mshr_st2, writeen_st2, force_miss_st2, dirty_st2, is_snp_st2, snp_inv_st2, is_fill_st2, addr_st2, wsel_st2, writeword_st2, readtag_st2, miss_st2, writedata_st2, mem_rw_st2, byteen_st2, req_tid_st2, tag_st2}) ); end else begin @@ -532,9 +541,7 @@ end else begin `UNUSED_VAR (mshr_pending_hazard_unqual_st0) `UNUSED_VAR (drsq_push) `UNUSED_VAR (addr_st0) - - assign {tag_st1, mem_rw_st1, mem_byteen_st1, tid_st1} = inst_meta_st1; - + assign is_fill_st1 = is_fill_st0; assign is_mshr_st1 = is_mshr_st0; assign is_snp_st1 = is_snp_st0; @@ -542,14 +549,17 @@ end else begin assign wsel_st1 = wsel_st0; assign writeword_st1= writeword_st0; assign writedata_st1= writedata_st0; - assign inst_meta_st1= inst_meta_st0; assign snp_inv_st1 = snp_inv_st0; assign addr_st1 = creq_addr_st0[`LINE_SELECT_ADDR_RNG]; assign dirty_st1 = 0; assign readtag_st1 = 0; assign miss_st1 = 0; - assign writeen_st1 = valid_st1 && mem_rw_st1; + assign writeen_st1 = mem_rw_st1; assign force_miss_st1 = 0; + assign tag_st1 = tag_st0; + assign mem_rw_st1 = mem_rw_st0; + assign byteen_st1 = byteen_st0; + assign req_tid_st1 = req_tid_st0; assign is_fill_st2 = is_fill_st1; assign is_mshr_st2 = is_mshr_st1; @@ -558,20 +568,19 @@ end else begin assign wsel_st2 = wsel_st1; assign writeword_st2= writeword_st1; assign writedata_st2= writedata_st1; - assign inst_meta_st2= inst_meta_st1; assign snp_inv_st2 = snp_inv_st1; assign addr_st2 = addr_st1; assign dirty_st2 = dirty_st1; - assign mem_byteen_st2 = mem_byteen_st1; assign readtag_st2 = readtag_st1; assign miss_st2 = miss_st1; assign writeen_st2 = writeen_st1; assign force_miss_st2 = force_miss_st1; + assign tag_st2 = tag_st1; + assign mem_rw_st2 = mem_rw_st1; + assign byteen_st2 = byteen_st1; + assign req_tid_st2 = req_tid_st1; - assign core_req_hit_st1 = 0; - assign core_req_hit_st2 = 0; - assign send_dwb_req_st2 = 0; - assign do_writeback_st2 = 0; + assign core_req_hit_st2 = 1; assign incoming_fill_st2 = 0; assign misses = 0; @@ -579,9 +588,9 @@ end `ifdef DBG_CACHE_REQ_INFO if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_pc_st2, debug_rd_st2, debug_wid_st2, debug_tagid_st2, debug_rw_st2, debug_byteen_st2, debug_tid_st2} = inst_meta_st2; + assign {debug_pc_st2, debug_rd_st2, debug_wid_st2, debug_tagid_st2} = tag_st2; end else begin - assign {debug_pc_st2, debug_rd_st2, debug_wid_st2, debug_tagid_st2, debug_rw_st2, debug_byteen_st2, debug_tid_st2} = 0; + assign {debug_pc_st2, debug_rd_st2, debug_wid_st2, debug_tagid_st2} = 0; end `endif @@ -613,7 +622,7 @@ end .writeen_in (writeen_st2), .is_fill_in (is_fill_st2), .wordsel_in (wsel_st2), - .byteen_in (mem_byteen_st2), + .byteen_in (byteen_st2), .writeword_in (writeword_st2), .writedata_in (writedata_st2), @@ -628,62 +637,58 @@ end wire [`WORD_WIDTH-1:0] readword_st3; wire [`BANK_LINE_WIDTH-1:0] readdata_st3; wire [BANK_LINE_SIZE-1:0] dirtyb_st3; - wire [`REQ_INST_META_WIDTH-1:0] inst_meta_st3; wire [`TAG_SELECT_BITS-1:0] readtag_st3; wire is_snp_st3; wire snp_inv_st3; - wire core_req_hit_st3; - wire send_dwb_req_st3; wire do_writeback_st3; wire incoming_fill_st3; + wire mshr_push_st3; + wire crsq_push_st3; + wire dreq_push_st3; + wire srsq_push_st3; - // check if a matching fill request is comming - wire incoming_fill_dfp_st2 = drsq_push && (addr_st2 == dram_rsp_addr); - wire incoming_fill_st0_st2 = !drsq_empty && (addr_st2 == drsq_addr_st0); - wire incoming_fill_st1_st2 = is_fill_st1 && (addr_st2 == addr_st1); - wire incoming_fill_st2 = incoming_fill_dfp_st2 - || incoming_fill_st0_st2 - || incoming_fill_st1_st2; + wire incoming_fill_qual_st2 = (!drsq_empty && (addr_st2 == drsq_addr_st0)) || incoming_fill_st2; + + wire do_fill_req_st2 = miss_st2 + && (!force_miss_st2 + || (is_mshr_st2 && addr_st2 != addr_st3)) + && !incoming_fill_qual_st2; - wire send_fill_req_st2 = miss_st2 - && (!force_miss_st2 - || (is_mshr_st2 && addr_st2 != addr_st3)) - && !incoming_fill_st2; + wire do_writeback_st2 = dirty_st2 + && (is_fill_st2 + || (!force_miss_st2 && is_snp_st2)); - wire do_writeback_st2 = dirty_st2 - && (is_fill_st2 - || (!force_miss_st2 && is_snp_st2)); + wire mshr_push_st2 = miss_st2 || force_miss_st2; - wire send_dwb_req_st2 = send_fill_req_st2 || do_writeback_st2; + wire crsq_push_st2 = core_req_hit_st2 && !mem_rw_st2; + + wire dreq_push_st2 = do_fill_req_st2 || do_writeback_st2; + + wire srsq_push_st2 = is_snp_st2 && !force_miss_st2; VX_generic_register #( - .N(1 + 1+ 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `UP(`WORD_SELECT_WIDTH) + `WORD_WIDTH + `WORD_WIDTH + `BANK_LINE_WIDTH + `TAG_SELECT_BITS + 1 + 1 + BANK_LINE_SIZE + `REQ_INST_META_WIDTH), + .N(1 + 1+ 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `UP(`WORD_SELECT_WIDTH) + `WORD_WIDTH + `TAG_SELECT_BITS + 1 + 1 + BANK_LINE_SIZE + 1 + WORD_SIZE + `WORD_WIDTH + `BANK_LINE_WIDTH + `REQS_BITS + `REQ_TAG_WIDTH), .R(1) ) pipe_reg2 ( .clk (clk), .reset (reset), .stall (pipeline_stall), .flush (1'b0), - .data_in ({valid_st2, core_req_hit_st2, send_dwb_req_st2, do_writeback_st2, incoming_fill_st2, force_miss_st2, is_mshr_st2, is_snp_st2, snp_inv_st2, addr_st2, wsel_st2, writeword_st2, readword_st2, readdata_st2, readtag_st2, miss_st2, dirtyb_st2, inst_meta_st2}), - .data_out ({valid_st3, core_req_hit_st3, send_dwb_req_st3, do_writeback_st3, incoming_fill_st3, force_miss_st3, is_mshr_st3, is_snp_st3, snp_inv_st3, addr_st3, wsel_st3, writeword_st3, readword_st3, readdata_st3, readtag_st3, miss_st3, dirtyb_st3, inst_meta_st3}) + .data_in ({valid_st2, mshr_push_st2, crsq_push_st2, dreq_push_st2, srsq_push_st2, do_writeback_st2, incoming_fill_qual_st2, force_miss_st2, is_mshr_st2, is_snp_st2, snp_inv_st2, addr_st2, wsel_st2, writeword_st2, readtag_st2, miss_st2, dirtyb_st2, mem_rw_st2, byteen_st2, readword_st2, readdata_st2, req_tid_st2, tag_st2}), + .data_out ({valid_st3, mshr_push_st3, crsq_push_st3, dreq_push_st3, srsq_push_st3, do_writeback_st3, incoming_fill_st3, force_miss_st3, is_mshr_st3, is_snp_st3, snp_inv_st3, addr_st3, wsel_st3, writeword_st3, readtag_st3, miss_st3, dirtyb_st3, mem_rw_st3, byteen_st3, readword_st3, readdata_st3, req_tid_st3, tag_st3}) ); `ifdef DBG_CACHE_REQ_INFO if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_pc_st3, debug_rd_st3, debug_wid_st3, debug_tagid_st3, debug_rw_st3, debug_byteen_st3, debug_tid_st3} = inst_meta_st3; + assign {debug_pc_st3, debug_rd_st3, debug_wid_st3, debug_tagid_st3} = tag_st3; end else begin - assign {debug_pc_st3, debug_rd_st3, debug_wid_st3, debug_tagid_st3, debug_rw_st3, debug_byteen_st3, debug_tid_st3} = 0; + assign {debug_pc_st3, debug_rd_st3, debug_wid_st3, debug_tagid_st3} = 0; end `endif // Enqueue to miss reserv if it's a valid miss - wire[`REQS_BITS-1:0] req_tid_st3; - wire[`REQ_TAG_WIDTH-1:0] req_tag_st3; - wire req_rw_st3; - wire[WORD_SIZE-1:0] req_byteen_st3; - - wire mshr_push_unqual = valid_st3 && (miss_st3 || force_miss_st3); + wire mshr_push_unqual = valid_st3 && mshr_push_st3; assign mshr_push_stall = 0; wire mshr_push = mshr_push_unqual @@ -696,7 +701,7 @@ end assert(!mshr_push || !mshr_full); // mmshr stall is detected before issuing new requests end - assign {req_tag_st3, req_rw_st3, req_byteen_st3, req_tid_st3} = inst_meta_st3; + wire incoming_fill_qual_st3 = (!drsq_empty && (addr_st3 == drsq_addr_st0)) || incoming_fill_st3; if (DRAM_ENABLE) begin @@ -707,7 +712,7 @@ end // push missed requests as 'ready' if it was a forced miss but actually had a hit // or the fill request is comming for the missed block - wire mshr_init_ready_state_st3 = valid_st3 && (!miss_st3 || incoming_fill_st3); + wire mshr_init_ready_state_st3 = valid_st3 && (!miss_st3 || incoming_fill_qual_st3); VX_miss_resrv #( .BANK_ID (BANK_ID), @@ -738,14 +743,7 @@ end // enqueue .enqueue_st3 (mshr_push), .enqueue_addr_st3 (addr_st3), - .enqueue_wsel_st3 (wsel_st3), - .enqueue_data_st3 (writeword_st3), - .enqueue_tid_st3 (req_tid_st3), - .enqueue_tag_st3 (req_tag_st3), - .enqueue_rw_st3 (req_rw_st3), - .enqueue_byteen_st3 (req_byteen_st3), - .enqueue_is_snp_st3 (is_snp_st3), - .enqueue_snp_inv_st3(snp_inv_st3), + .enqueue_data_st3 ({writeword_st3, req_tid_st3, tag_st3, mem_rw_st3, byteen_st3, wsel_st3, is_snp_st3, snp_inv_st3}), .enqueue_is_mshr_st3(is_mshr_st3), .enqueue_ready_st3 (mshr_init_ready_state_st3), .enqueue_full (mshr_full), @@ -759,14 +757,7 @@ end .schedule_st0 (mshr_pop), .dequeue_valid_st0 (mshr_valid_st0), .dequeue_addr_st0 (mshr_addr_st0), - .dequeue_wsel_st0 (mshr_wsel_st0), - .dequeue_data_st0 (mshr_writeword_st0), - .dequeue_tid_st0 (mshr_tid_st0), - .dequeue_tag_st0 (mshr_tag_st0), - .dequeue_rw_st0 (mshr_rw_st0), - .dequeue_byteen_st0 (mshr_byteen_st0), - .dequeue_is_snp_st0 (mshr_is_snp_st0), - .dequeue_snp_inv_st0(mshr_snp_inv_st0), + .dequeue_data_st0 ({mshr_writeword_st0, mshr_tid_st0, mshr_tag_st0, mshr_rw_st0, mshr_byteen_st0, mshr_wsel_st0, mshr_is_snp_st0, mshr_snp_inv_st0}), .dequeue_st3 (mshr_dequeue_st3) ); end else begin @@ -775,7 +766,8 @@ end `UNUSED_VAR (wsel_st3) `UNUSED_VAR (writeword_st3) `UNUSED_VAR (snp_inv_st3) - `UNUSED_VAR (req_byteen_st3) + `UNUSED_VAR (mem_rw_st3) + `UNUSED_VAR (byteen_st3) `UNUSED_VAR (is_snp_st3) `UNUSED_VAR (incoming_fill_st3) assign mshr_pending_hazard_unqual_st0 = 0; @@ -796,7 +788,7 @@ end wire crsq_empty, crsq_full; - wire crsq_push_unqual = valid_st3 && core_req_hit_st3 && !req_rw_st3; + wire crsq_push_unqual = valid_st3 && crsq_push_st3; assign crsq_push_stall = crsq_push_unqual && crsq_full; wire crsq_push = crsq_push_unqual @@ -808,13 +800,14 @@ end wire crsq_pop = core_rsp_valid && core_rsp_ready; wire [`REQS_BITS-1:0] crsq_tid_st3 = req_tid_st3; - wire [CORE_TAG_WIDTH-1:0] crsq_tag_st3 = CORE_TAG_WIDTH'(req_tag_st3); + wire [CORE_TAG_WIDTH-1:0] crsq_tag_st3 = CORE_TAG_WIDTH'(tag_st3); wire [`WORD_WIDTH-1:0] crsq_data_st3 = readword_st3; VX_generic_queue #( - .DATAW(`REQS_BITS + CORE_TAG_WIDTH + `WORD_WIDTH), - .SIZE(CRSQ_SIZE), - .BUFFERED(1) + .DATAW (`REQS_BITS + CORE_TAG_WIDTH + `WORD_WIDTH), + .SIZE (CRSQ_SIZE), + .BUFFERED (1), + .FASTRAM (1) ) core_rsp_queue ( .clk (clk), .reset (reset), @@ -833,11 +826,11 @@ end wire dreq_empty, dreq_full; - wire dreq_push_unqual = valid_st3 && send_dwb_req_st3; - - assign dreq_push_stall = dreq_push_unqual && dreq_full; + wire dreq_push_unqual = valid_st3 && dreq_push_st3; + assign dreq_push_stall = dreq_push_unqual && dreq_full; wire dreq_push = dreq_push_unqual + && (do_writeback_st3 || !incoming_fill_qual_st3) && !dreq_full && !mshr_push_stall && !crsq_push_stall @@ -854,9 +847,10 @@ end if (DRAM_ENABLE) begin VX_generic_queue #( - .DATAW(1 + BANK_LINE_SIZE + `LINE_ADDR_WIDTH + `BANK_LINE_WIDTH), - .SIZE(DREQ_SIZE), - .BUFFERED(1) + .DATAW (1 + BANK_LINE_SIZE + `LINE_ADDR_WIDTH + `BANK_LINE_WIDTH), + .SIZE (DREQ_SIZE), + .BUFFERED (1), + .FASTRAM (1) ) dram_req_queue ( .clk (clk), .reset (reset), @@ -892,8 +886,7 @@ end wire srsq_empty, srsq_full; - wire srsq_push_unqual = valid_st3 && is_snp_st3 && !force_miss_st3; - + wire srsq_push_unqual = valid_st3 && srsq_push_st3; assign srsq_push_stall = srsq_push_unqual && srsq_full; wire srsq_push = srsq_push_unqual @@ -904,13 +897,14 @@ end wire srsq_pop = snp_rsp_valid && snp_rsp_ready; - wire [SNP_TAG_WIDTH-1:0] srsq_tag_st3 = SNP_TAG_WIDTH'(req_tag_st3); + wire [SNP_TAG_WIDTH-1:0] srsq_tag_st3 = SNP_TAG_WIDTH'(tag_st3); if (FLUSH_ENABLE) begin VX_generic_queue #( - .DATAW (SNP_TAG_WIDTH), - .SIZE (SRSQ_SIZE), - .BUFFERED(1) + .DATAW (SNP_TAG_WIDTH), + .SIZE (SRSQ_SIZE), + .BUFFERED (1), + .FASTRAM (1) ) snp_rsp_queue ( .clk (clk), .reset (reset), @@ -945,14 +939,14 @@ end `SCOPE_ASSIGN (valid_st1, valid_st1); `SCOPE_ASSIGN (valid_st2, valid_st2); `SCOPE_ASSIGN (valid_st3, valid_st3); - + `SCOPE_ASSIGN (is_fill_st0, is_fill_st0); + `SCOPE_ASSIGN (is_snp_st0, is_snp_st0); `SCOPE_ASSIGN (is_mshr_st0, is_mshr_st0); - - `SCOPE_ASSIGN (miss_st1, miss_st1); - `SCOPE_ASSIGN (dirty_st1, dirty_st1); + `SCOPE_ASSIGN (miss_st1, miss_st1); + `SCOPE_ASSIGN (dirty_st1, dirty_st1); `SCOPE_ASSIGN (force_miss_st1, force_miss_st1); + `SCOPE_ASSIGN (mshr_push, mshr_push); `SCOPE_ASSIGN (pipeline_stall, pipeline_stall); - `SCOPE_ASSIGN (addr_st0, `LINE_TO_BYTE_ADDR(addr_st0, BANK_ID)); `SCOPE_ASSIGN (addr_st1, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID)); `SCOPE_ASSIGN (addr_st2, `LINE_TO_BYTE_ADDR(addr_st2, BANK_ID)); @@ -961,8 +955,8 @@ end `ifdef PERF_ENABLE assign perf_pipe_stall = pipeline_stall; assign perf_mshr_stall = mshr_going_full; - assign perf_read_miss = !pipeline_stall & miss_st1 & !is_mshr_st1 & !mem_rw_st1; - assign perf_write_miss = !pipeline_stall & miss_st1 & !is_mshr_st1 & mem_rw_st1; + assign perf_read_miss = !pipeline_stall & miss_st2 & !is_mshr_st2 & !mem_rw_st2; + assign perf_write_miss = !pipeline_stall & miss_st2 & !is_mshr_st2 & mem_rw_st2; if (DRAM_ENABLE) begin assign perf_evict = dreq_push & do_writeback_st3 & !is_snp_st3; end else begin diff --git a/hw/rtl/cache/VX_bank_core_req_queue.v b/hw/rtl/cache/VX_bank_core_req_queue.v index 4fd6ae66..ba1b458f 100644 --- a/hw/rtl/cache/VX_bank_core_req_queue.v +++ b/hw/rtl/cache/VX_bank_core_req_queue.v @@ -22,7 +22,7 @@ module VX_bank_core_req_queue #( input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] addr_in, input wire [`CORE_REQ_TAG_COUNT-1:0] rw_in, input wire [NUM_REQS-1:0][WORD_SIZE-1:0] byteen_in, - input wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] writedata_in, + input wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] wdata_in, // Dequeue input wire pop, @@ -30,7 +30,7 @@ module VX_bank_core_req_queue #( output wire [`WORD_ADDR_WIDTH-1:0] addr_out, output wire rw_out, output wire [WORD_SIZE-1:0] byteen_out, - output wire [`WORD_WIDTH-1:0] writedata_out, + output wire [`WORD_WIDTH-1:0] wdata_out, output wire [`REQS_BITS-1:0] tid_out, // States @@ -43,7 +43,7 @@ module VX_bank_core_req_queue #( wire [`CORE_REQ_TAG_COUNT-1:0] q_rw; wire [NUM_REQS-1:0][WORD_SIZE-1:0] q_byteen; wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] q_addr; - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] q_writedata; + wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] q_wdata; wire q_push; wire q_pop; wire q_empty; @@ -56,16 +56,17 @@ module VX_bank_core_req_queue #( end VX_generic_queue #( - .DATAW($bits(valids_in) + $bits(tag_in) + $bits(addr_in) + $bits(rw_in) + $bits(byteen_in) + $bits(writedata_in)), - .SIZE(CREQ_SIZE), - .BUFFERED(1) + .DATAW ($bits(valids_in) + $bits(tag_in) + $bits(addr_in) + $bits(rw_in) + $bits(byteen_in) + $bits(wdata_in)), + .SIZE (CREQ_SIZE), + .BUFFERED (1), + .FASTRAM (1) ) req_queue ( .clk (clk), .reset (reset), .push (q_push), .pop (q_pop), - .data_in ({valids_in, tag_in, addr_in, rw_in, byteen_in, writedata_in}), - .data_out ({q_valids, q_tag, q_addr, q_rw, q_byteen, q_writedata}), + .data_in ({valids_in, tag_in, addr_in, rw_in, byteen_in, wdata_in}), + .data_out ({q_valids, q_tag, q_addr, q_rw, q_byteen, q_wdata}), .empty (q_empty), .full (q_full), `UNUSED_PIN (size) @@ -78,37 +79,42 @@ module VX_bank_core_req_queue #( reg [`WORD_ADDR_WIDTH-1:0] sel_addr, sel_addr_r; reg sel_rw, sel_rw_r; reg [WORD_SIZE-1:0] sel_byteen, sel_byteen_r; - reg [`WORD_WIDTH-1:0] sel_writedata, sel_writedata_r; + reg [`WORD_WIDTH-1:0] sel_wdata, sel_wdata_r; reg [$clog2(NUM_REQS+1)-1:0] q_valids_cnt_r; + wire [$clog2(NUM_REQS+1)-1:0] q_valids_cnt_n; wire [$clog2(NUM_REQS+1)-1:0] q_valids_cnt; - reg [NUM_REQS-1:0] pop_mask; - reg fast_track; - + reg [NUM_REQS-1:0] pop_mask; + reg fast_track; + wire fast_track_n; + + reg req_eop; // request end of packet + reg empty_r; + assign q_push = push; - assign q_pop = pop && (q_valids_cnt_r == 1 || q_valids_cnt_r == 2) && !fast_track; + assign q_pop = pop && req_eop; wire [NUM_REQS-1:0] requests = q_valids & ~pop_mask; always @(*) begin - sel_idx = 0; - sel_tag = 'x; - sel_addr = 'x; - sel_rw = 'x; - sel_byteen = 'x; - sel_writedata = 'x; + sel_idx = 0; + sel_tag = 'x; + sel_addr = 'x; + sel_rw = 'x; + sel_byteen = 'x; + sel_wdata = 'x; for (integer i = 0; i < NUM_REQS; i++) begin if (requests[i]) begin sel_idx = `REQS_BITS'(i); sel_addr = q_addr[i]; if (0 == CORE_TAG_ID_BITS) begin - sel_tag = q_tag[i]; - sel_rw = q_rw[i]; + sel_tag = q_tag[i]; + sel_rw = q_rw[i]; end - sel_byteen = q_byteen[i]; - sel_writedata = q_writedata[i]; + sel_byteen = q_byteen[i]; + sel_wdata = q_wdata[i]; break; end end @@ -121,33 +127,43 @@ module VX_bank_core_req_queue #( .count (q_valids_cnt) ); + assign fast_track_n = (!q_empty && (empty_r || (pop && fast_track))) ? 0 : + pop ? (q_valids_cnt_r == 2) : + fast_track; + + assign q_valids_cnt_n = (!q_empty && (empty_r || (pop && fast_track))) ? q_valids_cnt : + pop ? (q_valids_cnt_r - 1) : + q_valids_cnt_r; + always @(posedge clk) begin if (reset) begin pop_mask <= 0; fast_track <= 0; q_valids_cnt_r <= 0; + req_eop <= 0; + empty_r <= 1; end else begin if (!q_empty - && ((0 == q_valids_cnt_r) || (pop && fast_track))) begin - q_valids_cnt_r <= q_valids_cnt; - pop_mask <= (NUM_REQS'(1) << sel_idx); - fast_track <= 0; + && (empty_r || (pop && fast_track))) begin + pop_mask <= (NUM_REQS'(1) << sel_idx); end else if (pop) begin - q_valids_cnt_r <= q_valids_cnt_r - 1; - fast_track <= (q_valids_cnt_r == 2); - if (q_valids_cnt_r == 1 || q_valids_cnt_r == 2) begin + if (q_valids_cnt_r == 1 || q_valids_cnt_r == 2) begin pop_mask <= 0; end else begin pop_mask[sel_idx] <= 1; end end + q_valids_cnt_r <= q_valids_cnt_n; + fast_track <= fast_track_n; + req_eop <= (q_valids_cnt_n == 1 || q_valids_cnt_n == 2) && !fast_track_n; + empty_r <= (0 == q_valids_cnt_n); end - if ((0 == q_valids_cnt_r) || pop) begin - sel_idx_r <= sel_idx; - sel_byteen_r <= sel_byteen; - sel_addr_r <= sel_addr; - sel_writedata_r <= sel_writedata; + if (empty_r || pop) begin + sel_idx_r <= sel_idx; + sel_byteen_r <= sel_byteen; + sel_addr_r <= sel_addr; + sel_wdata_r <= sel_wdata; end end @@ -155,45 +171,45 @@ module VX_bank_core_req_queue #( `UNUSED_VAR (sel_tag) `UNUSED_VAR (sel_rw) always @(posedge clk) begin - if ((0 == q_valids_cnt_r) || pop) begin + if (empty_r || pop) begin sel_tag_r <= q_tag; sel_rw_r <= q_rw; end end end else begin always @(posedge clk) begin - if ((0 == q_valids_cnt_r) || pop) begin + if (empty_r || pop) begin sel_tag_r <= sel_tag; sel_rw_r <= sel_rw; end end end - assign tag_out = sel_tag_r; - assign addr_out = sel_addr_r; - assign rw_out = sel_rw_r; - assign byteen_out = sel_byteen_r; - assign writedata_out = sel_writedata_r; - assign tid_out = sel_idx_r; + assign tag_out = sel_tag_r; + assign addr_out = sel_addr_r; + assign rw_out = sel_rw_r; + assign byteen_out = sel_byteen_r; + assign wdata_out = sel_wdata_r; + assign tid_out = sel_idx_r; - assign empty = (0 == q_valids_cnt_r); - assign full = q_full; + assign full = q_full; + assign empty = empty_r; end else begin `UNUSED_VAR (q_valids) - assign q_push = push; - assign q_pop = pop; + assign q_push = push; + assign q_pop = pop; - assign tag_out = q_tag; - assign addr_out = q_addr; - assign rw_out = q_rw; - assign byteen_out = q_byteen; - assign writedata_out = q_writedata; - assign tid_out = 0; + assign tag_out = q_tag; + assign addr_out = q_addr; + assign rw_out = q_rw; + assign byteen_out = q_byteen; + assign wdata_out = q_wdata; + assign tid_out = 0; - assign empty = q_empty; - assign full = q_full; + assign empty = q_empty; + assign full = q_full; end endmodule \ No newline at end of file diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 3127420e..7a7444e1 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -378,9 +378,9 @@ module VX_cache #( end VX_stream_arbiter #( - .NUM_REQS (NUM_BANKS), - .DATAW (`DRAM_ADDR_WIDTH + 1 + BANK_LINE_SIZE + `BANK_LINE_WIDTH), - .OUT_BUFFER (NUM_BANKS >= 4) + .NUM_REQS (NUM_BANKS), + .DATAW (`DRAM_ADDR_WIDTH + 1 + BANK_LINE_SIZE + `BANK_LINE_WIDTH), + .BUFFERED (1) ) dram_req_arb ( .clk (clk), .reset (reset), @@ -408,9 +408,9 @@ module VX_cache #( if (FLUSH_ENABLE) begin VX_stream_arbiter #( - .NUM_REQS (NUM_BANKS), - .DATAW (SNP_TAG_WIDTH), - .OUT_BUFFER (NUM_BANKS >= 4) + .NUM_REQS (NUM_BANKS), + .DATAW (SNP_TAG_WIDTH), + .BUFFERED (1) ) snp_rsp_arb ( .clk (clk), .reset (reset), diff --git a/hw/rtl/cache/VX_cache_config.vh b/hw/rtl/cache/VX_cache_config.vh index e37cd912..15b18010 100644 --- a/hw/rtl/cache/VX_cache_config.vh +++ b/hw/rtl/cache/VX_cache_config.vh @@ -15,7 +15,7 @@ `define REQ_INST_META_WIDTH (`REQ_TAG_WIDTH + 1 + WORD_SIZE + `REQS_BITS) // data metadata word_sel is_snp snp_inv -`define MSHR_DATA_WIDTH (`WORD_WIDTH + `REQ_INST_META_WIDTH + `UP(`WORD_SELECT_WIDTH) + 1 + 1) +`define MSHR_DATA_WIDTH (`WORD_WIDTH + `REQ_INST_META_WIDTH + `UP(`WORD_SELECT_WIDTH) + 1 + 1) `define BANK_BITS `LOG2UP(NUM_BANKS) diff --git a/hw/rtl/cache/VX_data_access.v b/hw/rtl/cache/VX_data_access.v index f4dd1100..56788607 100644 --- a/hw/rtl/cache/VX_data_access.v +++ b/hw/rtl/cache/VX_data_access.v @@ -40,25 +40,25 @@ module VX_data_access #( `IGNORE_WARNINGS_END input wire writeen_in, input wire is_fill_in, - input wire[`WORD_WIDTH-1:0] writeword_in, - input wire[`BANK_LINE_WIDTH-1:0] writedata_in, - input wire[WORD_SIZE-1:0] byteen_in, - input wire[`UP(`WORD_SELECT_WIDTH)-1:0] wordsel_in, + input wire [`WORD_WIDTH-1:0] writeword_in, + input wire [`BANK_LINE_WIDTH-1:0] writedata_in, + input wire [WORD_SIZE-1:0] byteen_in, + input wire [`UP(`WORD_SELECT_WIDTH)-1:0] wordsel_in, // Outputs output wire[`WORD_WIDTH-1:0] readword_out, - output wire[`BANK_LINE_WIDTH-1:0] readdata_out, - output wire[BANK_LINE_SIZE-1:0] dirtyb_out + output wire [`BANK_LINE_WIDTH-1:0] readdata_out, + output wire [BANK_LINE_SIZE-1:0] dirtyb_out ); - wire[BANK_LINE_SIZE-1:0] read_dirtyb_out; - wire[`BANK_LINE_WIDTH-1:0] read_data; + wire [BANK_LINE_SIZE-1:0] read_dirtyb_out; + wire [`BANK_LINE_WIDTH-1:0] read_data; - wire[`BANK_LINE_WORDS-1:0][WORD_SIZE-1:0] byte_enable; + wire [`BANK_LINE_WORDS-1:0][WORD_SIZE-1:0] byte_enable; wire write_enable; - wire[`BANK_LINE_WIDTH-1:0] write_data; + wire [`BANK_LINE_WIDTH-1:0] write_data; - wire[`LINE_SELECT_BITS-1:0] addrline = addr_in[`LINE_SELECT_BITS-1:0]; + wire [`LINE_SELECT_BITS-1:0] addrline = addr_in[`LINE_SELECT_BITS-1:0]; VX_data_store #( .CACHE_SIZE (CACHE_SIZE), @@ -68,7 +68,6 @@ module VX_data_access #( .WRITE_ENABLE (WRITE_ENABLE) ) data_store ( .clk (clk), - .reset (reset), .read_addr (addrline), @@ -81,7 +80,7 @@ module VX_data_access #( .write_addr (addrline), .write_data (write_data) ); - + if (`WORD_SELECT_WIDTH != 0) begin wire [`WORD_WIDTH-1:0] readword = read_data[wordsel_in * `WORD_WIDTH +: `WORD_WIDTH]; for (genvar i = 0; i < WORD_SIZE; i++) begin @@ -97,16 +96,12 @@ module VX_data_access #( wire word_sel = (`WORD_SELECT_WIDTH == 0) || (wordsel_in == `UP(`WORD_SELECT_WIDTH)'(i)); assign byte_enable[i] = is_fill_in ? {WORD_SIZE{1'b1}} : - word_sel ? byteen_in : - {WORD_SIZE{1'b0}}; + word_sel ? byteen_in : {WORD_SIZE{1'b0}}; assign write_data[i * `WORD_WIDTH +: `WORD_WIDTH] = is_fill_in ? writedata_in[i * `WORD_WIDTH +: `WORD_WIDTH] : writeword_in; end - assign write_enable = valid_in - && writeen_in - && !stall; - + assign write_enable = valid_in && writeen_in && !stall; assign dirtyb_out = read_dirtyb_out; assign readdata_out = read_data; diff --git a/hw/rtl/cache/VX_data_store.v b/hw/rtl/cache/VX_data_store.v index a3f4f4f3..6593de29 100644 --- a/hw/rtl/cache/VX_data_store.v +++ b/hw/rtl/cache/VX_data_store.v @@ -46,7 +46,6 @@ module VX_data_store #( .DATAW(BANK_LINE_SIZE * 8), .SIZE(`BANK_LINE_COUNT), .BYTEENW(BANK_LINE_SIZE), - .BUFFERED(0), .RWCHECK(1) ) data ( .clk(clk), diff --git a/hw/rtl/cache/VX_miss_resrv.v b/hw/rtl/cache/VX_miss_resrv.v index e697f807..25556ec0 100644 --- a/hw/rtl/cache/VX_miss_resrv.v +++ b/hw/rtl/cache/VX_miss_resrv.v @@ -39,36 +39,22 @@ module VX_miss_resrv #( // enqueue input wire enqueue_st3, - input wire[`LINE_ADDR_WIDTH-1:0] enqueue_addr_st3, - input wire[`UP(`WORD_SELECT_WIDTH)-1:0] enqueue_wsel_st3, - input wire[`WORD_WIDTH-1:0] enqueue_data_st3, - input wire[`REQS_BITS-1:0] enqueue_tid_st3, - input wire[`REQ_TAG_WIDTH-1:0] enqueue_tag_st3, - input wire enqueue_rw_st3, - input wire[WORD_SIZE-1:0] enqueue_byteen_st3, - input wire enqueue_is_snp_st3, - input wire enqueue_snp_inv_st3, + input wire [`LINE_ADDR_WIDTH-1:0] enqueue_addr_st3, + input wire [`MSHR_DATA_WIDTH-1:0] enqueue_data_st3, input wire enqueue_is_mshr_st3, input wire enqueue_ready_st3, output wire enqueue_full, // fill input wire update_ready_st0, - input wire[`LINE_ADDR_WIDTH-1:0] addr_st0, + input wire [`LINE_ADDR_WIDTH-1:0] addr_st0, output wire pending_hazard_st0, // dequeue input wire schedule_st0, output wire dequeue_valid_st0, - output wire[`LINE_ADDR_WIDTH-1:0] dequeue_addr_st0, - output wire[`UP(`WORD_SELECT_WIDTH)-1:0] dequeue_wsel_st0, - output wire[`WORD_WIDTH-1:0] dequeue_data_st0, - output wire[`REQS_BITS-1:0] dequeue_tid_st0, - output wire[`REQ_TAG_WIDTH-1:0] dequeue_tag_st0, - output wire dequeue_rw_st0, - output wire[WORD_SIZE-1:0] dequeue_byteen_st0, - output wire dequeue_is_snp_st0, - output wire dequeue_snp_inv_st0, + output wire [`LINE_ADDR_WIDTH-1:0] dequeue_addr_st0, + output wire [`MSHR_DATA_WIDTH-1:0] dequeue_data_st0, input wire dequeue_st3 ); reg [`LINE_ADDR_WIDTH-1:0] addr_table [MSHR_SIZE-1:0]; @@ -76,8 +62,7 @@ module VX_miss_resrv #( reg [MSHR_SIZE-1:0] valid_table; reg [MSHR_SIZE-1:0] ready_table; reg [`LOG2UP(MSHR_SIZE)-1:0] schedule_ptr, restore_ptr; - reg [`LOG2UP(MSHR_SIZE)-1:0] head_ptr; - reg [`LOG2UP(MSHR_SIZE)-1:0] tail_ptr; + reg [`LOG2UP(MSHR_SIZE)-1:0] head_ptr, tail_ptr; reg [`LOG2UP(MSHR_SIZE+1)-1:0] size; assign enqueue_full = (size == $bits(size)'(MSHR_SIZE)); @@ -151,8 +136,6 @@ module VX_miss_resrv #( VX_dp_ram #( .DATAW(`MSHR_DATA_WIDTH), .SIZE(MSHR_SIZE), - .BYTEENW(1), - .BUFFERED(0), .RWCHECK(1) ) datatable ( .clk(clk), @@ -161,8 +144,8 @@ module VX_miss_resrv #( .wren(mshr_push), .byteen(1'b1), .rden(1'b1), - .din({enqueue_data_st3, enqueue_tid_st3, enqueue_tag_st3, enqueue_rw_st3, enqueue_byteen_st3, enqueue_wsel_st3, enqueue_is_snp_st3, enqueue_snp_inv_st3}), - .dout({dequeue_data_st0, dequeue_tid_st0, dequeue_tag_st0, dequeue_rw_st0, dequeue_byteen_st0, dequeue_wsel_st0, dequeue_is_snp_st0, dequeue_snp_inv_st0}) + .din(enqueue_data_st3), + .dout(dequeue_data_st0) ); `ifdef DBG_PRINT_CACHE_MSHR diff --git a/hw/rtl/cache/VX_snp_forwarder.v b/hw/rtl/cache/VX_snp_forwarder.v index 27268895..1b273539 100644 --- a/hw/rtl/cache/VX_snp_forwarder.v +++ b/hw/rtl/cache/VX_snp_forwarder.v @@ -7,7 +7,8 @@ module VX_snp_forwarder #( parameter NUM_REQS = 1, parameter SREQ_SIZE = 1, parameter TAG_IN_WIDTH = 1, - parameter TAG_OUT_WIDTH = `LOG2UP(SREQ_SIZE) + parameter TAG_OUT_WIDTH = `LOG2UP(SREQ_SIZE), + parameter BUFFERED = 0 ) ( input wire clk, input wire reset, @@ -23,7 +24,7 @@ module VX_snp_forwarder #( output wire snp_rsp_valid, output wire [SRC_ADDR_WIDTH-1:0] snp_rsp_addr, output wire snp_rsp_inv, - output wire [TAG_IN_WIDTH-1:0] snp_rsp_tag, + output wire [TAG_IN_WIDTH-1:0] snp_rsp_tag, input wire snp_rsp_ready, // Snoop Forwarding out @@ -45,6 +46,11 @@ module VX_snp_forwarder #( if (NUM_REQS > 1) begin reg [REQ_QUAL_BITS:0] pending_cntrs [SREQ_SIZE-1:0]; + + wire [TAG_IN_WIDTH-1:0] snp_rsp_tag_unqual; + wire [SRC_ADDR_WIDTH-1:0] snp_rsp_addr_unqual; + wire snp_rsp_inv_unqual; + wire snp_rsp_ready_unqual; wire [TAG_OUT_WIDTH-1:0] sfq_write_addr, sfq_read_addr; wire sfq_full; @@ -52,30 +58,31 @@ module VX_snp_forwarder #( wire [TAG_OUT_WIDTH-1:0] fwdin_tag; wire fwdin_valid; - wire fwdin_ready = snp_rsp_ready || (1 != pending_cntrs[sfq_read_addr]); + wire fwdin_ready = snp_rsp_ready_unqual || (1 != pending_cntrs[sfq_read_addr]); wire fwdin_fire = fwdin_valid && fwdin_ready; - assign snp_rsp_valid = fwdin_valid && (1 == pending_cntrs[sfq_read_addr]); + wire snp_rsp_valid_unqual = fwdin_valid && (1 == pending_cntrs[sfq_read_addr]); assign sfq_read_addr = fwdin_tag; wire sfq_acquire = snp_req_valid && snp_req_ready; - wire sfq_release = snp_rsp_valid && snp_rsp_ready; + wire sfq_release = snp_rsp_valid_unqual && snp_rsp_ready_unqual; VX_cam_buffer #( - .DATAW (SRC_ADDR_WIDTH + 1 + TAG_IN_WIDTH), - .SIZE (SREQ_SIZE) + .DATAW (SRC_ADDR_WIDTH + 1 + TAG_IN_WIDTH), + .SIZE (SREQ_SIZE), + .FASTRAM (1) ) req_metadata_buf ( - .clk (clk), - .reset (reset), - .write_addr (sfq_write_addr), - .acquire_slot (sfq_acquire), - .read_addr (sfq_read_addr), - .write_data ({snp_req_addr, snp_req_inv, snp_req_tag}), - .read_data ({snp_rsp_addr, snp_rsp_inv, snp_rsp_tag}), - .release_addr (sfq_read_addr), - .release_slot (sfq_release), - .full (sfq_full) + .clk (clk), + .reset (reset), + .write_addr (sfq_write_addr), + .acquire_slot (sfq_acquire), + .read_addr (sfq_read_addr), + .write_data ({snp_req_tag, snp_req_addr, snp_req_inv}), + .read_data ({snp_rsp_tag_unqual, snp_rsp_addr_unqual, snp_rsp_inv_unqual}), + .release_addr (sfq_read_addr), + .release_slot (sfq_release), + .full (sfq_full) ); wire fwdout_valid; @@ -115,21 +122,21 @@ module VX_snp_forwarder #( fwdout_tag_r <= sfq_write_addr; end end - assign fwdout_valid = dispatch_hold_r || (snp_req_valid && !sfq_full); - assign fwdout_tag = dispatch_hold_r ? fwdout_tag_r : sfq_write_addr; - assign fwdout_addr = dispatch_hold_r ? fwdout_addr_r : {snp_req_addr, ADDR_DIFF'(0)}; - assign fwdout_inv = dispatch_hold_r ? fwdout_inv_r : snp_req_inv; - assign dispatch_hold= dispatch_hold_r; + assign fwdout_valid = dispatch_hold_r || (snp_req_valid && !sfq_full); + assign fwdout_tag = dispatch_hold_r ? fwdout_tag_r : sfq_write_addr; + assign fwdout_addr = dispatch_hold_r ? fwdout_addr_r : {snp_req_addr, ADDR_DIFF'(0)}; + assign fwdout_inv = dispatch_hold_r ? fwdout_inv_r : snp_req_inv; + assign dispatch_hold = dispatch_hold_r; end else begin - assign fwdout_valid = snp_req_valid && !sfq_full; - assign fwdout_tag = sfq_write_addr; - assign fwdout_addr = snp_req_addr; - assign fwdout_inv = snp_req_inv; - assign dispatch_hold= 1'b0; + assign fwdout_valid = snp_req_valid && !sfq_full; + assign fwdout_tag = sfq_write_addr; + assign fwdout_addr = snp_req_addr; + assign fwdout_inv = snp_req_inv; + assign dispatch_hold = 1'b0; end always @(posedge clk) begin - if (sfq_acquire) begin + if (sfq_acquire) begin pending_cntrs[sfq_write_addr] <= NUM_REQUESTS_QUAL; end if (fwdin_fire) begin @@ -143,7 +150,7 @@ module VX_snp_forwarder #( for (genvar i = 0; i < NUM_REQS; i++) begin VX_skid_buffer #( .DATAW (DST_ADDR_WIDTH + 1 + TAG_OUT_WIDTH), - .PASSTHRU (NUM_REQS >= 4) + .PASSTHRU (!BUFFERED) ) fwdout_buffer ( .clk (clk), .reset (reset), @@ -171,19 +178,31 @@ module VX_snp_forwarder #( assign snp_req_ready = fwdout_ready && !sfq_full && !dispatch_hold; VX_stream_arbiter #( - .NUM_REQS (NUM_REQS), - .DATAW (TAG_OUT_WIDTH), - .IN_BUFFER (NUM_REQS >= 4), - .OUT_BUFFER (NUM_REQS >= 4) + .NUM_REQS (NUM_REQS), + .DATAW (TAG_OUT_WIDTH) ) snp_fwdin_arb ( - .clk (clk), - .reset (reset), - .valid_in (snp_fwdin_valid), - .data_in (snp_fwdin_tag), - .ready_in (snp_fwdin_ready), - .valid_out (fwdin_valid), - .data_out (fwdin_tag), - .ready_out (fwdin_ready) + .clk (clk), + .reset (reset), + .valid_in (snp_fwdin_valid), + .data_in (snp_fwdin_tag), + .ready_in (snp_fwdin_ready), + .valid_out (fwdin_valid), + .data_out (fwdin_tag), + .ready_out (fwdin_ready) + ); + + VX_skid_buffer #( + .DATAW (TAG_IN_WIDTH + SRC_ADDR_WIDTH + 1), + .PASSTHRU (!BUFFERED) + ) rsp_buffer ( + .clk (clk), + .reset (reset), + .valid_in (snp_rsp_valid_unqual), + .data_in ({snp_rsp_tag_unqual, snp_rsp_addr_unqual, snp_rsp_inv_unqual}), + .ready_in (snp_rsp_ready_unqual), + .valid_out (snp_rsp_valid), + .data_out ({snp_rsp_tag, snp_rsp_addr, snp_rsp_inv}), + .ready_out (snp_rsp_ready) ); `ifdef DBG_PRINT_CACHE_SNP diff --git a/hw/rtl/cache/VX_tag_access.v b/hw/rtl/cache/VX_tag_access.v index ad593fdd..7050403e 100644 --- a/hw/rtl/cache/VX_tag_access.v +++ b/hw/rtl/cache/VX_tag_access.v @@ -54,7 +54,7 @@ module VX_tag_access #( wire read_valid; wire read_dirty; - wire[`TAG_SELECT_BITS-1:0] read_tag; + wire [`TAG_SELECT_BITS-1:0] read_tag; wire do_fill; wire do_write; diff --git a/hw/rtl/cache/VX_tag_store.v b/hw/rtl/cache/VX_tag_store.v index e5dc1d16..20d49d03 100644 --- a/hw/rtl/cache/VX_tag_store.v +++ b/hw/rtl/cache/VX_tag_store.v @@ -48,8 +48,6 @@ module VX_tag_store #( VX_dp_ram #( .DATAW(`TAG_SELECT_BITS), .SIZE(`BANK_LINE_COUNT), - .BYTEENW(1), - .BUFFERED(0), .RWCHECK(1) ) tags ( .clk(clk), diff --git a/hw/rtl/interfaces/VX_alu_req_if.v b/hw/rtl/interfaces/VX_alu_req_if.v index d940cc7a..3f507f81 100644 --- a/hw/rtl/interfaces/VX_alu_req_if.v +++ b/hw/rtl/interfaces/VX_alu_req_if.v @@ -6,7 +6,6 @@ interface VX_alu_req_if (); wire valid; - wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -20,8 +19,7 @@ interface VX_alu_req_if (); wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; wire [`NR_BITS-1:0] rd; - wire wb; - + wire wb; wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_cache_core_rsp_if.v b/hw/rtl/interfaces/VX_cache_core_rsp_if.v index 22150e90..265d19cd 100644 --- a/hw/rtl/interfaces/VX_cache_core_rsp_if.v +++ b/hw/rtl/interfaces/VX_cache_core_rsp_if.v @@ -10,12 +10,10 @@ interface VX_cache_core_rsp_if #( parameter CORE_TAG_ID_BITS = 0 ) (); - wire [NUM_REQS-1:0] valid; - - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] data; + wire [NUM_REQS-1:0] valid; + wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] data; wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] tag; - - wire ready; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_cache_dram_req_if.v b/hw/rtl/interfaces/VX_cache_dram_req_if.v index d02c1f57..5591bf9f 100644 --- a/hw/rtl/interfaces/VX_cache_dram_req_if.v +++ b/hw/rtl/interfaces/VX_cache_dram_req_if.v @@ -9,14 +9,12 @@ interface VX_cache_dram_req_if #( parameter DRAM_TAG_WIDTH = 1 ) (); - wire valid; - + wire valid; wire rw; wire [(DRAM_LINE_WIDTH/8)-1:0] byteen; wire [DRAM_ADDR_WIDTH-1:0] addr; wire [DRAM_LINE_WIDTH-1:0] data; - wire [DRAM_TAG_WIDTH-1:0] tag; - + wire [DRAM_TAG_WIDTH-1:0] tag; wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_cache_dram_rsp_if.v b/hw/rtl/interfaces/VX_cache_dram_rsp_if.v index c7b36eca..2a3d15e0 100644 --- a/hw/rtl/interfaces/VX_cache_dram_rsp_if.v +++ b/hw/rtl/interfaces/VX_cache_dram_rsp_if.v @@ -8,12 +8,10 @@ interface VX_cache_dram_rsp_if #( parameter DRAM_TAG_WIDTH = 1 ) (); - wire valid; - - wire [DRAM_LINE_WIDTH-1:0] data; - wire [DRAM_TAG_WIDTH-1:0] tag; - - wire ready; + wire valid; + wire [DRAM_LINE_WIDTH-1:0] data; + wire [DRAM_TAG_WIDTH-1:0] tag; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_cache_snp_req_if.v b/hw/rtl/interfaces/VX_cache_snp_req_if.v index 99fa0cf1..bc154568 100644 --- a/hw/rtl/interfaces/VX_cache_snp_req_if.v +++ b/hw/rtl/interfaces/VX_cache_snp_req_if.v @@ -8,13 +8,11 @@ interface VX_cache_snp_req_if #( parameter SNP_TAG_WIDTH = 0 ) (); - wire valid; - - wire [DRAM_ADDR_WIDTH-1:0] addr; - wire invalidate; - wire [SNP_TAG_WIDTH-1:0] tag; - - wire ready; + wire valid; + wire [DRAM_ADDR_WIDTH-1:0] addr; + wire invalidate; + wire [SNP_TAG_WIDTH-1:0] tag; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_cache_snp_rsp_if.v b/hw/rtl/interfaces/VX_cache_snp_rsp_if.v index d1b619ce..c9235ebe 100644 --- a/hw/rtl/interfaces/VX_cache_snp_rsp_if.v +++ b/hw/rtl/interfaces/VX_cache_snp_rsp_if.v @@ -7,10 +7,8 @@ interface VX_cache_snp_rsp_if #( parameter SNP_TAG_WIDTH = 0 ) (); - wire valid; - + wire valid; wire [SNP_TAG_WIDTH-1:0] tag; - wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_cmt_to_csr_if.v b/hw/rtl/interfaces/VX_cmt_to_csr_if.v index 366bde1e..d805f5b4 100644 --- a/hw/rtl/interfaces/VX_cmt_to_csr_if.v +++ b/hw/rtl/interfaces/VX_cmt_to_csr_if.v @@ -5,7 +5,7 @@ interface VX_cmt_to_csr_if (); - wire valid; + wire valid; wire [$clog2(3*`NUM_THREADS+1)-1:0] commit_size; endinterface diff --git a/hw/rtl/interfaces/VX_commit_if.v b/hw/rtl/interfaces/VX_commit_if.v index 1108e811..273408fe 100644 --- a/hw/rtl/interfaces/VX_commit_if.v +++ b/hw/rtl/interfaces/VX_commit_if.v @@ -6,14 +6,12 @@ interface VX_commit_if (); wire valid; - wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; wire [`NUM_THREADS-1:0][31:0] data; wire [`NR_BITS-1:0] rd; wire wb; - wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_csr_io_req_if.v b/hw/rtl/interfaces/VX_csr_io_req_if.v index 3b4d4806..225fcd1f 100644 --- a/hw/rtl/interfaces/VX_csr_io_req_if.v +++ b/hw/rtl/interfaces/VX_csr_io_req_if.v @@ -5,12 +5,10 @@ interface VX_csr_io_req_if (); - wire valid; - + wire valid; wire [`CSR_ADDR_BITS-1:0] addr; wire rw; wire [31:0] data; - wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_csr_io_rsp_if.v b/hw/rtl/interfaces/VX_csr_io_rsp_if.v index 2183edd7..333894e3 100644 --- a/hw/rtl/interfaces/VX_csr_io_rsp_if.v +++ b/hw/rtl/interfaces/VX_csr_io_rsp_if.v @@ -5,10 +5,8 @@ interface VX_csr_io_rsp_if (); - wire valid; - + wire valid; wire [31:0] data; - wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_csr_pipe_req_if.v b/hw/rtl/interfaces/VX_csr_pipe_req_if.v index 16e368a2..7cfa03bb 100644 --- a/hw/rtl/interfaces/VX_csr_pipe_req_if.v +++ b/hw/rtl/interfaces/VX_csr_pipe_req_if.v @@ -6,7 +6,6 @@ interface VX_csr_pipe_req_if (); wire valid; - wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -15,8 +14,7 @@ interface VX_csr_pipe_req_if (); wire [31:0] csr_mask; wire [`NR_BITS-1:0] rd; wire wb; - wire is_io; - + wire is_io; wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_csr_req_if.v b/hw/rtl/interfaces/VX_csr_req_if.v index 9bc17668..ae59ede3 100644 --- a/hw/rtl/interfaces/VX_csr_req_if.v +++ b/hw/rtl/interfaces/VX_csr_req_if.v @@ -6,7 +6,6 @@ interface VX_csr_req_if (); wire valid; - wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -16,8 +15,7 @@ interface VX_csr_req_if (); wire rs2_is_imm; wire [`NR_BITS-1:0] rs1; wire [`NR_BITS-1:0] rd; - wire wb; - + wire wb; wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_decode_if.v b/hw/rtl/interfaces/VX_decode_if.v index b864efa9..44bf00bc 100644 --- a/hw/rtl/interfaces/VX_decode_if.v +++ b/hw/rtl/interfaces/VX_decode_if.v @@ -6,7 +6,6 @@ interface VX_decode_if (); wire valid; - wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -21,8 +20,7 @@ interface VX_decode_if (); wire [31:0] imm; wire rs1_is_PC; wire rs2_is_imm; - wire [`NUM_REGS-1:0] used_regs; - + wire [`NUM_REGS-1:0] used_regs; wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_fpu_req_if.v b/hw/rtl/interfaces/VX_fpu_req_if.v index 369a5cea..a569e059 100644 --- a/hw/rtl/interfaces/VX_fpu_req_if.v +++ b/hw/rtl/interfaces/VX_fpu_req_if.v @@ -9,8 +9,7 @@ interface VX_fpu_req_if (); - wire valid; - + wire valid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -20,8 +19,7 @@ interface VX_fpu_req_if (); wire [`NUM_THREADS-1:0][31:0] rs2_data; wire [`NUM_THREADS-1:0][31:0] rs3_data; wire [`NR_BITS-1:0] rd; - wire wb; - + wire wb; wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_ifetch_req_if.v b/hw/rtl/interfaces/VX_ifetch_req_if.v index 923209a2..b99ed5da 100644 --- a/hw/rtl/interfaces/VX_ifetch_req_if.v +++ b/hw/rtl/interfaces/VX_ifetch_req_if.v @@ -5,12 +5,10 @@ interface VX_ifetch_req_if (); - wire valid; - + wire valid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; - wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_ifetch_rsp_if.v b/hw/rtl/interfaces/VX_ifetch_rsp_if.v index 8f6c38ae..4991f462 100644 --- a/hw/rtl/interfaces/VX_ifetch_rsp_if.v +++ b/hw/rtl/interfaces/VX_ifetch_rsp_if.v @@ -6,12 +6,10 @@ interface VX_ifetch_rsp_if (); wire valid; - wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; wire [31:0] instr; - wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_lsu_req_if.v b/hw/rtl/interfaces/VX_lsu_req_if.v index 9527cc04..1bfb5a36 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.v +++ b/hw/rtl/interfaces/VX_lsu_req_if.v @@ -6,7 +6,6 @@ interface VX_lsu_req_if (); wire valid; - wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -16,8 +15,7 @@ interface VX_lsu_req_if (); wire [`NUM_THREADS-1:0][31:0] base_addr; wire [31:0] offset; wire [`NR_BITS-1:0] rd; - wire wb; - + wire wb; wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_mul_req_if.v b/hw/rtl/interfaces/VX_mul_req_if.v index 713761eb..60f9c798 100644 --- a/hw/rtl/interfaces/VX_mul_req_if.v +++ b/hw/rtl/interfaces/VX_mul_req_if.v @@ -10,7 +10,6 @@ interface VX_mul_req_if (); wire valid; - wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -18,8 +17,7 @@ interface VX_mul_req_if (); wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; wire [`NR_BITS-1:0] rd; - wire wb; - + wire wb; wire ready; endinterface diff --git a/hw/rtl/libs/VX_cam_buffer.v b/hw/rtl/libs/VX_cam_buffer.v index 3e4ec4f8..7f44ca85 100644 --- a/hw/rtl/libs/VX_cam_buffer.v +++ b/hw/rtl/libs/VX_cam_buffer.v @@ -1,9 +1,10 @@ `include "VX_platform.vh" module VX_cam_buffer #( - parameter DATAW = 1, - parameter SIZE = 1, - parameter ADDRW = `LOG2UP(SIZE) + parameter DATAW = 1, + parameter SIZE = 1, + parameter FASTRAM = 0, + parameter ADDRW = `LOG2UP(SIZE) ) ( input wire clk, input wire reset, @@ -63,8 +64,8 @@ module VX_cam_buffer #( VX_dp_ram #( .DATAW(DATAW), .SIZE(SIZE), - .BUFFERED(0), - .RWCHECK(0) + .RWCHECK(1), + .FASTRAM(FASTRAM) ) data_table ( .clk(clk), .waddr(write_addr), diff --git a/hw/rtl/libs/VX_dp_ram.v b/hw/rtl/libs/VX_dp_ram.v index 6ab88847..90bdb547 100644 --- a/hw/rtl/libs/VX_dp_ram.v +++ b/hw/rtl/libs/VX_dp_ram.v @@ -26,9 +26,7 @@ module VX_dp_ram #( localparam DATA32W = DATAW / 32; localparam BYTEEN32W = BYTEENW / 4; -//`ifndef QUARTUS - - if (FASTRAM) begin + if (FASTRAM) begin if (BUFFERED) begin reg [DATAW-1:0] dout_r; @@ -57,72 +55,36 @@ module VX_dp_ram #( dout_r <= mem[raddr]; end end - assign dout = dout_r; - end else begin - `UNUSED_VAR (rden) - if (RWCHECK) begin + if (BYTEENW > 1) begin + `USE_FAST_BRAM reg [DATA32W-1:0][3:0][7:0] mem [SIZE-1:0]; - if (BYTEENW > 1) begin - `USE_FAST_BRAM reg [DATA32W-1:0][3:0][7:0] mem [SIZE-1:0]; - - always @(posedge clk) begin - if (wren) begin - for (integer j = 0; j < BYTEEN32W; j++) begin - for (integer i = 0; i < 4; i++) begin - if (byteen[j * 4 + i]) - mem[waddr][j][i] <= din[j * 32 + i * 8 +: 8]; - end + always @(posedge clk) begin + if (wren) begin + for (integer j = 0; j < BYTEEN32W; j++) begin + for (integer i = 0; i < 4; i++) begin + if (byteen[j * 4 + i]) + mem[waddr][j][i] <= din[j * 32 + i * 8 +: 8]; end end end - assign dout = mem[raddr]; - - end else begin - `USE_FAST_BRAM reg [DATAW-1:0] mem [SIZE-1:0]; - - always @(posedge clk) begin - if (wren && byteen) - mem[waddr] <= din; - end - assign dout = mem[raddr]; end - + assign dout = mem[raddr]; end else begin + `USE_FAST_BRAM reg [DATAW-1:0] mem [SIZE-1:0]; - if (BYTEENW > 1) begin - `USE_FAST_BRAM `NO_RW_RAM_CHECK reg [DATA32W-1:0][3:0][7:0] mem [SIZE-1:0]; - - always @(posedge clk) begin - if (wren) begin - for (integer j = 0; j < BYTEEN32W; j++) begin - for (integer i = 0; i < 4; i++) begin - if (byteen[j * 4 + i]) - mem[waddr][j][i] <= din[j * 32 + i * 8 +: 8]; - end - end - end - end - assign dout = mem[raddr]; - end else begin - `USE_FAST_BRAM `NO_RW_RAM_CHECK reg [DATAW-1:0] mem [SIZE-1:0]; - - always @(posedge clk) begin - if (wren && byteen) - mem[waddr] <= din; - end - assign dout = mem[raddr]; - end - end + always @(posedge clk) begin + if (wren && byteen) + mem[waddr] <= din; + end + assign dout = mem[raddr]; + end end - end else begin - if (BUFFERED) begin - reg [DATAW-1:0] dout_r; if (BYTEENW > 1) begin @@ -150,14 +112,11 @@ module VX_dp_ram #( dout_r <= mem[raddr]; end end - assign dout = dout_r; end else begin - `UNUSED_VAR (rden) if (RWCHECK) begin - if (BYTEENW > 1) begin reg [DATA32W-1:0][3:0][7:0] mem [SIZE-1:0]; @@ -208,96 +167,6 @@ module VX_dp_ram #( end end end - -/*`else - - localparam OUTDATA_REG_B = BUFFERED ? "CLOCK0" : "UNREGISTERED"; - localparam RAM_BLOCK_TYPE = FASTRAM ? "MLAB" : "AUTO"; - - if (RWCHECK) begin - - altsyncram #( - .init_file (), - .operation_mode ("DUAL_PORT"), - .numwords_a (SIZE), - .numwords_b (SIZE), - .widthad_a (ADDRW), - .widthad_b (ADDRW), - .width_a (DATAW), - .width_b (DATAW), - .width_byteena_a(BYTEENW), - .address_reg_b ("CLOCK0"), - .outdata_reg_b (OUTDATA_REG_B), - .ram_block_type (RAM_BLOCK_TYPE) - ) mem ( - .clocken0 (1'b1), - .clocken1 (), - .clocken2 (), - .clocken3 (), - .clock0 (clk), - .clock1 (), - .address_a (waddr), - .address_b (raddr), - .byteena_a (byteen), - .byteena_b (1'b1), - .wren_a (wren), - .wren_b (1'b0), - .data_a (din), - .data_b (), - .rden_a (), - .rden_b (1'b1), - .q_a (), - .q_b (dout), - .addressstall_a (1'b0), - .addressstall_b (1'b0), - .aclr0 (1'b0), - .aclr1 (1'b0), - .eccstatus () - ); - - end else begin - - `NO_RW_RAM_CHECK altsyncram #( - .init_file (), - .operation_mode ("DUAL_PORT"), - .numwords_a (SIZE), - .numwords_b (SIZE), - .widthad_a (ADDRW), - .widthad_b (ADDRW), - .width_a (DATAW), - .width_b (DATAW), - .width_byteena_a(BYTEENW), - .outdata_reg_b (OUTDATA_REG_B), - .ram_block_type (RAM_BLOCK_TYPE) - ) mem ( - .clocken0 (1'b1), - .clocken1 (1'b1), - .clocken2 (1'b1), - .clocken3 (1'b1), - .clock0 (clk), - .clock1 (clk), - .address_a (waddr), - .address_b (raddr), - .byteena_a (byteen), - .byteena_b (1'b1), - .wren_a (wren), - .wren_b (1'b0), - .data_a (din), - .data_b (), - .rden_a (), - .rden_b (1'b1), - .q_a (), - .q_b (dout), - .addressstall_a (1'b0), - .addressstall_b (1'b0), - .aclr0 (1'b0), - .aclr1 (1'b0), - .eccstatus () - ); - - end - -`endif*/ - + endmodule `TRACING_ON \ No newline at end of file diff --git a/hw/rtl/libs/VX_generic_queue.v b/hw/rtl/libs/VX_generic_queue.v index f3796fdb..0a951a6b 100644 --- a/hw/rtl/libs/VX_generic_queue.v +++ b/hw/rtl/libs/VX_generic_queue.v @@ -3,10 +3,10 @@ module VX_generic_queue #( parameter DATAW = 1, parameter SIZE = 2, - parameter BUFFERED = 0, parameter ADDRW = $clog2(SIZE), parameter SIZEW = $clog2(SIZE+1), - parameter FASTRAM = 0 + parameter BUFFERED = 0, + parameter FASTRAM = 1 ) ( input wire clk, input wire reset, @@ -78,25 +78,22 @@ module VX_generic_queue #( end; end end - used_r <= used_r + ADDRW'(push) - ADDRW'(pop); + used_r <= used_r + (ADDRW'(push) - ADDRW'(pop)); end end if (0 == BUFFERED) begin - reg [ADDRW:0] rd_ptr_r; - reg [ADDRW:0] wr_ptr_r; - - wire [ADDRW-1:0] rd_ptr_a = rd_ptr_r[ADDRW-1:0]; - wire [ADDRW-1:0] wr_ptr_a = wr_ptr_r[ADDRW-1:0]; + reg [ADDRW-1:0] rd_ptr_r; + reg [ADDRW-1:0] wr_ptr_r; always @(posedge clk) begin if (reset) begin rd_ptr_r <= 0; wr_ptr_r <= 0; end else begin - wr_ptr_r <= wr_ptr_r + (ADDRW+1)'(push); - rd_ptr_r <= rd_ptr_r + (ADDRW+1)'(pop); + wr_ptr_r <= wr_ptr_r + ADDRW'(push); + rd_ptr_r <= rd_ptr_r + ADDRW'(pop); end end @@ -108,8 +105,8 @@ module VX_generic_queue #( .FASTRAM(FASTRAM) ) dp_ram ( .clk(clk), - .waddr(wr_ptr_a), - .raddr(rd_ptr_a), + .waddr(wr_ptr_r), + .raddr(rd_ptr_r), .wren(push), .byteen(1'b1), .rden(1'b1), @@ -149,7 +146,7 @@ module VX_generic_queue #( .DATAW(DATAW), .SIZE(SIZE), .BUFFERED(0), - .RWCHECK(0), + .RWCHECK(1), .FASTRAM(FASTRAM) ) dp_ram ( .clk(clk), @@ -166,7 +163,7 @@ module VX_generic_queue #( if (push && (empty_r || ((used_r == ADDRW'(1)) && pop))) begin dout_r <= data_in; end else if (pop) begin - dout_r <= dout; + dout_r <= dout; // BRAM R/W collision end end @@ -178,4 +175,4 @@ module VX_generic_queue #( assign size = {full_r, used_r}; end -endmodule +endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_scope.v b/hw/rtl/libs/VX_scope.v index 1b227e79..a8f45f5b 100644 --- a/hw/rtl/libs/VX_scope.v +++ b/hw/rtl/libs/VX_scope.v @@ -143,7 +143,7 @@ module VX_scope #( end if (stop - || (waddr >= waddr_end)) begin + || (waddr == waddr_end)) begin waddr <= waddr; // keep last address recording <= 0; data_valid <= 1; diff --git a/hw/rtl/libs/VX_stream_arbiter.v b/hw/rtl/libs/VX_stream_arbiter.v index b17fc81b..562375f0 100644 --- a/hw/rtl/libs/VX_stream_arbiter.v +++ b/hw/rtl/libs/VX_stream_arbiter.v @@ -1,11 +1,10 @@ `include "VX_platform.vh" module VX_stream_arbiter #( - parameter NUM_REQS = 1, - parameter DATAW = 1, - parameter TYPE = "R", - parameter IN_BUFFER = 0, - parameter OUT_BUFFER = 0 + parameter NUM_REQS = 1, + parameter DATAW = 1, + parameter TYPE = "R", + parameter BUFFERED = 0 ) ( input wire clk, input wire reset, @@ -22,27 +21,6 @@ module VX_stream_arbiter #( localparam LOG_NUM_REQS = $clog2(NUM_REQS); if (NUM_REQS > 1) begin - - wire [NUM_REQS-1:0] valid_in_qual; - wire [NUM_REQS-1:0][DATAW-1:0] data_in_qual; - wire [NUM_REQS-1:0] ready_in_qual; - - for (genvar i = 0; i < NUM_REQS; ++i) begin - VX_skid_buffer #( - .DATAW (DATAW), - .PASSTHRU (!IN_BUFFER) - ) req_buffer ( - .clk (clk), - .reset (reset), - .valid_in (valid_in[i]), - .data_in (data_in[i]), - .ready_in (ready_in[i]), - .valid_out (valid_in_qual[i]), - .data_out (data_in_qual[i]), - .ready_out (ready_in_qual[i]) - ); - end - wire sel_enable; wire sel_valid; wire [LOG_NUM_REQS-1:0] sel_idx; @@ -56,7 +34,7 @@ module VX_stream_arbiter #( ) sel_arb ( .clk (clk), .reset (reset), - .requests (valid_in_qual), + .requests (valid_in), .enable (sel_enable), .grant_valid (sel_valid), .grant_index (sel_idx), @@ -71,7 +49,7 @@ module VX_stream_arbiter #( ) sel_arb ( .clk (clk), .reset (reset), - .requests (valid_in_qual), + .requests (valid_in), .enable (sel_enable), .grant_valid (sel_valid), .grant_index (sel_idx), @@ -86,7 +64,7 @@ module VX_stream_arbiter #( ) sel_arb ( .clk (clk), .reset (reset), - .requests (valid_in_qual), + .requests (valid_in), .enable (sel_enable), .grant_valid (sel_valid), .grant_index (sel_idx), @@ -101,47 +79,36 @@ module VX_stream_arbiter #( ) sel_arb ( .clk (clk), .reset (reset), - .requests (valid_in_qual), + .requests (valid_in), .enable (sel_enable), .grant_valid (sel_valid), .grant_index (sel_idx), .grant_onehot (sel_1hot) ); - end + end - if (OUT_BUFFER) begin + wire ready_out_unqual; - wire stall = ~ready_out && valid_out; - assign sel_enable = ~stall; + VX_skid_buffer #( + .DATAW (DATAW), + .PASSTHRU (!BUFFERED) + ) out_buffer ( + .clk (clk), + .reset (reset), + .valid_in (sel_valid), + .data_in (data_in[sel_idx]), + .ready_in (ready_out_unqual), + .valid_out (valid_out), + .data_out (data_out), + .ready_out (ready_out) + ); - VX_generic_register #( - .N(1 + DATAW), - .R(1) - ) pipe_reg ( - .clk (clk), - .reset (reset), - .stall (stall), - .flush (1'b0), - .data_in ({sel_valid, data_in_qual[sel_idx]}), - .data_out ({valid_out, data_out}) - ); + assign sel_enable = ready_out_unqual; - for (genvar i = 0; i < NUM_REQS; i++) begin - assign ready_in_qual[i] = sel_1hot[i] && ~stall; - end - - end else begin - - assign sel_enable = ready_out; - assign valid_out = sel_valid; - assign data_out = data_in_qual[sel_idx]; - - for (genvar i = 0; i < NUM_REQS; i++) begin - assign ready_in_qual[i] = sel_1hot[i] && ready_out; - end - - end + for (genvar i = 0; i < NUM_REQS; i++) begin + assign ready_in[i] = sel_1hot[i] && ready_out_unqual; + end end else begin diff --git a/hw/rtl/libs/VX_stream_demux.v b/hw/rtl/libs/VX_stream_demux.v new file mode 100644 index 00000000..7eea6ad3 --- /dev/null +++ b/hw/rtl/libs/VX_stream_demux.v @@ -0,0 +1,68 @@ +`include "VX_platform.vh" + +module VX_stream_demux #( + parameter NUM_REQS = 1, + parameter DATAW = 1, + parameter BUFFERED = 0, + localparam LOG_NUM_REQS = `LOG2UP(NUM_REQS) +) ( + input wire clk, + input wire reset, + + input wire [LOG_NUM_REQS-1:0] sel, + + input wire valid_in, + input wire [DATAW-1:0] data_in, + output wire ready_in, + + output wire [NUM_REQS-1:0] valid_out, + output wire [NUM_REQS-1:0][DATAW-1:0] data_out, + input wire [NUM_REQS-1:0] ready_out + ); + + if (NUM_REQS > 1) begin + + reg [NUM_REQS-1:0] valid_out_unqual; + wire [NUM_REQS-1:0][DATAW-1:0] data_out_unqual; + wire [NUM_REQS-1:0] ready_out_unqual; + + always @(*) begin + valid_out_unqual = '0; + valid_out_unqual[sel] = valid_in; + end + + for (genvar i = 0; i < NUM_REQS; i++) begin + assign data_out_unqual[i] = data_in; + end + + assign ready_in = ready_out_unqual[sel]; + + for (genvar i = 0; i < NUM_REQS; i++) begin + VX_skid_buffer #( + .DATAW (DATAW), + .PASSTHRU (!BUFFERED) + ) out_buffer ( + .clk (clk), + .reset (reset), + .valid_in (valid_out_unqual[i]), + .data_in (data_out_unqual[i]), + .ready_in (ready_out_unqual[i]), + .valid_out (valid_out[i]), + .data_out (data_out[i]), + .ready_out (ready_out[i]) + ); + end + + end else begin + + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + `UNUSED_VAR (sel) + + assign valid_out = valid_in; + assign data_out = data_in; + assign ready_in = ready_out; + + end + +endmodule \ No newline at end of file diff --git a/hw/scripts/scope.json b/hw/scripts/scope.json index 32b5d965..a269a926 100644 --- a/hw/scripts/scope.json +++ b/hw/scripts/scope.json @@ -9,7 +9,7 @@ "modules": { "afu": { "submodules": { - "vortex": {"type":"Vortex", "enabled":false} + "vortex": {"type":"Vortex", "enabled":true} } }, "Vortex": { @@ -190,6 +190,7 @@ "?writeback_valid": 1, "writeback_wid":"`NW_BITS", "writeback_pc": 32, + "writeback_tmask":"`NUM_THREADS", "writeback_rd":"`NR_BITS", "writeback_data":"`NUM_THREADS * 32", "!scoreboard_delay": 1, @@ -204,11 +205,14 @@ "addr_st1": 32, "addr_st2": 32, "addr_st3": 32, + "is_fill_st0": 1, + "is_snp_st0": 1, "is_mshr_st0": 1, "miss_st1": 1, + "force_miss_st1": 1, "dirty_st1": 1, - "!force_miss_st1": 1, - "!pipeline_stall": 1 + "mshr_push": 1, + "?pipeline_stall": 1 } } } diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index 91fe3930..7106989f 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -4,7 +4,7 @@ #include #define ENABLE_DRAM_STALLS -#define DRAM_LATENCY 100 +#define DRAM_LATENCY 24 #define DRAM_RQ_SIZE 16 #define DRAM_STALLS_MODULO 16 diff --git a/hw/syn/quartus/.gitignore b/hw/syn/quartus/.gitignore index eac68fed..e1a705fb 100644 --- a/hw/syn/quartus/.gitignore +++ b/hw/syn/quartus/.gitignore @@ -12,3 +12,15 @@ /core/* !/core/Makefile + +/core8/* +!/core8/Makefile + +/top1/* +!/top1/Makefile + +/top2/* +!/top2/Makefile + +/top8/* +!/top8/Makefile \ No newline at end of file diff --git a/hw/syn/quartus/core8/Makefile b/hw/syn/quartus/core8/Makefile new file mode 100644 index 00000000..9cbb35c7 --- /dev/null +++ b/hw/syn/quartus/core8/Makefile @@ -0,0 +1,72 @@ +PROJECT = Core +TOP_LEVEL_ENTITY = VX_core +SRC_FILE = VX_core.v +FPU_INCLUDE = ../../../rtl/fp_cores;../../../rtl/fp_cores/altera;../../../rtl/fp_cores/fpnew/src;../../../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;../../../rtl/fp_cores/fpnew/src/common_cells/include;../../../rtl/fp_cores/fpnew/src/common_cells/src +RTL_INCLUDE = $(FPU_INCLUDE);../../../rtl;../../../rtl/libs;../../../rtl/interfaces;../../../rtl/cache +PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf + +# Part, Family +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG + +# Executable Configuration +SYN_ARGS = --parallel --read_settings_files=on +FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on +ASM_ARGS = +STA_ARGS = --parallel --do_report_timing + +# Build targets +all: $(PROJECT).sta.rpt + +syn: $(PROJECT).syn.rpt + +fit: $(PROJECT).fit.rpt + +asm: $(PROJECT).asm.rpt + +sta: $(PROJECT).sta.rpt + +smart: smart.log + +# Target implementations +STAMP = echo done > + +$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES) + quartus_syn $(PROJECT) $(SYN_ARGS) + $(STAMP) fit.chg + +$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt + quartus_fit $(PROJECT) $(FIT_ARGS) + $(STAMP) asm.chg + $(STAMP) sta.chg + +$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt + quartus_asm $(PROJECT) $(ASM_ARGS) + +$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt + quartus_sta $(PROJECT) $(STA_ARGS) + +smart.log: $(PROJECT_FILES) + quartus_sh --determine_smart_action $(PROJECT) > smart.log + +# Project initialization +$(PROJECT_FILES): + quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" -set "NUM_THREADS=8" + +syn.chg: + $(STAMP) syn.chg + +fit.chg: + $(STAMP) fit.chg + +sta.chg: + $(STAMP) sta.chg + +asm.chg: + $(STAMP) asm.chg + +program: $(PROJECT).sof + quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof" + +clean: + rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox diff --git a/hw/syn/quartus/project.sdc b/hw/syn/quartus/project.sdc index 59686a41..a8170852 100644 --- a/hw/syn/quartus/project.sdc +++ b/hw/syn/quartus/project.sdc @@ -1,6 +1,6 @@ set_time_format -unit ns -decimal_places 3 -create_clock -name {clk} -period "200 MHz" -waveform { 0.0 1.0 } [get_ports {clk}] +create_clock -name {clk} -period "220 MHz" -waveform { 0.0 1.0 } [get_ports {clk}] derive_pll_clocks -create_base_clocks derive_clock_uncertainty diff --git a/hw/syn/quartus/project.tcl b/hw/syn/quartus/project.tcl index 0e85bf48..8edc3278 100644 --- a/hw/syn/quartus/project.tcl +++ b/hw/syn/quartus/project.tcl @@ -31,6 +31,7 @@ set_global_assignment -name FAMILY $opts(family) set_global_assignment -name DEVICE $opts(device) set_global_assignment -name TOP_LEVEL_ENTITY $opts(top) set_global_assignment -name PROJECT_OUTPUT_DIRECTORY bin + set_global_assignment -name NUM_PARALLEL_PROCESSORS ALL set_global_assignment -name VERILOG_INPUT_VERSION SYSTEMVERILOG_2009 set_global_assignment -name ADD_PASS_THROUGH_LOGIC_TO_INFERRED_RAMS ON @@ -39,7 +40,14 @@ set_global_assignment -name VERILOG_MACRO SYNTHESIS set_global_assignment -name VERILOG_MACRO NDEBUG set_global_assignment -name MESSAGE_DISABLE 16818 set_global_assignment -name VERILOG_MACRO FPU_FAST +set_global_assignment -name TIMEQUEST_DO_REPORT_TIMING ON +set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED +set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE" +set_global_assignment -name FITTER_EFFORT "STANDARD FIT" +set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS" +set_global_assignment -name ROUTER_TIMING_OPTIMIZATION_LEVEL MAXIMUM +set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0 set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100 set_global_assignment -name POWER_BOARD_THERMAL_MODEL "NONE (CONSERVATIVE)" @@ -50,12 +58,6 @@ set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON set_global_assignment -name POWER_USE_TA_VALUE 65 set_global_assignment -name SEED 1 -set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON -set_global_assignment -name FITTER_EFFORT "STANDARD FIT" -set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS" -set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED -set_global_assignment -name ROUTER_TIMING_OPTIMIZATION_LEVEL MAXIMUM -set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE" set idx 0 foreach arg $q_args_orig { diff --git a/hw/syn/quartus/timing-html.tcl b/hw/syn/quartus/timing-html.tcl new file mode 100644 index 00000000..20d3fb52 --- /dev/null +++ b/hw/syn/quartus/timing-html.tcl @@ -0,0 +1,40 @@ +package require cmdline + +set options { + { "project.arg" "" "Project name" } + { "outdir.arg" "timing-html" "Output directory" } +} + +array set opts [::cmdline::getoptions quartus(args) $options] + +# Verify required parameters +set requiredParameters {project} +foreach p $requiredParameters { + if {$opts($p) == ""} { + puts stderr "Missing required parameter: -$p" + exit 1 + } +} + +project_open $opts(project) + +set_global_assignment -name NUM_PARALLEL_PROCESSORS ALL + +create_timing_netlist +read_sdc +update_timing_netlist + +foreach_in_collection op [get_available_operating_conditions] { + set_operating_conditions $op + + report_timing -setup -npaths 150 -detail full_path -multi_corner -pairs_only -nworst 8 \ + -file "$opts(outdir)/timing_paths_$op.html" \ + -panel_name "Critical paths for $op" + + create_slack_histogram -num_bins 50 -clock clk -multi_corner -file "$opts(outdir)/slack_histogram_$op.html" +} + + + + + diff --git a/hw/syn/quartus/timing.tcl b/hw/syn/quartus/timing.tcl deleted file mode 100644 index 5e9def4d..00000000 --- a/hw/syn/quartus/timing.tcl +++ /dev/null @@ -1,24 +0,0 @@ -project_open VX_pipeline - -set_global_assignment -name NUM_PARALLEL_PROCESSORS ALL - -create_timing_netlist -read_sdc -update_timing_netlist - -foreach_in_collection op [get_available_operating_conditions] { - set_operating_conditions $op - - report_timing -setup -npaths 150 -detail full_path -multi_corner -pairs_only -nworst 8 \ - -file "bin/timing_paths_$op.html" \ - -panel_name "Critical paths for $op" - - create_slack_histogram -num_bins 50 -clock clk -multi_corner -file "bin/slack_histogram_$op.html" - - -} - - - - - diff --git a/hw/syn/quartus/top1/Makefile b/hw/syn/quartus/top1/Makefile new file mode 100644 index 00000000..34db9257 --- /dev/null +++ b/hw/syn/quartus/top1/Makefile @@ -0,0 +1,72 @@ +PROJECT = vortex_afu +TOP_LEVEL_ENTITY = vortex_afu +SRC_FILE = vortex_afu.sv +FPU_INCLUDE = ../../../rtl/fp_cores;../../../rtl/fp_cores/altera;../../../rtl/fp_cores/fpnew/src;../../../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;../../../rtl/fp_cores/fpnew/src/common_cells/include;../../../rtl/fp_cores/fpnew/src/common_cells/src +RTL_INCLUDE = $(FPU_INCLUDE);../../../rtl;../../../rtl/libs;../../../rtl/interfaces;../../../rtl/cache;../../../rtl/afu;../../../rtl/afu/ccip +PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf + +# Part, Family +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG + +# Executable Configuration +SYN_ARGS = --parallel --read_settings_files=on --set=VERILOG_MACRO=NOPAE=1 +FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on +ASM_ARGS = +STA_ARGS = --parallel --do_report_timing + +# Build targets +all: $(PROJECT).sta.rpt + +syn: $(PROJECT).syn.rpt + +fit: $(PROJECT).fit.rpt + +asm: $(PROJECT).asm.rpt + +sta: $(PROJECT).sta.rpt + +smart: smart.log + +# Target implementations +STAMP = echo done > + +$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES) + quartus_syn $(PROJECT) $(SYN_ARGS) + $(STAMP) fit.chg + +$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt + quartus_fit $(PROJECT) $(FIT_ARGS) + $(STAMP) asm.chg + $(STAMP) sta.chg + +$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt + quartus_asm $(PROJECT) $(ASM_ARGS) + +$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt + quartus_sta $(PROJECT) $(STA_ARGS) + +smart.log: $(PROJECT_FILES) + quartus_sh --determine_smart_action $(PROJECT) > smart.log + +# Project initialization +$(PROJECT_FILES): + quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=1" + +syn.chg: + $(STAMP) syn.chg + +fit.chg: + $(STAMP) fit.chg + +sta.chg: + $(STAMP) sta.chg + +asm.chg: + $(STAMP) asm.chg + +program: $(PROJECT).sof + quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof" + +clean: + rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox diff --git a/hw/syn/quartus/top2/Makefile b/hw/syn/quartus/top2/Makefile new file mode 100644 index 00000000..5c02d012 --- /dev/null +++ b/hw/syn/quartus/top2/Makefile @@ -0,0 +1,72 @@ +PROJECT = vortex_afu +TOP_LEVEL_ENTITY = vortex_afu +SRC_FILE = vortex_afu.sv +FPU_INCLUDE = ../../../rtl/fp_cores;../../../rtl/fp_cores/altera;../../../rtl/fp_cores/fpnew/src;../../../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;../../../rtl/fp_cores/fpnew/src/common_cells/include;../../../rtl/fp_cores/fpnew/src/common_cells/src +RTL_INCLUDE = $(FPU_INCLUDE);../../../rtl;../../../rtl/libs;../../../rtl/interfaces;../../../rtl/cache;../../../rtl/afu;../../../rtl/afu/ccip +PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf + +# Part, Family +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG + +# Executable Configuration +SYN_ARGS = --parallel --read_settings_files=on --set=VERILOG_MACRO=NOPAE=1 +FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on +ASM_ARGS = +STA_ARGS = --parallel --do_report_timing + +# Build targets +all: $(PROJECT).sta.rpt + +syn: $(PROJECT).syn.rpt + +fit: $(PROJECT).fit.rpt + +asm: $(PROJECT).asm.rpt + +sta: $(PROJECT).sta.rpt + +smart: smart.log + +# Target implementations +STAMP = echo done > + +$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES) + quartus_syn $(PROJECT) $(SYN_ARGS) + $(STAMP) fit.chg + +$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt + quartus_fit $(PROJECT) $(FIT_ARGS) + $(STAMP) asm.chg + $(STAMP) sta.chg + +$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt + quartus_asm $(PROJECT) $(ASM_ARGS) + +$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt + quartus_sta $(PROJECT) $(STA_ARGS) + +smart.log: $(PROJECT_FILES) + quartus_sh --determine_smart_action $(PROJECT) > smart.log + +# Project initialization +$(PROJECT_FILES): + quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=2" + +syn.chg: + $(STAMP) syn.chg + +fit.chg: + $(STAMP) fit.chg + +sta.chg: + $(STAMP) sta.chg + +asm.chg: + $(STAMP) asm.chg + +program: $(PROJECT).sof + quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof" + +clean: + rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox diff --git a/hw/syn/quartus/top8/Makefile b/hw/syn/quartus/top8/Makefile new file mode 100644 index 00000000..373e8b74 --- /dev/null +++ b/hw/syn/quartus/top8/Makefile @@ -0,0 +1,74 @@ +PROJECT = vortex_afu +TOP_LEVEL_ENTITY = vortex_afu +SRC_FILE = vortex_afu.sv +FPU_INCLUDE = ../../../rtl/fp_cores;../../../rtl/fp_cores/altera;../../../rtl/fp_cores/fpnew/src;../../../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;../../../rtl/fp_cores/fpnew/src/common_cells/include;../../../rtl/fp_cores/fpnew/src/common_cells/src +RTL_INCLUDE = $(FPU_INCLUDE);../../../rtl;../../../rtl/libs;../../../rtl/interfaces;../../../rtl/cache;../../../rtl/afu;../../../rtl/afu/ccip +PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf + +# Part, Family +FAMILY = "Arria 10" + +#DEVICE = 1SX280HN2F43E2VG +DEVICE = 10AX115N3F40E2SG + +# Executable Configuration +SYN_ARGS = --parallel --read_settings_files=on --set=VERILOG_MACRO=NOPAE=1 +FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on +ASM_ARGS = +STA_ARGS = --parallel --do_report_timing + +# Build targets +all: $(PROJECT).sta.rpt + +syn: $(PROJECT).syn.rpt + +fit: $(PROJECT).fit.rpt + +asm: $(PROJECT).asm.rpt + +sta: $(PROJECT).sta.rpt + +smart: smart.log + +# Target implementations +STAMP = echo done > + +$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES) + quartus_syn $(PROJECT) $(SYN_ARGS) + $(STAMP) fit.chg + +$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt + quartus_fit $(PROJECT) $(FIT_ARGS) + $(STAMP) asm.chg + $(STAMP) sta.chg + +$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt + quartus_asm $(PROJECT) $(ASM_ARGS) + +$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt + quartus_sta $(PROJECT) $(STA_ARGS) + +smart.log: $(PROJECT_FILES) + quartus_sh --determine_smart_action $(PROJECT) > smart.log + +# Project initialization +$(PROJECT_FILES): + quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=8" + +syn.chg: + $(STAMP) syn.chg + +fit.chg: + $(STAMP) fit.chg + +sta.chg: + $(STAMP) sta.chg + +asm.chg: + $(STAMP) asm.chg + +program: $(PROJECT).sof + quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof" + +clean: + rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox