From 41d7e6c63afd8a90f69559a22426ca71e98a6c12 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Nov 2021 07:08:15 -0500 Subject: [PATCH] cummulative fixes, RTL uuid trace, texture unit fixes, simx timing fixes --- ci/blackbox.sh | 12 +- ci/regression.sh | 9 + driver/common/vx_utils.cpp | 81 ++-- hw/rtl/VX_alu_unit.sv | 16 +- hw/rtl/VX_commit.sv | 46 +- hw/rtl/VX_config.vh | 96 ++-- hw/rtl/VX_csr_data.sv | 57 ++- hw/rtl/VX_csr_unit.sv | 21 +- hw/rtl/VX_decode.sv | 49 +- hw/rtl/VX_define.vh | 3 +- hw/rtl/VX_dispatch.sv | 30 +- hw/rtl/VX_execute.sv | 10 + hw/rtl/VX_fpu_unit.sv | 13 +- hw/rtl/VX_gpu_unit.sv | 31 +- hw/rtl/VX_ibuffer.sv | 8 +- hw/rtl/VX_icache_stage.sv | 33 +- hw/rtl/VX_issue.sv | 68 ++- hw/rtl/VX_lsu_unit.sv | 47 +- hw/rtl/VX_mem_unit.sv | 30 +- hw/rtl/VX_muldiv.sv | 35 +- hw/rtl/VX_pipeline.sv | 5 +- hw/rtl/VX_scoreboard.sv | 12 +- hw/rtl/VX_trace_instr.vh | 4 +- hw/rtl/VX_warp_sched.sv | 20 +- hw/rtl/VX_writeback.sv | 27 +- hw/rtl/afu/VX_to_mem.sv | 3 +- hw/rtl/cache/VX_bank.sv | 2 - hw/rtl/cache/VX_cache.sv | 244 +++++----- hw/rtl/cache/VX_shared_mem.sv | 25 +- hw/rtl/interfaces/VX_alu_req_if.sv | 5 +- hw/rtl/interfaces/VX_cmt_to_csr_if.sv | 9 +- hw/rtl/interfaces/VX_commit_if.sv | 3 + hw/rtl/interfaces/VX_csr_req_if.sv | 3 + hw/rtl/interfaces/VX_decode_if.sv | 7 +- hw/rtl/interfaces/VX_fpu_req_if.sv | 3 + hw/rtl/interfaces/VX_gpu_req_if.sv | 5 +- hw/rtl/interfaces/VX_ibuffer_if.sv | 3 + hw/rtl/interfaces/VX_ifetch_req_if.sv | 9 +- hw/rtl/interfaces/VX_ifetch_rsp_if.sv | 7 +- hw/rtl/interfaces/VX_lsu_req_if.sv | 3 + hw/rtl/interfaces/VX_perf_cache_if.sv | 6 +- hw/rtl/interfaces/VX_perf_memsys_if.sv | 24 +- hw/rtl/interfaces/VX_perf_pipeline_if.sv | 32 +- hw/rtl/interfaces/VX_perf_tex_if.sv | 23 + hw/rtl/interfaces/VX_tex_req_if.sv | 3 + hw/rtl/interfaces/VX_tex_rsp_if.sv | 3 + hw/rtl/interfaces/VX_writeback_if.sv | 3 + hw/rtl/libs/VX_axi_adapter.sv | 4 +- hw/rtl/libs/VX_index_queue.sv | 2 +- hw/rtl/libs/VX_popcount.sv | 11 +- hw/rtl/libs/VX_skid_buffer.sv | 2 +- hw/rtl/tex_unit/VX_tex_unit.sv | 147 ++++-- hw/scripts/scope.json | 76 +-- runtime/src/vx_start.S | 12 +- sim/common/mempool.h | 47 ++ sim/common/simobject.h | 254 +++++----- sim/simX/archdef.h | 38 +- sim/simX/args.h | 4 +- sim/simX/cache.cpp | 79 ++- sim/simX/cache.h | 69 ++- sim/simX/constants.h | 6 +- sim/simX/core.cpp | 594 ++++++++++++++--------- sim/simX/core.h | 85 +++- sim/simX/decode.cpp | 38 +- sim/simX/execute.cpp | 120 +++-- sim/simX/exeunit.cpp | 305 +++++++----- sim/simX/exeunit.h | 69 +-- sim/simX/memsim.cpp | 12 +- sim/simX/memsim.h | 49 +- sim/simX/pipeline.h | 42 +- sim/simX/processor.cpp | 8 +- sim/simX/scoreboard.h | 2 +- sim/simX/sharedmem.h | 93 ++++ sim/simX/tex_unit.cpp | 12 +- sim/simX/tex_unit.h | 2 +- sim/simX/types.h | 125 ++++- sim/simX/warp.cpp | 4 +- sim/simX/warp.h | 4 + tests/regression/tex/kernel.c | 17 +- 79 files changed, 2148 insertions(+), 1372 deletions(-) create mode 100644 hw/rtl/interfaces/VX_perf_tex_if.sv create mode 100644 sim/common/mempool.h create mode 100644 sim/simX/sharedmem.h diff --git a/ci/blackbox.sh b/ci/blackbox.sh index 5ba7a29a..f2c6ec2b 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -124,7 +124,17 @@ CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_TH echo "CONFIGS=$CONFIGS" -make -C $DRIVER_PATH clean +if [ -f "blackbox.cache" ] +then + LAST_CONFIGS=`cat blackbox.cache` +fi + +if [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ]; +then + make -C $DRIVER_PATH clean +fi + +echo "$CONFIGS+$DEBUG+$SCOPE" > blackbox.cache status=0 diff --git a/ci/regression.sh b/ci/regression.sh index 936ca13b..2be58140 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -27,8 +27,11 @@ tex() echo "begin texture tests..." CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0" +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-isoccer.png -osoccer_result.png -g0" CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-irainbow.png -orainbow_result.png -g2" +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" --perf +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-itoad.png -otoad_result.png -g1" --perf echo "coverage texture done!" } @@ -58,7 +61,9 @@ debug() echo "begin debugging tests..." ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1" ./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1" echo "debugging tests done!" @@ -73,9 +78,13 @@ CONFIGS=-DEXT_M_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_e # disabling F extension CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext +CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf +CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf # disable shared memory CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem +CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem --perf +CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=simx --cores=1 --app=no_smem --perf # using Default FPU core FPU_CORE=FPU_DEFAULT ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index 5b70e09b..a69df27c 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -114,11 +114,13 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t csr_stalls = 0; uint64_t alu_stalls = 0; uint64_t gpu_stalls = 0; + // PERF: decode + uint64_t loads = 0; + uint64_t stores = 0; + uint64_t branches = 0; // PERF: Icache uint64_t icache_reads = 0; uint64_t icache_read_misses = 0; - uint64_t icache_pipe_stalls = 0; - uint64_t icache_rsp_stalls = 0; // PERF: Dcache uint64_t dcache_reads = 0; uint64_t dcache_writes = 0; @@ -126,17 +128,19 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t dcache_write_misses = 0; uint64_t dcache_bank_stalls = 0; uint64_t dcache_mshr_stalls = 0; - uint64_t dcache_pipe_stalls = 0; - uint64_t dcache_rsp_stalls = 0; - // PERF: SMEM + // PERF: shared memory uint64_t smem_reads = 0; uint64_t smem_writes = 0; uint64_t smem_bank_stalls = 0; // PERF: memory uint64_t mem_reads = 0; uint64_t mem_writes = 0; - uint64_t mem_stalls = 0; uint64_t mem_lat = 0; +#ifdef EXT_TEX_ENABLE + // PERF: texunit + uint64_t tex_mem_reads = 0; + uint64_t tex_mem_lat = 0; +#endif #endif uint64_t num_cores; @@ -196,6 +200,20 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu unit stalls=%ld\n", core_id, gpu_stalls_per_core); gpu_stalls += gpu_stalls_per_core; + // PERF: decode + // loads + uint64_t loads_per_core = get_csr_64(staging_ptr, CSR_MPM_LOADS); + if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core); + loads += loads_per_core; + // stores + uint64_t stores_per_core = get_csr_64(staging_ptr, CSR_MPM_STORES); + if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core); + stores += stores_per_core; + // branches + uint64_t branches_per_core = get_csr_64(staging_ptr, CSR_MPM_BRANCHES); + if (num_cores > 1) fprintf(stream, "PERF: core%d: branches=%ld\n", core_id, branches_per_core); + branches += branches_per_core; + // PERF: Icache // total reads uint64_t icache_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_READS); @@ -204,16 +222,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { // read misses uint64_t icache_miss_r_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_MISS_R); int icache_read_hit_ratio = (int)((1.0 - (double(icache_miss_r_per_core) / double(icache_reads_per_core))) * 100); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio); + if (num_cores > 1) fprintf(stream, "PERF: core%d: icache misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio); icache_read_misses += icache_miss_r_per_core; - // pipeline stalls - uint64_t icache_pipe_st_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_PIPE_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache pipeline stalls=%ld\n", core_id, icache_pipe_st_per_core); - icache_pipe_stalls += icache_pipe_st_per_core; - // response stalls - uint64_t icache_crsp_st_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_CRSP_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reponse stalls=%ld\n", core_id, icache_crsp_st_per_core); - icache_rsp_stalls += icache_crsp_st_per_core; // PERF: Dcache // total reads @@ -243,14 +253,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t dcache_mshr_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MSHR_ST); if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_st_per_core); dcache_mshr_stalls += dcache_mshr_st_per_core; - // pipeline stalls - uint64_t dcache_pipe_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_PIPE_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core); - dcache_pipe_stalls += dcache_pipe_st_per_core; - // response stalls - uint64_t dcache_crsp_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_CRSP_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core); - dcache_rsp_stalls += dcache_crsp_st_per_core; // PERF: SMEM // total reads @@ -270,17 +272,26 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { // PERF: memory uint64_t mem_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_READS); uint64_t mem_writes_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_WRITES); - uint64_t mem_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_ST); uint64_t mem_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_LAT); - int mem_utilization = (int)((double(mem_reads_per_core + mem_writes_per_core) / double(mem_reads_per_core + mem_writes_per_core + mem_stalls_per_core)) * 100); int mem_avg_lat = (int)(double(mem_lat_per_core) / double(mem_reads_per_core)); if (num_cores > 1) fprintf(stream, "PERF: core%d: memory requests=%ld (reads=%ld, writes=%ld)\n", core_id, (mem_reads_per_core + mem_writes_per_core), mem_reads_per_core, mem_writes_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: memory stalls=%ld (utilization=%d%%)\n", core_id, mem_stalls_per_core, mem_utilization); - if (num_cores > 1) fprintf(stream, "PERF: core%d: memory average latency=%d cycles\n", core_id, mem_avg_lat); + if (num_cores > 1) fprintf(stream, "PERF: core%d: memory latency=%d cycles\n", core_id, mem_avg_lat); mem_reads += mem_reads_per_core; mem_writes += mem_writes_per_core; - mem_stalls += mem_stalls_per_core; mem_lat += mem_lat_per_core; + + #ifdef EXT_TEX_ENABLE + // total reads + uint64_t tex_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_TEX_READS); + if (num_cores > 1) fprintf(stream, "PERF: core%d: tex memory reads=%ld\n", core_id, tex_reads_per_core); + tex_mem_reads += tex_reads_per_core; + + // read latency + uint64_t tex_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_TEX_LAT); + int tex_avg_lat = (int)(double(tex_lat_per_core) / double(tex_reads_per_core)); + if (num_cores > 1) fprintf(stream, "PERF: core%d: tex memory latency=%d cycles\n", core_id, tex_avg_lat); + tex_mem_lat += tex_lat_per_core; + #endif #endif } @@ -293,7 +304,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100); int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100); int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100); - int mem_utilization = (int)((double(mem_reads + mem_writes) / double(mem_reads + mem_writes + mem_stalls)) * 100); int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads)); fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls); fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls); @@ -302,24 +312,27 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { fprintf(stream, "PERF: csr unit stalls=%ld\n", csr_stalls); fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls); fprintf(stream, "PERF: gpu unit stalls=%ld\n", gpu_stalls); + fprintf(stream, "PERF: loads=%ld\n", loads); + fprintf(stream, "PERF: stores=%ld\n", stores); + fprintf(stream, "PERF: branches=%ld\n", branches); fprintf(stream, "PERF: icache reads=%ld\n", icache_reads); fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio); - fprintf(stream, "PERF: icache pipeline stalls=%ld\n", icache_pipe_stalls); - fprintf(stream, "PERF: icache reponse stalls=%ld\n", icache_rsp_stalls); fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads); fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes); fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio); fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio); fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization); fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls); - fprintf(stream, "PERF: dcache pipeline stalls=%ld\n", dcache_pipe_stalls); - fprintf(stream, "PERF: dcache reponse stalls=%ld\n", dcache_rsp_stalls); fprintf(stream, "PERF: smem reads=%ld\n", smem_reads); fprintf(stream, "PERF: smem writes=%ld\n", smem_writes); fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization); fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes); - fprintf(stream, "PERF: memory stalls=%ld (utilization=%d%%)\n", mem_stalls, mem_utilization); fprintf(stream, "PERF: memory average latency=%d cycles\n", mem_avg_lat); +#ifdef EXT_TEX_ENABLE + int tex_avg_lat = (int)(double(tex_mem_lat) / double(tex_mem_reads)); + fprintf(stream, "PERF: tex memory reads=%ld\n", tex_mem_reads); + fprintf(stream, "PERF: tex memory latency=%d cycles\n", tex_avg_lat); +#endif #endif // release allocated resources diff --git a/hw/rtl/VX_alu_unit.sv b/hw/rtl/VX_alu_unit.sv index 8840f044..da20eb6d 100644 --- a/hw/rtl/VX_alu_unit.sv +++ b/hw/rtl/VX_alu_unit.sv @@ -96,6 +96,7 @@ module VX_alu_unit #( wire alu_ready_in; wire alu_valid_out; wire alu_ready_out; + wire [63:0] alu_uuid; wire [`NW_BITS-1:0] alu_wid; wire [`NUM_THREADS-1:0] alu_tmask; wire [31:0] alu_PC; @@ -112,14 +113,14 @@ module VX_alu_unit #( assign alu_ready_in = alu_ready_out || ~alu_valid_out; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (alu_ready_in), - .data_in ({alu_valid_in, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}), - .data_out ({alu_valid_out, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r}) + .data_in ({alu_valid_in, alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}), + .data_out ({alu_valid_out, alu_uuid, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r}) ); `UNUSED_VAR (br_op_r) @@ -138,6 +139,7 @@ module VX_alu_unit #( wire mul_ready_in; wire mul_valid_out; wire mul_ready_out; + wire [63:0] mul_uuid; wire [`NW_BITS-1:0] mul_wid; wire [`NUM_THREADS-1:0] mul_tmask; wire [31:0] mul_PC; @@ -153,6 +155,7 @@ module VX_alu_unit #( // Inputs .alu_op (mul_op), + .uuid_in (alu_req_if.uuid), .wid_in (alu_req_if.wid), .tmask_in (alu_req_if.tmask), .PC_in (alu_req_if.PC), @@ -163,6 +166,7 @@ module VX_alu_unit #( // Outputs .wid_out (mul_wid), + .uuid_out (mul_uuid), .tmask_out (mul_tmask), .PC_out (mul_PC), .rd_out (mul_rd), @@ -184,6 +188,7 @@ module VX_alu_unit #( assign mul_valid_in = alu_req_if.valid && is_mul_op; assign alu_commit_if.valid = alu_valid_out || mul_valid_out; + assign alu_commit_if.uuid = alu_valid_out ? alu_uuid : mul_uuid; assign alu_commit_if.wid = alu_valid_out ? alu_wid : mul_wid; assign alu_commit_if.tmask = alu_valid_out ? alu_tmask : mul_tmask; assign alu_commit_if.PC = alu_valid_out ? alu_PC : mul_PC; @@ -201,6 +206,7 @@ module VX_alu_unit #( assign alu_valid_in = alu_req_if.valid; assign alu_commit_if.valid = alu_valid_out; + assign alu_commit_if.uuid = alu_uuid; assign alu_commit_if.wid = alu_wid; assign alu_commit_if.tmask = alu_tmask; assign alu_commit_if.PC = alu_PC; @@ -220,8 +226,8 @@ module VX_alu_unit #( `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (branch_ctl_if.valid) begin - dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h\n", - $time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest); + dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h (#%0d)\n", + $time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest, alu_uuid); end end `endif diff --git a/hw/rtl/VX_commit.sv b/hw/rtl/VX_commit.sv index 07b83df0..574ed36e 100644 --- a/hw/rtl/VX_commit.sv +++ b/hw/rtl/VX_commit.sv @@ -40,27 +40,35 @@ module VX_commit #( `endif || gpu_commit_fire; - wire [`NUM_THREADS-1:0] commit_tmask; - assign commit_tmask = alu_commit_fire ? alu_commit_if.tmask: - ld_commit_fire ? ld_commit_if.tmask: - st_commit_fire ? st_commit_if.tmask: - csr_commit_fire ? csr_commit_if.tmask: - `ifdef EXT_F_ENABLE - fpu_commit_fire ? fpu_commit_if.tmask: - `endif - /*gpu_commit_fire ?*/ gpu_commit_if.tmask; +`ifdef EXT_F_ENABLE + wire [(6*`NUM_THREADS)-1:0] commit_tmask; +`else + wire [(5*`NUM_THREADS)-1:0] commit_tmask; +`endif - wire [$clog2(`NUM_THREADS+1)-1:0] commit_cnt; - `POP_COUNT(commit_cnt, commit_tmask); + wire [$clog2($bits(commit_tmask)+1)-1:0] commit_size; + + assign commit_tmask = { + {`NUM_THREADS{alu_commit_fire}} & alu_commit_if.tmask, + {`NUM_THREADS{ld_commit_fire}} & ld_commit_if.tmask, + {`NUM_THREADS{st_commit_fire}} & st_commit_if.tmask, + {`NUM_THREADS{csr_commit_fire}} & csr_commit_if.tmask, + `ifdef EXT_F_ENABLE + {`NUM_THREADS{fpu_commit_fire}} & fpu_commit_if.tmask, + `endif + {`NUM_THREADS{gpu_commit_fire}} & gpu_commit_if.tmask + }; + + `POP_COUNT(commit_size, commit_tmask); VX_pipe_register #( - .DATAW (1 + $clog2(`NUM_THREADS+1)), + .DATAW (1 + $bits(commit_size)), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (1'b1), - .data_in ({commit_fire, commit_cnt}), + .data_in ({commit_fire, commit_size}), .data_out ({cmt_to_csr_if.valid, cmt_to_csr_if.commit_size}) ); @@ -90,32 +98,32 @@ module VX_commit #( if (alu_commit_if.valid && alu_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd); `TRACE_ARRAY1D(alu_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", alu_commit_if.uuid); end if (ld_commit_if.valid && ld_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.wb, ld_commit_if.rd); `TRACE_ARRAY1D(ld_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", ld_commit_if.uuid); end if (st_commit_if.valid && st_commit_if.ready) begin - dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd); + dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d (#%0d)\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd, st_commit_if.uuid); end if (csr_commit_if.valid && csr_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=CSR, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.wb, csr_commit_if.rd); `TRACE_ARRAY1D(csr_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", csr_commit_if.uuid); end `ifdef EXT_F_ENABLE if (fpu_commit_if.valid && fpu_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.wb, fpu_commit_if.rd); `TRACE_ARRAY1D(fpu_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", fpu_commit_if.uuid); end `endif if (gpu_commit_if.valid && gpu_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=GPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.wb, gpu_commit_if.rd); `TRACE_ARRAY1D(gpu_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", gpu_commit_if.uuid); end end `endif diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 82da10c2..8e0bbaa8 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -171,48 +171,50 @@ `define CSR_MPM_FPU_ST_H 12'hB88 `define CSR_MPM_GPU_ST 12'hB09 `define CSR_MPM_GPU_ST_H 12'hB89 +// PERF: decode +`define CSR_MPM_LOADS 12'hB0A +`define CSR_MPM_LOADS_H 12'hB8A +`define CSR_MPM_STORES 12'hB0B +`define CSR_MPM_STORES_H 12'hB8B +`define CSR_MPM_BRANCHES 12'hB0C +`define CSR_MPM_BRANCHES_H 12'hB8C // PERF: icache -`define CSR_MPM_ICACHE_READS 12'hB0A // total reads -`define CSR_MPM_ICACHE_READS_H 12'hB8A -`define CSR_MPM_ICACHE_MISS_R 12'hB0B // total misses -`define CSR_MPM_ICACHE_MISS_R_H 12'hB8B -`define CSR_MPM_ICACHE_PIPE_ST 12'hB0C // pipeline stalls -`define CSR_MPM_ICACHE_PIPE_ST_H 12'hB8C -`define CSR_MPM_ICACHE_CRSP_ST 12'hB0D // core response stalls -`define CSR_MPM_ICACHE_CRSP_ST_H 12'hB8D +`define CSR_MPM_ICACHE_READS 12'hB0D // total reads +`define CSR_MPM_ICACHE_READS_H 12'hB8D +`define CSR_MPM_ICACHE_MISS_R 12'hB0E // read misses +`define CSR_MPM_ICACHE_MISS_R_H 12'hB8E // PERF: dcache -`define CSR_MPM_DCACHE_READS 12'hB0E // total reads -`define CSR_MPM_DCACHE_READS_H 12'hB8E -`define CSR_MPM_DCACHE_WRITES 12'hB0F // total writes -`define CSR_MPM_DCACHE_WRITES_H 12'hB8F -`define CSR_MPM_DCACHE_MISS_R 12'hB10 // read misses -`define CSR_MPM_DCACHE_MISS_R_H 12'hB90 -`define CSR_MPM_DCACHE_MISS_W 12'hB11 // write misses -`define CSR_MPM_DCACHE_MISS_W_H 12'hB91 -`define CSR_MPM_DCACHE_BANK_ST 12'hB12 // bank conflicts stalls -`define CSR_MPM_DCACHE_BANK_ST_H 12'hB92 -`define CSR_MPM_DCACHE_MSHR_ST 12'hB13 // MSHR stalls -`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB93 -`define CSR_MPM_DCACHE_PIPE_ST 12'hB14 // pipeline stalls -`define CSR_MPM_DCACHE_PIPE_ST_H 12'hB94 -`define CSR_MPM_DCACHE_CRSP_ST 12'hB15 // core response stalls -`define CSR_MPM_DCACHE_CRSP_ST_H 12'hB95 +`define CSR_MPM_DCACHE_READS 12'hB0F // total reads +`define CSR_MPM_DCACHE_READS_H 12'hB8F +`define CSR_MPM_DCACHE_WRITES 12'hB10 // total writes +`define CSR_MPM_DCACHE_WRITES_H 12'hB90 +`define CSR_MPM_DCACHE_MISS_R 12'hB11 // read misses +`define CSR_MPM_DCACHE_MISS_R_H 12'hB91 +`define CSR_MPM_DCACHE_MISS_W 12'hB12 // write misses +`define CSR_MPM_DCACHE_MISS_W_H 12'hB92 +`define CSR_MPM_DCACHE_BANK_ST 12'hB13 // bank conflicts +`define CSR_MPM_DCACHE_BANK_ST_H 12'hB93 +`define CSR_MPM_DCACHE_MSHR_ST 12'hB14 // MSHR stalls +`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB94 // PERF: smem -`define CSR_MPM_SMEM_READS 12'hB16 // total reads -`define CSR_MPM_SMEM_READS_H 12'hB96 -`define CSR_MPM_SMEM_WRITES 12'hB17 // total writes -`define CSR_MPM_SMEM_WRITES_H 12'hB97 -`define CSR_MPM_SMEM_BANK_ST 12'hB18 // bank conflicts stalls -`define CSR_MPM_SMEM_BANK_ST_H 12'hB98 +`define CSR_MPM_SMEM_READS 12'hB15 // total reads +`define CSR_MPM_SMEM_READS_H 12'hB95 +`define CSR_MPM_SMEM_WRITES 12'hB16 // total writes +`define CSR_MPM_SMEM_WRITES_H 12'hB96 +`define CSR_MPM_SMEM_BANK_ST 12'hB17 // bank conflicts +`define CSR_MPM_SMEM_BANK_ST_H 12'hB97 // PERF: memory -`define CSR_MPM_MEM_READS 12'hB19 // memory reads -`define CSR_MPM_MEM_READS_H 12'hB99 -`define CSR_MPM_MEM_WRITES 12'hB1A // memory writes -`define CSR_MPM_MEM_WRITES_H 12'hB9A -`define CSR_MPM_MEM_ST 12'hB1B // memory request stalls -`define CSR_MPM_MEM_ST_H 12'hB9B -`define CSR_MPM_MEM_LAT 12'hB1C // memory latency (total) -`define CSR_MPM_MEM_LAT_H 12'hB9C +`define CSR_MPM_MEM_READS 12'hB18 // memory reads +`define CSR_MPM_MEM_READS_H 12'hB98 +`define CSR_MPM_MEM_WRITES 12'hB19 // memory writes +`define CSR_MPM_MEM_WRITES_H 12'hB99 +`define CSR_MPM_MEM_LAT 12'hB1A // memory latency +`define CSR_MPM_MEM_LAT_H 12'hB9A +// PERF: texunit +`define CSR_MPM_TEX_READS 12'hB1B // texture accesses +`define CSR_MPM_TEX_READS_H 12'hB9B +`define CSR_MPM_TEX_LAT 12'hB1C // texture latency +`define CSR_MPM_TEX_LAT_H 12'hB9C // Machine Information Registers `define CSR_MVENDORID 12'hF11 @@ -254,12 +256,22 @@ `define TEX_STATE_WRAPU 5 `define TEX_STATE_WRAPV 6 `define TEX_STATE_MIPOFF(lod) (7+(lod)) +`define NUM_TEX_STATES (`TEX_STATE_MIPOFF(`TEX_LOD_MAX)+1) -`define NUM_TEX_STATES (7+`TEX_LOD_MAX) +`define CSR_TEX_UNIT 12'hFD0 -`define CSR_TEX(unit,state) (12'hFD0 + ((unit) * `NUM_TEX_STATES) + (state)) -`define CSR_TEX_UNIT(csr) (((csr) - 12'hFD0) / `NUM_TEX_STATES) -`define CSR_TEX_STATE(csr) (((csr) - 12'hFD0) % `NUM_TEX_STATES) +`define CSR_TEX_STATE_BEGIN 12'hFD1 +`define CSR_TEX_ADDR (`CSR_TEX_STATE_BEGIN+`TEX_STATE_ADDR) +`define CSR_TEX_WIDTH (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WIDTH) +`define CSR_TEX_HEIGHT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_HEIGHT) +`define CSR_TEX_FORMAT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FORMAT) +`define CSR_TEX_FILTER (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FILTER) +`define CSR_TEX_WRAPU (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPU) +`define CSR_TEX_WRAPV (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPV) +`define CSR_TEX_MIPOFF(lod) (`CSR_TEX_STATE_BEGIN+`TEX_STATE_MIPOFF(lod)) +`define CSR_TEX_STATE_END (`CSR_TEX_STATE_BEGIN + `NUM_TEX_STATES) + +`define CSR_TEX_STATE(addr) ((addr) - `CSR_TEX_STATE_BEGIN) // Pipeline Queues //////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_csr_data.sv b/hw/rtl/VX_csr_data.sv index 396358d1..d63d1b7d 100644 --- a/hw/rtl/VX_csr_data.sv +++ b/hw/rtl/VX_csr_data.sv @@ -7,6 +7,9 @@ module VX_csr_data #( input wire reset, `ifdef PERF_ENABLE +`ifdef EXT_TEX_ENABLE + VX_perf_tex_if.slave perf_tex_if, +`endif VX_perf_memsys_if.slave perf_memsys_if, VX_perf_pipeline_if.slave perf_pipeline_if, `endif @@ -22,11 +25,13 @@ module VX_csr_data #( `endif input wire read_enable, + input wire [63:0] read_uuid, input wire[`CSR_ADDR_BITS-1:0] read_addr, input wire[`NW_BITS-1:0] read_wid, output wire[31:0] read_data, input wire write_enable, + input wire [63:0] write_uuid, input wire[`CSR_ADDR_BITS-1:0] write_addr, input wire[`NW_BITS-1:0] write_wid, input wire[31:0] write_data, @@ -56,7 +61,7 @@ module VX_csr_data #( `ifdef EXT_F_ENABLE if (fpu_to_csr_if.write_enable) begin fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] - | fpu_to_csr_if.write_fflags; + | fpu_to_csr_if.write_fflags; end `endif if (write_enable) begin @@ -75,11 +80,12 @@ module VX_csr_data #( `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0]; default: begin `ifdef EXT_TEX_ENABLE - `ASSERT(write_addr >= `CSR_TEX(0,0) - && write_addr < `CSR_TEX(`NUM_TEX_UNITS, 0), - ("%t: invalid CSR write address: %0h", $time, write_addr)); + `ASSERT((write_addr == `CSR_TEX_UNIT) + || (write_addr >= `CSR_TEX_STATE_BEGIN + && write_addr < `CSR_TEX_STATE_END), + ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid)); `else - `ASSERT(~write_enable, ("%t: invalid CSR write address: %0h", $time, write_addr)); + `ASSERT(~write_enable, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid)); `endif end endcase @@ -152,20 +158,28 @@ module VX_csr_data #( `CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0]; `CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]); + `ifdef EXT_F_ENABLE `CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0]; `CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]); + `else + `CSR_MPM_FPU_ST : read_data_r = '0; + `CSR_MPM_FPU_ST_H : read_data_r = '0; + `endif `CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0]; `CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]); + // PERF: decode + `CSR_MPM_LOADS : read_data_r = perf_pipeline_if.loads[31:0]; + `CSR_MPM_LOADS_H : read_data_r = 32'(perf_pipeline_if.loads[`PERF_CTR_BITS-1:32]); + `CSR_MPM_STORES : read_data_r = perf_pipeline_if.stores[31:0]; + `CSR_MPM_STORES_H : read_data_r = 32'(perf_pipeline_if.stores[`PERF_CTR_BITS-1:32]); + `CSR_MPM_BRANCHES : read_data_r = perf_pipeline_if.branches[31:0]; + `CSR_MPM_BRANCHES_H : read_data_r = 32'(perf_pipeline_if.branches[`PERF_CTR_BITS-1:32]); // PERF: icache `CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0]; `CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0]; `CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[`PERF_CTR_BITS-1:32]); - `CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0]; - `CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.icache_pipe_stalls[`PERF_CTR_BITS-1:32]); - `CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0]; - `CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.icache_crsp_stalls[`PERF_CTR_BITS-1:32]); - // PERF: dcache + // PERF: dcache `CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0]; `CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0]; @@ -178,26 +192,27 @@ module VX_csr_data #( `CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0]; `CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]); - `CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0]; - `CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.dcache_pipe_stalls[`PERF_CTR_BITS-1:32]); - `CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0]; - `CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[`PERF_CTR_BITS-1:32]); - // PERF: smem + // PERF: smem `CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0]; `CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0]; `CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]); `CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0]; `CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]); - // PERF: MEM + // PERF: memory `CSR_MPM_MEM_READS : read_data_r = perf_memsys_if.mem_reads[31:0]; `CSR_MPM_MEM_READS_H : read_data_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_MEM_WRITES : read_data_r = perf_memsys_if.mem_writes[31:0]; `CSR_MPM_MEM_WRITES_H : read_data_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]); - `CSR_MPM_MEM_ST : read_data_r = perf_memsys_if.mem_stalls[31:0]; - `CSR_MPM_MEM_ST_H : read_data_r = 32'(perf_memsys_if.mem_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_MEM_LAT : read_data_r = perf_memsys_if.mem_latency[31:0]; `CSR_MPM_MEM_LAT_H : read_data_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]); + `ifdef EXT_TEX_ENABLE + // PERF: texunit + `CSR_MPM_TEX_READS : read_data_r = perf_tex_if.mem_reads[31:0]; + `CSR_MPM_TEX_READS_H : read_data_r = 32'(perf_tex_if.mem_reads[`PERF_CTR_BITS-1:32]); + `CSR_MPM_TEX_LAT : read_data_r = perf_tex_if.mem_latency[31:0]; + `CSR_MPM_TEX_LAT_H : read_data_r = 32'(perf_tex_if.mem_latency[`PERF_CTR_BITS-1:32]); + `endif // PERF: reserved `CSR_MPM_RESERVED : read_data_r = '0; `CSR_MPM_RESERVED_H : read_data_r = '0; @@ -227,7 +242,9 @@ module VX_csr_data #( read_addr_valid_r = 1; end else `ifdef EXT_TEX_ENABLE - if (read_addr >= `CSR_TEX(0,0) && read_addr < `CSR_TEX(`NUM_TEX_UNITS,0)) begin + if ((read_addr == `CSR_TEX_UNIT) + || (read_addr >= `CSR_TEX_STATE_BEGIN + && read_addr < `CSR_TEX_STATE_END)) begin read_addr_valid_r = 1; end else `endif @@ -236,7 +253,7 @@ module VX_csr_data #( endcase end - `RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("invalid CSR read address: %0h", read_addr)) + `RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: %0h (#%0d)", $time, read_addr, read_uuid)) assign read_data = read_data_r; diff --git a/hw/rtl/VX_csr_unit.sv b/hw/rtl/VX_csr_unit.sv index 0b05ca9c..6f7b35c9 100644 --- a/hw/rtl/VX_csr_unit.sv +++ b/hw/rtl/VX_csr_unit.sv @@ -7,6 +7,9 @@ module VX_csr_unit #( input wire reset, `ifdef PERF_ENABLE +`ifdef EXT_TEX_ENABLE + VX_perf_tex_if.slave perf_tex_if, +`endif VX_perf_memsys_if.slave perf_memsys_if, VX_perf_pipeline_if.slave perf_pipeline_if, `endif @@ -29,7 +32,8 @@ module VX_csr_unit #( ); wire csr_we_s1; wire [`CSR_ADDR_BITS-1:0] csr_addr_s1; - wire [31:0] csr_read_data, csr_read_data_s1; + wire [31:0] csr_read_data; + wire [31:0] csr_read_data_s1; wire [31:0] csr_updated_data_s1; wire write_enable = csr_commit_if.valid && csr_we_s1; @@ -42,8 +46,11 @@ module VX_csr_unit #( .clk (clk), .reset (reset), `ifdef PERF_ENABLE - .perf_memsys_if (perf_memsys_if), - .perf_pipeline_if (perf_pipeline_if), + `ifdef EXT_TEX_ENABLE + .perf_tex_if (perf_tex_if), + `endif + .perf_memsys_if (perf_memsys_if), + .perf_pipeline_if(perf_pipeline_if), `endif .cmt_to_csr_if (cmt_to_csr_if), .fetch_to_csr_if(fetch_to_csr_if), @@ -54,10 +61,12 @@ module VX_csr_unit #( .tex_csr_if (tex_csr_if), `endif .read_enable (csr_req_if.valid), + .read_uuid (csr_req_if.uuid), .read_addr (csr_req_if.addr), .read_wid (csr_req_if.wid), .read_data (csr_read_data), .write_enable (write_enable), + .write_uuid (csr_commit_if.uuid), .write_addr (csr_addr_s1), .write_wid (csr_commit_if.wid), .write_data (csr_updated_data_s1), @@ -101,14 +110,14 @@ module VX_csr_unit #( wire stall_out = ~csr_commit_if.ready && csr_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({csr_req_valid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}), - .data_out ({csr_commit_if.valid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1}) + .data_in ({csr_req_valid, csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}), + .data_out ({csr_commit_if.valid, csr_commit_if.uuid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1}) ); for (genvar i = 0; i < `NUM_THREADS; i++) begin diff --git a/hw/rtl/VX_decode.sv b/hw/rtl/VX_decode.sv index 2c6f09fb..3f9af431 100644 --- a/hw/rtl/VX_decode.sv +++ b/hw/rtl/VX_decode.sv @@ -20,6 +20,10 @@ module VX_decode #( input wire clk, input wire reset, +`ifdef PERF_ENABLE + VX_perf_pipeline_if.decode perf_decode_if, +`endif + // inputs VX_ifetch_rsp_if.slave ifetch_rsp_if, @@ -57,7 +61,6 @@ module VX_decode #( wire [11:0] s_imm = {func7, rd}; wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0}; wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0}; - wire [11:0] jalr_imm = {func7, rs2}; `UNUSED_VAR (rs3) @@ -169,7 +172,7 @@ module VX_decode #( use_rd = 1; use_imm = 1; is_wstall = 1; - imm = {{20{jalr_imm[11]}}, jalr_imm}; + imm = {{20{u_12[11]}}, u_12}; `USED_IREG (rd); `USED_IREG (rs1); end @@ -192,7 +195,7 @@ module VX_decode #( `USED_IREG (rs1); `USED_IREG (rs2); end - `INST_F: begin + `INST_FENCE: begin ex_type = `EX_LSU; op_mod = `INST_MOD_BITS'(1); end @@ -411,6 +414,7 @@ module VX_decode #( wire wb = use_rd && (| rd_r); assign decode_if.valid = ifetch_rsp_if.valid; + assign decode_if.uuid = ifetch_rsp_if.uuid; assign decode_if.wid = ifetch_rsp_if.wid; assign decode_if.tmask = ifetch_rsp_if.tmask; assign decode_if.PC = ifetch_rsp_if.PC; @@ -439,6 +443,42 @@ module VX_decode #( assign ifetch_rsp_if.ready = decode_if.ready; +`ifdef PERF_ENABLE + wire [$clog2(`NUM_THREADS+1)-1:0] perf_loads_per_cycle; + wire [$clog2(`NUM_THREADS+1)-1:0] perf_stores_per_cycle; + wire [$clog2(`NUM_THREADS+1)-1:0] perf_branches_per_cycle; + + wire [`NUM_THREADS-1:0] perf_loads_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && decode_if.wb}}; + wire [`NUM_THREADS-1:0] perf_stores_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && ~decode_if.wb}}; + wire [`NUM_THREADS-1:0] perf_branches_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_ALU && `INST_ALU_IS_BR(decode_if.op_mod)}}; + + `POP_COUNT(perf_loads_per_cycle, perf_loads_per_mask); + `POP_COUNT(perf_stores_per_cycle, perf_stores_per_mask); + `POP_COUNT(perf_branches_per_cycle, perf_branches_per_mask); + + reg [`PERF_CTR_BITS-1:0] perf_loads; + reg [`PERF_CTR_BITS-1:0] perf_stores; + reg [`PERF_CTR_BITS-1:0] perf_branches; + + always @(posedge clk) begin + if (reset) begin + perf_loads <= 0; + perf_stores <= 0; + perf_branches <= 0; + end else begin + if (decode_if.valid && decode_if.ready) begin + perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_loads_per_cycle); + perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_stores_per_cycle); + perf_branches <= perf_branches + `PERF_CTR_BITS'(perf_branches_per_cycle); + end + end + end + + assign perf_decode_if.loads = perf_loads; + assign perf_decode_if.stores = perf_stores; + assign perf_decode_if.branches = perf_branches; +`endif + `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (decode_if.valid && decode_if.ready) begin @@ -446,7 +486,8 @@ module VX_decode #( trace_ex_type(decode_if.ex_type); dpi_trace(", op="); trace_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod); - dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm); + dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b (#%0d)\n", + decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm, decode_if.uuid); end end `endif diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 696b6eaa..d4cf83fa 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -55,7 +55,7 @@ `define INST_S 7'b0100011 // store instructions `define INST_I 7'b0010011 // immediate instructions `define INST_R 7'b0110011 // register instructions -`define INST_F 7'b0001111 // Fence instructions +`define INST_FENCE 7'b0001111 // Fence instructions `define INST_SYS 7'b1110011 // system instructions `define INST_FL 7'b0000111 // float load instruction @@ -155,6 +155,7 @@ `define INST_LSU_BITS 4 `define INST_LSU_FMT(x) x[2:0] `define INST_LSU_WSIZE(x) x[1:0] +`define INST_LSU_IS_MEM(x) (3'h0 == x) `define INST_LSU_IS_FENCE(x) (3'h1 == x) `define INST_LSU_IS_PREFETCH(x) (3'h2 == x) diff --git a/hw/rtl/VX_dispatch.sv b/hw/rtl/VX_dispatch.sv index 008a7c62..5715d14b 100644 --- a/hw/rtl/VX_dispatch.sv +++ b/hw/rtl/VX_dispatch.sv @@ -42,15 +42,15 @@ module VX_dispatch ( wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), + .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), .OUT_REG (1) ) alu_buffer ( .clk (clk), .reset (reset), .valid_in (alu_req_valid), .ready_in (alu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), - .data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), + .data_out ({alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}), .valid_out (alu_req_if.valid), .ready_out (alu_req_if.ready) ); @@ -63,15 +63,15 @@ module VX_dispatch ( wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1), + .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1), .OUT_REG (1) ) lsu_buffer ( .clk (clk), .reset (reset), .valid_in (lsu_req_valid), .ready_in (lsu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}), - .data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}), + .data_out ({lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}), .valid_out (lsu_req_if.valid), .ready_out (lsu_req_if.ready) ); @@ -85,15 +85,15 @@ module VX_dispatch ( wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid]; VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32), + .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32), .OUT_REG (1) ) csr_buffer ( .clk (clk), .reset (reset), .valid_in (csr_req_valid), .ready_in (csr_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}), - .data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}), + .data_out ({csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}), .valid_out (csr_req_if.valid), .ready_out (csr_req_if.ready) ); @@ -105,15 +105,15 @@ module VX_dispatch ( wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), + .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), .OUT_REG (1) ) fpu_buffer ( .clk (clk), .reset (reset), .valid_in (fpu_req_valid), .ready_in (fpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), - .data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), + .data_out ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), .valid_out (fpu_req_if.valid), .ready_out (fpu_req_if.ready) ); @@ -127,15 +127,15 @@ module VX_dispatch ( wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)), + .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)), .OUT_REG (1) ) gpu_buffer ( .clk (clk), .reset (reset), .valid_in (gpu_req_valid), .ready_in (gpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), - .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), + .data_out ({gpu_req_if.uuid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}), .valid_out (gpu_req_if.valid), .ready_out (gpu_req_if.ready) ); diff --git a/hw/rtl/VX_execute.sv b/hw/rtl/VX_execute.sv index 029d58ab..3549465a 100644 --- a/hw/rtl/VX_execute.sv +++ b/hw/rtl/VX_execute.sv @@ -75,6 +75,10 @@ module VX_execute #( VX_tex_csr_if tex_csr_if(); +`ifdef PERF_ENABLE + VX_perf_tex_if perf_tex_if(); +`endif + VX_cache_arb #( .NUM_REQS (2), .LANES (`NUM_THREADS), @@ -165,6 +169,9 @@ module VX_execute #( .clk (clk), .reset (csr_reset), `ifdef PERF_ENABLE + `ifdef EXT_TEX_ENABLE + .perf_tex_if (perf_tex_if), + `endif .perf_memsys_if (perf_memsys_if), .perf_pipeline_if(perf_pipeline_if), `endif @@ -209,6 +216,9 @@ module VX_execute #( .reset (gpu_reset), .gpu_req_if (gpu_req_if), `ifdef EXT_TEX_ENABLE + `ifdef PERF_ENABLE + .perf_tex_if (perf_tex_if), + `endif .tex_csr_if (tex_csr_if), .dcache_req_if (tex_dcache_req_if), .dcache_rsp_if (tex_dcache_rsp_if), diff --git a/hw/rtl/VX_fpu_unit.sv b/hw/rtl/VX_fpu_unit.sv index 7b0f07cc..84af116b 100644 --- a/hw/rtl/VX_fpu_unit.sv +++ b/hw/rtl/VX_fpu_unit.sv @@ -22,6 +22,7 @@ module VX_fpu_unit #( wire valid_out; wire ready_out; + wire [63:0] rsp_uuid; wire [`NW_BITS-1:0] rsp_wid; wire [`NUM_THREADS-1:0] rsp_tmask; wire [31:0] rsp_PC; @@ -39,7 +40,7 @@ module VX_fpu_unit #( wire fpuq_pop = valid_out && ready_out; VX_index_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), + .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), .SIZE (`FPUQ_SIZE) ) req_metadata ( .clk (clk), @@ -48,8 +49,8 @@ module VX_fpu_unit #( .write_addr (tag_in), .read_addr (tag_out), .release_addr (tag_out), - .write_data ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}), - .read_data ({rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}), + .write_data ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}), + .read_data ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}), .release_slot (fpuq_pop), .full (fpuq_full), `UNUSED_PIN (empty) @@ -180,14 +181,14 @@ module VX_fpu_unit #( wire stall_out = ~fpu_commit_if.ready && fpu_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({valid_out, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result, has_fflags, rsp_fflags}), - .data_out ({fpu_commit_if.valid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r}) + .data_in ({valid_out, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result, has_fflags, rsp_fflags}), + .data_out ({fpu_commit_if.valid, fpu_commit_if.uuid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r}) ); assign fpu_commit_if.eop = 1'b1; diff --git a/hw/rtl/VX_gpu_unit.sv b/hw/rtl/VX_gpu_unit.sv index 06d5fbc7..6db637a2 100644 --- a/hw/rtl/VX_gpu_unit.sv +++ b/hw/rtl/VX_gpu_unit.sv @@ -12,6 +12,10 @@ module VX_gpu_unit #( VX_gpu_req_if.slave gpu_req_if, `ifdef EXT_TEX_ENABLE + // PERF +`ifdef PERF_ENABLE + VX_perf_tex_if.master perf_tex_if, +`endif VX_dcache_req_if.master dcache_req_if, VX_dcache_rsp_if.slave dcache_rsp_if, VX_tex_csr_if.slave tex_csr_if, @@ -28,12 +32,13 @@ module VX_gpu_unit #( localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS; localparam RSP_DATAW = `MAX(`NUM_THREADS * 32, WCTL_DATAW); - wire rsp_valid; - wire [`NW_BITS-1:0] rsp_wid; - wire [`NUM_THREADS-1:0] rsp_tmask; - wire [31:0] rsp_PC; - wire [`NR_BITS-1:0] rsp_rd; - wire rsp_wb; + wire rsp_valid; + wire [63:0] rsp_uuid; + wire [`NW_BITS-1:0] rsp_wid; + wire [`NUM_THREADS-1:0] rsp_tmask; + wire [31:0] rsp_PC; + wire [`NR_BITS-1:0] rsp_rd; + wire rsp_wb; wire [RSP_DATAW-1:0] rsp_data, rsp_data_r; @@ -112,6 +117,7 @@ module VX_gpu_unit #( wire is_tex = (gpu_req_if.op_type == `INST_GPU_TEX); assign tex_req_if.valid = gpu_req_if.valid && is_tex; + assign tex_req_if.uuid = gpu_req_if.uuid; assign tex_req_if.wid = gpu_req_if.wid; assign tex_req_if.tmask = gpu_req_if.tmask; assign tex_req_if.PC = gpu_req_if.PC; @@ -128,6 +134,9 @@ module VX_gpu_unit #( ) tex_unit ( .clk (clk), .reset (reset), + `ifdef PERF_ENABLE + .perf_tex_if (perf_tex_if), + `endif .tex_req_if (tex_req_if), .tex_csr_if (tex_csr_if), .tex_rsp_if (tex_rsp_if), @@ -143,6 +152,7 @@ module VX_gpu_unit #( assign is_warp_ctl = !(is_tex || tex_rsp_if.valid); assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex); + assign rsp_uuid = tex_rsp_if.valid ? tex_rsp_if.uuid : gpu_req_if.uuid; assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid; assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask; assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC; @@ -161,6 +171,7 @@ module VX_gpu_unit #( assign is_warp_ctl = 1; assign rsp_valid = gpu_req_if.valid; + assign rsp_uuid = gpu_req_if.uuid; assign rsp_wid = gpu_req_if.wid; assign rsp_tmask = gpu_req_if.tmask; assign rsp_PC = gpu_req_if.PC; @@ -176,14 +187,14 @@ module VX_gpu_unit #( assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}), - .data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r}) + .data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}), + .data_out ({gpu_commit_if.valid, gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r}) ); assign gpu_commit_if.data = rsp_data_r[(`NUM_THREADS * 32)-1:0]; @@ -200,7 +211,7 @@ module VX_gpu_unit #( assign gpu_req_if.ready = ~stall_in; `SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid); - `SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid); + `SCOPE_ASSIGN (gpu_rsp_uuid, gpu_commit_if.uuid); `SCOPE_ASSIGN (gpu_rsp_tmc, warp_ctl_if.tmc.valid); `SCOPE_ASSIGN (gpu_rsp_wspawn, warp_ctl_if.wspawn.valid); `SCOPE_ASSIGN (gpu_rsp_split, warp_ctl_if.split.valid); diff --git a/hw/rtl/VX_ibuffer.sv b/hw/rtl/VX_ibuffer.sv index 9b9fd397..953f1426 100644 --- a/hw/rtl/VX_ibuffer.sv +++ b/hw/rtl/VX_ibuffer.sv @@ -15,7 +15,7 @@ module VX_ibuffer #( `UNUSED_PARAM (CORE_ID) - localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1; + localparam DATAW = 64 + `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1; localparam ADDRW = $clog2(`IBUF_SIZE+1); localparam NWARPSW = $clog2(`NUM_WARPS+1); @@ -168,7 +168,8 @@ module VX_ibuffer #( assign decode_if.ready = ~q_full[decode_if.wid]; - assign q_data_in = {decode_if.tmask, + assign q_data_in = {decode_if.uuid, + decode_if.tmask, decode_if.PC, decode_if.ex_type, decode_if.op_type, @@ -184,7 +185,8 @@ module VX_ibuffer #( assign ibuffer_if.valid = deq_valid; assign ibuffer_if.wid = deq_wid; - assign {ibuffer_if.tmask, + assign {ibuffer_if.uuid, + ibuffer_if.tmask, ibuffer_if.PC, ibuffer_if.ex_type, ibuffer_if.op_type, diff --git a/hw/rtl/VX_icache_stage.sv b/hw/rtl/VX_icache_stage.sv index ad296649..77a20b47 100644 --- a/hw/rtl/VX_icache_stage.sv +++ b/hw/rtl/VX_icache_stage.sv @@ -25,35 +25,36 @@ module VX_icache_stage #( localparam OUT_REG = 0; reg [`DBG_CACHE_REQ_IDW-1:0] req_id; - wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id; + wire [`DBG_CACHE_REQ_IDW-1:0] rsp_id; wire [`NW_BITS-1:0] req_tag, rsp_tag; - `UNUSED_VAR (rsp_req_id) + `UNUSED_VAR (rsp_id) wire icache_req_fire = icache_req_if.valid && icache_req_if.ready; - assign req_tag = ifetch_req_if.wid; - assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0]; - assign rsp_req_id = icache_rsp_if.tag[`NW_BITS +: `DBG_CACHE_REQ_IDW]; + assign req_tag = ifetch_req_if.wid; + assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0]; + assign rsp_id = icache_rsp_if.tag[`NW_BITS +: `DBG_CACHE_REQ_IDW]; + wire [63:0] rsp_uuid; wire [31:0] rsp_PC; wire [`NUM_THREADS-1:0] rsp_tmask; VX_dp_ram #( - .DATAW (32 + `NUM_THREADS), + .DATAW (32 + `NUM_THREADS + 64), .SIZE (`NUM_WARPS), .LUTRAM (1) ) req_metadata ( .clk (clk), .wren (icache_req_fire), .waddr (req_tag), - .wdata ({ifetch_req_if.PC, ifetch_req_if.tmask}), + .wdata ({ifetch_req_if.PC, ifetch_req_if.tmask, ifetch_req_if.uuid}), .raddr (rsp_tag), - .rdata ({rsp_PC, rsp_tmask}) + .rdata ({rsp_PC, rsp_tmask, rsp_uuid}) ); `RUNTIME_ASSERT((!ifetch_req_if.valid || ifetch_req_if.PC >= `STARTUP_ADDR), - ("invalid PC=%0h, wid=%0d, tmask=%b", ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask)) + ("%t: *** invalid PC=%0h, wid=%0d, tmask=%b (#%0d)", $time, ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask, ifetch_req_if.uuid)) // Icache Request assign icache_req_if.valid = ifetch_req_if.valid; @@ -78,35 +79,37 @@ module VX_icache_stage #( wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid); VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32), + .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32 + 64), .RESETW (1), .DEPTH (OUT_REG) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({icache_rsp_if.valid, rsp_wid, rsp_tmask, rsp_PC, icache_rsp_if.data}), - .data_out ({ifetch_rsp_if.valid, ifetch_rsp_if.wid, ifetch_rsp_if.tmask, ifetch_rsp_if.PC, ifetch_rsp_if.data}) + .data_in ({icache_rsp_if.valid, rsp_wid, rsp_tmask, rsp_PC, icache_rsp_if.data, rsp_uuid}), + .data_out ({ifetch_rsp_if.valid, ifetch_rsp_if.wid, ifetch_rsp_if.tmask, ifetch_rsp_if.PC, ifetch_rsp_if.data, ifetch_rsp_if.uuid}) ); // Can accept new response? assign icache_rsp_if.ready = ~stall_out; `SCOPE_ASSIGN (icache_req_fire, icache_req_fire); - `SCOPE_ASSIGN (icache_req_wid, ifetch_req_if.wid); + `SCOPE_ASSIGN (icache_req_uuid, ifetch_req_if.uuid); `SCOPE_ASSIGN (icache_req_addr, {icache_req_if.addr, 2'b0}); `SCOPE_ASSIGN (icache_req_tag, req_tag); + `SCOPE_ASSIGN (icache_rsp_fire, icache_rsp_if.valid && icache_rsp_if.ready); + `SCOPE_ASSIGN (icache_rsp_uuid, rsp_uuid); `SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data); `SCOPE_ASSIGN (icache_rsp_tag, rsp_tag); `ifdef DBG_TRACE_CORE_ICACHE always @(posedge clk) begin if (icache_req_fire) begin - dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h, req_id=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, req_id); + dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h, req_id=%0h (#%0d)\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, req_id, ifetch_req_if.uuid); end if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin - dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, req_id=%0h, data=%0h\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, rsp_req_id, ifetch_rsp_if.data); + dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, req_id=%0h, data=%0h (#%0d)\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, rsp_id, ifetch_rsp_if.data, ifetch_rsp_if.uuid); end end `endif diff --git a/hw/rtl/VX_issue.sv b/hw/rtl/VX_issue.sv index abbb5241..e20f5fce 100644 --- a/hw/rtl/VX_issue.sv +++ b/hw/rtl/VX_issue.sv @@ -9,7 +9,7 @@ module VX_issue #( input wire reset, `ifdef PERF_ENABLE - VX_perf_pipeline_if.master perf_pipeline_if, + VX_perf_pipeline_if.issue perf_issue_if, `endif VX_decode_if.slave decode_if, @@ -38,6 +38,7 @@ module VX_issue #( // scoreboard writeback interface assign sboard_wb_if.valid = writeback_if.valid; + assign sboard_wb_if.uuid = writeback_if.uuid; assign sboard_wb_if.wid = writeback_if.wid; assign sboard_wb_if.PC = writeback_if.PC; assign sboard_wb_if.rd = writeback_if.rd; @@ -45,6 +46,7 @@ module VX_issue #( // scoreboard interface assign scoreboard_if.valid = ibuffer_if.valid && dispatch_if.ready; + assign scoreboard_if.uuid = ibuffer_if.uuid; assign scoreboard_if.wid = ibuffer_if.wid; assign scoreboard_if.PC = ibuffer_if.PC; assign scoreboard_if.wb = ibuffer_if.wb; @@ -57,6 +59,7 @@ module VX_issue #( // dispatch interface assign dispatch_if.valid = ibuffer_if.valid && scoreboard_if.ready; + assign dispatch_if.uuid = ibuffer_if.uuid; assign dispatch_if.wid = ibuffer_if.wid; assign dispatch_if.tmask = ibuffer_if.tmask; assign dispatch_if.PC = ibuffer_if.PC; @@ -121,9 +124,8 @@ module VX_issue #( ); `SCOPE_ASSIGN (issue_fire, ibuffer_if.valid && ibuffer_if.ready); - `SCOPE_ASSIGN (issue_wid, ibuffer_if.wid); + `SCOPE_ASSIGN (issue_uuid, ibuffer_if.uuid); `SCOPE_ASSIGN (issue_tmask, ibuffer_if.tmask); - `SCOPE_ASSIGN (issue_pc, ibuffer_if.PC); `SCOPE_ASSIGN (issue_ex_type, ibuffer_if.ex_type); `SCOPE_ASSIGN (issue_op_type, ibuffer_if.op_type); `SCOPE_ASSIGN (issue_op_mod, ibuffer_if.op_mod); @@ -140,10 +142,9 @@ module VX_issue #( `SCOPE_ASSIGN (gpr_rs1, gpr_rsp_if.rs1_data); `SCOPE_ASSIGN (gpr_rs2, gpr_rsp_if.rs2_data); `SCOPE_ASSIGN (gpr_rs3, gpr_rsp_if.rs3_data); - `SCOPE_ASSIGN (writeback_valid, writeback_if.valid); + `SCOPE_ASSIGN (writeback_valid, writeback_if.valid); + `SCOPE_ASSIGN (writeback_uuid, writeback_if.uuid); `SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask); - `SCOPE_ASSIGN (writeback_wid, writeback_if.wid); - `SCOPE_ASSIGN (writeback_pc, writeback_if.PC); `SCOPE_ASSIGN (writeback_rd, writeback_if.rd); `SCOPE_ASSIGN (writeback_data, writeback_if.data); `SCOPE_ASSIGN (writeback_eop, writeback_if.eop); @@ -171,40 +172,35 @@ module VX_issue #( perf_fpu_stalls <= 0; `endif end else begin - if (decode_if.valid & !decode_if.ready) begin + if (decode_if.valid & ~decode_if.ready) begin perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1; end - if (scoreboard_if.valid & !scoreboard_if.ready) begin + if (scoreboard_if.valid & ~scoreboard_if.ready) begin perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1; end - if (alu_req_if.valid & !alu_req_if.ready) begin - perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1; + if (dispatch_if.valid & ~dispatch_if.ready) begin + case (dispatch_if.ex_type) + `EX_ALU: perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1; + `ifdef EXT_F_ENABLE + `EX_FPU: perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1; + `endif + `EX_LSU: perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1; + `EX_CSR: perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1; + //`EX_GPU: + default: perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1; + endcase end - if (lsu_req_if.valid & !lsu_req_if.ready) begin - perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1; - end - if (csr_req_if.valid & !csr_req_if.ready) begin - perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1; - end - if (gpu_req_if.valid & !gpu_req_if.ready) begin - perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1; - end - `ifdef EXT_F_ENABLE - if (fpu_req_if.valid & !fpu_req_if.ready) begin - perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1; - end - `endif end end - assign perf_pipeline_if.ibf_stalls = perf_ibf_stalls; - assign perf_pipeline_if.scb_stalls = perf_scb_stalls; - assign perf_pipeline_if.alu_stalls = perf_alu_stalls; - assign perf_pipeline_if.lsu_stalls = perf_lsu_stalls; - assign perf_pipeline_if.csr_stalls = perf_csr_stalls; - assign perf_pipeline_if.gpu_stalls = perf_gpu_stalls; + assign perf_issue_if.ibf_stalls = perf_ibf_stalls; + assign perf_issue_if.scb_stalls = perf_scb_stalls; + assign perf_issue_if.alu_stalls = perf_alu_stalls; + assign perf_issue_if.lsu_stalls = perf_lsu_stalls; + assign perf_issue_if.csr_stalls = perf_csr_stalls; + assign perf_issue_if.gpu_stalls = perf_gpu_stalls; `ifdef EXT_F_ENABLE - assign perf_pipeline_if.fpu_stalls = perf_fpu_stalls; + assign perf_issue_if.fpu_stalls = perf_fpu_stalls; `endif `endif @@ -216,7 +212,7 @@ module VX_issue #( `TRACE_ARRAY1D(alu_req_if.rs1_data, `NUM_THREADS); dpi_trace(", rs2_data="); `TRACE_ARRAY1D(alu_req_if.rs2_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", alu_req_if.uuid); end if (lsu_req_if.valid && lsu_req_if.ready) begin dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, offset=%0h, addr=", @@ -224,13 +220,13 @@ module VX_issue #( `TRACE_ARRAY1D(lsu_req_if.base_addr, `NUM_THREADS); dpi_trace(", data="); `TRACE_ARRAY1D(lsu_req_if.store_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", lsu_req_if.uuid); end if (csr_req_if.valid && csr_req_if.ready) begin dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, rs1_data=", $time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.rd, csr_req_if.addr); `TRACE_ARRAY1D(csr_req_if.rs1_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", csr_req_if.uuid); end `ifdef EXT_F_ENABLE if (fpu_req_if.valid && fpu_req_if.ready) begin @@ -241,7 +237,7 @@ module VX_issue #( `TRACE_ARRAY1D(fpu_req_if.rs2_data, `NUM_THREADS); dpi_trace(", rs3_data="); `TRACE_ARRAY1D(fpu_req_if.rs3_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", fpu_req_if.uuid); end `endif if (gpu_req_if.valid && gpu_req_if.ready) begin @@ -252,7 +248,7 @@ module VX_issue #( `TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS); dpi_trace(", rs3_data="); `TRACE_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", gpu_req_if.uuid); end end `endif diff --git a/hw/rtl/VX_lsu_unit.sv b/hw/rtl/VX_lsu_unit.sv index de47dca0..e0ed73b5 100644 --- a/hw/rtl/VX_lsu_unit.sv +++ b/hw/rtl/VX_lsu_unit.sv @@ -21,7 +21,6 @@ module VX_lsu_unit #( ); localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE); localparam MEM_ADDRW = 32 - MEM_ASHIFT; - localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE); `STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) @@ -29,6 +28,7 @@ module VX_lsu_unit #( `STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter")) wire req_valid; + wire [63:0] req_uuid; wire [`NUM_THREADS-1:0] req_tmask; wire [`NUM_THREADS-1:0][31:0] req_addr; wire [`INST_LSU_BITS-1:0] req_type; @@ -54,16 +54,16 @@ module VX_lsu_unit #( for (genvar i = 0; i < (`NUM_THREADS-1); i++) begin assign addr_matches[i] = (lsu_req_if.base_addr[i+1] == lsu_req_if.base_addr[0]) || ~lsu_req_if.tmask[i+1]; end + wire lsu_is_dup = lsu_req_if.tmask[0] && (& addr_matches); for (genvar i = 0; i < `NUM_THREADS; i++) begin // is non-cacheable address wire is_addr_nc = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'(`IO_BASE_ADDR >> MEM_ASHIFT)); - if (`SM_ENABLE) begin // is shared memory address wire is_addr_sm = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'((`SMEM_BASE_ADDR - `SMEM_SIZE) >> MEM_ASHIFT)) - & (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT)); + & (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT)); assign lsu_addr_type[i] = {is_addr_nc, is_addr_sm}; end else begin assign lsu_addr_type[i] = is_addr_nc; @@ -81,19 +81,20 @@ module VX_lsu_unit #( wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch; VX_pipe_register #( - .DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 1 + 1 + 64 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) req_pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_in), - .data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}), - .data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data}) + .data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}), + .data_out ({req_valid, req_is_dup, req_is_prefetch, req_uuid, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data}) ); // Can accept new request? assign lsu_req_if.ready = ~stall_in && ~fence_wait; + wire [63:0] rsp_uuid; wire [`NW_BITS-1:0] rsp_wid; wire [31:0] rsp_pc; wire [`NR_BITS-1:0] rsp_rd; @@ -146,7 +147,7 @@ module VX_lsu_unit #( wire req_wb2 = req_wb && ~req_is_prefetch; VX_index_buffer #( - .DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1), + .DATAW (64 + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1), .SIZE (`LSUQ_SIZE) ) req_metadata ( .clk (clk), @@ -154,8 +155,8 @@ module VX_lsu_unit #( .write_addr (mbuf_waddr), .acquire_slot (mbuf_push), .read_addr (mbuf_raddr), - .write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}), - .read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}), + .write_data ({req_uuid, req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}), + .read_data ({rsp_uuid, rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}), .release_addr (mbuf_raddr), .release_slot (mbuf_pop), .full (mbuf_full), @@ -259,6 +260,7 @@ module VX_lsu_unit #( wire is_store_rsp = req_valid && ~req_wb && dcache_req_ready; assign st_commit_if.valid = is_store_rsp; + assign st_commit_if.uuid = req_uuid; assign st_commit_if.wid = req_wid; assign st_commit_if.tmask = req_tmask; assign st_commit_if.PC = req_pc; @@ -295,14 +297,14 @@ module VX_lsu_unit #( wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1), .RESETW (1) ) rsp_pipe_reg ( .clk (clk), .reset (reset), .enable (!load_rsp_stall), - .data_in ({dcache_rsp_if.valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), - .data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) + .data_in ({dcache_rsp_if.valid, rsp_uuid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), + .data_out ({ld_commit_if.valid, ld_commit_if.uuid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) ); // Can accept new cache response? @@ -310,19 +312,19 @@ module VX_lsu_unit #( // scope registration `SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire); - `SCOPE_ASSIGN (dcache_req_wid, req_wid); - `SCOPE_ASSIGN (dcache_req_pc, req_pc); + `SCOPE_ASSIGN (dcache_req_uuid, req_uuid); `SCOPE_ASSIGN (dcache_req_addr, req_addr); `SCOPE_ASSIGN (dcache_req_rw, ~req_wb); `SCOPE_ASSIGN (dcache_req_byteen,dcache_req_if.byteen); `SCOPE_ASSIGN (dcache_req_data, dcache_req_if.data); `SCOPE_ASSIGN (dcache_req_tag, req_tag); `SCOPE_ASSIGN (dcache_rsp_fire, dcache_rsp_if.tmask & {`NUM_THREADS{dcache_rsp_fire}}); + `SCOPE_ASSIGN (dcache_rsp_uuid, rsp_uuid); `SCOPE_ASSIGN (dcache_rsp_data, dcache_rsp_if.data); `SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr); `ifndef SYNTHESIS - reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + 64 + 1)-1:0] pending_reqs; + reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + 64 + 64 + 1)-1:0] pending_reqs; wire [63:0] delay_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE)); always @(posedge clk) begin @@ -330,7 +332,7 @@ module VX_lsu_unit #( pending_reqs <= '0; end begin if (mbuf_push) begin - pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, $time, 1'b1}; + pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, req_uuid, $time, 1'b1}; end if (mbuf_pop) begin pending_reqs[mbuf_raddr] <= '0; @@ -340,8 +342,11 @@ module VX_lsu_unit #( for (integer i = 0; i < `LSUQ_SIZE; ++i) begin if (pending_reqs[i][0]) begin `ASSERT(($time - pending_reqs[i][1 +: 64]) < delay_timeout, - ("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d", - $time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+32+`NR_BITS +: `NW_BITS], pending_reqs[i][1+64+`NR_BITS +: 32], pending_reqs[i][1+64 +: `NR_BITS])); + ("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d (#%0d)", + $time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+64+32+`NR_BITS +: `NW_BITS], + pending_reqs[i][1+64+64+`NR_BITS +: 32], + pending_reqs[i][1+64+64 +: `NR_BITS], + pending_reqs[i][1+64 +: 64])); end end end @@ -360,20 +365,20 @@ module VX_lsu_unit #( `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); dpi_trace(", data="); `TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS); - dpi_trace(", req_id=%0h\n", req_id); + dpi_trace(", (#%0d)\n", req_uuid); end else begin dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire, req_id); `TRACE_ARRAY1D(req_addr, `NUM_THREADS); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); - dpi_trace(", rd=%0d, is_dup=%b\n", req_rd, req_is_dup); + dpi_trace(", rd=%0d, is_dup=%b (#%0d)\n", req_rd, req_is_dup, req_uuid); end end if (dcache_rsp_fire) begin dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, rd=%0d, data=", $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, rsp_req_id, mbuf_raddr, rsp_rd); `TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS); - dpi_trace(", is_dup=%b\n", rsp_is_dup); + dpi_trace(", is_dup=%b (#%0d)\n", rsp_is_dup, rsp_uuid); end end `endif diff --git a/hw/rtl/VX_mem_unit.sv b/hw/rtl/VX_mem_unit.sv index 56de47ef..ade9600f 100644 --- a/hw/rtl/VX_mem_unit.sv +++ b/hw/rtl/VX_mem_unit.sv @@ -358,19 +358,17 @@ module VX_mem_unit # ( `ifdef PERF_ENABLE + `UNUSED_VAR (perf_dcache_if.mem_stalls) + `UNUSED_VAR (perf_dcache_if.crsp_stalls) + assign perf_memsys_if.icache_reads = perf_icache_if.reads; assign perf_memsys_if.icache_read_misses = perf_icache_if.read_misses; - assign perf_memsys_if.icache_pipe_stalls = perf_icache_if.pipe_stalls; - assign perf_memsys_if.icache_crsp_stalls = perf_icache_if.crsp_stalls; - assign perf_memsys_if.dcache_reads = perf_dcache_if.reads; assign perf_memsys_if.dcache_writes = perf_dcache_if.writes; assign perf_memsys_if.dcache_read_misses = perf_dcache_if.read_misses; assign perf_memsys_if.dcache_write_misses= perf_dcache_if.write_misses; assign perf_memsys_if.dcache_bank_stalls = perf_dcache_if.bank_stalls; assign perf_memsys_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls; - assign perf_memsys_if.dcache_pipe_stalls = perf_dcache_if.pipe_stalls; - assign perf_memsys_if.dcache_crsp_stalls = perf_dcache_if.crsp_stalls; if (`SM_ENABLE) begin assign perf_memsys_if.smem_reads = perf_smem_if.reads; @@ -382,47 +380,41 @@ end else begin assign perf_memsys_if.smem_bank_stalls = 0; end - reg [`PERF_CTR_BITS-1:0] perf_mem_lat_per_cycle; + reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads; always @(posedge clk) begin if (reset) begin - perf_mem_lat_per_cycle <= 0; + perf_mem_pending_reads <= 0; end else begin - perf_mem_lat_per_cycle <= perf_mem_lat_per_cycle + - `PERF_CTR_BITS'($signed(2'((mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready) && !(mem_rsp_if.valid && mem_rsp_if.ready)) - - 2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready)))); + perf_mem_pending_reads <= perf_mem_pending_reads + + `PERF_CTR_BITS'($signed(2'((mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) && !(mem_rsp_if.valid && mem_rsp_if.ready)) - + 2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw)))); end end reg [`PERF_CTR_BITS-1:0] perf_mem_reads; reg [`PERF_CTR_BITS-1:0] perf_mem_writes; reg [`PERF_CTR_BITS-1:0] perf_mem_lat; - reg [`PERF_CTR_BITS-1:0] perf_mem_stalls; always @(posedge clk) begin if (reset) begin perf_mem_reads <= 0; perf_mem_writes <= 0; perf_mem_lat <= 0; - perf_mem_stalls <= 0; end else begin if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'd1; end if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'd1; - end - if (mem_req_if.valid && !mem_req_if.ready) begin - perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'd1; - end - perf_mem_lat <= perf_mem_lat + perf_mem_lat_per_cycle; + end + perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads; end end assign perf_memsys_if.mem_reads = perf_mem_reads; assign perf_memsys_if.mem_writes = perf_mem_writes; - assign perf_memsys_if.mem_latency = perf_mem_lat; - assign perf_memsys_if.mem_stalls = perf_mem_stalls; + assign perf_memsys_if.mem_latency = perf_mem_lat; `endif endmodule diff --git a/hw/rtl/VX_muldiv.sv b/hw/rtl/VX_muldiv.sv index 5cd13f5c..c4dda93b 100644 --- a/hw/rtl/VX_muldiv.sv +++ b/hw/rtl/VX_muldiv.sv @@ -6,6 +6,7 @@ module VX_muldiv ( // Inputs input wire [`INST_MUL_BITS-1:0] alu_op, + input wire [63:0] uuid_in, input wire [`NW_BITS-1:0] wid_in, input wire [`NUM_THREADS-1:0] tmask_in, input wire [31:0] PC_in, @@ -15,6 +16,7 @@ module VX_muldiv ( input wire [`NUM_THREADS-1:0][31:0] alu_in2, // Outputs + output wire [63:0] uuid_out, output wire [`NW_BITS-1:0] wid_out, output wire [`NUM_THREADS-1:0] tmask_out, output wire [31:0] PC_out, @@ -32,6 +34,7 @@ module VX_muldiv ( wire is_div_op = `INST_MUL_IS_DIV(alu_op); wire [`NUM_THREADS-1:0][31:0] mul_result; + wire [63:0] mul_uuid_out; wire [`NW_BITS-1:0] mul_wid_out; wire [`NUM_THREADS-1:0] mul_tmask_out; wire [31:0] mul_PC_out; @@ -63,15 +66,15 @@ module VX_muldiv ( end VX_shift_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .DEPTH (`LATENCY_IMUL), .RESETW (1) ) mul_shift_reg ( .clk(clk), .reset (reset), .enable (mul_ready_in), - .data_in ({mul_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}), - .data_out ({mul_valid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result}) + .data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}), + .data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result}) ); `else @@ -103,15 +106,15 @@ module VX_muldiv ( end VX_shift_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1), .DEPTH (`LATENCY_IMUL), .RESETW (1) ) mul_shift_reg ( .clk(clk), .reset (reset), .enable (mul_ready_in), - .data_in ({mul_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}), - .data_out ({mul_valid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out}) + .data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}), + .data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out}) ); `endif @@ -119,6 +122,7 @@ module VX_muldiv ( /////////////////////////////////////////////////////////////////////////// wire [`NUM_THREADS-1:0][31:0] div_result; + wire [63:0] div_uuid_out; wire [`NW_BITS-1:0] div_wid_out; wire [`NUM_THREADS-1:0] div_tmask_out; wire [31:0] div_PC_out; @@ -147,15 +151,15 @@ module VX_muldiv ( end VX_shift_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .DEPTH (`LATENCY_IMUL), .RESETW (1) ) div_shift_reg ( .clk(clk), .reset (reset), .enable (div_ready_in), - .data_in ({div_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}), - .data_out ({div_valid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result}) + .data_in ({div_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}), + .data_out ({div_valid_out, div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result}) ); assign div_ready_in = div_ready_out || ~div_valid_out; @@ -171,21 +175,21 @@ module VX_muldiv ( .WIDTHQ (32), .WIDTHR (32), .LANES (`NUM_THREADS), - .TAGW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1) + .TAGW (64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1) ) divide ( .clk (clk), .reset (reset), .valid_in (div_valid_in), .ready_in (div_ready_in), .signed_mode(is_signed_div), - .tag_in ({wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}), + .tag_in ({uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}), .numer (alu_in1), .denom (alu_in2), .quotient (div_result_tmp), .remainder (rem_result_tmp), .ready_out (div_ready_out), .valid_out (div_valid_out), - .tag_out ({div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out}) + .tag_out ({div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out}) ); assign div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp; @@ -195,6 +199,7 @@ module VX_muldiv ( /////////////////////////////////////////////////////////////////////////// wire rsp_valid = mul_valid_out || div_valid_out; + wire [63:0] rsp_uuid = mul_valid_out ? mul_uuid_out : div_uuid_out; wire [`NW_BITS-1:0] rsp_wid = mul_valid_out ? mul_wid_out : div_wid_out; wire [`NUM_THREADS-1:0] rsp_tmask = mul_valid_out ? mul_tmask_out : div_tmask_out; wire [31:0] rsp_PC = mul_valid_out ? mul_PC_out : div_PC_out; @@ -205,14 +210,14 @@ module VX_muldiv ( assign stall_out = ~ready_out && valid_out; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}), - .data_out ({valid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out}) + .data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}), + .data_out ({valid_out, uuid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out}) ); // can accept new request? diff --git a/hw/rtl/VX_pipeline.sv b/hw/rtl/VX_pipeline.sv index 8bbc7ead..1ab20c4a 100644 --- a/hw/rtl/VX_pipeline.sv +++ b/hw/rtl/VX_pipeline.sv @@ -165,6 +165,9 @@ module VX_pipeline #( ) decode ( .clk (clk), .reset (decode_reset), + `ifdef PERF_ENABLE + .perf_decode_if (perf_pipeline_if.decode), + `endif .ifetch_rsp_if (ifetch_rsp_if), .decode_if (decode_if), .wstall_if (wstall_if), @@ -180,7 +183,7 @@ module VX_pipeline #( .reset (issue_reset), `ifdef PERF_ENABLE - .perf_pipeline_if (perf_pipeline_if), + .perf_issue_if (perf_pipeline_if.issue), `endif .decode_if (decode_if), diff --git a/hw/rtl/VX_scoreboard.sv b/hw/rtl/VX_scoreboard.sv index 6ba4e998..9a3fed37 100644 --- a/hw/rtl/VX_scoreboard.sv +++ b/hw/rtl/VX_scoreboard.sv @@ -60,22 +60,22 @@ module VX_scoreboard #( end else begin `ifdef DBG_TRACE_PIPELINE if (ibuffer_if.valid && ~ibuffer_if.ready) begin - dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n", + dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)\n", $time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb, - deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3); + deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid); end `endif if (release_reg) begin `ASSERT(inuse_regs[writeback_if.wid][writeback_if.rd] != 0, - ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d", - $time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd)); + ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d (#%0d)", + $time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd,writeback_if.uuid)); end if (ibuffer_if.valid && ~ibuffer_if.ready) begin deadlock_ctr <= deadlock_ctr + 1; `ASSERT(deadlock_ctr < deadlock_timeout, - ("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b", + ("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)", $time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb, - deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3)); + deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid)); end else if (ibuffer_if.valid && ibuffer_if.ready) begin deadlock_ctr <= 0; end diff --git a/hw/rtl/VX_trace_instr.vh b/hw/rtl/VX_trace_instr.vh index e228179e..5e8e031e 100644 --- a/hw/rtl/VX_trace_instr.vh +++ b/hw/rtl/VX_trace_instr.vh @@ -35,9 +35,9 @@ task trace_ex_op ( `INST_BR_JALR: dpi_trace("JALR"); `INST_BR_ECALL: dpi_trace("ECALL"); `INST_BR_EBREAK:dpi_trace("EBREAK"); - `INST_BR_MRET: dpi_trace("MRET"); + `INST_BR_URET: dpi_trace("URET"); `INST_BR_SRET: dpi_trace("SRET"); - `INST_BR_DRET: dpi_trace("DRET"); + `INST_BR_MRET: dpi_trace("MRET"); default: dpi_trace("?"); endcase end else if (`INST_ALU_IS_MUL(op_mod)) begin diff --git a/hw/rtl/VX_warp_sched.sv b/hw/rtl/VX_warp_sched.sv index 979a3536..b8ec17bf 100644 --- a/hw/rtl/VX_warp_sched.sv +++ b/hw/rtl/VX_warp_sched.sv @@ -46,6 +46,8 @@ module VX_warp_sched #( wire schedule_valid; wire warp_scheduled; + reg [63:0] issued_instrs; + wire ifetch_req_fire = ifetch_req_if.valid && ifetch_req_if.ready; wire tmc_active = (warp_ctl_if.tmc.tmask != 0); @@ -62,12 +64,13 @@ module VX_warp_sched #( always @(posedge clk) begin if (reset) begin - barrier_masks <= 0; - use_wspawn <= 0; - stalled_warps <= 0; + barrier_masks <= '0; + use_wspawn <= '0; + stalled_warps <= '0; warp_pcs <= '0; active_warps <= '0; thread_masks <= '0; + issued_instrs <= '0; // activate first warp warp_pcs[0] <= `STARTUP_ADDR; @@ -117,6 +120,8 @@ module VX_warp_sched #( if (use_wspawn[schedule_wid]) begin thread_masks[schedule_wid] <= 1; end + + issued_instrs <= issued_instrs + 1; end if (ifetch_req_fire) begin @@ -223,20 +228,23 @@ module VX_warp_sched #( assign warp_scheduled = schedule_valid && ~stall_out; + wire [63:0] instr_uuid = (issued_instrs * `NUM_CORES * `NUM_CLUSTERS) + 64'(CORE_ID); + VX_pipe_register #( - .DATAW (1 + `NUM_THREADS + 32 + `NW_BITS), + .DATAW (1 + 64 + `NUM_THREADS + 32 + `NW_BITS), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({schedule_valid, schedule_tmask, schedule_pc, schedule_wid}), - .data_out ({ifetch_req_if.valid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid}) + .data_in ({schedule_valid, instr_uuid, schedule_tmask, schedule_pc, schedule_wid}), + .data_out ({ifetch_req_if.valid, ifetch_req_if.uuid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid}) ); assign busy = (active_warps != 0); `SCOPE_ASSIGN (wsched_scheduled, warp_scheduled); + `SCOPE_ASSIGN (wsched_schedule_uuid, instr_uuid); `SCOPE_ASSIGN (wsched_active_warps, active_warps); `SCOPE_ASSIGN (wsched_stalled_warps, stalled_warps); `SCOPE_ASSIGN (wsched_schedule_wid, schedule_wid); diff --git a/hw/rtl/VX_writeback.sv b/hw/rtl/VX_writeback.sv index cdf7f988..5b67256c 100644 --- a/hw/rtl/VX_writeback.sv +++ b/hw/rtl/VX_writeback.sv @@ -23,17 +23,9 @@ module VX_writeback #( localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1; `ifdef EXT_F_ENABLE -`ifdef EXT_TEX_ENABLE localparam NUM_RSPS = 5; `else localparam NUM_RSPS = 4; -`endif -`else -`ifdef EXT_TEX_ENABLE - localparam NUM_RSPS = 4; -`else - localparam NUM_RSPS = 3; -`endif `endif wire wb_valid; @@ -50,9 +42,7 @@ module VX_writeback #( wire stall; assign rsp_valid = { - `ifdef EXT_TEX_ENABLE gpu_commit_if.valid && gpu_commit_if.wb, - `endif csr_commit_if.valid && csr_commit_if.wb, alu_commit_if.valid && alu_commit_if.wb, `ifdef EXT_F_ENABLE @@ -62,9 +52,7 @@ module VX_writeback #( }; assign rsp_data = { - `ifdef EXT_TEX_ENABLE {gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop}, - `endif {csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop}, {alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop}, `ifdef EXT_F_ENABLE @@ -88,28 +76,17 @@ module VX_writeback #( .ready_out (~stall) ); - assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb; + assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb; `ifdef EXT_F_ENABLE assign fpu_commit_if.ready = rsp_ready[1] || ~fpu_commit_if.wb; assign alu_commit_if.ready = rsp_ready[2] || ~alu_commit_if.wb; assign csr_commit_if.ready = rsp_ready[3] || ~csr_commit_if.wb; + assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb; `else assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb; assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb; -`ifdef EXT_TEX_ENABLE assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb; `endif -`endif - -`ifdef EXT_TEX_ENABLE -`ifdef EXT_F_ENABLE - assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb; -`else - assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb; -`endif -`else - assign gpu_commit_if.ready = 1; -`endif assign stall = ~writeback_if.ready && writeback_if.valid; diff --git a/hw/rtl/afu/VX_to_mem.sv b/hw/rtl/afu/VX_to_mem.sv index 472f8cb3..acc2899b 100644 --- a/hw/rtl/afu/VX_to_mem.sv +++ b/hw/rtl/afu/VX_to_mem.sv @@ -124,7 +124,8 @@ module VX_to_mem #( end end assign mem_rsp_tag_in_w = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_in; - `RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_w == mem_rsp_tag_in), ("out-of-order memory reponse! cur=%d, expected=%d", mem_rsp_tag_in_w, mem_rsp_tag_in)) + `RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_w == mem_rsp_tag_in), + ("%t: *** out-of-order memory reponse! cur=%d, expected=%d", $time, mem_rsp_tag_in_w, mem_rsp_tag_in)) wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr}; diff --git a/hw/rtl/cache/VX_bank.sv b/hw/rtl/cache/VX_bank.sv index 2dfc51fe..9e1f3552 100644 --- a/hw/rtl/cache/VX_bank.sv +++ b/hw/rtl/cache/VX_bank.sv @@ -48,7 +48,6 @@ module VX_bank #( output wire perf_read_misses, output wire perf_write_misses, output wire perf_mshr_stalls, - output wire perf_pipe_stalls, `endif // Core Request @@ -470,7 +469,6 @@ module VX_bank #( `ifdef PERF_ENABLE assign perf_read_misses = do_read_st1 && miss_st1; assign perf_write_misses = do_write_st1 && miss_st1; - assign perf_pipe_stalls = crsq_stall || mreq_alm_full || mshr_alm_full; assign perf_mshr_stalls = mshr_alm_full; `endif diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index 6b6841dd..1b7d7abf 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -102,7 +102,6 @@ module VX_cache #( wire [NUM_BANKS-1:0] perf_read_miss_per_bank; wire [NUM_BANKS-1:0] perf_write_miss_per_bank; wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank; - wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank; `endif /////////////////////////////////////////////////////////////////////////// @@ -219,37 +218,37 @@ module VX_cache #( /////////////////////////////////////////////////////////////////////////// // Core request - wire [NUM_REQS-1:0] core_req_valid_nc; - wire [NUM_REQS-1:0] core_req_rw_nc; - wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr_nc; - wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_nc; - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data_nc; - wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_nc; - wire [NUM_REQS-1:0] core_req_ready_nc; + wire [NUM_REQS-1:0] core_req_valid_c; + wire [NUM_REQS-1:0] core_req_rw_c; + wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr_c; + wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_c; + wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data_c; + wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_c; + wire [NUM_REQS-1:0] core_req_ready_c; // Core response - wire [`CORE_RSP_TAGS-1:0] core_rsp_valid_nc; - wire [NUM_REQS-1:0] core_rsp_tmask_nc; - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_nc; - wire [`CORE_RSP_TAGS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_nc; - wire [`CORE_RSP_TAGS-1:0] core_rsp_ready_nc; + wire [`CORE_RSP_TAGS-1:0] core_rsp_valid_c; + wire [NUM_REQS-1:0] core_rsp_tmask_c; + wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_c; + wire [`CORE_RSP_TAGS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_c; + wire [`CORE_RSP_TAGS-1:0] core_rsp_ready_c; // Memory request - wire mem_req_valid_nc; - wire mem_req_rw_nc; - wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_nc; - wire [NUM_PORTS-1:0] mem_req_pmask_nc; - wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_nc; - wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_nc; - wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_nc; - wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_nc; - wire mem_req_ready_nc; + wire mem_req_valid_c; + wire mem_req_rw_c; + wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_c; + wire [NUM_PORTS-1:0] mem_req_pmask_c; + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_c; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_c; + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_c; + wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_c; + wire mem_req_ready_c; // Memory response - wire mem_rsp_valid_nc; - wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_nc; - wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_nc; - wire mem_rsp_ready_nc; + wire mem_rsp_valid_c; + wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_c; + wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_c; + wire mem_rsp_ready_c; if (NC_ENABLE) begin VX_nc_bypass #( @@ -280,20 +279,20 @@ module VX_cache #( .core_req_ready_in (core_req_ready), // Core request out - .core_req_valid_out (core_req_valid_nc), - .core_req_rw_out (core_req_rw_nc), - .core_req_byteen_out(core_req_byteen_nc), - .core_req_addr_out (core_req_addr_nc), - .core_req_data_out (core_req_data_nc), - .core_req_tag_out (core_req_tag_nc), - .core_req_ready_out (core_req_ready_nc), + .core_req_valid_out (core_req_valid_c), + .core_req_rw_out (core_req_rw_c), + .core_req_byteen_out(core_req_byteen_c), + .core_req_addr_out (core_req_addr_c), + .core_req_data_out (core_req_data_c), + .core_req_tag_out (core_req_tag_c), + .core_req_ready_out (core_req_ready_c), // Core response in - .core_rsp_valid_in (core_rsp_valid_nc), - .core_rsp_tmask_in (core_rsp_tmask_nc), - .core_rsp_data_in (core_rsp_data_nc), - .core_rsp_tag_in (core_rsp_tag_nc), - .core_rsp_ready_in (core_rsp_ready_nc), + .core_rsp_valid_in (core_rsp_valid_c), + .core_rsp_tmask_in (core_rsp_tmask_c), + .core_rsp_data_in (core_rsp_data_c), + .core_rsp_tag_in (core_rsp_tag_c), + .core_rsp_ready_in (core_rsp_ready_c), // Core response out .core_rsp_valid_out (core_rsp_valid_sb), @@ -303,15 +302,15 @@ module VX_cache #( .core_rsp_ready_out (core_rsp_ready_sb), // Memory request in - .mem_req_valid_in (mem_req_valid_nc), - .mem_req_rw_in (mem_req_rw_nc), - .mem_req_addr_in (mem_req_addr_nc), - .mem_req_pmask_in (mem_req_pmask_nc), - .mem_req_byteen_in (mem_req_byteen_nc), - .mem_req_wsel_in (mem_req_wsel_nc), - .mem_req_data_in (mem_req_data_nc), - .mem_req_tag_in (mem_req_tag_nc), - .mem_req_ready_in (mem_req_ready_nc), + .mem_req_valid_in (mem_req_valid_c), + .mem_req_rw_in (mem_req_rw_c), + .mem_req_addr_in (mem_req_addr_c), + .mem_req_pmask_in (mem_req_pmask_c), + .mem_req_byteen_in (mem_req_byteen_c), + .mem_req_wsel_in (mem_req_wsel_c), + .mem_req_data_in (mem_req_data_c), + .mem_req_tag_in (mem_req_tag_c), + .mem_req_ready_in (mem_req_ready_c), // Memory request out .mem_req_valid_out (mem_req_valid_sb), @@ -331,40 +330,40 @@ module VX_cache #( .mem_rsp_ready_in (mem_rsp_ready), // Memory response out - .mem_rsp_valid_out (mem_rsp_valid_nc), - .mem_rsp_data_out (mem_rsp_data_nc), - .mem_rsp_tag_out (mem_rsp_tag_nc), - .mem_rsp_ready_out (mem_rsp_ready_nc) + .mem_rsp_valid_out (mem_rsp_valid_c), + .mem_rsp_data_out (mem_rsp_data_c), + .mem_rsp_tag_out (mem_rsp_tag_c), + .mem_rsp_ready_out (mem_rsp_ready_c) ); end else begin - assign core_req_valid_nc = core_req_valid; - assign core_req_rw_nc = core_req_rw; - assign core_req_addr_nc = core_req_addr; - assign core_req_byteen_nc = core_req_byteen; - assign core_req_data_nc = core_req_data; - assign core_req_tag_nc = core_req_tag; - assign core_req_ready = core_req_ready_nc; + assign core_req_valid_c = core_req_valid; + assign core_req_rw_c = core_req_rw; + assign core_req_addr_c = core_req_addr; + assign core_req_byteen_c = core_req_byteen; + assign core_req_data_c = core_req_data; + assign core_req_tag_c = core_req_tag; + assign core_req_ready = core_req_ready_c; - assign core_rsp_valid_sb = core_rsp_valid_nc; - assign core_rsp_tmask_sb = core_rsp_tmask_nc; - assign core_rsp_data_sb = core_rsp_data_nc; - assign core_rsp_tag_sb = core_rsp_tag_nc; - assign core_rsp_ready_nc = core_rsp_ready_sb; + assign core_rsp_valid_sb = core_rsp_valid_c; + assign core_rsp_tmask_sb = core_rsp_tmask_c; + assign core_rsp_data_sb = core_rsp_data_c; + assign core_rsp_tag_sb = core_rsp_tag_c; + assign core_rsp_ready_c = core_rsp_ready_sb; - assign mem_req_valid_sb = mem_req_valid_nc; - assign mem_req_addr_sb = mem_req_addr_nc; - assign mem_req_rw_p = mem_req_rw_nc; - assign mem_req_pmask_p = mem_req_pmask_nc; - assign mem_req_byteen_p = mem_req_byteen_nc; - assign mem_req_wsel_p = mem_req_wsel_nc; - assign mem_req_data_p = mem_req_data_nc; - assign mem_req_tag_sb = mem_req_tag_nc; - assign mem_req_ready_nc = mem_req_ready_sb; + assign mem_req_valid_sb = mem_req_valid_c; + assign mem_req_addr_sb = mem_req_addr_c; + assign mem_req_rw_p = mem_req_rw_c; + assign mem_req_pmask_p = mem_req_pmask_c; + assign mem_req_byteen_p = mem_req_byteen_c; + assign mem_req_wsel_p = mem_req_wsel_c; + assign mem_req_data_p = mem_req_data_c; + assign mem_req_tag_sb = mem_req_tag_c; + assign mem_req_ready_c = mem_req_ready_sb; - assign mem_rsp_valid_nc = mem_rsp_valid; - assign mem_rsp_data_nc = mem_rsp_data; - assign mem_rsp_tag_nc = mem_rsp_tag; - assign mem_rsp_ready = mem_rsp_ready_nc; + assign mem_rsp_valid_c = mem_rsp_valid; + assign mem_rsp_data_c = mem_rsp_data; + assign mem_rsp_tag_c = mem_rsp_tag; + assign mem_rsp_ready = mem_rsp_ready_c; end /////////////////////////////////////////////////////////////////////////// @@ -383,15 +382,15 @@ module VX_cache #( ) mem_rsp_queue ( .clk (clk), .reset (mrsq_reset), - .ready_in (mem_rsp_ready_nc), - .valid_in (mem_rsp_valid_nc), - .data_in ({mem_rsp_tag_nc, mem_rsp_data_nc}), + .ready_in (mem_rsp_ready_c), + .valid_in (mem_rsp_valid_c), + .data_in ({mem_rsp_tag_c, mem_rsp_data_c}), .data_out ({mem_rsp_tag_qual, mem_rsp_data_qual}), .ready_out (mrsq_out_ready), .valid_out (mrsq_out_valid) ); - `UNUSED_VAR (mem_rsp_tag_nc) + `UNUSED_VAR (mem_rsp_tag_c) /////////////////////////////////////////////////////////////////////////// @@ -464,13 +463,13 @@ module VX_cache #( `ifdef PERF_ENABLE .bank_stalls(perf_cache_if.bank_stalls), `endif - .core_req_valid (core_req_valid_nc), - .core_req_rw (core_req_rw_nc), - .core_req_addr (core_req_addr_nc), - .core_req_byteen (core_req_byteen_nc), - .core_req_data (core_req_data_nc), - .core_req_tag (core_req_tag_nc), - .core_req_ready (core_req_ready_nc), + .core_req_valid (core_req_valid_c), + .core_req_rw (core_req_rw_c), + .core_req_addr (core_req_addr_c), + .core_req_byteen (core_req_byteen_c), + .core_req_data (core_req_data_c), + .core_req_tag (core_req_tag_c), + .core_req_ready (core_req_ready_c), .per_bank_core_req_valid (per_bank_core_req_valid), .per_bank_core_req_pmask (per_bank_core_req_pmask), .per_bank_core_req_rw (per_bank_core_req_rw), @@ -592,7 +591,6 @@ module VX_cache #( .perf_read_misses (perf_read_miss_per_bank[i]), .perf_write_misses (perf_write_miss_per_bank[i]), .perf_mshr_stalls (perf_mshr_stall_per_bank[i]), - .perf_pipe_stalls (perf_pipe_stall_per_bank[i]), `endif // Core request @@ -655,11 +653,11 @@ module VX_cache #( .per_bank_core_rsp_tag (per_bank_core_rsp_tag), .per_bank_core_rsp_tid (per_bank_core_rsp_tid), .per_bank_core_rsp_ready (per_bank_core_rsp_ready), - .core_rsp_valid (core_rsp_valid_nc), - .core_rsp_tmask (core_rsp_tmask_nc), - .core_rsp_tag (core_rsp_tag_nc), - .core_rsp_data (core_rsp_data_nc), - .core_rsp_ready (core_rsp_ready_nc) + .core_rsp_valid (core_rsp_valid_c), + .core_rsp_tmask (core_rsp_tmask_c), + .core_rsp_tag (core_rsp_tag_c), + .core_rsp_data (core_rsp_data_c), + .core_rsp_ready (core_rsp_ready_c) ); wire [NUM_BANKS-1:0][(`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH))-1:0] data_in; @@ -681,15 +679,15 @@ module VX_cache #( .valid_in (per_bank_mem_req_valid), .data_in (data_in), .ready_in (per_bank_mem_req_ready), - .valid_out (mem_req_valid_nc), - .data_out ({mem_req_addr_nc, mem_req_id, mem_req_rw_nc, mem_req_pmask_nc, mem_req_byteen_nc, mem_req_wsel_nc, mem_req_data_nc}), - .ready_out (mem_req_ready_nc) + .valid_out (mem_req_valid_c), + .data_out ({mem_req_addr_c, mem_req_id, mem_req_rw_c, mem_req_pmask_c, mem_req_byteen_c, mem_req_wsel_c, mem_req_data_c}), + .ready_out (mem_req_ready_c) ); if (NUM_BANKS == 1) begin - assign mem_req_tag_nc = MEM_TAG_IN_WIDTH'(mem_req_id); + assign mem_req_tag_c = MEM_TAG_IN_WIDTH'(mem_req_id); end else begin - assign mem_req_tag_nc = MEM_TAG_IN_WIDTH'({`MEM_ADDR_TO_BANK_ID(mem_req_addr_nc), mem_req_id}); + assign mem_req_tag_c = MEM_TAG_IN_WIDTH'({`MEM_ADDR_TO_BANK_ID(mem_req_addr_c), mem_req_id}); end `ifdef PERF_ENABLE @@ -697,12 +695,21 @@ module VX_cache #( wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; wire [$clog2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; - - wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw; - wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw; - - `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask); + + wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid_c & core_req_ready_c & ~core_req_rw; + wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid_c & core_req_ready_c & core_req_rw; + + // per cycle: read misses, write misses, msrq stalls, pipeline stalls + wire [$clog2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; + wire [$clog2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle; + wire [$clog2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; + wire [$clog2(NUM_BANKS+1)-1:0] perf_crsp_stall_per_cycle; + + `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask); `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask); + `POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank); + `POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank); + `POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank); if (CORE_TAG_ID_BITS != 0) begin wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}}; @@ -712,23 +719,14 @@ module VX_cache #( `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask); end - // per cycle: read misses, write misses, msrq stalls, pipeline stalls - wire [$clog2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; - wire [$clog2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle; - wire [$clog2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; - wire [$clog2(NUM_BANKS+1)-1:0] perf_pipe_stall_per_cycle; - - `POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank); - `POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank); - `POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank); - `POP_COUNT(perf_pipe_stall_per_cycle, perf_pipe_stall_per_bank); + wire perf_mem_stall_per_cycle = mem_req_valid & ~mem_req_ready; reg [`PERF_CTR_BITS-1:0] perf_core_reads; reg [`PERF_CTR_BITS-1:0] perf_core_writes; reg [`PERF_CTR_BITS-1:0] perf_read_misses; reg [`PERF_CTR_BITS-1:0] perf_write_misses; reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls; - reg [`PERF_CTR_BITS-1:0] perf_pipe_stalls; + reg [`PERF_CTR_BITS-1:0] perf_mem_stalls; reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls; always @(posedge clk) begin @@ -738,16 +736,16 @@ module VX_cache #( perf_read_misses <= 0; perf_write_misses <= 0; perf_mshr_stalls <= 0; - perf_pipe_stalls <= 0; + perf_mem_stalls <= 0; perf_crsp_stalls <= 0; end else begin - perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); - perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); - perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle); - perf_write_misses <= perf_write_misses+ `PERF_CTR_BITS'(perf_write_miss_per_cycle); - perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle); - perf_pipe_stalls <= perf_pipe_stalls + `PERF_CTR_BITS'(perf_pipe_stall_per_cycle); - perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); + perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); + perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); + perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle); + perf_write_misses <= perf_write_misses + `PERF_CTR_BITS'(perf_write_miss_per_cycle); + perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle); + perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle); + perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); end end @@ -756,7 +754,7 @@ module VX_cache #( assign perf_cache_if.read_misses = perf_read_misses; assign perf_cache_if.write_misses = perf_write_misses; assign perf_cache_if.mshr_stalls = perf_mshr_stalls; - assign perf_cache_if.pipe_stalls = perf_pipe_stalls; + assign perf_cache_if.mem_stalls = perf_mem_stalls; assign perf_cache_if.crsp_stalls = perf_crsp_stalls; `endif diff --git a/hw/rtl/cache/VX_shared_mem.sv b/hw/rtl/cache/VX_shared_mem.sv index 257cf295..971795e0 100644 --- a/hw/rtl/cache/VX_shared_mem.sv +++ b/hw/rtl/cache/VX_shared_mem.sv @@ -335,21 +335,13 @@ module VX_shared_mem #( // per cycle: core_reads, core_writes wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; - wire [$clog2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw; wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw; `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask); `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask); - - if (CORE_TAG_ID_BITS != 0) begin - wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}}; - `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask); - end else begin - wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_valid & ~core_rsp_ready; - `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask); - end + wire perf_crsp_stall_per_cycle = core_rsp_valid & ~core_rsp_ready; reg [`PERF_CTR_BITS-1:0] perf_core_reads; reg [`PERF_CTR_BITS-1:0] perf_core_writes; @@ -357,13 +349,13 @@ module VX_shared_mem #( always @(posedge clk) begin if (reset) begin - perf_core_reads <= 0; - perf_core_writes <= 0; - perf_crsp_stalls <= 0; + perf_core_reads <= 0; + perf_core_writes <= 0; + perf_crsp_stalls <= 0; end else begin - perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); - perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); - perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); + perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); + perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); + perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); end end @@ -371,7 +363,8 @@ module VX_shared_mem #( assign perf_cache_if.writes = perf_core_writes; assign perf_cache_if.read_misses = '0; assign perf_cache_if.write_misses = '0; - assign perf_cache_if.pipe_stalls = '0; + assign perf_cache_if.mshr_stalls = '0; + assign perf_cache_if.mem_stalls = '0; assign perf_cache_if.crsp_stalls = perf_crsp_stalls; `endif diff --git a/hw/rtl/interfaces/VX_alu_req_if.sv b/hw/rtl/interfaces/VX_alu_req_if.sv index 2c6ffd5e..35049542 100644 --- a/hw/rtl/interfaces/VX_alu_req_if.sv +++ b/hw/rtl/interfaces/VX_alu_req_if.sv @@ -5,7 +5,8 @@ interface VX_alu_req_if (); - wire valid; + wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -24,6 +25,7 @@ interface VX_alu_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -43,6 +45,7 @@ interface VX_alu_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_cmt_to_csr_if.sv b/hw/rtl/interfaces/VX_cmt_to_csr_if.sv index 800d428d..ed5ffc24 100644 --- a/hw/rtl/interfaces/VX_cmt_to_csr_if.sv +++ b/hw/rtl/interfaces/VX_cmt_to_csr_if.sv @@ -5,9 +5,12 @@ interface VX_cmt_to_csr_if (); - wire valid; - wire [$clog2(`NUM_THREADS+1)-1:0] commit_size; - + wire valid; +`ifdef EXT_F_ENABLE + wire [$clog2(6*`NUM_THREADS+1)-1:0] commit_size; +`else + wire [$clog2(5*`NUM_THREADS+1)-1:0] commit_size; +`endif modport master ( output valid, output commit_size diff --git a/hw/rtl/interfaces/VX_commit_if.sv b/hw/rtl/interfaces/VX_commit_if.sv index 4b6844d6..e85d310f 100644 --- a/hw/rtl/interfaces/VX_commit_if.sv +++ b/hw/rtl/interfaces/VX_commit_if.sv @@ -6,6 +6,7 @@ interface VX_commit_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -17,6 +18,7 @@ interface VX_commit_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -29,6 +31,7 @@ interface VX_commit_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_csr_req_if.sv b/hw/rtl/interfaces/VX_csr_req_if.sv index 23345d53..0639f3aa 100644 --- a/hw/rtl/interfaces/VX_csr_req_if.sv +++ b/hw/rtl/interfaces/VX_csr_req_if.sv @@ -6,6 +6,7 @@ interface VX_csr_req_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -20,6 +21,7 @@ interface VX_csr_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -35,6 +37,7 @@ interface VX_csr_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_decode_if.sv b/hw/rtl/interfaces/VX_decode_if.sv index 90c5d70e..23039847 100644 --- a/hw/rtl/interfaces/VX_decode_if.sv +++ b/hw/rtl/interfaces/VX_decode_if.sv @@ -6,6 +6,7 @@ interface VX_decode_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -23,7 +24,8 @@ interface VX_decode_if (); wire ready; modport master ( - output valid, + output valid, + output uuid, output wid, output tmask, output PC, @@ -42,7 +44,8 @@ interface VX_decode_if (); ); modport slave ( - input valid, + input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_fpu_req_if.sv b/hw/rtl/interfaces/VX_fpu_req_if.sv index 25867e42..2b7d69f0 100644 --- a/hw/rtl/interfaces/VX_fpu_req_if.sv +++ b/hw/rtl/interfaces/VX_fpu_req_if.sv @@ -6,6 +6,7 @@ interface VX_fpu_req_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -20,6 +21,7 @@ interface VX_fpu_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -35,6 +37,7 @@ interface VX_fpu_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_gpu_req_if.sv b/hw/rtl/interfaces/VX_gpu_req_if.sv index 50ac8c7c..06ef6cc7 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.sv +++ b/hw/rtl/interfaces/VX_gpu_req_if.sv @@ -6,7 +6,7 @@ interface VX_gpu_req_if(); wire valid; - + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -19,11 +19,11 @@ interface VX_gpu_req_if(); wire [`NUM_THREADS-1:0][31:0] rs3_data; wire [`NR_BITS-1:0] rd; wire wb; - wire ready; modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -41,6 +41,7 @@ interface VX_gpu_req_if(); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_ibuffer_if.sv b/hw/rtl/interfaces/VX_ibuffer_if.sv index bb791737..a436ae7b 100644 --- a/hw/rtl/interfaces/VX_ibuffer_if.sv +++ b/hw/rtl/interfaces/VX_ibuffer_if.sv @@ -6,6 +6,7 @@ interface VX_ibuffer_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -31,6 +32,7 @@ interface VX_ibuffer_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -55,6 +57,7 @@ interface VX_ibuffer_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_ifetch_req_if.sv b/hw/rtl/interfaces/VX_ifetch_req_if.sv index 3d75e736..4132f90b 100644 --- a/hw/rtl/interfaces/VX_ifetch_req_if.sv +++ b/hw/rtl/interfaces/VX_ifetch_req_if.sv @@ -5,14 +5,16 @@ interface VX_ifetch_req_if (); - wire valid; + wire valid; + wire [63:0] uuid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; wire ready; modport master ( - output valid, + output valid, + output uuid, output tmask, output wid, output PC, @@ -20,7 +22,8 @@ interface VX_ifetch_req_if (); ); modport slave ( - input valid, + input valid, + input uuid, input tmask, input wid, input PC, diff --git a/hw/rtl/interfaces/VX_ifetch_rsp_if.sv b/hw/rtl/interfaces/VX_ifetch_rsp_if.sv index a2f04fe4..350af081 100644 --- a/hw/rtl/interfaces/VX_ifetch_rsp_if.sv +++ b/hw/rtl/interfaces/VX_ifetch_rsp_if.sv @@ -6,6 +6,7 @@ interface VX_ifetch_rsp_if (); wire valid; + wire [63:0] uuid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; @@ -13,7 +14,8 @@ interface VX_ifetch_rsp_if (); wire ready; modport master ( - output valid, + output valid, + output uuid, output tmask, output wid, output PC, @@ -22,7 +24,8 @@ interface VX_ifetch_rsp_if (); ); modport slave ( - input valid, + input valid, + input uuid, input tmask, input wid, input PC, diff --git a/hw/rtl/interfaces/VX_lsu_req_if.sv b/hw/rtl/interfaces/VX_lsu_req_if.sv index 4f31b17c..128b3c20 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.sv +++ b/hw/rtl/interfaces/VX_lsu_req_if.sv @@ -6,6 +6,7 @@ interface VX_lsu_req_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -21,6 +22,7 @@ interface VX_lsu_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -37,6 +39,7 @@ interface VX_lsu_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_perf_cache_if.sv b/hw/rtl/interfaces/VX_perf_cache_if.sv index d9efb2cc..0ec8d582 100644 --- a/hw/rtl/interfaces/VX_perf_cache_if.sv +++ b/hw/rtl/interfaces/VX_perf_cache_if.sv @@ -11,7 +11,7 @@ interface VX_perf_cache_if (); wire [`PERF_CTR_BITS-1:0] write_misses; wire [`PERF_CTR_BITS-1:0] bank_stalls; wire [`PERF_CTR_BITS-1:0] mshr_stalls; - wire [`PERF_CTR_BITS-1:0] pipe_stalls; + wire [`PERF_CTR_BITS-1:0] mem_stalls; wire [`PERF_CTR_BITS-1:0] crsp_stalls; modport master ( @@ -21,7 +21,7 @@ interface VX_perf_cache_if (); output write_misses, output bank_stalls, output mshr_stalls, - output pipe_stalls, + output mem_stalls, output crsp_stalls ); @@ -32,7 +32,7 @@ interface VX_perf_cache_if (); input write_misses, input bank_stalls, input mshr_stalls, - input pipe_stalls, + input mem_stalls, input crsp_stalls ); diff --git a/hw/rtl/interfaces/VX_perf_memsys_if.sv b/hw/rtl/interfaces/VX_perf_memsys_if.sv index f0e27ed6..9a38dc26 100644 --- a/hw/rtl/interfaces/VX_perf_memsys_if.sv +++ b/hw/rtl/interfaces/VX_perf_memsys_if.sv @@ -7,68 +7,50 @@ interface VX_perf_memsys_if (); wire [`PERF_CTR_BITS-1:0] icache_reads; wire [`PERF_CTR_BITS-1:0] icache_read_misses; - wire [`PERF_CTR_BITS-1:0] icache_pipe_stalls; - wire [`PERF_CTR_BITS-1:0] icache_crsp_stalls; - wire [`PERF_CTR_BITS-1:0] dcache_reads; - wire [`PERF_CTR_BITS-1:0] dcache_writes; + wire [`PERF_CTR_BITS-1:0] dcache_writes; wire [`PERF_CTR_BITS-1:0] dcache_read_misses; wire [`PERF_CTR_BITS-1:0] dcache_write_misses; wire [`PERF_CTR_BITS-1:0] dcache_bank_stalls; wire [`PERF_CTR_BITS-1:0] dcache_mshr_stalls; - wire [`PERF_CTR_BITS-1:0] dcache_pipe_stalls; - wire [`PERF_CTR_BITS-1:0] dcache_crsp_stalls; - wire [`PERF_CTR_BITS-1:0] smem_reads; wire [`PERF_CTR_BITS-1:0] smem_writes; wire [`PERF_CTR_BITS-1:0] smem_bank_stalls; - wire [`PERF_CTR_BITS-1:0] mem_reads; wire [`PERF_CTR_BITS-1:0] mem_writes; - wire [`PERF_CTR_BITS-1:0] mem_stalls; wire [`PERF_CTR_BITS-1:0] mem_latency; modport master ( output icache_reads, output icache_read_misses, - output icache_pipe_stalls, - output icache_crsp_stalls, output dcache_reads, - output dcache_writes, + output dcache_writes, output dcache_read_misses, output dcache_write_misses, output dcache_bank_stalls, output dcache_mshr_stalls, - output dcache_pipe_stalls, - output dcache_crsp_stalls, output smem_reads, output smem_writes, output smem_bank_stalls, output mem_reads, output mem_writes, - output mem_stalls, output mem_latency ); modport slave ( input icache_reads, input icache_read_misses, - input icache_pipe_stalls, - input icache_crsp_stalls, input dcache_reads, - input dcache_writes, + input dcache_writes, input dcache_read_misses, input dcache_write_misses, input dcache_bank_stalls, input dcache_mshr_stalls, - input dcache_pipe_stalls, - input dcache_crsp_stalls, input smem_reads, input smem_writes, input smem_bank_stalls, input mem_reads, input mem_writes, - input mem_stalls, input mem_latency ); diff --git a/hw/rtl/interfaces/VX_perf_pipeline_if.sv b/hw/rtl/interfaces/VX_perf_pipeline_if.sv index 19cc15c3..a4470e4c 100644 --- a/hw/rtl/interfaces/VX_perf_pipeline_if.sv +++ b/hw/rtl/interfaces/VX_perf_pipeline_if.sv @@ -4,18 +4,27 @@ `include "VX_define.vh" interface VX_perf_pipeline_if (); - - wire [`PERF_CTR_BITS-1:0] ibf_stalls; - wire [`PERF_CTR_BITS-1:0] scb_stalls; - wire [`PERF_CTR_BITS-1:0] lsu_stalls; - wire [`PERF_CTR_BITS-1:0] csr_stalls; - wire [`PERF_CTR_BITS-1:0] alu_stalls; + wire [`PERF_CTR_BITS-1:0] loads; + wire [`PERF_CTR_BITS-1:0] stores; + wire [`PERF_CTR_BITS-1:0] branches; + + wire [`PERF_CTR_BITS-1:0] ibf_stalls; + wire [`PERF_CTR_BITS-1:0] scb_stalls; + wire [`PERF_CTR_BITS-1:0] lsu_stalls; + wire [`PERF_CTR_BITS-1:0] csr_stalls; + wire [`PERF_CTR_BITS-1:0] alu_stalls; `ifdef EXT_F_ENABLE - wire [`PERF_CTR_BITS-1:0] fpu_stalls; + wire [`PERF_CTR_BITS-1:0] fpu_stalls; `endif - wire [`PERF_CTR_BITS-1:0] gpu_stalls; + wire [`PERF_CTR_BITS-1:0] gpu_stalls; - modport master ( + modport decode ( + output loads, + output stores, + output branches + ); + + modport issue ( output ibf_stalls, output scb_stalls, output lsu_stalls, @@ -25,9 +34,12 @@ interface VX_perf_pipeline_if (); output fpu_stalls, `endif output gpu_stalls - ); + ); modport slave ( + input loads, + input stores, + input branches, input ibf_stalls, input scb_stalls, input lsu_stalls, diff --git a/hw/rtl/interfaces/VX_perf_tex_if.sv b/hw/rtl/interfaces/VX_perf_tex_if.sv new file mode 100644 index 00000000..222ade53 --- /dev/null +++ b/hw/rtl/interfaces/VX_perf_tex_if.sv @@ -0,0 +1,23 @@ +`ifndef VX_PERF_TEX_IF +`define VX_PERF_TEX_IF + +`include "VX_define.vh" + +interface VX_perf_tex_if (); + + wire [`PERF_CTR_BITS-1:0] mem_reads; + wire [`PERF_CTR_BITS-1:0] mem_latency; + + modport master ( + output mem_reads, + output mem_latency + ); + + modport slave ( + input mem_reads, + input mem_latency + ); + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_tex_req_if.sv b/hw/rtl/interfaces/VX_tex_req_if.sv index f1eaa1be..0059de59 100644 --- a/hw/rtl/interfaces/VX_tex_req_if.sv +++ b/hw/rtl/interfaces/VX_tex_req_if.sv @@ -6,6 +6,7 @@ interface VX_tex_req_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -20,6 +21,7 @@ interface VX_tex_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -33,6 +35,7 @@ interface VX_tex_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_tex_rsp_if.sv b/hw/rtl/interfaces/VX_tex_rsp_if.sv index b3dbd65d..5966124c 100644 --- a/hw/rtl/interfaces/VX_tex_rsp_if.sv +++ b/hw/rtl/interfaces/VX_tex_rsp_if.sv @@ -6,6 +6,7 @@ interface VX_tex_rsp_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -16,6 +17,7 @@ interface VX_tex_rsp_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -27,6 +29,7 @@ interface VX_tex_rsp_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_writeback_if.sv b/hw/rtl/interfaces/VX_writeback_if.sv index 8f05fc7a..00cab3b8 100644 --- a/hw/rtl/interfaces/VX_writeback_if.sv +++ b/hw/rtl/interfaces/VX_writeback_if.sv @@ -6,6 +6,7 @@ interface VX_writeback_if (); wire valid; + wire [63:0] uuid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; @@ -16,6 +17,7 @@ interface VX_writeback_if (); modport master ( output valid, + output uuid, output tmask, output wid, output PC, @@ -27,6 +29,7 @@ interface VX_writeback_if (); modport slave ( input valid, + input uuid, input tmask, input wid, input PC, diff --git a/hw/rtl/libs/VX_axi_adapter.sv b/hw/rtl/libs/VX_axi_adapter.sv index 2788c315..9e96eedb 100644 --- a/hw/rtl/libs/VX_axi_adapter.sv +++ b/hw/rtl/libs/VX_axi_adapter.sv @@ -125,7 +125,7 @@ module VX_axi_adapter #( // AXI write response channel `UNUSED_VAR (m_axi_bid); - `RUNTIME_ASSERT(~m_axi_bvalid || m_axi_bresp == 0, ("AXI response error")); + `RUNTIME_ASSERT(~m_axi_bvalid || m_axi_bresp == 0, ("%t: *** AXI response error", $time)); assign m_axi_bready = 1'b1; // AXI read request channel @@ -144,7 +144,7 @@ module VX_axi_adapter #( assign mem_rsp_valid = m_axi_rvalid; assign mem_rsp_tag = m_axi_rid; assign mem_rsp_data = m_axi_rdata; - `RUNTIME_ASSERT(~m_axi_rvalid || m_axi_rresp == 0, ("AXI response error")); + `RUNTIME_ASSERT(~m_axi_rvalid || m_axi_rresp == 0, ("%t: *** AXI response error", $time)); `UNUSED_VAR (m_axi_rlast); assign m_axi_rready = mem_rsp_ready; diff --git a/hw/rtl/libs/VX_index_queue.sv b/hw/rtl/libs/VX_index_queue.sv index 66307d74..201287fb 100644 --- a/hw/rtl/libs/VX_index_queue.sv +++ b/hw/rtl/libs/VX_index_queue.sv @@ -32,7 +32,7 @@ module VX_index_queue #( assign enqueue = push; assign dequeue = !empty && !valid[rd_a]; // auto-remove when head is invalid - `RUNTIME_ASSERT(!push || !full, ("invalid inputs")); + `RUNTIME_ASSERT(!push || !full, ("%t: *** invalid inputs", $time)); always @(posedge clk) begin if (reset) begin diff --git a/hw/rtl/libs/VX_popcount.sv b/hw/rtl/libs/VX_popcount.sv index 3144f106..8c8b08d3 100644 --- a/hw/rtl/libs/VX_popcount.sv +++ b/hw/rtl/libs/VX_popcount.sv @@ -4,12 +4,17 @@ module VX_popcount #( parameter MODEL = 1, parameter N = 1, - parameter LOGN = $clog2(N), - parameter M = LOGN+1 + parameter M = $clog2(N+1) ) ( input wire [N-1:0] in_i, output wire [M-1:0] cnt_o ); +`ifndef SYNTHESIS + assign cnt_o = $countones(in_i); +`else +`ifdef QUARTUS + assign cnt_o = $countones(in_i); +`else if (N == 1) begin assign cnt_o = in_i; @@ -53,6 +58,8 @@ module VX_popcount #( assign cnt_o = cnt_r; end +`endif +`endif endmodule `TRACING_ON \ No newline at end of file diff --git a/hw/rtl/libs/VX_skid_buffer.sv b/hw/rtl/libs/VX_skid_buffer.sv index ba6c8b6c..c6820f75 100644 --- a/hw/rtl/libs/VX_skid_buffer.sv +++ b/hw/rtl/libs/VX_skid_buffer.sv @@ -30,7 +30,7 @@ module VX_skid_buffer #( end else if (NOBACKPRESSURE) begin - `RUNTIME_ASSERT(ready_out, ("ready_out should always be asserted")) + `RUNTIME_ASSERT(ready_out, ("%t: *** ready_out should always be asserted", $time)) wire stall = valid_out && ~ready_out; diff --git a/hw/rtl/tex_unit/VX_tex_unit.sv b/hw/rtl/tex_unit/VX_tex_unit.sv index 38f93eb2..c9510827 100644 --- a/hw/rtl/tex_unit/VX_tex_unit.sv +++ b/hw/rtl/tex_unit/VX_tex_unit.sv @@ -6,6 +6,11 @@ module VX_tex_unit #( input wire clk, input wire reset, + // PERF +`ifdef PERF_ENABLE + VX_perf_tex_if.master perf_tex_if, +`endif + // Texture unit <-> Memory Unit VX_dcache_req_if.master dcache_req_if, VX_dcache_rsp_if.slave dcache_rsp_if, @@ -18,10 +23,11 @@ module VX_tex_unit #( VX_tex_rsp_if.master tex_rsp_if ); - localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32; + localparam REQ_INFOW_S = 64 + `NR_BITS + 1 + `NW_BITS + 32; localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S; localparam REQ_INFOW_M = (2 * `NUM_THREADS * `TEX_BLEND_FRAC) + REQ_INFOW_A; + reg [$clog2(`NUM_TEX_UNITS)-1:0] csr_tex_unit; reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][`TEX_LOD_MAX+1-1:0]; reg [1:0][`TEX_LOD_BITS-1:0] tex_logdims [`NUM_TEX_UNITS-1:0]; reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0]; @@ -29,57 +35,60 @@ module VX_tex_unit #( reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0]; reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0]; - // CSRs programming + // CSRs programming - reg [`NUM_TEX_UNITS-1:0] csrs_dirty; + reg csrs_dirty [`NUM_TEX_UNITS-1:0]; `UNUSED_VAR (csrs_dirty) - for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin - always @(posedge clk) begin - if (tex_csr_if.write_enable) begin - case (tex_csr_if.write_addr) - `CSR_TEX(i, `TEX_STATE_ADDR) : begin - tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX(i, `TEX_STATE_FORMAT) : begin - tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX(i, `TEX_STATE_WRAPU) : begin - tex_wraps[i][0] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX(i, `TEX_STATE_WRAPV) : begin - tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX(i, `TEX_STATE_FILTER) : begin - tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX(i, `TEX_STATE_WIDTH) : begin - tex_logdims[i][0] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX(i, `TEX_STATE_HEIGHT) : begin - tex_logdims[i][1] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; - csrs_dirty[i] <= 1; - end - default: begin - for (integer j = 0; j <= `TEX_LOD_MAX; ++j) begin - `IGNORE_WARNINGS_BEGIN - if (tex_csr_if.write_addr == `CSR_ADDR_BITS'(`CSR_TEX(i, `TEX_STATE_MIPOFF(j)))) begin - `IGNORE_WARNINGS_END - tex_mipoff[i][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; - csrs_dirty[i] <= 1; - end + always @(posedge clk) begin + if (tex_csr_if.write_enable) begin + case (tex_csr_if.write_addr) + `CSR_TEX_UNIT: begin + csr_tex_unit <= tex_csr_if.write_data[$clog2(`NUM_TEX_UNITS)-1:0]; + end + `CSR_TEX_ADDR: begin + tex_baddr[csr_tex_unit] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + `CSR_TEX_FORMAT: begin + tex_format[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + `CSR_TEX_WRAPU: begin + tex_wraps[csr_tex_unit][0] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + `CSR_TEX_WRAPV: begin + tex_wraps[csr_tex_unit][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + `CSR_TEX_FILTER: begin + tex_filter[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + `CSR_TEX_WIDTH: begin + tex_logdims[csr_tex_unit][0] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + `CSR_TEX_HEIGHT: begin + tex_logdims[csr_tex_unit][1] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + default: begin + for (integer j = 0; j <= `TEX_LOD_MAX; ++j) begin + `IGNORE_WARNINGS_BEGIN + if (tex_csr_if.write_addr == `CSR_TEX_MIPOFF(j)) begin + `IGNORE_WARNINGS_END + tex_mipoff[csr_tex_unit][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; end end - endcase - end - if (reset || (tex_req_if.valid && tex_req_if.ready)) begin - csrs_dirty[i] <= '0; + end + endcase + end + if (reset || (tex_req_if.valid && tex_req_if.ready)) begin + for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin + csrs_dirty[i] <= 0; end end end @@ -125,7 +134,7 @@ module VX_tex_unit #( .req_baseaddr(tex_baddr[tex_req_if.unit]), .req_mipoff (sel_mipoff), .req_logdims(sel_logdims), - .req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), + .req_info ({tex_format[tex_req_if.unit], tex_req_if.uuid, tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), .req_ready (tex_req_if.ready), .rsp_valid (mem_req_valid), @@ -204,9 +213,47 @@ module VX_tex_unit #( .rsp_valid (tex_rsp_if.valid), .rsp_tmask (tex_rsp_if.tmask), .rsp_data (tex_rsp_if.data), - .rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}), + .rsp_info ({tex_rsp_if.uuid, tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}), .rsp_ready (tex_rsp_if.ready) - ); + ); + +`ifdef PERF_ENABLE + wire [$clog2(`NUM_THREADS+1)-1:0] perf_mem_req_per_cycle; + wire [$clog2(`NUM_THREADS+1)-1:0] perf_mem_rsp_per_cycle; + + wire [`NUM_THREADS-1:0] perf_mem_req_per_mask = dcache_req_if.valid & dcache_req_if.ready; + wire [`NUM_THREADS-1:0] perf_mem_rsp_per_mask = dcache_rsp_if.tmask & {`NUM_THREADS{dcache_rsp_if.valid & dcache_rsp_if.ready}}; + + `POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_per_mask); + `POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_per_mask); + + reg [`PERF_CTR_BITS-1:0] perf_pending_reads; + wire [$clog2(`NUM_THREADS+1)+1-1:0] perf_pending_reads_cycle = perf_mem_req_per_cycle - perf_mem_rsp_per_cycle; + + always @(posedge clk) begin + if (reset) begin + perf_pending_reads <= 0; + end else begin + perf_pending_reads <= perf_pending_reads + `PERF_CTR_BITS'($signed(perf_pending_reads_cycle)); + end + end + + reg [`PERF_CTR_BITS-1:0] perf_mem_reads; + reg [`PERF_CTR_BITS-1:0] perf_mem_latency; + + always @(posedge clk) begin + if (reset) begin + perf_mem_reads <= 0; + perf_mem_latency <= 0; + end else begin + perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(perf_mem_req_per_cycle); + perf_mem_latency <= perf_mem_latency + `PERF_CTR_BITS'(perf_pending_reads); + end + end + + assign perf_tex_if.mem_reads = perf_mem_reads; + assign perf_tex_if.mem_latency = perf_mem_latency; +`endif `ifdef DBG_TRACE_TEX always @(posedge clk) begin diff --git a/hw/scripts/scope.json b/hw/scripts/scope.json index 2c9f8355..c9c49ebe 100644 --- a/hw/scripts/scope.json +++ b/hw/scripts/scope.json @@ -123,9 +123,9 @@ "!cci_pending_writes_full": 1, "?afu_mem_req_fire": 1, "afu_mem_req_addr": 26, - "afu_mem_req_tag": 27, + "afu_mem_req_tag": "`VX_MEM_TAG_WIDTH+1", "?afu_mem_rsp_fire": 1, - "afu_mem_rsp_tag": 27 + "afu_mem_rsp_tag": "`VX_MEM_TAG_WIDTH+1" }, "afu/vortex": { "!reset": 1, @@ -140,49 +140,29 @@ "mem_rsp_tag":"`VX_MEM_TAG_WIDTH", "busy": 1 }, - "afu/vortex/cluster/core/pipeline/fetch/icache_stage": { - "?icache_req_fire": 1, - "icache_req_wid":"`NW_BITS", - "icache_req_addr": 32, - "icache_req_tag":"`ICACHE_CORE_TAG_ID_BITS", - "?icache_rsp_fire": 1, - "icache_rsp_data": 32, - "icache_rsp_tag":"`ICACHE_CORE_TAG_ID_BITS" - }, "afu/vortex/cluster/core/pipeline/fetch/warp_sched": { "?wsched_scheduled": 1, + "wsched_schedule_uuid": 64, "wsched_active_warps": "`NUM_WARPS", "wsched_stalled_warps": "`NUM_WARPS", "wsched_schedule_tmask": "`NUM_THREADS", "wsched_schedule_wid": "`NW_BITS", - "wsched_schedule_pc": "32" + "wsched_schedule_pc": 32 }, - "afu/vortex/cluster/core/pipeline/execute/gpu_unit": { - "?gpu_rsp_valid": 1, - "gpu_rsp_wid": "`NW_BITS", - "gpu_rsp_tmc": 1, - "gpu_rsp_wspawn": 1, - "gpu_rsp_split": 1, - "gpu_rsp_barrier": 1 - }, - "afu/vortex/cluster/core/pipeline/execute/lsu_unit": { - "?dcache_req_fire":"`NUM_THREADS", - "dcache_req_wid":"`NW_BITS", - "dcache_req_pc": 32, - "dcache_req_addr":"`NUM_THREADS * 32", - "dcache_req_rw": 1, - "dcache_req_byteen":"`NUM_THREADS * 4", - "dcache_req_data": "`NUM_THREADS * 32", - "dcache_req_tag":"`LSUQ_ADDR_BITS", - "?dcache_rsp_fire":"`NUM_THREADS", - "dcache_rsp_data":"`NUM_THREADS * 32", - "dcache_rsp_tag":"`LSUQ_ADDR_BITS" + "afu/vortex/cluster/core/pipeline/fetch/icache_stage": { + "?icache_req_fire": 1, + "icache_req_uuid": 64, + "icache_req_addr": 32, + "icache_req_tag":"`ICACHE_CORE_TAG_ID_BITS", + "?icache_rsp_fire": 1, + "icache_rsp_uuid": 64, + "icache_rsp_data": 32, + "icache_rsp_tag":"`ICACHE_CORE_TAG_ID_BITS" }, "afu/vortex/cluster/core/pipeline/issue": { "?issue_fire": 1, - "issue_wid":"`NW_BITS", - "issue_tmask":"`NUM_THREADS", - "issue_pc": 32, + "issue_uuid": 64, + "issue_tmask":"`NUM_THREADS", "issue_ex_type":"`EX_BITS", "issue_op_type":"`INST_OP_BITS", "issue_op_mod":"`INST_MOD_BITS", @@ -198,15 +178,35 @@ "gpr_rs2":"`NUM_THREADS * 32", "gpr_rs3":"`NUM_THREADS * 32", "?writeback_valid": 1, - "writeback_wid":"`NW_BITS", - "writeback_pc": 32, + "writeback_uuid": 64, "writeback_tmask":"`NUM_THREADS", "writeback_rd":"`NR_BITS", "writeback_data":"`NUM_THREADS * 32", "writeback_eop": 1, "!scoreboard_delay": 1, "!dispatch_delay": 1 - }, + }, + "afu/vortex/cluster/core/pipeline/execute/lsu_unit": { + "?dcache_req_fire":"`NUM_THREADS", + "dcache_req_uuid": 64, + "dcache_req_addr":"`NUM_THREADS * 32", + "dcache_req_rw": 1, + "dcache_req_byteen":"`NUM_THREADS * 4", + "dcache_req_data":"`NUM_THREADS * 32", + "dcache_req_tag":"`LSUQ_ADDR_BITS", + "?dcache_rsp_fire":"`NUM_THREADS", + "dcache_rsp_uuid": 64, + "dcache_rsp_data":"`NUM_THREADS * 32", + "dcache_rsp_tag":"`LSUQ_ADDR_BITS" + }, + "afu/vortex/cluster/core/pipeline/execute/gpu_unit": { + "?gpu_rsp_valid": 1, + "gpu_rsp_uuid": 64, + "gpu_rsp_tmc": 1, + "gpu_rsp_wspawn": 1, + "gpu_rsp_split": 1, + "gpu_rsp_barrier": 1 + }, "afu/vortex/l3cache/bank, afu/vortex/cluster/l2cache/bank, afu/vortex/cluster/core/mem_unit/dcache/bank, afu/vortex/cluster/core/mem_unit/icache/bank": { "?valid_st0": 1, "?valid_st1": 1, diff --git a/runtime/src/vx_start.S b/runtime/src/vx_start.S index 0d2a0078..16e91a15 100644 --- a/runtime/src/vx_start.S +++ b/runtime/src/vx_start.S @@ -42,15 +42,9 @@ _start: .type _exit, @function .global _exit _exit: - beqz a0, label_exit_next - mv gp, a0 - ecall; - -label_exit_next: - # dump performance CSRs - call vx_perf_dump - - # disable all threads in current warp + mv s0, a0 + call vx_perf_dump + mv gp, s0 li a0, 0 .insn s 0x6b, 0, x0, 0(a0) # tmc a0 diff --git a/sim/common/mempool.h b/sim/common/mempool.h new file mode 100644 index 00000000..a5c0429d --- /dev/null +++ b/sim/common/mempool.h @@ -0,0 +1,47 @@ +#pragma once + +#include + +template +class MemoryPool { +public: + MemoryPool(uint32_t max_size) : max_size_(max_size) {} + + MemoryPool(MemoryPool && other) + : free_list_(std::move(other.free_list_)) + {} + + ~MemoryPool() { + this->flush(); + } + + void* allocate() { + void* mem; + if (!free_list_.empty()) { + mem = static_cast(free_list_.top()); + free_list_.pop(); + } else { + mem = ::operator new(sizeof(T)); + } + return mem; + } + + void deallocate(void * object) { + if (free_list_.size() < max_size_) { + free_list_.push(static_cast(object)); + } else { + ::operator delete(object); + } + } + + void flush() { + while (!free_list_.empty()) { + ::operator delete(free_list_.top()); + free_list_.pop(); + } + } + +private: + std::stack free_list_; + uint32_t max_size_; +}; \ No newline at end of file diff --git a/sim/common/simobject.h b/sim/common/simobject.h index 369a3503..3a5ab2b6 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -7,6 +7,7 @@ #include #include #include +#include "mempool.h" class SimObjectBase; @@ -20,37 +21,14 @@ public: return module_; } - SimPortBase* peer() const { - return peer_; - } - - bool connected() const { - return (peer_ != nullptr); - } - protected: SimPortBase(SimObjectBase* module) : module_(module) - , peer_(nullptr) {} - void connect(SimPortBase* peer) { - assert(peer_ == nullptr); - peer_ = peer; - } - - void disconnect() { - assert(peer_ == nullptr); - peer_ = nullptr; - } - SimPortBase& operator=(const SimPortBase&) = delete; SimObjectBase* module_; - SimPortBase* peer_; - - template friend class SlavePort; - template friend class MasterPort; }; /////////////////////////////////////////////////////////////////////////////// @@ -58,72 +36,92 @@ protected: template class SimPort : public SimPortBase { public: - void send(const Pkt& pkt, uint64_t delay) const; + typedef std::function TxCallback; + + SimPort(SimObjectBase* module) + : SimPortBase(module) + , peer_(nullptr) + , tx_cb_(nullptr) + {} + + void send(const Pkt& pkt, uint64_t delay = 1) const; void bind(SimPort* peer) { - this->connect(peer); + assert(peer_ == nullptr); + peer_ = peer; } void unbind() { - this->disconnect(); + assert(peer_ == nullptr); + peer_ = nullptr; + } + + bool connected() const { + return (peer_ != nullptr); + } + + SimPort* peer() const { + return peer_; } bool empty() const { return queue_.empty(); } - const Pkt& top() const { + const Pkt& front() const { return queue_.front(); } - Pkt& top() { - return queue_.front(); + Pkt& front() { + return queue_.front().pkt; } - void pop() { + const Pkt& back() const { + return queue_.back(); + } + + Pkt& back() { + return queue_.back().pkt; + } + + uint64_t pop() { + auto cycle = queue_.front().cycle; queue_.pop(); - } + return cycle; + } + + void tx_callback(const TxCallback& callback) { + tx_cb_ = callback; + } protected: - SimPort(SimObjectBase* module) - : SimPortBase(module) - {} + struct timed_pkt_t { + Pkt pkt; + uint64_t cycle; + }; - void push(const Pkt& data) { - queue_.push(data); + std::queue queue_; + SimPort* peer_; + TxCallback tx_cb_; + + void push(const Pkt& data, uint64_t cycle) { + if (tx_cb_) { + tx_cb_(data, cycle); + } + if (peer_) { + peer_->push(data, cycle); + } else { + queue_.push({data, cycle}); + } } SimPort& operator=(const SimPort&) = delete; - std::queue queue_; - template friend class SimPortEvent; }; /////////////////////////////////////////////////////////////////////////////// -template -class SlavePort : public SimPort { -public: - SlavePort(SimObjectBase* module) : SimPort(module) {} - -protected: - SlavePort& operator=(const SlavePort&) = delete; -}; - -/////////////////////////////////////////////////////////////////////////////// - -template -class MasterPort : public SimPort { -public: - MasterPort(SimObjectBase* module) : SimPort(module) {} - -protected: - MasterPort& operator=(const MasterPort&) = delete; -}; - -/////////////////////////////////////////////////////////////////////////////// - class SimEventBase { public: typedef std::shared_ptr Ptr; @@ -132,14 +130,14 @@ public: virtual void fire() const = 0; - bool step() { - return (0 == --delay_); + uint64_t time() const { + return time_; } protected: - SimEventBase(uint64_t delay) : delay_(delay) {} + SimEventBase(uint64_t time) : time_(time) {} - uint64_t delay_; + uint64_t time_; }; /////////////////////////////////////////////////////////////////////////////// @@ -147,26 +145,34 @@ protected: template class SimCallEvent : public SimEventBase { public: - typedef std::function Func; - - template - static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) { - return std::make_shared(func, pkt, delay); - } - - SimCallEvent(const Func& func, const Pkt& pkt, uint64_t delay) - : SimEventBase(delay) - , func_(func) - , pkt_(pkt) - {} - void fire() const override { func_(pkt_); } -protected: + typedef std::function Func; + + SimCallEvent(const Func& func, const Pkt& pkt, uint64_t time) + : SimEventBase(time) + , func_(func) + , pkt_(pkt) + {} + + void* operator new(size_t /*size*/) { + return allocator().allocate(); + } + + void operator delete(void* ptr) { + allocator().deallocate(ptr); + } + +protected: Func func_; - Pkt pkt_; + Pkt pkt_; + + static MemoryPool>& allocator() { + static MemoryPool> instance(64); + return instance; + } }; /////////////////////////////////////////////////////////////////////////////// @@ -174,23 +180,32 @@ protected: template class SimPortEvent : public SimEventBase { public: - static Ptr Create(const SimPort* port, const Pkt& pkt, uint64_t delay) { - return std::make_shared(port, pkt, delay); + void fire() const override { + const_cast*>(port_)->push(pkt_, time_); } - SimPortEvent(const SimPort* port, const Pkt& pkt, uint64_t delay) - : SimEventBase(delay) + SimPortEvent(const SimPort* port, const Pkt& pkt, uint64_t time) + : SimEventBase(time) , port_(port) , pkt_(pkt) {} - - void fire() const override { - const_cast*>(port_)->push(pkt_); + + void* operator new(size_t /*size*/) { + return allocator().allocate(); } -private: + void operator delete(void* ptr) { + allocator().deallocate(ptr); + } + +protected: const SimPort* port_; Pkt pkt_; + + static MemoryPool>& allocator() { + static MemoryPool> instance(64); + return instance; + } }; /////////////////////////////////////////////////////////////////////////////// @@ -203,24 +218,17 @@ public: virtual ~SimObjectBase() {} - template - void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay); - const std::string& name() const { return name_; } -protected: - virtual void step(uint64_t cycle) = 0; - SimObjectBase(const SimContext& ctx, const char* name); +protected: + + SimObjectBase(const SimContext& ctx, const char* name); -private: std::string name_; - - friend class SimPlatform; - friend class SimPortBase; }; /////////////////////////////////////////////////////////////////////////////// @@ -228,14 +236,16 @@ private: template class SimObject : public SimObjectBase { public: - typedef std::shared_ptr Ptr; + typedef std::shared_ptr Ptr; template static Ptr Create(Args&&... args); protected: - SimObject(const SimContext& ctx, const char* name) : SimObjectBase(ctx, name) {} + SimObject(const SimContext& ctx, const char* name) + : SimObjectBase(ctx, name) + {} void step(uint64_t cycle) override { this->impl().step(cycle); @@ -255,8 +265,8 @@ private: class SimContext { private: SimContext() {} - template template - friend typename SimObject::Ptr SimObject::Create(Args&&... args); + + friend class SimPlatform; }; /////////////////////////////////////////////////////////////////////////////// @@ -281,25 +291,19 @@ public: instance().clear(); } - void register_object(const SimObjectBase::Ptr& obj) { + template + typename SimObject::Ptr CreateObject(Args&&... args) { + auto obj = std::make_shared(SimContext{}, std::forward(args)...); objects_.push_back(obj); + return obj; } template - void schedule(const typename SimCallEvent::Func& callback, + void schedule(const typename SimCallEvent::Func& callback, const Pkt& pkt, uint64_t delay) { - auto evt = SimCallEvent::Create(callback, pkt, delay); - assert(delay != 0); - events_.emplace_back(evt); - } - - template - void schedule(const SimPort* port, - const Pkt& pkt, - uint64_t delay) { - auto evt = SimPortEvent::Create(port, pkt, delay); assert(delay != 0); + auto evt = std::make_shared>(callback, pkt, cycles_ + delay); events_.emplace_back(evt); } @@ -309,7 +313,7 @@ public: auto evt_it_end = events_.end(); while (evt_it != evt_it_end) { auto& event = *evt_it; - if (event->step()) { + if (cycles_ >= event->time()) { event->fire(); evt_it = events_.erase(evt_it); } else { @@ -341,9 +345,19 @@ private: events_.clear(); } + template + void schedule(const SimPort* port, const Pkt& pkt, uint64_t delay) { + assert(delay != 0); + auto evt = SimEventBase::Ptr(new SimPortEvent(port, pkt, cycles_ + delay)); + events_.emplace_back(evt); + } + std::vector objects_; std::list events_; uint64_t cycles_; + + template friend class SimPort; + friend class SimObjectBase; }; /////////////////////////////////////////////////////////////////////////////// @@ -355,22 +369,14 @@ inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) template template typename SimObject::Ptr SimObject::Create(Args&&... args) { - auto obj = std::make_shared(SimContext{}, std::forward(args)...); - SimPlatform::instance().register_object(obj); - return obj; + return SimPlatform::instance().CreateObject(std::forward(args)...); } template void SimPort::send(const Pkt& pkt, uint64_t delay) const { - if (peer_) { + if (peer_ && !tx_cb_) { reinterpret_cast*>(peer_)->send(pkt, delay); } else { SimPlatform::instance().schedule(this, pkt, delay); } -} - -template -void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) { - auto callback = std::bind(entry, obj, std::placeholders::_1); - SimPlatform::instance().schedule(callback, pkt, delay); } \ No newline at end of file diff --git a/sim/simX/archdef.h b/sim/simX/archdef.h index c6728831..c2a28f78 100644 --- a/sim/simX/archdef.h +++ b/sim/simX/archdef.h @@ -11,20 +11,20 @@ namespace vortex { class ArchDef { private: - int num_cores_; - int num_warps_; - int num_threads_; - int wsize_; - int vsize_; - int num_regs_; - int num_csrs_; - int num_barriers_; + uint16_t num_cores_; + uint16_t num_warps_; + uint16_t num_threads_; + uint16_t wsize_; + uint16_t vsize_; + uint16_t num_regs_; + uint16_t num_csrs_; + uint16_t num_barriers_; public: ArchDef(const std::string& /*arch*/, - int num_cores, - int num_warps, - int num_threads) + uint16_t num_cores, + uint16_t num_warps, + uint16_t num_threads) : num_cores_(num_cores) , num_warps_(num_warps) , num_threads_(num_threads) @@ -35,35 +35,35 @@ public: , num_barriers_(NUM_BARRIERS) {} - int wsize() const { + uint16_t wsize() const { return wsize_; } - int vsize() const { + uint16_t vsize() const { return vsize_; } - int num_regs() const { + uint16_t num_regs() const { return num_regs_; } - int num_csrs() const { + uint16_t num_csrs() const { return num_csrs_; } - int num_barriers() const { + uint16_t num_barriers() const { return num_barriers_; } - int num_threads() const { + uint16_t num_threads() const { return num_threads_; } - int num_warps() const { + uint16_t num_warps() const { return num_warps_; } - int num_cores() const { + uint16_t num_cores() const { return num_cores_; } }; diff --git a/sim/simX/args.h b/sim/simX/args.h index aeaba4e5..fd7de5bc 100644 --- a/sim/simX/args.h +++ b/sim/simX/args.h @@ -35,7 +35,7 @@ public: CommandLineArg(l, ht), arg_(x) {} int read(int argc, char **argv) { - __unused(argc); + __unused (argc); std::istringstream iss(argv[1]); iss >> arg_; return 1; @@ -53,7 +53,7 @@ public: CommandLineArg(l, ht), arg_(x) { arg_ = false; } int read(int argc, char **argv) { - __unused(argc, argv); + __unused (argc, argv); arg_ = true; return 0; } diff --git a/sim/simX/cache.cpp b/sim/simX/cache.cpp index da69cf3a..36da1b27 100644 --- a/sim/simX/cache.cpp +++ b/sim/simX/cache.cpp @@ -27,7 +27,7 @@ struct params_t { uint32_t tag_select_addr_start; uint32_t tag_select_addr_end; - params_t(const CacheConfig& config) { + params_t(const Cache::Config& config) { uint32_t bank_bits = log2ceil(config.num_banks); uint32_t offset_bits = config.B - config.W; uint32_t log2_bank_size = config.C - bank_bits; @@ -214,7 +214,7 @@ struct bank_t { std::vector sets; MSHR mshr; - bank_t(const CacheConfig& config, + bank_t(const Cache::Config& config, const params_t& params) : sets(params.sets_per_bank, params.blocks_per_set) , mshr(config.mshr_size) @@ -226,22 +226,30 @@ struct bank_t { class Cache::Impl { private: Cache* const simobject_; - CacheConfig config_; + Config config_; params_t params_; std::vector banks_; Switch::Ptr mem_switch_; Switch::Ptr bypass_switch_; - std::vector> mem_req_ports_; - std::vector> mem_rsp_ports_; + std::vector> mem_req_ports_; + std::vector> mem_rsp_ports_; + PerfStats perf_stats_; + uint64_t pending_read_reqs_; + uint64_t pending_write_reqs_; + uint64_t pending_fill_reqs_; + uint32_t flush_cycles_; public: - Impl(Cache* simobject, const CacheConfig& config) + Impl(Cache* simobject, const Config& config) : simobject_(simobject) , config_(config) , params_(config) , banks_(config.num_banks, {config, params_}) , mem_req_ports_(config.num_banks, simobject) , mem_rsp_ports_(config.num_banks, simobject) + , pending_read_reqs_(0) + , pending_write_reqs_(0) + , pending_fill_reqs_(0) { bypass_switch_ = Switch::Create("bypass_arb", ArbiterType::Priority, 2); bypass_switch_->ReqOut.bind(&simobject->MemReqPort); @@ -259,13 +267,29 @@ public: mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0)); } + + // calculate tag flush cycles + flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set; + } + + const PerfStats& perf_stats() const { + return perf_stats_; } - void step(uint64_t /*cycle*/) { + void step(uint64_t cycle) { + // wait on flush cycles + if (flush_cycles_ != 0) { + --flush_cycles_; + return; + } + + // calculate memory latency + perf_stats_.mem_latency += pending_fill_reqs_; + // handle bypasss responses auto& bypass_port = bypass_switch_->RspOut.at(1); if (!bypass_port.empty()) { - auto& mem_rsp = bypass_port.top(); + auto& mem_rsp = bypass_port.front(); uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1); uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs; MemRsp core_rsp(tag); @@ -287,7 +311,7 @@ public: for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { auto& mem_rsp_port = mem_rsp_ports_.at(bank_id); if (!mem_rsp_port.empty()) { - auto& mem_rsp = mem_rsp_port.top(); + auto& mem_rsp = mem_rsp_port.front(); this->processMemoryFill(bank_id, mem_rsp.tag); pending_fill_req.at(bank_id) = true; mem_rsp_port.pop(); @@ -300,7 +324,7 @@ public: if (core_req_port.empty()) continue; - auto& core_req = core_req_port.top(); + auto& core_req = core_req_port.front(); // check cache bypassing if (core_req.is_io) { @@ -345,7 +369,7 @@ public: // check MSHR capacity if read or writeback if ((!core_req.write || !config_.write_through) && bank.mshr.full()) { - // stall + ++perf_stats_.mshr_stalls; continue; } @@ -356,7 +380,7 @@ public: || pipeline_req.set_id != set_id || pipeline_req.tag != tag || pipeline_req.infos[port_id].valid) { - // stall + ++perf_stats_.bank_stalls; continue; } // update pending request infos @@ -365,8 +389,15 @@ public: // schedule new request pipeline_req = bank_req; } + + if (core_req.write) + ++perf_stats_.writes; + else + ++perf_stats_.reads; + // remove request - core_req_port.pop(); + auto time = core_req_port.pop(); + perf_stats_.pipeline_stalls += (cycle - time); } // process active request @@ -393,6 +424,7 @@ public: auto& block = set.blocks.at(entry.block_id); block.valid = true; block.tag = entry.tag; + --pending_fill_reqs_; } void processBankRequest(const std::vector& pipeline_reqs) { @@ -438,7 +470,7 @@ public: if (hit) { // - // MISS handling + // Hit handling // if (pipeline_req.write) { // handle write hit @@ -462,8 +494,13 @@ public: } } else { // - // MISS handling - // + // Miss handling + // + if (pipeline_req.write) + ++perf_stats_.write_misses; + else + ++perf_stats_.read_misses; + if (!found_free_block && !config_.write_through) { // write back dirty block auto& repl_block = set.blocks.at(repl_block_id); @@ -472,6 +509,7 @@ public: mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag); mem_req.write = true; mem_req_ports_.at(bank_id).send(mem_req, 1); + ++perf_stats_.evictions; } } @@ -500,9 +538,10 @@ public: if (pending == -1) { MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); - mem_req.write = pipeline_req.write; + mem_req.write = false; mem_req.tag = mshr_id; mem_req_ports_.at(bank_id).send(mem_req, 1); + ++pending_fill_reqs_; } } } @@ -513,7 +552,7 @@ public: /////////////////////////////////////////////////////////////////////////////// -Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config) +Cache::Cache(const SimContext& ctx, const char* name, const Config& config) : SimObject(ctx, name) , CoreReqPorts(config.num_inputs, this) , CoreRspPorts(config.num_inputs, this) @@ -528,4 +567,8 @@ Cache::~Cache() { void Cache::step(uint64_t cycle) { impl_->step(cycle); +} + +const Cache::PerfStats& Cache::perf_stats() const { + return impl_->perf_stats(); } \ No newline at end of file diff --git a/sim/simX/cache.h b/sim/simX/cache.h index 0be8cf6e..8f4b3932 100644 --- a/sim/simX/cache.h +++ b/sim/simX/cache.h @@ -5,33 +5,58 @@ namespace vortex { -struct CacheConfig { - uint8_t C; // log2 cache size - uint8_t B; // log2 block size - uint8_t W; // log2 word size - uint8_t A; // log2 associativity - uint8_t addr_width; // word address bits - uint8_t num_banks; // number of banks - uint8_t ports_per_bank; // number of ports per bank - uint8_t num_inputs; // number of inputs - bool write_through; // is write-through - bool write_reponse; // enable write response - uint16_t victim_size; // victim cache size - uint16_t mshr_size; // MSHR buffer size - uint8_t latency; // pipeline latency -}; - -class Cache : public SimObject { +class Cache : public SimObject { public: - Cache(const SimContext& ctx, const char* name, const CacheConfig& config); + struct Config { + uint8_t C; // log2 cache size + uint8_t B; // log2 block size + uint8_t W; // log2 word size + uint8_t A; // log2 associativity + uint8_t addr_width; // word address bits + uint8_t num_banks; // number of banks + uint8_t ports_per_bank; // number of ports per bank + uint8_t num_inputs; // number of inputs + bool write_through; // is write-through + bool write_reponse; // enable write response + uint16_t victim_size; // victim cache size + uint16_t mshr_size; // MSHR buffer size + uint8_t latency; // pipeline latency + }; + struct PerfStats { + uint64_t reads; + uint64_t writes; + uint64_t read_misses; + uint64_t write_misses; + uint64_t evictions; + uint64_t pipeline_stalls; + uint64_t bank_stalls; + uint64_t mshr_stalls; + uint64_t mem_latency; + + PerfStats() + : reads(0) + , writes(0) + , read_misses(0) + , write_misses(0) + , evictions(0) + , pipeline_stalls(0) + , bank_stalls(0) + , mshr_stalls(0) + , mem_latency(0) + {} + }; + + std::vector> CoreReqPorts; + std::vector> CoreRspPorts; + SimPort MemReqPort; + SimPort MemRspPort; + + Cache(const SimContext& ctx, const char* name, const Config& config); ~Cache(); void step(uint64_t cycle); - std::vector> CoreReqPorts; - std::vector> CoreRspPorts; - MasterPort MemReqPort; - SlavePort MemRspPort; + const PerfStats& perf_stats() const; private: class Impl; diff --git a/sim/simX/constants.h b/sim/simX/constants.h index 218fa5f9..b173a03f 100644 --- a/sim/simX/constants.h +++ b/sim/simX/constants.h @@ -3,14 +3,14 @@ #include "types.h" #ifndef MEM_LATENCY -#define MEM_LATENCY 18 +#define MEM_LATENCY 24 #endif namespace vortex { -struct Constants { +enum Constants { -static constexpr uint32_t SMEM_DELAY = 1 + SM_ENABLE; + SMEM_BANK_OFFSET = log2ceil(sizeof(Word)) + log2ceil(STACK_SIZE / sizeof(Word)), }; diff --git a/sim/simX/core.cpp b/sim/simX/core.cpp index 19b20967..7c6cbffa 100644 --- a/sim/simX/core.cpp +++ b/sim/simX/core.cpp @@ -9,16 +9,18 @@ #include "decode.h" #include "core.h" #include "debug.h" +#include "constants.h" using namespace vortex; Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) : SimObject(ctx, "Core") + , MemRspPort(this) + , MemReqPort(this) , id_(id) , arch_(arch) , decoder_(arch) , mmu_(0, arch.wsize(), true) - , shared_mem_(4096) , tex_units_(NUM_TEX_UNITS, this) , warps_(arch.num_warps()) , barriers_(arch.num_barriers(), 0) @@ -27,7 +29,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) , ibuffers_(arch.num_warps(), IBUF_SIZE) , scoreboard_(arch_) , exe_units_((int)ExeType::MAX) - , icache_(Cache::Create("Icache", CacheConfig{ + , icache_(Cache::Create("Icache", Cache::Config{ log2ceil(ICACHE_SIZE), // C log2ceil(L1_BLOCK_SIZE),// B 2, // W @@ -42,7 +44,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) NUM_WARPS, // mshr 2, // pipeline latency })) - , dcache_(Cache::Create("Dcache", CacheConfig{ + , dcache_(Cache::Create("Dcache", Cache::Config{ log2ceil(DCACHE_SIZE), // C log2ceil(L1_BLOCK_SIZE),// B 2, // W @@ -55,37 +57,41 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) false, // write response 0, // victim size DCACHE_MSHR_SIZE, // mshr - 2, // pipeline latency + 4, // pipeline latency + })) + , shared_mem_(SharedMem::Create("sharedmem", SharedMem::Config{ + arch.num_threads(), + arch.num_threads(), + Constants::SMEM_BANK_OFFSET, + 1, + false })) , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) , dcache_switch_(arch.num_threads()) - , fetch_stage_("fetch") - , decode_stage_("decode") - , issue_stage_("issue") - , execute_stage_("execute") - , commit_stage_("writeback") + , fetch_latch_("fetch") + , decode_latch_("decode") , pending_icache_(arch_.num_warps()) + , active_warps_(1) , stalled_warps_(0) , last_schedule_wid_(0) , issued_instrs_(0) , committed_instrs_(0) + , csr_tex_unit_(0) , ecall_(false) , ebreak_(false) - , stats_insts_(0) - , MemRspPort(this) - , MemReqPort(this) + , perf_mem_pending_reads_(0) { for (int i = 0; i < arch_.num_warps(); ++i) { warps_.at(i) = std::make_shared(this, i); } // register execute units - exe_units_.at((int)ExeType::NOP) = std::make_shared(this); - exe_units_.at((int)ExeType::ALU) = std::make_shared(this); - exe_units_.at((int)ExeType::LSU) = std::make_shared(this); - exe_units_.at((int)ExeType::CSR) = std::make_shared(this); - exe_units_.at((int)ExeType::FPU) = std::make_shared(this); - exe_units_.at((int)ExeType::GPU) = std::make_shared(this); + exe_units_.at((int)ExeType::NOP) = SimPlatform::instance().CreateObject(this); + exe_units_.at((int)ExeType::ALU) = SimPlatform::instance().CreateObject(this); + exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().CreateObject(this); + exe_units_.at((int)ExeType::CSR) = SimPlatform::instance().CreateObject(this); + exe_units_.at((int)ExeType::FPU) = SimPlatform::instance().CreateObject(this); + exe_units_.at((int)ExeType::GPU) = SimPlatform::instance().CreateObject(this); // connect l1 switch icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]); @@ -109,6 +115,18 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) // activate warp0 warps_.at(0)->setTmask(0, true); + + // memory perf callbacks + MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){ + __unused (cycle); + perf_stats_.mem_reads += !req.write; + perf_stats_.mem_writes += req.write; + perf_mem_pending_reads_ += !req.write; + }); + MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){ + __unused (cycle); + --perf_mem_pending_reads_; + }); } Core::~Core() { @@ -128,23 +146,26 @@ void Core::attach_ram(RAM* ram) { void Core::step(uint64_t cycle) { this->commit(cycle); this->execute(cycle); - this->issue(cycle); this->decode(cycle); this->fetch(cycle); + this->schedule(cycle); + + // update perf counter + perf_stats_.mem_latency += perf_mem_pending_reads_; DPN(2, std::flush); } -void Core::warp_scheduler(uint64_t cycle) { +void Core::schedule(uint64_t cycle) { __unused (cycle); bool foundSchedule = false; int scheduled_warp = last_schedule_wid_; // round robin scheduling - for (size_t wid = 0; wid < warps_.size(); ++wid) { - scheduled_warp = (scheduled_warp + 1) % warps_.size(); - bool warp_active = warps_.at(scheduled_warp)->active(); + for (size_t wid = 0, nw = arch_.num_warps(); wid < nw; ++wid) { + scheduled_warp = (scheduled_warp + 1) % nw; + bool warp_active = active_warps_.test(scheduled_warp); bool warp_stalled = stalled_warps_.test(scheduled_warp); if (warp_active && !warp_stalled) { last_schedule_wid_ = scheduled_warp; @@ -159,85 +180,91 @@ void Core::warp_scheduler(uint64_t cycle) { // suspend warp until decode stalled_warps_.set(scheduled_warp); - auto& warp = warps_.at(scheduled_warp); - stats_insts_ += warp->getActiveThreads(); - - auto trace = new pipeline_trace_t((issued_instrs_++ * arch_.num_cores()) + id_, arch_); + auto& warp = warps_.at(scheduled_warp); + + uint64_t uuid = (issued_instrs_++ * arch_.num_cores()) + id_; + + auto trace = new pipeline_trace_t(uuid, arch_); warp->eval(trace); DT(3, cycle, "pipeline-schedule: " << *trace); // advance to fetch stage - fetch_stage_.push(trace); + fetch_latch_.push(trace); } void Core::fetch(uint64_t cycle) { + __unused (cycle); + // handle icache reponse auto& icache_rsp_port = icache_->CoreRspPorts.at(0); if (!icache_rsp_port.empty()){ - auto& mem_rsp = icache_rsp_port.top(); + auto& mem_rsp = icache_rsp_port.front(); auto trace = pending_icache_.at(mem_rsp.tag); - auto latency = (SimPlatform::instance().cycles() - trace->icache_latency); - trace->icache_latency = latency; - decode_stage_.push(trace); + decode_latch_.push(trace); DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace); pending_icache_.release(mem_rsp.tag); icache_rsp_port.pop(); } // send icache request - if (!fetch_stage_.empty()) { - auto trace = fetch_stage_.top(); - trace->icache_latency = SimPlatform::instance().cycles(); + if (!fetch_latch_.empty()) { + auto trace = fetch_latch_.front(); MemReq mem_req; mem_req.addr = trace->PC; mem_req.write = false; mem_req.tag = pending_icache_.allocate(trace); icache_->CoreReqPorts.at(0).send(mem_req, 1); DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); - fetch_stage_.pop(); - } - - // schedule next warp - this->warp_scheduler(cycle); + fetch_latch_.pop(); + } } void Core::decode(uint64_t cycle) { __unused (cycle); - if (decode_stage_.empty()) + if (decode_latch_.empty()) return; - auto trace = decode_stage_.top(); + auto trace = decode_latch_.front(); + + // check ibuffer capacity + auto& ibuffer = ibuffers_.at(trace->wid); + if (ibuffer.full()) { + if (!trace->suspend()) { + DT(3, cycle, "*** ibuffer-stall: " << *trace); + } + ++perf_stats_.ibuf_stalls; + return; + } else { + trace->resume(); + } // release warp if (!trace->fetch_stall) { stalled_warps_.reset(trace->wid); } + // update perf counters + uint32_t active_threads = trace->tmask.count(); + if (trace->exe_type == ExeType::LSU && trace->lsu.type == LsuType::LOAD) + perf_stats_.loads += active_threads; + if (trace->exe_type == ExeType::LSU && trace->lsu.type == LsuType::STORE) + perf_stats_.stores += active_threads; + if (trace->exe_type == ExeType::ALU && trace->alu.type == AluType::BRANCH) + perf_stats_.branches += active_threads; + DT(3, cycle, "pipeline-decode: " << *trace); - - // advance to issue stage - issue_stage_.push(trace); - decode_stage_.pop(); + + // insert to ibuffer + ibuffer.push(trace); + + decode_latch_.pop(); } -void Core::issue(uint64_t cycle) { - __unused (cycle); - - if (!issue_stage_.empty()) { - // insert to ibuffer - auto trace = issue_stage_.top(); - auto& ibuffer = ibuffers_.at(trace->wid); - if (!trace->check_stalled(ibuffer.full())) { - DT(3, cycle, "*** ibuffer-stall: " << *trace); - } - if (!ibuffer.full()) { - ibuffer.push(trace); - issue_stage_.pop(); - } - } +void Core::execute(uint64_t cycle) { + __unused (cycle); // issue ibuffer instructions for (auto& ibuffer : ibuffers_) { @@ -247,180 +274,102 @@ void Core::issue(uint64_t cycle) { auto trace = ibuffer.top(); // check scoreboard - if (!trace->check_stalled(scoreboard_.in_use(trace))) { - DTH(3, cycle, "*** scoreboard-stall: dependents={"); - auto uses = scoreboard_.get_uses(trace); - for (uint32_t i = 0, n = uses.size(); i < n; ++i) { - auto& use = uses.at(i); - __unused(use); - if (i) DTN(3, ", "); - DTN(3, use.type << use.reg << "(#" << use.owner << ")"); + if (scoreboard_.in_use(trace)) { + if (!trace->suspend()) { + DTH(3, cycle, "*** scoreboard-stall: dependents={"); + auto uses = scoreboard_.get_uses(trace); + for (uint32_t i = 0, n = uses.size(); i < n; ++i) { + auto& use = uses.at(i); + __unused (use); + if (i) DTN(3, ", "); + DTN(3, use.type << use.reg << "(#" << use.owner << ")"); + } + DTN(3, "}, " << *trace << std::endl); } - DTN(3, "}, " << *trace << std::endl); - } - if (scoreboard_.in_use(trace)) + ++perf_stats_.scrb_stalls; continue; - - DT(3, cycle, "pipeline-issue: " << *trace); + } else { + trace->resume(); + } // update scoreboard scoreboard_.reserve(trace); - // advance to execute stage - execute_stage_.push(trace); + DT(3, cycle, "pipeline-issue: " << *trace); + + // push to execute units + auto& exe_unit = exe_units_.at((int)trace->exe_type); + exe_unit->Input.send(trace, 1); ibuffer.pop(); break; } } -void Core::execute(uint64_t cycle) { - // process stage inputs - if (!execute_stage_.empty()) { - auto trace = execute_stage_.top(); - auto& exe_unit = exe_units_.at((int)trace->exe_type); - exe_unit->push(trace); - DT(3, cycle, "pipeline-execute: " << *trace); - execute_stage_.pop(); - } - - // advance execute units - for (auto& exe_unit : exe_units_) { - exe_unit->step(cycle); - } - - // commit completed instructions - for (auto& exe_unit : exe_units_) { - if (!exe_unit->empty()) { - auto trace = exe_unit->top(); - if (trace->fetch_stall) { - stalled_warps_.reset(trace->wid); - } - // advance to commit stage - commit_stage_.push(trace); - exe_unit->pop(); - } - } -} - void Core::commit(uint64_t cycle) { __unused (cycle); - if (commit_stage_.empty()) - return; + // commit completed instructions + bool wb = false; + for (auto& exe_unit : exe_units_) { + if (!exe_unit->Output.empty()) { + auto trace = exe_unit->Output.front(); - auto trace = commit_stage_.top(); + // allow only one commit that updates registers + if (trace->wb && wb) + continue; + wb |= trace->wb; - DT(3, cycle, "pipeline-commit: " << *trace); + // advance to commit stage + DT(3, cycle, "pipeline-commit: " << *trace); - // update scoreboard - scoreboard_.release(trace); + // update scoreboard + scoreboard_.release(trace); - assert(committed_instrs_ <= issued_instrs_); - ++committed_instrs_; + assert(committed_instrs_ <= issued_instrs_); + ++committed_instrs_; - commit_stage_.pop(); + perf_stats_.instrs += trace->tmask.count(); - // delete the trace - delete trace; -} + // delete the trace + delete trace; -bool Core::running() const { - bool is_running = (committed_instrs_ != issued_instrs_); - return is_running; -} - -Word Core::get_csr(Addr addr, int tid, int wid) { - if (addr == CSR_FFLAGS) { - return fcsrs_.at(wid) & 0x1F; - } else if (addr == CSR_FRM) { - return (fcsrs_.at(wid) >> 5); - } else if (addr == CSR_FCSR) { - return fcsrs_.at(wid); - } else if (addr == CSR_WTID) { - // Warp threadID - return tid; - } else if (addr == CSR_LTID) { - // Core threadID - return tid + (wid * arch_.num_threads()); - } else if (addr == CSR_GTID) { - // Processor threadID - return tid + (wid * arch_.num_threads()) + - (arch_.num_threads() * arch_.num_warps() * id_); - } else if (addr == CSR_LWID) { - // Core warpID - return wid; - } else if (addr == CSR_GWID) { - // Processor warpID - return wid + (arch_.num_warps() * id_); - } else if (addr == CSR_GCID) { - // Processor coreID - return id_; - } else if (addr == CSR_TMASK) { - // Processor coreID - return warps_.at(wid)->getTmask(); - } else if (addr == CSR_NT) { - // Number of threads per warp - return arch_.num_threads(); - } else if (addr == CSR_NW) { - // Number of warps per core - return arch_.num_warps(); - } else if (addr == CSR_NC) { - // Number of cores - return arch_.num_cores(); - } else if (addr == CSR_MINSTRET) { - // NumInsts - return stats_insts_; - } else if (addr == CSR_MINSTRET_H) { - // NumInsts - return (Word)(stats_insts_ >> 32); - } else if (addr == CSR_MCYCLE) { - // NumCycles - return (Word)SimPlatform::instance().cycles(); - } else if (addr == CSR_MCYCLE_H) { - // NumCycles - return (Word)(SimPlatform::instance().cycles() >> 32); - } else { - if (addr >= CSR_TEX(0,0) - && addr < CSR_TEX(NUM_TEX_UNITS,0)) { - uint32_t unit = CSR_TEX_UNIT(addr); - uint32_t state = CSR_TEX_STATE(addr); - return tex_units_.at(unit).get_state(state); + exe_unit->Output.pop(); } - return csrs_.at(addr); } } -void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) { - if (addr == CSR_FFLAGS) { - fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F); - } else if (addr == CSR_FRM) { - fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5); - } else if (addr == CSR_FCSR) { - fcsrs_.at(wid) = value & 0xff; - } else { - if (addr >= CSR_TEX(0,0) - && addr < CSR_TEX(NUM_TEX_UNITS,0)) { - uint32_t unit = CSR_TEX_UNIT(addr); - uint32_t state = CSR_TEX_STATE(addr); - tex_units_.at(unit).set_state(state, value); - return; - } - csrs_.at(addr) = value; +WarpMask Core::wspawn(int num_warps, int nextPC) { + WarpMask ret(1); + int active_warps = std::min(num_warps, arch_.num_warps()); + DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << nextPC); + for (int i = 1; i < active_warps; ++i) { + auto warp = warps_.at(i); + warp->setPC(nextPC); + warp->setTmask(0, true); + ret.set(i); } + return std::move(ret); } -void Core::barrier(int bar_id, int count, int warp_id) { +WarpMask Core::barrier(int bar_id, int count, int warp_id) { + WarpMask ret(0); auto& barrier = barriers_.at(bar_id); barrier.set(warp_id); - if (barrier.count() < (size_t)count) - return; + if (barrier.count() < (size_t)count) { + warps_.at(warp_id)->suspend(); + DP(3, "*** Suspend warp #" << warp_id << " at barrier #" << bar_id); + return std::move(ret); + } for (int i = 0; i < arch_.num_warps(); ++i) { if (barrier.test(i)) { + DP(3, "*** Resume warp #" << i << " at barrier #" << bar_id); warps_.at(i)->activate(); + ret.set(i); } } barrier.reset(); + return std::move(ret); } Word Core::icache_read(Addr addr, Size size) { @@ -430,35 +379,21 @@ Word Core::icache_read(Addr addr, Size size) { } Word Core::dcache_read(Addr addr, Size size) { - Word data = 0; - if (SM_ENABLE) { - if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) - && ((addr + 3) < SMEM_BASE_ADDR)) { - shared_mem_.read(&data, addr & (SMEM_SIZE-1), size); - return data; - } - } + Word data; mmu_.read(&data, addr, size, 0); return data; } void Core::dcache_write(Addr addr, Word data, Size size) { - if (SM_ENABLE) { - if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) - && ((addr + 3) < SMEM_BASE_ADDR)) { - shared_mem_.write(&data, addr & (SMEM_SIZE-1), size); - return; - } - } if (addr >= IO_COUT_ADDR && addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { this->writeToStdOut(addr, data); - return; + } else { + mmu_.write(&data, addr, size, 0); } - mmu_.write(&data, addr, size, 0); } -Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector* mem_addrs) { +Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector* mem_addrs) { return tex_units_.at(unit).read(u, v, lod, mem_addrs); } @@ -473,6 +408,228 @@ void Core::writeToStdOut(Addr addr, Word data) { } } +Word Core::get_csr(Addr addr, int tid, int wid) { + switch (addr) { + case CSR_SATP: + case CSR_PMPCFG0: + case CSR_PMPADDR0: + case CSR_MSTATUS: + case CSR_MISA: + case CSR_MEDELEG: + case CSR_MIDELEG: + case CSR_MIE: + case CSR_MTVEC: + case CSR_MEPC: + return 0; + + case CSR_FFLAGS: + return fcsrs_.at(wid) & 0x1F; + case CSR_FRM: + return (fcsrs_.at(wid) >> 5); + case CSR_FCSR: + return fcsrs_.at(wid); + case CSR_WTID: + // Warp threadID + return tid; + case CSR_LTID: + // Core threadID + return tid + (wid * arch_.num_threads()); + case CSR_GTID: + // Processor threadID + return tid + (wid * arch_.num_threads()) + + (arch_.num_threads() * arch_.num_warps() * id_); + case CSR_LWID: + // Core warpID + return wid; + case CSR_GWID: + // Processor warpID + return wid + (arch_.num_warps() * id_); + case CSR_GCID: + // Processor coreID + return id_; + case CSR_TMASK: + // Processor coreID + return warps_.at(wid)->getTmask(); + case CSR_NT: + // Number of threads per warp + return arch_.num_threads(); + case CSR_NW: + // Number of warps per core + return arch_.num_warps(); + case CSR_NC: + // Number of cores + return arch_.num_cores(); + case CSR_MINSTRET: + // NumInsts + return perf_stats_.instrs & 0xffffffff; + case CSR_MINSTRET_H: + // NumInsts + return (Word)(perf_stats_.instrs >> 32); + case CSR_MCYCLE: + // NumCycles + return (Word)SimPlatform::instance().cycles(); + case CSR_MCYCLE_H: + // NumCycles + return (Word)(SimPlatform::instance().cycles() >> 32); + case CSR_MPM_IBUF_ST: + return perf_stats_.ibuf_stalls & 0xffffffff; + case CSR_MPM_IBUF_ST_H: + return perf_stats_.ibuf_stalls >> 32; + case CSR_MPM_SCRB_ST: + return perf_stats_.scrb_stalls & 0xffffffff; + case CSR_MPM_SCRB_ST_H: + return perf_stats_.scrb_stalls >> 32; + case CSR_MPM_ALU_ST: + return perf_stats_.alu_stalls & 0xffffffff; + case CSR_MPM_ALU_ST_H: + return perf_stats_.alu_stalls >> 32; + case CSR_MPM_LSU_ST: + return perf_stats_.lsu_stalls & 0xffffffff; + case CSR_MPM_LSU_ST_H: + return perf_stats_.lsu_stalls >> 32; + case CSR_MPM_CSR_ST: + return perf_stats_.csr_stalls & 0xffffffff; + case CSR_MPM_CSR_ST_H: + return perf_stats_.csr_stalls >> 32; + case CSR_MPM_FPU_ST: + return perf_stats_.fpu_stalls & 0xffffffff; + case CSR_MPM_FPU_ST_H: + return perf_stats_.fpu_stalls >> 32; + case CSR_MPM_GPU_ST: + return perf_stats_.gpu_stalls & 0xffffffff; + case CSR_MPM_GPU_ST_H: + return perf_stats_.gpu_stalls >> 32; + + case CSR_MPM_LOADS: + return perf_stats_.loads & 0xffffffff; + case CSR_MPM_LOADS_H: + return perf_stats_.loads >> 32; + case CSR_MPM_STORES: + return perf_stats_.stores & 0xffffffff; + case CSR_MPM_STORES_H: + return perf_stats_.stores >> 32; + case CSR_MPM_BRANCHES: + return perf_stats_.branches & 0xffffffff; + case CSR_MPM_BRANCHES_H: + return perf_stats_.branches >> 32; + + case CSR_MPM_ICACHE_READS: + return icache_->perf_stats().reads & 0xffffffff; + case CSR_MPM_ICACHE_READS_H: + return icache_->perf_stats().reads >> 32; + case CSR_MPM_ICACHE_MISS_R: + return icache_->perf_stats().read_misses & 0xffffffff; + case CSR_MPM_ICACHE_MISS_R_H: + return icache_->perf_stats().read_misses >> 32; + + case CSR_MPM_DCACHE_READS: + return dcache_->perf_stats().reads & 0xffffffff; + case CSR_MPM_DCACHE_READS_H: + return dcache_->perf_stats().reads >> 32; + case CSR_MPM_DCACHE_WRITES: + return dcache_->perf_stats().writes & 0xffffffff; + case CSR_MPM_DCACHE_WRITES_H: + return dcache_->perf_stats().writes >> 32; + case CSR_MPM_DCACHE_MISS_R: + return dcache_->perf_stats().read_misses & 0xffffffff; + case CSR_MPM_DCACHE_MISS_R_H: + return dcache_->perf_stats().read_misses >> 32; + case CSR_MPM_DCACHE_MISS_W: + return dcache_->perf_stats().write_misses & 0xffffffff; + case CSR_MPM_DCACHE_MISS_W_H: + return dcache_->perf_stats().write_misses >> 32; + case CSR_MPM_DCACHE_BANK_ST: + return dcache_->perf_stats().bank_stalls & 0xffffffff; + case CSR_MPM_DCACHE_BANK_ST_H: + return dcache_->perf_stats().bank_stalls >> 32; + case CSR_MPM_DCACHE_MSHR_ST: + return dcache_->perf_stats().mshr_stalls & 0xffffffff; + case CSR_MPM_DCACHE_MSHR_ST_H: + return dcache_->perf_stats().mshr_stalls >> 32; + + case CSR_MPM_SMEM_READS: + return shared_mem_->perf_stats().reads & 0xffffffff; + case CSR_MPM_SMEM_READS_H: + return shared_mem_->perf_stats().reads >> 32; + case CSR_MPM_SMEM_WRITES: + return shared_mem_->perf_stats().writes & 0xffffffff; + case CSR_MPM_SMEM_WRITES_H: + return shared_mem_->perf_stats().writes >> 32; + case CSR_MPM_SMEM_BANK_ST: + return shared_mem_->perf_stats().bank_stalls & 0xffffffff; + case CSR_MPM_SMEM_BANK_ST_H: + return shared_mem_->perf_stats().bank_stalls >> 32; + + case CSR_MPM_MEM_READS: + return perf_stats_.mem_reads & 0xffffffff; + case CSR_MPM_MEM_READS_H: + return perf_stats_.mem_reads >> 32; + case CSR_MPM_MEM_WRITES: + return perf_stats_.mem_writes & 0xffffffff; + case CSR_MPM_MEM_WRITES_H: + return perf_stats_.mem_writes >> 32; + case CSR_MPM_MEM_LAT: + return perf_stats_.mem_latency & 0xffffffff; + case CSR_MPM_MEM_LAT_H: + return perf_stats_.mem_latency >> 32; + +#ifdef EXT_TEX_ENABLE + case CSR_MPM_TEX_READS: + return perf_stats_.tex_reads & 0xffffffff; + case CSR_MPM_TEX_READS_H: + return perf_stats_.tex_reads >> 32; + case CSR_MPM_TEX_LAT: + return perf_stats_.tex_latency & 0xffffffff; + case CSR_MPM_TEX_LAT_H: + return perf_stats_.tex_latency >> 32; +#endif + default: + if ((addr >= CSR_MPM_BASE && addr < (CSR_MPM_BASE + 32)) + || (addr >= CSR_MPM_BASE_H && addr < (CSR_MPM_BASE_H + 32))) { + // user-defined MPM CSRs + } else + #ifdef EXT_TEX_ENABLE + if (addr == CSR_TEX_UNIT) { + return csr_tex_unit_; + } else + if (addr >= CSR_TEX_STATE_BEGIN + && addr < CSR_TEX_STATE_END) { + uint32_t state = CSR_TEX_STATE(addr); + return tex_units_.at(csr_tex_unit_).get_state(state); + } else + #endif + { + std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl; + std::abort(); + } + } + return 0; +} + +void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) { + if (addr == CSR_FFLAGS) { + fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F); + } else if (addr == CSR_FRM) { + fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5); + } else if (addr == CSR_FCSR) { + fcsrs_.at(wid) = value & 0xff; + } else +#ifdef EXT_TEX_ENABLE + if (addr == CSR_TEX_UNIT) { + csr_tex_unit_ = value; + } else + if (addr >= CSR_TEX_STATE_BEGIN + && addr < CSR_TEX_STATE_END) { + uint32_t state = CSR_TEX_STATE(addr); + tex_units_.at(csr_tex_unit_).set_state(state, value); + return; + } else +#endif + { + csrs_.at(addr) = value; + } +} + void Core::trigger_ecall() { ecall_ = true; } @@ -483,4 +640,9 @@ void Core::trigger_ebreak() { bool Core::check_exit() const { return ebreak_ || ecall_; +} + +bool Core::running() const { + bool is_running = (committed_instrs_ != issued_instrs_); + return is_running; } \ No newline at end of file diff --git a/sim/simX/core.h b/sim/simX/core.h index 5066d8af..e4a6034e 100644 --- a/sim/simX/core.h +++ b/sim/simX/core.h @@ -17,6 +17,7 @@ #include "warp.h" #include "pipeline.h" #include "cache.h" +#include "sharedmem.h" #include "ibuffer.h" #include "scoreboard.h" #include "exeunit.h" @@ -26,6 +27,47 @@ namespace vortex { class Core : public SimObject { public: + struct PerfStats { + uint64_t instrs; + uint64_t ibuf_stalls; + uint64_t scrb_stalls; + uint64_t alu_stalls; + uint64_t lsu_stalls; + uint64_t csr_stalls; + uint64_t fpu_stalls; + uint64_t gpu_stalls; + uint64_t loads; + uint64_t stores; + uint64_t branches; + uint64_t mem_reads; + uint64_t mem_writes; + uint64_t mem_latency; + uint64_t tex_reads; + uint64_t tex_latency; + + PerfStats() + : instrs(0) + , ibuf_stalls(0) + , scrb_stalls(0) + , alu_stalls(0) + , lsu_stalls(0) + , csr_stalls(0) + , fpu_stalls(0) + , gpu_stalls(0) + , loads(0) + , stores(0) + , branches(0) + , mem_reads(0) + , mem_writes(0) + , mem_latency(0) + , tex_reads(0) + , tex_latency(0) + {} + }; + + SimPort MemRspPort; + SimPort MemReqPort; + Core(const SimContext& ctx, const ArchDef &arch, Word id); ~Core(); @@ -51,8 +93,8 @@ public: return arch_; } - unsigned long stats_insts() const { - return stats_insts_; + const PerfStats& perf_stats() const { + return perf_stats_; } Word getIRegValue(int reg) const { @@ -63,7 +105,9 @@ public: void set_csr(Addr addr, Word value, int tid, int wid); - void barrier(int bar_id, int count, int warp_id); + WarpMask wspawn(int num_warps, int nextPC); + + WarpMask barrier(int bar_id, int count, int warp_id); Word icache_read(Addr, Size); @@ -71,7 +115,7 @@ public: void dcache_write(Addr, Word, Size); - Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector* mem_addrs); + Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector* mem_addrs); void trigger_ecall(); @@ -81,21 +125,18 @@ public: private: + void schedule(uint64_t cycle); void fetch(uint64_t cycle); void decode(uint64_t cycle); - void issue(uint64_t cycle); void execute(uint64_t cycle); void commit(uint64_t cycle); - - void warp_scheduler(uint64_t cycle); - + void writeToStdOut(Addr addr, Word data); Word id_; const ArchDef arch_; const Decoder decoder_; MemoryUnit mmu_; - RAM shared_mem_; std::vector tex_units_; std::vector> warps_; @@ -107,33 +148,33 @@ private: std::vector exe_units_; Cache::Ptr icache_; Cache::Ptr dcache_; + SharedMem::Ptr shared_mem_; Switch::Ptr l1_mem_switch_; std::vector::Ptr> dcache_switch_; - PipelineStage fetch_stage_; - PipelineStage decode_stage_; - PipelineStage issue_stage_; - PipelineStage execute_stage_; - PipelineStage commit_stage_; + PipelineLatch fetch_latch_; + PipelineLatch decode_latch_; HashTable pending_icache_; - WarpMask stalled_warps_; + WarpMask active_warps_; + WarpMask stalled_warps_; uint32_t last_schedule_wid_; - uint32_t issued_instrs_; - uint32_t committed_instrs_; + uint64_t issued_instrs_; + uint64_t committed_instrs_; + uint32_t csr_tex_unit_; bool ecall_; bool ebreak_; std::unordered_map print_bufs_; - uint64_t stats_insts_; + PerfStats perf_stats_; + uint64_t perf_mem_pending_reads_; friend class LsuUnit; + friend class AluUnit; + friend class CsrUnit; + friend class FpuUnit; friend class GpuUnit; - -public: - SlavePort MemRspPort; - MasterPort MemReqPort; }; } // namespace vortex \ No newline at end of file diff --git a/sim/simX/decode.cpp b/sim/simX/decode.cpp index a2957c64..f890d2f9 100644 --- a/sim/simX/decode.cpp +++ b/sim/simX/decode.cpp @@ -359,14 +359,28 @@ std::shared_ptr Decoder::decode(Word code) const { instr->setDestReg(rd); } instr->setFunc3(func3); - instr->setFunc7(func7); - if ((func3 == 5) && (op != L_INST) && (op != Opcode::FL)) { - instr->setImm(sext32(rs2, 5)); - } else { + instr->setFunc7(func7); + switch (op) { + case Opcode::SYS_INST: + case Opcode::FENCE: + // uint12 + instr->setImm(code >> shift_rs2_); + break; + case Opcode::I_INST: + if (func3 == 0x1 || func3 == 0x5) { + // int5 + instr->setImm(sext32(rs2, 5)); + } else { + // int12 + instr->setImm(sext32(code >> shift_rs2_, 12)); + } + break; + default: + // int12 instr->setImm(sext32(code >> shift_rs2_, 12)); + break; } } break; - case InstType::S_TYPE: { instr->setSrcReg(rs1); if (op == Opcode::FS) { @@ -375,8 +389,8 @@ std::shared_ptr Decoder::decode(Word code) const { instr->setSrcReg(rs2); } instr->setFunc3(func3); - Word imeed = (func7 << reg_s_) | rd; - instr->setImm(sext32(imeed, 12)); + Word imm = (func7 << reg_s_) | rd; + instr->setImm(sext32(imm, 12)); } break; case InstType::B_TYPE: { @@ -387,8 +401,8 @@ std::shared_ptr Decoder::decode(Word code) const { Word bits_4_1 = rd >> 1; Word bit_10_5 = func7 & 0x3f; Word bit_12 = func7 >> 6; - Word imeed = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12); - instr->setImm(sext32(imeed, 13)); + Word imm = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12); + instr->setImm(sext32(imm, 13)); } break; case InstType::U_TYPE: @@ -403,11 +417,11 @@ std::shared_ptr Decoder::decode(Word code) const { Word bit_11 = (unordered >> 8) & 0x1; Word bits_10_1 = (unordered >> 9) & 0x3ff; Word bit_20 = (unordered >> 19) & 0x1; - Word imeed = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20); + Word imm = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20); if (bit_20) { - imeed |= ~j_imm_mask_; + imm |= ~j_imm_mask_; } - instr->setImm(imeed); + instr->setImm(imm); } break; case InstType::V_TYPE: diff --git a/sim/simX/execute.cpp b/sim/simX/execute.cpp index d55ba2f9..be172830 100644 --- a/sim/simX/execute.cpp +++ b/sim/simX/execute.cpp @@ -428,7 +428,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; Word data_read = core_->dcache_read(memAddr, 4); - trace->mem_addrs.at(t).push_back(memAddr); + trace->mem_addrs.at(t).push_back({memAddr, 4}); DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); switch (func3) { case 0: @@ -491,7 +491,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { if (!tmask_.test(t)) continue; Word memAddr = rsdata[t][0] + immsrc; - trace->mem_addrs.at(t).push_back(memAddr); + trace->mem_addrs.at(t).push_back({memAddr, (1u << func3)}); DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); switch (func3) { case 0: @@ -528,14 +528,14 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case SYS_INST: - trace->exe_type = ExeType::CSR; for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; - Word csr_addr = immsrc & 0x00000FFF; - Word csr_value = core_->get_csr(csr_addr, t, id_); - switch (func3) { - case 0: + Word csr_addr = immsrc; + Word csr_value; + if (func3 == 0) { + trace->exe_type = ExeType::ALU; + trace->fetch_stall = true; switch (csr_addr) { case 0: // ECALL core_->trigger_ecall(); @@ -549,56 +549,59 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { break; default: std::abort(); - } - break; - case 1: - // CSRRW - rddata[t] = csr_value; - core_->set_csr(csr_addr, rsdata[t][0], t, id_); - trace->used_iregs.set(rsrc0); - rd_write = true; - break; - case 2: - // CSRRS - rddata[t] = csr_value; - core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_); - trace->used_iregs.set(rsrc0); - rd_write = true; - break; - case 3: - // CSRRC - rddata[t] = csr_value; - core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_); - trace->used_iregs.set(rsrc0); - rd_write = true; - break; - case 5: - // CSRRWI - rddata[t] = csr_value; - core_->set_csr(csr_addr, rsrc0, t, id_); - rd_write = true; - break; - case 6: - // CSRRSI - rddata[t] = csr_value; - core_->set_csr(csr_addr, csr_value | rsrc0, t, id_); - rd_write = true; - break; - case 7: - // CSRRCI - rddata[t] = csr_value; - core_->set_csr(csr_addr, csr_value & ~rsrc0, t, id_); - rd_write = true; - break; - default: - break; + } + } else { + trace->exe_type = ExeType::CSR; + csr_value = core_->get_csr(csr_addr, t, id_); + switch (func3) { + case 1: + // CSRRW + rddata[t] = csr_value; + core_->set_csr(csr_addr, rsdata[t][0], t, id_); + trace->used_iregs.set(rsrc0); + rd_write = true; + break; + case 2: + // CSRRS + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_); + trace->used_iregs.set(rsrc0); + rd_write = true; + break; + case 3: + // CSRRC + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_); + trace->used_iregs.set(rsrc0); + rd_write = true; + break; + case 5: + // CSRRWI + rddata[t] = csr_value; + core_->set_csr(csr_addr, rsrc0, t, id_); + rd_write = true; + break; + case 6: + // CSRRSI; + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value | rsrc0, t, id_); + rd_write = true; + break; + case 7: + // CSRRCI + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value & ~rsrc0, t, id_); + rd_write = true; + break; + default: + break; + } } } break; case FENCE: trace->exe_type = ExeType::LSU; trace->lsu.type = LsuType::FENCE; - trace->fetch_stall = true; break; case FCI: trace->exe_type = ExeType::FPU; @@ -797,6 +800,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DPN(3, std::endl); active_ = tmask_.any(); + trace->gpu.active_warps.reset(); + trace->gpu.active_warps.set(id_, active_); } break; case 1: { // WSPAWN @@ -805,13 +810,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { trace->used_iregs.set(rsrc0); trace->used_iregs.set(rsrc1); trace->fetch_stall = true; - int active_warps = std::min(rsdata.at(ts)[0], core_->arch().num_warps()); - DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]); - for (int i = 1; i < active_warps; ++i) { - Warp &newWarp = core_->warp(i); - newWarp.setPC(rsdata[ts][1]); - newWarp.setTmask(0, true); - } + trace->gpu.active_warps = core_->wspawn(rsdata.at(ts)[0], rsdata.at(ts)[1]); } break; case 2: { // SPLIT @@ -877,9 +876,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { trace->gpu.type = GpuType::BAR; trace->used_iregs.set(rsrc0); trace->used_iregs.set(rsrc1); - trace->fetch_stall = true; - active_ = false; - core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); + trace->fetch_stall = true; + trace->gpu.active_warps = core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); } break; case 5: { // PREFETCH diff --git a/sim/simX/exeunit.cpp b/sim/simX/exeunit.cpp index 1d0a3cfc..1736101c 100644 --- a/sim/simX/exeunit.cpp +++ b/sim/simX/exeunit.cpp @@ -10,64 +10,78 @@ using namespace vortex; -NopUnit::NopUnit(Core*) : ExeUnit("NOP") {} +NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {} void NopUnit::step(uint64_t /*cycle*/) { - if (inputs_.empty()) + if (Input.empty()) return; - auto trace = inputs_.top(); - this->schedule_output(trace, 1); - inputs_.pop(); + auto trace = Input.front(); + Output.send(trace, 1); + Input.pop(); } /////////////////////////////////////////////////////////////////////////////// -LsuUnit::LsuUnit(Core* core) - : ExeUnit("LSU") - , core_(core) +LsuUnit::LsuUnit(const SimContext& ctx, Core* core) + : ExeUnit(ctx, core, "LSU") , num_threads_(core->arch().num_threads()) , pending_dcache_(LSUQ_SIZE) , fence_lock_(false) {} void LsuUnit::step(uint64_t cycle) { - __unused (cycle); - // handle dcache response for (uint32_t t = 0; t < num_threads_; ++t) { auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0); if (dcache_rsp_port.empty()) continue; - auto& mem_rsp = dcache_rsp_port.top(); + auto& mem_rsp = dcache_rsp_port.front(); auto& entry = pending_dcache_.at(mem_rsp.tag); auto trace = entry.first; DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); assert(entry.second); --entry.second; // track remaining blocks - if (0 == entry.second) { - auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency); - trace->dcache_latency = latency; - this->schedule_output(trace, 1); + if (0 == entry.second) { + Output.send(trace, 1); pending_dcache_.release(mem_rsp.tag); } dcache_rsp_port.pop(); } + // handle shared memory response + for (uint32_t t = 0; t < num_threads_; ++t) { + auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t); + if (smem_rsp_port.empty()) + continue; + auto& mem_rsp = smem_rsp_port.front(); + auto& entry = pending_dcache_.at(mem_rsp.tag); + auto trace = entry.first; + DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type + << ", tid=" << t << ", " << *trace); + assert(entry.second); + --entry.second; // track remaining blocks + if (0 == entry.second) { + Output.send(trace, 1); + pending_dcache_.release(mem_rsp.tag); + } + smem_rsp_port.pop(); + } + if (fence_lock_) { // wait for all pending memory operations to complete if (!pending_dcache_.empty()) return; - this->schedule_output(fence_state_, 1); + Output.send(fence_state_, 1); fence_lock_ = false; DT(3, cycle, "fence-unlock: " << fence_state_); } // check input queue - if (inputs_.empty()) + if (Input.empty()) return; - auto trace = inputs_.top(); + auto trace = Input.front(); if (trace->lsu.type == LsuType::FENCE) { // schedule fence lock @@ -75,179 +89,188 @@ void LsuUnit::step(uint64_t cycle) { fence_lock_ = true; DT(3, cycle, "fence-lock: " << *trace); // remove input - inputs_.pop(); + auto time = Input.pop(); + core_->perf_stats_.lsu_stalls += (cycle - time); return; } - // check pending queue capacity - if (!trace->check_stalled(pending_dcache_.full())) { - DT(3, cycle, "*** lsu-queue-stall: " << *trace); - } - if (pending_dcache_.full()) + // check pending queue capacity + if (pending_dcache_.full()) { + if (!trace->suspend()) { + DT(3, cycle, "*** lsu-queue-stall: " << *trace); + } return; - - // send memory request - - bool has_shared_memory = false; - bool mem_rsp_pending = false; + } else { + trace->resume(); + } + bool is_write = (trace->lsu.type == LsuType::STORE); - uint32_t valid_addrs = 0; - for (auto& mem_addr : trace->mem_addrs) { - valid_addrs += mem_addr.size(); - } + // duplicates detection + bool is_dup = false; + if (trace->tmask.test(0)) { + uint64_t addr_mask = sizeof(Word)-1; + Word addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask; + uint32_t matches = 1; + for (uint32_t t = 1; t < num_threads_; ++t) { + if (!trace->tmask.test(t)) + continue; + auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask; + matches += (addr0 == mem_addr); + } + is_dup = (matches == trace->tmask.count()); + } + + uint32_t valid_addrs = 0; + if (is_dup) { + valid_addrs = 1; + } else { + for (auto& mem_addr : trace->mem_addrs) { + valid_addrs += mem_addr.size(); + } + } - trace->dcache_latency = SimPlatform::instance().cycles(); auto tag = pending_dcache_.allocate({trace, valid_addrs}); for (uint32_t t = 0; t < num_threads_; ++t) { if (!trace->tmask.test(t)) continue; + + auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0); + auto mem_addr = trace->mem_addrs.at(t).at(0); + auto type = get_addr_type(mem_addr.addr, mem_addr.size); - auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0); - for (auto mem_addr : trace->mem_addrs.at(t)) { - // check shared memory address - if (SM_ENABLE) { - if ((mem_addr >= (SMEM_BASE_ADDR-SMEM_SIZE)) - && (mem_addr < SMEM_BASE_ADDR)) { - DT(3, cycle, "smem-access: addr=" << std::hex << mem_addr << ", tag=" << tag - << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); - has_shared_memory = true; - continue; - } - } - - bool is_io = (mem_addr >= IO_BASE_ADDR); - - MemReq mem_req; - mem_req.addr = mem_addr; - mem_req.write = is_write; - mem_req.tag = tag; - mem_req.is_io = is_io; - dcache_req_port.send(mem_req, 1); - DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr << ", tag=" << tag - << ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << is_io << ", "<< trace); - // do not wait on writes - mem_rsp_pending = !is_write; - } + MemReq mem_req; + mem_req.addr = mem_addr.addr; + mem_req.write = is_write; + mem_req.tag = tag; + mem_req.is_io = (type == AddrType::IO); + + if (type == AddrType::Shared) { + core_->shared_mem_->Inputs.at(t).send(mem_req, 2); + DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); + } else { + dcache_req_port.send(mem_req, 2); + DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + << ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << mem_req.is_io << ", " << *trace); + } + + if (is_dup) + break; } - // do not wait - if (!mem_rsp_pending) { + // do not wait on writes + if (is_write) { pending_dcache_.release(tag); - uint32_t delay = 1; - if (has_shared_memory) { - // all threads accessed shared memory - delay += Constants::SMEM_DELAY; - } - this->schedule_output(trace, delay); + Output.send(trace, 1); } // remove input - inputs_.pop(); + auto time = Input.pop(); + core_->perf_stats_.lsu_stalls += (cycle - time); } /////////////////////////////////////////////////////////////////////////////// -AluUnit::AluUnit(Core*) : ExeUnit("ALU") {} +AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {} -void AluUnit::step(uint64_t /*cycle*/) { - if (inputs_.empty()) +void AluUnit::step(uint64_t cycle) { + if (Input.empty()) return; - auto trace = inputs_.top(); + auto trace = Input.front(); switch (trace->alu.type) { case AluType::ARITH: case AluType::BRANCH: case AluType::CMOV: - this->schedule_output(trace, 1); - inputs_.pop(); + Output.send(trace, 1); break; case AluType::IMUL: - this->schedule_output(trace, LATENCY_IMUL); - inputs_.pop(); + Output.send(trace, LATENCY_IMUL+1); break; case AluType::IDIV: - this->schedule_output(trace, XLEN); - inputs_.pop(); + Output.send(trace, XLEN+1); break; default: std::abort(); } + DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace); + if (trace->fetch_stall) { + core_->stalled_warps_.reset(trace->wid); + } + auto time = Input.pop(); + core_->perf_stats_.alu_stalls += (cycle - time); } /////////////////////////////////////////////////////////////////////////////// -CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {} +CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {} -void CsrUnit::step(uint64_t /*cycle*/) { - if (inputs_.empty()) +void CsrUnit::step(uint64_t cycle) { + if (Input.empty()) return; - auto trace = inputs_.top(); - this->schedule_output(trace, 1); - inputs_.pop(); + auto trace = Input.front(); + Output.send(trace, 1); + auto time = Input.pop(); + core_->perf_stats_.csr_stalls += (cycle - time); + DT(3, cycle, "pipeline-execute: op=CSR, " << *trace); } /////////////////////////////////////////////////////////////////////////////// -FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {} +FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {} -void FpuUnit::step(uint64_t /*cycle*/) { - if (inputs_.empty()) +void FpuUnit::step(uint64_t cycle) { + if (Input.empty()) return; - auto trace = inputs_.top(); + auto trace = Input.front(); switch (trace->fpu.type) { case FpuType::FNCP: - this->schedule_output(trace, 1); - inputs_.pop(); + Output.send(trace, 2); break; case FpuType::FMA: - this->schedule_output(trace, LATENCY_FMA); - inputs_.pop(); + Output.send(trace, LATENCY_FMA+1); break; case FpuType::FDIV: - this->schedule_output(trace, LATENCY_FDIV); - inputs_.pop(); + Output.send(trace, LATENCY_FDIV+1); break; case FpuType::FSQRT: - this->schedule_output(trace, LATENCY_FSQRT); - inputs_.pop(); + Output.send(trace, LATENCY_FSQRT+1); break; case FpuType::FCVT: - this->schedule_output(trace, LATENCY_FCVT); - inputs_.pop(); + Output.send(trace, LATENCY_FCVT+1); break; default: std::abort(); - } + } + DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace); + auto time = Input.pop(); + core_->perf_stats_.fpu_stalls += (cycle - time); } /////////////////////////////////////////////////////////////////////////////// -GpuUnit::GpuUnit(Core* core) - : ExeUnit("GPU") - , core_(core) +GpuUnit::GpuUnit(const SimContext& ctx, Core* core) + : ExeUnit(ctx, core, "GPU") , num_threads_(core->arch().num_threads()) , pending_tex_reqs_(TEXQ_SIZE) {} void GpuUnit::step(uint64_t cycle) { - __unused (cycle); #ifdef EXT_TEX_ENABLE // handle memory response for (uint32_t t = 0; t < num_threads_; ++t) { auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1); if (dcache_rsp_port.empty()) continue; - auto& mem_rsp = dcache_rsp_port.top(); + auto& mem_rsp = dcache_rsp_port.front(); auto& entry = pending_tex_reqs_.at(mem_rsp.tag); auto trace = entry.first; DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace); assert(entry.second); --entry.second; // track remaining blocks - if (0 == entry.second) { - auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency); - trace->dcache_latency = latency; - this->schedule_output(trace, 1); + if (0 == entry.second) { + Output.send(trace, 1); pending_tex_reqs_.release(mem_rsp.tag); } dcache_rsp_port.pop(); @@ -255,38 +278,67 @@ void GpuUnit::step(uint64_t cycle) { #endif // check input queue - if (inputs_.empty()) + if (Input.empty()) return; - auto trace = inputs_.top(); + auto trace = Input.front(); + + bool issued = false; switch (trace->gpu.type) { case GpuType::TMC: + Output.send(trace, 1); + core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid)); + issued = true; + break; case GpuType::WSPAWN: + Output.send(trace, 1); + core_->active_warps_ = trace->gpu.active_warps; + issued = true; + break; case GpuType::SPLIT: case GpuType::JOIN: - case GpuType::BAR: - this->schedule_output(trace, 1); - inputs_.pop(); + Output.send(trace, 1); + issued = true; break; - case GpuType::TEX: { + case GpuType::BAR: + Output.send(trace, 1); + if (trace->gpu.active_warps != 0) + core_->active_warps_ |= trace->gpu.active_warps; + else + core_->active_warps_.reset(trace->wid); + issued = true; + break; + case GpuType::TEX: if (this->processTexRequest(cycle, trace)) - inputs_.pop(); - } break; + issued = true; + break; default: std::abort(); } + + if (issued) { + DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace); + if (trace->fetch_stall) { + core_->stalled_warps_.reset(trace->wid); + } + auto time = Input.pop(); + core_->perf_stats_.fpu_stalls += (cycle - time); + } } bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { __unused (cycle); - // check pending queue capacity - if (!trace->check_stalled(pending_tex_reqs_.full())) { - DT(3, cycle, "*** tex-queue-stall: " << *trace); - } - if (pending_tex_reqs_.full()) + // check pending queue capacity + if (pending_tex_reqs_.full()) { + if (!trace->suspend()) { + DT(3, cycle, "*** tex-queue-stall: " << *trace); + } return false; + } else { + trace->resume(); + } // send memory request @@ -295,7 +347,6 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { valid_addrs += mem_addr.size(); } - trace->tex_latency = SimPlatform::instance().cycles(); auto tag = pending_tex_reqs_.allocate({trace, valid_addrs}); for (uint32_t t = 0; t < num_threads_; ++t) { @@ -305,12 +356,14 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1); for (auto mem_addr : trace->mem_addrs.at(t)) { MemReq mem_req; - mem_req.addr = mem_addr; + mem_req.addr = mem_addr.addr; mem_req.write = (trace->lsu.type == LsuType::STORE); mem_req.tag = tag; - dcache_req_port.send(mem_req, 1); - DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr << ", tag=" << tag + dcache_req_port.send(mem_req, 3); + DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag << ", tid=" << t << ", "<< trace); + ++ core_->perf_stats_.tex_reads; + ++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size(); } } diff --git a/sim/simX/exeunit.h b/sim/simX/exeunit.h index 83e69463..bea714ea 100644 --- a/sim/simX/exeunit.h +++ b/sim/simX/exeunit.h @@ -8,56 +8,29 @@ namespace vortex { class Core; -class ExeUnit { -protected: - const char* name_; - Queue inputs_; - Queue outputs_; +class ExeUnit : public SimObject { +public: + SimPort Input; + SimPort Output; - void schedule_output(pipeline_trace_t* trace, uint32_t delay) { - if (delay > 1) { - SimPlatform::instance().schedule( - [&](pipeline_trace_t* req) { - outputs_.push(req); - }, - trace, - (delay - 1) - ); - } else { - outputs_.push(trace); - } - } - -public: - typedef std::shared_ptr Ptr; - - ExeUnit(const char* name) : name_(name) {} + ExeUnit(const SimContext& ctx, Core* core, const char* name) + : SimObject(ctx, name) + , Input(this) + , Output(this) + , core_(core) + {} + virtual ~ExeUnit() {} - void push(pipeline_trace_t* trace) { - inputs_.push(trace); - } - - bool empty() const { - return outputs_.empty(); - } - - pipeline_trace_t* top() const { - return outputs_.top(); - } - - void pop() { - outputs_.pop(); - } - - virtual void step(uint64_t cycle) = 0; +protected: + Core* core_; }; /////////////////////////////////////////////////////////////////////////////// class NopUnit : public ExeUnit { public: - NopUnit(Core*); + NopUnit(const SimContext& ctx, Core*); void step(uint64_t cycle); }; @@ -65,15 +38,14 @@ public: /////////////////////////////////////////////////////////////////////////////// class LsuUnit : public ExeUnit { -private: - Core* core_; +private: uint32_t num_threads_; HashTable> pending_dcache_; pipeline_trace_t* fence_state_; bool fence_lock_; public: - LsuUnit(Core*); + LsuUnit(const SimContext& ctx, Core*); void step(uint64_t cycle); }; @@ -82,7 +54,7 @@ public: class AluUnit : public ExeUnit { public: - AluUnit(Core*); + AluUnit(const SimContext& ctx, Core*); void step(uint64_t cycle); }; @@ -91,7 +63,7 @@ public: class CsrUnit : public ExeUnit { public: - CsrUnit(Core*); + CsrUnit(const SimContext& ctx, Core*); void step(uint64_t cycle); }; @@ -100,7 +72,7 @@ public: class FpuUnit : public ExeUnit { public: - FpuUnit(Core*); + FpuUnit(const SimContext& ctx, Core*); void step(uint64_t cycle); }; @@ -109,14 +81,13 @@ public: class GpuUnit : public ExeUnit { private: - Core* core_; uint32_t num_threads_; HashTable> pending_tex_reqs_; bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace); public: - GpuUnit(Core*); + GpuUnit(const SimContext& ctx, Core*); void step(uint64_t cycle); }; diff --git a/sim/simX/memsim.cpp b/sim/simX/memsim.cpp index 6559000d..012082d9 100644 --- a/sim/simX/memsim.cpp +++ b/sim/simX/memsim.cpp @@ -10,6 +10,7 @@ private: MemSim* simobject_; uint32_t num_banks_; uint32_t latency_; + PerfStats perf_stats_; public: Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency) @@ -18,16 +19,23 @@ public: , latency_(latency) {} + const PerfStats& perf_stats() const { + return perf_stats_; + } + void step(uint64_t /*cycle*/) { for (uint32_t i = 0, n = num_banks_; i < n; ++i) { auto& mem_req_port = simobject_->MemReqPorts.at(i); if (mem_req_port.empty()) continue; - auto& mem_req = mem_req_port.top(); + auto& mem_req = mem_req_port.front(); if (!mem_req.write) { MemRsp mem_rsp; mem_rsp.tag = mem_req.tag; simobject_->MemRspPorts.at(i).send(mem_rsp, latency_); + ++perf_stats_.reads; + } else { + ++perf_stats_.writes; } mem_req_port.pop(); } @@ -40,9 +48,9 @@ MemSim::MemSim(const SimContext& ctx, uint32_t num_banks, uint32_t latency) : SimObject(ctx, "MemSim") - , impl_(new Impl(this, num_banks, latency)) , MemReqPorts(num_banks, this) , MemRspPorts(num_banks, this) + , impl_(new Impl(this, num_banks, latency)) {} MemSim::~MemSim() { diff --git a/sim/simX/memsim.h b/sim/simX/memsim.h index 3d5b33fe..c48361bc 100644 --- a/sim/simX/memsim.h +++ b/sim/simX/memsim.h @@ -1,47 +1,36 @@ #pragma once #include +#include "types.h" #include -#include namespace vortex { -struct MemReq { - uint64_t addr; - uint32_t tag; - bool write; - bool is_io; - - MemReq(uint64_t _addr = 0, - uint64_t _tag = 0, - bool _write = false, - bool _is_io = false - ) : addr(_addr) - , tag(_tag) - , write(_write) - , is_io(_is_io) - {} -}; - -struct MemRsp { - uint64_t tag; - MemRsp(uint64_t _tag = 0) : tag (_tag) {} -}; - class MemSim : public SimObject{ -private: - class Impl; - Impl* impl_; - public: + struct PerfStats { + uint64_t reads; + uint64_t writes; - MemSim(const SimContext& ctx, uint32_t num_inputs, uint32_t latency); + PerfStats() + : reads(0) + , writes(0) + {} + }; + + std::vector> MemReqPorts; + std::vector> MemRspPorts; + + MemSim(const SimContext& ctx, uint32_t num_banks, uint32_t latency); ~MemSim(); void step(uint64_t cycle); - std::vector> MemReqPorts; - std::vector> MemRspPorts; + const PerfStats& perf_stats() const; + +private: + class Impl; + Impl* impl_; }; }; \ No newline at end of file diff --git a/sim/simX/pipeline.h b/sim/simX/pipeline.h index a5bf6d52..9ac09352 100644 --- a/sim/simX/pipeline.h +++ b/sim/simX/pipeline.h @@ -12,7 +12,7 @@ namespace vortex { struct pipeline_trace_t { //-- - uint64_t id; + uint64_t uuid; //-- int cid; @@ -22,7 +22,6 @@ struct pipeline_trace_t { //-- bool fetch_stall; - bool pipeline_stall; //-- bool wb; @@ -38,7 +37,7 @@ struct pipeline_trace_t { ExeType exe_type; //-- - std::vector> mem_addrs; + std::vector> mem_addrs; //-- union { @@ -53,22 +52,19 @@ struct pipeline_trace_t { } fpu; struct { GpuType type; + WarpMask active_warps; } gpu; }; - // stats - uint64_t icache_latency; - uint64_t dcache_latency; - uint64_t tex_latency; + bool stalled; - pipeline_trace_t(uint64_t id_, const ArchDef& arch) { - id = id_; + pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) { + uuid = uuid_; cid = 0; wid = 0; tmask.reset(); - PC = 0; + PC = 0; fetch_stall = false; - pipeline_stall = false; wb = false; rdest = 0; rdest_type = RegType::None; @@ -76,16 +72,18 @@ struct pipeline_trace_t { used_fregs.reset(); used_vregs.reset(); exe_type = ExeType::NOP; - mem_addrs.resize(arch.num_threads()); - icache_latency = 0; - dcache_latency = 0; - tex_latency = 0; + mem_addrs.resize(arch.num_threads()); + stalled = false; } - bool check_stalled(bool stall) { - bool old = pipeline_stall; - pipeline_stall = stall; - return stall ? old : true; + bool suspend() { + bool old = stalled; + stalled = true; + return old; + } + + void resume() { + stalled = false; } }; @@ -96,16 +94,16 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) os << ", rd=" << state.rdest_type << std::dec << state.rdest; } os << ", ex=" << state.exe_type; - os << " (#" << std::dec << state.id << ")"; + os << " (#" << std::dec << state.uuid << ")"; return os; } -class PipelineStage : public Queue { +class PipelineLatch : public Queue { protected: const char* name_; public: - PipelineStage(const char* name = nullptr) + PipelineLatch(const char* name = nullptr) : name_(name) {} }; diff --git a/sim/simX/processor.cpp b/sim/simX/processor.cpp index 7b54b505..ca9d46a6 100644 --- a/sim/simX/processor.cpp +++ b/sim/simX/processor.cpp @@ -18,13 +18,13 @@ Processor::Processor(const ArchDef& arch) // connect memory sub-systen memsim_ = MemSim::Create(1, MEM_LATENCY); - std::vector*> mem_req_ports(1); - std::vector*> mem_rsp_ports(1); + std::vector*> mem_req_ports(1); + std::vector*> mem_rsp_ports(1); mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0); mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0); if (L3_ENABLE) { - l3cache_ = Cache::Create("l3cache", CacheConfig{ + l3cache_ = Cache::Create("l3cache", Cache::Config{ log2ceil(L3_CACHE_SIZE), // C log2ceil(MEM_BLOCK_SIZE), // B 2, // W @@ -66,7 +66,7 @@ Processor::Processor(const ArchDef& arch) for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { if (L2_ENABLE) { auto& l2cache = l2caches_.at(i); - l2cache = Cache::Create("l2cache", CacheConfig{ + l2cache = Cache::Create("l2cache", Cache::Config{ log2ceil(L2_CACHE_SIZE), // C log2ceil(MEM_BLOCK_SIZE), // B 2, // W diff --git a/sim/simX/scoreboard.h b/sim/simX/scoreboard.h index 95ba0700..b36d60b3 100644 --- a/sim/simX/scoreboard.h +++ b/sim/simX/scoreboard.h @@ -96,7 +96,7 @@ public: } uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type; assert(owners_.count(tag) == 0); - owners_[tag] = state->id; + owners_[tag] = state->uuid; } void release(pipeline_trace_t* state) { diff --git a/sim/simX/sharedmem.h b/sim/simX/sharedmem.h new file mode 100644 index 00000000..d984422d --- /dev/null +++ b/sim/simX/sharedmem.h @@ -0,0 +1,93 @@ +#pragma once + +#include +#include +#include +#include "types.h" + +namespace vortex { + +class Core; + +class SharedMem : public SimObject { +public: + struct Config { + uint32_t num_reqs; + uint32_t num_banks; + uint32_t bank_offset; + uint32_t latency; + bool write_reponse; + }; + + struct PerfStats { + uint64_t reads; + uint64_t writes; + uint64_t bank_stalls; + + PerfStats() + : reads(0) + , writes(0) + , bank_stalls(0) + {} + }; + + std::vector> Inputs; + std::vector> Outputs; + + SharedMem(const SimContext& ctx, const char* name, const Config& config) + : SimObject(ctx, name) + , Inputs(config.num_reqs, this) + , Outputs(config.num_reqs, this) + , config_(config) + , bank_sel_addr_start_(config.bank_offset) + , bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1) + {} + + virtual ~SharedMem() {} + + void step(uint64_t /*cycle*/) { + std::vector in_used_banks(config_.num_banks); + for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) { + auto& core_req_port = this->Inputs.at(req_id); + if (core_req_port.empty()) + continue; + + auto& core_req = core_req_port.front(); + + uint32_t bank_id = (uint32_t)bit_getw( + core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_); + + // bank conflict check + if (in_used_banks.at(bank_id)) + continue; + + in_used_banks.at(bank_id) = true; + + if (!core_req.write || config_.write_reponse) { + // send response + MemRsp core_rsp; + core_rsp.tag = core_req.tag; + this->Outputs.at(req_id).send(core_rsp, 1); + } + + // update perf counters + perf_stats_.reads += !core_req.write; + perf_stats_.writes += core_req.write; + + // remove input + core_req_port.pop(); + } + } + + const PerfStats& perf_stats() const { + return perf_stats_; + } + +protected: + Config config_; + uint32_t bank_sel_addr_start_; + uint32_t bank_sel_addr_end_; + PerfStats perf_stats_; +}; + +} \ No newline at end of file diff --git a/sim/simX/tex_unit.cpp b/sim/simX/tex_unit.cpp index bfbcef1a..8dedef38 100644 --- a/sim/simX/tex_unit.cpp +++ b/sim/simX/tex_unit.cpp @@ -27,7 +27,7 @@ void TexUnit::set_state(uint32_t state, uint32_t value) { uint32_t TexUnit::read(int32_t u, int32_t v, int32_t lod, - std::vector* mem_addrs) { + std::vector* mem_addrs) { //-- auto xu = Fixed::make(u); auto xv = Fixed::make(v); @@ -60,10 +60,10 @@ uint32_t TexUnit::read(int32_t u, uint32_t texel10 = core_->dcache_read(addr10, stride); uint32_t texel11 = core_->dcache_read(addr11, stride); - mem_addrs->push_back(addr00); - mem_addrs->push_back(addr01); - mem_addrs->push_back(addr10); - mem_addrs->push_back(addr11); + mem_addrs->push_back({addr00, stride}); + mem_addrs->push_back({addr01, stride}); + mem_addrs->push_back({addr10, stride}); + mem_addrs->push_back({addr11, stride}); // filtering auto color = TexFilterLinear( @@ -79,7 +79,7 @@ uint32_t TexUnit::read(int32_t u, // memory lookup uint32_t texel = core_->dcache_read(addr, stride); - mem_addrs->push_back(addr); + mem_addrs->push_back({addr, stride}); // filtering auto color = TexFilterPoint(format, texel); diff --git a/sim/simX/tex_unit.h b/sim/simX/tex_unit.h index 759dda2a..b41cd8c7 100644 --- a/sim/simX/tex_unit.h +++ b/sim/simX/tex_unit.h @@ -15,7 +15,7 @@ public: void set_state(uint32_t state, uint32_t value); - uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector* mem_addrs); + uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector* mem_addrs); private: diff --git a/sim/simX/types.h b/sim/simX/types.h index d4feb1cb..7675ab82 100644 --- a/sim/simX/types.h +++ b/sim/simX/types.h @@ -21,6 +21,8 @@ typedef std::bitset<32> RegMask; typedef std::bitset<32> ThreadMask; typedef std::bitset<32> WarpMask; +/////////////////////////////////////////////////////////////////////////////// + enum class RegType { None, Integer, @@ -38,6 +40,8 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) { return os; } +/////////////////////////////////////////////////////////////////////////////// + enum class ExeType { NOP, ALU, @@ -61,6 +65,8 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) { return os; } +/////////////////////////////////////////////////////////////////////////////// + enum class AluType { ARITH, BRANCH, @@ -80,6 +86,8 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) { return os; } +/////////////////////////////////////////////////////////////////////////////// + enum class LsuType { LOAD, STORE, @@ -97,6 +105,47 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) { return os; } +/////////////////////////////////////////////////////////////////////////////// + +enum class AddrType { + Global, + Shared, + IO, +}; + +inline std::ostream &operator<<(std::ostream &os, const AddrType& type) { + switch (type) { + case AddrType::Global: os << "Global"; break; + case AddrType::Shared: os << "Shared"; break; + case AddrType::IO: os << "IO"; break; + } + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + +struct mem_addr_size_t { + uint64_t addr; + uint32_t size; +}; + +inline AddrType get_addr_type(Word addr, uint32_t size) { + __unused (size); + if (SM_ENABLE) { + if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE) + && addr < SMEM_BASE_ADDR) { + assert((addr + size) <= SMEM_BASE_ADDR); + return AddrType::Shared; + } + } + if (addr >= IO_BASE_ADDR) { + return AddrType::IO; + } + return AddrType::Global; +} + +/////////////////////////////////////////////////////////////////////////////// + enum class FpuType { FNCP, FMA, @@ -116,6 +165,8 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) { return os; } +/////////////////////////////////////////////////////////////////////////////// + enum class GpuType { TMC, WSPAWN, @@ -137,6 +188,8 @@ inline std::ostream &operator<<(std::ostream &os, const GpuType& type) { return os; } +/////////////////////////////////////////////////////////////////////////////// + enum class ArbiterType { Priority, RoundRobin @@ -152,6 +205,30 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) { /////////////////////////////////////////////////////////////////////////////// +struct MemReq { + uint64_t addr; + uint32_t tag; + bool write; + bool is_io; + + MemReq(uint64_t _addr = 0, + uint64_t _tag = 0, + bool _write = false, + bool _is_io = false + ) : addr(_addr) + , tag(_tag) + , write(_write) + , is_io(_is_io) + {} +}; + +struct MemRsp { + uint64_t tag; + MemRsp(uint64_t _tag = 0) : tag (_tag) {} +}; + +/////////////////////////////////////////////////////////////////////////////// + template class Queue { protected: @@ -164,21 +241,29 @@ public: return queue_.empty(); } - const T& top() const { + const T& front() const { return queue_.front(); } - T& top() { + T& front() { return queue_.front(); } - void pop() { - queue_.pop(); + const T& back() const { + return queue_.back(); + } + + T& back() { + return queue_.back(); } void push(const T& value) { queue_.push(value); } + + void pop() { + queue_.pop(); + } }; /////////////////////////////////////////////////////////////////////////////// @@ -187,20 +272,24 @@ template class HashTable { private: std::vector> entries_; - uint32_t capacity_; + uint32_t size_; public: - HashTable(uint32_t size) - : entries_(size) - , capacity_(0) + HashTable(uint32_t capacity) + : entries_(capacity) + , size_(0) {} bool empty() const { - return (0 == capacity_); + return (0 == size_); } bool full() const { - return (capacity_ == entries_.size()); + return (size_ == entries_.size()); + } + + uint32_t size() const { + return size_; } bool contains(uint32_t index) const { @@ -225,7 +314,7 @@ public: if (!entry.first) { entry.first = true; entry.second = value; - ++capacity_; + ++size_; return i; } } @@ -237,7 +326,7 @@ public: auto& entry = entries_.at(index); assert(entry.first); entry.first = false; - --capacity_; + --size_; } }; @@ -287,7 +376,7 @@ public: uint32_t j = (cursor_ + i) % n; auto& req_in = ReqIn.at(j); if (!req_in.empty()) { - auto& req = req_in.top(); + auto& req = req_in.front(); if (tag_shift_) { req.tag = (req.tag << tag_shift_) | j; } @@ -300,7 +389,7 @@ public: // process incoming reponses if (!RspIn.empty()) { - auto& rsp = RspIn.top(); + auto& rsp = RspIn.front(); uint32_t port_id = 0; if (tag_shift_) { port_id = rsp.tag & ((1 << tag_shift_)-1); @@ -317,10 +406,10 @@ public: } } - std::vector> ReqIn; - MasterPort ReqOut; - SlavePort RspIn; - std::vector> RspOut; + std::vector> ReqIn; + SimPort ReqOut; + SimPort RspIn; + std::vector> RspOut; }; } \ No newline at end of file diff --git a/sim/simX/warp.cpp b/sim/simX/warp.cpp index 0392c1b9..df0c0e75 100644 --- a/sim/simX/warp.cpp +++ b/sim/simX/warp.cpp @@ -27,7 +27,7 @@ void Warp::eval(pipeline_trace_t *trace) { DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask="); for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) DPN(2, tmask_.test(n-i-1)); - DPN(2, ", PC=0x" << std::hex << PC_ << std::endl); + DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl); /* Fetch and decode. */ @@ -38,7 +38,7 @@ void Warp::eval(pipeline_trace_t *trace) { std::abort(); } - DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr << " (#" << trace->id << ")"); + DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr); // Update trace trace->cid = core_->id(); diff --git a/sim/simX/warp.h b/sim/simX/warp.h index 5af5eb02..c5a54205 100644 --- a/sim/simX/warp.h +++ b/sim/simX/warp.h @@ -46,6 +46,10 @@ public: return active_; } + void suspend() { + active_ = false; + } + void activate() { active_ = true; } diff --git a/tests/regression/tex/kernel.c b/tests/regression/tex/kernel.c index 88aec50c..9a36d8cb 100644 --- a/tests/regression/tex/kernel.c +++ b/tests/regression/tex/kernel.c @@ -62,15 +62,16 @@ int main() { kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; // configure texture unit - csr_write(CSR_TEX(0, TEX_STATE_WIDTH), arg->src_logwidth); - csr_write(CSR_TEX(0, TEX_STATE_HEIGHT), arg->src_logheight); - csr_write(CSR_TEX(0, TEX_STATE_FORMAT), arg->format); - csr_write(CSR_TEX(0, TEX_STATE_WRAPU), arg->wrapu); - csr_write(CSR_TEX(0, TEX_STATE_WRAPV), arg->wrapv); - csr_write(CSR_TEX(0, TEX_STATE_FILTER), (arg->filter ? 1 : 0)); - csr_write(CSR_TEX(0, TEX_STATE_ADDR), arg->src_addr); + csr_write(CSR_TEX_UNIT, 0); + csr_write(CSR_TEX_WIDTH, arg->src_logwidth); + csr_write(CSR_TEX_HEIGHT, arg->src_logheight); + csr_write(CSR_TEX_FORMAT, arg->format); + csr_write(CSR_TEX_WRAPU, arg->wrapu); + csr_write(CSR_TEX_WRAPV, arg->wrapv); + csr_write(CSR_TEX_FILTER, (arg->filter ? 1 : 0)); + csr_write(CSR_TEX_ADDR, arg->src_addr); static_for_t()([&](int i) { - csr_write(CSR_TEX(0, TEX_STATE_MIPOFF(i)), arg->mip_offs[i]); + csr_write(CSR_TEX_MIPOFF(i), arg->mip_offs[i]); }); tile_arg_t targ;