From 18762dffce0fd57a09f7d4cad7571d6f2e652e5f Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 24 Nov 2021 00:00:17 -0500 Subject: [PATCH] fixes: texture unit mem access sometimes going to smem, bilinear texture filtering; new: cache req_id, --- ci/regression.sh | 2 +- driver/rtlsim/vortex.cpp | 7 +- driver/simx/vortex.cpp | 82 +-- hw/rtl/VX_config.vh | 37 +- hw/rtl/VX_csr_data.sv | 72 +-- hw/rtl/VX_decode.sv | 28 +- hw/rtl/VX_define.vh | 51 +- hw/rtl/VX_execute.sv | 34 +- hw/rtl/VX_icache_stage.sv | 34 +- hw/rtl/VX_lsu_unit.sv | 48 +- hw/rtl/cache/VX_bank.sv | 71 +-- hw/rtl/cache/VX_cache.sv | 3 +- hw/rtl/cache/VX_cache_define.vh | 7 +- hw/rtl/cache/VX_data_access.sv | 9 +- hw/rtl/cache/VX_miss_resrv.sv | 26 +- hw/rtl/cache/VX_shared_mem.sv | 29 +- hw/rtl/cache/VX_tag_access.sv | 9 +- hw/rtl/tex_unit/VX_tex_addr.sv | 133 +++-- hw/rtl/tex_unit/VX_tex_define.vh | 23 +- hw/rtl/tex_unit/VX_tex_mem.sv | 58 +- hw/rtl/tex_unit/VX_tex_sampler.sv | 10 +- hw/rtl/tex_unit/VX_tex_stride.sv | 4 +- hw/rtl/tex_unit/VX_tex_unit.sv | 78 +-- hw/rtl/tex_unit/VX_tex_wrap.sv | 16 +- hw/syn/opae/Makefile | 1 - runtime/Makefile | 2 +- runtime/include/vx_intrinsics.h | 178 +++--- runtime/src/tinyprintf.c | 890 ++++++++++++++++++++++++++++++ runtime/src/tinyprintf.h | 86 +++ runtime/src/vx_perf.c | 8 +- runtime/src/vx_print.c | 63 ++- runtime/src/vx_syscalls.c | 5 +- sim/common/bitmanip.h | 79 +++ sim/common/fixed.h | 419 ++++++++++++++ sim/common/simobject.h | 69 +-- sim/common/texturing.h | 221 ++++++++ sim/common/util.h | 75 +-- sim/rtlsim/Makefile | 13 +- sim/simX/Makefile | 2 +- sim/simX/cache.cpp | 391 +++++++------ sim/simX/cache.h | 3 +- sim/simX/constants.h | 6 +- sim/simX/core.cpp | 224 ++++---- sim/simX/core.h | 20 +- sim/simX/decode.cpp | 89 ++- sim/simX/execute.cpp | 542 +++++++++--------- sim/simX/exeunit.cpp | 269 ++++++--- sim/simX/exeunit.h | 42 +- sim/simX/ibuffer.h | 8 +- sim/simX/instr.h | 6 +- sim/simX/memsim.cpp | 6 +- sim/simX/memsim.h | 14 +- sim/simX/pipeline.h | 39 +- sim/simX/processor.cpp | 10 +- sim/simX/scoreboard.h | 66 ++- sim/simX/tex_unit.cpp | 91 +++ sim/simX/tex_unit.h | 26 + sim/simX/types.h | 108 ++-- sim/simX/warp.cpp | 20 +- sim/simX/warp.h | 6 +- sim/vlsim/Makefile | 8 +- tests/regression/tex/Makefile | 8 +- tests/regression/tex/common.h | 34 +- tests/regression/tex/kernel.c | 73 ++- tests/regression/tex/main.cpp | 73 ++- tests/regression/tex/texsw.h | 247 ++++----- tests/regression/tex/utils.cpp | 108 ++++ tests/regression/tex/utils.h | 16 +- tests/runtime/fibonacci/main.cpp | 7 +- tests/runtime/hello/main.cpp | 3 +- 70 files changed, 3818 insertions(+), 1727 deletions(-) create mode 100644 runtime/src/tinyprintf.c create mode 100644 runtime/src/tinyprintf.h create mode 100644 sim/common/bitmanip.h create mode 100644 sim/common/fixed.h create mode 100644 sim/common/texturing.h create mode 100644 sim/simX/tex_unit.cpp create mode 100644 sim/simX/tex_unit.h diff --git a/ci/regression.sh b/ci/regression.sh index 073c0ed1..936ca13b 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -28,7 +28,7 @@ echo "begin texture tests..." CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0" CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" -CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-irainbow.png -orainbow_result.png -g1" +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-irainbow.png -orainbow_result.png -g2" echo "coverage texture done!" } diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index bed5c807..cc16f0d3 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -116,9 +116,11 @@ public: } int start() { + // ensure prior run completed if (future_.valid()) { - future_.wait(); // ensure prior run completed + future_.wait(); } + // start new run simulator_.attach_ram(&ram_); future_ = std::async(std::launch::async, [&]{ simulator_.reset(); @@ -135,7 +137,8 @@ public: uint64_t timeout_sec = timeout / 1000; std::chrono::seconds wait_time(1); for (;;) { - auto status = future_.wait_for(wait_time); // wait for 1 sec and check status + // wait for 1 sec and check status + auto status = future_.wait_for(wait_time); if (status == std::future_status::ready || 0 == timeout_sec--) break; diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index 1bd15e07..d63005d6 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -3,8 +3,7 @@ #include #include #include -#include -#include +#include #include #include @@ -60,18 +59,14 @@ class vx_device { public: vx_device() : arch_("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS) - , is_done_(false) - , is_running_(false) - , mem_allocation_(ALLOC_BASE_ADDR) - , thread_(__thread_proc__, this) , ram_(RAM_PAGE_SIZE) + , mem_allocation_(ALLOC_BASE_ADDR) {} ~vx_device() { - mutex_.lock(); - is_done_ = true; - mutex_.unlock(); - thread_.join(); + if (future_.valid()) { + future_.wait(); + } } int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) { @@ -115,72 +110,41 @@ public: } int start() { - mutex_.lock(); + // ensure prior run completed + if (future_.valid()) { + future_.wait(); + } + // start new run SimPlatform::instance().flush(); processor_ = std::make_shared(arch_); processor_->attach_ram(&ram_); - is_running_ = true; - mutex_.unlock(); + future_ = std::async(std::launch::async, [&]{ + processor_->run(); + }); return 0; } int wait(uint64_t timeout) { + if (!future_.valid()) + return 0; uint64_t timeout_sec = timeout / 1000; + std::chrono::seconds wait_time(1); for (;;) { - mutex_.lock(); - bool is_running = is_running_; - mutex_.unlock(); - - if (!is_running || 0 == timeout_sec--) + // wait for 1 sec and check status + auto status = future_.wait_for(wait_time); + if (status == std::future_status::ready + || 0 == timeout_sec--) break; - - std::this_thread::sleep_for(std::chrono::seconds(1)); } return 0; } private: - - void thread_proc() { - std::cout << "Device ready..." << std::flush << std::endl; - - for (;;) { - mutex_.lock(); - bool is_done = is_done_; - bool is_running = is_running_; - mutex_.unlock(); - - if (is_done) - break; - - if (is_running) { - std::cout << "Device running..." << std::flush << std::endl; - - processor_->run(); - - mutex_.lock(); - is_running_ = false; - mutex_.unlock(); - - std::cout << "Device ready..." << std::flush << std::endl; - } - } - - std::cout << "Device shutdown..." << std::flush << std::endl; - } - - static void __thread_proc__(vx_device* device) { - device->thread_proc(); - } - ArchDef arch_; - Processor::Ptr processor_; - bool is_done_; - bool is_running_; - uint64_t mem_allocation_; - std::thread thread_; RAM ram_; - std::mutex mutex_; + Processor::Ptr processor_; + uint64_t mem_allocation_; + std::future future_; }; /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index de58d9ee..82da10c2 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -236,18 +236,30 @@ ////////// Texture Units ////////////////////////////////////////////////////// -`define NUM_TEX_UNITS 2 +`define NUM_TEX_UNITS 2 +`define TEX_SUBPIXEL_BITS 8 -`define CSR_TEX_STATES 7 -`define CSR_TEX_BEGIN(x) (12'hFD0 + (x) * `CSR_TEX_STATES) +`define TEX_DIM_BITS 15 +`define TEX_LOD_MAX `TEX_DIM_BITS +`define TEX_LOD_BITS 4 -`define CSR_TEX_ADDR(x) (`CSR_TEX_BEGIN(x) + 12'h00) -`define CSR_TEX_FORMAT(x) (`CSR_TEX_BEGIN(x) + 12'h01) -`define CSR_TEX_WRAP(x) (`CSR_TEX_BEGIN(x) + 12'h02) -`define CSR_TEX_FILTER(x) (`CSR_TEX_BEGIN(x) + 12'h03) -`define CSR_TEX_MIPOFF(x) (`CSR_TEX_BEGIN(x) + 12'h04) -`define CSR_TEX_WIDTH(x) (`CSR_TEX_BEGIN(x) + 12'h05) -`define CSR_TEX_HEIGHT(x) (`CSR_TEX_BEGIN(x) + 12'h06) +`define TEX_FXD_BITS 32 +`define TEX_FXD_FRAC (`TEX_DIM_BITS+`TEX_SUBPIXEL_BITS) + +`define TEX_STATE_ADDR 0 +`define TEX_STATE_WIDTH 1 +`define TEX_STATE_HEIGHT 2 +`define TEX_STATE_FORMAT 3 +`define TEX_STATE_FILTER 4 +`define TEX_STATE_WRAPU 5 +`define TEX_STATE_WRAPV 6 +`define TEX_STATE_MIPOFF(lod) (7+(lod)) + +`define NUM_TEX_STATES (7+`TEX_LOD_MAX) + +`define CSR_TEX(unit,state) (12'hFD0 + ((unit) * `NUM_TEX_STATES) + (state)) +`define CSR_TEX_UNIT(csr) (((csr) - 12'hFD0) / `NUM_TEX_STATES) +`define CSR_TEX_STATE(csr) (((csr) - 12'hFD0) % `NUM_TEX_STATES) // Pipeline Queues //////////////////////////////////////////////////////////// @@ -266,6 +278,11 @@ `define FPUQ_SIZE 8 `endif +// Texture Unit Request Queue +`ifndef TEXQ_SIZE +`define TEXQ_SIZE (`NUM_WARPS * 2) +`endif + // Icache Configurable Knobs ////////////////////////////////////////////////// // Size of cache in bytes diff --git a/hw/rtl/VX_csr_data.sv b/hw/rtl/VX_csr_data.sv index b071a347..396358d1 100644 --- a/hw/rtl/VX_csr_data.sv +++ b/hw/rtl/VX_csr_data.sv @@ -50,35 +50,40 @@ module VX_csr_data #( reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr; always @(posedge clk) begin - `ifdef EXT_F_ENABLE if (reset) begin fcsr <= '0; - end - if (fpu_to_csr_if.write_enable) begin - fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] - | fpu_to_csr_if.write_fflags; - end - `endif - if (write_enable) begin - case (write_addr) - `CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0]; - `CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0]; - `CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0]; - `CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0]; - `CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0]; - `CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0]; - `CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0]; - `CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0]; - `CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0]; - `CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0]; - `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0]; - `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0]; - default: begin - `ASSERT(write_addr >= `CSR_TEX_BEGIN(0) - && write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES), - ("%t: invalid CSR write address: %0h", $time, write_addr)); - end - endcase + end else begin + `ifdef EXT_F_ENABLE + if (fpu_to_csr_if.write_enable) begin + fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] + | fpu_to_csr_if.write_fflags; + end + `endif + if (write_enable) begin + case (write_addr) + `CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0]; + `CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0]; + `CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0]; + `CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0]; + `CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0]; + `CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0]; + `CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0]; + `CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0]; + `CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0]; + `CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0]; + `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0]; + `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0]; + default: begin + `ifdef EXT_TEX_ENABLE + `ASSERT(write_addr >= `CSR_TEX(0,0) + && write_addr < `CSR_TEX(`NUM_TEX_UNITS, 0), + ("%t: invalid CSR write address: %0h", $time, write_addr)); + `else + `ASSERT(~write_enable, ("%t: invalid CSR write address: %0h", $time, write_addr)); + `endif + end + endcase + end end end @@ -217,11 +222,16 @@ module VX_csr_data #( `CSR_MIMPID : read_data_r = `IMPLEMENTATION_ID; default: begin - if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32)) - || (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32) - || (read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))))) begin + if ((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32)) + || (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32))) begin + read_addr_valid_r = 1; + end else + `ifdef EXT_TEX_ENABLE + if (read_addr >= `CSR_TEX(0,0) && read_addr < `CSR_TEX(`NUM_TEX_UNITS,0)) begin + read_addr_valid_r = 1; + end else + `endif read_addr_valid_r = 0; - end end endcase end diff --git a/hw/rtl/VX_decode.sv b/hw/rtl/VX_decode.sv index 89d70d7a..2c6f09fb 100644 --- a/hw/rtl/VX_decode.sv +++ b/hw/rtl/VX_decode.sv @@ -214,9 +214,9 @@ module VX_decode #( case (u_12) 12'h000: op_type = `INST_OP_BITS'(`INST_BR_ECALL); 12'h001: op_type = `INST_OP_BITS'(`INST_BR_EBREAK); + 12'h002: op_type = `INST_OP_BITS'(`INST_BR_URET); + 12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET); 12'h302: op_type = `INST_OP_BITS'(`INST_BR_MRET); - 12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET); - 12'h7B2: op_type = `INST_OP_BITS'(`INST_BR_DRET); default:; endcase op_mod = 1; @@ -347,7 +347,7 @@ module VX_decode #( endcase end `endif - `INST_GPU: begin + `INST_GPGPU: begin ex_type = `EX_GPU; case (func3) 3'h0: begin @@ -374,9 +374,21 @@ module VX_decode #( is_wstall = 1; `USED_IREG (rs1); `USED_IREG (rs2); - end - `ifdef EXT_TEX_ENABLE + end 3'h5: begin + ex_type = `EX_LSU; + op_type = `INST_OP_BITS'(`INST_LSU_LW); + op_mod = `INST_MOD_BITS'(2); + `USED_IREG (rs1); + end + default:; + endcase + end + `INST_GPU: begin + case (func3) + `ifdef EXT_TEX_ENABLE + 3'h0: begin + ex_type = `EX_GPU; op_type = `INST_OP_BITS'(`INST_GPU_TEX); op_mod = `INST_MOD_BITS'(func2); use_rd = 1; @@ -386,12 +398,6 @@ module VX_decode #( `USED_IREG (rs3); end `endif - 3'h6: begin - ex_type = `EX_LSU; - op_type = `INST_OP_BITS'(`INST_LSU_LW); - op_mod = `INST_MOD_BITS'(2); - `USED_IREG (rs1); - end default:; endcase end diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index c3706000..696b6eaa 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -66,7 +66,8 @@ `define INST_FNMADD 7'b1001111 `define INST_FCI 7'b1010011 // float common instructions -`define INST_GPU 7'b1101011 +`define INST_GPGPU 7'b1101011 +`define INST_GPU 7'b1011011 `define INST_TEX 7'b0101011 @@ -117,9 +118,9 @@ `define INST_BR_JALR 4'b1001 `define INST_BR_ECALL 4'b1010 `define INST_BR_EBREAK 4'b1011 -`define INST_BR_MRET 4'b1100 +`define INST_BR_URET 4'b1100 `define INST_BR_SRET 4'b1101 -`define INST_BR_DRET 4'b1110 +`define INST_BR_MRET 4'b1110 `define INST_BR_OTHER 4'b1111 `define INST_BR_BITS 4 `define INST_BR_NEG(x) x[1] @@ -185,14 +186,14 @@ `define INST_FPU_NMADD 4'hF `define INST_FPU_BITS 4 -`define INST_GPU_TMC 3'h0 -`define INST_GPU_WSPAWN 3'h1 -`define INST_GPU_SPLIT 3'h2 -`define INST_GPU_JOIN 3'h3 -`define INST_GPU_BAR 3'h4 -`define INST_GPU_PRED 3'h5 -`define INST_GPU_TEX 3'h6 -`define INST_GPU_BITS 3 +`define INST_GPU_TMC 4'h0 +`define INST_GPU_WSPAWN 4'h1 +`define INST_GPU_SPLIT 4'h2 +`define INST_GPU_JOIN 4'h3 +`define INST_GPU_BAR 4'h4 +`define INST_GPU_PRED 4'h5 +`define INST_GPU_TEX 4'h6 +`define INST_GPU_BITS 4 /////////////////////////////////////////////////////////////////////////////// @@ -237,11 +238,9 @@ /////////////////////////////////////////////////////////////////////////////// -`ifdef DBG_CACHE_REQ_INFO // wid PC -`define DBG_CACHE_REQ_MDATAW (`NW_BITS + 32) -`else -`define DBG_CACHE_REQ_MDATAW 0 -`endif +// cache request identifier +`define DBG_CACHE_REQ_IDW 48 +`define DBG_CACHE_REQ_ID(type, ctr) {4'(type), {`DBG_CACHE_REQ_IDW-4{1'b0}}} + ctr // non-cacheable tag bits `define NC_TAG_BIT 1 @@ -249,6 +248,9 @@ // texture tag bits `define TEX_TAG_BIT 1 +// cache address type bits +`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BIT + `SM_ENABLE) + ////////////////////////// Icache Configurable Knobs ////////////////////////// // Cache ID @@ -264,7 +266,7 @@ `define ICACHE_CORE_TAG_ID_BITS `NW_BITS // Core request tag bits -`define ICACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `ICACHE_CORE_TAG_ID_BITS) +`define ICACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_IDW + `ICACHE_CORE_TAG_ID_BITS) // Memory request data bits `define ICACHE_MEM_DATA_WIDTH (`ICACHE_LINE_SIZE * 8) @@ -289,17 +291,14 @@ // Core request tag bits `define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE) `ifdef EXT_TEX_ENABLE -`define LSU_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE) -`define TEX_TAG_ID_BITS (2) -`define LSU_TEX_TAG_ID_BITS `MAX(`LSU_TAG_ID_BITS, `TEX_TAG_ID_BITS) -`define DCACHE_CORE_TAG_ID_BITS (`LSU_TEX_TAG_ID_BITS + `TEX_TAG_BIT) -`define LSU_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TAG_ID_BITS) -`define TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `TEX_TAG_ID_BITS) -`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TEX_TAG_ID_BITS) +`define LSU_TAG_ID_BITS `MAX(`LSUQ_ADDR_BITS, 2) +`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_IDW + `LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS) +`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS + `TEX_TAG_BIT) `else -`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE) +`define LSU_TAG_ID_BITS `LSUQ_ADDR_BITS +`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS) `endif -`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS) +`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_IDW + `DCACHE_CORE_TAG_ID_BITS) // Memory request data bits `define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8) diff --git a/hw/rtl/VX_execute.sv b/hw/rtl/VX_execute.sv index f0cdd37e..029d58ab 100644 --- a/hw/rtl/VX_execute.sv +++ b/hw/rtl/VX_execute.sv @@ -52,51 +52,29 @@ module VX_execute #( VX_dcache_req_if #( .NUM_REQS (`NUM_THREADS), .WORD_SIZE (4), - .TAG_WIDTH (`LSU_DCACHE_TAG_BITS) + .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS) ) lsu_dcache_req_if(); VX_dcache_rsp_if #( .NUM_REQS (`NUM_THREADS), .WORD_SIZE (4), - .TAG_WIDTH (`LSU_DCACHE_TAG_BITS) + .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS) ) lsu_dcache_rsp_if(); VX_dcache_req_if #( .NUM_REQS (`NUM_THREADS), .WORD_SIZE (4), - .TAG_WIDTH (`TEX_DCACHE_TAG_BITS) + .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS) ) tex_dcache_req_if(); VX_dcache_rsp_if #( .NUM_REQS (`NUM_THREADS), .WORD_SIZE (4), - .TAG_WIDTH (`TEX_DCACHE_TAG_BITS) + .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS) ) tex_dcache_rsp_if(); VX_tex_csr_if tex_csr_if(); - wire [`NUM_THREADS-1:0][`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_in, lsu_tag_in; - wire [`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_out, lsu_tag_out; - - `UNUSED_VAR (tex_tag_out) - `UNUSED_VAR (lsu_tag_out) - - for (genvar i = 0; i < `NUM_THREADS; ++i) begin - assign tex_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(tex_dcache_req_if.tag[i][`TEX_TAG_ID_BITS-1:0]); - assign lsu_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(lsu_dcache_req_if.tag[i][`LSU_TAG_ID_BITS-1:0]); - `ifdef DBG_CACHE_REQ_INFO - assign tex_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = tex_dcache_req_if.tag[i][`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS]; - assign lsu_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = lsu_dcache_req_if.tag[i][`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS]; - `endif - end - - assign tex_dcache_rsp_if.tag[`TEX_TAG_ID_BITS-1:0] = tex_tag_out[`TEX_TAG_ID_BITS-1:0]; - assign lsu_dcache_rsp_if.tag[`LSU_TAG_ID_BITS-1:0] = lsu_tag_out[`LSU_TAG_ID_BITS-1:0]; -`ifdef DBG_CACHE_REQ_INFO - assign tex_dcache_rsp_if.tag[`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS] = tex_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS]; - assign lsu_dcache_rsp_if.tag[`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS] = lsu_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS]; -`endif - VX_cache_arb #( .NUM_REQS (2), .LANES (`NUM_THREADS), @@ -113,7 +91,7 @@ module VX_execute #( .req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}), .req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}), .req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}), - .req_tag_in ({tex_tag_in, lsu_tag_in}), + .req_tag_in ({tex_dcache_req_if.tag, lsu_dcache_req_if.tag}), .req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}), // Dcache request @@ -136,7 +114,7 @@ module VX_execute #( .rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}), .rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}), .rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}), - .rsp_tag_out ({tex_tag_out, lsu_tag_out}), + .rsp_tag_out ({tex_dcache_rsp_if.tag, lsu_dcache_rsp_if.tag}), .rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready}) ); diff --git a/hw/rtl/VX_icache_stage.sv b/hw/rtl/VX_icache_stage.sv index cb33b82d..ad296649 100644 --- a/hw/rtl/VX_icache_stage.sv +++ b/hw/rtl/VX_icache_stage.sv @@ -24,10 +24,17 @@ module VX_icache_stage #( localparam OUT_REG = 0; + reg [`DBG_CACHE_REQ_IDW-1:0] req_id; + wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id; + wire [`NW_BITS-1:0] req_tag, rsp_tag; + + `UNUSED_VAR (rsp_req_id) + wire icache_req_fire = icache_req_if.valid && icache_req_if.ready; - wire [`NW_BITS-1:0] req_tag = ifetch_req_if.wid; - wire [`NW_BITS-1:0] rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0]; + assign req_tag = ifetch_req_if.wid; + assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0]; + assign rsp_req_id = icache_rsp_if.tag[`NW_BITS +: `DBG_CACHE_REQ_IDW]; wire [31:0] rsp_PC; wire [`NUM_THREADS-1:0] rsp_tmask; @@ -51,16 +58,21 @@ module VX_icache_stage #( // Icache Request assign icache_req_if.valid = ifetch_req_if.valid; assign icache_req_if.addr = ifetch_req_if.PC[31:2]; + assign icache_req_if.tag = {req_id, req_tag}; + + always @(posedge clk) begin + if (reset) begin + req_id <= `DBG_CACHE_REQ_ID(0, 0); + end else begin + if (icache_req_fire) begin + req_id <= req_id + 1; + end + end + end // Can accept new request? assign ifetch_req_if.ready = icache_req_if.ready; -`ifdef DBG_CACHE_REQ_INFO - assign icache_req_if.tag = {ifetch_req_if.wid, ifetch_req_if.PC, req_tag}; -`else - assign icache_req_if.tag = req_tag; -`endif - wire [`NW_BITS-1:0] rsp_wid = rsp_tag; wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid); @@ -90,11 +102,11 @@ module VX_icache_stage #( `ifdef DBG_TRACE_CORE_ICACHE always @(posedge clk) begin - if (icache_req_if.valid && icache_req_if.ready) begin - dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC); + if (icache_req_fire) begin + dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h, req_id=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, req_id); end if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin - dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, data=%0h\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, ifetch_rsp_if.data); + dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, req_id=%0h, data=%0h\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, rsp_req_id, ifetch_rsp_if.data); end end `endif diff --git a/hw/rtl/VX_lsu_unit.sv b/hw/rtl/VX_lsu_unit.sv index 8541f4c6..de47dca0 100644 --- a/hw/rtl/VX_lsu_unit.sv +++ b/hw/rtl/VX_lsu_unit.sv @@ -24,8 +24,6 @@ module VX_lsu_unit #( localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE); - localparam ADDR_TYPEW = `NC_TAG_BIT + `SM_ENABLE; - `STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) `STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) `STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter")) @@ -44,7 +42,7 @@ module VX_lsu_unit #( wire mbuf_empty; - wire [`NUM_THREADS-1:0][ADDR_TYPEW-1:0] lsu_addr_type, req_addr_type; + wire [`NUM_THREADS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type, req_addr_type; wire [`NUM_THREADS-1:0][31:0] full_addr; for (genvar i = 0; i < `NUM_THREADS; i++) begin @@ -83,7 +81,7 @@ module VX_lsu_unit #( wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch; VX_pipe_register #( - .DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) req_pipe_reg ( .clk (clk), @@ -104,19 +102,22 @@ module VX_lsu_unit #( wire rsp_is_dup; wire rsp_is_prefetch; - `UNUSED_VAR (rsp_type) - `UNUSED_VAR (rsp_is_prefetch) - reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask; wire [`NUM_THREADS-1:0] rsp_rem_mask_n; wire [`NUM_THREADS-1:0] rsp_tmask; + reg [`DBG_CACHE_REQ_IDW-1:0] req_id; + wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id; reg [`NUM_THREADS-1:0] req_sent_mask; reg is_req_start; wire [`LSUQ_ADDR_BITS-1:0] mbuf_waddr, mbuf_raddr; wire mbuf_full; + `UNUSED_VAR (rsp_type) + `UNUSED_VAR (rsp_is_prefetch) + `UNUSED_VAR (rsp_req_id) + wire [`NUM_THREADS-1:0][REQ_ASHIFT-1:0] req_offset, rsp_offset; for (genvar i = 0; i < `NUM_THREADS; i++) begin assign req_offset[i] = req_addr[i][1:0]; @@ -124,6 +125,8 @@ module VX_lsu_unit #( wire [`NUM_THREADS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready; + wire dcache_req_fire_any = (| dcache_req_fire); + wire dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready; wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1}; @@ -135,7 +138,8 @@ module VX_lsu_unit #( wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n); - assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS]; + assign mbuf_raddr = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: `LSUQ_ADDR_BITS]; + assign rsp_req_id = dcache_rsp_if.tag[(`CACHE_ADDR_TYPE_BITS + `LSU_TAG_ID_BITS) +: `DBG_CACHE_REQ_IDW]; `UNUSED_VAR (dcache_rsp_if.tag) // do not writeback from software prefetch @@ -214,7 +218,7 @@ module VX_lsu_unit #( 0: mem_req_byteen[req_offset[i]] = 1; 1: begin mem_req_byteen[req_offset[i]] = 1; - mem_req_byteen[{req_addr[i][1], 1'b1}] = 1; + mem_req_byteen[{req_offset[i][1], 1'b1}] = 1; end default : mem_req_byteen = {4{1'b1}}; endcase @@ -235,12 +239,17 @@ module VX_lsu_unit #( assign dcache_req_if.addr[i] = req_addr[i][31:2]; assign dcache_req_if.byteen[i] = mem_req_byteen; assign dcache_req_if.data[i] = mem_req_data; + assign dcache_req_if.tag[i] = {req_id, `LSU_TAG_ID_BITS'(req_tag), req_addr_type[i]}; + end - `ifdef DBG_CACHE_REQ_INFO - assign dcache_req_if.tag[i] = {req_wid, req_pc, req_tag, req_addr_type[i]}; - `else - assign dcache_req_if.tag[i] = {req_tag, req_addr_type[i]}; - `endif + always @(posedge clk) begin + if (reset) begin + req_id <= `DBG_CACHE_REQ_ID(1, 0); + end else begin + if (dcache_req_fire_any) begin + req_id <= req_id + 1; + end + end end assign ready_in = req_dep_ready && dcache_req_ready; @@ -339,22 +348,21 @@ module VX_lsu_unit #( `endif `ifdef DBG_TRACE_CORE_DCACHE - wire dcache_req_fire_any = (| dcache_req_fire); always @(posedge clk) begin if (lsu_req_if.valid && fence_wait) begin dpi_trace("%d: *** D$%0d fence wait\n", $time, CORE_ID); end if (dcache_req_fire_any) begin if (dcache_req_if.rw[0]) begin - dpi_trace("%d: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire); + dpi_trace("%d: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, req_id=%0h, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire, req_id); `TRACE_ARRAY1D(req_addr, `NUM_THREADS); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); dpi_trace(", data="); `TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(", req_id=%0h\n", req_id); end else begin - dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire); + dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire, req_id); `TRACE_ARRAY1D(req_addr, `NUM_THREADS); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); @@ -362,8 +370,8 @@ module VX_lsu_unit #( end end if (dcache_rsp_fire) begin - dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=", - $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd); + dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, rd=%0d, data=", + $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, rsp_req_id, mbuf_raddr, rsp_rd); `TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS); dpi_trace(", is_dup=%b\n", rsp_is_dup); end diff --git a/hw/rtl/cache/VX_bank.sv b/hw/rtl/cache/VX_bank.sv index 14d50e29..2dfc51fe 100644 --- a/hw/rtl/cache/VX_bank.sv +++ b/hw/rtl/cache/VX_bank.sv @@ -33,9 +33,6 @@ module VX_bank #( // core request tag size parameter CORE_TAG_WIDTH = 1, - // size of tag id in core request tag - parameter CORE_TAG_ID_BITS = 0, - // bank offset from beginning of index range parameter BANK_ADDR_OFFSET = 0, @@ -96,14 +93,9 @@ module VX_bank #( input wire [`LINE_SELECT_BITS-1:0] flush_addr ); - `UNUSED_PARAM (CORE_TAG_ID_BITS) - -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - wire [31:0] debug_pc_sel, debug_pc_st0, debug_pc_st1; - wire [`NW_BITS-1:0] debug_wid_sel, debug_wid_st0, debug_wid_st1; + wire [`DBG_CACHE_REQ_IDW-1:0] req_id_sel, req_id_st0, req_id_st1; `IGNORE_UNUSED_END -`endif wire [NUM_PORTS-1:0] creq_pmask; wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] creq_wsel; @@ -197,13 +189,7 @@ module VX_bank #( wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; wire creq_fire = creq_valid && creq_ready; -`ifdef DBG_CACHE_REQ_INFO - if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_sel, debug_pc_sel} = mshr_enable ? mshr_tag[0][`CACHE_REQ_INFO_RNG] : creq_tag[0][`CACHE_REQ_INFO_RNG]; - end else begin - assign {debug_wid_sel, debug_pc_sel} = 0; - end -`endif + assign req_id_sel = mshr_enable ? mshr_tag[0][`CACHE_REQ_ID_RNG] : creq_tag[0][`CACHE_REQ_ID_RNG]; wire [`CACHE_LINE_WIDTH-1:0] wdata_sel; assign wdata_sel[(NUM_PORTS * `WORD_WIDTH)-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[(NUM_PORTS * `WORD_WIDTH)-1:0] : creq_data; @@ -237,13 +223,7 @@ module VX_bank #( .data_out ({valid_st0, is_flush_st0, is_mshr_st0, is_fill_st0, is_read_st0, is_write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0}) ); -`ifdef DBG_CACHE_REQ_INFO - if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st0, debug_pc_st0} = tag_st0[0][`CACHE_REQ_INFO_RNG]; - end else begin - assign {debug_wid_st0, debug_pc_st0} = 0; - end -`endif + assign req_id_st0 = tag_st0[0][`CACHE_REQ_ID_RNG]; wire do_fill_st0 = valid_st0 && is_fill_st0; wire do_flush_st0 = valid_st0 && is_flush_st0; @@ -263,11 +243,9 @@ module VX_bank #( .clk (clk), .reset (reset), - `ifdef DBG_CACHE_REQ_INFO - .debug_pc (debug_pc_st0), - .debug_wid (debug_wid_st0), - `endif - .stall (crsq_stall), + .req_id (req_id_st0), + + .stall (crsq_stall), // read/Fill .lookup (do_lookup_st0), @@ -293,13 +271,7 @@ module VX_bank #( .data_out ({valid_st1, is_mshr_st1, is_fill_st1, is_read_st1, is_write_st1, miss_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1}) ); -`ifdef DBG_CACHE_REQ_INFO - if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st1, debug_pc_st1} = tag_st1[0][`CACHE_REQ_INFO_RNG]; - end else begin - assign {debug_wid_st1, debug_pc_st1} = 0; - end -`endif + assign req_id_st1 = tag_st1[0][`CACHE_REQ_ID_RNG]; wire do_read_st0 = valid_st0 && is_read_st0; wire do_read_st1 = valid_st1 && is_read_st1; @@ -323,10 +295,8 @@ module VX_bank #( .clk (clk), .reset (reset), - `ifdef DBG_CACHE_REQ_INFO - .debug_pc (debug_pc_st1), - .debug_wid (debug_wid_st1), - `endif + .req_id (req_id_st1), + .stall (crsq_stall), .read (do_read_st1 || do_mshr_st1), @@ -372,14 +342,9 @@ module VX_bank #( .clk (clk), .reset (reset), - `ifdef DBG_CACHE_REQ_INFO - .deq_debug_pc (debug_pc_sel), - .deq_debug_wid (debug_wid_sel), - .lkp_debug_pc (debug_pc_st0), - .lkp_debug_wid (debug_wid_st0), - .rel_debug_pc (debug_pc_st1), - .rel_debug_wid (debug_wid_st1), - `endif + .deq_req_id (req_id_sel), + .lkp_req_id (req_id_st0), + .rel_req_id (req_id_st1), // allocate .allocate_valid (mshr_allocate), @@ -525,22 +490,22 @@ module VX_bank #( dpi_trace("%d: cache%0d:%0d fill-rsp: addr=%0h, id=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data); end if (mshr_fire) begin - dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, debug_wid_sel, debug_pc_sel); + dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, req_id_sel); end if (creq_fire) begin if (creq_rw) - dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel); + dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, req_id_sel); else - dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel); + dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, req_id_sel); end if (crsq_fire) begin - dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1); + dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, req_id_st1); end if (mreq_push) begin if (is_write_st1) - dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, debug_wid_st1, debug_pc_st1); + dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, req_id_st1); else - dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, debug_wid_st1, debug_pc_st1); + dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, req_id_st1); end end `endif diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index 29e14892..6b6841dd 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -580,8 +580,7 @@ module VX_cache #( .MSHR_SIZE (MSHR_SIZE), .MREQ_SIZE (MREQ_SIZE), .WRITE_ENABLE (WRITE_ENABLE), - .CORE_TAG_WIDTH (CORE_TAG_X_WIDTH), - .CORE_TAG_ID_BITS (CORE_TAG_ID_X_BITS), + .CORE_TAG_WIDTH (CORE_TAG_X_WIDTH), .BANK_ADDR_OFFSET (BANK_ADDR_OFFSET) ) bank ( `SCOPE_BIND_VX_cache_bank(i) diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index 8af2921b..b8f2fdbc 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -3,9 +3,8 @@ `include "VX_platform.vh" -`ifdef DBG_CACHE_REQ_INFO -`include "VX_define.vh" -`endif +// cache request identifier +`define DBG_CACHE_REQ_IDW 48 `define REQS_BITS `LOG2UP(NUM_REQS) @@ -52,7 +51,7 @@ `define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS] -`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_MDATAW) +`define CACHE_REQ_ID_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_IDW) /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/cache/VX_data_access.sv b/hw/rtl/cache/VX_data_access.sv index a1a5247b..887b4095 100644 --- a/hw/rtl/cache/VX_data_access.sv +++ b/hw/rtl/cache/VX_data_access.sv @@ -21,12 +21,9 @@ module VX_data_access #( input wire clk, input wire reset, -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - input wire[31:0] debug_pc, - input wire[`NW_BITS-1:0] debug_wid, + input wire[`DBG_CACHE_REQ_IDW-1:0] req_id, `IGNORE_UNUSED_END -`endif input wire stall, @@ -125,10 +122,10 @@ module VX_data_access #( dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data); end if (read && ~stall) begin - dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, read_data); + dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, req_id=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, line_addr, read_data); end if (write && ~stall) begin - dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, wid=%0d, PC=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, byteen, line_addr, write_data); + dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, req_id=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, byteen, line_addr, write_data); end end `endif diff --git a/hw/rtl/cache/VX_miss_resrv.sv b/hw/rtl/cache/VX_miss_resrv.sv index bda63bb1..08b76add 100644 --- a/hw/rtl/cache/VX_miss_resrv.sv +++ b/hw/rtl/cache/VX_miss_resrv.sv @@ -25,16 +25,11 @@ module VX_miss_resrv #( input wire clk, input wire reset, -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - input wire[31:0] deq_debug_pc, - input wire[`NW_BITS-1:0] deq_debug_wid, - input wire[31:0] lkp_debug_pc, - input wire[`NW_BITS-1:0] lkp_debug_wid, - input wire[31:0] rel_debug_pc, - input wire[`NW_BITS-1:0] rel_debug_wid, + input wire[`DBG_CACHE_REQ_IDW-1:0] deq_req_id, + input wire[`DBG_CACHE_REQ_IDW-1:0] lkp_req_id, + input wire[`DBG_CACHE_REQ_IDW-1:0] rel_req_id, `IGNORE_UNUSED_END -`endif // allocate input wire allocate_valid, @@ -206,23 +201,22 @@ module VX_miss_resrv #( always @(posedge clk) begin if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin if (allocate_fire) - dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, - `LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_debug_wid, deq_debug_pc); + dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, + `LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_req_id); if (fill_valid) dpi_trace("%d: cache%0d:%0d mshr-fill: addr=%0h, id=%0d, addr=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_table[fill_id], BANK_ID), fill_id, `LINE_TO_BYTE_ADDR(fill_addr, BANK_ID)); if (dequeue_fire) - dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, - `LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_debug_wid, deq_debug_pc); + dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, + `LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_id); if (lookup_replay) dpi_trace("%d: cache%0d:%0d mshr-replay: addr=%0h, id=%0d\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id); if (lookup_valid) - dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, - `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_debug_wid, lkp_debug_pc); + dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b, req_id=%0h\n", $time, CACHE_ID, BANK_ID, + `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_req_id); if (release_valid) - dpi_trace("%d: cache%0d:%0d mshr-release id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, - release_id, rel_debug_wid, rel_debug_pc); + dpi_trace("%d: cache%0d:%0d mshr-release id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, release_id, rel_req_id); dpi_trace("%d: cache%0d:%0d mshr-table", $time, CACHE_ID, BANK_ID); for (integer i = 0; i < MSHR_SIZE; ++i) begin if (valid_table[i]) begin diff --git a/hw/rtl/cache/VX_shared_mem.sv b/hw/rtl/cache/VX_shared_mem.sv index 46ea0cfc..257cf295 100644 --- a/hw/rtl/cache/VX_shared_mem.sv +++ b/hw/rtl/cache/VX_shared_mem.sv @@ -254,22 +254,19 @@ module VX_shared_mem #( .ready_out (core_rsp_ready) ); -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - wire [NUM_BANKS-1:0][31:0] debug_pc_st0, debug_pc_st1; - wire [NUM_BANKS-1:0][`NW_BITS-1:0] debug_wid_st0, debug_wid_st1; + wire [NUM_BANKS-1:0][`DBG_CACHE_REQ_IDW-1:0] req_id_st0, req_id_st1; `IGNORE_UNUSED_END for (genvar i = 0; i < NUM_BANKS; ++i) begin if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st0[i], debug_pc_st0[i]} = per_bank_core_req_tag_unqual[i][`CACHE_REQ_INFO_RNG]; - assign {debug_wid_st1[i], debug_pc_st1[i]} = per_bank_core_req_tag[i][`CACHE_REQ_INFO_RNG]; + assign req_id_st0[i] = per_bank_core_req_tag_unqual[i][`CACHE_REQ_ID_RNG]; + assign req_id_st1[i] = per_bank_core_req_tag[i][`CACHE_REQ_ID_RNG]; end else begin - assign {debug_wid_st0[i], debug_pc_st0[i]} = 0; - assign {debug_wid_st1[i], debug_pc_st1[i]} = 0; + assign req_id_st0[i] = 0; + assign req_id_st1[i] = 0; end end -`endif `ifdef DBG_TRACE_CACHE_BANK @@ -309,11 +306,11 @@ module VX_shared_mem #( for (integer i = 0; i < NUM_BANKS; ++i) begin if (per_bank_core_req_valid_unqual[i]) begin if (per_bank_core_req_rw_unqual[i]) begin - dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", - $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], debug_wid_st0[i], debug_pc_st0[i]); + dpi_trace("%d: smem%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, req_id=%0h\n", + $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], req_id_st0[i]); end else begin - dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h\n", - $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], debug_wid_st0[i], debug_pc_st0[i]); + dpi_trace("%d: smem%0d:%0d core-rd-req: addr=%0h, tag=%0h, req_id=%0h\n", + $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], req_id_st0[i]); end end end @@ -322,11 +319,11 @@ module VX_shared_mem #( for (integer i = 0; i < NUM_BANKS; ++i) begin if (per_bank_core_req_valid[i]) begin if (per_bank_core_req_rw[i]) begin - dpi_trace("%d: cache%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", - $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_req_data[i], debug_wid_st1[i], debug_pc_st1[i]); + dpi_trace("%d: smem%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, data=%0h, req_id=%0h\n", + $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_data[i], req_id_st1[i]); end else begin - dpi_trace("%d: cache%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", - $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_rsp_data[i], debug_wid_st1[i], debug_pc_st1[i]); + dpi_trace("%d: smem%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, data=%0h, req_id=%0h\n", + $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_rsp_data[i], req_id_st1[i]); end end end diff --git a/hw/rtl/cache/VX_tag_access.sv b/hw/rtl/cache/VX_tag_access.sv index 55124a65..808008d5 100644 --- a/hw/rtl/cache/VX_tag_access.sv +++ b/hw/rtl/cache/VX_tag_access.sv @@ -17,12 +17,9 @@ module VX_tag_access #( input wire clk, input wire reset, -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - input wire[31:0] debug_pc, - input wire[`NW_BITS-1:0] debug_wid, + input wire[`DBG_CACHE_REQ_IDW-1:0] req_id, `IGNORE_UNUSED_END -`endif input wire stall, @@ -71,9 +68,9 @@ module VX_tag_access #( end if (lookup && ~stall) begin if (tag_match) begin - dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag); + dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, req_id=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, line_addr, line_tag); end else begin - dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag, read_tag); + dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, req_id=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, line_addr, line_tag, read_tag); end end end diff --git a/hw/rtl/tex_unit/VX_tex_addr.sv b/hw/rtl/tex_unit/VX_tex_addr.sv index 26a20566..c33cc47a 100644 --- a/hw/rtl/tex_unit/VX_tex_addr.sv +++ b/hw/rtl/tex_unit/VX_tex_addr.sv @@ -12,13 +12,13 @@ module VX_tex_addr #( input wire req_valid, input wire [NUM_REQS-1:0] req_tmask, - input wire [1:0][NUM_REQS-1:0][31:0] req_coords, + input wire [1:0][NUM_REQS-1:0][`TEX_FXD_BITS-1:0] req_coords, input wire [`TEX_FORMAT_BITS-1:0] req_format, input wire [`TEX_FILTER_BITS-1:0] req_filter, input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps, input wire [`TEX_ADDR_BITS-1:0] req_baseaddr, input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff, - input wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] req_logdims, + input wire [NUM_REQS-1:0][1:0][`TEX_LOD_BITS-1:0] req_logdims, input wire [REQ_INFOW-1:0] req_info, output wire req_ready, @@ -27,31 +27,33 @@ module VX_tex_addr #( output wire rsp_valid, output wire [NUM_REQS-1:0] rsp_tmask, output wire [`TEX_FILTER_BITS-1:0] rsp_filter, - output wire [`TEX_STRIDE_BITS-1:0] rsp_stride, + output wire [`TEX_LGSTRIDE_BITS-1:0] rsp_lgstride, output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr, - output wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends, + output wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends, output wire [REQ_INFOW-1:0] rsp_info, input wire rsp_ready ); `UNUSED_PARAM (CORE_ID) - localparam PITCH_BITS = `MAX(`TEX_DIM_BITS, `TEX_STRIDE_BITS) + 1; - localparam SCALED_U_W = `FIXED_INT + `TEX_STRIDE_BITS; - localparam SCALED_X_W = (2 * `FIXED_INT); - localparam SCALED_V_W = SCALED_X_W + `TEX_STRIDE_BITS; + localparam SHIFT_BITS = $clog2(`TEX_FXD_FRAC+1); + localparam PITCH_BITS = `MAX(`TEX_LOD_BITS, `TEX_LGSTRIDE_BITS) + 1; + localparam SCALED_X_W = `TEX_DIM_BITS + `TEX_BLEND_FRAC; + localparam OFFSET_U_W = `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX; + localparam OFFSET_V_W = `TEX_DIM_BITS + `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX; wire valid_s0; wire [NUM_REQS-1:0] tmask_s0; wire [`TEX_FILTER_BITS-1:0] filter_s0; wire [REQ_INFOW-1:0] req_info_s0; - wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_lo, clamped_lo_s0; - wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_hi, clamped_hi_s0; - wire [`TEX_STRIDE_BITS-1:0] log_stride, log_stride_s0; + wire [NUM_REQS-1:0][1:0][`TEX_FXD_FRAC-1:0] clamped_lo, clamped_lo_s0; + wire [NUM_REQS-1:0][1:0][`TEX_FXD_FRAC-1:0] clamped_hi, clamped_hi_s0; + wire [NUM_REQS-1:0][1:0][SHIFT_BITS-1:0] dim_shift, dim_shift_s0; + wire [`TEX_LGSTRIDE_BITS-1:0] log_stride, log_stride_s0; wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0; - wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] log_dims_s0; wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0; - + wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0; + wire stall_out; // stride @@ -67,9 +69,9 @@ module VX_tex_addr #( for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar j = 0; j < 2; ++j) begin - wire [`FIXED_FRAC-1:0] delta = (`FIXED_HALF >> req_logdims[i][j]); - wire [31:0] coord_lo = req_filter ? (req_coords[j][i] - 32'(delta)) : req_coords[j][i]; - wire [31:0] coord_hi = req_filter ? (req_coords[j][i] + 32'(delta)) : req_coords[j][i]; + wire [`TEX_FXD_FRAC-1:0] delta = (`TEX_FXD_HALF >> req_logdims[i][j]); + wire [`TEX_FXD_BITS-1:0] coord_lo = req_filter ? (req_coords[j][i] - `TEX_FXD_BITS'(delta)) : req_coords[j][i]; + wire [`TEX_FXD_BITS-1:0] coord_hi = req_filter ? (req_coords[j][i] + `TEX_FXD_BITS'(delta)) : req_coords[j][i]; VX_tex_wrap #( .CORE_ID (CORE_ID) @@ -86,66 +88,72 @@ module VX_tex_addr #( .coord_i (coord_hi), .coord_o (clamped_hi[i][j]) ); + + assign dim_shift[i][j] = (`TEX_FXD_FRAC - `TEX_BLEND_FRAC - req_logdims[i][j]); end assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0]) + PITCH_BITS'(log_stride); assign mip_addr[i] = req_baseaddr + 32'(req_mipoff[i]); end VX_pipe_register #( - .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * `TEX_DIM_BITS + 32 + 2 * 2 * `FIXED_FRAC)), + .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * SHIFT_BITS + 32 + 2 * 2 * `TEX_FXD_FRAC)), .RESETW (1) ) pipe_reg0 ( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, req_logdims, mip_addr, clamped_lo, clamped_hi}), - .data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, log_dims_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0}) + .data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, dim_shift, mip_addr, clamped_lo, clamped_hi}), + .data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, dim_shift_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0}) ); // addresses generation - wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_lo; - wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_hi; - wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] blends; + wire [NUM_REQS-1:0][1:0][SCALED_X_W-1:0] scaled_lo; + wire [NUM_REQS-1:0][1:0][SCALED_X_W-1:0] scaled_hi; + wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_lo; + wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_hi; + wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_lo; + wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_hi; + wire [NUM_REQS-1:0][31:0] base_addr_lo; + wire [NUM_REQS-1:0][31:0] base_addr_hi; + wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] blends; wire [NUM_REQS-1:0][3:0][31:0] addr; for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar j = 0; j < 2; ++j) begin - assign scaled_lo[i][j] = scale_to_dim(clamped_lo_s0[i][j], log_dims_s0[i][j]); - assign scaled_hi[i][j] = scale_to_dim(clamped_hi_s0[i][j], log_dims_s0[i][j]); - assign blends[i][j] = filter_s0 ? clamped_lo_s0[i][j][`BLEND_FRAC-1:0] : `BLEND_FRAC'(0); + assign scaled_lo[i][j] = SCALED_X_W'(clamped_lo_s0[i][j] >> dim_shift_s0[i][j]); + assign scaled_hi[i][j] = SCALED_X_W'(clamped_hi_s0[i][j] >> dim_shift_s0[i][j]); + assign blends[i][j] = filter_s0 ? scaled_lo[i][j][`TEX_BLEND_FRAC-1:0] : `TEX_BLEND_FRAC'(0); end end - `UNUSED_VAR (log_pitch_s0) - for (genvar i = 0; i < NUM_REQS; ++i) begin - wire [SCALED_U_W-1:0] offset_u_lo = SCALED_U_W'(scaled_lo[i][0]) << log_stride_s0; - wire [SCALED_U_W-1:0] offset_u_hi = SCALED_U_W'(scaled_hi[i][0]) << log_stride_s0; + assign offset_u_lo[i] = OFFSET_U_W'(scaled_lo[i][0][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_stride_s0; + assign offset_u_hi[i] = OFFSET_U_W'(scaled_hi[i][0][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_stride_s0; - wire [SCALED_V_W-1:0] offset_v_lo = SCALED_V_W'(scaled_lo[i][1]) << log_pitch_s0[i]; - wire [SCALED_V_W-1:0] offset_v_hi = SCALED_V_W'(scaled_hi[i][1]) << log_pitch_s0[i]; + assign offset_v_lo[i] = OFFSET_V_W'(scaled_lo[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i]; + assign offset_v_hi[i] = OFFSET_V_W'(scaled_hi[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i]; - wire [31:0] base_addr_lo = mip_addr_s0[i] + 32'(offset_v_lo); - wire [31:0] base_addr_hi = mip_addr_s0[i] + 32'(offset_v_hi); + assign base_addr_lo[i] = mip_addr_s0[i] + 32'(offset_v_lo[i]); + assign base_addr_hi[i] = mip_addr_s0[i] + 32'(offset_v_hi[i]); - assign addr[i][0] = base_addr_lo + 32'(offset_u_lo); - assign addr[i][1] = base_addr_lo + 32'(offset_u_hi); - assign addr[i][2] = base_addr_hi + 32'(offset_u_lo); - assign addr[i][3] = base_addr_hi + 32'(offset_u_hi); + assign addr[i][0] = base_addr_lo[i] + 32'(offset_u_lo[i]); + assign addr[i][1] = base_addr_lo[i] + 32'(offset_u_hi[i]); + assign addr[i][2] = base_addr_hi[i] + 32'(offset_u_lo[i]); + assign addr[i][3] = base_addr_hi[i] + 32'(offset_u_hi[i]); end assign stall_out = rsp_valid && ~rsp_ready; VX_pipe_register #( - .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `BLEND_FRAC) + REQ_INFOW), + .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `TEX_BLEND_FRAC) + REQ_INFOW), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (~stall_out), .data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}), - .data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_stride, rsp_addr, rsp_blends, rsp_info}) + .data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_lgstride, rsp_addr, rsp_blends, rsp_info}) ); assign req_ready = ~stall_out; @@ -157,22 +165,47 @@ module VX_tex_addr #( assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0]; always @(posedge clk) begin + if (req_valid && ~stall_out) begin + dpi_trace("%d: *** log_pitch=", $time); + `TRACE_ARRAY1D(log_pitch, NUM_REQS); + dpi_trace(", mip_addr="); + `TRACE_ARRAY1D(mip_addr, NUM_REQS); + dpi_trace(", req_logdims="); + `TRACE_ARRAY2D(req_logdims, 2, NUM_REQS); + dpi_trace(", clamped_lo="); + `TRACE_ARRAY2D(clamped_lo, 2, NUM_REQS); + dpi_trace(", clamped_hi="); + `TRACE_ARRAY2D(clamped_hi, 2, NUM_REQS); + dpi_trace("\n"); + end + + if (valid_s0 && ~stall_out) begin + dpi_trace("%d: *** scaled_lo=", $time); + `TRACE_ARRAY2D(scaled_lo, 2, NUM_REQS); + dpi_trace(", scaled_hi="); + `TRACE_ARRAY2D(scaled_hi, 2, NUM_REQS); + dpi_trace(", offset_u_lo="); + `TRACE_ARRAY1D(offset_u_lo, NUM_REQS); + dpi_trace(", offset_u_hi="); + `TRACE_ARRAY1D(offset_u_hi, NUM_REQS); + dpi_trace(", offset_v_lo="); + `TRACE_ARRAY1D(offset_v_lo, NUM_REQS); + dpi_trace(", offset_v_hi="); + `TRACE_ARRAY1D(offset_v_hi, NUM_REQS); + dpi_trace(", base_addr_lo="); + `TRACE_ARRAY1D(base_addr_lo, NUM_REQS); + dpi_trace(", base_addr_hi="); + `TRACE_ARRAY1D(base_addr_hi, NUM_REQS); + dpi_trace("\n"); + end + if (rsp_valid && rsp_ready) begin - dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, tride=%0d, addr=", - $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_stride); + dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, lgstride=%0d, addr=", + $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_lgstride); `TRACE_ARRAY2D(rsp_addr, 4, NUM_REQS); dpi_trace("\n"); end end `endif -function logic [`FIXED_INT-1:0] scale_to_dim (input logic [`FIXED_FRAC-1:0] src, - input logic [`TEX_DIM_BITS-1:0] dim); -`IGNORE_WARNINGS_BEGIN - logic [`FIXED_BITS-1:0] out; -`IGNORE_WARNINGS_END - out = `FIXED_BITS'(src) << dim; - return out[`FIXED_FRAC +: `FIXED_INT]; -endfunction - endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_define.vh b/hw/rtl/tex_unit/VX_tex_define.vh index 16272fc9..34564b39 100644 --- a/hw/rtl/tex_unit/VX_tex_define.vh +++ b/hw/rtl/tex_unit/VX_tex_define.vh @@ -3,31 +3,26 @@ `include "VX_define.vh" -`define FIXED_BITS 32 -`define FIXED_FRAC 20 -`define FIXED_INT (`FIXED_BITS - `FIXED_FRAC) -`define FIXED_ONE (2 ** `FIXED_FRAC) -`define FIXED_HALF (`FIXED_ONE >> 1) -`define FIXED_MASK (`FIXED_ONE - 1) +`define TEX_FXD_INT (`TEX_FXD_BITS - `TEX_FXD_FRAC) +`define TEX_FXD_ONE (2 ** `TEX_FXD_FRAC) +`define TEX_FXD_HALF (`TEX_FXD_ONE >> 1) +`define TEX_FXD_MASK (`TEX_FXD_ONE - 1) `define TEX_ADDR_BITS 32 `define TEX_FORMAT_BITS 3 `define TEX_WRAP_BITS 2 -`define TEX_DIM_BITS 4 `define TEX_FILTER_BITS 1 +`define TEX_MIPOFF_BITS (2*`TEX_DIM_BITS+1) -`define TEX_MIPOFF_BITS (2*12+1) -`define TEX_STRIDE_BITS 2 - -`define TEX_LOD_BITS 4 -`define TEX_MIP_BITS (`NTEX_BITS + `TEX_LOD_BITS) +`define TEX_LGSTRIDE_MAX 2 +`define TEX_LGSTRIDE_BITS 2 `define TEX_WRAP_CLAMP 0 `define TEX_WRAP_REPEAT 1 `define TEX_WRAP_MIRROR 2 -`define BLEND_FRAC 8 -`define BLEND_ONE (2 ** `BLEND_FRAC) +`define TEX_BLEND_FRAC 8 +`define TEX_BLEND_ONE (2 ** `TEX_BLEND_FRAC) `define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0) `define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1) diff --git a/hw/rtl/tex_unit/VX_tex_mem.sv b/hw/rtl/tex_unit/VX_tex_mem.sv index 91aa0438..fc99466e 100644 --- a/hw/rtl/tex_unit/VX_tex_mem.sv +++ b/hw/rtl/tex_unit/VX_tex_mem.sv @@ -15,7 +15,7 @@ module VX_tex_mem #( input wire req_valid, input wire [NUM_REQS-1:0] req_tmask, input wire [`TEX_FILTER_BITS-1:0] req_filter, - input wire [`TEX_STRIDE_BITS-1:0] req_stride, + input wire [`TEX_LGSTRIDE_BITS-1:0] req_lgstride, input wire [NUM_REQS-1:0][3:0][31:0] req_addr, input wire [REQ_INFOW-1:0] req_info, output wire req_ready, @@ -63,23 +63,23 @@ module VX_tex_mem #( wire [NUM_REQS-1:0] q_req_tmask; wire [`TEX_FILTER_BITS-1:0] q_req_filter; wire [REQ_INFOW-1:0] q_req_info; - wire [`TEX_STRIDE_BITS-1:0] q_req_stride; + wire [`TEX_LGSTRIDE_BITS-1:0] q_req_lgstride; wire [3:0][NUM_REQS-1:0][1:0] q_align_offs; wire [3:0] q_dup_reqs; assign reqq_push = req_valid && req_ready; VX_fifo_queue #( - .DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (4 * NUM_REQS * 2) + 4), - .SIZE (`LSUQ_SIZE), + .DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (4 * NUM_REQS * 2) + 4), + .SIZE (`TEXQ_SIZE), .OUT_REG (1) ) req_queue ( .clk (clk), .reset (reset), .push (reqq_push), .pop (reqq_pop), - .data_in ({req_addr_w, req_tmask, req_info, req_filter, req_stride, align_offs, dup_reqs}), - .data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_stride, q_align_offs, q_dup_reqs}), + .data_in ({req_addr_w, req_tmask, req_info, req_filter, req_lgstride, align_offs, dup_reqs}), + .data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_lgstride, q_align_offs, q_dup_reqs}), .empty (reqq_empty), .full (reqq_full), `UNUSED_PIN (alm_full), @@ -96,8 +96,12 @@ module VX_tex_mem #( wire sent_all_ready, last_texel_sent; wire req_texel_dup; wire [NUM_REQS-1:0][29:0] req_texel_addr; + reg [`DBG_CACHE_REQ_IDW-1:0] req_id; + wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id; reg [1:0] req_texel_idx; reg req_texels_done; + + `UNUSED_VAR (rsp_req_id) always @(posedge clk) begin if (reset || last_texel_sent) begin @@ -146,14 +150,19 @@ module VX_tex_mem #( assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask; assign dcache_req_if.rw = {NUM_REQS{1'b0}}; assign dcache_req_if.addr = req_texel_addr; - assign dcache_req_if.byteen = {NUM_REQS{4'b1111}}; + assign dcache_req_if.byteen = {NUM_REQS{4'b0}}; assign dcache_req_if.data = 'x; + assign dcache_req_if.tag = {NUM_REQS{req_id, `LSU_TAG_ID_BITS'(req_texel_idx), `CACHE_ADDR_TYPE_BITS'(0)}}; -`ifdef DBG_CACHE_REQ_INFO - assign dcache_req_if.tag = {NUM_REQS{q_req_info[`DBG_CACHE_REQ_MDATAW-1:0], req_texel_idx}}; -`else - assign dcache_req_if.tag = {NUM_REQS{req_texel_idx}}; -`endif + always @(posedge clk) begin + if (reset) begin + req_id <= `DBG_CACHE_REQ_ID(2, 0); + end else begin + if (dcache_req_fire_any) begin + req_id <= req_id + 1; + end + end + end // Dcache Response @@ -162,14 +171,17 @@ module VX_tex_mem #( reg [NUM_REQS-1:0][31:0] rsp_data_qual; reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init; wire [RSP_CTR_W-1:0] rsp_rem_ctr_n; + wire [NUM_REQS-1:0][1:0] rsp_align_offs; wire dcache_rsp_fire; wire [1:0] rsp_texel_idx; wire rsp_texel_dup; - - assign rsp_texel_idx = dcache_rsp_if.tag[1:0]; + + assign rsp_texel_idx = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: 2]; + assign rsp_req_id = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS + `LSU_TAG_ID_BITS +: `DBG_CACHE_REQ_IDW]; `UNUSED_VAR (dcache_rsp_if.tag) assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx]; + assign rsp_align_offs = q_align_offs[rsp_texel_idx]; assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready; @@ -180,12 +192,12 @@ module VX_tex_mem #( reg [31:0] rsp_data_shifted; always @(*) begin rsp_data_shifted[31:16] = src_data[31:16]; - rsp_data_shifted[15:0] = q_align_offs[rsp_texel_idx][i][1] ? src_data[31:16] : src_data[15:0]; - rsp_data_shifted[7:0] = q_align_offs[rsp_texel_idx][i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0]; + rsp_data_shifted[15:0] = rsp_align_offs[i][1] ? src_data[31:16] : src_data[15:0]; + rsp_data_shifted[7:0] = rsp_align_offs[i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0]; end always @(*) begin - case (q_req_stride) + case (q_req_lgstride) 0: rsp_data_qual[i] = 32'(rsp_data_shifted[7:0]); 1: rsp_data_qual[i] = 32'(rsp_data_shifted[15:0]); default: rsp_data_qual[i] = rsp_data_shifted; @@ -266,20 +278,20 @@ module VX_tex_mem #( always @(posedge clk) begin if (dcache_req_fire_any) begin - dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=", - $time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_texel_idx); + dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, addr=", + $time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_id, req_texel_idx); `TRACE_ARRAY1D(req_texel_addr, NUM_REQS); dpi_trace(", is_dup=%b\n", req_texel_dup); end if (dcache_rsp_fire) begin - dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, data=", - $time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_texel_idx); + dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, data=", + $time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_req_id, rsp_texel_idx); `TRACE_ARRAY1D(dcache_rsp_if.data, NUM_REQS); dpi_trace("\n"); end if (req_valid && req_ready) begin - dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, stride=%0d, addr=", - $time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_stride); + dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, lgstride=%0d, addr=", + $time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_lgstride); `TRACE_ARRAY2D(req_addr, 4, NUM_REQS); dpi_trace("\n"); end diff --git a/hw/rtl/tex_unit/VX_tex_sampler.sv b/hw/rtl/tex_unit/VX_tex_sampler.sv index ac0f1496..63371337 100644 --- a/hw/rtl/tex_unit/VX_tex_sampler.sv +++ b/hw/rtl/tex_unit/VX_tex_sampler.sv @@ -12,7 +12,7 @@ module VX_tex_sampler #( input wire req_valid, input wire [NUM_REQS-1:0] req_tmask, input wire [`TEX_FORMAT_BITS-1:0] req_format, - input wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] req_blends, + input wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] req_blends, input wire [NUM_REQS-1:0][3:0][31:0] req_data, input wire [REQ_INFOW-1:0] req_info, output wire req_ready, @@ -32,7 +32,7 @@ module VX_tex_sampler #( wire [REQ_INFOW-1:0] req_info_s0; wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh; wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0; - wire [NUM_REQS-1:0][`BLEND_FRAC-1:0] blend_v, blend_v_s0; + wire [NUM_REQS-1:0][`TEX_BLEND_FRAC-1:0] blend_v, blend_v_s0; wire [NUM_REQS-1:0][31:0] texel_v; wire stall_out; @@ -52,7 +52,7 @@ module VX_tex_sampler #( end wire [7:0] beta = req_blends[i][0]; - wire [8:0] alpha = `BLEND_ONE - beta; + wire [8:0] alpha = `TEX_BLEND_ONE - beta; VX_tex_lerp #( ) tex_lerp_ul ( @@ -76,7 +76,7 @@ module VX_tex_sampler #( end VX_pipe_register #( - .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `BLEND_FRAC) + (2 * NUM_REQS * 32)), + .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `TEX_BLEND_FRAC) + (2 * NUM_REQS * 32)), .RESETW (1) ) pipe_reg0 ( .clk (clk), @@ -88,7 +88,7 @@ module VX_tex_sampler #( for (genvar i = 0; i < NUM_REQS; i++) begin wire [7:0] beta = blend_v_s0[i]; - wire [8:0] alpha = `BLEND_ONE - beta; + wire [8:0] alpha = `TEX_BLEND_ONE - beta; VX_tex_lerp #( ) tex_lerp_v ( diff --git a/hw/rtl/tex_unit/VX_tex_stride.sv b/hw/rtl/tex_unit/VX_tex_stride.sv index 50393fe9..0e1eca6a 100644 --- a/hw/rtl/tex_unit/VX_tex_stride.sv +++ b/hw/rtl/tex_unit/VX_tex_stride.sv @@ -4,11 +4,11 @@ module VX_tex_stride #( parameter CORE_ID = 0 ) ( input wire [`TEX_FORMAT_BITS-1:0] format, - output wire [`TEX_STRIDE_BITS-1:0] log_stride + output wire [`TEX_LGSTRIDE_BITS-1:0] log_stride ); `UNUSED_PARAM (CORE_ID) - reg [`TEX_STRIDE_BITS-1:0] log_stride_r; + reg [`TEX_LGSTRIDE_BITS-1:0] log_stride_r; always @(*) begin case (format) diff --git a/hw/rtl/tex_unit/VX_tex_unit.sv b/hw/rtl/tex_unit/VX_tex_unit.sv index 6be6aa43..38f93eb2 100644 --- a/hw/rtl/tex_unit/VX_tex_unit.sv +++ b/hw/rtl/tex_unit/VX_tex_unit.sv @@ -20,13 +20,13 @@ module VX_tex_unit #( localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32; localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S; - localparam REQ_INFOW_M = (2 * `NUM_THREADS * `BLEND_FRAC) + REQ_INFOW_A; + localparam REQ_INFOW_M = (2 * `NUM_THREADS * `TEX_BLEND_FRAC) + REQ_INFOW_A; - reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; - reg [1:0][`TEX_DIM_BITS-1:0] tex_dims [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; + reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][`TEX_LOD_MAX+1-1:0]; + reg [1:0][`TEX_LOD_BITS-1:0] tex_logdims [`NUM_TEX_UNITS-1:0]; + reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0]; reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0]; reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0]; - reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0]; reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0]; // CSRs programming @@ -35,38 +35,46 @@ module VX_tex_unit #( `UNUSED_VAR (csrs_dirty) for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin - wire [`TEX_LOD_BITS-1:0] mip_level = tex_csr_if.write_data[28 +: `TEX_LOD_BITS]; always @(posedge clk) begin if (tex_csr_if.write_enable) begin case (tex_csr_if.write_addr) - `CSR_TEX_ADDR(i) : begin + `CSR_TEX(i, `TEX_STATE_ADDR) : begin tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; csrs_dirty[i] <= 1; end - `CSR_TEX_FORMAT(i) : begin + `CSR_TEX(i, `TEX_STATE_FORMAT) : begin tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; csrs_dirty[i] <= 1; end - `CSR_TEX_WRAP(i) : begin - tex_wraps[i][0] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS]; - tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS]; + `CSR_TEX(i, `TEX_STATE_WRAPU) : begin + tex_wraps[i][0] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; csrs_dirty[i] <= 1; end - `CSR_TEX_FILTER(i) : begin + `CSR_TEX(i, `TEX_STATE_WRAPV) : begin + tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; + csrs_dirty[i] <= 1; + end + `CSR_TEX(i, `TEX_STATE_FILTER) : begin tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; csrs_dirty[i] <= 1; end - `CSR_TEX_MIPOFF(i) : begin - tex_mipoff[i][mip_level] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; + `CSR_TEX(i, `TEX_STATE_WIDTH) : begin + tex_logdims[i][0] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; csrs_dirty[i] <= 1; end - `CSR_TEX_WIDTH(i) : begin - tex_dims[i][mip_level][0] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0]; + `CSR_TEX(i, `TEX_STATE_HEIGHT) : begin + tex_logdims[i][1] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; csrs_dirty[i] <= 1; end - `CSR_TEX_HEIGHT(i) : begin - tex_dims[i][mip_level][1] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0]; - csrs_dirty[i] <= 1; + default: begin + for (integer j = 0; j <= `TEX_LOD_MAX; ++j) begin + `IGNORE_WARNINGS_BEGIN + if (tex_csr_if.write_addr == `CSR_ADDR_BITS'(`CSR_TEX(i, `TEX_STATE_MIPOFF(j)))) begin + `IGNORE_WARNINGS_END + tex_mipoff[i][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; + csrs_dirty[i] <= 1; + end + end end endcase end @@ -78,14 +86,15 @@ module VX_tex_unit #( // mipmap attributes - wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff; - wire [`NUM_THREADS-1:0][1:0][`TEX_DIM_BITS-1:0] sel_dims; + wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff; + wire [`NUM_THREADS-1:0][1:0][`TEX_LOD_BITS-1:0] sel_logdims; for (genvar i = 0; i < `NUM_THREADS; ++i) begin wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0]; - wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][20+:`TEX_LOD_BITS]; - assign sel_mipoff[i] = tex_mipoff[unit][mip_level]; - assign sel_dims[i] = tex_dims[unit][mip_level]; + wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][`TEX_LOD_BITS-1:0]; + assign sel_mipoff[i] = tex_mipoff[unit][mip_level]; + assign sel_logdims[i][0] = (tex_logdims[unit][0] - mip_level); + assign sel_logdims[i][1] = (tex_logdims[unit][1] - mip_level); end // address generation @@ -93,8 +102,8 @@ module VX_tex_unit #( wire mem_req_valid; wire [`NUM_THREADS-1:0] mem_req_tmask; wire [`TEX_FILTER_BITS-1:0] mem_req_filter; - wire [`TEX_STRIDE_BITS-1:0] mem_req_stride; - wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] mem_req_blends; + wire [`TEX_LGSTRIDE_BITS-1:0] mem_req_lgstride; + wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] mem_req_blends; wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr; wire [REQ_INFOW_A-1:0] mem_req_info; wire mem_req_ready; @@ -113,16 +122,16 @@ module VX_tex_unit #( .req_format (tex_format[tex_req_if.unit]), .req_filter (tex_filter[tex_req_if.unit]), .req_wraps (tex_wraps[tex_req_if.unit]), - .req_baseaddr (tex_baddr[tex_req_if.unit]), + .req_baseaddr(tex_baddr[tex_req_if.unit]), .req_mipoff (sel_mipoff), - .req_logdims (sel_dims), + .req_logdims(sel_logdims), .req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), .req_ready (tex_req_if.ready), .rsp_valid (mem_req_valid), .rsp_tmask (mem_req_tmask), .rsp_filter (mem_req_filter), - .rsp_stride (mem_req_stride), + .rsp_lgstride(mem_req_lgstride), .rsp_addr (mem_req_addr), .rsp_blends (mem_req_blends), .rsp_info (mem_req_info), @@ -142,8 +151,8 @@ module VX_tex_unit #( .REQ_INFOW (REQ_INFOW_M), .NUM_REQS (`NUM_THREADS) ) tex_mem ( - .clk (clk), - .reset (reset), + .clk (clk), + .reset (reset), // memory interface .dcache_req_if (dcache_req_if), @@ -153,7 +162,7 @@ module VX_tex_unit #( .req_valid (mem_req_valid), .req_tmask (mem_req_tmask), .req_filter(mem_req_filter), - .req_stride(mem_req_stride), + .req_lgstride(mem_req_lgstride), .req_addr (mem_req_addr), .req_info ({mem_req_blends, mem_req_info}), .req_ready (mem_req_ready), @@ -168,7 +177,7 @@ module VX_tex_unit #( // apply sampler - wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends; + wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends; wire [`TEX_FORMAT_BITS-1:0] rsp_format; wire [REQ_INFOW_S-1:0] rsp_info; @@ -205,13 +214,12 @@ module VX_tex_unit #( for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin if (csrs_dirty[i]) begin dpi_trace("%d: core%0d-tex-csr: tex%0d_addr=%0h\n", $time, CORE_ID, i, tex_baddr[i]); + dpi_trace("%d: core%0d-tex-csr: tex%0d_logwidth=%0h\n", $time, CORE_ID, i, tex_logdims[i][0]); + dpi_trace("%d: core%0d-tex-csr: tex%0d_logheight=%0h\n", $time, CORE_ID, i, tex_logdims[i][1]); dpi_trace("%d: core%0d-tex-csr: tex%0d_format=%0h\n", $time, CORE_ID, i, tex_format[i]); dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_u=%0h\n", $time, CORE_ID, i, tex_wraps[i][0]); dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_v=%0h\n", $time, CORE_ID, i, tex_wraps[i][1]); dpi_trace("%d: core%0d-tex-csr: tex%0d_filter=%0h\n", $time, CORE_ID, i, tex_filter[i]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_mipoff[0]=%0h\n", $time, CORE_ID, i, tex_mipoff[i][0]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_width[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][0]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_height[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][1]); end end diff --git a/hw/rtl/tex_unit/VX_tex_wrap.sv b/hw/rtl/tex_unit/VX_tex_wrap.sv index 8cc7b2f5..fe2110ba 100644 --- a/hw/rtl/tex_unit/VX_tex_wrap.sv +++ b/hw/rtl/tex_unit/VX_tex_wrap.sv @@ -4,19 +4,19 @@ module VX_tex_wrap #( parameter CORE_ID = 0 ) ( input wire [`TEX_WRAP_BITS-1:0] wrap_i, - input wire [31:0] coord_i, - output wire [`FIXED_FRAC-1:0] coord_o + input wire [`TEX_FXD_BITS-1:0] coord_i, + output wire [`TEX_FXD_FRAC-1:0] coord_o ); `UNUSED_PARAM (CORE_ID) - reg [`FIXED_FRAC-1:0] coord_r; + reg [`TEX_FXD_FRAC-1:0] coord_r; - wire [`FIXED_FRAC-1:0] clamp; + wire [`TEX_FXD_FRAC-1:0] clamp; VX_tex_sat #( - .IN_W (32), - .OUT_W (`FIXED_FRAC) + .IN_W (`TEX_FXD_BITS), + .OUT_W (`TEX_FXD_FRAC) ) sat_fx ( .data_in (coord_i), .data_out (clamp) @@ -27,9 +27,9 @@ module VX_tex_wrap #( `TEX_WRAP_CLAMP: coord_r = clamp; `TEX_WRAP_MIRROR: - coord_r = coord_i[`FIXED_FRAC-1:0] ^ {`FIXED_FRAC{coord_i[`FIXED_FRAC]}}; + coord_r = coord_i[`TEX_FXD_FRAC-1:0] ^ {`TEX_FXD_FRAC{coord_i[`TEX_FXD_FRAC]}}; default: //`TEX_WRAP_REPEAT - coord_r = coord_i[`FIXED_FRAC-1:0]; + coord_r = coord_i[`TEX_FXD_FRAC-1:0]; endcase end diff --git a/hw/syn/opae/Makefile b/hw/syn/opae/Makefile index 010baea3..29b6a922 100644 --- a/hw/syn/opae/Makefile +++ b/hw/syn/opae/Makefile @@ -23,7 +23,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_FLAGS += $(DBG_TRACE_FLAGS) -DBG_FLAGS += -DDBG_CACHE_REQ_INFO CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) CONFIG2 := -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) diff --git a/runtime/Makefile b/runtime/Makefile index 60c3b398..c329e531 100644 --- a/runtime/Makefile +++ b/runtime/Makefile @@ -10,7 +10,7 @@ CFLAGS += -I./include -I../hw PROJECT = libvortexrt -SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c +SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/tinyprintf.c ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c OBJS := $(addsuffix .o, $(notdir $(SRCS))) diff --git a/runtime/include/vx_intrinsics.h b/runtime/include/vx_intrinsics.h index abbca493..f3562872 100644 --- a/runtime/include/vx_intrinsics.h +++ b/runtime/include/vx_intrinsics.h @@ -5,62 +5,7 @@ #ifdef __cplusplus extern "C" { - #endif -#ifdef __ASSEMBLY__ -#define __ASM_STR(x) x -#else -#define __ASM_STR(x) #x -#endif - -#define vx_csr_swap(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_read(csr) ({ \ - register unsigned __v; \ - __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \ - __v; \ -}) - -#define vx_csr_write(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -#define vx_csr_read_set(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_set(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -#define vx_csr_read_clear(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_clear(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -// Texture load -#define vx_tex(unit, u, v, l) ({ \ - unsigned __r; \ - unsigned __u = u; \ - unsigned __v = v; \ - unsigned __l = l; \ - __asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \ - __r; \ -}) #ifdef __ASSEMBLY__ #define __ASM_STR(x) x @@ -68,72 +13,77 @@ extern "C" { #define __ASM_STR(x) #x #endif -#define vx_csr_swap(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_read(csr) ({ \ - register unsigned __v; \ - __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \ - __v; \ -}) - -#define vx_csr_write(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -#define vx_csr_read_set(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_set(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -#define vx_csr_read_clear(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_clear(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -// Texture load -#define vx_tex(unit, u, v, l) ({ \ - unsigned __r; \ - unsigned __u = u; \ - unsigned __v = v; \ - unsigned __l = l; \ - __asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \ +#define csr_read(csr) ({ \ + unsigned __r; \ + __asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \ __r; \ }) -// Lerp instruction -#define vx_lerp(a, b, s) ({ \ - unsigned __r; \ - unsigned __a = a; \ - unsigned __b = b; \ - unsigned __s = s; \ - __asm__ __volatile__ (".insn r4 0x6b, 7, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r : "r"(__a), "r"(__b), "r"(__s)); \ +#define csr_write(csr, val) ({ \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \ +}) + +#define csr_swap(csr, val) ({ \ + unsigned __r; \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \ + __r; \ +}) + +#define csr_read_set(csr, val) ({ \ + unsigned __r; \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \ + __r; \ +}) + +#define csr_set(csr, val) ({ \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \ +}) + +#define csr_read_clear(csr, val) ({ \ + unsigned __r; \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \ + __r; \ +}) + +#define csr_clear(csr, val) ({ \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \ +}) + +// Texture load +#define vx_tex(unit, u, v, lod) ({ \ + unsigned __r; \ + __asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(unit), "r"(u), "r"(v), "r"(lod)); \ __r; \ }) // Conditional move -#define vx_cmov(c, t, f) ({ \ +#define vx_cmov(c, t, f) ({ \ unsigned __r; \ - unsigned __c = c; \ - unsigned __t = t; \ - unsigned __f = f; \ - __asm__ __volatile__ (".insn r4 0x6b, 6, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r : "r"(__c), "r"(__t), "r"(__f)); \ + __asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \ __r; \ }) @@ -171,7 +121,7 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) { // Prefetch inline void vx_prefetch(unsigned addr) { - asm volatile (".insn s 0x6b, 6, x0, 0(%0)" :: "r"(addr) ); + asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) ); } // Return active warp's thread id diff --git a/runtime/src/tinyprintf.c b/runtime/src/tinyprintf.c new file mode 100644 index 00000000..4c88ef29 --- /dev/null +++ b/runtime/src/tinyprintf.c @@ -0,0 +1,890 @@ +/////////////////////////////////////////////////////////////////////////////// +// \author (c) Marco Paland (info@paland.com) +// 2014-2019, PALANDesign Hannover, Germany +// +// \license The MIT License (MIT) +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// \brief Tiny printf, sprintf and (v)snprintf implementation, optimized for speed on +// embedded systems with a very limited resources. These routines are thread +// safe and reentrant! +// Use this instead of the bloated standard/newlib printf cause these use +// malloc for printf (and may not be thread safe). +// +/////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include "tinyprintf.h" +#include "vx_print.h" + + +// define this globally (e.g. gcc -DPRINTF_INCLUDE_CONFIG_H ...) to include the +// printf_config.h header file +// default: undefined +#ifdef PRINTF_INCLUDE_CONFIG_H +#include "printf_config.h" +#endif + + +// 'ntoa' conversion buffer size, this must be big enough to hold one converted +// numeric number including padded zeros (dynamically created on stack) +// default: 32 byte +#ifndef PRINTF_NTOA_BUFFER_SIZE +#define PRINTF_NTOA_BUFFER_SIZE 32U +#endif + +// 'ftoa' conversion buffer size, this must be big enough to hold one converted +// float number including padded zeros (dynamically created on stack) +// default: 32 byte +#ifndef PRINTF_FTOA_BUFFER_SIZE +#define PRINTF_FTOA_BUFFER_SIZE 32U +#endif + +// support for the floating point type (%f) +// default: activated +#ifndef PRINTF_DISABLE_SUPPORT_FLOAT +#define PRINTF_SUPPORT_FLOAT +#endif + +// support for exponential floating point notation (%e/%g) +// default: activated +#ifndef PRINTF_DISABLE_SUPPORT_EXPONENTIAL +#define PRINTF_SUPPORT_EXPONENTIAL +#endif + +// define the default floating point precision +// default: 6 digits +#ifndef PRINTF_DEFAULT_FLOAT_PRECISION +#define PRINTF_DEFAULT_FLOAT_PRECISION 6U +#endif + +// define the largest float suitable to print with %f +// default: 1e9 +#ifndef PRINTF_MAX_FLOAT +#define PRINTF_MAX_FLOAT 1e9 +#endif + +// support for the long long types (%llu or %p) +// default: activated +#ifndef PRINTF_DISABLE_SUPPORT_LONG_LONG +#define PRINTF_SUPPORT_LONG_LONG +#endif + +// support for the ptrdiff_t type (%t) +// ptrdiff_t is normally defined in as long or long long type +// default: activated +#ifndef PRINTF_DISABLE_SUPPORT_PTRDIFF_T +#define PRINTF_SUPPORT_PTRDIFF_T +#endif + +/////////////////////////////////////////////////////////////////////////////// + +// internal flag definitions +#define FLAGS_ZEROPAD (1U << 0U) +#define FLAGS_LEFT (1U << 1U) +#define FLAGS_PLUS (1U << 2U) +#define FLAGS_SPACE (1U << 3U) +#define FLAGS_HASH (1U << 4U) +#define FLAGS_UPPERCASE (1U << 5U) +#define FLAGS_CHAR (1U << 6U) +#define FLAGS_SHORT (1U << 7U) +#define FLAGS_LONG (1U << 8U) +#define FLAGS_LONG_LONG (1U << 9U) +#define FLAGS_PRECISION (1U << 10U) +#define FLAGS_ADAPT_EXP (1U << 11U) + + +// import float.h for DBL_MAX +#if defined(PRINTF_SUPPORT_FLOAT) +#include +#endif + + +// output function type +typedef void (*out_fct_type)(char character, void* buffer, size_t idx, size_t maxlen); + + +// wrapper (used as buffer) for output function type +typedef struct { + void (*fct)(char character, void* arg); + void* arg; +} out_fct_wrap_type; + + +// internal buffer output +static inline void _out_buffer(char character, void* buffer, size_t idx, size_t maxlen) +{ + if (idx < maxlen) { + ((char*)buffer)[idx] = character; + } +} + + +// internal null output +static inline void _out_null(char character, void* buffer, size_t idx, size_t maxlen) +{ + (void)character; (void)buffer; (void)idx; (void)maxlen; +} + + +// internal _putchar wrapper +static inline void _out_char(char character, void* buffer, size_t idx, size_t maxlen) +{ + (void)buffer; (void)idx; (void)maxlen; + if (character) { + vx_putchar(character); + } +} + + +// internal output function wrapper +static inline void _out_fct(char character, void* buffer, size_t idx, size_t maxlen) +{ + (void)idx; (void)maxlen; + if (character) { + // buffer is the output fct pointer + ((out_fct_wrap_type*)buffer)->fct(character, ((out_fct_wrap_type*)buffer)->arg); + } +} + + +// internal secure strlen +// \return The length of the string (excluding the terminating 0) limited by 'maxsize' +static inline unsigned int _strnlen_s(const char* str, size_t maxsize) +{ + const char* s; + for (s = str; *s && maxsize--; ++s); + return (unsigned int)(s - str); +} + + +// internal test if char is a digit (0-9) +// \return true if char is a digit +static inline bool _is_digit(char ch) +{ + return (ch >= '0') && (ch <= '9'); +} + + +// internal ASCII string to unsigned int conversion +static unsigned int _atoi(const char** str) +{ + unsigned int i = 0U; + while (_is_digit(**str)) { + i = i * 10U + (unsigned int)(*((*str)++) - '0'); + } + return i; +} + + +// output the specified string in reverse, taking care of any zero-padding +static size_t _out_rev(out_fct_type out, char* buffer, size_t idx, size_t maxlen, const char* buf, size_t len, unsigned int width, unsigned int flags) +{ + const size_t start_idx = idx; + + // pad spaces up to given width + if (!(flags & FLAGS_LEFT) && !(flags & FLAGS_ZEROPAD)) { + for (size_t i = len; i < width; i++) { + out(' ', buffer, idx++, maxlen); + } + } + + // reverse string + while (len) { + out(buf[--len], buffer, idx++, maxlen); + } + + // append pad spaces up to given width + if (flags & FLAGS_LEFT) { + while (idx - start_idx < width) { + out(' ', buffer, idx++, maxlen); + } + } + + return idx; +} + + +// internal itoa format +static size_t _ntoa_format(out_fct_type out, char* buffer, size_t idx, size_t maxlen, char* buf, size_t len, bool negative, unsigned int base, unsigned int prec, unsigned int width, unsigned int flags) +{ + // pad leading zeros + if (!(flags & FLAGS_LEFT)) { + if (width && (flags & FLAGS_ZEROPAD) && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) { + width--; + } + while ((len < prec) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = '0'; + } + while ((flags & FLAGS_ZEROPAD) && (len < width) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = '0'; + } + } + + // handle hash + if (flags & FLAGS_HASH) { + if (!(flags & FLAGS_PRECISION) && len && ((len == prec) || (len == width))) { + len--; + if (len && (base == 16U)) { + len--; + } + } + if ((base == 16U) && !(flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = 'x'; + } + else if ((base == 16U) && (flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = 'X'; + } + else if ((base == 2U) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = 'b'; + } + if (len < PRINTF_NTOA_BUFFER_SIZE) { + buf[len++] = '0'; + } + } + + if (len < PRINTF_NTOA_BUFFER_SIZE) { + if (negative) { + buf[len++] = '-'; + } + else if (flags & FLAGS_PLUS) { + buf[len++] = '+'; // ignore the space if the '+' exists + } + else if (flags & FLAGS_SPACE) { + buf[len++] = ' '; + } + } + + return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags); +} + + +// internal itoa for 'long' type +static size_t _ntoa_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long value, bool negative, unsigned long base, unsigned int prec, unsigned int width, unsigned int flags) +{ + char buf[PRINTF_NTOA_BUFFER_SIZE]; + size_t len = 0U; + + // no hash for 0 values + if (!value) { + flags &= ~FLAGS_HASH; + } + + // write if precision != 0 and value is != 0 + if (!(flags & FLAGS_PRECISION) || value) { + do { + const char digit = (char)(value % base); + buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10; + value /= base; + } while (value && (len < PRINTF_NTOA_BUFFER_SIZE)); + } + + return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags); +} + + +// internal itoa for 'long long' type +#if defined(PRINTF_SUPPORT_LONG_LONG) +static size_t _ntoa_long_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long long value, bool negative, unsigned long long base, unsigned int prec, unsigned int width, unsigned int flags) +{ + char buf[PRINTF_NTOA_BUFFER_SIZE]; + size_t len = 0U; + + // no hash for 0 values + if (!value) { + flags &= ~FLAGS_HASH; + } + + // write if precision != 0 and value is != 0 + if (!(flags & FLAGS_PRECISION) || value) { + do { + const char digit = (char)(value % base); + buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10; + value /= base; + } while (value && (len < PRINTF_NTOA_BUFFER_SIZE)); + } + + return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags); +} +#endif // PRINTF_SUPPORT_LONG_LONG + + +#if defined(PRINTF_SUPPORT_FLOAT) + +#if defined(PRINTF_SUPPORT_EXPONENTIAL) +// forward declaration so that _ftoa can switch to exp notation for values > PRINTF_MAX_FLOAT +static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags); +#endif + + +// internal ftoa for fixed decimal floating point +static size_t _ftoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags) +{ + char buf[PRINTF_FTOA_BUFFER_SIZE]; + size_t len = 0U; + double diff = 0.0; + + // powers of 10 + static const double pow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 }; + + // test for special values + if (value != value) + return _out_rev(out, buffer, idx, maxlen, "nan", 3, width, flags); + if (value < -DBL_MAX) + return _out_rev(out, buffer, idx, maxlen, "fni-", 4, width, flags); + if (value > DBL_MAX) + return _out_rev(out, buffer, idx, maxlen, (flags & FLAGS_PLUS) ? "fni+" : "fni", (flags & FLAGS_PLUS) ? 4U : 3U, width, flags); + + // test for very large values + // standard printf behavior is to print EVERY whole number digit -- which could be 100s of characters overflowing your buffers == bad + if ((value > PRINTF_MAX_FLOAT) || (value < -PRINTF_MAX_FLOAT)) { +#if defined(PRINTF_SUPPORT_EXPONENTIAL) + return _etoa(out, buffer, idx, maxlen, value, prec, width, flags); +#else + return 0U; +#endif + } + + // test for negative + bool negative = false; + if (value < 0) { + negative = true; + value = 0 - value; + } + + // set default precision, if not set explicitly + if (!(flags & FLAGS_PRECISION)) { + prec = PRINTF_DEFAULT_FLOAT_PRECISION; + } + // limit precision to 9, cause a prec >= 10 can lead to overflow errors + while ((len < PRINTF_FTOA_BUFFER_SIZE) && (prec > 9U)) { + buf[len++] = '0'; + prec--; + } + + int whole = (int)value; + double tmp = (value - whole) * pow10[prec]; + unsigned long frac = (unsigned long)tmp; + diff = tmp - frac; + + if (diff > 0.5) { + ++frac; + // handle rollover, e.g. case 0.99 with prec 1 is 1.0 + if (frac >= pow10[prec]) { + frac = 0; + ++whole; + } + } + else if (diff < 0.5) { + } + else if ((frac == 0U) || (frac & 1U)) { + // if halfway, round up if odd OR if last digit is 0 + ++frac; + } + + if (prec == 0U) { + diff = value - (double)whole; + if ((!(diff < 0.5) || (diff > 0.5)) && (whole & 1)) { + // exactly 0.5 and ODD, then round up + // 1.5 -> 2, but 2.5 -> 2 + ++whole; + } + } + else { + unsigned int count = prec; + // now do fractional part, as an unsigned number + while (len < PRINTF_FTOA_BUFFER_SIZE) { + --count; + buf[len++] = (char)(48U + (frac % 10U)); + if (!(frac /= 10U)) { + break; + } + } + // add extra 0s + while ((len < PRINTF_FTOA_BUFFER_SIZE) && (count-- > 0U)) { + buf[len++] = '0'; + } + if (len < PRINTF_FTOA_BUFFER_SIZE) { + // add decimal + buf[len++] = '.'; + } + } + + // do whole part, number is reversed + while (len < PRINTF_FTOA_BUFFER_SIZE) { + buf[len++] = (char)(48 + (whole % 10)); + if (!(whole /= 10)) { + break; + } + } + + // pad leading zeros + if (!(flags & FLAGS_LEFT) && (flags & FLAGS_ZEROPAD)) { + if (width && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) { + width--; + } + while ((len < width) && (len < PRINTF_FTOA_BUFFER_SIZE)) { + buf[len++] = '0'; + } + } + + if (len < PRINTF_FTOA_BUFFER_SIZE) { + if (negative) { + buf[len++] = '-'; + } + else if (flags & FLAGS_PLUS) { + buf[len++] = '+'; // ignore the space if the '+' exists + } + else if (flags & FLAGS_SPACE) { + buf[len++] = ' '; + } + } + + return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags); +} + + +#if defined(PRINTF_SUPPORT_EXPONENTIAL) +// internal ftoa variant for exponential floating-point type, contributed by Martijn Jasperse +static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags) +{ + // check for NaN and special values + if ((value != value) || (value > DBL_MAX) || (value < -DBL_MAX)) { + return _ftoa(out, buffer, idx, maxlen, value, prec, width, flags); + } + + // determine the sign + const bool negative = value < 0; + if (negative) { + value = -value; + } + + // default precision + if (!(flags & FLAGS_PRECISION)) { + prec = PRINTF_DEFAULT_FLOAT_PRECISION; + } + + // determine the decimal exponent + // based on the algorithm by David Gay (https://www.ampl.com/netlib/fp/dtoa.c) + union { + uint64_t U; + double F; + } conv; + + conv.F = value; + int exp2 = (int)((conv.U >> 52U) & 0x07FFU) - 1023; // effectively log2 + conv.U = (conv.U & ((1ULL << 52U) - 1U)) | (1023ULL << 52U); // drop the exponent so conv.F is now in [1,2) + // now approximate log10 from the log2 integer part and an expansion of ln around 1.5 + int expval = (int)(0.1760912590558 + exp2 * 0.301029995663981 + (conv.F - 1.5) * 0.289529654602168); + // now we want to compute 10^expval but we want to be sure it won't overflow + exp2 = (int)(expval * 3.321928094887362 + 0.5); + const double z = expval * 2.302585092994046 - exp2 * 0.6931471805599453; + const double z2 = z * z; + conv.U = (uint64_t)(exp2 + 1023) << 52U; + // compute exp(z) using continued fractions, see https://en.wikipedia.org/wiki/Exponential_function#Continued_fractions_for_ex + conv.F *= 1 + 2 * z / (2 - z + (z2 / (6 + (z2 / (10 + z2 / 14))))); + // correct for rounding errors + if (value < conv.F) { + expval--; + conv.F /= 10; + } + + // the exponent format is "%+03d" and largest value is "307", so set aside 4-5 characters + unsigned int minwidth = ((expval < 100) && (expval > -100)) ? 4U : 5U; + + // in "%g" mode, "prec" is the number of *significant figures* not decimals + if (flags & FLAGS_ADAPT_EXP) { + // do we want to fall-back to "%f" mode? + if ((value >= 1e-4) && (value < 1e6)) { + if ((int)prec > expval) { + prec = (unsigned)((int)prec - expval - 1); + } + else { + prec = 0; + } + flags |= FLAGS_PRECISION; // make sure _ftoa respects precision + // no characters in exponent + minwidth = 0U; + expval = 0; + } + else { + // we use one sigfig for the whole part + if ((prec > 0) && (flags & FLAGS_PRECISION)) { + --prec; + } + } + } + + // will everything fit? + unsigned int fwidth = width; + if (width > minwidth) { + // we didn't fall-back so subtract the characters required for the exponent + fwidth -= minwidth; + } else { + // not enough characters, so go back to default sizing + fwidth = 0U; + } + if ((flags & FLAGS_LEFT) && minwidth) { + // if we're padding on the right, DON'T pad the floating part + fwidth = 0U; + } + + // rescale the float value + if (expval) { + value /= conv.F; + } + + // output the floating part + const size_t start_idx = idx; + idx = _ftoa(out, buffer, idx, maxlen, negative ? -value : value, prec, fwidth, flags & ~FLAGS_ADAPT_EXP); + + // output the exponent part + if (minwidth) { + // output the exponential symbol + out((flags & FLAGS_UPPERCASE) ? 'E' : 'e', buffer, idx++, maxlen); + // output the exponent value + idx = _ntoa_long(out, buffer, idx, maxlen, (expval < 0) ? -expval : expval, expval < 0, 10, 0, minwidth-1, FLAGS_ZEROPAD | FLAGS_PLUS); + // might need to right-pad spaces + if (flags & FLAGS_LEFT) { + while (idx - start_idx < width) out(' ', buffer, idx++, maxlen); + } + } + return idx; +} +#endif // PRINTF_SUPPORT_EXPONENTIAL +#endif // PRINTF_SUPPORT_FLOAT + + +// internal vsnprintf +static int _vsnprintf(out_fct_type out, char* buffer, const size_t maxlen, const char* format, va_list va) { + unsigned int flags, width, precision, n; + size_t idx = 0U; + + if (!buffer) { + // use null output function + out = _out_null; + } + + while (*format) + { + // format specifier? %[flags][width][.precision][length] + if (*format != '%') { + // no + out(*format, buffer, idx++, maxlen); + format++; + continue; + } + else { + // yes, evaluate it + format++; + } + + // evaluate flags + flags = 0U; + do { + switch (*format) { + case '0': flags |= FLAGS_ZEROPAD; format++; n = 1U; break; + case '-': flags |= FLAGS_LEFT; format++; n = 1U; break; + case '+': flags |= FLAGS_PLUS; format++; n = 1U; break; + case ' ': flags |= FLAGS_SPACE; format++; n = 1U; break; + case '#': flags |= FLAGS_HASH; format++; n = 1U; break; + default : n = 0U; break; + } + } while (n); + + // evaluate width field + width = 0U; + if (_is_digit(*format)) { + width = _atoi(&format); + } + else if (*format == '*') { + const int w = va_arg(va, int); + if (w < 0) { + flags |= FLAGS_LEFT; // reverse padding + width = (unsigned int)-w; + } + else { + width = (unsigned int)w; + } + format++; + } + + // evaluate precision field + precision = 0U; + if (*format == '.') { + flags |= FLAGS_PRECISION; + format++; + if (_is_digit(*format)) { + precision = _atoi(&format); + } + else if (*format == '*') { + const int prec = (int)va_arg(va, int); + precision = prec > 0 ? (unsigned int)prec : 0U; + format++; + } + } + + // evaluate length field + switch (*format) { + case 'l' : + flags |= FLAGS_LONG; + format++; + if (*format == 'l') { + flags |= FLAGS_LONG_LONG; + format++; + } + break; + case 'h' : + flags |= FLAGS_SHORT; + format++; + if (*format == 'h') { + flags |= FLAGS_CHAR; + format++; + } + break; +#if defined(PRINTF_SUPPORT_PTRDIFF_T) + case 't' : + flags |= (sizeof(ptrdiff_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG); + format++; + break; +#endif + case 'j' : + flags |= (sizeof(intmax_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG); + format++; + break; + case 'z' : + flags |= (sizeof(size_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG); + format++; + break; + default : + break; + } + + // evaluate specifier + switch (*format) { + case 'd' : + case 'i' : + case 'u' : + case 'x' : + case 'X' : + case 'o' : + case 'b' : { + // set the base + unsigned int base; + if (*format == 'x' || *format == 'X') { + base = 16U; + } + else if (*format == 'o') { + base = 8U; + } + else if (*format == 'b') { + base = 2U; + } + else { + base = 10U; + flags &= ~FLAGS_HASH; // no hash for dec format + } + // uppercase + if (*format == 'X') { + flags |= FLAGS_UPPERCASE; + } + + // no plus or space flag for u, x, X, o, b + if ((*format != 'i') && (*format != 'd')) { + flags &= ~(FLAGS_PLUS | FLAGS_SPACE); + } + + // ignore '0' flag when precision is given + if (flags & FLAGS_PRECISION) { + flags &= ~FLAGS_ZEROPAD; + } + + // convert the integer + if ((*format == 'i') || (*format == 'd')) { + // signed + if (flags & FLAGS_LONG_LONG) { +#if defined(PRINTF_SUPPORT_LONG_LONG) + const long long value = va_arg(va, long long); + idx = _ntoa_long_long(out, buffer, idx, maxlen, (unsigned long long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags); +#endif + } + else if (flags & FLAGS_LONG) { + const long value = va_arg(va, long); + idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags); + } + else { + const int value = (flags & FLAGS_CHAR) ? (char)va_arg(va, int) : (flags & FLAGS_SHORT) ? (short int)va_arg(va, int) : va_arg(va, int); + idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned int)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags); + } + } + else { + // unsigned + if (flags & FLAGS_LONG_LONG) { +#if defined(PRINTF_SUPPORT_LONG_LONG) + idx = _ntoa_long_long(out, buffer, idx, maxlen, va_arg(va, unsigned long long), false, base, precision, width, flags); +#endif + } + else if (flags & FLAGS_LONG) { + idx = _ntoa_long(out, buffer, idx, maxlen, va_arg(va, unsigned long), false, base, precision, width, flags); + } + else { + const unsigned int value = (flags & FLAGS_CHAR) ? (unsigned char)va_arg(va, unsigned int) : (flags & FLAGS_SHORT) ? (unsigned short int)va_arg(va, unsigned int) : va_arg(va, unsigned int); + idx = _ntoa_long(out, buffer, idx, maxlen, value, false, base, precision, width, flags); + } + } + format++; + break; + } +#if defined(PRINTF_SUPPORT_FLOAT) + case 'f' : + case 'F' : + if (*format == 'F') flags |= FLAGS_UPPERCASE; + idx = _ftoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags); + format++; + break; +#if defined(PRINTF_SUPPORT_EXPONENTIAL) + case 'e': + case 'E': + case 'g': + case 'G': + if ((*format == 'g')||(*format == 'G')) flags |= FLAGS_ADAPT_EXP; + if ((*format == 'E')||(*format == 'G')) flags |= FLAGS_UPPERCASE; + idx = _etoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags); + format++; + break; +#endif // PRINTF_SUPPORT_EXPONENTIAL +#endif // PRINTF_SUPPORT_FLOAT + case 'c' : { + unsigned int l = 1U; + // pre padding + if (!(flags & FLAGS_LEFT)) { + while (l++ < width) { + out(' ', buffer, idx++, maxlen); + } + } + // char output + out((char)va_arg(va, int), buffer, idx++, maxlen); + // post padding + if (flags & FLAGS_LEFT) { + while (l++ < width) { + out(' ', buffer, idx++, maxlen); + } + } + format++; + break; + } + + case 's' : { + const char* p = va_arg(va, char*); + unsigned int l = _strnlen_s(p, precision ? precision : (size_t)-1); + // pre padding + if (flags & FLAGS_PRECISION) { + l = (l < precision ? l : precision); + } + if (!(flags & FLAGS_LEFT)) { + while (l++ < width) { + out(' ', buffer, idx++, maxlen); + } + } + // string output + while ((*p != 0) && (!(flags & FLAGS_PRECISION) || precision--)) { + out(*(p++), buffer, idx++, maxlen); + } + // post padding + if (flags & FLAGS_LEFT) { + while (l++ < width) { + out(' ', buffer, idx++, maxlen); + } + } + format++; + break; + } + + case 'p' : { + width = sizeof(void*) * 2U; + flags |= FLAGS_ZEROPAD | FLAGS_UPPERCASE; +#if defined(PRINTF_SUPPORT_LONG_LONG) + const bool is_ll = sizeof(uintptr_t) == sizeof(long long); + if (is_ll) { + idx = _ntoa_long_long(out, buffer, idx, maxlen, (uintptr_t)va_arg(va, void*), false, 16U, precision, width, flags); + } + else { +#endif + idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)((uintptr_t)va_arg(va, void*)), false, 16U, precision, width, flags); +#if defined(PRINTF_SUPPORT_LONG_LONG) + } +#endif + format++; + break; + } + + case '%' : + out('%', buffer, idx++, maxlen); + format++; + break; + + default : + out(*format, buffer, idx++, maxlen); + format++; + break; + } + } + + // termination + out((char)0, buffer, idx < maxlen ? idx : maxlen - 1U, maxlen); + + // return written chars without terminating \0 + return (int)idx; +} + +int tiny_printf(const char* format, ...) { + va_list va; + va_start(va, format); + char buffer[1]; + const int ret = _vsnprintf(_out_char, buffer, (size_t)-1, format, va); + va_end(va); + return ret; +} + +int tiny_sprintf(char* buffer, const char* format, ...) { + va_list va; + va_start(va, format); + const int ret = _vsnprintf(_out_buffer, buffer, (size_t)-1, format, va); + va_end(va); + return ret; +} + +int tiny_snprintf(char* buffer, size_t count, const char* format, ...) { + va_list va; + va_start(va, format); + const int ret = _vsnprintf(_out_buffer, buffer, count, format, va); + va_end(va); + return ret; +} + +int tiny_vprintf(const char* format, va_list va) { + char buffer[1]; + return _vsnprintf(_out_char, buffer, (size_t)-1, format, va); +} + +int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va) { + return _vsnprintf(_out_buffer, buffer, count, format, va); +} \ No newline at end of file diff --git a/runtime/src/tinyprintf.h b/runtime/src/tinyprintf.h new file mode 100644 index 00000000..9aa79d9a --- /dev/null +++ b/runtime/src/tinyprintf.h @@ -0,0 +1,86 @@ +/////////////////////////////////////////////////////////////////////////////// +// \author (c) Marco Paland (info@paland.com) +// 2014-2019, PALANDesign Hannover, Germany +// +// \license The MIT License (MIT) +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// \brief Tiny printf, sprintf and snprintf implementation, optimized for speed on +// embedded systems with a very limited resources. +// Use this instead of bloated standard/newlib printf. +// These routines are thread safe and reentrant. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _TINYPRINTF_H_ +#define _TINYPRINTF_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Tiny printf implementation + * You have to implement _putchar if you use printf() + * To avoid conflicts with the regular printf() API it is overridden by macro defines + * and internal underscore-appended functions like printf_() are used + * \param format A string that specifies the format of the output + * \return The number of characters that are written into the array, not counting the terminating null character + */ +int tiny_printf(const char* format, ...); + +/** + * Tiny sprintf implementation + * Due to security reasons (buffer overflow) YOU SHOULD CONSIDER USING (V)SNPRINTF INSTEAD! + * \param buffer A pointer to the buffer where to store the formatted string. MUST be big enough to store the output! + * \param format A string that specifies the format of the output + * \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character + */ +int tiny_sprintf(char* buffer, const char* format, ...); + +/** + * Tiny snprintf/vsnprintf implementation + * \param buffer A pointer to the buffer where to store the formatted string + * \param count The maximum number of characters to store in the buffer, including a terminating null character + * \param format A string that specifies the format of the output + * \param va A value identifying a variable arguments list + * \return The number of characters that COULD have been written into the buffer, not counting the terminating + * null character. A value equal or larger than count indicates truncation. Only when the returned value + * is non-negative and less than count, the string has been completely written. + */ +int tiny_snprintf(char* buffer, size_t count, const char* format, ...); +int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va); + +/** + * Tiny vprintf implementation + * \param format A string that specifies the format of the output + * \param va A value identifying a variable arguments list + * \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character + */ +int tiny_vprintf(const char* format, va_list va); + +#ifdef __cplusplus +} +#endif + +#endif // _TINYPRINTF_H_ \ No newline at end of file diff --git a/runtime/src/vx_perf.c b/runtime/src/vx_perf.c index edfecdeb..0fe74375 100644 --- a/runtime/src/vx_perf.c +++ b/runtime/src/vx_perf.c @@ -4,10 +4,10 @@ #include #define DUMP_CSR_4(d, s) \ - csr_mem[d + 0] = vx_csr_read(s + 0); \ - csr_mem[d + 1] = vx_csr_read(s + 1); \ - csr_mem[d + 2] = vx_csr_read(s + 2); \ - csr_mem[d + 3] = vx_csr_read(s + 3); + csr_mem[d + 0] = csr_read(s + 0); \ + csr_mem[d + 1] = csr_read(s + 1); \ + csr_mem[d + 2] = csr_read(s + 2); \ + csr_mem[d + 3] = csr_read(s + 3); #define DUMP_CSR_32(d, s) \ DUMP_CSR_4(d + 0, s + 0) \ diff --git a/runtime/src/vx_print.c b/runtime/src/vx_print.c index 86458644..e75993e2 100644 --- a/runtime/src/vx_print.c +++ b/runtime/src/vx_print.c @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include "tinyprintf.h" #ifdef __cplusplus extern "C" { @@ -26,46 +28,19 @@ typedef struct { int precision; } putfloat_arg_t; -static void __printf_cb(printf_arg_t* arg) { - arg->ret = vprintf(arg->format, *arg->va); -} - -int vx_vprintf(const char* format, va_list va) { - printf_arg_t arg; - arg.format = format; - arg.va = &va; - vx_serial((vx_serial_cb)__printf_cb, &arg); - return arg.ret; -} - -int vx_printf(const char * format, ...) { - int ret; - va_list va; - va_start(va, format); - ret = vx_vprintf(format, va); - va_end(va); - return ret; -} - -static void __putint_cb(const putint_arg_t* arg) { +static void __putint_cb(const putint_arg_t* arg) { char tmp[33]; float value = arg->value; int base = arg->base; itoa(value, tmp, base); for (int i = 0; i < 33; ++i) { int c = tmp[i]; - if (!c) break; + if (!c) + break; vx_putchar(c); } } -void vx_putint(int value, int base) { - putint_arg_t arg; - arg.value = value; - arg.base = base; - vx_serial((vx_serial_cb)__putint_cb, &arg); -} - static void __putfloat_cb(const putfloat_arg_t* arg) { float value = arg->value; int precision = arg->precision; @@ -79,6 +54,17 @@ static void __putfloat_cb(const putfloat_arg_t* arg) { } } +static void __vprintf_cb(printf_arg_t* arg) { + arg->ret = tiny_vprintf(arg->format, *arg->va); +} + +void vx_putint(int value, int base) { + putint_arg_t arg; + arg.value = value; + arg.base = base; + vx_serial((vx_serial_cb)__putint_cb, &arg); +} + void vx_putfloat(float value, int precision) { putfloat_arg_t arg; arg.value = value; @@ -86,6 +72,23 @@ void vx_putfloat(float value, int precision) { vx_serial((vx_serial_cb)__putfloat_cb, &arg); } +int vx_vprintf(const char* format, va_list va) { + printf_arg_t arg; + arg.format = format; + arg.va = &va; + vx_serial((vx_serial_cb)__vprintf_cb, &arg); + return arg.ret; +} + +int vx_printf(const char * format, ...) { + int ret; + va_list va; + va_start(va, format); + ret = vx_vprintf(format, va); + va_end(va); + return ret; +} + #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/runtime/src/vx_syscalls.c b/runtime/src/vx_syscalls.c index 37d60b8d..37e4d193 100644 --- a/runtime/src/vx_syscalls.c +++ b/runtime/src/vx_syscalls.c @@ -16,7 +16,10 @@ int _open(const char *name, int flags, int mode) { return -1; } int _read(int file, char *ptr, int len) { return -1; } -caddr_t _sbrk(int incr) { return 0; } +caddr_t _sbrk(int incr) { + __asm__ __volatile__("ebreak"); + return 0; +} int _write(int file, char *ptr, int len) { int i; diff --git a/sim/common/bitmanip.h b/sim/common/bitmanip.h new file mode 100644 index 00000000..f485cd6d --- /dev/null +++ b/sim/common/bitmanip.h @@ -0,0 +1,79 @@ +#pragma once + +#include +#include +#include + +constexpr uint32_t count_leading_zeros(uint32_t value) { + return value ? __builtin_clz(value) : 32; +} + +constexpr uint32_t count_trailing_zeros(uint32_t value) { + return value ? __builtin_ctz(value) : 32; +} + +constexpr bool ispow2(uint32_t value) { + return value && !(value & (value - 1)); +} + +constexpr uint32_t log2ceil(uint32_t value) { + return 32 - count_leading_zeros(value - 1); +} + +inline unsigned log2up(uint32_t value) { + return std::max(1, log2ceil(value)); +} + +constexpr unsigned log2floor(uint32_t value) { + return 31 - count_leading_zeros(value); +} + +constexpr unsigned ceil2(uint32_t value) { + return 32 - count_leading_zeros(value); +} + +inline uint64_t bit_clr(uint64_t bits, uint32_t index) { + assert(index <= 63); + return bits & ~(1ull << index); +} + +inline uint64_t bit_set(uint64_t bits, uint32_t index) { + assert(index <= 63); + return bits | (1ull << index); +} + +inline bool bit_get(uint64_t bits, uint32_t index) { + assert(index <= 63); + return (bits >> index) & 0x1; +} + +inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift; + return bits & ~mask; +} + +inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + uint64_t dirty = (value << (shift + start)) >> shift; + return bit_clrw(bits, start, end) | dirty; +} + +inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + return (bits << shift) >> (shift + start); +} + +// Apply integer sign extension +inline uint32_t sext32(uint32_t word, uint32_t width) { + assert(width > 1); + assert(width <= 32); + uint32_t mask = (1 << width) - 1; + return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word; +} \ No newline at end of file diff --git a/sim/common/fixed.h b/sim/common/fixed.h new file mode 100644 index 00000000..8ef60d9a --- /dev/null +++ b/sim/common/fixed.h @@ -0,0 +1,419 @@ +#pragma once + +#include +#include +#include + +template +class Fixed { +private: + + template + struct Cast { + private: + template struct Tag {}; + + inline static T Convert(T2 value, Tag) { + return static_cast(value) << (F - F2); + } + + inline static T Convert(T2 value, Tag) { + return static_cast(value) >> (F2 - F); + } + + inline static T Convert(T2 value, Tag) { + return static_cast(value << (F - F2)); + } + + inline static T Convert(T2 value, Tag) { + return static_cast(value >> (F2 - F)); + } + + public: + inline static T Convert(T2 value) { + return Convert(value, Tag<(sizeof(T2) > sizeof(T)), (F2 > F)>{}); + } + }; + +public: + using data_type = T; + + static constexpr uint32_t FRAC = F; + static constexpr uint32_t INT = sizeof(T) * 8 - FRAC; + static constexpr uint32_t HFRAC = FRAC >> 1; + static constexpr T ONE = static_cast(1) << FRAC; + static constexpr T MASK = ONE - 1; + static constexpr T IMASK = ~MASK; + static constexpr T HALF = ONE >> 1; + static constexpr T TWO = ONE << 1; + + Fixed() {} + + explicit Fixed(int64_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(uint64_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(int32_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(uint32_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(int16_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(uint16_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(int8_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(uint8_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + template + explicit Fixed(Fixed rhs) + : data_(Cast::Convert(rhs.data())) + {} + + explicit Fixed(float rhs) + : data_(static_cast(rhs * ONE)) { + assert(data_ == static_cast(rhs * ONE)); + } + + bool operator==(Fixed rhs) const { + return (data_ == rhs.data_); + } + + bool operator!=(Fixed rhs) const { + return (data_ != rhs.data_); + } + + bool operator<(Fixed rhs) const { + return (data_ < rhs.data_); + } + + bool operator<=(Fixed rhs) const { + return (data_ <= rhs.data_); + } + + bool operator>(Fixed rhs) const { + return (data_ > rhs.data_); + } + + bool operator>=(Fixed rhs) const { + return (data_ >= rhs.data_); + } + + Fixed operator-() const { + return make(-data_); + } + + Fixed operator+=(Fixed rhs) { + *this = (*this) + rhs; + return *this; + } + + Fixed operator-=(Fixed rhs) { + *this = (*this) - rhs; + return *this; + } + + Fixed operator*=(Fixed rhs) { + *this = (*this) * rhs; + return *this; + } + + Fixed operator/=(Fixed rhs) { + *this = (*this) / rhs; + return *this; + } + + template + Fixed operator*=(Fixed rhs) { + *this = (*this) * rhs; + return *this; + } + + template + Fixed operator/=(Fixed rhs) { + *this = (*this) / rhs; + return *this; + } + + Fixed operator*=(int32_t rhs) { + *this = (*this) * rhs; + return *this; + } + + Fixed operator*=(uint32_t rhs) { + *this = (*this) * rhs; + return *this; + } + + Fixed operator*=(float rhs) { + *this = (*this) * rhs; + return *this; + } + + Fixed operator/=(int32_t rhs) { + *this = (*this) / rhs; + return *this; + } + + Fixed operator/=(uint32_t rhs) { + *this = (*this) / rhs; + return *this; + } + + Fixed operator/=(float rhs) { + *this = (*this) / rhs; + return *this; + } + + friend Fixed operator+(Fixed lhs, Fixed rhs) { + assert((static_cast(lhs.data_) + rhs.data_) == + (lhs.data_ + rhs.data_)); + return Fixed::make(lhs.data_ + rhs.data_); + } + + friend Fixed operator-(Fixed lhs, Fixed rhs) { + assert((static_cast(lhs.data_) - rhs.data_) == + (lhs.data_ - rhs.data_)); + return Fixed::make(lhs.data_ - rhs.data_); + } + + friend Fixed operator*(Fixed lhs, Fixed rhs) { + return Fixed::make((static_cast(lhs.data_) * rhs.data_) >> FRAC); + } + + template + friend Fixed operator*(Fixed lhs, Fixed rhs) { + return Fixed::make((static_cast(lhs.data_) * rhs.data()) >> F2); + } + + friend Fixed operator/(Fixed lhs, Fixed rhs) { + assert(rhs.data_ != 0); + return Fixed::make((static_cast(lhs.data_) << FRAC) / rhs.data_); + } + + template + friend Fixed operator/(Fixed lhs, Fixed rhs) { + assert(rhs.data() != 0); + return Fixed::make((static_cast(lhs.data_) << F2) / rhs.data()); + } + + friend Fixed operator*(Fixed lhs, float rhs) { + return static_cast(lhs) * rhs; + } + + friend Fixed operator*(float lhs, Fixed rhs) { + return lhs * static_cast(rhs); + } + + friend Fixed operator/(Fixed lhs, float rhs) { + return static_cast(lhs) / rhs; + } + + friend Fixed operator/(float lhs, Fixed rhs) { + return lhs / static_cast(rhs); + } + + friend Fixed operator*(Fixed lhs, char rhs) { + return lhs * static_cast(rhs); + } + + friend Fixed operator*(char lhs, Fixed rhs) { + return rhs * lhs; + } + + friend Fixed operator/(Fixed lhs, char rhs) { + return lhs / static_cast(rhs); + } + + friend Fixed operator/(char lhs, Fixed rhs) { + return rhs / lhs; + } + + friend Fixed operator*(Fixed lhs, uint8_t rhs) { + return lhs * static_cast(rhs); + } + + friend Fixed operator*(uint8_t lhs, Fixed rhs) { + return rhs * lhs; + } + + friend Fixed operator/(Fixed lhs, uint8_t rhs) { + return lhs / static_cast(rhs); + } + + friend Fixed operator/(uint8_t lhs, Fixed rhs) { + return rhs / lhs; + } + + friend Fixed operator*(Fixed lhs, short rhs) { + return lhs * static_cast(rhs); + } + + friend Fixed operator*(short lhs, Fixed rhs) { + return rhs * lhs; + } + + friend Fixed operator/(Fixed lhs, short rhs) { + return lhs / static_cast(rhs); + } + + friend Fixed operator/(short lhs, Fixed rhs) { + return rhs / lhs; + } + + friend Fixed operator*(Fixed lhs, uint16_t rhs) { + return lhs * static_cast(rhs); + } + + friend Fixed operator*(uint16_t lhs, Fixed rhs) { + return rhs * lhs; + } + + friend Fixed operator/(Fixed lhs, uint16_t rhs) { + return lhs / static_cast(rhs); + } + + friend Fixed operator/(uint16_t lhs, Fixed rhs) { + return rhs / lhs; + } + + friend Fixed operator*(Fixed lhs, int32_t rhs) { + auto value = static_cast(lhs.data_ * rhs); + assert((lhs.data_ * static_cast(rhs)) == value); + return Fixed::make(value); + } + + friend Fixed operator*(int32_t lhs, Fixed rhs) { + return rhs * lhs; + } + + friend Fixed operator/(Fixed lhs, int32_t rhs) { + assert(rhs); + auto value = static_cast(lhs.data_ / rhs); + return Fixed::make(value); + } + + friend Fixed operator/(int32_t lhs, Fixed rhs) { + return rhs / lhs; + } + + friend Fixed operator*(Fixed lhs, uint32_t rhs) { + auto value = static_cast(lhs.data_ << rhs); + assert((lhs.data_ << static_cast(rhs)) == value); + return Fixed::make(value); + } + + friend Fixed operator*(uint32_t lhs, Fixed rhs) { + return rhs * lhs; + } + + friend Fixed operator/(Fixed lhs, uint32_t rhs) { + assert(rhs); + auto value = static_cast(lhs.data_ / rhs); + return Fixed::make(value); + } + + friend Fixed operator/(uint32_t lhs, Fixed rhs) { + return rhs / lhs; + } + + friend Fixed operator<<(Fixed lhs, int32_t rhs) { + auto value = static_cast(lhs.data_ << rhs); + assert((lhs.data_ << static_cast(rhs)) == value); + return Fixed::make(value); + } + + friend Fixed operator>>(Fixed lhs, int32_t rhs) { + auto value = static_cast(lhs.data_ >> rhs); + return Fixed::make(value); + } + + friend Fixed operator<<(Fixed lhs, uint32_t rhs) { + auto value = static_cast(lhs.data_ << rhs); + assert((lhs.data_ << static_cast(rhs)) == value); + return Fixed::make(value); + } + + friend Fixed operator>>(Fixed lhs, uint32_t rhs) { + auto value = static_cast(lhs.data_ >> rhs); + return Fixed::make(value); + } + + static Fixed make(T value) { + Fixed ret; + ret.data_ = value; + return ret; + } + + explicit operator int64_t() const { + return static_cast(data_ >> F); + } + + explicit operator uint64_t() const { + return static_cast(data_ >> F); + } + + explicit operator int32_t() const { + return static_cast(data_ >> F); + } + + explicit operator uint32_t() const { + return static_cast(data_ >> F); + } + + explicit operator int16_t() const { + return static_cast(data_ >> F); + } + + explicit operator uint16_t() const { + return static_cast(data_ >> F); + } + + explicit operator int8_t() const { + return static_cast(data_ >> F); + } + + explicit operator uint8_t() const { + return static_cast(data_ >> F); + } + + template + explicit operator Fixed() const { + return Fixed(*this); + } + + explicit operator float() const { + return static_cast(data_) / (static_cast(1) << F); + } + + T data() const { + return data_; + } + +private: + T data_; +}; \ No newline at end of file diff --git a/sim/common/simobject.h b/sim/common/simobject.h index 52c74643..369a3503 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -5,10 +5,9 @@ #include #include #include +#include #include -namespace vortex { - class SimObjectBase; /////////////////////////////////////////////////////////////////////////////// @@ -59,32 +58,44 @@ protected: template class SimPort : public SimPortBase { public: - void send(const Pkt& pkt, uint64_t delay) const; + void send(const Pkt& pkt, uint64_t delay) const; - bool read(Pkt* out) { - if (!valid_) - return false; - *out = data_; - valid_ = false; - return true; + void bind(SimPort* peer) { + this->connect(peer); } + void unbind() { + this->disconnect(); + } + + bool empty() const { + return queue_.empty(); + } + + const Pkt& top() const { + return queue_.front(); + } + + Pkt& top() { + return queue_.front(); + } + + void pop() { + queue_.pop(); + } + protected: SimPort(SimObjectBase* module) : SimPortBase(module) - , valid_(false) {} - void write(const Pkt& data) { - assert(!valid_); - data_ = data; - valid_ = true; + void push(const Pkt& data) { + queue_.push(data); } SimPort& operator=(const SimPort&) = delete; - Pkt data_; - bool valid_; + std::queue queue_; template friend class SimPortEvent; }; @@ -94,15 +105,7 @@ protected: template class SlavePort : public SimPort { public: - SlavePort(SimObjectBase* module) : SimPort(module) {} - - void bind(SlavePort* peer) { - this->connect(peer); - } - - void unbind() { - this->disconnect(); - } + SlavePort(SimObjectBase* module) : SimPort(module) {} protected: SlavePort& operator=(const SlavePort&) = delete; @@ -115,18 +118,6 @@ class MasterPort : public SimPort { public: MasterPort(SimObjectBase* module) : SimPort(module) {} - void bind(SlavePort* peer) { - this->connect(peer); - } - - void bind(MasterPort* peer) { - this->connect(peer); - } - - void unbind() { - this->disconnect(); - } - protected: MasterPort& operator=(const MasterPort&) = delete; }; @@ -194,7 +185,7 @@ public: {} void fire() const override { - const_cast*>(port_)->write(pkt_); + const_cast*>(port_)->push(pkt_); } private: @@ -382,6 +373,4 @@ template void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) { auto callback = std::bind(entry, obj, std::placeholders::_1); SimPlatform::instance().schedule(callback, pkt, delay); -} - } \ No newline at end of file diff --git a/sim/common/texturing.h b/sim/common/texturing.h new file mode 100644 index 00000000..8d76519e --- /dev/null +++ b/sim/common/texturing.h @@ -0,0 +1,221 @@ +#pragma once + +#include +#include +#include +#include + +enum class WrapMode { + Clamp, + Repeat, + Mirror, +}; + +enum class TexFormat { + R8G8B8A8, + R5G6B5, + R4G4B4A4, + L8A8, + L8, + A8, +}; + +template +T Clamp(Fixed fx, WrapMode mode) { + switch (mode) { + case WrapMode::Clamp: return (fx.data() < 0) ? 0 : ((fx.data() > Fixed::MASK) ? Fixed::MASK : fx.data()); + case WrapMode::Repeat: return (fx.data() & Fixed::MASK); + case WrapMode::Mirror: return (bit_get(fx.data(), Fixed::FRAC) ? ~fx.data() : fx.data()); + default: + std::abort(); + return 0; + } +} + +inline uint32_t Stride(TexFormat format) { + switch (format) { + case TexFormat::R8G8B8A8: + return 4; + case TexFormat::R5G6B5: + case TexFormat::R4G4B4A4: + case TexFormat::L8A8: + return 2; + case TexFormat::L8: + case TexFormat::A8: + return 1; + default: + std::abort(); + return 0; + } +} + +inline void Unpack8888(TexFormat format, + uint32_t texel, + uint32_t* lo, + uint32_t* hi) { + switch (format) { + case TexFormat::R8G8B8A8: + *lo = texel & 0x00ff00ff; + *hi = (texel >> 8) & 0x00ff00ff; + break; + case TexFormat::R5G6B5: + case TexFormat::R4G4B4A4: + *lo = texel; + *hi= 0; + break; + case TexFormat::L8A8: + *lo = (texel | (texel << 8)) & 0x00ff00ff; + *hi = 0; + break; + case TexFormat::L8: + *lo = (texel | (texel << 16)) & 0x07e0f81f; + *hi = 0; + break; + case TexFormat::A8: + *lo = (texel | (texel << 12)) & 0x0f0f0f0f; + *hi = 0; + break; + default: + std::abort(); + } +} + +inline uint32_t Pack8888(TexFormat format, uint32_t lo, uint32_t hi) { + switch (format) { + case TexFormat::R8G8B8A8: + return (hi << 8) | lo; + case TexFormat::R5G6B5: + case TexFormat::R4G4B4A4: + return lo; + case TexFormat::L8A8: + return (lo | (lo >> 8)) & 0xffff; + case TexFormat::L8: + return (lo | (lo >> 16)) & 0xffff; + case TexFormat::A8: + return (lo | (lo >> 12)) & 0xffff; + default: + std::abort(); + return 0; + } +} + +inline void Lerp8888(uint32_t al, + uint32_t ah, + uint32_t bl, + uint32_t bh, + uint32_t frac, + uint32_t* lo, + uint32_t* hi) { + *lo = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; + *hi = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; +} + +template +void TexAddressLinear(Fixed fu, + Fixed fv, + uint32_t log_width, + uint32_t log_height, + WrapMode wrapu, + WrapMode wrapv, + uint32_t* addr00, + uint32_t* addr01, + uint32_t* addr10, + uint32_t* addr11, + uint32_t* alpha, + uint32_t* beta +) { + auto delta_x = Fixed::make(Fixed::HALF >> log_width); + auto delta_y = Fixed::make(Fixed::HALF >> log_height); + + uint32_t u0 = Clamp(fu - delta_x, wrapu); + uint32_t u1 = Clamp(fu + delta_x, wrapu); + uint32_t v0 = Clamp(fv - delta_y, wrapv); + uint32_t v1 = Clamp(fv + delta_y, wrapv); + + uint32_t shift_u = (Fixed::FRAC - log_width); + uint32_t shift_v = (Fixed::FRAC - log_height); + + uint32_t x0s = (u0 << 8) >> shift_u; + uint32_t y0s = (v0 << 8) >> shift_v; + + uint32_t x0 = x0s >> 8; + uint32_t y0 = y0s >> 8; + uint32_t x1 = u1 >> shift_u; + uint32_t y1 = v1 >> shift_v; + + *addr00 = x0 + (y0 << log_width); + *addr01 = x1 + (y0 << log_width); + *addr10 = x0 + (y1 << log_width); + *addr11 = x1 + (y1 << log_width); + + *alpha = x0s & 0xff; + *beta = y0s & 0xff; + + //printf("*** fu=0x%x, fv=0x%x, u0=0x%x, u1=0x%x, v0=0x%x, v1=0x%x, x0=0x%x, x1=0x%x, y0=0x%x, y1=0x%x, addr00=0x%x, addr01=0x%x, addr10=0x%x, addr11=0x%x\n", fu.data(), fv.data(), u0, u1, v0, v1, x0, x1, y0, y1, *addr00, *addr01, *addr10, *addr11); +} + +template +void TexAddressPoint(Fixed fu, + Fixed fv, + uint32_t log_width, + uint32_t log_height, + WrapMode wrapu, + WrapMode wrapv, + uint32_t* addr +) { + uint32_t u = Clamp(fu, wrapu); + uint32_t v = Clamp(fv, wrapv); + + uint32_t x = u >> (Fixed::FRAC - log_width); + uint32_t y = v >> (Fixed::FRAC - log_height); + + *addr = x + (y << log_width); + + //printf("*** fu=0x%x, fv=0x%x, u=0x%x, v=0x%x, x=0x%x, y=0x%x, addr=0x%x\n", fu.data(), fv.data(), u, v, x, y, *addr); +} + +inline uint32_t TexFilterLinear( + TexFormat format, + uint32_t texel00, + uint32_t texel01, + uint32_t texel10, + uint32_t texel11, + uint32_t alpha, + uint32_t beta +) { + uint32_t c01l, c01h; + { + uint32_t c0l, c0h; + uint32_t c1l, c1h; + Unpack8888(format, texel00, &c0l, &c0h); + Unpack8888(format, texel01, &c1l, &c1h); + Lerp8888(c0l, c0h, c1l, c1h, alpha, &c01l, &c01h); + } + + uint32_t c23l, c23h; + { + uint32_t c2l, c2h; + uint32_t c3l, c3h; + Unpack8888(format, texel10, &c2l, &c2h); + Unpack8888(format, texel11, &c3l, &c3h); + Lerp8888(c2l, c2h, c3l, c3h, alpha, &c23l, &c23h); + } + + uint32_t cl, ch; + Lerp8888(c01l, c01h, c23l, c23h, beta, &cl, &ch); + uint32_t color = Pack8888(TexFormat::R8G8B8A8, cl, ch); + + //printf("*** texel00=0x%x, texel01=0x%x, texel10=0x%x, texel11=0x%x, color=0x%x\n", texel00, texel01, texel10, texel11, color); + + return color; +} + +inline uint32_t TexFilterPoint(TexFormat format, uint32_t texel) { + uint32_t cl, ch; + Unpack8888(format, texel, &cl, &ch); + uint32_t color = Pack8888(TexFormat::R8G8B8A8, cl, ch); + + //printf("*** texel=0x%x, color=0x%x\n", texel, color); + + return color; +} \ No newline at end of file diff --git a/sim/common/util.h b/sim/common/util.h index b6137199..d66305ee 100644 --- a/sim/common/util.h +++ b/sim/common/util.h @@ -3,85 +3,12 @@ #include #include #include +#include template void unused(Args&&...) {} #define __unused(...) unused(__VA_ARGS__) -constexpr uint32_t count_leading_zeros(uint32_t value) { - return value ? __builtin_clz(value) : 32; -} - -constexpr uint32_t count_trailing_zeros(uint32_t value) { - return value ? __builtin_ctz(value) : 32; -} - -constexpr bool ispow2(uint32_t value) { - return value && !(value & (value - 1)); -} - -constexpr uint32_t log2ceil(uint32_t value) { - return 32 - count_leading_zeros(value - 1); -} - -inline unsigned log2up(uint32_t value) { - return std::max(1, log2ceil(value)); -} - -constexpr unsigned log2floor(uint32_t value) { - return 31 - count_leading_zeros(value); -} - -constexpr unsigned ceil2(uint32_t value) { - return 32 - count_leading_zeros(value); -} - -inline uint64_t bit_clr(uint64_t bits, uint32_t index) { - assert(index <= 63); - return bits & ~(1ull << index); -} - -inline uint64_t bit_set(uint64_t bits, uint32_t index) { - assert(index <= 63); - return bits | (1ull << index); -} - -inline bool bit_get(uint64_t bits, uint32_t index) { - assert(index <= 63); - return (bits >> index) & 0x1; -} - -inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) { - assert(end >= start); - assert(end <= 63); - uint32_t shift = 63 - end; - uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift; - return bits & ~mask; -} - -inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) { - assert(end >= start); - assert(end <= 63); - uint32_t shift = 63 - end; - uint64_t dirty = (value << (shift + start)) >> shift; - return bit_clrw(bits, start, end) | dirty; -} - -inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) { - assert(end >= start); - assert(end <= 63); - uint32_t shift = 63 - end; - return (bits << shift) >> (shift + start); -} - -// Apply integer sign extension -inline uint32_t sext32(uint32_t word, uint32_t width) { - assert(width > 1); - assert(width <= 32); - uint32_t mask = (1 << width) - 1; - return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word; -} - // return file extension const char* fileExtension(const char* filepath); \ No newline at end of file diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index a0c8d339..662fbf1d 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -23,8 +23,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_FLAGS += $(DBG_TRACE_FLAGS) -DBG_FLAGS += -DDBG_CACHE_REQ_INFO -DBG_FLAGS += -DVCD_OUTPUT FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src TEX_INCLUDE = -I$(RTL_DIR)/tex_unit @@ -51,10 +49,17 @@ VL_FLAGS += $(RTL_INCLUDE) VL_FLAGS += $(CONFIGS) CXXFLAGS += $(CONFIGS) +# Enable Verilator multithreaded simulation +#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') +#VL_FLAGS += --threads $(THREADS) + +# Enable VCD trace +VCD_TRACE = -DVCD_OUTPUT + # Debugigng ifdef DEBUG - VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS) - CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS) + VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS) + CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS) else VL_FLAGS += -DNDEBUG CXXFLAGS += -O2 -DNDEBUG diff --git a/sim/simX/Makefile b/sim/simX/Makefile index 75a4a495..7ea54863 100644 --- a/sim/simX/Makefile +++ b/sim/simX/Makefile @@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a TOP = vx_cache_sim SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp -SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp processor.cpp main.cpp +SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp main.cpp OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS))) VPATH := $(sort $(dir $(SRCS))) diff --git a/sim/simX/cache.cpp b/sim/simX/cache.cpp index 503d32c5..da69cf3a 100644 --- a/sim/simX/cache.cpp +++ b/sim/simX/cache.cpp @@ -13,6 +13,7 @@ struct params_t { uint32_t sets_per_bank; uint32_t blocks_per_set; uint32_t words_per_block; + uint32_t log2_num_inputs; uint32_t word_select_addr_start; uint32_t word_select_addr_end; @@ -31,8 +32,10 @@ struct params_t { uint32_t offset_bits = config.B - config.W; uint32_t log2_bank_size = config.C - bank_bits; uint32_t index_bits = log2_bank_size - (config.B << config.A); - assert(log2_bank_size >= config.B); - + assert(log2_bank_size >= config.B); + + this->log2_num_inputs = log2ceil(config.num_inputs); + this->words_per_block = 1 << offset_bits; this->blocks_per_set = 1 << config.A; this->sets_per_bank = 1 << index_bits; @@ -104,7 +107,7 @@ struct set_t { struct bank_req_info_t { bool valid; uint32_t req_id; - uint32_t req_tag; + uint64_t req_tag; }; struct bank_req_t { @@ -194,7 +197,7 @@ public: return root_entry; } - bool try_pop(bank_req_t* out) { + bool pop(bank_req_t* out) { for (auto& entry : entries_) { if (entry.valid && entry.mshr_replay) { *out = entry; @@ -208,16 +211,13 @@ public: }; struct bank_t { - std::vector sets; - MSHR mshr; - std::queue stall_buffer; - bank_req_t active_req; + std::vector sets; + MSHR mshr; bank_t(const CacheConfig& config, const params_t& params) : sets(params.sets_per_bank, params.blocks_per_set) , mshr(config.mshr_size) - , active_req(config.ports_per_bank) {} }; @@ -229,8 +229,8 @@ private: CacheConfig config_; params_t params_; std::vector banks_; - std::vector> core_rsps_; - Switch::Ptr mem_switch_; + Switch::Ptr mem_switch_; + Switch::Ptr bypass_switch_; std::vector> mem_req_ports_; std::vector> mem_rsp_ports_; @@ -240,241 +240,270 @@ public: , config_(config) , params_(config) , banks_(config.num_banks, {config, params_}) - , core_rsps_(config.num_inputs) , mem_req_ports_(config.num_banks, simobject) , mem_rsp_ports_(config.num_banks, simobject) { + bypass_switch_ = Switch::Create("bypass_arb", ArbiterType::Priority, 2); + bypass_switch_->ReqOut.bind(&simobject->MemReqPort); + simobject->MemRspPort.bind(&bypass_switch_->RspIn); + if (config.num_banks > 1) { mem_switch_ = Switch::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks); for (uint32_t i = 0, n = config.num_banks; i < n; ++i) { mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i)); mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i)); } - mem_switch_->ReqOut.bind(&simobject->MemReqPort); - simobject->MemRspPort.bind(&mem_switch_->RspIn); + mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn); } else { - mem_req_ports_.at(0).bind(&simobject->MemReqPort); - simobject->MemRspPort.bind(&mem_rsp_ports_.at(0)); + mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0)); } } void step(uint64_t /*cycle*/) { - // process core response - for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) { - auto& core_rsp = core_rsps_.at(req_id); - if (!core_rsp.empty()) { - simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_rsp.front()}, config_.latency); - core_rsp.pop(); - } + // handle bypasss responses + auto& bypass_port = bypass_switch_->RspOut.at(1); + if (!bypass_port.empty()) { + auto& mem_rsp = bypass_port.top(); + uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1); + uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs; + MemRsp core_rsp(tag); + simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency); + bypass_port.pop(); } - for (auto& bank : banks_) { - auto& active_req = bank.active_req; + std::vector pipeline_reqs(config_.num_banks, config_.ports_per_bank); - // try chedule mshr replay - if (!active_req.valid) { - bank.mshr.try_pop(&active_req); - } - - // try schedule stall queue if MSHR has space - if (!active_req.valid - && !bank.stall_buffer.empty() - && !bank.mshr.full()) { - active_req = bank.stall_buffer.front(); - bank.stall_buffer.pop(); - } - } + // handle MSHR replay + for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + auto& bank = banks_.at(bank_id); + auto& pipeline_req = pipeline_reqs.at(bank_id); + bank.mshr.pop(&pipeline_req); + } // handle memory fills - for (uint32_t i = 0, n = config_.num_banks; i < n; ++i) { - MemRsp mem_rsp; - if (mem_rsp_ports_.at(i).read(&mem_rsp)) { - this->processMemoryFill(i, mem_rsp.tag); + std::vector pending_fill_req(config_.num_banks, false); + for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + auto& mem_rsp_port = mem_rsp_ports_.at(bank_id); + if (!mem_rsp_port.empty()) { + auto& mem_rsp = mem_rsp_port.top(); + this->processMemoryFill(bank_id, mem_rsp.tag); + pending_fill_req.at(bank_id) = true; + mem_rsp_port.pop(); } } // handle incoming core requests - for (uint32_t i = 0, n = config_.num_inputs; i < n; ++i) { - MemReq core_req; - if (!simobject_->CoreReqPorts.at(i).read(&core_req)) + for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) { + auto& core_req_port = simobject_->CoreReqPorts.at(req_id); + if (core_req_port.empty()) continue; - auto bank_id = params_.addr_bank_id(core_req.addr); - auto set_id = params_.addr_set_id(core_req.addr); - auto tag = params_.addr_tag(core_req.addr); - auto port_id = i % config_.ports_per_bank; + auto& core_req = core_req_port.top(); + + // check cache bypassing + if (core_req.is_io) { + // send IO request + this->processIORequest(core_req, req_id); + + // remove request + core_req_port.pop(); + continue; + } + + auto bank_id = params_.addr_bank_id(core_req.addr); + auto set_id = params_.addr_set_id(core_req.addr); + auto tag = params_.addr_tag(core_req.addr); + auto port_id = req_id % config_.ports_per_bank; - // create abnk request + // create bank request bank_req_t bank_req(config_.ports_per_bank); bank_req.valid = true; bank_req.write = core_req.write; bank_req.mshr_replay = false; bank_req.tag = tag; bank_req.set_id = set_id; - bank_req.infos.at(port_id) = {true, i, core_req.tag}; + bank_req.infos.at(port_id) = {true, req_id, core_req.tag}; - auto& bank = banks_.at(bank_id); - - // check MSHR capacity - if (bank.mshr.full()) { - // add to stall buffer - bank.stall_buffer.emplace(bank_req); + auto& bank = banks_.at(bank_id); + auto& pipeline_req = pipeline_reqs.at(bank_id); + + // check pending MSHR replay + if (pipeline_req.valid + && pipeline_req.mshr_replay) { + // stall + continue; + } + + // check pending fill request + if (pending_fill_req.at(bank_id)) { + // stall continue; } - - auto& active_req = bank.active_req; - - // check pending MSHR request - if (active_req.valid - && active_req.mshr_replay) { - // add to stall buffer - bank.stall_buffer.emplace(bank_req); + + // check MSHR capacity if read or writeback + if ((!core_req.write || !config_.write_through) + && bank.mshr.full()) { + // stall continue; - } + } // check bank conflicts - if (active_req.valid) { + if (pipeline_req.valid) { // check port conflict - if (active_req.write != core_req.write - || active_req.set_id != set_id - || active_req.tag != tag - || active_req.infos[port_id].valid) { - // add to stall buffer - bank.stall_buffer.emplace(bank_req); + if (pipeline_req.write != core_req.write + || pipeline_req.set_id != set_id + || pipeline_req.tag != tag + || pipeline_req.infos[port_id].valid) { + // stall continue; } // update pending request infos - active_req.infos[port_id] = bank_req.infos[port_id]; + pipeline_req.infos[port_id] = bank_req.infos[port_id]; } else { // schedule new request - active_req = bank_req; + pipeline_req = bank_req; } + // remove request + core_req_port.pop(); } - // process active request - for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { - this->processBankRequest(bank_id); + // process active request + this->processBankRequest(pipeline_reqs); + } + + void processIORequest(const MemReq& core_req, uint32_t req_id) { + { + MemReq mem_req(core_req); + mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id; + bypass_switch_->ReqIn.at(1).send(mem_req, 1); + } + + if (core_req.write && config_.write_reponse) { + simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_req.tag}, 1); } } void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) { // update block - auto& bank = banks_.at(bank_id); - auto& root_entry = bank.mshr.replay(mshr_id); - auto& set = bank.sets.at(root_entry.set_id); - auto& block = set.blocks.at(root_entry.block_id); + auto& bank = banks_.at(bank_id); + auto& entry = bank.mshr.replay(mshr_id); + auto& set = bank.sets.at(entry.set_id); + auto& block = set.blocks.at(entry.block_id); block.valid = true; - block.tag = root_entry.tag; + block.tag = entry.tag; } - void processBankRequest(uint32_t bank_id) { - auto& bank = banks_.at(bank_id); - auto& active_req = bank.active_req; - if (!active_req.valid) - return; + void processBankRequest(const std::vector& pipeline_reqs) { + for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + auto& pipeline_req = pipeline_reqs.at(bank_id); + if (!pipeline_req.valid) + continue; - active_req.valid = false; + auto& bank = banks_.at(bank_id); + auto& set = bank.sets.at(pipeline_req.set_id); - auto& set = bank.sets.at(active_req.set_id); - - if (active_req.mshr_replay) { - // send core response - for (auto& info : active_req.infos) { - core_rsps_.at(info.req_id).emplace(info.req_tag); - } - } else { - bool hit = false; - bool found_free_block = false; - int hit_block_id = 0; - int repl_block_id = 0; - uint32_t max_cnt = 0; - - for (int i = 0, n = set.blocks.size(); i < n; ++i) { - auto& block = set.blocks.at(i); - if (block.valid) { - if (block.tag == active_req.tag) { - block.lru_ctr = 0; - hit_block_id = i; - hit = true; - } else { - ++block.lru_ctr; - } - if (max_cnt < block.lru_ctr) { - max_cnt = block.lru_ctr; + if (pipeline_req.mshr_replay) { + // send core response + for (auto& info : pipeline_req.infos) { + simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency); + } + } else { + bool hit = false; + bool found_free_block = false; + int hit_block_id = 0; + int repl_block_id = 0; + uint32_t max_cnt = 0; + + for (int i = 0, n = set.blocks.size(); i < n; ++i) { + auto& block = set.blocks.at(i); + if (block.valid) { + if (block.tag == pipeline_req.tag) { + block.lru_ctr = 0; + hit_block_id = i; + hit = true; + } else { + ++block.lru_ctr; + } + if (max_cnt < block.lru_ctr) { + max_cnt = block.lru_ctr; + repl_block_id = i; + } + } else { + found_free_block = true; repl_block_id = i; } - } else { - found_free_block = true; - repl_block_id = i; - } - } - - if (hit) { - // - // MISS handling - // - if (active_req.write) { - // handle write hit - auto& hit_block = set.blocks.at(hit_block_id); - if (config_.write_through) { - // forward write request to memory - MemReq mem_req; - mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag); - mem_req.write = true; - mem_req.tag = 0; - mem_req_ports_.at(bank_id).send(mem_req, 1); - } else { - // mark block as dirty - hit_block.dirty = true; - } - } - // send core response - for (auto& info : active_req.infos) { - core_rsps_.at(info.req_id).emplace(info.req_tag); - } - } else { - // - // MISS handling - // - if (!found_free_block && !config_.write_through) { - // write back dirty block - auto& repl_block = set.blocks.at(repl_block_id); - if (repl_block.dirty) { - MemReq mem_req; - mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag); - mem_req.write = true; - mem_req.tag = 0; - mem_req_ports_.at(bank_id).send(mem_req, 1); - } } - if (active_req.write && config_.write_through) { - // forward write request to memory - { - MemReq mem_req; - mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); - mem_req.write = true; - mem_req.tag = 0; - mem_req_ports_.at(bank_id).send(mem_req, 1); + if (hit) { + // + // MISS handling + // + if (pipeline_req.write) { + // handle write hit + auto& hit_block = set.blocks.at(hit_block_id); + if (config_.write_through) { + // forward write request to memory + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag); + mem_req.write = true; + mem_req_ports_.at(bank_id).send(mem_req, 1); + } else { + // mark block as dirty + hit_block.dirty = true; + } } // send core response - for (auto& info : active_req.infos) { - core_rsps_.at(info.req_id).emplace(info.req_tag); + if (!pipeline_req.write || config_.write_reponse) { + for (auto& info : pipeline_req.infos) { + simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency); + } + } + } else { + // + // MISS handling + // + if (!found_free_block && !config_.write_through) { + // write back dirty block + auto& repl_block = set.blocks.at(repl_block_id); + if (repl_block.dirty) { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag); + mem_req.write = true; + mem_req_ports_.at(bank_id).send(mem_req, 1); + } } - } else { - // lookup - int pending = bank.mshr.lookup(active_req); - // allocate MSHR - int mshr_id = bank.mshr.allocate(active_req, repl_block_id); - - // send fill request - if (pending == -1) { - MemReq mem_req; - mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); - mem_req.write = active_req.write; - mem_req.tag = mshr_id; - mem_req_ports_.at(bank_id).send(mem_req, 1); + if (pipeline_req.write && config_.write_through) { + // forward write request to memory + { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); + mem_req.write = true; + mem_req_ports_.at(bank_id).send(mem_req, 1); + } + // send core response + if (config_.write_reponse) { + for (auto& info : pipeline_req.infos) { + simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency); + } + } + } else { + // MSHR lookup + int pending = bank.mshr.lookup(pipeline_req); + + // allocate MSHR + int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id); + + // send fill request + if (pending == -1) { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); + mem_req.write = pipeline_req.write; + mem_req.tag = mshr_id; + mem_req_ports_.at(bank_id).send(mem_req, 1); + } } } } diff --git a/sim/simX/cache.h b/sim/simX/cache.h index 58767d9f..0be8cf6e 100644 --- a/sim/simX/cache.h +++ b/sim/simX/cache.h @@ -14,7 +14,8 @@ struct CacheConfig { uint8_t num_banks; // number of banks uint8_t ports_per_bank; // number of ports per bank uint8_t num_inputs; // number of inputs - bool write_through; // is write-through cache + bool write_through; // is write-through + bool write_reponse; // enable write response uint16_t victim_size; // victim cache size uint16_t mshr_size; // MSHR buffer size uint8_t latency; // pipeline latency diff --git a/sim/simX/constants.h b/sim/simX/constants.h index d9171b8d..218fa5f9 100644 --- a/sim/simX/constants.h +++ b/sim/simX/constants.h @@ -10,11 +10,7 @@ namespace vortex { struct Constants { -static constexpr uint32_t CORE_TO_DCACHE_DELAY = 1 + SM_ENABLE; -static constexpr uint32_t CORE_TO_ICACHE_DELAY = 1; - -static constexpr uint32_t ICACHE_TO_MEM_DELAY = 2; -static constexpr uint32_t DCACHE_TO_MEM_DELAY = 2; +static constexpr uint32_t SMEM_DELAY = 1 + SM_ENABLE; }; diff --git a/sim/simX/core.cpp b/sim/simX/core.cpp index e1333dac..19b20967 100644 --- a/sim/simX/core.cpp +++ b/sim/simX/core.cpp @@ -19,6 +19,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) , decoder_(arch) , mmu_(0, arch.wsize(), true) , shared_mem_(4096) + , tex_units_(NUM_TEX_UNITS, this) , warps_(arch.num_warps()) , barriers_(arch.num_barriers(), 0) , csrs_(arch.num_csrs(), 0) @@ -35,7 +36,8 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) 1, // number of banks 1, // number of ports 1, // request size - true, // write-throught + true, // write-through + false, // write response 0, // victim size NUM_WARPS, // mshr 2, // pipeline latency @@ -49,12 +51,14 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) DCACHE_NUM_BANKS, // number of banks DCACHE_NUM_PORTS, // number of ports (uint8_t)arch.num_threads(), // request size - true, // write-throught + true, // write-through + false, // write response 0, // victim size DCACHE_MSHR_SIZE, // mshr 2, // pipeline latency })) - , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) + , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) + , dcache_switch_(arch.num_threads()) , fetch_stage_("fetch") , decode_stage_("decode") , issue_stage_("issue") @@ -65,10 +69,9 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) , last_schedule_wid_(0) , issued_instrs_(0) , committed_instrs_(0) + , ecall_(false) , ebreak_(false) , stats_insts_(0) - , stats_loads_(0) - , stats_stores_(0) , MemRspPort(this) , MemReqPort(this) { @@ -92,6 +95,18 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) this->MemRspPort.bind(&l1_mem_switch_->RspIn); l1_mem_switch_->ReqOut.bind(&this->MemReqPort); + // lsu/tex switch + for (uint32_t i = 0, n = arch.num_threads(); i < n; ++i) { + auto& sw = dcache_switch_.at(i); +#ifdef EXT_TEX_ENABLE + sw = Switch::Create("lsu_arb", ArbiterType::Priority, 2); +#else + sw = Switch::Create("lsu_arb", ArbiterType::Priority, 1); +#endif + sw->ReqOut.bind(&dcache_->CoreReqPorts.at(i)); + dcache_->CoreRspPorts.at(i).bind(&sw->RspIn); + } + // activate warp0 warps_.at(0)->setTmask(0, true); } @@ -147,44 +162,41 @@ void Core::warp_scheduler(uint64_t cycle) { auto& warp = warps_.at(scheduled_warp); stats_insts_ += warp->getActiveThreads(); - pipeline_state_t state; - state.clear(); - state.id = (issued_instrs_++ * arch_.num_cores()) + id_; + auto trace = new pipeline_trace_t((issued_instrs_++ * arch_.num_cores()) + id_, arch_); - warp->eval(&state); + warp->eval(trace); - DT(3, cycle, "pipeline-schedule: " << state); + DT(3, cycle, "pipeline-schedule: " << *trace); // advance to fetch stage - fetch_stage_.push(state); + fetch_stage_.push(trace); } void Core::fetch(uint64_t cycle) { // handle icache reponse - { - MemRsp mem_rsp; - if (icache_->CoreRspPorts.at(0).read(&mem_rsp)){ - pipeline_state_t state; - pending_icache_.remove(mem_rsp.tag, &state); - auto latency = (SimPlatform::instance().cycles() - state.icache_latency); - state.icache_latency = latency; - decode_stage_.push(state); - DT(3, cycle, "icache-rsp: addr=" << std::hex << state.PC << ", tag=" << mem_rsp.tag << ", " << state); - } + auto& icache_rsp_port = icache_->CoreRspPorts.at(0); + if (!icache_rsp_port.empty()){ + auto& mem_rsp = icache_rsp_port.top(); + auto trace = pending_icache_.at(mem_rsp.tag); + auto latency = (SimPlatform::instance().cycles() - trace->icache_latency); + trace->icache_latency = latency; + decode_stage_.push(trace); + DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace); + pending_icache_.release(mem_rsp.tag); + icache_rsp_port.pop(); } // send icache request - { - pipeline_state_t state; - if (fetch_stage_.try_pop(&state)) { - state.icache_latency = SimPlatform::instance().cycles(); - MemReq mem_req; - mem_req.addr = state.PC; - mem_req.write = false; - mem_req.tag = pending_icache_.allocate(state); - icache_->CoreReqPorts.at(0).send(mem_req, 1); - DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << state); - } + if (!fetch_stage_.empty()) { + auto trace = fetch_stage_.top(); + trace->icache_latency = SimPlatform::instance().cycles(); + MemReq mem_req; + mem_req.addr = trace->PC; + mem_req.write = false; + mem_req.tag = pending_icache_.allocate(trace); + icache_->CoreReqPorts.at(0).send(mem_req, 1); + DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); + fetch_stage_.pop(); } // schedule next warp @@ -194,19 +206,21 @@ void Core::fetch(uint64_t cycle) { void Core::decode(uint64_t cycle) { __unused (cycle); - pipeline_state_t state; - if (!decode_stage_.try_pop(&state)) - return; + if (decode_stage_.empty()) + return; + + auto trace = decode_stage_.top(); // release warp - if (!state.stall_warp) { - stalled_warps_.reset(state.wid); + if (!trace->fetch_stall) { + stalled_warps_.reset(trace->wid); } - DT(3, cycle, "pipeline-decode: " << state); + DT(3, cycle, "pipeline-decode: " << *trace); // advance to issue stage - issue_stage_.push(state); + issue_stage_.push(trace); + decode_stage_.pop(); } void Core::issue(uint64_t cycle) { @@ -214,12 +228,13 @@ void Core::issue(uint64_t cycle) { if (!issue_stage_.empty()) { // insert to ibuffer - auto& state = issue_stage_.top(); - auto& ibuffer = ibuffers_.at(state.wid); - if (ibuffer.full()) { - DT(3, cycle, "*** ibuffer-stall: " << state); - } else { - ibuffer.push(state); + auto trace = issue_stage_.top(); + auto& ibuffer = ibuffers_.at(trace->wid); + if (!trace->check_stalled(ibuffer.full())) { + DT(3, cycle, "*** ibuffer-stall: " << *trace); + } + if (!ibuffer.full()) { + ibuffer.push(trace); issue_stage_.pop(); } } @@ -229,27 +244,30 @@ void Core::issue(uint64_t cycle) { if (ibuffer.empty()) continue; - auto& state = ibuffer.top(); + auto trace = ibuffer.top(); // check scoreboard - if (scoreboard_.in_use(state)) { + if (!trace->check_stalled(scoreboard_.in_use(trace))) { DTH(3, cycle, "*** scoreboard-stall: dependents={"); - auto owners = scoreboard_.owners(state); - for (uint32_t i = 0, n = owners.size(); i < n; ++i) { - if (i) DTN(3, ", "); - DTN(3, "#" << owners.at(i)); + auto uses = scoreboard_.get_uses(trace); + for (uint32_t i = 0, n = uses.size(); i < n; ++i) { + auto& use = uses.at(i); + __unused(use); + if (i) DTN(3, ", "); + DTN(3, use.type << use.reg << "(#" << use.owner << ")"); } - DTN(3, "}, " << state << std::endl); - continue; + DTN(3, "}, " << *trace << std::endl); } + if (scoreboard_.in_use(trace)) + continue; - DT(3, cycle, "pipeline-issue: " << state); + DT(3, cycle, "pipeline-issue: " << *trace); // update scoreboard - scoreboard_.reserve(state); + scoreboard_.reserve(trace); // advance to execute stage - execute_stage_.push(state); + execute_stage_.push(trace); ibuffer.pop(); break; @@ -259,11 +277,11 @@ void Core::issue(uint64_t cycle) { void Core::execute(uint64_t cycle) { // process stage inputs if (!execute_stage_.empty()) { - auto& state = execute_stage_.top(); - auto& exe_unit = exe_units_.at((int)state.exe_type); - exe_unit->push_input(state); + auto trace = execute_stage_.top(); + auto& exe_unit = exe_units_.at((int)trace->exe_type); + exe_unit->push(trace); + DT(3, cycle, "pipeline-execute: " << *trace); execute_stage_.pop(); - DT(3, cycle, "pipeline-execute: " << state); } // advance execute units @@ -273,13 +291,14 @@ void Core::execute(uint64_t cycle) { // commit completed instructions for (auto& exe_unit : exe_units_) { - pipeline_state_t state; - if (exe_unit->pop_output(&state)) { - if (state.stall_warp) { - stalled_warps_.reset(state.wid); + if (!exe_unit->empty()) { + auto trace = exe_unit->top(); + if (trace->fetch_stall) { + stalled_warps_.reset(trace->wid); } // advance to commit stage - commit_stage_.push(state); + commit_stage_.push(trace); + exe_unit->pop(); } } } @@ -287,21 +306,28 @@ void Core::execute(uint64_t cycle) { void Core::commit(uint64_t cycle) { __unused (cycle); - pipeline_state_t state; - if (!commit_stage_.try_pop(&state)) + if (commit_stage_.empty()) return; - DT(3, cycle, "pipeline-commit: " << state); + auto trace = commit_stage_.top(); + + DT(3, cycle, "pipeline-commit: " << *trace); // update scoreboard - scoreboard_.release(state); + scoreboard_.release(trace); assert(committed_instrs_ <= issued_instrs_); ++committed_instrs_; + + commit_stage_.pop(); + + // delete the trace + delete trace; } bool Core::running() const { - return (committed_instrs_ != issued_instrs_); + bool is_running = (committed_instrs_ != issued_instrs_); + return is_running; } Word Core::get_csr(Addr addr, int tid, int wid) { @@ -355,6 +381,12 @@ Word Core::get_csr(Addr addr, int tid, int wid) { // NumCycles return (Word)(SimPlatform::instance().cycles() >> 32); } else { + if (addr >= CSR_TEX(0,0) + && addr < CSR_TEX(NUM_TEX_UNITS,0)) { + uint32_t unit = CSR_TEX_UNIT(addr); + uint32_t state = CSR_TEX_STATE(addr); + return tex_units_.at(unit).get_state(state); + } return csrs_.at(addr); } } @@ -367,6 +399,13 @@ void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) { } else if (addr == CSR_FCSR) { fcsrs_.at(wid) = value & 0xff; } else { + if (addr >= CSR_TEX(0,0) + && addr < CSR_TEX(NUM_TEX_UNITS,0)) { + uint32_t unit = CSR_TEX_UNIT(addr); + uint32_t state = CSR_TEX_STATE(addr); + tex_units_.at(unit).set_state(state, value); + return; + } csrs_.at(addr) = value; } } @@ -390,29 +429,27 @@ Word Core::icache_read(Addr addr, Size size) { return data; } -Word Core::dcache_read(Addr addr, Size size) { - ++stats_loads_; +Word Core::dcache_read(Addr addr, Size size) { Word data = 0; -#ifdef SM_ENABLE - if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) - && ((addr + 3) < SMEM_BASE_ADDR)) { - shared_mem_.read(&data, addr & (SMEM_SIZE-1), size); - return data; + if (SM_ENABLE) { + if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) + && ((addr + 3) < SMEM_BASE_ADDR)) { + shared_mem_.read(&data, addr & (SMEM_SIZE-1), size); + return data; + } } -#endif mmu_.read(&data, addr, size, 0); return data; } -void Core::dcache_write(Addr addr, Word data, Size size) { - ++stats_stores_; -#ifdef SM_ENABLE - if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) - && ((addr + 3) < SMEM_BASE_ADDR)) { - shared_mem_.write(&data, addr & (SMEM_SIZE-1), size); - return; +void Core::dcache_write(Addr addr, Word data, Size size) { + if (SM_ENABLE) { + if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) + && ((addr + 3) < SMEM_BASE_ADDR)) { + shared_mem_.write(&data, addr & (SMEM_SIZE-1), size); + return; + } } -#endif if (addr >= IO_COUT_ADDR && addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { this->writeToStdOut(addr, data); @@ -421,11 +458,8 @@ void Core::dcache_write(Addr addr, Word data, Size size) { mmu_.write(&data, addr, size, 0); } -void Core::printStats() const { - std::cout << "Cycles: " << SimPlatform::instance().cycles() << std::endl - << "Insts : " << stats_insts_ << std::endl - << "Loads : " << stats_loads_ << std::endl - << "Stores: " << stats_stores_ << std::endl; +Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector* mem_addrs) { + return tex_units_.at(unit).read(u, v, lod, mem_addrs); } void Core::writeToStdOut(Addr addr, Word data) { @@ -439,10 +473,14 @@ void Core::writeToStdOut(Addr addr, Word data) { } } +void Core::trigger_ecall() { + ecall_ = true; +} + void Core::trigger_ebreak() { ebreak_ = true; } -bool Core::check_ebreak() const { - return ebreak_; +bool Core::check_exit() const { + return ebreak_ || ecall_; } \ No newline at end of file diff --git a/sim/simX/core.h b/sim/simX/core.h index ea1a6582..5066d8af 100644 --- a/sim/simX/core.h +++ b/sim/simX/core.h @@ -20,6 +20,7 @@ #include "ibuffer.h" #include "scoreboard.h" #include "exeunit.h" +#include "tex_unit.h" namespace vortex { @@ -34,8 +35,6 @@ public: void step(uint64_t cycle); - void printStats() const; - Word id() const { return id_; } @@ -72,9 +71,13 @@ public: void dcache_write(Addr, Word, Size); + Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector* mem_addrs); + + void trigger_ecall(); + void trigger_ebreak(); - bool check_ebreak() const; + bool check_exit() const; private: @@ -92,10 +95,8 @@ private: const ArchDef arch_; const Decoder decoder_; MemoryUnit mmu_; - -#ifdef SM_ENABLE RAM shared_mem_; -#endif + std::vector tex_units_; std::vector> warps_; std::vector barriers_; @@ -107,6 +108,7 @@ private: Cache::Ptr icache_; Cache::Ptr dcache_; Switch::Ptr l1_mem_switch_; + std::vector::Ptr> dcache_switch_; PipelineStage fetch_stage_; PipelineStage decode_stage_; @@ -114,20 +116,20 @@ private: PipelineStage execute_stage_; PipelineStage commit_stage_; - HashTable pending_icache_; + HashTable pending_icache_; WarpMask stalled_warps_; uint32_t last_schedule_wid_; uint32_t issued_instrs_; uint32_t committed_instrs_; + bool ecall_; bool ebreak_; std::unordered_map print_bufs_; uint64_t stats_insts_; - uint64_t stats_loads_; - uint64_t stats_stores_; friend class LsuUnit; + friend class GpuUnit; public: SlavePort MemRspPort; diff --git a/sim/simX/decode.cpp b/sim/simX/decode.cpp index 6530d223..a2957c64 100644 --- a/sim/simX/decode.cpp +++ b/sim/simX/decode.cpp @@ -41,14 +41,18 @@ static const std::unordered_map sc_instTable = { {Opcode::FMNMSUB, {false, InstType::R4_TYPE}}, {Opcode::VSET, {false, InstType::V_TYPE}}, {Opcode::GPGPU, {false, InstType::R_TYPE}}, + {Opcode::GPU, {false, InstType::R4_TYPE}}, }; -static const char* op_string(const Instr &instr) { - Word func3 = instr.getFunc3(); - Word func7 = instr.getFunc7(); - Word rs2 = instr.getRSrc(1); - Word imm = instr.getImm(); - switch (instr.getOpcode()) { +static const char* op_string(const Instr &instr) { + auto opcode = instr.getOpcode(); + Word func2 = instr.getFunc2(); + Word func3 = instr.getFunc3(); + Word func7 = instr.getFunc7(); + Word rs2 = instr.getRSrc(1); + Word imm = instr.getImm(); + + switch (opcode) { case Opcode::NOP: return "NOP"; case Opcode::LUI_INST: return "LUI"; case Opcode::AUIPC_INST: return "AUIPC"; @@ -120,7 +124,16 @@ static const char* op_string(const Instr &instr) { } case Opcode::SYS_INST: switch (func3) { - case 0: return imm ? "EBREAK" : "ECALL"; + case 0: + switch (imm) { + case 0x000: return "ECALL"; + case 0x001: return "EBREAK"; + case 0x002: return "URET"; + case 0x102: return "SRET"; + case 0x302: return "MRET"; + default: + std::abort(); + } case 1: return "CSRRW"; case 2: return "CSRRS"; case 3: return "CSRRC"; @@ -181,29 +194,43 @@ static const char* op_string(const Instr &instr) { case 1: return "WSPAWN"; case 2: return "SPLIT"; case 3: return "JOIN"; - case 4: return "BAR"; - case 6: return "PREFETCH"; + case 4: return "BAR"; + default: + std::abort(); + } + case Opcode::GPU: + switch (func3) { + case 0: return "TEX"; + case 1: { + switch (func2) { + case 0: return "CMOV"; + default: + std::abort(); + } + } default: std::abort(); } default: std::abort(); - } + } } namespace vortex { -std::ostream &operator<<(std::ostream &os, const Instr &instr) { - os << op_string(instr) << ": "; +std::ostream &operator<<(std::ostream &os, const Instr &instr) { auto opcode = instr.getOpcode(); + Word func2 = instr.getFunc2(); + Word func3 = instr.getFunc3(); + + os << op_string(instr) << ": "; + if (opcode == S_INST - || opcode == FS - || opcode == VS) { + || opcode == FS) { os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- "; os << instr.getRSType(1) << std::dec << instr.getRSrc(1); } else if (opcode == L_INST - || opcode == FL - || opcode == VL) { + || opcode == FL) { os << instr.getRDType() << std::dec << instr.getRDest() << " <- "; os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]"; } else { @@ -219,8 +246,10 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) { if (i) os << ", "; os << "imm=0x" << std::hex << instr.getImm(); } - } - + if (opcode == GPU && func3 == 0) { + os << ", unit=" << std::dec << func2; + } + } return os; } } @@ -239,6 +268,7 @@ Decoder::Decoder(const ArchDef &arch) { shift_func3_ = shift_rd_ + reg_s_; shift_rs1_ = shift_func3_ + func3_s_; shift_rs2_ = shift_rs1_ + reg_s_; + shift_func2_ = shift_rs2_ + reg_s_; shift_func7_ = shift_rs2_ + reg_s_; shift_rs3_ = shift_func7_ + func2_s_; shift_vmop_ = shift_func7_ + vmask_s_; @@ -247,7 +277,7 @@ Decoder::Decoder(const ArchDef &arch) { shift_vset_ = shift_func7_ + 6; reg_mask_ = 0x1f; - func2_mask_ = 0x2; + func2_mask_ = 0x3; func3_mask_ = 0x7; func6_mask_ = 0x3f; func7_mask_ = 0x7f; @@ -265,6 +295,7 @@ std::shared_ptr Decoder::decode(Word code) const { Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_); instr->setOpcode(op); + Word func2 = (code >> shift_func2_) & func2_mask_; Word func3 = (code >> shift_func3_) & func3_mask_; Word func6 = (code >> shift_func6_) & func6_mask_; Word func7 = (code >> shift_func7_) & func7_mask_; @@ -403,7 +434,7 @@ std::shared_ptr Decoder::decode(Word code) const { } } break; - case Opcode::VL: + case Opcode::FL: instr->setDestVReg(rd); instr->setSrcVReg(rs1); instr->setVlsWidth(func3); @@ -413,7 +444,7 @@ std::shared_ptr Decoder::decode(Word code) const { instr->setVnf((code >> shift_vnf_) & func3_mask_); break; - case Opcode::VS: + case Opcode::FS: instr->setVs3(rd); instr->setSrcVReg(rs1); instr->setVlsWidth(func3); @@ -428,10 +459,18 @@ std::shared_ptr Decoder::decode(Word code) const { } break; case R4_TYPE: - instr->setDestFReg(rd); - instr->setSrcFReg(rs1); - instr->setSrcFReg(rs2); - instr->setSrcFReg(rs3); + if (op == Opcode::GPU) { + instr->setDestReg(rd); + instr->setSrcReg(rs1); + instr->setSrcReg(rs2); + instr->setSrcReg(rs3); + } else { + instr->setDestFReg(rd); + instr->setSrcFReg(rs1); + instr->setSrcFReg(rs2); + instr->setSrcFReg(rs3); + } + instr->setFunc2(func2); instr->setFunc3(func3); break; default: diff --git a/sim/simX/execute.cpp b/sim/simX/execute.cpp index ff705d82..d55ba2f9 100644 --- a/sim/simX/execute.cpp +++ b/sim/simX/execute.cpp @@ -49,11 +49,12 @@ inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid) } } -void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { +void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { assert(tmask_.any()); Word nextPC = PC_ + core_->arch().wsize(); + Word func2 = instr.getFunc2(); Word func3 = instr.getFunc3(); Word func6 = instr.getFunc6(); Word func7 = instr.getFunc7(); @@ -117,8 +118,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { case NOP: break; case LUI_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::ARITH; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::ARITH; for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -127,8 +128,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { rd_write = true; break; case AUIPC_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::ARITH; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::ARITH; for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -137,10 +138,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { rd_write = true; break; case R_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::ARITH; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::ARITH; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -149,7 +150,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { case 0: // MUL rddata[t] = ((WordI)rsdata[t][0]) * ((WordI)rsdata[t][1]); - pipeline_state->alu.type = AluType::IMUL; + trace->alu.type = AluType::IMUL; break; case 1: { // MULH @@ -163,7 +164,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } uint64_t result = first * second; rddata[t] = (result >> 32) & 0xFFFFFFFF; - pipeline_state->alu.type = AluType::IMUL; + trace->alu.type = AluType::IMUL; } break; case 2: { // MULHSU @@ -173,14 +174,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } int64_t second = (int64_t)rsdata[t][1]; rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF; - pipeline_state->alu.type = AluType::IMUL; + trace->alu.type = AluType::IMUL; } break; case 3: { // MULHU uint64_t first = (uint64_t)rsdata[t][0]; uint64_t second = (uint64_t)rsdata[t][1]; rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF; - pipeline_state->alu.type = AluType::IMUL; + trace->alu.type = AluType::IMUL; } break; case 4: { // DIV @@ -193,7 +194,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } else { rddata[t] = dividen / divisor; } - pipeline_state->alu.type = AluType::IDIV; + trace->alu.type = AluType::IDIV; } break; case 5: { // DIVU @@ -204,7 +205,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } else { rddata[t] = dividen / divisor; } - pipeline_state->alu.type = AluType::IDIV; + trace->alu.type = AluType::IDIV; } break; case 6: { // REM @@ -217,7 +218,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } else { rddata[t] = dividen % divisor; } - pipeline_state->alu.type = AluType::IDIV; + trace->alu.type = AluType::IDIV; } break; case 7: { // REMU @@ -228,7 +229,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } else { rddata[t] = dividen % divisor; } - pipeline_state->alu.type = AluType::IDIV; + trace->alu.type = AluType::IDIV; } break; default: std::abort(); @@ -285,9 +286,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { rd_write = true; break; case I_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::ARITH; - pipeline_state->used_iregs[rsrc0] = 1; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::ARITH; + trace->used_iregs.set(rsrc0); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -336,10 +337,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { rd_write = true; break; case B_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::BRANCH; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::BRANCH; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -385,107 +386,149 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } break; // runonce } - pipeline_state->stall_warp = true; + trace->fetch_stall = true; break; case JAL_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::BRANCH; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::BRANCH; for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; rddata[t] = nextPC; nextPC = PC_ + immsrc; - pipeline_state->stall_warp = true; + trace->fetch_stall = true; break; // runonce } rd_write = true; break; case JALR_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::BRANCH; - pipeline_state->used_iregs[rsrc0] = 1; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::BRANCH; + trace->used_iregs.set(rsrc0); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; rddata[t] = nextPC; nextPC = rsdata[t][0] + immsrc; - pipeline_state->stall_warp = true; + trace->fetch_stall = true; break; // runOnce } rd_write = true; break; case L_INST: - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.type = LsuType::LOAD; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->mem_addrs.resize(num_threads); - for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned - Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; - Word data_read = core_->dcache_read(memAddr, 4); - pipeline_state->mem_addrs.at(t) = memAddr; - DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); - switch (func3) { - case 0: - // LBI - rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8); - break; - case 1: - // LHI - rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16); - break; - case 2: - // LW - rddata[t] = data_read; - break; - case 4: - // LBU - rddata[t] = Word((data_read >> shift_by) & 0xFF); - break; - case 5: - // LHU - rddata[t] = Word((data_read >> shift_by) & 0xFFFF); - break; - default: - std::abort(); + case FL: + trace->exe_type = ExeType::LSU; + trace->lsu.type = LsuType::LOAD; + trace->used_iregs.set(rsrc0); + if (opcode == L_INST + || (opcode == FL && func3 == 2)) { + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned + Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; + Word data_read = core_->dcache_read(memAddr, 4); + trace->mem_addrs.at(t).push_back(memAddr); + DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); + switch (func3) { + case 0: + // LBI + rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8); + break; + case 1: + // LHI + rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16); + break; + case 2: + // LW + rddata[t] = data_read; + break; + case 4: + // LBU + rddata[t] = Word((data_read >> shift_by) & 0xFF); + break; + case 5: + // LHU + rddata[t] = Word((data_read >> shift_by) & 0xFFFF); + break; + default: + std::abort(); + } } - } - rd_write = true; - break; - case S_INST: - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.type = LsuType::STORE; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; - pipeline_state->mem_addrs.resize(num_threads); - for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - Word memAddr = rsdata[t][0] + immsrc; - pipeline_state->mem_addrs.at(t) = memAddr; - DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - switch (func3) { - case 0: - // SB - core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1); - break; - case 1: - // SH - core_->dcache_write(memAddr, rsdata[t][1], 2); - break; - case 2: - // SW - core_->dcache_write(memAddr, rsdata[t][1], 4); - break; + } else { + DP(4, "Executing vector load"); + DP(4, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); + DP(4, "dest: v" << rdest); + DP(4, "width" << instr.getVlsWidth()); + auto &vd = vRegFile_.at(rdest); + switch (instr.getVlsWidth()) { + case 6: { + // load word and unit strided (not checking for unit stride) + for (int i = 0; i < vl_; i++) { + Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); + DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr); + Word data_read = core_->dcache_read(memAddr, 4); + DP(4, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); + int *result_ptr = (int *)(vd.data() + i); + *result_ptr = data_read; + } + } break; default: std::abort(); } } + rd_write = true; + break; + case S_INST: + case FS: + trace->exe_type = ExeType::LSU; + trace->lsu.type = LsuType::STORE; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + if (opcode == S_INST + || (opcode == FS && func3 == 2)) { + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word memAddr = rsdata[t][0] + immsrc; + trace->mem_addrs.at(t).push_back(memAddr); + DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + switch (func3) { + case 0: + // SB + core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1); + break; + case 1: + // SH + core_->dcache_write(memAddr, rsdata[t][1], 2); + break; + case 2: + // SW + core_->dcache_write(memAddr, rsdata[t][1], 4); + break; + default: + std::abort(); + } + } + } else { + for (int i = 0; i < vl_; i++) { + Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8); + DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + switch (instr.getVlsWidth()) { + case 6: { + // store word and unit strided (not checking for unit stride) + uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i); + core_->dcache_write(memAddr, value, 4); + DP(4, "store: " << memAddr << " value:" << value); + } break; + default: + std::abort(); + } + } + } break; case SYS_INST: - pipeline_state->exe_type = ExeType::CSR; + trace->exe_type = ExeType::CSR; for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -493,30 +536,40 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { Word csr_value = core_->get_csr(csr_addr, t, id_); switch (func3) { case 0: - if (csr_addr < 2) { - // ECALL/EBREAK + switch (csr_addr) { + case 0: // ECALL + core_->trigger_ecall(); + break; + case 1: // EBREAK core_->trigger_ebreak(); - } + break; + case 0x002: // URET + case 0x102: // SRET + case 0x302: // MRET + break; + default: + std::abort(); + } break; case 1: // CSRRW rddata[t] = csr_value; core_->set_csr(csr_addr, rsdata[t][0], t, id_); - pipeline_state->used_iregs[rsrc0] = 1; + trace->used_iregs.set(rsrc0); rd_write = true; break; case 2: // CSRRS rddata[t] = csr_value; core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_); - pipeline_state->used_iregs[rsrc0] = 1; + trace->used_iregs.set(rsrc0); rd_write = true; break; case 3: // CSRRC rddata[t] = csr_value; core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_); - pipeline_state->used_iregs[rsrc0] = 1; + trace->used_iregs.set(rsrc0); rd_write = true; break; case 5: @@ -543,88 +596,12 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } break; case FENCE: - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.type = LsuType::FENCE; - pipeline_state->stall_warp = true; - break; - case (FL | VL): - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.type = LsuType::LOAD; - pipeline_state->used_iregs[rsrc0] = 1; - if (func3 == 0x2) { - pipeline_state->mem_addrs.resize(num_threads); - for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - Word memAddr = rsdata[t][0] + immsrc; - pipeline_state->mem_addrs.at(t) = memAddr; - Word data_read = core_->dcache_read(memAddr, 4); - DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); - rddata[t] = data_read; - } - } else { - DP(3, "Executing vector load"); - DP(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); - DP(3, "dest: v" << rdest); - DP(3, "width" << instr.getVlsWidth()); - pipeline_state->mem_addrs.resize(vl_); - auto &vd = vRegFile_.at(rdest); - switch (instr.getVlsWidth()) { - case 6: { - // load word and unit strided (not checking for unit stride) - for (int i = 0; i < vl_; i++) { - Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); - pipeline_state->mem_addrs.at(i) = memAddr; - DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - Word data_read = core_->dcache_read(memAddr, 4); - DP(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); - int *result_ptr = (int *)(vd.data() + i); - *result_ptr = data_read; - } - } break; - default: - std::abort(); - } - break; - } - rd_write = true; - break; - case (FS | VS): - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.type = LsuType::STORE; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; - if (func3 == 0x2) { - pipeline_state->mem_addrs.resize(num_threads); - for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - Word memAddr = rsdata[t][0] + immsrc; - pipeline_state->mem_addrs.at(t) = memAddr; - core_->dcache_write(memAddr, rsdata[t][1], 4); - DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - } - } else { - pipeline_state->mem_addrs.resize(vl_); - for (int i = 0; i < vl_; i++) { - Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8); - pipeline_state->mem_addrs.at(i) = memAddr; - DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - switch (instr.getVlsWidth()) { - case 6: { - //store word and unit strided (not checking for unit stride) - uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i); - core_->dcache_write(memAddr, value, 4); - DP(3, "store: " << memAddr << " value:" << value); - } break; - default: - std::abort(); - } - } - } - break; + trace->exe_type = ExeType::LSU; + trace->lsu.type = LsuType::FENCE; + trace->fetch_stall = true; + break; case FCI: - pipeline_state->exe_type = ExeType::FPU; + trace->exe_type = ExeType::FPU; for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -633,32 +610,32 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { switch (func7) { case 0x00: //FADD rddata[t] = rv_fadd(rsdata[t][0], rsdata[t][1], frm, &fflags); - pipeline_state->fpu.type = FpuType::FMA; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FMA; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x04: //FSUB rddata[t] = rv_fsub(rsdata[t][0], rsdata[t][1], frm, &fflags); - pipeline_state->fpu.type = FpuType::FMA; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FMA; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x08: //FMUL rddata[t] = rv_fmul(rsdata[t][0], rsdata[t][1], frm, &fflags); - pipeline_state->fpu.type = FpuType::FMA; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FMA; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x0c: //FDIV rddata[t] = rv_fdiv(rsdata[t][0], rsdata[t][1], frm, &fflags); - pipeline_state->fpu.type = FpuType::FDIV; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FDIV; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x2c: //FSQRT rddata[t] = rv_fsqrt(rsdata[t][0], frm, &fflags); - pipeline_state->fpu.type = FpuType::FSQRT; - pipeline_state->used_fregs[rsrc0] = 1; + trace->fpu.type = FpuType::FSQRT; + trace->used_fregs.set(rsrc0); break; case 0x10: switch (func3) { @@ -672,9 +649,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { rddata[t] = rv_fsgnjx(rsdata[t][0], rsdata[t][1]); break; } - pipeline_state->fpu.type = FpuType::FNCP; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FNCP; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x14: if (func3) { @@ -684,9 +661,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { // FMIN.S rddata[t] = rv_fmin(rsdata[t][0], rsdata[t][1], &fflags); } - pipeline_state->fpu.type = FpuType::FNCP; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FNCP; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x60: if (rsrc1 == 0) { @@ -696,8 +673,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { // FCVT.WU.S rddata[t] = rv_ftou(rsdata[t][0], frm, &fflags); } - pipeline_state->fpu.type = FpuType::FCVT; - pipeline_state->used_fregs[rsrc0] = 1; + trace->fpu.type = FpuType::FCVT; + trace->used_fregs.set(rsrc0); break; case 0x70: if (func3) { @@ -707,8 +684,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { // FMV.X.W rddata[t] = rsdata[t][0]; } - pipeline_state->fpu.type = FpuType::FNCP; - pipeline_state->used_fregs[rsrc0] = 1; + trace->fpu.type = FpuType::FNCP; + trace->used_fregs.set(rsrc0); break; case 0x50: switch(func3) { @@ -725,9 +702,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { rddata[t] = rv_feq(rsdata[t][0], rsdata[t][1], &fflags); break; } - pipeline_state->fpu.type = FpuType::FNCP; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FNCP; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x68: if (rsrc1) { @@ -737,14 +714,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { // FCVT.S.W: rddata[t] = rv_itof(rsdata[t][0], frm, &fflags); } - pipeline_state->fpu.type = FpuType::FCVT; - pipeline_state->used_iregs[rsrc0] = 1; + trace->fpu.type = FpuType::FCVT; + trace->used_iregs.set(rsrc0); break; case 0x78: // FMV.W.X rddata[t] = rsdata[t][0]; - pipeline_state->fpu.type = FpuType::FNCP; - pipeline_state->used_iregs[rsrc0] = 1; + trace->fpu.type = FpuType::FNCP; + trace->used_iregs.set(rsrc0); break; } update_fcrs(fflags, core_, t, id_); @@ -755,10 +732,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { case FMSUB: case FMNMADD: case FMNMSUB: - pipeline_state->fpu.type = FpuType::FMA; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; - pipeline_state->used_fregs[rsrc2] = 1; + trace->fpu.type = FpuType::FMA; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); + trace->used_fregs.set(rsrc2); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -784,8 +761,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } rd_write = true; break; - case GPGPU: { - pipeline_state->exe_type = ExeType::GPU; + case GPGPU: { int ts = 0; for (int t = 0; t < num_threads; ++t) { if (tmask_.test(t)) { @@ -795,10 +771,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } switch (func3) { case 0: { - // TMC - pipeline_state->gpu.type = GpuType::TMC; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->stall_warp = true; + // TMC + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::TMC; + trace->used_iregs.set(rsrc0); + trace->fetch_stall = true; if (rsrc1) { // predicate mode ThreadMask pred; @@ -823,10 +800,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } break; case 1: { // WSPAWN - pipeline_state->gpu.type = GpuType::WSPAWN; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; - pipeline_state->stall_warp = true; + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::WSPAWN; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + trace->fetch_stall = true; int active_warps = std::min(rsdata.at(ts)[0], core_->arch().num_warps()); DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]); for (int i = 1; i < active_warps; ++i) { @@ -837,9 +815,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } break; case 2: { // SPLIT - pipeline_state->gpu.type = GpuType::SPLIT; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->stall_warp = true; + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::SPLIT; + trace->used_iregs.set(rsrc0); + trace->fetch_stall = true; if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) { ThreadMask tmask; for (int i = 0; i < num_threads; ++i) { @@ -868,8 +847,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } break; case 3: { // JOIN - pipeline_state->gpu.type = GpuType::JOIN; - pipeline_state->stall_warp = true; + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::JOIN; + trace->fetch_stall = true; if (!domStack_.empty() && domStack_.top().unanimous) { DP(3, "*** Uninimous branch at join"); tmask_ = domStack_.top().tmask; @@ -893,18 +873,19 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } break; case 4: { // BAR - pipeline_state->gpu.type = GpuType::BAR; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; - pipeline_state->stall_warp = true; + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::BAR; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + trace->fetch_stall = true; active_ = false; core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); } break; - case 6: { + case 5: { // PREFETCH - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.type = LsuType::PREFETCH; - pipeline_state->used_iregs[rsrc0] = 1; + trace->exe_type = ExeType::LSU; + trace->lsu.type = LsuType::PREFETCH; + trace->used_iregs.set(rsrc0); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -915,7 +896,50 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { default: std::abort(); } - } break; + } break; + case GPU: { + switch (func3) { + case 0: { // TEX + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::TEX; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + trace->used_iregs.set(rsrc2); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + auto unit = func2; + auto u = rsdata[t][0]; + auto v = rsdata[t][1]; + auto lod = rsdata[t][2]; + auto color = core_->tex_read(unit, u, v, lod, &trace->mem_addrs.at(t)); + rddata[t] = color; + } + rd_write = true; + } break; + case 1: + switch (func2) { + case 0: { // CMOV + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::CMOV; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + trace->used_iregs.set(rsrc2); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = rsdata[t][0] ? rsdata[t][1] : rsdata[t][2]; + } + rd_write = true; + } break; + default: + std::abort(); + } + break; + default: + std::abort(); + } + } break; case VSET: { int VLEN = core_->arch().vsize() * 8; int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew(); @@ -966,7 +990,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 24: { - //vmseq + // vmseq auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -997,7 +1021,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 25: { - //vmsne + // vmsne auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1028,7 +1052,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 26: { - //vmsltu + // vmsltu auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1059,7 +1083,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 27: { - //vmslt + // vmslt auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1090,7 +1114,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 28: { - //vmsleu + // vmsleu auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1121,7 +1145,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 29: { - //vmsle + // vmsle auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1152,7 +1176,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 30: { - //vmsgtu + // vmsgtu auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1183,7 +1207,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 31: { - //vmsgt + // vmsgt auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1356,7 +1380,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 27: { - //vmxor + // vmxor auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1402,7 +1426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 28: { - //vmornot + // vmornot auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1448,7 +1472,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 29: { - //vmnand + // vmnand auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1494,7 +1518,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 30: { - //vmnor + // vmnor auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1540,7 +1564,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 31: { - //vmxnor + // vmxnor auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1586,7 +1610,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 37: { - //vmul + // vmul auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1769,7 +1793,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } if (rd_write) { - pipeline_state->wb = true; + trace->wb = true; DPH(2, "Dest Reg: "); auto rdt = instr.getRDType(); switch (rdt) { @@ -1786,7 +1810,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { DPN(2, "0x" << std::hex << rddata[t]); } DPN(2, "}" << std::endl); - pipeline_state->used_iregs[rdest] = 1; + trace->used_iregs[rdest] = 1; } break; case RegType::Float: @@ -1801,7 +1825,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { DPN(2, "0x" << std::hex << rddata[t]); } DPN(2, "}" << std::endl); - pipeline_state->used_fregs[rdest] = 1; + trace->used_fregs[rdest] = 1; break; default: std::abort(); diff --git a/sim/simX/exeunit.cpp b/sim/simX/exeunit.cpp index ba280812..1d0a3cfc 100644 --- a/sim/simX/exeunit.cpp +++ b/sim/simX/exeunit.cpp @@ -6,16 +6,18 @@ #include #include "debug.h" #include "core.h" +#include "constants.h" using namespace vortex; NopUnit::NopUnit(Core*) : ExeUnit("NOP") {} void NopUnit::step(uint64_t /*cycle*/) { - pipeline_state_t state; - if (!inputs_.try_pop(&state)) + if (inputs_.empty()) return; - this->schedule_output(state, 1); + auto trace = inputs_.top(); + this->schedule_output(trace, 1); + inputs_.pop(); } /////////////////////////////////////////////////////////////////////////////// @@ -33,19 +35,23 @@ void LsuUnit::step(uint64_t cycle) { // handle dcache response for (uint32_t t = 0; t < num_threads_; ++t) { - MemRsp mem_rsp; - if (!core_->dcache_->CoreRspPorts.at(t).read(&mem_rsp)) + auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0); + if (dcache_rsp_port.empty()) continue; - auto& entry = pending_dcache_.at(mem_rsp.tag); - DT(3, cycle, "dcache-rsp: addr=" << std::hex << entry.first.mem_addrs.at(t) << ", tag=" << mem_rsp.tag << ", type=" << entry.first.lsu.type << ", tid=" << t << ", " << entry.first); - assert(entry.second.test(t)); - entry.second.reset(t); // track remaining blocks - if (!entry.second.any()) { - auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency); - entry.first.dcache_latency = latency; - this->schedule_output(entry.first, 1); + auto& mem_rsp = dcache_rsp_port.top(); + auto& entry = pending_dcache_.at(mem_rsp.tag); + auto trace = entry.first; + DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type + << ", tid=" << t << ", " << *trace); + assert(entry.second); + --entry.second; // track remaining blocks + if (0 == entry.second) { + auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency); + trace->dcache_latency = latency; + this->schedule_output(trace, 1); pending_dcache_.release(mem_rsp.tag); - } + } + dcache_rsp_port.pop(); } if (fence_lock_) { @@ -61,36 +67,83 @@ void LsuUnit::step(uint64_t cycle) { if (inputs_.empty()) return; - auto state = inputs_.top(); + auto trace = inputs_.top(); - if (state.lsu.type == LsuType::FENCE) { + if (trace->lsu.type == LsuType::FENCE) { // schedule fence lock - fence_state_ = state; - fence_lock_ = true; - inputs_.pop(); - DT(3, cycle, "fence-lock: " << state); + fence_state_ = trace; + fence_lock_ = true; + DT(3, cycle, "fence-lock: " << *trace); + // remove input + inputs_.pop(); return; } // check pending queue capacity - if (pending_dcache_.full()) { - DT(3, cycle, "*** lsu-queue-stall: " << state); + if (!trace->check_stalled(pending_dcache_.full())) { + DT(3, cycle, "*** lsu-queue-stall: " << *trace); + } + if (pending_dcache_.full()) return; + + // send memory request + + bool has_shared_memory = false; + bool mem_rsp_pending = false; + bool is_write = (trace->lsu.type == LsuType::STORE); + + uint32_t valid_addrs = 0; + for (auto& mem_addr : trace->mem_addrs) { + valid_addrs += mem_addr.size(); + } + + trace->dcache_latency = SimPlatform::instance().cycles(); + auto tag = pending_dcache_.allocate({trace, valid_addrs}); + + for (uint32_t t = 0; t < num_threads_; ++t) { + if (!trace->tmask.test(t)) + continue; + + auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0); + for (auto mem_addr : trace->mem_addrs.at(t)) { + // check shared memory address + if (SM_ENABLE) { + if ((mem_addr >= (SMEM_BASE_ADDR-SMEM_SIZE)) + && (mem_addr < SMEM_BASE_ADDR)) { + DT(3, cycle, "smem-access: addr=" << std::hex << mem_addr << ", tag=" << tag + << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); + has_shared_memory = true; + continue; + } + } + + bool is_io = (mem_addr >= IO_BASE_ADDR); + + MemReq mem_req; + mem_req.addr = mem_addr; + mem_req.write = is_write; + mem_req.tag = tag; + mem_req.is_io = is_io; + dcache_req_port.send(mem_req, 1); + DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr << ", tag=" << tag + << ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << is_io << ", "<< trace); + // do not wait on writes + mem_rsp_pending = !is_write; + } } - // send dcache request - state.dcache_latency = SimPlatform::instance().cycles(); - auto tag = pending_dcache_.allocate({state, state.tmask}); - for (uint32_t t = 0; t < num_threads_; ++t) { - if (!state.tmask.test(t)) - continue; - MemReq mem_req; - mem_req.addr = state.mem_addrs.at(t); - mem_req.write = (state.lsu.type == LsuType::STORE); - mem_req.tag = tag; - core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1); - DT(3, cycle, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", type=" << state.lsu.type << ", tid=" << t << ", " << state); - } + // do not wait + if (!mem_rsp_pending) { + pending_dcache_.release(tag); + uint32_t delay = 1; + if (has_shared_memory) { + // all threads accessed shared memory + delay += Constants::SMEM_DELAY; + } + this->schedule_output(trace, delay); + } + + // remove input inputs_.pop(); } @@ -98,23 +151,27 @@ void LsuUnit::step(uint64_t cycle) { AluUnit::AluUnit(Core*) : ExeUnit("ALU") {} -void AluUnit::step(uint64_t /*cycle*/) { - pipeline_state_t state; - if (!inputs_.try_pop(&state)) +void AluUnit::step(uint64_t /*cycle*/) { + if (inputs_.empty()) return; - switch (state.alu.type) { - case AluType::ARITH: - this->schedule_output(state, 1); - break; + auto trace = inputs_.top(); + switch (trace->alu.type) { + case AluType::ARITH: case AluType::BRANCH: - this->schedule_output(state, 1); + case AluType::CMOV: + this->schedule_output(trace, 1); + inputs_.pop(); break; case AluType::IMUL: - this->schedule_output(state, LATENCY_IMUL); + this->schedule_output(trace, LATENCY_IMUL); + inputs_.pop(); break; case AluType::IDIV: - this->schedule_output(state, XLEN); + this->schedule_output(trace, XLEN); + inputs_.pop(); break; + default: + std::abort(); } } @@ -123,10 +180,11 @@ void AluUnit::step(uint64_t /*cycle*/) { CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {} void CsrUnit::step(uint64_t /*cycle*/) { - pipeline_state_t state; - if (!inputs_.try_pop(&state)) + if (inputs_.empty()) return; - this->schedule_output(state, 1); + auto trace = inputs_.top(); + this->schedule_output(trace, 1); + inputs_.pop(); } /////////////////////////////////////////////////////////////////////////////// @@ -134,46 +192,127 @@ void CsrUnit::step(uint64_t /*cycle*/) { FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {} void FpuUnit::step(uint64_t /*cycle*/) { - pipeline_state_t state; - if (!inputs_.try_pop(&state)) + if (inputs_.empty()) return; - switch (state.fpu.type) { + auto trace = inputs_.top(); + switch (trace->fpu.type) { case FpuType::FNCP: - this->schedule_output(state, 1); + this->schedule_output(trace, 1); + inputs_.pop(); break; case FpuType::FMA: - this->schedule_output(state, LATENCY_FMA); + this->schedule_output(trace, LATENCY_FMA); + inputs_.pop(); break; case FpuType::FDIV: - this->schedule_output(state, LATENCY_FDIV); + this->schedule_output(trace, LATENCY_FDIV); + inputs_.pop(); break; case FpuType::FSQRT: - this->schedule_output(state, LATENCY_FSQRT); + this->schedule_output(trace, LATENCY_FSQRT); + inputs_.pop(); break; case FpuType::FCVT: - this->schedule_output(state, LATENCY_FCVT); + this->schedule_output(trace, LATENCY_FCVT); + inputs_.pop(); break; + default: + std::abort(); } } /////////////////////////////////////////////////////////////////////////////// -GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {} +GpuUnit::GpuUnit(Core* core) + : ExeUnit("GPU") + , core_(core) + , num_threads_(core->arch().num_threads()) + , pending_tex_reqs_(TEXQ_SIZE) +{} -void GpuUnit::step(uint64_t /*cycle*/) { - pipeline_state_t state; - if (!inputs_.try_pop(&state)) +void GpuUnit::step(uint64_t cycle) { + __unused (cycle); +#ifdef EXT_TEX_ENABLE + // handle memory response + for (uint32_t t = 0; t < num_threads_; ++t) { + auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1); + if (dcache_rsp_port.empty()) + continue; + auto& mem_rsp = dcache_rsp_port.top(); + auto& entry = pending_tex_reqs_.at(mem_rsp.tag); + auto trace = entry.first; + DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace); + assert(entry.second); + --entry.second; // track remaining blocks + if (0 == entry.second) { + auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency); + trace->dcache_latency = latency; + this->schedule_output(trace, 1); + pending_tex_reqs_.release(mem_rsp.tag); + } + dcache_rsp_port.pop(); + } +#endif + + // check input queue + if (inputs_.empty()) return; - switch (state.gpu.type) { + + auto trace = inputs_.top(); + + switch (trace->gpu.type) { case GpuType::TMC: case GpuType::WSPAWN: case GpuType::SPLIT: case GpuType::JOIN: case GpuType::BAR: - this->schedule_output(state, 1); - break; - case GpuType::TEX: - /* TODO */ + this->schedule_output(trace, 1); + inputs_.pop(); break; + case GpuType::TEX: { + if (this->processTexRequest(cycle, trace)) + inputs_.pop(); + } break; + default: + std::abort(); } +} + +bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { + __unused (cycle); + + // check pending queue capacity + if (!trace->check_stalled(pending_tex_reqs_.full())) { + DT(3, cycle, "*** tex-queue-stall: " << *trace); + } + if (pending_tex_reqs_.full()) + return false; + + // send memory request + + uint32_t valid_addrs = 0; + for (auto& mem_addr : trace->mem_addrs) { + valid_addrs += mem_addr.size(); + } + + trace->tex_latency = SimPlatform::instance().cycles(); + auto tag = pending_tex_reqs_.allocate({trace, valid_addrs}); + + for (uint32_t t = 0; t < num_threads_; ++t) { + if (!trace->tmask.test(t)) + continue; + + auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1); + for (auto mem_addr : trace->mem_addrs.at(t)) { + MemReq mem_req; + mem_req.addr = mem_addr; + mem_req.write = (trace->lsu.type == LsuType::STORE); + mem_req.tag = tag; + dcache_req_port.send(mem_req, 1); + DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr << ", tag=" << tag + << ", tid=" << t << ", "<< trace); + } + } + + return true; } \ No newline at end of file diff --git a/sim/simX/exeunit.h b/sim/simX/exeunit.h index 3b2bbf91..83e69463 100644 --- a/sim/simX/exeunit.h +++ b/sim/simX/exeunit.h @@ -11,36 +11,43 @@ class Core; class ExeUnit { protected: const char* name_; - Queue inputs_; - Queue outputs_; + Queue inputs_; + Queue outputs_; - void schedule_output(const pipeline_state_t& state, uint32_t delay) { + void schedule_output(pipeline_trace_t* trace, uint32_t delay) { if (delay > 1) { SimPlatform::instance().schedule( - [&](const pipeline_state_t& req) { + [&](pipeline_trace_t* req) { outputs_.push(req); }, - state, + trace, (delay - 1) ); } else { - outputs_.push(state); + outputs_.push(trace); } } public: typedef std::shared_ptr Ptr; - ExeUnit(const char* name) : name_(name) {} - + ExeUnit(const char* name) : name_(name) {} virtual ~ExeUnit() {} - void push_input(const pipeline_state_t& state) { - inputs_.push(state); + void push(pipeline_trace_t* trace) { + inputs_.push(trace); } - bool pop_output(pipeline_state_t* state) { - return outputs_.try_pop(state); + bool empty() const { + return outputs_.empty(); + } + + pipeline_trace_t* top() const { + return outputs_.top(); + } + + void pop() { + outputs_.pop(); } virtual void step(uint64_t cycle) = 0; @@ -61,8 +68,8 @@ class LsuUnit : public ExeUnit { private: Core* core_; uint32_t num_threads_; - HashTable> pending_dcache_; - pipeline_state_t fence_state_; + HashTable> pending_dcache_; + pipeline_trace_t* fence_state_; bool fence_lock_; public: @@ -101,6 +108,13 @@ public: /////////////////////////////////////////////////////////////////////////////// class GpuUnit : public ExeUnit { +private: + Core* core_; + uint32_t num_threads_; + HashTable> pending_tex_reqs_; + + bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace); + public: GpuUnit(Core*); diff --git a/sim/simX/ibuffer.h b/sim/simX/ibuffer.h index 86bdeed7..b4c6f51e 100644 --- a/sim/simX/ibuffer.h +++ b/sim/simX/ibuffer.h @@ -7,7 +7,7 @@ namespace vortex { class IBuffer { private: - std::queue entries_; + std::queue entries_; uint32_t capacity_; public: @@ -23,12 +23,12 @@ public: return (entries_.size() == capacity_); } - const pipeline_state_t& top() const { + pipeline_trace_t* top() const { return entries_.front(); } - void push(const pipeline_state_t& state) { - entries_.emplace(state); + void push(pipeline_trace_t* trace) { + entries_.emplace(trace); } void pop() { diff --git a/sim/simX/instr.h b/sim/simX/instr.h index 5deace6c..334b8565 100644 --- a/sim/simX/instr.h +++ b/sim/simX/instr.h @@ -29,10 +29,9 @@ enum Opcode { FMNMADD = 0x4f, // Vector Extension VSET = 0x57, - VL = 0x7, - VS = 0x27, // GPGPU Extension GPGPU = 0x6b, + GPU = 0x5b, }; enum InstType { @@ -70,6 +69,7 @@ public: void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg; } void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; } void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; } + void setFunc2(Word func2) { func2_ = func2; } void setFunc3(Word func3) { func3_ = func3; } void setFunc7(Word func7) { func7_ = func7; } void setImm(Word imm) { has_imm_ = true; imm_ = imm; } @@ -85,6 +85,7 @@ public: /* Getters used by encoders. */ Opcode getOpcode() const { return opcode_; } + Word getFunc2() const { return func2_; } Word getFunc3() const { return func3_; } Word getFunc6() const { return func6_; } Word getFunc7() const { return func7_; } @@ -118,6 +119,7 @@ private: RegType rsrc_type_[MAX_REG_SOURCES]; int rsrc_[MAX_REG_SOURCES]; int rdest_; + Word func2_; Word func3_; Word func6_; diff --git a/sim/simX/memsim.cpp b/sim/simX/memsim.cpp index 63ba571a..6559000d 100644 --- a/sim/simX/memsim.cpp +++ b/sim/simX/memsim.cpp @@ -20,14 +20,16 @@ public: void step(uint64_t /*cycle*/) { for (uint32_t i = 0, n = num_banks_; i < n; ++i) { - MemReq mem_req; - if (!simobject_->MemReqPorts.at(i).read(&mem_req)) + auto& mem_req_port = simobject_->MemReqPorts.at(i); + if (mem_req_port.empty()) continue; + auto& mem_req = mem_req_port.top(); if (!mem_req.write) { MemRsp mem_rsp; mem_rsp.tag = mem_req.tag; simobject_->MemRspPorts.at(i).send(mem_rsp, latency_); } + mem_req_port.pop(); } } }; diff --git a/sim/simX/memsim.h b/sim/simX/memsim.h index 24d8e6ca..3d5b33fe 100644 --- a/sim/simX/memsim.h +++ b/sim/simX/memsim.h @@ -10,10 +10,22 @@ struct MemReq { uint64_t addr; uint32_t tag; bool write; + bool is_io; + + MemReq(uint64_t _addr = 0, + uint64_t _tag = 0, + bool _write = false, + bool _is_io = false + ) : addr(_addr) + , tag(_tag) + , write(_write) + , is_io(_is_io) + {} }; struct MemRsp { - uint32_t tag; + uint64_t tag; + MemRsp(uint64_t _tag = 0) : tag (_tag) {} }; class MemSim : public SimObject{ diff --git a/sim/simX/pipeline.h b/sim/simX/pipeline.h index b5937b29..a5bf6d52 100644 --- a/sim/simX/pipeline.h +++ b/sim/simX/pipeline.h @@ -5,11 +5,12 @@ #include #include #include "types.h" +#include "archdef.h" #include "debug.h" namespace vortex { -struct pipeline_state_t { +struct pipeline_trace_t { //-- uint64_t id; @@ -20,17 +21,24 @@ struct pipeline_state_t { Word PC; //-- - bool stall_warp; + bool fetch_stall; + bool pipeline_stall; + + //-- bool wb; RegType rdest_type; int rdest; + + //-- RegMask used_iregs; RegMask used_fregs; RegMask used_vregs; //- ExeType exe_type; - std::vector mem_addrs; + + //-- + std::vector> mem_addrs; //-- union { @@ -51,27 +59,37 @@ struct pipeline_state_t { // stats uint64_t icache_latency; uint64_t dcache_latency; + uint64_t tex_latency; - void clear() { + pipeline_trace_t(uint64_t id_, const ArchDef& arch) { + id = id_; cid = 0; wid = 0; tmask.reset(); - PC = 0; - stall_warp = false; - wb = false; + PC = 0; + fetch_stall = false; + pipeline_stall = false; + wb = false; rdest = 0; rdest_type = RegType::None; used_iregs.reset(); used_fregs.reset(); used_vregs.reset(); exe_type = ExeType::NOP; - mem_addrs.clear(); + mem_addrs.resize(arch.num_threads()); icache_latency = 0; dcache_latency = 0; + tex_latency = 0; + } + + bool check_stalled(bool stall) { + bool old = pipeline_stall; + pipeline_stall = stall; + return stall ? old : true; } }; -inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) { +inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) { os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC; os << ", wb=" << state.wb; if (state.wb) { @@ -82,10 +100,9 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) return os; } -class PipelineStage : public Queue { +class PipelineStage : public Queue { protected: const char* name_; - friend std::ostream &operator<<(std::ostream &, const pipeline_state_t&); public: PipelineStage(const char* name = nullptr) diff --git a/sim/simX/processor.cpp b/sim/simX/processor.cpp index be5cd4f4..7b54b505 100644 --- a/sim/simX/processor.cpp +++ b/sim/simX/processor.cpp @@ -33,7 +33,8 @@ Processor::Processor(const ArchDef& arch) L3_NUM_BANKS, // number of banks L3_NUM_PORTS, // number of ports NUM_CLUSTERS, // request size - true, // write-throught + true, // write-through + false, // write response 0, // victim size L3_MSHR_SIZE, // mshr 2, // pipeline latency @@ -74,7 +75,8 @@ Processor::Processor(const ArchDef& arch) L2_NUM_BANKS, // number of banks L2_NUM_PORTS, // number of ports NUM_CORES, // request size - true, // write-throught + true, // write-through + false, // write response 0, // victim size L2_MSHR_SIZE, // mshr 2, // pipeline latency @@ -129,7 +131,7 @@ int Processor::run() { if (core->running()) { running = true; } - if (core->check_ebreak()) { + if (core->check_exit()) { exitcode = core->getIRegValue(3); running = false; break; @@ -137,5 +139,7 @@ int Processor::run() { } } while (running); + std::cout << std::flush; + return exitcode; } \ No newline at end of file diff --git a/sim/simX/scoreboard.h b/sim/simX/scoreboard.h index 46bf3bdc..95ba0700 100644 --- a/sim/simX/scoreboard.h +++ b/sim/simX/scoreboard.h @@ -7,6 +7,12 @@ namespace vortex { class Scoreboard { private: + struct reg_use_t { + RegType type; + uint32_t reg; + uint64_t owner; + }; + std::vector in_use_iregs_; std::vector in_use_fregs_; std::vector in_use_vregs_; @@ -25,21 +31,21 @@ public: } } - bool in_use(const pipeline_state_t& state) const { - return (state.used_iregs & in_use_iregs_.at(state.wid)) != 0 - || (state.used_fregs & in_use_fregs_.at(state.wid)) != 0 - || (state.used_vregs & in_use_vregs_.at(state.wid)) != 0; + bool in_use(pipeline_trace_t* state) const { + return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0 + || (state->used_fregs & in_use_fregs_.at(state->wid)) != 0 + || (state->used_vregs & in_use_vregs_.at(state->wid)) != 0; } - std::vector owners(const pipeline_state_t& state) const { - std::vector out; + std::vector get_uses(pipeline_trace_t* state) const { + std::vector out; { uint32_t r = 0; - auto used_iregs = state.used_iregs & in_use_iregs_.at(state.wid); + auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid); while (used_iregs.any()) { if (used_iregs.test(0)) { - uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Integer; - out.push_back(owners_.at(tag)); + uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer; + out.push_back({RegType::Integer, r, owners_.at(tag)}); } used_iregs >>= 1; ++r; @@ -47,11 +53,11 @@ public: } { uint32_t r = 0; - auto used_fregs = state.used_fregs & in_use_fregs_.at(state.wid); + auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid); while (used_fregs.any()) { if (used_fregs.test(0)) { - uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Float; - out.push_back(owners_.at(tag)); + uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float; + out.push_back({RegType::Float, r, owners_.at(tag)}); } used_fregs >>= 1; ++r; @@ -59,11 +65,11 @@ public: } { uint32_t r = 0; - auto used_vregs = state.used_vregs & in_use_vregs_.at(state.wid); + auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid); while (used_vregs.any()) { if (used_vregs.test(0)) { - uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Vector; - out.push_back(owners_.at(tag)); + uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector; + out.push_back({RegType::Vector, r, owners_.at(tag)}); } used_vregs >>= 1; ++r; @@ -72,44 +78,44 @@ public: return std::move(out); } - void reserve(const pipeline_state_t& state) { - if (!state.wb) + void reserve(pipeline_trace_t* state) { + if (!state->wb) return; - switch (state.rdest_type) { + switch (state->rdest_type) { case RegType::Integer: - in_use_iregs_.at(state.wid).set(state.rdest); + in_use_iregs_.at(state->wid).set(state->rdest); break; case RegType::Float: - in_use_fregs_.at(state.wid).set(state.rdest); + in_use_fregs_.at(state->wid).set(state->rdest); break; case RegType::Vector: - in_use_vregs_.at(state.wid).set(state.rdest); + in_use_vregs_.at(state->wid).set(state->rdest); break; default: break; } - uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type; + uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type; assert(owners_.count(tag) == 0); - owners_[tag] = state.id; + owners_[tag] = state->id; } - void release(const pipeline_state_t& state) { - if (!state.wb) + void release(pipeline_trace_t* state) { + if (!state->wb) return; - switch (state.rdest_type) { + switch (state->rdest_type) { case RegType::Integer: - in_use_iregs_.at(state.wid).reset(state.rdest); + in_use_iregs_.at(state->wid).reset(state->rdest); break; case RegType::Float: - in_use_fregs_.at(state.wid).reset(state.rdest); + in_use_fregs_.at(state->wid).reset(state->rdest); break; case RegType::Vector: - in_use_vregs_.at(state.wid).reset(state.rdest); + in_use_vregs_.at(state->wid).reset(state->rdest); break; default: break; } - uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type; + uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type; owners_.erase(tag); } }; diff --git a/sim/simX/tex_unit.cpp b/sim/simX/tex_unit.cpp new file mode 100644 index 00000000..d73bd728 --- /dev/null +++ b/sim/simX/tex_unit.cpp @@ -0,0 +1,91 @@ +#include "tex_unit.h" +#include "core.h" +#include +#include + +using namespace vortex; + +enum class FilterMode { + Point, + Bilinear, + Trilinear, +}; + +TexUnit::TexUnit(Core* core) : core_(core) {} + +TexUnit::~TexUnit() {} + +uint32_t TexUnit::get_state(uint32_t state) { + return states_.at(state); +} + +void TexUnit::set_state(uint32_t state, uint32_t value) { + states_.at(state) = value; +} + +uint32_t TexUnit::read(int32_t u, + int32_t v, + int32_t lod, + std::vector* mem_addrs) { + //-- + auto xu = Fixed::make(u); + auto xv = Fixed::make(v); + uint32_t base_addr = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod)); + uint32_t log_width = std::max(states_.at(TEX_STATE_WIDTH) - lod, 0); + uint32_t log_height = std::max(states_.at(TEX_STATE_HEIGHT) - lod, 0); + auto format = (TexFormat)states_.at(TEX_STATE_FORMAT); + auto filter = (FilterMode)states_.at(TEX_STATE_FILTER); + auto wrapu = (WrapMode)states_.at(TEX_STATE_WRAPU); + auto wrapv = (WrapMode)states_.at(TEX_STATE_WRAPV); + + auto stride = Stride(format); + + switch (filter) { + case FilterMode::Bilinear: { + // addressing + uint32_t offset00, offset01, offset10, offset11; + uint32_t alpha, beta; + TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv, + &offset00, &offset01, &offset10, &offset11, &alpha, &beta); + + uint32_t addr00 = base_addr + offset00 * stride; + uint32_t addr01 = base_addr + offset01 * stride; + uint32_t addr10 = base_addr + offset10 * stride; + uint32_t addr11 = base_addr + offset11 * stride; + + // memory lookup + uint32_t texel00 = core_->dcache_read(addr00, stride); + uint32_t texel01 = core_->dcache_read(addr01, stride); + uint32_t texel10 = core_->dcache_read(addr10, stride); + uint32_t texel11 = core_->dcache_read(addr11, stride); + + mem_addrs->push_back(addr00); + mem_addrs->push_back(addr01); + mem_addrs->push_back(addr10); + mem_addrs->push_back(addr11); + + // filtering + auto color = TexFilterLinear( + format, texel00, texel01, texel10, texel11, alpha, beta); + return color; + } + case FilterMode::Point: { + // addressing + uint32_t offset; + TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset); + + uint32_t addr = base_addr + offset * stride; + + // memory lookup + uint32_t texel = core_->dcache_read(addr, stride); + mem_addrs->push_back(addr); + + // filtering + auto color = TexFilterPoint(format, texel); + return color; + } + default: + std::abort(); + return 0; + } +} \ No newline at end of file diff --git a/sim/simX/tex_unit.h b/sim/simX/tex_unit.h new file mode 100644 index 00000000..759dda2a --- /dev/null +++ b/sim/simX/tex_unit.h @@ -0,0 +1,26 @@ +#pragma once + +#include "types.h" + +namespace vortex { + +class Core; + +class TexUnit { +public: + TexUnit(Core* core); + ~TexUnit(); + + uint32_t get_state(uint32_t state); + + void set_state(uint32_t state, uint32_t value); + + uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector* mem_addrs); + +private: + + std::array states_; + Core* core_; +}; + +} \ No newline at end of file diff --git a/sim/simX/types.h b/sim/simX/types.h index f53c3754..d4feb1cb 100644 --- a/sim/simX/types.h +++ b/sim/simX/types.h @@ -66,6 +66,7 @@ enum class AluType { BRANCH, IMUL, IDIV, + CMOV, }; inline std::ostream &operator<<(std::ostream &os, const AluType& type) { @@ -74,6 +75,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) { case AluType::BRANCH: os << "BRANCH"; break; case AluType::IMUL: os << "IMUL"; break; case AluType::IDIV: os << "IDIV"; break; + case AluType::CMOV: os << "CMOV"; break; } return os; } @@ -155,8 +157,6 @@ class Queue { protected: std::queue queue_; - uint32_t count; - public: Queue() {} @@ -168,21 +168,16 @@ public: return queue_.front(); } - void push(const T& value) { - ++count; - queue_.push(value); + T& top() { + return queue_.front(); } void pop() { queue_.pop(); } - bool try_pop(T* value) { - if (queue_.empty()) - return false; - *value = queue_.front(); - queue_.pop(); - return true; + void push(const T& value) { + queue_.push(value); } }; @@ -244,14 +239,6 @@ public: entry.first = false; --capacity_; } - - void remove(uint32_t index, T* value) { - auto& entry = entries_.at(index); - assert(entry.first); - *value = entry.second; - entry.first = false; - --capacity_; - } }; /////////////////////////////////////////////////////////////////////////////// @@ -259,18 +246,7 @@ public: template class Switch : public SimObject> { private: - struct req_batch_t { - std::vector data; - std::bitset valid; - req_batch_t() {} - req_batch_t(uint32_t size) - : data(size) - , valid(0) - {} - }; - ArbiterType type_; - std::queue reqq_; uint32_t delay_; uint32_t cursor_; uint32_t tag_shift_; @@ -295,55 +271,43 @@ public: { assert(delay_ != 0); assert(num_inputs <= MaxInputs); + if (num_inputs == 1) { + // bypass + ReqIn.at(0).bind(&ReqOut); + RspIn.bind(&RspOut.at(0)); + } } - void step(uint64_t /*cycle*/) { - // process incomming requests - { - req_batch_t req_batch(ReqIn.size()); - for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) { - Req req; - if (ReqIn.at(i).read(&req)) { - req_batch.data.at(i) = req; - req_batch.valid.set(i); + void step(uint64_t /*cycle*/) { + if (ReqIn.size() == 1) + return; + + // process incomming requests + for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) { + uint32_t j = (cursor_ + i) % n; + auto& req_in = ReqIn.at(j); + if (!req_in.empty()) { + auto& req = req_in.top(); + if (tag_shift_) { + req.tag = (req.tag << tag_shift_) | j; } + ReqOut.send(req, delay_); + req_in.pop(); + this->update_cursor(j); + break; } - if (req_batch.valid.any()) { - reqq_.push(req_batch); - } - } - - // apply arbitration - if (!reqq_.empty()) { - auto& req_batch = reqq_.front(); - for (uint32_t i = 0, n = req_batch.data.size(); i < n; ++i) { - auto j = (cursor_ + i) % n; - if (req_batch.valid.test(j)) { - auto& req = req_batch.data.at(j); - if (tag_shift_) { - req.tag = (req.tag << tag_shift_) | j; - } - ReqOut.send(req, delay_); - req_batch.valid.reset(j); - this->update_cursor(j); - if (!req_batch.valid.any()) - reqq_.pop(); // pop when empty - break; - } - } } // process incoming reponses - { - Rsp rsp; - if (RspIn.read(&rsp)) { - uint32_t port_id = 0; - if (tag_shift_) { - port_id = rsp.tag & ((1 << tag_shift_)-1); - rsp.tag >>= tag_shift_; - } - RspOut.at(port_id).send(rsp, 1); - } + if (!RspIn.empty()) { + auto& rsp = RspIn.top(); + uint32_t port_id = 0; + if (tag_shift_) { + port_id = rsp.tag & ((1 << tag_shift_)-1); + rsp.tag >>= tag_shift_; + } + RspOut.at(port_id).send(rsp, 1); + RspIn.pop(); } } diff --git a/sim/simX/warp.cpp b/sim/simX/warp.cpp index 89b9cc39..0392c1b9 100644 --- a/sim/simX/warp.cpp +++ b/sim/simX/warp.cpp @@ -21,7 +21,7 @@ Warp::Warp(Core *core, Word id) vRegFile_.resize(core_->arch().num_regs(), std::vector(core_->arch().vsize(), 0)); } -void Warp::eval(pipeline_state_t *pipeline_state) { +void Warp::eval(pipeline_trace_t *trace) { assert(tmask_.any()); DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask="); @@ -38,18 +38,18 @@ void Warp::eval(pipeline_state_t *pipeline_state) { std::abort(); } - DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr); + DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr << " (#" << trace->id << ")"); - // Update state - pipeline_state->cid = core_->id(); - pipeline_state->wid = id_; - pipeline_state->PC = PC_; - pipeline_state->tmask = tmask_; - pipeline_state->rdest = instr->getRDest(); - pipeline_state->rdest_type = instr->getRDType(); + // Update trace + trace->cid = core_->id(); + trace->wid = id_; + trace->PC = PC_; + trace->tmask = tmask_; + trace->rdest = instr->getRDest(); + trace->rdest_type = instr->getRDType(); // Execute - this->execute(*instr, pipeline_state); + this->execute(*instr, trace); DP(4, "Register state:"); for (int i = 0; i < core_->arch().num_regs(); ++i) { diff --git a/sim/simX/warp.h b/sim/simX/warp.h index 99b372ca..5af5eb02 100644 --- a/sim/simX/warp.h +++ b/sim/simX/warp.h @@ -9,7 +9,7 @@ namespace vortex { class Core; class Instr; -class pipeline_state_t; +class pipeline_trace_t; struct DomStackEntry { DomStackEntry(const ThreadMask &tmask, Word PC) : tmask(tmask) @@ -83,11 +83,11 @@ public: return iRegFile_.at(0).at(reg); } - void eval(pipeline_state_t *); + void eval(pipeline_trace_t *); private: - void execute(const Instr &instr, pipeline_state_t *pipeline_state); + void execute(const Instr &instr, pipeline_trace_t *trace); Word id_; Core *core_; diff --git a/sim/vlsim/Makefile b/sim/vlsim/Makefile index ce01395d..57e114a2 100644 --- a/sim/vlsim/Makefile +++ b/sim/vlsim/Makefile @@ -24,7 +24,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_FLAGS += $(DBG_TRACE_FLAGS) -DBG_FLAGS += -DDBG_CACHE_REQ_INFO SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp @@ -51,10 +50,13 @@ CXXFLAGS += $(CONFIGS) #THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') #VL_FLAGS += --threads $(THREADS) +# Enable VCD trace +#VCD_TRACE = -DVCD_OUTPUT + # Debugigng ifdef DEBUG - VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS) - CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS) + VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS) + CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS) else VL_FLAGS += -DNDEBUG CXXFLAGS += -O2 -DNDEBUG diff --git a/tests/regression/tex/Makefile b/tests/regression/tex/Makefile index 8b313d25..1a771373 100644 --- a/tests/regression/tex/Makefile +++ b/tests/regression/tex/Makefile @@ -9,8 +9,8 @@ VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy -VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections -VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw +VX_CFLAGS += -std=c++11 -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections +VX_CFLAGS += -DENABLE_SW -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a @@ -21,7 +21,7 @@ CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors CXXFLAGS += -DLUPNG_USE_ZLIB -CXXFLAGS += -I$(VORTEX_DRV_PATH)/include +CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex -lz @@ -38,7 +38,7 @@ kernel.bin: kernel.elf $(VX_CP) -O binary kernel.elf kernel.bin kernel.elf: $(VX_SRCS) - $(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf + $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ diff --git a/tests/regression/tex/common.h b/tests/regression/tex/common.h index 2abb7234..1a7f53d0 100644 --- a/tests/regression/tex/common.h +++ b/tests/regression/tex/common.h @@ -1,25 +1,27 @@ #ifndef _COMMON_H_ #define _COMMON_H_ +#include + #define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 typedef struct { - uint32_t num_tasks; - uint8_t format; - uint8_t filter; - uint8_t wrap; - uint8_t use_sw; - uint32_t lod; - uint8_t src_logWidth; - uint8_t src_logHeight; - uint8_t src_stride; - uint8_t src_pitch; - uint32_t src_ptr; - uint32_t dst_width; - uint32_t dst_height; - uint8_t dst_stride; - uint32_t dst_pitch; - uint32_t dst_ptr; + bool use_sw; + uint32_t num_tasks; + uint8_t format; + uint8_t filter; + uint8_t wrapu; + uint8_t wrapv; + uint8_t src_logwidth; + uint8_t src_logheight; + uint32_t src_addr; + float lod; + uint32_t mip_offs[TEX_LOD_MAX+1]; + uint32_t dst_width; + uint32_t dst_height; + uint8_t dst_stride; + uint32_t dst_pitch; + uint32_t dst_addr; } kernel_arg_t; #endif \ No newline at end of file diff --git a/tests/regression/tex/kernel.c b/tests/regression/tex/kernel.c index bd0cebb4..9aaaad24 100644 --- a/tests/regression/tex/kernel.c +++ b/tests/regression/tex/kernel.c @@ -1,11 +1,9 @@ #include #include #include -#include "common.h" +#include #include "texsw.h" -#define ENABLE_SW - typedef struct { kernel_arg_t* state; uint32_t tile_width; @@ -14,29 +12,50 @@ typedef struct { float deltaY; } tile_arg_t; +template +struct static_for_t { + template + inline void operator()(const Fn& callback) const { + callback(Start); + static_for_t()(callback); + } +}; + +template +struct static_for_t { + template + inline void operator()(const Fn& callback) const {} +}; + void kernel_body(int task_id, tile_arg_t* arg) { kernel_arg_t* state = arg->state; uint32_t xoffset = 0; - uint32_t yoffset = task_id * arg->tile_height; - uint8_t* dst_ptr = (uint8_t*)(state->dst_ptr + xoffset * state->dst_stride + yoffset * state->dst_pitch); + uint32_t yoffset = task_id * arg->tile_height; - float fv = yoffset * arg->deltaY; + uint8_t* dst_ptr = (uint8_t*)(state->dst_addr + xoffset * state->dst_stride + yoffset * state->dst_pitch); + + Fixed<16> xlod(state->lod); + + /*vx_printf("task_id=%d, deltaX=%f, deltaY=%f, tile_width=%d, tile_height=%d\n", + task_id, arg->deltaX, arg->deltaY, arg->tile_width, arg->tile_height);*/ + + float fv = (yoffset + 0.5f) * arg->deltaY; for (uint32_t y = 0; y < arg->tile_height; ++y) { uint32_t* dst_row = (uint32_t*)dst_ptr; - float fu = xoffset * arg->deltaX; + float fu = (xoffset + 0.5f) * arg->deltaX; for (uint32_t x = 0; x < arg->tile_width; ++x) { - int32_t u = (int32_t)(fu * (1<<20)); - int32_t v = (int32_t)(fv * (1<<20)); + Fixed xu(fu); + Fixed xv(fv); + uint32_t color; #ifdef ENABLE_SW - if (state->use_sw) { - dst_row[x] = (state->filter == 2) ? tex3_sw(state, 0, u, v, state->lod) : tex_sw(state, 0, u, v, state->lod); - } else { - #endif - dst_row[x] = (state->filter == 2) ? vx_tex3(0, u, v, state->lod) : vx_tex(0, u, v, state->lod); - #ifdef ENABLE_SW - } + if (state->use_sw) + color = tex_load_sw(state, xu, xv, xlod); + else #endif + color = tex_load_hw(state, xu, xv, xlod); + //vx_printf("task_id=%d, x=%d, y=%d, fu=%f, fv=%f, xu=0x%x, xv=0x%x, color=0x%x\n", task_id, x, y, fu, fv, xu.data(), xv.data(), color); + dst_row[x] = color; fu += arg->deltaX; } dst_ptr += state->dst_pitch; @@ -48,13 +67,16 @@ int main() { kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; // configure texture unit - vx_csr_write(CSR_TEX_ADDR(0), arg->src_ptr); - vx_csr_write(CSR_TEX_MIPOFF(0), 0); - vx_csr_write(CSR_TEX_WIDTH(0), arg->src_logWidth); - vx_csr_write(CSR_TEX_HEIGHT(0), arg->src_logHeight); - vx_csr_write(CSR_TEX_FORMAT(0), arg->format); - vx_csr_write(CSR_TEX_WRAP(0), (arg->wrap << 2) | arg->wrap); - vx_csr_write(CSR_TEX_FILTER(0), (arg->filter ? 1 : 0)); + csr_write(CSR_TEX(0, TEX_STATE_WIDTH), arg->src_logwidth); + csr_write(CSR_TEX(0, TEX_STATE_HEIGHT), arg->src_logheight); + csr_write(CSR_TEX(0, TEX_STATE_FORMAT), arg->format); + csr_write(CSR_TEX(0, TEX_STATE_WRAPU), arg->wrapu); + csr_write(CSR_TEX(0, TEX_STATE_WRAPV), arg->wrapv); + csr_write(CSR_TEX(0, TEX_STATE_FILTER), (arg->filter ? 1 : 0)); + csr_write(CSR_TEX(0, TEX_STATE_ADDR), arg->src_addr); + static_for_t()([&](int i) { + csr_write(CSR_TEX(0, TEX_STATE_MIPOFF(i)), arg->mip_offs[i]); + }); tile_arg_t targ; targ.state = arg; @@ -64,4 +86,9 @@ int main() { targ.deltaY = 1.0f / arg->dst_height; vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, &targ); + /*for (uint32_t t=0; t < arg->num_tasks; ++t) { + kernel_body(t, &targ); + }*/ + + return 0; } \ No newline at end of file diff --git a/tests/regression/tex/main.cpp b/tests/regression/tex/main.cpp index a83651ee..ffdfb593 100644 --- a/tests/regression/tex/main.cpp +++ b/tests/regression/tex/main.cpp @@ -25,10 +25,11 @@ const char* kernel_file = "kernel.bin"; const char* input_file = "palette64.png"; const char* output_file = "output.png"; int wrap = 0; -int filter = 0; +int filter = 0; // 0-> point, 1->bilinear, 2->trilinear float scale = 1.0f; int format = 0; bool use_sw = false; +float lod = 1.0f; // >= 1.0f ePixelFormat eformat = FORMAT_A8R8G8B8; vx_device_h device = nullptr; @@ -36,7 +37,7 @@ vx_buffer_h buffer = nullptr; static void show_usage() { std::cout << "Vortex Texture Test." << std::endl; - std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-z no_hw] [-h: help]" << std::endl; + std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-l lod] [-z no_hw] [-h: help]" << std::endl; } static void parse_args(int argc, char **argv) { @@ -55,6 +56,9 @@ static void parse_args(int argc, char **argv) { case 'w': wrap = std::atoi(optarg); break; + case 'l': + lod = std::stof(optarg, NULL); + break; case 'z': use_sw = true; break; @@ -118,7 +122,7 @@ int run_test(const kernel_arg_t& kernel_arg, // download destination buffer std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0)); + RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_addr, buf_size, 0)); std::vector dst_pixels(buf_size); auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); @@ -137,25 +141,39 @@ int run_test(const kernel_arg_t& kernel_arg, int main(int argc, char *argv[]) { kernel_arg_t kernel_arg; std::vector src_pixels; + std::vector mip_offsets; uint32_t src_width; uint32_t src_height; // parse command arguments parse_args(argc, argv); - RT_CHECK(LoadImage(input_file, eformat, src_pixels, &src_width, &src_height)); + { + std::vector staging; + RT_CHECK(LoadImage(input_file, eformat, staging, &src_width, &src_height)); + + RT_CHECK(GenerateMipmaps(src_pixels, mip_offsets, staging, eformat, src_width, src_height)); + + //uint32_t src_bpp = Format::GetInfo(eformat).BytePerPixel; + //dump_image(src_pixels, src_pixels.size() / src_bpp, 1, src_bpp); + } // check power of two support - if (!ISPOW2(src_width) || !ISPOW2(src_height)) { + if (!ispow2(src_width) || !ispow2(src_height)) { std::cout << "Error: only power of two textures supported: width=" << src_width << ", heigth=" << src_height << std::endl; return -1; } - uint32_t src_bpp = Format::GetInfo(eformat).BytePerPixel; - - //dump_image(src_pixels, src_width, src_height, src_bpp); + uint32_t src_logwidth = log2ceil(src_width); + uint32_t src_logheight = log2ceil(src_height); - uint32_t src_bufsize = src_bpp * src_width * src_height; + uint32_t src_max_lod = std::max(src_logwidth, src_logheight); + if (lod > src_max_lod) { + std::cout << "Error: out-of-bound level-of-detail: lod=" << lod << ", source image=" << src_max_lod << std::endl; + return -1; + } + + uint32_t src_bufsize = src_pixels.size(); uint32_t dst_width = (uint32_t)(src_width * scale); uint32_t dst_height = (uint32_t)(src_height * scale); @@ -183,7 +201,7 @@ int main(int argc, char *argv[]) { // allocate device memory std::cout << "allocate device memory" << std::endl; - size_t src_addr, dst_addr; + uint64_t src_addr, dst_addr; RT_CHECK(vx_alloc_dev_mem(device, src_bufsize, &src_addr)); RT_CHECK(vx_alloc_dev_mem(device, dst_bufsize, &dst_addr)); @@ -192,32 +210,37 @@ int main(int argc, char *argv[]) { // allocate staging shared memory std::cout << "allocate shared memory" << std::endl; - uint32_t alloc_size = std::max(sizeof(kernel_arg_t), std::max(src_bufsize, dst_bufsize)); + uint32_t alloc_size = std::max(sizeof(kernel_arg_t), + std::max(src_bufsize, dst_bufsize)); RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer)); // upload kernel argument std::cout << "upload kernel argument" << std::endl; { + kernel_arg.use_sw = use_sw; kernel_arg.num_tasks = std::min(num_tasks, dst_height); kernel_arg.format = format; kernel_arg.filter = filter; - kernel_arg.wrap = wrap; - kernel_arg.use_sw = use_sw; - kernel_arg.lod = 0x0; + kernel_arg.wrapu = wrap; + kernel_arg.wrapv = wrap; - kernel_arg.src_logWidth = (uint32_t)std::log2(src_width); - kernel_arg.src_logHeight = (uint32_t)std::log2(src_height); - kernel_arg.src_stride = src_bpp; - kernel_arg.src_pitch = src_bpp * src_width; - kernel_arg.src_ptr = src_addr; + kernel_arg.src_logwidth = src_logwidth; + kernel_arg.src_logheight = src_logheight; + kernel_arg.src_addr = src_addr; + kernel_arg.lod = lod; + + for (uint32_t i = 0; i < mip_offsets.size(); ++i) { + assert(i < TEX_LOD_MAX); + kernel_arg.mip_offs[i] = mip_offsets.at(i); + } kernel_arg.dst_width = dst_width; kernel_arg.dst_height = dst_height; kernel_arg.dst_stride = dst_bpp; kernel_arg.dst_pitch = dst_bpp * dst_width; - kernel_arg.dst_ptr = dst_addr; + kernel_arg.dst_addr = dst_addr; - auto buf_ptr = (int*)vx_host_ptr(buffer); + auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); } @@ -225,21 +248,21 @@ int main(int argc, char *argv[]) { // upload source buffer std::cout << "upload source buffer" << std::endl; { - auto buf_ptr = (int8_t*)vx_host_ptr(buffer); + auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); for (uint32_t i = 0; i < src_bufsize; ++i) { buf_ptr[i] = src_pixels[i]; } - RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, src_bufsize, 0)); + RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_addr, src_bufsize, 0)); } // clear destination buffer std::cout << "clear destination buffer" << std::endl; { - auto buf_ptr = (int32_t*)vx_host_ptr(buffer); + auto buf_ptr = (uint32_t*)vx_host_ptr(buffer); for (uint32_t i = 0; i < (dst_bufsize/4); ++i) { buf_ptr[i] = 0xdeadbeef; } - RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, dst_bufsize, 0)); + RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_addr, dst_bufsize, 0)); } // run tests diff --git a/tests/regression/tex/texsw.h b/tests/regression/tex/texsw.h index 96b9a19e..c9961ab8 100644 --- a/tests/regression/tex/texsw.h +++ b/tests/regression/tex/texsw.h @@ -1,167 +1,122 @@ -#ifndef _TEXSW_H_ +#pragma once +#include +#include #include "common.h" -#define TEX_LOD_MAX 11 - -#define MIN(x, y) ((x < y) ? (x) : (y)) - -#define MAX(x, y) ((x > y) ? (x) : (y)) - -inline int address(int wrap, int value) { - switch (wrap) { - case 1: return value & 0xfffff; - default: - case 0: return MIN(MAX(value, 0), 0xfffff); +inline uint32_t texel_read(uint8_t* address, uint32_t stride) { + switch (stride) { + case 1: return *(uint8_t*)address; + case 2: return *(uint16_t*)address; + case 4: return *(uint32_t*)address; + default: + std::abort(); + return 0; } } -inline void unpack(int format, int value, int* l, int* h) { - switch (format) { - case 1: - case 2: - *l = value; - *h = 0; - break; - case 3: - *l = (value | (value << 8)) & 0x00ff00ff; - *h = 0; - break; - case 4: - *l = (value | (value << 16)) & 0x07e0f81f; - *h = 0; - break; - case 5: - *l = (value | (value << 12)) & 0x0f0f0f0f; - *h = 0; - break; - default: - case 0: - *l = value & 0x00ff00ff; - *h = (value >> 8) & 0x00ff00ff; - break; - } -} +inline uint32_t vx_tex_sw(kernel_arg_t* state, + Fixed xu, + Fixed xv, + uint32_t lod) { + uint8_t* base_addr = ((uint8_t*)state->src_addr) + state->mip_offs[lod]; + uint32_t log_width = std::max(state->src_logwidth - lod, 0); + uint32_t log_height = std::max(state->src_logheight - lod, 0); + auto format = (TexFormat)state->format; + auto wrapu = (WrapMode)state->wrapu; + auto wrapv = (WrapMode)state->wrapv; + auto filter = state->filter; + auto stride = Stride(format); -inline void lerp(int al, int ah, int bl, int bh, int frac, int* l, int* h) { - *l = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; - *h = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; -} - -inline int pack(int format, int l, int h) { - switch (format) { - case 1: - case 2: - return l; - case 3: - return (l | (l >> 8)) & 0xffff; - case 4: - return (l | (l >> 16)) & 0xffff; - case 5: - return (l | (l >> 12)) & 0xffff; - default: - case 0: - return (h << 8) | l; - } -} - -inline int tex_sw(kernel_arg_t* state, int stage, int u, int v, int lod) { - int base_addr = state->src_ptr; - int mip_offset = 0; - int log_width = state->src_logWidth; - int log_height = state->src_logHeight; - int format = state->format; - int wrap = state->wrap; - int filter = state->filter; - - int32_t* pBits = ((uint32_t*)base_addr) + mip_offset; + uint32_t color; if (filter) { - int u0 = address(wrap, u - (0x80000 >> log_width)); - int v0 = address(wrap, v - (0x80000 >> log_height)); - int u1 = address(wrap, u + (0x80000 >> log_width)); - int v1 = address(wrap, v + (0x80000 >> log_height)); + // addressing + uint32_t offset00, offset01, offset10, offset11; + uint32_t alpha, beta; + TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv, + &offset00, &offset01, &offset10, &offset11, &alpha, &beta); - int x0 = u0 >> (20 - log_width); - int y0 = v0 >> (20 - log_height); - int x1 = u1 >> (20 - log_width); - int y1 = v1 >> (20 - log_height); + uint8_t* addr00 = base_addr + offset00 * stride; + uint8_t* addr01 = base_addr + offset01 * stride; + uint8_t* addr10 = base_addr + offset10 * stride; + uint8_t* addr11 = base_addr + offset11 * stride; // memory lookup - - int c0 = pBits[x0 + (y0 << log_width)]; - int c1 = pBits[x1 + (y0 << log_width)]; - int c2 = pBits[x0 + (y1 << log_width)]; - int c3 = pBits[x1 + (y1 << log_width)]; + uint32_t texel00 = texel_read(addr00, stride); + uint32_t texel01 = texel_read(addr01, stride); + uint32_t texel10 = texel_read(addr10, stride); + uint32_t texel11 = texel_read(addr11, stride); // filtering - - int alpha = x0 & 0xff; - int beta = y0 & 0xff; - - int c0a, c0b; - int c1a, c1b; - int c01a, c01b; - - unpack(format, c0, &c0a, &c0b); - unpack(format, c1, &c1a, &c1b); - lerp(c0a, c0b, c1a, c1b, alpha, &c01a, &c01b); - - int c2a, c2b; - int c3a, c3b; - int c23a, c23b; - - unpack(format, c2, &c2a, &c2b); - unpack(format, c3, &c3a, &c3b); - lerp(c2a, c2b, c3a, c3b, alpha, &c23a, &c23b); - - int c4a, c4b; - lerp(c01a, c01b, c23a, c23b, beta, &c4a, &c4b); - return pack(format, c4a, c4b); + color = TexFilterLinear( + format, texel00, texel01, texel10, texel11, alpha, beta); } else { - int u0 = address(wrap, u); - int v0 = address(wrap, v); + // addressing + uint32_t offset; + TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset); + + uint8_t* addr = base_addr + offset * stride; + + // memory lookup + uint32_t texel = texel_read(addr, stride); - int x0 = u0 >> (20 - log_width); - int y0 = v0 >> (20 - log_height); - - int c0 = pBits[x0 + (y0 <> 8) & 0x00ff00ff; - int bl = b & 0x00ff00ff; - int bh = (b >> 8) & 0x00ff00ff; - int frac = (lod >> 12) & 0xff; - int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; - int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; - int c = al | (ah << 8); - return c; +inline uint32_t tex_load_hw(kernel_arg_t* state, + Fixed xu, + Fixed xv, + Fixed<16> xlod) { + uint32_t color; + int32_t ilod = std::max(xlod.data(), Fixed<16>::ONE); + uint32_t lod = std::min(log2floor(ilod) - 16, TEX_LOD_MAX); + if (state->filter == 2) { + uint32_t lod_n = std::min(lod + 1, TEX_LOD_MAX); + uint32_t frac = ilod >> (lod + 16 - 8); + uint32_t texel0 = vx_tex(0, xu.data(), xv.data(), lod); + uint32_t texel1 = vx_tex(0, xu.data(), xv.data(), lod_n); + uint32_t cl, ch; + { + uint32_t c0l, c0h; + uint32_t c1l, c1h; + Unpack8888(TexFormat::R8G8B8A8, texel0, &c0l, &c0h); + Unpack8888(TexFormat::R8G8B8A8, texel1, &c1l, &c1h); + Lerp8888(c0l, c0h, c1l, c1h, frac, &cl, &ch); + } + color = Pack8888(TexFormat::R8G8B8A8, cl, ch); + } else { + color = vx_tex(0, xu.data(), xv.data(), lod); + } + return color; } -inline int tex3_sw(kernel_arg_t* state, int stage, int u, int v, int lod) { - int lodn = MIN(lod + 0x10000, TEX_LOD_MAX); - int a = tex_sw(state, 0, u, v, lod); - int b = tex_sw(state, 0, u, v, lodn); - int al = a & 0x00ff00ff; - int ah = (a >> 8) & 0x00ff00ff; - - int bl = b & 0x00ff00ff; - int bh = (b >> 8) & 0x00ff00ff; - int frac = (lod >> 12) & 0xff; - int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; - int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; - int c = al | (ah << 8); - return c; -} - -#endif \ No newline at end of file +inline uint32_t tex_load_sw(kernel_arg_t* state, + Fixed xu, + Fixed xv, + Fixed<16> xlod) { + uint32_t color; + int32_t ilod = std::max(xlod.data(), Fixed<16>::ONE); + uint32_t lod = std::min(log2floor(ilod) - 16, TEX_LOD_MAX); + if (state->filter == 2) { + uint32_t lod_n = std::min(lod + 1, TEX_LOD_MAX); + uint32_t frac = ilod >> (lod + 16 - 8); + uint32_t texel0 = vx_tex_sw(state, xu, xv, lod); + uint32_t texel1 = vx_tex_sw(state, xu, xv, lod_n); + uint32_t cl, ch; + { + uint32_t c0l, c0h; + uint32_t c1l, c1h; + Unpack8888(TexFormat::R8G8B8A8, texel0, &c0l, &c0h); + Unpack8888(TexFormat::R8G8B8A8, texel1, &c1l, &c1h); + Lerp8888(c0l, c0h, c1l, c1h, frac, &cl, &ch); + } + color = Pack8888(TexFormat::R8G8B8A8, cl, ch); + } else { + color = vx_tex_sw(state, xu, xv, lod); + } + return color; +} \ No newline at end of file diff --git a/tests/regression/tex/utils.cpp b/tests/regression/tex/utils.cpp index 8a2ff760..81a47158 100644 --- a/tests/regression/tex/utils.cpp +++ b/tests/regression/tex/utils.cpp @@ -191,4 +191,112 @@ int ConvertImage(std::vector& dst_pixels, SurfaceDesc dstDesc{dst_format, dst_pixels.data(), width, height, dst_pitch}; return CopyBuffers(dstDesc, 0, 0, width, height, srcDesc, 0, 0); +} + + + +int GenerateMipmaps(std::vector& dst_pixels, + std::vector& mip_offsets, + const std::vector& src_pixels, + ePixelFormat format, + uint32_t src_width, + uint32_t src_height) { + std::vector src_staging, dst_staging; + const std::vector *pSrcPixels; + std::vector *pDstPixels; + + // convert source image if needed + bool need_conversion = (format != FORMAT_A8R8G8B8); + if (need_conversion) { + ConvertImage(src_staging, src_pixels, src_width, src_height, format, FORMAT_A8R8G8B8); + pSrcPixels = &src_staging; + pDstPixels = &dst_staging; + } else { + pSrcPixels = &src_pixels; + pDstPixels = &dst_pixels; + } + + uint32_t src_logwidth = log2ceil(src_width); + uint32_t src_logheight = log2ceil(src_height); + uint32_t max_lod = std::max(src_logwidth, src_logheight) + 1; + + mip_offsets.resize(max_lod); + + // Calculate mipmaps buffer size + uint32_t dst_height = 1; + uint32_t dst_width = 0; + for (uint32_t lod = 0, w = src_width, h = src_height; lod < max_lod; ++lod) { + assert((w > 0) || (w > 0)); + uint32_t pw = std::max(w, 1); + uint32_t ph = std::max(h, 1); + mip_offsets.at(lod) = dst_width; + dst_width += pw * ph; + w >>= 1; + h >>= 1; + } + + // allocate mipmap + pDstPixels->resize(dst_width * 4); + + // generate mipmaps + { + auto pSrc = reinterpret_cast(pSrcPixels->data()); + auto pDst = reinterpret_cast(pDstPixels->data()); + + // copy level 0 + memcpy(pDst, pSrc, pSrcPixels->size()); + assert(pSrcPixels->size() == 4 * src_width * src_height); + pSrc = pDst; + pDst += src_width * src_height; + + // copy lower levels + for (uint32_t lod = 1, w = (src_width/2), h = (src_height/2); lod < max_lod;) { + assert((w > 0) || (w > 0)); + uint32_t pw = std::max(w, 1); + uint32_t ph = std::max(h, 1); + for (uint32_t y = 0; y < pw; ++y) { + auto v0 = 2 * y; + auto v1 = 2 * y + ((ph > 1) ? 1 : 0); + auto pSrc0 = pSrc + v0 * (2 * pw); + auto pSrc1 = pSrc + v1 * (2 * pw); + + for (uint32_t x = 0; x 1) ? 1 : 0); + + auto c00 = Format::ConvertFrom(pSrc0 + u0); + auto c01 = Format::ConvertFrom(pSrc0 + u1); + auto c10 = Format::ConvertFrom(pSrc1 + u0); + auto c11 = Format::ConvertFrom(pSrc1 + u1); + + const ColorARGB color((c00.a + c01.a + c10.a + c11.a+2) >> 2, + (c00.r + c01.r + c10.r + c11.r+2) >> 2, + (c00.g + c01.g + c10.g + c11.g+2) >> 2, + (c00.b + c01.b + c10.b + c11.b+2) >> 2); + + uint32_t ncolor; + Format::ConvertTo(&ncolor, color); + pDst[x + y * pw] = ncolor; + } + } + ++lod; + pSrc = pDst; + pDst += pw * ph; + w >>= 1; + h >>= 1; + } + assert((pDst - reinterpret_cast(pDstPixels->data())) == dst_width); + } + + // convert destination image if needed + if (need_conversion) { + ConvertImage(dst_staging, dst_staging, dst_width, dst_height, FORMAT_A8R8G8B8, format); + } + + uint32_t bpp = Format::GetInfo(format).BytePerPixel; + for (auto& offset : mip_offsets) { + offset *= bpp; + } + + return 0; } \ No newline at end of file diff --git a/tests/regression/tex/utils.h b/tests/regression/tex/utils.h index 48b1ad55..7ce58941 100644 --- a/tests/regression/tex/utils.h +++ b/tests/regression/tex/utils.h @@ -1,14 +1,9 @@ #include #include #include +#include #include "surfacedesc.h" -#define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1)))) - -inline uint32_t ilog2 (uint32_t value) { - return (uint32_t)(sizeof(uint32_t) * 8UL) - (uint32_t)__builtin_clzl((value << 1) - 1UL) - 1; -} - int LoadImage(const char *filename, ePixelFormat format, std::vector &pixels, @@ -37,7 +32,14 @@ int ConvertImage(std::vector& dst_pixels, ePixelFormat src_format, ePixelFormat dst_format); +int GenerateMipmaps(std::vector& dst_pixels, + std::vector& mip_offsets, + const std::vector& src_pixels, + ePixelFormat format, + uint32_t src_width, + uint32_t src_height); + void dump_image(const std::vector& pixels, uint32_t width, uint32_t height, - uint32_t bpp); + uint32_t bpp); \ No newline at end of file diff --git a/tests/runtime/fibonacci/main.cpp b/tests/runtime/fibonacci/main.cpp index f6612c29..c6fc036a 100644 --- a/tests/runtime/fibonacci/main.cpp +++ b/tests/runtime/fibonacci/main.cpp @@ -1,4 +1,5 @@ #include +#include const int Num = 9; const int Ans = 34; @@ -14,12 +15,12 @@ int main() { int fib = fibonacci(Num); - printf("fibonacci(%d) = %d\n", Num, fib); + vx_printf("fibonacci(%d) = %d\n", Num, fib); if (fib == Ans) { - printf("Passed!\n"); + vx_printf("Passed!\n"); } else { - printf("Failed! value=%d, expected=%d\n", fib, Ans); + vx_printf("Failed! value=%d, expected=%d\n", fib, Ans); errors = 1; } diff --git a/tests/runtime/hello/main.cpp b/tests/runtime/hello/main.cpp index 69904cfd..94aff07e 100644 --- a/tests/runtime/hello/main.cpp +++ b/tests/runtime/hello/main.cpp @@ -1,8 +1,9 @@ #include +#include int main() { - printf("Hello World!\n"); + vx_printf("Hello World!\n"); return 0; } \ No newline at end of file