diff --git a/.gitmodules b/.gitmodules index 96aeefdb..0db51e41 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,12 @@ -[submodule "hw/rtl/fp_cores/fpnew"] - path = hw/rtl/fp_cores/fpnew +[submodule "third_party/fpnew"] + path = third_party/fpnew url = https://github.com/pulp-platform/fpnew.git -[submodule "sim/common/softfloat"] - path = sim/common/softfloat +[submodule "third_party/softfloat"] + path = third_party/softfloat url = https://github.com/ucb-bar/berkeley-softfloat-3.git +[submodule "third_party/cocogfx"] + path = third_party/cocogfx + url = https://github.com/gtcasl/cocogfx.git +[submodule "third_party/ramulator"] + path = third_party/ramulator + url = https://github.com/CMU-SAFARI/ramulator.git diff --git a/Makefile b/Makefile index 859c597d..8142a1be 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,11 @@ all: + $(MAKE) -C third_party $(MAKE) -C hw $(MAKE) -C sim $(MAKE) -C driver $(MAKE) -C runtime $(MAKE) -C tests - + clean: $(MAKE) -C hw clean $(MAKE) -C sim clean diff --git a/ci/blackbox.sh b/ci/blackbox.sh index 5ba7a29a..88930faf 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -124,7 +124,19 @@ CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_TH echo "CONFIGS=$CONFIGS" -make -C $DRIVER_PATH clean +BLACKBOX_CACHE=blackbox.$DRIVER.cache + +if [ -f "$BLACKBOX_CACHE" ] +then + LAST_CONFIGS=`cat $BLACKBOX_CACHE` +fi + +if [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ]; +then + make -C $DRIVER_PATH clean +fi + +echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE status=0 diff --git a/ci/regression.sh b/ci/regression.sh index 073c0ed1..b99754af 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -27,8 +27,11 @@ tex() echo "begin texture tests..." CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0" +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-isoccer.png -osoccer_result.png -g0" CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" -CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-irainbow.png -orainbow_result.png -g1" +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-irainbow.png -orainbow_result.png -g2" +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" --perf +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-itoad.png -otoad_result.png -g1" --perf echo "coverage texture done!" } @@ -40,15 +43,21 @@ echo "begin clustering tests..." # warp/threads configurations ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=demo ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=demo +./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=demo # cores clustering ./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=demo --args="-n1" +./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=demo --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=demo --args="-n1" # L2/L3 ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1" +./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=demo --args="-n1" echo "clustering tests done!" } @@ -58,7 +67,9 @@ debug() echo "begin debugging tests..." ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1" ./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1" echo "debugging tests done!" @@ -73,9 +84,13 @@ CONFIGS=-DEXT_M_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_e # disabling F extension CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext +CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf +CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf # disable shared memory CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem +CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem --perf +CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=simx --cores=1 --app=no_smem --perf # using Default FPU core FPU_CORE=FPU_DEFAULT ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood @@ -87,33 +102,29 @@ FPU_CORE=FPU_FPNEW ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo # adjust l1 block size to match l2 -CONFIGS="-DMEM_BLOCK_SIZE=16 -DL1_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1" +CONFIGS="-DL1_BLOCK_SIZE=64" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1" # test cache banking CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr +CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=io_addr # test cache multi-porting CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo --debug --args="-n1" CONFIGS="-DL2_NUM_PORTS=2 -DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=io_addr +CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=io_addr # test 128-bit MEM block CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo -# test 128-bit MEM and DRAM block -CONFIGS="-DMEM_BLOCK_SIZE=16 -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo +# test single-bank DRAM +CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo # test 27-bit DRAM address CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo -# test 128-bit DRAM block -CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo - -# test long memory latency -CONFIGS="-DMEM_LATENCY=100 -DMEM_RQ_SIZE=4 -DMEM_STALLS_MODULO=4" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo - echo "configuration tests done!" } diff --git a/driver/common/opae.cpp b/driver/common/opae.cpp index de6e0fbb..e0f9ad09 100755 --- a/driver/common/opae.cpp +++ b/driver/common/opae.cpp @@ -17,6 +17,7 @@ #include #endif +#include "vx_utils.h" #include #include #include "vortex_afu.h" @@ -52,7 +53,7 @@ typedef struct vx_device_ { fpga_handle fpga; - size_t mem_allocation; + uint64_t mem_allocation; unsigned version; unsigned num_cores; unsigned num_warps; @@ -64,19 +65,9 @@ typedef struct vx_buffer_ { void* host_ptr; uint64_t io_addr; vx_device_h hdevice; - size_t size; + uint64_t size; } vx_buffer_t; -inline size_t align_size(size_t size, size_t alignment) { - assert(0 == (alignment & (alignment - 1))); - return (size + alignment - 1) & ~(alignment - 1); -} - -inline bool is_aligned(size_t addr, size_t alignment) { - assert(0 == (alignment & (alignment - 1))); - return 0 == (addr & (alignment - 1)); -} - /////////////////////////////////////////////////////////////////////////////// #ifdef DUMP_PERF_STATS @@ -107,7 +98,7 @@ AutoPerfDump gAutoPerfDump; /////////////////////////////////////////////////////////////////////////////// -extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { +extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { if (nullptr == hdevice) return -1; @@ -279,7 +270,7 @@ extern int vx_dev_close(vx_device_h hdevice) { return 0; } -extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) { +extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) { if (nullptr == hdevice || nullptr == dev_maddr || 0 >= size) @@ -288,7 +279,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) vx_device_t *device = ((vx_device_t*)hdevice); size_t dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (device->mem_allocation + asize > dev_mem_size) return -1; @@ -299,7 +290,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) return 0; } -extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) { +extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) { fpga_result res; void* host_ptr; uint64_t wsid; @@ -313,7 +304,7 @@ extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hb vx_device_t *device = ((vx_device_t*)hdevice); - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); res = fpgaPrepareBuffer(device->fpga, asize, &host_ptr, &wsid, 0); if (FPGA_OK != res) { @@ -367,7 +358,7 @@ extern int vx_buf_release(vx_buffer_h hbuffer) { return 0; } -extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { +extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { if (nullptr == hdevice) return -1; @@ -386,7 +377,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { #endif // to milliseconds - long long sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000); + uint64_t sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000); for (;;) { uint64_t status; @@ -430,7 +421,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { return 0; } -extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) { +extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -438,8 +429,8 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer); vx_device_t *device = ((vx_device_t*)buffer->hdevice); - size_t dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + uint64_t dev_mem_size = LOCAL_MEM_SIZE; + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); // check alignment if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE)) @@ -454,7 +445,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si return -1; // Ensure ready for new command - if (vx_ready_wait(buffer->hdevice, -1) != 0) + if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0) return -1; auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE); @@ -465,13 +456,13 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE)); // Wait for the write operation to finish - if (vx_ready_wait(buffer->hdevice, -1) != 0) + if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0) return -1; return 0; } -extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) { +extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -479,8 +470,8 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer); vx_device_t *device = ((vx_device_t*)buffer->hdevice); - size_t dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + uint64_t dev_mem_size = LOCAL_MEM_SIZE; + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); // check alignment if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE)) @@ -495,7 +486,7 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, return -1; // Ensure ready for new command - if (vx_ready_wait(buffer->hdevice, -1) != 0) + if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0) return -1; auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE); @@ -506,7 +497,7 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ)); // Wait for the write operation to finish - if (vx_ready_wait(buffer->hdevice, -1) != 0) + if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0) return -1; return 0; @@ -519,7 +510,7 @@ extern int vx_start(vx_device_h hdevice) { vx_device_t *device = ((vx_device_t*)hdevice); // Ensure ready for new command - if (vx_ready_wait(hdevice, -1) != 0) + if (vx_ready_wait(hdevice, MAX_TIMEOUT) != 0) return -1; // start execution diff --git a/driver/common/vx_scope.h b/driver/common/vx_scope.h index dfc53520..0e2ae081 100644 --- a/driver/common/vx_scope.h +++ b/driver/common/vx_scope.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #ifdef USE_VLSIM #include diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index 79853aa1..a69df27c 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -1,17 +1,29 @@ +#include "vx_utils.h" #include #include #include #include #include +#include -extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size) { +uint64_t aligned_size(uint64_t size, uint64_t alignment) { + assert(0 == (alignment & (alignment - 1))); + return (size + alignment - 1) & ~(alignment - 1); +} + +bool is_aligned(uint64_t addr, uint64_t alignment) { + assert(0 == (alignment & (alignment - 1))); + return 0 == (addr & (alignment - 1)); +} + +extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, uint64_t size) { int err = 0; if (NULL == content || 0 == size) return -1; uint32_t buffer_transfer_size = 65536; - unsigned kernel_base_addr; + uint64_t kernel_base_addr; err = vx_dev_caps(device, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr); if (err != 0) return -1; @@ -29,9 +41,9 @@ extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_ // upload content // - size_t offset = 0; + uint64_t offset = 0; while (offset < size) { - auto chunk_size = std::min(buffer_transfer_size, size - offset); + auto chunk_size = std::min(buffer_transfer_size, size - offset); std::memcpy(buf_ptr, (uint8_t*)content + offset, chunk_size); /*printf("*** Upload Kernel to 0x%0x: data=", kernel_base_addr + offset); @@ -102,11 +114,13 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t csr_stalls = 0; uint64_t alu_stalls = 0; uint64_t gpu_stalls = 0; + // PERF: decode + uint64_t loads = 0; + uint64_t stores = 0; + uint64_t branches = 0; // PERF: Icache uint64_t icache_reads = 0; uint64_t icache_read_misses = 0; - uint64_t icache_pipe_stalls = 0; - uint64_t icache_rsp_stalls = 0; // PERF: Dcache uint64_t dcache_reads = 0; uint64_t dcache_writes = 0; @@ -114,20 +128,22 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t dcache_write_misses = 0; uint64_t dcache_bank_stalls = 0; uint64_t dcache_mshr_stalls = 0; - uint64_t dcache_pipe_stalls = 0; - uint64_t dcache_rsp_stalls = 0; - // PERF: SMEM + // PERF: shared memory uint64_t smem_reads = 0; uint64_t smem_writes = 0; uint64_t smem_bank_stalls = 0; // PERF: memory uint64_t mem_reads = 0; uint64_t mem_writes = 0; - uint64_t mem_stalls = 0; uint64_t mem_lat = 0; +#ifdef EXT_TEX_ENABLE + // PERF: texunit + uint64_t tex_mem_reads = 0; + uint64_t tex_mem_lat = 0; +#endif #endif - unsigned num_cores; + uint64_t num_cores; ret = vx_dev_caps(device, VX_CAPS_MAX_CORES, &num_cores); if (ret != 0) return ret; @@ -184,6 +200,20 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu unit stalls=%ld\n", core_id, gpu_stalls_per_core); gpu_stalls += gpu_stalls_per_core; + // PERF: decode + // loads + uint64_t loads_per_core = get_csr_64(staging_ptr, CSR_MPM_LOADS); + if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core); + loads += loads_per_core; + // stores + uint64_t stores_per_core = get_csr_64(staging_ptr, CSR_MPM_STORES); + if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core); + stores += stores_per_core; + // branches + uint64_t branches_per_core = get_csr_64(staging_ptr, CSR_MPM_BRANCHES); + if (num_cores > 1) fprintf(stream, "PERF: core%d: branches=%ld\n", core_id, branches_per_core); + branches += branches_per_core; + // PERF: Icache // total reads uint64_t icache_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_READS); @@ -192,16 +222,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { // read misses uint64_t icache_miss_r_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_MISS_R); int icache_read_hit_ratio = (int)((1.0 - (double(icache_miss_r_per_core) / double(icache_reads_per_core))) * 100); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio); + if (num_cores > 1) fprintf(stream, "PERF: core%d: icache misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio); icache_read_misses += icache_miss_r_per_core; - // pipeline stalls - uint64_t icache_pipe_st_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_PIPE_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache pipeline stalls=%ld\n", core_id, icache_pipe_st_per_core); - icache_pipe_stalls += icache_pipe_st_per_core; - // response stalls - uint64_t icache_crsp_st_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_CRSP_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reponse stalls=%ld\n", core_id, icache_crsp_st_per_core); - icache_rsp_stalls += icache_crsp_st_per_core; // PERF: Dcache // total reads @@ -231,14 +253,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t dcache_mshr_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MSHR_ST); if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_st_per_core); dcache_mshr_stalls += dcache_mshr_st_per_core; - // pipeline stalls - uint64_t dcache_pipe_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_PIPE_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core); - dcache_pipe_stalls += dcache_pipe_st_per_core; - // response stalls - uint64_t dcache_crsp_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_CRSP_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core); - dcache_rsp_stalls += dcache_crsp_st_per_core; // PERF: SMEM // total reads @@ -258,17 +272,26 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { // PERF: memory uint64_t mem_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_READS); uint64_t mem_writes_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_WRITES); - uint64_t mem_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_ST); uint64_t mem_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_LAT); - int mem_utilization = (int)((double(mem_reads_per_core + mem_writes_per_core) / double(mem_reads_per_core + mem_writes_per_core + mem_stalls_per_core)) * 100); int mem_avg_lat = (int)(double(mem_lat_per_core) / double(mem_reads_per_core)); if (num_cores > 1) fprintf(stream, "PERF: core%d: memory requests=%ld (reads=%ld, writes=%ld)\n", core_id, (mem_reads_per_core + mem_writes_per_core), mem_reads_per_core, mem_writes_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: memory stalls=%ld (utilization=%d%%)\n", core_id, mem_stalls_per_core, mem_utilization); - if (num_cores > 1) fprintf(stream, "PERF: core%d: memory average latency=%d cycles\n", core_id, mem_avg_lat); + if (num_cores > 1) fprintf(stream, "PERF: core%d: memory latency=%d cycles\n", core_id, mem_avg_lat); mem_reads += mem_reads_per_core; mem_writes += mem_writes_per_core; - mem_stalls += mem_stalls_per_core; mem_lat += mem_lat_per_core; + + #ifdef EXT_TEX_ENABLE + // total reads + uint64_t tex_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_TEX_READS); + if (num_cores > 1) fprintf(stream, "PERF: core%d: tex memory reads=%ld\n", core_id, tex_reads_per_core); + tex_mem_reads += tex_reads_per_core; + + // read latency + uint64_t tex_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_TEX_LAT); + int tex_avg_lat = (int)(double(tex_lat_per_core) / double(tex_reads_per_core)); + if (num_cores > 1) fprintf(stream, "PERF: core%d: tex memory latency=%d cycles\n", core_id, tex_avg_lat); + tex_mem_lat += tex_lat_per_core; + #endif #endif } @@ -281,7 +304,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100); int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100); int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100); - int mem_utilization = (int)((double(mem_reads + mem_writes) / double(mem_reads + mem_writes + mem_stalls)) * 100); int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads)); fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls); fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls); @@ -290,24 +312,27 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { fprintf(stream, "PERF: csr unit stalls=%ld\n", csr_stalls); fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls); fprintf(stream, "PERF: gpu unit stalls=%ld\n", gpu_stalls); + fprintf(stream, "PERF: loads=%ld\n", loads); + fprintf(stream, "PERF: stores=%ld\n", stores); + fprintf(stream, "PERF: branches=%ld\n", branches); fprintf(stream, "PERF: icache reads=%ld\n", icache_reads); fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio); - fprintf(stream, "PERF: icache pipeline stalls=%ld\n", icache_pipe_stalls); - fprintf(stream, "PERF: icache reponse stalls=%ld\n", icache_rsp_stalls); fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads); fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes); fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio); fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio); fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization); fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls); - fprintf(stream, "PERF: dcache pipeline stalls=%ld\n", dcache_pipe_stalls); - fprintf(stream, "PERF: dcache reponse stalls=%ld\n", dcache_rsp_stalls); fprintf(stream, "PERF: smem reads=%ld\n", smem_reads); fprintf(stream, "PERF: smem writes=%ld\n", smem_writes); fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization); fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes); - fprintf(stream, "PERF: memory stalls=%ld (utilization=%d%%)\n", mem_stalls, mem_utilization); fprintf(stream, "PERF: memory average latency=%d cycles\n", mem_avg_lat); +#ifdef EXT_TEX_ENABLE + int tex_avg_lat = (int)(double(tex_mem_lat) / double(tex_mem_reads)); + fprintf(stream, "PERF: tex memory reads=%ld\n", tex_mem_reads); + fprintf(stream, "PERF: tex memory latency=%d cycles\n", tex_avg_lat); +#endif #endif // release allocated resources diff --git a/driver/common/vx_utils.h b/driver/common/vx_utils.h new file mode 100644 index 00000000..b86c75af --- /dev/null +++ b/driver/common/vx_utils.h @@ -0,0 +1,11 @@ +#pragma once + +#include + +uint64_t aligned_size(uint64_t size, uint64_t alignment); + +bool is_aligned(uint64_t addr, uint64_t alignment); + +#define CACHE_BLOCK_SIZE 64 +#define ALLOC_BASE_ADDR 0x00000000 +#define LOCAL_MEM_SIZE 4294967296 // 4 GB \ No newline at end of file diff --git a/driver/include/vortex.h b/driver/include/vortex.h index 05648671..0fc9c5ce 100644 --- a/driver/include/vortex.h +++ b/driver/include/vortex.h @@ -2,6 +2,7 @@ #define __VX_DRIVER_H__ #include +#include #include #ifdef __cplusplus @@ -22,9 +23,7 @@ typedef void* vx_buffer_h; #define VX_CAPS_ALLOC_BASE_ADDR 0x6 #define VX_CAPS_KERNEL_BASE_ADDR 0x7 -#define CACHE_BLOCK_SIZE 64 -#define ALLOC_BASE_ADDR 0x00000000 -#define LOCAL_MEM_SIZE 0xffffffff +#define MAX_TIMEOUT (60*60*1000) // 1hr // open the device and connect to it int vx_dev_open(vx_device_h* hdevice); @@ -33,10 +32,10 @@ int vx_dev_open(vx_device_h* hdevice); int vx_dev_close(vx_device_h hdevice); // return device configurations -int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value); +int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value); // Allocate shared buffer with device -int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer); +int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer); // Get host pointer address void* vx_host_ptr(vx_buffer_h hbuffer); @@ -45,24 +44,24 @@ void* vx_host_ptr(vx_buffer_h hbuffer); int vx_buf_release(vx_buffer_h hbuffer); // allocate device memory and return address -int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr); +int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr); // Copy bytes from buffer to device local memory -int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset); +int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset); // Copy bytes from device local memory to buffer -int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dst_offset); +int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dst_offset); // Start device execution int vx_start(vx_device_h hdevice); // Wait for device ready with milliseconds timeout -int vx_ready_wait(vx_device_h hdevice, long long timeout); +int vx_ready_wait(vx_device_h hdevice, uint64_t timeout); ////////////////////////////// UTILITY FUNCIONS /////////////////////////////// // upload kernel bytes to device -int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size); +int vx_upload_kernel_bytes(vx_device_h device, const void* content, uint64_t size); // upload kernel file to device int vx_upload_kernel_file(vx_device_h device, const char* filename); diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index cf0a184d..72d3a07a 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -3,9 +3,7 @@ RTLSIM_DIR = ../../sim/rtlsim CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors #CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors -CXXFLAGS += -I../include -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common - -LDFLAGS += $(RTLSIM_DIR)/librtlsim.a +CXXFLAGS += -I../include -I../common -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common # Position independent code CXXFLAGS += -fPIC @@ -17,6 +15,7 @@ CXXFLAGS += $(CONFIGS) CXXFLAGS += -DDUMP_PERF_STATS LDFLAGS += -shared -pthread +LDFLAGS += -L. -lrtlsim SRCS = vortex.cpp ../common/vx_utils.cpp @@ -30,9 +29,9 @@ PROJECT = libvortex.so all: $(PROJECT) $(PROJECT): $(SRCS) - $(MAKE) -C $(RTLSIM_DIR) static + DESTDIR=../../driver/rtlsim $(MAKE) -C $(RTLSIM_DIR) ../../driver/rtlsim/librtlsim.so $(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT) clean: - $(MAKE) -C $(RTLSIM_DIR) clean-static + DESTDIR=../../driver/rtlsim $(MAKE) -C $(RTLSIM_DIR) clean rm -rf $(PROJECT) *.o \ No newline at end of file diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index 64fcd72e..85f7054c 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -4,13 +4,17 @@ #include #include #include +#include #include #include +#include #include #include #include -#include +#include + +#define RAM_PAGE_SIZE 4096 using namespace vortex; @@ -19,10 +23,10 @@ using namespace vortex; class vx_device; class vx_buffer { public: - vx_buffer(size_t size, vx_device* device) + vx_buffer(uint64_t size, vx_device* device) : size_(size) , device_(device) { - auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE); + auto aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE); data_ = malloc(aligned_asize); } @@ -36,7 +40,7 @@ public: return data_; } - size_t size() const { + uint64_t size() const { return size_; } @@ -45,7 +49,7 @@ public: } private: - size_t size_; + uint64_t size_; vx_device* device_; void* data_; }; @@ -54,9 +58,12 @@ private: class vx_device { public: - vx_device() : ram_((1<<12), (1<<20)) { - mem_allocation_ = ALLOC_BASE_ADDR; - } + vx_device() + : ram_(RAM_PAGE_SIZE) + , mem_allocation_(ALLOC_BASE_ADDR) + { + processor_.attach_ram(&ram_); + } ~vx_device() { if (future_.valid()) { @@ -64,9 +71,9 @@ public: } } - int alloc_local_mem(size_t size, size_t* dev_maddr) { - auto dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) { + uint64_t dev_mem_size = LOCAL_MEM_SIZE; + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (mem_allocation_ + asize > dev_mem_size) return -1; *dev_maddr = mem_allocation_; @@ -74,9 +81,9 @@ public: return 0; } - int upload(const void* src, size_t dest_addr, size_t size, size_t src_offset) { - size_t asize = align_size(size, CACHE_BLOCK_SIZE); - if (dest_addr + asize > ram_.size()) + int upload(const void* src, uint64_t dest_addr, uint64_t size, uint64_t src_offset) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (dest_addr + asize > LOCAL_MEM_SIZE) return -1; /*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src + src_offset)); @@ -92,9 +99,9 @@ public: return 0; } - int download(void* dest, size_t src_addr, size_t size, size_t dest_offset) { - size_t asize = align_size(size, CACHE_BLOCK_SIZE); - if (src_addr + asize > ram_.size()) + int download(void* dest, uint64_t src_addr, uint64_t size, uint64_t dest_offset) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (src_addr + asize > LOCAL_MEM_SIZE) return -1; ram_.read((uint8_t*)dest + dest_offset, src_addr, asize); @@ -112,26 +119,25 @@ public: } int start() { + // ensure prior run completed if (future_.valid()) { - future_.wait(); // ensure prior run completed + future_.wait(); } - simulator_.attach_ram(&ram_); - future_ = std::async(std::launch::async, [&]{ - simulator_.reset(); - while (simulator_.is_busy()) { - simulator_.step(); - } + // start new run + future_ = std::async(std::launch::async, [&]{ + processor_.run(); }); return 0; } - int wait(long long timeout) { + int wait(uint64_t timeout) { if (!future_.valid()) return 0; - auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000); + uint64_t timeout_sec = timeout / 1000; std::chrono::seconds wait_time(1); for (;;) { - auto status = future_.wait_for(wait_time); // wait for 1 sec and check status + // wait for 1 sec and check status + auto status = future_.wait_for(wait_time); if (status == std::future_status::ready || 0 == timeout_sec--) break; @@ -141,9 +147,9 @@ public: private: - size_t mem_allocation_; RAM ram_; - Simulator simulator_; + Processor processor_; + uint64_t mem_allocation_; std::future future_; }; @@ -177,7 +183,7 @@ AutoPerfDump gAutoPerfDump; /////////////////////////////////////////////////////////////////////////////// -extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { +extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { if (nullptr == hdevice) return -1; @@ -198,10 +204,10 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { *value = CACHE_BLOCK_SIZE; break; case VX_CAPS_LOCAL_MEM_SIZE: - *value = 0xffffffff; + *value = LOCAL_MEM_SIZE; break; case VX_CAPS_ALLOC_BASE_ADDR: - *value = 0x10000000; + *value = ALLOC_BASE_ADDR; break; case VX_CAPS_KERNEL_BASE_ADDR: *value = STARTUP_ADDR; @@ -244,7 +250,7 @@ extern int vx_dev_close(vx_device_h hdevice) { return 0; } -extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) { +extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) { if (nullptr == hdevice || nullptr == dev_maddr || 0 >= size) @@ -255,7 +261,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) } -extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) { +extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) { if (nullptr == hdevice || 0 >= size || nullptr == hbuffer) @@ -294,7 +300,7 @@ extern int vx_buf_release(vx_buffer_h hbuffer) { return 0; } -extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) { +extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -307,7 +313,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset); } -extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) { +extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -329,7 +335,7 @@ extern int vx_start(vx_device_h hdevice) { return device->start(); } -extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { +extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { if (nullptr == hdevice) return -1; diff --git a/driver/simx/Makefile b/driver/simx/Makefile index 82bf6e32..b5723972 100644 --- a/driver/simx/Makefile +++ b/driver/simx/Makefile @@ -1,15 +1,15 @@ -SIMX_DIR = ../../sim/simX +SIMX_DIR = ../../sim/simx CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CXXFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors CXXFLAGS += -fPIC -Wno-maybe-uninitialized -CXXFLAGS += -I../include -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common +CXXFLAGS += -I../include -I../common -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common CXXFLAGS += $(CONFIGS) CXXFLAGS += -DDUMP_PERF_STATS LDFLAGS += -shared -pthread -LDFLAGS += $(SIMX_DIR)/libsimX.a +LDFLAGS += -L. -lsimx SRCS = vortex.cpp ../common/vx_utils.cpp @@ -18,9 +18,9 @@ PROJECT = libvortex.so all: $(PROJECT) $(PROJECT): $(SRCS) - $(MAKE) -C $(SIMX_DIR) static + DESTDIR=../../driver/simx $(MAKE) -C $(SIMX_DIR) ../../driver/simx/libsimx.so $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ clean: - $(MAKE) -C $(SIMX_DIR) clean-static - rm -rf $(PROJECT) *.o \ No newline at end of file + DESTDIR=../../driver/simx $(MAKE) -C $(SIMX_DIR) clean + rm -rf libsimx.so $(PROJECT) *.o \ No newline at end of file diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index 5ad4242b..e1897139 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -3,16 +3,21 @@ #include #include #include -#include -#include +#include #include #include -#include +#include + #include + #include -#define PAGE_SIZE 4096 +#include +#include +#include +#include + using namespace vortex; @@ -22,10 +27,10 @@ class vx_device; class vx_buffer { public: - vx_buffer(size_t size, vx_device* device) + vx_buffer(uint64_t size, vx_device* device) : size_(size) , device_(device) { - auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE); + uint64_t aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE); data_ = malloc(aligned_asize); } @@ -39,7 +44,7 @@ public: return data_; } - size_t size() const { + uint64_t size() const { return size_; } @@ -48,7 +53,7 @@ public: } private: - size_t size_; + uint64_t size_; vx_device* device_; void* data_; }; @@ -58,33 +63,24 @@ private: class vx_device { public: vx_device() - : arch_("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS) - , decoder_(arch_) - , mmu_(PAGE_SIZE, arch_.wsize(), true) - , cores_(arch_.num_cores()) - , is_done_(false) - , is_running_(false) - , thread_(__thread_proc__, this) - , ram_((1<<12), (1<<20)) { - - mem_allocation_ = ALLOC_BASE_ADDR; - mmu_.attach(ram_, 0, 0xffffffff); - for (int i = 0; i < arch_.num_cores(); ++i) { - cores_[i] = std::make_shared(arch_, decoder_, mmu_, i); - } + : arch_("rv32i", NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS) + , ram_(RAM_PAGE_SIZE) + , processor_(arch_) + , mem_allocation_(ALLOC_BASE_ADDR) + { + // attach memory module + processor_.attach_ram(&ram_); } ~vx_device() { - mutex_.lock(); - is_done_ = true; - mutex_.unlock(); - - thread_.join(); + if (future_.valid()) { + future_.wait(); + } } - int alloc_local_mem(size_t size, size_t* dev_maddr) { - auto dev_mem_size = LOCAL_MEM_SIZE; - auto asize = align_size(size, CACHE_BLOCK_SIZE); + int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) { + uint64_t dev_mem_size = LOCAL_MEM_SIZE; + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (mem_allocation_ + asize > dev_mem_size) return -1; *dev_maddr = mem_allocation_; @@ -92,9 +88,9 @@ public: return 0; } - int upload(const void* src, size_t dest_addr, size_t size, size_t src_offset) { - auto asize = align_size(size, CACHE_BLOCK_SIZE); - if (dest_addr + asize > ram_.size()) + int upload(const void* src, uint64_t dest_addr, uint64_t size, uint64_t src_offset) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (dest_addr + asize > LOCAL_MEM_SIZE) return -1; ram_.write((const uint8_t*)src + src_offset, dest_addr, asize); @@ -107,9 +103,9 @@ public: return 0; } - int download(void* dest, size_t src_addr, size_t size, size_t dest_offset) { - size_t asize = align_size(size, CACHE_BLOCK_SIZE); - if (src_addr + asize > ram_.size()) + int download(void* dest, uint64_t src_addr, uint64_t size, uint64_t dest_offset) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (src_addr + asize > LOCAL_MEM_SIZE) return -1; ram_.read((uint8_t*)dest + dest_offset, src_addr, asize); @@ -123,98 +119,40 @@ public: } int start() { - - mutex_.lock(); - for (int i = 0; i < arch_.num_cores(); ++i) { - cores_[i]->clear(); + // ensure prior run completed + if (future_.valid()) { + future_.wait(); } - is_running_ = true; - mutex_.unlock(); - + + // start new run + future_ = std::async(std::launch::async, [&]{ + processor_.run(); + }); + return 0; } - int wait(long long timeout) { - auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000); + int wait(uint64_t timeout) { + if (!future_.valid()) + return 0; + uint64_t timeout_sec = timeout / 1000; + std::chrono::seconds wait_time(1); for (;;) { - mutex_.lock(); - bool is_running = is_running_; - mutex_.unlock(); - - if (!is_running || 0 == timeout_sec--) + // wait for 1 sec and check status + auto status = future_.wait_for(wait_time); + if (status == std::future_status::ready + || 0 == timeout_sec--) break; - - std::this_thread::sleep_for(std::chrono::seconds(1)); } return 0; - } - - int get_csr(int core_id, int addr, unsigned *value) { - *value = cores_.at(core_id)->get_csr(addr, 0, 0); - return 0; - } - - int set_csr(int core_id, int addr, unsigned value) { - cores_.at(core_id)->set_csr(addr, value, 0, 0); - return 0; - } + } private: - - void run() { - bool running; - do { - running = false; - for (auto& core : cores_) { - core->step(); - if (core->running()) - running = true; - } - } while (running); - } - - void thread_proc() { - std::cout << "Device ready..." << std::flush << std::endl; - - for (;;) { - mutex_.lock(); - bool is_done = is_done_; - bool is_running = is_running_; - mutex_.unlock(); - - if (is_done) - break; - - if (is_running) { - std::cout << "Device running..." << std::flush << std::endl; - - this->run(); - - mutex_.lock(); - is_running_ = false; - mutex_.unlock(); - - std::cout << "Device ready..." << std::flush << std::endl; - } - } - - std::cout << "Device shutdown..." << std::flush << std::endl; - } - - static void __thread_proc__(vx_device* device) { - device->thread_proc(); - } - ArchDef arch_; - Decoder decoder_; - MemoryUnit mmu_; - std::vector> cores_; - bool is_done_; - bool is_running_; - size_t mem_allocation_; - std::thread thread_; RAM ram_; - std::mutex mutex_; + Processor processor_; + uint64_t mem_allocation_; + std::future future_; }; /////////////////////////////////////////////////////////////////////////////// @@ -276,7 +214,7 @@ extern int vx_dev_close(vx_device_h hdevice) { return 0; } -extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { +extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { if (nullptr == hdevice) return -1; @@ -314,7 +252,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { return 0; } -extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) { +extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) { if (nullptr == hdevice || nullptr == dev_maddr || 0 >= size) @@ -324,7 +262,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) return device->alloc_local_mem(size, dev_maddr); } -extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) { +extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) { if (nullptr == hdevice || 0 >= size || nullptr == hbuffer) @@ -363,7 +301,7 @@ extern int vx_buf_release(vx_buffer_h hbuffer) { return 0; } -extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) { +extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -376,7 +314,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset); } -extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) { +extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -398,7 +336,7 @@ extern int vx_start(vx_device_h hdevice) { return device->start(); } -extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { +extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { if (nullptr == hdevice) return -1; diff --git a/driver/stub/vortex.cpp b/driver/stub/vortex.cpp index f5079500..95777257 100644 --- a/driver/stub/vortex.cpp +++ b/driver/stub/vortex.cpp @@ -8,15 +8,15 @@ extern int vx_dev_close(vx_device_h /*hdevice*/) { return -1; } -extern int vx_dev_caps(vx_device_h /*hdevice*/, unsigned /*caps_id*/, unsigned* /*value*/) { +extern int vx_dev_caps(vx_device_h /*hdevice*/, uint32_t /*caps_id*/, uint64_t* /*value*/) { return -1; } -extern int vx_alloc_dev_mem(vx_device_h /*hdevice*/, size_t /*size*/, size_t* /*dev_maddr*/) { +extern int vx_alloc_dev_mem(vx_device_h /*hdevice*/, uint64_t /*size*/, uint64_t* /*dev_maddr*/) { return -1; } -extern int vx_alloc_shared_mem(vx_device_h /*hdevice*/, size_t /*size*/, vx_buffer_h* /*hbuffer*/) { +extern int vx_alloc_shared_mem(vx_device_h /*hdevice*/, uint64_t /*size*/, vx_buffer_h* /*hbuffer*/) { return -1; } @@ -28,11 +28,11 @@ extern int vx_buf_release(vx_buffer_h /*hbuffer*/) { return -1; } -extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, size_t /*dev_maddr*/, size_t /*size*/, size_t /*src_offset*/) { +extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, uint64_t /*dev_maddr*/, uint64_t /*size*/, uint64_t /*src_offset*/) { return -1; } -extern int vx_copy_from_dev(vx_buffer_h /*hbuffer*/, size_t /*dev_maddr*/, size_t /*size*/, size_t /*dest_offset*/) { +extern int vx_copy_from_dev(vx_buffer_h /*hbuffer*/, uint64_t /*dev_maddr*/, uint64_t /*size*/, uint64_t /*dest_offset*/) { return -1; } @@ -40,6 +40,6 @@ extern int vx_start(vx_device_h /*hdevice*/) { return -1; } -extern int vx_ready_wait(vx_device_h /*hdevice*/, long long /*timeout*/) { +extern int vx_ready_wait(vx_device_h /*hdevice*/, uint64_t /*timeout*/) { return -1; } \ No newline at end of file diff --git a/driver/vlsim/Makefile b/driver/vlsim/Makefile index 5608ad11..23c07635 100644 --- a/driver/vlsim/Makefile +++ b/driver/vlsim/Makefile @@ -9,8 +9,6 @@ CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -I. -I../include -I../../hw -I$(VLSIM_DIR) -LDFLAGS += $(VLSIM_DIR)/libopae-c-vlsim.a - # Position independent code CXXFLAGS += -fPIC @@ -21,6 +19,7 @@ CXXFLAGS += $(CONFIGS) CXXFLAGS += -DDUMP_PERF_STATS LDFLAGS += -shared -pthread +LDFLAGS += -L. -lopae-c-vlsim SRCS = ../common/opae.cpp ../common/vx_utils.cpp @@ -47,9 +46,9 @@ scope-defs.h: $(SCRIPT_DIR)/scope.json scope: scope-defs.h $(PROJECT): $(SRCS) $(SCOPE_H) - $(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C $(VLSIM_DIR) static + DESTDIR=../../driver/vlsim $(MAKE) -C $(VLSIM_DIR) ../../driver/vlsim/libopae-c-vlsim.so $(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -o $(PROJECT) clean: - $(MAKE) -C $(VLSIM_DIR) clean-static - rm -rf $(PROJECT) *.o scope-defs.h \ No newline at end of file + DESTDIR=../../driver/vlsim $(MAKE) -C $(VLSIM_DIR) clean + rm -rf libopae-c-vlsim.so $(PROJECT) *.o scope-defs.h \ No newline at end of file diff --git a/hw/rtl/VX_alu_unit.sv b/hw/rtl/VX_alu_unit.sv index 8840f044..72d36184 100644 --- a/hw/rtl/VX_alu_unit.sv +++ b/hw/rtl/VX_alu_unit.sv @@ -96,6 +96,7 @@ module VX_alu_unit #( wire alu_ready_in; wire alu_valid_out; wire alu_ready_out; + wire [`UUID_BITS-1:0] alu_uuid; wire [`NW_BITS-1:0] alu_wid; wire [`NUM_THREADS-1:0] alu_tmask; wire [31:0] alu_PC; @@ -112,14 +113,14 @@ module VX_alu_unit #( assign alu_ready_in = alu_ready_out || ~alu_valid_out; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (alu_ready_in), - .data_in ({alu_valid_in, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}), - .data_out ({alu_valid_out, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r}) + .data_in ({alu_valid_in, alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}), + .data_out ({alu_valid_out, alu_uuid, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r}) ); `UNUSED_VAR (br_op_r) @@ -138,6 +139,7 @@ module VX_alu_unit #( wire mul_ready_in; wire mul_valid_out; wire mul_ready_out; + wire [`UUID_BITS-1:0] mul_uuid; wire [`NW_BITS-1:0] mul_wid; wire [`NUM_THREADS-1:0] mul_tmask; wire [31:0] mul_PC; @@ -153,6 +155,7 @@ module VX_alu_unit #( // Inputs .alu_op (mul_op), + .uuid_in (alu_req_if.uuid), .wid_in (alu_req_if.wid), .tmask_in (alu_req_if.tmask), .PC_in (alu_req_if.PC), @@ -163,6 +166,7 @@ module VX_alu_unit #( // Outputs .wid_out (mul_wid), + .uuid_out (mul_uuid), .tmask_out (mul_tmask), .PC_out (mul_PC), .rd_out (mul_rd), @@ -184,6 +188,7 @@ module VX_alu_unit #( assign mul_valid_in = alu_req_if.valid && is_mul_op; assign alu_commit_if.valid = alu_valid_out || mul_valid_out; + assign alu_commit_if.uuid = alu_valid_out ? alu_uuid : mul_uuid; assign alu_commit_if.wid = alu_valid_out ? alu_wid : mul_wid; assign alu_commit_if.tmask = alu_valid_out ? alu_tmask : mul_tmask; assign alu_commit_if.PC = alu_valid_out ? alu_PC : mul_PC; @@ -201,6 +206,7 @@ module VX_alu_unit #( assign alu_valid_in = alu_req_if.valid; assign alu_commit_if.valid = alu_valid_out; + assign alu_commit_if.uuid = alu_uuid; assign alu_commit_if.wid = alu_wid; assign alu_commit_if.tmask = alu_tmask; assign alu_commit_if.PC = alu_PC; @@ -220,8 +226,8 @@ module VX_alu_unit #( `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (branch_ctl_if.valid) begin - dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h\n", - $time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest); + dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h (#%0d)\n", + $time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest, alu_uuid); end end `endif diff --git a/hw/rtl/VX_commit.sv b/hw/rtl/VX_commit.sv index 07b83df0..574ed36e 100644 --- a/hw/rtl/VX_commit.sv +++ b/hw/rtl/VX_commit.sv @@ -40,27 +40,35 @@ module VX_commit #( `endif || gpu_commit_fire; - wire [`NUM_THREADS-1:0] commit_tmask; - assign commit_tmask = alu_commit_fire ? alu_commit_if.tmask: - ld_commit_fire ? ld_commit_if.tmask: - st_commit_fire ? st_commit_if.tmask: - csr_commit_fire ? csr_commit_if.tmask: - `ifdef EXT_F_ENABLE - fpu_commit_fire ? fpu_commit_if.tmask: - `endif - /*gpu_commit_fire ?*/ gpu_commit_if.tmask; +`ifdef EXT_F_ENABLE + wire [(6*`NUM_THREADS)-1:0] commit_tmask; +`else + wire [(5*`NUM_THREADS)-1:0] commit_tmask; +`endif - wire [$clog2(`NUM_THREADS+1)-1:0] commit_cnt; - `POP_COUNT(commit_cnt, commit_tmask); + wire [$clog2($bits(commit_tmask)+1)-1:0] commit_size; + + assign commit_tmask = { + {`NUM_THREADS{alu_commit_fire}} & alu_commit_if.tmask, + {`NUM_THREADS{ld_commit_fire}} & ld_commit_if.tmask, + {`NUM_THREADS{st_commit_fire}} & st_commit_if.tmask, + {`NUM_THREADS{csr_commit_fire}} & csr_commit_if.tmask, + `ifdef EXT_F_ENABLE + {`NUM_THREADS{fpu_commit_fire}} & fpu_commit_if.tmask, + `endif + {`NUM_THREADS{gpu_commit_fire}} & gpu_commit_if.tmask + }; + + `POP_COUNT(commit_size, commit_tmask); VX_pipe_register #( - .DATAW (1 + $clog2(`NUM_THREADS+1)), + .DATAW (1 + $bits(commit_size)), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (1'b1), - .data_in ({commit_fire, commit_cnt}), + .data_in ({commit_fire, commit_size}), .data_out ({cmt_to_csr_if.valid, cmt_to_csr_if.commit_size}) ); @@ -90,32 +98,32 @@ module VX_commit #( if (alu_commit_if.valid && alu_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd); `TRACE_ARRAY1D(alu_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", alu_commit_if.uuid); end if (ld_commit_if.valid && ld_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.wb, ld_commit_if.rd); `TRACE_ARRAY1D(ld_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", ld_commit_if.uuid); end if (st_commit_if.valid && st_commit_if.ready) begin - dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd); + dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d (#%0d)\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd, st_commit_if.uuid); end if (csr_commit_if.valid && csr_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=CSR, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.wb, csr_commit_if.rd); `TRACE_ARRAY1D(csr_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", csr_commit_if.uuid); end `ifdef EXT_F_ENABLE if (fpu_commit_if.valid && fpu_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.wb, fpu_commit_if.rd); `TRACE_ARRAY1D(fpu_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", fpu_commit_if.uuid); end `endif if (gpu_commit_if.valid && gpu_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=GPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.wb, gpu_commit_if.rd); `TRACE_ARRAY1D(gpu_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", gpu_commit_if.uuid); end end `endif diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index b52a1ab2..e9e57b03 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -1,6 +1,10 @@ `ifndef VX_CONFIG `define VX_CONFIG +`ifndef XLEN +`define XLEN 32 +`endif + `ifndef NUM_CLUSTERS `define NUM_CLUSTERS 1 `endif @@ -167,48 +171,50 @@ `define CSR_MPM_FPU_ST_H 12'hB88 `define CSR_MPM_GPU_ST 12'hB09 `define CSR_MPM_GPU_ST_H 12'hB89 +// PERF: decode +`define CSR_MPM_LOADS 12'hB0A +`define CSR_MPM_LOADS_H 12'hB8A +`define CSR_MPM_STORES 12'hB0B +`define CSR_MPM_STORES_H 12'hB8B +`define CSR_MPM_BRANCHES 12'hB0C +`define CSR_MPM_BRANCHES_H 12'hB8C // PERF: icache -`define CSR_MPM_ICACHE_READS 12'hB0A // total reads -`define CSR_MPM_ICACHE_READS_H 12'hB8A -`define CSR_MPM_ICACHE_MISS_R 12'hB0B // total misses -`define CSR_MPM_ICACHE_MISS_R_H 12'hB8B -`define CSR_MPM_ICACHE_PIPE_ST 12'hB0C // pipeline stalls -`define CSR_MPM_ICACHE_PIPE_ST_H 12'hB8C -`define CSR_MPM_ICACHE_CRSP_ST 12'hB0D // core response stalls -`define CSR_MPM_ICACHE_CRSP_ST_H 12'hB8D +`define CSR_MPM_ICACHE_READS 12'hB0D // total reads +`define CSR_MPM_ICACHE_READS_H 12'hB8D +`define CSR_MPM_ICACHE_MISS_R 12'hB0E // read misses +`define CSR_MPM_ICACHE_MISS_R_H 12'hB8E // PERF: dcache -`define CSR_MPM_DCACHE_READS 12'hB0E // total reads -`define CSR_MPM_DCACHE_READS_H 12'hB8E -`define CSR_MPM_DCACHE_WRITES 12'hB0F // total writes -`define CSR_MPM_DCACHE_WRITES_H 12'hB8F -`define CSR_MPM_DCACHE_MISS_R 12'hB10 // read misses -`define CSR_MPM_DCACHE_MISS_R_H 12'hB90 -`define CSR_MPM_DCACHE_MISS_W 12'hB11 // write misses -`define CSR_MPM_DCACHE_MISS_W_H 12'hB91 -`define CSR_MPM_DCACHE_BANK_ST 12'hB12 // bank conflicts stalls -`define CSR_MPM_DCACHE_BANK_ST_H 12'hB92 -`define CSR_MPM_DCACHE_MSHR_ST 12'hB13 // MSHR stalls -`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB93 -`define CSR_MPM_DCACHE_PIPE_ST 12'hB14 // pipeline stalls -`define CSR_MPM_DCACHE_PIPE_ST_H 12'hB94 -`define CSR_MPM_DCACHE_CRSP_ST 12'hB15 // core response stalls -`define CSR_MPM_DCACHE_CRSP_ST_H 12'hB95 +`define CSR_MPM_DCACHE_READS 12'hB0F // total reads +`define CSR_MPM_DCACHE_READS_H 12'hB8F +`define CSR_MPM_DCACHE_WRITES 12'hB10 // total writes +`define CSR_MPM_DCACHE_WRITES_H 12'hB90 +`define CSR_MPM_DCACHE_MISS_R 12'hB11 // read misses +`define CSR_MPM_DCACHE_MISS_R_H 12'hB91 +`define CSR_MPM_DCACHE_MISS_W 12'hB12 // write misses +`define CSR_MPM_DCACHE_MISS_W_H 12'hB92 +`define CSR_MPM_DCACHE_BANK_ST 12'hB13 // bank conflicts +`define CSR_MPM_DCACHE_BANK_ST_H 12'hB93 +`define CSR_MPM_DCACHE_MSHR_ST 12'hB14 // MSHR stalls +`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB94 // PERF: smem -`define CSR_MPM_SMEM_READS 12'hB16 // total reads -`define CSR_MPM_SMEM_READS_H 12'hB96 -`define CSR_MPM_SMEM_WRITES 12'hB17 // total writes -`define CSR_MPM_SMEM_WRITES_H 12'hB97 -`define CSR_MPM_SMEM_BANK_ST 12'hB18 // bank conflicts stalls -`define CSR_MPM_SMEM_BANK_ST_H 12'hB98 +`define CSR_MPM_SMEM_READS 12'hB15 // total reads +`define CSR_MPM_SMEM_READS_H 12'hB95 +`define CSR_MPM_SMEM_WRITES 12'hB16 // total writes +`define CSR_MPM_SMEM_WRITES_H 12'hB96 +`define CSR_MPM_SMEM_BANK_ST 12'hB17 // bank conflicts +`define CSR_MPM_SMEM_BANK_ST_H 12'hB97 // PERF: memory -`define CSR_MPM_MEM_READS 12'hB19 // memory reads -`define CSR_MPM_MEM_READS_H 12'hB99 -`define CSR_MPM_MEM_WRITES 12'hB1A // memory writes -`define CSR_MPM_MEM_WRITES_H 12'hB9A -`define CSR_MPM_MEM_ST 12'hB1B // memory request stalls -`define CSR_MPM_MEM_ST_H 12'hB9B -`define CSR_MPM_MEM_LAT 12'hB1C // memory latency (total) -`define CSR_MPM_MEM_LAT_H 12'hB9C +`define CSR_MPM_MEM_READS 12'hB18 // memory reads +`define CSR_MPM_MEM_READS_H 12'hB98 +`define CSR_MPM_MEM_WRITES 12'hB19 // memory writes +`define CSR_MPM_MEM_WRITES_H 12'hB99 +`define CSR_MPM_MEM_LAT 12'hB1A // memory latency +`define CSR_MPM_MEM_LAT_H 12'hB9A +// PERF: texunit +`define CSR_MPM_TEX_READS 12'hB1B // texture accesses +`define CSR_MPM_TEX_READS_H 12'hB9B +`define CSR_MPM_TEX_LAT 12'hB1C // texture latency +`define CSR_MPM_TEX_LAT_H 12'hB9C // Machine Information Registers `define CSR_MVENDORID 12'hF11 @@ -232,18 +238,40 @@ ////////// Texture Units ////////////////////////////////////////////////////// -`define NUM_TEX_UNITS 2 +`define NUM_TEX_UNITS 2 +`define TEX_SUBPIXEL_BITS 8 -`define CSR_TEX_STATES 7 -`define CSR_TEX_BEGIN(x) (12'hFD0 + (x) * `CSR_TEX_STATES) +`define TEX_DIM_BITS 15 +`define TEX_LOD_MAX `TEX_DIM_BITS +`define TEX_LOD_BITS 4 -`define CSR_TEX_ADDR(x) (`CSR_TEX_BEGIN(x) + 12'h00) -`define CSR_TEX_FORMAT(x) (`CSR_TEX_BEGIN(x) + 12'h01) -`define CSR_TEX_WRAP(x) (`CSR_TEX_BEGIN(x) + 12'h02) -`define CSR_TEX_FILTER(x) (`CSR_TEX_BEGIN(x) + 12'h03) -`define CSR_TEX_MIPOFF(x) (`CSR_TEX_BEGIN(x) + 12'h04) -`define CSR_TEX_WIDTH(x) (`CSR_TEX_BEGIN(x) + 12'h05) -`define CSR_TEX_HEIGHT(x) (`CSR_TEX_BEGIN(x) + 12'h06) +`define TEX_FXD_BITS 32 +`define TEX_FXD_FRAC (`TEX_DIM_BITS+`TEX_SUBPIXEL_BITS) + +`define TEX_STATE_ADDR 0 +`define TEX_STATE_WIDTH 1 +`define TEX_STATE_HEIGHT 2 +`define TEX_STATE_FORMAT 3 +`define TEX_STATE_FILTER 4 +`define TEX_STATE_WRAPU 5 +`define TEX_STATE_WRAPV 6 +`define TEX_STATE_MIPOFF(lod) (7+(lod)) +`define NUM_TEX_STATES (`TEX_STATE_MIPOFF(`TEX_LOD_MAX)+1) + +`define CSR_TEX_UNIT 12'hFD0 + +`define CSR_TEX_STATE_BEGIN 12'hFD1 +`define CSR_TEX_ADDR (`CSR_TEX_STATE_BEGIN+`TEX_STATE_ADDR) +`define CSR_TEX_WIDTH (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WIDTH) +`define CSR_TEX_HEIGHT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_HEIGHT) +`define CSR_TEX_FORMAT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FORMAT) +`define CSR_TEX_FILTER (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FILTER) +`define CSR_TEX_WRAPU (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPU) +`define CSR_TEX_WRAPV (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPV) +`define CSR_TEX_MIPOFF(lod) (`CSR_TEX_STATE_BEGIN+`TEX_STATE_MIPOFF(lod)) +`define CSR_TEX_STATE_END (`CSR_TEX_STATE_BEGIN+`NUM_TEX_STATES) + +`define CSR_TEX_STATE(addr) ((addr) - `CSR_TEX_STATE_BEGIN) // Pipeline Queues //////////////////////////////////////////////////////////// @@ -262,6 +290,11 @@ `define FPUQ_SIZE 8 `endif +// Texture Unit Request Queue +`ifndef TEXQ_SIZE +`define TEXQ_SIZE (`NUM_WARPS * 2) +`endif + // Icache Configurable Knobs ////////////////////////////////////////////////// // Size of cache in bytes @@ -373,7 +406,7 @@ // Number of banks `ifndef L2_NUM_BANKS -`define L2_NUM_BANKS `MIN(`NUM_CORES, 4) +`define L2_NUM_BANKS ((`NUM_CORES < 4) ? `NUM_CORES : 4) `endif // Number of ports per bank @@ -415,7 +448,7 @@ // Number of banks `ifndef L3_NUM_BANKS -`define L3_NUM_BANKS `MIN(`NUM_CLUSTERS, 4) +`define L3_NUM_BANKS ((`NUM_CLUSTERS < 4) ? `NUM_CORES : 4) `endif // Number of ports per bank diff --git a/hw/rtl/VX_csr_data.sv b/hw/rtl/VX_csr_data.sv index b071a347..6d4a82c9 100644 --- a/hw/rtl/VX_csr_data.sv +++ b/hw/rtl/VX_csr_data.sv @@ -7,6 +7,9 @@ module VX_csr_data #( input wire reset, `ifdef PERF_ENABLE +`ifdef EXT_TEX_ENABLE + VX_perf_tex_if.slave perf_tex_if, +`endif VX_perf_memsys_if.slave perf_memsys_if, VX_perf_pipeline_if.slave perf_pipeline_if, `endif @@ -22,11 +25,13 @@ module VX_csr_data #( `endif input wire read_enable, + input wire [`UUID_BITS-1:0] read_uuid, input wire[`CSR_ADDR_BITS-1:0] read_addr, input wire[`NW_BITS-1:0] read_wid, output wire[31:0] read_data, input wire write_enable, + input wire [`UUID_BITS-1:0] write_uuid, input wire[`CSR_ADDR_BITS-1:0] write_addr, input wire[`NW_BITS-1:0] write_wid, input wire[31:0] write_data, @@ -50,35 +55,41 @@ module VX_csr_data #( reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr; always @(posedge clk) begin - `ifdef EXT_F_ENABLE if (reset) begin fcsr <= '0; - end - if (fpu_to_csr_if.write_enable) begin - fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] - | fpu_to_csr_if.write_fflags; - end - `endif - if (write_enable) begin - case (write_addr) - `CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0]; - `CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0]; - `CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0]; - `CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0]; - `CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0]; - `CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0]; - `CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0]; - `CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0]; - `CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0]; - `CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0]; - `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0]; - `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0]; - default: begin - `ASSERT(write_addr >= `CSR_TEX_BEGIN(0) - && write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES), - ("%t: invalid CSR write address: %0h", $time, write_addr)); - end - endcase + end else begin + `ifdef EXT_F_ENABLE + if (fpu_to_csr_if.write_enable) begin + fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] + | fpu_to_csr_if.write_fflags; + end + `endif + if (write_enable) begin + case (write_addr) + `CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0]; + `CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0]; + `CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0]; + `CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0]; + `CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0]; + `CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0]; + `CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0]; + `CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0]; + `CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0]; + `CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0]; + `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0]; + `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0]; + default: begin + `ifdef EXT_TEX_ENABLE + `ASSERT((write_addr == `CSR_TEX_UNIT) + || (write_addr >= `CSR_TEX_STATE_BEGIN + && write_addr < `CSR_TEX_STATE_END), + ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid)); + `else + `ASSERT(~write_enable, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid)); + `endif + end + endcase + end end end @@ -89,6 +100,7 @@ module VX_csr_data #( assign tex_csr_if.write_enable = write_enable; assign tex_csr_if.write_addr = write_addr; assign tex_csr_if.write_data = write_data; + assign tex_csr_if.write_uuid = write_uuid; `endif always @(posedge clk) begin @@ -147,20 +159,28 @@ module VX_csr_data #( `CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0]; `CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]); + `ifdef EXT_F_ENABLE `CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0]; `CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]); + `else + `CSR_MPM_FPU_ST : read_data_r = '0; + `CSR_MPM_FPU_ST_H : read_data_r = '0; + `endif `CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0]; `CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]); + // PERF: decode + `CSR_MPM_LOADS : read_data_r = perf_pipeline_if.loads[31:0]; + `CSR_MPM_LOADS_H : read_data_r = 32'(perf_pipeline_if.loads[`PERF_CTR_BITS-1:32]); + `CSR_MPM_STORES : read_data_r = perf_pipeline_if.stores[31:0]; + `CSR_MPM_STORES_H : read_data_r = 32'(perf_pipeline_if.stores[`PERF_CTR_BITS-1:32]); + `CSR_MPM_BRANCHES : read_data_r = perf_pipeline_if.branches[31:0]; + `CSR_MPM_BRANCHES_H : read_data_r = 32'(perf_pipeline_if.branches[`PERF_CTR_BITS-1:32]); // PERF: icache `CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0]; `CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0]; `CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[`PERF_CTR_BITS-1:32]); - `CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0]; - `CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.icache_pipe_stalls[`PERF_CTR_BITS-1:32]); - `CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0]; - `CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.icache_crsp_stalls[`PERF_CTR_BITS-1:32]); - // PERF: dcache + // PERF: dcache `CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0]; `CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0]; @@ -173,26 +193,27 @@ module VX_csr_data #( `CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0]; `CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]); - `CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0]; - `CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.dcache_pipe_stalls[`PERF_CTR_BITS-1:32]); - `CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0]; - `CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[`PERF_CTR_BITS-1:32]); - // PERF: smem + // PERF: smem `CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0]; `CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0]; `CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]); `CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0]; `CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]); - // PERF: MEM + // PERF: memory `CSR_MPM_MEM_READS : read_data_r = perf_memsys_if.mem_reads[31:0]; `CSR_MPM_MEM_READS_H : read_data_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_MEM_WRITES : read_data_r = perf_memsys_if.mem_writes[31:0]; `CSR_MPM_MEM_WRITES_H : read_data_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]); - `CSR_MPM_MEM_ST : read_data_r = perf_memsys_if.mem_stalls[31:0]; - `CSR_MPM_MEM_ST_H : read_data_r = 32'(perf_memsys_if.mem_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_MEM_LAT : read_data_r = perf_memsys_if.mem_latency[31:0]; `CSR_MPM_MEM_LAT_H : read_data_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]); + `ifdef EXT_TEX_ENABLE + // PERF: texunit + `CSR_MPM_TEX_READS : read_data_r = perf_tex_if.mem_reads[31:0]; + `CSR_MPM_TEX_READS_H : read_data_r = 32'(perf_tex_if.mem_reads[`PERF_CTR_BITS-1:32]); + `CSR_MPM_TEX_LAT : read_data_r = perf_tex_if.mem_latency[31:0]; + `CSR_MPM_TEX_LAT_H : read_data_r = 32'(perf_tex_if.mem_latency[`PERF_CTR_BITS-1:32]); + `endif // PERF: reserved `CSR_MPM_RESERVED : read_data_r = '0; `CSR_MPM_RESERVED_H : read_data_r = '0; @@ -217,16 +238,23 @@ module VX_csr_data #( `CSR_MIMPID : read_data_r = `IMPLEMENTATION_ID; default: begin - if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32)) - || (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32) - || (read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))))) begin + if ((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32)) + || (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32))) begin + read_addr_valid_r = 1; + end else + `ifdef EXT_TEX_ENABLE + if ((read_addr == `CSR_TEX_UNIT) + || (read_addr >= `CSR_TEX_STATE_BEGIN + && read_addr < `CSR_TEX_STATE_END)) begin + read_addr_valid_r = 1; + end else + `endif read_addr_valid_r = 0; - end end endcase end - `RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("invalid CSR read address: %0h", read_addr)) + `RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: %0h (#%0d)", $time, read_addr, read_uuid)) assign read_data = read_data_r; diff --git a/hw/rtl/VX_csr_unit.sv b/hw/rtl/VX_csr_unit.sv index 0b05ca9c..9186586a 100644 --- a/hw/rtl/VX_csr_unit.sv +++ b/hw/rtl/VX_csr_unit.sv @@ -7,6 +7,9 @@ module VX_csr_unit #( input wire reset, `ifdef PERF_ENABLE +`ifdef EXT_TEX_ENABLE + VX_perf_tex_if.slave perf_tex_if, +`endif VX_perf_memsys_if.slave perf_memsys_if, VX_perf_pipeline_if.slave perf_pipeline_if, `endif @@ -29,7 +32,8 @@ module VX_csr_unit #( ); wire csr_we_s1; wire [`CSR_ADDR_BITS-1:0] csr_addr_s1; - wire [31:0] csr_read_data, csr_read_data_s1; + wire [31:0] csr_read_data; + wire [31:0] csr_read_data_s1; wire [31:0] csr_updated_data_s1; wire write_enable = csr_commit_if.valid && csr_we_s1; @@ -42,8 +46,11 @@ module VX_csr_unit #( .clk (clk), .reset (reset), `ifdef PERF_ENABLE - .perf_memsys_if (perf_memsys_if), - .perf_pipeline_if (perf_pipeline_if), + `ifdef EXT_TEX_ENABLE + .perf_tex_if (perf_tex_if), + `endif + .perf_memsys_if (perf_memsys_if), + .perf_pipeline_if(perf_pipeline_if), `endif .cmt_to_csr_if (cmt_to_csr_if), .fetch_to_csr_if(fetch_to_csr_if), @@ -54,10 +61,12 @@ module VX_csr_unit #( .tex_csr_if (tex_csr_if), `endif .read_enable (csr_req_if.valid), + .read_uuid (csr_req_if.uuid), .read_addr (csr_req_if.addr), .read_wid (csr_req_if.wid), .read_data (csr_read_data), .write_enable (write_enable), + .write_uuid (csr_commit_if.uuid), .write_addr (csr_addr_s1), .write_wid (csr_commit_if.wid), .write_data (csr_updated_data_s1), @@ -101,14 +110,14 @@ module VX_csr_unit #( wire stall_out = ~csr_commit_if.ready && csr_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({csr_req_valid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}), - .data_out ({csr_commit_if.valid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1}) + .data_in ({csr_req_valid, csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}), + .data_out ({csr_commit_if.valid, csr_commit_if.uuid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1}) ); for (genvar i = 0; i < `NUM_THREADS; i++) begin diff --git a/hw/rtl/VX_decode.sv b/hw/rtl/VX_decode.sv index 89d70d7a..3f9af431 100644 --- a/hw/rtl/VX_decode.sv +++ b/hw/rtl/VX_decode.sv @@ -20,6 +20,10 @@ module VX_decode #( input wire clk, input wire reset, +`ifdef PERF_ENABLE + VX_perf_pipeline_if.decode perf_decode_if, +`endif + // inputs VX_ifetch_rsp_if.slave ifetch_rsp_if, @@ -57,7 +61,6 @@ module VX_decode #( wire [11:0] s_imm = {func7, rd}; wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0}; wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0}; - wire [11:0] jalr_imm = {func7, rs2}; `UNUSED_VAR (rs3) @@ -169,7 +172,7 @@ module VX_decode #( use_rd = 1; use_imm = 1; is_wstall = 1; - imm = {{20{jalr_imm[11]}}, jalr_imm}; + imm = {{20{u_12[11]}}, u_12}; `USED_IREG (rd); `USED_IREG (rs1); end @@ -192,7 +195,7 @@ module VX_decode #( `USED_IREG (rs1); `USED_IREG (rs2); end - `INST_F: begin + `INST_FENCE: begin ex_type = `EX_LSU; op_mod = `INST_MOD_BITS'(1); end @@ -214,9 +217,9 @@ module VX_decode #( case (u_12) 12'h000: op_type = `INST_OP_BITS'(`INST_BR_ECALL); 12'h001: op_type = `INST_OP_BITS'(`INST_BR_EBREAK); + 12'h002: op_type = `INST_OP_BITS'(`INST_BR_URET); + 12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET); 12'h302: op_type = `INST_OP_BITS'(`INST_BR_MRET); - 12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET); - 12'h7B2: op_type = `INST_OP_BITS'(`INST_BR_DRET); default:; endcase op_mod = 1; @@ -347,7 +350,7 @@ module VX_decode #( endcase end `endif - `INST_GPU: begin + `INST_GPGPU: begin ex_type = `EX_GPU; case (func3) 3'h0: begin @@ -374,9 +377,21 @@ module VX_decode #( is_wstall = 1; `USED_IREG (rs1); `USED_IREG (rs2); - end - `ifdef EXT_TEX_ENABLE + end 3'h5: begin + ex_type = `EX_LSU; + op_type = `INST_OP_BITS'(`INST_LSU_LW); + op_mod = `INST_MOD_BITS'(2); + `USED_IREG (rs1); + end + default:; + endcase + end + `INST_GPU: begin + case (func3) + `ifdef EXT_TEX_ENABLE + 3'h0: begin + ex_type = `EX_GPU; op_type = `INST_OP_BITS'(`INST_GPU_TEX); op_mod = `INST_MOD_BITS'(func2); use_rd = 1; @@ -386,12 +401,6 @@ module VX_decode #( `USED_IREG (rs3); end `endif - 3'h6: begin - ex_type = `EX_LSU; - op_type = `INST_OP_BITS'(`INST_LSU_LW); - op_mod = `INST_MOD_BITS'(2); - `USED_IREG (rs1); - end default:; endcase end @@ -405,6 +414,7 @@ module VX_decode #( wire wb = use_rd && (| rd_r); assign decode_if.valid = ifetch_rsp_if.valid; + assign decode_if.uuid = ifetch_rsp_if.uuid; assign decode_if.wid = ifetch_rsp_if.wid; assign decode_if.tmask = ifetch_rsp_if.tmask; assign decode_if.PC = ifetch_rsp_if.PC; @@ -433,6 +443,42 @@ module VX_decode #( assign ifetch_rsp_if.ready = decode_if.ready; +`ifdef PERF_ENABLE + wire [$clog2(`NUM_THREADS+1)-1:0] perf_loads_per_cycle; + wire [$clog2(`NUM_THREADS+1)-1:0] perf_stores_per_cycle; + wire [$clog2(`NUM_THREADS+1)-1:0] perf_branches_per_cycle; + + wire [`NUM_THREADS-1:0] perf_loads_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && decode_if.wb}}; + wire [`NUM_THREADS-1:0] perf_stores_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && ~decode_if.wb}}; + wire [`NUM_THREADS-1:0] perf_branches_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_ALU && `INST_ALU_IS_BR(decode_if.op_mod)}}; + + `POP_COUNT(perf_loads_per_cycle, perf_loads_per_mask); + `POP_COUNT(perf_stores_per_cycle, perf_stores_per_mask); + `POP_COUNT(perf_branches_per_cycle, perf_branches_per_mask); + + reg [`PERF_CTR_BITS-1:0] perf_loads; + reg [`PERF_CTR_BITS-1:0] perf_stores; + reg [`PERF_CTR_BITS-1:0] perf_branches; + + always @(posedge clk) begin + if (reset) begin + perf_loads <= 0; + perf_stores <= 0; + perf_branches <= 0; + end else begin + if (decode_if.valid && decode_if.ready) begin + perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_loads_per_cycle); + perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_stores_per_cycle); + perf_branches <= perf_branches + `PERF_CTR_BITS'(perf_branches_per_cycle); + end + end + end + + assign perf_decode_if.loads = perf_loads; + assign perf_decode_if.stores = perf_stores; + assign perf_decode_if.branches = perf_branches; +`endif + `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (decode_if.valid && decode_if.ready) begin @@ -440,7 +486,8 @@ module VX_decode #( trace_ex_type(decode_if.ex_type); dpi_trace(", op="); trace_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod); - dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm); + dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b (#%0d)\n", + decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm, decode_if.uuid); end end `endif diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index c3706000..2badf7f8 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -34,6 +34,8 @@ `define PERF_CTR_BITS 44 +`define UUID_BITS 44 + /////////////////////////////////////////////////////////////////////////////// `define EX_NOP 3'h0 @@ -55,7 +57,7 @@ `define INST_S 7'b0100011 // store instructions `define INST_I 7'b0010011 // immediate instructions `define INST_R 7'b0110011 // register instructions -`define INST_F 7'b0001111 // Fence instructions +`define INST_FENCE 7'b0001111 // Fence instructions `define INST_SYS 7'b1110011 // system instructions `define INST_FL 7'b0000111 // float load instruction @@ -66,7 +68,8 @@ `define INST_FNMADD 7'b1001111 `define INST_FCI 7'b1010011 // float common instructions -`define INST_GPU 7'b1101011 +`define INST_GPGPU 7'b1101011 +`define INST_GPU 7'b1011011 `define INST_TEX 7'b0101011 @@ -117,9 +120,9 @@ `define INST_BR_JALR 4'b1001 `define INST_BR_ECALL 4'b1010 `define INST_BR_EBREAK 4'b1011 -`define INST_BR_MRET 4'b1100 +`define INST_BR_URET 4'b1100 `define INST_BR_SRET 4'b1101 -`define INST_BR_DRET 4'b1110 +`define INST_BR_MRET 4'b1110 `define INST_BR_OTHER 4'b1111 `define INST_BR_BITS 4 `define INST_BR_NEG(x) x[1] @@ -154,6 +157,7 @@ `define INST_LSU_BITS 4 `define INST_LSU_FMT(x) x[2:0] `define INST_LSU_WSIZE(x) x[1:0] +`define INST_LSU_IS_MEM(x) (3'h0 == x) `define INST_LSU_IS_FENCE(x) (3'h1 == x) `define INST_LSU_IS_PREFETCH(x) (3'h2 == x) @@ -185,14 +189,14 @@ `define INST_FPU_NMADD 4'hF `define INST_FPU_BITS 4 -`define INST_GPU_TMC 3'h0 -`define INST_GPU_WSPAWN 3'h1 -`define INST_GPU_SPLIT 3'h2 -`define INST_GPU_JOIN 3'h3 -`define INST_GPU_BAR 3'h4 -`define INST_GPU_PRED 3'h5 -`define INST_GPU_TEX 3'h6 -`define INST_GPU_BITS 3 +`define INST_GPU_TMC 4'h0 +`define INST_GPU_WSPAWN 4'h1 +`define INST_GPU_SPLIT 4'h2 +`define INST_GPU_JOIN 4'h3 +`define INST_GPU_BAR 4'h4 +`define INST_GPU_PRED 4'h5 +`define INST_GPU_TEX 4'h6 +`define INST_GPU_BITS 4 /////////////////////////////////////////////////////////////////////////////// @@ -237,18 +241,15 @@ /////////////////////////////////////////////////////////////////////////////// -`ifdef DBG_CACHE_REQ_INFO // wid PC -`define DBG_CACHE_REQ_MDATAW (`NW_BITS + 32) -`else -`define DBG_CACHE_REQ_MDATAW 0 -`endif - // non-cacheable tag bits `define NC_TAG_BIT 1 // texture tag bits `define TEX_TAG_BIT 1 +// cache address type bits +`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BIT + `SM_ENABLE) + ////////////////////////// Icache Configurable Knobs ////////////////////////// // Cache ID @@ -264,7 +265,7 @@ `define ICACHE_CORE_TAG_ID_BITS `NW_BITS // Core request tag bits -`define ICACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `ICACHE_CORE_TAG_ID_BITS) +`define ICACHE_CORE_TAG_WIDTH (`UUID_BITS + `ICACHE_CORE_TAG_ID_BITS) // Memory request data bits `define ICACHE_MEM_DATA_WIDTH (`ICACHE_LINE_SIZE * 8) @@ -289,17 +290,14 @@ // Core request tag bits `define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE) `ifdef EXT_TEX_ENABLE -`define LSU_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE) -`define TEX_TAG_ID_BITS (2) -`define LSU_TEX_TAG_ID_BITS `MAX(`LSU_TAG_ID_BITS, `TEX_TAG_ID_BITS) -`define DCACHE_CORE_TAG_ID_BITS (`LSU_TEX_TAG_ID_BITS + `TEX_TAG_BIT) -`define LSU_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TAG_ID_BITS) -`define TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `TEX_TAG_ID_BITS) -`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TEX_TAG_ID_BITS) +`define LSU_TAG_ID_BITS `MAX(`LSUQ_ADDR_BITS, 2) +`define LSU_TEX_DCACHE_TAG_BITS (`UUID_BITS + `LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS) +`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS + `TEX_TAG_BIT) `else -`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE) +`define LSU_TAG_ID_BITS `LSUQ_ADDR_BITS +`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS) `endif -`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS) +`define DCACHE_CORE_TAG_WIDTH (`UUID_BITS + `DCACHE_CORE_TAG_ID_BITS) // Memory request data bits `define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8) diff --git a/hw/rtl/VX_dispatch.sv b/hw/rtl/VX_dispatch.sv index 008a7c62..9b8b88c8 100644 --- a/hw/rtl/VX_dispatch.sv +++ b/hw/rtl/VX_dispatch.sv @@ -42,15 +42,15 @@ module VX_dispatch ( wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), + .DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), .OUT_REG (1) ) alu_buffer ( .clk (clk), .reset (reset), .valid_in (alu_req_valid), .ready_in (alu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), - .data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), + .data_out ({alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}), .valid_out (alu_req_if.valid), .ready_out (alu_req_if.ready) ); @@ -63,15 +63,15 @@ module VX_dispatch ( wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1), + .DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1), .OUT_REG (1) ) lsu_buffer ( .clk (clk), .reset (reset), .valid_in (lsu_req_valid), .ready_in (lsu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}), - .data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}), + .data_out ({lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}), .valid_out (lsu_req_if.valid), .ready_out (lsu_req_if.ready) ); @@ -85,15 +85,15 @@ module VX_dispatch ( wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid]; VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32), + .DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32), .OUT_REG (1) ) csr_buffer ( .clk (clk), .reset (reset), .valid_in (csr_req_valid), .ready_in (csr_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}), - .data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}), + .data_out ({csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}), .valid_out (csr_req_if.valid), .ready_out (csr_req_if.ready) ); @@ -105,15 +105,15 @@ module VX_dispatch ( wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), + .DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), .OUT_REG (1) ) fpu_buffer ( .clk (clk), .reset (reset), .valid_in (fpu_req_valid), .ready_in (fpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), - .data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), + .data_out ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), .valid_out (fpu_req_if.valid), .ready_out (fpu_req_if.ready) ); @@ -127,15 +127,15 @@ module VX_dispatch ( wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)), + .DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)), .OUT_REG (1) ) gpu_buffer ( .clk (clk), .reset (reset), .valid_in (gpu_req_valid), .ready_in (gpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), - .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), + .data_out ({gpu_req_if.uuid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}), .valid_out (gpu_req_if.valid), .ready_out (gpu_req_if.ready) ); diff --git a/hw/rtl/VX_execute.sv b/hw/rtl/VX_execute.sv index f0cdd37e..3549465a 100644 --- a/hw/rtl/VX_execute.sv +++ b/hw/rtl/VX_execute.sv @@ -52,49 +52,31 @@ module VX_execute #( VX_dcache_req_if #( .NUM_REQS (`NUM_THREADS), .WORD_SIZE (4), - .TAG_WIDTH (`LSU_DCACHE_TAG_BITS) + .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS) ) lsu_dcache_req_if(); VX_dcache_rsp_if #( .NUM_REQS (`NUM_THREADS), .WORD_SIZE (4), - .TAG_WIDTH (`LSU_DCACHE_TAG_BITS) + .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS) ) lsu_dcache_rsp_if(); VX_dcache_req_if #( .NUM_REQS (`NUM_THREADS), .WORD_SIZE (4), - .TAG_WIDTH (`TEX_DCACHE_TAG_BITS) + .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS) ) tex_dcache_req_if(); VX_dcache_rsp_if #( .NUM_REQS (`NUM_THREADS), .WORD_SIZE (4), - .TAG_WIDTH (`TEX_DCACHE_TAG_BITS) + .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS) ) tex_dcache_rsp_if(); VX_tex_csr_if tex_csr_if(); - wire [`NUM_THREADS-1:0][`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_in, lsu_tag_in; - wire [`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_out, lsu_tag_out; - - `UNUSED_VAR (tex_tag_out) - `UNUSED_VAR (lsu_tag_out) - - for (genvar i = 0; i < `NUM_THREADS; ++i) begin - assign tex_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(tex_dcache_req_if.tag[i][`TEX_TAG_ID_BITS-1:0]); - assign lsu_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(lsu_dcache_req_if.tag[i][`LSU_TAG_ID_BITS-1:0]); - `ifdef DBG_CACHE_REQ_INFO - assign tex_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = tex_dcache_req_if.tag[i][`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS]; - assign lsu_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = lsu_dcache_req_if.tag[i][`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS]; - `endif - end - - assign tex_dcache_rsp_if.tag[`TEX_TAG_ID_BITS-1:0] = tex_tag_out[`TEX_TAG_ID_BITS-1:0]; - assign lsu_dcache_rsp_if.tag[`LSU_TAG_ID_BITS-1:0] = lsu_tag_out[`LSU_TAG_ID_BITS-1:0]; -`ifdef DBG_CACHE_REQ_INFO - assign tex_dcache_rsp_if.tag[`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS] = tex_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS]; - assign lsu_dcache_rsp_if.tag[`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS] = lsu_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS]; +`ifdef PERF_ENABLE + VX_perf_tex_if perf_tex_if(); `endif VX_cache_arb #( @@ -113,7 +95,7 @@ module VX_execute #( .req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}), .req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}), .req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}), - .req_tag_in ({tex_tag_in, lsu_tag_in}), + .req_tag_in ({tex_dcache_req_if.tag, lsu_dcache_req_if.tag}), .req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}), // Dcache request @@ -136,7 +118,7 @@ module VX_execute #( .rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}), .rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}), .rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}), - .rsp_tag_out ({tex_tag_out, lsu_tag_out}), + .rsp_tag_out ({tex_dcache_rsp_if.tag, lsu_dcache_rsp_if.tag}), .rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready}) ); @@ -187,6 +169,9 @@ module VX_execute #( .clk (clk), .reset (csr_reset), `ifdef PERF_ENABLE + `ifdef EXT_TEX_ENABLE + .perf_tex_if (perf_tex_if), + `endif .perf_memsys_if (perf_memsys_if), .perf_pipeline_if(perf_pipeline_if), `endif @@ -231,6 +216,9 @@ module VX_execute #( .reset (gpu_reset), .gpu_req_if (gpu_req_if), `ifdef EXT_TEX_ENABLE + `ifdef PERF_ENABLE + .perf_tex_if (perf_tex_if), + `endif .tex_csr_if (tex_csr_if), .dcache_req_if (tex_dcache_req_if), .dcache_rsp_if (tex_dcache_rsp_if), diff --git a/hw/rtl/VX_fpu_unit.sv b/hw/rtl/VX_fpu_unit.sv index 7b0f07cc..342bf36d 100644 --- a/hw/rtl/VX_fpu_unit.sv +++ b/hw/rtl/VX_fpu_unit.sv @@ -22,6 +22,7 @@ module VX_fpu_unit #( wire valid_out; wire ready_out; + wire [`UUID_BITS-1:0] rsp_uuid; wire [`NW_BITS-1:0] rsp_wid; wire [`NUM_THREADS-1:0] rsp_tmask; wire [31:0] rsp_PC; @@ -39,7 +40,7 @@ module VX_fpu_unit #( wire fpuq_pop = valid_out && ready_out; VX_index_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), + .DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), .SIZE (`FPUQ_SIZE) ) req_metadata ( .clk (clk), @@ -48,8 +49,8 @@ module VX_fpu_unit #( .write_addr (tag_in), .read_addr (tag_out), .release_addr (tag_out), - .write_data ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}), - .read_data ({rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}), + .write_data ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}), + .read_data ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}), .release_slot (fpuq_pop), .full (fpuq_full), `UNUSED_PIN (empty) @@ -180,14 +181,14 @@ module VX_fpu_unit #( wire stall_out = ~fpu_commit_if.ready && fpu_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({valid_out, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result, has_fflags, rsp_fflags}), - .data_out ({fpu_commit_if.valid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r}) + .data_in ({valid_out, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result, has_fflags, rsp_fflags}), + .data_out ({fpu_commit_if.valid, fpu_commit_if.uuid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r}) ); assign fpu_commit_if.eop = 1'b1; diff --git a/hw/rtl/VX_gpu_unit.sv b/hw/rtl/VX_gpu_unit.sv index 06d5fbc7..b4047830 100644 --- a/hw/rtl/VX_gpu_unit.sv +++ b/hw/rtl/VX_gpu_unit.sv @@ -12,6 +12,10 @@ module VX_gpu_unit #( VX_gpu_req_if.slave gpu_req_if, `ifdef EXT_TEX_ENABLE + // PERF +`ifdef PERF_ENABLE + VX_perf_tex_if.master perf_tex_if, +`endif VX_dcache_req_if.master dcache_req_if, VX_dcache_rsp_if.slave dcache_rsp_if, VX_tex_csr_if.slave tex_csr_if, @@ -28,12 +32,13 @@ module VX_gpu_unit #( localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS; localparam RSP_DATAW = `MAX(`NUM_THREADS * 32, WCTL_DATAW); - wire rsp_valid; - wire [`NW_BITS-1:0] rsp_wid; - wire [`NUM_THREADS-1:0] rsp_tmask; - wire [31:0] rsp_PC; - wire [`NR_BITS-1:0] rsp_rd; - wire rsp_wb; + wire rsp_valid; + wire [`UUID_BITS-1:0] rsp_uuid; + wire [`NW_BITS-1:0] rsp_wid; + wire [`NUM_THREADS-1:0] rsp_tmask; + wire [31:0] rsp_PC; + wire [`NR_BITS-1:0] rsp_rd; + wire rsp_wb; wire [RSP_DATAW-1:0] rsp_data, rsp_data_r; @@ -112,6 +117,7 @@ module VX_gpu_unit #( wire is_tex = (gpu_req_if.op_type == `INST_GPU_TEX); assign tex_req_if.valid = gpu_req_if.valid && is_tex; + assign tex_req_if.uuid = gpu_req_if.uuid; assign tex_req_if.wid = gpu_req_if.wid; assign tex_req_if.tmask = gpu_req_if.tmask; assign tex_req_if.PC = gpu_req_if.PC; @@ -128,6 +134,9 @@ module VX_gpu_unit #( ) tex_unit ( .clk (clk), .reset (reset), + `ifdef PERF_ENABLE + .perf_tex_if (perf_tex_if), + `endif .tex_req_if (tex_req_if), .tex_csr_if (tex_csr_if), .tex_rsp_if (tex_rsp_if), @@ -143,6 +152,7 @@ module VX_gpu_unit #( assign is_warp_ctl = !(is_tex || tex_rsp_if.valid); assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex); + assign rsp_uuid = tex_rsp_if.valid ? tex_rsp_if.uuid : gpu_req_if.uuid; assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid; assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask; assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC; @@ -161,6 +171,7 @@ module VX_gpu_unit #( assign is_warp_ctl = 1; assign rsp_valid = gpu_req_if.valid; + assign rsp_uuid = gpu_req_if.uuid; assign rsp_wid = gpu_req_if.wid; assign rsp_tmask = gpu_req_if.tmask; assign rsp_PC = gpu_req_if.PC; @@ -176,14 +187,14 @@ module VX_gpu_unit #( assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}), - .data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r}) + .data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}), + .data_out ({gpu_commit_if.valid, gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r}) ); assign gpu_commit_if.data = rsp_data_r[(`NUM_THREADS * 32)-1:0]; @@ -200,7 +211,7 @@ module VX_gpu_unit #( assign gpu_req_if.ready = ~stall_in; `SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid); - `SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid); + `SCOPE_ASSIGN (gpu_rsp_uuid, gpu_commit_if.uuid); `SCOPE_ASSIGN (gpu_rsp_tmc, warp_ctl_if.tmc.valid); `SCOPE_ASSIGN (gpu_rsp_wspawn, warp_ctl_if.wspawn.valid); `SCOPE_ASSIGN (gpu_rsp_split, warp_ctl_if.split.valid); diff --git a/hw/rtl/VX_ibuffer.sv b/hw/rtl/VX_ibuffer.sv index 9b9fd397..6231ac5f 100644 --- a/hw/rtl/VX_ibuffer.sv +++ b/hw/rtl/VX_ibuffer.sv @@ -15,7 +15,7 @@ module VX_ibuffer #( `UNUSED_PARAM (CORE_ID) - localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1; + localparam DATAW = `UUID_BITS + `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1; localparam ADDRW = $clog2(`IBUF_SIZE+1); localparam NWARPSW = $clog2(`NUM_WARPS+1); @@ -168,7 +168,8 @@ module VX_ibuffer #( assign decode_if.ready = ~q_full[decode_if.wid]; - assign q_data_in = {decode_if.tmask, + assign q_data_in = {decode_if.uuid, + decode_if.tmask, decode_if.PC, decode_if.ex_type, decode_if.op_type, @@ -184,7 +185,8 @@ module VX_ibuffer #( assign ibuffer_if.valid = deq_valid; assign ibuffer_if.wid = deq_wid; - assign {ibuffer_if.tmask, + assign {ibuffer_if.uuid, + ibuffer_if.tmask, ibuffer_if.PC, ibuffer_if.ex_type, ibuffer_if.op_type, diff --git a/hw/rtl/VX_icache_stage.sv b/hw/rtl/VX_icache_stage.sv index cb33b82d..be096c5f 100644 --- a/hw/rtl/VX_icache_stage.sv +++ b/hw/rtl/VX_icache_stage.sv @@ -24,77 +24,77 @@ module VX_icache_stage #( localparam OUT_REG = 0; + wire [`NW_BITS-1:0] req_tag, rsp_tag; + wire icache_req_fire = icache_req_if.valid && icache_req_if.ready; - wire [`NW_BITS-1:0] req_tag = ifetch_req_if.wid; - wire [`NW_BITS-1:0] rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0]; + assign req_tag = ifetch_req_if.wid; + assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0]; + wire [`UUID_BITS-1:0] rsp_uuid; wire [31:0] rsp_PC; wire [`NUM_THREADS-1:0] rsp_tmask; VX_dp_ram #( - .DATAW (32 + `NUM_THREADS), + .DATAW (32 + `NUM_THREADS + `UUID_BITS), .SIZE (`NUM_WARPS), .LUTRAM (1) ) req_metadata ( .clk (clk), .wren (icache_req_fire), .waddr (req_tag), - .wdata ({ifetch_req_if.PC, ifetch_req_if.tmask}), + .wdata ({ifetch_req_if.PC, ifetch_req_if.tmask, ifetch_req_if.uuid}), .raddr (rsp_tag), - .rdata ({rsp_PC, rsp_tmask}) + .rdata ({rsp_PC, rsp_tmask, rsp_uuid}) ); `RUNTIME_ASSERT((!ifetch_req_if.valid || ifetch_req_if.PC >= `STARTUP_ADDR), - ("invalid PC=%0h, wid=%0d, tmask=%b", ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask)) + ("%t: *** invalid PC=%0h, wid=%0d, tmask=%b (#%0d)", $time, ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask, ifetch_req_if.uuid)) // Icache Request assign icache_req_if.valid = ifetch_req_if.valid; assign icache_req_if.addr = ifetch_req_if.PC[31:2]; + assign icache_req_if.tag = {ifetch_req_if.uuid, req_tag}; // Can accept new request? assign ifetch_req_if.ready = icache_req_if.ready; -`ifdef DBG_CACHE_REQ_INFO - assign icache_req_if.tag = {ifetch_req_if.wid, ifetch_req_if.PC, req_tag}; -`else - assign icache_req_if.tag = req_tag; -`endif - wire [`NW_BITS-1:0] rsp_wid = rsp_tag; wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid); VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32), + .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `UUID_BITS), .RESETW (1), .DEPTH (OUT_REG) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({icache_rsp_if.valid, rsp_wid, rsp_tmask, rsp_PC, icache_rsp_if.data}), - .data_out ({ifetch_rsp_if.valid, ifetch_rsp_if.wid, ifetch_rsp_if.tmask, ifetch_rsp_if.PC, ifetch_rsp_if.data}) + .data_in ({icache_rsp_if.valid, rsp_wid, rsp_tmask, rsp_PC, icache_rsp_if.data, rsp_uuid}), + .data_out ({ifetch_rsp_if.valid, ifetch_rsp_if.wid, ifetch_rsp_if.tmask, ifetch_rsp_if.PC, ifetch_rsp_if.data, ifetch_rsp_if.uuid}) ); // Can accept new response? assign icache_rsp_if.ready = ~stall_out; `SCOPE_ASSIGN (icache_req_fire, icache_req_fire); - `SCOPE_ASSIGN (icache_req_wid, ifetch_req_if.wid); + `SCOPE_ASSIGN (icache_req_uuid, ifetch_req_if.uuid); `SCOPE_ASSIGN (icache_req_addr, {icache_req_if.addr, 2'b0}); `SCOPE_ASSIGN (icache_req_tag, req_tag); + `SCOPE_ASSIGN (icache_rsp_fire, icache_rsp_if.valid && icache_rsp_if.ready); + `SCOPE_ASSIGN (icache_rsp_uuid, rsp_uuid); `SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data); `SCOPE_ASSIGN (icache_rsp_tag, rsp_tag); `ifdef DBG_TRACE_CORE_ICACHE always @(posedge clk) begin - if (icache_req_if.valid && icache_req_if.ready) begin - dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC); + if (icache_req_fire) begin + dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h (#%0d)\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, ifetch_req_if.uuid); end if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin - dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, data=%0h\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, ifetch_rsp_if.data); + dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, data=%0h (#%0d)\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, ifetch_rsp_if.data, ifetch_rsp_if.uuid); end end `endif diff --git a/hw/rtl/VX_issue.sv b/hw/rtl/VX_issue.sv index abbb5241..e20f5fce 100644 --- a/hw/rtl/VX_issue.sv +++ b/hw/rtl/VX_issue.sv @@ -9,7 +9,7 @@ module VX_issue #( input wire reset, `ifdef PERF_ENABLE - VX_perf_pipeline_if.master perf_pipeline_if, + VX_perf_pipeline_if.issue perf_issue_if, `endif VX_decode_if.slave decode_if, @@ -38,6 +38,7 @@ module VX_issue #( // scoreboard writeback interface assign sboard_wb_if.valid = writeback_if.valid; + assign sboard_wb_if.uuid = writeback_if.uuid; assign sboard_wb_if.wid = writeback_if.wid; assign sboard_wb_if.PC = writeback_if.PC; assign sboard_wb_if.rd = writeback_if.rd; @@ -45,6 +46,7 @@ module VX_issue #( // scoreboard interface assign scoreboard_if.valid = ibuffer_if.valid && dispatch_if.ready; + assign scoreboard_if.uuid = ibuffer_if.uuid; assign scoreboard_if.wid = ibuffer_if.wid; assign scoreboard_if.PC = ibuffer_if.PC; assign scoreboard_if.wb = ibuffer_if.wb; @@ -57,6 +59,7 @@ module VX_issue #( // dispatch interface assign dispatch_if.valid = ibuffer_if.valid && scoreboard_if.ready; + assign dispatch_if.uuid = ibuffer_if.uuid; assign dispatch_if.wid = ibuffer_if.wid; assign dispatch_if.tmask = ibuffer_if.tmask; assign dispatch_if.PC = ibuffer_if.PC; @@ -121,9 +124,8 @@ module VX_issue #( ); `SCOPE_ASSIGN (issue_fire, ibuffer_if.valid && ibuffer_if.ready); - `SCOPE_ASSIGN (issue_wid, ibuffer_if.wid); + `SCOPE_ASSIGN (issue_uuid, ibuffer_if.uuid); `SCOPE_ASSIGN (issue_tmask, ibuffer_if.tmask); - `SCOPE_ASSIGN (issue_pc, ibuffer_if.PC); `SCOPE_ASSIGN (issue_ex_type, ibuffer_if.ex_type); `SCOPE_ASSIGN (issue_op_type, ibuffer_if.op_type); `SCOPE_ASSIGN (issue_op_mod, ibuffer_if.op_mod); @@ -140,10 +142,9 @@ module VX_issue #( `SCOPE_ASSIGN (gpr_rs1, gpr_rsp_if.rs1_data); `SCOPE_ASSIGN (gpr_rs2, gpr_rsp_if.rs2_data); `SCOPE_ASSIGN (gpr_rs3, gpr_rsp_if.rs3_data); - `SCOPE_ASSIGN (writeback_valid, writeback_if.valid); + `SCOPE_ASSIGN (writeback_valid, writeback_if.valid); + `SCOPE_ASSIGN (writeback_uuid, writeback_if.uuid); `SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask); - `SCOPE_ASSIGN (writeback_wid, writeback_if.wid); - `SCOPE_ASSIGN (writeback_pc, writeback_if.PC); `SCOPE_ASSIGN (writeback_rd, writeback_if.rd); `SCOPE_ASSIGN (writeback_data, writeback_if.data); `SCOPE_ASSIGN (writeback_eop, writeback_if.eop); @@ -171,40 +172,35 @@ module VX_issue #( perf_fpu_stalls <= 0; `endif end else begin - if (decode_if.valid & !decode_if.ready) begin + if (decode_if.valid & ~decode_if.ready) begin perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1; end - if (scoreboard_if.valid & !scoreboard_if.ready) begin + if (scoreboard_if.valid & ~scoreboard_if.ready) begin perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1; end - if (alu_req_if.valid & !alu_req_if.ready) begin - perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1; + if (dispatch_if.valid & ~dispatch_if.ready) begin + case (dispatch_if.ex_type) + `EX_ALU: perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1; + `ifdef EXT_F_ENABLE + `EX_FPU: perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1; + `endif + `EX_LSU: perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1; + `EX_CSR: perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1; + //`EX_GPU: + default: perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1; + endcase end - if (lsu_req_if.valid & !lsu_req_if.ready) begin - perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1; - end - if (csr_req_if.valid & !csr_req_if.ready) begin - perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1; - end - if (gpu_req_if.valid & !gpu_req_if.ready) begin - perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1; - end - `ifdef EXT_F_ENABLE - if (fpu_req_if.valid & !fpu_req_if.ready) begin - perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1; - end - `endif end end - assign perf_pipeline_if.ibf_stalls = perf_ibf_stalls; - assign perf_pipeline_if.scb_stalls = perf_scb_stalls; - assign perf_pipeline_if.alu_stalls = perf_alu_stalls; - assign perf_pipeline_if.lsu_stalls = perf_lsu_stalls; - assign perf_pipeline_if.csr_stalls = perf_csr_stalls; - assign perf_pipeline_if.gpu_stalls = perf_gpu_stalls; + assign perf_issue_if.ibf_stalls = perf_ibf_stalls; + assign perf_issue_if.scb_stalls = perf_scb_stalls; + assign perf_issue_if.alu_stalls = perf_alu_stalls; + assign perf_issue_if.lsu_stalls = perf_lsu_stalls; + assign perf_issue_if.csr_stalls = perf_csr_stalls; + assign perf_issue_if.gpu_stalls = perf_gpu_stalls; `ifdef EXT_F_ENABLE - assign perf_pipeline_if.fpu_stalls = perf_fpu_stalls; + assign perf_issue_if.fpu_stalls = perf_fpu_stalls; `endif `endif @@ -216,7 +212,7 @@ module VX_issue #( `TRACE_ARRAY1D(alu_req_if.rs1_data, `NUM_THREADS); dpi_trace(", rs2_data="); `TRACE_ARRAY1D(alu_req_if.rs2_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", alu_req_if.uuid); end if (lsu_req_if.valid && lsu_req_if.ready) begin dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, offset=%0h, addr=", @@ -224,13 +220,13 @@ module VX_issue #( `TRACE_ARRAY1D(lsu_req_if.base_addr, `NUM_THREADS); dpi_trace(", data="); `TRACE_ARRAY1D(lsu_req_if.store_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", lsu_req_if.uuid); end if (csr_req_if.valid && csr_req_if.ready) begin dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, rs1_data=", $time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.rd, csr_req_if.addr); `TRACE_ARRAY1D(csr_req_if.rs1_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", csr_req_if.uuid); end `ifdef EXT_F_ENABLE if (fpu_req_if.valid && fpu_req_if.ready) begin @@ -241,7 +237,7 @@ module VX_issue #( `TRACE_ARRAY1D(fpu_req_if.rs2_data, `NUM_THREADS); dpi_trace(", rs3_data="); `TRACE_ARRAY1D(fpu_req_if.rs3_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", fpu_req_if.uuid); end `endif if (gpu_req_if.valid && gpu_req_if.ready) begin @@ -252,7 +248,7 @@ module VX_issue #( `TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS); dpi_trace(", rs3_data="); `TRACE_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", gpu_req_if.uuid); end end `endif diff --git a/hw/rtl/VX_lsu_unit.sv b/hw/rtl/VX_lsu_unit.sv index 8541f4c6..5116035f 100644 --- a/hw/rtl/VX_lsu_unit.sv +++ b/hw/rtl/VX_lsu_unit.sv @@ -21,16 +21,14 @@ module VX_lsu_unit #( ); localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE); localparam MEM_ADDRW = 32 - MEM_ASHIFT; - localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE); - localparam ADDR_TYPEW = `NC_TAG_BIT + `SM_ENABLE; - `STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) `STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) `STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter")) wire req_valid; + wire [`UUID_BITS-1:0] req_uuid; wire [`NUM_THREADS-1:0] req_tmask; wire [`NUM_THREADS-1:0][31:0] req_addr; wire [`INST_LSU_BITS-1:0] req_type; @@ -44,8 +42,9 @@ module VX_lsu_unit #( wire mbuf_empty; - wire [`NUM_THREADS-1:0][ADDR_TYPEW-1:0] lsu_addr_type, req_addr_type; + wire [`NUM_THREADS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type, req_addr_type; + // full address calculation wire [`NUM_THREADS-1:0][31:0] full_addr; for (genvar i = 0; i < `NUM_THREADS; i++) begin assign full_addr[i] = lsu_req_if.base_addr[i] + lsu_req_if.offset; @@ -56,16 +55,16 @@ module VX_lsu_unit #( for (genvar i = 0; i < (`NUM_THREADS-1); i++) begin assign addr_matches[i] = (lsu_req_if.base_addr[i+1] == lsu_req_if.base_addr[0]) || ~lsu_req_if.tmask[i+1]; end + wire lsu_is_dup = lsu_req_if.tmask[0] && (& addr_matches); for (genvar i = 0; i < `NUM_THREADS; i++) begin // is non-cacheable address wire is_addr_nc = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'(`IO_BASE_ADDR >> MEM_ASHIFT)); - if (`SM_ENABLE) begin // is shared memory address wire is_addr_sm = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'((`SMEM_BASE_ADDR - `SMEM_SIZE) >> MEM_ASHIFT)) - & (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT)); + & (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT)); assign lsu_addr_type[i] = {is_addr_nc, is_addr_sm}; end else begin assign lsu_addr_type[i] = is_addr_nc; @@ -83,19 +82,20 @@ module VX_lsu_unit #( wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch; VX_pipe_register #( - .DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 1 + 1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) req_pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_in), - .data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}), - .data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data}) + .data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}), + .data_out ({req_valid, req_is_dup, req_is_prefetch, req_uuid, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data}) ); // Can accept new request? assign lsu_req_if.ready = ~stall_in && ~fence_wait; + wire [`UUID_BITS-1:0] rsp_uuid; wire [`NW_BITS-1:0] rsp_wid; wire [31:0] rsp_pc; wire [`NR_BITS-1:0] rsp_rd; @@ -104,9 +104,6 @@ module VX_lsu_unit #( wire rsp_is_dup; wire rsp_is_prefetch; - `UNUSED_VAR (rsp_type) - `UNUSED_VAR (rsp_is_prefetch) - reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask; wire [`NUM_THREADS-1:0] rsp_rem_mask_n; wire [`NUM_THREADS-1:0] rsp_tmask; @@ -117,6 +114,9 @@ module VX_lsu_unit #( wire [`LSUQ_ADDR_BITS-1:0] mbuf_waddr, mbuf_raddr; wire mbuf_full; + `UNUSED_VAR (rsp_type) + `UNUSED_VAR (rsp_is_prefetch) + wire [`NUM_THREADS-1:0][REQ_ASHIFT-1:0] req_offset, rsp_offset; for (genvar i = 0; i < `NUM_THREADS; i++) begin assign req_offset[i] = req_addr[i][1:0]; @@ -135,14 +135,14 @@ module VX_lsu_unit #( wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n); - assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS]; + assign mbuf_raddr = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: `LSUQ_ADDR_BITS]; `UNUSED_VAR (dcache_rsp_if.tag) // do not writeback from software prefetch wire req_wb2 = req_wb && ~req_is_prefetch; VX_index_buffer #( - .DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1), + .DATAW (`UUID_BITS + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1), .SIZE (`LSUQ_SIZE) ) req_metadata ( .clk (clk), @@ -150,8 +150,8 @@ module VX_lsu_unit #( .write_addr (mbuf_waddr), .acquire_slot (mbuf_push), .read_addr (mbuf_raddr), - .write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}), - .read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}), + .write_data ({req_uuid, req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}), + .read_data ({rsp_uuid, rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}), .release_addr (mbuf_raddr), .release_slot (mbuf_pop), .full (mbuf_full), @@ -214,7 +214,7 @@ module VX_lsu_unit #( 0: mem_req_byteen[req_offset[i]] = 1; 1: begin mem_req_byteen[req_offset[i]] = 1; - mem_req_byteen[{req_addr[i][1], 1'b1}] = 1; + mem_req_byteen[{req_offset[i][1], 1'b1}] = 1; end default : mem_req_byteen = {4{1'b1}}; endcase @@ -235,14 +235,9 @@ module VX_lsu_unit #( assign dcache_req_if.addr[i] = req_addr[i][31:2]; assign dcache_req_if.byteen[i] = mem_req_byteen; assign dcache_req_if.data[i] = mem_req_data; - - `ifdef DBG_CACHE_REQ_INFO - assign dcache_req_if.tag[i] = {req_wid, req_pc, req_tag, req_addr_type[i]}; - `else - assign dcache_req_if.tag[i] = {req_tag, req_addr_type[i]}; - `endif + assign dcache_req_if.tag[i] = {req_uuid, `LSU_TAG_ID_BITS'(req_tag), req_addr_type[i]}; end - + assign ready_in = req_dep_ready && dcache_req_ready; // send store commit @@ -250,6 +245,7 @@ module VX_lsu_unit #( wire is_store_rsp = req_valid && ~req_wb && dcache_req_ready; assign st_commit_if.valid = is_store_rsp; + assign st_commit_if.uuid = req_uuid; assign st_commit_if.wid = req_wid; assign st_commit_if.tmask = req_tmask; assign st_commit_if.PC = req_pc; @@ -286,14 +282,14 @@ module VX_lsu_unit #( wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1), .RESETW (1) ) rsp_pipe_reg ( .clk (clk), .reset (reset), .enable (!load_rsp_stall), - .data_in ({dcache_rsp_if.valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), - .data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) + .data_in ({dcache_rsp_if.valid, rsp_uuid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), + .data_out ({ld_commit_if.valid, ld_commit_if.uuid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) ); // Can accept new cache response? @@ -301,19 +297,19 @@ module VX_lsu_unit #( // scope registration `SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire); - `SCOPE_ASSIGN (dcache_req_wid, req_wid); - `SCOPE_ASSIGN (dcache_req_pc, req_pc); + `SCOPE_ASSIGN (dcache_req_uuid, req_uuid); `SCOPE_ASSIGN (dcache_req_addr, req_addr); `SCOPE_ASSIGN (dcache_req_rw, ~req_wb); `SCOPE_ASSIGN (dcache_req_byteen,dcache_req_if.byteen); `SCOPE_ASSIGN (dcache_req_data, dcache_req_if.data); `SCOPE_ASSIGN (dcache_req_tag, req_tag); `SCOPE_ASSIGN (dcache_rsp_fire, dcache_rsp_if.tmask & {`NUM_THREADS{dcache_rsp_fire}}); + `SCOPE_ASSIGN (dcache_rsp_uuid, rsp_uuid); `SCOPE_ASSIGN (dcache_rsp_data, dcache_rsp_if.data); `SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr); `ifndef SYNTHESIS - reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + 64 + 1)-1:0] pending_reqs; + reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + `UUID_BITS + 64 + 1)-1:0] pending_reqs; wire [63:0] delay_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE)); always @(posedge clk) begin @@ -321,7 +317,7 @@ module VX_lsu_unit #( pending_reqs <= '0; end begin if (mbuf_push) begin - pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, $time, 1'b1}; + pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, req_uuid, $time, 1'b1}; end if (mbuf_pop) begin pending_reqs[mbuf_raddr] <= '0; @@ -331,8 +327,11 @@ module VX_lsu_unit #( for (integer i = 0; i < `LSUQ_SIZE; ++i) begin if (pending_reqs[i][0]) begin `ASSERT(($time - pending_reqs[i][1 +: 64]) < delay_timeout, - ("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d", - $time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+32+`NR_BITS +: `NW_BITS], pending_reqs[i][1+64+`NR_BITS +: 32], pending_reqs[i][1+64 +: `NR_BITS])); + ("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d (#%0d)", + $time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+`UUID_BITS+`NR_BITS+32 +: `NW_BITS], + pending_reqs[i][1+64+`UUID_BITS+`NR_BITS +: 32], + pending_reqs[i][1+64+`UUID_BITS +: `NR_BITS], + pending_reqs[i][1+64 +: `UUID_BITS])); end end end @@ -352,20 +351,20 @@ module VX_lsu_unit #( `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); dpi_trace(", data="); `TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(", (#%0d)\n", req_uuid); end else begin dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire); `TRACE_ARRAY1D(req_addr, `NUM_THREADS); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); - dpi_trace(", rd=%0d, is_dup=%b\n", req_rd, req_is_dup); + dpi_trace(", rd=%0d, is_dup=%b (#%0d)\n", req_rd, req_is_dup, req_uuid); end end if (dcache_rsp_fire) begin dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=", $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd); `TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS); - dpi_trace(", is_dup=%b\n", rsp_is_dup); + dpi_trace(", is_dup=%b (#%0d)\n", rsp_is_dup, rsp_uuid); end end `endif diff --git a/hw/rtl/VX_mem_unit.sv b/hw/rtl/VX_mem_unit.sv index 56de47ef..ade9600f 100644 --- a/hw/rtl/VX_mem_unit.sv +++ b/hw/rtl/VX_mem_unit.sv @@ -358,19 +358,17 @@ module VX_mem_unit # ( `ifdef PERF_ENABLE + `UNUSED_VAR (perf_dcache_if.mem_stalls) + `UNUSED_VAR (perf_dcache_if.crsp_stalls) + assign perf_memsys_if.icache_reads = perf_icache_if.reads; assign perf_memsys_if.icache_read_misses = perf_icache_if.read_misses; - assign perf_memsys_if.icache_pipe_stalls = perf_icache_if.pipe_stalls; - assign perf_memsys_if.icache_crsp_stalls = perf_icache_if.crsp_stalls; - assign perf_memsys_if.dcache_reads = perf_dcache_if.reads; assign perf_memsys_if.dcache_writes = perf_dcache_if.writes; assign perf_memsys_if.dcache_read_misses = perf_dcache_if.read_misses; assign perf_memsys_if.dcache_write_misses= perf_dcache_if.write_misses; assign perf_memsys_if.dcache_bank_stalls = perf_dcache_if.bank_stalls; assign perf_memsys_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls; - assign perf_memsys_if.dcache_pipe_stalls = perf_dcache_if.pipe_stalls; - assign perf_memsys_if.dcache_crsp_stalls = perf_dcache_if.crsp_stalls; if (`SM_ENABLE) begin assign perf_memsys_if.smem_reads = perf_smem_if.reads; @@ -382,47 +380,41 @@ end else begin assign perf_memsys_if.smem_bank_stalls = 0; end - reg [`PERF_CTR_BITS-1:0] perf_mem_lat_per_cycle; + reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads; always @(posedge clk) begin if (reset) begin - perf_mem_lat_per_cycle <= 0; + perf_mem_pending_reads <= 0; end else begin - perf_mem_lat_per_cycle <= perf_mem_lat_per_cycle + - `PERF_CTR_BITS'($signed(2'((mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready) && !(mem_rsp_if.valid && mem_rsp_if.ready)) - - 2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready)))); + perf_mem_pending_reads <= perf_mem_pending_reads + + `PERF_CTR_BITS'($signed(2'((mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) && !(mem_rsp_if.valid && mem_rsp_if.ready)) - + 2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw)))); end end reg [`PERF_CTR_BITS-1:0] perf_mem_reads; reg [`PERF_CTR_BITS-1:0] perf_mem_writes; reg [`PERF_CTR_BITS-1:0] perf_mem_lat; - reg [`PERF_CTR_BITS-1:0] perf_mem_stalls; always @(posedge clk) begin if (reset) begin perf_mem_reads <= 0; perf_mem_writes <= 0; perf_mem_lat <= 0; - perf_mem_stalls <= 0; end else begin if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'd1; end if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'd1; - end - if (mem_req_if.valid && !mem_req_if.ready) begin - perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'd1; - end - perf_mem_lat <= perf_mem_lat + perf_mem_lat_per_cycle; + end + perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads; end end assign perf_memsys_if.mem_reads = perf_mem_reads; assign perf_memsys_if.mem_writes = perf_mem_writes; - assign perf_memsys_if.mem_latency = perf_mem_lat; - assign perf_memsys_if.mem_stalls = perf_mem_stalls; + assign perf_memsys_if.mem_latency = perf_mem_lat; `endif endmodule diff --git a/hw/rtl/VX_muldiv.sv b/hw/rtl/VX_muldiv.sv index 5cd13f5c..ea992825 100644 --- a/hw/rtl/VX_muldiv.sv +++ b/hw/rtl/VX_muldiv.sv @@ -6,6 +6,7 @@ module VX_muldiv ( // Inputs input wire [`INST_MUL_BITS-1:0] alu_op, + input wire [`UUID_BITS-1:0] uuid_in, input wire [`NW_BITS-1:0] wid_in, input wire [`NUM_THREADS-1:0] tmask_in, input wire [31:0] PC_in, @@ -15,6 +16,7 @@ module VX_muldiv ( input wire [`NUM_THREADS-1:0][31:0] alu_in2, // Outputs + output wire [`UUID_BITS-1:0] uuid_out, output wire [`NW_BITS-1:0] wid_out, output wire [`NUM_THREADS-1:0] tmask_out, output wire [31:0] PC_out, @@ -32,6 +34,7 @@ module VX_muldiv ( wire is_div_op = `INST_MUL_IS_DIV(alu_op); wire [`NUM_THREADS-1:0][31:0] mul_result; + wire [`UUID_BITS-1:0] mul_uuid_out; wire [`NW_BITS-1:0] mul_wid_out; wire [`NUM_THREADS-1:0] mul_tmask_out; wire [31:0] mul_PC_out; @@ -63,15 +66,15 @@ module VX_muldiv ( end VX_shift_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .DEPTH (`LATENCY_IMUL), .RESETW (1) ) mul_shift_reg ( .clk(clk), .reset (reset), .enable (mul_ready_in), - .data_in ({mul_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}), - .data_out ({mul_valid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result}) + .data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}), + .data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result}) ); `else @@ -103,15 +106,15 @@ module VX_muldiv ( end VX_shift_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1), .DEPTH (`LATENCY_IMUL), .RESETW (1) ) mul_shift_reg ( .clk(clk), .reset (reset), .enable (mul_ready_in), - .data_in ({mul_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}), - .data_out ({mul_valid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out}) + .data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}), + .data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out}) ); `endif @@ -119,6 +122,7 @@ module VX_muldiv ( /////////////////////////////////////////////////////////////////////////// wire [`NUM_THREADS-1:0][31:0] div_result; + wire [`UUID_BITS-1:0] div_uuid_out; wire [`NW_BITS-1:0] div_wid_out; wire [`NUM_THREADS-1:0] div_tmask_out; wire [31:0] div_PC_out; @@ -147,15 +151,15 @@ module VX_muldiv ( end VX_shift_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .DEPTH (`LATENCY_IMUL), .RESETW (1) ) div_shift_reg ( .clk(clk), .reset (reset), .enable (div_ready_in), - .data_in ({div_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}), - .data_out ({div_valid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result}) + .data_in ({div_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}), + .data_out ({div_valid_out, div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result}) ); assign div_ready_in = div_ready_out || ~div_valid_out; @@ -171,21 +175,21 @@ module VX_muldiv ( .WIDTHQ (32), .WIDTHR (32), .LANES (`NUM_THREADS), - .TAGW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1) + .TAGW (64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1) ) divide ( .clk (clk), .reset (reset), .valid_in (div_valid_in), .ready_in (div_ready_in), .signed_mode(is_signed_div), - .tag_in ({wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}), + .tag_in ({uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}), .numer (alu_in1), .denom (alu_in2), .quotient (div_result_tmp), .remainder (rem_result_tmp), .ready_out (div_ready_out), .valid_out (div_valid_out), - .tag_out ({div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out}) + .tag_out ({div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out}) ); assign div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp; @@ -195,6 +199,7 @@ module VX_muldiv ( /////////////////////////////////////////////////////////////////////////// wire rsp_valid = mul_valid_out || div_valid_out; + wire [`UUID_BITS-1:0] rsp_uuid = mul_valid_out ? mul_uuid_out : div_uuid_out; wire [`NW_BITS-1:0] rsp_wid = mul_valid_out ? mul_wid_out : div_wid_out; wire [`NUM_THREADS-1:0] rsp_tmask = mul_valid_out ? mul_tmask_out : div_tmask_out; wire [31:0] rsp_PC = mul_valid_out ? mul_PC_out : div_PC_out; @@ -205,14 +210,14 @@ module VX_muldiv ( assign stall_out = ~ready_out && valid_out; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}), - .data_out ({valid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out}) + .data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}), + .data_out ({valid_out, uuid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out}) ); // can accept new request? diff --git a/hw/rtl/VX_pipeline.sv b/hw/rtl/VX_pipeline.sv index 8bbc7ead..1ab20c4a 100644 --- a/hw/rtl/VX_pipeline.sv +++ b/hw/rtl/VX_pipeline.sv @@ -165,6 +165,9 @@ module VX_pipeline #( ) decode ( .clk (clk), .reset (decode_reset), + `ifdef PERF_ENABLE + .perf_decode_if (perf_pipeline_if.decode), + `endif .ifetch_rsp_if (ifetch_rsp_if), .decode_if (decode_if), .wstall_if (wstall_if), @@ -180,7 +183,7 @@ module VX_pipeline #( .reset (issue_reset), `ifdef PERF_ENABLE - .perf_pipeline_if (perf_pipeline_if), + .perf_issue_if (perf_pipeline_if.issue), `endif .decode_if (decode_if), diff --git a/hw/rtl/VX_scoreboard.sv b/hw/rtl/VX_scoreboard.sv index 6ba4e998..9a3fed37 100644 --- a/hw/rtl/VX_scoreboard.sv +++ b/hw/rtl/VX_scoreboard.sv @@ -60,22 +60,22 @@ module VX_scoreboard #( end else begin `ifdef DBG_TRACE_PIPELINE if (ibuffer_if.valid && ~ibuffer_if.ready) begin - dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n", + dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)\n", $time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb, - deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3); + deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid); end `endif if (release_reg) begin `ASSERT(inuse_regs[writeback_if.wid][writeback_if.rd] != 0, - ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d", - $time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd)); + ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d (#%0d)", + $time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd,writeback_if.uuid)); end if (ibuffer_if.valid && ~ibuffer_if.ready) begin deadlock_ctr <= deadlock_ctr + 1; `ASSERT(deadlock_ctr < deadlock_timeout, - ("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b", + ("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)", $time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb, - deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3)); + deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid)); end else if (ibuffer_if.valid && ibuffer_if.ready) begin deadlock_ctr <= 0; end diff --git a/hw/rtl/VX_trace_instr.vh b/hw/rtl/VX_trace_instr.vh index e228179e..5e8e031e 100644 --- a/hw/rtl/VX_trace_instr.vh +++ b/hw/rtl/VX_trace_instr.vh @@ -35,9 +35,9 @@ task trace_ex_op ( `INST_BR_JALR: dpi_trace("JALR"); `INST_BR_ECALL: dpi_trace("ECALL"); `INST_BR_EBREAK:dpi_trace("EBREAK"); - `INST_BR_MRET: dpi_trace("MRET"); + `INST_BR_URET: dpi_trace("URET"); `INST_BR_SRET: dpi_trace("SRET"); - `INST_BR_DRET: dpi_trace("DRET"); + `INST_BR_MRET: dpi_trace("MRET"); default: dpi_trace("?"); endcase end else if (`INST_ALU_IS_MUL(op_mod)) begin diff --git a/hw/rtl/VX_warp_sched.sv b/hw/rtl/VX_warp_sched.sv index 979a3536..dda8600b 100644 --- a/hw/rtl/VX_warp_sched.sv +++ b/hw/rtl/VX_warp_sched.sv @@ -46,6 +46,8 @@ module VX_warp_sched #( wire schedule_valid; wire warp_scheduled; + reg [`UUID_BITS-1:0] issued_instrs; + wire ifetch_req_fire = ifetch_req_if.valid && ifetch_req_if.ready; wire tmc_active = (warp_ctl_if.tmc.tmask != 0); @@ -62,12 +64,13 @@ module VX_warp_sched #( always @(posedge clk) begin if (reset) begin - barrier_masks <= 0; - use_wspawn <= 0; - stalled_warps <= 0; + barrier_masks <= '0; + use_wspawn <= '0; + stalled_warps <= '0; warp_pcs <= '0; active_warps <= '0; thread_masks <= '0; + issued_instrs <= '0; // activate first warp warp_pcs[0] <= `STARTUP_ADDR; @@ -117,6 +120,8 @@ module VX_warp_sched #( if (use_wspawn[schedule_wid]) begin thread_masks[schedule_wid] <= 1; end + + issued_instrs <= issued_instrs + 1; end if (ifetch_req_fire) begin @@ -223,20 +228,23 @@ module VX_warp_sched #( assign warp_scheduled = schedule_valid && ~stall_out; + wire [`UUID_BITS-1:0] instr_uuid = (issued_instrs * `NUM_CORES * `NUM_CLUSTERS) + `UUID_BITS'(CORE_ID); + VX_pipe_register #( - .DATAW (1 + `NUM_THREADS + 32 + `NW_BITS), + .DATAW (1 + `UUID_BITS + `NUM_THREADS + 32 + `NW_BITS), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({schedule_valid, schedule_tmask, schedule_pc, schedule_wid}), - .data_out ({ifetch_req_if.valid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid}) + .data_in ({schedule_valid, instr_uuid, schedule_tmask, schedule_pc, schedule_wid}), + .data_out ({ifetch_req_if.valid, ifetch_req_if.uuid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid}) ); assign busy = (active_warps != 0); `SCOPE_ASSIGN (wsched_scheduled, warp_scheduled); + `SCOPE_ASSIGN (wsched_schedule_uuid, instr_uuid); `SCOPE_ASSIGN (wsched_active_warps, active_warps); `SCOPE_ASSIGN (wsched_stalled_warps, stalled_warps); `SCOPE_ASSIGN (wsched_schedule_wid, schedule_wid); diff --git a/hw/rtl/VX_writeback.sv b/hw/rtl/VX_writeback.sv index cdf7f988..f4471046 100644 --- a/hw/rtl/VX_writeback.sv +++ b/hw/rtl/VX_writeback.sv @@ -23,17 +23,9 @@ module VX_writeback #( localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1; `ifdef EXT_F_ENABLE -`ifdef EXT_TEX_ENABLE localparam NUM_RSPS = 5; `else localparam NUM_RSPS = 4; -`endif -`else -`ifdef EXT_TEX_ENABLE - localparam NUM_RSPS = 4; -`else - localparam NUM_RSPS = 3; -`endif `endif wire wb_valid; @@ -50,9 +42,7 @@ module VX_writeback #( wire stall; assign rsp_valid = { - `ifdef EXT_TEX_ENABLE gpu_commit_if.valid && gpu_commit_if.wb, - `endif csr_commit_if.valid && csr_commit_if.wb, alu_commit_if.valid && alu_commit_if.wb, `ifdef EXT_F_ENABLE @@ -62,9 +52,7 @@ module VX_writeback #( }; assign rsp_data = { - `ifdef EXT_TEX_ENABLE {gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop}, - `endif {csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop}, {alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop}, `ifdef EXT_F_ENABLE @@ -76,7 +64,8 @@ module VX_writeback #( VX_stream_arbiter #( .NUM_REQS (NUM_RSPS), .DATAW (DATAW), - .TYPE ("P") + .BUFFERED (1), + .TYPE ("R") ) rsp_arb ( .clk (clk), .reset (reset), @@ -88,28 +77,17 @@ module VX_writeback #( .ready_out (~stall) ); - assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb; + assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb; `ifdef EXT_F_ENABLE assign fpu_commit_if.ready = rsp_ready[1] || ~fpu_commit_if.wb; assign alu_commit_if.ready = rsp_ready[2] || ~alu_commit_if.wb; assign csr_commit_if.ready = rsp_ready[3] || ~csr_commit_if.wb; + assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb; `else assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb; assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb; -`ifdef EXT_TEX_ENABLE assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb; `endif -`endif - -`ifdef EXT_TEX_ENABLE -`ifdef EXT_F_ENABLE - assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb; -`else - assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb; -`endif -`else - assign gpu_commit_if.ready = 1; -`endif assign stall = ~writeback_if.ready && writeback_if.valid; diff --git a/hw/rtl/afu/VX_to_mem.sv b/hw/rtl/afu/VX_to_mem.sv index 472f8cb3..acc2899b 100644 --- a/hw/rtl/afu/VX_to_mem.sv +++ b/hw/rtl/afu/VX_to_mem.sv @@ -124,7 +124,8 @@ module VX_to_mem #( end end assign mem_rsp_tag_in_w = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_in; - `RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_w == mem_rsp_tag_in), ("out-of-order memory reponse! cur=%d, expected=%d", mem_rsp_tag_in_w, mem_rsp_tag_in)) + `RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_w == mem_rsp_tag_in), + ("%t: *** out-of-order memory reponse! cur=%d, expected=%d", $time, mem_rsp_tag_in_w, mem_rsp_tag_in)) wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr}; diff --git a/hw/rtl/cache/VX_bank.sv b/hw/rtl/cache/VX_bank.sv index 14d50e29..22e5887b 100644 --- a/hw/rtl/cache/VX_bank.sv +++ b/hw/rtl/cache/VX_bank.sv @@ -33,9 +33,6 @@ module VX_bank #( // core request tag size parameter CORE_TAG_WIDTH = 1, - // size of tag id in core request tag - parameter CORE_TAG_ID_BITS = 0, - // bank offset from beginning of index range parameter BANK_ADDR_OFFSET = 0, @@ -51,7 +48,6 @@ module VX_bank #( output wire perf_read_misses, output wire perf_write_misses, output wire perf_mshr_stalls, - output wire perf_pipe_stalls, `endif // Core Request @@ -96,14 +92,9 @@ module VX_bank #( input wire [`LINE_SELECT_BITS-1:0] flush_addr ); - `UNUSED_PARAM (CORE_TAG_ID_BITS) - -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - wire [31:0] debug_pc_sel, debug_pc_st0, debug_pc_st1; - wire [`NW_BITS-1:0] debug_wid_sel, debug_wid_st0, debug_wid_st1; + wire [`DBG_CACHE_REQ_IDW-1:0] req_id_sel, req_id_st0, req_id_st1; `IGNORE_UNUSED_END -`endif wire [NUM_PORTS-1:0] creq_pmask; wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] creq_wsel; @@ -197,13 +188,7 @@ module VX_bank #( wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; wire creq_fire = creq_valid && creq_ready; -`ifdef DBG_CACHE_REQ_INFO - if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_sel, debug_pc_sel} = mshr_enable ? mshr_tag[0][`CACHE_REQ_INFO_RNG] : creq_tag[0][`CACHE_REQ_INFO_RNG]; - end else begin - assign {debug_wid_sel, debug_pc_sel} = 0; - end -`endif + assign req_id_sel = mshr_enable ? mshr_tag[0][`CACHE_REQ_ID_RNG] : creq_tag[0][`CACHE_REQ_ID_RNG]; wire [`CACHE_LINE_WIDTH-1:0] wdata_sel; assign wdata_sel[(NUM_PORTS * `WORD_WIDTH)-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[(NUM_PORTS * `WORD_WIDTH)-1:0] : creq_data; @@ -237,13 +222,7 @@ module VX_bank #( .data_out ({valid_st0, is_flush_st0, is_mshr_st0, is_fill_st0, is_read_st0, is_write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0}) ); -`ifdef DBG_CACHE_REQ_INFO - if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st0, debug_pc_st0} = tag_st0[0][`CACHE_REQ_INFO_RNG]; - end else begin - assign {debug_wid_st0, debug_pc_st0} = 0; - end -`endif + assign req_id_st0 = tag_st0[0][`CACHE_REQ_ID_RNG]; wire do_fill_st0 = valid_st0 && is_fill_st0; wire do_flush_st0 = valid_st0 && is_flush_st0; @@ -263,11 +242,9 @@ module VX_bank #( .clk (clk), .reset (reset), - `ifdef DBG_CACHE_REQ_INFO - .debug_pc (debug_pc_st0), - .debug_wid (debug_wid_st0), - `endif - .stall (crsq_stall), + .req_id (req_id_st0), + + .stall (crsq_stall), // read/Fill .lookup (do_lookup_st0), @@ -293,13 +270,7 @@ module VX_bank #( .data_out ({valid_st1, is_mshr_st1, is_fill_st1, is_read_st1, is_write_st1, miss_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1}) ); -`ifdef DBG_CACHE_REQ_INFO - if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st1, debug_pc_st1} = tag_st1[0][`CACHE_REQ_INFO_RNG]; - end else begin - assign {debug_wid_st1, debug_pc_st1} = 0; - end -`endif + assign req_id_st1 = tag_st1[0][`CACHE_REQ_ID_RNG]; wire do_read_st0 = valid_st0 && is_read_st0; wire do_read_st1 = valid_st1 && is_read_st1; @@ -323,10 +294,8 @@ module VX_bank #( .clk (clk), .reset (reset), - `ifdef DBG_CACHE_REQ_INFO - .debug_pc (debug_pc_st1), - .debug_wid (debug_wid_st1), - `endif + .req_id (req_id_st1), + .stall (crsq_stall), .read (do_read_st1 || do_mshr_st1), @@ -372,14 +341,9 @@ module VX_bank #( .clk (clk), .reset (reset), - `ifdef DBG_CACHE_REQ_INFO - .deq_debug_pc (debug_pc_sel), - .deq_debug_wid (debug_wid_sel), - .lkp_debug_pc (debug_pc_st0), - .lkp_debug_wid (debug_wid_st0), - .rel_debug_pc (debug_pc_st1), - .rel_debug_wid (debug_wid_st1), - `endif + .deq_req_id (req_id_sel), + .lkp_req_id (req_id_st0), + .rel_req_id (req_id_st1), // allocate .allocate_valid (mshr_allocate), @@ -505,7 +469,6 @@ module VX_bank #( `ifdef PERF_ENABLE assign perf_read_misses = do_read_st1 && miss_st1; assign perf_write_misses = do_write_st1 && miss_st1; - assign perf_pipe_stalls = crsq_stall || mreq_alm_full || mshr_alm_full; assign perf_mshr_stalls = mshr_alm_full; `endif @@ -525,22 +488,22 @@ module VX_bank #( dpi_trace("%d: cache%0d:%0d fill-rsp: addr=%0h, id=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data); end if (mshr_fire) begin - dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, debug_wid_sel, debug_pc_sel); + dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, req_id_sel); end if (creq_fire) begin if (creq_rw) - dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel); + dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, req_id_sel); else - dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel); + dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, req_id_sel); end if (crsq_fire) begin - dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1); + dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, req_id_st1); end if (mreq_push) begin if (is_write_st1) - dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, debug_wid_st1, debug_pc_st1); + dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, req_id_st1); else - dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, debug_wid_st1, debug_pc_st1); + dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, req_id_st1); end end `endif diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index 29e14892..1b7d7abf 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -102,7 +102,6 @@ module VX_cache #( wire [NUM_BANKS-1:0] perf_read_miss_per_bank; wire [NUM_BANKS-1:0] perf_write_miss_per_bank; wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank; - wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank; `endif /////////////////////////////////////////////////////////////////////////// @@ -219,37 +218,37 @@ module VX_cache #( /////////////////////////////////////////////////////////////////////////// // Core request - wire [NUM_REQS-1:0] core_req_valid_nc; - wire [NUM_REQS-1:0] core_req_rw_nc; - wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr_nc; - wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_nc; - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data_nc; - wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_nc; - wire [NUM_REQS-1:0] core_req_ready_nc; + wire [NUM_REQS-1:0] core_req_valid_c; + wire [NUM_REQS-1:0] core_req_rw_c; + wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr_c; + wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_c; + wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data_c; + wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_c; + wire [NUM_REQS-1:0] core_req_ready_c; // Core response - wire [`CORE_RSP_TAGS-1:0] core_rsp_valid_nc; - wire [NUM_REQS-1:0] core_rsp_tmask_nc; - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_nc; - wire [`CORE_RSP_TAGS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_nc; - wire [`CORE_RSP_TAGS-1:0] core_rsp_ready_nc; + wire [`CORE_RSP_TAGS-1:0] core_rsp_valid_c; + wire [NUM_REQS-1:0] core_rsp_tmask_c; + wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_c; + wire [`CORE_RSP_TAGS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_c; + wire [`CORE_RSP_TAGS-1:0] core_rsp_ready_c; // Memory request - wire mem_req_valid_nc; - wire mem_req_rw_nc; - wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_nc; - wire [NUM_PORTS-1:0] mem_req_pmask_nc; - wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_nc; - wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_nc; - wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_nc; - wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_nc; - wire mem_req_ready_nc; + wire mem_req_valid_c; + wire mem_req_rw_c; + wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_c; + wire [NUM_PORTS-1:0] mem_req_pmask_c; + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_c; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_c; + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_c; + wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_c; + wire mem_req_ready_c; // Memory response - wire mem_rsp_valid_nc; - wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_nc; - wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_nc; - wire mem_rsp_ready_nc; + wire mem_rsp_valid_c; + wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_c; + wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_c; + wire mem_rsp_ready_c; if (NC_ENABLE) begin VX_nc_bypass #( @@ -280,20 +279,20 @@ module VX_cache #( .core_req_ready_in (core_req_ready), // Core request out - .core_req_valid_out (core_req_valid_nc), - .core_req_rw_out (core_req_rw_nc), - .core_req_byteen_out(core_req_byteen_nc), - .core_req_addr_out (core_req_addr_nc), - .core_req_data_out (core_req_data_nc), - .core_req_tag_out (core_req_tag_nc), - .core_req_ready_out (core_req_ready_nc), + .core_req_valid_out (core_req_valid_c), + .core_req_rw_out (core_req_rw_c), + .core_req_byteen_out(core_req_byteen_c), + .core_req_addr_out (core_req_addr_c), + .core_req_data_out (core_req_data_c), + .core_req_tag_out (core_req_tag_c), + .core_req_ready_out (core_req_ready_c), // Core response in - .core_rsp_valid_in (core_rsp_valid_nc), - .core_rsp_tmask_in (core_rsp_tmask_nc), - .core_rsp_data_in (core_rsp_data_nc), - .core_rsp_tag_in (core_rsp_tag_nc), - .core_rsp_ready_in (core_rsp_ready_nc), + .core_rsp_valid_in (core_rsp_valid_c), + .core_rsp_tmask_in (core_rsp_tmask_c), + .core_rsp_data_in (core_rsp_data_c), + .core_rsp_tag_in (core_rsp_tag_c), + .core_rsp_ready_in (core_rsp_ready_c), // Core response out .core_rsp_valid_out (core_rsp_valid_sb), @@ -303,15 +302,15 @@ module VX_cache #( .core_rsp_ready_out (core_rsp_ready_sb), // Memory request in - .mem_req_valid_in (mem_req_valid_nc), - .mem_req_rw_in (mem_req_rw_nc), - .mem_req_addr_in (mem_req_addr_nc), - .mem_req_pmask_in (mem_req_pmask_nc), - .mem_req_byteen_in (mem_req_byteen_nc), - .mem_req_wsel_in (mem_req_wsel_nc), - .mem_req_data_in (mem_req_data_nc), - .mem_req_tag_in (mem_req_tag_nc), - .mem_req_ready_in (mem_req_ready_nc), + .mem_req_valid_in (mem_req_valid_c), + .mem_req_rw_in (mem_req_rw_c), + .mem_req_addr_in (mem_req_addr_c), + .mem_req_pmask_in (mem_req_pmask_c), + .mem_req_byteen_in (mem_req_byteen_c), + .mem_req_wsel_in (mem_req_wsel_c), + .mem_req_data_in (mem_req_data_c), + .mem_req_tag_in (mem_req_tag_c), + .mem_req_ready_in (mem_req_ready_c), // Memory request out .mem_req_valid_out (mem_req_valid_sb), @@ -331,40 +330,40 @@ module VX_cache #( .mem_rsp_ready_in (mem_rsp_ready), // Memory response out - .mem_rsp_valid_out (mem_rsp_valid_nc), - .mem_rsp_data_out (mem_rsp_data_nc), - .mem_rsp_tag_out (mem_rsp_tag_nc), - .mem_rsp_ready_out (mem_rsp_ready_nc) + .mem_rsp_valid_out (mem_rsp_valid_c), + .mem_rsp_data_out (mem_rsp_data_c), + .mem_rsp_tag_out (mem_rsp_tag_c), + .mem_rsp_ready_out (mem_rsp_ready_c) ); end else begin - assign core_req_valid_nc = core_req_valid; - assign core_req_rw_nc = core_req_rw; - assign core_req_addr_nc = core_req_addr; - assign core_req_byteen_nc = core_req_byteen; - assign core_req_data_nc = core_req_data; - assign core_req_tag_nc = core_req_tag; - assign core_req_ready = core_req_ready_nc; + assign core_req_valid_c = core_req_valid; + assign core_req_rw_c = core_req_rw; + assign core_req_addr_c = core_req_addr; + assign core_req_byteen_c = core_req_byteen; + assign core_req_data_c = core_req_data; + assign core_req_tag_c = core_req_tag; + assign core_req_ready = core_req_ready_c; - assign core_rsp_valid_sb = core_rsp_valid_nc; - assign core_rsp_tmask_sb = core_rsp_tmask_nc; - assign core_rsp_data_sb = core_rsp_data_nc; - assign core_rsp_tag_sb = core_rsp_tag_nc; - assign core_rsp_ready_nc = core_rsp_ready_sb; + assign core_rsp_valid_sb = core_rsp_valid_c; + assign core_rsp_tmask_sb = core_rsp_tmask_c; + assign core_rsp_data_sb = core_rsp_data_c; + assign core_rsp_tag_sb = core_rsp_tag_c; + assign core_rsp_ready_c = core_rsp_ready_sb; - assign mem_req_valid_sb = mem_req_valid_nc; - assign mem_req_addr_sb = mem_req_addr_nc; - assign mem_req_rw_p = mem_req_rw_nc; - assign mem_req_pmask_p = mem_req_pmask_nc; - assign mem_req_byteen_p = mem_req_byteen_nc; - assign mem_req_wsel_p = mem_req_wsel_nc; - assign mem_req_data_p = mem_req_data_nc; - assign mem_req_tag_sb = mem_req_tag_nc; - assign mem_req_ready_nc = mem_req_ready_sb; + assign mem_req_valid_sb = mem_req_valid_c; + assign mem_req_addr_sb = mem_req_addr_c; + assign mem_req_rw_p = mem_req_rw_c; + assign mem_req_pmask_p = mem_req_pmask_c; + assign mem_req_byteen_p = mem_req_byteen_c; + assign mem_req_wsel_p = mem_req_wsel_c; + assign mem_req_data_p = mem_req_data_c; + assign mem_req_tag_sb = mem_req_tag_c; + assign mem_req_ready_c = mem_req_ready_sb; - assign mem_rsp_valid_nc = mem_rsp_valid; - assign mem_rsp_data_nc = mem_rsp_data; - assign mem_rsp_tag_nc = mem_rsp_tag; - assign mem_rsp_ready = mem_rsp_ready_nc; + assign mem_rsp_valid_c = mem_rsp_valid; + assign mem_rsp_data_c = mem_rsp_data; + assign mem_rsp_tag_c = mem_rsp_tag; + assign mem_rsp_ready = mem_rsp_ready_c; end /////////////////////////////////////////////////////////////////////////// @@ -383,15 +382,15 @@ module VX_cache #( ) mem_rsp_queue ( .clk (clk), .reset (mrsq_reset), - .ready_in (mem_rsp_ready_nc), - .valid_in (mem_rsp_valid_nc), - .data_in ({mem_rsp_tag_nc, mem_rsp_data_nc}), + .ready_in (mem_rsp_ready_c), + .valid_in (mem_rsp_valid_c), + .data_in ({mem_rsp_tag_c, mem_rsp_data_c}), .data_out ({mem_rsp_tag_qual, mem_rsp_data_qual}), .ready_out (mrsq_out_ready), .valid_out (mrsq_out_valid) ); - `UNUSED_VAR (mem_rsp_tag_nc) + `UNUSED_VAR (mem_rsp_tag_c) /////////////////////////////////////////////////////////////////////////// @@ -464,13 +463,13 @@ module VX_cache #( `ifdef PERF_ENABLE .bank_stalls(perf_cache_if.bank_stalls), `endif - .core_req_valid (core_req_valid_nc), - .core_req_rw (core_req_rw_nc), - .core_req_addr (core_req_addr_nc), - .core_req_byteen (core_req_byteen_nc), - .core_req_data (core_req_data_nc), - .core_req_tag (core_req_tag_nc), - .core_req_ready (core_req_ready_nc), + .core_req_valid (core_req_valid_c), + .core_req_rw (core_req_rw_c), + .core_req_addr (core_req_addr_c), + .core_req_byteen (core_req_byteen_c), + .core_req_data (core_req_data_c), + .core_req_tag (core_req_tag_c), + .core_req_ready (core_req_ready_c), .per_bank_core_req_valid (per_bank_core_req_valid), .per_bank_core_req_pmask (per_bank_core_req_pmask), .per_bank_core_req_rw (per_bank_core_req_rw), @@ -580,8 +579,7 @@ module VX_cache #( .MSHR_SIZE (MSHR_SIZE), .MREQ_SIZE (MREQ_SIZE), .WRITE_ENABLE (WRITE_ENABLE), - .CORE_TAG_WIDTH (CORE_TAG_X_WIDTH), - .CORE_TAG_ID_BITS (CORE_TAG_ID_X_BITS), + .CORE_TAG_WIDTH (CORE_TAG_X_WIDTH), .BANK_ADDR_OFFSET (BANK_ADDR_OFFSET) ) bank ( `SCOPE_BIND_VX_cache_bank(i) @@ -593,7 +591,6 @@ module VX_cache #( .perf_read_misses (perf_read_miss_per_bank[i]), .perf_write_misses (perf_write_miss_per_bank[i]), .perf_mshr_stalls (perf_mshr_stall_per_bank[i]), - .perf_pipe_stalls (perf_pipe_stall_per_bank[i]), `endif // Core request @@ -656,11 +653,11 @@ module VX_cache #( .per_bank_core_rsp_tag (per_bank_core_rsp_tag), .per_bank_core_rsp_tid (per_bank_core_rsp_tid), .per_bank_core_rsp_ready (per_bank_core_rsp_ready), - .core_rsp_valid (core_rsp_valid_nc), - .core_rsp_tmask (core_rsp_tmask_nc), - .core_rsp_tag (core_rsp_tag_nc), - .core_rsp_data (core_rsp_data_nc), - .core_rsp_ready (core_rsp_ready_nc) + .core_rsp_valid (core_rsp_valid_c), + .core_rsp_tmask (core_rsp_tmask_c), + .core_rsp_tag (core_rsp_tag_c), + .core_rsp_data (core_rsp_data_c), + .core_rsp_ready (core_rsp_ready_c) ); wire [NUM_BANKS-1:0][(`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH))-1:0] data_in; @@ -682,15 +679,15 @@ module VX_cache #( .valid_in (per_bank_mem_req_valid), .data_in (data_in), .ready_in (per_bank_mem_req_ready), - .valid_out (mem_req_valid_nc), - .data_out ({mem_req_addr_nc, mem_req_id, mem_req_rw_nc, mem_req_pmask_nc, mem_req_byteen_nc, mem_req_wsel_nc, mem_req_data_nc}), - .ready_out (mem_req_ready_nc) + .valid_out (mem_req_valid_c), + .data_out ({mem_req_addr_c, mem_req_id, mem_req_rw_c, mem_req_pmask_c, mem_req_byteen_c, mem_req_wsel_c, mem_req_data_c}), + .ready_out (mem_req_ready_c) ); if (NUM_BANKS == 1) begin - assign mem_req_tag_nc = MEM_TAG_IN_WIDTH'(mem_req_id); + assign mem_req_tag_c = MEM_TAG_IN_WIDTH'(mem_req_id); end else begin - assign mem_req_tag_nc = MEM_TAG_IN_WIDTH'({`MEM_ADDR_TO_BANK_ID(mem_req_addr_nc), mem_req_id}); + assign mem_req_tag_c = MEM_TAG_IN_WIDTH'({`MEM_ADDR_TO_BANK_ID(mem_req_addr_c), mem_req_id}); end `ifdef PERF_ENABLE @@ -698,12 +695,21 @@ module VX_cache #( wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; wire [$clog2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; - - wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw; - wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw; - - `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask); + + wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid_c & core_req_ready_c & ~core_req_rw; + wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid_c & core_req_ready_c & core_req_rw; + + // per cycle: read misses, write misses, msrq stalls, pipeline stalls + wire [$clog2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; + wire [$clog2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle; + wire [$clog2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; + wire [$clog2(NUM_BANKS+1)-1:0] perf_crsp_stall_per_cycle; + + `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask); `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask); + `POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank); + `POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank); + `POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank); if (CORE_TAG_ID_BITS != 0) begin wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}}; @@ -713,23 +719,14 @@ module VX_cache #( `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask); end - // per cycle: read misses, write misses, msrq stalls, pipeline stalls - wire [$clog2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; - wire [$clog2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle; - wire [$clog2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; - wire [$clog2(NUM_BANKS+1)-1:0] perf_pipe_stall_per_cycle; - - `POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank); - `POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank); - `POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank); - `POP_COUNT(perf_pipe_stall_per_cycle, perf_pipe_stall_per_bank); + wire perf_mem_stall_per_cycle = mem_req_valid & ~mem_req_ready; reg [`PERF_CTR_BITS-1:0] perf_core_reads; reg [`PERF_CTR_BITS-1:0] perf_core_writes; reg [`PERF_CTR_BITS-1:0] perf_read_misses; reg [`PERF_CTR_BITS-1:0] perf_write_misses; reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls; - reg [`PERF_CTR_BITS-1:0] perf_pipe_stalls; + reg [`PERF_CTR_BITS-1:0] perf_mem_stalls; reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls; always @(posedge clk) begin @@ -739,16 +736,16 @@ module VX_cache #( perf_read_misses <= 0; perf_write_misses <= 0; perf_mshr_stalls <= 0; - perf_pipe_stalls <= 0; + perf_mem_stalls <= 0; perf_crsp_stalls <= 0; end else begin - perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); - perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); - perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle); - perf_write_misses <= perf_write_misses+ `PERF_CTR_BITS'(perf_write_miss_per_cycle); - perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle); - perf_pipe_stalls <= perf_pipe_stalls + `PERF_CTR_BITS'(perf_pipe_stall_per_cycle); - perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); + perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); + perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); + perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle); + perf_write_misses <= perf_write_misses + `PERF_CTR_BITS'(perf_write_miss_per_cycle); + perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle); + perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle); + perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); end end @@ -757,7 +754,7 @@ module VX_cache #( assign perf_cache_if.read_misses = perf_read_misses; assign perf_cache_if.write_misses = perf_write_misses; assign perf_cache_if.mshr_stalls = perf_mshr_stalls; - assign perf_cache_if.pipe_stalls = perf_pipe_stalls; + assign perf_cache_if.mem_stalls = perf_mem_stalls; assign perf_cache_if.crsp_stalls = perf_crsp_stalls; `endif diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index c0709cce..647ea0be 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -3,9 +3,8 @@ `include "VX_platform.vh" -`ifdef DBG_CACHE_REQ_INFO -`include "VX_define.vh" -`endif +// cache request identifier +`define DBG_CACHE_REQ_IDW 44 `define REQS_BITS `LOG2UP(NUM_REQS) @@ -24,7 +23,7 @@ `define WORD_ADDR_WIDTH (32-`CLOG2(WORD_SIZE)) `define MEM_ADDR_WIDTH (32-`CLOG2(CACHE_LINE_SIZE)) -`define LINE_ADDR_WIDTH (`MEM_ADDR_WIDTH-`BANK_SELECT_BITS) +`define LINE_ADDR_WIDTH (`MEM_ADDR_WIDTH-`CLOG2(NUM_BANKS)) // Word select `define WORD_SELECT_BITS `CLOG2(`WORDS_PER_LINE) @@ -46,14 +45,13 @@ `define TAG_SELECT_ADDR_START (1+`LINE_SELECT_ADDR_END) `define TAG_SELECT_ADDR_END (`WORD_ADDR_WIDTH-1) -`define BANK_SELECT_ADDR(x) x[`BANK_SELECT_ADDR_END : `BANK_SELECT_ADDR_START] - -`define LINE_SELECT_ADDR0(x) x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START] -`define LINE_SELECT_ADDRX(x) {x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START], x[`BANK_SELECT_ADDR_START-1 : 1+`WORD_SELECT_ADDR_END]} +`define SELECT_BANK_ID(x) x[`BANK_SELECT_ADDR_END : `BANK_SELECT_ADDR_START] +`define SELECT_LINE_ADDR0(x) x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START] +`define SELECT_LINE_ADDRX(x) {x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START], x[`BANK_SELECT_ADDR_START-1 : 1+`WORD_SELECT_ADDR_END]} `define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS] -`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_MDATAW) +`define CACHE_REQ_ID_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_IDW) /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/cache/VX_core_req_bank_sel.sv b/hw/rtl/cache/VX_core_req_bank_sel.sv index 01c9f12b..1197edfb 100644 --- a/hw/rtl/cache/VX_core_req_bank_sel.sv +++ b/hw/rtl/cache/VX_core_req_bank_sel.sv @@ -57,16 +57,16 @@ module VX_core_req_bank_sel #( for (genvar i = 0; i < NUM_REQS; i++) begin if (BANK_ADDR_OFFSET == 0) begin - assign core_req_line_addr[i] = `LINE_SELECT_ADDR0(core_req_addr[i]); + assign core_req_line_addr[i] = `SELECT_LINE_ADDR0(core_req_addr[i]); end else begin - assign core_req_line_addr[i] = `LINE_SELECT_ADDRX(core_req_addr[i]); + assign core_req_line_addr[i] = `SELECT_LINE_ADDRX(core_req_addr[i]); end assign core_req_wsel[i] = core_req_addr[i][`UP(`WORD_SELECT_BITS)-1:0]; end for (genvar i = 0; i < NUM_REQS; ++i) begin if (NUM_BANKS > 1) begin - assign core_req_bid[i] = `BANK_SELECT_ADDR(core_req_addr[i]); + assign core_req_bid[i] = `SELECT_BANK_ID(core_req_addr[i]); end else begin assign core_req_bid[i] = 0; end @@ -88,6 +88,7 @@ module VX_core_req_bank_sel #( if (NUM_PORTS > 1) begin reg [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_line_addr_r; + reg [NUM_BANKS-1:0] per_bank_rw_r; wire [NUM_REQS-1:0] core_req_line_match; always @(*) begin @@ -95,12 +96,14 @@ module VX_core_req_bank_sel #( for (integer i = NUM_REQS-1; i >= 0; --i) begin if (core_req_valid[i]) begin per_bank_line_addr_r[core_req_bid[i]] = core_req_line_addr[i]; + per_bank_rw_r[core_req_bid[i]] = core_req_rw[i]; end end end for (genvar i = 0; i < NUM_REQS; ++i) begin - assign core_req_line_match[i] = (core_req_line_addr[i] == per_bank_line_addr_r[core_req_bid[i]]); + assign core_req_line_match[i] = (core_req_line_addr[i] == per_bank_line_addr_r[core_req_bid[i]]) + && (core_req_rw[i] == per_bank_rw_r[core_req_bid[i]]); end if (NUM_PORTS < NUM_REQS) begin diff --git a/hw/rtl/cache/VX_data_access.sv b/hw/rtl/cache/VX_data_access.sv index a1a5247b..f5809644 100644 --- a/hw/rtl/cache/VX_data_access.sv +++ b/hw/rtl/cache/VX_data_access.sv @@ -21,12 +21,9 @@ module VX_data_access #( input wire clk, input wire reset, -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - input wire[31:0] debug_pc, - input wire[`NW_BITS-1:0] debug_wid, + input wire[`DBG_CACHE_REQ_IDW-1:0] req_id, `IGNORE_UNUSED_END -`endif input wire stall, @@ -125,10 +122,10 @@ module VX_data_access #( dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data); end if (read && ~stall) begin - dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, read_data); + dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, blk_addr=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, read_data, req_id); end if (write && ~stall) begin - dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, wid=%0d, PC=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, byteen, line_addr, write_data); + dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, byteen=%b, blk_addr=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), byteen, line_addr, write_data, req_id); end end `endif diff --git a/hw/rtl/cache/VX_miss_resrv.sv b/hw/rtl/cache/VX_miss_resrv.sv index bda63bb1..b9081fdd 100644 --- a/hw/rtl/cache/VX_miss_resrv.sv +++ b/hw/rtl/cache/VX_miss_resrv.sv @@ -25,16 +25,11 @@ module VX_miss_resrv #( input wire clk, input wire reset, -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - input wire[31:0] deq_debug_pc, - input wire[`NW_BITS-1:0] deq_debug_wid, - input wire[31:0] lkp_debug_pc, - input wire[`NW_BITS-1:0] lkp_debug_wid, - input wire[31:0] rel_debug_pc, - input wire[`NW_BITS-1:0] rel_debug_wid, + input wire[`DBG_CACHE_REQ_IDW-1:0] deq_req_id, + input wire[`DBG_CACHE_REQ_IDW-1:0] lkp_req_id, + input wire[`DBG_CACHE_REQ_IDW-1:0] rel_req_id, `IGNORE_UNUSED_END -`endif // allocate input wire allocate_valid, @@ -206,23 +201,22 @@ module VX_miss_resrv #( always @(posedge clk) begin if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin if (allocate_fire) - dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, - `LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_debug_wid, deq_debug_pc); + dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, + `LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_req_id); if (fill_valid) dpi_trace("%d: cache%0d:%0d mshr-fill: addr=%0h, id=%0d, addr=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_table[fill_id], BANK_ID), fill_id, `LINE_TO_BYTE_ADDR(fill_addr, BANK_ID)); if (dequeue_fire) - dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, - `LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_debug_wid, deq_debug_pc); + dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, + `LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_id); if (lookup_replay) - dpi_trace("%d: cache%0d:%0d mshr-replay: addr=%0h, id=%0d\n", $time, CACHE_ID, BANK_ID, - `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id); + dpi_trace("%d: cache%0d:%0d mshr-replay: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, + `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lkp_req_id); if (lookup_valid) - dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, - `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_debug_wid, lkp_debug_pc); + dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b (#%0d)\n", $time, CACHE_ID, BANK_ID, + `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_req_id); if (release_valid) - dpi_trace("%d: cache%0d:%0d mshr-release id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, - release_id, rel_debug_wid, rel_debug_pc); + dpi_trace("%d: cache%0d:%0d mshr-release id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, release_id, rel_req_id); dpi_trace("%d: cache%0d:%0d mshr-table", $time, CACHE_ID, BANK_ID); for (integer i = 0; i < MSHR_SIZE; ++i) begin if (valid_table[i]) begin diff --git a/hw/rtl/cache/VX_shared_mem.sv b/hw/rtl/cache/VX_shared_mem.sv index 46ea0cfc..7d6eb275 100644 --- a/hw/rtl/cache/VX_shared_mem.sv +++ b/hw/rtl/cache/VX_shared_mem.sv @@ -254,22 +254,19 @@ module VX_shared_mem #( .ready_out (core_rsp_ready) ); -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - wire [NUM_BANKS-1:0][31:0] debug_pc_st0, debug_pc_st1; - wire [NUM_BANKS-1:0][`NW_BITS-1:0] debug_wid_st0, debug_wid_st1; + wire [NUM_BANKS-1:0][`DBG_CACHE_REQ_IDW-1:0] req_id_st0, req_id_st1; `IGNORE_UNUSED_END for (genvar i = 0; i < NUM_BANKS; ++i) begin if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st0[i], debug_pc_st0[i]} = per_bank_core_req_tag_unqual[i][`CACHE_REQ_INFO_RNG]; - assign {debug_wid_st1[i], debug_pc_st1[i]} = per_bank_core_req_tag[i][`CACHE_REQ_INFO_RNG]; + assign req_id_st0[i] = per_bank_core_req_tag_unqual[i][`CACHE_REQ_ID_RNG]; + assign req_id_st1[i] = per_bank_core_req_tag[i][`CACHE_REQ_ID_RNG]; end else begin - assign {debug_wid_st0[i], debug_pc_st0[i]} = 0; - assign {debug_wid_st1[i], debug_pc_st1[i]} = 0; + assign req_id_st0[i] = 0; + assign req_id_st1[i] = 0; end end -`endif `ifdef DBG_TRACE_CACHE_BANK @@ -309,11 +306,11 @@ module VX_shared_mem #( for (integer i = 0; i < NUM_BANKS; ++i) begin if (per_bank_core_req_valid_unqual[i]) begin if (per_bank_core_req_rw_unqual[i]) begin - dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", - $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], debug_wid_st0[i], debug_pc_st0[i]); + dpi_trace("%d: smem%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h (#%0d)\n", + $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], req_id_st0[i]); end else begin - dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h\n", - $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], debug_wid_st0[i], debug_pc_st0[i]); + dpi_trace("%d: smem%0d:%0d core-rd-req: addr=%0h, tag=%0h (#%0d)\n", + $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], req_id_st0[i]); end end end @@ -322,11 +319,11 @@ module VX_shared_mem #( for (integer i = 0; i < NUM_BANKS; ++i) begin if (per_bank_core_req_valid[i]) begin if (per_bank_core_req_rw[i]) begin - dpi_trace("%d: cache%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", - $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_req_data[i], debug_wid_st1[i], debug_pc_st1[i]); + dpi_trace("%d: smem%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, data=%0h (#%0d)\n", + $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_data[i], req_id_st1[i]); end else begin - dpi_trace("%d: cache%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", - $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_rsp_data[i], debug_wid_st1[i], debug_pc_st1[i]); + dpi_trace("%d: smem%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, data=%0h (#%0d)\n", + $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_rsp_data[i], req_id_st1[i]); end end end @@ -338,21 +335,13 @@ module VX_shared_mem #( // per cycle: core_reads, core_writes wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; - wire [$clog2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw; wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw; `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask); `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask); - - if (CORE_TAG_ID_BITS != 0) begin - wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}}; - `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask); - end else begin - wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_valid & ~core_rsp_ready; - `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask); - end + wire perf_crsp_stall_per_cycle = core_rsp_valid & ~core_rsp_ready; reg [`PERF_CTR_BITS-1:0] perf_core_reads; reg [`PERF_CTR_BITS-1:0] perf_core_writes; @@ -360,13 +349,13 @@ module VX_shared_mem #( always @(posedge clk) begin if (reset) begin - perf_core_reads <= 0; - perf_core_writes <= 0; - perf_crsp_stalls <= 0; + perf_core_reads <= 0; + perf_core_writes <= 0; + perf_crsp_stalls <= 0; end else begin - perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); - perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); - perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); + perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); + perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); + perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); end end @@ -374,7 +363,8 @@ module VX_shared_mem #( assign perf_cache_if.writes = perf_core_writes; assign perf_cache_if.read_misses = '0; assign perf_cache_if.write_misses = '0; - assign perf_cache_if.pipe_stalls = '0; + assign perf_cache_if.mshr_stalls = '0; + assign perf_cache_if.mem_stalls = '0; assign perf_cache_if.crsp_stalls = perf_crsp_stalls; `endif diff --git a/hw/rtl/cache/VX_tag_access.sv b/hw/rtl/cache/VX_tag_access.sv index 55124a65..d8d2a4db 100644 --- a/hw/rtl/cache/VX_tag_access.sv +++ b/hw/rtl/cache/VX_tag_access.sv @@ -17,12 +17,9 @@ module VX_tag_access #( input wire clk, input wire reset, -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - input wire[31:0] debug_pc, - input wire[`NW_BITS-1:0] debug_wid, + input wire[`DBG_CACHE_REQ_IDW-1:0] req_id, `IGNORE_UNUSED_END -`endif input wire stall, @@ -71,9 +68,9 @@ module VX_tag_access #( end if (lookup && ~stall) begin if (tag_match) begin - dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag); + dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, blk_addr=%0d, tag_id=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag, req_id); end else begin - dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag, read_tag); + dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag, read_tag, req_id); end end end diff --git a/hw/rtl/fp_cores/fpnew b/hw/rtl/fp_cores/fpnew deleted file mode 160000 index 1def7bb6..00000000 --- a/hw/rtl/fp_cores/fpnew +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1def7bb630ceae2ebc58921f6b5ee3e686fb6d5a diff --git a/hw/rtl/interfaces/VX_alu_req_if.sv b/hw/rtl/interfaces/VX_alu_req_if.sv index 2c6ffd5e..f6818e7d 100644 --- a/hw/rtl/interfaces/VX_alu_req_if.sv +++ b/hw/rtl/interfaces/VX_alu_req_if.sv @@ -5,7 +5,8 @@ interface VX_alu_req_if (); - wire valid; + wire valid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -24,6 +25,7 @@ interface VX_alu_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -43,6 +45,7 @@ interface VX_alu_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_cmt_to_csr_if.sv b/hw/rtl/interfaces/VX_cmt_to_csr_if.sv index 800d428d..ed5ffc24 100644 --- a/hw/rtl/interfaces/VX_cmt_to_csr_if.sv +++ b/hw/rtl/interfaces/VX_cmt_to_csr_if.sv @@ -5,9 +5,12 @@ interface VX_cmt_to_csr_if (); - wire valid; - wire [$clog2(`NUM_THREADS+1)-1:0] commit_size; - + wire valid; +`ifdef EXT_F_ENABLE + wire [$clog2(6*`NUM_THREADS+1)-1:0] commit_size; +`else + wire [$clog2(5*`NUM_THREADS+1)-1:0] commit_size; +`endif modport master ( output valid, output commit_size diff --git a/hw/rtl/interfaces/VX_commit_if.sv b/hw/rtl/interfaces/VX_commit_if.sv index 4b6844d6..ddbd9600 100644 --- a/hw/rtl/interfaces/VX_commit_if.sv +++ b/hw/rtl/interfaces/VX_commit_if.sv @@ -6,6 +6,7 @@ interface VX_commit_if (); wire valid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -17,6 +18,7 @@ interface VX_commit_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -29,6 +31,7 @@ interface VX_commit_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_csr_req_if.sv b/hw/rtl/interfaces/VX_csr_req_if.sv index 23345d53..c8eef24a 100644 --- a/hw/rtl/interfaces/VX_csr_req_if.sv +++ b/hw/rtl/interfaces/VX_csr_req_if.sv @@ -6,6 +6,7 @@ interface VX_csr_req_if (); wire valid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -20,6 +21,7 @@ interface VX_csr_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -35,6 +37,7 @@ interface VX_csr_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_decode_if.sv b/hw/rtl/interfaces/VX_decode_if.sv index 90c5d70e..5c00fb0f 100644 --- a/hw/rtl/interfaces/VX_decode_if.sv +++ b/hw/rtl/interfaces/VX_decode_if.sv @@ -6,6 +6,7 @@ interface VX_decode_if (); wire valid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -23,7 +24,8 @@ interface VX_decode_if (); wire ready; modport master ( - output valid, + output valid, + output uuid, output wid, output tmask, output PC, @@ -42,7 +44,8 @@ interface VX_decode_if (); ); modport slave ( - input valid, + input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_fpu_req_if.sv b/hw/rtl/interfaces/VX_fpu_req_if.sv index 25867e42..62ea9255 100644 --- a/hw/rtl/interfaces/VX_fpu_req_if.sv +++ b/hw/rtl/interfaces/VX_fpu_req_if.sv @@ -6,6 +6,7 @@ interface VX_fpu_req_if (); wire valid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -20,6 +21,7 @@ interface VX_fpu_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -35,6 +37,7 @@ interface VX_fpu_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_gpu_req_if.sv b/hw/rtl/interfaces/VX_gpu_req_if.sv index 50ac8c7c..027f7a2b 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.sv +++ b/hw/rtl/interfaces/VX_gpu_req_if.sv @@ -6,7 +6,7 @@ interface VX_gpu_req_if(); wire valid; - + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -19,11 +19,11 @@ interface VX_gpu_req_if(); wire [`NUM_THREADS-1:0][31:0] rs3_data; wire [`NR_BITS-1:0] rd; wire wb; - wire ready; modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -41,6 +41,7 @@ interface VX_gpu_req_if(); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_ibuffer_if.sv b/hw/rtl/interfaces/VX_ibuffer_if.sv index bb791737..2f9c17b6 100644 --- a/hw/rtl/interfaces/VX_ibuffer_if.sv +++ b/hw/rtl/interfaces/VX_ibuffer_if.sv @@ -6,6 +6,7 @@ interface VX_ibuffer_if (); wire valid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -31,6 +32,7 @@ interface VX_ibuffer_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -55,6 +57,7 @@ interface VX_ibuffer_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_ifetch_req_if.sv b/hw/rtl/interfaces/VX_ifetch_req_if.sv index 3d75e736..95e88223 100644 --- a/hw/rtl/interfaces/VX_ifetch_req_if.sv +++ b/hw/rtl/interfaces/VX_ifetch_req_if.sv @@ -5,14 +5,16 @@ interface VX_ifetch_req_if (); - wire valid; + wire valid; + wire [`UUID_BITS-1:0] uuid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; wire ready; modport master ( - output valid, + output valid, + output uuid, output tmask, output wid, output PC, @@ -20,7 +22,8 @@ interface VX_ifetch_req_if (); ); modport slave ( - input valid, + input valid, + input uuid, input tmask, input wid, input PC, diff --git a/hw/rtl/interfaces/VX_ifetch_rsp_if.sv b/hw/rtl/interfaces/VX_ifetch_rsp_if.sv index a2f04fe4..f47e8749 100644 --- a/hw/rtl/interfaces/VX_ifetch_rsp_if.sv +++ b/hw/rtl/interfaces/VX_ifetch_rsp_if.sv @@ -6,6 +6,7 @@ interface VX_ifetch_rsp_if (); wire valid; + wire [`UUID_BITS-1:0] uuid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; @@ -13,7 +14,8 @@ interface VX_ifetch_rsp_if (); wire ready; modport master ( - output valid, + output valid, + output uuid, output tmask, output wid, output PC, @@ -22,7 +24,8 @@ interface VX_ifetch_rsp_if (); ); modport slave ( - input valid, + input valid, + input uuid, input tmask, input wid, input PC, diff --git a/hw/rtl/interfaces/VX_lsu_req_if.sv b/hw/rtl/interfaces/VX_lsu_req_if.sv index 4f31b17c..f52b22da 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.sv +++ b/hw/rtl/interfaces/VX_lsu_req_if.sv @@ -6,6 +6,7 @@ interface VX_lsu_req_if (); wire valid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -21,6 +22,7 @@ interface VX_lsu_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -37,6 +39,7 @@ interface VX_lsu_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_perf_cache_if.sv b/hw/rtl/interfaces/VX_perf_cache_if.sv index d9efb2cc..0ec8d582 100644 --- a/hw/rtl/interfaces/VX_perf_cache_if.sv +++ b/hw/rtl/interfaces/VX_perf_cache_if.sv @@ -11,7 +11,7 @@ interface VX_perf_cache_if (); wire [`PERF_CTR_BITS-1:0] write_misses; wire [`PERF_CTR_BITS-1:0] bank_stalls; wire [`PERF_CTR_BITS-1:0] mshr_stalls; - wire [`PERF_CTR_BITS-1:0] pipe_stalls; + wire [`PERF_CTR_BITS-1:0] mem_stalls; wire [`PERF_CTR_BITS-1:0] crsp_stalls; modport master ( @@ -21,7 +21,7 @@ interface VX_perf_cache_if (); output write_misses, output bank_stalls, output mshr_stalls, - output pipe_stalls, + output mem_stalls, output crsp_stalls ); @@ -32,7 +32,7 @@ interface VX_perf_cache_if (); input write_misses, input bank_stalls, input mshr_stalls, - input pipe_stalls, + input mem_stalls, input crsp_stalls ); diff --git a/hw/rtl/interfaces/VX_perf_memsys_if.sv b/hw/rtl/interfaces/VX_perf_memsys_if.sv index f0e27ed6..9a38dc26 100644 --- a/hw/rtl/interfaces/VX_perf_memsys_if.sv +++ b/hw/rtl/interfaces/VX_perf_memsys_if.sv @@ -7,68 +7,50 @@ interface VX_perf_memsys_if (); wire [`PERF_CTR_BITS-1:0] icache_reads; wire [`PERF_CTR_BITS-1:0] icache_read_misses; - wire [`PERF_CTR_BITS-1:0] icache_pipe_stalls; - wire [`PERF_CTR_BITS-1:0] icache_crsp_stalls; - wire [`PERF_CTR_BITS-1:0] dcache_reads; - wire [`PERF_CTR_BITS-1:0] dcache_writes; + wire [`PERF_CTR_BITS-1:0] dcache_writes; wire [`PERF_CTR_BITS-1:0] dcache_read_misses; wire [`PERF_CTR_BITS-1:0] dcache_write_misses; wire [`PERF_CTR_BITS-1:0] dcache_bank_stalls; wire [`PERF_CTR_BITS-1:0] dcache_mshr_stalls; - wire [`PERF_CTR_BITS-1:0] dcache_pipe_stalls; - wire [`PERF_CTR_BITS-1:0] dcache_crsp_stalls; - wire [`PERF_CTR_BITS-1:0] smem_reads; wire [`PERF_CTR_BITS-1:0] smem_writes; wire [`PERF_CTR_BITS-1:0] smem_bank_stalls; - wire [`PERF_CTR_BITS-1:0] mem_reads; wire [`PERF_CTR_BITS-1:0] mem_writes; - wire [`PERF_CTR_BITS-1:0] mem_stalls; wire [`PERF_CTR_BITS-1:0] mem_latency; modport master ( output icache_reads, output icache_read_misses, - output icache_pipe_stalls, - output icache_crsp_stalls, output dcache_reads, - output dcache_writes, + output dcache_writes, output dcache_read_misses, output dcache_write_misses, output dcache_bank_stalls, output dcache_mshr_stalls, - output dcache_pipe_stalls, - output dcache_crsp_stalls, output smem_reads, output smem_writes, output smem_bank_stalls, output mem_reads, output mem_writes, - output mem_stalls, output mem_latency ); modport slave ( input icache_reads, input icache_read_misses, - input icache_pipe_stalls, - input icache_crsp_stalls, input dcache_reads, - input dcache_writes, + input dcache_writes, input dcache_read_misses, input dcache_write_misses, input dcache_bank_stalls, input dcache_mshr_stalls, - input dcache_pipe_stalls, - input dcache_crsp_stalls, input smem_reads, input smem_writes, input smem_bank_stalls, input mem_reads, input mem_writes, - input mem_stalls, input mem_latency ); diff --git a/hw/rtl/interfaces/VX_perf_pipeline_if.sv b/hw/rtl/interfaces/VX_perf_pipeline_if.sv index 19cc15c3..a4470e4c 100644 --- a/hw/rtl/interfaces/VX_perf_pipeline_if.sv +++ b/hw/rtl/interfaces/VX_perf_pipeline_if.sv @@ -4,18 +4,27 @@ `include "VX_define.vh" interface VX_perf_pipeline_if (); - - wire [`PERF_CTR_BITS-1:0] ibf_stalls; - wire [`PERF_CTR_BITS-1:0] scb_stalls; - wire [`PERF_CTR_BITS-1:0] lsu_stalls; - wire [`PERF_CTR_BITS-1:0] csr_stalls; - wire [`PERF_CTR_BITS-1:0] alu_stalls; + wire [`PERF_CTR_BITS-1:0] loads; + wire [`PERF_CTR_BITS-1:0] stores; + wire [`PERF_CTR_BITS-1:0] branches; + + wire [`PERF_CTR_BITS-1:0] ibf_stalls; + wire [`PERF_CTR_BITS-1:0] scb_stalls; + wire [`PERF_CTR_BITS-1:0] lsu_stalls; + wire [`PERF_CTR_BITS-1:0] csr_stalls; + wire [`PERF_CTR_BITS-1:0] alu_stalls; `ifdef EXT_F_ENABLE - wire [`PERF_CTR_BITS-1:0] fpu_stalls; + wire [`PERF_CTR_BITS-1:0] fpu_stalls; `endif - wire [`PERF_CTR_BITS-1:0] gpu_stalls; + wire [`PERF_CTR_BITS-1:0] gpu_stalls; - modport master ( + modport decode ( + output loads, + output stores, + output branches + ); + + modport issue ( output ibf_stalls, output scb_stalls, output lsu_stalls, @@ -25,9 +34,12 @@ interface VX_perf_pipeline_if (); output fpu_stalls, `endif output gpu_stalls - ); + ); modport slave ( + input loads, + input stores, + input branches, input ibf_stalls, input scb_stalls, input lsu_stalls, diff --git a/hw/rtl/interfaces/VX_perf_tex_if.sv b/hw/rtl/interfaces/VX_perf_tex_if.sv new file mode 100644 index 00000000..222ade53 --- /dev/null +++ b/hw/rtl/interfaces/VX_perf_tex_if.sv @@ -0,0 +1,23 @@ +`ifndef VX_PERF_TEX_IF +`define VX_PERF_TEX_IF + +`include "VX_define.vh" + +interface VX_perf_tex_if (); + + wire [`PERF_CTR_BITS-1:0] mem_reads; + wire [`PERF_CTR_BITS-1:0] mem_latency; + + modport master ( + output mem_reads, + output mem_latency + ); + + modport slave ( + input mem_reads, + input mem_latency + ); + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_tex_csr_if.sv b/hw/rtl/interfaces/VX_tex_csr_if.sv index a83c9479..e0c626a5 100644 --- a/hw/rtl/interfaces/VX_tex_csr_if.sv +++ b/hw/rtl/interfaces/VX_tex_csr_if.sv @@ -8,17 +8,20 @@ interface VX_tex_csr_if (); wire write_enable; wire [`CSR_ADDR_BITS-1:0] write_addr; wire [31:0] write_data; + wire [`UUID_BITS-1:0] write_uuid; modport master ( output write_enable, output write_addr, - output write_data + output write_data, + output write_uuid ); modport slave ( input write_enable, input write_addr, - input write_data + input write_data, + input write_uuid ); endinterface diff --git a/hw/rtl/interfaces/VX_tex_req_if.sv b/hw/rtl/interfaces/VX_tex_req_if.sv index f1eaa1be..a3fec613 100644 --- a/hw/rtl/interfaces/VX_tex_req_if.sv +++ b/hw/rtl/interfaces/VX_tex_req_if.sv @@ -6,6 +6,7 @@ interface VX_tex_req_if (); wire valid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -20,6 +21,7 @@ interface VX_tex_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -33,6 +35,7 @@ interface VX_tex_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_tex_rsp_if.sv b/hw/rtl/interfaces/VX_tex_rsp_if.sv index b3dbd65d..b6fe625a 100644 --- a/hw/rtl/interfaces/VX_tex_rsp_if.sv +++ b/hw/rtl/interfaces/VX_tex_rsp_if.sv @@ -6,6 +6,7 @@ interface VX_tex_rsp_if (); wire valid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -16,6 +17,7 @@ interface VX_tex_rsp_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -27,6 +29,7 @@ interface VX_tex_rsp_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_writeback_if.sv b/hw/rtl/interfaces/VX_writeback_if.sv index 8f05fc7a..6b93a04f 100644 --- a/hw/rtl/interfaces/VX_writeback_if.sv +++ b/hw/rtl/interfaces/VX_writeback_if.sv @@ -6,6 +6,7 @@ interface VX_writeback_if (); wire valid; + wire [`UUID_BITS-1:0] uuid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; @@ -16,6 +17,7 @@ interface VX_writeback_if (); modport master ( output valid, + output uuid, output tmask, output wid, output PC, @@ -27,6 +29,7 @@ interface VX_writeback_if (); modport slave ( input valid, + input uuid, input tmask, input wid, input PC, diff --git a/hw/rtl/libs/VX_axi_adapter.sv b/hw/rtl/libs/VX_axi_adapter.sv index 2788c315..9e96eedb 100644 --- a/hw/rtl/libs/VX_axi_adapter.sv +++ b/hw/rtl/libs/VX_axi_adapter.sv @@ -125,7 +125,7 @@ module VX_axi_adapter #( // AXI write response channel `UNUSED_VAR (m_axi_bid); - `RUNTIME_ASSERT(~m_axi_bvalid || m_axi_bresp == 0, ("AXI response error")); + `RUNTIME_ASSERT(~m_axi_bvalid || m_axi_bresp == 0, ("%t: *** AXI response error", $time)); assign m_axi_bready = 1'b1; // AXI read request channel @@ -144,7 +144,7 @@ module VX_axi_adapter #( assign mem_rsp_valid = m_axi_rvalid; assign mem_rsp_tag = m_axi_rid; assign mem_rsp_data = m_axi_rdata; - `RUNTIME_ASSERT(~m_axi_rvalid || m_axi_rresp == 0, ("AXI response error")); + `RUNTIME_ASSERT(~m_axi_rvalid || m_axi_rresp == 0, ("%t: *** AXI response error", $time)); `UNUSED_VAR (m_axi_rlast); assign m_axi_rready = mem_rsp_ready; diff --git a/hw/rtl/libs/VX_index_queue.sv b/hw/rtl/libs/VX_index_queue.sv index 66307d74..201287fb 100644 --- a/hw/rtl/libs/VX_index_queue.sv +++ b/hw/rtl/libs/VX_index_queue.sv @@ -32,7 +32,7 @@ module VX_index_queue #( assign enqueue = push; assign dequeue = !empty && !valid[rd_a]; // auto-remove when head is invalid - `RUNTIME_ASSERT(!push || !full, ("invalid inputs")); + `RUNTIME_ASSERT(!push || !full, ("%t: *** invalid inputs", $time)); always @(posedge clk) begin if (reset) begin diff --git a/hw/rtl/libs/VX_popcount.sv b/hw/rtl/libs/VX_popcount.sv index 3144f106..8c8b08d3 100644 --- a/hw/rtl/libs/VX_popcount.sv +++ b/hw/rtl/libs/VX_popcount.sv @@ -4,12 +4,17 @@ module VX_popcount #( parameter MODEL = 1, parameter N = 1, - parameter LOGN = $clog2(N), - parameter M = LOGN+1 + parameter M = $clog2(N+1) ) ( input wire [N-1:0] in_i, output wire [M-1:0] cnt_o ); +`ifndef SYNTHESIS + assign cnt_o = $countones(in_i); +`else +`ifdef QUARTUS + assign cnt_o = $countones(in_i); +`else if (N == 1) begin assign cnt_o = in_i; @@ -53,6 +58,8 @@ module VX_popcount #( assign cnt_o = cnt_r; end +`endif +`endif endmodule `TRACING_ON \ No newline at end of file diff --git a/hw/rtl/libs/VX_skid_buffer.sv b/hw/rtl/libs/VX_skid_buffer.sv index ba6c8b6c..c6820f75 100644 --- a/hw/rtl/libs/VX_skid_buffer.sv +++ b/hw/rtl/libs/VX_skid_buffer.sv @@ -30,7 +30,7 @@ module VX_skid_buffer #( end else if (NOBACKPRESSURE) begin - `RUNTIME_ASSERT(ready_out, ("ready_out should always be asserted")) + `RUNTIME_ASSERT(ready_out, ("%t: *** ready_out should always be asserted", $time)) wire stall = valid_out && ~ready_out; diff --git a/hw/rtl/tex_unit/VX_tex_addr.sv b/hw/rtl/tex_unit/VX_tex_addr.sv index 26a20566..87da9cef 100644 --- a/hw/rtl/tex_unit/VX_tex_addr.sv +++ b/hw/rtl/tex_unit/VX_tex_addr.sv @@ -12,13 +12,14 @@ module VX_tex_addr #( input wire req_valid, input wire [NUM_REQS-1:0] req_tmask, - input wire [1:0][NUM_REQS-1:0][31:0] req_coords, + input wire [1:0][NUM_REQS-1:0][`TEX_FXD_BITS-1:0] req_coords, input wire [`TEX_FORMAT_BITS-1:0] req_format, input wire [`TEX_FILTER_BITS-1:0] req_filter, input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps, input wire [`TEX_ADDR_BITS-1:0] req_baseaddr, + input wire [NUM_REQS-1:0][`TEX_LOD_BITS-1:0] mip_level, input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff, - input wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] req_logdims, + input wire [NUM_REQS-1:0][1:0][`TEX_LOD_BITS-1:0] req_logdims, input wire [REQ_INFOW-1:0] req_info, output wire req_ready, @@ -27,31 +28,35 @@ module VX_tex_addr #( output wire rsp_valid, output wire [NUM_REQS-1:0] rsp_tmask, output wire [`TEX_FILTER_BITS-1:0] rsp_filter, - output wire [`TEX_STRIDE_BITS-1:0] rsp_stride, + output wire [`TEX_LGSTRIDE_BITS-1:0] rsp_lgstride, + output wire [NUM_REQS-1:0][31:0] rsp_baseaddr, output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr, - output wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends, + output wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends, output wire [REQ_INFOW-1:0] rsp_info, input wire rsp_ready ); `UNUSED_PARAM (CORE_ID) - localparam PITCH_BITS = `MAX(`TEX_DIM_BITS, `TEX_STRIDE_BITS) + 1; - localparam SCALED_U_W = `FIXED_INT + `TEX_STRIDE_BITS; - localparam SCALED_X_W = (2 * `FIXED_INT); - localparam SCALED_V_W = SCALED_X_W + `TEX_STRIDE_BITS; + localparam SHIFT_BITS = $clog2(`TEX_FXD_FRAC+1); + localparam PITCH_BITS = `MAX(`TEX_LOD_BITS, `TEX_LGSTRIDE_BITS) + 1; + localparam SCALED_DIM = `TEX_FXD_FRAC + `TEX_DIM_BITS; + localparam SCALED_X_W = `TEX_DIM_BITS + `TEX_BLEND_FRAC; + localparam OFFSET_U_W = `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX; + localparam OFFSET_V_W = `TEX_DIM_BITS + `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX; wire valid_s0; wire [NUM_REQS-1:0] tmask_s0; wire [`TEX_FILTER_BITS-1:0] filter_s0; wire [REQ_INFOW-1:0] req_info_s0; - wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_lo, clamped_lo_s0; - wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_hi, clamped_hi_s0; - wire [`TEX_STRIDE_BITS-1:0] log_stride, log_stride_s0; + wire [NUM_REQS-1:0][1:0][`TEX_FXD_FRAC-1:0] clamped_lo, clamped_lo_s0; + wire [NUM_REQS-1:0][1:0][`TEX_FXD_FRAC-1:0] clamped_hi, clamped_hi_s0; + wire [NUM_REQS-1:0][1:0][SHIFT_BITS-1:0] dim_shift, dim_shift_s0; + wire [`TEX_LGSTRIDE_BITS-1:0] log_stride, log_stride_s0; wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0; - wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] log_dims_s0; wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0; - + wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0; + wire stall_out; // stride @@ -67,9 +72,9 @@ module VX_tex_addr #( for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar j = 0; j < 2; ++j) begin - wire [`FIXED_FRAC-1:0] delta = (`FIXED_HALF >> req_logdims[i][j]); - wire [31:0] coord_lo = req_filter ? (req_coords[j][i] - 32'(delta)) : req_coords[j][i]; - wire [31:0] coord_hi = req_filter ? (req_coords[j][i] + 32'(delta)) : req_coords[j][i]; + wire [`TEX_FXD_FRAC-1:0] delta = `TEX_FXD_FRAC'((SCALED_DIM'(`TEX_FXD_HALF) << mip_level[i]) >> req_logdims[i][j]); + wire [`TEX_FXD_BITS-1:0] coord_lo = req_filter ? (req_coords[j][i] - `TEX_FXD_BITS'(delta)) : req_coords[j][i]; + wire [`TEX_FXD_BITS-1:0] coord_hi = req_filter ? (req_coords[j][i] + `TEX_FXD_BITS'(delta)) : req_coords[j][i]; VX_tex_wrap #( .CORE_ID (CORE_ID) @@ -86,66 +91,67 @@ module VX_tex_addr #( .coord_i (coord_hi), .coord_o (clamped_hi[i][j]) ); + + assign dim_shift[i][j] = (`TEX_FXD_FRAC - `TEX_BLEND_FRAC - (req_logdims[i][j] - mip_level[i])); end - assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0]) + PITCH_BITS'(log_stride); - assign mip_addr[i] = req_baseaddr + 32'(req_mipoff[i]); + assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0] - mip_level[i]) + PITCH_BITS'(log_stride); + assign mip_addr[i] = req_baseaddr + `TEX_ADDR_BITS'(req_mipoff[i]); end VX_pipe_register #( - .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * `TEX_DIM_BITS + 32 + 2 * 2 * `FIXED_FRAC)), + .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * SHIFT_BITS + `TEX_ADDR_BITS + 2 * 2 * `TEX_FXD_FRAC)), .RESETW (1) ) pipe_reg0 ( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, req_logdims, mip_addr, clamped_lo, clamped_hi}), - .data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, log_dims_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0}) + .data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, dim_shift, mip_addr, clamped_lo, clamped_hi}), + .data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, dim_shift_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0}) ); // addresses generation - wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_lo; - wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_hi; - wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] blends; + wire [NUM_REQS-1:0][1:0][SCALED_X_W-1:0] scaled_lo; + wire [NUM_REQS-1:0][1:0][SCALED_X_W-1:0] scaled_hi; + wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_lo; + wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_hi; + wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_lo; + wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_hi; + wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] blends; wire [NUM_REQS-1:0][3:0][31:0] addr; for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar j = 0; j < 2; ++j) begin - assign scaled_lo[i][j] = scale_to_dim(clamped_lo_s0[i][j], log_dims_s0[i][j]); - assign scaled_hi[i][j] = scale_to_dim(clamped_hi_s0[i][j], log_dims_s0[i][j]); - assign blends[i][j] = filter_s0 ? clamped_lo_s0[i][j][`BLEND_FRAC-1:0] : `BLEND_FRAC'(0); + assign scaled_lo[i][j] = SCALED_X_W'(clamped_lo_s0[i][j] >> dim_shift_s0[i][j]); + assign scaled_hi[i][j] = SCALED_X_W'(clamped_hi_s0[i][j] >> dim_shift_s0[i][j]); + assign blends[i][j] = filter_s0 ? scaled_lo[i][j][`TEX_BLEND_FRAC-1:0] : `TEX_BLEND_FRAC'(0); end end - `UNUSED_VAR (log_pitch_s0) - for (genvar i = 0; i < NUM_REQS; ++i) begin - wire [SCALED_U_W-1:0] offset_u_lo = SCALED_U_W'(scaled_lo[i][0]) << log_stride_s0; - wire [SCALED_U_W-1:0] offset_u_hi = SCALED_U_W'(scaled_hi[i][0]) << log_stride_s0; + assign offset_u_lo[i] = OFFSET_U_W'(scaled_lo[i][0][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_stride_s0; + assign offset_u_hi[i] = OFFSET_U_W'(scaled_hi[i][0][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_stride_s0; - wire [SCALED_V_W-1:0] offset_v_lo = SCALED_V_W'(scaled_lo[i][1]) << log_pitch_s0[i]; - wire [SCALED_V_W-1:0] offset_v_hi = SCALED_V_W'(scaled_hi[i][1]) << log_pitch_s0[i]; + assign offset_v_lo[i] = OFFSET_V_W'(scaled_lo[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i]; + assign offset_v_hi[i] = OFFSET_V_W'(scaled_hi[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i]; - wire [31:0] base_addr_lo = mip_addr_s0[i] + 32'(offset_v_lo); - wire [31:0] base_addr_hi = mip_addr_s0[i] + 32'(offset_v_hi); - - assign addr[i][0] = base_addr_lo + 32'(offset_u_lo); - assign addr[i][1] = base_addr_lo + 32'(offset_u_hi); - assign addr[i][2] = base_addr_hi + 32'(offset_u_lo); - assign addr[i][3] = base_addr_hi + 32'(offset_u_hi); + assign addr[i][0] = 32'(offset_v_lo[i]) + 32'(offset_u_lo[i]); + assign addr[i][1] = 32'(offset_v_lo[i]) + 32'(offset_u_hi[i]); + assign addr[i][2] = 32'(offset_v_hi[i]) + 32'(offset_u_lo[i]); + assign addr[i][3] = 32'(offset_v_hi[i]) + 32'(offset_u_hi[i]); end assign stall_out = rsp_valid && ~rsp_ready; VX_pipe_register #( - .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `BLEND_FRAC) + REQ_INFOW), + .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (NUM_REQS * 32) + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `TEX_BLEND_FRAC) + REQ_INFOW), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}), - .data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_stride, rsp_addr, rsp_blends, rsp_info}) + .data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, mip_addr_s0, addr, blends, req_info_s0}), + .data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_lgstride, rsp_baseaddr, rsp_addr, rsp_blends, rsp_info}) ); assign req_ready = ~stall_out; @@ -157,22 +163,45 @@ module VX_tex_addr #( assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0]; always @(posedge clk) begin + if (req_valid && ~stall_out) begin + dpi_trace("%d: *** log_pitch=", $time); + `TRACE_ARRAY1D(log_pitch, NUM_REQS); + dpi_trace(", mip_addr="); + `TRACE_ARRAY1D(mip_addr, NUM_REQS); + dpi_trace(", req_logdims="); + `TRACE_ARRAY2D(req_logdims, 2, NUM_REQS); + dpi_trace(", clamped_lo="); + `TRACE_ARRAY2D(clamped_lo, 2, NUM_REQS); + dpi_trace(", clamped_hi="); + `TRACE_ARRAY2D(clamped_hi, 2, NUM_REQS); + dpi_trace(", mip_addr="); + `TRACE_ARRAY1D(mip_addr, NUM_REQS); + dpi_trace("\n"); + end + + if (valid_s0 && ~stall_out) begin + dpi_trace("%d: *** scaled_lo=", $time); + `TRACE_ARRAY2D(scaled_lo, 2, NUM_REQS); + dpi_trace(", scaled_hi="); + `TRACE_ARRAY2D(scaled_hi, 2, NUM_REQS); + dpi_trace(", offset_u_lo="); + `TRACE_ARRAY1D(offset_u_lo, NUM_REQS); + dpi_trace(", offset_u_hi="); + `TRACE_ARRAY1D(offset_u_hi, NUM_REQS); + dpi_trace(", offset_v_lo="); + `TRACE_ARRAY1D(offset_v_lo, NUM_REQS); + dpi_trace(", offset_v_hi="); + `TRACE_ARRAY1D(offset_v_hi, NUM_REQS); + dpi_trace("\n"); + end + if (rsp_valid && rsp_ready) begin - dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, tride=%0d, addr=", - $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_stride); + dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, lgstride=%0d, addr=", + $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_lgstride); `TRACE_ARRAY2D(rsp_addr, 4, NUM_REQS); dpi_trace("\n"); end end `endif -function logic [`FIXED_INT-1:0] scale_to_dim (input logic [`FIXED_FRAC-1:0] src, - input logic [`TEX_DIM_BITS-1:0] dim); -`IGNORE_WARNINGS_BEGIN - logic [`FIXED_BITS-1:0] out; -`IGNORE_WARNINGS_END - out = `FIXED_BITS'(src) << dim; - return out[`FIXED_FRAC +: `FIXED_INT]; -endfunction - endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_define.vh b/hw/rtl/tex_unit/VX_tex_define.vh index 16272fc9..381069fc 100644 --- a/hw/rtl/tex_unit/VX_tex_define.vh +++ b/hw/rtl/tex_unit/VX_tex_define.vh @@ -3,37 +3,49 @@ `include "VX_define.vh" -`define FIXED_BITS 32 -`define FIXED_FRAC 20 -`define FIXED_INT (`FIXED_BITS - `FIXED_FRAC) -`define FIXED_ONE (2 ** `FIXED_FRAC) -`define FIXED_HALF (`FIXED_ONE >> 1) -`define FIXED_MASK (`FIXED_ONE - 1) +`define TEX_FXD_INT (`TEX_FXD_BITS - `TEX_FXD_FRAC) +`define TEX_FXD_ONE (2 ** `TEX_FXD_FRAC) +`define TEX_FXD_HALF (`TEX_FXD_ONE >> 1) +`define TEX_FXD_MASK (`TEX_FXD_ONE - 1) `define TEX_ADDR_BITS 32 `define TEX_FORMAT_BITS 3 `define TEX_WRAP_BITS 2 -`define TEX_DIM_BITS 4 `define TEX_FILTER_BITS 1 +`define TEX_MIPOFF_BITS (2*`TEX_DIM_BITS+1) -`define TEX_MIPOFF_BITS (2*12+1) -`define TEX_STRIDE_BITS 2 - -`define TEX_LOD_BITS 4 -`define TEX_MIP_BITS (`NTEX_BITS + `TEX_LOD_BITS) +`define TEX_LGSTRIDE_MAX 2 +`define TEX_LGSTRIDE_BITS 2 `define TEX_WRAP_CLAMP 0 `define TEX_WRAP_REPEAT 1 `define TEX_WRAP_MIRROR 2 -`define BLEND_FRAC 8 -`define BLEND_ONE (2 ** `BLEND_FRAC) +`define TEX_BLEND_FRAC 8 +`define TEX_BLEND_ONE (2 ** `TEX_BLEND_FRAC) -`define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0) +`define TEX_FORMAT_A8R8G8B8 `TEX_FORMAT_BITS'(0) `define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1) -`define TEX_FORMAT_R4G4B4A4 `TEX_FORMAT_BITS'(2) -`define TEX_FORMAT_L8A8 `TEX_FORMAT_BITS'(3) -`define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(4) -`define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(5) +`define TEX_FORMAT_A1R5G5B5 `TEX_FORMAT_BITS'(2) +`define TEX_FORMAT_A4R4G4B4 `TEX_FORMAT_BITS'(3) +`define TEX_FORMAT_A8L8 `TEX_FORMAT_BITS'(4) +`define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(5) +`define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(6) + +task trace_tex_state ( + input [`CSR_ADDR_BITS-1:0] state +); + case (state) + `CSR_TEX_ADDR: dpi_trace("ADDR"); + `CSR_TEX_WIDTH: dpi_trace("WIDTH"); + `CSR_TEX_HEIGHT: dpi_trace("HEIGHT"); + `CSR_TEX_FORMAT: dpi_trace("FORMAT"); + `CSR_TEX_FILTER: dpi_trace("FILTER"); + `CSR_TEX_WRAPU: dpi_trace("WRAPU"); + `CSR_TEX_WRAPV: dpi_trace("WRAPV"); + //`CSR_TEX_MIPOFF + default: dpi_trace("MIPOFF"); + endcase +endtask `endif \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_format.sv b/hw/rtl/tex_unit/VX_tex_format.sv index 91e0e6f8..e299ed17 100644 --- a/hw/rtl/tex_unit/VX_tex_format.sv +++ b/hw/rtl/tex_unit/VX_tex_format.sv @@ -13,25 +13,31 @@ module VX_tex_format #( always @(*) begin case (format) - `TEX_FORMAT_R8G8B8A8: begin + `TEX_FORMAT_A8R8G8B8: begin texel_out_r[07:00] = texel_in[7:0]; texel_out_r[15:08] = texel_in[15:8]; texel_out_r[23:16] = texel_in[23:16]; texel_out_r[31:24] = texel_in[31:24]; end `TEX_FORMAT_R5G6B5: begin - texel_out_r[07:00] = {texel_in[15:11], texel_in[15:13]}; + texel_out_r[07:00] = {texel_in[4:0], texel_in[4:2]}; texel_out_r[15:08] = {texel_in[10:5], texel_in[10:9]}; - texel_out_r[23:16] = {texel_in[4:0], texel_in[4:2]}; + texel_out_r[23:16] = {texel_in[15:11], texel_in[15:13]}; texel_out_r[31:24] = 8'hff; end - `TEX_FORMAT_R4G4B4A4: begin - texel_out_r[07:00] = {texel_in[11:8], texel_in[15:12]}; + `TEX_FORMAT_A1R5G5B5: begin + texel_out_r[07:00] = {texel_in[4:0], texel_in[4:2]}; + texel_out_r[15:08] = {texel_in[9:5], texel_in[9:7]}; + texel_out_r[23:16] = {texel_in[14:10], texel_in[14:12]}; + texel_out_r[31:24] = {8{texel_in[15]}}; + end + `TEX_FORMAT_A4R4G4B4: begin + texel_out_r[07:00] = {2{texel_in[3:0]}}; texel_out_r[15:08] = {2{texel_in[7:4]}}; - texel_out_r[23:16] = {2{texel_in[3:0]}}; + texel_out_r[23:16] = {2{texel_in[11:8]}}; texel_out_r[31:24] = {2{texel_in[15:12]}}; end - `TEX_FORMAT_L8A8: begin + `TEX_FORMAT_A8L8: begin texel_out_r[07:00] = texel_in[7:0]; texel_out_r[15:08] = texel_in[7:0]; texel_out_r[23:16] = texel_in[7:0]; @@ -45,9 +51,9 @@ module VX_tex_format #( end //`TEX_FORMAT_A8 default: begin - texel_out_r[07:00] = 0; - texel_out_r[15:08] = 0; - texel_out_r[23:16] = 0; + texel_out_r[07:00] = 8'hff; + texel_out_r[15:08] = 8'hff; + texel_out_r[23:16] = 8'hff; texel_out_r[31:24] = texel_in[7:0]; end endcase diff --git a/hw/rtl/tex_unit/VX_tex_lerp.sv b/hw/rtl/tex_unit/VX_tex_lerp.sv index 6dce57e3..7f35ac38 100644 --- a/hw/rtl/tex_unit/VX_tex_lerp.sv +++ b/hw/rtl/tex_unit/VX_tex_lerp.sv @@ -3,12 +3,11 @@ module VX_tex_lerp ( input wire [3:0][7:0] in1, input wire [3:0][7:0] in2, - input wire [8:0] alpha, - input wire [7:0] beta, + input wire [7:0] frac, output wire [3:0][7:0] out -); +); for (genvar i = 0; i < 4; ++i) begin - wire [16:0] sum = in1[i] * alpha + in2[i] * beta; + wire [16:0] sum = in1[i] * 8'(8'hff - frac) + in2[i] * frac; `UNUSED_VAR (sum) assign out[i] = sum[15:8]; end diff --git a/hw/rtl/tex_unit/VX_tex_mem.sv b/hw/rtl/tex_unit/VX_tex_mem.sv index 91aa0438..73f9367c 100644 --- a/hw/rtl/tex_unit/VX_tex_mem.sv +++ b/hw/rtl/tex_unit/VX_tex_mem.sv @@ -15,7 +15,8 @@ module VX_tex_mem #( input wire req_valid, input wire [NUM_REQS-1:0] req_tmask, input wire [`TEX_FILTER_BITS-1:0] req_filter, - input wire [`TEX_STRIDE_BITS-1:0] req_stride, + input wire [`TEX_LGSTRIDE_BITS-1:0] req_lgstride, + input wire [NUM_REQS-1:0][31:0] req_baseaddr, input wire [NUM_REQS-1:0][3:0][31:0] req_addr, input wire [REQ_INFOW-1:0] req_info, output wire req_ready, @@ -32,6 +33,14 @@ module VX_tex_mem #( localparam RSP_CTR_W = $clog2(NUM_REQS * 4 + 1); + // full address calculation + wire [NUM_REQS-1:0][3:0][31:0] full_addr; + for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar j = 0; j < 4; ++j) begin + assign full_addr[i][j] = req_baseaddr[i] + req_addr[i][j]; + end + end + wire [3:0] dup_reqs; wire [3:0][NUM_REQS-1:0][29:0] req_addr_w; wire [3:0][NUM_REQS-1:0][1:0] align_offs; @@ -40,17 +49,17 @@ module VX_tex_mem #( for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar j = 0; j < 4; ++j) begin - assign req_addr_w[j][i] = req_addr[i][j][31:2]; - assign align_offs[j][i] = req_addr[i][j][1:0]; + assign req_addr_w[j][i] = full_addr[i][j][31:2]; + assign align_offs[j][i] = full_addr[i][j][1:0]; end end - // find duplicate addresses + // detect duplicate addresses for (genvar i = 0; i < 4; ++i) begin - wire [NUM_REQS-1:0] addr_matches; - for (genvar j = 0; j < NUM_REQS; j++) begin - assign addr_matches[j] = (req_addr_w[i][0] == req_addr_w[i][j]) || ~req_tmask[j]; + wire [NUM_REQS-2:0] addr_matches; + for (genvar j = 0; j < (NUM_REQS-1); ++j) begin + assign addr_matches[j] = (req_addr_w[i][j+1] == req_addr_w[i][0]) || ~req_tmask[j+1]; end assign dup_reqs[i] = req_tmask[0] && (& addr_matches); end @@ -63,23 +72,26 @@ module VX_tex_mem #( wire [NUM_REQS-1:0] q_req_tmask; wire [`TEX_FILTER_BITS-1:0] q_req_filter; wire [REQ_INFOW-1:0] q_req_info; - wire [`TEX_STRIDE_BITS-1:0] q_req_stride; + wire [`TEX_LGSTRIDE_BITS-1:0] q_req_lgstride; wire [3:0][NUM_REQS-1:0][1:0] q_align_offs; wire [3:0] q_dup_reqs; + wire [`NW_BITS-1:0] q_req_wid; + wire [31:0] q_req_PC; + wire [`UUID_BITS-1:0] q_req_uuid; assign reqq_push = req_valid && req_ready; VX_fifo_queue #( - .DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (4 * NUM_REQS * 2) + 4), - .SIZE (`LSUQ_SIZE), + .DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (4 * NUM_REQS * 2) + 4), + .SIZE (`TEXQ_SIZE), .OUT_REG (1) ) req_queue ( .clk (clk), .reset (reset), .push (reqq_push), .pop (reqq_pop), - .data_in ({req_addr_w, req_tmask, req_info, req_filter, req_stride, align_offs, dup_reqs}), - .data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_stride, q_align_offs, q_dup_reqs}), + .data_in ({req_addr_w, req_tmask, req_info, req_filter, req_lgstride, align_offs, dup_reqs}), + .data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_lgstride, q_align_offs, q_dup_reqs}), .empty (reqq_empty), .full (reqq_full), `UNUSED_PIN (alm_full), @@ -143,17 +155,16 @@ module VX_tex_mem #( wire [NUM_REQS-1:0] req_dup_mask = {{(NUM_REQS-1){~req_texel_dup}}, 1'b1}; + assign {q_req_wid, q_req_PC, q_req_uuid} = q_req_info[`NW_BITS+32+`UUID_BITS-1:0]; + `UNUSED_VAR (q_req_wid) + `UNUSED_VAR (q_req_PC) + assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask; assign dcache_req_if.rw = {NUM_REQS{1'b0}}; assign dcache_req_if.addr = req_texel_addr; - assign dcache_req_if.byteen = {NUM_REQS{4'b1111}}; + assign dcache_req_if.byteen = {NUM_REQS{4'b0}}; assign dcache_req_if.data = 'x; - -`ifdef DBG_CACHE_REQ_INFO - assign dcache_req_if.tag = {NUM_REQS{q_req_info[`DBG_CACHE_REQ_MDATAW-1:0], req_texel_idx}}; -`else - assign dcache_req_if.tag = {NUM_REQS{req_texel_idx}}; -`endif + assign dcache_req_if.tag = {NUM_REQS{q_req_uuid, `LSU_TAG_ID_BITS'(req_texel_idx), `CACHE_ADDR_TYPE_BITS'(0)}}; // Dcache Response @@ -162,14 +173,18 @@ module VX_tex_mem #( reg [NUM_REQS-1:0][31:0] rsp_data_qual; reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init; wire [RSP_CTR_W-1:0] rsp_rem_ctr_n; + wire [NUM_REQS-1:0][1:0] rsp_align_offs; + wire [$clog2(NUM_REQS+1)-1:0] q_req_size; + wire [$clog2(NUM_REQS+1)-1:0] dcache_rsp_size; wire dcache_rsp_fire; wire [1:0] rsp_texel_idx; wire rsp_texel_dup; - - assign rsp_texel_idx = dcache_rsp_if.tag[1:0]; + + assign rsp_texel_idx = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: 2]; `UNUSED_VAR (dcache_rsp_if.tag) assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx]; + assign rsp_align_offs = q_align_offs[rsp_texel_idx]; assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready; @@ -180,12 +195,12 @@ module VX_tex_mem #( reg [31:0] rsp_data_shifted; always @(*) begin rsp_data_shifted[31:16] = src_data[31:16]; - rsp_data_shifted[15:0] = q_align_offs[rsp_texel_idx][i][1] ? src_data[31:16] : src_data[15:0]; - rsp_data_shifted[7:0] = q_align_offs[rsp_texel_idx][i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0]; + rsp_data_shifted[15:0] = rsp_align_offs[i][1] ? src_data[31:16] : src_data[15:0]; + rsp_data_shifted[7:0] = rsp_align_offs[i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0]; end always @(*) begin - case (q_req_stride) + case (q_req_lgstride) 0: rsp_data_qual[i] = 32'(rsp_data_shifted[7:0]); 1: rsp_data_qual[i] = 32'(rsp_data_shifted[15:0]); default: rsp_data_qual[i] = rsp_data_shifted; @@ -206,16 +221,21 @@ module VX_tex_mem #( end end + `POP_COUNT(q_req_size, q_req_tmask); + always @(*) begin - rsp_rem_ctr_init = RSP_CTR_W'($countones(q_dup_reqs[0] ? NUM_REQS'(1) : q_req_tmask)); + rsp_rem_ctr_init = q_dup_reqs[0] ? RSP_CTR_W'(1) : RSP_CTR_W'(q_req_size); if (q_req_filter) begin for (integer i = 1; i < 4; ++i) begin - rsp_rem_ctr_init += RSP_CTR_W'($countones(q_dup_reqs[i] ? NUM_REQS'(1) : q_req_tmask)); + rsp_rem_ctr_init += q_dup_reqs[i] ? RSP_CTR_W'(1) : RSP_CTR_W'(q_req_size); end end end - assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'($countones(dcache_rsp_if.tmask)); + wire [NUM_REQS-1:0] dcache_rsp_tmask = dcache_rsp_if.tmask; + `POP_COUNT(dcache_rsp_size, dcache_rsp_tmask); + + assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'(dcache_rsp_size); always @(posedge clk) begin if (reset) begin @@ -237,7 +257,7 @@ module VX_tex_mem #( wire stall_out = rsp_valid && ~rsp_ready; - wire is_last_rsp = (0 == rsp_rem_ctr_n); + wire is_last_rsp = (rsp_rem_ctr == RSP_CTR_W'(dcache_rsp_size)); wire rsp_texels_done = dcache_rsp_fire && is_last_rsp; @@ -257,37 +277,39 @@ module VX_tex_mem #( // Can accept new cache response? assign dcache_rsp_if.ready = ~(is_last_rsp && stall_out); -`ifdef DBG_TRACE_TEX - wire [`NW_BITS-1:0] q_req_wid, req_wid, rsp_wid; - wire [31:0] q_req_PC, req_PC, rsp_PC; - assign {q_req_wid, q_req_PC} = q_req_info[`NW_BITS+32-1:0]; - assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0]; - assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0]; +`ifdef DBG_TRACE_TEX + wire [`NW_BITS-1:0] req_wid, rsp_wid; + wire [31:0] req_PC, rsp_PC; + wire [`UUID_BITS-1:0] req_uuid, rsp_uuid; + assign {req_wid, req_PC, req_uuid} = req_info[`NW_BITS+32+`UUID_BITS-1:0]; + assign {rsp_wid, rsp_PC, rsp_uuid} = rsp_info[`NW_BITS+32+`UUID_BITS-1:0]; always @(posedge clk) begin if (dcache_req_fire_any) begin dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=", $time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_texel_idx); `TRACE_ARRAY1D(req_texel_addr, NUM_REQS); - dpi_trace(", is_dup=%b\n", req_texel_dup); + dpi_trace(", is_dup=%b (#%0d)\n", req_texel_dup, q_req_uuid); end if (dcache_rsp_fire) begin dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, data=", $time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_texel_idx); `TRACE_ARRAY1D(dcache_rsp_if.data, NUM_REQS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", q_req_uuid); end if (req_valid && req_ready) begin - dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, stride=%0d, addr=", - $time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_stride); + dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, lgstride=%0d, baseaddr=", + $time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_lgstride); + `TRACE_ARRAY1D(req_baseaddr, NUM_REQS); + dpi_trace(", addr="); `TRACE_ARRAY2D(req_addr, 4, NUM_REQS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", req_uuid); end if (rsp_valid && rsp_ready) begin dpi_trace("%d: core%0d-tex-mem-rsp: wid=%0d, PC=%0h, tmask=%b, data=", $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask); `TRACE_ARRAY2D(rsp_data, 4, NUM_REQS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", rsp_uuid); end end `endif diff --git a/hw/rtl/tex_unit/VX_tex_sampler.sv b/hw/rtl/tex_unit/VX_tex_sampler.sv index ac0f1496..dffc5cf0 100644 --- a/hw/rtl/tex_unit/VX_tex_sampler.sv +++ b/hw/rtl/tex_unit/VX_tex_sampler.sv @@ -12,7 +12,7 @@ module VX_tex_sampler #( input wire req_valid, input wire [NUM_REQS-1:0] req_tmask, input wire [`TEX_FORMAT_BITS-1:0] req_format, - input wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] req_blends, + input wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] req_blends, input wire [NUM_REQS-1:0][3:0][31:0] req_data, input wire [REQ_INFOW-1:0] req_info, output wire req_ready, @@ -27,75 +27,78 @@ module VX_tex_sampler #( `UNUSED_PARAM (CORE_ID) - wire valid_s0; - wire [NUM_REQS-1:0] tmask_s0; - wire [REQ_INFOW-1:0] req_info_s0; + wire valid_s0, valid_s1; + wire [NUM_REQS-1:0] req_tmask_s0, req_tmask_s1; + wire [REQ_INFOW-1:0] req_info_s0, req_info_s1; wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh; - wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0; - wire [NUM_REQS-1:0][`BLEND_FRAC-1:0] blend_v, blend_v_s0; + wire [NUM_REQS-1:0][31:0] texel_ul_s1, texel_uh_s1; + wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] req_blends_s0; + wire [NUM_REQS-1:0][`TEX_BLEND_FRAC-1:0] blend_v, blend_v_s1; wire [NUM_REQS-1:0][31:0] texel_v; + wire [NUM_REQS-1:0][3:0][31:0] fmt_texels, fmt_texels_s0; wire stall_out; for (genvar i = 0; i < NUM_REQS; ++i) begin - - wire [3:0][31:0] fmt_texels; - for (genvar j = 0; j < 4; ++j) begin VX_tex_format #( .CORE_ID (CORE_ID) ) tex_format ( .format (req_format), .texel_in (req_data[i][j]), - .texel_out (fmt_texels[j]) + .texel_out (fmt_texels[i][j]) ); - end - - wire [7:0] beta = req_blends[i][0]; - wire [8:0] alpha = `BLEND_ONE - beta; - - VX_tex_lerp #( - ) tex_lerp_ul ( - .in1 (fmt_texels[0]), - .in2 (fmt_texels[1]), - .alpha (alpha), - .beta (beta), - .out (texel_ul[i]) - ); - - VX_tex_lerp #( - ) tex_lerp_uh ( - .in1 (fmt_texels[2]), - .in2 (fmt_texels[3]), - .alpha (alpha), - .beta (beta), - .out (texel_uh[i]) - ); - - assign blend_v[i] = req_blends[i][1]; + end end VX_pipe_register #( - .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `BLEND_FRAC) + (2 * NUM_REQS * 32)), + .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 2 * `TEX_BLEND_FRAC) + (NUM_REQS * 4 * 32)), .RESETW (1) ) pipe_reg0 ( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({req_valid, req_tmask, req_info, blend_v, texel_ul, texel_uh}), - .data_out ({valid_s0, tmask_s0, req_info_s0, blend_v_s0, texel_ul_s0, texel_uh_s0}) + .data_in ({req_valid, req_tmask, req_info, req_blends, fmt_texels}), + .data_out ({valid_s0, req_tmask_s0, req_info_s0, req_blends_s0, fmt_texels_s0}) + ); + + for (genvar i = 0; i < NUM_REQS; ++i) begin + VX_tex_lerp #( + ) tex_lerp_ul ( + .in1 (fmt_texels_s0[i][0]), + .in2 (fmt_texels_s0[i][1]), + .frac (req_blends_s0[i][0]), + .out (texel_ul[i]) + ); + + VX_tex_lerp #( + ) tex_lerp_uh ( + .in1 (fmt_texels_s0[i][2]), + .in2 (fmt_texels_s0[i][3]), + .frac (req_blends_s0[i][0]), + .out (texel_uh[i]) + ); + + assign blend_v[i] = req_blends_s0[i][1]; + end + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `TEX_BLEND_FRAC) + (2 * NUM_REQS * 32)), + .RESETW (1) + ) pipe_reg1 ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({valid_s0, req_tmask_s0, req_info_s0, blend_v, texel_ul, texel_uh}), + .data_out ({valid_s1, req_tmask_s1, req_info_s1, blend_v_s1, texel_ul_s1, texel_uh_s1}) ); for (genvar i = 0; i < NUM_REQS; i++) begin - wire [7:0] beta = blend_v_s0[i]; - wire [8:0] alpha = `BLEND_ONE - beta; - VX_tex_lerp #( ) tex_lerp_v ( - .in1 (texel_ul_s0[i]), - .in2 (texel_uh_s0[i]), - .alpha (alpha), - .beta (beta), + .in1 (texel_ul_s1[i]), + .in2 (texel_uh_s1[i]), + .frac (blend_v_s1[i]), .out (texel_v[i]) ); end @@ -105,12 +108,12 @@ module VX_tex_sampler #( VX_pipe_register #( .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 32)), .RESETW (1) - ) pipe_reg1 ( + ) pipe_reg2 ( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({valid_s0, tmask_s0, req_info_s0, texel_v}), - .data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data}) + .data_in ({valid_s1, req_tmask_s1, req_info_s1, texel_v}), + .data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data}) ); // can accept new request? diff --git a/hw/rtl/tex_unit/VX_tex_stride.sv b/hw/rtl/tex_unit/VX_tex_stride.sv index 50393fe9..3f1427bb 100644 --- a/hw/rtl/tex_unit/VX_tex_stride.sv +++ b/hw/rtl/tex_unit/VX_tex_stride.sv @@ -4,21 +4,22 @@ module VX_tex_stride #( parameter CORE_ID = 0 ) ( input wire [`TEX_FORMAT_BITS-1:0] format, - output wire [`TEX_STRIDE_BITS-1:0] log_stride + output wire [`TEX_LGSTRIDE_BITS-1:0] log_stride ); `UNUSED_PARAM (CORE_ID) - reg [`TEX_STRIDE_BITS-1:0] log_stride_r; + reg [`TEX_LGSTRIDE_BITS-1:0] log_stride_r; always @(*) begin case (format) - `TEX_FORMAT_A8: log_stride_r = 0; - `TEX_FORMAT_L8: log_stride_r = 0; - `TEX_FORMAT_L8A8: log_stride_r = 1; - `TEX_FORMAT_R5G6B5: log_stride_r = 1; - `TEX_FORMAT_R4G4B4A4: log_stride_r = 1; - //`TEX_FORMAT_R8G8B8A8 - default: log_stride_r = 2; + `TEX_FORMAT_A8R8G8B8: log_stride_r = 2; + `TEX_FORMAT_R5G6B5, + `TEX_FORMAT_A1R5G5B5, + `TEX_FORMAT_A4R4G4B4, + `TEX_FORMAT_A8L8: log_stride_r = 1; + // `TEX_FORMAT_L8: + // `TEX_FORMAT_A8: + default: log_stride_r = 0; endcase end diff --git a/hw/rtl/tex_unit/VX_tex_unit.sv b/hw/rtl/tex_unit/VX_tex_unit.sv index 6be6aa43..9045c5aa 100644 --- a/hw/rtl/tex_unit/VX_tex_unit.sv +++ b/hw/rtl/tex_unit/VX_tex_unit.sv @@ -6,6 +6,11 @@ module VX_tex_unit #( input wire clk, input wire reset, + // PERF +`ifdef PERF_ENABLE + VX_perf_tex_if.master perf_tex_if, +`endif + // Texture unit <-> Memory Unit VX_dcache_req_if.master dcache_req_if, VX_dcache_rsp_if.slave dcache_rsp_if, @@ -18,74 +23,73 @@ module VX_tex_unit #( VX_tex_rsp_if.master tex_rsp_if ); - localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32; - localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S; - localparam REQ_INFOW_M = (2 * `NUM_THREADS * `BLEND_FRAC) + REQ_INFOW_A; + localparam REQ_INFO_W = `NR_BITS + 1 + `NW_BITS + 32 + `UUID_BITS; + localparam BLEND_FRAC_W = (2 * `NUM_THREADS * `TEX_BLEND_FRAC); - reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; - reg [1:0][`TEX_DIM_BITS-1:0] tex_dims [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; + reg [$clog2(`NUM_TEX_UNITS)-1:0] csr_tex_unit; + reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(`TEX_LOD_MAX+1)-1:0]; + reg [1:0][`TEX_LOD_BITS-1:0] tex_logdims [`NUM_TEX_UNITS-1:0]; + reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0]; reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0]; reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0]; - reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0]; reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0]; - // CSRs programming + // CSRs programming - reg [`NUM_TEX_UNITS-1:0] csrs_dirty; - `UNUSED_VAR (csrs_dirty) - - for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin - wire [`TEX_LOD_BITS-1:0] mip_level = tex_csr_if.write_data[28 +: `TEX_LOD_BITS]; - always @(posedge clk) begin - if (tex_csr_if.write_enable) begin - case (tex_csr_if.write_addr) - `CSR_TEX_ADDR(i) : begin - tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; - csrs_dirty[i] <= 1; + always @(posedge clk) begin + if (tex_csr_if.write_enable) begin + case (tex_csr_if.write_addr) + `CSR_TEX_UNIT: begin + csr_tex_unit <= tex_csr_if.write_data[$clog2(`NUM_TEX_UNITS)-1:0]; + end + `CSR_TEX_ADDR: begin + tex_baddr[csr_tex_unit] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; + end + `CSR_TEX_FORMAT: begin + tex_format[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; + end + `CSR_TEX_WRAPU: begin + tex_wraps[csr_tex_unit][0] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; + end + `CSR_TEX_WRAPV: begin + tex_wraps[csr_tex_unit][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; + end + `CSR_TEX_FILTER: begin + tex_filter[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; + end + `CSR_TEX_WIDTH: begin + tex_logdims[csr_tex_unit][0] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; + end + `CSR_TEX_HEIGHT: begin + tex_logdims[csr_tex_unit][1] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; + end + default: begin + for (integer j = 0; j <= `TEX_LOD_MAX; ++j) begin + `IGNORE_WARNINGS_BEGIN + if (tex_csr_if.write_addr == `CSR_TEX_MIPOFF(j)) begin + `IGNORE_WARNINGS_END + tex_mipoff[csr_tex_unit][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; + end end - `CSR_TEX_FORMAT(i) : begin - tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX_WRAP(i) : begin - tex_wraps[i][0] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS]; - tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS]; - csrs_dirty[i] <= 1; - end - `CSR_TEX_FILTER(i) : begin - tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX_MIPOFF(i) : begin - tex_mipoff[i][mip_level] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX_WIDTH(i) : begin - tex_dims[i][mip_level][0] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX_HEIGHT(i) : begin - tex_dims[i][mip_level][1] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0]; - csrs_dirty[i] <= 1; - end - endcase - end - if (reset || (tex_req_if.valid && tex_req_if.ready)) begin - csrs_dirty[i] <= '0; - end + end + endcase end end + wire [`UUID_BITS-1:0] write_uuid = tex_csr_if.write_uuid; + `UNUSED_VAR (write_uuid); // mipmap attributes - wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff; - wire [`NUM_THREADS-1:0][1:0][`TEX_DIM_BITS-1:0] sel_dims; + wire [`NUM_THREADS-1:0][`TEX_LOD_BITS-1:0] mip_level; + wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff; + wire [`NUM_THREADS-1:0][1:0][`TEX_LOD_BITS-1:0] sel_logdims; for (genvar i = 0; i < `NUM_THREADS; ++i) begin wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0]; - wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][20+:`TEX_LOD_BITS]; - assign sel_mipoff[i] = tex_mipoff[unit][mip_level]; - assign sel_dims[i] = tex_dims[unit][mip_level]; + assign mip_level[i] = tex_req_if.lod[i][`TEX_LOD_BITS-1:0]; + assign sel_mipoff[i] = tex_mipoff[unit][mip_level[i]]; + assign sel_logdims[i][0] = tex_logdims[unit][0]; + assign sel_logdims[i][1] = tex_logdims[unit][1]; end // address generation @@ -93,15 +97,16 @@ module VX_tex_unit #( wire mem_req_valid; wire [`NUM_THREADS-1:0] mem_req_tmask; wire [`TEX_FILTER_BITS-1:0] mem_req_filter; - wire [`TEX_STRIDE_BITS-1:0] mem_req_stride; - wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] mem_req_blends; + wire [`TEX_LGSTRIDE_BITS-1:0] mem_req_lgstride; + wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] mem_req_blends; wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr; - wire [REQ_INFOW_A-1:0] mem_req_info; + wire [`NUM_THREADS-1:0][31:0] mem_req_baseaddr; + wire [(`TEX_FORMAT_BITS + REQ_INFO_W)-1:0] mem_req_info; wire mem_req_ready; VX_tex_addr #( .CORE_ID (CORE_ID), - .REQ_INFOW (REQ_INFOW_A), + .REQ_INFOW (`TEX_FORMAT_BITS + REQ_INFO_W), .NUM_REQS (`NUM_THREADS) ) tex_addr ( .clk (clk), @@ -113,16 +118,18 @@ module VX_tex_unit #( .req_format (tex_format[tex_req_if.unit]), .req_filter (tex_filter[tex_req_if.unit]), .req_wraps (tex_wraps[tex_req_if.unit]), - .req_baseaddr (tex_baddr[tex_req_if.unit]), + .req_baseaddr(tex_baddr[tex_req_if.unit]), + .mip_level (mip_level), .req_mipoff (sel_mipoff), - .req_logdims (sel_dims), - .req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), + .req_logdims(sel_logdims), + .req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC, tex_req_if.uuid}), .req_ready (tex_req_if.ready), .rsp_valid (mem_req_valid), .rsp_tmask (mem_req_tmask), .rsp_filter (mem_req_filter), - .rsp_stride (mem_req_stride), + .rsp_lgstride(mem_req_lgstride), + .rsp_baseaddr(mem_req_baseaddr), .rsp_addr (mem_req_addr), .rsp_blends (mem_req_blends), .rsp_info (mem_req_info), @@ -134,16 +141,16 @@ module VX_tex_unit #( wire mem_rsp_valid; wire [`NUM_THREADS-1:0] mem_rsp_tmask; wire [`NUM_THREADS-1:0][3:0][31:0] mem_rsp_data; - wire [REQ_INFOW_M-1:0] mem_rsp_info; + wire [(BLEND_FRAC_W + `TEX_FORMAT_BITS + REQ_INFO_W)-1:0] mem_rsp_info; wire mem_rsp_ready; VX_tex_mem #( .CORE_ID (CORE_ID), - .REQ_INFOW (REQ_INFOW_M), + .REQ_INFOW (BLEND_FRAC_W + `TEX_FORMAT_BITS + REQ_INFO_W), .NUM_REQS (`NUM_THREADS) ) tex_mem ( - .clk (clk), - .reset (reset), + .clk (clk), + .reset (reset), // memory interface .dcache_req_if (dcache_req_if), @@ -153,7 +160,8 @@ module VX_tex_unit #( .req_valid (mem_req_valid), .req_tmask (mem_req_tmask), .req_filter(mem_req_filter), - .req_stride(mem_req_stride), + .req_lgstride(mem_req_lgstride), + .req_baseaddr(mem_req_baseaddr), .req_addr (mem_req_addr), .req_info ({mem_req_blends, mem_req_info}), .req_ready (mem_req_ready), @@ -168,15 +176,9 @@ module VX_tex_unit #( // apply sampler - wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends; - wire [`TEX_FORMAT_BITS-1:0] rsp_format; - wire [REQ_INFOW_S-1:0] rsp_info; - - assign {rsp_blends, rsp_format, rsp_info} = mem_rsp_info; - VX_tex_sampler #( .CORE_ID (CORE_ID), - .REQ_INFOW (REQ_INFOW_S), + .REQ_INFOW (REQ_INFO_W), .NUM_REQS (`NUM_THREADS) ) tex_sampler ( .clk (clk), @@ -186,47 +188,77 @@ module VX_tex_unit #( .req_valid (mem_rsp_valid), .req_tmask (mem_rsp_tmask), .req_data (mem_rsp_data), - .req_format (rsp_format), - .req_blends (rsp_blends), - .req_info (rsp_info), + .req_blends (mem_rsp_info[(REQ_INFO_W+`TEX_FORMAT_BITS) +: BLEND_FRAC_W]), + .req_format (mem_rsp_info[REQ_INFO_W +: `TEX_FORMAT_BITS]), + .req_info (mem_rsp_info[0 +: REQ_INFO_W]), .req_ready (mem_rsp_ready), // outputs .rsp_valid (tex_rsp_if.valid), .rsp_tmask (tex_rsp_if.tmask), .rsp_data (tex_rsp_if.data), - .rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}), + .rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC, tex_rsp_if.uuid}), .rsp_ready (tex_rsp_if.ready) - ); + ); + +`ifdef PERF_ENABLE + wire [$clog2(`NUM_THREADS+1)-1:0] perf_mem_req_per_cycle; + wire [$clog2(`NUM_THREADS+1)-1:0] perf_mem_rsp_per_cycle; + + wire [`NUM_THREADS-1:0] perf_mem_req_per_mask = dcache_req_if.valid & dcache_req_if.ready; + wire [`NUM_THREADS-1:0] perf_mem_rsp_per_mask = dcache_rsp_if.tmask & {`NUM_THREADS{dcache_rsp_if.valid & dcache_rsp_if.ready}}; + + `POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_per_mask); + `POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_per_mask); + + reg [`PERF_CTR_BITS-1:0] perf_pending_reads; + wire [$clog2(`NUM_THREADS+1)+1-1:0] perf_pending_reads_cycle = perf_mem_req_per_cycle - perf_mem_rsp_per_cycle; + + always @(posedge clk) begin + if (reset) begin + perf_pending_reads <= 0; + end else begin + perf_pending_reads <= perf_pending_reads + `PERF_CTR_BITS'($signed(perf_pending_reads_cycle)); + end + end + + reg [`PERF_CTR_BITS-1:0] perf_mem_reads; + reg [`PERF_CTR_BITS-1:0] perf_mem_latency; + + always @(posedge clk) begin + if (reset) begin + perf_mem_reads <= 0; + perf_mem_latency <= 0; + end else begin + perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(perf_mem_req_per_cycle); + perf_mem_latency <= perf_mem_latency + `PERF_CTR_BITS'(perf_pending_reads); + end + end + + assign perf_tex_if.mem_reads = perf_mem_reads; + assign perf_tex_if.mem_latency = perf_mem_latency; +`endif `ifdef DBG_TRACE_TEX always @(posedge clk) begin + if (tex_csr_if.write_enable) begin + dpi_trace("%d: core%0d-tex-csr: unit=%0d, state=", $time, CORE_ID, csr_tex_unit); + trace_tex_state(tex_csr_if.write_addr); + dpi_trace(", data=%0h (#%0d)\n", tex_csr_if.write_data, tex_csr_if.write_uuid); + end if (tex_req_if.valid && tex_req_if.ready) begin - for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin - if (csrs_dirty[i]) begin - dpi_trace("%d: core%0d-tex-csr: tex%0d_addr=%0h\n", $time, CORE_ID, i, tex_baddr[i]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_format=%0h\n", $time, CORE_ID, i, tex_format[i]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_u=%0h\n", $time, CORE_ID, i, tex_wraps[i][0]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_v=%0h\n", $time, CORE_ID, i, tex_wraps[i][1]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_filter=%0h\n", $time, CORE_ID, i, tex_filter[i]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_mipoff[0]=%0h\n", $time, CORE_ID, i, tex_mipoff[i][0]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_width[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][0]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_height[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][1]); - end - end - dpi_trace("%d: core%0d-tex-req: wid=%0d, PC=%0h, tmask=%b, unit=%0d, lod=%0h, u=", - $time, CORE_ID, tex_req_if.wid, tex_req_if.PC, tex_req_if.tmask, tex_req_if.unit, tex_req_if.lod); + $time, CORE_ID, tex_req_if.wid, tex_req_if.PC, tex_req_if.tmask, tex_req_if.unit, tex_req_if.lod); `TRACE_ARRAY1D(tex_req_if.coords[0], `NUM_THREADS); dpi_trace(", v="); `TRACE_ARRAY1D(tex_req_if.coords[1], `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", tex_req_if.uuid); end if (tex_rsp_if.valid && tex_rsp_if.ready) begin dpi_trace("%d: core%0d-tex-rsp: wid=%0d, PC=%0h, tmask=%b, data=", $time, CORE_ID, tex_rsp_if.wid, tex_rsp_if.PC, tex_rsp_if.tmask); `TRACE_ARRAY1D(tex_rsp_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", tex_rsp_if.uuid); end end `endif diff --git a/hw/rtl/tex_unit/VX_tex_wrap.sv b/hw/rtl/tex_unit/VX_tex_wrap.sv index 8cc7b2f5..fe2110ba 100644 --- a/hw/rtl/tex_unit/VX_tex_wrap.sv +++ b/hw/rtl/tex_unit/VX_tex_wrap.sv @@ -4,19 +4,19 @@ module VX_tex_wrap #( parameter CORE_ID = 0 ) ( input wire [`TEX_WRAP_BITS-1:0] wrap_i, - input wire [31:0] coord_i, - output wire [`FIXED_FRAC-1:0] coord_o + input wire [`TEX_FXD_BITS-1:0] coord_i, + output wire [`TEX_FXD_FRAC-1:0] coord_o ); `UNUSED_PARAM (CORE_ID) - reg [`FIXED_FRAC-1:0] coord_r; + reg [`TEX_FXD_FRAC-1:0] coord_r; - wire [`FIXED_FRAC-1:0] clamp; + wire [`TEX_FXD_FRAC-1:0] clamp; VX_tex_sat #( - .IN_W (32), - .OUT_W (`FIXED_FRAC) + .IN_W (`TEX_FXD_BITS), + .OUT_W (`TEX_FXD_FRAC) ) sat_fx ( .data_in (coord_i), .data_out (clamp) @@ -27,9 +27,9 @@ module VX_tex_wrap #( `TEX_WRAP_CLAMP: coord_r = clamp; `TEX_WRAP_MIRROR: - coord_r = coord_i[`FIXED_FRAC-1:0] ^ {`FIXED_FRAC{coord_i[`FIXED_FRAC]}}; + coord_r = coord_i[`TEX_FXD_FRAC-1:0] ^ {`TEX_FXD_FRAC{coord_i[`TEX_FXD_FRAC]}}; default: //`TEX_WRAP_REPEAT - coord_r = coord_i[`FIXED_FRAC-1:0]; + coord_r = coord_i[`TEX_FXD_FRAC-1:0]; endcase end diff --git a/hw/scripts/scope.json b/hw/scripts/scope.json index 2c9f8355..d6cfd609 100644 --- a/hw/scripts/scope.json +++ b/hw/scripts/scope.json @@ -123,9 +123,9 @@ "!cci_pending_writes_full": 1, "?afu_mem_req_fire": 1, "afu_mem_req_addr": 26, - "afu_mem_req_tag": 27, + "afu_mem_req_tag": "`VX_MEM_TAG_WIDTH+1", "?afu_mem_rsp_fire": 1, - "afu_mem_rsp_tag": 27 + "afu_mem_rsp_tag": "`VX_MEM_TAG_WIDTH+1" }, "afu/vortex": { "!reset": 1, @@ -140,49 +140,29 @@ "mem_rsp_tag":"`VX_MEM_TAG_WIDTH", "busy": 1 }, - "afu/vortex/cluster/core/pipeline/fetch/icache_stage": { - "?icache_req_fire": 1, - "icache_req_wid":"`NW_BITS", - "icache_req_addr": 32, - "icache_req_tag":"`ICACHE_CORE_TAG_ID_BITS", - "?icache_rsp_fire": 1, - "icache_rsp_data": 32, - "icache_rsp_tag":"`ICACHE_CORE_TAG_ID_BITS" - }, "afu/vortex/cluster/core/pipeline/fetch/warp_sched": { "?wsched_scheduled": 1, + "wsched_schedule_uuid": "`UUID_BITS", "wsched_active_warps": "`NUM_WARPS", "wsched_stalled_warps": "`NUM_WARPS", "wsched_schedule_tmask": "`NUM_THREADS", "wsched_schedule_wid": "`NW_BITS", - "wsched_schedule_pc": "32" + "wsched_schedule_pc": 32 }, - "afu/vortex/cluster/core/pipeline/execute/gpu_unit": { - "?gpu_rsp_valid": 1, - "gpu_rsp_wid": "`NW_BITS", - "gpu_rsp_tmc": 1, - "gpu_rsp_wspawn": 1, - "gpu_rsp_split": 1, - "gpu_rsp_barrier": 1 - }, - "afu/vortex/cluster/core/pipeline/execute/lsu_unit": { - "?dcache_req_fire":"`NUM_THREADS", - "dcache_req_wid":"`NW_BITS", - "dcache_req_pc": 32, - "dcache_req_addr":"`NUM_THREADS * 32", - "dcache_req_rw": 1, - "dcache_req_byteen":"`NUM_THREADS * 4", - "dcache_req_data": "`NUM_THREADS * 32", - "dcache_req_tag":"`LSUQ_ADDR_BITS", - "?dcache_rsp_fire":"`NUM_THREADS", - "dcache_rsp_data":"`NUM_THREADS * 32", - "dcache_rsp_tag":"`LSUQ_ADDR_BITS" + "afu/vortex/cluster/core/pipeline/fetch/icache_stage": { + "?icache_req_fire": 1, + "icache_req_uuid": "`UUID_BITS", + "icache_req_addr": 32, + "icache_req_tag":"`ICACHE_CORE_TAG_ID_BITS", + "?icache_rsp_fire": 1, + "icache_rsp_uuid": "`UUID_BITS", + "icache_rsp_data": 32, + "icache_rsp_tag":"`ICACHE_CORE_TAG_ID_BITS" }, "afu/vortex/cluster/core/pipeline/issue": { "?issue_fire": 1, - "issue_wid":"`NW_BITS", - "issue_tmask":"`NUM_THREADS", - "issue_pc": 32, + "issue_uuid": "`UUID_BITS", + "issue_tmask":"`NUM_THREADS", "issue_ex_type":"`EX_BITS", "issue_op_type":"`INST_OP_BITS", "issue_op_mod":"`INST_MOD_BITS", @@ -198,15 +178,35 @@ "gpr_rs2":"`NUM_THREADS * 32", "gpr_rs3":"`NUM_THREADS * 32", "?writeback_valid": 1, - "writeback_wid":"`NW_BITS", - "writeback_pc": 32, + "writeback_uuid": "`UUID_BITS", "writeback_tmask":"`NUM_THREADS", "writeback_rd":"`NR_BITS", "writeback_data":"`NUM_THREADS * 32", "writeback_eop": 1, "!scoreboard_delay": 1, "!dispatch_delay": 1 - }, + }, + "afu/vortex/cluster/core/pipeline/execute/lsu_unit": { + "?dcache_req_fire":"`NUM_THREADS", + "dcache_req_uuid": "`UUID_BITS", + "dcache_req_addr":"`NUM_THREADS * 32", + "dcache_req_rw": 1, + "dcache_req_byteen":"`NUM_THREADS * 4", + "dcache_req_data":"`NUM_THREADS * 32", + "dcache_req_tag":"`LSUQ_ADDR_BITS", + "?dcache_rsp_fire":"`NUM_THREADS", + "dcache_rsp_uuid": "`UUID_BITS", + "dcache_rsp_data":"`NUM_THREADS * 32", + "dcache_rsp_tag":"`LSUQ_ADDR_BITS" + }, + "afu/vortex/cluster/core/pipeline/execute/gpu_unit": { + "?gpu_rsp_valid": 1, + "gpu_rsp_uuid": "`UUID_BITS", + "gpu_rsp_tmc": 1, + "gpu_rsp_wspawn": 1, + "gpu_rsp_split": 1, + "gpu_rsp_barrier": 1 + }, "afu/vortex/l3cache/bank, afu/vortex/cluster/l2cache/bank, afu/vortex/cluster/core/mem_unit/dcache/bank, afu/vortex/cluster/core/mem_unit/icache/bank": { "?valid_st0": 1, "?valid_st1": 1, diff --git a/hw/syn/opae/Makefile b/hw/syn/opae/Makefile index 010baea3..29b6a922 100644 --- a/hw/syn/opae/Makefile +++ b/hw/syn/opae/Makefile @@ -23,7 +23,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_FLAGS += $(DBG_TRACE_FLAGS) -DBG_FLAGS += -DDBG_CACHE_REQ_INFO CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) CONFIG2 := -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) diff --git a/hw/syn/quartus/Makefile b/hw/syn/quartus/Makefile index 662848e1..1dd63335 100644 --- a/hw/syn/quartus/Makefile +++ b/hw/syn/quartus/Makefile @@ -1,6 +1,6 @@ BUILD_DIR ?= build -.PHONY: dogfood unittest pipeline smem cache fpu_core core vortex top1 top2 top4 top8 top16 top32 top64 +.PHONY: dogfood unittest pipeline smem cache fpu_core core vortex top1 top2 top4 top8 top16 top32 top64 texunit dogfood: mkdir -p dogfood/$(BUILD_DIR) @@ -75,4 +75,9 @@ top32: top64: mkdir -p top64/$(BUILD_DIR) cp top64/Makefile top64/$(BUILD_DIR) - $(MAKE) -C top64/$(BUILD_DIR) clean && $(MAKE) -C top64/$(BUILD_DIR) > top64/$(BUILD_DIR)/build.log 2>&1 & \ No newline at end of file + $(MAKE) -C top64/$(BUILD_DIR) clean && $(MAKE) -C top64/$(BUILD_DIR) > top64/$(BUILD_DIR)/build.log 2>&1 & + +texunit: + mkdir -p texunit/$(BUILD_DIR) + cp texunit/Makefile texunit/$(BUILD_DIR) + $(MAKE) -C texunit/$(BUILD_DIR) clean && $(MAKE) -C texunit/$(BUILD_DIR) > texunit/$(BUILD_DIR)/build.log 2>&1 & \ No newline at end of file diff --git a/hw/syn/quartus/core/Makefile b/hw/syn/quartus/core/Makefile index b976110c..d209c80d 100644 --- a/hw/syn/quartus/core/Makefile +++ b/hw/syn/quartus/core/Makefile @@ -2,6 +2,7 @@ PROJECT = Core TOP_LEVEL_ENTITY = VX_core SRC_FILE = VX_core.v RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/fpu_core/Makefile b/hw/syn/quartus/fpu_core/Makefile index 291d8124..26ca51ac 100644 --- a/hw/syn/quartus/fpu_core/Makefile +++ b/hw/syn/quartus/fpu_core/Makefile @@ -2,6 +2,7 @@ PROJECT = VX_fpu_fpga TOP_LEVEL_ENTITY = VX_fpu_fpga SRC_FILE = VX_fpu_fpga.v RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/pipeline/Makefile b/hw/syn/quartus/pipeline/Makefile index e4cad107..665f7829 100644 --- a/hw/syn/quartus/pipeline/Makefile +++ b/hw/syn/quartus/pipeline/Makefile @@ -2,6 +2,7 @@ PROJECT = VX_pipeline TOP_LEVEL_ENTITY = VX_pipeline SRC_FILE = VX_pipeline.v RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/texunit/Makefile b/hw/syn/quartus/texunit/Makefile new file mode 100644 index 00000000..3ecfa892 --- /dev/null +++ b/hw/syn/quartus/texunit/Makefile @@ -0,0 +1,81 @@ +PROJECT = Core +TOP_LEVEL_ENTITY = VX_core +SRC_FILE = VX_core.v +RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party + +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG +FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 + +#FAMILY = "Stratix 10" +#DEVICE = 1SX280HN2F43E2VG +#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 + +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) + +PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf + +# Executable Configuration +SYN_ARGS = --parallel --read_settings_files=on +FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on +ASM_ARGS = +STA_ARGS = --parallel --do_report_timing + +# Build targets +all: $(PROJECT).sta.rpt + +syn: $(PROJECT).syn.rpt + +fit: $(PROJECT).fit.rpt + +asm: $(PROJECT).asm.rpt + +sta: $(PROJECT).sta.rpt + +smart: smart.log + +# Target implementations +STAMP = echo done > + +$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES) + quartus_syn $(PROJECT) $(SYN_ARGS) + $(STAMP) fit.chg + +$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt + quartus_fit $(PROJECT) $(FIT_ARGS) + $(STAMP) asm.chg + $(STAMP) sta.chg + +$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt + quartus_asm $(PROJECT) $(ASM_ARGS) + +$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt + quartus_sta $(PROJECT) $(STA_ARGS) + +smart.log: $(PROJECT_FILES) + quartus_sh --determine_smart_action $(PROJECT) > smart.log + +# Project initialization +$(PROJECT_FILES): + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "EXT_TEX_ENABLE=1" + +syn.chg: + $(STAMP) syn.chg + +fit.chg: + $(STAMP) fit.chg + +sta.chg: + $(STAMP) sta.chg + +asm.chg: + $(STAMP) asm.chg + +program: $(PROJECT).sof + quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof" + +clean: + rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox diff --git a/hw/syn/quartus/top1/Makefile b/hw/syn/quartus/top1/Makefile index 374f84e1..9494b2d3 100644 --- a/hw/syn/quartus/top1/Makefile +++ b/hw/syn/quartus/top1/Makefile @@ -2,6 +2,7 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top16/Makefile b/hw/syn/quartus/top16/Makefile index 78f4df68..836e3558 100644 --- a/hw/syn/quartus/top16/Makefile +++ b/hw/syn/quartus/top16/Makefile @@ -2,6 +2,7 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top2/Makefile b/hw/syn/quartus/top2/Makefile index f8801373..d4c6abbc 100644 --- a/hw/syn/quartus/top2/Makefile +++ b/hw/syn/quartus/top2/Makefile @@ -2,6 +2,7 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top32/Makefile b/hw/syn/quartus/top32/Makefile index cea702f5..d07a515c 100644 --- a/hw/syn/quartus/top32/Makefile +++ b/hw/syn/quartus/top32/Makefile @@ -2,6 +2,7 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top4/Makefile b/hw/syn/quartus/top4/Makefile index bfe734a7..af33661c 100644 --- a/hw/syn/quartus/top4/Makefile +++ b/hw/syn/quartus/top4/Makefile @@ -2,6 +2,7 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top64/Makefile b/hw/syn/quartus/top64/Makefile index 604f794f..1d60b214 100644 --- a/hw/syn/quartus/top64/Makefile +++ b/hw/syn/quartus/top64/Makefile @@ -1,7 +1,8 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv -RTL_DIR=../../../../rtl +RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party #FAMILY = "Arria 10" #DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FAMILY = "Stratix 10" DEVICE = 1SX280HN2F43E2VG FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top8/Makefile b/hw/syn/quartus/top8/Makefile index 0614e0d5..b2efcc6d 100644 --- a/hw/syn/quartus/top8/Makefile +++ b/hw/syn/quartus/top8/Makefile @@ -2,6 +2,7 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/unittest/Makefile b/hw/syn/quartus/unittest/Makefile index 3b1bc6da..975ec0a1 100644 --- a/hw/syn/quartus/unittest/Makefile +++ b/hw/syn/quartus/unittest/Makefile @@ -2,6 +2,7 @@ PROJECT = Unittest TOP_LEVEL_ENTITY = VX_core_req_bank_sel SRC_FILE = VX_core_req_bank_sel.v RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/vortex/Makefile b/hw/syn/quartus/vortex/Makefile index 6874cce3..b2046cf8 100644 --- a/hw/syn/quartus/vortex/Makefile +++ b/hw/syn/quartus/vortex/Makefile @@ -2,6 +2,7 @@ PROJECT = Vortex TOP_LEVEL_ENTITY = Vortex SRC_FILE = Vortex.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/miscs/patch/ramulator.patch b/miscs/patch/ramulator.patch new file mode 100644 index 00000000..e24b5d23 --- /dev/null +++ b/miscs/patch/ramulator.patch @@ -0,0 +1,46 @@ +diff --git a/Makefile b/Makefile +index ea340c8..d2aac5b 100644 +--- a/Makefile ++++ b/Makefile +@@ -7,16 +7,16 @@ OBJS := $(patsubst $(SRCDIR)/%.cpp, $(OBJDIR)/%.o, $(SRCS)) + + # Ramulator currently supports g++ 5.1+ or clang++ 3.4+. It will NOT work with + # g++ 4.x due to an internal compiler error when processing lambda functions. +-CXX := clang++ ++#CXX := clang++ + # CXX := g++-5 +-CXXFLAGS := -O3 -std=c++11 -g -Wall ++CXXFLAGS := -std=c++11 -O3 -g -Wall -fPIC + + .PHONY: all clean depend + + all: depend ramulator + + clean: +- rm -f ramulator ++ rm -f ramulator libramulator.a + rm -rf $(OBJDIR) + + depend: $(OBJDIR)/.depend +@@ -36,7 +36,7 @@ ramulator: $(MAIN) $(OBJS) $(SRCDIR)/*.h | depend + $(CXX) $(CXXFLAGS) -DRAMULATOR -o $@ $(MAIN) $(OBJS) + + libramulator.a: $(OBJS) $(OBJDIR)/Gem5Wrapper.o +- libtool -static -o $@ $(OBJS) $(OBJDIR)/Gem5Wrapper.o ++ $(AR) rcs $@ $^ + + $(OBJS): | $(OBJDIR) + +diff --git a/src/Request.h b/src/Request.h +index 57abd0d..a5ce061 100644 +--- a/src/Request.h ++++ b/src/Request.h +@@ -36,7 +36,7 @@ public: + + Request(long addr, Type type, int coreid = 0) + : is_first_command(true), addr(addr), coreid(coreid), type(type), +- callback([](Request& req){}) {} ++ callback([](Request&){}) {} + + Request(long addr, Type type, function callback, int coreid = 0) + : is_first_command(true), addr(addr), coreid(coreid), type(type), callback(callback) {} diff --git a/miscs/rvvector/basic/Makefile b/miscs/rvvector/basic/Makefile index 5a796fe7..66aece0c 100644 --- a/miscs/rvvector/basic/Makefile +++ b/miscs/rvvector/basic/Makefile @@ -36,6 +36,6 @@ ELF: $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf run: - ../../simX/obj_dir/Vcache_simX -E -a rv32i --core vx_vector_main.hex -s -b 1> emulator.debug + ../../simx/obj_dir/Vcache_simX -E -a rv32i --core vx_vector_main.hex -s -b 1> emulator.debug diff --git a/runtime/Makefile b/runtime/Makefile index 60c3b398..d72eb665 100644 --- a/runtime/Makefile +++ b/runtime/Makefile @@ -10,7 +10,7 @@ CFLAGS += -I./include -I../hw PROJECT = libvortexrt -SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c +SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/tinyprintf.c ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c OBJS := $(addsuffix .o, $(notdir $(SRCS))) @@ -26,7 +26,7 @@ $(PROJECT).dump: $(PROJECT).a $(CC) $(CFLAGS) -c $< -o $@ $(PROJECT).a: $(OBJS) - $(AR) rcs $(PROJECT).a $^ + $(AR) rcs $@ $^ .depend: $(SRCS) $(CC) $(CFLAGS) -MM $^ > .depend; diff --git a/runtime/include/vx_intrinsics.h b/runtime/include/vx_intrinsics.h index 9c3149d7..f3562872 100644 --- a/runtime/include/vx_intrinsics.h +++ b/runtime/include/vx_intrinsics.h @@ -5,115 +5,85 @@ #ifdef __cplusplus extern "C" { - #endif + #ifdef __ASSEMBLY__ #define __ASM_STR(x) x #else #define __ASM_STR(x) #x #endif -#define vx_csr_swap(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_read(csr) ({ \ - register unsigned __v; \ - __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \ - __v; \ -}) - -#define vx_csr_write(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -#define vx_csr_read_set(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_set(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -#define vx_csr_read_clear(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_clear(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -// Texture load -#define vx_tex(unit, u, v, l) ({ \ - unsigned __r; \ - unsigned __u = u; \ - unsigned __v = v; \ - unsigned __l = l; \ - __asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \ +#define csr_read(csr) ({ \ + unsigned __r; \ + __asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \ __r; \ }) -#ifdef __ASSEMBLY__ -#define __ASM_STR(x) x -#else -#define __ASM_STR(x) #x -#endif - -#define vx_csr_swap(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ +#define csr_write(csr, val) ({ \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \ }) -#define vx_csr_read(csr) ({ \ - register unsigned __v; \ - __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \ - __v; \ +#define csr_swap(csr, val) ({ \ + unsigned __r; \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \ + __r; \ }) -#define vx_csr_write(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ +#define csr_read_set(csr, val) ({ \ + unsigned __r; \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \ + __r; \ }) -#define vx_csr_read_set(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ +#define csr_set(csr, val) ({ \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \ }) -#define vx_csr_set(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ +#define csr_read_clear(csr, val) ({ \ + unsigned __r; \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \ + __r; \ }) -#define vx_csr_read_clear(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_clear(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ +#define csr_clear(csr, val) ({ \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \ }) // Texture load -#define vx_tex(unit, u, v, l) ({ \ +#define vx_tex(unit, u, v, lod) ({ \ + unsigned __r; \ + __asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(unit), "r"(u), "r"(v), "r"(lod)); \ + __r; \ +}) + +// Conditional move +#define vx_cmov(c, t, f) ({ \ unsigned __r; \ - unsigned __u = u; \ - unsigned __v = v; \ - unsigned __l = l; \ - __asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \ + __asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \ __r; \ }) @@ -151,7 +121,7 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) { // Prefetch inline void vx_prefetch(unsigned addr) { - asm volatile (".insn s 0x6b, 6, x0, 0(%0)" :: "r"(addr) ); + asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) ); } // Return active warp's thread id diff --git a/runtime/src/tinyprintf.c b/runtime/src/tinyprintf.c new file mode 100644 index 00000000..4c88ef29 --- /dev/null +++ b/runtime/src/tinyprintf.c @@ -0,0 +1,890 @@ +/////////////////////////////////////////////////////////////////////////////// +// \author (c) Marco Paland (info@paland.com) +// 2014-2019, PALANDesign Hannover, Germany +// +// \license The MIT License (MIT) +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// \brief Tiny printf, sprintf and (v)snprintf implementation, optimized for speed on +// embedded systems with a very limited resources. These routines are thread +// safe and reentrant! +// Use this instead of the bloated standard/newlib printf cause these use +// malloc for printf (and may not be thread safe). +// +/////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include "tinyprintf.h" +#include "vx_print.h" + + +// define this globally (e.g. gcc -DPRINTF_INCLUDE_CONFIG_H ...) to include the +// printf_config.h header file +// default: undefined +#ifdef PRINTF_INCLUDE_CONFIG_H +#include "printf_config.h" +#endif + + +// 'ntoa' conversion buffer size, this must be big enough to hold one converted +// numeric number including padded zeros (dynamically created on stack) +// default: 32 byte +#ifndef PRINTF_NTOA_BUFFER_SIZE +#define PRINTF_NTOA_BUFFER_SIZE 32U +#endif + +// 'ftoa' conversion buffer size, this must be big enough to hold one converted +// float number including padded zeros (dynamically created on stack) +// default: 32 byte +#ifndef PRINTF_FTOA_BUFFER_SIZE +#define PRINTF_FTOA_BUFFER_SIZE 32U +#endif + +// support for the floating point type (%f) +// default: activated +#ifndef PRINTF_DISABLE_SUPPORT_FLOAT +#define PRINTF_SUPPORT_FLOAT +#endif + +// support for exponential floating point notation (%e/%g) +// default: activated +#ifndef PRINTF_DISABLE_SUPPORT_EXPONENTIAL +#define PRINTF_SUPPORT_EXPONENTIAL +#endif + +// define the default floating point precision +// default: 6 digits +#ifndef PRINTF_DEFAULT_FLOAT_PRECISION +#define PRINTF_DEFAULT_FLOAT_PRECISION 6U +#endif + +// define the largest float suitable to print with %f +// default: 1e9 +#ifndef PRINTF_MAX_FLOAT +#define PRINTF_MAX_FLOAT 1e9 +#endif + +// support for the long long types (%llu or %p) +// default: activated +#ifndef PRINTF_DISABLE_SUPPORT_LONG_LONG +#define PRINTF_SUPPORT_LONG_LONG +#endif + +// support for the ptrdiff_t type (%t) +// ptrdiff_t is normally defined in as long or long long type +// default: activated +#ifndef PRINTF_DISABLE_SUPPORT_PTRDIFF_T +#define PRINTF_SUPPORT_PTRDIFF_T +#endif + +/////////////////////////////////////////////////////////////////////////////// + +// internal flag definitions +#define FLAGS_ZEROPAD (1U << 0U) +#define FLAGS_LEFT (1U << 1U) +#define FLAGS_PLUS (1U << 2U) +#define FLAGS_SPACE (1U << 3U) +#define FLAGS_HASH (1U << 4U) +#define FLAGS_UPPERCASE (1U << 5U) +#define FLAGS_CHAR (1U << 6U) +#define FLAGS_SHORT (1U << 7U) +#define FLAGS_LONG (1U << 8U) +#define FLAGS_LONG_LONG (1U << 9U) +#define FLAGS_PRECISION (1U << 10U) +#define FLAGS_ADAPT_EXP (1U << 11U) + + +// import float.h for DBL_MAX +#if defined(PRINTF_SUPPORT_FLOAT) +#include +#endif + + +// output function type +typedef void (*out_fct_type)(char character, void* buffer, size_t idx, size_t maxlen); + + +// wrapper (used as buffer) for output function type +typedef struct { + void (*fct)(char character, void* arg); + void* arg; +} out_fct_wrap_type; + + +// internal buffer output +static inline void _out_buffer(char character, void* buffer, size_t idx, size_t maxlen) +{ + if (idx < maxlen) { + ((char*)buffer)[idx] = character; + } +} + + +// internal null output +static inline void _out_null(char character, void* buffer, size_t idx, size_t maxlen) +{ + (void)character; (void)buffer; (void)idx; (void)maxlen; +} + + +// internal _putchar wrapper +static inline void _out_char(char character, void* buffer, size_t idx, size_t maxlen) +{ + (void)buffer; (void)idx; (void)maxlen; + if (character) { + vx_putchar(character); + } +} + + +// internal output function wrapper +static inline void _out_fct(char character, void* buffer, size_t idx, size_t maxlen) +{ + (void)idx; (void)maxlen; + if (character) { + // buffer is the output fct pointer + ((out_fct_wrap_type*)buffer)->fct(character, ((out_fct_wrap_type*)buffer)->arg); + } +} + + +// internal secure strlen +// \return The length of the string (excluding the terminating 0) limited by 'maxsize' +static inline unsigned int _strnlen_s(const char* str, size_t maxsize) +{ + const char* s; + for (s = str; *s && maxsize--; ++s); + return (unsigned int)(s - str); +} + + +// internal test if char is a digit (0-9) +// \return true if char is a digit +static inline bool _is_digit(char ch) +{ + return (ch >= '0') && (ch <= '9'); +} + + +// internal ASCII string to unsigned int conversion +static unsigned int _atoi(const char** str) +{ + unsigned int i = 0U; + while (_is_digit(**str)) { + i = i * 10U + (unsigned int)(*((*str)++) - '0'); + } + return i; +} + + +// output the specified string in reverse, taking care of any zero-padding +static size_t _out_rev(out_fct_type out, char* buffer, size_t idx, size_t maxlen, const char* buf, size_t len, unsigned int width, unsigned int flags) +{ + const size_t start_idx = idx; + + // pad spaces up to given width + if (!(flags & FLAGS_LEFT) && !(flags & FLAGS_ZEROPAD)) { + for (size_t i = len; i < width; i++) { + out(' ', buffer, idx++, maxlen); + } + } + + // reverse string + while (len) { + out(buf[--len], buffer, idx++, maxlen); + } + + // append pad spaces up to given width + if (flags & FLAGS_LEFT) { + while (idx - start_idx < width) { + out(' ', buffer, idx++, maxlen); + } + } + + return idx; +} + + +// internal itoa format +static size_t _ntoa_format(out_fct_type out, char* buffer, size_t idx, size_t maxlen, char* buf, size_t len, bool negative, unsigned int base, unsigned int prec, unsigned int width, unsigned int flags) +{ + // pad leading zeros + if (!(flags & FLAGS_LEFT)) { + if (width && (flags & FLAGS_ZEROPAD) && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) { + width--; + } + while ((len < prec) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = '0'; + } + while ((flags & FLAGS_ZEROPAD) && (len < width) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = '0'; + } + } + + // handle hash + if (flags & FLAGS_HASH) { + if (!(flags & FLAGS_PRECISION) && len && ((len == prec) || (len == width))) { + len--; + if (len && (base == 16U)) { + len--; + } + } + if ((base == 16U) && !(flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = 'x'; + } + else if ((base == 16U) && (flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = 'X'; + } + else if ((base == 2U) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = 'b'; + } + if (len < PRINTF_NTOA_BUFFER_SIZE) { + buf[len++] = '0'; + } + } + + if (len < PRINTF_NTOA_BUFFER_SIZE) { + if (negative) { + buf[len++] = '-'; + } + else if (flags & FLAGS_PLUS) { + buf[len++] = '+'; // ignore the space if the '+' exists + } + else if (flags & FLAGS_SPACE) { + buf[len++] = ' '; + } + } + + return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags); +} + + +// internal itoa for 'long' type +static size_t _ntoa_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long value, bool negative, unsigned long base, unsigned int prec, unsigned int width, unsigned int flags) +{ + char buf[PRINTF_NTOA_BUFFER_SIZE]; + size_t len = 0U; + + // no hash for 0 values + if (!value) { + flags &= ~FLAGS_HASH; + } + + // write if precision != 0 and value is != 0 + if (!(flags & FLAGS_PRECISION) || value) { + do { + const char digit = (char)(value % base); + buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10; + value /= base; + } while (value && (len < PRINTF_NTOA_BUFFER_SIZE)); + } + + return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags); +} + + +// internal itoa for 'long long' type +#if defined(PRINTF_SUPPORT_LONG_LONG) +static size_t _ntoa_long_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long long value, bool negative, unsigned long long base, unsigned int prec, unsigned int width, unsigned int flags) +{ + char buf[PRINTF_NTOA_BUFFER_SIZE]; + size_t len = 0U; + + // no hash for 0 values + if (!value) { + flags &= ~FLAGS_HASH; + } + + // write if precision != 0 and value is != 0 + if (!(flags & FLAGS_PRECISION) || value) { + do { + const char digit = (char)(value % base); + buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10; + value /= base; + } while (value && (len < PRINTF_NTOA_BUFFER_SIZE)); + } + + return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags); +} +#endif // PRINTF_SUPPORT_LONG_LONG + + +#if defined(PRINTF_SUPPORT_FLOAT) + +#if defined(PRINTF_SUPPORT_EXPONENTIAL) +// forward declaration so that _ftoa can switch to exp notation for values > PRINTF_MAX_FLOAT +static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags); +#endif + + +// internal ftoa for fixed decimal floating point +static size_t _ftoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags) +{ + char buf[PRINTF_FTOA_BUFFER_SIZE]; + size_t len = 0U; + double diff = 0.0; + + // powers of 10 + static const double pow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 }; + + // test for special values + if (value != value) + return _out_rev(out, buffer, idx, maxlen, "nan", 3, width, flags); + if (value < -DBL_MAX) + return _out_rev(out, buffer, idx, maxlen, "fni-", 4, width, flags); + if (value > DBL_MAX) + return _out_rev(out, buffer, idx, maxlen, (flags & FLAGS_PLUS) ? "fni+" : "fni", (flags & FLAGS_PLUS) ? 4U : 3U, width, flags); + + // test for very large values + // standard printf behavior is to print EVERY whole number digit -- which could be 100s of characters overflowing your buffers == bad + if ((value > PRINTF_MAX_FLOAT) || (value < -PRINTF_MAX_FLOAT)) { +#if defined(PRINTF_SUPPORT_EXPONENTIAL) + return _etoa(out, buffer, idx, maxlen, value, prec, width, flags); +#else + return 0U; +#endif + } + + // test for negative + bool negative = false; + if (value < 0) { + negative = true; + value = 0 - value; + } + + // set default precision, if not set explicitly + if (!(flags & FLAGS_PRECISION)) { + prec = PRINTF_DEFAULT_FLOAT_PRECISION; + } + // limit precision to 9, cause a prec >= 10 can lead to overflow errors + while ((len < PRINTF_FTOA_BUFFER_SIZE) && (prec > 9U)) { + buf[len++] = '0'; + prec--; + } + + int whole = (int)value; + double tmp = (value - whole) * pow10[prec]; + unsigned long frac = (unsigned long)tmp; + diff = tmp - frac; + + if (diff > 0.5) { + ++frac; + // handle rollover, e.g. case 0.99 with prec 1 is 1.0 + if (frac >= pow10[prec]) { + frac = 0; + ++whole; + } + } + else if (diff < 0.5) { + } + else if ((frac == 0U) || (frac & 1U)) { + // if halfway, round up if odd OR if last digit is 0 + ++frac; + } + + if (prec == 0U) { + diff = value - (double)whole; + if ((!(diff < 0.5) || (diff > 0.5)) && (whole & 1)) { + // exactly 0.5 and ODD, then round up + // 1.5 -> 2, but 2.5 -> 2 + ++whole; + } + } + else { + unsigned int count = prec; + // now do fractional part, as an unsigned number + while (len < PRINTF_FTOA_BUFFER_SIZE) { + --count; + buf[len++] = (char)(48U + (frac % 10U)); + if (!(frac /= 10U)) { + break; + } + } + // add extra 0s + while ((len < PRINTF_FTOA_BUFFER_SIZE) && (count-- > 0U)) { + buf[len++] = '0'; + } + if (len < PRINTF_FTOA_BUFFER_SIZE) { + // add decimal + buf[len++] = '.'; + } + } + + // do whole part, number is reversed + while (len < PRINTF_FTOA_BUFFER_SIZE) { + buf[len++] = (char)(48 + (whole % 10)); + if (!(whole /= 10)) { + break; + } + } + + // pad leading zeros + if (!(flags & FLAGS_LEFT) && (flags & FLAGS_ZEROPAD)) { + if (width && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) { + width--; + } + while ((len < width) && (len < PRINTF_FTOA_BUFFER_SIZE)) { + buf[len++] = '0'; + } + } + + if (len < PRINTF_FTOA_BUFFER_SIZE) { + if (negative) { + buf[len++] = '-'; + } + else if (flags & FLAGS_PLUS) { + buf[len++] = '+'; // ignore the space if the '+' exists + } + else if (flags & FLAGS_SPACE) { + buf[len++] = ' '; + } + } + + return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags); +} + + +#if defined(PRINTF_SUPPORT_EXPONENTIAL) +// internal ftoa variant for exponential floating-point type, contributed by Martijn Jasperse +static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags) +{ + // check for NaN and special values + if ((value != value) || (value > DBL_MAX) || (value < -DBL_MAX)) { + return _ftoa(out, buffer, idx, maxlen, value, prec, width, flags); + } + + // determine the sign + const bool negative = value < 0; + if (negative) { + value = -value; + } + + // default precision + if (!(flags & FLAGS_PRECISION)) { + prec = PRINTF_DEFAULT_FLOAT_PRECISION; + } + + // determine the decimal exponent + // based on the algorithm by David Gay (https://www.ampl.com/netlib/fp/dtoa.c) + union { + uint64_t U; + double F; + } conv; + + conv.F = value; + int exp2 = (int)((conv.U >> 52U) & 0x07FFU) - 1023; // effectively log2 + conv.U = (conv.U & ((1ULL << 52U) - 1U)) | (1023ULL << 52U); // drop the exponent so conv.F is now in [1,2) + // now approximate log10 from the log2 integer part and an expansion of ln around 1.5 + int expval = (int)(0.1760912590558 + exp2 * 0.301029995663981 + (conv.F - 1.5) * 0.289529654602168); + // now we want to compute 10^expval but we want to be sure it won't overflow + exp2 = (int)(expval * 3.321928094887362 + 0.5); + const double z = expval * 2.302585092994046 - exp2 * 0.6931471805599453; + const double z2 = z * z; + conv.U = (uint64_t)(exp2 + 1023) << 52U; + // compute exp(z) using continued fractions, see https://en.wikipedia.org/wiki/Exponential_function#Continued_fractions_for_ex + conv.F *= 1 + 2 * z / (2 - z + (z2 / (6 + (z2 / (10 + z2 / 14))))); + // correct for rounding errors + if (value < conv.F) { + expval--; + conv.F /= 10; + } + + // the exponent format is "%+03d" and largest value is "307", so set aside 4-5 characters + unsigned int minwidth = ((expval < 100) && (expval > -100)) ? 4U : 5U; + + // in "%g" mode, "prec" is the number of *significant figures* not decimals + if (flags & FLAGS_ADAPT_EXP) { + // do we want to fall-back to "%f" mode? + if ((value >= 1e-4) && (value < 1e6)) { + if ((int)prec > expval) { + prec = (unsigned)((int)prec - expval - 1); + } + else { + prec = 0; + } + flags |= FLAGS_PRECISION; // make sure _ftoa respects precision + // no characters in exponent + minwidth = 0U; + expval = 0; + } + else { + // we use one sigfig for the whole part + if ((prec > 0) && (flags & FLAGS_PRECISION)) { + --prec; + } + } + } + + // will everything fit? + unsigned int fwidth = width; + if (width > minwidth) { + // we didn't fall-back so subtract the characters required for the exponent + fwidth -= minwidth; + } else { + // not enough characters, so go back to default sizing + fwidth = 0U; + } + if ((flags & FLAGS_LEFT) && minwidth) { + // if we're padding on the right, DON'T pad the floating part + fwidth = 0U; + } + + // rescale the float value + if (expval) { + value /= conv.F; + } + + // output the floating part + const size_t start_idx = idx; + idx = _ftoa(out, buffer, idx, maxlen, negative ? -value : value, prec, fwidth, flags & ~FLAGS_ADAPT_EXP); + + // output the exponent part + if (minwidth) { + // output the exponential symbol + out((flags & FLAGS_UPPERCASE) ? 'E' : 'e', buffer, idx++, maxlen); + // output the exponent value + idx = _ntoa_long(out, buffer, idx, maxlen, (expval < 0) ? -expval : expval, expval < 0, 10, 0, minwidth-1, FLAGS_ZEROPAD | FLAGS_PLUS); + // might need to right-pad spaces + if (flags & FLAGS_LEFT) { + while (idx - start_idx < width) out(' ', buffer, idx++, maxlen); + } + } + return idx; +} +#endif // PRINTF_SUPPORT_EXPONENTIAL +#endif // PRINTF_SUPPORT_FLOAT + + +// internal vsnprintf +static int _vsnprintf(out_fct_type out, char* buffer, const size_t maxlen, const char* format, va_list va) { + unsigned int flags, width, precision, n; + size_t idx = 0U; + + if (!buffer) { + // use null output function + out = _out_null; + } + + while (*format) + { + // format specifier? %[flags][width][.precision][length] + if (*format != '%') { + // no + out(*format, buffer, idx++, maxlen); + format++; + continue; + } + else { + // yes, evaluate it + format++; + } + + // evaluate flags + flags = 0U; + do { + switch (*format) { + case '0': flags |= FLAGS_ZEROPAD; format++; n = 1U; break; + case '-': flags |= FLAGS_LEFT; format++; n = 1U; break; + case '+': flags |= FLAGS_PLUS; format++; n = 1U; break; + case ' ': flags |= FLAGS_SPACE; format++; n = 1U; break; + case '#': flags |= FLAGS_HASH; format++; n = 1U; break; + default : n = 0U; break; + } + } while (n); + + // evaluate width field + width = 0U; + if (_is_digit(*format)) { + width = _atoi(&format); + } + else if (*format == '*') { + const int w = va_arg(va, int); + if (w < 0) { + flags |= FLAGS_LEFT; // reverse padding + width = (unsigned int)-w; + } + else { + width = (unsigned int)w; + } + format++; + } + + // evaluate precision field + precision = 0U; + if (*format == '.') { + flags |= FLAGS_PRECISION; + format++; + if (_is_digit(*format)) { + precision = _atoi(&format); + } + else if (*format == '*') { + const int prec = (int)va_arg(va, int); + precision = prec > 0 ? (unsigned int)prec : 0U; + format++; + } + } + + // evaluate length field + switch (*format) { + case 'l' : + flags |= FLAGS_LONG; + format++; + if (*format == 'l') { + flags |= FLAGS_LONG_LONG; + format++; + } + break; + case 'h' : + flags |= FLAGS_SHORT; + format++; + if (*format == 'h') { + flags |= FLAGS_CHAR; + format++; + } + break; +#if defined(PRINTF_SUPPORT_PTRDIFF_T) + case 't' : + flags |= (sizeof(ptrdiff_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG); + format++; + break; +#endif + case 'j' : + flags |= (sizeof(intmax_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG); + format++; + break; + case 'z' : + flags |= (sizeof(size_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG); + format++; + break; + default : + break; + } + + // evaluate specifier + switch (*format) { + case 'd' : + case 'i' : + case 'u' : + case 'x' : + case 'X' : + case 'o' : + case 'b' : { + // set the base + unsigned int base; + if (*format == 'x' || *format == 'X') { + base = 16U; + } + else if (*format == 'o') { + base = 8U; + } + else if (*format == 'b') { + base = 2U; + } + else { + base = 10U; + flags &= ~FLAGS_HASH; // no hash for dec format + } + // uppercase + if (*format == 'X') { + flags |= FLAGS_UPPERCASE; + } + + // no plus or space flag for u, x, X, o, b + if ((*format != 'i') && (*format != 'd')) { + flags &= ~(FLAGS_PLUS | FLAGS_SPACE); + } + + // ignore '0' flag when precision is given + if (flags & FLAGS_PRECISION) { + flags &= ~FLAGS_ZEROPAD; + } + + // convert the integer + if ((*format == 'i') || (*format == 'd')) { + // signed + if (flags & FLAGS_LONG_LONG) { +#if defined(PRINTF_SUPPORT_LONG_LONG) + const long long value = va_arg(va, long long); + idx = _ntoa_long_long(out, buffer, idx, maxlen, (unsigned long long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags); +#endif + } + else if (flags & FLAGS_LONG) { + const long value = va_arg(va, long); + idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags); + } + else { + const int value = (flags & FLAGS_CHAR) ? (char)va_arg(va, int) : (flags & FLAGS_SHORT) ? (short int)va_arg(va, int) : va_arg(va, int); + idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned int)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags); + } + } + else { + // unsigned + if (flags & FLAGS_LONG_LONG) { +#if defined(PRINTF_SUPPORT_LONG_LONG) + idx = _ntoa_long_long(out, buffer, idx, maxlen, va_arg(va, unsigned long long), false, base, precision, width, flags); +#endif + } + else if (flags & FLAGS_LONG) { + idx = _ntoa_long(out, buffer, idx, maxlen, va_arg(va, unsigned long), false, base, precision, width, flags); + } + else { + const unsigned int value = (flags & FLAGS_CHAR) ? (unsigned char)va_arg(va, unsigned int) : (flags & FLAGS_SHORT) ? (unsigned short int)va_arg(va, unsigned int) : va_arg(va, unsigned int); + idx = _ntoa_long(out, buffer, idx, maxlen, value, false, base, precision, width, flags); + } + } + format++; + break; + } +#if defined(PRINTF_SUPPORT_FLOAT) + case 'f' : + case 'F' : + if (*format == 'F') flags |= FLAGS_UPPERCASE; + idx = _ftoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags); + format++; + break; +#if defined(PRINTF_SUPPORT_EXPONENTIAL) + case 'e': + case 'E': + case 'g': + case 'G': + if ((*format == 'g')||(*format == 'G')) flags |= FLAGS_ADAPT_EXP; + if ((*format == 'E')||(*format == 'G')) flags |= FLAGS_UPPERCASE; + idx = _etoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags); + format++; + break; +#endif // PRINTF_SUPPORT_EXPONENTIAL +#endif // PRINTF_SUPPORT_FLOAT + case 'c' : { + unsigned int l = 1U; + // pre padding + if (!(flags & FLAGS_LEFT)) { + while (l++ < width) { + out(' ', buffer, idx++, maxlen); + } + } + // char output + out((char)va_arg(va, int), buffer, idx++, maxlen); + // post padding + if (flags & FLAGS_LEFT) { + while (l++ < width) { + out(' ', buffer, idx++, maxlen); + } + } + format++; + break; + } + + case 's' : { + const char* p = va_arg(va, char*); + unsigned int l = _strnlen_s(p, precision ? precision : (size_t)-1); + // pre padding + if (flags & FLAGS_PRECISION) { + l = (l < precision ? l : precision); + } + if (!(flags & FLAGS_LEFT)) { + while (l++ < width) { + out(' ', buffer, idx++, maxlen); + } + } + // string output + while ((*p != 0) && (!(flags & FLAGS_PRECISION) || precision--)) { + out(*(p++), buffer, idx++, maxlen); + } + // post padding + if (flags & FLAGS_LEFT) { + while (l++ < width) { + out(' ', buffer, idx++, maxlen); + } + } + format++; + break; + } + + case 'p' : { + width = sizeof(void*) * 2U; + flags |= FLAGS_ZEROPAD | FLAGS_UPPERCASE; +#if defined(PRINTF_SUPPORT_LONG_LONG) + const bool is_ll = sizeof(uintptr_t) == sizeof(long long); + if (is_ll) { + idx = _ntoa_long_long(out, buffer, idx, maxlen, (uintptr_t)va_arg(va, void*), false, 16U, precision, width, flags); + } + else { +#endif + idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)((uintptr_t)va_arg(va, void*)), false, 16U, precision, width, flags); +#if defined(PRINTF_SUPPORT_LONG_LONG) + } +#endif + format++; + break; + } + + case '%' : + out('%', buffer, idx++, maxlen); + format++; + break; + + default : + out(*format, buffer, idx++, maxlen); + format++; + break; + } + } + + // termination + out((char)0, buffer, idx < maxlen ? idx : maxlen - 1U, maxlen); + + // return written chars without terminating \0 + return (int)idx; +} + +int tiny_printf(const char* format, ...) { + va_list va; + va_start(va, format); + char buffer[1]; + const int ret = _vsnprintf(_out_char, buffer, (size_t)-1, format, va); + va_end(va); + return ret; +} + +int tiny_sprintf(char* buffer, const char* format, ...) { + va_list va; + va_start(va, format); + const int ret = _vsnprintf(_out_buffer, buffer, (size_t)-1, format, va); + va_end(va); + return ret; +} + +int tiny_snprintf(char* buffer, size_t count, const char* format, ...) { + va_list va; + va_start(va, format); + const int ret = _vsnprintf(_out_buffer, buffer, count, format, va); + va_end(va); + return ret; +} + +int tiny_vprintf(const char* format, va_list va) { + char buffer[1]; + return _vsnprintf(_out_char, buffer, (size_t)-1, format, va); +} + +int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va) { + return _vsnprintf(_out_buffer, buffer, count, format, va); +} \ No newline at end of file diff --git a/runtime/src/tinyprintf.h b/runtime/src/tinyprintf.h new file mode 100644 index 00000000..9aa79d9a --- /dev/null +++ b/runtime/src/tinyprintf.h @@ -0,0 +1,86 @@ +/////////////////////////////////////////////////////////////////////////////// +// \author (c) Marco Paland (info@paland.com) +// 2014-2019, PALANDesign Hannover, Germany +// +// \license The MIT License (MIT) +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// \brief Tiny printf, sprintf and snprintf implementation, optimized for speed on +// embedded systems with a very limited resources. +// Use this instead of bloated standard/newlib printf. +// These routines are thread safe and reentrant. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _TINYPRINTF_H_ +#define _TINYPRINTF_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Tiny printf implementation + * You have to implement _putchar if you use printf() + * To avoid conflicts with the regular printf() API it is overridden by macro defines + * and internal underscore-appended functions like printf_() are used + * \param format A string that specifies the format of the output + * \return The number of characters that are written into the array, not counting the terminating null character + */ +int tiny_printf(const char* format, ...); + +/** + * Tiny sprintf implementation + * Due to security reasons (buffer overflow) YOU SHOULD CONSIDER USING (V)SNPRINTF INSTEAD! + * \param buffer A pointer to the buffer where to store the formatted string. MUST be big enough to store the output! + * \param format A string that specifies the format of the output + * \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character + */ +int tiny_sprintf(char* buffer, const char* format, ...); + +/** + * Tiny snprintf/vsnprintf implementation + * \param buffer A pointer to the buffer where to store the formatted string + * \param count The maximum number of characters to store in the buffer, including a terminating null character + * \param format A string that specifies the format of the output + * \param va A value identifying a variable arguments list + * \return The number of characters that COULD have been written into the buffer, not counting the terminating + * null character. A value equal or larger than count indicates truncation. Only when the returned value + * is non-negative and less than count, the string has been completely written. + */ +int tiny_snprintf(char* buffer, size_t count, const char* format, ...); +int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va); + +/** + * Tiny vprintf implementation + * \param format A string that specifies the format of the output + * \param va A value identifying a variable arguments list + * \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character + */ +int tiny_vprintf(const char* format, va_list va); + +#ifdef __cplusplus +} +#endif + +#endif // _TINYPRINTF_H_ \ No newline at end of file diff --git a/runtime/src/vx_perf.c b/runtime/src/vx_perf.c index edfecdeb..0fe74375 100644 --- a/runtime/src/vx_perf.c +++ b/runtime/src/vx_perf.c @@ -4,10 +4,10 @@ #include #define DUMP_CSR_4(d, s) \ - csr_mem[d + 0] = vx_csr_read(s + 0); \ - csr_mem[d + 1] = vx_csr_read(s + 1); \ - csr_mem[d + 2] = vx_csr_read(s + 2); \ - csr_mem[d + 3] = vx_csr_read(s + 3); + csr_mem[d + 0] = csr_read(s + 0); \ + csr_mem[d + 1] = csr_read(s + 1); \ + csr_mem[d + 2] = csr_read(s + 2); \ + csr_mem[d + 3] = csr_read(s + 3); #define DUMP_CSR_32(d, s) \ DUMP_CSR_4(d + 0, s + 0) \ diff --git a/runtime/src/vx_print.c b/runtime/src/vx_print.c index 86458644..e75993e2 100644 --- a/runtime/src/vx_print.c +++ b/runtime/src/vx_print.c @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include "tinyprintf.h" #ifdef __cplusplus extern "C" { @@ -26,46 +28,19 @@ typedef struct { int precision; } putfloat_arg_t; -static void __printf_cb(printf_arg_t* arg) { - arg->ret = vprintf(arg->format, *arg->va); -} - -int vx_vprintf(const char* format, va_list va) { - printf_arg_t arg; - arg.format = format; - arg.va = &va; - vx_serial((vx_serial_cb)__printf_cb, &arg); - return arg.ret; -} - -int vx_printf(const char * format, ...) { - int ret; - va_list va; - va_start(va, format); - ret = vx_vprintf(format, va); - va_end(va); - return ret; -} - -static void __putint_cb(const putint_arg_t* arg) { +static void __putint_cb(const putint_arg_t* arg) { char tmp[33]; float value = arg->value; int base = arg->base; itoa(value, tmp, base); for (int i = 0; i < 33; ++i) { int c = tmp[i]; - if (!c) break; + if (!c) + break; vx_putchar(c); } } -void vx_putint(int value, int base) { - putint_arg_t arg; - arg.value = value; - arg.base = base; - vx_serial((vx_serial_cb)__putint_cb, &arg); -} - static void __putfloat_cb(const putfloat_arg_t* arg) { float value = arg->value; int precision = arg->precision; @@ -79,6 +54,17 @@ static void __putfloat_cb(const putfloat_arg_t* arg) { } } +static void __vprintf_cb(printf_arg_t* arg) { + arg->ret = tiny_vprintf(arg->format, *arg->va); +} + +void vx_putint(int value, int base) { + putint_arg_t arg; + arg.value = value; + arg.base = base; + vx_serial((vx_serial_cb)__putint_cb, &arg); +} + void vx_putfloat(float value, int precision) { putfloat_arg_t arg; arg.value = value; @@ -86,6 +72,23 @@ void vx_putfloat(float value, int precision) { vx_serial((vx_serial_cb)__putfloat_cb, &arg); } +int vx_vprintf(const char* format, va_list va) { + printf_arg_t arg; + arg.format = format; + arg.va = &va; + vx_serial((vx_serial_cb)__vprintf_cb, &arg); + return arg.ret; +} + +int vx_printf(const char * format, ...) { + int ret; + va_list va; + va_start(va, format); + ret = vx_vprintf(format, va); + va_end(va); + return ret; +} + #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/runtime/src/vx_start.S b/runtime/src/vx_start.S index 0d2a0078..16e91a15 100644 --- a/runtime/src/vx_start.S +++ b/runtime/src/vx_start.S @@ -42,15 +42,9 @@ _start: .type _exit, @function .global _exit _exit: - beqz a0, label_exit_next - mv gp, a0 - ecall; - -label_exit_next: - # dump performance CSRs - call vx_perf_dump - - # disable all threads in current warp + mv s0, a0 + call vx_perf_dump + mv gp, s0 li a0, 0 .insn s 0x6b, 0, x0, 0(a0) # tmc a0 diff --git a/runtime/src/vx_syscalls.c b/runtime/src/vx_syscalls.c index 37d60b8d..37e4d193 100644 --- a/runtime/src/vx_syscalls.c +++ b/runtime/src/vx_syscalls.c @@ -16,7 +16,10 @@ int _open(const char *name, int flags, int mode) { return -1; } int _read(int file, char *ptr, int len) { return -1; } -caddr_t _sbrk(int incr) { return 0; } +caddr_t _sbrk(int incr) { + __asm__ __volatile__("ebreak"); + return 0; +} int _write(int file, char *ptr, int len) { int i; diff --git a/sim/Makefile b/sim/Makefile index eca60c0b..5c4584f4 100644 --- a/sim/Makefile +++ b/sim/Makefile @@ -1,11 +1,9 @@ all: - $(MAKE) -C common - $(MAKE) -C simX + $(MAKE) -C simx $(MAKE) -C rtlsim $(MAKE) -C vlsim clean: - $(MAKE) -C common clean - $(MAKE) -C simX clean + $(MAKE) -C simx clean $(MAKE) -C rtlsim clean $(MAKE) -C vlsim clean \ No newline at end of file diff --git a/sim/common/Makefile b/sim/common/Makefile deleted file mode 100644 index b17dc25b..00000000 --- a/sim/common/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -all: - SPECIALIZE_TYPE=RISCV SOFTFLOAT_OPTS="-fPIC -DSOFTFLOAT_ROUND_ODD -DINLINE_LEVEL=5 -DSOFTFLOAT_FAST_DIV32TO16 -DSOFTFLOAT_FAST_DIV64TO32" $(MAKE) -C softfloat/build/Linux-x86_64-GCC - -clean: - $(MAKE) -C softfloat/build/Linux-x86_64-GCC clean \ No newline at end of file diff --git a/sim/common/bitmanip.h b/sim/common/bitmanip.h new file mode 100644 index 00000000..f485cd6d --- /dev/null +++ b/sim/common/bitmanip.h @@ -0,0 +1,79 @@ +#pragma once + +#include +#include +#include + +constexpr uint32_t count_leading_zeros(uint32_t value) { + return value ? __builtin_clz(value) : 32; +} + +constexpr uint32_t count_trailing_zeros(uint32_t value) { + return value ? __builtin_ctz(value) : 32; +} + +constexpr bool ispow2(uint32_t value) { + return value && !(value & (value - 1)); +} + +constexpr uint32_t log2ceil(uint32_t value) { + return 32 - count_leading_zeros(value - 1); +} + +inline unsigned log2up(uint32_t value) { + return std::max(1, log2ceil(value)); +} + +constexpr unsigned log2floor(uint32_t value) { + return 31 - count_leading_zeros(value); +} + +constexpr unsigned ceil2(uint32_t value) { + return 32 - count_leading_zeros(value); +} + +inline uint64_t bit_clr(uint64_t bits, uint32_t index) { + assert(index <= 63); + return bits & ~(1ull << index); +} + +inline uint64_t bit_set(uint64_t bits, uint32_t index) { + assert(index <= 63); + return bits | (1ull << index); +} + +inline bool bit_get(uint64_t bits, uint32_t index) { + assert(index <= 63); + return (bits >> index) & 0x1; +} + +inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift; + return bits & ~mask; +} + +inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + uint64_t dirty = (value << (shift + start)) >> shift; + return bit_clrw(bits, start, end) | dirty; +} + +inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + return (bits << shift) >> (shift + start); +} + +// Apply integer sign extension +inline uint32_t sext32(uint32_t word, uint32_t width) { + assert(width > 1); + assert(width <= 32); + uint32_t mask = (1 << width) - 1; + return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word; +} \ No newline at end of file diff --git a/sim/common/mem.cpp b/sim/common/mem.cpp index 6c4b94de..ff67489d 100644 --- a/sim/common/mem.cpp +++ b/sim/common/mem.cpp @@ -168,11 +168,12 @@ void MemoryUnit::tlbRm(uint64_t va) { /////////////////////////////////////////////////////////////////////////////// -RAM::RAM(uint32_t num_pages, uint32_t page_size) - : page_bits_(log2ceil(page_size)) { - assert(ispow2(page_size)); - mem_.resize(num_pages, NULL); - size_ = uint64_t(mem_.size()) << page_bits_; +RAM::RAM(uint32_t page_size) + : size_(0) + , page_bits_(log2ceil(page_size)) + , last_page_(nullptr) + , last_page_index_(0) { + assert(ispow2(page_size)); } RAM::~RAM() { @@ -180,31 +181,41 @@ RAM::~RAM() { } void RAM::clear() { - for (auto& page : mem_) { - delete[] page; - page = NULL; + for (auto& page : pages_) { + delete[] page.second; } } uint64_t RAM::size() const { - return size_; + return uint64_t(pages_.size()) << page_bits_; } -uint8_t *RAM::get(uint32_t address) const { - uint32_t page_size = 1 << page_bits_; - uint32_t page_index = address >> page_bits_; - uint32_t byte_offset = address & ((1 << page_bits_) - 1); +uint8_t *RAM::get(uint64_t address) const { + uint32_t page_size = 1 << page_bits_; + uint32_t page_offset = address & (page_size - 1); + uint64_t page_index = address >> page_bits_; - auto &page = mem_.at(page_index); - if (page == NULL) { - uint8_t *ptr = new uint8_t[page_size]; - // set uninitialized data to "baadf00d" - for (uint32_t i = 0; i < page_size; ++i) { - ptr[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff; + uint8_t* page; + if (last_page_ && last_page_index_ == page_index) { + page = last_page_; + } else { + auto it = pages_.find(page_index); + if (it != pages_.end()) { + page = it->second; + } else { + uint8_t *ptr = new uint8_t[page_size]; + // set uninitialized data to "baadf00d" + for (uint32_t i = 0; i < page_size; ++i) { + ptr[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff; + } + pages_.emplace(page_index, ptr); + page = ptr; } - page = ptr; + last_page_ = page; + last_page_index_ = page_index; } - return page + byte_offset; + + return page + page_offset; } void RAM::read(void *data, uint64_t addr, uint64_t size) { diff --git a/sim/common/mem.h b/sim/common/mem.h index 8929b4d9..d404602d 100644 --- a/sim/common/mem.h +++ b/sim/common/mem.h @@ -130,13 +130,13 @@ private: class RAM : public MemDevice { public: - RAM(uint32_t num_pages, uint32_t page_size); - + RAM(uint32_t page_size); ~RAM(); void clear(); uint64_t size() const override; + void read(void *data, uint64_t addr, uint64_t size) override; void write(const void *data, uint64_t addr, uint64_t size) override; @@ -153,11 +153,13 @@ public: private: - uint8_t *get(uint32_t address) const; + uint8_t *get(uint64_t address) const; - mutable std::vector mem_; - uint32_t page_bits_; uint64_t size_; + uint32_t page_bits_; + mutable std::unordered_map pages_; + mutable uint8_t* last_page_; + mutable uint64_t last_page_index_; }; } // namespace vortex \ No newline at end of file diff --git a/sim/common/mempool.h b/sim/common/mempool.h new file mode 100644 index 00000000..a5c0429d --- /dev/null +++ b/sim/common/mempool.h @@ -0,0 +1,47 @@ +#pragma once + +#include + +template +class MemoryPool { +public: + MemoryPool(uint32_t max_size) : max_size_(max_size) {} + + MemoryPool(MemoryPool && other) + : free_list_(std::move(other.free_list_)) + {} + + ~MemoryPool() { + this->flush(); + } + + void* allocate() { + void* mem; + if (!free_list_.empty()) { + mem = static_cast(free_list_.top()); + free_list_.pop(); + } else { + mem = ::operator new(sizeof(T)); + } + return mem; + } + + void deallocate(void * object) { + if (free_list_.size() < max_size_) { + free_list_.push(static_cast(object)); + } else { + ::operator delete(object); + } + } + + void flush() { + while (!free_list_.empty()) { + ::operator delete(free_list_.top()); + free_list_.pop(); + } + } + +private: + std::stack free_list_; + uint32_t max_size_; +}; \ No newline at end of file diff --git a/sim/common/rvfloats.cpp b/sim/common/rvfloats.cpp index c23cb8da..17fab394 100644 --- a/sim/common/rvfloats.cpp +++ b/sim/common/rvfloats.cpp @@ -3,8 +3,8 @@ extern "C" { #include -#include -#include +#include +#include <../RISCV/specialize.h> } #define F32_SIGN 0x80000000 diff --git a/sim/common/simobject.h b/sim/common/simobject.h new file mode 100644 index 00000000..eb32302d --- /dev/null +++ b/sim/common/simobject.h @@ -0,0 +1,399 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "mempool.h" + +class SimObjectBase; + +/////////////////////////////////////////////////////////////////////////////// + +class SimPortBase { +public: + virtual ~SimPortBase() {} + + SimObjectBase* module() const { + return module_; + } + +protected: + SimPortBase(SimObjectBase* module) + : module_(module) + {} + + SimPortBase& operator=(const SimPortBase&) = delete; + + SimObjectBase* module_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SimPort : public SimPortBase { +public: + typedef std::function TxCallback; + + SimPort(SimObjectBase* module) + : SimPortBase(module) + , peer_(nullptr) + , tx_cb_(nullptr) + {} + + void send(const Pkt& pkt, uint64_t delay = 1) const; + + void bind(SimPort* peer) { + assert(peer_ == nullptr); + peer_ = peer; + } + + void unbind() { + peer_ = nullptr; + } + + bool connected() const { + return (peer_ != nullptr); + } + + SimPort* peer() const { + return peer_; + } + + bool empty() const { + return queue_.empty(); + } + + const Pkt& front() const { + return queue_.front(); + } + + Pkt& front() { + return queue_.front().pkt; + } + + const Pkt& back() const { + return queue_.back(); + } + + Pkt& back() { + return queue_.back().pkt; + } + + uint64_t pop() { + auto cycle = queue_.front().cycle; + queue_.pop(); + return cycle; + } + + void tx_callback(const TxCallback& callback) { + tx_cb_ = callback; + } + +protected: + struct timed_pkt_t { + Pkt pkt; + uint64_t cycle; + }; + + std::queue queue_; + SimPort* peer_; + TxCallback tx_cb_; + + void push(const Pkt& data, uint64_t cycle) { + if (tx_cb_) { + tx_cb_(data, cycle); + } + if (peer_) { + peer_->push(data, cycle); + } else { + queue_.push({data, cycle}); + } + } + + SimPort& operator=(const SimPort&) = delete; + + template friend class SimPortEvent; +}; + +/////////////////////////////////////////////////////////////////////////////// + +class SimEventBase { +public: + typedef std::shared_ptr Ptr; + + virtual ~SimEventBase() {} + + virtual void fire() const = 0; + + uint64_t time() const { + return time_; + } + +protected: + SimEventBase(uint64_t time) : time_(time) {} + + uint64_t time_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SimCallEvent : public SimEventBase { +public: + void fire() const override { + func_(pkt_); + } + + typedef std::function Func; + + SimCallEvent(const Func& func, const Pkt& pkt, uint64_t time) + : SimEventBase(time) + , func_(func) + , pkt_(pkt) + {} + + void* operator new(size_t /*size*/) { + return allocator().allocate(); + } + + void operator delete(void* ptr) { + allocator().deallocate(ptr); + } + +protected: + Func func_; + Pkt pkt_; + + static MemoryPool>& allocator() { + static MemoryPool> instance(64); + return instance; + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SimPortEvent : public SimEventBase { +public: + void fire() const override { + const_cast*>(port_)->push(pkt_, time_); + } + + SimPortEvent(const SimPort* port, const Pkt& pkt, uint64_t time) + : SimEventBase(time) + , port_(port) + , pkt_(pkt) + {} + + void* operator new(size_t /*size*/) { + return allocator().allocate(); + } + + void operator delete(void* ptr) { + allocator().deallocate(ptr); + } + +protected: + const SimPort* port_; + Pkt pkt_; + + static MemoryPool>& allocator() { + static MemoryPool> instance(64); + return instance; + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +class SimContext; + +class SimObjectBase { +public: + typedef std::shared_ptr Ptr; + + virtual ~SimObjectBase() {} + + const std::string& name() const { + return name_; + } + +protected: + + SimObjectBase(const SimContext& ctx, const char* name); + +private: + + virtual void do_reset() = 0; + + virtual void do_tick() = 0; + + std::string name_; + + friend class SimPlatform; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SimObject : public SimObjectBase { +public: + typedef std::shared_ptr Ptr; + + template + static Ptr Create(Args&&... args); + +protected: + + SimObject(const SimContext& ctx, const char* name) + : SimObjectBase(ctx, name) + {} + +private: + + const Impl* impl() const { + return static_cast(this); + } + + Impl* impl() { + return static_cast(this); + } + + void do_reset() override { + this->impl()->reset(); + } + + void do_tick() override { + this->impl()->tick(); + } +}; + +class SimContext { +private: + SimContext() {} + + friend class SimPlatform; +}; + +/////////////////////////////////////////////////////////////////////////////// + +class SimPlatform { +public: + static SimPlatform& instance() { + static SimPlatform s_inst; + return s_inst; + } + + bool initialize() { + //-- + return true; + } + + void finalize() { + instance().clear(); + } + + template + typename SimObject::Ptr create_object(Args&&... args) { + auto obj = std::make_shared(SimContext{}, std::forward(args)...); + objects_.push_back(obj); + return obj; + } + + void release_object(const SimObjectBase::Ptr& object) { + objects_.remove(object); + } + + template + void schedule(const typename SimCallEvent::Func& callback, + const Pkt& pkt, + uint64_t delay) { + assert(delay != 0); + auto evt = std::make_shared>(callback, pkt, cycles_ + delay); + events_.emplace_back(evt); + } + + void reset() { + events_.clear(); + for (auto& object : objects_) { + object->do_reset(); + } + cycles_ = 0; + } + + void tick() { + // evaluate events + auto evt_it = events_.begin(); + auto evt_it_end = events_.end(); + while (evt_it != evt_it_end) { + auto& event = *evt_it; + if (cycles_ >= event->time()) { + event->fire(); + evt_it = events_.erase(evt_it); + } else { + ++evt_it; + } + } + // evaluate components + for (auto& object : objects_) { + object->do_tick(); + } + // advance clock + ++cycles_; + } + + uint64_t cycles() const { + return cycles_; + } + +private: + + SimPlatform() : cycles_(0) {} + + virtual ~SimPlatform() { + this->clear(); + } + + void clear() { + objects_.clear(); + events_.clear(); + } + + template + void schedule(const SimPort* port, const Pkt& pkt, uint64_t delay) { + assert(delay != 0); + auto evt = SimEventBase::Ptr(new SimPortEvent(port, pkt, cycles_ + delay)); + events_.emplace_back(evt); + } + + std::list objects_; + std::list events_; + uint64_t cycles_; + + template friend class SimPort; + friend class SimObjectBase; +}; + +/////////////////////////////////////////////////////////////////////////////// + +inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) + : name_(name) +{} + +template +template +typename SimObject::Ptr SimObject::Create(Args&&... args) { + return SimPlatform::instance().create_object(std::forward(args)...); +} + +template +void SimPort::send(const Pkt& pkt, uint64_t delay) const { + if (peer_ && !tx_cb_) { + reinterpret_cast*>(peer_)->send(pkt, delay); + } else { + SimPlatform::instance().schedule(this, pkt, delay); + } +} \ No newline at end of file diff --git a/sim/common/texturing.h b/sim/common/texturing.h new file mode 100644 index 00000000..5941594e --- /dev/null +++ b/sim/common/texturing.h @@ -0,0 +1,237 @@ +#pragma once + +#include +#include +#include + +using namespace cocogfx; + +enum class WrapMode { + Clamp, + Repeat, + Mirror, +}; + +enum class TexFormat { + A8R8G8B8, + R5G6B5, + A1R5G5B5, + A4R4G4B4, + A8L8, + L8, + A8, +}; + +template +T Clamp(Fixed fx, WrapMode mode) { + switch (mode) { + case WrapMode::Clamp: return (fx.data() < 0) ? 0 : ((fx.data() > Fixed::MASK) ? Fixed::MASK : fx.data()); + case WrapMode::Repeat: return (fx.data() & Fixed::MASK); + case WrapMode::Mirror: return (bit_get(fx.data(), Fixed::FRAC) ? ~fx.data() : fx.data()); + default: + std::abort(); + return 0; + } +} + +inline uint32_t Stride(TexFormat format) { + switch (format) { + case TexFormat::A8R8G8B8: + return 4; + case TexFormat::R5G6B5: + case TexFormat::A1R5G5B5: + case TexFormat::A4R4G4B4: + case TexFormat::A8L8: + return 2; + case TexFormat::L8: + case TexFormat::A8: + return 1; + default: + std::abort(); + return 0; + } +} + +inline void Unpack8888(TexFormat format, + uint32_t texel, + uint32_t* lo, + uint32_t* hi) { + int r, g, b, a; + switch (format) { + case TexFormat::A8R8G8B8: + r = (texel >> 16) & 0xff; + g = (texel >> 8) & 0xff; + b = texel & 0xff; + a = texel >> 24; + break; + case TexFormat::R5G6B5: + r = ((texel >> 11) << 3) | (texel >> 13); + g = ((texel >> 3) & 0xfc) | ((texel >> 9) & 0x3); + b = ((texel & 0x1f) << 3) | ((texel & 0x1c) >> 2); + a = 0xff; + break; + case TexFormat::A1R5G5B5: + r = ((texel >> 7) & 0xf8) | ((texel << 1) >> 13); + g = ((texel >> 2) & 0xf8) | ((texel >> 7) & 7); + b = ((texel & 0x1f) << 3) | ((texel & 0x1c) >> 2); + a = 0xff * (texel >> 15); + break; + case TexFormat::A4R4G4B4: + r = ((texel >> 4) & 0xf0) | ((texel >> 8) & 0x0f); + g = ((texel & 0xf0) >> 0) | ((texel & 0xf0) >> 4); + b = ((texel & 0x0f) << 4) | ((texel & 0x0f) >> 0); + a = ((texel >> 8) & 0xf0) | (texel >> 12); + break; + case TexFormat::A8L8: + r = texel & 0xff; + g = r; + b = r; + a = texel >> 8; + break; + case TexFormat::L8: + r = texel & 0xff; + g = r; + b = r; + a = 0xff; + break; + case TexFormat::A8: + r = 0xff; + g = 0xff; + b = 0xff; + a = texel & 0xff; + break; + default: + std::abort(); + } + *lo = (r << 16) + b; + *hi = (a << 16) + g; +} + +inline void Unpack8888(uint32_t texel, uint32_t* lo, uint32_t* hi) { + *lo = texel & 0x00ff00ff; + *hi = (texel >> 8) & 0x00ff00ff; +} + +inline uint32_t Pack8888(uint32_t lo, uint32_t hi) { + return (hi << 8) | lo; +} + +inline uint32_t Lerp8888(uint32_t a, uint32_t b, uint32_t f) { + return (a + (((b - a) * f) >> 8)) & 0x00ff00ff; +} + +template +void TexAddressLinear(Fixed fu, + Fixed fv, + uint32_t log_width, + uint32_t log_height, + WrapMode wrapu, + WrapMode wrapv, + uint32_t* addr00, + uint32_t* addr01, + uint32_t* addr10, + uint32_t* addr11, + uint32_t* alpha, + uint32_t* beta +) { + auto delta_x = Fixed::make(Fixed::HALF >> log_width); + auto delta_y = Fixed::make(Fixed::HALF >> log_height); + + uint32_t u0 = Clamp(fu - delta_x, wrapu); + uint32_t u1 = Clamp(fu + delta_x, wrapu); + uint32_t v0 = Clamp(fv - delta_y, wrapv); + uint32_t v1 = Clamp(fv + delta_y, wrapv); + + uint32_t shift_u = (Fixed::FRAC - log_width); + uint32_t shift_v = (Fixed::FRAC - log_height); + + uint32_t x0s = (u0 << 8) >> shift_u; + uint32_t y0s = (v0 << 8) >> shift_v; + + uint32_t x0 = x0s >> 8; + uint32_t y0 = y0s >> 8; + uint32_t x1 = u1 >> shift_u; + uint32_t y1 = v1 >> shift_v; + + *addr00 = x0 + (y0 << log_width); + *addr01 = x1 + (y0 << log_width); + *addr10 = x0 + (y1 << log_width); + *addr11 = x1 + (y1 << log_width); + + *alpha = x0s & 0xff; + *beta = y0s & 0xff; + + //printf("*** fu=0x%x, fv=0x%x, u0=0x%x, u1=0x%x, v0=0x%x, v1=0x%x, x0=0x%x, x1=0x%x, y0=0x%x, y1=0x%x, addr00=0x%x, addr01=0x%x, addr10=0x%x, addr11=0x%x\n", fu.data(), fv.data(), u0, u1, v0, v1, x0, x1, y0, y1, *addr00, *addr01, *addr10, *addr11); +} + +template +void TexAddressPoint(Fixed fu, + Fixed fv, + uint32_t log_width, + uint32_t log_height, + WrapMode wrapu, + WrapMode wrapv, + uint32_t* addr +) { + uint32_t u = Clamp(fu, wrapu); + uint32_t v = Clamp(fv, wrapv); + + uint32_t x = u >> (Fixed::FRAC - log_width); + uint32_t y = v >> (Fixed::FRAC - log_height); + + *addr = x + (y << log_width); + + //printf("*** fu=0x%x, fv=0x%x, u=0x%x, v=0x%x, x=0x%x, y=0x%x, addr=0x%x\n", fu.data(), fv.data(), u, v, x, y, *addr); +} + +inline uint32_t TexFilterLinear( + TexFormat format, + uint32_t texel00, + uint32_t texel01, + uint32_t texel10, + uint32_t texel11, + uint32_t alpha, + uint32_t beta +) { + uint32_t c01l, c01h; + { + uint32_t c0l, c0h, c1l, c1h; + Unpack8888(format, texel00, &c0l, &c0h); + Unpack8888(format, texel01, &c1l, &c1h); + c01l = Lerp8888(c0l, c1l, alpha); + c01h = Lerp8888(c0h, c1h, alpha); + } + + uint32_t c23l, c23h; + { + uint32_t c2l, c2h, c3l, c3h; + Unpack8888(format, texel10, &c2l, &c2h); + Unpack8888(format, texel11, &c3l, &c3h); + c23l = Lerp8888(c2l, c3l, alpha); + c23h = Lerp8888(c2h, c3h, alpha); + } + + uint32_t color; + { + uint32_t cl = Lerp8888(c01l, c23l, beta); + uint32_t ch = Lerp8888(c01h, c23h, beta); + color = Pack8888(cl, ch); + } + + //printf("*** texel00=0x%x, texel01=0x%x, texel10=0x%x, texel11=0x%x, color=0x%x\n", texel00, texel01, texel10, texel11, color); + + return color; +} + +inline uint32_t TexFilterPoint(TexFormat format, uint32_t texel) { + uint32_t color; + { + uint32_t cl, ch; + Unpack8888(format, texel, &cl, &ch); + color = Pack8888(cl, ch); + } + + //printf("*** texel=0x%x, color=0x%x\n", texel, color); + + return color; +} \ No newline at end of file diff --git a/sim/common/util.h b/sim/common/util.h index dbaeb5fa..171bbe68 100644 --- a/sim/common/util.h +++ b/sim/common/util.h @@ -1,32 +1,52 @@ #pragma once #include +#include #include +#include template void unused(Args&&...) {} #define __unused(...) unused(__VA_ARGS__) -constexpr bool ispow2(uint64_t value) { - return value && !(value & (value - 1)); -} - -constexpr unsigned log2ceil(uint32_t value) { - return 32 - __builtin_clz(value - 1); -} - -inline uint64_t align_size(uint64_t size, uint64_t alignment) { - assert(0 == (alignment & (alignment - 1))); - return (size + alignment - 1) & ~(alignment - 1); -} - -// Apply integer sign extension -inline uint32_t signExt(uint32_t w, uint32_t bit, uint32_t mask) { - if (w >> (bit - 1)) - w |= ~mask; - return w; -} - // return file extension -const char* fileExtension(const char* filepath); \ No newline at end of file +const char* fileExtension(const char* filepath); + +#if defined(_MSC_VER) +#define DISABLE_WARNING_PUSH __pragma(warning(push)) +#define DISABLE_WARNING_POP __pragma(warning(pop)) +#define DISABLE_WARNING_UNUSED_PARAMETER \ + __pragma(warning(disable : 4100)) +#define DISABLE_WARNING_UNREFERENCED_FUNCTION __pragma(warning(disable : 4505)) +#define DISABLE_WARNING_ANONYMOUS_STRUCT __pragma(warning(disable : 4201)) +#define DISABLE_WARNING_UNUSED_VARIABLE __pragma(warning(disable : 4189)) +#elif defined(__GNUC__) +#define DISABLE_WARNING_PUSH _Pragma("GCC diagnostic push") +#define DISABLE_WARNING_POP _Pragma("GCC diagnostic pop") +#define DISABLE_WARNING_UNUSED_PARAMETER \ + _Pragma("GCC diagnostic ignored \"-Wunused-parameter\"") +#define DISABLE_WARNING_UNREFERENCED_FUNCTION \ + _Pragma("GCC diagnostic ignored \"-Wunused-function\"") +#define DISABLE_WARNING_ANONYMOUS_STRUCT \ + _Pragma("GCC diagnostic ignored \"-Wpedantic\"") +#define DISABLE_WARNING_UNUSED_VARIABLE \ + _Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"") +#elif defined(__clang__) +#define DISABLE_WARNING_PUSH _Pragma("clang diagnostic push") +#define DISABLE_WARNING_POP _Pragma("clang diagnostic pop") +#define DISABLE_WARNING_UNUSED_PARAMETER \ + _Pragma("clang diagnostic ignored \"-Wunused-parameter\"") +#define DISABLE_WARNING_UNREFERENCED_FUNCTION \ + _Pragma("clang diagnostic ignored \"-Wunused-function\"") +#define DISABLE_WARNING_ANONYMOUS_STRUCT \ + _Pragma("clang diagnostic ignored \"-Wgnu-anonymous-struct\"") +#define DISABLE_WARNING_UNUSED_VARIABLE \ + _Pragma("clang diagnostic ignored \"-Wunused-but-set-variable\"") +#else +#define DISABLE_WARNING_PUSH +#define DISABLE_WARNING_POP +#define DISABLE_WARNING_UNUSED_PARAMETER +#define DISABLE_WARNING_UNREFERENCED_FUNCTION +#define DISABLE_WARNING_ANONYMOUS_STRUCT +#endif \ No newline at end of file diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index a0c8d339..607dcf41 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -1,12 +1,16 @@ -RTL_DIR=../../hw/rtl -DPI_DIR=../../hw/dpi +DESTDIR ?= . +RTL_DIR = ../../hw/rtl +DPI_DIR = ../../hw/dpi +THIRD_PARTY_DIR = ../../third_party CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -I../../../hw -I../../common -CXXFLAGS += -I../../common/softfloat/source/include +CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include +CXXFLAGS += -I../$(THIRD_PARTY_DIR) -LDFLAGS += ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a +LDFLAGS += ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a +LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator # control RTL debug tracing states DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE @@ -23,16 +27,14 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_FLAGS += $(DBG_TRACE_FLAGS) -DBG_FLAGS += -DDBG_CACHE_REQ_INFO -DBG_FLAGS += -DVCD_OUTPUT -FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src +FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src TEX_INCLUDE = -I$(RTL_DIR)/tex_unit RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) $(TEX_INCLUDE) SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp -SRCS += main.cpp simulator.cpp +SRCS += processor.cpp ifdef AXI_BUS TOP = Vortex_axi @@ -51,10 +53,17 @@ VL_FLAGS += $(RTL_INCLUDE) VL_FLAGS += $(CONFIGS) CXXFLAGS += $(CONFIGS) +# Enable Verilator multithreaded simulation +#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') +#VL_FLAGS += --threads $(THREADS) + +# Enable VCD trace +VCD_TRACE = -DVCD_OUTPUT + # Debugigng ifdef DEBUG - VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS) - CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS) + VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS) + CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS) else VL_FLAGS += -DNDEBUG CXXFLAGS += -O2 -DNDEBUG @@ -80,15 +89,11 @@ PROJECT = rtlsim all: $(PROJECT) -$(PROJECT): $(SRCS) - verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT) +$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp + verilator --build $(VL_FLAGS) $^ $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$@ -static: $(SRCS) - verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' - $(AR) rcs lib$(PROJECT).a obj_dir/*.o ../common/softfloat/build/Linux-x86_64-GCC/*.o +$(DESTDIR)/lib$(PROJECT).so: $(SRCS) + verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' -o ../$@ -clean-static: - rm -rf lib$(PROJECT).a obj_dir - -clean: clean-static - rm -rf $(PROJECT) +clean: + rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so diff --git a/sim/rtlsim/main.cpp b/sim/rtlsim/main.cpp index 0f0575f5..a3766604 100644 --- a/sim/rtlsim/main.cpp +++ b/sim/rtlsim/main.cpp @@ -5,7 +5,10 @@ #include #include #include -#include "simulator.h" +#include +#include "processor.h" + +#define RAM_PAGE_SIZE 4096 using namespace vortex; @@ -46,12 +49,12 @@ int main(int argc, char **argv) { parse_args(argc, argv); - for (auto program : programs) { - std::cout << "Running " << program << "..." << std::endl; + vortex::RAM ram(RAM_PAGE_SIZE); + vortex::Processor processor; + processor.attach_ram(&ram); - vortex::RAM ram((1<<12), (1<<20)); - vortex::Simulator simulator; - simulator.attach_ram(&ram); + for (auto program : programs) { + std::cout << "Running " << program << "..." << std::endl; std::string program_ext(fileExtension(program)); if (program_ext == "bin") { @@ -63,7 +66,7 @@ int main(int argc, char **argv) { return -1; } - exitcode = simulator.run(); + exitcode = processor.run(); if (riscv_test) { if (1 == exitcode) { diff --git a/sim/rtlsim/processor.cpp b/sim/rtlsim/processor.cpp new file mode 100644 index 00000000..284d599f --- /dev/null +++ b/sim/rtlsim/processor.cpp @@ -0,0 +1,621 @@ +#include "processor.h" + +#include + +#ifdef AXI_BUS +#include "VVortex_axi.h" +#include "VVortex_axi__Syms.h" +#else +#include "VVortex.h" +#include "VVortex__Syms.h" +#endif + +#ifdef VCD_OUTPUT +#include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define RAMULATOR +#include +#include +#include + +#ifndef MEMORY_BANKS + #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS + #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS + #else + #define MEMORY_BANKS 2 + #endif +#endif + +#ifndef MEM_CYCLE_RATIO +#define MEM_CYCLE_RATIO -1 +#endif + +#ifndef TRACE_START_TIME +#define TRACE_START_TIME 0ull +#endif + +#ifndef TRACE_STOP_TIME +#define TRACE_STOP_TIME -1ull +#endif + +#ifndef VERILATOR_RESET_VALUE +#define VERILATOR_RESET_VALUE 2 +#endif + +#define VL_WDATA_GETW(lwp, i, n, w) \ + VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w) + +using namespace vortex; + +static uint64_t timestamp = 0; + +double sc_time_stamp() { + return timestamp; +} + +/////////////////////////////////////////////////////////////////////////////// + +static bool trace_enabled = false; +static uint64_t trace_start_time = TRACE_START_TIME; +static uint64_t trace_stop_time = TRACE_STOP_TIME; + +bool sim_trace_enabled() { + if (timestamp >= trace_start_time + && timestamp < trace_stop_time) + return true; + return trace_enabled; +} + +void sim_trace_enable(bool enable) { + trace_enabled = enable; +} + +/////////////////////////////////////////////////////////////////////////////// + +class Processor::Impl { +public: + Impl() { + // force random values for unitialized signals + Verilated::randReset(VERILATOR_RESET_VALUE); + Verilated::randSeed(50); + + // turn off assertion before reset + Verilated::assertOn(false); + + // create RTL module instance + #ifdef AXI_BUS + device_ = new VVortex_axi(); + #else + device_ = new VVortex(); + #endif + + #ifdef VCD_OUTPUT + Verilated::traceEverOn(true); + trace_ = new VerilatedVcdC(); + device_->trace(trace_, 99); + trace_->open("trace.vcd"); + #endif + + ram_ = nullptr; + + // initialize dram simulator + ramulator::Config ram_config; + ram_config.add("standard", "DDR4"); + ram_config.add("channels", std::to_string(MEMORY_BANKS)); + ram_config.add("ranks", "1"); + ram_config.add("speed", "DDR4_2400R"); + ram_config.add("org", "DDR4_4Gb_x8"); + ram_config.add("mapping", "defaultmapping"); + ram_config.set_core_num(1); + dram_ = new ramulator::Gem5Wrapper(ram_config, MEM_BLOCK_SIZE); + Stats::statlist.output("ramulator.ddr4.log"); + + // reset the device + this->reset(); + } + + ~Impl() { + this->cout_flush(); + + #ifdef VCD_OUTPUT + trace_->close(); + delete trace_; + #endif + + delete device_; + + if (dram_) { + dram_->finish(); + Stats::statlist.printall(); + delete dram_; + } + } + + void cout_flush() { + for (auto& buf : print_bufs_) { + auto str = buf.second.str(); + if (!str.empty()) { + std::cout << "#" << buf.first << ": " << str << std::endl; + } + } + } + + void attach_ram(RAM* ram) { + ram_ = ram; + } + + int run() { + int exitcode = 0; + + #ifndef NDEBUG + std::cout << std::dec << timestamp << ": [sim] run()" << std::endl; + #endif + + // reset device + this->reset(); + + // execute program + while (device_->busy) { + if (get_ebreak()) { + exitcode = get_last_wb_value(3); + break; + } + this->tick(); + } + + // wait 5 cycles to flush the pipeline + this->wait(5); + + return exitcode; + } + +private: + + void reset() { + print_bufs_.clear(); + + pending_mem_reqs_.clear(); + + mem_rd_rsp_active_ = false; + mem_wr_rsp_active_ = false; + + #ifdef AXI_BUS + this->reset_axi_bus(); + #else + this->reset_avs_bus(); + #endif + + device_->reset = 1; + + for (int i = 0; i < RESET_DELAY; ++i) { + device_->clk = 0; + this->eval(); + device_->clk = 1; + this->eval(); + } + + device_->reset = 0; + + // Turn on assertion after reset + Verilated::assertOn(true); + + this->cout_flush(); + } + + void tick() { + + device_->clk = 0; + this->eval(); + + #ifdef AXI_BUS + this->eval_axi_bus(0); + #else + this->eval_avs_bus(0); + #endif + + device_->clk = 1; + this->eval(); + + #ifdef AXI_BUS + this->eval_axi_bus(1); + #else + this->eval_avs_bus(1); + #endif + + if (MEM_CYCLE_RATIO > 0) { + auto cycle = timestamp / 2; + if ((cycle % MEM_CYCLE_RATIO) == 0) + dram_->tick(); + } else { + for (int i = MEM_CYCLE_RATIO; i <= 0; ++i) + dram_->tick(); + } + + if (!dram_queue_.empty()) { + if (dram_->send(dram_queue_.front())) + dram_queue_.pop(); + } + + #ifndef NDEBUG + fflush(stdout); + #endif + } + + void eval() { + device_->eval(); + #ifdef VCD_OUTPUT + if (sim_trace_enabled()) { + trace_->dump(timestamp); + } + #endif + ++timestamp; + } + +#ifdef AXI_BUS + + void reset_axi_bus() { + device_->m_axi_wready = 0; + device_->m_axi_awready = 0; + device_->m_axi_arready = 0; + device_->m_axi_rvalid = 0; + device_->m_axi_bvalid = 0; + } + + void eval_axi_bus(bool clk) { + if (!clk) { + mem_rd_rsp_ready_ = device_->m_axi_rready; + mem_wr_rsp_ready_ = device_->m_axi_bready; + return; + } + + if (ram_ == nullptr) { + device_->m_axi_wready = 0; + device_->m_axi_awready = 0; + device_->m_axi_arready = 0; + return; + } + + // process memory responses + if (mem_rd_rsp_active_ + && device_->m_axi_rvalid && mem_rd_rsp_ready_) { + mem_rd_rsp_active_ = false; + } + if (!mem_rd_rsp_active_) { + if (!pending_mem_reqs_.empty() + && (*pending_mem_reqs_.begin())->ready + && !(*pending_mem_reqs_.begin())->write) { + auto mem_rsp_it = pending_mem_reqs_.begin(); + auto mem_req = *mem_rsp_it; + /* + printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_req->addr); + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + printf("%02x", mem_req->block[(MEM_BLOCK_SIZE-1)-i]); + } + printf("\n"); + */ + device_->m_axi_rvalid = 1; + device_->m_axi_rid = mem_req->tag; + device_->m_axi_rresp = 0; + device_->m_axi_rlast = 1; + memcpy((uint8_t*)device_->m_axi_rdata, mem_req->block.data(), MEM_BLOCK_SIZE); + pending_mem_reqs_.erase(mem_rsp_it); + mem_rd_rsp_active_ = true; + delete mem_req; + } else { + device_->m_axi_rvalid = 0; + } + } + + // send memory write response + if (mem_wr_rsp_active_ + && device_->m_axi_bvalid && mem_wr_rsp_ready_) { + mem_wr_rsp_active_ = false; + } + if (!mem_wr_rsp_active_) { + if (!pending_mem_reqs_.empty() + && (*pending_mem_reqs_.begin())->ready + && (*pending_mem_reqs_.begin())->write) { + auto mem_rsp_it = pending_mem_reqs_.begin(); + auto mem_req = *mem_rsp_it; + /* + printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_req->addr); + */ + device_->m_axi_bvalid = 1; + device_->m_axi_bid = mem_req->tag; + device_->m_axi_bresp = 0; + pending_mem_reqs_.erase(mem_rsp_it); + mem_wr_rsp_active_ = true; + delete mem_req; + } else { + device_->m_axi_bvalid = 0; + } + } + + // select the memory bank + uint32_t req_addr = device_->m_axi_wvalid ? device_->m_axi_awaddr : device_->m_axi_araddr; + + // process memory requests + if (device_->m_axi_wvalid || device_->m_axi_arvalid) { + if (device_->m_axi_wvalid) { + uint64_t byteen = device_->m_axi_wstrb; + unsigned base_addr = device_->m_axi_awaddr; + uint8_t* data = (uint8_t*)(device_->m_axi_wdata); + + // check console output + if (base_addr >= IO_COUT_ADDR + && base_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + if ((byteen >> i) & 0x1) { + auto& ss_buf = print_bufs_[i]; + char c = data[i]; + ss_buf << c; + if (c == '\n') { + std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush; + ss_buf.str(""); + } + } + } + } else { + /* + printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen); + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); + } + printf("\n"); + */ + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + if ((byteen >> i) & 0x1) { + (*ram_)[base_addr + i] = data[i]; + } + } + + auto mem_req = new mem_req_t(); + mem_req->tag = device_->m_axi_awid; + mem_req->addr = device_->m_axi_awaddr; + mem_req->write = true; + mem_req->ready = true; + pending_mem_reqs_.emplace_back(mem_req); + + // send dram request + ramulator::Request dram_req( + device_->m_axi_awaddr, + ramulator::Request::Type::WRITE, + 0 + ); + dram_queue_.push(dram_req); + } + } else { + // process reads + auto mem_req = new mem_req_t(); + mem_req->tag = device_->m_axi_arid; + mem_req->addr = device_->m_axi_araddr; + ram_->read(mem_req->block.data(), device_->m_axi_araddr, MEM_BLOCK_SIZE); + mem_req->write = false; + mem_req->ready = false; + pending_mem_reqs_.emplace_back(mem_req); + + // send dram request + ramulator::Request dram_req( + device_->m_axi_araddr, + ramulator::Request::Type::READ, + std::bind([](ramulator::Request& dram_req, mem_req_t* mem_req) { + mem_req->ready = true; + }, placeholders::_1, mem_req), + 0 + ); + dram_queue_.push(dram_req); + } + } + + device_->m_axi_wready = 1; + device_->m_axi_awready = 1; + device_->m_axi_arready = 1; + } + +#else + + void reset_avs_bus() { + device_->mem_req_ready = 0; + device_->mem_rsp_valid = 0; + } + + void eval_avs_bus(bool clk) { + if (!clk) { + mem_rd_rsp_ready_ = device_->mem_rsp_ready; + return; + } + + if (ram_ == nullptr) { + device_->mem_req_ready = 0; + return; + } + + // process memory responses + if (mem_rd_rsp_active_ + && device_->mem_rsp_valid && mem_rd_rsp_ready_) { + mem_rd_rsp_active_ = false; + } + if (!mem_rd_rsp_active_) { + if (!pending_mem_reqs_.empty() + && (*pending_mem_reqs_.begin())->ready) { + device_->mem_rsp_valid = 1; + auto mem_rsp_it = pending_mem_reqs_.begin(); + auto mem_req = *mem_rsp_it; + /* + printf("%0ld: [sim] MEM Rd: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_req->addr); + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + printf("%02x", mem_req->block[(MEM_BLOCK_SIZE-1)-i]); + } + printf("\n"); + */ + memcpy((uint8_t*)device_->mem_rsp_data, mem_req->block.data(), MEM_BLOCK_SIZE); + device_->mem_rsp_tag = mem_req->tag; + pending_mem_reqs_.erase(mem_rsp_it); + mem_rd_rsp_active_ = true; + delete mem_req; + } else { + device_->mem_rsp_valid = 0; + } + } + + // process memory requests + if (device_->mem_req_valid) { + uint32_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE); + if (device_->mem_req_rw) { + // process writes + uint64_t byteen = device_->mem_req_byteen; + uint8_t* data = (uint8_t*)(device_->mem_req_data); + + // check console output + if (byte_addr >= IO_COUT_ADDR + && byte_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { + for (int i = 0; i < IO_COUT_SIZE; i++) { + if ((byteen >> i) & 0x1) { + auto& ss_buf = print_bufs_[i]; + char c = data[i]; + ss_buf << c; + if (c == '\n') { + std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush; + ss_buf.str(""); + } + } + } + } else { + /* + printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, byte_addr, byteen); + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); + } + printf("\n"); + */ + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + if ((byteen >> i) & 0x1) { + (*ram_)[byte_addr + i] = data[i]; + } + } + + // send dram request + ramulator::Request dram_req( + byte_addr, + ramulator::Request::Type::WRITE, + 0 + ); + dram_queue_.push(dram_req); + } + } else { + // process reads + auto mem_req = new mem_req_t(); + mem_req->tag = device_->mem_req_tag; + mem_req->addr = byte_addr; + mem_req->write = false; + mem_req->ready = false; + ram_->read(mem_req->block.data(), byte_addr, MEM_BLOCK_SIZE); + pending_mem_reqs_.emplace_back(mem_req); + + // send dram request + ramulator::Request dram_req( + byte_addr, + ramulator::Request::Type::READ, + std::bind([](ramulator::Request& dram_req, mem_req_t* mem_req) { + mem_req->ready = true; + }, placeholders::_1, mem_req), + 0 + ); + dram_queue_.push(dram_req); + } + } + + device_->mem_req_ready = 1; + } + +#endif + + void wait(uint32_t cycles) { + for (int i = 0; i < cycles; ++i) { + this->tick(); + } + } + + bool get_ebreak() const { + #ifdef AXI_BUS + return (bool)device_->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak; + #else + return (bool)device_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak; + #endif + } + + int get_last_wb_value(int reg) const { + #ifdef AXI_BUS + return (int)device_->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg]; + #else + return (int)device_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg]; + #endif + } + +private: + + typedef struct { + bool ready; + std::array block; + uint64_t addr; + uint64_t tag; + bool write; + } mem_req_t; + +#ifdef AXI_BUS + VVortex_axi *device_; +#else + VVortex *device_; +#endif +#ifdef VCD_OUTPUT + VerilatedVcdC *trace_; +#endif + + std::unordered_map print_bufs_; + + std::list pending_mem_reqs_; + + bool mem_rd_rsp_active_; + bool mem_rd_rsp_ready_; + + bool mem_wr_rsp_active_; + bool mem_wr_rsp_ready_; + + RAM *ram_; + + ramulator::Gem5Wrapper* dram_; + + std::queue dram_queue_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +Processor::Processor() + : impl_(new Impl()) +{} + +Processor::~Processor() { + delete impl_; +} + +void Processor::attach_ram(RAM* mem) { + impl_->attach_ram(mem); +} + +int Processor::run() { + return impl_->run(); +} \ No newline at end of file diff --git a/sim/rtlsim/processor.h b/sim/rtlsim/processor.h new file mode 100644 index 00000000..5518990b --- /dev/null +++ b/sim/rtlsim/processor.h @@ -0,0 +1,23 @@ +#pragma once + +namespace vortex { + +class RAM; + +class Processor { +public: + + Processor(); + ~Processor(); + + void attach_ram(RAM* ram); + + int run(); + +private: + + class Impl; + Impl* impl_; +}; + +} \ No newline at end of file diff --git a/sim/rtlsim/simulator.cpp b/sim/rtlsim/simulator.cpp deleted file mode 100644 index 8d3f9acf..00000000 --- a/sim/rtlsim/simulator.cpp +++ /dev/null @@ -1,579 +0,0 @@ -#include "simulator.h" - -#include - -#ifdef AXI_BUS -#include "VVortex_axi.h" -#include "VVortex_axi__Syms.h" -#else -#include "VVortex.h" -#include "VVortex__Syms.h" -#endif - -#ifdef VCD_OUTPUT -#include -#endif - -#include -#include -#include -#include - -#define ENABLE_MEM_STALLS - -#ifndef TRACE_START_TIME -#define TRACE_START_TIME 0ull -#endif - -#ifndef TRACE_STOP_TIME -#define TRACE_STOP_TIME -1ull -#endif - -#ifndef MEM_LATENCY -#define MEM_LATENCY 24 -#endif - -#ifndef MEM_RQ_SIZE -#define MEM_RQ_SIZE 16 -#endif - -#ifndef MEM_STALLS_MODULO -#define MEM_STALLS_MODULO 16 -#endif - -#ifndef VERILATOR_RESET_VALUE -#define VERILATOR_RESET_VALUE 2 -#endif - -#define VL_WDATA_GETW(lwp, i, n, w) \ - VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w) - -using namespace vortex; - -static uint64_t timestamp = 0; - -double sc_time_stamp() { - return timestamp; -} - -/////////////////////////////////////////////////////////////////////////////// - -static bool trace_enabled = false; -static uint64_t trace_start_time = TRACE_START_TIME; -static uint64_t trace_stop_time = TRACE_STOP_TIME; - -bool sim_trace_enabled() { - if (timestamp >= trace_start_time - && timestamp < trace_stop_time) - return true; - return trace_enabled; -} - -void sim_trace_enable(bool enable) { - trace_enabled = enable; -} - -/////////////////////////////////////////////////////////////////////////////// - -namespace vortex { -class VL_OBJ { -public: -#ifdef AXI_BUS - VVortex_axi *device; -#else - VVortex *device; -#endif -#ifdef VCD_OUTPUT - VerilatedVcdC *trace; -#endif - - VL_OBJ() { - // force random values for unitialized signals - Verilated::randReset(VERILATOR_RESET_VALUE); - Verilated::randSeed(50); - - // Turn off assertion before reset - Verilated::assertOn(false); - - #ifdef AXI_BUS - this->device = new VVortex_axi(); - #else - this->device = new VVortex(); - #endif - - #ifdef VCD_OUTPUT - Verilated::traceEverOn(true); - this->trace = new VerilatedVcdC(); - this->device->trace(this->trace, 99); - this->trace->open("trace.vcd"); - #endif - } - - ~VL_OBJ() { - #ifdef VCD_OUTPUT - this->trace->close(); - delete this->trace; - #endif - delete this->device; - } -}; -} - -/////////////////////////////////////////////////////////////////////////////// - -Simulator::Simulator() { - vl_obj_ = new VL_OBJ(); - ram_ = nullptr; - // reset the device - this->reset(); -} - -Simulator::~Simulator() { - for (auto& buf : print_bufs_) { - auto str = buf.second.str(); - if (!str.empty()) { - std::cout << "#" << buf.first << ": " << str << std::endl; - } - } - delete vl_obj_; -} - -void Simulator::attach_ram(RAM* ram) { - ram_ = ram; - for (int b = 0; b < MEMORY_BANKS; ++b) { - mem_rsp_vec_[b].clear(); - } - last_mem_rsp_bank_ = 0; -} - -void Simulator::reset() { - print_bufs_.clear(); - - for (int b = 0; b < MEMORY_BANKS; ++b) { - mem_rsp_vec_[b].clear(); - } - last_mem_rsp_bank_ = 0; - mem_rd_rsp_active_ = false; - mem_wr_rsp_active_ = false; - -#ifdef AXI_BUS - this->reset_axi_bus(); -#else - this->reset_mem_bus(); -#endif - - vl_obj_->device->reset = 1; - - for (int i = 0; i < RESET_DELAY; ++i) { - vl_obj_->device->clk = 0; - this->eval(); - vl_obj_->device->clk = 1; - this->eval(); - } - - vl_obj_->device->reset = 0; - - // Turn on assertion after reset - Verilated::assertOn(true); -} - -void Simulator::step() { - - vl_obj_->device->clk = 0; - this->eval(); - -#ifdef AXI_BUS - this->eval_axi_bus(0); -#else - this->eval_mem_bus(0); -#endif - - vl_obj_->device->clk = 1; - this->eval(); - -#ifdef AXI_BUS - this->eval_axi_bus(1); -#else - this->eval_mem_bus(1); -#endif - -#ifndef NDEBUG - fflush(stdout); -#endif -} - -void Simulator::eval() { - vl_obj_->device->eval(); -#ifdef VCD_OUTPUT - if (sim_trace_enabled()) { - vl_obj_->trace->dump(timestamp); - } -#endif - ++timestamp; -} - -#ifdef AXI_BUS - -void Simulator::reset_axi_bus() { - vl_obj_->device->m_axi_wready = 0; - vl_obj_->device->m_axi_awready = 0; - vl_obj_->device->m_axi_arready = 0; - vl_obj_->device->m_axi_rvalid = 0; - vl_obj_->device->m_axi_bvalid = 0; -} - -void Simulator::eval_axi_bus(bool clk) { - if (!clk) { - mem_rd_rsp_ready_ = vl_obj_->device->m_axi_rready; - mem_wr_rsp_ready_ = vl_obj_->device->m_axi_bready; - return; - } - - if (ram_ == nullptr) { - vl_obj_->device->m_axi_wready = 0; - vl_obj_->device->m_axi_awready = 0; - vl_obj_->device->m_axi_arready = 0; - return; - } - - // update memory responses schedule - for (int b = 0; b < MEMORY_BANKS; ++b) { - for (auto& rsp : mem_rsp_vec_[b]) { - if (rsp.cycles_left > 0) - rsp.cycles_left -= 1; - } - } - - bool has_rd_response = false; - bool has_wr_response = false; - - // schedule memory responses that are ready - for (int i = 0; i < MEMORY_BANKS; ++i) { - uint32_t b = (i + last_mem_rsp_bank_ + 1) % MEMORY_BANKS; - if (!mem_rsp_vec_[b].empty()) { - auto mem_rsp_it = mem_rsp_vec_[b].begin(); - if (mem_rsp_it->cycles_left <= 0) { - has_rd_response = !mem_rsp_it->write; - has_wr_response = mem_rsp_it->write; - last_mem_rsp_bank_ = b; - break; - } - } - } - - // send memory read response - if (mem_rd_rsp_active_ - && vl_obj_->device->m_axi_rvalid && mem_rd_rsp_ready_) { - mem_rd_rsp_active_ = false; - } - if (!mem_rd_rsp_active_) { - if (has_rd_response) { - auto mem_rsp_it = mem_rsp_vec_[last_mem_rsp_bank_].begin(); - /* - printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_rsp_it->addr); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", mem_rsp_it->block[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); - */ - vl_obj_->device->m_axi_rvalid = 1; - vl_obj_->device->m_axi_rid = mem_rsp_it->tag; - vl_obj_->device->m_axi_rresp = 0; - vl_obj_->device->m_axi_rlast = 1; - memcpy((uint8_t*)vl_obj_->device->m_axi_rdata, mem_rsp_it->block.data(), MEM_BLOCK_SIZE); - mem_rsp_vec_[last_mem_rsp_bank_].erase(mem_rsp_it); - mem_rd_rsp_active_ = true; - } else { - vl_obj_->device->m_axi_rvalid = 0; - } - } - - // send memory write response - if (mem_wr_rsp_active_ - && vl_obj_->device->m_axi_bvalid && mem_wr_rsp_ready_) { - mem_wr_rsp_active_ = false; - } - if (!mem_wr_rsp_active_) { - if (has_wr_response) { - auto mem_rsp_it = mem_rsp_vec_[last_mem_rsp_bank_].begin(); - /* - printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_rsp_it->addr); - */ - vl_obj_->device->m_axi_bvalid = 1; - vl_obj_->device->m_axi_bid = mem_rsp_it->tag; - vl_obj_->device->m_axi_bresp = 0; - mem_rsp_vec_[last_mem_rsp_bank_].erase(mem_rsp_it); - mem_wr_rsp_active_ = true; - } else { - vl_obj_->device->m_axi_bvalid = 0; - } - } - - // select the memory bank - uint32_t req_addr = vl_obj_->device->m_axi_wvalid ? vl_obj_->device->m_axi_awaddr : vl_obj_->device->m_axi_araddr; - uint32_t req_bank = (MEMORY_BANKS >= 2) ? ((req_addr / MEM_BLOCK_SIZE) % MEMORY_BANKS) : 0; - - // handle memory stalls - bool mem_stalled = false; -#ifdef ENABLE_MEM_STALLS - if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { - mem_stalled = true; - } else - if (mem_rsp_vec_[req_bank].size() >= MEM_RQ_SIZE) { - mem_stalled = true; - } -#endif - - // process memory requests - if (!mem_stalled) { - if (vl_obj_->device->m_axi_wvalid || vl_obj_->device->m_axi_arvalid) { - if (vl_obj_->device->m_axi_wvalid) { - uint64_t byteen = vl_obj_->device->m_axi_wstrb; - unsigned base_addr = vl_obj_->device->m_axi_awaddr; - uint8_t* data = (uint8_t*)(vl_obj_->device->m_axi_wdata); - - // detect stdout write - if (base_addr >= IO_COUT_ADDR - && base_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - auto& ss_buf = print_bufs_[i]; - char c = data[i]; - ss_buf << c; - if (c == '\n') { - std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush; - ss_buf.str(""); - } - } - } - } else { - /* - printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); - */ - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - (*ram_)[base_addr + i] = data[i]; - } - } - mem_req_t mem_req; - mem_req.tag = vl_obj_->device->m_axi_arid; - mem_req.addr = vl_obj_->device->m_axi_araddr; - mem_req.cycles_left = 0; - mem_req.write = 1; - mem_rsp_vec_[req_bank].emplace_back(mem_req); - } - } else { - mem_req_t mem_req; - mem_req.tag = vl_obj_->device->m_axi_arid; - mem_req.addr = vl_obj_->device->m_axi_araddr; - ram_->read(mem_req.block.data(), vl_obj_->device->m_axi_araddr, MEM_BLOCK_SIZE); - mem_req.cycles_left = MEM_LATENCY; - mem_req.write = 0; - for (auto& rsp : mem_rsp_vec_[req_bank]) { - if (mem_req.addr == rsp.addr) { - // duplicate requests receive the same cycle delay - mem_req.cycles_left = rsp.cycles_left; - break; - } - } - mem_rsp_vec_[req_bank].emplace_back(mem_req); - } - } - } - - vl_obj_->device->m_axi_wready = !mem_stalled; - vl_obj_->device->m_axi_awready = !mem_stalled; - vl_obj_->device->m_axi_arready = !mem_stalled; -} - -#else - -void Simulator::reset_mem_bus() { - vl_obj_->device->mem_req_ready = 0; - vl_obj_->device->mem_rsp_valid = 0; -} - -void Simulator::eval_mem_bus(bool clk) { - if (!clk) { - mem_rd_rsp_ready_ = vl_obj_->device->mem_rsp_ready; - return; - } - - if (ram_ == nullptr) { - vl_obj_->device->mem_req_ready = 0; - return; - } - - // update memory responses schedule - for (int b = 0; b < MEMORY_BANKS; ++b) { - for (auto& rsp : mem_rsp_vec_[b]) { - if (rsp.cycles_left > 0) - rsp.cycles_left -= 1; - } - } - - bool has_response = false; - - // schedule memory responses that are ready - for (int i = 0; i < MEMORY_BANKS; ++i) { - uint32_t b = (i + last_mem_rsp_bank_ + 1) % MEMORY_BANKS; - if (!mem_rsp_vec_[b].empty() - && (mem_rsp_vec_[b].begin()->cycles_left) <= 0) { - has_response = true; - last_mem_rsp_bank_ = b; - break; - } - } - - // send memory response - if (mem_rd_rsp_active_ - && vl_obj_->device->mem_rsp_valid && mem_rd_rsp_ready_) { - mem_rd_rsp_active_ = false; - } - if (!mem_rd_rsp_active_) { - if (has_response) { - vl_obj_->device->mem_rsp_valid = 1; - auto mem_rsp_it = mem_rsp_vec_[last_mem_rsp_bank_].begin(); - /* - printf("%0ld: [sim] MEM Rd: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_rsp_it->addr); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", mem_rsp_it->block[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); - */ - memcpy((uint8_t*)vl_obj_->device->mem_rsp_data, mem_rsp_it->block.data(), MEM_BLOCK_SIZE); - vl_obj_->device->mem_rsp_tag = mem_rsp_it->tag; - mem_rsp_vec_[last_mem_rsp_bank_].erase(mem_rsp_it); - mem_rd_rsp_active_ = true; - } else { - vl_obj_->device->mem_rsp_valid = 0; - } - } - - // select the memory bank - uint32_t req_bank = (MEMORY_BANKS >= 2) ? (vl_obj_->device->mem_req_addr % MEMORY_BANKS) : 0; - - // handle memory stalls - bool mem_stalled = false; -#ifdef ENABLE_MEM_STALLS - if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { - mem_stalled = true; - } else - if (mem_rsp_vec_[req_bank].size() >= MEM_RQ_SIZE) { - mem_stalled = true; - } -#endif - - // process memory requests - if (!mem_stalled) { - if (vl_obj_->device->mem_req_valid) { - if (vl_obj_->device->mem_req_rw) { - uint64_t byteen = vl_obj_->device->mem_req_byteen; - unsigned base_addr = (vl_obj_->device->mem_req_addr * MEM_BLOCK_SIZE); - uint8_t* data = (uint8_t*)(vl_obj_->device->mem_req_data); - if (base_addr >= IO_COUT_ADDR - && base_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - auto& ss_buf = print_bufs_[i]; - char c = data[i]; - ss_buf << c; - if (c == '\n') { - std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush; - ss_buf.str(""); - } - } - } - } else { - /* - printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); - */ - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - (*ram_)[base_addr + i] = data[i]; - } - } - } - } else { - mem_req_t mem_req; - mem_req.tag = vl_obj_->device->mem_req_tag; - mem_req.addr = (vl_obj_->device->mem_req_addr * MEM_BLOCK_SIZE); - ram_->read(mem_req.block.data(), vl_obj_->device->mem_req_addr * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE); - mem_req.cycles_left = MEM_LATENCY; - for (auto& rsp : mem_rsp_vec_[req_bank]) { - if (mem_req.addr == rsp.addr) { - // duplicate requests receive the same cycle delay - mem_req.cycles_left = rsp.cycles_left; - break; - } - } - mem_rsp_vec_[req_bank].emplace_back(mem_req); - } - } - } - - vl_obj_->device->mem_req_ready = !mem_stalled; -} - -#endif - -void Simulator::wait(uint32_t cycles) { - for (int i = 0; i < cycles; ++i) { - this->step(); - } -} - -bool Simulator::is_busy() const { - return vl_obj_->device->busy; -} - -int Simulator::run() { - int exitcode = 0; - -#ifndef NDEBUG - std::cout << std::dec << timestamp << ": [sim] run()" << std::endl; -#endif - - // execute program - while (vl_obj_->device->busy) { - if (get_ebreak()) { - exitcode = get_last_wb_value(3); - break; - } - this->step(); - } - - // wait 5 cycles to flush the pipeline - this->wait(5); - - return exitcode; -} - -bool Simulator::get_ebreak() const { -#ifdef AXI_BUS - return (int)vl_obj_->device->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak; -#else - return (int)vl_obj_->device->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak; -#endif -} - -int Simulator::get_last_wb_value(int reg) const { -#ifdef AXI_BUS - return (int)vl_obj_->device->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg]; -#else - return (int)vl_obj_->device->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg]; -#endif -} - -void Simulator::print_stats(std::ostream& out) { - out << std::left; - out << std::setw(24) << "# of total cycles:" << std::dec << timestamp/2 << std::endl; -} \ No newline at end of file diff --git a/sim/rtlsim/simulator.h b/sim/rtlsim/simulator.h deleted file mode 100644 index 3b36c520..00000000 --- a/sim/rtlsim/simulator.h +++ /dev/null @@ -1,81 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -#ifndef MEMORY_BANKS - #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS - #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS - #else - #define MEMORY_BANKS 2 - #endif -#endif - -namespace vortex { - -class VL_OBJ; -class RAM; - -class Simulator { -public: - - Simulator(); - virtual ~Simulator(); - - void attach_ram(RAM* ram); - - bool is_busy() const; - - void reset(); - void step(); - void wait(uint32_t cycles); - - int run(); - - void print_stats(std::ostream& out); - -private: - - typedef struct { - int cycles_left; - std::array block; - uint64_t addr; - uint64_t tag; - bool write; - } mem_req_t; - - std::unordered_map print_bufs_; - - void eval(); - -#ifdef AXI_BUS - void reset_axi_bus(); - void eval_axi_bus(bool clk); -#else - void reset_mem_bus(); - void eval_mem_bus(bool clk); -#endif - - int get_last_wb_value(int reg) const; - - bool get_ebreak() const; - - std::list mem_rsp_vec_ [MEMORY_BANKS]; - uint32_t last_mem_rsp_bank_; - - bool mem_rd_rsp_active_; - bool mem_rd_rsp_ready_; - - bool mem_wr_rsp_active_; - bool mem_wr_rsp_ready_; - - RAM *ram_; - - VL_OBJ* vl_obj_; -}; - -} \ No newline at end of file diff --git a/sim/simX/Makefile b/sim/simX/Makefile deleted file mode 100644 index 29b53fc3..00000000 --- a/sim/simX/Makefile +++ /dev/null @@ -1,50 +0,0 @@ -RTL_DIR = ../hw/rtl - -CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -CXXFLAGS += -fPIC -Wno-maybe-uninitialized -CXXFLAGS += -I. -I../common -I../../hw -CXXFLAGS += -I../common/softfloat/source/include -CXXFLAGS += $(CONFIGS) - -LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a - -TOP = vx_cache_sim - -SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp -SRCS += args.cpp pipeline.cpp warp.cpp core.cpp decode.cpp execute.cpp main.cpp - -OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS))) -VPATH := $(sort $(dir $(SRCS))) - -#$(info OBJS is $(OBJS)) -#$(info VPATH is $(VPATH)) - -# Debugigng -ifdef DEBUG - CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -else - CXXFLAGS += -O2 -DNDEBUG -endif - -PROJECT = simX - -all: $(PROJECT) - -$(PROJECT): $(SRCS) - $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ - -obj_dir/%.o: %.cpp - mkdir -p obj_dir - $(CXX) $(CXXFLAGS) -c $< -o $@ - -static: $(OBJS) - $(AR) rcs lib$(PROJECT).a $(OBJS) ../common/softfloat/build/Linux-x86_64-GCC/*.o - -.depend: $(SRCS) - $(CXX) $(CXXFLAGS) -MM $^ > .depend; - -clean-static: - rm -rf lib$(PROJECT).a obj_dir .depend - -clean: clean-static - rm -rf $(PROJECT) \ No newline at end of file diff --git a/sim/simX/archdef.h b/sim/simX/archdef.h deleted file mode 100644 index 75248c1a..00000000 --- a/sim/simX/archdef.h +++ /dev/null @@ -1,72 +0,0 @@ -#pragma once - -#include -#include - -#include -#include -#include "types.h" - -namespace vortex { - -class ArchDef { -public: - ArchDef(const std::string &/*arch*/, - int num_cores, - int num_warps, - int num_threads) { - wsize_ = 4; - vsize_ = 16; - num_regs_ = 32; - num_csrs_ = 4096; - num_barriers_= NUM_BARRIERS; - num_cores_ = num_cores; - num_warps_ = num_warps; - num_threads_ = num_threads; - } - - int wsize() const { - return wsize_; - } - - int vsize() const { - return vsize_; - } - - int num_regs() const { - return num_regs_; - } - - int num_csrs() const { - return num_csrs_; - } - - int num_barriers() const { - return num_barriers_; - } - - int num_threads() const { - return num_threads_; - } - - int num_warps() const { - return num_warps_; - } - - int num_cores() const { - return num_cores_; - } - -private: - - int wsize_; - int vsize_; - int num_regs_; - int num_csrs_; - int num_barriers_; - int num_threads_; - int num_warps_; - int num_cores_; -}; - -} \ No newline at end of file diff --git a/sim/simX/core.cpp b/sim/simX/core.cpp deleted file mode 100644 index c68ac854..00000000 --- a/sim/simX/core.cpp +++ /dev/null @@ -1,393 +0,0 @@ -#include -#include -#include -#include -#include -#include "types.h" -#include "archdef.h" -#include "mem.h" -#include "decode.h" -#include "core.h" -#include "debug.h" - -using namespace vortex; - -Core::Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id) - : id_(id) - , arch_(arch) - , decoder_(decoder) - , mem_(mem) - , shared_mem_(1, SMEM_SIZE) - , inst_in_schedule_("schedule") - , inst_in_fetch_("fetch") - , inst_in_decode_("decode") - , inst_in_issue_("issue") - , inst_in_execute_("execute") - , inst_in_writeback_("writeback") { - in_use_iregs_.resize(arch.num_warps(), 0); - in_use_fregs_.resize(arch.num_warps(), 0); - in_use_vregs_.reset(); - - csrs_.resize(arch_.num_csrs(), 0); - - fcsrs_.resize(arch_.num_warps(), 0); - - barriers_.resize(arch_.num_barriers(), 0); - - warps_.resize(arch_.num_warps()); - for (int i = 0; i < arch_.num_warps(); ++i) { - warps_[i] = std::make_shared(this, i); - } - - this->clear(); -} - -Core::~Core() { - for (auto& buf : print_bufs_) { - auto str = buf.second.str(); - if (!str.empty()) { - std::cout << "#" << buf.first << ": " << str << std::endl; - } - } -} - -void Core::clear() { - for (int w = 0; w < arch_.num_warps(); ++w) { - in_use_iregs_[w].reset(); - in_use_fregs_[w].reset(); - } - stalled_warps_.reset(); - - in_use_vregs_.reset(); - - for (auto& csr : csrs_) { - csr = 0; - } - - for (auto& fcsr : fcsrs_) { - fcsr = 0; - } - - for (auto& barrier : barriers_) { - barrier.reset(); - } - - for (auto warp : warps_) { - warp->clear(); - } - - inst_in_schedule_.clear(); - inst_in_fetch_.clear(); - inst_in_decode_.clear(); - inst_in_issue_.clear(); - inst_in_execute_.clear(); - inst_in_writeback_.clear(); - print_bufs_.clear(); - - steps_ = 0; - insts_ = 0; - loads_ = 0; - stores_ = 0; - - inst_in_schedule_.valid = true; - warps_[0]->setTmask(0, true); - - ebreak_ = false; -} - -void Core::step() { - D(2, "###########################################################"); - - steps_++; - D(2, std::dec << "Core" << id_ << ": cycle: " << steps_); - - this->writeback(); - this->execute(); - this->issue(); - this->decode(); - this->fetch(); - this->schedule(); - - DPN(2, std::flush); -} - -void Core::schedule() { - if (!inst_in_schedule_.enter(&inst_in_fetch_)) - return; - - bool foundSchedule = false; - int scheduled_warp = inst_in_schedule_.wid; - - for (size_t wid = 0; wid < warps_.size(); ++wid) { - // round robin scheduling - scheduled_warp = (scheduled_warp + 1) % warps_.size(); - bool is_active = warps_[scheduled_warp]->active(); - bool stalled = stalled_warps_[scheduled_warp]; - if (is_active && !stalled) { - foundSchedule = true; - break; - } - } - - if (!foundSchedule) - return; - - D(2, "Schedule: wid=" << scheduled_warp); - inst_in_schedule_.wid = scheduled_warp; - - // advance pipeline - inst_in_schedule_.next(&inst_in_fetch_); -} - -void Core::fetch() { - if (!inst_in_fetch_.enter(&inst_in_issue_)) - return; - - int wid = inst_in_fetch_.wid; - - auto active_threads_b = warps_[wid]->getActiveThreads(); - warps_[wid]->step(&inst_in_fetch_); - auto active_threads_a = warps_[wid]->getActiveThreads(); - - insts_ += active_threads_b; - if (active_threads_b != active_threads_a) { - D(3, "*** warp#" << wid << " active threads changed to " << active_threads_a); - } - - if (inst_in_fetch_.stall_warp) { - D(3, "*** warp#" << wid << " fetch stalled"); - stalled_warps_[wid] = true; - } - - D(4, inst_in_fetch_); - - // advance pipeline - inst_in_fetch_.next(&inst_in_issue_); -} - -void Core::decode() { - if (!inst_in_decode_.enter(&inst_in_issue_)) - return; - - // advance pipeline - inst_in_decode_.next(&inst_in_issue_); -} - -void Core::issue() { - if (!inst_in_issue_.enter(&inst_in_execute_)) - return; - - bool in_use_regs = (inst_in_issue_.used_iregs & in_use_iregs_[inst_in_issue_.wid]) != 0 - || (inst_in_issue_.used_fregs & in_use_fregs_[inst_in_issue_.wid]) != 0 - || (inst_in_issue_.used_vregs & in_use_vregs_) != 0; - - if (in_use_regs) { - D(3, "*** Issue: registers not ready!"); - inst_in_issue_.stalled = true; - return; - } - - switch (inst_in_issue_.rdest_type) { - case 1: - if (inst_in_issue_.rdest) - in_use_iregs_[inst_in_issue_.wid][inst_in_issue_.rdest] = 1; - break; - case 2: - in_use_fregs_[inst_in_issue_.wid][inst_in_issue_.rdest] = 1; - break; - case 3: - in_use_vregs_[inst_in_issue_.rdest] = 1; - break; - default: - break; - } - - // advance pipeline - inst_in_issue_.next(&inst_in_execute_); -} - -void Core::execute() { - if (!inst_in_execute_.enter(&inst_in_writeback_)) - return; - - // advance pipeline - inst_in_execute_.next(&inst_in_writeback_); -} - -void Core::writeback() { - if (!inst_in_writeback_.enter(NULL)) - return; - - switch (inst_in_writeback_.rdest_type) { - case 1: - in_use_iregs_[inst_in_writeback_.wid][inst_in_writeback_.rdest] = 0; - break; - case 2: - in_use_fregs_[inst_in_writeback_.wid][inst_in_writeback_.rdest] = 0; - break; - case 3: - in_use_vregs_[inst_in_writeback_.rdest] = 0; - break; - default: - break; - } - - if (inst_in_writeback_.stall_warp) { - stalled_warps_[inst_in_writeback_.wid] = false; - D(3, "*** warp#" << inst_in_writeback_.wid << " fetch released"); - } - - // advance pipeline - inst_in_writeback_.next(NULL); -} - -Word Core::get_csr(Addr addr, int tid, int wid) { - if (addr == CSR_FFLAGS) { - return fcsrs_.at(wid) & 0x1F; - } else if (addr == CSR_FRM) { - return (fcsrs_.at(wid) >> 5); - } else if (addr == CSR_FCSR) { - return fcsrs_.at(wid); - } else if (addr == CSR_WTID) { - // Warp threadID - return tid; - } else if (addr == CSR_LTID) { - // Core threadID - return tid + (wid * arch_.num_threads()); - } else if (addr == CSR_GTID) { - // Processor threadID - return tid + (wid * arch_.num_threads()) + - (arch_.num_threads() * arch_.num_warps() * id_); - } else if (addr == CSR_LWID) { - // Core warpID - return wid; - } else if (addr == CSR_GWID) { - // Processor warpID - return wid + (arch_.num_warps() * id_); - } else if (addr == CSR_GCID) { - // Processor coreID - return id_; - } else if (addr == CSR_TMASK) { - // Processor coreID - return warps_.at(wid)->getTmask(); - } else if (addr == CSR_NT) { - // Number of threads per warp - return arch_.num_threads(); - } else if (addr == CSR_NW) { - // Number of warps per core - return arch_.num_warps(); - } else if (addr == CSR_NC) { - // Number of cores - return arch_.num_cores(); - } else if (addr == CSR_MINSTRET) { - // NumInsts - return insts_; - } else if (addr == CSR_MINSTRET_H) { - // NumInsts - return (Word)(insts_ >> 32); - } else if (addr == CSR_MCYCLE) { - // NumCycles - return (Word)steps_; - } else if (addr == CSR_MCYCLE_H) { - // NumCycles - return (Word)(steps_ >> 32); - } else { - return csrs_.at(addr); - } -} - -void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) { - if (addr == CSR_FFLAGS) { - fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F); - } else if (addr == CSR_FRM) { - fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5); - } else if (addr == CSR_FCSR) { - fcsrs_.at(wid) = value & 0xff; - } else { - csrs_.at(addr) = value; - } -} - -void Core::barrier(int bar_id, int count, int warp_id) { - auto& barrier = barriers_.at(bar_id); - barrier.set(warp_id); - if (barrier.count() < (size_t)count) - return; - for (int i = 0; i < arch_.num_warps(); ++i) { - if (barrier.test(i)) { - warps_.at(i)->activate(); - } - } - barrier.reset(); -} - -Word Core::icache_fetch(Addr addr) { - Word data; - mem_.read(&data, addr, sizeof(Word), 0); - return data; -} - -Word Core::dcache_read(Addr addr, Size size) { - ++loads_; - Word data = 0; -#ifdef SM_ENABLE - if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) - && ((addr + 3) < SMEM_BASE_ADDR)) { - shared_mem_.read(&data, addr & (SMEM_SIZE-1), size); - return data; - } -#endif - mem_.read(&data, addr, size, 0); - return data; -} - -void Core::dcache_write(Addr addr, Word data, Size size) { - ++stores_; -#ifdef SM_ENABLE - if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) - && ((addr + 3) < SMEM_BASE_ADDR)) { - shared_mem_.write(&data, addr & (SMEM_SIZE-1), size); - return; - } -#endif - if (addr >= IO_COUT_ADDR - && addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { - this->writeToStdOut(addr, data); - return; - } - mem_.write(&data, addr, size, 0); -} - -bool Core::running() const { - return inst_in_fetch_.valid - || inst_in_decode_.valid - || inst_in_issue_.valid - || inst_in_execute_.valid - || inst_in_writeback_.valid; -} - -void Core::printStats() const { - std::cout << "Steps : " << steps_ << std::endl - << "Insts : " << insts_ << std::endl - << "Loads : " << loads_ << std::endl - << "Stores: " << stores_ << std::endl; -} - -void Core::writeToStdOut(Addr addr, Word data) { - uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1); - auto& ss_buf = print_bufs_[tid]; - char c = (char)data; - ss_buf << c; - if (c == '\n') { - std::cout << std::dec << "#" << tid << ": " << ss_buf.str() << std::flush; - ss_buf.str(""); - } -} - -void Core::trigger_ebreak() { - ebreak_ = true; -} - -bool Core::check_ebreak() const { - return ebreak_; -} \ No newline at end of file diff --git a/sim/simX/core.h b/sim/simX/core.h deleted file mode 100644 index 29de3ec6..00000000 --- a/sim/simX/core.h +++ /dev/null @@ -1,122 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "debug.h" -#include "types.h" -#include "archdef.h" -#include "decode.h" -#include "mem.h" -#include "warp.h" -#include "pipeline.h" - -namespace vortex { - -class Core { -public: - Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id); - - ~Core(); - - void clear(); - - bool running() const; - - void step(); - - void printStats() const; - - Word id() const { - return id_; - } - - Warp& warp(int i) { - return *warps_.at(i); - } - - Decoder& decoder() { - return decoder_; - } - - const ArchDef& arch() const { - return arch_; - } - - unsigned long num_insts() const { - return insts_; - } - - unsigned long num_steps() const { - return steps_; - } - - Word getIRegValue(int reg) const { - return warps_[0]->getIRegValue(reg); - } - - Word get_csr(Addr addr, int tid, int wid); - - void set_csr(Addr addr, Word value, int tid, int wid); - - void barrier(int bar_id, int count, int warp_id); - - Word icache_fetch(Addr); - - Word dcache_read(Addr, Size); - - void dcache_write(Addr, Word, Size); - - void trigger_ebreak(); - bool check_ebreak() const; - -private: - - void schedule(); - void fetch(); - void decode(); - void issue(); - void execute(); - void writeback(); - - void writeToStdOut(Addr addr, Word data); - - std::vector in_use_iregs_; - std::vector in_use_fregs_; - RegMask in_use_vregs_; - WarpMask stalled_warps_; - std::vector> warps_; - std::vector barriers_; - std::vector csrs_; - std::vector fcsrs_; - std::unordered_map print_bufs_; - - Word id_; - const ArchDef &arch_; - Decoder &decoder_; - MemoryUnit &mem_; -#ifdef SM_ENABLE - RAM shared_mem_; -#endif - - bool ebreak_; - - Pipeline inst_in_schedule_; - Pipeline inst_in_fetch_; - Pipeline inst_in_decode_; - Pipeline inst_in_issue_; - Pipeline inst_in_execute_; - Pipeline inst_in_writeback_; - - uint64_t steps_; - uint64_t insts_; - uint64_t loads_; - uint64_t stores_; -}; - -} // namespace vortex \ No newline at end of file diff --git a/sim/simX/debug.h b/sim/simX/debug.h deleted file mode 100644 index ad7fd16f..00000000 --- a/sim/simX/debug.h +++ /dev/null @@ -1,43 +0,0 @@ -#pragma once - -#ifndef DEBUG_LEVEL -#define DEBUG_LEVEL 3 -#endif - -#define DEBUG_HEADER << "DEBUG " -//#define DEBUG_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": " - -#ifndef NDEBUG - -#include -#include - -#define DX(x) x - -#define D(lvl, x) do { \ - if ((lvl) <= DEBUG_LEVEL) { \ - std::cout DEBUG_HEADER << x << std::endl; \ - } \ -} while(0) - -#define DPH(lvl, x) do { \ - if ((lvl) <= DEBUG_LEVEL) { \ - std::cout DEBUG_HEADER << x; \ - } \ -} while(0) - -#define DPN(lvl, x) do { \ - if ((lvl) <= DEBUG_LEVEL) { \ - std::cout << x; \ - } \ -} while(0) - -#else - -#define DX(x) -#define D(lvl, x) do {} while(0) -#define DPH(lvl, x) do {} while(0) -#define DPN(lvl, x) do {} while(0) -#define D_RAW(x) do {} while(0) - -#endif \ No newline at end of file diff --git a/sim/simX/execute.cpp b/sim/simX/execute.cpp deleted file mode 100644 index 47bf4e04..00000000 --- a/sim/simX/execute.cpp +++ /dev/null @@ -1,1600 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "warp.h" -#include "instr.h" -#include "core.h" - -using namespace vortex; - -static bool HasDivergentThreads(const ThreadMask &thread_mask, - const std::vector> ®_file, - unsigned reg) { - bool cond; - size_t thread_idx = 0; - size_t num_threads = reg_file.size(); - for (; thread_idx < num_threads; ++thread_idx) { - if (thread_mask[thread_idx]) { - cond = bool(reg_file[thread_idx][reg]); - break; - } - } - assert(thread_idx != num_threads); - for (; thread_idx < num_threads; ++thread_idx) { - if (thread_mask[thread_idx]) { - if (cond != (bool(reg_file[thread_idx][reg]))) { - return true; - } - } - } - return false; -} - -inline uint32_t get_fpu_rm(uint32_t func3, Core* core, uint32_t tid, uint32_t wid) { - return (func3 == 0x7) ? core->get_csr(CSR_FRM, tid, wid) : func3; -} - -inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid) { - if (fflags) { - core->set_csr(CSR_FCSR, core->get_csr(CSR_FCSR, tid, wid) | fflags, tid, wid); - core->set_csr(CSR_FFLAGS, core->get_csr(CSR_FFLAGS, tid, wid) | fflags, tid, wid); - } -} - -void Warp::execute(const Instr &instr, Pipeline *pipeline) { - assert(tmask_.any()); - - Word nextPC = PC_ + core_->arch().wsize(); - bool runOnce = false; - - Word func3 = instr.getFunc3(); - Word func6 = instr.getFunc6(); - Word func7 = instr.getFunc7(); - - auto opcode = instr.getOpcode(); - int rdest = instr.getRDest(); - int rsrc0 = instr.getRSrc(0); - int rsrc1 = instr.getRSrc(1); - Word immsrc= instr.getImm(); - Word vmask = instr.getVmask(); - - int num_threads = core_->arch().num_threads(); - for (int t = 0; t < num_threads; t++) { - if (!tmask_.test(t) || runOnce) - continue; - - auto &iregs = iRegFile_.at(t); - auto &fregs = fRegFile_.at(t); - - Word rsdata[3]; - Word rddata; - - int num_rsrcs = instr.getNRSrc(); - if (num_rsrcs) { - DPH(2, "[" << std::dec << t << "] Src Regs: "); - for (int i = 0; i < num_rsrcs; ++i) { - int rst = instr.getRSType(i); - int rs = instr.getRSrc(i); - if (i) DPN(2, ", "); - switch (rst) { - case 1: - rsdata[i] = iregs[rs]; - DPN(2, "r" << std::dec << rs << "=0x" << std::hex << rsdata[i]); - break; - case 2: - rsdata[i] = fregs[rs]; - DPN(2, "fr" << std::dec << rs << "=0x" << std::hex << rsdata[i]); - break; - default: break; - } - } - DPN(2, std::endl); - } - - bool rd_write = false; - - switch (opcode) { - case NOP: - break; - case LUI_INST: - rddata = (immsrc << 12) & 0xfffff000; - rd_write = true; - break; - case AUIPC_INST: - rddata = ((immsrc << 12) & 0xfffff000) + PC_; - rd_write = true; - break; - case R_INST: { - if (func7 & 0x1) { - switch (func3) { - case 0: - // MUL - rddata = ((WordI)rsdata[0]) * ((WordI)rsdata[1]); - break; - case 1: { - // MULH - int64_t first = (int64_t)rsdata[0]; - if (rsdata[0] & 0x80000000) { - first = first | 0xFFFFFFFF00000000; - } - int64_t second = (int64_t)rsdata[1]; - if (rsdata[1] & 0x80000000) { - second = second | 0xFFFFFFFF00000000; - } - uint64_t result = first * second; - rddata = (result >> 32) & 0xFFFFFFFF; - } break; - case 2: { - // MULHSU - int64_t first = (int64_t)rsdata[0]; - if (rsdata[0] & 0x80000000) { - first = first | 0xFFFFFFFF00000000; - } - int64_t second = (int64_t)rsdata[1]; - rddata = ((first * second) >> 32) & 0xFFFFFFFF; - } break; - case 3: { - // MULHU - uint64_t first = (uint64_t)rsdata[0]; - uint64_t second = (uint64_t)rsdata[1]; - rddata = ((first * second) >> 32) & 0xFFFFFFFF; - } break; - case 4: { - // DIV - WordI dividen = rsdata[0]; - WordI divisor = rsdata[1]; - if (divisor == 0) { - rddata = -1; - } else if (dividen == WordI(0x80000000) && divisor == WordI(0xffffffff)) { - rddata = dividen; - } else { - rddata = dividen / divisor; - } - } break; - case 5: { - // DIVU - Word dividen = rsdata[0]; - Word divisor = rsdata[1]; - if (divisor == 0) { - rddata = -1; - } else { - rddata = dividen / divisor; - } - } break; - case 6: { - // REM - WordI dividen = rsdata[0]; - WordI divisor = rsdata[1]; - if (rsdata[1] == 0) { - rddata = dividen; - } else if (dividen == WordI(0x80000000) && divisor == WordI(0xffffffff)) { - rddata = 0; - } else { - rddata = dividen % divisor; - } - } break; - case 7: { - // REMU - Word dividen = rsdata[0]; - Word divisor = rsdata[1]; - if (rsdata[1] == 0) { - rddata = dividen; - } else { - rddata = dividen % divisor; - } - } break; - default: - std::cout << "unsupported MUL/DIV instr\n"; - std::abort(); - } - } else { - switch (func3) { - case 0: - if (func7) { - rddata = rsdata[0] - rsdata[1]; - } else { - rddata = rsdata[0] + rsdata[1]; - } - break; - case 1: - rddata = rsdata[0] << rsdata[1]; - break; - case 2: - rddata = (WordI(rsdata[0]) < WordI(rsdata[1])); - break; - case 3: - rddata = (Word(rsdata[0]) < Word(rsdata[1])); - break; - case 4: - rddata = rsdata[0] ^ rsdata[1]; - break; - case 5: - if (func7) { - rddata = WordI(rsdata[0]) >> WordI(rsdata[1]); - } else { - rddata = Word(rsdata[0]) >> Word(rsdata[1]); - } - break; - case 6: - rddata = rsdata[0] | rsdata[1]; - break; - case 7: - rddata = rsdata[0] & rsdata[1]; - break; - default: - std::abort(); - } - } - rd_write = true; - } break; - case I_INST: - switch (func3) { - case 0: - // ADDI - rddata = rsdata[0] + immsrc; - break; - case 1: - // SLLI - rddata = rsdata[0] << immsrc; - break; - case 2: - // SLTI - rddata = (WordI(rsdata[0]) < WordI(immsrc)); - break; - case 3: { - // SLTIU - rddata = (Word(rsdata[0]) < Word(immsrc)); - } break; - case 4: - // XORI - rddata = rsdata[0] ^ immsrc; - break; - case 5: - if (func7) { - // SRAI - Word result = WordI(rsdata[0]) >> immsrc; - rddata = result; - } else { - // SRLI - Word result = Word(rsdata[0]) >> immsrc; - rddata = result; - } - break; - case 6: - // ORI - rddata = rsdata[0] | immsrc; - break; - case 7: - // ANDI - rddata = rsdata[0] & immsrc; - break; - default: - std::abort(); - } - rd_write = true; - break; - case B_INST: - switch (func3) { - case 0: - // BEQ - if (rsdata[0] == rsdata[1]) { - nextPC = PC_ + immsrc; - } - break; - case 1: - // BNE - if (rsdata[0] != rsdata[1]) { - nextPC = PC_ + immsrc; - } - break; - case 4: - // BLT - if (WordI(rsdata[0]) < WordI(rsdata[1])) { - nextPC = PC_ + immsrc; - } - break; - case 5: - // BGE - if (WordI(rsdata[0]) >= WordI(rsdata[1])) { - nextPC = PC_ + immsrc; - } - break; - case 6: - // BLTU - if (Word(rsdata[0]) < Word(rsdata[1])) { - nextPC = PC_ + immsrc; - } - break; - case 7: - // BGEU - if (Word(rsdata[0]) >= Word(rsdata[1])) { - nextPC = PC_ + immsrc; - } - break; - } - pipeline->stall_warp = true; - runOnce = true; - break; - case JAL_INST: - rddata = nextPC; - nextPC = PC_ + immsrc; - pipeline->stall_warp = true; - runOnce = true; - rd_write = true; - break; - case JALR_INST: - rddata = nextPC; - nextPC = rsdata[0] + immsrc; - pipeline->stall_warp = true; - runOnce = true; - rd_write = true; - break; - case L_INST: { - Word memAddr = ((rsdata[0] + immsrc) & 0xFFFFFFFC); // word aligned - Word shift_by = ((rsdata[0] + immsrc) & 0x00000003) * 8; - Word data_read = core_->dcache_read(memAddr, 4); - D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); - switch (func3) { - case 0: - // LBI - rddata = signExt((data_read >> shift_by) & 0xFF, 8, 0xFF); - break; - case 1: - // LHI - rddata = signExt((data_read >> shift_by) & 0xFFFF, 16, 0xFFFF); - break; - case 2: - // LW - rddata = data_read; - break; - case 4: - // LBU - rddata = Word((data_read >> shift_by) & 0xFF); - break; - case 5: - // LHU - rddata = Word((data_read >> shift_by) & 0xFFFF); - break; - default: - std::abort(); - } - rd_write = true; - } break; - case S_INST: { - Word memAddr = rsdata[0] + immsrc; - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - switch (func3) { - case 0: - // SB - core_->dcache_write(memAddr, rsdata[1] & 0x000000FF, 1); - break; - case 1: - // SH - core_->dcache_write(memAddr, rsdata[1], 2); - break; - case 2: - // SW - core_->dcache_write(memAddr, rsdata[1], 4); - break; - default: - std::abort(); - } - } break; - case SYS_INST: { - Word csr_addr = immsrc & 0x00000FFF; - Word csr_value = core_->get_csr(csr_addr, t, id_); - switch (func3) { - case 0: - if (csr_addr < 2) { - // ECALL/EBREAK - core_->trigger_ebreak(); - } - break; - case 1: - // CSRRW - rddata = csr_value; - core_->set_csr(csr_addr, rsdata[0], t, id_); - rd_write = true; - break; - case 2: - // CSRRS - rddata = csr_value; - core_->set_csr(csr_addr, csr_value | rsdata[0], t, id_); - rd_write = true; - break; - case 3: - // CSRRC - rddata = csr_value; - core_->set_csr(csr_addr, csr_value & ~rsdata[0], t, id_); - rd_write = true; - break; - case 5: - // CSRRWI - rddata = csr_value; - core_->set_csr(csr_addr, rsrc0, t, id_); - rd_write = true; - break; - case 6: - // CSRRSI - rddata = csr_value; - core_->set_csr(csr_addr, csr_value | rsrc0, t, id_); - rd_write = true; - break; - case 7: - // CSRRCI - rddata = csr_value; - core_->set_csr(csr_addr, csr_value & ~rsrc0, t, id_); - rd_write = true; - break; - default: - break; - } - } break; - case FENCE: - pipeline->stall_warp = true; - runOnce = true; - break; - case (FL | VL): - if (func3 == 0x2) { - Word memAddr = rsdata[0] + immsrc; - Word data_read = core_->dcache_read(memAddr, 4); - D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); - rddata = data_read; - } else { - D(3, "Executing vector load"); - D(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); - D(3, "src: " << rsrc0 << " " << rsdata[0]); - D(3, "dest" << rdest); - D(3, "width" << instr.getVlsWidth()); - - auto &vd = vRegFile_[rdest]; - - switch (instr.getVlsWidth()) { - case 6: { - //load word and unit strided (not checking for unit stride) - for (int i = 0; i < vl_; i++) { - Word memAddr = ((rsdata[0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - Word data_read = core_->dcache_read(memAddr, 4); - D(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); - int *result_ptr = (int *)(vd.data() + i); - *result_ptr = data_read; - } - } break; - default: - std::abort(); - } - break; - } - rd_write = true; - break; - case (FS | VS): - if (func3 == 0x2) { - Word memAddr = rsdata[0] + immsrc; - core_->dcache_write(memAddr, rsdata[1], 4); - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - } else { - for (int i = 0; i < vl_; i++) { - Word memAddr = rsdata[0] + (i * vtype_.vsew / 8); - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - switch (instr.getVlsWidth()) { - case 6: { - //store word and unit strided (not checking for unit stride) - uint32_t value = *(uint32_t *)(vRegFile_[instr.getVs3()].data() + i); - core_->dcache_write(memAddr, value, 4); - D(3, "store: " << memAddr << " value:" << value); - } break; - default: - std::abort(); - } - } - } - break; - case FCI: { - uint32_t frm = get_fpu_rm(func3, core_, t, id_); - uint32_t fflags = 0; - switch (func7) { - case 0x00: //FADD - rddata = rv_fadd(rsdata[0], rsdata[1], frm, &fflags); - break; - case 0x04: //FSUB - rddata = rv_fsub(rsdata[0], rsdata[1], frm, &fflags); - break; - case 0x08: //FMUL - rddata = rv_fmul(rsdata[0], rsdata[1], frm, &fflags); - break; - case 0x0c: //FDIV - rddata = rv_fdiv(rsdata[0], rsdata[1], frm, &fflags); - break; - case 0x2c: //FSQRT - rddata = rv_fsqrt(rsdata[0], frm, &fflags); - break; - case 0x10: - switch (func3) { - case 0: // FSGNJ.S - rddata = rv_fsgnj(rsdata[0], rsdata[1]); - break; - case 1: // FSGNJN.S - rddata = rv_fsgnjn(rsdata[0], rsdata[1]); - break; - case 2: // FSGNJX.S - rddata = rv_fsgnjx(rsdata[0], rsdata[1]); - break; - } - break; - case 0x14: - if (func3) { - // FMAX.S - rddata = rv_fmax(rsdata[0], rsdata[1], &fflags); - } else { - // FMIN.S - rddata = rv_fmin(rsdata[0], rsdata[1], &fflags); - } - break; - case 0x60: - if (rsrc1 == 0) { - // FCVT.W.S - rddata = rv_ftoi(rsdata[0], frm, &fflags); - } else { - // FCVT.WU.S - rddata = rv_ftou(rsdata[0], frm, &fflags); - } - break; - case 0x70: - if (func3) { - // FCLASS.S - rddata = rv_fclss(rsdata[0]); - } else { - // FMV.X.W - rddata = rsdata[0]; - } - break; - case 0x50: - switch(func3) { - case 0: - // FLE.S - rddata = rv_fle(rsdata[0], rsdata[1], &fflags); - break; - case 1: - // FLT.S - rddata = rv_flt(rsdata[0], rsdata[1], &fflags); - break; - case 2: - // FEQ.S - rddata = rv_feq(rsdata[0], rsdata[1], &fflags); - break; - } break; - case 0x68: - if (rsrc1) { - // FCVT.S.WU: - rddata = rv_utof(rsdata[0], frm, &fflags); - } else { - // FCVT.S.W: - rddata = rv_itof(rsdata[0], frm, &fflags); - } - break; - case 0x78: - // FMV.W.X - rddata = rsdata[0]; - break; - } - update_fcrs(fflags, core_, t, id_); - rd_write = true; - } break; - case FMADD: - case FMSUB: - case FMNMADD: - case FMNMSUB: { - int frm = get_fpu_rm(func3, core_, t, id_); - Word fflags = 0; - switch (opcode) { - case FMADD: - rddata = rv_fmadd(rsdata[0], rsdata[1], rsdata[2], frm, &fflags); - break; - case FMSUB: - rddata = rv_fmsub(rsdata[0], rsdata[1], rsdata[2], frm, &fflags); - break; - case FMNMADD: - rddata = rv_fnmadd(rsdata[0], rsdata[1], rsdata[2], frm, &fflags); - break; - case FMNMSUB: - rddata = rv_fnmsub(rsdata[0], rsdata[1], rsdata[2], frm, &fflags); - break; - default: - break; - } - update_fcrs(fflags, core_, t, id_); - rd_write = true; - } break; - case GPGPU: - switch (func3) { - case 0: { - // TMC - if (rsrc1) { - // predicate mode - ThreadMask pred; - for (int i = 0; i < num_threads; ++i) { - pred[i] = tmask_[i] ? (iRegFile_[i][rsrc0] != 0) : 0; - } - if (pred.any()) { - tmask_ &= pred; - } - } else { - tmask_.reset(); - for (int i = 0; i < num_threads; ++i) { - tmask_[i] = rsdata[0] & (1 << i); - } - } - D(3, "*** TMC " << tmask_); - active_ = tmask_.any(); - pipeline->stall_warp = true; - runOnce = true; - } break; - case 1: { - // WSPAWN - int active_warps = std::min(rsdata[0], core_->arch().num_warps()); - D(3, "*** Spawning " << (active_warps-1) << " warps at PC: " << std::hex << rsdata[1]); - for (int i = 1; i < active_warps; ++i) { - Warp &newWarp = core_->warp(i); - newWarp.setPC(rsdata[1]); - newWarp.setTmask(0, true); - } - pipeline->stall_warp = true; - runOnce = true; - } break; - case 2: { - // SPLIT - if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) { - ThreadMask tmask; - for (int i = 0; i < num_threads; ++i) { - tmask[i] = tmask_[i] && !iRegFile_[i][rsrc0]; - } - - DomStackEntry e(tmask, nextPC); - domStack_.push(tmask_); - domStack_.push(e); - for (size_t i = 0; i < e.tmask.size(); ++i) { - tmask_[i] = !e.tmask[i] && tmask_[i]; - } - active_ = tmask_.any(); - - DPH(3, "*** Split: New TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, tmask_[num_threads-i-1]); - DPN(3, ", Pushed TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask[num_threads-i-1]); - DPN(3, ", PC=0x" << std::hex << e.PC << "\n"); - } else { - D(3, "*** Unanimous pred"); - DomStackEntry e(tmask_); - e.unanimous = true; - domStack_.push(e); - } - pipeline->stall_warp = true; - runOnce = true; - } break; - case 3: { - // JOIN - if (!domStack_.empty() && domStack_.top().unanimous) { - D(3, "*** Uninimous branch at join"); - tmask_ = domStack_.top().tmask; - active_ = tmask_.any(); - domStack_.pop(); - } else { - if (!domStack_.top().fallThrough) { - nextPC = domStack_.top().PC; - D(3, "*** Join: next PC: " << std::hex << nextPC << std::dec); - } - - tmask_ = domStack_.top().tmask; - active_ = tmask_.any(); - - DPH(3, "*** Join: New TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, tmask_[num_threads-i-1]); - DPN(3, "\n"); - - domStack_.pop(); - } - pipeline->stall_warp = true; - runOnce = true; - } break; - case 4: { - // BAR - active_ = false; - core_->barrier(rsdata[0], rsdata[1], id_); - pipeline->stall_warp = true; - runOnce = true; - } break; - case 6: { - // PREFETCH - int addr = rsdata[0]; - printf("*** PREFETCHED %d ***\n", addr); - } break; - default: - std::abort(); - } - break; - case VSET: { - int VLEN = core_->arch().vsize() * 8; - int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew(); - switch (func3) { - case 0: // vector-vector - switch (func6) { - case 0: { - auto& vr1 = vRegFile_[rsrc0]; - auto& vr2 = vRegFile_[rsrc1]; - auto& vd = vRegFile_[rdest]; - auto& mask = vRegFile_[0]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t emask = *(uint8_t *)(mask.data() + i); - uint8_t value = emask & 0x1; - if (vmask || (!vmask && value)) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t emask = *(uint16_t *)(mask.data() + i); - uint16_t value = emask & 0x1; - if (vmask || (!vmask && value)) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t emask = *(uint32_t *)(mask.data() + i); - uint32_t value = emask & 0x1; - if (vmask || (!vmask && value)) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - } - } - } break; - case 24: { - //vmseq - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - } - } break; - case 25: { - //vmsne - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - } - } break; - case 26: { - //vmsltu - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - } - } break; - case 27: { - //vmslt - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - int8_t first = *(int8_t *)(vr1.data() + i); - int8_t second = *(int8_t *)(vr2.data() + i); - int8_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - int16_t first = *(int16_t *)(vr1.data() + i); - int16_t second = *(int16_t *)(vr2.data() + i); - int16_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - int32_t first = *(int32_t *)(vr1.data() + i); - int32_t second = *(int32_t *)(vr2.data() + i); - int32_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int32_t *)(vd.data() + i) = result; - } - } - } break; - case 28: { - //vmsleu - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - } - } break; - case 29: { - //vmsle - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - int8_t first = *(int8_t *)(vr1.data() + i); - int8_t second = *(int8_t *)(vr2.data() + i); - int8_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - int16_t first = *(int16_t *)(vr1.data() + i); - int16_t second = *(int16_t *)(vr2.data() + i); - int16_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - int32_t first = *(int32_t *)(vr1.data() + i); - int32_t second = *(int32_t *)(vr2.data() + i); - int32_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int32_t *)(vd.data() + i) = result; - } - } - } break; - case 30: { - //vmsgtu - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - } - } break; - case 31: { - //vmsgt - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - int8_t first = *(int8_t *)(vr1.data() + i); - int8_t second = *(int8_t *)(vr2.data() + i); - int8_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - int16_t first = *(int16_t *)(vr1.data() + i); - int16_t second = *(int16_t *)(vr2.data() + i); - int16_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - int32_t first = *(int32_t *)(vr1.data() + i); - int32_t second = *(int32_t *)(vr2.data() + i); - int32_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int32_t *)(vd.data() + i) = result; - } - } - } break; - } - break; - case 2: { - switch (func6) { - case 24: { - // vmandnot - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 25: { - // vmand - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 26: { - // vmor - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 27: { - //vmxor - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 28: { - //vmornot - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 29: { - //vmnand - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 30: { - //vmnor - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 31: { - //vmxnor - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 37: { - //vmul - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 45: { - // vmacc - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) += result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) += result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) += result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - } - } break; - case 6: { - switch (func6) { - case 0: { - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (rsdata[0] + second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (rsdata[0] + second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (rsdata[0] + second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 37: { - // vmul.vx - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (rsdata[0] * second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (rsdata[0] * second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (rsdata[0] * second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - } - } break; - case 7: { - vtype_.vill = 0; - vtype_.vediv = instr.getVediv(); - vtype_.vsew = instr.getVsew(); - vtype_.vlmul = instr.getVlmul(); - - D(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0] << "VLMAX" << VLMAX); - - int s0 = rsdata[0]; - if (s0 <= VLMAX) { - vl_ = s0; - } else if (s0 < (2 * VLMAX)) { - vl_ = (int)ceil((s0 * 1.0) / 2.0); - } else if (s0 >= (2 * VLMAX)) { - vl_ = VLMAX; - } - rddata = vl_; - } break; - default: - std::abort(); - } - } break; - default: - std::abort(); - } - - if (rd_write) { - int rdt = instr.getRDType(); - switch (rdt) { - case 1: - if (rdest) { - D(2, "[" << std::dec << t << "] Dest Regs: r" << rdest << "=0x" << std::hex << std::hex << rddata); - iregs[rdest] = rddata; - } - break; - case 2: - D(2, "[" << std::dec << t << "] Dest Regs: fr" << rdest << "=0x" << std::hex << std::hex << rddata); - fregs[rdest] = rddata; - break; - default: - break; - } - } - } - - PC_ += core_->arch().wsize(); - if (PC_ != nextPC) { - D(3, "*** Next PC: " << std::hex << nextPC << std::dec); - PC_ = nextPC; - } -} diff --git a/sim/simX/pipeline.cpp b/sim/simX/pipeline.cpp deleted file mode 100644 index c54977a0..00000000 --- a/sim/simX/pipeline.cpp +++ /dev/null @@ -1,63 +0,0 @@ -#include -#include "pipeline.h" - -using namespace vortex; - -namespace vortex { -std::ostream &operator<<(std::ostream &os, const Pipeline& pipeline) { - os << pipeline.name_ << ": valid=" << pipeline.valid << std::endl; - os << pipeline.name_ << ": stalled=" << pipeline.stalled << std::endl; - os << pipeline.name_ << ": stall_warp=" << pipeline.stall_warp << std::endl; - os << pipeline.name_ << ": wid=" << pipeline.wid << std::endl; - os << pipeline.name_ << ": PC=" << std::hex << pipeline.PC << std::endl; - os << pipeline.name_ << ": used_iregs=" << pipeline.used_iregs << std::endl; - os << pipeline.name_ << ": used_fregs=" << pipeline.used_fregs << std::endl; - os << pipeline.name_ << ": used_vregs=" << pipeline.used_vregs << std::endl; - return os; -} -} - -Pipeline::Pipeline(const char* name) -: name_(name) { - this->clear(); -} - -void Pipeline::clear() { - valid = false; - stalled = false; - stall_warp = false; - wid = 0; - PC = 0; - used_iregs.reset(); - used_fregs.reset(); - used_vregs.reset(); -} - -bool Pipeline::enter(Pipeline *drain) { - if (drain) { - if (drain->stalled) { - this->stalled = true; - return false; - } - drain->valid = false; - } - this->stalled = false; - if (!this->valid) - return false; - return true; -} - -void Pipeline::next(Pipeline *drain) { - if (drain) { - drain->valid = this->valid; - drain->stalled = this->stalled; - drain->stall_warp = this->stall_warp; - drain->wid = this->wid; - drain->PC = this->PC; - drain->rdest = this->rdest; - drain->rdest_type = this->rdest_type; - drain->used_iregs = this->used_iregs; - drain->used_fregs = this->used_fregs; - drain->used_vregs = this->used_vregs; - } -} \ No newline at end of file diff --git a/sim/simX/pipeline.h b/sim/simX/pipeline.h deleted file mode 100644 index f8899a63..00000000 --- a/sim/simX/pipeline.h +++ /dev/null @@ -1,48 +0,0 @@ - -#pragma once - -#include -#include -#include "types.h" -#include "debug.h" - -namespace vortex { - -class Instr; - -class Pipeline { -public: - Pipeline(const char* name); - - void clear(); - - bool enter(Pipeline* drain); - - void next(Pipeline* drain); - - //-- - bool valid; - - //-- - bool stalled; - bool stall_warp; - - //-- - int wid; - Word PC; - - //-- - int rdest_type; - int rdest; - RegMask used_iregs; - RegMask used_fregs; - RegMask used_vregs; - -private: - - const char* name_; - - friend std::ostream &operator<<(std::ostream &, const Pipeline&); -}; - -} \ No newline at end of file diff --git a/sim/simX/types.h b/sim/simX/types.h deleted file mode 100644 index ca732040..00000000 --- a/sim/simX/types.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace vortex { - -typedef uint8_t Byte; -typedef uint32_t Word; -typedef int32_t WordI; - -typedef uint32_t Addr; -typedef uint32_t Size; - -typedef std::bitset<32> RegMask; - -typedef std::bitset<32> ThreadMask; - -typedef std::bitset<32> WarpMask; - -} \ No newline at end of file diff --git a/sim/simX/warp.cpp b/sim/simX/warp.cpp deleted file mode 100644 index a505fe5c..00000000 --- a/sim/simX/warp.cpp +++ /dev/null @@ -1,93 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "instr.h" -#include "core.h" - -using namespace vortex; - -Warp::Warp(Core *core, Word id) - : id_(id) - , core_(core) { - iRegFile_.resize(core_->arch().num_threads(), std::vector(core_->arch().num_regs(), 0)); - fRegFile_.resize(core_->arch().num_threads(), std::vector(core_->arch().num_regs(), 0)); - vRegFile_.resize(core_->arch().num_regs(), std::vector(core_->arch().vsize(), 0)); - this->clear(); -} - -void Warp::clear() { - PC_ = STARTUP_ADDR; - tmask_.reset(); - active_ = false; -} - -void Warp::step(Pipeline *pipeline) { - assert(tmask_.any()); - - DPH(2, "Step: wid=" << id_ << ", PC=0x" << std::hex << PC_ << ", tmask="); - for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) - DPN(2, tmask_[n-i-1]); - DPN(2, "\n"); - - /* Fetch and decode. */ - - Word fetched = core_->icache_fetch(PC_); - auto instr = core_->decoder().decode(fetched, PC_); - - // Update pipeline - pipeline->valid = true; - pipeline->PC = PC_; - pipeline->rdest = instr->getRDest(); - pipeline->rdest_type = instr->getRDType(); - pipeline->used_iregs.reset(); - pipeline->used_fregs.reset(); - pipeline->used_vregs.reset(); - - switch (pipeline->rdest_type) { - case 1: - pipeline->used_iregs[pipeline->rdest] = 1; - break; - case 2: - pipeline->used_fregs[pipeline->rdest] = 1; - break; - case 3: - pipeline->used_vregs[pipeline->rdest] = 1; - break; - default: - break; - } - - for (int i = 0; i < instr->getNRSrc(); ++i) { - int type = instr->getRSType(i); - int reg = instr->getRSrc(i); - switch (type) { - case 1: - pipeline->used_iregs[reg] = 1; - break; - case 2: - pipeline->used_fregs[reg] = 1; - break; - case 3: - pipeline->used_vregs[reg] = 1; - break; - default: - break; - } - } - - // Execute - this->execute(*instr, pipeline); - - D(4, "Register state:"); - for (int i = 0; i < core_->arch().num_regs(); ++i) { - DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':'); - for (int j = 0; j < core_->arch().num_threads(); ++j) { - DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_[j][i] << std::setfill(' ') << ' '); - } - DPN(4, std::endl); - } -} \ No newline at end of file diff --git a/sim/simx/Makefile b/sim/simx/Makefile new file mode 100644 index 00000000..1d081c9e --- /dev/null +++ b/sim/simx/Makefile @@ -0,0 +1,48 @@ +DESTDIR ?= . +RTL_DIR = ../hw/rtl +THIRD_PARTY_DIR = ../../third_party + +CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors +CXXFLAGS += -fPIC -Wno-maybe-uninitialized +CXXFLAGS += -I. -I../common -I../../hw +CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include +CXXFLAGS += -I$(THIRD_PARTY_DIR) +CXXFLAGS += $(CONFIGS) + +LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a +LDFLAGS += -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx +LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator + +TOP = vx_cache_sim + +SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp +SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp + +OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS))) +VPATH := $(sort $(dir $(SRCS))) + +#$(info OBJS is $(OBJS)) +#$(info VPATH is $(VPATH)) + +# Debugigng +ifdef DEBUG + CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) +else + CXXFLAGS += -O2 -DNDEBUG +endif + +PROJECT = simx + +all: $(DESTDIR)/$(PROJECT) + +$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ + +$(DESTDIR)/lib$(PROJECT).so: $(SRCS) + $(CXX) $(CXXFLAGS) $^ -shared $(LDFLAGS) -o $@ + +.depend: $(SRCS) + $(CXX) $(CXXFLAGS) -MM $^ > .depend; + +clean: + rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so \ No newline at end of file diff --git a/sim/simx/archdef.h b/sim/simx/archdef.h new file mode 100644 index 00000000..c2a28f78 --- /dev/null +++ b/sim/simx/archdef.h @@ -0,0 +1,71 @@ +#pragma once + +#include +#include + +#include +#include +#include "types.h" + +namespace vortex { + +class ArchDef { +private: + uint16_t num_cores_; + uint16_t num_warps_; + uint16_t num_threads_; + uint16_t wsize_; + uint16_t vsize_; + uint16_t num_regs_; + uint16_t num_csrs_; + uint16_t num_barriers_; + +public: + ArchDef(const std::string& /*arch*/, + uint16_t num_cores, + uint16_t num_warps, + uint16_t num_threads) + : num_cores_(num_cores) + , num_warps_(num_warps) + , num_threads_(num_threads) + , wsize_(4) + , vsize_(16) + , num_regs_(32) + , num_csrs_(4096) + , num_barriers_(NUM_BARRIERS) + {} + + uint16_t wsize() const { + return wsize_; + } + + uint16_t vsize() const { + return vsize_; + } + + uint16_t num_regs() const { + return num_regs_; + } + + uint16_t num_csrs() const { + return num_csrs_; + } + + uint16_t num_barriers() const { + return num_barriers_; + } + + uint16_t num_threads() const { + return num_threads_; + } + + uint16_t num_warps() const { + return num_warps_; + } + + uint16_t num_cores() const { + return num_cores_; + } +}; + +} \ No newline at end of file diff --git a/sim/simX/args.cpp b/sim/simx/args.cpp similarity index 100% rename from sim/simX/args.cpp rename to sim/simx/args.cpp diff --git a/sim/simX/args.h b/sim/simx/args.h similarity index 97% rename from sim/simX/args.h rename to sim/simx/args.h index aeaba4e5..fd7de5bc 100644 --- a/sim/simX/args.h +++ b/sim/simx/args.h @@ -35,7 +35,7 @@ public: CommandLineArg(l, ht), arg_(x) {} int read(int argc, char **argv) { - __unused(argc); + __unused (argc); std::istringstream iss(argv[1]); iss >> arg_; return 1; @@ -53,7 +53,7 @@ public: CommandLineArg(l, ht), arg_(x) { arg_ = false; } int read(int argc, char **argv) { - __unused(argc, argv); + __unused (argc, argv); arg_ = true; return 0; } diff --git a/sim/simx/cache.cpp b/sim/simx/cache.cpp new file mode 100644 index 00000000..34c8903c --- /dev/null +++ b/sim/simx/cache.cpp @@ -0,0 +1,637 @@ +#include "cache.h" +#include "debug.h" +#include "types.h" +#include +#include +#include +#include +#include + +using namespace vortex; + +struct params_t { + uint32_t sets_per_bank; + uint32_t blocks_per_set; + uint32_t words_per_block; + uint32_t log2_num_inputs; + + uint32_t word_select_addr_start; + uint32_t word_select_addr_end; + + uint32_t bank_select_addr_start; + uint32_t bank_select_addr_end; + + uint32_t set_select_addr_start; + uint32_t set_select_addr_end; + + uint32_t tag_select_addr_start; + uint32_t tag_select_addr_end; + + params_t(const Cache::Config& config) { + uint32_t bank_bits = log2ceil(config.num_banks); + uint32_t offset_bits = config.B - config.W; + uint32_t log2_bank_size = config.C - bank_bits; + uint32_t index_bits = log2_bank_size - (config.B << config.A); + assert(log2_bank_size >= config.B); + + this->log2_num_inputs = log2ceil(config.num_inputs); + + this->words_per_block = 1 << offset_bits; + this->blocks_per_set = 1 << config.A; + this->sets_per_bank = 1 << index_bits; + + assert(config.ports_per_bank <= this->words_per_block); + + // Word select + this->word_select_addr_start = config.W; + this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1); + + // Bank select + this->bank_select_addr_start = (1+this->word_select_addr_end); + this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1); + + // Set select + this->set_select_addr_start = (1+this->bank_select_addr_end); + this->set_select_addr_end = (this->set_select_addr_start+index_bits-1); + + // Tag select + this->tag_select_addr_start = (1+this->set_select_addr_end); + this->tag_select_addr_end = (config.addr_width-1); + } + + uint32_t addr_bank_id(uint64_t word_addr) const { + if (bank_select_addr_end >= bank_select_addr_start) + return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end); + else + return 0; + } + + uint32_t addr_set_id(uint64_t word_addr) const { + if (set_select_addr_end >= set_select_addr_start) + return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end); + else + return 0; + } + + uint64_t addr_tag(uint64_t word_addr) const { + if (tag_select_addr_end >= tag_select_addr_start) + return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end); + else + return 0; + } + + uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const { + uint64_t addr(0); + if (bank_select_addr_end >= bank_select_addr_start) + addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id); + if (set_select_addr_end >= set_select_addr_start) + addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id); + if (tag_select_addr_end >= tag_select_addr_start) + addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag); + return addr; + } +}; + +struct block_t { + bool valid; + bool dirty; + uint64_t tag; + uint32_t lru_ctr; +}; + +struct set_t { + std::vector blocks; + set_t(uint32_t size) : blocks(size) {} + + void clear() { + for (auto& block : blocks) { + block.valid = false; + } + } +}; + +struct bank_req_info_t { + bool valid; + uint32_t req_id; + uint64_t req_tag; +}; + +struct bank_req_t { + bool valid; + bool write; + bool mshr_replay; + uint64_t tag; + uint32_t set_id; + uint32_t core_id; + uint64_t uuid; + std::vector infos; + + bank_req_t(uint32_t size) + : valid(false) + , write(false) + , mshr_replay(false) + , tag(0) + , set_id(0) + , core_id(0) + , uuid(0) + , infos(size) + {} +}; + +struct mshr_entry_t : public bank_req_t { + uint32_t block_id; + + mshr_entry_t(uint32_t size = 0) + : bank_req_t(size) + , block_id(0) + {} +}; + +class MSHR { +private: + std::vector entries_; + uint32_t size_; + +public: + MSHR(uint32_t size) + : entries_(size) + , size_(0) + {} + + bool empty() const { + return (0 == size_); + } + + bool full() const { + return (size_ == entries_.size()); + } + + int lookup(const bank_req_t& bank_req) { + for (uint32_t i = 0, n = entries_.size(); i < n; ++i) { + auto& entry = entries_.at(i); + if (entry.valid + && entry.set_id == bank_req.set_id + && entry.tag == bank_req.tag) { + return i; + } + } + return -1; + } + + int allocate(const bank_req_t& bank_req, uint32_t block_id) { + for (uint32_t i = 0, n = entries_.size(); i < n; ++i) { + auto& entry = entries_.at(i); + if (!entry.valid) { + *(bank_req_t*)&entry = bank_req; + entry.valid = true; + entry.mshr_replay = false; + entry.block_id = block_id; + ++size_; + return i; + } + } + return -1; + } + + mshr_entry_t& replay(uint32_t id) { + auto& root_entry = entries_.at(id); + assert(root_entry.valid); + // make all related mshr entries for replay + for (auto& entry : entries_) { + if (entry.valid + && entry.set_id == root_entry.set_id + && entry.tag == root_entry.tag) { + entry.mshr_replay = true; + } + } + return root_entry; + } + + bool pop(bank_req_t* out) { + for (auto& entry : entries_) { + if (entry.valid && entry.mshr_replay) { + *out = entry; + entry.valid = false; + --size_; + return true; + } + } + return false; + } + + void clear() { + for (auto& entry : entries_) { + if (entry.valid && entry.mshr_replay) { + entry.valid = false; + } + } + size_ = 0; + } +}; + +struct bank_t { + std::vector sets; + MSHR mshr; + + bank_t(const Cache::Config& config, + const params_t& params) + : sets(params.sets_per_bank, params.blocks_per_set) + , mshr(config.mshr_size) + {} + + void clear() { + mshr.clear(); + for (auto& set : sets) { + set.clear(); + } + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +class Cache::Impl { +private: + Cache* const simobject_; + Config config_; + params_t params_; + std::vector banks_; + Switch::Ptr mem_switch_; + Switch::Ptr bypass_switch_; + std::vector> mem_req_ports_; + std::vector> mem_rsp_ports_; + uint32_t flush_cycles_; + PerfStats perf_stats_; + uint64_t pending_read_reqs_; + uint64_t pending_write_reqs_; + uint64_t pending_fill_reqs_; + +public: + Impl(Cache* simobject, const Config& config) + : simobject_(simobject) + , config_(config) + , params_(config) + , banks_(config.num_banks, {config, params_}) + , mem_req_ports_(config.num_banks, simobject) + , mem_rsp_ports_(config.num_banks, simobject) + { + bypass_switch_ = Switch::Create("bypass_arb", ArbiterType::Priority, 2); + bypass_switch_->ReqOut.bind(&simobject->MemReqPort); + simobject->MemRspPort.bind(&bypass_switch_->RspIn); + + if (config.num_banks > 1) { + mem_switch_ = Switch::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks); + for (uint32_t i = 0, n = config.num_banks; i < n; ++i) { + mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i)); + mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i)); + } + mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn); + } else { + mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0)); + } + + // calculate tag flush cycles + flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set; + } + + void reset() { + for (auto& bank : banks_) { + bank.clear(); + } + perf_stats_ = PerfStats(); + pending_read_reqs_ = 0; + pending_write_reqs_ = 0; + pending_fill_reqs_ = 0; + } + + void tick() { + // wait on flush cycles + if (flush_cycles_ != 0) { + --flush_cycles_; + return; + } + + // per-bank pipeline request + std::vector pipeline_reqs(config_.num_banks, config_.ports_per_bank); + + // calculate memory latency + perf_stats_.mem_latency += pending_fill_reqs_; + + // handle bypasss responses + auto& bypass_port = bypass_switch_->RspOut.at(1); + if (!bypass_port.empty()) { + auto& mem_rsp = bypass_port.front(); + uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1); + uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs; + MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid}; + simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency); + DT(3, simobject_->name() << "-" << core_rsp); + bypass_port.pop(); + } + + // handle MSHR replay + for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + auto& bank = banks_.at(bank_id); + auto& pipeline_req = pipeline_reqs.at(bank_id); + bank.mshr.pop(&pipeline_req); + } + + // handle memory fills + std::vector pending_fill_req(config_.num_banks, false); + for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + auto& mem_rsp_port = mem_rsp_ports_.at(bank_id); + if (!mem_rsp_port.empty()) { + auto& mem_rsp = mem_rsp_port.front(); + this->processMemoryFill(bank_id, mem_rsp.tag); + pending_fill_req.at(bank_id) = true; + mem_rsp_port.pop(); + } + } + + // handle incoming core requests + for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) { + auto& core_req_port = simobject_->CoreReqPorts.at(req_id); + if (core_req_port.empty()) + continue; + + auto& core_req = core_req_port.front(); + + // check cache bypassing + if (core_req.non_cacheable) { + // send IO request + this->processIORequest(core_req, req_id); + + // remove request + core_req_port.pop(); + continue; + } + + auto bank_id = params_.addr_bank_id(core_req.addr); + auto set_id = params_.addr_set_id(core_req.addr); + auto tag = params_.addr_tag(core_req.addr); + auto port_id = req_id % config_.ports_per_bank; + + // create bank request + bank_req_t bank_req(config_.ports_per_bank); + bank_req.valid = true; + bank_req.write = core_req.write; + bank_req.mshr_replay = false; + bank_req.tag = tag; + bank_req.set_id = set_id; + bank_req.core_id = core_req.core_id; + bank_req.uuid = core_req.uuid; + bank_req.infos.at(port_id) = {true, req_id, core_req.tag}; + + auto& bank = banks_.at(bank_id); + auto& pipeline_req = pipeline_reqs.at(bank_id); + + // check pending MSHR replay + if (pipeline_req.valid + && pipeline_req.mshr_replay) { + // stall + continue; + } + + // check pending fill request + if (pending_fill_req.at(bank_id)) { + // stall + continue; + } + + // check MSHR capacity if read or writeback + if ((!core_req.write || !config_.write_through) + && bank.mshr.full()) { + ++perf_stats_.mshr_stalls; + continue; + } + + // check bank conflicts + if (pipeline_req.valid) { + // check port conflict + if (pipeline_req.write != core_req.write + || pipeline_req.set_id != set_id + || pipeline_req.tag != tag + || pipeline_req.infos[port_id].valid) { + ++perf_stats_.bank_stalls; + continue; + } + // update pending request infos + pipeline_req.infos[port_id] = bank_req.infos[port_id]; + } else { + // schedule new request + pipeline_req = bank_req; + } + + if (core_req.write) + ++perf_stats_.writes; + else + ++perf_stats_.reads; + + // remove request + auto time = core_req_port.pop(); + perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time); + } + + // process active request + this->processBankRequest(pipeline_reqs); + } + + const PerfStats& perf_stats() const { + return perf_stats_; + } + +private: + + void processIORequest(const MemReq& core_req, uint32_t req_id) { + { + MemReq mem_req(core_req); + mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id; + bypass_switch_->ReqIn.at(1).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); + } + + if (core_req.write && config_.write_reponse) { + MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid}; + simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1); + DT(3, simobject_->name() << "-" << core_rsp); + } + } + + void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) { + // update block + auto& bank = banks_.at(bank_id); + auto& entry = bank.mshr.replay(mshr_id); + auto& set = bank.sets.at(entry.set_id); + auto& block = set.blocks.at(entry.block_id); + block.valid = true; + block.tag = entry.tag; + --pending_fill_reqs_; + } + + void processBankRequest(const std::vector& pipeline_reqs) { + for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + auto& pipeline_req = pipeline_reqs.at(bank_id); + if (!pipeline_req.valid) + continue; + + auto& bank = banks_.at(bank_id); + auto& set = bank.sets.at(pipeline_req.set_id); + + if (pipeline_req.mshr_replay) { + // send core response + for (auto& info : pipeline_req.infos) { + MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid}; + simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); + DT(3, simobject_->name() << "-" << core_rsp); + } + } else { + bool hit = false; + bool found_free_block = false; + int hit_block_id = 0; + int repl_block_id = 0; + uint32_t max_cnt = 0; + + for (int i = 0, n = set.blocks.size(); i < n; ++i) { + auto& block = set.blocks.at(i); + if (block.valid) { + if (block.tag == pipeline_req.tag) { + block.lru_ctr = 0; + hit_block_id = i; + hit = true; + } else { + ++block.lru_ctr; + } + if (max_cnt < block.lru_ctr) { + max_cnt = block.lru_ctr; + repl_block_id = i; + } + } else { + found_free_block = true; + repl_block_id = i; + } + } + + if (hit) { + // + // Hit handling + // + if (pipeline_req.write) { + // handle write hit + auto& hit_block = set.blocks.at(hit_block_id); + if (config_.write_through) { + // forward write request to memory + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag); + mem_req.write = true; + mem_req.core_id = pipeline_req.core_id; + mem_req.uuid = pipeline_req.uuid; + mem_req_ports_.at(bank_id).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); + } else { + // mark block as dirty + hit_block.dirty = true; + } + } + // send core response + if (!pipeline_req.write || config_.write_reponse) { + for (auto& info : pipeline_req.infos) { + MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid}; + simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); + DT(3, simobject_->name() << "-" << core_rsp); + } + } + } else { + // + // Miss handling + // + if (pipeline_req.write) + ++perf_stats_.write_misses; + else + ++perf_stats_.read_misses; + + if (!found_free_block && !config_.write_through) { + // write back dirty block + auto& repl_block = set.blocks.at(repl_block_id); + if (repl_block.dirty) { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag); + mem_req.write = true; + mem_req.core_id = pipeline_req.core_id; + mem_req_ports_.at(bank_id).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); + ++perf_stats_.evictions; + } + } + + if (pipeline_req.write && config_.write_through) { + // forward write request to memory + { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); + mem_req.write = true; + mem_req.core_id = pipeline_req.core_id; + mem_req.uuid = pipeline_req.uuid; + mem_req_ports_.at(bank_id).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); + } + // send core response + if (config_.write_reponse) { + for (auto& info : pipeline_req.infos) { + MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid}; + simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); + DT(3, simobject_->name() << "-" << core_rsp); + } + } + } else { + // MSHR lookup + int pending = bank.mshr.lookup(pipeline_req); + + // allocate MSHR + int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id); + + // send fill request + if (pending == -1) { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); + mem_req.write = false; + mem_req.tag = mshr_id; + mem_req.core_id = pipeline_req.core_id; + mem_req.uuid = pipeline_req.uuid; + mem_req_ports_.at(bank_id).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); + ++pending_fill_reqs_; + } + } + } + } + } + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +Cache::Cache(const SimContext& ctx, const char* name, const Config& config) + : SimObject(ctx, name) + , CoreReqPorts(config.num_inputs, this) + , CoreRspPorts(config.num_inputs, this) + , MemReqPort(this) + , MemRspPort(this) + , impl_(new Impl(this, config)) +{} + +Cache::~Cache() { + delete impl_; +} + +void Cache::reset() { + impl_->reset(); +} + +void Cache::tick() { + impl_->tick(); +} + +const Cache::PerfStats& Cache::perf_stats() const { + return impl_->perf_stats(); +} \ No newline at end of file diff --git a/sim/simx/cache.h b/sim/simx/cache.h new file mode 100644 index 00000000..a335b483 --- /dev/null +++ b/sim/simx/cache.h @@ -0,0 +1,69 @@ +#pragma once + +#include +#include "memsim.h" + +namespace vortex { + +class Cache : public SimObject { +public: + struct Config { + uint8_t C; // log2 cache size + uint8_t B; // log2 block size + uint8_t W; // log2 word size + uint8_t A; // log2 associativity + uint8_t addr_width; // word address bits + uint8_t num_banks; // number of banks + uint8_t ports_per_bank; // number of ports per bank + uint8_t num_inputs; // number of inputs + bool write_through; // is write-through + bool write_reponse; // enable write response + uint16_t victim_size; // victim cache size + uint16_t mshr_size; // MSHR buffer size + uint8_t latency; // pipeline latency + }; + + struct PerfStats { + uint64_t reads; + uint64_t writes; + uint64_t read_misses; + uint64_t write_misses; + uint64_t evictions; + uint64_t pipeline_stalls; + uint64_t bank_stalls; + uint64_t mshr_stalls; + uint64_t mem_latency; + + PerfStats() + : reads(0) + , writes(0) + , read_misses(0) + , write_misses(0) + , evictions(0) + , pipeline_stalls(0) + , bank_stalls(0) + , mshr_stalls(0) + , mem_latency(0) + {} + }; + + std::vector> CoreReqPorts; + std::vector> CoreRspPorts; + SimPort MemReqPort; + SimPort MemRspPort; + + Cache(const SimContext& ctx, const char* name, const Config& config); + ~Cache(); + + void reset(); + + void tick(); + + const PerfStats& perf_stats() const; + +private: + class Impl; + Impl* impl_; +}; + +} \ No newline at end of file diff --git a/sim/simx/constants.h b/sim/simx/constants.h new file mode 100644 index 00000000..109f29f4 --- /dev/null +++ b/sim/simx/constants.h @@ -0,0 +1,23 @@ +#pragma once + +#ifndef RAM_PAGE_SIZE +#define RAM_PAGE_SIZE 4096 +#endif + +#ifndef MEM_CYCLE_RATIO +#define MEM_CYCLE_RATIO -1 +#endif + +#ifndef MEMORY_BANKS +#define MEMORY_BANKS 2 +#endif + +namespace vortex { + +enum Constants { + + SMEM_BANK_OFFSET = log2ceil(sizeof(Word)) + log2ceil(STACK_SIZE / sizeof(Word)), + +}; + +} \ No newline at end of file diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp new file mode 100644 index 00000000..fd11befd --- /dev/null +++ b/sim/simx/core.cpp @@ -0,0 +1,686 @@ +#include +#include +#include +#include +#include +#include "types.h" +#include "archdef.h" +#include "mem.h" +#include "decode.h" +#include "core.h" +#include "debug.h" +#include "constants.h" + +using namespace vortex; + +Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) + : SimObject(ctx, "Core") + , MemRspPort(this) + , MemReqPort(this) + , id_(id) + , arch_(arch) + , decoder_(arch) + , mmu_(0, arch.wsize(), true) + , smem_(RAM_PAGE_SIZE) + , tex_units_(NUM_TEX_UNITS, this) + , warps_(arch.num_warps()) + , barriers_(arch.num_barriers(), 0) + , csrs_(arch.num_csrs(), 0) + , fcsrs_(arch.num_warps(), 0) + , ibuffers_(arch.num_warps(), IBUF_SIZE) + , scoreboard_(arch_) + , exe_units_((int)ExeType::MAX) + , icache_(Cache::Create("icache", Cache::Config{ + log2ceil(ICACHE_SIZE), // C + log2ceil(L1_BLOCK_SIZE),// B + 2, // W + 0, // A + 32, // address bits + 1, // number of banks + 1, // number of ports + 1, // request size + true, // write-through + false, // write response + 0, // victim size + NUM_WARPS, // mshr + 2, // pipeline latency + })) + , dcache_(Cache::Create("dcache", Cache::Config{ + log2ceil(DCACHE_SIZE), // C + log2ceil(L1_BLOCK_SIZE),// B + 2, // W + 0, // A + 32, // address bits + DCACHE_NUM_BANKS, // number of banks + DCACHE_NUM_PORTS, // number of ports + (uint8_t)arch.num_threads(), // request size + true, // write-through + false, // write response + 0, // victim size + DCACHE_MSHR_SIZE, // mshr + 4, // pipeline latency + })) + , shared_mem_(SharedMem::Create("sharedmem", SharedMem::Config{ + arch.num_threads(), + arch.num_threads(), + Constants::SMEM_BANK_OFFSET, + 1, + false + })) + , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) + , dcache_switch_(arch.num_threads()) + , fetch_latch_("fetch") + , decode_latch_("decode") + , pending_icache_(arch_.num_warps()) +{ + for (int i = 0; i < arch_.num_warps(); ++i) { + warps_.at(i) = std::make_shared(this, i); + } + + // register execute units + exe_units_.at((int)ExeType::NOP) = SimPlatform::instance().create_object(this); + exe_units_.at((int)ExeType::ALU) = SimPlatform::instance().create_object(this); + exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object(this); + exe_units_.at((int)ExeType::CSR) = SimPlatform::instance().create_object(this); + exe_units_.at((int)ExeType::FPU) = SimPlatform::instance().create_object(this); + exe_units_.at((int)ExeType::GPU) = SimPlatform::instance().create_object(this); + + // connect l1 switch + icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]); + dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]); + l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort); + l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort); + this->MemRspPort.bind(&l1_mem_switch_->RspIn); + l1_mem_switch_->ReqOut.bind(&this->MemReqPort); + + // lsu/tex switch + for (uint32_t i = 0, n = arch.num_threads(); i < n; ++i) { + auto& sw = dcache_switch_.at(i); +#ifdef EXT_TEX_ENABLE + sw = Switch::Create("lsu_arb", ArbiterType::Priority, 2); +#else + sw = Switch::Create("lsu_arb", ArbiterType::Priority, 1); +#endif + sw->ReqOut.bind(&dcache_->CoreReqPorts.at(i)); + dcache_->CoreRspPorts.at(i).bind(&sw->RspIn); + } + + // memory perf callbacks + MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){ + __unused (cycle); + perf_stats_.mem_reads += !req.write; + perf_stats_.mem_writes += req.write; + perf_mem_pending_reads_ += !req.write; + }); + MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){ + __unused (cycle); + --perf_mem_pending_reads_; + }); + + this->reset(); +} + +Core::~Core() { + this->cout_flush(); +} + +void Core::reset() { + for (auto& warp : warps_) { + warp->clear(); + } + warps_.at(0)->setTmask(0, true); + active_warps_ = 1; + + for (auto& tex_unit : tex_units_) { + tex_unit.clear(); + } + + for ( auto& barrier : barriers_) { + barrier.reset(); + } + + for (auto& csr : csrs_) { + csr = 0; + } + + for (auto& fcsr : fcsrs_) { + fcsr = 0; + } + + for (auto& ibuf : ibuffers_) { + ibuf.clear(); + } + + scoreboard_.clear(); + fetch_latch_.clear(); + decode_latch_.clear(); + pending_icache_.clear(); + stalled_warps_.reset(); + last_schedule_wid_ = 0; + issued_instrs_ = 0; + committed_instrs_ = 0; + csr_tex_unit_ = 0; + ecall_ = false; + ebreak_ = false; + perf_mem_pending_reads_ = 0; + perf_stats_ = PerfStats(); +} + +void Core::attach_ram(RAM* ram) { + // bind RAM to memory unit + mmu_.attach(*ram, 0, 0xFFFFFFFF); +} + +void Core::cout_flush() { + for (auto& buf : print_bufs_) { + auto str = buf.second.str(); + if (!str.empty()) { + std::cout << "#" << buf.first << ": " << str << std::endl; + } + } +} + +void Core::tick() { + this->commit(); + this->execute(); + this->decode(); + this->fetch(); + this->schedule(); + + // update perf counter + perf_stats_.mem_latency += perf_mem_pending_reads_; + + DPN(2, std::flush); +} + +void Core::schedule() { + bool foundSchedule = false; + int scheduled_warp = last_schedule_wid_; + + // round robin scheduling + for (size_t wid = 0, nw = arch_.num_warps(); wid < nw; ++wid) { + scheduled_warp = (scheduled_warp + 1) % nw; + bool warp_active = active_warps_.test(scheduled_warp); + bool warp_stalled = stalled_warps_.test(scheduled_warp); + if (warp_active && !warp_stalled) { + last_schedule_wid_ = scheduled_warp; + foundSchedule = true; + break; + } + } + + if (!foundSchedule) + return; + + // suspend warp until decode + stalled_warps_.set(scheduled_warp); + + uint64_t uuid = (issued_instrs_++ * arch_.num_cores()) + id_; + + auto trace = new pipeline_trace_t(uuid, arch_); + + auto& warp = warps_.at(scheduled_warp); + warp->eval(trace); + + DT(3, "pipeline-schedule: " << *trace); + + // advance to fetch stage + fetch_latch_.push(trace); +} + +void Core::fetch() { + // handle icache reponse + auto& icache_rsp_port = icache_->CoreRspPorts.at(0); + if (!icache_rsp_port.empty()){ + auto& mem_rsp = icache_rsp_port.front(); + auto trace = pending_icache_.at(mem_rsp.tag); + decode_latch_.push(trace); + DT(3, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace); + pending_icache_.release(mem_rsp.tag); + icache_rsp_port.pop(); + } + + // send icache request + if (!fetch_latch_.empty()) { + auto trace = fetch_latch_.front(); + MemReq mem_req; + mem_req.addr = trace->PC; + mem_req.write = false; + mem_req.tag = pending_icache_.allocate(trace); + mem_req.core_id = trace->cid; + mem_req.uuid = trace->uuid; + icache_->CoreReqPorts.at(0).send(mem_req, 1); + DT(3, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); + fetch_latch_.pop(); + } +} + +void Core::decode() { + if (decode_latch_.empty()) + return; + + auto trace = decode_latch_.front(); + + // check ibuffer capacity + auto& ibuffer = ibuffers_.at(trace->wid); + if (ibuffer.full()) { + if (!trace->suspend()) { + DT(3, "*** ibuffer-stall: " << *trace); + } + ++perf_stats_.ibuf_stalls; + return; + } else { + trace->resume(); + } + + // release warp + if (!trace->fetch_stall) { + stalled_warps_.reset(trace->wid); + } + + // update perf counters + uint32_t active_threads = trace->tmask.count(); + if (trace->exe_type == ExeType::LSU && trace->lsu.type == LsuType::LOAD) + perf_stats_.loads += active_threads; + if (trace->exe_type == ExeType::LSU && trace->lsu.type == LsuType::STORE) + perf_stats_.stores += active_threads; + if (trace->exe_type == ExeType::ALU && trace->alu.type == AluType::BRANCH) + perf_stats_.branches += active_threads; + + DT(3, "pipeline-decode: " << *trace); + + // insert to ibuffer + ibuffer.push(trace); + + decode_latch_.pop(); +} + +void Core::execute() { + // issue ibuffer instructions + for (auto& ibuffer : ibuffers_) { + if (ibuffer.empty()) + continue; + + auto trace = ibuffer.top(); + + // check scoreboard + if (scoreboard_.in_use(trace)) { + if (!trace->suspend()) { + DTH(3, "*** scoreboard-stall: dependents={"); + auto uses = scoreboard_.get_uses(trace); + for (uint32_t i = 0, n = uses.size(); i < n; ++i) { + auto& use = uses.at(i); + __unused (use); + if (i) DTN(3, ", "); + DTN(3, use.type << use.reg << "(#" << use.owner << ")"); + } + DTN(3, "}, " << *trace << std::endl); + } + ++perf_stats_.scrb_stalls; + continue; + } else { + trace->resume(); + } + + // update scoreboard + scoreboard_.reserve(trace); + + DT(3, "pipeline-issue: " << *trace); + + // push to execute units + auto& exe_unit = exe_units_.at((int)trace->exe_type); + exe_unit->Input.send(trace, 1); + + ibuffer.pop(); + break; + } +} + +void Core::commit() { + // commit completed instructions + bool wb = false; + for (auto& exe_unit : exe_units_) { + if (!exe_unit->Output.empty()) { + auto trace = exe_unit->Output.front(); + + // allow only one commit that updates registers + if (trace->wb && wb) + continue; + wb |= trace->wb; + + // advance to commit stage + DT(3, "pipeline-commit: " << *trace); + + // update scoreboard + scoreboard_.release(trace); + + assert(committed_instrs_ <= issued_instrs_); + ++committed_instrs_; + + perf_stats_.instrs += trace->tmask.count(); + + // delete the trace + delete trace; + + exe_unit->Output.pop(); + } + } +} + +WarpMask Core::wspawn(int num_warps, int nextPC) { + WarpMask ret(1); + int active_warps = std::min(num_warps, arch_.num_warps()); + DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << nextPC); + for (int i = 1; i < active_warps; ++i) { + auto warp = warps_.at(i); + warp->setPC(nextPC); + warp->setTmask(0, true); + ret.set(i); + } + return std::move(ret); +} + +WarpMask Core::barrier(int bar_id, int count, int warp_id) { + WarpMask ret(0); + auto& barrier = barriers_.at(bar_id); + barrier.set(warp_id); + if (barrier.count() < (size_t)count) { + warps_.at(warp_id)->suspend(); + DP(3, "*** Suspend warp #" << warp_id << " at barrier #" << bar_id); + return std::move(ret); + } + for (int i = 0; i < arch_.num_warps(); ++i) { + if (barrier.test(i)) { + DP(3, "*** Resume warp #" << i << " at barrier #" << bar_id); + warps_.at(i)->activate(); + ret.set(i); + } + } + barrier.reset(); + return std::move(ret); +} + +Word Core::icache_read(Addr addr, Size size) { + Word data; + mmu_.read(&data, addr, size, 0); + return data; +} + +Word Core::dcache_read(Addr addr, Size size) { + Word data; + auto type = get_addr_type(addr, size); + if (type == AddrType::Shared) { + smem_.read(&data, addr & (SMEM_SIZE-1), size); + } else { + mmu_.read(&data, addr, size, 0); + } + return data; +} + +void Core::dcache_write(Addr addr, Word data, Size size) { + if (addr >= IO_COUT_ADDR + && addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { + this->writeToStdOut(addr, data); + } else { + auto type = get_addr_type(addr, size); + if (type == AddrType::Shared) { + smem_.write(&data, addr & (SMEM_SIZE-1), size); + } else { + mmu_.write(&data, addr, size, 0); + } + } +} + +Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector* mem_addrs) { + return tex_units_.at(unit).read(u, v, lod, mem_addrs); +} + +void Core::writeToStdOut(Addr addr, Word data) { + uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1); + auto& ss_buf = print_bufs_[tid]; + char c = (char)data; + ss_buf << c; + if (c == '\n') { + std::cout << std::dec << "#" << tid << ": " << ss_buf.str() << std::flush; + ss_buf.str(""); + } +} + +Word Core::get_csr(Addr addr, int tid, int wid) { + switch (addr) { + case CSR_SATP: + case CSR_PMPCFG0: + case CSR_PMPADDR0: + case CSR_MSTATUS: + case CSR_MISA: + case CSR_MEDELEG: + case CSR_MIDELEG: + case CSR_MIE: + case CSR_MTVEC: + case CSR_MEPC: + return 0; + + case CSR_FFLAGS: + return fcsrs_.at(wid) & 0x1F; + case CSR_FRM: + return (fcsrs_.at(wid) >> 5); + case CSR_FCSR: + return fcsrs_.at(wid); + case CSR_WTID: + // Warp threadID + return tid; + case CSR_LTID: + // Core threadID + return tid + (wid * arch_.num_threads()); + case CSR_GTID: + // Processor threadID + return tid + (wid * arch_.num_threads()) + + (arch_.num_threads() * arch_.num_warps() * id_); + case CSR_LWID: + // Core warpID + return wid; + case CSR_GWID: + // Processor warpID + return wid + (arch_.num_warps() * id_); + case CSR_GCID: + // Processor coreID + return id_; + case CSR_TMASK: + // Processor coreID + return warps_.at(wid)->getTmask(); + case CSR_NT: + // Number of threads per warp + return arch_.num_threads(); + case CSR_NW: + // Number of warps per core + return arch_.num_warps(); + case CSR_NC: + // Number of cores + return arch_.num_cores(); + case CSR_MINSTRET: + // NumInsts + return perf_stats_.instrs & 0xffffffff; + case CSR_MINSTRET_H: + // NumInsts + return (Word)(perf_stats_.instrs >> 32); + case CSR_MCYCLE: + // NumCycles + return (Word)SimPlatform::instance().cycles(); + case CSR_MCYCLE_H: + // NumCycles + return (Word)(SimPlatform::instance().cycles() >> 32); + case CSR_MPM_IBUF_ST: + return perf_stats_.ibuf_stalls & 0xffffffff; + case CSR_MPM_IBUF_ST_H: + return perf_stats_.ibuf_stalls >> 32; + case CSR_MPM_SCRB_ST: + return perf_stats_.scrb_stalls & 0xffffffff; + case CSR_MPM_SCRB_ST_H: + return perf_stats_.scrb_stalls >> 32; + case CSR_MPM_ALU_ST: + return perf_stats_.alu_stalls & 0xffffffff; + case CSR_MPM_ALU_ST_H: + return perf_stats_.alu_stalls >> 32; + case CSR_MPM_LSU_ST: + return perf_stats_.lsu_stalls & 0xffffffff; + case CSR_MPM_LSU_ST_H: + return perf_stats_.lsu_stalls >> 32; + case CSR_MPM_CSR_ST: + return perf_stats_.csr_stalls & 0xffffffff; + case CSR_MPM_CSR_ST_H: + return perf_stats_.csr_stalls >> 32; + case CSR_MPM_FPU_ST: + return perf_stats_.fpu_stalls & 0xffffffff; + case CSR_MPM_FPU_ST_H: + return perf_stats_.fpu_stalls >> 32; + case CSR_MPM_GPU_ST: + return perf_stats_.gpu_stalls & 0xffffffff; + case CSR_MPM_GPU_ST_H: + return perf_stats_.gpu_stalls >> 32; + + case CSR_MPM_LOADS: + return perf_stats_.loads & 0xffffffff; + case CSR_MPM_LOADS_H: + return perf_stats_.loads >> 32; + case CSR_MPM_STORES: + return perf_stats_.stores & 0xffffffff; + case CSR_MPM_STORES_H: + return perf_stats_.stores >> 32; + case CSR_MPM_BRANCHES: + return perf_stats_.branches & 0xffffffff; + case CSR_MPM_BRANCHES_H: + return perf_stats_.branches >> 32; + + case CSR_MPM_ICACHE_READS: + return icache_->perf_stats().reads & 0xffffffff; + case CSR_MPM_ICACHE_READS_H: + return icache_->perf_stats().reads >> 32; + case CSR_MPM_ICACHE_MISS_R: + return icache_->perf_stats().read_misses & 0xffffffff; + case CSR_MPM_ICACHE_MISS_R_H: + return icache_->perf_stats().read_misses >> 32; + + case CSR_MPM_DCACHE_READS: + return dcache_->perf_stats().reads & 0xffffffff; + case CSR_MPM_DCACHE_READS_H: + return dcache_->perf_stats().reads >> 32; + case CSR_MPM_DCACHE_WRITES: + return dcache_->perf_stats().writes & 0xffffffff; + case CSR_MPM_DCACHE_WRITES_H: + return dcache_->perf_stats().writes >> 32; + case CSR_MPM_DCACHE_MISS_R: + return dcache_->perf_stats().read_misses & 0xffffffff; + case CSR_MPM_DCACHE_MISS_R_H: + return dcache_->perf_stats().read_misses >> 32; + case CSR_MPM_DCACHE_MISS_W: + return dcache_->perf_stats().write_misses & 0xffffffff; + case CSR_MPM_DCACHE_MISS_W_H: + return dcache_->perf_stats().write_misses >> 32; + case CSR_MPM_DCACHE_BANK_ST: + return dcache_->perf_stats().bank_stalls & 0xffffffff; + case CSR_MPM_DCACHE_BANK_ST_H: + return dcache_->perf_stats().bank_stalls >> 32; + case CSR_MPM_DCACHE_MSHR_ST: + return dcache_->perf_stats().mshr_stalls & 0xffffffff; + case CSR_MPM_DCACHE_MSHR_ST_H: + return dcache_->perf_stats().mshr_stalls >> 32; + + case CSR_MPM_SMEM_READS: + return shared_mem_->perf_stats().reads & 0xffffffff; + case CSR_MPM_SMEM_READS_H: + return shared_mem_->perf_stats().reads >> 32; + case CSR_MPM_SMEM_WRITES: + return shared_mem_->perf_stats().writes & 0xffffffff; + case CSR_MPM_SMEM_WRITES_H: + return shared_mem_->perf_stats().writes >> 32; + case CSR_MPM_SMEM_BANK_ST: + return shared_mem_->perf_stats().bank_stalls & 0xffffffff; + case CSR_MPM_SMEM_BANK_ST_H: + return shared_mem_->perf_stats().bank_stalls >> 32; + + case CSR_MPM_MEM_READS: + return perf_stats_.mem_reads & 0xffffffff; + case CSR_MPM_MEM_READS_H: + return perf_stats_.mem_reads >> 32; + case CSR_MPM_MEM_WRITES: + return perf_stats_.mem_writes & 0xffffffff; + case CSR_MPM_MEM_WRITES_H: + return perf_stats_.mem_writes >> 32; + case CSR_MPM_MEM_LAT: + return perf_stats_.mem_latency & 0xffffffff; + case CSR_MPM_MEM_LAT_H: + return perf_stats_.mem_latency >> 32; + +#ifdef EXT_TEX_ENABLE + case CSR_MPM_TEX_READS: + return perf_stats_.tex_reads & 0xffffffff; + case CSR_MPM_TEX_READS_H: + return perf_stats_.tex_reads >> 32; + case CSR_MPM_TEX_LAT: + return perf_stats_.tex_latency & 0xffffffff; + case CSR_MPM_TEX_LAT_H: + return perf_stats_.tex_latency >> 32; +#endif + default: + if ((addr >= CSR_MPM_BASE && addr < (CSR_MPM_BASE + 32)) + || (addr >= CSR_MPM_BASE_H && addr < (CSR_MPM_BASE_H + 32))) { + // user-defined MPM CSRs + } else + #ifdef EXT_TEX_ENABLE + if (addr == CSR_TEX_UNIT) { + return csr_tex_unit_; + } else + if (addr >= CSR_TEX_STATE_BEGIN + && addr < CSR_TEX_STATE_END) { + uint32_t state = CSR_TEX_STATE(addr); + return tex_units_.at(csr_tex_unit_).get_state(state); + } else + #endif + { + std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl; + std::abort(); + } + } + return 0; +} + +void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) { + if (addr == CSR_FFLAGS) { + fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F); + } else if (addr == CSR_FRM) { + fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5); + } else if (addr == CSR_FCSR) { + fcsrs_.at(wid) = value & 0xff; + } else +#ifdef EXT_TEX_ENABLE + if (addr == CSR_TEX_UNIT) { + csr_tex_unit_ = value; + } else + if (addr >= CSR_TEX_STATE_BEGIN + && addr < CSR_TEX_STATE_END) { + uint32_t state = CSR_TEX_STATE(addr); + tex_units_.at(csr_tex_unit_).set_state(state, value); + return; + } else +#endif + { + csrs_.at(addr) = value; + } +} + +void Core::trigger_ecall() { + ecall_ = true; +} + +void Core::trigger_ebreak() { + ebreak_ = true; +} + +bool Core::check_exit() const { + return ebreak_ || ecall_; +} + +bool Core::running() const { + bool is_running = (committed_instrs_ != issued_instrs_); + return is_running; +} \ No newline at end of file diff --git a/sim/simx/core.h b/sim/simx/core.h new file mode 100644 index 00000000..18c9beb3 --- /dev/null +++ b/sim/simx/core.h @@ -0,0 +1,181 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "debug.h" +#include "types.h" +#include "archdef.h" +#include "decode.h" +#include "mem.h" +#include "warp.h" +#include "pipeline.h" +#include "cache.h" +#include "sharedmem.h" +#include "ibuffer.h" +#include "scoreboard.h" +#include "exeunit.h" +#include "tex_unit.h" + +namespace vortex { + +class Core : public SimObject { +public: + struct PerfStats { + uint64_t instrs; + uint64_t ibuf_stalls; + uint64_t scrb_stalls; + uint64_t alu_stalls; + uint64_t lsu_stalls; + uint64_t csr_stalls; + uint64_t fpu_stalls; + uint64_t gpu_stalls; + uint64_t loads; + uint64_t stores; + uint64_t branches; + uint64_t mem_reads; + uint64_t mem_writes; + uint64_t mem_latency; + uint64_t tex_reads; + uint64_t tex_latency; + + PerfStats() + : instrs(0) + , ibuf_stalls(0) + , scrb_stalls(0) + , alu_stalls(0) + , lsu_stalls(0) + , csr_stalls(0) + , fpu_stalls(0) + , gpu_stalls(0) + , loads(0) + , stores(0) + , branches(0) + , mem_reads(0) + , mem_writes(0) + , mem_latency(0) + , tex_reads(0) + , tex_latency(0) + {} + }; + + SimPort MemRspPort; + SimPort MemReqPort; + + Core(const SimContext& ctx, const ArchDef &arch, Word id); + ~Core(); + + void attach_ram(RAM* ram); + + bool running() const; + + void reset(); + + void tick(); + + Word id() const { + return id_; + } + + const Decoder& decoder() { + return decoder_; + } + + const ArchDef& arch() const { + return arch_; + } + + const PerfStats& perf_stats() const { + return perf_stats_; + } + + Word getIRegValue(int reg) const { + return warps_.at(0)->getIRegValue(reg); + } + + Word get_csr(Addr addr, int tid, int wid); + + void set_csr(Addr addr, Word value, int tid, int wid); + + WarpMask wspawn(int num_warps, int nextPC); + + WarpMask barrier(int bar_id, int count, int warp_id); + + Word icache_read(Addr, Size); + + Word dcache_read(Addr, Size); + + void dcache_write(Addr, Word, Size); + + Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector* mem_addrs); + + void trigger_ecall(); + + void trigger_ebreak(); + + bool check_exit() const; + +private: + + void schedule(); + void fetch(); + void decode(); + void execute(); + void commit(); + + void writeToStdOut(Addr addr, Word data); + + void cout_flush(); + + Word id_; + const ArchDef arch_; + const Decoder decoder_; + MemoryUnit mmu_; + RAM smem_; + std::vector tex_units_; + + std::vector> warps_; + std::vector barriers_; + std::vector csrs_; + std::vector fcsrs_; + std::vector ibuffers_; + Scoreboard scoreboard_; + std::vector exe_units_; + Cache::Ptr icache_; + Cache::Ptr dcache_; + SharedMem::Ptr shared_mem_; + Switch::Ptr l1_mem_switch_; + std::vector::Ptr> dcache_switch_; + + PipelineLatch fetch_latch_; + PipelineLatch decode_latch_; + + HashTable pending_icache_; + WarpMask active_warps_; + WarpMask stalled_warps_; + uint32_t last_schedule_wid_; + uint64_t issued_instrs_; + uint64_t committed_instrs_; + uint32_t csr_tex_unit_; + bool ecall_; + bool ebreak_; + + std::unordered_map print_bufs_; + + PerfStats perf_stats_; + uint64_t perf_mem_pending_reads_; + + friend class LsuUnit; + friend class AluUnit; + friend class CsrUnit; + friend class FpuUnit; + friend class GpuUnit; +}; + +} // namespace vortex \ No newline at end of file diff --git a/sim/simx/debug.h b/sim/simx/debug.h new file mode 100644 index 00000000..688eded4 --- /dev/null +++ b/sim/simx/debug.h @@ -0,0 +1,65 @@ +#pragma once + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 3 +#endif + +#define DEBUG_HEADER << "DEBUG " +//#define DEBUG_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": " + +#define TRACE_HEADER << "TRACE " +//#define TRACE_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": " + +#ifndef NDEBUG + +#include +#include + +#define DP(lvl, x) do { \ + if ((lvl) <= DEBUG_LEVEL) { \ + std::cout DEBUG_HEADER << x << std::endl; \ + } \ +} while(0) + +#define DPH(lvl, x) do { \ + if ((lvl) <= DEBUG_LEVEL) { \ + std::cout DEBUG_HEADER << x; \ + } \ +} while(0) + +#define DPN(lvl, x) do { \ + if ((lvl) <= DEBUG_LEVEL) { \ + std::cout << x; \ + } \ +} while(0) + +#define DT(lvl, x) do { \ + if ((lvl) <= DEBUG_LEVEL) { \ + std::cout TRACE_HEADER << std::setw(10) << std::dec << SimPlatform::instance().cycles() << std::setw(0) << ": " << x << std::endl; \ + } \ +} while(0) + +#define DTH(lvl, x) do { \ + if ((lvl) <= DEBUG_LEVEL) { \ + std::cout TRACE_HEADER << std::setw(10) << std::dec << SimPlatform::instance().cycles() << std::setw(0) << ": " << x; \ + } \ +} while(0) + +#define DTN(lvl, x) do { \ + if ((lvl) <= DEBUG_LEVEL) { \ + std::cout << x; \ + } \ +} while(0) + + +#else + +#define DP(lvl, x) do {} while(0) +#define DPH(lvl, x) do {} while(0) +#define DPN(lvl, x) do {} while(0) + +#define DT(lvl, x) do {} while(0) +#define DTH(lvl, x) do {} while(0) +#define DTN(lvl, x) do {} while(0) + +#endif \ No newline at end of file diff --git a/sim/simX/decode.cpp b/sim/simx/decode.cpp similarity index 78% rename from sim/simX/decode.cpp rename to sim/simx/decode.cpp index dbc7115a..86e30266 100644 --- a/sim/simX/decode.cpp +++ b/sim/simx/decode.cpp @@ -41,14 +41,18 @@ static const std::unordered_map sc_instTable = { {Opcode::FMNMSUB, {false, InstType::R4_TYPE}}, {Opcode::VSET, {false, InstType::V_TYPE}}, {Opcode::GPGPU, {false, InstType::R_TYPE}}, + {Opcode::GPU, {false, InstType::R4_TYPE}}, }; -static const char* op_string(const Instr &instr) { - Word func3 = instr.getFunc3(); - Word func7 = instr.getFunc7(); - Word rs2 = instr.getRSrc(1); - Word imm = instr.getImm(); - switch (instr.getOpcode()) { +static const char* op_string(const Instr &instr) { + auto opcode = instr.getOpcode(); + Word func2 = instr.getFunc2(); + Word func3 = instr.getFunc3(); + Word func7 = instr.getFunc7(); + Word rs2 = instr.getRSrc(1); + Word imm = instr.getImm(); + + switch (opcode) { case Opcode::NOP: return "NOP"; case Opcode::LUI_INST: return "LUI"; case Opcode::AUIPC_INST: return "AUIPC"; @@ -120,7 +124,16 @@ static const char* op_string(const Instr &instr) { } case Opcode::SYS_INST: switch (func3) { - case 0: return imm ? "EBREAK" : "ECALL"; + case 0: + switch (imm) { + case 0x000: return "ECALL"; + case 0x001: return "EBREAK"; + case 0x002: return "URET"; + case 0x102: return "SRET"; + case 0x302: return "MRET"; + default: + std::abort(); + } case 1: return "CSRRW"; case 2: return "CSRRS"; case 3: return "CSRRC"; @@ -181,67 +194,63 @@ static const char* op_string(const Instr &instr) { case 1: return "WSPAWN"; case 2: return "SPLIT"; case 3: return "JOIN"; - case 4: return "BAR"; - case 6: return "PREFETCH"; + case 4: return "BAR"; + case 5: return "PREFETCH"; + default: + std::abort(); + } + case Opcode::GPU: + switch (func3) { + case 0: return "TEX"; + case 1: { + switch (func2) { + case 0: return "CMOV"; + default: + std::abort(); + } + } default: std::abort(); } default: std::abort(); - } + } } namespace vortex { -std::ostream &operator<<(std::ostream &os, const Instr &instr) { - os << op_string(instr) << ": "; - auto opcode = instr.getOpcode(); - - auto rd_to_string = [&]() { - int rdt = instr.getRDType(); - int rd = instr.getRDest(); - switch (rdt) { - case 1: os << "r" << std::dec << rd << " <- "; break; - case 2: os << "fr" << std::dec << rd << " <- "; break; - case 3: os << "vr" << std::dec << rd << " <- "; break; - default: break; - } - }; +std::ostream &operator<<(std::ostream &os, const Instr &instr) { + auto opcode = instr.getOpcode(); + Word func2 = instr.getFunc2(); + Word func3 = instr.getFunc3(); - auto rs_to_string = [&](int i) { - int rst = instr.getRSType(i); - int rs = instr.getRSrc(i); - switch (rst) { - case 1: os << "r" << std::dec << rs; break; - case 2: os << "fr" << std::dec << rs; break; - case 3: os << "vr" << std::dec << rs; break; - default: break; - } - }; + os << op_string(instr) << ": "; if (opcode == S_INST - || opcode == FS - || opcode == VS) { + || opcode == FS) { os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- "; - rs_to_string(1); + os << instr.getRSType(1) << std::dec << instr.getRSrc(1); } else if (opcode == L_INST - || opcode == FL - || opcode == VL) { - rd_to_string(); + || opcode == FL) { + os << instr.getRDType() << std::dec << instr.getRDest() << " <- "; os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]"; } else { - rd_to_string(); + if (instr.getRDType() != RegType::None) { + os << instr.getRDType() << std::dec << instr.getRDest() << " <- "; + } int i = 0; for (; i < instr.getNRSrc(); ++i) { if (i) os << ", "; - rs_to_string(i); + os << instr.getRSType(i) << std::dec << instr.getRSrc(i); } if (instr.hasImm()) { if (i) os << ", "; os << "imm=0x" << std::hex << instr.getImm(); } - } - + if (opcode == GPU && func3 == 0) { + os << ", unit=" << std::dec << func2; + } + } return os; } } @@ -260,6 +269,7 @@ Decoder::Decoder(const ArchDef &arch) { shift_func3_ = shift_rd_ + reg_s_; shift_rs1_ = shift_func3_ + func3_s_; shift_rs2_ = shift_rs1_ + reg_s_; + shift_func2_ = shift_rs2_ + reg_s_; shift_func7_ = shift_rs2_ + reg_s_; shift_rs3_ = shift_func7_ + func2_s_; shift_vmop_ = shift_func7_ + vmask_s_; @@ -268,7 +278,7 @@ Decoder::Decoder(const ArchDef &arch) { shift_vset_ = shift_func7_ + 6; reg_mask_ = 0x1f; - func2_mask_ = 0x2; + func2_mask_ = 0x3; func3_mask_ = 0x7; func6_mask_ = 0x3f; func7_mask_ = 0x7f; @@ -281,11 +291,12 @@ Decoder::Decoder(const ArchDef &arch) { v_imm_mask_ = 0x7ff; } -std::shared_ptr Decoder::decode(Word code, Word PC) { +std::shared_ptr Decoder::decode(Word code) const { auto instr = std::make_shared(); Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_); instr->setOpcode(op); + Word func2 = (code >> shift_func2_) & func2_mask_; Word func3 = (code >> shift_func3_) & func3_mask_; Word func6 = (code >> shift_func6_) & func6_mask_; Word func7 = (code >> shift_func7_) & func7_mask_; @@ -297,8 +308,8 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { auto op_it = sc_instTable.find(op); if (op_it == sc_instTable.end()) { - std::cout << std::hex << "invalid opcode: 0x" << op << ", instruction=0x" << code << ", PC=" << PC << std::endl; - std::abort(); + std::cout << std::hex << "Error: invalid opcode: 0x" << op << std::endl; + return nullptr; } auto iType = op_it->second.iType; @@ -349,14 +360,28 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { instr->setDestReg(rd); } instr->setFunc3(func3); - instr->setFunc7(func7); - if ((func3 == 5) && (op != L_INST) && (op != Opcode::FL)) { - instr->setImm(signExt(rs2, 5, reg_mask_)); - } else { - instr->setImm(signExt(code >> shift_rs2_, 12, i_imm_mask_)); + instr->setFunc7(func7); + switch (op) { + case Opcode::SYS_INST: + case Opcode::FENCE: + // uint12 + instr->setImm(code >> shift_rs2_); + break; + case Opcode::I_INST: + if (func3 == 0x1 || func3 == 0x5) { + // int5 + instr->setImm(sext32(rs2, 5)); + } else { + // int12 + instr->setImm(sext32(code >> shift_rs2_, 12)); + } + break; + default: + // int12 + instr->setImm(sext32(code >> shift_rs2_, 12)); + break; } } break; - case InstType::S_TYPE: { instr->setSrcReg(rs1); if (op == Opcode::FS) { @@ -365,8 +390,8 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { instr->setSrcReg(rs2); } instr->setFunc3(func3); - Word imeed = (func7 << reg_s_) | rd; - instr->setImm(signExt(imeed, 12, s_imm_mask_)); + Word imm = (func7 << reg_s_) | rd; + instr->setImm(sext32(imm, 12)); } break; case InstType::B_TYPE: { @@ -377,13 +402,13 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { Word bits_4_1 = rd >> 1; Word bit_10_5 = func7 & 0x3f; Word bit_12 = func7 >> 6; - Word imeed = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12); - instr->setImm(signExt(imeed, 13, b_imm_mask_)); + Word imm = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12); + instr->setImm(sext32(imm, 13)); } break; case InstType::U_TYPE: instr->setDestReg(rd); - instr->setImm(signExt(code >> shift_func3_, 20, u_imm_mask_)); + instr->setImm(sext32(code >> shift_func3_, 20)); break; case InstType::J_TYPE: { @@ -393,11 +418,11 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { Word bit_11 = (unordered >> 8) & 0x1; Word bits_10_1 = (unordered >> 9) & 0x3ff; Word bit_20 = (unordered >> 19) & 0x1; - Word imeed = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20); + Word imm = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20); if (bit_20) { - imeed |= ~j_imm_mask_; + imm |= ~j_imm_mask_; } - instr->setImm(imeed); + instr->setImm(imm); } break; case InstType::V_TYPE: @@ -424,7 +449,7 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { } } break; - case Opcode::VL: + case Opcode::FL: instr->setDestVReg(rd); instr->setSrcVReg(rs1); instr->setVlsWidth(func3); @@ -434,7 +459,7 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { instr->setVnf((code >> shift_vnf_) & func3_mask_); break; - case Opcode::VS: + case Opcode::FS: instr->setVs3(rd); instr->setSrcVReg(rs1); instr->setVlsWidth(func3); @@ -449,17 +474,23 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { } break; case R4_TYPE: - instr->setDestFReg(rd); - instr->setSrcFReg(rs1); - instr->setSrcFReg(rs2); - instr->setSrcFReg(rs3); + if (op == Opcode::GPU) { + instr->setDestReg(rd); + instr->setSrcReg(rs1); + instr->setSrcReg(rs2); + instr->setSrcReg(rs3); + } else { + instr->setDestFReg(rd); + instr->setSrcFReg(rs1); + instr->setSrcFReg(rs2); + instr->setSrcFReg(rs3); + } + instr->setFunc2(func2); instr->setFunc3(func3); break; default: std::abort(); } - D(2, "Instr 0x" << std::hex << code << ": " << *instr << std::flush); - return instr; } diff --git a/sim/simX/decode.h b/sim/simx/decode.h similarity index 94% rename from sim/simX/decode.h rename to sim/simx/decode.h index f8f3909c..e481cb28 100644 --- a/sim/simX/decode.h +++ b/sim/simx/decode.h @@ -13,7 +13,7 @@ class Decoder { public: Decoder(const ArchDef &); - std::shared_ptr decode(Word code, Word PC); + std::shared_ptr decode(Word code) const; private: diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp new file mode 100644 index 00000000..5df72c6f --- /dev/null +++ b/sim/simx/execute.cpp @@ -0,0 +1,1840 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "warp.h" +#include "instr.h" +#include "core.h" + +using namespace vortex; + +static bool HasDivergentThreads(const ThreadMask &thread_mask, + const std::vector> ®_file, + unsigned reg) { + bool cond; + size_t thread_idx = 0; + size_t num_threads = reg_file.size(); + for (; thread_idx < num_threads; ++thread_idx) { + if (thread_mask[thread_idx]) { + cond = bool(reg_file[thread_idx][reg]); + break; + } + } + assert(thread_idx != num_threads); + for (; thread_idx < num_threads; ++thread_idx) { + if (thread_mask[thread_idx]) { + if (cond != (bool(reg_file[thread_idx][reg]))) { + return true; + } + } + } + return false; +} + +inline uint32_t get_fpu_rm(uint32_t func3, Core* core, uint32_t tid, uint32_t wid) { + return (func3 == 0x7) ? core->get_csr(CSR_FRM, tid, wid) : func3; +} + +inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid) { + if (fflags) { + core->set_csr(CSR_FCSR, core->get_csr(CSR_FCSR, tid, wid) | fflags, tid, wid); + core->set_csr(CSR_FFLAGS, core->get_csr(CSR_FFLAGS, tid, wid) | fflags, tid, wid); + } +} + +void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { + assert(tmask_.any()); + + Word nextPC = PC_ + core_->arch().wsize(); + + Word func2 = instr.getFunc2(); + Word func3 = instr.getFunc3(); + Word func6 = instr.getFunc6(); + Word func7 = instr.getFunc7(); + + auto opcode = instr.getOpcode(); + int rdest = instr.getRDest(); + int rsrc0 = instr.getRSrc(0); + int rsrc1 = instr.getRSrc(1); + int rsrc2 = instr.getRSrc(2); + Word immsrc = instr.getImm(); + Word vmask = instr.getVmask(); + + int num_threads = core_->arch().num_threads(); + + std::vector rsdata(num_threads); + std::vector rddata(num_threads); + + int num_rsrcs = instr.getNRSrc(); + if (num_rsrcs) { + for (int i = 0; i < num_rsrcs; ++i) { + DPH(2, "Src Reg [" << std::dec << i << "]: "); + auto type = instr.getRSType(i); + int reg = instr.getRSrc(i); + switch (type) { + case RegType::Integer: + DPN(2, "r" << std::dec << reg << "={"); + for (int t = 0; t < num_threads; ++t) { + if (t) DPN(2, ", "); + if (!tmask_.test(t)) { + DPN(2, "-"); + continue; + } + rsdata[t][i] = ireg_file_.at(t)[reg]; + DPN(2, std::hex << rsdata[t][i]); + } + DPN(2, "}" << std::endl); + break; + case RegType::Float: + DPN(2, "fr" << std::dec << reg << "={"); + for (int t = 0; t < num_threads; ++t) { + if (t) DPN(2, ", "); + if (!tmask_.test(t)) { + DPN(2, "-"); + continue; + } + rsdata[t][i] = freg_file_.at(t)[reg]; + DPN(2, std::hex << rsdata[t][i]); + } + DPN(2, "}" << std::endl); + break; + default: + std::abort(); + break; + } + } + } + + bool rd_write = false; + + switch (opcode) { + case NOP: + break; + case LUI_INST: + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::ARITH; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = (immsrc << 12) & 0xfffff000; + } + rd_write = true; + break; + case AUIPC_INST: + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::ARITH; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = ((immsrc << 12) & 0xfffff000) + PC_; + } + rd_write = true; + break; + case R_INST: + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::ARITH; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + if (func7 & 0x1) { + switch (func3) { + case 0: + // MUL + rddata[t] = ((WordI)rsdata[t][0]) * ((WordI)rsdata[t][1]); + trace->alu.type = AluType::IMUL; + break; + case 1: { + // MULH + int64_t first = (int64_t)rsdata[t][0]; + if (rsdata[t][0] & 0x80000000) { + first = first | 0xFFFFFFFF00000000; + } + int64_t second = (int64_t)rsdata[t][1]; + if (rsdata[t][1] & 0x80000000) { + second = second | 0xFFFFFFFF00000000; + } + uint64_t result = first * second; + rddata[t] = (result >> 32) & 0xFFFFFFFF; + trace->alu.type = AluType::IMUL; + } break; + case 2: { + // MULHSU + int64_t first = (int64_t)rsdata[t][0]; + if (rsdata[t][0] & 0x80000000) { + first = first | 0xFFFFFFFF00000000; + } + int64_t second = (int64_t)rsdata[t][1]; + rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF; + trace->alu.type = AluType::IMUL; + } break; + case 3: { + // MULHU + uint64_t first = (uint64_t)rsdata[t][0]; + uint64_t second = (uint64_t)rsdata[t][1]; + rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF; + trace->alu.type = AluType::IMUL; + } break; + case 4: { + // DIV + WordI dividen = rsdata[t][0]; + WordI divisor = rsdata[t][1]; + if (divisor == 0) { + rddata[t] = -1; + } else if (dividen == WordI(0x80000000) && divisor == WordI(0xffffffff)) { + rddata[t] = dividen; + } else { + rddata[t] = dividen / divisor; + } + trace->alu.type = AluType::IDIV; + } break; + case 5: { + // DIVU + Word dividen = rsdata[t][0]; + Word divisor = rsdata[t][1]; + if (divisor == 0) { + rddata[t] = -1; + } else { + rddata[t] = dividen / divisor; + } + trace->alu.type = AluType::IDIV; + } break; + case 6: { + // REM + WordI dividen = rsdata[t][0]; + WordI divisor = rsdata[t][1]; + if (rsdata[t][1] == 0) { + rddata[t] = dividen; + } else if (dividen == WordI(0x80000000) && divisor == WordI(0xffffffff)) { + rddata[t] = 0; + } else { + rddata[t] = dividen % divisor; + } + trace->alu.type = AluType::IDIV; + } break; + case 7: { + // REMU + Word dividen = rsdata[t][0]; + Word divisor = rsdata[t][1]; + if (rsdata[t][1] == 0) { + rddata[t] = dividen; + } else { + rddata[t] = dividen % divisor; + } + trace->alu.type = AluType::IDIV; + } break; + default: + std::abort(); + } + } else { + switch (func3) { + case 0: + if (func7) { + // SUB + rddata[t] = rsdata[t][0] - rsdata[t][1]; + } else { + // ADD + rddata[t] = rsdata[t][0] + rsdata[t][1]; + } + break; + case 1: + // SHL + rddata[t] = rsdata[t][0] << rsdata[t][1]; + break; + case 2: + // LT + rddata[t] = (WordI(rsdata[t][0]) < WordI(rsdata[t][1])); + break; + case 3: + // LTU + rddata[t] = (Word(rsdata[t][0]) < Word(rsdata[t][1])); + break; + case 4: + // XOR + rddata[t] = rsdata[t][0] ^ rsdata[t][1]; + break; + case 5: + if (func7) { + // SRA + rddata[t] = WordI(rsdata[t][0]) >> WordI(rsdata[t][1]); + } else { + // SHR + rddata[t] = Word(rsdata[t][0]) >> Word(rsdata[t][1]); + } + break; + case 6: + // OR + rddata[t] = rsdata[t][0] | rsdata[t][1]; + break; + case 7: + // AND + rddata[t] = rsdata[t][0] & rsdata[t][1]; + break; + default: + std::abort(); + } + } + } + rd_write = true; + break; + case I_INST: + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::ARITH; + trace->used_iregs.set(rsrc0); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + switch (func3) { + case 0: + // ADDI + rddata[t] = rsdata[t][0] + immsrc; + break; + case 1: + // SLLI + rddata[t] = rsdata[t][0] << immsrc; + break; + case 2: + // SLTI + rddata[t] = (WordI(rsdata[t][0]) < WordI(immsrc)); + break; + case 3: { + // SLTIU + rddata[t] = (Word(rsdata[t][0]) < Word(immsrc)); + } break; + case 4: + // XORI + rddata[t] = rsdata[t][0] ^ immsrc; + break; + case 5: + if (func7) { + // SRAI + Word result = WordI(rsdata[t][0]) >> immsrc; + rddata[t] = result; + } else { + // SRLI + Word result = Word(rsdata[t][0]) >> immsrc; + rddata[t] = result; + } + break; + case 6: + // ORI + rddata[t] = rsdata[t][0] | immsrc; + break; + case 7: + // ANDI + rddata[t] = rsdata[t][0] & immsrc; + break; + } + } + rd_write = true; + break; + case B_INST: + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::BRANCH; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + switch (func3) { + case 0: + // BEQ + if (rsdata[t][0] == rsdata[t][1]) { + nextPC = PC_ + immsrc; + } + break; + case 1: + // BNE + if (rsdata[t][0] != rsdata[t][1]) { + nextPC = PC_ + immsrc; + } + break; + case 4: + // BLT + if (WordI(rsdata[t][0]) < WordI(rsdata[t][1])) { + nextPC = PC_ + immsrc; + } + break; + case 5: + // BGE + if (WordI(rsdata[t][0]) >= WordI(rsdata[t][1])) { + nextPC = PC_ + immsrc; + } + break; + case 6: + // BLTU + if (Word(rsdata[t][0]) < Word(rsdata[t][1])) { + nextPC = PC_ + immsrc; + } + break; + case 7: + // BGEU + if (Word(rsdata[t][0]) >= Word(rsdata[t][1])) { + nextPC = PC_ + immsrc; + } + break; + default: + std::abort(); + } + break; // runonce + } + trace->fetch_stall = true; + break; + case JAL_INST: + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::BRANCH; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = nextPC; + nextPC = PC_ + immsrc; + trace->fetch_stall = true; + break; // runonce + } + rd_write = true; + break; + case JALR_INST: + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::BRANCH; + trace->used_iregs.set(rsrc0); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = nextPC; + nextPC = rsdata[t][0] + immsrc; + trace->fetch_stall = true; + break; // runOnce + } + rd_write = true; + break; + case L_INST: + case FL: + trace->exe_type = ExeType::LSU; + trace->lsu.type = LsuType::LOAD; + trace->used_iregs.set(rsrc0); + if (opcode == L_INST + || (opcode == FL && func3 == 2)) { + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word mem_addr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned + Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; + Word data_read = core_->dcache_read(mem_addr, 4); + trace->mem_addrs.at(t).push_back({mem_addr, 4}); + DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << mem_addr << ", DATA=0x" << data_read); + switch (func3) { + case 0: + // LBI + rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8); + break; + case 1: + // LHI + rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16); + break; + case 2: + // LW + rddata[t] = data_read; + break; + case 4: + // LBU + rddata[t] = Word((data_read >> shift_by) & 0xFF); + break; + case 5: + // LHU + rddata[t] = Word((data_read >> shift_by) & 0xFFFF); + break; + default: + std::abort(); + } + } + } else { + DP(4, "Executing vector load"); + DP(4, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); + DP(4, "dest: v" << rdest); + DP(4, "width" << instr.getVlsWidth()); + auto &vd = vreg_file_.at(rdest); + switch (instr.getVlsWidth()) { + case 6: { + // load word and unit strided (not checking for unit stride) + for (int i = 0; i < vl_; i++) { + Word mem_addr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); + DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << mem_addr); + Word data_read = core_->dcache_read(mem_addr, 4); + DP(4, "Mem addr: " << std::hex << mem_addr << " Data read " << data_read); + int *result_ptr = (int *)(vd.data() + i); + *result_ptr = data_read; + } + } break; + default: + std::abort(); + } + } + rd_write = true; + break; + case S_INST: + case FS: + trace->exe_type = ExeType::LSU; + trace->lsu.type = LsuType::STORE; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + if (opcode == S_INST + || (opcode == FS && func3 == 2)) { + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word mem_addr = rsdata[t][0] + immsrc; + trace->mem_addrs.at(t).push_back({mem_addr, (1u << func3)}); + DP(4, "STORE MEM: ADDRESS=0x" << std::hex << mem_addr); + switch (func3) { + case 0: + // SB + core_->dcache_write(mem_addr, rsdata[t][1] & 0x000000FF, 1); + break; + case 1: + // SH + core_->dcache_write(mem_addr, rsdata[t][1], 2); + break; + case 2: + // SW + core_->dcache_write(mem_addr, rsdata[t][1], 4); + break; + default: + std::abort(); + } + } + } else { + for (int i = 0; i < vl_; i++) { + Word mem_addr = rsdata[i][0] + (i * vtype_.vsew / 8); + DP(4, "STORE MEM: ADDRESS=0x" << std::hex << mem_addr); + switch (instr.getVlsWidth()) { + case 6: { + // store word and unit strided (not checking for unit stride) + uint32_t value = *(uint32_t *)(vreg_file_.at(instr.getVs3()).data() + i); + core_->dcache_write(mem_addr, value, 4); + DP(4, "store: " << mem_addr << " value:" << value); + } break; + default: + std::abort(); + } + } + } + break; + case SYS_INST: + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word csr_addr = immsrc; + Word csr_value; + if (func3 == 0) { + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::SYSCALL; + trace->fetch_stall = true; + switch (csr_addr) { + case 0: // ECALL + core_->trigger_ecall(); + break; + case 1: // EBREAK + core_->trigger_ebreak(); + break; + case 0x002: // URET + case 0x102: // SRET + case 0x302: // MRET + break; + default: + std::abort(); + } + } else { + trace->exe_type = ExeType::CSR; + csr_value = core_->get_csr(csr_addr, t, id_); + switch (func3) { + case 1: + // CSRRW + rddata[t] = csr_value; + core_->set_csr(csr_addr, rsdata[t][0], t, id_); + trace->used_iregs.set(rsrc0); + rd_write = true; + break; + case 2: + // CSRRS + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_); + trace->used_iregs.set(rsrc0); + rd_write = true; + break; + case 3: + // CSRRC + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_); + trace->used_iregs.set(rsrc0); + rd_write = true; + break; + case 5: + // CSRRWI + rddata[t] = csr_value; + core_->set_csr(csr_addr, rsrc0, t, id_); + rd_write = true; + break; + case 6: + // CSRRSI; + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value | rsrc0, t, id_); + rd_write = true; + break; + case 7: + // CSRRCI + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value & ~rsrc0, t, id_); + rd_write = true; + break; + default: + break; + } + } + } + break; + case FENCE: + trace->exe_type = ExeType::LSU; + trace->lsu.type = LsuType::FENCE; + break; + case FCI: + trace->exe_type = ExeType::FPU; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + uint32_t frm = get_fpu_rm(func3, core_, t, id_); + uint32_t fflags = 0; + switch (func7) { + case 0x00: //FADD + rddata[t] = rv_fadd(rsdata[t][0], rsdata[t][1], frm, &fflags); + trace->fpu.type = FpuType::FMA; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); + break; + case 0x04: //FSUB + rddata[t] = rv_fsub(rsdata[t][0], rsdata[t][1], frm, &fflags); + trace->fpu.type = FpuType::FMA; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); + break; + case 0x08: //FMUL + rddata[t] = rv_fmul(rsdata[t][0], rsdata[t][1], frm, &fflags); + trace->fpu.type = FpuType::FMA; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); + break; + case 0x0c: //FDIV + rddata[t] = rv_fdiv(rsdata[t][0], rsdata[t][1], frm, &fflags); + trace->fpu.type = FpuType::FDIV; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); + break; + case 0x2c: //FSQRT + rddata[t] = rv_fsqrt(rsdata[t][0], frm, &fflags); + trace->fpu.type = FpuType::FSQRT; + trace->used_fregs.set(rsrc0); + break; + case 0x10: + switch (func3) { + case 0: // FSGNJ.S + rddata[t] = rv_fsgnj(rsdata[t][0], rsdata[t][1]); + break; + case 1: // FSGNJN.S + rddata[t] = rv_fsgnjn(rsdata[t][0], rsdata[t][1]); + break; + case 2: // FSGNJX.S + rddata[t] = rv_fsgnjx(rsdata[t][0], rsdata[t][1]); + break; + } + trace->fpu.type = FpuType::FNCP; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); + break; + case 0x14: + if (func3) { + // FMAX.S + rddata[t] = rv_fmax(rsdata[t][0], rsdata[t][1], &fflags); + } else { + // FMIN.S + rddata[t] = rv_fmin(rsdata[t][0], rsdata[t][1], &fflags); + } + trace->fpu.type = FpuType::FNCP; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); + break; + case 0x60: + if (rsrc1 == 0) { + // FCVT.W.S + rddata[t] = rv_ftoi(rsdata[t][0], frm, &fflags); + } else { + // FCVT.WU.S + rddata[t] = rv_ftou(rsdata[t][0], frm, &fflags); + } + trace->fpu.type = FpuType::FCVT; + trace->used_fregs.set(rsrc0); + break; + case 0x70: + if (func3) { + // FCLASS.S + rddata[t] = rv_fclss(rsdata[t][0]); + } else { + // FMV.X.W + rddata[t] = rsdata[t][0]; + } + trace->fpu.type = FpuType::FNCP; + trace->used_fregs.set(rsrc0); + break; + case 0x50: + switch(func3) { + case 0: + // FLE.S + rddata[t] = rv_fle(rsdata[t][0], rsdata[t][1], &fflags); + break; + case 1: + // FLT.S + rddata[t] = rv_flt(rsdata[t][0], rsdata[t][1], &fflags); + break; + case 2: + // FEQ.S + rddata[t] = rv_feq(rsdata[t][0], rsdata[t][1], &fflags); + break; + } + trace->fpu.type = FpuType::FNCP; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); + break; + case 0x68: + if (rsrc1) { + // FCVT.S.WU: + rddata[t] = rv_utof(rsdata[t][0], frm, &fflags); + } else { + // FCVT.S.W: + rddata[t] = rv_itof(rsdata[t][0], frm, &fflags); + } + trace->fpu.type = FpuType::FCVT; + trace->used_iregs.set(rsrc0); + break; + case 0x78: + // FMV.W.X + rddata[t] = rsdata[t][0]; + trace->fpu.type = FpuType::FNCP; + trace->used_iregs.set(rsrc0); + break; + } + update_fcrs(fflags, core_, t, id_); + } + rd_write = true; + break; + case FMADD: + case FMSUB: + case FMNMADD: + case FMNMSUB: + trace->fpu.type = FpuType::FMA; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); + trace->used_fregs.set(rsrc2); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + int frm = get_fpu_rm(func3, core_, t, id_); + Word fflags = 0; + switch (opcode) { + case FMADD: + rddata[t] = rv_fmadd(rsdata[t][0], rsdata[t][1], rsdata[t][2], frm, &fflags); + break; + case FMSUB: + rddata[t] = rv_fmsub(rsdata[t][0], rsdata[t][1], rsdata[t][2], frm, &fflags); + break; + case FMNMADD: + rddata[t] = rv_fnmadd(rsdata[t][0], rsdata[t][1], rsdata[t][2], frm, &fflags); + break; + case FMNMSUB: + rddata[t] = rv_fnmsub(rsdata[t][0], rsdata[t][1], rsdata[t][2], frm, &fflags); + break; + default: + break; + } + update_fcrs(fflags, core_, t, id_); + } + rd_write = true; + break; + case GPGPU: { + int ts = 0; + for (int t = 0; t < num_threads; ++t) { + if (tmask_.test(t)) { + ts = t; + break; + } + } + switch (func3) { + case 0: { + // TMC + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::TMC; + trace->used_iregs.set(rsrc0); + trace->fetch_stall = true; + if (rsrc1) { + // predicate mode + ThreadMask pred; + for (int i = 0; i < num_threads; ++i) { + pred[i] = tmask_.test(i) ? (ireg_file_.at(i).at(rsrc0) != 0) : 0; + } + if (pred.any()) { + tmask_ &= pred; + } + } else { + tmask_.reset(); + for (int i = 0; i < num_threads; ++i) { + tmask_.set(i, rsdata.at(ts)[0] & (1 << i)); + } + } + DPH(3, "*** New TMC: "); + for (int i = 0; i < num_threads; ++i) + DPN(3, tmask_.test(num_threads-i-1)); + DPN(3, std::endl); + + active_ = tmask_.any(); + trace->gpu.active_warps.reset(); + trace->gpu.active_warps.set(id_, active_); + } break; + case 1: { + // WSPAWN + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::WSPAWN; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + trace->fetch_stall = true; + trace->gpu.active_warps = core_->wspawn(rsdata.at(ts)[0], rsdata.at(ts)[1]); + } break; + case 2: { + // SPLIT + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::SPLIT; + trace->used_iregs.set(rsrc0); + trace->fetch_stall = true; + if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) { + ThreadMask tmask; + for (int i = 0; i < num_threads; ++i) { + tmask[i] = tmask_.test(i) && !ireg_file_.at(i).at(rsrc0); + } + + DomStackEntry e(tmask, nextPC); + dom_stack_.push(tmask_); + dom_stack_.push(e); + for (size_t i = 0; i < e.tmask.size(); ++i) { + tmask_.set(i, !e.tmask.test(i) && tmask_.test(i)); + } + active_ = tmask_.any(); + + DPH(3, "*** Split: New TM="); + for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); + DPN(3, ", Pushed TM="); + for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1)); + DPN(3, ", PC=0x" << std::hex << e.PC << "\n"); + } else { + DP(3, "*** Unanimous pred"); + DomStackEntry e(tmask_); + e.unanimous = true; + dom_stack_.push(e); + } + } break; + case 3: { + // JOIN + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::JOIN; + trace->fetch_stall = true; + if (!dom_stack_.empty() && dom_stack_.top().unanimous) { + DP(3, "*** Uninimous branch at join"); + tmask_ = dom_stack_.top().tmask; + active_ = tmask_.any(); + dom_stack_.pop(); + } else { + if (!dom_stack_.top().fallThrough) { + nextPC = dom_stack_.top().PC; + DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec); + } + + tmask_ = dom_stack_.top().tmask; + active_ = tmask_.any(); + + DPH(3, "*** Join: New TM="); + for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); + DPN(3, "\n"); + + dom_stack_.pop(); + } + } break; + case 4: { + // BAR + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::BAR; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + trace->fetch_stall = true; + trace->gpu.active_warps = core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); + } break; + case 5: { + // PREFETCH + trace->exe_type = ExeType::LSU; + trace->lsu.type = LsuType::PREFETCH; + trace->used_iregs.set(rsrc0); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + auto mem_addr = rsdata[t][0]; + trace->mem_addrs.at(t).push_back({mem_addr, 4}); + } + } break; + default: + std::abort(); + } + } break; + case GPU: { + switch (func3) { + case 0: { // TEX + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::TEX; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + trace->used_iregs.set(rsrc2); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + auto unit = func2; + auto u = rsdata[t][0]; + auto v = rsdata[t][1]; + auto lod = rsdata[t][2]; + auto color = core_->tex_read(unit, u, v, lod, &trace->mem_addrs.at(t)); + rddata[t] = color; + } + rd_write = true; + } break; + case 1: + switch (func2) { + case 0: { // CMOV + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::CMOV; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + trace->used_iregs.set(rsrc2); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = rsdata[t][0] ? rsdata[t][1] : rsdata[t][2]; + } + rd_write = true; + } break; + default: + std::abort(); + } + break; + default: + std::abort(); + } + } break; + case VSET: { + int VLEN = core_->arch().vsize() * 8; + int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew(); + switch (func3) { + case 0: // vector-vector + switch (func6) { + case 0: { + auto& vr1 = vreg_file_.at(rsrc0); + auto& vr2 = vreg_file_.at(rsrc1); + auto& vd = vreg_file_.at(rdest); + auto& mask = vreg_file_.at(0); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t emask = *(uint8_t *)(mask.data() + i); + uint8_t value = emask & 0x1; + if (vmask || (!vmask && value)) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = first + second; + DP(3, "Adding " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t emask = *(uint16_t *)(mask.data() + i); + uint16_t value = emask & 0x1; + if (vmask || (!vmask && value)) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = first + second; + DP(3, "Adding " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t emask = *(uint32_t *)(mask.data() + i); + uint32_t value = emask & 0x1; + if (vmask || (!vmask && value)) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = first + second; + DP(3, "Adding " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + } + } + } break; + case 24: { + // vmseq + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first == second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first == second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first == second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + } + } break; + case 25: { + // vmsne + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first != second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first != second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first != second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + } + } break; + case 26: { + // vmsltu + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first < second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first < second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first < second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + } + } break; + case 27: { + // vmslt + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + int8_t first = *(int8_t *)(vr1.data() + i); + int8_t second = *(int8_t *)(vr2.data() + i); + int8_t result = (first < second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + int16_t first = *(int16_t *)(vr1.data() + i); + int16_t second = *(int16_t *)(vr2.data() + i); + int16_t result = (first < second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(int16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + int32_t first = *(int32_t *)(vr1.data() + i); + int32_t second = *(int32_t *)(vr2.data() + i); + int32_t result = (first < second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(int32_t *)(vd.data() + i) = result; + } + } + } break; + case 28: { + // vmsleu + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first <= second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first <= second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first <= second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + } + } break; + case 29: { + // vmsle + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + int8_t first = *(int8_t *)(vr1.data() + i); + int8_t second = *(int8_t *)(vr2.data() + i); + int8_t result = (first <= second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + int16_t first = *(int16_t *)(vr1.data() + i); + int16_t second = *(int16_t *)(vr2.data() + i); + int16_t result = (first <= second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(int16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + int32_t first = *(int32_t *)(vr1.data() + i); + int32_t second = *(int32_t *)(vr2.data() + i); + int32_t result = (first <= second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(int32_t *)(vd.data() + i) = result; + } + } + } break; + case 30: { + // vmsgtu + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first > second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first > second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first > second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + } + } break; + case 31: { + // vmsgt + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + int8_t first = *(int8_t *)(vr1.data() + i); + int8_t second = *(int8_t *)(vr2.data() + i); + int8_t result = (first > second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + int16_t first = *(int16_t *)(vr1.data() + i); + int16_t second = *(int16_t *)(vr2.data() + i); + int16_t result = (first > second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(int16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + int32_t first = *(int32_t *)(vr1.data() + i); + int32_t second = *(int32_t *)(vr2.data() + i); + int32_t result = (first > second) ? 1 : 0; + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(int32_t *)(vd.data() + i) = result; + } + } + } break; + } + break; + case 2: { + switch (func6) { + case 24: { + // vmandnot + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value & !second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value & !second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value & !second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 25: { + // vmand + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value & second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value & second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value & second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 26: { + // vmor + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value | second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value | second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value | second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 27: { + // vmxor + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value ^ second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value ^ second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value ^ second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 28: { + // vmornot + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value | !second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value | !second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value | !second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 29: { + // vmnand + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = !(first_value & second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = !(first_value & second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = !(first_value & second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 30: { + // vmnor + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = !(first_value | second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = !(first_value | second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = !(first_value | second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 31: { + // vmxnor + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = !(first_value ^ second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = !(first_value ^ second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = !(first_value ^ second_value); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 37: { + // vmul + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first * second); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first * second); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first * second); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 45: { + // vmacc + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first * second); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) += result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first * second); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) += result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first * second); + DP(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) += result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + } + } break; + case 6: { + switch (func6) { + case 0: { + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (rsdata[i][0] + second); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (rsdata[i][0] + second); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (rsdata[i][0] + second); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 37: { + // vmul.vx + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (rsdata[i][0] * second); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (rsdata[i][0] * second); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (rsdata[i][0] * second); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + } + } break; + case 7: { + vtype_.vill = 0; + vtype_.vediv = instr.getVediv(); + vtype_.vsew = instr.getVsew(); + vtype_.vlmul = instr.getVlmul(); + + DP(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX); + + int s0 = rsdata[0][0]; + if (s0 <= VLMAX) { + vl_ = s0; + } else if (s0 < (2 * VLMAX)) { + vl_ = (int)ceil((s0 * 1.0) / 2.0); + } else if (s0 >= (2 * VLMAX)) { + vl_ = VLMAX; + } + rddata[0] = vl_; + } break; + default: + std::abort(); + } + } break; + default: + std::abort(); + } + + if (rd_write) { + trace->wb = true; + DPH(2, "Dest Reg: "); + auto rdt = instr.getRDType(); + switch (rdt) { + case RegType::Integer: + if (rdest) { + DPN(2, "r" << std::dec << rdest << "={"); + for (int t = 0; t < num_threads; ++t) { + if (t) DPN(2, ", "); + if (!tmask_.test(t)) { + DPN(2, "-"); + continue; + } + ireg_file_.at(t)[rdest] = rddata[t]; + DPN(2, "0x" << std::hex << rddata[t]); + } + DPN(2, "}" << std::endl); + trace->used_iregs[rdest] = 1; + } + break; + case RegType::Float: + DPN(2, "fr" << std::dec << rdest << "={"); + for (int t = 0; t < num_threads; ++t) { + if (t) DPN(2, ", "); + if (!tmask_.test(t)) { + DPN(2, "-"); + continue; + } + freg_file_.at(t)[rdest] = rddata[t]; + DPN(2, "0x" << std::hex << rddata[t]); + } + DPN(2, "}" << std::endl); + trace->used_fregs[rdest] = 1; + break; + default: + std::abort(); + break; + } + } + + PC_ += core_->arch().wsize(); + if (PC_ != nextPC) { + DP(3, "*** Next PC: " << std::hex << nextPC << std::dec); + PC_ = nextPC; + } +} diff --git a/sim/simx/exeunit.cpp b/sim/simx/exeunit.cpp new file mode 100644 index 00000000..5a47dc06 --- /dev/null +++ b/sim/simx/exeunit.cpp @@ -0,0 +1,383 @@ +#include "exeunit.h" +#include +#include +#include +#include +#include +#include "debug.h" +#include "core.h" +#include "constants.h" + +using namespace vortex; + +NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {} + +void NopUnit::tick() { + if (Input.empty()) + return; + auto trace = Input.front(); + Output.send(trace, 1); + Input.pop(); +} + +/////////////////////////////////////////////////////////////////////////////// + +LsuUnit::LsuUnit(const SimContext& ctx, Core* core) + : ExeUnit(ctx, core, "LSU") + , num_threads_(core->arch().num_threads()) + , pending_rd_reqs_(LSUQ_SIZE) + , fence_lock_(false) +{} + +void LsuUnit::reset() { + pending_rd_reqs_.clear(); + fence_lock_ = false; +} + +void LsuUnit::tick() { + // handle dcache response + for (uint32_t t = 0; t < num_threads_; ++t) { + auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0); + if (dcache_rsp_port.empty()) + continue; + auto& mem_rsp = dcache_rsp_port.front(); + auto& entry = pending_rd_reqs_.at(mem_rsp.tag); + auto trace = entry.first; + DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type + << ", tid=" << t << ", " << *trace); + assert(entry.second); + --entry.second; // track remaining blocks + if (0 == entry.second) { + Output.send(trace, 1); + pending_rd_reqs_.release(mem_rsp.tag); + } + dcache_rsp_port.pop(); + } + + // handle shared memory response + for (uint32_t t = 0; t < num_threads_; ++t) { + auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t); + if (smem_rsp_port.empty()) + continue; + auto& mem_rsp = smem_rsp_port.front(); + auto& entry = pending_rd_reqs_.at(mem_rsp.tag); + auto trace = entry.first; + DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type + << ", tid=" << t << ", " << *trace); + assert(entry.second); + --entry.second; // track remaining blocks + if (0 == entry.second) { + Output.send(trace, 1); + pending_rd_reqs_.release(mem_rsp.tag); + } + smem_rsp_port.pop(); + } + + if (fence_lock_) { + // wait for all pending memory operations to complete + if (!pending_rd_reqs_.empty()) + return; + Output.send(fence_state_, 1); + fence_lock_ = false; + DT(3, "fence-unlock: " << fence_state_); + } + + // check input queue + if (Input.empty()) + return; + + auto trace = Input.front(); + + if (trace->lsu.type == LsuType::FENCE) { + // schedule fence lock + fence_state_ = trace; + fence_lock_ = true; + DT(3, "fence-lock: " << *trace); + // remove input + auto time = Input.pop(); + core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time); + return; + } + + // check pending queue capacity + if (pending_rd_reqs_.full()) { + if (!trace->suspend()) { + DT(3, "*** lsu-queue-stall: " << *trace); + } + return; + } else { + trace->resume(); + } + + bool is_write = (trace->lsu.type == LsuType::STORE); + + // duplicates detection + bool is_dup = false; + if (trace->tmask.test(0)) { + uint64_t addr_mask = sizeof(Word)-1; + Word addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask; + uint32_t matches = 1; + for (uint32_t t = 1; t < num_threads_; ++t) { + if (!trace->tmask.test(t)) + continue; + auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask; + matches += (addr0 == mem_addr); + } + is_dup = (matches == trace->tmask.count()); + } + + uint32_t valid_addrs = 0; + if (is_dup) { + valid_addrs = 1; + } else { + for (auto& mem_addr : trace->mem_addrs) { + valid_addrs += mem_addr.size(); + } + } + + auto tag = pending_rd_reqs_.allocate({trace, valid_addrs}); + + for (uint32_t t = 0; t < num_threads_; ++t) { + if (!trace->tmask.test(t)) + continue; + + auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0); + auto mem_addr = trace->mem_addrs.at(t).at(0); + auto type = get_addr_type(mem_addr.addr, mem_addr.size); + + MemReq mem_req; + mem_req.addr = mem_addr.addr; + mem_req.write = is_write; + mem_req.non_cacheable = (type == AddrType::IO); + mem_req.tag = tag; + mem_req.core_id = trace->cid; + mem_req.uuid = trace->uuid; + + if (type == AddrType::Shared) { + core_->shared_mem_->Inputs.at(t).send(mem_req, 2); + DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); + } else { + dcache_req_port.send(mem_req, 2); + DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + << ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace); + } + + if (is_dup) + break; + } + + // do not wait on writes + if (is_write) { + pending_rd_reqs_.release(tag); + Output.send(trace, 1); + } + + // remove input + auto time = Input.pop(); + core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time); +} + +/////////////////////////////////////////////////////////////////////////////// + +AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {} + +void AluUnit::tick() { + if (Input.empty()) + return; + auto trace = Input.front(); + switch (trace->alu.type) { + case AluType::ARITH: + case AluType::BRANCH: + case AluType::SYSCALL: + case AluType::CMOV: + Output.send(trace, 1); + break; + case AluType::IMUL: + Output.send(trace, LATENCY_IMUL+1); + break; + case AluType::IDIV: + Output.send(trace, XLEN+1); + break; + default: + std::abort(); + } + DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace); + if (trace->fetch_stall) { + core_->stalled_warps_.reset(trace->wid); + } + auto time = Input.pop(); + core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time); +} + +/////////////////////////////////////////////////////////////////////////////// + +CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {} + +void CsrUnit::tick() { + if (Input.empty()) + return; + auto trace = Input.front(); + Output.send(trace, 1); + auto time = Input.pop(); + core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time); + DT(3, "pipeline-execute: op=CSR, " << *trace); +} + +/////////////////////////////////////////////////////////////////////////////// + +FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {} + +void FpuUnit::tick() { + if (Input.empty()) + return; + auto trace = Input.front(); + switch (trace->fpu.type) { + case FpuType::FNCP: + Output.send(trace, 2); + break; + case FpuType::FMA: + Output.send(trace, LATENCY_FMA+1); + break; + case FpuType::FDIV: + Output.send(trace, LATENCY_FDIV+1); + break; + case FpuType::FSQRT: + Output.send(trace, LATENCY_FSQRT+1); + break; + case FpuType::FCVT: + Output.send(trace, LATENCY_FCVT+1); + break; + default: + std::abort(); + } + DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace); + auto time = Input.pop(); + core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time); +} + +/////////////////////////////////////////////////////////////////////////////// + +GpuUnit::GpuUnit(const SimContext& ctx, Core* core) + : ExeUnit(ctx, core, "GPU") + , num_threads_(core->arch().num_threads()) + , pending_tex_reqs_(TEXQ_SIZE) +{} + +void GpuUnit::reset() { + pending_tex_reqs_.clear(); +} + +void GpuUnit::tick() { +#ifdef EXT_TEX_ENABLE + // handle memory response + for (uint32_t t = 0; t < num_threads_; ++t) { + auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1); + if (dcache_rsp_port.empty()) + continue; + auto& mem_rsp = dcache_rsp_port.front(); + auto& entry = pending_tex_reqs_.at(mem_rsp.tag); + auto trace = entry.first; + DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace); + assert(entry.second); + --entry.second; // track remaining blocks + if (0 == entry.second) { + Output.send(trace, 1); + pending_tex_reqs_.release(mem_rsp.tag); + } + dcache_rsp_port.pop(); + } +#endif + + // check input queue + if (Input.empty()) + return; + + auto trace = Input.front(); + + bool issued = false; + + switch (trace->gpu.type) { + case GpuType::TMC: + Output.send(trace, 1); + core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid)); + issued = true; + break; + case GpuType::WSPAWN: + Output.send(trace, 1); + core_->active_warps_ = trace->gpu.active_warps; + issued = true; + break; + case GpuType::SPLIT: + case GpuType::JOIN: + Output.send(trace, 1); + issued = true; + break; + case GpuType::BAR: + Output.send(trace, 1); + if (trace->gpu.active_warps != 0) + core_->active_warps_ |= trace->gpu.active_warps; + else + core_->active_warps_.reset(trace->wid); + issued = true; + break; + case GpuType::TEX: + if (this->processTexRequest(trace)) + issued = true; + break; + default: + std::abort(); + } + + if (issued) { + DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace); + if (trace->fetch_stall) { + core_->stalled_warps_.reset(trace->wid); + } + auto time = Input.pop(); + core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time); + } +} + +bool GpuUnit::processTexRequest(pipeline_trace_t* trace) { + // check pending queue capacity + if (pending_tex_reqs_.full()) { + if (!trace->suspend()) { + DT(3, "*** tex-queue-stall: " << *trace); + } + return false; + } else { + trace->resume(); + } + + // send memory request + + uint32_t valid_addrs = 0; + for (auto& mem_addr : trace->mem_addrs) { + valid_addrs += mem_addr.size(); + } + + auto tag = pending_tex_reqs_.allocate({trace, valid_addrs}); + + for (uint32_t t = 0; t < num_threads_; ++t) { + if (!trace->tmask.test(t)) + continue; + + auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1); + for (auto& mem_addr : trace->mem_addrs.at(t)) { + MemReq mem_req; + mem_req.addr = mem_addr.addr; + mem_req.write = (trace->lsu.type == LsuType::STORE); + mem_req.tag = tag; + mem_req.core_id = core_->id(); + mem_req.uuid = trace->uuid; + dcache_req_port.send(mem_req, 3); + DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + << ", tid=" << t << ", "<< trace); + ++ core_->perf_stats_.tex_reads; + ++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size(); + } + } + + return true; +} \ No newline at end of file diff --git a/sim/simx/exeunit.h b/sim/simx/exeunit.h new file mode 100644 index 00000000..78990369 --- /dev/null +++ b/sim/simx/exeunit.h @@ -0,0 +1,103 @@ +#pragma once + +#include +#include "pipeline.h" +#include "cache.h" + +namespace vortex { + +class Core; + +class ExeUnit : public SimObject { +public: + SimPort Input; + SimPort Output; + + ExeUnit(const SimContext& ctx, Core* core, const char* name) + : SimObject(ctx, name) + , Input(this) + , Output(this) + , core_(core) + {} + + virtual ~ExeUnit() {} + + virtual void reset() {} + + virtual void tick() = 0; + +protected: + Core* core_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +class NopUnit : public ExeUnit { +public: + NopUnit(const SimContext& ctx, Core*); + + void tick(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class LsuUnit : public ExeUnit { +private: + uint32_t num_threads_; + HashTable> pending_rd_reqs_; + pipeline_trace_t* fence_state_; + bool fence_lock_; + +public: + LsuUnit(const SimContext& ctx, Core*); + + void reset(); + + void tick(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class AluUnit : public ExeUnit { +public: + AluUnit(const SimContext& ctx, Core*); + + void tick(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class CsrUnit : public ExeUnit { +public: + CsrUnit(const SimContext& ctx, Core*); + + void tick(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class FpuUnit : public ExeUnit { +public: + FpuUnit(const SimContext& ctx, Core*); + + void tick(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class GpuUnit : public ExeUnit { +private: + uint32_t num_threads_; + HashTable> pending_tex_reqs_; + + bool processTexRequest(pipeline_trace_t* trace); + +public: + GpuUnit(const SimContext& ctx, Core*); + + void reset(); + + void tick(); +}; + +} \ No newline at end of file diff --git a/sim/simx/ibuffer.h b/sim/simx/ibuffer.h new file mode 100644 index 00000000..7362195f --- /dev/null +++ b/sim/simx/ibuffer.h @@ -0,0 +1,44 @@ +#pragma once + +#include "pipeline.h" +#include + +namespace vortex { + +class IBuffer { +private: + std::queue entries_; + uint32_t capacity_; + +public: + IBuffer(uint32_t size) + : capacity_(size) + {} + + bool empty() const { + return entries_.empty(); + } + + bool full() const { + return (entries_.size() == capacity_); + } + + pipeline_trace_t* top() const { + return entries_.front(); + } + + void push(pipeline_trace_t* trace) { + entries_.emplace(trace); + } + + void pop() { + return entries_.pop(); + } + + void clear() { + std::queue empty; + std::swap(entries_, empty ); + } +}; + +} \ No newline at end of file diff --git a/sim/simX/instr.h b/sim/simx/instr.h similarity index 73% rename from sim/simX/instr.h rename to sim/simx/instr.h index a93dd61b..334b8565 100644 --- a/sim/simX/instr.h +++ b/sim/simx/instr.h @@ -29,10 +29,9 @@ enum Opcode { FMNMADD = 0x4f, // Vector Extension VSET = 0x57, - VL = 0x7, - VS = 0x27, // GPGPU Extension GPGPU = 0x6b, + GPU = 0x5b, }; enum InstType { @@ -53,22 +52,24 @@ public: : opcode_(Opcode::NOP) , num_rsrcs_(0) , has_imm_(false) + , rdest_type_(RegType::None) , rdest_(0) , func3_(0) , func7_(0) { for (int i = 0; i < MAX_REG_SOURCES; ++i) { - rsrc_type_[i] = 0; + rsrc_type_[i] = RegType::None; } } /* Setters used to "craft" the instruction. */ void setOpcode(Opcode opcode) { opcode_ = opcode; } - void setDestReg(int destReg) { rdest_type_ = 1; rdest_ = destReg; } - void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = 1; rsrc_[num_rsrcs_++] = srcReg; } - void setDestFReg(int destReg) { rdest_type_ = 2; rdest_ = destReg; } - void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = 2; rsrc_[num_rsrcs_++] = srcReg; } - void setDestVReg(int destReg) { rdest_type_ = 3; rdest_ = destReg; } - void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = 3; rsrc_[num_rsrcs_++] = srcReg; } + void setDestReg(int destReg) { rdest_type_ = RegType::Integer; rdest_ = destReg; } + void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Integer; rsrc_[num_rsrcs_++] = srcReg; } + void setDestFReg(int destReg) { rdest_type_ = RegType::Float; rdest_ = destReg; } + void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg; } + void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; } + void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; } + void setFunc2(Word func2) { func2_ = func2; } void setFunc3(Word func3) { func3_ = func3; } void setFunc7(Word func7) { func7_ = func7; } void setImm(Word imm) { has_imm_ = true; imm_ = imm; } @@ -84,14 +85,15 @@ public: /* Getters used by encoders. */ Opcode getOpcode() const { return opcode_; } + Word getFunc2() const { return func2_; } Word getFunc3() const { return func3_; } Word getFunc6() const { return func6_; } Word getFunc7() const { return func7_; } int getNRSrc() const { return num_rsrcs_; } int getRSrc(int i) const { return rsrc_[i]; } - int getRSType(int i) const { return rsrc_type_[i]; } + RegType getRSType(int i) const { return rsrc_type_[i]; } int getRDest() const { return rdest_; } - int getRDType() const { return rdest_type_; } + RegType getRDType() const { return rdest_type_; } bool hasImm() const { return has_imm_; } Word getImm() const { return imm_; } Word getVlsWidth() const { return vlsWidth_; } @@ -112,18 +114,16 @@ private: Opcode opcode_; int num_rsrcs_; bool has_imm_; - int rdest_type_; - int isrc_mask_; - int fsrc_mask_; - int vsrc_mask_; + RegType rdest_type_; Word imm_; - int rsrc_type_[MAX_REG_SOURCES]; + RegType rsrc_type_[MAX_REG_SOURCES]; int rsrc_[MAX_REG_SOURCES]; int rdest_; + Word func2_; Word func3_; - Word func7_; + Word func6_; - //Vector + // Vector Word vmask_; Word vlsWidth_; Word vMop_; @@ -132,7 +132,7 @@ private: Word vlmul_; Word vsew_; Word vediv_; - Word func6_; + Word func7_; friend std::ostream &operator<<(std::ostream &, const Instr&); }; diff --git a/sim/simX/main.cpp b/sim/simx/main.cpp similarity index 61% rename from sim/simX/main.cpp rename to sim/simx/main.cpp index 9af8ff02..89999c8f 100644 --- a/sim/simX/main.cpp +++ b/sim/simx/main.cpp @@ -5,28 +5,30 @@ #include #include #include - -#include "debug.h" -#include "types.h" -#include "core.h" +#include "processor.h" +#include "archdef.h" +#include "mem.h" +#include "constants.h" +#include #include "args.h" using namespace vortex; int main(int argc, char **argv) { + int exitcode = 0; - std::string archString("rv32imf"); + std::string archStr("rv32imf"); + std::string imgFileName; int num_cores(NUM_CORES * NUM_CLUSTERS); int num_warps(NUM_WARPS); - int num_threads(NUM_THREADS); - std::string imgFileName; + int num_threads(NUM_THREADS); bool showHelp(false); bool showStats(false); bool riscv_test(false); /* Read the command line arguments. */ CommandLineArgFlag fh("-h", "--help", "", showHelp); - CommandLineArgSetter fa("-a", "--arch", "", archString); + CommandLineArgSetter fa("-a", "--arch", "", archStr); CommandLineArgSetter fi("-i", "--image", "", imgFileName); CommandLineArgSetter fc("-c", "--cores", "", num_cores); CommandLineArgSetter fw("-w", "--warps", "", num_warps); @@ -48,49 +50,37 @@ int main(int argc, char **argv) { return 0; } - ArchDef arch(archString, num_cores, num_warps, num_threads); - - Decoder decoder(arch); - MemoryUnit mu(0, arch.wsize(), true); + std::cout << "Running " << imgFileName << "..." << std::endl; - RAM ram((1<<12), (1<<20)); + { + // create processor configuation + ArchDef arch(archStr, num_cores, num_warps, num_threads); - std::string program_ext(fileExtension(imgFileName.c_str())); - if (program_ext == "bin") { - ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR); - } else if (program_ext == "hex") { - ram.loadHexImage(imgFileName.c_str()); - } else { - std::cout << "*** error: only *.bin or *.hex images supported." << std::endl; - return -1; - } + // create memory module + RAM ram(RAM_PAGE_SIZE); - mu.attach(ram, 0, 0xFFFFFFFF); - - struct stat hello; - fstat(0, &hello); - - std::vector> cores(num_cores); - for (int i = 0; i < num_cores; ++i) { - cores[i] = std::make_shared(arch, decoder, mu, i); - } - - bool running; - int exitcode = 0; - do { - running = false; - for (auto& core : cores) { - core->step(); - if (core->running()) { - running = true; - } - if (core->check_ebreak()) { - exitcode = core->getIRegValue(3); - running = false; - break; + // load program + { + std::string program_ext(fileExtension(imgFileName.c_str())); + if (program_ext == "bin") { + ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR); + } else if (program_ext == "hex") { + ram.loadHexImage(imgFileName.c_str()); + } else { + std::cout << "*** error: only *.bin or *.hex images supported." << std::endl; + return -1; } } - } while (running); + + // create processor + Processor processor(arch); + + // attach memory module + processor.attach_ram(&ram); + + // run simulation + processor.run(); + } if (riscv_test) { if (1 == exitcode) { @@ -103,7 +93,7 @@ int main(int argc, char **argv) { if (exitcode != 0) { std::cout << "*** error: exitcode=" << exitcode << std::endl; } - } + } return exitcode; } diff --git a/sim/simx/memsim.cpp b/sim/simx/memsim.cpp new file mode 100644 index 00000000..a69df4b9 --- /dev/null +++ b/sim/simx/memsim.cpp @@ -0,0 +1,123 @@ +#include "memsim.h" +#include +#include +#include + +DISABLE_WARNING_PUSH +DISABLE_WARNING_UNUSED_PARAMETER +#define RAMULATOR +#include +#include +#include +DISABLE_WARNING_POP + +#include "constants.h" +#include "types.h" +#include "debug.h" + +using namespace vortex; + +class MemSim::Impl { +private: + MemSim* simobject_; + Config config_; + PerfStats perf_stats_; + ramulator::Gem5Wrapper* dram_; + +public: + + Impl(MemSim* simobject, const Config& config) + : simobject_(simobject) + , config_(config) + { + ramulator::Config ram_config; + ram_config.add("standard", "DDR4"); + ram_config.add("channels", std::to_string(config.channels)); + ram_config.add("ranks", "1"); + ram_config.add("speed", "DDR4_2400R"); + ram_config.add("org", "DDR4_4Gb_x8"); + ram_config.add("mapping", "defaultmapping"); + ram_config.set_core_num(config.num_cores); + dram_ = new ramulator::Gem5Wrapper(ram_config, MEM_BLOCK_SIZE); + Stats::statlist.output("ramulator.ddr4.log"); + } + + ~Impl() { + dram_->finish(); + Stats::statlist.printall(); + delete dram_; + } + + const PerfStats& perf_stats() const { + return perf_stats_; + } + + void dram_callback(ramulator::Request& req, uint32_t tag, uint64_t uuid) { + if (req.type == ramulator::Request::Type::WRITE) + return; + MemRsp mem_rsp{tag, (uint32_t)req.coreid, uuid}; + simobject_->MemRspPort.send(mem_rsp, 1); + DT(3, simobject_->name() << "-" << mem_rsp); + } + + void reset() { + perf_stats_ = PerfStats(); + } + + void tick() { + if (MEM_CYCLE_RATIO > 0) { + auto cycle = SimPlatform::instance().cycles(); + if ((cycle % MEM_CYCLE_RATIO) == 0) + dram_->tick(); + } else { + for (int i = MEM_CYCLE_RATIO; i <= 0; ++i) + dram_->tick(); + } + + if (simobject_->MemReqPort.empty()) + return; + + auto& mem_req = simobject_->MemReqPort.front(); + + ramulator::Request dram_req( + mem_req.addr, + mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ, + std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid), + mem_req.core_id + ); + + if (!dram_->send(dram_req)) + return; + + if (mem_req.write) { + ++perf_stats_.writes; + } else { + ++perf_stats_.reads; + } + + DT(3, simobject_->name() << "-" << mem_req); + + simobject_->MemReqPort.pop(); + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +MemSim::MemSim(const SimContext& ctx, const char* name, const Config& config) + : SimObject(ctx, name) + , MemReqPort(this) + , MemRspPort(this) + , impl_(new Impl(this, config)) +{} + +MemSim::~MemSim() { + delete impl_; +} + +void MemSim::reset() { + impl_->reset(); +} + +void MemSim::tick() { + impl_->tick(); +} \ No newline at end of file diff --git a/sim/simx/memsim.h b/sim/simx/memsim.h new file mode 100644 index 00000000..26e21a34 --- /dev/null +++ b/sim/simx/memsim.h @@ -0,0 +1,43 @@ +#pragma once + +#include +#include "types.h" +#include + +namespace vortex { + +class MemSim : public SimObject{ +public: + struct Config { + uint32_t channels; + uint32_t num_cores; + }; + + struct PerfStats { + uint64_t reads; + uint64_t writes; + + PerfStats() + : reads(0) + , writes(0) + {} + }; + + SimPort MemReqPort; + SimPort MemRspPort; + + MemSim(const SimContext& ctx, const char* name, const Config& config); + ~MemSim(); + + void reset(); + + void tick(); + + const PerfStats& perf_stats() const; + +private: + class Impl; + Impl* impl_; +}; + +}; \ No newline at end of file diff --git a/sim/simx/pipeline.h b/sim/simx/pipeline.h new file mode 100644 index 00000000..18d54e21 --- /dev/null +++ b/sim/simx/pipeline.h @@ -0,0 +1,137 @@ + +#pragma once + +#include +#include +#include +#include "types.h" +#include "archdef.h" +#include "debug.h" + +namespace vortex { + +struct pipeline_trace_t { + //-- + uint64_t uuid; + + //-- + int cid; + int wid; + ThreadMask tmask; + Word PC; + + //-- + bool fetch_stall; + + //-- + bool wb; + RegType rdest_type; + int rdest; + + //-- + RegMask used_iregs; + RegMask used_fregs; + RegMask used_vregs; + + //- + ExeType exe_type; + + //-- + std::vector> mem_addrs; + + //-- + union { + struct { + LsuType type; + } lsu; + struct { + AluType type; + } alu; + struct { + FpuType type; + } fpu; + struct { + GpuType type; + WarpMask active_warps; + } gpu; + }; + + bool stalled; + + pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) { + uuid = uuid_; + cid = 0; + wid = 0; + tmask.reset(); + PC = 0; + fetch_stall = false; + wb = false; + rdest = 0; + rdest_type = RegType::None; + used_iregs.reset(); + used_fregs.reset(); + used_vregs.reset(); + exe_type = ExeType::NOP; + mem_addrs.resize(arch.num_threads()); + stalled = false; + } + + bool suspend() { + bool old = stalled; + stalled = true; + return old; + } + + void resume() { + stalled = false; + } +}; + +inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) { + os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC; + os << ", wb=" << state.wb; + if (state.wb) { + os << ", rd=" << state.rdest_type << std::dec << state.rdest; + } + os << ", ex=" << state.exe_type; + os << " (#" << std::dec << state.uuid << ")"; + return os; +} + +class PipelineLatch { +protected: + const char* name_; + std::queue queue_; + +public: + PipelineLatch(const char* name = nullptr) + : name_(name) + {} + + bool empty() const { + return queue_.empty(); + } + + pipeline_trace_t* front() { + return queue_.front(); + } + + pipeline_trace_t* back() { + return queue_.back(); + } + + void push(pipeline_trace_t* value) { + queue_.push(value); + } + + void pop() { + queue_.pop(); + } + + void clear() { + std::queue empty; + std::swap(queue_, empty ); + } +}; + +} \ No newline at end of file diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp new file mode 100644 index 00000000..a7314687 --- /dev/null +++ b/sim/simx/processor.cpp @@ -0,0 +1,178 @@ +#include "processor.h" +#include "core.h" +#include "constants.h" + +using namespace vortex; + +class Processor::Impl { +private: + std::vector cores_; + std::vector l2caches_; + std::vector::Ptr> l2_mem_switches_; + Cache::Ptr l3cache_; + Switch::Ptr l3_mem_switch_; + +public: + Impl(const ArchDef& arch) + : cores_(arch.num_cores()) + , l2caches_(NUM_CLUSTERS) + , l2_mem_switches_(NUM_CLUSTERS) + { + SimPlatform::instance().initialize(); + + uint32_t num_cores = arch.num_cores(); + uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; + + // create cores + for (uint32_t i = 0; i < num_cores; ++i) { + cores_.at(i) = Core::Create(arch, i); + } + + // setup memory simulator + auto memsim = MemSim::Create("dram", MemSim::Config{ + MEMORY_BANKS, + arch.num_cores() + }); + + std::vector*> mem_req_ports(1, &memsim->MemReqPort); + std::vector*> mem_rsp_ports(1, &memsim->MemRspPort); + + if (L3_ENABLE) { + l3cache_ = Cache::Create("l3cache", Cache::Config{ + log2ceil(L3_CACHE_SIZE), // C + log2ceil(MEM_BLOCK_SIZE), // B + 2, // W + 0, // A + 32, // address bits + L3_NUM_BANKS, // number of banks + L3_NUM_PORTS, // number of ports + NUM_CLUSTERS, // request size + true, // write-through + false, // write response + 0, // victim size + L3_MSHR_SIZE, // mshr + 2, // pipeline latency + } + ); + l3cache_->MemReqPort.bind(mem_req_ports.at(0)); + mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort); + + mem_req_ports.resize(NUM_CLUSTERS); + mem_rsp_ports.resize(NUM_CLUSTERS); + + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i); + mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i); + } + } else if (NUM_CLUSTERS > 1) { + l3_mem_switch_ = Switch::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS); + l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0)); + mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn); + + mem_req_ports.resize(NUM_CLUSTERS); + mem_rsp_ports.resize(NUM_CLUSTERS); + + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i); + mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i); + } + } + + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + std::vector*> cluster_mem_req_ports(cores_per_cluster); + std::vector*> cluster_mem_rsp_ports(cores_per_cluster); + + if (L2_ENABLE) { + auto& l2cache = l2caches_.at(i); + l2cache = Cache::Create("l2cache", Cache::Config{ + log2ceil(L2_CACHE_SIZE), // C + log2ceil(MEM_BLOCK_SIZE), // B + 2, // W + 0, // A + 32, // address bits + L2_NUM_BANKS, // number of banks + L2_NUM_PORTS, // number of ports + (uint8_t)cores_per_cluster, // request size + true, // write-through + false, // write response + 0, // victim size + L2_MSHR_SIZE, // mshr + 2, // pipeline latency + }); + l2cache->MemReqPort.bind(mem_req_ports.at(i)); + mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort); + + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); + cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); + } + } else { + auto& l2_mem_switch = l2_mem_switches_.at(i); + l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster); + l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); + mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); + + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); + cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); + } + } + + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + auto& core = cores_.at((i * cores_per_cluster) + j); + core->MemReqPort.bind(cluster_mem_req_ports.at(j)); + cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort); + } + } + } + + ~Impl() { + SimPlatform::instance().finalize(); + } + + void attach_ram(RAM* ram) { + for (auto core : cores_) { + core->attach_ram(ram); + } + } + + int run() { + SimPlatform::instance().reset(); + bool running; + int exitcode = 0; + do { + SimPlatform::instance().tick(); + running = false; + for (auto& core : cores_) { + if (core->running()) { + running = true; + } + if (core->check_exit()) { + exitcode = core->getIRegValue(3); + running = false; + break; + } + } + } while (running); + + return exitcode; + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +Processor::Processor(const ArchDef& arch) + : impl_(new Impl(arch)) +{} + +Processor::~Processor() { + delete impl_; +} + +void Processor::attach_ram(RAM* mem) { + impl_->attach_ram(mem); +} + +int Processor::run() { + return impl_->run(); +} \ No newline at end of file diff --git a/sim/simx/processor.h b/sim/simx/processor.h new file mode 100644 index 00000000..46bcd735 --- /dev/null +++ b/sim/simx/processor.h @@ -0,0 +1,22 @@ +#pragma once + +namespace vortex { + +class ArchDef; +class RAM; + +class Processor { +public: + Processor(const ArchDef& arch); + ~Processor(); + + void attach_ram(RAM* mem); + + int run(); + +private: + class Impl; + Impl* impl_; +}; + +} \ No newline at end of file diff --git a/sim/simx/scoreboard.h b/sim/simx/scoreboard.h new file mode 100644 index 00000000..c468860d --- /dev/null +++ b/sim/simx/scoreboard.h @@ -0,0 +1,128 @@ +#pragma once + +#include "pipeline.h" +#include + +namespace vortex { + +class Scoreboard { +private: + struct reg_use_t { + RegType type; + uint32_t reg; + uint64_t owner; + }; + + std::vector in_use_iregs_; + std::vector in_use_fregs_; + std::vector in_use_vregs_; + std::unordered_map owners_; + +public: + Scoreboard(const ArchDef &arch) + : in_use_iregs_(arch.num_warps()) + , in_use_fregs_(arch.num_warps()) + , in_use_vregs_(arch.num_warps()) + { + this->clear(); + } + + void clear() { + for (int i = 0, n = in_use_iregs_.size(); i < n; ++i) { + in_use_iregs_.at(i).reset(); + in_use_fregs_.at(i).reset(); + in_use_vregs_.at(i).reset(); + } + owners_.clear(); + } + + bool in_use(pipeline_trace_t* state) const { + return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0 + || (state->used_fregs & in_use_fregs_.at(state->wid)) != 0 + || (state->used_vregs & in_use_vregs_.at(state->wid)) != 0; + } + + std::vector get_uses(pipeline_trace_t* state) const { + std::vector out; + { + uint32_t r = 0; + auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid); + while (used_iregs.any()) { + if (used_iregs.test(0)) { + uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer; + out.push_back({RegType::Integer, r, owners_.at(tag)}); + } + used_iregs >>= 1; + ++r; + } + } + { + uint32_t r = 0; + auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid); + while (used_fregs.any()) { + if (used_fregs.test(0)) { + uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float; + out.push_back({RegType::Float, r, owners_.at(tag)}); + } + used_fregs >>= 1; + ++r; + } + } + { + uint32_t r = 0; + auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid); + while (used_vregs.any()) { + if (used_vregs.test(0)) { + uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector; + out.push_back({RegType::Vector, r, owners_.at(tag)}); + } + used_vregs >>= 1; + ++r; + } + } + return std::move(out); + } + + void reserve(pipeline_trace_t* state) { + if (!state->wb) + return; + switch (state->rdest_type) { + case RegType::Integer: + in_use_iregs_.at(state->wid).set(state->rdest); + break; + case RegType::Float: + in_use_fregs_.at(state->wid).set(state->rdest); + break; + case RegType::Vector: + in_use_vregs_.at(state->wid).set(state->rdest); + break; + default: + break; + } + uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type; + assert(owners_.count(tag) == 0); + owners_[tag] = state->uuid; + } + + void release(pipeline_trace_t* state) { + if (!state->wb) + return; + switch (state->rdest_type) { + case RegType::Integer: + in_use_iregs_.at(state->wid).reset(state->rdest); + break; + case RegType::Float: + in_use_fregs_.at(state->wid).reset(state->rdest); + break; + case RegType::Vector: + in_use_vregs_.at(state->wid).reset(state->rdest); + break; + default: + break; + } + uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type; + owners_.erase(tag); + } +}; + +} \ No newline at end of file diff --git a/sim/simx/sharedmem.h b/sim/simx/sharedmem.h new file mode 100644 index 00000000..c76a29d3 --- /dev/null +++ b/sim/simx/sharedmem.h @@ -0,0 +1,96 @@ +#pragma once + +#include +#include +#include +#include "types.h" + +namespace vortex { + +class Core; + +class SharedMem : public SimObject { +public: + struct Config { + uint32_t num_reqs; + uint32_t num_banks; + uint32_t bank_offset; + uint32_t latency; + bool write_reponse; + }; + + struct PerfStats { + uint64_t reads; + uint64_t writes; + uint64_t bank_stalls; + + PerfStats() + : reads(0) + , writes(0) + , bank_stalls(0) + {} + }; + + std::vector> Inputs; + std::vector> Outputs; + + SharedMem(const SimContext& ctx, const char* name, const Config& config) + : SimObject(ctx, name) + , Inputs(config.num_reqs, this) + , Outputs(config.num_reqs, this) + , config_(config) + , bank_sel_addr_start_(config.bank_offset) + , bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1) + {} + + virtual ~SharedMem() {} + + void reset() { + perf_stats_ = PerfStats(); + } + + void tick() { + std::vector in_used_banks(config_.num_banks); + for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) { + auto& core_req_port = this->Inputs.at(req_id); + if (core_req_port.empty()) + continue; + + auto& core_req = core_req_port.front(); + + uint32_t bank_id = (uint32_t)bit_getw( + core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_); + + // bank conflict check + if (in_used_banks.at(bank_id)) + continue; + + in_used_banks.at(bank_id) = true; + + if (!core_req.write || config_.write_reponse) { + // send response + MemRsp core_rsp{core_req.tag, core_req.core_id}; + this->Outputs.at(req_id).send(core_rsp, 1); + } + + // update perf counters + perf_stats_.reads += !core_req.write; + perf_stats_.writes += core_req.write; + + // remove input + core_req_port.pop(); + } + } + + const PerfStats& perf_stats() const { + return perf_stats_; + } + +protected: + Config config_; + uint32_t bank_sel_addr_start_; + uint32_t bank_sel_addr_end_; + PerfStats perf_stats_; +}; + +} \ No newline at end of file diff --git a/sim/simx/tex_unit.cpp b/sim/simx/tex_unit.cpp new file mode 100644 index 00000000..763f37a6 --- /dev/null +++ b/sim/simx/tex_unit.cpp @@ -0,0 +1,98 @@ +#include "tex_unit.h" +#include "core.h" +#include +#include + +using namespace vortex; +using namespace cocogfx; + +enum class FilterMode { + Point, + Bilinear, + Trilinear, +}; + +TexUnit::TexUnit(Core* core) : core_(core) {} + +TexUnit::~TexUnit() {} + +void TexUnit::clear() { + for (auto& state : states_) { + state = 0; + } +} + +uint32_t TexUnit::get_state(uint32_t state) { + return states_.at(state); +} + +void TexUnit::set_state(uint32_t state, uint32_t value) { + states_.at(state) = value; +} + +uint32_t TexUnit::read(int32_t u, + int32_t v, + int32_t lod, + std::vector* mem_addrs) { + //-- + auto xu = Fixed::make(u); + auto xv = Fixed::make(v); + uint32_t base_addr = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod)); + uint32_t log_width = std::max(states_.at(TEX_STATE_WIDTH) - lod, 0); + uint32_t log_height = std::max(states_.at(TEX_STATE_HEIGHT) - lod, 0); + auto format = (TexFormat)states_.at(TEX_STATE_FORMAT); + auto filter = (FilterMode)states_.at(TEX_STATE_FILTER); + auto wrapu = (WrapMode)states_.at(TEX_STATE_WRAPU); + auto wrapv = (WrapMode)states_.at(TEX_STATE_WRAPV); + + auto stride = Stride(format); + + switch (filter) { + case FilterMode::Bilinear: { + // addressing + uint32_t offset00, offset01, offset10, offset11; + uint32_t alpha, beta; + TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv, + &offset00, &offset01, &offset10, &offset11, &alpha, &beta); + + uint32_t addr00 = base_addr + offset00 * stride; + uint32_t addr01 = base_addr + offset01 * stride; + uint32_t addr10 = base_addr + offset10 * stride; + uint32_t addr11 = base_addr + offset11 * stride; + + // memory lookup + uint32_t texel00 = core_->dcache_read(addr00, stride); + uint32_t texel01 = core_->dcache_read(addr01, stride); + uint32_t texel10 = core_->dcache_read(addr10, stride); + uint32_t texel11 = core_->dcache_read(addr11, stride); + + mem_addrs->push_back({addr00, stride}); + mem_addrs->push_back({addr01, stride}); + mem_addrs->push_back({addr10, stride}); + mem_addrs->push_back({addr11, stride}); + + // filtering + auto color = TexFilterLinear( + format, texel00, texel01, texel10, texel11, alpha, beta); + return color; + } + case FilterMode::Point: { + // addressing + uint32_t offset; + TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset); + + uint32_t addr = base_addr + offset * stride; + + // memory lookup + uint32_t texel = core_->dcache_read(addr, stride); + mem_addrs->push_back({addr, stride}); + + // filtering + auto color = TexFilterPoint(format, texel); + return color; + } + default: + std::abort(); + return 0; + } +} \ No newline at end of file diff --git a/sim/simx/tex_unit.h b/sim/simx/tex_unit.h new file mode 100644 index 00000000..5bca8098 --- /dev/null +++ b/sim/simx/tex_unit.h @@ -0,0 +1,28 @@ +#pragma once + +#include "types.h" + +namespace vortex { + +class Core; + +class TexUnit { +public: + TexUnit(Core* core); + ~TexUnit(); + + void clear(); + + uint32_t get_state(uint32_t state); + + void set_state(uint32_t state, uint32_t value); + + uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector* mem_addrs); + +private: + + std::array states_; + Core* core_; +}; + +} \ No newline at end of file diff --git a/sim/simx/types.h b/sim/simx/types.h new file mode 100644 index 00000000..9177dba4 --- /dev/null +++ b/sim/simx/types.h @@ -0,0 +1,417 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace vortex { + +typedef uint8_t Byte; +typedef uint32_t Word; +typedef int32_t WordI; + +typedef uint32_t Addr; +typedef uint32_t Size; + +typedef std::bitset<32> RegMask; +typedef std::bitset<32> ThreadMask; +typedef std::bitset<32> WarpMask; + +/////////////////////////////////////////////////////////////////////////////// + +enum class RegType { + None, + Integer, + Float, + Vector +}; + +inline std::ostream &operator<<(std::ostream &os, const RegType& type) { + switch (type) { + case RegType::None: break; + case RegType::Integer: os << "r"; break; + case RegType::Float: os << "fr"; break; + case RegType::Vector: os << "vr"; break; + } + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + +enum class ExeType { + NOP, + ALU, + LSU, + CSR, + FPU, + GPU, + MAX, +}; + +inline std::ostream &operator<<(std::ostream &os, const ExeType& type) { + switch (type) { + case ExeType::NOP: os << "NOP"; break; + case ExeType::ALU: os << "ALU"; break; + case ExeType::LSU: os << "LSU"; break; + case ExeType::CSR: os << "CSR"; break; + case ExeType::FPU: os << "FPU"; break; + case ExeType::GPU: os << "GPU"; break; + case ExeType::MAX: break; + } + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + +enum class AluType { + ARITH, + BRANCH, + SYSCALL, + IMUL, + IDIV, + CMOV, +}; + +inline std::ostream &operator<<(std::ostream &os, const AluType& type) { + switch (type) { + case AluType::ARITH: os << "ARITH"; break; + case AluType::BRANCH: os << "BRANCH"; break; + case AluType::SYSCALL: os << "SYSCALL"; break; + case AluType::IMUL: os << "IMUL"; break; + case AluType::IDIV: os << "IDIV"; break; + case AluType::CMOV: os << "CMOV"; break; + } + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + +enum class LsuType { + LOAD, + STORE, + FENCE, + PREFETCH, +}; + +inline std::ostream &operator<<(std::ostream &os, const LsuType& type) { + switch (type) { + case LsuType::LOAD: os << "LOAD"; break; + case LsuType::STORE: os << "STORE"; break; + case LsuType::FENCE: os << "FENCE"; break; + case LsuType::PREFETCH: os << "PREFETCH"; break; + } + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + +enum class AddrType { + Global, + Shared, + IO, +}; + +inline std::ostream &operator<<(std::ostream &os, const AddrType& type) { + switch (type) { + case AddrType::Global: os << "Global"; break; + case AddrType::Shared: os << "Shared"; break; + case AddrType::IO: os << "IO"; break; + } + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + +struct mem_addr_size_t { + uint64_t addr; + uint32_t size; +}; + +inline AddrType get_addr_type(Word addr, uint32_t size) { + __unused (size); + if (SM_ENABLE) { + if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE) + && addr < SMEM_BASE_ADDR) { + assert((addr + size) <= SMEM_BASE_ADDR); + return AddrType::Shared; + } + } + if (addr >= IO_BASE_ADDR) { + return AddrType::IO; + } + return AddrType::Global; +} + +/////////////////////////////////////////////////////////////////////////////// + +enum class FpuType { + FNCP, + FMA, + FDIV, + FSQRT, + FCVT, +}; + +inline std::ostream &operator<<(std::ostream &os, const FpuType& type) { + switch (type) { + case FpuType::FNCP: os << "FNCP"; break; + case FpuType::FMA: os << "FMA"; break; + case FpuType::FDIV: os << "FDIV"; break; + case FpuType::FSQRT: os << "FSQRT"; break; + case FpuType::FCVT: os << "FCVT"; break; + } + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + +enum class GpuType { + TMC, + WSPAWN, + SPLIT, + JOIN, + BAR, + TEX, +}; + +inline std::ostream &operator<<(std::ostream &os, const GpuType& type) { + switch (type) { + case GpuType::TMC: os << "TMC"; break; + case GpuType::WSPAWN: os << "WSPAWN"; break; + case GpuType::SPLIT: os << "SPLIT"; break; + case GpuType::JOIN: os << "JOIN"; break; + case GpuType::BAR: os << "BAR"; break; + case GpuType::TEX: os << "TEX"; break; + } + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + +enum class ArbiterType { + Priority, + RoundRobin +}; + +inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) { + switch (type) { + case ArbiterType::Priority: os << "Priority"; break; + case ArbiterType::RoundRobin: os << "RoundRobin"; break; + } + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + +struct MemReq { + uint64_t addr; + bool write; + bool non_cacheable; + uint32_t tag; + uint32_t core_id; + uint64_t uuid; + + MemReq(uint64_t _addr = 0, + bool _write = false, + bool _non_cacheable = false, + uint64_t _tag = 0, + uint32_t _core_id = 0, + uint64_t _uuid = 0 + ) : addr(_addr) + , write(_write) + , non_cacheable(_non_cacheable) + , tag(_tag) + , core_id(_core_id) + , uuid(_uuid) + {} +}; + +inline std::ostream &operator<<(std::ostream &os, const MemReq& req) { + os << "mem-" << (req.write ? "wr" : "rd") << ": "; + os << "addr=" << req.addr << ", tag=" << req.tag << ", core_id=" << req.core_id; + os << " (#" << std::dec << req.uuid << ")"; + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + +struct MemRsp { + uint64_t tag; + uint32_t core_id; + uint64_t uuid; + MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0) + : tag (_tag) + , core_id(_core_id) + , uuid(_uuid) + {} +}; + +inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) { + os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id; + os << " (#" << std::dec << rsp.uuid << ")"; + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + +template +class HashTable { +private: + std::vector> entries_; + uint32_t size_; + +public: + HashTable(uint32_t capacity) + : entries_(capacity) + , size_(0) + {} + + bool empty() const { + return (0 == size_); + } + + bool full() const { + return (size_ == entries_.size()); + } + + uint32_t size() const { + return size_; + } + + bool contains(uint32_t index) const { + return entries_.at(index).first; + } + + const T& at(uint32_t index) const { + auto& entry = entries_.at(index); + assert(entry.first); + return entry.second; + } + + T& at(uint32_t index) { + auto& entry = entries_.at(index); + assert(entry.first); + return entry.second; + } + + uint32_t allocate(const T& value) { + for (uint32_t i = 0, n = entries_.size(); i < n; ++i) { + auto& entry = entries_.at(i); + if (!entry.first) { + entry.first = true; + entry.second = value; + ++size_; + return i; + } + } + assert(false); + return -1; + } + + void release(uint32_t index) { + auto& entry = entries_.at(index); + assert(entry.first); + entry.first = false; + --size_; + } + + void clear() { + for (uint32_t i = 0, n = entries_.size(); i < n; ++i) { + auto& entry = entries_.at(i); + entry.first = false; + } + size_ = 0; + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class Switch : public SimObject> { +private: + ArbiterType type_; + uint32_t delay_; + uint32_t cursor_; + uint32_t tag_shift_; + +public: + Switch( + const SimContext& ctx, + const char* name, + ArbiterType type, + uint32_t num_inputs, + uint32_t delay = 1 + ) + : SimObject>(ctx, name) + , type_(type) + , delay_(delay) + , cursor_(0) + , tag_shift_(log2ceil(num_inputs)) + , ReqIn(num_inputs, this) + , ReqOut(this) + , RspIn(this) + , RspOut(num_inputs, this) + { + assert(delay_ != 0); + assert(num_inputs <= MaxInputs); + if (num_inputs == 1) { + // bypass + ReqIn.at(0).bind(&ReqOut); + RspIn.bind(&RspOut.at(0)); + } + } + + void reset() { + cursor_ = 0; + } + + void tick() { + if (ReqIn.size() == 1) + return; + + // process incomming requests + for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) { + uint32_t j = (cursor_ + i) % n; + auto& req_in = ReqIn.at(j); + if (!req_in.empty()) { + auto& req = req_in.front(); + if (tag_shift_) { + req.tag = (req.tag << tag_shift_) | j; + } + ReqOut.send(req, delay_); + req_in.pop(); + this->update_cursor(j); + break; + } + } + + // process incoming reponses + if (!RspIn.empty()) { + auto& rsp = RspIn.front(); + uint32_t port_id = 0; + if (tag_shift_) { + port_id = rsp.tag & ((1 << tag_shift_)-1); + rsp.tag >>= tag_shift_; + } + RspOut.at(port_id).send(rsp, 1); + RspIn.pop(); + } + } + + void update_cursor(uint32_t grant) { + if (type_ == ArbiterType::RoundRobin) { + cursor_ = grant + 1; + } + } + + std::vector> ReqIn; + SimPort ReqOut; + SimPort RspIn; + std::vector> RspOut; +}; + +} \ No newline at end of file diff --git a/sim/simx/warp.cpp b/sim/simx/warp.cpp new file mode 100644 index 00000000..b05b1246 --- /dev/null +++ b/sim/simx/warp.cpp @@ -0,0 +1,78 @@ +#include +#include +#include +#include +#include +#include + +#include "instr.h" +#include "core.h" + +using namespace vortex; + +Warp::Warp(Core *core, Word id) + : id_(id) + , core_(core) + , ireg_file_(core->arch().num_threads(), std::vector(core->arch().num_regs())) + , freg_file_(core->arch().num_threads(), std::vector(core->arch().num_regs())) + , vreg_file_(core->arch().num_threads(), std::vector(core->arch().vsize())) +{ + this->clear(); +} + +void Warp::clear() { + active_ = false; + PC_ = STARTUP_ADDR; + tmask_.reset(); + for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) { + for (auto& reg : ireg_file_.at(i)) { + reg = 0; + } + for (auto& reg : freg_file_.at(i)) { + reg = 0; + } + for (auto& reg : vreg_file_.at(i)) { + reg = 0; + } + } +} + +void Warp::eval(pipeline_trace_t *trace) { + assert(tmask_.any()); + + DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask="); + for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) + DPN(2, tmask_.test(n-i-1)); + DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl); + + /* Fetch and decode. */ + + Word instr_code = core_->icache_read(PC_, sizeof(Word)); + auto instr = core_->decoder().decode(instr_code); + if (!instr) { + std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl; + std::abort(); + } + + DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr); + + // Update trace + trace->cid = core_->id(); + trace->wid = id_; + trace->PC = PC_; + trace->tmask = tmask_; + trace->rdest = instr->getRDest(); + trace->rdest_type = instr->getRDType(); + + // Execute + this->execute(*instr, trace); + + DP(4, "Register state:"); + for (int i = 0; i < core_->arch().num_regs(); ++i) { + DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':'); + for (int j = 0; j < core_->arch().num_threads(); ++j) { + DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' '); + } + DPN(4, std::endl); + } +} \ No newline at end of file diff --git a/sim/simX/warp.h b/sim/simx/warp.h similarity index 76% rename from sim/simX/warp.h rename to sim/simx/warp.h index 7473d858..9e9970f3 100644 --- a/sim/simX/warp.h +++ b/sim/simx/warp.h @@ -9,7 +9,7 @@ namespace vortex { class Core; class Instr; -class Pipeline; +class pipeline_trace_t; struct DomStackEntry { DomStackEntry(const ThreadMask &tmask, Word PC) : tmask(tmask) @@ -48,6 +48,10 @@ public: return active_; } + void suspend() { + active_ = false; + } + void activate() { active_ = true; } @@ -71,7 +75,7 @@ public: } void setTmask(size_t index, bool value) { - tmask_[index] = value; + tmask_.set(index, value); active_ = tmask_.any(); } @@ -82,26 +86,26 @@ public: } Word getIRegValue(int reg) const { - return iRegFile_[0][reg]; + return ireg_file_.at(0).at(reg); } - void step(Pipeline *); + void eval(pipeline_trace_t *); private: - void execute(const Instr &instr, Pipeline *); + void execute(const Instr &instr, pipeline_trace_t *trace); Word id_; - bool active_; Core *core_; + bool active_; Word PC_; ThreadMask tmask_; - std::vector> iRegFile_; - std::vector> fRegFile_; - std::vector> vRegFile_; - std::stack domStack_; + std::vector> ireg_file_; + std::vector> freg_file_; + std::vector> vreg_file_; + std::stack dom_stack_; struct vtype vtype_; int vl_; diff --git a/sim/vlsim/Makefile b/sim/vlsim/Makefile index ce01395d..bd34e60f 100644 --- a/sim/vlsim/Makefile +++ b/sim/vlsim/Makefile @@ -1,13 +1,17 @@ +DESTDIR ?= . RTL_DIR = ../../hw/rtl DPI_DIR = ../../hw/dpi -SCRIPT_DIR=../../hw/scripts +SCRIPT_DIR = ../../hw/scripts +THIRD_PARTY_DIR = ../../third_party CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -I.. -I../../../hw -I../../common -CXXFLAGS += -I../../common/softfloat/source/include +CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include +CXXFLAGS += -I../$(THIRD_PARTY_DIR) -LDFLAGS += -shared ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a +LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a +LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator # control RTL debug tracing states DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE @@ -24,13 +28,12 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_FLAGS += $(DBG_TRACE_FLAGS) -DBG_FLAGS += -DDBG_CACHE_REQ_INFO SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += fpga.cpp opae_sim.cpp -FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src +FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src TEX_INCLUDE = -I$(RTL_DIR)/tex_unit RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE) RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip @@ -51,10 +54,13 @@ CXXFLAGS += $(CONFIGS) #THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') #VL_FLAGS += --threads $(THREADS) +# Enable VCD trace +#VCD_TRACE = -DVCD_OUTPUT + # Debugigng ifdef DEBUG - VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS) - CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS) + VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS) + CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS) else VL_FLAGS += -DNDEBUG CXXFLAGS += -O2 -DNDEBUG @@ -84,22 +90,15 @@ VL_FLAGS += -DIDIV_DPI FPU_CORE ?= FPU_DPI VL_FLAGS += -D$(FPU_CORE) -PROJECT = libopae-c-vlsim +PROJECT = libopae-c-vlsim.so -all: $(PROJECT).so +all: $(PROJECT) vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh $(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h -$(PROJECT).so: $(SRCS) vortex_afu.h - verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT).so +$(DESTDIR)/$(PROJECT): $(SRCS) vortex_afu.h + verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT) -static: $(SRCS) vortex_afu.h - verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' - $(AR) rcs $(PROJECT).a obj_dir/*.o ../common/softfloat/build/Linux-x86_64-GCC/*.o - -clean-static: - rm -rf $(PROJECT).a obj_dir vortex_afu.h - -clean: clean-static - rm -rf $(PROJECT).so +clean: + rm -rf obj_dir $(DESTDIR)/$(PROJECT) diff --git a/sim/vlsim/opae_sim.cpp b/sim/vlsim/opae_sim.cpp index ced1e233..ff632bf4 100644 --- a/sim/vlsim/opae_sim.cpp +++ b/sim/vlsim/opae_sim.cpp @@ -13,13 +13,41 @@ #include #include +#define RAMULATOR +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#ifndef MEMORY_BANKS + #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS + #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS + #else + #define MEMORY_BANKS 2 + #endif +#endif + +#ifndef MEM_CYCLE_RATIO +#define MEM_CYCLE_RATIO -1 +#endif + +#undef MEM_BLOCK_SIZE +#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8) + +#define CACHE_BLOCK_SIZE 64 + #define CCI_LATENCY 8 #define CCI_RAND_MOD 8 #define CCI_RQ_SIZE 16 #define CCI_WQ_SIZE 16 -#define ENABLE_MEM_STALLS - #ifndef TRACE_START_TIME #define TRACE_START_TIME 0ull #endif @@ -28,22 +56,12 @@ #define TRACE_STOP_TIME -1ull #endif -#ifndef MEM_LATENCY -#define MEM_LATENCY 24 -#endif - -#ifndef MEM_RQ_SIZE -#define MEM_RQ_SIZE 16 -#endif - -#ifndef MEM_STALLS_MODULO -#define MEM_STALLS_MODULO 16 -#endif - #ifndef VERILATOR_RESET_VALUE #define VERILATOR_RESET_VALUE 2 #endif +#define RAM_PAGE_SIZE 4096 + using namespace vortex; static uint64_t timestamp = 0; @@ -86,357 +104,431 @@ void sim_trace_enable(bool enable) { /////////////////////////////////////////////////////////////////////////////// -namespace vortex { -class VL_OBJ { +class opae_sim::Impl { public: -#ifdef AXI_BUS - VVortex_axi *device; -#else - Vvortex_afu_shim *device; -#endif -#ifdef VCD_OUTPUT - VerilatedVcdC *trace; -#endif - - VL_OBJ() { + Impl() + : stop_(false) + , host_buffer_ids_(0) { // force random values for unitialized signals Verilated::randReset(VERILATOR_RESET_VALUE); Verilated::randSeed(50); - // Turn off assertion before reset + // turn off assertion before reset Verilated::assertOn(false); - #ifdef AXI_BUS - this->device = new Vvortex_afu_shim(); - #else - this->device = new Vvortex_afu_shim(); - #endif + // create RTL module instance + device_ = new Vvortex_afu_shim(); #ifdef VCD_OUTPUT Verilated::traceEverOn(true); - this->trace = new VerilatedVcdC(); - this->device->trace(this->trace, 99); - this->trace->open("trace.vcd"); + trace_ = new VerilatedVcdC(); + device_->trace(this->trace, 99); + trace_->open("trace.vcd"); #endif + + ram_ = new RAM(RAM_PAGE_SIZE); + + // initialize dram simulator + ramulator::Config ram_config; + ram_config.add("standard", "DDR4"); + ram_config.add("channels", std::to_string(MEMORY_BANKS)); + ram_config.add("ranks", "1"); + ram_config.add("speed", "DDR4_2400R"); + ram_config.add("org", "DDR4_4Gb_x8"); + ram_config.add("mapping", "defaultmapping"); + ram_config.set_core_num(1); + dram_ = new ramulator::Gem5Wrapper(ram_config, MEM_BLOCK_SIZE); + Stats::statlist.output("ramulator.ddr4.log"); + + // reset the device + this->reset(); + + // launch execution thread + future_ = std::async(std::launch::async, [&]{ + while (!stop_) { + std::lock_guard guard(mutex_); + this->tick(); + } + }); } - ~VL_OBJ() { + ~Impl() { + stop_ = true; + if (future_.valid()) { + future_.wait(); + } + for (auto& buffer : host_buffers_) { + __aligned_free(buffer.second.data); + } #ifdef VCD_OUTPUT - this->trace->close(); - delete this->trace; + trace_->close(); + delete trace_; #endif - delete this->device; - } -}; -} + delete device_; + + delete ram_; -/////////////////////////////////////////////////////////////////////////////// - -opae_sim::opae_sim() - : stop_(false) - , host_buffer_ids_(0) { - vl_obj_ = new VL_OBJ(); - ram_ = new RAM((1<<12), (1<<20)); - - // reset the device - this->reset(); - - // launch execution thread - future_ = std::async(std::launch::async, [&]{ - while (!stop_) { - std::lock_guard guard(mutex_); - this->step(); - } - }); -} - -opae_sim::~opae_sim() { - stop_ = true; - if (future_.valid()) { - future_.wait(); - } - for (auto& buffer : host_buffers_) { - __aligned_free(buffer.second.data); - } - delete vl_obj_; - delete ram_; -} - -int opae_sim::prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) { - auto alloc = __aligned_malloc(CACHE_BLOCK_SIZE, len); - if (alloc == NULL) - return -1; - host_buffer_t buffer; - buffer.data = (uint64_t*)alloc; - buffer.size = len; - buffer.ioaddr = uintptr_t(alloc); - auto buffer_id = host_buffer_ids_++; - host_buffers_.emplace(buffer_id, buffer); - *buf_addr = alloc; - *wsid = buffer_id; - return 0; -} - -void opae_sim::release_buffer(uint64_t wsid) { - auto it = host_buffers_.find(wsid); - if (it != host_buffers_.end()) { - __aligned_free(it->second.data); - host_buffers_.erase(it); - } -} - -void opae_sim::get_io_address(uint64_t wsid, uint64_t *ioaddr) { - *ioaddr = host_buffers_[wsid].ioaddr; -} - -void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) { - std::lock_guard guard(mutex_); - - vl_obj_->device->vcp2af_sRxPort_c0_mmioRdValid = 1; - vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; - vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; - vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; - this->step(); - vl_obj_->device->vcp2af_sRxPort_c0_mmioRdValid = 0; - assert(vl_obj_->device->af2cp_sTxPort_c2_mmioRdValid); - *value = vl_obj_->device->af2cp_sTxPort_c2_data; -} - -void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) { - std::lock_guard guard(mutex_); - - vl_obj_->device->vcp2af_sRxPort_c0_mmioWrValid = 1; - vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; - vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; - vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; - memcpy(vl_obj_->device->vcp2af_sRxPort_c0_data, &value, 8); - this->step(); - vl_obj_->device->vcp2af_sRxPort_c0_mmioWrValid = 0; -} - -/////////////////////////////////////////////////////////////////////////////// - -void opae_sim::reset() { - cci_reads_.clear(); - cci_writes_.clear(); - vl_obj_->device->vcp2af_sRxPort_c0_mmioRdValid = 0; - vl_obj_->device->vcp2af_sRxPort_c0_mmioWrValid = 0; - vl_obj_->device->vcp2af_sRxPort_c0_rspValid = 0; - vl_obj_->device->vcp2af_sRxPort_c1_rspValid = 0; - vl_obj_->device->vcp2af_sRxPort_c0_TxAlmFull = 0; - vl_obj_->device->vcp2af_sRxPort_c1_TxAlmFull = 0; - - for (int b = 0; b < MEMORY_BANKS; ++b) { - mem_reads_[b].clear(); - vl_obj_->device->avs_readdatavalid[b] = 0; - vl_obj_->device->avs_waitrequest[b] = 0; + if (dram_) { + dram_->finish(); + Stats::statlist.printall(); + delete dram_; + } } - vl_obj_->device->reset = 1; + int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) { + auto alloc = __aligned_malloc(CACHE_BLOCK_SIZE, len); + if (alloc == NULL) + return -1; + host_buffer_t buffer; + buffer.data = (uint64_t*)alloc; + buffer.size = len; + buffer.ioaddr = uintptr_t(alloc); + auto buffer_id = host_buffer_ids_++; + host_buffers_.emplace(buffer_id, buffer); + *buf_addr = alloc; + *wsid = buffer_id; + return 0; + } - for (int i = 0; i < RESET_DELAY; ++i) { - vl_obj_->device->clk = 0; + void release_buffer(uint64_t wsid) { + auto it = host_buffers_.find(wsid); + if (it != host_buffers_.end()) { + __aligned_free(it->second.data); + host_buffers_.erase(it); + } + } + + void get_io_address(uint64_t wsid, uint64_t *ioaddr) { + *ioaddr = host_buffers_[wsid].ioaddr; + } + + void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) { + std::lock_guard guard(mutex_); + + device_->vcp2af_sRxPort_c0_mmioRdValid = 1; + device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; + device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; + device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; + this->tick(); + device_->vcp2af_sRxPort_c0_mmioRdValid = 0; + assert(device_->af2cp_sTxPort_c2_mmioRdValid); + *value = device_->af2cp_sTxPort_c2_data; + } + + void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) { + std::lock_guard guard(mutex_); + + device_->vcp2af_sRxPort_c0_mmioWrValid = 1; + device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; + device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; + device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; + memcpy(device_->vcp2af_sRxPort_c0_data, &value, 8); + this->tick(); + device_->vcp2af_sRxPort_c0_mmioWrValid = 0; + } + +private: + + void reset() { + cci_reads_.clear(); + cci_writes_.clear(); + device_->vcp2af_sRxPort_c0_mmioRdValid = 0; + device_->vcp2af_sRxPort_c0_mmioWrValid = 0; + device_->vcp2af_sRxPort_c0_rspValid = 0; + device_->vcp2af_sRxPort_c1_rspValid = 0; + device_->vcp2af_sRxPort_c0_TxAlmFull = 0; + device_->vcp2af_sRxPort_c1_TxAlmFull = 0; + + for (int b = 0; b < MEMORY_BANKS; ++b) { + pending_mem_reqs_[b].clear(); + device_->avs_readdatavalid[b] = 0; + device_->avs_waitrequest[b] = 0; + } + + device_->reset = 1; + + for (int i = 0; i < RESET_DELAY; ++i) { + device_->clk = 0; + this->eval(); + device_->clk = 1; + this->eval(); + } + + device_->reset = 0; + + // Turn on assertion after reset + Verilated::assertOn(true); + } + + void tick() { + this->sRxPort_bus(); + this->sTxPort_bus(); + this->avs_bus(); + + if (!dram_queue_.empty()) { + if (dram_->send(dram_queue_.front())) + dram_queue_.pop(); + } + + device_->clk = 0; this->eval(); - vl_obj_->device->clk = 1; + device_->clk = 1; this->eval(); - } - vl_obj_->device->reset = 0; - - // Turn on assertion after reset - Verilated::assertOn(true); -} - -void opae_sim::step() { - this->sRxPort_bus(); - this->sTxPort_bus(); - this->avs_bus(); - - vl_obj_->device->clk = 0; - this->eval(); - vl_obj_->device->clk = 1; - this->eval(); - -#ifndef NDEBUG - fflush(stdout); -#endif -} - -void opae_sim::eval() { - vl_obj_->device->eval(); -#ifdef VCD_OUTPUT - if (sim_trace_enabled()) { - vl_obj_->trace->dump(timestamp); - } -#endif - ++timestamp; -} - -void opae_sim::sRxPort_bus() { - // check mmio request - bool mmio_req_enabled = vl_obj_->device->vcp2af_sRxPort_c0_mmioRdValid - || vl_obj_->device->vcp2af_sRxPort_c0_mmioWrValid; - - // schedule CCI read responses - std::list::iterator cci_rd_it(cci_reads_.end()); - for (auto it = cci_reads_.begin(), ie = cci_reads_.end(); it != ie; ++it) { - if (it->cycles_left > 0) - it->cycles_left -= 1; - if ((cci_rd_it == ie) && (it->cycles_left == 0)) { - cci_rd_it = it; - } - } - - // schedule CCI write responses - std::list::iterator cci_wr_it(cci_writes_.end()); - for (auto it = cci_writes_.begin(), ie = cci_writes_.end(); it != ie; ++it) { - if (it->cycles_left > 0) - it->cycles_left -= 1; - if ((cci_wr_it == ie) && (it->cycles_left == 0)) { - cci_wr_it = it; - } - } - - // send CCI write response - vl_obj_->device->vcp2af_sRxPort_c1_rspValid = 0; - if (cci_wr_it != cci_writes_.end()) { - vl_obj_->device->vcp2af_sRxPort_c1_rspValid = 1; - vl_obj_->device->vcp2af_sRxPort_c1_hdr_resp_type = 0; - vl_obj_->device->vcp2af_sRxPort_c1_hdr_mdata = cci_wr_it->mdata; - cci_writes_.erase(cci_wr_it); - } - - // send CCI read response (ensure mmio disabled) - vl_obj_->device->vcp2af_sRxPort_c0_rspValid = 0; - if (!mmio_req_enabled - && (cci_rd_it != cci_reads_.end())) { - vl_obj_->device->vcp2af_sRxPort_c0_rspValid = 1; - vl_obj_->device->vcp2af_sRxPort_c0_hdr_resp_type = 0; - memcpy(vl_obj_->device->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE); - vl_obj_->device->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata; - /*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata); - for (int i = 0; i < CACHE_BLOCK_SIZE; ++i) - printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]); - printf("\n");*/ - cci_reads_.erase(cci_rd_it); - } -} - -void opae_sim::sTxPort_bus() { - // process read requests - if (vl_obj_->device->af2cp_sTxPort_c0_valid) { - assert(!vl_obj_->device->vcp2af_sRxPort_c0_TxAlmFull); - cci_rd_req_t cci_req; - cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD); - cci_req.addr = vl_obj_->device->af2cp_sTxPort_c0_hdr_address; - cci_req.mdata = vl_obj_->device->af2cp_sTxPort_c0_hdr_mdata; - auto host_ptr = (uint64_t*)(vl_obj_->device->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE); - memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE); - //printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, vl_obj_->device->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); - cci_reads_.emplace_back(cci_req); - } - - // process write requests - if (vl_obj_->device->af2cp_sTxPort_c1_valid) { - assert(!vl_obj_->device->vcp2af_sRxPort_c1_TxAlmFull); - cci_wr_req_t cci_req; - cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD); - cci_req.mdata = vl_obj_->device->af2cp_sTxPort_c1_hdr_mdata; - auto host_ptr = (uint64_t*)(vl_obj_->device->af2cp_sTxPort_c1_hdr_address * CACHE_BLOCK_SIZE); - memcpy(host_ptr, vl_obj_->device->af2cp_sTxPort_c1_data, CACHE_BLOCK_SIZE); - cci_writes_.emplace_back(cci_req); - } - - // check queues overflow - vl_obj_->device->vcp2af_sRxPort_c0_TxAlmFull = (cci_reads_.size() >= (CCI_RQ_SIZE-1)); - vl_obj_->device->vcp2af_sRxPort_c1_TxAlmFull = (cci_writes_.size() >= (CCI_WQ_SIZE-1)); -} - -void opae_sim::avs_bus() { - for (int b = 0; b < MEMORY_BANKS; ++b) { - // update memory responses schedule - for (auto& rsp : mem_reads_[b]) { - if (rsp.cycles_left > 0) - rsp.cycles_left -= 1; + if (MEM_CYCLE_RATIO > 0) { + auto cycle = timestamp / 2; + if ((cycle % MEM_CYCLE_RATIO) == 0) + dram_->tick(); + } else { + for (int i = MEM_CYCLE_RATIO; i <= 0; ++i) + dram_->tick(); } - // schedule memory responses in FIFO order - std::list::iterator mem_rd_it(mem_reads_[b].end()); - if (!mem_reads_[b].empty() - && (0 == mem_reads_[b].begin()->cycles_left)) { - mem_rd_it = mem_reads_[b].begin(); - } + #ifndef NDEBUG + fflush(stdout); + #endif + } - // send memory response - vl_obj_->device->avs_readdatavalid[b] = 0; - if (mem_rd_it != mem_reads_[b].end()) { - vl_obj_->device->avs_readdatavalid[b] = 1; - memcpy(vl_obj_->device->avs_readdata[b], mem_rd_it->data.data(), MEM_BLOCK_SIZE); - uint32_t addr = mem_rd_it->addr; - mem_reads_[b].erase(mem_rd_it); - /*printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%x, pending={", timestamp, b, addr * MEM_BLOCK_SIZE); - for (auto& req : mem_reads_[b]) { - if (req.cycles_left != 0) - printf(" !%0x", req.addr * MEM_BLOCK_SIZE); - else - printf(" %0x", req.addr * MEM_BLOCK_SIZE); - } - printf("}\n");*/ - } - - // handle memory stalls - bool mem_stalled = false; - #ifdef ENABLE_MEM_STALLS - if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { - mem_stalled = true; - } else - if (mem_reads_[b].size() >= MEM_RQ_SIZE) { - mem_stalled = true; + void eval() { + device_->eval(); + #ifdef VCD_OUTPUT + if (sim_trace_enabled()) { + trace_->dump(timestamp); } #endif + ++timestamp; + } - // process memory requests - if (!mem_stalled) { - assert(!vl_obj_->device->avs_read[b] || !vl_obj_->device->avs_write[b]); - if (vl_obj_->device->avs_write[b]) { - uint64_t byteen = vl_obj_->device->avs_byteenable[b]; - unsigned base_addr = vl_obj_->device->avs_address[b] * MEM_BLOCK_SIZE; - uint8_t* data = (uint8_t*)(vl_obj_->device->avs_writedata[b]); + void sRxPort_bus() { + // check mmio request + bool mmio_req_enabled = device_->vcp2af_sRxPort_c0_mmioRdValid + || device_->vcp2af_sRxPort_c0_mmioWrValid; + + // schedule CCI read responses + std::list::iterator cci_rd_it(cci_reads_.end()); + for (auto it = cci_reads_.begin(), ie = cci_reads_.end(); it != ie; ++it) { + if (it->cycles_left > 0) + it->cycles_left -= 1; + if ((cci_rd_it == ie) && (it->cycles_left == 0)) { + cci_rd_it = it; + } + } + + // schedule CCI write responses + std::list::iterator cci_wr_it(cci_writes_.end()); + for (auto it = cci_writes_.begin(), ie = cci_writes_.end(); it != ie; ++it) { + if (it->cycles_left > 0) + it->cycles_left -= 1; + if ((cci_wr_it == ie) && (it->cycles_left == 0)) { + cci_wr_it = it; + } + } + + // send CCI write response + device_->vcp2af_sRxPort_c1_rspValid = 0; + if (cci_wr_it != cci_writes_.end()) { + device_->vcp2af_sRxPort_c1_rspValid = 1; + device_->vcp2af_sRxPort_c1_hdr_resp_type = 0; + device_->vcp2af_sRxPort_c1_hdr_mdata = cci_wr_it->mdata; + cci_writes_.erase(cci_wr_it); + } + + // send CCI read response (ensure mmio disabled) + device_->vcp2af_sRxPort_c0_rspValid = 0; + if (!mmio_req_enabled + && (cci_rd_it != cci_reads_.end())) { + device_->vcp2af_sRxPort_c0_rspValid = 1; + device_->vcp2af_sRxPort_c0_hdr_resp_type = 0; + memcpy(device_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE); + device_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata; + /*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata); + for (int i = 0; i < CACHE_BLOCK_SIZE; ++i) + printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]); + printf("\n");*/ + cci_reads_.erase(cci_rd_it); + } + } + + void sTxPort_bus() { + // process read requests + if (device_->af2cp_sTxPort_c0_valid) { + assert(!device_->vcp2af_sRxPort_c0_TxAlmFull); + cci_rd_req_t cci_req; + cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD); + cci_req.addr = device_->af2cp_sTxPort_c0_hdr_address; + cci_req.mdata = device_->af2cp_sTxPort_c0_hdr_mdata; + auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE); + memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE); + //printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, device_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); + cci_reads_.emplace_back(cci_req); + } + + // process write requests + if (device_->af2cp_sTxPort_c1_valid) { + assert(!device_->vcp2af_sRxPort_c1_TxAlmFull); + cci_wr_req_t cci_req; + cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD); + cci_req.mdata = device_->af2cp_sTxPort_c1_hdr_mdata; + auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c1_hdr_address * CACHE_BLOCK_SIZE); + memcpy(host_ptr, device_->af2cp_sTxPort_c1_data, CACHE_BLOCK_SIZE); + cci_writes_.emplace_back(cci_req); + } + + // check queues overflow + device_->vcp2af_sRxPort_c0_TxAlmFull = (cci_reads_.size() >= (CCI_RQ_SIZE-1)); + device_->vcp2af_sRxPort_c1_TxAlmFull = (cci_writes_.size() >= (CCI_WQ_SIZE-1)); + } + + void avs_bus() { + for (int b = 0; b < MEMORY_BANKS; ++b) { + // process memory responses + device_->avs_readdatavalid[b] = 0; + if (!pending_mem_reqs_[b].empty() + && (*pending_mem_reqs_[b].begin())->ready) { + auto mem_rd_it = pending_mem_reqs_[b].begin(); + auto mem_req = *mem_rd_it; + device_->avs_readdatavalid[b] = 1; + memcpy(device_->avs_readdata[b], mem_req->data.data(), MEM_BLOCK_SIZE); + uint32_t addr = mem_req->addr; + pending_mem_reqs_[b].erase(mem_rd_it); + delete mem_req; + } + + // process memory requests + assert(!device_->avs_read[b] || !device_->avs_write[b]); + unsigned byte_addr = device_->avs_address[b] * MEM_BLOCK_SIZE; + if (device_->avs_write[b]) { + uint64_t byteen = device_->avs_byteenable[b]; + uint8_t* data = (uint8_t*)(device_->avs_writedata[b]); for (int i = 0; i < MEM_BLOCK_SIZE; i++) { if ((byteen >> i) & 0x1) { - (*ram_)[base_addr + i] = data[i]; + (*ram_)[byte_addr + i] = data[i]; } } - /*printf("%0ld: [sim] MEM Wr Req: bank=%d, addr=%x, data=", timestamp, b, base_addr); + + /*printf("%0ld: [sim] MEM Wr Req: bank=%d, addr=%x, data=", timestamp, b, byte_addr); for (int i = 0; i < MEM_BLOCK_SIZE; i++) { printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); } printf("\n");*/ + + // send dram request + ramulator::Request dram_req( + byte_addr, + ramulator::Request::Type::WRITE, + 0 + ); + dram_queue_.push(dram_req); } - if (vl_obj_->device->avs_read[b]) { - mem_rd_req_t mem_req; - mem_req.addr = vl_obj_->device->avs_address[b]; - ram_->read(mem_req.data.data(), vl_obj_->device->avs_address[b] * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE); - mem_req.cycles_left = MEM_LATENCY; - for (auto& rsp : mem_reads_[b]) { - if (mem_req.addr == rsp.addr) { - // duplicate requests receive the same cycle delay - mem_req.cycles_left = rsp.cycles_left; - break; - } - } - mem_reads_[b].emplace_back(mem_req); + + if (device_->avs_read[b]) { + auto mem_req = new mem_rd_req_t(); + mem_req->addr = device_->avs_address[b]; + ram_->read(mem_req->data.data(), byte_addr, MEM_BLOCK_SIZE); + mem_req->ready = false; + pending_mem_reqs_[b].emplace_back(mem_req); + /*printf("%0ld: [sim] MEM Rd Req: bank=%d, addr=%x, pending={", timestamp, b, mem_req.addr * MEM_BLOCK_SIZE); - for (auto& req : mem_reads_[b]) { + for (auto& req : pending_mem_reqs_[b]) { if (req.cycles_left != 0) printf(" !%0x", req.addr * MEM_BLOCK_SIZE); else printf(" %0x", req.addr * MEM_BLOCK_SIZE); } printf("}\n");*/ - } - } - vl_obj_->device->avs_waitrequest[b] = mem_stalled; + // send dram request + ramulator::Request dram_req( + byte_addr, + ramulator::Request::Type::READ, + std::bind([](ramulator::Request& dram_req, mem_rd_req_t* mem_req) { + mem_req->ready = true; + }, placeholders::_1, mem_req), + 0 + ); + dram_queue_.push(dram_req); + } + + device_->avs_waitrequest[b] = false; + } } + + typedef struct { + bool ready; + std::array data; + uint32_t addr; + } mem_rd_req_t; + + typedef struct { + int cycles_left; + std::array data; + uint64_t addr; + uint32_t mdata; + } cci_rd_req_t; + + typedef struct { + int cycles_left; + uint32_t mdata; + } cci_wr_req_t; + + typedef struct { + uint64_t* data; + size_t size; + uint64_t ioaddr; + } host_buffer_t; + + std::future future_; + bool stop_; + + std::unordered_map host_buffers_; + int64_t host_buffer_ids_; + + std::list pending_mem_reqs_[MEMORY_BANKS]; + + std::list cci_reads_; + + std::list cci_writes_; + + std::mutex mutex_; + + RAM *ram_; + + ramulator::Gem5Wrapper* dram_; + + std::queue dram_queue_; + + Vvortex_afu_shim *device_; +#ifdef VCD_OUTPUT + VerilatedVcdC *trace_; +#endif +}; + +/////////////////////////////////////////////////////////////////////////////// + +opae_sim::opae_sim() + : impl_(new Impl()) +{} + +opae_sim::~opae_sim() { + delete impl_; +} + +int opae_sim::prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) { + return impl_->prepare_buffer(len, buf_addr, wsid, flags); +} + +void opae_sim::release_buffer(uint64_t wsid) { + impl_->release_buffer(wsid); +} + +void opae_sim::get_io_address(uint64_t wsid, uint64_t *ioaddr) { + impl_->get_io_address(wsid, ioaddr); +} + +void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) { + impl_->write_mmio64(mmio_num, offset, value); +} + +void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) { + impl_->read_mmio64(mmio_num, offset, value); } \ No newline at end of file diff --git a/sim/vlsim/opae_sim.h b/sim/vlsim/opae_sim.h index aa19532f..21010b94 100644 --- a/sim/vlsim/opae_sim.h +++ b/sim/vlsim/opae_sim.h @@ -1,29 +1,8 @@ #pragma once -#include -#include - -#include -#include -#include -#include - -#ifndef MEMORY_BANKS - #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS - #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS - #else - #define MEMORY_BANKS 2 - #endif -#endif - -#undef MEM_BLOCK_SIZE -#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8) - -#define CACHE_BLOCK_SIZE 64 - +#include namespace vortex { -class VL_OBJ; class RAM; class opae_sim { @@ -44,57 +23,8 @@ public: private: - typedef struct { - int cycles_left; - std::array data; - uint32_t addr; - } mem_rd_req_t; - - typedef struct { - int cycles_left; - std::array data; - uint64_t addr; - uint32_t mdata; - } cci_rd_req_t; - - typedef struct { - int cycles_left; - uint32_t mdata; - } cci_wr_req_t; - - typedef struct { - uint64_t* data; - size_t size; - uint64_t ioaddr; - } host_buffer_t; - - void reset(); - - void eval(); - - void step(); - - void sRxPort_bus(); - void sTxPort_bus(); - void avs_bus(); - - std::future future_; - bool stop_; - - std::unordered_map host_buffers_; - int64_t host_buffer_ids_; - - std::list mem_reads_ [MEMORY_BANKS]; - - std::list cci_reads_; - - std::list cci_writes_; - - std::mutex mutex_; - - RAM *ram_; - - VL_OBJ* vl_obj_; + class Impl; + Impl* impl_; }; } \ No newline at end of file diff --git a/tests/opencl/BlackScholes/Makefile b/tests/opencl/BlackScholes/Makefile index 54ffe7ab..30091c87 100644 --- a/tests/opencl/BlackScholes/Makefile +++ b/tests/opencl/BlackScholes/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/DotProduct/Makefile b/tests/opencl/DotProduct/Makefile index 44eaf258..3f3a68f3 100644 --- a/tests/opencl/DotProduct/Makefile +++ b/tests/opencl/DotProduct/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/VectorHypot/Makefile b/tests/opencl/VectorHypot/Makefile index bd5cf982..e58561ca 100644 --- a/tests/opencl/VectorHypot/Makefile +++ b/tests/opencl/VectorHypot/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/cutcp/Makefile b/tests/opencl/cutcp/Makefile index 7d4ed97b..3d694a63 100644 --- a/tests/opencl/cutcp/Makefile +++ b/tests/opencl/cutcp/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/lbm/Makefile b/tests/opencl/lbm/Makefile index 2f0116a8..ffa85d1a 100644 --- a/tests/opencl/lbm/Makefile +++ b/tests/opencl/lbm/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/mri-q/Makefile b/tests/opencl/mri-q/Makefile index ff9f420c..0aa409b6 100644 --- a/tests/opencl/mri-q/Makefile +++ b/tests/opencl/mri-q/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/reduce0/Makefile b/tests/opencl/reduce0/Makefile index b4aede1d..bb72241f 100644 --- a/tests/opencl/reduce0/Makefile +++ b/tests/opencl/reduce0/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/sad/Makefile b/tests/opencl/sad/Makefile index a8314b9c..129996be 100644 --- a/tests/opencl/sad/Makefile +++ b/tests/opencl/sad/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/spmv/Makefile b/tests/opencl/spmv/Makefile index eedbed22..f3c7a13f 100644 --- a/tests/opencl/spmv/Makefile +++ b/tests/opencl/spmv/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/stencil/Makefile b/tests/opencl/stencil/Makefile index ba69490b..41e05787 100644 --- a/tests/opencl/stencil/Makefile +++ b/tests/opencl/stencil/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/regression/basic/main.cpp b/tests/regression/basic/main.cpp index c92bae8d..fcea1fda 100755 --- a/tests/regression/basic/main.cpp +++ b/tests/regression/basic/main.cpp @@ -169,7 +169,7 @@ int run_kernel_test(const kernel_arg_t& kernel_arg, std::cout << "start execution" << std::endl; auto t2 = std::chrono::high_resolution_clock::now(); RT_CHECK(vx_start(device)); - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); auto t3 = std::chrono::high_resolution_clock::now(); // read destination buffer from local memory @@ -228,7 +228,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores; + uint64_t max_cores; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); uint32_t num_points = count; uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64; diff --git a/tests/regression/demo/main.cpp b/tests/regression/demo/main.cpp index 2961b517..29cc7d85 100644 --- a/tests/regression/demo/main.cpp +++ b/tests/regression/demo/main.cpp @@ -67,7 +67,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/diverge/main.cpp b/tests/regression/diverge/main.cpp index 7b27760c..778d118f 100644 --- a/tests/regression/diverge/main.cpp +++ b/tests/regression/diverge/main.cpp @@ -121,7 +121,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/dogfood/main.cpp b/tests/regression/dogfood/main.cpp index 804609ae..71ae6624 100644 --- a/tests/regression/dogfood/main.cpp +++ b/tests/regression/dogfood/main.cpp @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); @@ -245,7 +245,7 @@ int main(int argc, char *argv[]) { // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/fence/main.cpp b/tests/regression/fence/main.cpp index 2961b517..29cc7d85 100644 --- a/tests/regression/fence/main.cpp +++ b/tests/regression/fence/main.cpp @@ -67,7 +67,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/io_addr/main.cpp b/tests/regression/io_addr/main.cpp index 7899aa2a..42dcd7c0 100644 --- a/tests/regression/io_addr/main.cpp +++ b/tests/regression/io_addr/main.cpp @@ -101,7 +101,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/mstress/main.cpp b/tests/regression/mstress/main.cpp index bbb4660f..c2354edc 100644 --- a/tests/regression/mstress/main.cpp +++ b/tests/regression/mstress/main.cpp @@ -136,7 +136,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; @@ -194,7 +194,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/no_mf_ext/main.cpp b/tests/regression/no_mf_ext/main.cpp index 01bcfb90..01ae744c 100644 --- a/tests/regression/no_mf_ext/main.cpp +++ b/tests/regression/no_mf_ext/main.cpp @@ -67,7 +67,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/no_smem/main.cpp b/tests/regression/no_smem/main.cpp index 01bcfb90..01ae744c 100644 --- a/tests/regression/no_smem/main.cpp +++ b/tests/regression/no_smem/main.cpp @@ -67,7 +67,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/prefetch/Makefile b/tests/regression/prefetch/Makefile index 0627bd36..af58821c 100644 --- a/tests/regression/prefetch/Makefile +++ b/tests/regression/prefetch/Makefile @@ -2,7 +2,7 @@ RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain VORTEX_DRV_PATH ?= $(realpath ../../../driver) VORTEX_RT_PATH ?= $(realpath ../../../runtime) -OPTS ?= -n64 +OPTS ?= -n32 VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/regression/prefetch/kernel.c b/tests/regression/prefetch/kernel.c index c6d76e75..b852f582 100644 --- a/tests/regression/prefetch/kernel.c +++ b/tests/regression/prefetch/kernel.c @@ -1,24 +1,43 @@ #include #include #include +#include #include "common.h" -void kernel_body(int task_id, const kernel_arg_t* arg) { - uint32_t count = arg->task_size; - int32_t* src0_ptr = (int32_t*)arg->src0_ptr; - int32_t* src1_ptr = (int32_t*)arg->src1_ptr; - int32_t* dst_ptr = (int32_t*)arg->dst_ptr; - +#define BLOCK_SIZE 64 + +void kernel_body(int task_id, kernel_arg_t* arg) { + uint32_t count = arg->task_size; uint32_t offset = task_id * count; + uint32_t num_blocks = (count * 4 + BLOCK_SIZE-1) / BLOCK_SIZE; + + int32_t* src0_ptr = (int32_t*)arg->src0_ptr + offset; + int32_t* src1_ptr = (int32_t*)arg->src1_ptr + offset; + int32_t* dst_ptr = (int32_t*)arg->dst_ptr + offset; + + uint32_t src0_end = (uint32_t)(src0_ptr + count); + uint32_t src1_end = (uint32_t)(src1_ptr + count); for (uint32_t i = 0; i < count; ++i) { - vx_prefetch((uint32_t)(src0_ptr) + offset + i); - vx_prefetch((uint32_t)(src1_ptr) + offset + i); - dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i]; + dst_ptr[i] = src0_ptr[i] + src1_ptr[i]; + + uint32_t src0_mask = ((uint32_t)(src0_ptr + i)) % BLOCK_SIZE; + uint32_t src0_next = (uint32_t)(src0_ptr + i + BLOCK_SIZE/4); + if (src0_mask == 0 && src0_next < src0_end) { + //vx_printf("src0_next=%d\n", src0_next); + vx_prefetch(src0_next); + } + + uint32_t src1_mask = ((uint32_t)(src1_ptr + i)) % BLOCK_SIZE; + uint32_t src1_next = (uint32_t)(src1_ptr + i + BLOCK_SIZE/4); + if (src1_mask == 0 && src1_next < src1_end) { + //vx_printf("src1_next=%d\n", src1_next); + vx_prefetch(src1_next); + } } } void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - vx_spawn_tasks(arg->num_tasks, kernel_body, arg); + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg); } \ No newline at end of file diff --git a/tests/regression/prefetch/main.cpp b/tests/regression/prefetch/main.cpp index 2961b517..8be0d2a4 100644 --- a/tests/regression/prefetch/main.cpp +++ b/tests/regression/prefetch/main.cpp @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/printf/main.cpp b/tests/regression/printf/main.cpp index 11b9fc50..b9d4db38 100644 --- a/tests/regression/printf/main.cpp +++ b/tests/regression/printf/main.cpp @@ -65,7 +65,7 @@ int run_test() { // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); return 0; } @@ -85,7 +85,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/sort/main.cpp b/tests/regression/sort/main.cpp index c5f23141..96032a91 100644 --- a/tests/regression/sort/main.cpp +++ b/tests/regression/sort/main.cpp @@ -98,7 +98,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/tex/Makefile b/tests/regression/tex/Makefile index 8b313d25..ff38c514 100644 --- a/tests/regression/tex/Makefile +++ b/tests/regression/tex/Makefile @@ -9,8 +9,8 @@ VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy -VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections -VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw +VX_CFLAGS += -std=c++11 -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections +VX_CFLAGS += -DENABLE_SW -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common -I$(VORTEX_RT_PATH)/../third_party VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a @@ -19,15 +19,13 @@ VX_SRCS = kernel.c #CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors -CXXFLAGS += -DLUPNG_USE_ZLIB +CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common -I$(VORTEX_RT_PATH)/../third_party -CXXFLAGS += -I$(VORTEX_DRV_PATH)/include - -LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex -lz +LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex $(VORTEX_RT_PATH)/../third_party/cocogfx/libcocogfx.a -lz PROJECT = tex -SRCS = main.cpp utils.cpp tga.cpp lupng.c +SRCS = main.cpp utils.cpp all: $(PROJECT) kernel.bin kernel.dump @@ -38,7 +36,7 @@ kernel.bin: kernel.elf $(VX_CP) -O binary kernel.elf kernel.bin kernel.elf: $(VX_SRCS) - $(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf + $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ diff --git a/tests/regression/tex/blitter.h b/tests/regression/tex/blitter.h deleted file mode 100644 index e05f64b8..00000000 --- a/tests/regression/tex/blitter.h +++ /dev/null @@ -1,268 +0,0 @@ -// -// Copyright (c) Blaise Tine. All rights reserved. -// -// -// Use of this sample source code is subject to the terms of the Microsoft -// license agreement under which you licensed this sample source code. If -// you did not accept the terms of the license agreement, you are not -// authorized to use this sample source code. For the terms of the license, -// please see the license agreement between you and Microsoft or, if applicable, -// see the LICENSE.RTF on your install media or the root of your tools -// installation. -// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR -// INDEMNITIES. -// -#pragma once - -#include "surfacedesc.h" - -class BlitTable { -public: - typedef int (*PfnCopy)(const SurfaceDesc &dstDesc, - uint32_t dstOffsetX, - uint32_t dstOffsetY, - uint32_t copyWidth, - uint32_t copyHeight, - const SurfaceDesc &srcDesc, - uint32_t srcOffsetX, - uint32_t srcOffsetY); - - BlitTable() { - for (uint32_t s = 0; s < FORMAT_COLOR_SIZE_; ++s) { - for (uint32_t d = 0; d < FORMAT_COLOR_SIZE_; ++d) { - copyFuncs_[s][d] = CopyInvalid; - } - } - - for (uint32_t s = 0; s < FORMAT_COLOR_SIZE_; ++s) { - switch (s) { - case FORMAT_A8: - case FORMAT_L8: - copyFuncs_[s][s] = CopyFast; - break; - - case FORMAT_A8L8: - copyFuncs_[FORMAT_A8L8][FORMAT_A8] = Copy; - copyFuncs_[FORMAT_A8L8][FORMAT_A8L8] = CopyFast; - break; - - case FORMAT_R5G6B5: - copyFuncs_[FORMAT_R5G6B5][FORMAT_L8] = Copy; - copyFuncs_[FORMAT_R5G6B5][FORMAT_R5G6B5] = CopyFast; - copyFuncs_[FORMAT_R5G6B5][FORMAT_R8G8B8] = - Copy; - copyFuncs_[FORMAT_R5G6B5][FORMAT_B8G8R8] = - Copy; - copyFuncs_[FORMAT_R5G6B5][FORMAT_A8B8G8R8] = - Copy; - copyFuncs_[FORMAT_R5G6B5][FORMAT_A8R8G8B8] = - Copy; - break; - - case FORMAT_A1R5G5B5: - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_L8] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8L8] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R8G8B8] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8R8G8B8] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R5G5B5A1] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R4G4B4A4] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_B8G8R8] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8B8G8R8] = - Copy; - break; - - case FORMAT_A4R4G4B4: - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_L8] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8L8] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R8G8B8] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8R8G8B8] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R5G5B5A1] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R4G4B4A4] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_B8G8R8] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8B8G8R8] = - Copy; - break; - - case FORMAT_R8G8B8: - copyFuncs_[FORMAT_R8G8B8][FORMAT_L8] = Copy; - copyFuncs_[FORMAT_R8G8B8][FORMAT_R5G6B5] = - Copy; - copyFuncs_[FORMAT_R8G8B8][FORMAT_R8G8B8] = CopyFast; - copyFuncs_[FORMAT_R8G8B8][FORMAT_B8G8R8] = - Copy; - copyFuncs_[FORMAT_R8G8B8][FORMAT_A8B8G8R8] = - Copy; - copyFuncs_[FORMAT_R8G8B8][FORMAT_A8R8G8B8] = - Copy; - break; - - case FORMAT_A8R8G8B8: - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_L8] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8L8] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R5G6B5] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R8G8B8] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8R8G8B8] = CopyFast; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R5G5B5A1] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R4G4B4A4] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_B8G8R8] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8B8G8R8] = - Copy; - break; - - case FORMAT_R5G5B5A1: - copyFuncs_[FORMAT_R5G5B5A1][FORMAT_A8] = - Copy; - copyFuncs_[FORMAT_R5G5B5A1][FORMAT_L8] = - Copy; - copyFuncs_[FORMAT_R5G5B5A1][FORMAT_A8L8] = - Copy; - copyFuncs_[FORMAT_R5G5B5A1][FORMAT_RGB] = - Copy; - copyFuncs_[FORMAT_R5G5B5A1][FORMAT_ARGB] = - Copy; - break; - - case FORMAT_R4G4B4A4: - copyFuncs_[FORMAT_R4G4B4A4][FORMAT_A8] = - Copy; - copyFuncs_[FORMAT_R4G4B4A4][FORMAT_L8] = - Copy; - copyFuncs_[FORMAT_R4G4B4A4][FORMAT_A8L8] = - Copy; - copyFuncs_[FORMAT_R4G4B4A4][FORMAT_RGB] = - Copy; - copyFuncs_[FORMAT_R4G4B4A4][FORMAT_ARGB] = - Copy; - break; - - case FORMAT_B8G8R8: - copyFuncs_[FORMAT_B8G8R8][FORMAT_L8] = Copy; - copyFuncs_[FORMAT_B8G8R8][FORMAT_RGB] = Copy; - break; - - case FORMAT_A8B8G8R8: - copyFuncs_[FORMAT_A8B8G8R8][FORMAT_A8] = - Copy; - copyFuncs_[FORMAT_A8B8G8R8][FORMAT_L8] = - Copy; - copyFuncs_[FORMAT_A8B8G8R8][FORMAT_A8L8] = - Copy; - copyFuncs_[FORMAT_A8B8G8R8][FORMAT_RGB] = - Copy; - copyFuncs_[FORMAT_A8B8G8R8][FORMAT_ARGB] = - Copy; - break; - } - } - } - - PfnCopy get(uint32_t srcFormat, uint32_t dstFormat) const { - assert(srcFormat < FORMAT_COLOR_SIZE_); - assert(dstFormat < FORMAT_COLOR_SIZE_); - return copyFuncs_[srcFormat][dstFormat]; - } - -private: - template - static int Copy(const SurfaceDesc &dstDesc, - uint32_t dstOffsetX, - uint32_t dstOffsetY, - uint32_t copyWidth, - uint32_t copyHeight, - const SurfaceDesc &srcDesc, - uint32_t srcOffsetX, - uint32_t srcOffsetY) { - auto srcBPP = TFormatInfo::CBSIZE; - auto dstBPP = TFormatInfo::CBSIZE; - auto srcNextLine = srcDesc.Pitch; - auto dstNextLine = dstDesc.Pitch; - - auto pbSrc = srcDesc.pBits + srcOffsetX * srcBPP + srcOffsetY * srcDesc.Pitch; - auto pbDst = dstDesc.pBits + dstOffsetX * dstBPP + dstOffsetY * dstDesc.Pitch; - - while (copyHeight--) { - auto pSrc = reinterpret_cast::TYPE *>(pbSrc); - for (auto *pDst = reinterpret_cast::TYPE *>( - pbDst), - *const pEnd = pDst + copyWidth; - pDst != pEnd; ++pDst, ++pSrc) { - auto tmp = Format::ConvertFrom(pSrc); - Format::ConvertTo(pDst, tmp); - } - - pbSrc += srcNextLine; - pbDst += dstNextLine; - } - return 0; - } - - template - static int CopyFast(const SurfaceDesc &dstDesc, - uint32_t dstOffsetX, - uint32_t dstOffsetY, - uint32_t copyWidth, - uint32_t copyHeight, - const SurfaceDesc &srcDesc, - uint32_t srcOffsetX, - uint32_t srcOffsetY) { - auto nBPP = sizeof(Type); - auto srcNextLine = srcDesc.Pitch; - auto dstNextLine = dstDesc.Pitch; - - auto pbSrc = srcDesc.pBits + srcOffsetX * nBPP + srcOffsetY * srcDesc.Pitch; - auto pbDst = dstDesc.pBits + dstOffsetX * nBPP + dstOffsetY * dstDesc.Pitch; - - while (copyHeight--) { - auto pSrc = reinterpret_cast(pbSrc); - for (auto *pDst = reinterpret_cast(pbDst), *const pEnd = pDst + copyWidth; - pDst != pEnd; ++pDst, ++pSrc) { - *pDst = *pSrc; - } - pbSrc += srcNextLine; - pbDst += dstNextLine; - } - return 0; - } - - static int CopyInvalid(const SurfaceDesc & /*dstDesc*/, - uint32_t /*dstOffsetX*/, - uint32_t /*dstOffsetY*/, - uint32_t /*copyWidth*/, - uint32_t /*copyHeight*/, - const SurfaceDesc & /*srcDesc*/, - uint32_t /*srcOffsetX*/, - uint32_t /*srcOffsetY*/) - { - std::cout << "Error: invalid format" << std::endl; - return -1; - } - - PfnCopy copyFuncs_[FORMAT_COLOR_SIZE_][FORMAT_COLOR_SIZE_]; -}; \ No newline at end of file diff --git a/tests/regression/tex/color.h b/tests/regression/tex/color.h deleted file mode 100644 index 708565a3..00000000 --- a/tests/regression/tex/color.h +++ /dev/null @@ -1,68 +0,0 @@ -// -// Copyright (c) Blaise Tine. All rights reserved. -// -// -// Use of this sample source code is subject to the terms of the Microsoft -// license agreement under which you licensed this sample source code. If -// you did not accept the terms of the license agreement, you are not -// authorized to use this sample source code. For the terms of the license, -// please see the license agreement between you and Microsoft or, if applicable, -// see the LICENSE.RTF on your install media or the root of your tools -// installation. -// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR -// INDEMNITIES. -// -#pragma once - -#include -#include - -struct ColorARGB { - union { - struct { - uint32_t value; - }; - struct { - uint8_t b, g, r, a; - }; - struct { - uint8_t m[4]; - }; - }; - - ColorARGB() {} - - ColorARGB(int a, int r, int g, int b) { - assert((a >= 0) && (a <= 0xff)); - assert((r >= 0) && (r <= 0xff)); - assert((g >= 0) && (g <= 0xff)); - assert((b >= 0) && (b <= 0xff)); - - this->b = static_cast(b); - this->g = static_cast(g); - this->r = static_cast(r); - this->a = static_cast(a); - } - - ColorARGB(int r, int g, int b) { - assert((r >= 0) && (r <= 0xff)); - assert((g >= 0) && (g <= 0xff)); - assert((b >= 0) && (b <= 0xff)); - - this->b = static_cast(b); - this->g = static_cast(g); - this->r = static_cast(r); - } - - ColorARGB(int value) { - this->value = value; - } - - void operator=(const ColorARGB &rhs) { - this->value = rhs.value; - } - - operator uint32_t() const { - return this->value; - } -}; \ No newline at end of file diff --git a/tests/regression/tex/common.h b/tests/regression/tex/common.h index 2abb7234..00d7148f 100644 --- a/tests/regression/tex/common.h +++ b/tests/regression/tex/common.h @@ -1,25 +1,26 @@ #ifndef _COMMON_H_ #define _COMMON_H_ +#include + #define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 typedef struct { - uint32_t num_tasks; - uint8_t format; - uint8_t filter; - uint8_t wrap; - uint8_t use_sw; - uint32_t lod; - uint8_t src_logWidth; - uint8_t src_logHeight; - uint8_t src_stride; - uint8_t src_pitch; - uint32_t src_ptr; - uint32_t dst_width; - uint32_t dst_height; - uint8_t dst_stride; - uint32_t dst_pitch; - uint32_t dst_ptr; + bool use_sw; + uint32_t num_tasks; + uint8_t format; + uint8_t filter; + uint8_t wrapu; + uint8_t wrapv; + uint8_t src_logwidth; + uint8_t src_logheight; + uint32_t src_addr; + uint32_t mip_offs[TEX_LOD_MAX+1]; + uint32_t dst_width; + uint32_t dst_height; + uint8_t dst_stride; + uint32_t dst_pitch; + uint32_t dst_addr; } kernel_arg_t; #endif \ No newline at end of file diff --git a/tests/regression/tex/format.h b/tests/regression/tex/format.h deleted file mode 100644 index 4ee8268e..00000000 --- a/tests/regression/tex/format.h +++ /dev/null @@ -1,1022 +0,0 @@ -// -// Copyright (c) Blaise Tine. All rights reserved. -// -// -// Use of this sample source code is subject to the terms of the Microsoft -// license agreement under which you licensed this sample source code. If -// you did not accept the terms of the license agreement, you are not -// authorized to use this sample source code. For the terms of the license, -// please see the license agreement between you and Microsoft or, if applicable, -// see the LICENSE.RTF on your install media or the root of your tools -// installation. -// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR -// INDEMNITIES. -// -#pragma once - -#include "int24.h" -#include "color.h" -#include - -enum ePixelFormat { - FORMAT_UNKNOWN, - FORMAT_A8, - FORMAT_L8, - FORMAT_A8L8, - FORMAT_R5G6B5, - FORMAT_A8R8G8B8, - FORMAT_A1R5G5B5, - FORMAT_R8G8B8, - FORMAT_A4R4G4B4, - FORMAT_A8B8G8R8, - FORMAT_R5G5B5A1, - FORMAT_B8G8R8, - FORMAT_R4G4B4A4, - FORMAT_COLOR_SIZE_, - FORMAT_D16 = FORMAT_COLOR_SIZE_, - FORMAT_X8S8D16, - FORMAT_PAL4_B8G8R8, - FORMAT_PAL4_A8B8G8R8, - FORMAT_PAL4_R5G6B5, - FORMAT_PAL4_R4G4B4A4, - FORMAT_PAL4_R5G5B5A1, - FORMAT_PAL8_B8G8R8, - FORMAT_PAL8_A8B8G8R8, - FORMAT_PAL8_R5G6B5, - FORMAT_PAL8_R4G4B4A4, - FORMAT_PAL8_R5G5B5A1, - FORMAT_SIZE_, -}; - -#define FORMAT_A FORMAT_A8 -#define FORMAT_RGB FORMAT_R5G6B5 -#define FORMAT_RGB_ FORMAT_R8G8B8 -#define FORMAT_ARGB FORMAT_A8R8G8B8 -#define FORMAT_ARGB_ FORMAT_A4R4G4B4 - -template -struct TFormatInfo {}; - -template <> -struct TFormatInfo { - typedef uint8_t TYPE; - - enum { - CBSIZE = 0, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 4, - RED = 4, - GREEN = 4, - BLUE = 4, - LERP = 4, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 4, - RED = 4, - GREEN = 4, - BLUE = 4, - LERP = 4, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 1, - RED = 5, - GREEN = 5, - BLUE = 5, - LERP = 5, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 1, - RED = 5, - GREEN = 5, - BLUE = 5, - LERP = 5, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - RED = 5, - GREEN = 6, - BLUE = 5, - LERP = 5, - }; -}; - -template <> -struct TFormatInfo { - typedef uint24_t TYPE; - - enum { - CBSIZE = 3, - RED = 8, - GREEN = 8, - BLUE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint24_t TYPE; - - enum { - CBSIZE = 3, - RED = 8, - GREEN = 8, - BLUE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint32_t TYPE; - - enum { - CBSIZE = 4, - ALPHA = 8, - RED = 8, - GREEN = 8, - BLUE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint32_t TYPE; - - enum { - CBSIZE = 4, - ALPHA = 8, - RED = 8, - GREEN = 8, - BLUE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint8_t TYPE; - - enum { - CBSIZE = 1, - ALPHA = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint8_t TYPE; - - enum { - CBSIZE = 1, - LUMINANCE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 8, - LUMINANCE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - DEPTH = 16, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 4, - DEPTH = 16, - STENCIL = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 3, - RED = 8, - GREEN = 8, - BLUE = 8, - PALETTE = 4, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 4, - ALPHA = 8, - RED = 8, - GREEN = 8, - BLUE = 8, - PALETTE = 4, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - RED = 5, - GREEN = 6, - BLUE = 5, - PALETTE = 4, - LERP = 5, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 4, - RED = 4, - GREEN = 4, - BLUE = 4, - PALETTE = 4, - LERP = 4, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 1, - RED = 5, - GREEN = 5, - BLUE = 5, - PALETTE = 4, - LERP = 5, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 3, - RED = 8, - GREEN = 8, - BLUE = 8, - PALETTE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 4, - ALPHA = 8, - RED = 8, - GREEN = 8, - BLUE = 8, - PALETTE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - RED = 5, - GREEN = 6, - BLUE = 5, - PALETTE = 8, - LERP = 5, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 4, - RED = 4, - GREEN = 4, - BLUE = 4, - PALETTE = 8, - LERP = 4, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 1, - RED = 5, - GREEN = 5, - BLUE = 5, - PALETTE = 8, - LERP = 5, - }; -}; - -/////////////////////////////////////////////////////////////////////////////// - -#define DEF_GET_ENUM_VALUE(Name, Default) \ - template \ - struct enum_get_##Name { \ - static constexpr int value = Default; \ - }; \ - template \ - struct enum_get_##Name::type> { \ - static constexpr int value = T::Name; \ - } - -#define __formatInfo(format) \ - { \ - TFormatInfo::CBSIZE, FormatSize>::RED, \ - FormatSize>::GREEN, \ - FormatSize>::BLUE, \ - FormatSize>::ALPHA, \ - FormatSize>::LUMINANCE, \ - FormatSize>::DEPTH, \ - FormatSize>::STENCIL, \ - FormatSize>::PALETTE, \ - FormatSize>::LERP \ - } - -/////////////////////////////////////////////////////////////////////////////// - -struct FormatInfo { - uint8_t BytePerPixel; - uint8_t Red; - uint8_t Green; - uint8_t Blue; - uint8_t Alpha; - uint8_t Luminance; - uint8_t Depth; - uint8_t Stencil; - uint8_t PaletteBits; - uint8_t LerpBits; -}; - -template -class FormatSize { -protected: - DEF_GET_ENUM_VALUE(RED, 0); - DEF_GET_ENUM_VALUE(GREEN, 0); - DEF_GET_ENUM_VALUE(BLUE, 0); - DEF_GET_ENUM_VALUE(ALPHA, 0); - DEF_GET_ENUM_VALUE(LUMINANCE, 0); - DEF_GET_ENUM_VALUE(DEPTH, 0); - DEF_GET_ENUM_VALUE(STENCIL, 0); - DEF_GET_ENUM_VALUE(PALETTE, 0); - DEF_GET_ENUM_VALUE(LERP, 0); - -public: - enum { - RED = enum_get_RED::value, - GREEN = enum_get_GREEN::value, - BLUE = enum_get_BLUE::value, - ALPHA = enum_get_ALPHA::value, - LUMINANCE = enum_get_LUMINANCE::value, - DEPTH = enum_get_DEPTH::value, - STENCIL = enum_get_STENCIL::value, - PALETTE = enum_get_PALETTE::value, - LERP = enum_get_LERP::value, - - RGB = RED + GREEN + BLUE + LUMINANCE, - RGBA = RGB + ALPHA - }; -}; - -namespace Format { - -inline static const FormatInfo &GetInfo(ePixelFormat pixelFormat) { - static const FormatInfo sc_formatInfos[FORMAT_SIZE_] = { - __formatInfo(FORMAT_UNKNOWN), - __formatInfo(FORMAT_A8), - __formatInfo(FORMAT_L8), - __formatInfo(FORMAT_A8L8), - __formatInfo(FORMAT_RGB), - __formatInfo(FORMAT_ARGB), - __formatInfo(FORMAT_A1R5G5B5), - __formatInfo(FORMAT_RGB_), - __formatInfo(FORMAT_ARGB_), - __formatInfo(FORMAT_R4G4B4A4), - __formatInfo(FORMAT_R5G5B5A1), - __formatInfo(FORMAT_B8G8R8), - __formatInfo(FORMAT_A8B8G8R8), - __formatInfo(FORMAT_D16), - __formatInfo(FORMAT_X8S8D16), - __formatInfo(FORMAT_PAL4_B8G8R8), - __formatInfo(FORMAT_PAL4_A8B8G8R8), - __formatInfo(FORMAT_PAL4_R5G6B5), - __formatInfo(FORMAT_PAL4_R4G4B4A4), - __formatInfo(FORMAT_PAL4_R5G5B5A1), - __formatInfo(FORMAT_PAL8_B8G8R8), - __formatInfo(FORMAT_PAL8_A8B8G8R8), - __formatInfo(FORMAT_PAL8_R5G6B5), - __formatInfo(FORMAT_PAL8_R4G4B4A4), - __formatInfo(FORMAT_PAL8_R5G5B5A1), - }; - assert(pixelFormat < FORMAT_SIZE_); - return sc_formatInfos[pixelFormat]; -} - -#undef __formatInfo -#undef DEF_GET_ENUM_VALUE - -typedef ColorARGB (*pfn_convert_from)(const void *pIn); - -typedef void (*pfn_convert_to)(void *pOut, const ColorARGB &in); - -template -static uint32_t ConvertTo(const ColorARGB &color); - -template -static void ConvertTo(void *pOut, const ColorARGB &in) { - *reinterpret_cast::TYPE *>(pOut) = - static_cast::TYPE>( - ConvertTo(in)); -} - -template -static ColorARGB ConvertFrom(uint32_t in); - -template -static ColorARGB ConvertFrom(const void *pIn) { - return ConvertFrom( - *reinterpret_cast::TYPE *>(pIn)); -} - -inline static pfn_convert_to GetConvertTo(ePixelFormat pixelFormat) { - switch (pixelFormat) { - case FORMAT_A8: - return &ConvertTo; - case FORMAT_L8: - return &ConvertTo; - case FORMAT_A8L8: - return &ConvertTo; - case FORMAT_R5G6B5: - return &ConvertTo; - case FORMAT_A1R5G5B5: - return &ConvertTo; - case FORMAT_A4R4G4B4: - return &ConvertTo; - case FORMAT_R8G8B8: - return &ConvertTo; - case FORMAT_A8R8G8B8: - return &ConvertTo; - case FORMAT_R5G5B5A1: - return &ConvertTo; - case FORMAT_R4G4B4A4: - return &ConvertTo; - case FORMAT_B8G8R8: - return &ConvertTo; - case FORMAT_A8B8G8R8: - return &ConvertTo; - case FORMAT_D16: - return &ConvertTo; - case FORMAT_X8S8D16: - return &ConvertTo; - default: - return &ConvertTo; - } - return nullptr; -} - -inline static pfn_convert_from GetConvertFrom(ePixelFormat pixelFormat, - bool bForceAlpha) { - if (bForceAlpha) { - switch (pixelFormat) { - case FORMAT_A8: - return &ConvertFrom; - case FORMAT_L8: - return &ConvertFrom; - case FORMAT_A8L8: - return &ConvertFrom; - case FORMAT_R5G6B5: - return &ConvertFrom; - case FORMAT_A1R5G5B5: - return &ConvertFrom; - case FORMAT_A4R4G4B4: - return &ConvertFrom; - case FORMAT_R8G8B8: - return &ConvertFrom; - case FORMAT_A8R8G8B8: - return &ConvertFrom; - case FORMAT_R5G5B5A1: - return &ConvertFrom; - case FORMAT_R4G4B4A4: - return &ConvertFrom; - case FORMAT_B8G8R8: - return &ConvertFrom; - case FORMAT_A8B8G8R8: - return &ConvertFrom; - case FORMAT_D16: - return &ConvertFrom; - case FORMAT_X8S8D16: - return &ConvertFrom; - default: - return &ConvertFrom; - } - } else { - switch (pixelFormat) { - case FORMAT_A8: - return &ConvertFrom; - case FORMAT_L8: - return &ConvertFrom; - case FORMAT_A8L8: - return &ConvertFrom; - case FORMAT_R5G6B5: - return &ConvertFrom; - case FORMAT_A1R5G5B5: - return &ConvertFrom; - case FORMAT_A4R4G4B4: - return &ConvertFrom; - case FORMAT_R8G8B8: - return &ConvertFrom; - case FORMAT_A8R8G8B8: - return &ConvertFrom; - case FORMAT_R5G5B5A1: - return &ConvertFrom; - case FORMAT_R4G4B4A4: - return &ConvertFrom; - case FORMAT_B8G8R8: - return &ConvertFrom; - case FORMAT_A8B8G8R8: - return &ConvertFrom; - case FORMAT_D16: - return &ConvertFrom; - case FORMAT_X8S8D16: - return &ConvertFrom; - default: - return &ConvertFrom; - } - } - - return nullptr; -} - -inline static uint32_t GetNativeFormat(ePixelFormat pixelFormat) { - switch (pixelFormat) { - case FORMAT_PAL4_B8G8R8: - case FORMAT_PAL8_B8G8R8: - return FORMAT_B8G8R8; - - case FORMAT_PAL4_A8B8G8R8: - case FORMAT_PAL8_A8B8G8R8: - return FORMAT_A8B8G8R8; - - case FORMAT_PAL4_R5G6B5: - case FORMAT_PAL8_R5G6B5: - return FORMAT_R5G6B5; - - case FORMAT_PAL4_R4G4B4A4: - case FORMAT_PAL8_R4G4B4A4: - return FORMAT_R4G4B4A4; - - case FORMAT_PAL4_R5G5B5A1: - case FORMAT_PAL8_R5G5B5A1: - return FORMAT_R5G5B5A1; - - default: - return pixelFormat; - } -} - -/////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &/*in*/) { - return 0; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t /*in*/) { - return 0; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t /*in*/) { - return 0; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return ((in.r & 0xf8) << 8) | ((in.g & 0xfc) << 3) | (in.b >> 3); -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.r = ((in >> 11) << 3) | (in >> 13); - ret.g = ((in >> 3) & 0xfc) | ((in >> 9) & 0x3); - ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff; - ret.r = ((in >> 11) << 3) | (in >> 13); - ret.g = ((in >> 3) & 0xfc) | ((in >> 9) & 0x3); - ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return (in.a ? 0x8000 : 0) | ((in.r & 0xf8) << 7) | ((in.g & 0xf8) << 2) | - (in.b >> 3); -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff * (in >> 15); - ret.r = ((in >> 7) & 0xf8) | ((in << 1) >> 13); - ret.g = ((in >> 2) & 0xf8) | ((in >> 7) & 7); - ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff * (in >> 15); - ret.r = ((in >> 7) & 0xf8) | ((in << 1) >> 13); - ret.g = ((in >> 2) & 0xf8) | ((in >> 7) & 7); - ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return ((in.r & 0xf8) << 8) | ((in.g & 0xf8) << 3) | ((in.b & 0xf8) >> 2) | - (in.a ? 0x1 : 0); -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff * (in & 0x1); - ret.r = ((in >> 8) & 0xf8) | (in >> 13); - ret.g = ((in >> 3) & 0xf8) | ((in >> 8) & 7); - ret.b = ((in & 0x3e) << 2) | ((in & 0x3e) >> 3); - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff * (in & 0x1); - ret.r = ((in >> 8) & 0xf8) | (in >> 13); - ret.g = ((in >> 3) & 0xf8) | ((in >> 8) & 7); - ret.b = ((in & 0x3e) << 2) | ((in & 0x3e) >> 3); - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return ((in.a & 0xf0) << 8) | ((in.r & 0xf0) << 4) | ((in.g & 0xf0) << 0) | - (in.b >> 4); -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = ((in >> 8) & 0xf0) | (in >> 12); - ret.r = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); - ret.g = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); - ret.b = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = ((in >> 8) & 0xf0) | (in >> 12); - ret.r = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); - ret.g = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); - ret.b = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return ((in.r & 0xf0) << 8) | ((in.g & 0xf0) << 4) | ((in.b & 0xf0) << 0) | - (in.a >> 4); -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); - ret.r = ((in >> 8) & 0xf0) | (in >> 12); - ret.g = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); - ret.b = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); - ret.r = ((in >> 8) & 0xf0) | (in >> 12); - ret.g = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); - ret.b = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return (in.r << 16) | (in.g << 8) | in.b; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.r = in >> 16; - ret.g = (in >> 8) & 0xff; - ret.b = in & 0xff; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff; - ret.r = in >> 16; - ret.g = (in >> 8) & 0xff; - ret.b = in & 0xff; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return (in.b << 16) | (in.g << 8) | in.r; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.r = in & 0xff; - ret.g = (in >> 8) & 0xff; - ret.b = in >> 16; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff; - ret.r = in & 0xff; - ret.g = (in >> 8) & 0xff; - ret.b = in >> 16; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return (in.a << 24) | (in.r << 16) | (in.g << 8) | in.b; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in >> 24; - ret.r = (in >> 16) & 0xff; - ret.g = (in >> 8) & 0xff; - ret.b = in & 0xff; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in >> 24; - ret.r = (in >> 16) & 0xff; - ret.g = (in >> 8) & 0xff; - ret.b = in & 0xff; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return (in.a << 24) | (in.b << 16) | (in.g << 8) | in.r; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in >> 24; - ret.r = in & 0xff; - ret.g = (in >> 8) & 0xff; - ret.b = (in >> 16) & 0xff; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in >> 24; - ret.r = in & 0xff; - ret.g = (in >> 8) & 0xff; - ret.b = (in >> 16) & 0xff; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return in.a; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return in.r; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.r = in; - ret.g = in; - ret.b = in; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff; - ret.r = in; - ret.g = in; - ret.b = in; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return (in.a << 8) | in.r; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in >> 8; - ret.r = in & 0xff; - ret.g = in & 0xff; - ret.b = in & 0xff; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in >> 8; - ret.r = in & 0xff; - ret.g = in & 0xff; - ret.b = in & 0xff; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return in.value; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.value = in; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return in.b; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.value = in; - return ret; -} - -} // namespace Format \ No newline at end of file diff --git a/tests/regression/tex/int24.h b/tests/regression/tex/int24.h deleted file mode 100644 index b08537a7..00000000 --- a/tests/regression/tex/int24.h +++ /dev/null @@ -1,37 +0,0 @@ -// -// Copyright (c) Blaise Tine. All rights reserved. -// -// -// Use of this sample source code is subject to the terms of the Microsoft -// license agreement under which you licensed this sample source code. If -// you did not accept the terms of the license agreement, you are not -// authorized to use this sample source code. For the terms of the license, -// please see the license agreement between you and Microsoft or, if applicable, -// see the LICENSE.RTF on your install media or the root of your tools -// installation. -// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR -// INDEMNITIES. -// -#pragma once - -#include - -struct uint24_t { - uint8_t m[3]; - - explicit uint24_t(uint32_t value) { - m[0] = (value >> 0) & 0xff; - m[1] = (value >> 8) & 0xff; - m[2] = (value >> 16) & 0xff; - } - - explicit uint24_t(uint8_t x, uint8_t y, uint8_t z) { - m[0] = x; - m[1] = y; - m[2] = z; - } - - operator uint32_t() const { - return (m[2] << 16) | (m[1] << 8) | m[0]; - } -}; diff --git a/tests/regression/tex/kernel.c b/tests/regression/tex/kernel.c index bd0cebb4..9a36d8cb 100644 --- a/tests/regression/tex/kernel.c +++ b/tests/regression/tex/kernel.c @@ -1,42 +1,56 @@ #include #include #include -#include "common.h" +#include #include "texsw.h" -#define ENABLE_SW - typedef struct { kernel_arg_t* state; uint32_t tile_width; uint32_t tile_height; float deltaX; float deltaY; + float minification; } tile_arg_t; +template +struct static_for_t { + template + inline void operator()(const Fn& callback) const { + callback(Start); + static_for_t()(callback); + } +}; + +template +struct static_for_t { + template + inline void operator()(const Fn& callback) const {} +}; + void kernel_body(int task_id, tile_arg_t* arg) { kernel_arg_t* state = arg->state; uint32_t xoffset = 0; - uint32_t yoffset = task_id * arg->tile_height; - uint8_t* dst_ptr = (uint8_t*)(state->dst_ptr + xoffset * state->dst_stride + yoffset * state->dst_pitch); + uint32_t yoffset = task_id * arg->tile_height; - float fv = yoffset * arg->deltaY; + uint8_t* dst_ptr = (uint8_t*)(state->dst_addr + xoffset * state->dst_stride + yoffset * state->dst_pitch); + + Fixed<16> xj(arg->minification); + + /*vx_printf("task_id=%d, tile_width=%d, tile_height=%d, deltaX=%f, deltaY=%f, minification=%f\n", + task_id, arg->tile_width, arg->tile_height, arg->deltaX, arg->deltaY, arg->minification);*/ + + float fv = (yoffset + 0.5f) * arg->deltaY; for (uint32_t y = 0; y < arg->tile_height; ++y) { uint32_t* dst_row = (uint32_t*)dst_ptr; - float fu = xoffset * arg->deltaX; + float fu = (xoffset + 0.5f) * arg->deltaX; for (uint32_t x = 0; x < arg->tile_width; ++x) { - int32_t u = (int32_t)(fu * (1<<20)); - int32_t v = (int32_t)(fv * (1<<20)); - #ifdef ENABLE_SW - if (state->use_sw) { - dst_row[x] = (state->filter == 2) ? tex3_sw(state, 0, u, v, state->lod) : tex_sw(state, 0, u, v, state->lod); - } else { - #endif - dst_row[x] = (state->filter == 2) ? vx_tex3(0, u, v, state->lod) : vx_tex(0, u, v, state->lod); - #ifdef ENABLE_SW - } - #endif + Fixed xu(fu); + Fixed xv(fv); + uint32_t color = tex_load(state, xu, xv, xj); + //vx_printf("task_id=%d, x=%d, y=%d, fu=%f, fv=%f, xu=0x%x, xv=0x%x, color=0x%x\n", task_id, x, y, fu, fv, xu.data(), xv.data(), color); + dst_row[x] = color; fu += arg->deltaX; } dst_ptr += state->dst_pitch; @@ -48,13 +62,17 @@ int main() { kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; // configure texture unit - vx_csr_write(CSR_TEX_ADDR(0), arg->src_ptr); - vx_csr_write(CSR_TEX_MIPOFF(0), 0); - vx_csr_write(CSR_TEX_WIDTH(0), arg->src_logWidth); - vx_csr_write(CSR_TEX_HEIGHT(0), arg->src_logHeight); - vx_csr_write(CSR_TEX_FORMAT(0), arg->format); - vx_csr_write(CSR_TEX_WRAP(0), (arg->wrap << 2) | arg->wrap); - vx_csr_write(CSR_TEX_FILTER(0), (arg->filter ? 1 : 0)); + csr_write(CSR_TEX_UNIT, 0); + csr_write(CSR_TEX_WIDTH, arg->src_logwidth); + csr_write(CSR_TEX_HEIGHT, arg->src_logheight); + csr_write(CSR_TEX_FORMAT, arg->format); + csr_write(CSR_TEX_WRAPU, arg->wrapu); + csr_write(CSR_TEX_WRAPV, arg->wrapv); + csr_write(CSR_TEX_FILTER, (arg->filter ? 1 : 0)); + csr_write(CSR_TEX_ADDR, arg->src_addr); + static_for_t()([&](int i) { + csr_write(CSR_TEX_MIPOFF(i), arg->mip_offs[i]); + }); tile_arg_t targ; targ.state = arg; @@ -62,6 +80,19 @@ int main() { targ.tile_height = (arg->dst_height + arg->num_tasks - 1) / arg->num_tasks; targ.deltaX = 1.0f / arg->dst_width; targ.deltaY = 1.0f / arg->dst_height; + + { + uint32_t src_width = (1 << arg->src_logwidth); + uint32_t src_height = (1 << arg->src_logheight); + float width_ratio = float(src_width) / arg->dst_width; + float height_ratio = float(src_height) / arg->dst_height; + targ.minification = std::max(width_ratio, height_ratio); + } vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, &targ); + /*for (uint32_t t=0; t < arg->num_tasks; ++t) { + kernel_body(t, &targ); + }*/ + + return 0; } \ No newline at end of file diff --git a/tests/regression/tex/lupng.c b/tests/regression/tex/lupng.c deleted file mode 100644 index f612fbc9..00000000 --- a/tests/regression/tex/lupng.c +++ /dev/null @@ -1,1313 +0,0 @@ -/* - * The MIT License (MIT) - * - * Copyright (c) 2014 Jan Solanti - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include - -#ifndef LUPNG_USE_ZLIB -#include -#else -#include -#endif - -#include "lupng.h" - -#define PNG_NONE 0 -#define PNG_IHDR 0x01 -#define PNG_PLTE 0x02 -#define PNG_IDAT 0x04 -#define PNG_IEND 0x08 - -#define PNG_GRAYSCALE 0 -#define PNG_TRUECOLOR 2 -/* 24bpp RGB palette */ -#define PNG_PALETTED 3 -#define PNG_GRAYSCALE_ALPHA 4 -#define PNG_TRUECOLOR_ALPHA 6 - -#define PNG_FILTER_NONE 0 -#define PNG_FILTER_SUB 1 -#define PNG_FILTER_UP 2 -#define PNG_FILTER_AVERAGE 3 -#define PNG_FILTER_PAETH 4 - -#define PNG_SIG_SIZE 8 - -#define PNG_DONE 1 -#define PNG_OK 0 -#define PNG_ERROR -1 - -#define BUF_SIZE 8192 -#define MAX(x, y) (x > y ? x : y) - -#if defined(_MSC_VER) -#define LU_INLINE __inline /* MS-specific inline */ -#else -#define LU_INLINE inline /* rest of the world... */ -#endif - -#define SIZE_T_MAX_POSITIVE ( ((size_t)-1) >> 1 ) - -/******************************************************** - * CRC computation as per PNG spec - ********************************************************/ - -/* Precomputed table of CRCs of all 8-bit messages - using the polynomial from the PNG spec, 0xEDB88320L. */ -static const uint32_t crcTable[] = -{ - 0x0, 0x77073096, 0xEE0E612C, 0x990951BA, 0x76DC419, 0x706AF48F, - 0xE963A535, 0x9E6495A3, 0xEDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, - 0x9B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2, - 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, - 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, - 0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, - 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C, - 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, - 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, - 0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, - 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x1DB7106, - 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x6B6B51F, 0x9FBFE4A5, 0xE8B8D433, - 0x7807C9A2, 0xF00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x86D3D2D, - 0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, - 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950, - 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, - 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, - 0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, - 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA, - 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F, - 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, - 0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x3B6E20C, 0x74B1D29A, - 0xEAD54739, 0x9DD277AF, 0x4DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84, - 0xD6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0xA00AE27, 0x7D079EB1, - 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, - 0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, - 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E, - 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B, - 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, - 0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, - 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28, - 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, - 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x26D930A, 0x9C0906A9, 0xEB0E363F, - 0x72076785, 0x5005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0xCB61B38, - 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0xBDBDF21, 0x86D3D2D4, 0xF1D4E242, - 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, - 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, - 0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, - 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC, - 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9, - 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, - 0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, - 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D -}; - -/* Update a running CRC with the bytes buf[0..len-1]--the CRC - should be initialized to all 1's, and the transmitted value - is the 1's complement of the final running CRC (see the - crc() routine below)). */ -static uint32_t updateCrc(uint32_t crc, unsigned char *buf, - size_t len) -{ - uint32_t c = crc; - size_t n; - - for (n = 0; n < len; n++) - c = crcTable[(c ^ buf[n]) & 0xFF] ^ (c >> 8); - - return c; -} - -/* Return the CRC of the bytes buf[0..len-1]. */ -static uint32_t crc(unsigned char *buf, size_t len) -{ - return updateCrc(0xFFFFFFFFL, buf, len) ^ 0xFFFFFFFFL; -} - - - -/******************************************************** - * Helper structs - ********************************************************/ - -typedef struct -{ - uint32_t length; - uint8_t *type; - uint8_t *data; - uint32_t crc; -} PngChunk; - -typedef struct { - const LuUserContext *userCtx; - int8_t chunksFound; - - /* IHDR info */ - int32_t width; - int32_t height; - uint8_t depth; - uint8_t colorType; - uint8_t channels; - uint8_t compression; - uint8_t filter; - uint8_t interlace; - - /* PLTE info */ - uint32_t paletteItems; - uint8_t *palette; - - /* fields used for (de)compression & (de-)filtering */ - z_stream stream; - size_t scanlineBytes; - int32_t currentCol; - int32_t currentRow; - uint32_t currentElem; - size_t currentByte; - int bytesPerPixel; - uint8_t *currentScanline; - uint8_t *previousScanline; - uint8_t currentFilter; - uint8_t interlacePass; - size_t compressedBytes; - - /* used for constructing 16 bit deep pixels */ - int tmpCount; - uint8_t tmpBytes[2]; - - /* the output image */ - LuImage *img; - const LuImage *cimg; /* constant pointer version */ -} PngInfoStruct; - -/* helper macro to output warning via user context of the info struct */ -#define LUPNG_WARN_UC(uc,...) do { if ((uc)->warnProc) { (uc)->warnProc((uc)->warnProcUserPtr, __VA_ARGS__); }} while(0) -#define LUPNG_WARN(info,...) LUPNG_WARN_UC((info)->userCtx, __VA_ARGS__) - -/* PNG header: */ -static const uint8_t PNG_SIG[] = -/* P N G \r \n SUB \n */ -{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}; - -static const int startingRow[] = { 0, 0, 0, 4, 0, 2, 0, 1 }; -static const int startingCol[] = { 0, 0, 4, 0, 2, 0, 1, 0 }; -static const int rowIncrement[] = { 1, 8, 8, 8, 4, 4, 2, 2 }; -static const int colIncrement[] = { 1, 8, 8, 4, 4, 2, 2, 1 }; - - - -/******************************************************** - * Helper functions - ********************************************************/ - -static LU_INLINE void releaseChunk(PngChunk *chunk, const LuUserContext *userCtx) -{ - /* Only release chunk->type since chunk->data points to the same memory. */ - userCtx->freeProc(chunk->type, userCtx->freeProcUserPtr); - userCtx->freeProc(chunk, userCtx->freeProcUserPtr); -} - -static LU_INLINE uint32_t swap32(uint32_t n) -{ - union { - unsigned char np[4]; - uint32_t i; - } u; - u.i = n; - - return ((uint32_t)u.np[0] << 24) | - ((uint32_t)u.np[1] << 16) | - ((uint32_t)u.np[2] << 8) | - (uint32_t)u.np[3]; -} - -static LU_INLINE uint16_t swap16(uint16_t n) -{ - union { - unsigned char np[2]; - uint16_t i; - } u; - u.i = n; - - return ((uint16_t)u.np[0] << 8) | (uint16_t)u.np[1]; -} - -static int bytesEqual(const uint8_t *a, const uint8_t *b, size_t count) -{ - size_t i; - for (i = 0; i < count; ++i) - { - if (*(a+i) != *(b+i)) - return 0; - } - - return 1; -} - -static void* internalMalloc(size_t size, void *userPtr) -{ - (void)userPtr; /* not used */ - return malloc(size); -} - -static void internalFree(void *ptr, void *userPtr) -{ - (void)userPtr; /* not used */ - free(ptr); -} - -static void internalPrintf(void *userPtr, const char *fmt, ...) -{ - FILE *outStream = (FILE*)userPtr; - va_list args; - - va_start(args, fmt); - vfprintf(outStream, fmt, args); - va_end(args); - fputc('\n', outStream); -} - -static size_t internalFread(void *ptr, size_t size, size_t count, void *userPtr) -{ - return fread(ptr, size, count, (FILE *)userPtr); -} - -static size_t internalFwrite(const void *ptr, size_t size, size_t count, void *userPtr) -{ - return fwrite(ptr, size, count, (FILE *)userPtr); -} - -/******************************************************** - * Png filter functions - ********************************************************/ -static LU_INLINE int absi(int val) -{ - return val > 0 ? val : -val; -} - -static LU_INLINE uint8_t raw(PngInfoStruct *info, size_t col) -{ - if (col > SIZE_T_MAX_POSITIVE) - return 0; - return info->currentScanline[col]; -} - -static LU_INLINE uint8_t prior(PngInfoStruct *info, size_t col) -{ - if (info->currentRow <= startingRow[info->interlacePass] || col > SIZE_T_MAX_POSITIVE) - return 0; - return info->previousScanline[col]; -} - - -static LU_INLINE uint8_t paethPredictor(uint8_t a, uint8_t b, uint8_t c) -{ - unsigned int A = a, B = b, C = c; - int p = (int)A + (int)B - (int)C; - int pa = absi(p - (int)A); - int pb = absi(p - (int)B); - int pc = absi(p - (int)C); - - if (pa <= pb && pa <= pc) - return a; - if (pb <= pc) - return b; - return c; -} - -static LU_INLINE uint8_t deSub(PngInfoStruct *info, uint8_t filtered) -{ - return filtered + raw(info, info->currentByte-info->bytesPerPixel); -} - -static LU_INLINE uint8_t deUp(PngInfoStruct *info, uint8_t filtered) -{ - return filtered + prior(info, info->currentByte); -} - -static LU_INLINE uint8_t deAverage(PngInfoStruct *info, uint8_t filtered) -{ - uint16_t avg = (uint16_t)(raw(info, info->currentByte-info->bytesPerPixel) - + prior(info, info->currentByte)); - avg >>= 1; - return filtered + avg; -} - -static LU_INLINE uint8_t dePaeth(PngInfoStruct *info, uint8_t filtered) -{ - return filtered + paethPredictor( - raw(info, info->currentByte-info->bytesPerPixel), - prior(info, info->currentByte), - prior(info, info->currentByte-info->bytesPerPixel)); -} - -static LU_INLINE uint8_t none(PngInfoStruct *info) -{ - return raw(info, info->currentByte); -} - -static LU_INLINE uint8_t sub(PngInfoStruct *info) -{ - return raw(info, info->currentByte) - raw(info, info->currentByte-info->bytesPerPixel); -} - -static LU_INLINE uint8_t up(PngInfoStruct *info) -{ - return raw(info, info->currentByte) - prior(info, info->currentByte); -} - -static LU_INLINE uint8_t average(PngInfoStruct *info) -{ - uint16_t avg = (uint16_t)(raw(info, info->currentByte-info->bytesPerPixel) - + prior(info, info->currentByte)); - avg >>= 1; - return raw(info, info->currentByte) - avg; -} - -static LU_INLINE uint8_t paeth(PngInfoStruct *info) -{ - return raw(info, info->currentByte) - paethPredictor( - raw(info, info->currentByte-info->bytesPerPixel), - prior(info, info->currentByte), - prior(info, info->currentByte-info->bytesPerPixel)); -} - - - -/******************************************************** - * Actual implementation - ********************************************************/ -static LU_INLINE int parseIhdr(PngInfoStruct *info, PngChunk *chunk) -{ - if (info->chunksFound) - { - LUPNG_WARN(info,"PNG: malformed PNG file!"); - return PNG_ERROR; - } - - info->chunksFound |= PNG_IHDR; - info->width = swap32(*(uint32_t *)chunk->data); - info->height = swap32(*((uint32_t *)chunk->data + 1)); - info->depth = *(chunk->data + 8); - info->colorType = *(chunk->data + 9); - info->compression = *(chunk->data + 10); - info->filter = *(chunk->data + 11); - info->interlace = *(chunk->data + 12); - - switch (info->colorType) - { - case PNG_GRAYSCALE: - info->channels = 1; - break; - case PNG_TRUECOLOR: - info->channels = 3; - break; - case PNG_PALETTED: - info->channels = 3; - break; - case PNG_GRAYSCALE_ALPHA: - info->channels = 2; - break; - case PNG_TRUECOLOR_ALPHA: - info->channels = 4; - break; - default: - LUPNG_WARN(info,"PNG: illegal color type: %u", - (unsigned int)info->colorType); - return PNG_ERROR; - break; - } - - if (info->width <= 0 || info->height <= 0) - { - LUPNG_WARN(info, "PNG: illegal dimensions"); - return PNG_ERROR; - } - - if ((info->colorType != PNG_GRAYSCALE && info->colorType != PNG_PALETTED && - info->depth < 8) || - (info->colorType == PNG_PALETTED && info->depth == 16) || - info->depth > 16) - { - LUPNG_WARN(info, "PNG: illegal bit depth for color type"); - return PNG_ERROR; - } - - if (info->compression) - { - LUPNG_WARN(info,"PNG: unknown compression method: %u", - (unsigned int)info->compression); - return PNG_ERROR; - } - - if (info->filter) - { - LUPNG_WARN(info,"PNG: unknown filter scheme: %u", - (unsigned int)info->filter); - return PNG_ERROR; - } - - memset(&(info->stream), 0, sizeof(info->stream)); - if(inflateInit(&(info->stream)) != Z_OK) - { - LUPNG_WARN(info, "PNG: inflateInit failed!"); - return PNG_ERROR; - } - info->img = luImageCreate(info->width, info->height, - info->channels, info->depth < 16 ? 8 : 16, NULL, info->userCtx); - info->cimg = info->img; - info->scanlineBytes = MAX((info->width * info->channels * info->depth) >> 3, 1); - info->currentScanline = (uint8_t *)info->userCtx->allocProc(info->scanlineBytes, info->userCtx->allocProcUserPtr); - info->previousScanline = (uint8_t *)info->userCtx->allocProc(info->scanlineBytes, info->userCtx->allocProcUserPtr); - info->currentCol = -1; - info->interlacePass = info->interlace ? 1 : 0; - info->bytesPerPixel = MAX((info->channels * info->depth) >> 3, 1); - if (!info->img || !info->currentScanline || !info->previousScanline) - { - LUPNG_WARN(info, "PNG: memory allocation failed!"); - return PNG_ERROR; - } - - return PNG_OK; -} - -static LU_INLINE int parsePlte(PngInfoStruct *info, PngChunk *chunk) -{ - if (info->chunksFound & PNG_PLTE) - { - LUPNG_WARN(info, "PNG: too many palette chunks in file!"); - return PNG_ERROR; - } - info->chunksFound |= PNG_PLTE; - - if (info->chunksFound & PNG_IDAT || !(info->chunksFound & PNG_IHDR)) - { - LUPNG_WARN(info, "PNG: malformed PNG file!"); - return PNG_ERROR; - } - - if (info->colorType == PNG_GRAYSCALE || info->colorType == PNG_GRAYSCALE_ALPHA) - { - LUPNG_WARN(info, "PNG: palettes are not allowed in grayscale images!"); - return PNG_ERROR; - } - - if (chunk->length % 3 != 0) - { - LUPNG_WARN(info, "PNG: invalid palette size!"); - return PNG_ERROR; - } - - info->paletteItems = chunk->length/3; - info->palette = (uint8_t *)info->userCtx->allocProc(chunk->length,info->userCtx->allocProcUserPtr); - if (!info->palette) - { - LUPNG_WARN(info, "PNG: memory allocation failed!"); - return PNG_ERROR; - } - memcpy(info->palette, chunk->data, chunk->length); - - return PNG_OK; -} - -static LU_INLINE void stretchBits(uint8_t inByte, uint8_t outBytes[8], int depth) -{ - int i; - switch (depth) { - case 1: - for (i = 0; i < 8; ++i) - outBytes[i] = (inByte >> (7-i)) & 0x01; - break; - - case 2: - outBytes[0] = (inByte >> 6) & 0x03; - outBytes[1] = (inByte >> 4) & 0x03; - outBytes[2] = (inByte >> 2) & 0x03; - outBytes[3] = inByte & 0x03; - break; - - case 4: - outBytes[0] = (inByte >> 4) & 0x0F; - outBytes[1] = inByte & 0x0F; - break; - - default: - break; - } -} - -/* returns: 1 if at end of scanline, 0 otherwise */ -static LU_INLINE int insertByte(PngInfoStruct *info, uint8_t byte) -{ - int advance = 0; - const uint8_t scale[] = {0x00, 0xFF, 0x55, 0x00, 0x11, 0x00, 0x00, 0x00}; - - /* for paletted images currentElem will always be 0 */ - size_t idx = info->currentRow * info->width * info->channels - + info->currentCol * info->channels - + info->currentElem; - - if (info->colorType != PNG_PALETTED) - { - if (info->depth == 8) - info->cimg->data[idx] = byte; - - else if (info->depth < 8) - info->cimg->data[idx] = byte * scale[info->depth]; - - else /* depth == 16 */ - { - info->tmpBytes[info->tmpCount] = byte; - if (info->tmpCount) /* just inserted 2nd byte */ - { - uint16_t val = *(uint16_t *)info->tmpBytes; - val = swap16(val); - info->tmpCount = 0; - - ((uint16_t *)(info->cimg->data))[idx] = val; - } - else - { - ++info->tmpCount; - return 0; - } - } - - ++info->currentElem; - if (info->currentElem >= info->channels) - { - advance = 1; - info->currentElem = 0; - } - } - else - { - /* The spec limits palette size to 256 entries */ - if (byte < info->paletteItems) - { - info->cimg->data[idx ] = info->palette[3*byte ]; - info->cimg->data[idx+1] = info->palette[3*byte+1]; - info->cimg->data[idx+2] = info->palette[3*byte+2]; - } - else - { - LUPNG_WARN(info,"PNG: invalid palette index encountered!"); - } - advance = 1; - } - - if (advance) - { - /* advance to next pixel */ - info->currentCol += colIncrement[info->interlacePass]; - - if (info->currentCol >= info->width) - { - uint8_t *tmp = info->currentScanline; - info->currentScanline = info->previousScanline; - info->previousScanline = tmp; - - info->currentCol = -1; - info->currentByte = 0; - - info->currentRow += rowIncrement[info->interlacePass]; - if (info->currentRow >= info->height && info->interlace) - { - ++info->interlacePass; - while (startingCol[info->interlacePass] >= info->width || - startingRow[info->interlacePass] >= info->height) - ++info->interlacePass; - info->currentRow = startingRow[info->interlacePass]; - } - return 1; - } - } - - return 0; -} - -static LU_INLINE int parseIdat(PngInfoStruct *info, PngChunk *chunk) -{ - unsigned char filtered[BUF_SIZE]; - int status = Z_OK; - - if (!(info->chunksFound & PNG_IHDR)) - { - LUPNG_WARN(info,"PNG: malformed PNG file!"); - return PNG_ERROR; - } - - if (info->colorType == PNG_PALETTED && !(info->chunksFound & PNG_PLTE)) - { - LUPNG_WARN(info,"PNG: palette required but missing!"); - return PNG_ERROR; - } - - info->chunksFound |= PNG_IDAT; - info->stream.next_in = (unsigned char *)chunk->data; - info->stream.avail_in = chunk->length; - do - { - size_t decompressed; - size_t i; - - info->stream.next_out = filtered; - info->stream.avail_out = BUF_SIZE; - status = inflate(&(info->stream), Z_NO_FLUSH); - decompressed = BUF_SIZE - info->stream.avail_out; - - if (status != Z_OK && - status != Z_STREAM_END && - status != Z_BUF_ERROR && - status != Z_NEED_DICT) - { - LUPNG_WARN(info, "PNG: inflate error!"); - return PNG_ERROR; - } - - for (i = 0; - i < decompressed && info->currentCol < info->width && info->currentRow < info->height; - ++i) - { - if (info->currentCol < 0) - { - info->currentCol = startingCol[info->interlacePass]; - info->currentFilter = filtered[i]; - } - else - { - uint8_t rawByte = 0; - uint8_t fullBytes[8] = {0}; - switch (info->currentFilter) - { - case PNG_FILTER_NONE: - rawByte = filtered[i]; - break; - case PNG_FILTER_SUB: - rawByte = deSub(info, filtered[i]); - break; - case PNG_FILTER_UP: - rawByte = deUp(info, filtered[i]); - break; - case PNG_FILTER_AVERAGE: - rawByte = deAverage(info, filtered[i]); - break; - case PNG_FILTER_PAETH: - rawByte = dePaeth(info, filtered[i]); - break; - default: - break; - } - - info->currentScanline[info->currentByte] = rawByte; - ++info->currentByte; - - if (info->depth < 8) - { - int j; - stretchBits(rawByte, fullBytes, info->depth); - for (j = 0; j < 8/info->depth; ++j) - if(insertByte(info, fullBytes[j])) - break; - } - else - insertByte(info, rawByte); - } - } - } while ((info->stream.avail_in > 0 || info->stream.avail_out == 0) - && info->currentCol < info->width && info->currentRow < info->height); - - return PNG_OK; -} - -static LU_INLINE PngChunk *readChunk(PngInfoStruct *info) -{ - PngChunk *chunk = (PngChunk *)info->userCtx->allocProc(sizeof(PngChunk),info->userCtx->allocProcUserPtr); - size_t read = 0; - if (!chunk) - { - LUPNG_WARN(info,"PNG: memory allocation failed!"); - return NULL; - } - - info->userCtx->readProc((void *)&chunk->length, 4, 1, info->userCtx->readProcUserPtr); - chunk->length = swap32(chunk->length); - if (chunk->length+4 < chunk->length) - { - LUPNG_WARN(info, "PNG: chunk claims to be absurdly large"); - info->userCtx->freeProc(chunk, info->userCtx->freeProcUserPtr); - return NULL; - } - - // Store chunk type and contents in the same buffer for convenience - chunk->type = (uint8_t *)info->userCtx->allocProc(chunk->length + 4, info->userCtx->allocProcUserPtr); - if (!chunk->type) - { - LUPNG_WARN(info,"PNG: memory allocation failed!"); - info->userCtx->freeProc(chunk, info->userCtx->freeProcUserPtr); - return NULL; - } - chunk->data = chunk->type + 4; - info->userCtx->readProc((void *)chunk->type, 1, chunk->length + 4, info->userCtx->readProcUserPtr); - read = info->userCtx->readProc((void *)&chunk->crc, 4, 1, info->userCtx->readProcUserPtr); - chunk->crc = swap32(chunk->crc); - - for (int i = 0; i < 4; ++i) - { - char byte = chunk->type[i]; - if ((byte < 'a' || byte > 'z') && (byte < 'A' || byte > 'Z')) - { - LUPNG_WARN(info, "PNG: invalid chunk name, possibly unprintable"); - releaseChunk(chunk, info->userCtx); - return NULL; - } - } - if (read != 1) - { - LUPNG_WARN(info, "PNG: read error"); - releaseChunk(chunk, info->userCtx); - return NULL; - } - - if (crc(chunk->type, chunk->length+4) != chunk->crc) - { - LUPNG_WARN(info, "PNG: CRC mismatch in \'%.4s\' chunk", (char *)chunk->type); - releaseChunk(chunk, info->userCtx); - return NULL; - } - - return chunk; -} - -static LU_INLINE int handleChunk(PngInfoStruct *info, PngChunk *chunk) -{ - /* critical chunk */ - if (!(chunk->type[0] & 0x20)) - { - if (bytesEqual(chunk->type, (const uint8_t *)"IHDR", 4)) - return parseIhdr(info, chunk); - if (bytesEqual(chunk->type, (const uint8_t *)"PLTE", 4)) - return parsePlte(info, chunk); - if (bytesEqual(chunk->type, (const uint8_t *)"IDAT", 4)) - return parseIdat(info, chunk); - if (bytesEqual(chunk->type, (const uint8_t *)"IEND", 4)) - { - info->chunksFound |= PNG_IEND; - if (!(info->chunksFound & PNG_IDAT)) - { - LUPNG_WARN(info, "PNG: no IDAT chunk found"); - return PNG_ERROR; - } - return PNG_DONE; - } - } - /* ignore ancillary chunks for now */ - - return PNG_OK; -} - -LuImage *luPngReadUC(const LuUserContext *userCtx) -{ - - uint8_t signature[PNG_SIG_SIZE]; - int status = PNG_ERROR; - - PngInfoStruct info; - memset(&info, 0, sizeof(PngInfoStruct)); - info.userCtx = userCtx; - - if (!userCtx->skipSig) - { - info.userCtx->readProc((void *)signature, 1, PNG_SIG_SIZE, info.userCtx->readProcUserPtr); - status = bytesEqual(signature, PNG_SIG, PNG_SIG_SIZE) ? PNG_OK : PNG_ERROR; - } - - if (status == PNG_OK) - { - PngChunk *chunk; - while ((chunk = readChunk(&info))) - { - status = handleChunk(&info, chunk); - releaseChunk(chunk, info.userCtx); - - if (status != PNG_OK) - break; - } - } - else - LUPNG_WARN(&info, "PNG: invalid header"); - - userCtx->freeProc(info.currentScanline, userCtx->freeProcUserPtr); - userCtx->freeProc(info.previousScanline, userCtx->freeProcUserPtr); - userCtx->freeProc(info.palette, userCtx->freeProcUserPtr); - inflateEnd(&info.stream); - - if (status == PNG_DONE) - return info.img; - else - if (info.img) - luImageRelease(info.img, info.userCtx); - - return NULL; -} - -LuImage *luPngRead(PngReadProc readProc, void *userPtr, int skipSig) -{ - LuUserContext userCtx; - - luUserContextInitDefault(&userCtx); - userCtx.readProc = readProc; - userCtx.readProcUserPtr = userPtr; - userCtx.skipSig = skipSig; - return luPngReadUC(&userCtx); -} - -LuImage *luPngReadFile(const char *filename, LuUserContext *userCtx) -{ - LuUserContext tmp_userCtx; - if (userCtx == NULL) { - luUserContextInitDefault(&tmp_userCtx); - userCtx = &tmp_userCtx; - } - - LuImage *img; - FILE *f = fopen(filename,"rb"); - - if (f) { - userCtx->readProc = internalFread; - userCtx->readProcUserPtr = f; - img = luPngReadUC(userCtx); - fclose(f); - } else { - LUPNG_WARN_UC(userCtx, "PNG: failed to open '%s'", filename); - img = NULL; - } - - return img; -} - -static LU_INLINE int writeIhdr(PngInfoStruct *info) -{ - static uint8_t buf[17]; - static const uint8_t colorType[] = { - PNG_GRAYSCALE, - PNG_GRAYSCALE_ALPHA, - PNG_TRUECOLOR, - PNG_TRUECOLOR_ALPHA - }; - size_t written = 0; - PngChunk c; - - if (info->cimg->channels > 4) - { - LUPNG_WARN(info, "PNG: too many channels in image"); - return PNG_ERROR; - } - - c.length = swap32(13); - c.type = buf; /* 4 (type) + 4 + 4 + 5x1 */ - c.data = c.type + 4; - - memcpy((void *)c.type, (void *)"IHDR", 4); - *(uint32_t *)(c.data) = swap32((uint32_t)info->cimg->width); - *(uint32_t *)(c.data + 4) = swap32((uint32_t)info->cimg->height); - *(c.data + 8) = info->cimg->depth; - *(c.data + 9) = colorType[info->cimg->channels-1]; - *(c.data + 10) = 0; /* compression method */ - *(c.data + 11) = 0; /* filter method */ - *(c.data + 12) = 0; /* interlace method: none */ - - c.crc = swap32(crc(c.type, 17)); - - written += info->userCtx->writeProc((void *)&c.length, 4, 1, info->userCtx->writeProcUserPtr) * 4; - written += info->userCtx->writeProc((void *)c.type, 1, 4, info->userCtx->writeProcUserPtr); - written += info->userCtx->writeProc((void *)c.data, 1, 13, info->userCtx->writeProcUserPtr); - written += info->userCtx->writeProc((void *)&c.crc, 4, 1, info->userCtx->writeProcUserPtr) * 4; - - if (written != 25) - { - LUPNG_WARN(info, "PNG: write error"); - return PNG_ERROR; - } - - return PNG_OK; -} - -static LU_INLINE int writeIdat(PngInfoStruct *info, uint8_t *buf, size_t buflen) -{ - size_t written = 0; - PngChunk c; - - c.length = swap32((uint32_t)(buflen-4)); - c.crc = swap32(crc(buf, buflen)); - - written += info->userCtx->writeProc((void *)&c.length, 4, 1, info->userCtx->writeProcUserPtr) * 4; - written += info->userCtx->writeProc((void *)buf, 1, buflen, info->userCtx->writeProcUserPtr); - written += info->userCtx->writeProc((void *)&c.crc, 4, 1, info->userCtx->writeProcUserPtr) * 4; - - if (written != buflen+8) - { - LUPNG_WARN(info, "PNG: write error"); - return PNG_ERROR; - } - - return PNG_OK; -} - -static LU_INLINE void advanceBytep(PngInfoStruct *info, int is16bit) -{ - if (is16bit) - { - if (info->currentByte%2) - --info->currentByte; - else - info->currentByte+=3; - } - else - ++info->currentByte; -} - -static LU_INLINE size_t filterScanline(PngInfoStruct *info, - uint8_t(*f)(PngInfoStruct *info), - uint8_t filter, - uint8_t *filterCandidate, - int is16bit) -{ - size_t curSum = 0; - size_t fc; - - filterCandidate[0] = filter; - for (info->currentByte = is16bit ? 1 : 0, fc = 1; - info->currentByte < info->scanlineBytes; ++fc, advanceBytep(info, is16bit) ) - { - uint8_t val = f(info); - filterCandidate[fc] = val; - curSum += val; - } - - return curSum; -} - -/* - * Processes the input image and calls writeIdat for every BUF_SIZE compressed - * bytes. - */ -static LU_INLINE int processPixels(PngInfoStruct *info) -{ - uint8_t idatBuf[BUF_SIZE+4] = {'I', 'D', 'A', 'T'}; - uint8_t *compressed = idatBuf+4; - uint8_t *filterCandidate = (uint8_t *)info->userCtx->allocProc(info->scanlineBytes+1, info->userCtx->allocProcUserPtr); - uint8_t *bestCandidate = (uint8_t *)info->userCtx->allocProc(info->scanlineBytes+1, info->userCtx->allocProcUserPtr); - size_t minSum = (size_t)-1, curSum = 0; - int status = Z_OK; - int is16bit = info->cimg->depth == 16; - - if (!filterCandidate || !bestCandidate) - { - LUPNG_WARN(info, "PNG: memory allocation failed!"); - } - - memset(&(info->stream), 0, sizeof(info->stream)); - if(deflateInit(&(info->stream), info->userCtx->compressionLevel) != Z_OK) - { - LUPNG_WARN(info, "PNG: deflateInit failed!"); - info->userCtx->freeProc(filterCandidate, info->userCtx->freeProcUserPtr); - info->userCtx->freeProc(bestCandidate, info->userCtx->freeProcUserPtr); - return PNG_ERROR; - } - - info->stream.avail_out = BUF_SIZE; - info->stream.next_out = compressed; - - for (info->currentRow = 0; info->currentRow < info->cimg->height; - ++info->currentRow) - { - int flush = (info->currentRow < info->cimg->height-1) ? - Z_NO_FLUSH : Z_FINISH; - minSum = (size_t)-1; - - /* - * 1st time it doesn't matter, the filters never look at the previous - * scanline when processing row 0. And next time it'll be valid. - */ - info->previousScanline = info->currentScanline; - info->currentScanline = info->cimg->data + (info->currentRow*info->scanlineBytes); - - /* - * Try to choose the best filter for each scanline. - * Breaks in case of overflow, but hey it's just a heuristic. - */ - for (info->currentFilter = PNG_FILTER_NONE; info->currentFilter <= PNG_FILTER_PAETH; ++info->currentFilter) - { - - switch (info->currentFilter) - { - case PNG_FILTER_NONE: - curSum = filterScanline(info, none, PNG_FILTER_NONE, filterCandidate, is16bit); - break; - - case PNG_FILTER_SUB: - curSum = filterScanline(info, sub, PNG_FILTER_SUB, filterCandidate, is16bit); - break; - - case PNG_FILTER_UP: - curSum = filterScanline(info, up, PNG_FILTER_UP, filterCandidate, is16bit); - break; - - case PNG_FILTER_AVERAGE: - curSum = filterScanline(info, average, PNG_FILTER_AVERAGE, filterCandidate, is16bit); - break; - - case PNG_FILTER_PAETH: - curSum = filterScanline(info, paeth, PNG_FILTER_PAETH, filterCandidate, is16bit); - break; - - default: - break; - } - - if (curSum < minSum || !info->currentFilter) - { - uint8_t *tmp = bestCandidate; - bestCandidate = filterCandidate; - filterCandidate = tmp; - minSum = curSum; - } - } - - info->stream.avail_in = (unsigned int)info->scanlineBytes+1; - info->stream.next_in = bestCandidate; - - /* compress bestCandidate */ - do - { - status = deflate(&info->stream, flush); - - if (info->stream.avail_out < BUF_SIZE) - { - writeIdat(info, idatBuf, BUF_SIZE-info->stream.avail_out+4); - info->stream.next_out = compressed; - info->stream.avail_out = BUF_SIZE; - } - } while ((flush == Z_FINISH && status != Z_STREAM_END) - || (flush == Z_NO_FLUSH && info->stream.avail_in)); - } - - info->userCtx->freeProc(filterCandidate, info->userCtx->freeProcUserPtr); - info->userCtx->freeProc(bestCandidate, info->userCtx->freeProcUserPtr); - - return PNG_OK; -} - -static LU_INLINE int writeIend(PngInfoStruct *info) -{ - PngChunk c = { 0, (uint8_t *)"IEND", 0, 0 }; - size_t written = 0; - c.crc = swap32(crc(c.type, 4)); - - written += info->userCtx->writeProc((void *)&c.length, 4, 1, info->userCtx->writeProcUserPtr) * 4; - written += info->userCtx->writeProc((void *)c.type, 1, 4, info->userCtx->writeProcUserPtr); - written += info->userCtx->writeProc((void *)&c.crc, 4, 1, info->userCtx->writeProcUserPtr) * 4; - - if (written != 12) - { - LUPNG_WARN(info, "PNG: write error"); - return PNG_ERROR; - } - - return PNG_OK; -} - -int luPngWriteUC(const LuUserContext *userCtx, const LuImage *img) -{ - PngInfoStruct info; - memset(&info, 0, sizeof(PngInfoStruct)); - info.userCtx = userCtx; - info.cimg = img; - info.bytesPerPixel = (info.cimg->channels * info.cimg->depth) >> 3; - - if (info.userCtx->writeProc((void *)PNG_SIG, 1, PNG_SIG_SIZE, info.userCtx->writeProcUserPtr) != PNG_SIG_SIZE) - { - LUPNG_WARN(&info, "PNG: write error"); - return PNG_ERROR; - } - - if (writeIhdr(&info) != PNG_OK) - return PNG_ERROR; - - info.scanlineBytes = (info.cimg->depth >> 3) * info.cimg->channels * info.cimg->width; - if (processPixels(&info) != PNG_OK) - { - deflateEnd(&(info.stream)); - return PNG_ERROR; - } - - deflateEnd(&(info.stream)); - return writeIend(&info); -} - -int luPngWrite(PngWriteProc writeProc, void *userPtr, const LuImage *img) -{ - LuUserContext userCtx; - - luUserContextInitDefault(&userCtx); - userCtx.writeProc = writeProc; - userCtx.writeProcUserPtr = userPtr; - return luPngWriteUC(&userCtx, img); -} - -int luPngWriteFile(const char *filename, const LuImage *img) -{ - LuUserContext userCtx; - FILE *f; - - if (!img) - { - return PNG_ERROR; - } - - f = fopen(filename,"wb"); - luUserContextInitDefault(&userCtx); - if (f) - { - userCtx.writeProc = internalFwrite; - userCtx.writeProcUserPtr = f; - luPngWriteUC(&userCtx, img); - fclose(f); - } - else - { - LUPNG_WARN_UC(&userCtx, "PNG: failed to open '%s'", filename); - return PNG_ERROR; - } - - return PNG_OK; -} - -void luImageRelease(LuImage *img, const LuUserContext *userCtx) -{ - LuUserContext ucDefault; - - if (userCtx == NULL) - { - luUserContextInitDefault(&ucDefault); - userCtx = &ucDefault; - } - - userCtx->freeProc(img->data, userCtx->freeProcUserPtr); - if (userCtx->overrideImage != img) - userCtx->freeProc(img, userCtx->freeProcUserPtr); -} - -LuImage *luImageCreate(size_t width, size_t height, uint8_t channels, uint8_t depth, - uint8_t *buffer, const LuUserContext *userCtx) -{ - LuImage *img; - LuUserContext ucDefault; - - if (userCtx == NULL) { - luUserContextInitDefault(&ucDefault); - userCtx = &ucDefault; - } - - if (depth != 8 && depth != 16) - { - LUPNG_WARN_UC(userCtx,"Image: only bit depths 8 and 16 are supported!"); - return NULL; - } - if (width > 0x7FFFFFFF || height > 0x7FFFFFFF) { - LUPNG_WARN_UC(userCtx, "Image: only 32 bit signed image dimensions are supported!"); - return NULL; - } - - if (userCtx->overrideImage) - img = userCtx->overrideImage; - else - img = (LuImage *)userCtx->allocProc(sizeof(LuImage), userCtx->allocProcUserPtr); - if (!img) - return NULL; - - img->width = (int32_t)width; - img->height = (int32_t)height; - img->channels = channels; - img->depth = depth; - img->dataSize = (size_t)((depth >> 3) * width * height * channels); - if (buffer) - img->data = buffer; - else - img->data = (uint8_t *)userCtx->allocProc(img->dataSize, userCtx->allocProcUserPtr); - - if (img->data == NULL) - { - luImageRelease(img, userCtx); - return NULL; - } - - return img; -} - -uint8_t *luImageExtractBufAndRelease(LuImage *img, const LuUserContext *userCtx) -{ - uint8_t *data; - LuUserContext ucDefault; - - if (userCtx == NULL) { - luUserContextInitDefault(&ucDefault); - userCtx = &ucDefault; - } - - if (img) - { - data = img->data; - img->data = NULL; - luImageRelease(img, userCtx); - } - else - { - data = NULL; - } - - return data; -} - -void luUserContextInitDefault(LuUserContext *userCtx) -{ - userCtx->readProc=NULL; - userCtx->readProcUserPtr=NULL; - userCtx->skipSig = 0; - - userCtx->writeProc=NULL; - userCtx->writeProcUserPtr=NULL; - userCtx->compressionLevel=Z_DEFAULT_COMPRESSION; - - userCtx->allocProc=internalMalloc; - userCtx->allocProcUserPtr=NULL; - userCtx->freeProc=internalFree; - userCtx->freeProcUserPtr=NULL; - - userCtx->warnProc=internalPrintf; - userCtx->warnProcUserPtr=(void*)stderr; - - userCtx->overrideImage=NULL; -} \ No newline at end of file diff --git a/tests/regression/tex/lupng.h b/tests/regression/tex/lupng.h deleted file mode 100644 index 5c3f8465..00000000 --- a/tests/regression/tex/lupng.h +++ /dev/null @@ -1,186 +0,0 @@ -/* - * The MIT License (MIT) - * - * Copyright (c) 2014 Jan Solanti - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifdef __cplusplus -extern "C" { -#endif - -#pragma once - -#if defined(_MSC_VER) && (_MSC_VER < 1600) -typedef __int8 int8_t; -typedef __int16 int16_t; -typedef __int32 int32_t; -typedef unsigned __int8 uint8_t; -typedef unsigned __int16 uint16_t; -typedef unsigned __int32 uint32_t; -#else -#include -#include -#endif - -typedef struct { - int32_t width; - int32_t height; - uint8_t channels; - uint8_t depth; /* must be 8 or 16 */ - size_t dataSize; - uint8_t *data; -} LuImage; - -typedef size_t (*PngReadProc)(void *outPtr, size_t size, size_t count, void *userPtr); -typedef size_t (*PngWriteProc)(const void *inPtr, size_t size, size_t count, void *userPtr); -typedef void* (*PngAllocProc)(size_t size, void *userPtr); -typedef void (*PngFreeProc)(void *ptr, void *userPtr); -typedef void (*PngWarnProc)(void *userPtr, const char *fmt, ...); - -typedef struct { - /* loader */ - PngReadProc readProc; - void *readProcUserPtr; - int skipSig; - - /* writer */ - PngWriteProc writeProc; - void *writeProcUserPtr; - int compressionLevel; - - /* memory allocation */ - PngAllocProc allocProc; - void *allocProcUserPtr; - PngFreeProc freeProc; - void *freeProcUserPtr; - - /* warnings/error output */ - PngWarnProc warnProc; /* set to NULL to disable output altogether */ - void *warnProcUserPtr; - - /* special case: avoid allocating a LuImage when loading or creating - * an image, just use this one */ - LuImage *overrideImage; -} LuUserContext; - -/** - * Initializes a LuUserContext to use the defaul malloc implementation. - * - * @param userCtx the LuUserContext to initialize - */ -void luUserContextInitDefault(LuUserContext *userCtx); - -/** - * Creates a new Image object with the specified attributes. - * The data store of the Image is allocated but its contents are undefined. - * Only 8 and 16 bits deep images with 1-4 channels are supported. - * - * @param buffer pointer to an existing buffer (which may already contain the - * image data), or NULL to internally allocate a new buffer - * @param userCtx the user context (with the memory allocator function - * pointers to use), or NULL to use the default allocator - * (malloc). - */ -LuImage *luImageCreate(size_t width, size_t height, uint8_t channels, uint8_t depth, - uint8_t *buffer, const LuUserContext *usrCtx); - -/** - * Releases the memory associated with the given Image object. - * - * @param userCtx the user context (with the memory deallocator function - * pointers to use), or NULL to use the default deallocator - * (free). The deallocator should match the ones used for - * allocation. - */ -void luImageRelease(LuImage *img, const LuUserContext *usrCtx); - -/** - * Extracts the raw image buffer form a LuImage and releases the - * then-orphaned LuImage object. This can be used if you want to use - * the image data in your own structures. - * - * @param userCtx the user context (with the memory deallocator function - * pointers to use), or NULL to use the default deallocator - * (free). The deallocator should match the ones used for - * allocation. - */ -uint8_t *luImageExtractBufAndRelease(LuImage *img, const LuUserContext *userCtx); - -/** - * Decodes a PNG image from a file - * - * @param filename the file name (optionally with full path) to read from. - * @param userCtx the user context (with the memory allocator function - * pointers to use), or NULL to use the default allocator - * (malloc). - */ -LuImage *luPngReadFile(const char *filename, LuUserContext *userCtx); - -/** - * Decodes a PNG image with the provided read function into a LuImage struct - * - * @param readProc a function pointer to a user-defined function to use for - * reading the PNG data. - * @param userPtr an opaque pointer provided as an argument to readProc - * @param skipSig don't verify PNG signature - the bytes have already been - * removed from the input stream - */ -LuImage *luPngRead(PngReadProc readProc, void *userPtr, int skipSig); - -/** - * Decodes a PNG image with the provided user context into a LuImage struct - * - * @param userCtx the LuUserContext to use - */ -LuImage *luPngReadUC(const LuUserContext *userCtx); - -/** - * Encodes a LuImage struct to PNG and writes it out to a file. - * - * @param filename the file name (optionally with full path) to write to. - * Existing files will be overwritten! - * @param img the LuImage to encode - */ -int luPngWriteFile(const char *filename, const LuImage *img); - -/** - * Encodes a LuImage struct to PNG and writes it out using a user-defined write - * function. - * - * @param writeProc a function pointer to a user-defined function that will be - * used for writing the final PNG data. - * @param userPtr an opaque pointer provided as an argument to writeProc - * @param img the LuImage to encode - */ -int luPngWrite(PngWriteProc writeProc, void *userPtr, const LuImage *img); - -/** - * Encodes a LuImage struct to PNG and writes it out with the provided user - * context. - * - * @param userCtx the LuUserContext to use - * @param img the LuImage to encode - */ -int luPngWriteUC(const LuUserContext *userCtx, const LuImage *img); - -#ifdef __cplusplus -} -#endif \ No newline at end of file diff --git a/tests/regression/tex/main.cpp b/tests/regression/tex/main.cpp index 39ffea0c..5ea47cc0 100644 --- a/tests/regression/tex/main.cpp +++ b/tests/regression/tex/main.cpp @@ -9,6 +9,8 @@ #include "common.h" #include "utils.h" +using namespace cocogfx; + #define RT_CHECK(_expr) \ do { \ int _ret = _expr; \ @@ -25,7 +27,7 @@ const char* kernel_file = "kernel.bin"; const char* input_file = "palette64.png"; const char* output_file = "output.png"; int wrap = 0; -int filter = 0; +int filter = 0; // 0-> point, 1->bilinear, 2->trilinear float scale = 1.0f; int format = 0; bool use_sw = false; @@ -41,13 +43,13 @@ static void show_usage() { static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "zi:o:k:w:f:g:h?")) != -1) { + while ((c = getopt(argc, argv, "zi:o:k:w:f:g:s:h?")) != -1) { switch (c) { case 'i': - input_file = optarg; + input_file = optarg; break; case 'o': - output_file = optarg; + output_file = optarg; break; case 's': scale = std::stof(optarg, NULL); @@ -63,9 +65,11 @@ static void parse_args(int argc, char **argv) { switch (format) { case 0: eformat = FORMAT_A8R8G8B8; break; case 1: eformat = FORMAT_R5G6B5; break; - case 2: eformat = FORMAT_R4G4B4A4; break; - case 3: eformat = FORMAT_L8; break; - case 4: eformat = FORMAT_A8; break; + case 2: eformat = FORMAT_A1R5G5B5; break; + case 3: eformat = FORMAT_A4R4G4B4; break; + case 4: eformat = FORMAT_A8L8; break; + case 5: eformat = FORMAT_L8; break; + case 6: eformat = FORMAT_A8; break; default: std::cout << "Error: invalid format: " << format << std::endl; exit(1); @@ -101,7 +105,9 @@ void cleanup() { int run_test(const kernel_arg_t& kernel_arg, uint32_t buf_size, uint32_t width, - uint32_t height) { + uint32_t height, + uint32_t bpp) { + (void)bpp; auto time_start = std::chrono::high_resolution_clock::now(); // start device @@ -110,7 +116,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); auto time_end = std::chrono::high_resolution_clock::now(); double elapsed = std::chrono::duration_cast(time_end - time_start).count(); @@ -118,7 +124,7 @@ int run_test(const kernel_arg_t& kernel_arg, // download destination buffer std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0)); + RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_addr, buf_size, 0)); std::vector dst_pixels(buf_size); auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); @@ -128,7 +134,7 @@ int run_test(const kernel_arg_t& kernel_arg, // save output image std::cout << "save output image" << std::endl; - //dump_image(dst_pixels, width, height, bpp); + //dump_image(dst_pixels, width, height, bpp); RT_CHECK(SaveImage(output_file, FORMAT_A8R8G8B8, dst_pixels, width, height)); return 0; @@ -137,25 +143,31 @@ int run_test(const kernel_arg_t& kernel_arg, int main(int argc, char *argv[]) { kernel_arg_t kernel_arg; std::vector src_pixels; + std::vector mip_offsets; uint32_t src_width; uint32_t src_height; // parse command arguments parse_args(argc, argv); - RT_CHECK(LoadImage(input_file, eformat, src_pixels, &src_width, &src_height)); + { + std::vector staging; + RT_CHECK(LoadImage(input_file, eformat, staging, &src_width, &src_height)); + uint32_t src_bpp = GetInfo(eformat).BytePerPixel; + //dump_image(staging, src_width, src_height, src_bpp); + RT_CHECK(GenerateMipmaps(src_pixels, mip_offsets, staging, eformat, src_width, src_height, src_width * src_bpp)); + } // check power of two support - if (!ISPOW2(src_width) || !ISPOW2(src_height)) { + if (!ispow2(src_width) || !ispow2(src_height)) { std::cout << "Error: only power of two textures supported: width=" << src_width << ", heigth=" << src_height << std::endl; return -1; } - uint32_t src_bpp = Format::GetInfo(eformat).BytePerPixel; - - //dump_image(src_pixels, src_width, src_height, src_bpp); + uint32_t src_logwidth = log2ceil(src_width); + uint32_t src_logheight = log2ceil(src_height); - uint32_t src_bufsize = src_bpp * src_width * src_height; + uint32_t src_bufsize = src_pixels.size(); uint32_t dst_width = (uint32_t)(src_width * scale); uint32_t dst_height = (uint32_t)(src_height * scale); @@ -166,7 +178,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); @@ -183,7 +195,7 @@ int main(int argc, char *argv[]) { // allocate device memory std::cout << "allocate device memory" << std::endl; - size_t src_addr, dst_addr; + uint64_t src_addr, dst_addr; RT_CHECK(vx_alloc_dev_mem(device, src_bufsize, &src_addr)); RT_CHECK(vx_alloc_dev_mem(device, dst_bufsize, &dst_addr)); @@ -192,32 +204,36 @@ int main(int argc, char *argv[]) { // allocate staging shared memory std::cout << "allocate shared memory" << std::endl; - uint32_t alloc_size = std::max(sizeof(kernel_arg_t), std::max(src_bufsize, dst_bufsize)); + uint32_t alloc_size = std::max(sizeof(kernel_arg_t), + std::max(src_bufsize, dst_bufsize)); RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer)); // upload kernel argument std::cout << "upload kernel argument" << std::endl; { + kernel_arg.use_sw = use_sw; kernel_arg.num_tasks = std::min(num_tasks, dst_height); kernel_arg.format = format; kernel_arg.filter = filter; - kernel_arg.wrap = wrap; - kernel_arg.use_sw = use_sw; - kernel_arg.lod = 0x0; + kernel_arg.wrapu = wrap; + kernel_arg.wrapv = wrap; - kernel_arg.src_logWidth = (uint32_t)std::log2(src_width); - kernel_arg.src_logHeight = (uint32_t)std::log2(src_height); - kernel_arg.src_stride = src_bpp; - kernel_arg.src_pitch = src_bpp * src_width; - kernel_arg.src_ptr = src_addr; + kernel_arg.src_logwidth = src_logwidth; + kernel_arg.src_logheight = src_logheight; + kernel_arg.src_addr = src_addr; + + for (uint32_t i = 0; i < mip_offsets.size(); ++i) { + assert(i < TEX_LOD_MAX); + kernel_arg.mip_offs[i] = mip_offsets.at(i); + } kernel_arg.dst_width = dst_width; kernel_arg.dst_height = dst_height; kernel_arg.dst_stride = dst_bpp; kernel_arg.dst_pitch = dst_bpp * dst_width; - kernel_arg.dst_ptr = dst_addr; + kernel_arg.dst_addr = dst_addr; - auto buf_ptr = (int*)vx_host_ptr(buffer); + auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); } @@ -225,26 +241,26 @@ int main(int argc, char *argv[]) { // upload source buffer std::cout << "upload source buffer" << std::endl; { - auto buf_ptr = (int8_t*)vx_host_ptr(buffer); + auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); for (uint32_t i = 0; i < src_bufsize; ++i) { buf_ptr[i] = src_pixels[i]; } - RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, src_bufsize, 0)); + RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_addr, src_bufsize, 0)); } // clear destination buffer std::cout << "clear destination buffer" << std::endl; { - auto buf_ptr = (int32_t*)vx_host_ptr(buffer); + auto buf_ptr = (uint32_t*)vx_host_ptr(buffer); for (uint32_t i = 0; i < (dst_bufsize/4); ++i) { buf_ptr[i] = 0xdeadbeef; } - RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, dst_bufsize, 0)); + RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_addr, dst_bufsize, 0)); } // run tests std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, dst_bufsize, dst_width, dst_height)); + RT_CHECK(run_test(kernel_arg, dst_bufsize, dst_width, dst_height, dst_bpp)); // cleanup std::cout << "cleanup" << std::endl; diff --git a/tests/regression/tex/surfacedesc.h b/tests/regression/tex/surfacedesc.h deleted file mode 100644 index cf303584..00000000 --- a/tests/regression/tex/surfacedesc.h +++ /dev/null @@ -1,25 +0,0 @@ -// -// Copyright (c) Blaise Tine. All rights reserved. -// -// -// Use of this sample source code is subject to the terms of the Microsoft -// license agreement under which you licensed this sample source code. If -// you did not accept the terms of the license agreement, you are not -// authorized to use this sample source code. For the terms of the license, -// please see the license agreement between you and Microsoft or, if applicable, -// see the LICENSE.RTF on your install media or the root of your tools -// installation. -// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR -// INDEMNITIES. -// -#pragma once - -#include "format.h" - -struct SurfaceDesc { - ePixelFormat Format; - uint8_t *pBits; - uint32_t Width; - uint32_t Height; - uint32_t Pitch; -}; \ No newline at end of file diff --git a/tests/regression/tex/texsw.h b/tests/regression/tex/texsw.h index 96b9a19e..2eecb079 100644 --- a/tests/regression/tex/texsw.h +++ b/tests/regression/tex/texsw.h @@ -1,167 +1,125 @@ -#ifndef _TEXSW_H_ +#pragma once +#include +#include #include "common.h" -#define TEX_LOD_MAX 11 +using namespace cocogfx; -#define MIN(x, y) ((x < y) ? (x) : (y)) - -#define MAX(x, y) ((x > y) ? (x) : (y)) - -inline int address(int wrap, int value) { - switch (wrap) { - case 1: return value & 0xfffff; - default: - case 0: return MIN(MAX(value, 0), 0xfffff); +inline void texel_read(uint32_t* texels, + uint8_t** addresses, + uint32_t count, + uint32_t stride) { + switch (stride) { + case 1: + for (uint32_t i = 0; i < count; ++i) { + texels[i] = *(uint8_t*)addresses[i]; + } + break; + case 2: + for (uint32_t i = 0; i < count; ++i) { + texels[i] = *(uint16_t*)addresses[i]; + } + break; + case 4: + for (uint32_t i = 0; i < count; ++i) { + texels[i] = *(uint32_t*)addresses[i]; + } + break; + default: + std::abort(); } } -inline void unpack(int format, int value, int* l, int* h) { - switch (format) { - case 1: - case 2: - *l = value; - *h = 0; - break; - case 3: - *l = (value | (value << 8)) & 0x00ff00ff; - *h = 0; - break; - case 4: - *l = (value | (value << 16)) & 0x07e0f81f; - *h = 0; - break; - case 5: - *l = (value | (value << 12)) & 0x0f0f0f0f; - *h = 0; - break; - default: - case 0: - *l = value & 0x00ff00ff; - *h = (value >> 8) & 0x00ff00ff; - break; - } -} +inline uint32_t vx_tex_sw(kernel_arg_t* state, + Fixed xu, + Fixed xv, + uint32_t lod) { + uint8_t* base_addr = ((uint8_t*)state->src_addr) + state->mip_offs[lod]; + uint32_t log_width = std::max(state->src_logwidth - lod, 0); + uint32_t log_height = std::max(state->src_logheight - lod, 0); + auto format = (TexFormat)state->format; + auto wrapu = (WrapMode)state->wrapu; + auto wrapv = (WrapMode)state->wrapv; + auto filter = state->filter; + auto stride = Stride(format); -inline void lerp(int al, int ah, int bl, int bh, int frac, int* l, int* h) { - *l = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; - *h = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; -} - -inline int pack(int format, int l, int h) { - switch (format) { - case 1: - case 2: - return l; - case 3: - return (l | (l >> 8)) & 0xffff; - case 4: - return (l | (l >> 16)) & 0xffff; - case 5: - return (l | (l >> 12)) & 0xffff; - default: - case 0: - return (h << 8) | l; - } -} - -inline int tex_sw(kernel_arg_t* state, int stage, int u, int v, int lod) { - int base_addr = state->src_ptr; - int mip_offset = 0; - int log_width = state->src_logWidth; - int log_height = state->src_logHeight; - int format = state->format; - int wrap = state->wrap; - int filter = state->filter; - - int32_t* pBits = ((uint32_t*)base_addr) + mip_offset; + uint32_t color; if (filter) { - int u0 = address(wrap, u - (0x80000 >> log_width)); - int v0 = address(wrap, v - (0x80000 >> log_height)); - int u1 = address(wrap, u + (0x80000 >> log_width)); - int v1 = address(wrap, v + (0x80000 >> log_height)); + // addressing + uint32_t offset00, offset01, offset10, offset11; + uint32_t alpha, beta; + uint8_t* addr[4]; + uint32_t texel[4]; - int x0 = u0 >> (20 - log_width); - int y0 = v0 >> (20 - log_height); - int x1 = u1 >> (20 - log_width); - int y1 = v1 >> (20 - log_height); + TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv, + &offset00, &offset01, &offset10, &offset11, &alpha, &beta); - // memory lookup + addr[0] = base_addr + offset00 * stride; + addr[1] = base_addr + offset01 * stride; + addr[2] = base_addr + offset10 * stride; + addr[3] = base_addr + offset11 * stride; - int c0 = pBits[x0 + (y0 << log_width)]; - int c1 = pBits[x1 + (y0 << log_width)]; - int c2 = pBits[x0 + (y1 << log_width)]; - int c3 = pBits[x1 + (y1 << log_width)]; + // memory fetch + texel_read(texel, addr, 4, stride); // filtering - - int alpha = x0 & 0xff; - int beta = y0 & 0xff; - - int c0a, c0b; - int c1a, c1b; - int c01a, c01b; - - unpack(format, c0, &c0a, &c0b); - unpack(format, c1, &c1a, &c1b); - lerp(c0a, c0b, c1a, c1b, alpha, &c01a, &c01b); - - int c2a, c2b; - int c3a, c3b; - int c23a, c23b; - - unpack(format, c2, &c2a, &c2b); - unpack(format, c3, &c3a, &c3b); - lerp(c2a, c2b, c3a, c3b, alpha, &c23a, &c23b); - - int c4a, c4b; - lerp(c01a, c01b, c23a, c23b, beta, &c4a, &c4b); - return pack(format, c4a, c4b); + color = TexFilterLinear( + format, texel[0], texel[1], texel[2], texel[3], alpha, beta); } else { - int u0 = address(wrap, u); - int v0 = address(wrap, v); + // addressing + uint32_t offset; + uint8_t* addr; + uint32_t texel; - int x0 = u0 >> (20 - log_width); - int y0 = v0 >> (20 - log_height); + TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset); + + addr = base_addr + offset * stride; + + // memory fetch + texel_read(&texel, &addr, 1, stride); - int c0 = pBits[x0 + (y0 <> 8) & 0x00ff00ff; - int bl = b & 0x00ff00ff; - int bh = (b >> 8) & 0x00ff00ff; - int frac = (lod >> 12) & 0xff; - int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; - int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; - int c = al | (ah << 8); - return c; -} - -inline int tex3_sw(kernel_arg_t* state, int stage, int u, int v, int lod) { - int lodn = MIN(lod + 0x10000, TEX_LOD_MAX); - int a = tex_sw(state, 0, u, v, lod); - int b = tex_sw(state, 0, u, v, lodn); - int al = a & 0x00ff00ff; - int ah = (a >> 8) & 0x00ff00ff; - - int bl = b & 0x00ff00ff; - int bh = (b >> 8) & 0x00ff00ff; - int frac = (lod >> 12) & 0xff; - int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; - int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; - int c = al | (ah << 8); - return c; -} - -#endif \ No newline at end of file +inline uint32_t tex_load(kernel_arg_t* state, + Fixed xu, + Fixed xv, + Fixed<16> xj) { + uint32_t color; + uint32_t j = std::max(xj.data(), Fixed<16>::ONE); + uint32_t l = std::min(log2floor(j) - 16, TEX_LOD_MAX); + if (state->filter == 2) { + uint32_t ln = std::min(l + 1, TEX_LOD_MAX); + uint32_t f = (j - (1 << (l + 16))) >> (l + 16 - 8); + uint32_t texel0, texel1; + if (state->use_sw) { + texel0 = vx_tex_sw(state, xu, xv, l); + texel1 = vx_tex_sw(state, xu, xv, ln); + } else { + texel0 = vx_tex(0, xu.data(), xv.data(), l); + texel1 = vx_tex(0, xu.data(), xv.data(), ln); + } + uint32_t cl, ch; + { + uint32_t c0l, c0h, c1l, c1h; + Unpack8888(texel0, &c0l, &c0h); + Unpack8888(texel1, &c1l, &c1h); + cl = Lerp8888(c0l, c1l, f); + ch = Lerp8888(c0h, c1h, f); + } + color = Pack8888(cl, ch); + //vx_printf("j=0x%x, l=%d, ln=%d, f=%d, texel0=0x%x, texel1=0x%x, color=0x%x\n", j, l, ln, f, texel0, texel1, color); + } else { + if (state->use_sw) { + color = vx_tex_sw(state, xu, xv, l); + } else { + color = vx_tex(0, xu.data(), xv.data(), l); + } + } + return color; +} \ No newline at end of file diff --git a/tests/regression/tex/tga.cpp b/tests/regression/tex/tga.cpp deleted file mode 100644 index 62641587..00000000 --- a/tests/regression/tex/tga.cpp +++ /dev/null @@ -1,122 +0,0 @@ -#include "tga.h" -#include -#include -#include "format.h" - -struct __attribute__((__packed__)) tga_header_t { - int8_t idlength; - int8_t colormaptype; - int8_t imagetype; - int16_t colormaporigin; - int16_t colormaplength; - int8_t colormapdepth; - int16_t xoffset; - int16_t yoffset; - int16_t width; - int16_t height; - int8_t bitsperpixel; - int8_t imagedescriptor; -}; - -int LoadTGA(const char *filename, - std::vector &pixels, - uint32_t *width, - uint32_t *height, - uint32_t *bpp) { - std::ifstream ifs(filename, std::ios::in | std::ios::binary); - if (!ifs.is_open()) { - std::cerr << "couldn't open file: " << filename << "!" << std::endl; - return -1; - } - - tga_header_t header; - ifs.read(reinterpret_cast(&header), sizeof(tga_header_t)); - if (ifs.fail()) { - std::cerr << "invalid TGA file header!" << std::endl; - return -1; - } - - if (header.imagetype != 2) { - std::cerr << "unsupported TGA encoding format!" << std::endl; - return -1; - } - - ifs.seekg(header.idlength, std::ios::cur); // skip string - if (ifs.fail()) { - std::cerr << "invalid TGA file!" << std::endl; - return -1; - } - - switch (header.bitsperpixel) { - case 16: - case 24: - case 32: { - // Read pixels data - auto stride = header.bitsperpixel / 8; - pixels.resize(stride * header.width * header.height); - ifs.read((char*)pixels.data(), pixels.size()); - if (ifs.fail()) { - std::cerr << "invalid TGA file!" << std::endl; - return -1; - } - *bpp = stride; - break; - } - default: - std::cerr << "unsupported TGA bitsperpixel!" << std::endl; - return -1; - } - - *width = header.width; - *height = header.height; - - return 0; -} - -int SaveTGA(const char *filename, - const std::vector &pixels, - uint32_t width, - uint32_t height, - uint32_t bpp) { - std::ofstream ofs(filename, std::ios::out | std::ios::binary); - if (!ofs.is_open()) { - std::cerr << "couldn't create file: " << filename << "!" << std::endl; - return -1; - } - - if (bpp < 2 || bpp > 4) { - std::cerr << "unsupported pixel stride: " << bpp << "!" << std::endl; - return -1; - } - - tga_header_t header; - header.idlength = 0; - header.colormaptype = 0; // no palette - header.imagetype = 2; // color mapped data - header.colormaporigin = 0; - header.colormaplength = 0; - header.colormapdepth = 0; - header.xoffset = 0; - header.yoffset = 0; - header.width = width; - header.height = height; - header.bitsperpixel = bpp * 8; - header.imagedescriptor = 0; - - // write header - ofs.write(reinterpret_cast(&header), sizeof(tga_header_t)); - - // write pixel data - uint32_t pitch = bpp * width; - const uint8_t* pixel_bytes = pixels.data() + (height - 1) * pitch; - for (uint32_t y = 0; y < height; ++y) { - const uint8_t* pixel_row = pixel_bytes; - for (uint32_t x = 0; x < width; ++x) { - ofs.write((const char*)pixel_row, bpp); - pixel_row += bpp; - } - pixel_bytes -= pitch; - } - - return 0; -} \ No newline at end of file diff --git a/tests/regression/tex/tga.h b/tests/regression/tex/tga.h deleted file mode 100644 index 24b92a75..00000000 --- a/tests/regression/tex/tga.h +++ /dev/null @@ -1,14 +0,0 @@ -#include -#include - -int LoadTGA(const char *filename, - std::vector &pixels, - uint32_t *width, - uint32_t *height, - uint32_t *bpp); - -int SaveTGA(const char *filename, - const std::vector &pixels, - uint32_t width, - uint32_t height, - uint32_t bpp); \ No newline at end of file diff --git a/tests/regression/tex/utils.cpp b/tests/regression/tex/utils.cpp index 8a2ff760..aee98293 100644 --- a/tests/regression/tex/utils.cpp +++ b/tests/regression/tex/utils.cpp @@ -1,10 +1,12 @@ #include "utils.h" #include -#include -#include "blitter.h" -#include "format.h" -#include "tga.h" -#include "lupng.h" +#include +#include +#include +#include +#include + +using namespace cocogfx; std::string getFileExt(const std::string& str) { auto i = str.rfind('.'); @@ -41,22 +43,9 @@ int LoadImage(const char *filename, return ret; } else if (iequals(ext, "png")) { - auto image = luPngReadFile(filename, NULL); - if (image == NULL) - return -1; - if (image->depth != 8 - || (image->channels != 3 - && image->channels != 4)) { - luImageRelease(image, NULL); - std::cerr << "invalid png file format!" << std::endl; - return -1; - } - pixels.resize(image->channels * image->width * image->height); - memcpy(pixels.data(), image->data, pixels.size()); - img_width = image->width; - img_height = image->height; - img_bpp = image->channels; - luImageRelease(image, NULL); + int ret = LoadPNG(filename, pixels, &img_width, &img_height, &img_bpp); + if (ret) + return ret; } else { std::cerr << "invalid file extension: " << ext << "!" << std::endl; return -1; @@ -83,7 +72,7 @@ int LoadImage(const char *filename, if (img_format != format) { // format conversion to RGBA std::vector staging; - int ret = ConvertImage(staging, pixels, img_width, img_height, img_format, format); + int ret = ConvertImage(staging, format, pixels, img_format, img_width, img_height, img_width * img_bpp); if (ret) return ret; pixels.swap(staging); @@ -100,19 +89,13 @@ int SaveImage(const char *filename, const std::vector &pixels, uint32_t width, uint32_t height) { - uint32_t bpp = Format::GetInfo(format).BytePerPixel; + uint32_t bpp = GetInfo(format).BytePerPixel; auto ext = getFileExt(filename); if (iequals(ext, "tga")) { return SaveTGA(filename, pixels, width, height, bpp); } else if (iequals(ext, "png")) { - LuImage image; - image.width = width; - image.height = height; - image.depth = 8; - image.channels = bpp; - image.data = (uint8_t*)pixels.data(); - return luPngWriteFile(filename, &image); + return SavePNG(filename, pixels, width, height, bpp); } else { std::cerr << "invalid file extension: " << ext << "!" << std::endl; return -1; @@ -132,63 +115,8 @@ void dump_image(const std::vector& pixels, uint32_t width, uint32_t hei pixel32 |= pixel8 << (b * 8); } if (x) std::cout << ", "; - std::cout << std::hex << pixel32; + std::cout << std::hex << std::setw(bpp * 2) << std::setfill('0') << pixel32; } std::cout << std::endl; } -} - -int CopyBuffers(SurfaceDesc &dstDesc, - int32_t dstOffsetX, - int32_t dstOffsetY, - uint32_t copyWidth, - uint32_t copyHeight, - const SurfaceDesc &srcDesc, - int32_t srcOffsetX, - int32_t srcOffsetY) { - - static const BlitTable s_blitTable; - - if ((srcOffsetX >= (int32_t)srcDesc.Width) || (srcOffsetY >= (int32_t)srcDesc.Height) || - (dstOffsetX >= (int32_t)dstDesc.Width) || (dstOffsetY >= (int32_t)dstDesc.Height)) { - return -1; - } - - if (copyWidth > dstDesc.Width) { - copyWidth = dstDesc.Width; - } - - if (copyWidth > srcDesc.Width) { - copyWidth = srcDesc.Width; - } - - if (copyHeight > dstDesc.Height) { - copyHeight = dstDesc.Height; - } - - if (copyHeight > srcDesc.Height) { - copyHeight = srcDesc.Height; - } - - return s_blitTable.get(srcDesc.Format, dstDesc.Format)( - dstDesc, dstOffsetX, dstOffsetY, copyWidth, copyHeight, srcDesc, - srcOffsetX, srcOffsetY); -} - -int ConvertImage(std::vector& dst_pixels, - const std::vector& src_pixels, - uint32_t width, - uint32_t height, - ePixelFormat src_format, - ePixelFormat dst_format) { - - uint32_t src_pitch = Format::GetInfo(src_format).BytePerPixel * width; - uint32_t dst_pitch = Format::GetInfo(dst_format).BytePerPixel * width; - - dst_pixels.resize(dst_pitch * height); - - SurfaceDesc srcDesc{src_format, (uint8_t*)src_pixels.data(), width, height, src_pitch}; - SurfaceDesc dstDesc{dst_format, dst_pixels.data(), width, height, dst_pitch}; - - return CopyBuffers(dstDesc, 0, 0, width, height, srcDesc, 0, 0); } \ No newline at end of file diff --git a/tests/regression/tex/utils.h b/tests/regression/tex/utils.h index 48b1ad55..25c4e3ad 100644 --- a/tests/regression/tex/utils.h +++ b/tests/regression/tex/utils.h @@ -1,43 +1,22 @@ #include #include -#include -#include "surfacedesc.h" - -#define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1)))) - -inline uint32_t ilog2 (uint32_t value) { - return (uint32_t)(sizeof(uint32_t) * 8UL) - (uint32_t)__builtin_clzl((value << 1) - 1UL) - 1; -} +#include +#include +#include int LoadImage(const char *filename, - ePixelFormat format, + cocogfx::ePixelFormat format, std::vector &pixels, uint32_t *width, uint32_t *height); int SaveImage(const char *filename, - ePixelFormat format, + cocogfx::ePixelFormat format, const std::vector &pixels, uint32_t width, uint32_t height); -int CopyBuffers(SurfaceDesc &dstDesc, - int32_t dstOffsetX, - int32_t dstOffsetY, - uint32_t copyWidth, - uint32_t copyHeight, - const SurfaceDesc &srcDesc, - int32_t srcOffsetX, - int32_t srcOffsetY); - -int ConvertImage(std::vector& dst_pixels, - const std::vector& src_pixels, - uint32_t width, - uint32_t height, - ePixelFormat src_format, - ePixelFormat dst_format); - void dump_image(const std::vector& pixels, uint32_t width, uint32_t height, - uint32_t bpp); + uint32_t bpp); \ No newline at end of file diff --git a/tests/riscv/isa/Makefile b/tests/riscv/isa/Makefile index fba3bdd1..de35c0d0 100644 --- a/tests/riscv/isa/Makefile +++ b/tests/riscv/isa/Makefile @@ -10,7 +10,7 @@ TESTS := $(filter-out $(EXCLUDED_TESTS), $(ALL_TESTS)) all: run-simx: - $(foreach test, $(TESTS), ../../../sim/simX/simX -r -a rv32i -c 1 -i $(test) || exit;) + $(foreach test, $(TESTS), ../../../sim/simx/simx -r -a rv32i -c 1 -i $(test) || exit;) run-rtlsim: $(foreach test, $(TESTS), ../../../sim/rtlsim/rtlsim -r $(test) || exit;) diff --git a/tests/runtime/fibonacci/Makefile b/tests/runtime/fibonacci/Makefile index cd5195e0..1ea96718 100644 --- a/tests/runtime/fibonacci/Makefile +++ b/tests/runtime/fibonacci/Makefile @@ -30,7 +30,7 @@ run-rtlsim: $(PROJECT).bin ../../../sim/rtlsim/rtlsim $(PROJECT).bin run-simx: $(PROJECT).bin - ../../../sim/simX/simX -a rv32i -c 1 -i $(PROJECT).bin + ../../../sim/simx/simx -a rv32i -c 1 -i $(PROJECT).bin .depend: $(SRCS) $(CC) $(CFLAGS) -MM $^ > .depend; diff --git a/tests/runtime/fibonacci/main.cpp b/tests/runtime/fibonacci/main.cpp index f6612c29..c6fc036a 100644 --- a/tests/runtime/fibonacci/main.cpp +++ b/tests/runtime/fibonacci/main.cpp @@ -1,4 +1,5 @@ #include +#include const int Num = 9; const int Ans = 34; @@ -14,12 +15,12 @@ int main() { int fib = fibonacci(Num); - printf("fibonacci(%d) = %d\n", Num, fib); + vx_printf("fibonacci(%d) = %d\n", Num, fib); if (fib == Ans) { - printf("Passed!\n"); + vx_printf("Passed!\n"); } else { - printf("Failed! value=%d, expected=%d\n", fib, Ans); + vx_printf("Failed! value=%d, expected=%d\n", fib, Ans); errors = 1; } diff --git a/tests/runtime/hello/Makefile b/tests/runtime/hello/Makefile index 43e768b6..9c83df0c 100644 --- a/tests/runtime/hello/Makefile +++ b/tests/runtime/hello/Makefile @@ -30,7 +30,7 @@ run-rtlsim: $(PROJECT).bin ../../../sim/rtlsim/rtlsim $(PROJECT).bin run-simx: $(PROJECT).bin - ../../../sim/simX/simX -a rv32i -c 1 -i $(PROJECT).bin + ../../../sim/simx/simx -a rv32i -c 1 -i $(PROJECT).bin .depend: $(SRCS) $(CC) $(CFLAGS) -MM $^ > .depend; diff --git a/tests/runtime/hello/main.cpp b/tests/runtime/hello/main.cpp index 69904cfd..94aff07e 100644 --- a/tests/runtime/hello/main.cpp +++ b/tests/runtime/hello/main.cpp @@ -1,8 +1,9 @@ #include +#include int main() { - printf("Hello World!\n"); + vx_printf("Hello World!\n"); return 0; } \ No newline at end of file diff --git a/tests/runtime/simple/Makefile b/tests/runtime/simple/Makefile index dabb4cc0..79e2a2e5 100644 --- a/tests/runtime/simple/Makefile +++ b/tests/runtime/simple/Makefile @@ -30,7 +30,7 @@ run-rtlsim: $(PROJECT).bin ../../../sim/rtlsim/rtlsim $(PROJECT).bin run-simx: $(PROJECT).bin - ../../../sim/simX/simX -a rv32i -c 1 -i $(PROJECT).bin + ../../../sim/simx/simx -a rv32i -c 1 -i $(PROJECT).bin .depend: $(SRCS) $(CC) $(CFLAGS) -MM $^ > .depend; diff --git a/third_party/Makefile b/third_party/Makefile new file mode 100644 index 00000000..4cf70b2d --- /dev/null +++ b/third_party/Makefile @@ -0,0 +1,20 @@ +all: fpnew cocogfx softfloat ramulator + +fpnew: + +cocogfx: + $(MAKE) -C cocogfx + +softfloat: + SPECIALIZE_TYPE=RISCV SOFTFLOAT_OPTS="-fPIC -DSOFTFLOAT_ROUND_ODD -DINLINE_LEVEL=5 -DSOFTFLOAT_FAST_DIV32TO16 -DSOFTFLOAT_FAST_DIV64TO32" $(MAKE) -C softfloat/build/Linux-x86_64-GCC + +ramulator: + cd ramulator && git apply ../../miscs/patch/ramulator.patch 2> /dev/null; true + $(MAKE) -C ramulator libramulator.a + +clean: + $(MAKE) -C cocogfx clean + $(MAKE) -C softfloat/build/Linux-x86_64-GCC clean + $(MAKE) -C ramulator clean + +.PHONY: all fpnew cocogfx softfloat ramulator \ No newline at end of file diff --git a/third_party/cocogfx b/third_party/cocogfx new file mode 160000 index 00000000..04b10969 --- /dev/null +++ b/third_party/cocogfx @@ -0,0 +1 @@ +Subproject commit 04b109692cf6d0128f5ae89cbb4a7d77bfbc9f6a diff --git a/third_party/fpnew b/third_party/fpnew new file mode 160000 index 00000000..0bfbeede --- /dev/null +++ b/third_party/fpnew @@ -0,0 +1 @@ +Subproject commit 0bfbeede0e01b2e44e41bb14c70a80efeffa1bbd diff --git a/third_party/ramulator b/third_party/ramulator new file mode 160000 index 00000000..4edcb0d0 --- /dev/null +++ b/third_party/ramulator @@ -0,0 +1 @@ +Subproject commit 4edcb0d05aac9ec46b032a7bf59595c0418287f7 diff --git a/sim/common/softfloat b/third_party/softfloat similarity index 100% rename from sim/common/softfloat rename to third_party/softfloat