From 27a65fdee78febacf5e51deee36ed3260855c9f3 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 14 Nov 2021 09:05:15 -0500 Subject: [PATCH] driver refactoring --- driver/common/opae.cpp | 47 +++++------- driver/common/vx_scope.h | 2 +- driver/common/vx_utils.cpp | 22 ++++-- driver/common/vx_utils.h | 11 +++ driver/include/vortex.h | 19 +++-- driver/rtlsim/Makefile | 2 +- driver/rtlsim/vortex.cpp | 56 +++++++------- driver/simx/Makefile | 2 +- driver/simx/vortex.cpp | 109 ++++++++++------------------ driver/stub/vortex.cpp | 12 +-- sim/common/mem.cpp | 53 ++++++++------ sim/common/mem.h | 12 +-- sim/common/simobject.h | 4 + sim/common/util.h | 5 -- sim/rtlsim/main.cpp | 4 +- sim/rtlsim/simulator.cpp | 2 +- tests/regression/basic/main.cpp | 4 +- tests/regression/demo/main.cpp | 4 +- tests/regression/diverge/main.cpp | 2 +- tests/regression/dogfood/main.cpp | 4 +- tests/regression/fence/main.cpp | 4 +- tests/regression/io_addr/main.cpp | 2 +- tests/regression/mstress/main.cpp | 4 +- tests/regression/no_mf_ext/main.cpp | 2 +- tests/regression/no_smem/main.cpp | 2 +- tests/regression/printf/main.cpp | 4 +- tests/regression/tex/main.cpp | 4 +- 27 files changed, 200 insertions(+), 198 deletions(-) create mode 100644 driver/common/vx_utils.h diff --git a/driver/common/opae.cpp b/driver/common/opae.cpp index aa4bf933..e0f9ad09 100755 --- a/driver/common/opae.cpp +++ b/driver/common/opae.cpp @@ -17,6 +17,7 @@ #include #endif +#include "vx_utils.h" #include #include #include "vortex_afu.h" @@ -52,7 +53,7 @@ typedef struct vx_device_ { fpga_handle fpga; - size_t mem_allocation; + uint64_t mem_allocation; unsigned version; unsigned num_cores; unsigned num_warps; @@ -64,19 +65,9 @@ typedef struct vx_buffer_ { void* host_ptr; uint64_t io_addr; vx_device_h hdevice; - size_t size; + uint64_t size; } vx_buffer_t; -inline size_t aligned_size(size_t size, size_t alignment) { - assert(0 == (alignment & (alignment - 1))); - return (size + alignment - 1) & ~(alignment - 1); -} - -inline bool is_aligned(size_t addr, size_t alignment) { - assert(0 == (alignment & (alignment - 1))); - return 0 == (addr & (alignment - 1)); -} - /////////////////////////////////////////////////////////////////////////////// #ifdef DUMP_PERF_STATS @@ -107,7 +98,7 @@ AutoPerfDump gAutoPerfDump; /////////////////////////////////////////////////////////////////////////////// -extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { +extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { if (nullptr == hdevice) return -1; @@ -279,7 +270,7 @@ extern int vx_dev_close(vx_device_h hdevice) { return 0; } -extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) { +extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) { if (nullptr == hdevice || nullptr == dev_maddr || 0 >= size) @@ -299,7 +290,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) return 0; } -extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) { +extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) { fpga_result res; void* host_ptr; uint64_t wsid; @@ -367,7 +358,7 @@ extern int vx_buf_release(vx_buffer_h hbuffer) { return 0; } -extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { +extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { if (nullptr == hdevice) return -1; @@ -386,7 +377,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { #endif // to milliseconds - long long sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000); + uint64_t sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000); for (;;) { uint64_t status; @@ -430,7 +421,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { return 0; } -extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) { +extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -438,8 +429,8 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer); vx_device_t *device = ((vx_device_t*)buffer->hdevice); - size_t dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + uint64_t dev_mem_size = LOCAL_MEM_SIZE; + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); // check alignment if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE)) @@ -454,7 +445,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si return -1; // Ensure ready for new command - if (vx_ready_wait(buffer->hdevice, -1) != 0) + if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0) return -1; auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE); @@ -465,13 +456,13 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE)); // Wait for the write operation to finish - if (vx_ready_wait(buffer->hdevice, -1) != 0) + if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0) return -1; return 0; } -extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) { +extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -479,8 +470,8 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer); vx_device_t *device = ((vx_device_t*)buffer->hdevice); - size_t dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + uint64_t dev_mem_size = LOCAL_MEM_SIZE; + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); // check alignment if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE)) @@ -495,7 +486,7 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, return -1; // Ensure ready for new command - if (vx_ready_wait(buffer->hdevice, -1) != 0) + if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0) return -1; auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE); @@ -506,7 +497,7 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ)); // Wait for the write operation to finish - if (vx_ready_wait(buffer->hdevice, -1) != 0) + if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0) return -1; return 0; @@ -519,7 +510,7 @@ extern int vx_start(vx_device_h hdevice) { vx_device_t *device = ((vx_device_t*)hdevice); // Ensure ready for new command - if (vx_ready_wait(hdevice, -1) != 0) + if (vx_ready_wait(hdevice, MAX_TIMEOUT) != 0) return -1; // start execution diff --git a/driver/common/vx_scope.h b/driver/common/vx_scope.h index dfc53520..0e2ae081 100644 --- a/driver/common/vx_scope.h +++ b/driver/common/vx_scope.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #ifdef USE_VLSIM #include diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index 79853aa1..5b70e09b 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -1,17 +1,29 @@ +#include "vx_utils.h" #include #include #include #include #include +#include -extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size) { +uint64_t aligned_size(uint64_t size, uint64_t alignment) { + assert(0 == (alignment & (alignment - 1))); + return (size + alignment - 1) & ~(alignment - 1); +} + +bool is_aligned(uint64_t addr, uint64_t alignment) { + assert(0 == (alignment & (alignment - 1))); + return 0 == (addr & (alignment - 1)); +} + +extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, uint64_t size) { int err = 0; if (NULL == content || 0 == size) return -1; uint32_t buffer_transfer_size = 65536; - unsigned kernel_base_addr; + uint64_t kernel_base_addr; err = vx_dev_caps(device, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr); if (err != 0) return -1; @@ -29,9 +41,9 @@ extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_ // upload content // - size_t offset = 0; + uint64_t offset = 0; while (offset < size) { - auto chunk_size = std::min(buffer_transfer_size, size - offset); + auto chunk_size = std::min(buffer_transfer_size, size - offset); std::memcpy(buf_ptr, (uint8_t*)content + offset, chunk_size); /*printf("*** Upload Kernel to 0x%0x: data=", kernel_base_addr + offset); @@ -127,7 +139,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t mem_lat = 0; #endif - unsigned num_cores; + uint64_t num_cores; ret = vx_dev_caps(device, VX_CAPS_MAX_CORES, &num_cores); if (ret != 0) return ret; diff --git a/driver/common/vx_utils.h b/driver/common/vx_utils.h new file mode 100644 index 00000000..b86c75af --- /dev/null +++ b/driver/common/vx_utils.h @@ -0,0 +1,11 @@ +#pragma once + +#include + +uint64_t aligned_size(uint64_t size, uint64_t alignment); + +bool is_aligned(uint64_t addr, uint64_t alignment); + +#define CACHE_BLOCK_SIZE 64 +#define ALLOC_BASE_ADDR 0x00000000 +#define LOCAL_MEM_SIZE 4294967296 // 4 GB \ No newline at end of file diff --git a/driver/include/vortex.h b/driver/include/vortex.h index 05648671..0fc9c5ce 100644 --- a/driver/include/vortex.h +++ b/driver/include/vortex.h @@ -2,6 +2,7 @@ #define __VX_DRIVER_H__ #include +#include #include #ifdef __cplusplus @@ -22,9 +23,7 @@ typedef void* vx_buffer_h; #define VX_CAPS_ALLOC_BASE_ADDR 0x6 #define VX_CAPS_KERNEL_BASE_ADDR 0x7 -#define CACHE_BLOCK_SIZE 64 -#define ALLOC_BASE_ADDR 0x00000000 -#define LOCAL_MEM_SIZE 0xffffffff +#define MAX_TIMEOUT (60*60*1000) // 1hr // open the device and connect to it int vx_dev_open(vx_device_h* hdevice); @@ -33,10 +32,10 @@ int vx_dev_open(vx_device_h* hdevice); int vx_dev_close(vx_device_h hdevice); // return device configurations -int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value); +int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value); // Allocate shared buffer with device -int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer); +int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer); // Get host pointer address void* vx_host_ptr(vx_buffer_h hbuffer); @@ -45,24 +44,24 @@ void* vx_host_ptr(vx_buffer_h hbuffer); int vx_buf_release(vx_buffer_h hbuffer); // allocate device memory and return address -int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr); +int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr); // Copy bytes from buffer to device local memory -int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset); +int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset); // Copy bytes from device local memory to buffer -int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dst_offset); +int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dst_offset); // Start device execution int vx_start(vx_device_h hdevice); // Wait for device ready with milliseconds timeout -int vx_ready_wait(vx_device_h hdevice, long long timeout); +int vx_ready_wait(vx_device_h hdevice, uint64_t timeout); ////////////////////////////// UTILITY FUNCIONS /////////////////////////////// // upload kernel bytes to device -int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size); +int vx_upload_kernel_bytes(vx_device_h device, const void* content, uint64_t size); // upload kernel file to device int vx_upload_kernel_file(vx_device_h device, const char* filename); diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index cf0a184d..4626eeb3 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -3,7 +3,7 @@ RTLSIM_DIR = ../../sim/rtlsim CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors #CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors -CXXFLAGS += -I../include -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common +CXXFLAGS += -I../include -I../common -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common LDFLAGS += $(RTLSIM_DIR)/librtlsim.a diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index cfed5a97..bed5c807 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -7,11 +7,14 @@ #include #include +#include #include #include #include #include +#define RAM_PAGE_SIZE 4096 + using namespace vortex; /////////////////////////////////////////////////////////////////////////////// @@ -19,7 +22,7 @@ using namespace vortex; class vx_device; class vx_buffer { public: - vx_buffer(size_t size, vx_device* device) + vx_buffer(uint64_t size, vx_device* device) : size_(size) , device_(device) { auto aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE); @@ -36,7 +39,7 @@ public: return data_; } - size_t size() const { + uint64_t size() const { return size_; } @@ -45,7 +48,7 @@ public: } private: - size_t size_; + uint64_t size_; vx_device* device_; void* data_; }; @@ -54,9 +57,10 @@ private: class vx_device { public: - vx_device() : ram_((1<<12), (1<<20)) { - mem_allocation_ = ALLOC_BASE_ADDR; - } + vx_device() + : ram_(RAM_PAGE_SIZE) + , mem_allocation_(ALLOC_BASE_ADDR) + {} ~vx_device() { if (future_.valid()) { @@ -64,9 +68,9 @@ public: } } - int alloc_local_mem(size_t size, size_t* dev_maddr) { - auto dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) { + uint64_t dev_mem_size = LOCAL_MEM_SIZE; + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (mem_allocation_ + asize > dev_mem_size) return -1; *dev_maddr = mem_allocation_; @@ -74,9 +78,9 @@ public: return 0; } - int upload(const void* src, size_t dest_addr, size_t size, size_t src_offset) { - size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); - if (dest_addr + asize > ram_.size()) + int upload(const void* src, uint64_t dest_addr, uint64_t size, uint64_t src_offset) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (dest_addr + asize > LOCAL_MEM_SIZE) return -1; /*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src + src_offset)); @@ -92,9 +96,9 @@ public: return 0; } - int download(void* dest, size_t src_addr, size_t size, size_t dest_offset) { - size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); - if (src_addr + asize > ram_.size()) + int download(void* dest, uint64_t src_addr, uint64_t size, uint64_t dest_offset) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (src_addr + asize > LOCAL_MEM_SIZE) return -1; ram_.read((uint8_t*)dest + dest_offset, src_addr, asize); @@ -125,10 +129,10 @@ public: return 0; } - int wait(long long timeout) { + int wait(uint64_t timeout) { if (!future_.valid()) return 0; - auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000); + uint64_t timeout_sec = timeout / 1000; std::chrono::seconds wait_time(1); for (;;) { auto status = future_.wait_for(wait_time); // wait for 1 sec and check status @@ -141,9 +145,9 @@ public: private: - size_t mem_allocation_; RAM ram_; Simulator simulator_; + uint64_t mem_allocation_; std::future future_; }; @@ -177,7 +181,7 @@ AutoPerfDump gAutoPerfDump; /////////////////////////////////////////////////////////////////////////////// -extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { +extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { if (nullptr == hdevice) return -1; @@ -198,10 +202,10 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { *value = CACHE_BLOCK_SIZE; break; case VX_CAPS_LOCAL_MEM_SIZE: - *value = 0xffffffff; + *value = LOCAL_MEM_SIZE; break; case VX_CAPS_ALLOC_BASE_ADDR: - *value = 0x10000000; + *value = ALLOC_BASE_ADDR; break; case VX_CAPS_KERNEL_BASE_ADDR: *value = STARTUP_ADDR; @@ -244,7 +248,7 @@ extern int vx_dev_close(vx_device_h hdevice) { return 0; } -extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) { +extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) { if (nullptr == hdevice || nullptr == dev_maddr || 0 >= size) @@ -255,7 +259,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) } -extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) { +extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) { if (nullptr == hdevice || 0 >= size || nullptr == hbuffer) @@ -294,7 +298,7 @@ extern int vx_buf_release(vx_buffer_h hbuffer) { return 0; } -extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) { +extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -307,7 +311,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset); } -extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) { +extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -329,7 +333,7 @@ extern int vx_start(vx_device_h hdevice) { return device->start(); } -extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { +extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { if (nullptr == hdevice) return -1; diff --git a/driver/simx/Makefile b/driver/simx/Makefile index 82bf6e32..dea65c35 100644 --- a/driver/simx/Makefile +++ b/driver/simx/Makefile @@ -4,7 +4,7 @@ CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CXXFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors CXXFLAGS += -fPIC -Wno-maybe-uninitialized -CXXFLAGS += -I../include -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common +CXXFLAGS += -I../include -I../common -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common CXXFLAGS += $(CONFIGS) CXXFLAGS += -DDUMP_PERF_STATS diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index 5c31cb87..1bd15e07 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -8,11 +8,12 @@ #include #include -#include +#include +#include #include #include -#define PAGE_SIZE 4096 +#define RAM_PAGE_SIZE 4096 using namespace vortex; @@ -22,10 +23,10 @@ class vx_device; class vx_buffer { public: - vx_buffer(size_t size, vx_device* device) + vx_buffer(uint64_t size, vx_device* device) : size_(size) , device_(device) { - auto aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE); + uint64_t aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE); data_ = malloc(aligned_asize); } @@ -39,7 +40,7 @@ public: return data_; } - size_t size() const { + uint64_t size() const { return size_; } @@ -48,7 +49,7 @@ public: } private: - size_t size_; + uint64_t size_; vx_device* device_; void* data_; }; @@ -59,32 +60,23 @@ class vx_device { public: vx_device() : arch_("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS) - , decoder_(arch_) - , mmu_(PAGE_SIZE, arch_.wsize(), true) - , cores_(arch_.num_cores()) , is_done_(false) , is_running_(false) + , mem_allocation_(ALLOC_BASE_ADDR) , thread_(__thread_proc__, this) - , ram_((1<<12), (1<<20)) { - - mem_allocation_ = ALLOC_BASE_ADDR; - mmu_.attach(ram_, 0, 0xffffffff); - for (int i = 0; i < arch_.num_cores(); ++i) { - cores_.at(i) = std::make_shared(arch_, decoder_, mmu_, i); - } - } + , ram_(RAM_PAGE_SIZE) + {} ~vx_device() { mutex_.lock(); is_done_ = true; - mutex_.unlock(); - + mutex_.unlock(); thread_.join(); } - int alloc_local_mem(size_t size, size_t* dev_maddr) { - auto dev_mem_size = LOCAL_MEM_SIZE; - auto asize = aligned_size(size, CACHE_BLOCK_SIZE); + int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) { + uint64_t dev_mem_size = LOCAL_MEM_SIZE; + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (mem_allocation_ + asize > dev_mem_size) return -1; *dev_maddr = mem_allocation_; @@ -92,9 +84,9 @@ public: return 0; } - int upload(const void* src, size_t dest_addr, size_t size, size_t src_offset) { - auto asize = aligned_size(size, CACHE_BLOCK_SIZE); - if (dest_addr + asize > ram_.size()) + int upload(const void* src, uint64_t dest_addr, uint64_t size, uint64_t src_offset) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (dest_addr + asize > LOCAL_MEM_SIZE) return -1; ram_.write((const uint8_t*)src + src_offset, dest_addr, asize); @@ -107,9 +99,9 @@ public: return 0; } - int download(void* dest, size_t src_addr, size_t size, size_t dest_offset) { - size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); - if (src_addr + asize > ram_.size()) + int download(void* dest, uint64_t src_addr, uint64_t size, uint64_t dest_offset) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (src_addr + asize > LOCAL_MEM_SIZE) return -1; ram_.read((uint8_t*)dest + dest_offset, src_addr, asize); @@ -123,19 +115,17 @@ public: } int start() { - mutex_.lock(); - for (int i = 0; i < arch_.num_cores(); ++i) { - cores_.at(i)->clear(); - } + SimPlatform::instance().flush(); + processor_ = std::make_shared(arch_); + processor_->attach_ram(&ram_); is_running_ = true; mutex_.unlock(); - return 0; } - int wait(long long timeout) { - auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000); + int wait(uint64_t timeout) { + uint64_t timeout_sec = timeout / 1000; for (;;) { mutex_.lock(); bool is_running = is_running_; @@ -147,32 +137,10 @@ public: std::this_thread::sleep_for(std::chrono::seconds(1)); } return 0; - } - - int get_csr(int core_id, int addr, unsigned *value) { - *value = cores_.at(core_id)->get_csr(addr, 0, 0); - return 0; - } - - int set_csr(int core_id, int addr, unsigned value) { - cores_.at(core_id)->set_csr(addr, value, 0, 0); - return 0; - } + } private: - void run() { - bool running; - do { - running = false; - for (auto& core : cores_) { - core->step(); - if (core->running()) - running = true; - } - } while (running); - } - void thread_proc() { std::cout << "Device ready..." << std::flush << std::endl; @@ -188,7 +156,7 @@ private: if (is_running) { std::cout << "Device running..." << std::flush << std::endl; - this->run(); + processor_->run(); mutex_.lock(); is_running_ = false; @@ -206,12 +174,10 @@ private: } ArchDef arch_; - Decoder decoder_; - MemoryUnit mmu_; - std::vector> cores_; + Processor::Ptr processor_; bool is_done_; bool is_running_; - size_t mem_allocation_; + uint64_t mem_allocation_; std::thread thread_; RAM ram_; std::mutex mutex_; @@ -251,6 +217,9 @@ extern int vx_dev_open(vx_device_h* hdevice) { if (nullptr == hdevice) return -1; + if (!SimPlatform::instance().initialize()) + return -1; + *hdevice = new vx_device(); #ifdef DUMP_PERF_STATS @@ -273,10 +242,12 @@ extern int vx_dev_close(vx_device_h hdevice) { delete device; + SimPlatform::instance().finalize(); + return 0; } -extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { +extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { if (nullptr == hdevice) return -1; @@ -314,7 +285,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { return 0; } -extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) { +extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) { if (nullptr == hdevice || nullptr == dev_maddr || 0 >= size) @@ -324,7 +295,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) return device->alloc_local_mem(size, dev_maddr); } -extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) { +extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) { if (nullptr == hdevice || 0 >= size || nullptr == hbuffer) @@ -363,7 +334,7 @@ extern int vx_buf_release(vx_buffer_h hbuffer) { return 0; } -extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) { +extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -376,7 +347,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset); } -extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) { +extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -398,7 +369,7 @@ extern int vx_start(vx_device_h hdevice) { return device->start(); } -extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { +extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { if (nullptr == hdevice) return -1; diff --git a/driver/stub/vortex.cpp b/driver/stub/vortex.cpp index f5079500..95777257 100644 --- a/driver/stub/vortex.cpp +++ b/driver/stub/vortex.cpp @@ -8,15 +8,15 @@ extern int vx_dev_close(vx_device_h /*hdevice*/) { return -1; } -extern int vx_dev_caps(vx_device_h /*hdevice*/, unsigned /*caps_id*/, unsigned* /*value*/) { +extern int vx_dev_caps(vx_device_h /*hdevice*/, uint32_t /*caps_id*/, uint64_t* /*value*/) { return -1; } -extern int vx_alloc_dev_mem(vx_device_h /*hdevice*/, size_t /*size*/, size_t* /*dev_maddr*/) { +extern int vx_alloc_dev_mem(vx_device_h /*hdevice*/, uint64_t /*size*/, uint64_t* /*dev_maddr*/) { return -1; } -extern int vx_alloc_shared_mem(vx_device_h /*hdevice*/, size_t /*size*/, vx_buffer_h* /*hbuffer*/) { +extern int vx_alloc_shared_mem(vx_device_h /*hdevice*/, uint64_t /*size*/, vx_buffer_h* /*hbuffer*/) { return -1; } @@ -28,11 +28,11 @@ extern int vx_buf_release(vx_buffer_h /*hbuffer*/) { return -1; } -extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, size_t /*dev_maddr*/, size_t /*size*/, size_t /*src_offset*/) { +extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, uint64_t /*dev_maddr*/, uint64_t /*size*/, uint64_t /*src_offset*/) { return -1; } -extern int vx_copy_from_dev(vx_buffer_h /*hbuffer*/, size_t /*dev_maddr*/, size_t /*size*/, size_t /*dest_offset*/) { +extern int vx_copy_from_dev(vx_buffer_h /*hbuffer*/, uint64_t /*dev_maddr*/, uint64_t /*size*/, uint64_t /*dest_offset*/) { return -1; } @@ -40,6 +40,6 @@ extern int vx_start(vx_device_h /*hdevice*/) { return -1; } -extern int vx_ready_wait(vx_device_h /*hdevice*/, long long /*timeout*/) { +extern int vx_ready_wait(vx_device_h /*hdevice*/, uint64_t /*timeout*/) { return -1; } \ No newline at end of file diff --git a/sim/common/mem.cpp b/sim/common/mem.cpp index 6c4b94de..ff67489d 100644 --- a/sim/common/mem.cpp +++ b/sim/common/mem.cpp @@ -168,11 +168,12 @@ void MemoryUnit::tlbRm(uint64_t va) { /////////////////////////////////////////////////////////////////////////////// -RAM::RAM(uint32_t num_pages, uint32_t page_size) - : page_bits_(log2ceil(page_size)) { - assert(ispow2(page_size)); - mem_.resize(num_pages, NULL); - size_ = uint64_t(mem_.size()) << page_bits_; +RAM::RAM(uint32_t page_size) + : size_(0) + , page_bits_(log2ceil(page_size)) + , last_page_(nullptr) + , last_page_index_(0) { + assert(ispow2(page_size)); } RAM::~RAM() { @@ -180,31 +181,41 @@ RAM::~RAM() { } void RAM::clear() { - for (auto& page : mem_) { - delete[] page; - page = NULL; + for (auto& page : pages_) { + delete[] page.second; } } uint64_t RAM::size() const { - return size_; + return uint64_t(pages_.size()) << page_bits_; } -uint8_t *RAM::get(uint32_t address) const { - uint32_t page_size = 1 << page_bits_; - uint32_t page_index = address >> page_bits_; - uint32_t byte_offset = address & ((1 << page_bits_) - 1); +uint8_t *RAM::get(uint64_t address) const { + uint32_t page_size = 1 << page_bits_; + uint32_t page_offset = address & (page_size - 1); + uint64_t page_index = address >> page_bits_; - auto &page = mem_.at(page_index); - if (page == NULL) { - uint8_t *ptr = new uint8_t[page_size]; - // set uninitialized data to "baadf00d" - for (uint32_t i = 0; i < page_size; ++i) { - ptr[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff; + uint8_t* page; + if (last_page_ && last_page_index_ == page_index) { + page = last_page_; + } else { + auto it = pages_.find(page_index); + if (it != pages_.end()) { + page = it->second; + } else { + uint8_t *ptr = new uint8_t[page_size]; + // set uninitialized data to "baadf00d" + for (uint32_t i = 0; i < page_size; ++i) { + ptr[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff; + } + pages_.emplace(page_index, ptr); + page = ptr; } - page = ptr; + last_page_ = page; + last_page_index_ = page_index; } - return page + byte_offset; + + return page + page_offset; } void RAM::read(void *data, uint64_t addr, uint64_t size) { diff --git a/sim/common/mem.h b/sim/common/mem.h index 8929b4d9..d404602d 100644 --- a/sim/common/mem.h +++ b/sim/common/mem.h @@ -130,13 +130,13 @@ private: class RAM : public MemDevice { public: - RAM(uint32_t num_pages, uint32_t page_size); - + RAM(uint32_t page_size); ~RAM(); void clear(); uint64_t size() const override; + void read(void *data, uint64_t addr, uint64_t size) override; void write(const void *data, uint64_t addr, uint64_t size) override; @@ -153,11 +153,13 @@ public: private: - uint8_t *get(uint32_t address) const; + uint8_t *get(uint64_t address) const; - mutable std::vector mem_; - uint32_t page_bits_; uint64_t size_; + uint32_t page_bits_; + mutable std::unordered_map pages_; + mutable uint8_t* last_page_; + mutable uint64_t last_page_index_; }; } // namespace vortex \ No newline at end of file diff --git a/sim/common/simobject.h b/sim/common/simobject.h index 487d385c..52c74643 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -282,6 +282,10 @@ public: return true; } + void flush() { + instance().clear(); + } + void finalize() { instance().clear(); } diff --git a/sim/common/util.h b/sim/common/util.h index 668f3e26..b6137199 100644 --- a/sim/common/util.h +++ b/sim/common/util.h @@ -75,11 +75,6 @@ inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) { return (bits << shift) >> (shift + start); } -inline uint64_t aligned_size(uint64_t size, uint32_t alignment) { - assert(0 == (alignment & (alignment - 1))); - return (size + alignment - 1) & ~(alignment - 1); -} - // Apply integer sign extension inline uint32_t sext32(uint32_t word, uint32_t width) { assert(width > 1); diff --git a/sim/rtlsim/main.cpp b/sim/rtlsim/main.cpp index 0f0575f5..652e550f 100644 --- a/sim/rtlsim/main.cpp +++ b/sim/rtlsim/main.cpp @@ -7,6 +7,8 @@ #include #include "simulator.h" +#define RAM_PAGE_SIZE 4096 + using namespace vortex; static void show_usage() { @@ -49,7 +51,7 @@ int main(int argc, char **argv) { for (auto program : programs) { std::cout << "Running " << program << "..." << std::endl; - vortex::RAM ram((1<<12), (1<<20)); + vortex::RAM ram(RAM_PAGE_SIZE); vortex::Simulator simulator; simulator.attach_ram(&ram); diff --git a/sim/rtlsim/simulator.cpp b/sim/rtlsim/simulator.cpp index 8d3f9acf..0f6df7d7 100644 --- a/sim/rtlsim/simulator.cpp +++ b/sim/rtlsim/simulator.cpp @@ -477,7 +477,7 @@ void Simulator::eval_mem_bus(bool clk) { uint8_t* data = (uint8_t*)(vl_obj_->device->mem_req_data); if (base_addr >= IO_COUT_ADDR && base_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + for (int i = 0; i < IO_COUT_SIZE; i++) { if ((byteen >> i) & 0x1) { auto& ss_buf = print_bufs_[i]; char c = data[i]; diff --git a/tests/regression/basic/main.cpp b/tests/regression/basic/main.cpp index c92bae8d..fcea1fda 100755 --- a/tests/regression/basic/main.cpp +++ b/tests/regression/basic/main.cpp @@ -169,7 +169,7 @@ int run_kernel_test(const kernel_arg_t& kernel_arg, std::cout << "start execution" << std::endl; auto t2 = std::chrono::high_resolution_clock::now(); RT_CHECK(vx_start(device)); - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); auto t3 = std::chrono::high_resolution_clock::now(); // read destination buffer from local memory @@ -228,7 +228,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores; + uint64_t max_cores; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); uint32_t num_points = count; uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64; diff --git a/tests/regression/demo/main.cpp b/tests/regression/demo/main.cpp index 2961b517..29cc7d85 100644 --- a/tests/regression/demo/main.cpp +++ b/tests/regression/demo/main.cpp @@ -67,7 +67,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/diverge/main.cpp b/tests/regression/diverge/main.cpp index 7b27760c..778d118f 100644 --- a/tests/regression/diverge/main.cpp +++ b/tests/regression/diverge/main.cpp @@ -121,7 +121,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/dogfood/main.cpp b/tests/regression/dogfood/main.cpp index 804609ae..71ae6624 100644 --- a/tests/regression/dogfood/main.cpp +++ b/tests/regression/dogfood/main.cpp @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); @@ -245,7 +245,7 @@ int main(int argc, char *argv[]) { // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/fence/main.cpp b/tests/regression/fence/main.cpp index 2961b517..29cc7d85 100644 --- a/tests/regression/fence/main.cpp +++ b/tests/regression/fence/main.cpp @@ -67,7 +67,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/io_addr/main.cpp b/tests/regression/io_addr/main.cpp index 7899aa2a..42dcd7c0 100644 --- a/tests/regression/io_addr/main.cpp +++ b/tests/regression/io_addr/main.cpp @@ -101,7 +101,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/mstress/main.cpp b/tests/regression/mstress/main.cpp index bbb4660f..c2354edc 100644 --- a/tests/regression/mstress/main.cpp +++ b/tests/regression/mstress/main.cpp @@ -136,7 +136,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; @@ -194,7 +194,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/no_mf_ext/main.cpp b/tests/regression/no_mf_ext/main.cpp index 01bcfb90..01ae744c 100644 --- a/tests/regression/no_mf_ext/main.cpp +++ b/tests/regression/no_mf_ext/main.cpp @@ -67,7 +67,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/no_smem/main.cpp b/tests/regression/no_smem/main.cpp index 01bcfb90..01ae744c 100644 --- a/tests/regression/no_smem/main.cpp +++ b/tests/regression/no_smem/main.cpp @@ -67,7 +67,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/printf/main.cpp b/tests/regression/printf/main.cpp index 11b9fc50..b9d4db38 100644 --- a/tests/regression/printf/main.cpp +++ b/tests/regression/printf/main.cpp @@ -65,7 +65,7 @@ int run_test() { // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); return 0; } @@ -85,7 +85,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/tex/main.cpp b/tests/regression/tex/main.cpp index 39ffea0c..a83651ee 100644 --- a/tests/regression/tex/main.cpp +++ b/tests/regression/tex/main.cpp @@ -110,7 +110,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); auto time_end = std::chrono::high_resolution_clock::now(); double elapsed = std::chrono::duration_cast(time_end - time_start).count(); @@ -166,7 +166,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));