Vortex 2.0 changes:

+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions
--- a/runtime/.gitignore
+++ b/runtime/.gitignore
--- a/runtime/Makefile
+++ b/runtime/Makefile
@@ -1,49 +1,25 @@
-XLEN ?= 32
+all: stub rtlsim simx opae

-ifeq ($(XLEN),32)
-RISCV_TOOLCHAIN_PATH = /opt/riscv-gnu-toolchain
-else
-RISCV_TOOLCHAIN_PATH = /opt/riscv64-gnu-toolchain
-endif
+stub:
+	$(MAKE) -C stub

-RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
+simx:
+	$(MAKE) -C simx

-CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc
-AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc-ar
-DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objdump
-CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objcopy
+rtlsim:
+	$(MAKE) -C rtlsim

-ifeq ($(XLEN),32)
-CFLAGS += -march=rv32imf -mabi=ilp32f
-else
-CFLAGS += -march=rv64imfd -mabi=lp64d
-endif
+opae:
+	$(MAKE) -C opae

-CFLAGS += -O3 -mcmodel=medany -Wstack-usage=1024 -fno-exceptions -fdata-sections -ffunction-sections
-CFLAGS += -I./include -I../hw
-
-PROJECT = libvortexrt
-
-SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/tinyprintf.c ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c
-
-OBJS := $(addsuffix .o, $(notdir $(SRCS)))
-
-all: $(PROJECT).a $(PROJECT).dump
-
-$(PROJECT).dump: $(PROJECT).a
-	$(DP) -D $(PROJECT).a > $(PROJECT).dump
-
-%.S.o: src/%.S
-	$(CC) $(CFLAGS) -c $< -o $@
-
-%.c.o: src/%.c
-	$(CC) $(CFLAGS) -c $< -o $@
-
-$(PROJECT).a: $(OBJS)
-	$(AR) rcs $@ $^
-
-.depend: $(SRCS)
-	$(CC) $(CFLAGS) -MM $^ > .depend;
+xrt:
+	$(MAKE) -C xrt

 clean:
-	rm -rf *.a *.o *.dump .depend 
+	$(MAKE) clean -C stub
+	$(MAKE) clean -C simx
+	$(MAKE) clean -C rtlsim
+	$(MAKE) clean -C opae
+	$(MAKE) clean -C xrt
+
+.PHONY: all stub simx rtlsim opae xrt clean
--- a/runtime/common/malloc.h
+++ b/runtime/common/malloc.h
@@ -0,0 +1,455 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <assert.h>
+#include <stdio.h>
+
+namespace vortex {
+
+class MemoryAllocator {
+public:
+    MemoryAllocator(
+        uint64_t baseAddress,
+        uint64_t capacity,
+        uint32_t pageAlign, 
+        uint32_t blockAlign) 
+        : baseAddress_(baseAddress)
+        , capacity_(capacity)
+        , pageAlign_(pageAlign)
+        , blockAlign_(blockAlign)
+        , pages_(nullptr)
+        , nextAddress_(0)
+        , allocated_(0)
+    {}
+
+    ~MemoryAllocator() {
+        // Free allocated pages
+        page_t* currPage = pages_;
+        while (currPage) {
+            auto nextPage = currPage->next;
+            this->DeletePage(currPage);
+            currPage = nextPage;
+        }
+    }
+    
+    uint32_t baseAddress() const {
+        return baseAddress_;
+    }
+
+    uint32_t capacity() const {
+        return capacity_;
+    }
+
+    uint64_t free() const {
+        return (capacity_ - allocated_);
+    }
+
+    uint64_t allocated() const {
+        return allocated_;
+    }
+
+    int allocate(uint64_t size, uint64_t* addr) {
+        if (size == 0 || addr == nullptr) {
+            printf("error: invalid argurments\n");
+            return -1;
+        }
+
+        // Align allocation size
+        size = AlignSize(size, blockAlign_);
+
+        // Walk thru all pages to find a free block
+        block_t* freeBlock = nullptr;
+        auto currPage = pages_;
+        while (currPage) {
+            auto currBlock = currPage->freeSList;
+            if (currBlock) {
+                // The free S-list is already sorted with the largest block first
+                // Quick check if the head block has enough space.
+                if (currBlock->size >= size) {
+                    // Find the smallest matching block in the S-list
+                    while (currBlock->nextFreeS 
+                        && (currBlock->nextFreeS->size >= size)) {
+                        currBlock = currBlock->nextFreeS;
+                    }
+                    // Return the free block
+                    freeBlock = currBlock;
+                    break;
+                }
+            }
+            currPage = currPage->next;
+        }
+
+        if (nullptr == freeBlock) {
+            // Allocate a new page for this request
+            currPage = this->NewPage(size);
+            if (nullptr == currPage) {
+                printf("error: out of memory\n");
+                return -1;
+            }
+            freeBlock = currPage->freeSList;
+        }   
+
+        // Remove the block from the free lists
+        assert(freeBlock->size >= size);
+        currPage->RemoveFreeMList(freeBlock);
+        currPage->RemoveFreeSList(freeBlock);
+
+        // If the free block we have found is larger than what we are looking for,
+        // we may be able to split our free block in two.
+        uint64_t extraBytes = freeBlock->size - size;
+        if (extraBytes >= blockAlign_) {
+            // Reduce the free block size to the requested value
+            freeBlock->size = size;
+
+            // Allocate a new block to contain the extra buffer
+            auto nextAddr = freeBlock->addr + size;
+            auto newBlock = new block_t(nextAddr, extraBytes);
+
+            // Add the new block to the free lists
+            currPage->InsertFreeMList(newBlock);
+            currPage->InsertFreeSList(newBlock);
+        }
+
+        // Insert the free block into the used list
+        currPage->InsertUsedList(freeBlock);
+
+        // Return the free block address
+        *addr = baseAddress_ + freeBlock->addr;
+
+        // Update allocated size
+        allocated_ += size;
+
+        return 0;
+    }
+
+    int release(uint64_t addr) {
+        // Walk all pages to find the pointer
+        uint64_t local_addr = addr - baseAddress_;
+        block_t* usedBlock = nullptr;
+        auto currPage = pages_;
+        while (currPage) {
+            if (local_addr >= currPage->addr
+            &&  local_addr < (currPage->addr + currPage->size)) {
+                auto currBlock = currPage->usedList;
+                while (currBlock) {
+                    if (currBlock->addr == local_addr) {
+                        usedBlock = currBlock;
+                        break;
+                    }
+                    currBlock = currBlock->nextUsed;
+                }
+                break;
+            }
+            currPage = currPage->next;
+        }
+
+        // found the corresponding block?
+        if (nullptr == usedBlock) {
+            printf("error: invalid address to release: 0x%lx\n", addr);
+            return -1;
+        }
+
+        auto size = usedBlock->size;
+
+        // Remove the block from the used list
+        currPage->RemoveUsedList(usedBlock);
+
+        // Insert the block into the free M-list.
+        currPage->InsertFreeMList(usedBlock);
+
+        // Check if we can merge adjacent free blocks from the left.        
+        if (usedBlock->prevFreeM) {
+            // Calculate the previous address
+            auto prevAddr = usedBlock->prevFreeM->addr + usedBlock->prevFreeM->size;
+            if (usedBlock->addr == prevAddr) {
+                auto prevBlock = usedBlock->prevFreeM;
+
+                // Merge the blocks to the left
+                prevBlock->size += usedBlock->size;
+                prevBlock->nextFreeM = usedBlock->nextFreeM;
+                if (prevBlock->nextFreeM) {
+                    prevBlock->nextFreeM->prevFreeM = prevBlock;
+                }
+
+                // Detach previous block from the free S-list since size increased
+                currPage->RemoveFreeSList(prevBlock);
+
+                // reset usedBlock
+                delete usedBlock;
+                usedBlock = prevBlock;
+            }
+        }
+
+        // Check if we can merge adjacent free blocks from the right.
+        if (usedBlock->nextFreeM) {
+            // Calculate the next allocation start address
+            auto nextAddr = usedBlock->addr + usedBlock->size;
+            if (usedBlock->nextFreeM->addr == nextAddr) {
+                auto nextBlock = usedBlock->nextFreeM;
+
+                // Merge the blocks to the right
+                usedBlock->size += nextBlock->size;
+                usedBlock->nextFreeM = nextBlock->nextFreeM;
+                if (usedBlock->nextFreeM) {
+                    usedBlock->nextFreeM->prevFreeM = usedBlock;
+                }
+
+                // Delete next block
+                currPage->RemoveFreeSList(nextBlock);
+                delete nextBlock;
+            }
+        }
+
+        // Insert the block into the free S-list.
+        currPage->InsertFreeSList(usedBlock);
+
+        // Check if we can free empty pages
+        if (nullptr == currPage->usedList) {
+            // Try to delete the page
+            while (currPage && this->DeletePage(currPage)) {
+                currPage = this->FindNextEmptyPage();
+            }
+
+        }
+
+        // update allocated size
+        allocated_ -= size;
+
+        return 0;
+    }
+
+private:
+
+    struct block_t {
+        block_t* nextFreeS;
+        block_t* prevFreeS;
+        
+        block_t* nextFreeM;
+        block_t* prevFreeM;
+        
+        block_t* nextUsed;
+        block_t* prevUsed;
+
+        uint64_t addr;
+        uint64_t size;
+
+        block_t(uint64_t addr, uint64_t size) 
+            : nextFreeS(nullptr)
+            , prevFreeS(nullptr)
+            , nextFreeM(nullptr)
+            , prevFreeM(nullptr)
+            , nextUsed(nullptr)
+            , prevUsed(nullptr)
+            , addr(addr)
+            , size(size)
+        {}
+    };
+
+    struct page_t {
+        page_t*  next;        
+        
+        // List of used blocks
+        block_t* usedList;
+        
+        // List with blocks sorted by descreasing sizes
+        // Used for block lookup during memory allocation.
+        block_t* freeSList;
+        
+        // List with blocks sorted by increasing memory addresses
+        // Used for block merging during memory release.
+        block_t* freeMList;
+        
+        uint64_t addr;
+        uint64_t size;
+
+        page_t(uint64_t addr, uint64_t size) : 
+            next(nullptr),            
+            usedList(nullptr),
+            addr(addr),
+            size(size) {
+            freeSList = freeMList = new block_t(addr, size);
+        }
+
+        void InsertUsedList(block_t* block) {
+            block->nextUsed = usedList;
+            if (usedList) {
+                usedList->prevUsed = block;
+            }
+            usedList = block;
+        }
+
+        void RemoveUsedList(block_t* block) {
+            if (block->prevUsed) {
+                block->prevUsed->nextUsed = block->nextUsed;
+            } else {
+                usedList = block->nextUsed;
+            }
+            if (block->nextUsed) {
+                block->nextUsed->prevUsed = block->prevUsed;
+            }
+            block->nextUsed = nullptr;
+            block->prevUsed = nullptr;
+        }
+
+        void InsertFreeMList(block_t* block) {
+            block_t* currBlock = freeMList;
+            block_t* prevBlock = nullptr;
+            while (currBlock && (currBlock->addr < block->addr)) {
+                prevBlock = currBlock;
+                currBlock = currBlock->nextFreeM;
+            }
+            block->nextFreeM = currBlock;
+            block->prevFreeM = prevBlock;
+            if (prevBlock) {
+                prevBlock->nextFreeM = block;
+            } else {
+                freeMList = block;
+            }
+            if (currBlock) {
+                currBlock->prevFreeM = block;
+            }    
+        }
+
+        void RemoveFreeMList(block_t* block) {
+            if (block->prevFreeM) {
+                block->prevFreeM->nextFreeM = block->nextFreeM;
+            } else {
+                freeMList = block->nextFreeM;
+            }
+            if (block->nextFreeM) {
+                block->nextFreeM->prevFreeM = block->prevFreeM;
+            }
+            block->nextFreeM = nullptr;
+            block->prevFreeM = nullptr;
+        }
+
+        void InsertFreeSList(block_t* block) {
+            block_t* currBlock = this->freeSList;
+            block_t* prevBlock = nullptr;
+            while (currBlock && (currBlock->size > block->size)) {
+                prevBlock = currBlock;
+                currBlock = currBlock->nextFreeS;
+            }
+            block->nextFreeS = currBlock;
+            block->prevFreeS = prevBlock;
+            if (prevBlock) {
+                prevBlock->nextFreeS = block;
+            } else {
+                this->freeSList = block;
+            }
+            if (currBlock) {
+                currBlock->prevFreeS = block;
+            }
+        }
+
+        void RemoveFreeSList(block_t* block) {
+            if (block->prevFreeS) {
+                block->prevFreeS->nextFreeS = block->nextFreeS;
+            } else {
+                freeSList = block->nextFreeS;
+            }
+            if (block->nextFreeS) {
+                block->nextFreeS->prevFreeS = block->prevFreeS;
+            }
+            block->nextFreeS = nullptr;
+            block->prevFreeS = nullptr;    
+        }
+    };
+
+    page_t* NewPage(uint64_t size) {
+        // Increase buffer size to include the page and first block size
+        // also add padding to ensure page alignment
+        size = AlignSize(size, pageAlign_);
+
+        // Allocate page memory
+        auto addr = nextAddress_;
+        nextAddress_ += size;
+
+        // Overflow check
+        if (nextAddress_ > capacity_)
+            return nullptr;
+
+        // Allocate object
+        auto newPage = new page_t(addr, size);
+
+        // Insert the new page into the list
+        newPage->next = pages_;
+        pages_ = newPage;
+
+        return newPage;
+    }
+
+    bool DeletePage(page_t* page) {
+        // The page should be empty
+        assert(nullptr == page->usedList);
+        assert(page->freeMList && (nullptr == page->freeMList->nextFreeM));
+
+        // Only delete top-level pages
+        auto nextAddr = page->addr + page->size;
+        if (nextAddr != nextAddress_)
+            return false;
+
+        // Remove the page from the list
+        page_t* prevPage = nullptr;
+        auto currPage = pages_;
+        while (currPage) {
+            if (currPage == page) {
+                if (prevPage) {
+                    prevPage->next = currPage->next;
+                } else {
+                    pages_ = currPage->next;
+                }
+                break;
+            }
+            prevPage = currPage;
+            currPage = currPage->next;
+        }
+
+        // Update next allocation address
+        nextAddress_ = page->addr;
+        
+        // free object
+        delete page->freeMList;
+        delete page;
+
+        return true;
+    }
+
+    page_t* FindNextEmptyPage() {
+       auto currPage = pages_;
+        while (currPage) {
+            if (nullptr == currPage->usedList)
+                return currPage;
+            currPage = currPage->next;
+        } 
+        return nullptr;
+    }
+
+    static uint64_t AlignSize(uint64_t size, uint64_t alignment) {
+        assert(0 == (alignment & (alignment - 1)));
+        return (size + alignment - 1) & ~(alignment - 1);
+    }
+
+    uint64_t baseAddress_;
+    uint64_t capacity_;
+    uint32_t pageAlign_;    
+    uint32_t blockAlign_;    
+    page_t*  pages_;
+    uint16_t nextAddress_;
+    uint64_t allocated_;
+};
+
+} // namespace vortex
--- a/runtime/common/nlohmann_json.hpp
+++ b/runtime/common/nlohmann_json.hpp
--- a/runtime/common/scope.cpp
+++ b/runtime/common/scope.cpp
@@ -0,0 +1,359 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scope.h"
+#include <VX_config.h>
+#include <nlohmann_json.hpp>
+#include <iostream>
+#include <fstream>
+#include <thread>
+#include <chrono>
+#include <vector>
+#include <list>
+#include <assert.h>
+#include <chrono>
+#include <thread>
+#include <condition_variable>
+#include <mutex>
+#include <unordered_set>
+#include <sstream>
+
+#define FRAME_FLUSH_SIZE 100
+
+#define MMIO_SCOPE_READ  (AFU_IMAGE_MMIO_SCOPE_READ * 4)
+#define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4)
+
+#define CMD_GET_WIDTH   0
+#define CMD_GET_COUNT   1
+#define CMD_GET_START   2
+#define CMD_GET_DATA    3
+#define CMD_SET_START   4
+#define CMD_SET_STOP    5
+
+#define CHECK_ERR(_expr)    \
+    do {                    \
+        int err = _expr;    \
+        if (err == 0)       \
+            break;          \
+        printf("[SCOPE] error: '%s' returned %d!\n", #_expr, err); \
+        return err;         \
+    } while (false)
+
+struct tap_signal_t {
+    uint32_t id;  
+    std::string name;    
+    uint32_t width;    
+};
+
+struct tap_t {
+    uint32_t id;    
+    uint32_t width;    
+    uint32_t frames;    
+    uint32_t cur_frame;
+    uint64_t cycle_time;
+    std::string path;
+    std::vector<tap_signal_t> signals;
+};
+
+static scope_callback_t g_callback;
+
+using json = nlohmann::json;
+
+static std::vector<std::string> split(const std::string &s, char delimiter) {
+    std::vector<std::string> tokens;
+    std::string token;
+    std::istringstream tokenStream(s);
+    while (std::getline(tokenStream, token, delimiter)) {
+        tokens.push_back(token);
+    }
+    return tokens;
+}
+
+static void dump_module(std::ofstream& ofs, 
+                        const std::string& name,
+                        std::unordered_map<std::string, std::unordered_set<std::string>>& hierarchy,
+                        std::unordered_map<std::string, tap_t*>& tails,
+                        int indentation) {
+    std::string indent(indentation, ' ');
+    ofs << indent << "$scope module " << name << " $end" << std::endl;
+
+    auto itt = tails.find(name);
+    if (itt != tails.end()) {
+        for (auto& signal : itt->second->signals) {
+            ofs << indent << " $var reg " << signal.width << " " << signal.id << " " << signal.name << " $end" << std::endl;                        
+        }
+    }
+
+    auto ith = hierarchy.find(name);
+    if (ith != hierarchy.end()) {
+        for (auto& child : ith->second) {
+            dump_module(ofs, child, hierarchy, tails, indentation + 1);
+        }
+    }
+
+    ofs << indent << "$upscope $end" << std::endl;
+}
+
+static void dump_header(std::ofstream& ofs, std::vector<tap_t>& taps) {
+    ofs << "$version Generated by Vortex Scope Analyzer $end" << std::endl;
+    ofs << "$timescale 1 ns $end" << std::endl; 
+    ofs << "$scope module TOP $end" << std::endl;
+    ofs << " $var reg 1 0 clk $end" << std::endl;
+
+    std::unordered_map<std::string, std::unordered_set<std::string>> hierarchy;
+    std::unordered_set<std::string> heads;
+    std::unordered_map<std::string, tap_t*> tails;
+
+    // Build hierarchy
+    for (auto& tap : taps) {
+        std::vector<std::string> tokens = split(tap.path, '.');
+        for (size_t i = 1; i < tokens.size(); ++i) {
+            hierarchy[tokens[i-1]].insert(tokens[i]);
+        }
+        auto h = tokens[0];
+        auto t = tokens[tokens.size()-1];
+        heads.insert(h);
+        tails[t] = &tap;
+    }
+
+    // Dump module huierarchy
+    for (auto& head : heads) {
+        dump_module(ofs, head, hierarchy, tails, 1);
+    }
+
+    ofs << "$upscope $end" << std::endl;    
+    ofs << "enddefinitions $end" << std::endl;
+}
+
+static tap_t* find_nearest_tap(std::vector<tap_t>& taps) {
+    tap_t* nearest = nullptr;
+    for (auto& tap : taps) {
+        if (tap.cur_frame == tap.frames)
+            continue;
+        if (nearest != nullptr) {
+            if (tap.cycle_time < nearest->cycle_time)
+                nearest = &tap;                
+        } else {
+            nearest = &tap;
+        }
+    }
+    return nearest;
+}
+
+static uint64_t advance_time(std::ofstream& ofs, uint64_t next_time, uint64_t cur_time) {
+    while (cur_time < next_time) {
+        ofs << '#' << (cur_time * 2 + 0) << std::endl;
+        ofs << "b0 0" << std::endl;
+        ofs << '#' << (cur_time * 2 + 1) << std::endl;
+        ofs << "b1 0" << std::endl;
+        ++cur_time;
+    }
+    return cur_time;
+}
+
+static int dump_tap(std::ofstream& ofs, tap_t* tap, vx_device_h hdevice) {
+    uint32_t signal_offset = 0;   
+    uint32_t frame_offset = 0;
+    uint64_t word;
+
+    std::vector<char> signal_data(tap->width);
+    auto signal_it = tap->signals.rbegin();
+    uint32_t signal_width = signal_it->width;
+
+    do {
+        // read data
+        uint64_t cmd_data = (tap->id << 3) | CMD_GET_DATA;
+        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));        
+        CHECK_ERR(g_callback.registerRead(hdevice, &word));        
+        do {            
+            uint32_t word_offset = frame_offset % 64;
+            signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0';
+            ++signal_offset;
+            ++frame_offset;
+            if (signal_offset == signal_width) {
+                signal_data[signal_width] = 0; // string null termination
+                ofs << 'b' << signal_data.data() << ' ' << signal_it->id << std::endl;
+                if (frame_offset == tap->width) {
+                    // end-of-frame
+                    ++tap->cur_frame;
+                    if (tap->cur_frame != tap->frames) {
+                        // read next delta
+                        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));      
+                        CHECK_ERR(g_callback.registerRead(hdevice, &word));
+                        tap->cycle_time += 1 + word;
+                        if (0 == (tap->cur_frame % FRAME_FLUSH_SIZE)) {
+                            ofs << std::flush;
+                            std::cout << std::dec << "[SCOPE] flush tap #" << tap->id << ": "<< tap->cur_frame << "/" << tap->frames << " frames, next_time=" << tap->cycle_time << std::endl;
+                        }
+                    }
+                    break; 
+                }
+                signal_offset = 0;
+                ++signal_it;
+                signal_width = signal_it->width;
+            }
+        } while ((frame_offset % 64) != 0);
+    } while (frame_offset != tap->width);
+
+    return 0;
+}
+
+int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t start_time, uint64_t stop_time) {    
+    if (nullptr == hdevice || nullptr == callback)
+        return -1;
+
+    const char* json_path = getenv("SCOPE_JSON_PATH");
+    std::ifstream ifs(json_path);
+    if (!ifs) {
+        std::cerr << "[SCOPE] error: cannot open scope manifest file: " << json_path << std::endl;
+        return -1;
+    }
+    auto json_obj = json::parse(ifs);
+    if (json_obj.is_null()) {
+        std::cerr << "[SCOPE] error: invalid scope manifest file: " << json_path << std::endl;
+        return -1;
+    }
+
+    g_callback = *callback;   
+
+    // validate scope manifest
+    for (auto& tap : json_obj["taps"]) {
+        auto id = tap["id"].get<uint32_t>();
+        auto width = tap["width"].get<uint32_t>();
+        
+        uint64_t cmd_width = (id << 3) | CMD_GET_WIDTH;
+        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_width));
+        uint64_t dev_width;
+        CHECK_ERR(g_callback.registerRead(hdevice, &dev_width));
+        if (width != dev_width) {
+            std::cerr << "[SCOPE] error: invalid tap #" << id << " width, actual=" << dev_width << ", expected=" << width << std::endl;
+            return 1;
+        }
+    }
+
+    // set stop time
+    if (stop_time != uint64_t(-1)) {
+        std::cout << "[SCOPE] stop time: " << std::dec << stop_time << "s" << std::endl;
+        for (auto& tap : json_obj["taps"]) {
+            auto id = tap["id"].get<uint32_t>();
+            uint64_t cmd_stop = (stop_time << 11) | (id << 3) | CMD_SET_STOP;
+            CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
+        }        
+    }
+
+    // start recording
+    if (start_time != uint64_t(-1)) {  
+        std::cout << "[SCOPE] start time: " << std::dec << start_time << "s" << std::endl;
+        for (auto& tap : json_obj["taps"]) {
+            auto id = tap["id"].get<uint32_t>();
+            uint64_t cmd_start = (start_time << 11) | (id << 3) | CMD_SET_START;
+            CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
+        }        
+    }
+
+    return 0;
+}
+
+int vx_scope_stop(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    std::vector<tap_t> taps;
+
+    {
+        const char* json_path = getenv("SCOPE_JSON_PATH");
+        std::ifstream ifs(json_path);
+        auto json_obj = json::parse(ifs);
+        if (json_obj.is_null())
+            return 0;
+
+        uint32_t signal_id = 1;
+
+        for (auto& tap : json_obj["taps"]) {
+            tap_t _tap;
+            _tap.id    = tap["id"].get<uint32_t>();
+            _tap.width = tap["width"].get<uint32_t>();
+            _tap.path  = tap["path"].get<std::string>();
+            _tap.cycle_time = 0;
+            _tap.frames = 0;
+            _tap.cur_frame = 0;            
+
+            for (auto& signal : tap["signals"]) {
+                auto name  = signal[0].get<std::string>();
+                auto width = signal[1].get<uint32_t>();
+                _tap.signals.push_back({signal_id, name, width});
+                ++signal_id;
+            }
+
+            taps.emplace_back(std::move(_tap));
+        }
+    }
+
+    // stop recording
+    for (auto& tap : taps) {
+        uint64_t cmd_stop = (0 << 11) | (tap.id << 3) | CMD_SET_STOP;
+        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
+    }
+
+    std::cout << "[SCOPE] trace dump begin..." << std::endl;
+
+    std::ofstream ofs("scope.vcd");
+
+    dump_header(ofs, taps);
+
+    // load trace info
+    for (auto& tap : taps) {
+        uint64_t count, start, delta;
+
+        // get count
+        uint64_t cmd_count = (tap.id << 3) | CMD_GET_COUNT;
+        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_count));
+        CHECK_ERR(g_callback.registerRead(hdevice, &count));   
+
+        // get start    
+        uint64_t cmd_start = (tap.id << 3) | CMD_GET_START;
+        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
+        CHECK_ERR(g_callback.registerRead(hdevice, &start));
+
+        // get data
+        uint64_t cmd_data = (tap.id << 3) | CMD_GET_DATA;
+        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
+        CHECK_ERR(g_callback.registerRead(hdevice, &delta));
+
+        tap.frames = count;
+        tap.cycle_time = 1 + start + delta;
+
+        std::cout << std::dec << "[SCOPE] tap #" << tap.id 
+                              << ": width=" << tap.width 
+                              << ", num_frames=" << tap.frames 
+                              << ", start_time=" << tap.cycle_time 
+                              << ", path=" << tap.path << std::endl;
+    }  
+
+    uint64_t cur_time = 0;
+
+    while (true) {
+        // find the nearest tap
+        auto tap = find_nearest_tap(taps);
+        if (tap == nullptr)
+            break;
+        // advance clock
+        cur_time = advance_time(ofs, tap->cycle_time, cur_time);        
+        // dump tap
+        CHECK_ERR(dump_tap(ofs, tap, hdevice));
+    };
+
+    std::cout << "[SCOPE] trace dump done! - " << (cur_time/2) << " cycles" << std::endl;
+
+    return 0;
+}
--- a/runtime/common/scope.h
+++ b/runtime/common/scope.h
@@ -0,0 +1,35 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vortex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int (*pfn_registerWrite)(vx_device_h hdevice, uint64_t value);
+typedef int (*pfn_registerRead)(vx_device_h hdevice, uint64_t *value);
+
+struct scope_callback_t {
+	pfn_registerWrite registerWrite;
+	pfn_registerRead  registerRead;
+};
+
+int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t start_time, uint64_t stop_time);
+int vx_scope_stop(vx_device_h hdevice);
+
+#ifdef __cplusplus
+}
+#endif
--- a/runtime/common/utils.cpp
+++ b/runtime/common/utils.cpp
@@ -0,0 +1,463 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "utils.h"
+#include <iostream>
+#include <fstream>
+#include <list>
+#include <cstring>
+#include <vector>
+#include <vortex.h>
+#include <assert.h>
+
+#define RT_CHECK(_expr, _cleanup)                               \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+     _cleanup                                                   \
+   } while (false)
+
+uint64_t aligned_size(uint64_t size, uint64_t alignment) {        
+    assert(0 == (alignment & (alignment - 1)));
+    return (size + alignment - 1) & ~(alignment - 1);
+}
+
+bool is_aligned(uint64_t addr, uint64_t alignment) {
+    assert(0 == (alignment & (alignment - 1)));
+    return 0 == (addr & (alignment - 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+class AutoPerfDump {
+public:
+    AutoPerfDump() : perf_class_(0) {}
+
+    ~AutoPerfDump() {
+      for (auto hdevice : hdevices_) {
+        vx_dump_perf(hdevice, stdout);
+      }
+    }
+
+    void add_device(vx_device_h hdevice) {
+      auto perf_class_s = getenv("PERF_CLASS");
+      if (perf_class_s) {
+        perf_class_ = std::atoi(perf_class_s);
+        vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, perf_class_);
+      }
+      hdevices_.push_back(hdevice);
+    }
+
+    void remove_device(vx_device_h hdevice) {
+      hdevices_.remove(hdevice);
+      vx_dump_perf(hdevice, stdout);
+    }
+
+    int get_perf_class() const {
+      return perf_class_;
+    }
+    
+private:
+    std::list<vx_device_h> hdevices_;
+    int perf_class_;
+};
+
+#ifdef DUMP_PERF_STATS
+AutoPerfDump gAutoPerfDump;
+#endif
+
+void perf_add_device(vx_device_h hdevice) {
+#ifdef DUMP_PERF_STATS
+  gAutoPerfDump.add_device(hdevice);
+#else
+  (void)hdevice;
+#endif
+}
+
+void perf_remove_device(vx_device_h hdevice) {
+#ifdef DUMP_PERF_STATS
+  gAutoPerfDump.remove_device(hdevice);
+#else
+  (void)hdevice;
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size) {
+  int err = 0;
+
+  if (NULL == content || 0 == size)
+    return -1;
+
+  uint64_t kernel_base_addr;
+  err = vx_dev_caps(hdevice, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr);
+  if (err != 0)
+    return err;
+
+  return vx_copy_to_dev(hdevice, kernel_base_addr, content, size);
+}
+
+extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename) {
+  std::ifstream ifs(filename);
+  if (!ifs) {
+    std::cout << "error: " << filename << " not found" << std::endl;
+    return -1;
+  }
+
+  // read file content
+  ifs.seekg(0, ifs.end);
+  auto size = ifs.tellg();
+  auto content = new char [size];   
+  ifs.seekg(0, ifs.beg);
+  ifs.read(content, size);
+
+  // upload
+  int err = vx_upload_kernel_bytes(hdevice, content, size);
+
+  // release buffer
+  delete[] content;
+
+  return err;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void DeviceConfig::write(uint32_t addr, uint32_t value) {
+  data_[addr] = value;
+}
+
+uint32_t DeviceConfig::read(uint32_t addr) const {
+  if (0 == data_.count(addr)) {
+    printf("Error: DeviceConfig::read(%d) failed\n", addr);
+  }
+  return data_.at(addr);
+}
+
+int dcr_initialize(vx_device_h hdevice) {
+  const uint64_t startup_addr(STARTUP_ADDR);
+  RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), {
+    return -1;
+  });
+
+  RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), {
+    return -1;
+  });
+
+  RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
+    return -1;
+  });
+  
+  return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+static uint64_t get_csr_64(const void* ptr, int addr) {
+  auto w_ptr = reinterpret_cast<const uint32_t*>(ptr);
+  uint32_t value_lo = w_ptr[addr - VX_CSR_MPM_BASE];
+  uint32_t value_hi = w_ptr[addr - VX_CSR_MPM_BASE + 32];
+  return (uint64_t(value_hi) << 32) | value_lo;
+}
+
+extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
+  int ret = 0;
+
+  uint64_t instrs = 0;
+  uint64_t cycles = 0;
+
+#ifdef PERF_ENABLE   
+  auto perf_class = gAutoPerfDump.get_perf_class();
+
+  // PERF: pipeline stalls
+  uint64_t ibuffer_stalls = 0;
+  uint64_t scoreboard_stalls = 0;
+  uint64_t lsu_stalls = 0;
+  uint64_t fpu_stalls = 0;
+  uint64_t alu_stalls = 0;
+  uint64_t sfu_stalls = 0;  
+  uint64_t ifetches = 0;
+  uint64_t loads = 0;
+  uint64_t stores = 0;
+  uint64_t ifetch_lat = 0;
+  uint64_t load_lat   = 0;
+  // PERF: Icache 
+  uint64_t icache_reads = 0;
+  uint64_t icache_read_misses = 0;
+  // PERF: Dcache 
+  uint64_t dcache_reads = 0;
+  uint64_t dcache_writes = 0;
+  uint64_t dcache_read_misses = 0;
+  uint64_t dcache_write_misses = 0;
+  uint64_t dcache_bank_stalls = 0; 
+  uint64_t dcache_mshr_stalls = 0;
+  // PERF: shared memory
+  uint64_t smem_reads = 0;
+  uint64_t smem_writes = 0;
+  uint64_t smem_bank_stalls = 0;
+  // PERF: l2cache 
+  uint64_t l2cache_reads = 0;
+  uint64_t l2cache_writes = 0;
+  uint64_t l2cache_read_misses = 0;
+  uint64_t l2cache_write_misses = 0;
+  uint64_t l2cache_bank_stalls = 0; 
+  uint64_t l2cache_mshr_stalls = 0;
+  // PERF: l3cache 
+  uint64_t l3cache_reads = 0;
+  uint64_t l3cache_writes = 0;
+  uint64_t l3cache_read_misses = 0;
+  uint64_t l3cache_write_misses = 0;
+  uint64_t l3cache_bank_stalls = 0; 
+  uint64_t l3cache_mshr_stalls = 0;
+  // PERF: memory
+  uint64_t mem_reads = 0;
+  uint64_t mem_writes = 0;
+  uint64_t mem_lat = 0;
+#endif
+
+  uint64_t num_cores;
+  ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores);
+  if (ret != 0)
+    return ret;
+
+  std::vector<uint8_t> staging_buf(64* sizeof(uint32_t));
+      
+  for (unsigned core_id = 0; core_id < num_cores; ++core_id) {    
+    uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size();    
+    ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
+    if (ret != 0)
+      return ret;
+
+    uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
+    uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
+    float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
+    if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);            
+    instrs += instrs_per_core;
+    cycles = std::max<uint64_t>(cycles_per_core, cycles);
+
+  #ifdef PERF_ENABLE
+    switch (perf_class) {
+    case VX_DCR_MPM_CLASS_CORE: {
+      // PERF: pipeline    
+      // ibuffer_stall
+      uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core);
+      ibuffer_stalls += ibuffer_stalls_per_core;
+      // scoreboard_stall
+      uint64_t scoreboard_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core);
+      scoreboard_stalls += scoreboard_stalls_per_core;
+      // alu_stall
+      uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
+      alu_stalls += alu_stalls_per_core;      
+      // lsu_stall
+      uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
+      lsu_stalls += lsu_stalls_per_core;
+      // fpu_stall
+      uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
+      fpu_stalls += fpu_stalls_per_core;      
+      // sfu_stall
+      uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core);
+      sfu_stalls += sfu_stalls_per_core;
+      // PERF: memory
+      // ifetches
+      uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
+      ifetches += ifetches_per_core;
+      // loads
+      uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
+      loads += loads_per_core;
+      // stores
+      uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
+      stores += stores_per_core;
+      // ifetch latency
+      uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
+      if (num_cores > 1) {
+        int mem_avg_lat = (int)(double(ifetch_lat_per_core) / double(ifetches_per_core));
+        fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
+      }
+      ifetch_lat += ifetch_lat_per_core;
+      // load latency
+      uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
+      if (num_cores > 1) {
+        int mem_avg_lat = (int)(double(load_lat_per_core) / double(loads_per_core));
+        fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
+      }
+      load_lat += load_lat_per_core;      
+    } break;
+    case VX_DCR_MPM_CLASS_MEM: {      
+      if (0 == core_id) {
+        // PERF: Icache
+        icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS);
+        icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R);
+      
+        // PERF: Dcache
+        dcache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_READS);
+        dcache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_WRITES);
+        dcache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_R);
+        dcache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_W);
+        dcache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_BANK_ST);
+        dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST);
+      
+        // PERF: smem
+        smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS);
+        smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES);
+        smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST);
+      
+        // PERF: L2cache
+        l2cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_READS);
+        l2cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_WRITES);
+        l2cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_R);
+        l2cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_W);
+        l2cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_BANK_ST);
+        l2cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MSHR_ST);
+      
+        // PERF: L3cache
+        l3cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_READS);
+        l3cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_WRITES);
+        l3cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_R);
+        l3cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_W);
+        l3cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_BANK_ST);
+        l3cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MSHR_ST);
+      
+        // PERF: memory
+        mem_reads  = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS);
+        mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES);
+        mem_lat    = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LAT);
+      }
+    } break;
+    default:
+      break;
+    }
+  #endif
+  }  
+  
+  float IPC = (float)(double(instrs) / double(cycles));
+  fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);    
+      
+#ifdef PERF_ENABLE
+  switch (perf_class) {
+  case VX_DCR_MPM_CLASS_CORE: {    
+    int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
+    int load_avg_lat = (int)(double(load_lat) / double(loads));
+    fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
+    fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
+    fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
+    fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls);
+    fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
+    fprintf(stream, "PERF: sfu unit stalls=%ld\n", sfu_stalls);
+    fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
+    fprintf(stream, "PERF: loads=%ld\n", loads);
+    fprintf(stream, "PERF: stores=%ld\n", stores);    
+    fprintf(stream, "PERF: ifetch latency=%d cycles\n", ifetch_avg_lat);
+    fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
+    
+  } break;  
+  case VX_DCR_MPM_CLASS_MEM: {
+    int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);    
+    int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
+    int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
+    int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);    
+    int l2cache_read_hit_ratio = (int)((1.0 - (double(l2cache_read_misses) / double(l2cache_reads))) * 100);
+    int l2cache_write_hit_ratio = (int)((1.0 - (double(l2cache_write_misses) / double(l2cache_writes))) * 100);
+    int l2cache_bank_utilization = (int)((double(l2cache_reads + l2cache_writes) / double(l2cache_reads + l2cache_writes + l2cache_bank_stalls)) * 100);    
+    int l3cache_read_hit_ratio = (int)((1.0 - (double(l3cache_read_misses) / double(l3cache_reads))) * 100);
+    int l3cache_write_hit_ratio = (int)((1.0 - (double(l3cache_write_misses) / double(l3cache_writes))) * 100);
+    int l3cache_bank_utilization = (int)((double(l3cache_reads + l3cache_writes) / double(l3cache_reads + l3cache_writes + l3cache_bank_stalls)) * 100);    
+    int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);    
+    int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));    
+    fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
+    fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio);
+    fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
+    fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes);
+    fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio);
+    fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio);  
+    fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization);
+    fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
+    fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
+    fprintf(stream, "PERF: smem writes=%ld\n", smem_writes); 
+    fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
+    fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
+    fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
+    fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio);
+    fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio);  
+    fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization);
+    fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls);
+    fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
+    fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
+    fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio);
+    fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio);  
+    fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization);
+    fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls);
+    fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
+    fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
+  } break;
+  default:
+    break;
+  }
+#endif
+
+  fflush(stream);
+
+  return 0;
+}
+
+extern int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value) {
+  int ret = 0;
+  uint64_t num_cores;
+  ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores);
+  if (ret != 0)
+    return ret;
+
+  if (core_id >= (int)num_cores) {
+    std::cout << "error: core_id out of range" << std::endl;
+    return -1;
+  }
+
+  std::vector<uint8_t> staging_buf(64 * sizeof(uint32_t));
+
+  uint64_t _value = 0;
+  
+  unsigned i = 0;
+  if (core_id != -1) {
+    i = core_id;
+    num_cores = core_id + 1;
+  }
+      
+  for (i = 0; i < num_cores; ++i) {
+    uint64_t mpm_mem_addr = IO_CSR_ADDR + i * staging_buf.size();    
+    ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
+    if (ret != 0)
+      return ret;
+
+    auto per_core_value = get_csr_64(staging_buf.data(), counter);     
+    if (counter == VX_CSR_MCYCLE) {
+      _value = std::max<uint64_t>(per_core_value, _value);
+    } else {
+      _value += per_core_value;
+    }    
+  }
+
+  // output
+  *value = _value;
+
+  return 0;
+}
--- a/runtime/common/utils.h
+++ b/runtime/common/utils.h
@@ -0,0 +1,47 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vortex.h>
+#include <cstdint>
+#include <unordered_map>
+#include <VX_config.h>
+#include <VX_types.h>
+
+class DeviceConfig {
+public:    
+    void write(uint32_t addr, uint32_t value);
+    uint32_t read(uint32_t addr) const;
+private:
+     std::unordered_map<uint32_t, uint32_t> data_;
+};
+
+int dcr_initialize(vx_device_h device);
+
+uint64_t aligned_size(uint64_t size, uint64_t alignment);
+
+bool is_aligned(uint64_t addr, uint64_t alignment);
+
+void perf_add_device(vx_device_h device);
+
+void perf_remove_device(vx_device_h device);
+
+#define CACHE_BLOCK_SIZE    64
+#define ALLOC_BASE_ADDR     CACHE_BLOCK_SIZE
+#define ALLOC_MAX_ADDR      STARTUP_ADDR
+#if (XLEN == 64)
+#define GLOBAL_MEM_SIZE      0x200000000  // 8 GB
+#else
+#define GLOBAL_MEM_SIZE      0x100000000  // 4 GB
+#endif
--- a/runtime/include/vortex.h
+++ b/runtime/include/vortex.h
@@ -0,0 +1,111 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __VX_VORTEX_H__
+#define __VX_VORTEX_H__
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void* vx_device_h;
+
+// device caps ids
+#define VX_CAPS_VERSION             0x0 
+#define VX_CAPS_NUM_THREADS         0x1
+#define VX_CAPS_NUM_WARPS           0x2
+#define VX_CAPS_NUM_CORES           0x3
+#define VX_CAPS_CACHE_LINE_SIZE     0x4
+#define VX_CAPS_GLOBAL_MEM_SIZE     0x5
+#define VX_CAPS_LOCAL_MEM_SIZE      0x6
+#define VX_CAPS_KERNEL_BASE_ADDR    0x7
+#define VX_CAPS_ISA_FLAGS           0x8
+
+// device isa flags
+#define VX_ISA_STD_A                (1ull << 0)
+#define VX_ISA_STD_C                (1ull << 2)
+#define VX_ISA_STD_D                (1ull << 3)
+#define VX_ISA_STD_E                (1ull << 4)
+#define VX_ISA_STD_F                (1ull << 5)
+#define VX_ISA_STD_H                (1ull << 7)
+#define VX_ISA_STD_I                (1ull << 8)
+#define VX_ISA_STD_N                (1ull << 13)
+#define VX_ISA_STD_Q                (1ull << 16)
+#define VX_ISA_STD_S                (1ull << 18)
+#define VX_ISA_STD_U                (1ull << 20)
+#define VX_ISA_BASE(flags)          (1 << (((flags >> 30) & 0x3) + 4))
+#define VX_ISA_EXT_TEX              (1ull << 32)
+#define VX_ISA_EXT_RASTER           (1ull << 33)
+#define VX_ISA_EXT_ROP              (1ull << 34)
+
+// device memory types
+#define VX_MEM_TYPE_GLOBAL          0
+#define VX_MEM_TYPE_LOCAL           1
+
+// ready wait timeout
+#define VX_MAX_TIMEOUT              (24*60*60*1000)   // 24 Hr
+
+// open the device and connect to it
+int vx_dev_open(vx_device_h* hdevice);
+
+// Close the device when all the operations are done
+int vx_dev_close(vx_device_h hdevice);
+
+// return device configurations
+int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value);
+
+// allocate device memory and return address
+int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr);
+
+// release device memory
+int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr);
+
+// get device memory info
+int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used);
+
+// Copy bytes from host to device memory
+int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size);
+
+// Copy bytes from device memory to host
+int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size);
+
+// Start device execution
+int vx_start(vx_device_h hdevice);
+
+// Wait for device ready with milliseconds timeout
+int vx_ready_wait(vx_device_h hdevice, uint64_t timeout);
+
+// write device configuration registers
+int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value);
+
+////////////////////////////// UTILITY FUNCTIONS //////////////////////////////
+
+// upload kernel bytes to device
+int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size);
+
+// upload kernel file to device
+int vx_upload_kernel_file(vx_device_h hdevice, const char* filename);
+
+// performance counters
+int vx_dump_perf(vx_device_h hdevice, FILE* stream);
+int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __VX_VORTEX_H__
--- a/runtime/include/vx_intrinsics.h
+++ b/runtime/include/vx_intrinsics.h
@@ -1,214 +0,0 @@
-#ifndef VX_INTRINSICS_H
-#define VX_INTRINSICS_H
-
-#include <VX_config.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __ASSEMBLY__
-#define __ASM_STR(x)	x
-#else
-#define __ASM_STR(x)	#x
-#endif
-
-#define csr_read(csr) ({                        \
-	unsigned __r;	               		        \
-	__asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \
-	__r;							            \
-})
-
-#define csr_write(csr, val)	({                  \
-	unsigned __v = (unsigned)(val);             \
-	if (__builtin_constant_p(val) && __v < 32)  \
-        __asm__ __volatile__ ("csrw %0, %1"	:: "i" (csr), "i" (__v));  \
-    else                                        \
-        __asm__ __volatile__ ("csrw %0, %1"	:: "i" (csr), "r" (__v));  \
-})
-
-#define csr_swap(csr, val) ({                   \
-    unsigned __r;                               \
-	unsigned __v = (unsigned)(val);	            \
-	if (__builtin_constant_p(val) && __v < 32)  \
-        __asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
-    else                                        \
-        __asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
-	__r;						                \
-})
-
-#define csr_read_set(csr, val) ({               \
-	unsigned __r;                               \
-	unsigned __v = (unsigned)(val);	            \
-    if (__builtin_constant_p(val) && __v < 32)  \
-	    __asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
-    else                                        \
-        __asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
-	__r;							            \
-})
-
-#define csr_set(csr, val) ({                    \
-	unsigned __v = (unsigned)(val);	            \
-    if (__builtin_constant_p(val) && __v < 32)  \
-	    __asm__ __volatile__ ("csrs %0, %1"	:: "i" (csr), "i" (__v));  \
-    else                                        \
-        __asm__ __volatile__ ("csrs %0, %1"	:: "i" (csr), "r" (__v));  \
-})
-
-#define csr_read_clear(csr, val) ({             \
-	unsigned __r;                               \
-	unsigned __v = (unsigned)(val);	            \
-    if (__builtin_constant_p(val) && __v < 32)  \
-	    __asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
-    else                                        \
-        __asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
-	__r;							            \
-})
-
-#define csr_clear(csr, val)	({                  \
-	unsigned __v = (unsigned)(val);             \
-	if (__builtin_constant_p(val) && __v < 32)  \
-        __asm__ __volatile__ ("csrc %0, %1"	:: "i" (csr), "i" (__v)); \
-    else                                        \
-        __asm__ __volatile__ ("csrc %0, %1"	:: "i" (csr), "r" (__v)); \
-})
-
-// Texture load
-#define vx_tex(unit, u, v, lod) ({              \
-	unsigned __r;                               \
-    __asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(unit), "r"(u), "r"(v), "r"(lod)); \
-	__r;							            \
-})
-
-// Conditional move
-#define vx_cmov(c, t, f) ({                     \
-	unsigned __r;		                        \
-    __asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
-	__r;							            \
-})
-
-// Set thread mask
-inline void vx_tmc(unsigned thread_mask) {
-    asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask));
-}
-
-// Set thread predicate
-inline void vx_pred(unsigned condition) {
-    asm volatile (".insn s 0x6b, 0, x1, 0(%0)" :: "r"(condition));
-}
-
-typedef void (*vx_wspawn_pfn)();
-
-// Spawn warps
-inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
-    asm volatile (".insn s 0x6b, 1, %1, 0(%0)" :: "r"(num_warps), "r"(func_ptr));
-}
-
-// Split on a predicate
-inline void vx_split(int predicate) {
-    asm volatile (".insn s 0x6b, 2, x0, 0(%0)" :: "r"(predicate));
-}
-
-// Join
-inline void vx_join() {
-  asm volatile (".insn s 0x6b, 3, x0, 0(x0)");
-}
-
-// Warp Barrier
-inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
-    asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
-}
-
-// Prefetch
-inline void vx_prefetch(unsigned addr) {
-    asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
-}
-
-// Return active warp's thread id 
-inline int vx_thread_id() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_WTID));
-    return result;   
-}
-
-// Return active core's local thread id
-inline int vx_thread_lid() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LTID));
-    return result;   
-}
-
-// Return processsor global thread id
-inline int vx_thread_gid() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GTID));
-    return result;   
-}
-
-// Return active core's local warp id
-inline int vx_warp_id() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LWID));
-    return result;   
-}
-
-// Return processsor's global warp id
-inline int vx_warp_gid() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GWID));
-    return result;   
-}
-
-// Return processsor core id
-inline int vx_core_id() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GCID));
-    return result; 
-}
-
-// Return current threadk mask
-inline int vx_thread_mask() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_TMASK));
-    return result; 
-}
-
-// Return the number of threads in a warp
-inline int vx_num_threads() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NT));
-    return result; 
-}
-
-// Return the number of warps in a core
-inline int vx_num_warps() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NW));
-    return result;   
-}
-
-// Return the number of cores in the processsor
-inline int vx_num_cores() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NC));
-    return result;   
-}
-
-inline void vx_fence() {
-    asm volatile ("fence iorw, iorw");
-}
-
-#define __if(b) vx_split(b); \
-                if (b) 
-
-#define __else else
-
-#define __endif vx_join();
-
-#define __DIVERGENT__ __attribute__((annotate("divergent")))
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/runtime/include/vx_print.h
+++ b/runtime/include/vx_print.h
@@ -1,21 +0,0 @@
-#ifndef VX_PRINT_H
-#define VX_PRINT_H
-
-#include <stdarg.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int vx_vprintf(const char* format, va_list va);
-int vx_printf(const char * format, ...);
-
-void vx_putchar(int c);
-void vx_putint(int value, int base);
-void vx_putfloat(float value, int precision);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/runtime/include/vx_spawn.h
+++ b/runtime/include/vx_spawn.h
@@ -1,43 +0,0 @@
-#ifndef VX_API_H
-#define VX_API_H
-
-#include <stdint.h>
-#include <stdio.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
-  uint32_t num_groups[3];
-  uint32_t global_offset[3];
-  uint32_t local_size[3];
-  char * printf_buffer;
-  uint32_t *printf_buffer_position;
-  uint32_t printf_buffer_capacity;
-  uint32_t work_dim;
-} context_t;
-
-typedef void (*vx_spawn_kernel_cb) (
-  const void * /* arg */,
-	const context_t * /* context */,
-	uint32_t /* group_x */,
-	uint32_t /* group_y */,
-	uint32_t /* group_z */
-);
-
-typedef void (*vx_spawn_tasks_cb)(int task_id, void *arg);
-
-typedef void (*vx_serial_cb)(void *arg);
-
-void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg);
-
-void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
-
-void vx_serial(vx_serial_cb callback, void * arg);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/runtime/linker/vx_link32.ld
+++ b/runtime/linker/vx_link32.ld
@@ -1,264 +0,0 @@
-/* ---- Original Script: /opt/riscv32i/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.x ---- */
-/* Default linker script, for normal executables */
-/* Copyright (C) 2014-2017 Free Software Foundation, Inc.
-   Copying and distribution of this script, with or without modification,
-   are permitted in any medium without royalty provided the copyright
-   notice and this notice are preserved.  */
-OUTPUT_FORMAT("elf32-littleriscv", "elf32-littleriscv", "elf32-littleriscv")
-OUTPUT_ARCH(riscv)
-ENTRY(_start)
-SECTIONS
-{
-  . = 0x80000000;
-  .interp         : { *(.interp) }
-  .note.gnu.build-id  : { *(.note.gnu.build-id) }
-  .hash           : { *(.hash) }
-  .gnu.hash       : { *(.gnu.hash) }
-  .dynsym         : { *(.dynsym) }
-  .dynstr         : { *(.dynstr) }
-  .gnu.version    : { *(.gnu.version) }
-  .gnu.version_d  : { *(.gnu.version_d) }
-  .gnu.version_r  : { *(.gnu.version_r) }
-  .rela.dyn       :
-    {
-      *(.rela.init)
-      *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*)
-      *(.rela.fini)
-      *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*)
-      *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*)
-      *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*)
-      *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*)
-      *(.rela.ctors)
-      *(.rela.dtors)
-      *(.rela.got)
-      *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*)
-      *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*)
-      *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*)
-      *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*)
-      *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*)
-      PROVIDE_HIDDEN (__rela_iplt_start = .);
-      *(.rela.iplt)
-      PROVIDE_HIDDEN (__rela_iplt_end = .);
-    }
-  .rela.plt       :
-    {
-      *(.rela.plt)
-    }
-  .init           :
-  {
-    KEEP (*(SORT_NONE(.init)))
-  }
-  .plt            : { *(.plt) }
-  .iplt           : { *(.iplt) }
-  .text           :
-  {
-    *(.text.unlikely .text.*_unlikely .text.unlikely.*)
-    *(.text.exit .text.exit.*)
-    *(.text.startup .text.startup.*)
-    *(.text.hot .text.hot.*)
-    *(.text .stub .text.* .gnu.linkonce.t.*)
-    /* .gnu.warning sections are handled specially by elf32.em.  */
-    *(.gnu.warning)
-  }
-  .fini           :
-  {
-    KEEP (*(SORT_NONE(.fini)))
-  }
-  PROVIDE (__etext = .);
-  PROVIDE (_etext = .);
-  PROVIDE (etext = .);
-  .rodata         : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
-  .rodata1        : { *(.rodata1) }
-  .sdata2         :
-  {
-    *(.sdata2 .sdata2.* .gnu.linkonce.s2.*)
-  }
-  .sbss2          : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) }
-  .eh_frame_hdr   : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) }
-  .eh_frame       : ONLY_IF_RO { KEEP (*(.eh_frame)) *(.eh_frame.*) }
-  .gcc_except_table   : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) }
-  .gnu_extab   : ONLY_IF_RO { *(.gnu_extab*) }
-  /* These sections are generated by the Sun/Oracle C++ compiler.  */
-  .exception_ranges   : ONLY_IF_RO { *(.exception_ranges*) }
-  /* Adjust the address for the data segment.  We want to adjust up to
-     the same address within the page on the next page up.  */
-  . = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
-  /* Exception handling  */
-  .eh_frame       : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) }
-  .gnu_extab      : ONLY_IF_RW { *(.gnu_extab) }
-  .gcc_except_table   : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) }
-  .exception_ranges   : ONLY_IF_RW { *(.exception_ranges*) }
-  /* Thread Local Storage sections  */
-  .tdata	  :
-   {
-     PROVIDE_HIDDEN (__tdata_start = .);
-     *(.tdata .tdata.* .gnu.linkonce.td.*)
-     PROVIDE_HIDDEN (__tdata_end = .);
-   }
-   PROVIDE (__tdata_size = SIZEOF (.tdata));
-  .tbss		  : 
-  {
-    PROVIDE_HIDDEN (__tbss_start = .);
-    PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start)); 
-    *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
-    PROVIDE_HIDDEN (__tbss_end = .);
-  }
-  PROVIDE (__tbss_size = SIZEOF (.tbss));
-  PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
-  .preinit_array    :
-  {
-    PROVIDE_HIDDEN (__preinit_array_start = .);
-    KEEP (*(.preinit_array))
-    PROVIDE_HIDDEN (__preinit_array_end = .);
-  }
-  .init_array    :
-  {
-    PROVIDE_HIDDEN (__init_array_start = .);
-    KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
-    KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
-    PROVIDE_HIDDEN (__init_array_end = .);
-  }
-  .fini_array    :
-  {
-    PROVIDE_HIDDEN (__fini_array_start = .);
-    KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
-    KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors))
-    PROVIDE_HIDDEN (__fini_array_end = .);
-  }
-  .ctors          :
-  {
-    /* gcc uses crtbegin.o to find the start of
-       the constructors, so we make sure it is
-       first.  Because this is a wildcard, it
-       doesn't matter if the user does not
-       actually link against crtbegin.o; the
-       linker won't look for a file to match a
-       wildcard.  The wildcard also means that it
-       doesn't matter which directory crtbegin.o
-       is in.  */
-    KEEP (*crtbegin.o(.ctors))
-    KEEP (*crtbegin?.o(.ctors))
-    /* We don't want to include the .ctor section from
-       the crtend.o file until after the sorted ctors.
-       The .ctor section from the crtend file contains the
-       end of ctors marker and it must be last */
-    KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
-    KEEP (*(SORT(.ctors.*)))
-    KEEP (*(.ctors))
-  }
-  .dtors          :
-  {
-    KEEP (*crtbegin.o(.dtors))
-    KEEP (*crtbegin?.o(.dtors))
-    KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
-    KEEP (*(SORT(.dtors.*)))
-    KEEP (*(.dtors))
-  }
-  .jcr            : { KEEP (*(.jcr)) }
-  .data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) }
-  .dynamic        : { *(.dynamic) }
-  . = DATA_SEGMENT_RELRO_END (0, .);
-  .data           :
-  {
-    __DATA_BEGIN__ = .;
-    *(.data .data.* .gnu.linkonce.d.*)
-    SORT(CONSTRUCTORS)
-  }
-  .data1          : { *(.data1) }
-  .got            : { *(.got.plt) *(.igot.plt) *(.got) *(.igot) }
-  /* We want the small data sections together, so single-instruction offsets
-     can access them all, and initialized data all before uninitialized, so
-     we can shorten the on-disk segment size.  */
-  .sdata          :
-  {
-    __SDATA_BEGIN__ = .;
-    *(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*)
-    *(.sdata .sdata.* .gnu.linkonce.s.*)
-  }
-  _edata = .; PROVIDE (edata = .);
-  . = .;
-  __bss_start = .;
-  .sbss           :
-  {
-    *(.dynsbss)
-    *(.sbss .sbss.* .gnu.linkonce.sb.*)
-    *(.scommon)
-  }
-  .bss            :
-  {
-   *(.dynbss)
-   *(.bss .bss.* .gnu.linkonce.b.*)
-   *(COMMON)
-   /* Align here to ensure that the .bss section occupies space up to
-      _end.  Align after .bss to ensure correct alignment even if the
-      .bss section disappears because there are no input sections.
-      FIXME: Why do we need it? When there is no .bss section, we do not
-      pad the .data section.  */
-   . = ALIGN(. != 0 ? 32 / 8 : 1);
-  }
-  . = ALIGN(32 / 8);
-  . = SEGMENT_START("ldata-segment", .);
-  . = ALIGN(32 / 8);
-  __BSS_END__ = .;
-    __global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
-                MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
-  _end = .; PROVIDE (end = .);
-  . = DATA_SEGMENT_END (.);  
-  
-  /* .stack_dummy section doesn't contains any symbols. It is only
-	 * used for linker to calculate size of stack sections, and assign
-	 * values to stack symbols later */
-	.stack_dummy (COPY):
-	{
-		KEEP(*(.stack*))
-	}
-  __stack_usage = SIZEOF(.stack_dummy);
-  PROVIDE(__stack_top  = 0xFF000000);
-  PROVIDE(__stack_size = 0x400);
-	PROVIDE(__stack = __stack_top);
-	ASSERT(__stack_usage <= __stack_size, "stack overflow")
-
-  /* Stabs debugging sections.  */
-  .stab          0 : { *(.stab) }
-  .stabstr       0 : { *(.stabstr) }
-  .stab.excl     0 : { *(.stab.excl) }
-  .stab.exclstr  0 : { *(.stab.exclstr) }
-  .stab.index    0 : { *(.stab.index) }
-  .stab.indexstr 0 : { *(.stab.indexstr) }
-  .comment       0 : { *(.comment) }
-  .gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
-  /* DWARF debug sections.
-     Symbols in the DWARF debugging sections are relative to the beginning
-     of the section so we begin them at 0.  */
-  /* DWARF 1 */
-  .debug          0 : { *(.debug) }
-  .line           0 : { *(.line) }
-  /* GNU DWARF 1 extensions */
-  .debug_srcinfo  0 : { *(.debug_srcinfo) }
-  .debug_sfnames  0 : { *(.debug_sfnames) }
-  /* DWARF 1.1 and DWARF 2 */
-  .debug_aranges  0 : { *(.debug_aranges) }
-  .debug_pubnames 0 : { *(.debug_pubnames) }
-  /* DWARF 2 */
-  .debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
-  .debug_abbrev   0 : { *(.debug_abbrev) }
-  .debug_line     0 : { *(.debug_line .debug_line.* .debug_line_end) }
-  .debug_frame    0 : { *(.debug_frame) }
-  .debug_str      0 : { *(.debug_str) }
-  .debug_loc      0 : { *(.debug_loc) }
-  .debug_macinfo  0 : { *(.debug_macinfo) }
-  /* SGI/MIPS DWARF 2 extensions */
-  .debug_weaknames 0 : { *(.debug_weaknames) }
-  .debug_funcnames 0 : { *(.debug_funcnames) }
-  .debug_typenames 0 : { *(.debug_typenames) }
-  .debug_varnames  0 : { *(.debug_varnames) }
-  /* DWARF 3 */
-  .debug_pubtypes 0 : { *(.debug_pubtypes) }
-  .debug_ranges   0 : { *(.debug_ranges) }
-  /* DWARF Extension.  */
-  .debug_macro    0 : { *(.debug_macro) }
-  .debug_addr     0 : { *(.debug_addr) }
-  .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
-  /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
-
-}
--- a/runtime/linker/vx_link64.ld
+++ b/runtime/linker/vx_link64.ld
@@ -1,264 +0,0 @@
-/* ---- Original Script: /opt/riscv32i/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.x ---- */
-/* Default linker script, for normal executables */
-/* Copyright (C) 2014-2017 Free Software Foundation, Inc.
-   Copying and distribution of this script, with or without modification,
-   are permitted in any medium without royalty provided the copyright
-   notice and this notice are preserved.  */
-OUTPUT_FORMAT("elf64-littleriscv", "elf64-littleriscv", "elf64-littleriscv")
-OUTPUT_ARCH(riscv)
-ENTRY(_start)
-SECTIONS
-{
-  . = 0x80000000;
-  .interp         : { *(.interp) }
-  .note.gnu.build-id  : { *(.note.gnu.build-id) }
-  .hash           : { *(.hash) }
-  .gnu.hash       : { *(.gnu.hash) }
-  .dynsym         : { *(.dynsym) }
-  .dynstr         : { *(.dynstr) }
-  .gnu.version    : { *(.gnu.version) }
-  .gnu.version_d  : { *(.gnu.version_d) }
-  .gnu.version_r  : { *(.gnu.version_r) }
-  .rela.dyn       :
-    {
-      *(.rela.init)
-      *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*)
-      *(.rela.fini)
-      *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*)
-      *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*)
-      *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*)
-      *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*)
-      *(.rela.ctors)
-      *(.rela.dtors)
-      *(.rela.got)
-      *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*)
-      *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*)
-      *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*)
-      *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*)
-      *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*)
-      PROVIDE_HIDDEN (__rela_iplt_start = .);
-      *(.rela.iplt)
-      PROVIDE_HIDDEN (__rela_iplt_end = .);
-    }
-  .rela.plt       :
-    {
-      *(.rela.plt)
-    }
-  .init           :
-  {
-    KEEP (*(SORT_NONE(.init)))
-  }
-  .plt            : { *(.plt) }
-  .iplt           : { *(.iplt) }
-  .text           :
-  {
-    *(.text.unlikely .text.*_unlikely .text.unlikely.*)
-    *(.text.exit .text.exit.*)
-    *(.text.startup .text.startup.*)
-    *(.text.hot .text.hot.*)
-    *(.text .stub .text.* .gnu.linkonce.t.*)
-    /* .gnu.warning sections are handled specially by elf32.em.  */
-    *(.gnu.warning)
-  }
-  .fini           :
-  {
-    KEEP (*(SORT_NONE(.fini)))
-  }
-  PROVIDE (__etext = .);
-  PROVIDE (_etext = .);
-  PROVIDE (etext = .);
-  .rodata         : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
-  .rodata1        : { *(.rodata1) }
-  .sdata2         :
-  {
-    *(.sdata2 .sdata2.* .gnu.linkonce.s2.*)
-  }
-  .sbss2          : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) }
-  .eh_frame_hdr   : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) }
-  .eh_frame       : ONLY_IF_RO { KEEP (*(.eh_frame)) *(.eh_frame.*) }
-  .gcc_except_table   : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) }
-  .gnu_extab   : ONLY_IF_RO { *(.gnu_extab*) }
-  /* These sections are generated by the Sun/Oracle C++ compiler.  */
-  .exception_ranges   : ONLY_IF_RO { *(.exception_ranges*) }
-  /* Adjust the address for the data segment.  We want to adjust up to
-     the same address within the page on the next page up.  */
-  . = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
-  /* Exception handling  */
-  .eh_frame       : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) }
-  .gnu_extab      : ONLY_IF_RW { *(.gnu_extab) }
-  .gcc_except_table   : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) }
-  .exception_ranges   : ONLY_IF_RW { *(.exception_ranges*) }
-  /* Thread Local Storage sections  */
-  .tdata	  :
-   {
-     PROVIDE_HIDDEN (__tdata_start = .);
-     *(.tdata .tdata.* .gnu.linkonce.td.*)
-     PROVIDE_HIDDEN (__tdata_end = .);
-   }
-   PROVIDE (__tdata_size = SIZEOF (.tdata));
-  .tbss		  : 
-  {
-    PROVIDE_HIDDEN (__tbss_start = .);
-    PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start)); 
-    *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
-    PROVIDE_HIDDEN (__tbss_end = .);
-  }
-  PROVIDE (__tbss_size = SIZEOF (.tbss));
-  PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
-  .preinit_array    :
-  {
-    PROVIDE_HIDDEN (__preinit_array_start = .);
-    KEEP (*(.preinit_array))
-    PROVIDE_HIDDEN (__preinit_array_end = .);
-  }
-  .init_array    :
-  {
-    PROVIDE_HIDDEN (__init_array_start = .);
-    KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
-    KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
-    PROVIDE_HIDDEN (__init_array_end = .);
-  }
-  .fini_array    :
-  {
-    PROVIDE_HIDDEN (__fini_array_start = .);
-    KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
-    KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors))
-    PROVIDE_HIDDEN (__fini_array_end = .);
-  }
-  .ctors          :
-  {
-    /* gcc uses crtbegin.o to find the start of
-       the constructors, so we make sure it is
-       first.  Because this is a wildcard, it
-       doesn't matter if the user does not
-       actually link against crtbegin.o; the
-       linker won't look for a file to match a
-       wildcard.  The wildcard also means that it
-       doesn't matter which directory crtbegin.o
-       is in.  */
-    KEEP (*crtbegin.o(.ctors))
-    KEEP (*crtbegin?.o(.ctors))
-    /* We don't want to include the .ctor section from
-       the crtend.o file until after the sorted ctors.
-       The .ctor section from the crtend file contains the
-       end of ctors marker and it must be last */
-    KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
-    KEEP (*(SORT(.ctors.*)))
-    KEEP (*(.ctors))
-  }
-  .dtors          :
-  {
-    KEEP (*crtbegin.o(.dtors))
-    KEEP (*crtbegin?.o(.dtors))
-    KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
-    KEEP (*(SORT(.dtors.*)))
-    KEEP (*(.dtors))
-  }
-  .jcr            : { KEEP (*(.jcr)) }
-  .data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) }
-  .dynamic        : { *(.dynamic) }
-  . = DATA_SEGMENT_RELRO_END (0, .);
-  .data           :
-  {
-    __DATA_BEGIN__ = .;
-    *(.data .data.* .gnu.linkonce.d.*)
-    SORT(CONSTRUCTORS)
-  }
-  .data1          : { *(.data1) }
-  .got            : { *(.got.plt) *(.igot.plt) *(.got) *(.igot) }
-  /* We want the small data sections together, so single-instruction offsets
-     can access them all, and initialized data all before uninitialized, so
-     we can shorten the on-disk segment size.  */
-  .sdata          :
-  {
-    __SDATA_BEGIN__ = .;
-    *(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*)
-    *(.sdata .sdata.* .gnu.linkonce.s.*)
-  }
-  _edata = .; PROVIDE (edata = .);
-  . = .;
-  __bss_start = .;
-  .sbss           :
-  {
-    *(.dynsbss)
-    *(.sbss .sbss.* .gnu.linkonce.sb.*)
-    *(.scommon)
-  }
-  .bss            :
-  {
-   *(.dynbss)
-   *(.bss .bss.* .gnu.linkonce.b.*)
-   *(COMMON)
-   /* Align here to ensure that the .bss section occupies space up to
-      _end.  Align after .bss to ensure correct alignment even if the
-      .bss section disappears because there are no input sections.
-      FIXME: Why do we need it? When there is no .bss section, we do not
-      pad the .data section.  */
-   . = ALIGN(. != 0 ? 64 / 8 : 1);
-  }
-  . = ALIGN(64 / 8);
-  . = SEGMENT_START("ldata-segment", .);
-  . = ALIGN(64 / 8);
-  __BSS_END__ = .;
-    __global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
-                MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
-  _end = .; PROVIDE (end = .);
-  . = DATA_SEGMENT_END (.);  
-  
-  /* .stack_dummy section doesn't contains any symbols. It is only
-	 * used for linker to calculate size of stack sections, and assign
-	 * values to stack symbols later */
-	.stack_dummy (COPY):
-	{
-		KEEP(*(.stack*))
-	}
-  __stack_usage = SIZEOF(.stack_dummy);
-  PROVIDE(__stack_top  = 0xFF000000);
-  PROVIDE(__stack_size = 0x400);
-	PROVIDE(__stack = __stack_top);
-	ASSERT(__stack_usage <= __stack_size, "stack overflow")
-
-  /* Stabs debugging sections.  */
-  .stab          0 : { *(.stab) }
-  .stabstr       0 : { *(.stabstr) }
-  .stab.excl     0 : { *(.stab.excl) }
-  .stab.exclstr  0 : { *(.stab.exclstr) }
-  .stab.index    0 : { *(.stab.index) }
-  .stab.indexstr 0 : { *(.stab.indexstr) }
-  .comment       0 : { *(.comment) }
-  .gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
-  /* DWARF debug sections.
-     Symbols in the DWARF debugging sections are relative to the beginning
-     of the section so we begin them at 0.  */
-  /* DWARF 1 */
-  .debug          0 : { *(.debug) }
-  .line           0 : { *(.line) }
-  /* GNU DWARF 1 extensions */
-  .debug_srcinfo  0 : { *(.debug_srcinfo) }
-  .debug_sfnames  0 : { *(.debug_sfnames) }
-  /* DWARF 1.1 and DWARF 2 */
-  .debug_aranges  0 : { *(.debug_aranges) }
-  .debug_pubnames 0 : { *(.debug_pubnames) }
-  /* DWARF 2 */
-  .debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
-  .debug_abbrev   0 : { *(.debug_abbrev) }
-  .debug_line     0 : { *(.debug_line .debug_line.* .debug_line_end) }
-  .debug_frame    0 : { *(.debug_frame) }
-  .debug_str      0 : { *(.debug_str) }
-  .debug_loc      0 : { *(.debug_loc) }
-  .debug_macinfo  0 : { *(.debug_macinfo) }
-  /* SGI/MIPS DWARF 2 extensions */
-  .debug_weaknames 0 : { *(.debug_weaknames) }
-  .debug_funcnames 0 : { *(.debug_funcnames) }
-  .debug_typenames 0 : { *(.debug_typenames) }
-  .debug_varnames  0 : { *(.debug_varnames) }
-  /* DWARF 3 */
-  .debug_pubtypes 0 : { *(.debug_pubtypes) }
-  .debug_ranges   0 : { *(.debug_ranges) }
-  /* DWARF Extension.  */
-  .debug_macro    0 : { *(.debug_macro) }
-  .debug_addr     0 : { *(.debug_addr) }
-  .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
-  /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
-
-}
--- a/runtime/opae/Makefile
+++ b/runtime/opae/Makefile
@@ -0,0 +1,78 @@
+XLEN ?= 32
+
+TARGET ?= opaesim
+
+OPAESIM_DIR = ../../sim/opaesim
+
+RTL_DIR=../../hw/rtl
+
+SYN_DIR=../../hw/syn/altera/opae
+
+SCRIPT_DIR=../../hw/scripts
+
+CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
+CXXFLAGS += -I. -I../include -I../common/ -I../../hw
+CXXFLAGS += -DXLEN_$(XLEN)
+
+ifeq ($(TARGET), opaesim)
+	CXXFLAGS += -I$(OPAESIM_DIR)
+else
+	CXXFLAGS += -I$(SYN_DIR)
+endif
+
+# Position independent code
+CXXFLAGS += -fPIC
+
+# Add external configuration
+CXXFLAGS += $(CONFIGS)
+
+# Dump perf stats
+CXXFLAGS += -DDUMP_PERF_STATS
+
+LDFLAGS += -shared -luuid -ldl -pthread
+
+SRCS = vortex.cpp driver.cpp ../common/utils.cpp
+
+# set up target types
+ifeq ($(TARGET), opaesim)
+	CXXFLAGS += -DOPAESIM
+	OPAESIM = libopae-c-sim.so
+else
+	ifeq ($(TARGET), asesim)
+		CXXFLAGS += -DASESIM
+	else
+		CXXFLAGS += -DFPGA
+	endif
+endif
+
+# Debugigng
+ifdef DEBUG
+	CXXFLAGS += -g -O0
+else    
+	CXXFLAGS += -O2 -DNDEBUG
+endif
+
+# Enable scope logic analyzer
+ifdef SCOPE
+	CXXFLAGS += -DSCOPE	
+	SRCS += ../common/scope.cpp
+endif
+
+# Enable perf counters
+ifdef PERF
+	CXXFLAGS += -DPERF_ENABLE
+endif
+
+PROJECT = libvortex.so
+
+all: $(PROJECT)
+
+libopae-c-sim.so:
+	DESTDIR=../../runtime/opae $(MAKE) -C $(OPAESIM_DIR) ../../runtime/opae/libopae-c-sim.so
+
+$(PROJECT): $(SRCS) $(OPAESIM)
+	$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT)
+
+clean:
+	DESTDIR=../../runtime/opae $(MAKE) -C $(OPAESIM_DIR) clean
+	rm -rf $(PROJECT)
--- a/runtime/opae/driver.cpp
+++ b/runtime/opae/driver.cpp
@@ -0,0 +1,93 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "driver.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/limits.h>
+#include <dlfcn.h>
+#include <string>
+#include <vector>
+#include <sstream>
+
+#ifdef OPAESIM
+#define DEFAULT_OPAE_DRV_PATHS "libopae-c-sim.so"
+#elif ASESIM
+#define DEFAULT_OPAE_DRV_PATHS "libopae-c-ase.so"
+#else
+#define DEFAULT_OPAE_DRV_PATHS "libopae-c.so"
+#endif
+
+#define SET_API(func) \
+	opae_drv_funcs->func = (pfn_##func)dlsym(dl_handle, #func); \
+	if (opae_drv_funcs->func == nullptr) { \
+        printf("dlsym failed: %s\n", dlerror()); \
+		dlclose(dl_handle); \
+        return -1; \
+	}
+
+void* dl_handle = nullptr;
+
+int drv_init(opae_drv_api_t* opae_drv_funcs) {
+    if (opae_drv_funcs == nullptr)
+        return -1;
+
+    const char* api_path_s = getenv("OPAE_DRV_PATHS");
+    if (api_path_s == nullptr || api_path_s[0] == '\0') {
+        api_path_s = DEFAULT_OPAE_DRV_PATHS;
+    }
+
+    std::vector<std::string> api_paths;
+    {
+        std::stringstream ss(api_path_s);
+        while (ss.good()) {
+            std::string path;
+            getline(ss, path, ',');
+            api_paths.push_back(path);
+        }
+    }
+    
+    for (auto& api_path : api_paths) {
+		dl_handle = dlopen(api_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
+		if (dl_handle)
+			break;
+	}
+    if (dl_handle == nullptr) {
+        printf("dlopen failed: %s\n", dlerror());
+        return -1;
+    }
+
+	SET_API (fpgaGetProperties);
+	SET_API (fpgaPropertiesSetObjectType);
+	SET_API (fpgaPropertiesSetGUID);
+	SET_API (fpgaDestroyProperties);
+    SET_API (fpgaDestroyToken);
+    SET_API (fpgaPropertiesGetLocalMemorySize);
+	SET_API (fpgaEnumerate);	
+	SET_API (fpgaOpen);
+	SET_API (fpgaClose);
+	SET_API (fpgaPrepareBuffer);
+	SET_API (fpgaReleaseBuffer);
+	SET_API (fpgaGetIOAddress);
+	SET_API (fpgaWriteMMIO64);
+	SET_API (fpgaReadMMIO64);
+	SET_API (fpgaErrStr);    
+
+    return 0;
+}
+
+void drv_close() {
+    dlclose(dl_handle);
+}
--- a/runtime/opae/driver.h
+++ b/runtime/opae/driver.h
@@ -0,0 +1,61 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef OPAESIM
+#include <opae/fpga.h>
+#include <uuid/uuid.h>
+#else
+#include <fpga.h>
+#endif
+
+typedef fpga_result (*pfn_fpgaGetProperties)(fpga_token token, fpga_properties *prop);
+typedef fpga_result (*pfn_fpgaPropertiesSetObjectType)(fpga_properties prop, fpga_objtype objtype);
+typedef fpga_result (*pfn_fpgaPropertiesSetGUID)(fpga_properties prop, fpga_guid guid);
+typedef fpga_result (*pfn_fpgaDestroyProperties)(fpga_properties *prop);
+typedef fpga_result (*pfn_fpgaEnumerate)(const fpga_properties *filters, uint32_t num_filters, fpga_token *tokens, uint32_t max_tokens, uint32_t *num_matches);
+typedef fpga_result (*pfn_fpgaDestroyToken)(fpga_token *token);
+typedef fpga_result (*pfn_fpgaPropertiesGetLocalMemorySize)(fpga_properties prop, uint64_t *lms);
+
+typedef fpga_result (*pfn_fpgaOpen)(fpga_token token, fpga_handle *handle, int flags);
+typedef fpga_result (*pfn_fpgaClose)(fpga_handle handle);
+typedef fpga_result (*pfn_fpgaPrepareBuffer)(fpga_handle handle, uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
+typedef fpga_result (*pfn_fpgaReleaseBuffer)(fpga_handle handle, uint64_t wsid);
+typedef fpga_result (*pfn_fpgaGetIOAddress)(fpga_handle handle, uint64_t wsid, uint64_t *ioaddr);
+typedef fpga_result (*pfn_fpgaWriteMMIO64)(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t value);
+typedef fpga_result (*pfn_fpgaReadMMIO64)(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t *value);
+typedef const char *(*pfn_fpgaErrStr)(fpga_result e);
+
+struct opae_drv_api_t {
+	pfn_fpgaGetProperties 	fpgaGetProperties;
+	pfn_fpgaPropertiesSetObjectType fpgaPropertiesSetObjectType;
+	pfn_fpgaPropertiesSetGUID fpgaPropertiesSetGUID;
+	pfn_fpgaDestroyProperties fpgaDestroyProperties;
+	pfn_fpgaEnumerate 		fpgaEnumerate;
+	pfn_fpgaDestroyToken 	fpgaDestroyToken;
+	pfn_fpgaPropertiesGetLocalMemorySize fpgaPropertiesGetLocalMemorySize;
+
+	pfn_fpgaOpen 			fpgaOpen;
+	pfn_fpgaClose 			fpgaClose;
+	pfn_fpgaPrepareBuffer 	fpgaPrepareBuffer;
+	pfn_fpgaReleaseBuffer 	fpgaReleaseBuffer;
+	pfn_fpgaGetIOAddress 	fpgaGetIOAddress;
+	pfn_fpgaWriteMMIO64  	fpgaWriteMMIO64;
+	pfn_fpgaReadMMIO64     	fpgaReadMMIO64;
+	pfn_fpgaErrStr        	fpgaErrStr;
+};
+
+int drv_init(opae_drv_api_t* opae_drv_funcs);
+
+void drv_close();
--- a/runtime/opae/vortex.cpp
+++ b/runtime/opae/vortex.cpp
@@ -0,0 +1,610 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vortex.h>
+#include <utils.h>
+#include <malloc.h>
+#include "driver.h"
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cstdlib>
+#include <cstring>
+#include <uuid/uuid.h>
+#include <unistd.h>
+#include <assert.h>
+#include <cmath>
+#include <sstream>
+#include <unordered_map>
+#include <algorithm>
+#include <memory>
+#include <list>
+
+#include <VX_config.h>
+#include <VX_types.h>
+#include <vortex_afu.h>
+
+#ifdef SCOPE
+#include "scope.h"
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+#define CMD_MEM_READ        AFU_IMAGE_CMD_MEM_READ
+#define CMD_MEM_WRITE       AFU_IMAGE_CMD_MEM_WRITE
+#define CMD_RUN             AFU_IMAGE_CMD_RUN
+#define CMD_DCR_WRITE       AFU_IMAGE_CMD_DCR_WRITE
+
+#define MMIO_CMD_TYPE       (AFU_IMAGE_MMIO_CMD_TYPE * 4)
+#define MMIO_CMD_ARG0       (AFU_IMAGE_MMIO_CMD_ARG0 * 4)
+#define MMIO_CMD_ARG1       (AFU_IMAGE_MMIO_CMD_ARG1 * 4)
+#define MMIO_CMD_ARG2       (AFU_IMAGE_MMIO_CMD_ARG2 * 4)
+#define MMIO_STATUS         (AFU_IMAGE_MMIO_STATUS   * 4)
+#define MMIO_DEV_CAPS       (AFU_IMAGE_MMIO_DEV_CAPS * 4)
+#define MMIO_ISA_CAPS       (AFU_IMAGE_MMIO_ISA_CAPS * 4)
+#define MMIO_SCOPE_READ     (AFU_IMAGE_MMIO_SCOPE_READ * 4)
+#define MMIO_SCOPE_WRITE    (AFU_IMAGE_MMIO_SCOPE_WRITE * 4)
+
+#define STATUS_STATE_BITS   8
+
+#define RAM_PAGE_SIZE       4096
+
+#define CHECK_HANDLE(handle, _expr, _cleanup)   \
+    auto handle = _expr;                        \
+    if (handle == nullptr) {                    \
+        printf("[VXDRV] Error: '%s' returned NULL!\n", #_expr); \
+        _cleanup                                \
+    }
+
+#define CHECK_ERR(_expr, _cleanup)              \
+    do {                                        \
+        auto err = _expr;                       \
+        if (err == 0)                           \
+            break;                              \
+        printf("[VXDRV] Error: '%s' returned %d, %s!\n", #_expr, (int)err, api.fpgaErrStr(err)); \
+        _cleanup                                \
+    } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+class vx_device {
+public:
+    vx_device() : 
+        staging_wsid(0), 
+        staging_ioaddr(0), 
+        staging_ptr(nullptr),
+        staging_size(0) 
+    {}
+
+    ~vx_device() {}
+
+    int ensure_staging(uint64_t size) {
+        size_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
+        if (staging_size >= asize)
+            return 0;
+
+        if (staging_size != 0) {
+            // release existing buffer
+            api.fpgaReleaseBuffer(fpga, staging_wsid);
+            staging_size = 0;
+        }
+
+        // allocate new buffer
+        CHECK_ERR(api.fpgaPrepareBuffer(fpga, asize, (void**)&staging_ptr, &staging_wsid, 0), {
+            return -1;
+        });
+
+        // get the physical address of the buffer in the accelerator
+        CHECK_ERR(api.fpgaGetIOAddress(fpga, staging_wsid, &staging_ioaddr), {
+            api.fpgaReleaseBuffer(fpga, staging_wsid);
+            return -1;
+        });
+
+        staging_size = asize;
+
+        return 0;
+    }
+
+    opae_drv_api_t api;
+    fpga_handle fpga;
+    std::shared_ptr<vortex::MemoryAllocator> global_mem;
+    std::shared_ptr<vortex::MemoryAllocator> local_mem;
+    DeviceConfig dcrs;
+    uint64_t dev_caps;
+    uint64_t isa_caps;
+    uint64_t global_mem_size;
+    uint64_t staging_wsid;
+    uint64_t staging_ioaddr;
+    uint8_t* staging_ptr;
+    uint64_t staging_size;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = ((vx_device*)hdevice);
+
+    switch (caps_id) {
+    case VX_CAPS_VERSION:
+        *value = (device->dev_caps >> 0) & 0xff;
+        break;
+    case VX_CAPS_NUM_THREADS:
+        *value = (device->dev_caps >> 8) & 0xff;
+        break;
+    case VX_CAPS_NUM_WARPS:
+        *value = (device->dev_caps >> 16) & 0xff;
+        break;
+    case VX_CAPS_NUM_CORES:
+        *value = (device->dev_caps >> 24) & 0xffff;
+        break;
+    case VX_CAPS_CACHE_LINE_SIZE:
+        *value = CACHE_BLOCK_SIZE;
+        break;
+    case VX_CAPS_GLOBAL_MEM_SIZE:
+        *value = device->global_mem_size;
+        break;
+    case VX_CAPS_LOCAL_MEM_SIZE:
+        *value = 1ull << ((device->dev_caps >> 40) & 0xff);
+        break;
+    case VX_CAPS_KERNEL_BASE_ADDR:
+        *value = (uint64_t(device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) |
+                           device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
+        break;
+    case VX_CAPS_ISA_FLAGS:
+        *value = device->isa_caps;
+        break;
+    default:
+        fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id);
+        std::abort();
+        return -1;
+    }
+
+    return 0;
+}
+
+extern int vx_dev_open(vx_device_h* hdevice) {
+    if (nullptr == hdevice)
+        return  -1;
+
+    vx_device* device;
+
+    fpga_handle accel_handle;
+    fpga_token accel_token;
+    fpga_properties filter;    
+    fpga_guid guid; 
+
+    uint32_t num_matches;
+
+    opae_drv_api_t api;
+    memset(&api, 0, sizeof(opae_drv_api_t));
+    if (drv_init(&api) !=0) {
+        return -1;
+    }
+    
+    // Set up a filter that will search for an accelerator
+    CHECK_ERR(api.fpgaGetProperties(nullptr, &filter), {
+        return -1;
+    });
+    
+    CHECK_ERR(api.fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR), {
+        api.fpgaDestroyProperties(&filter);
+        return -1;
+    });
+
+    // Add the desired UUID to the filter
+    std::string s_uuid(AFU_ACCEL_UUID);
+    std::replace(s_uuid.begin(), s_uuid.end(), '_', '-');
+    uuid_parse(s_uuid.c_str(), guid);    
+    CHECK_ERR(api.fpgaPropertiesSetGUID(filter, guid), {        
+        api.fpgaDestroyProperties(&filter);
+        return -1;
+    });
+
+    // Do the search across the available FPGA contexts
+    CHECK_ERR(api.fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches), {
+        api.fpgaDestroyProperties(&filter);
+        return -1;
+    });
+
+    // Not needed anymore
+    CHECK_ERR(api.fpgaDestroyProperties(&filter), {
+        api.fpgaDestroyToken(&accel_token);
+        return -1;
+    });
+
+    if (num_matches < 1) {
+        fprintf(stderr, "[VXDRV] Error: accelerator %s not found!\n", AFU_ACCEL_UUID);
+        api.fpgaDestroyToken(&accel_token);
+        return -1;
+    }
+
+    // Open accelerator
+    CHECK_ERR(api.fpgaOpen(accel_token, &accel_handle, 0), {
+        api.fpgaDestroyToken(&accel_token);
+        return -1;
+    });
+
+    // Done with token
+    CHECK_ERR(api.fpgaDestroyToken(&accel_token), {
+        api.fpgaClose(accel_handle);
+        return -1;
+    });
+
+    // allocate device object
+    device = new vx_device();
+    if (nullptr == device) {
+        api.fpgaClose(accel_handle);
+        return -1;
+    }
+
+    device->api = api;
+    device->fpga = accel_handle;
+
+    {   
+        // retrieve FPGA global memory size
+        CHECK_ERR(api.fpgaPropertiesGetLocalMemorySize(filter, &device->global_mem_size), {
+            // assume 8GB as default
+            device->global_mem_size = GLOBAL_MEM_SIZE;
+        });
+
+        // Load ISA CAPS
+        CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_ISA_CAPS, &device->isa_caps), {
+            api.fpgaClose(accel_handle);
+            return -1;
+        });
+
+        // Load device CAPS        
+        CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_DEV_CAPS, &device->dev_caps), {
+            api.fpgaClose(accel_handle);
+            return -1;
+        });
+    }
+
+    device->global_mem = std::make_shared<vortex::MemoryAllocator>(
+        ALLOC_BASE_ADDR, ALLOC_MAX_ADDR - ALLOC_BASE_ADDR, RAM_PAGE_SIZE, CACHE_BLOCK_SIZE);
+
+    uint64_t local_mem_size = 0;
+    vx_dev_caps(device, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size);
+    if (local_mem_size <= 1) {        
+        device->local_mem = std::make_shared<vortex::MemoryAllocator>(
+            SMEM_BASE_ADDR, local_mem_size, RAM_PAGE_SIZE, 1);
+    }
+    
+#ifdef SCOPE
+    {
+        scope_callback_t callback;
+        callback.registerWrite = [](vx_device_h hdevice, uint64_t value)->int { 
+            auto device = (vx_device*)hdevice;
+            return device->api.fpgaWriteMMIO64(device->fpga, 0, MMIO_SCOPE_WRITE, value);
+        };
+        callback.registerRead = [](vx_device_h hdevice, uint64_t* value)->int {
+            auto device = (vx_device*)hdevice;
+            return device->api.fpgaReadMMIO64(device->fpga, 0, MMIO_SCOPE_READ, value);
+        };
+        int ret = vx_scope_start(&callback, device, 0, -1);
+        if (ret != 0) {
+            api.fpgaClose(accel_handle);
+            return ret;
+        }
+    }
+#endif
+
+    int err = dcr_initialize(device);
+    if (err != 0) {
+        delete device;
+        return err;
+    }
+
+#ifdef DUMP_PERF_STATS
+    perf_add_device(device);
+#endif    
+
+    *hdevice = device;    
+
+    return 0;
+}
+
+extern int vx_dev_close(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = ((vx_device*)hdevice);
+    auto& api = device->api;
+
+#ifdef SCOPE
+    vx_scope_stop(hdevice);
+#endif
+
+#ifdef DUMP_PERF_STATS
+    perf_remove_device(hdevice);
+#endif
+
+    // release staging buffer
+    if (device->staging_size != 0) {
+        api.fpgaReleaseBuffer(device->fpga, device->staging_wsid);
+        device->staging_size = 0;
+    }
+
+    // close the device
+    api.fpgaClose(device->fpga);
+
+    delete device;
+
+    drv_close();
+
+    return 0;
+}
+
+extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr) {
+    if (nullptr == hdevice 
+     || nullptr == dev_addr
+     || 0 == size)
+        return -1;
+
+    auto device = ((vx_device*)hdevice);
+    if (type == VX_MEM_TYPE_GLOBAL) {
+        return device->global_mem->allocate(size, dev_addr);
+    } else if (type == VX_MEM_TYPE_LOCAL) {        
+        return device->local_mem->allocate(size, dev_addr);
+    }
+    return -1;
+}
+
+extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
+    if (nullptr == hdevice)
+        return -1;
+
+    if (0 == dev_addr)
+        return 0;
+
+    auto device = ((vx_device*)hdevice);
+    if (dev_addr >= SMEM_BASE_ADDR) {
+        return device->local_mem->release(dev_addr);
+    } else {    
+        return device->global_mem->release(dev_addr);
+    }
+}
+
+extern int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = ((vx_device*)hdevice);    
+    if (type == VX_MEM_TYPE_GLOBAL) {
+        if (mem_free)
+            *mem_free = device->global_mem->free();
+        if (mem_used)
+            *mem_used = device->global_mem->allocated();
+    } else if (type == VX_MEM_TYPE_LOCAL) {
+        if (mem_free)
+            *mem_free = device->local_mem->free();
+        if (mem_used)
+            *mem_free = device->local_mem->allocated();
+    } else {
+        return -1;
+    }
+    return 0;
+}
+
+extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = (vx_device*)hdevice;
+    auto& api = device->api;
+
+    if (device->ensure_staging(size) != 0)
+        return -1; 
+
+    uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
+
+    // check alignment
+    if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
+        return -1;
+
+    // bound checking
+    if (dev_addr + asize > device->global_mem_size)
+        return -1;
+
+    // ensure ready for new command
+    if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
+        return -1;
+
+    // update staging buffer
+    memcpy(device->staging_ptr, host_ptr, size);
+
+    auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
+
+    CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), {
+        return -1; 
+    });    
+    CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), {
+        return -1; 
+    });
+    CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), {
+        return -1; 
+    });
+    CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE), {
+        return -1; 
+    });
+
+    // Wait for the write operation to finish
+    if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
+        return -1;
+
+    return 0;
+}
+
+extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = (vx_device*)hdevice;
+    auto& api = device->api;
+
+    if (device->ensure_staging(size) != 0)
+        return -1;
+
+    uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
+
+    // check alignment
+    if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
+        return -1;
+
+    // bound checking
+    if (dev_addr + asize > device->global_mem_size)
+        return -1;
+
+    // Ensure ready for new command
+    if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
+        return -1;
+
+    auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
+
+    CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), {
+        return -1; 
+    });
+    CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), {
+        return -1; 
+    });
+    CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), {
+        return -1; 
+    });
+    CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ), {
+        return -1; 
+    });
+
+    // wait for the write operation to finish
+    if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
+        return -1;
+
+    // read staging buffer
+    memcpy(host_ptr, device->staging_ptr, size);
+
+    return 0;
+}
+
+extern int vx_start(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;   
+
+    auto device = ((vx_device*)hdevice);
+    auto& api = device->api;
+
+    // Ensure ready for new command
+    if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
+        return -1;    
+  
+    // start execution    
+    CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN), {
+        return -1; 
+    });
+
+    return 0;
+}
+
+extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
+    if (nullptr == hdevice)
+        return -1;
+
+    std::unordered_map<uint32_t, std::stringstream> print_bufs;
+    
+    auto device = ((vx_device*)hdevice);
+    auto& api = device->api;
+
+    struct timespec sleep_time; 
+
+    sleep_time.tv_sec = 0;
+    sleep_time.tv_nsec = 1000000;
+
+    // to milliseconds
+    uint64_t sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
+    
+    for (;;) {
+        uint64_t status;
+        CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), {
+            return -1; 
+        });
+
+        // check for console data
+        uint32_t cout_data = status >> STATUS_STATE_BITS;
+        if (cout_data & 0x1) {
+            // retrieve console data
+            do {
+                char cout_char = (cout_data >> 1) & 0xff;
+                uint32_t cout_tid = (cout_data >> 9) & 0xff;
+                auto& ss_buf = print_bufs[cout_tid];
+                ss_buf << cout_char;
+                if (cout_char == '\n') {
+                    std::cout << std::dec << "#" << cout_tid << ": " << ss_buf.str() << std::flush;
+                    ss_buf.str("");
+                }
+                CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), {
+                    return -1; 
+                });
+                cout_data = status >> STATUS_STATE_BITS;
+            } while (cout_data & 0x1);
+        }
+
+        uint32_t state = status & ((1 << STATUS_STATE_BITS)-1);
+
+        if (0 == state || 0 == timeout) {
+            for (auto& buf : print_bufs) {
+                auto str = buf.second.str();
+                if (!str.empty()) {
+                std::cout << "#" << buf.first << ": " << str << std::endl;
+                }
+            }
+            if (state != 0) {
+                fprintf(stdout, "[VXDRV] ready-wait timed out: state=%d\n", state);
+            }
+            break;
+        }
+
+        nanosleep(&sleep_time, nullptr);
+        timeout -= sleep_time_ms;
+    };
+
+    return 0;
+}
+
+extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = ((vx_device*)hdevice);
+    auto& api = device->api;
+
+    // Ensure ready for new command
+    if (vx_ready_wait(hdevice, -1) != 0)
+        return -1;    
+  
+    // write DCR value
+    CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, addr), {
+        return -1; 
+    });
+    CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, value), {
+        return -1; 
+    });
+    CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_DCR_WRITE), {
+        return -1; 
+    });
+
+    // save the value
+    device->dcrs.write(addr, value);
+
+    return 0;
+}
--- a/runtime/rtlsim/.gitignore
+++ b/runtime/rtlsim/.gitignore
@@ -0,0 +1,2 @@
+obj_dir
+*.so
--- a/runtime/rtlsim/Makefile
+++ b/runtime/rtlsim/Makefile
@@ -0,0 +1,45 @@
+XLEN ?= 32
+
+RTLSIM_DIR = ../../sim/rtlsim
+
+CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
+CXXFLAGS += -I../include -I../common -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common
+CXXFLAGS += -DXLEN_$(XLEN)
+
+# Position independent code
+CXXFLAGS += -fPIC
+
+# Add external configuration
+CXXFLAGS += $(CONFIGS)
+
+# Dump perf stats
+CXXFLAGS += -DDUMP_PERF_STATS
+
+LDFLAGS += -shared -pthread
+LDFLAGS += -L. -lrtlsim
+
+SRCS = vortex.cpp ../common/utils.cpp
+
+# Debugigng
+ifdef DEBUG
+	CXXFLAGS += -g -O0
+else    
+	CXXFLAGS += -O2 -DNDEBUG
+endif
+
+# Enable perf counters
+ifdef PERF
+	CXXFLAGS += -DPERF_ENABLE
+endif
+
+PROJECT = libvortex.so
+
+all: $(PROJECT)
+	
+$(PROJECT): $(SRCS)
+	DESTDIR=../../runtime/rtlsim $(MAKE) -C $(RTLSIM_DIR) ../../runtime/rtlsim/librtlsim.so
+	$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT)
+
+clean:
+	DESTDIR=../../runtime/rtlsim $(MAKE) -C $(RTLSIM_DIR) clean
+	rm -rf $(PROJECT) *.o
--- a/runtime/rtlsim/vortex.cpp
+++ b/runtime/rtlsim/vortex.cpp
@@ -0,0 +1,336 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <iostream>
+#include <future>
+#include <list>
+#include <chrono>
+
+#include <vortex.h>
+#include <malloc.h>
+#include <utils.h>
+#include <VX_config.h>
+#include <VX_types.h>
+
+#include <mem.h>
+#include <util.h>
+#include <processor.h>
+
+#define RAM_PAGE_SIZE 4096
+
+using namespace vortex;
+
+///////////////////////////////////////////////////////////////////////////////
+
+class vx_device {    
+public:
+    vx_device() 
+        : ram_(RAM_PAGE_SIZE)
+        , global_mem_(
+            ALLOC_BASE_ADDR,
+            ALLOC_MAX_ADDR - ALLOC_BASE_ADDR,
+            RAM_PAGE_SIZE,
+            CACHE_BLOCK_SIZE)
+        , local_mem_(
+            SMEM_BASE_ADDR,
+            (1ull << SMEM_LOG_SIZE),
+            RAM_PAGE_SIZE,
+            1) 
+    {
+        processor_.attach_ram(&ram_);
+    }
+
+    ~vx_device() {    
+        if (future_.valid()) {
+            future_.wait();
+        }
+    }
+
+    int mem_alloc(uint64_t size, int type, uint64_t* dev_addr) {
+        if (type == VX_MEM_TYPE_GLOBAL) {
+            return global_mem_.allocate(size, dev_addr);
+        } else if (type == VX_MEM_TYPE_LOCAL) {
+            return local_mem_.allocate(size, dev_addr);
+        }
+        return -1;
+    }
+
+    int mem_free(uint64_t dev_addr) {
+        if (dev_addr >= SMEM_BASE_ADDR) {
+            return local_mem_.release(dev_addr);
+        } else {
+            return global_mem_.release(dev_addr);
+        }
+    }
+
+    int mem_info(int type, uint64_t* mem_free, uint64_t* mem_used) const {
+        if (type == VX_MEM_TYPE_GLOBAL) {
+            if (mem_free)
+                *mem_free = global_mem_.free();
+            if (mem_used)
+                *mem_used = global_mem_.allocated();
+        } else if (type == VX_MEM_TYPE_LOCAL) {
+            if (mem_free)
+                *mem_free = local_mem_.free();
+            if (mem_used)
+                *mem_free = local_mem_.allocated();
+        } else {
+            return -1;
+        }
+        return 0;
+    }
+
+    int upload(uint64_t dest_addr, const void* src, uint64_t size) {
+        uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
+        if (dest_addr + asize > GLOBAL_MEM_SIZE)
+            return -1;
+
+        /*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src));
+        for (int i = 0;  i < (asize / CACHE_BLOCK_SIZE); ++i) {
+            printf("\n0x%08lx=", dest_addr + i * CACHE_BLOCK_SIZE);
+            for (int j = 0;  j < CACHE_BLOCK_SIZE; ++j) {
+                printf("%02x", *((uint8_t*)src + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
+            }
+        }
+        printf("\n");*/
+        
+        ram_.write((const uint8_t*)src, dest_addr, size);
+        return 0;
+    }
+
+    int download(void* dest, uint64_t src_addr, uint64_t size) {
+        uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
+        if (src_addr + asize > GLOBAL_MEM_SIZE)
+            return -1;
+
+        ram_.read((uint8_t*)dest, src_addr, size);
+        
+        /*printf("VXDRV: download %ld bytes to 0x%lx:", size, uintptr_t((uint8_t*)dest));
+        for (int i = 0;  i < (asize / CACHE_BLOCK_SIZE); ++i) {
+            printf("\n0x%08lx=", src_addr + i * CACHE_BLOCK_SIZE);
+            for (int j = 0;  j < CACHE_BLOCK_SIZE; ++j) {
+                printf("%02x", *((uint8_t*)dest + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
+            }
+        }
+        printf("\n");*/
+        
+        return 0;
+    }
+
+    int start() {   
+        // ensure prior run completed
+        if (future_.valid()) {
+            future_.wait();
+        }
+        // start new run
+        future_ = std::async(std::launch::async, [&]{
+            processor_.run();
+        });
+        return 0;
+    }
+
+    int wait(uint64_t timeout) {
+        if (!future_.valid())
+            return 0;
+        uint64_t timeout_sec = timeout / 1000;
+        std::chrono::seconds wait_time(1);
+        for (;;) {
+            // wait for 1 sec and check status
+            auto status = future_.wait_for(wait_time);
+            if (status == std::future_status::ready 
+             || 0 == timeout_sec--)
+                break;
+        }
+        return 0;
+    }
+
+    int write_dcr(uint32_t addr, uint32_t value) {
+        if (future_.valid()) {
+            future_.wait(); // ensure prior run completed
+        }        
+        processor_.write_dcr(addr, value);
+        dcrs_.write(addr, value);
+        return 0;
+    }
+
+    uint64_t read_dcr(uint32_t addr) const {
+        return dcrs_.read(addr);
+    }
+
+private:
+
+    RAM                 ram_;
+    Processor           processor_;
+    MemoryAllocator     global_mem_;
+    MemoryAllocator     local_mem_;
+    DeviceConfig        dcrs_;
+    std::future<void>   future_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
+   if (nullptr == hdevice)
+        return  -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    switch (caps_id) {
+    case VX_CAPS_VERSION:
+        *value = IMPLEMENTATION_ID;
+        break;
+    case VX_CAPS_NUM_THREADS:
+        *value = NUM_THREADS;
+        break;
+    case VX_CAPS_NUM_WARPS:
+        *value = NUM_WARPS;
+        break;
+    case VX_CAPS_NUM_CORES:
+        *value = NUM_CORES * NUM_CLUSTERS;
+        break;
+    case VX_CAPS_CACHE_LINE_SIZE:
+        *value = CACHE_BLOCK_SIZE;
+        break;
+    case VX_CAPS_GLOBAL_MEM_SIZE:
+        *value = GLOBAL_MEM_SIZE;
+        break;
+    case VX_CAPS_KERNEL_BASE_ADDR:
+         *value = (uint64_t(device->read_dcr(VX_DCR_BASE_STARTUP_ADDR1)) << 32)
+                          | device->read_dcr(VX_DCR_BASE_STARTUP_ADDR0);
+        break;    
+    case VX_CAPS_ISA_FLAGS:
+        *value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
+        break;
+    default:
+        std::cout << "invalid caps id: " << caps_id << std::endl;
+        std::abort();
+        return -1;
+    }
+
+    return 0;
+}
+
+extern int vx_dev_open(vx_device_h* hdevice) {
+    if (nullptr == hdevice)
+        return  -1;
+
+    auto device = new vx_device();
+    if (device == nullptr)
+        return -1;
+
+    int err = dcr_initialize(device);
+    if (err != 0) {
+        delete device;
+        return err;
+    }
+
+#ifdef DUMP_PERF_STATS
+    perf_add_device(device);
+#endif
+
+    *hdevice = device;
+
+    return 0;
+}
+
+extern int vx_dev_close(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+    
+#ifdef DUMP_PERF_STATS
+    perf_remove_device(hdevice);
+#endif
+
+    delete device;
+
+    return 0;
+}
+
+extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr) {
+    if (nullptr == hdevice 
+     || nullptr == dev_addr
+     || 0 == size)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+    return device->mem_alloc(size, type, dev_addr);
+}
+
+extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
+    if (nullptr == hdevice)
+        return -1;
+
+    if (0 == dev_addr)
+        return 0;
+
+    vx_device *device = ((vx_device*)hdevice);
+    return device->mem_free(dev_addr);
+}
+
+extern int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = ((vx_device*)hdevice);
+    return device->mem_info(type, mem_free, mem_used);
+}
+
+extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = (vx_device*)hdevice;
+    return device->upload(dev_addr, host_ptr, size);
+}
+
+extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = (vx_device*)hdevice;
+    return device->download(host_ptr, dev_addr, size);
+}
+
+extern int vx_start(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+    return device->start();
+}
+
+extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+    return device->wait(timeout);
+}
+
+extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    // Ensure ready for new command
+    if (vx_ready_wait(hdevice, -1) != 0)
+        return -1;  
+    return device->write_dcr(addr, value);
+}
--- a/runtime/simx/.gitignore
+++ b/runtime/simx/.gitignore
@@ -0,0 +1,2 @@
+obj_dir
+libvortex.so
--- a/runtime/simx/Makefile
+++ b/runtime/simx/Makefile
@@ -0,0 +1,34 @@
+XLEN ?= 32
+
+SIMX_DIR = ../../sim/simx
+
+CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
+CXXFLAGS += -fPIC -Wno-maybe-uninitialized
+CXXFLAGS += -I../include -I../common -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common
+CXXFLAGS += $(CONFIGS)
+CXXFLAGS += -DDUMP_PERF_STATS
+CXXFLAGS += -DXLEN_$(XLEN)
+
+LDFLAGS += -shared -pthread
+LDFLAGS += -L. -lsimx
+
+SRCS = vortex.cpp ../common/utils.cpp
+
+# Debugigng
+ifdef DEBUG
+	CXXFLAGS += -g -O0
+else    
+	CXXFLAGS += -O2 -DNDEBUG
+endif
+
+PROJECT = libvortex.so
+
+all: $(PROJECT)
+
+$(PROJECT): $(SRCS)
+	DESTDIR=../../runtime/simx $(MAKE) -C $(SIMX_DIR) ../../runtime/simx/libsimx.so
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+clean:
+	DESTDIR=../../runtime/simx $(MAKE) -C $(SIMX_DIR) clean
+	rm -rf libsimx.so $(PROJECT) *.o
--- a/runtime/simx/vortex.cpp
+++ b/runtime/simx/vortex.cpp
@@ -0,0 +1,397 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <iostream>
+#include <future>
+#include <chrono>
+
+#include <vortex.h>
+#include <utils.h>
+#include <malloc.h>
+
+#include <VX_config.h>
+#include <VX_types.h>
+
+#include <util.h>
+
+#include <processor.h>
+#include <arch.h>
+#include <mem.h>
+#include <constants.h>
+
+#ifndef NDEBUG
+#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
+#else
+#define DBGPRINT(format, ...) ((void)0)
+#endif
+
+using namespace vortex;
+
+///////////////////////////////////////////////////////////////////////////////
+
+class vx_device;
+
+class vx_buffer {
+public:
+    vx_buffer(uint64_t size, vx_device* device) 
+        : size_(size)
+        , device_(device) {
+        uint64_t aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE);
+        data_ = aligned_malloc(aligned_asize, CACHE_BLOCK_SIZE);
+        // set uninitialized data to "baadf00d"
+        for (uint32_t i = 0; i < aligned_asize; ++i) {
+            ((uint8_t*)data_)[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff;
+        }
+    }
+
+    ~vx_buffer() {
+        if (data_) {
+            aligned_free(data_);
+        }
+    }
+
+    void* data() const {
+        return data_;
+    }
+
+    uint64_t size() const {
+        return size_;
+    }
+
+    vx_device* device() const {
+        return device_;
+    }
+
+private:
+    uint64_t   size_;
+    vx_device* device_;
+    void*      data_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class vx_device {    
+public:
+    vx_device() 
+        : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, NUM_CLUSTERS)
+        , ram_(RAM_PAGE_SIZE)
+        , processor_(arch_)
+        , global_mem_(
+            ALLOC_BASE_ADDR,
+            ALLOC_MAX_ADDR - ALLOC_BASE_ADDR,
+            RAM_PAGE_SIZE,
+            CACHE_BLOCK_SIZE)
+        , local_mem_(
+            SMEM_BASE_ADDR,
+            (1ull << SMEM_LOG_SIZE),
+            RAM_PAGE_SIZE,
+            1)
+    {
+        // attach memory module
+        processor_.attach_ram(&ram_);
+    }
+
+    ~vx_device() {
+        if (future_.valid()) {
+            future_.wait();
+        }
+    }    
+
+    int mem_alloc(uint64_t size, int type, uint64_t* dev_addr) {
+        if (type == VX_MEM_TYPE_GLOBAL) {
+            return global_mem_.allocate(size, dev_addr);
+        } else if (type == VX_MEM_TYPE_LOCAL) {
+            return local_mem_.allocate(size, dev_addr);
+        }
+        return -1;
+    }
+
+    int mem_free(uint64_t dev_addr) {
+        if (dev_addr >= SMEM_BASE_ADDR) {
+            return local_mem_.release(dev_addr);
+        } else {
+            return global_mem_.release(dev_addr);
+        }
+    }
+
+    int mem_info(int type, uint64_t* mem_free, uint64_t* mem_used) const {
+        if (type == VX_MEM_TYPE_GLOBAL) {
+            if (mem_free)
+                *mem_free = global_mem_.free();
+            if (mem_used)
+                *mem_used = global_mem_.allocated();
+        } else if (type == VX_MEM_TYPE_LOCAL) {
+            if (mem_free)
+                *mem_free = local_mem_.free();
+            if (mem_used)
+                *mem_free = local_mem_.allocated();
+        } else {
+            return -1;
+        }
+        return 0;
+    }
+
+    int upload(uint64_t dest_addr, const void* src, uint64_t size) {
+        uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
+        if (dest_addr + asize > GLOBAL_MEM_SIZE)
+            return -1;
+
+        ram_.write((const uint8_t*)src, dest_addr, size);
+        
+        /*DBGPRINT("upload %ld bytes to 0x%lx\n", size, dest_addr);
+        for (uint64_t i = 0; i < size && i < 1024; i += 4) {
+            DBGPRINT("  0x%lx <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + i));
+        }*/
+        
+        return 0;
+    }
+
+    int download(void* dest, uint64_t src_addr, uint64_t size) {
+        uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
+        if (src_addr + asize > GLOBAL_MEM_SIZE)
+            return -1;
+
+        ram_.read((uint8_t*)dest, src_addr, size);
+        
+        /*DBGPRINT("download %ld bytes from 0x%lx\n", size, src_addr);
+        for (uint64_t i = 0; i < size && i < 1024; i += 4) {
+            DBGPRINT("  0x%lx -> 0x%x\n", src_addr + i, *(uint32_t*)((uint8_t*)dest + i));
+        }*/
+        
+        return 0;
+    }
+
+    int start() {  
+        // ensure prior run completed
+        if (future_.valid()) {
+            future_.wait();
+        }
+        
+        // start new run
+        future_ = std::async(std::launch::async, [&]{
+            processor_.run(false);
+        });
+        
+        return 0;
+    }
+
+    int wait(uint64_t timeout) {
+        if (!future_.valid())
+            return 0;
+        uint64_t timeout_sec = timeout / 1000;
+        std::chrono::seconds wait_time(1);
+        for (;;) {
+            // wait for 1 sec and check status
+            auto status = future_.wait_for(wait_time);
+            if (status == std::future_status::ready 
+             || 0 == timeout_sec--)
+                break;
+        }
+        return 0;
+    }
+
+    int write_dcr(uint32_t addr, uint32_t value) {
+        if (future_.valid()) {
+            future_.wait(); // ensure prior run completed
+        }        
+        processor_.write_dcr(addr, value);
+        dcrs_.write(addr, value);
+        return 0;
+    }
+
+    uint64_t read_dcr(uint32_t addr) const {
+        return dcrs_.read(addr);
+    }
+
+private:
+    Arch                arch_;
+    RAM                 ram_;
+    Processor           processor_;
+    MemoryAllocator     global_mem_;
+    MemoryAllocator     local_mem_;
+    DeviceConfig        dcrs_;
+    std::future<void>   future_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern int vx_dev_open(vx_device_h* hdevice) {
+    if (nullptr == hdevice)
+        return  -1;
+
+    auto device = new vx_device();
+    if (device == nullptr)
+        return -1;
+
+    int err = dcr_initialize(device);
+    if (err != 0) {
+        delete device;
+        return err;
+    }
+
+#ifdef DUMP_PERF_STATS
+    perf_add_device(device);
+#endif  
+
+    *hdevice = device;
+
+    DBGPRINT("device creation complete!\n");
+
+    return 0;
+}
+
+extern int vx_dev_close(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+#ifdef DUMP_PERF_STATS
+    perf_remove_device(hdevice);
+#endif
+
+    delete device;
+
+    DBGPRINT("device destroyed!\n");
+
+    return 0;
+}
+
+extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
+    if (nullptr == hdevice)
+        return  -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    switch (caps_id) {
+    case VX_CAPS_VERSION:
+        *value = IMPLEMENTATION_ID;
+        break;
+    case VX_CAPS_NUM_THREADS:
+        *value = NUM_THREADS;
+        break;
+    case VX_CAPS_NUM_WARPS:
+        *value = NUM_WARPS;
+        break;
+    case VX_CAPS_NUM_CORES:
+        *value = NUM_CORES * NUM_CLUSTERS;
+        break;
+    case VX_CAPS_CACHE_LINE_SIZE:
+        *value = CACHE_BLOCK_SIZE;
+        break;
+    case VX_CAPS_GLOBAL_MEM_SIZE:
+        *value = GLOBAL_MEM_SIZE;
+        break;
+    case VX_CAPS_KERNEL_BASE_ADDR:
+        *value = (uint64_t(device->read_dcr(VX_DCR_BASE_STARTUP_ADDR1)) << 32)
+                         | device->read_dcr(VX_DCR_BASE_STARTUP_ADDR0);
+        break;    
+    case VX_CAPS_ISA_FLAGS:
+        *value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
+        break;
+    default:
+        std::cout << "invalid caps id: " << caps_id << std::endl;
+        std::abort();
+        return -1;
+    }
+
+    return 0;
+}
+
+extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr) {
+    if (nullptr == hdevice 
+     || nullptr == dev_addr
+     || 0 == size)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+    return device->mem_alloc(size, type, dev_addr);
+}
+
+extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
+    if (nullptr == hdevice)
+        return -1;
+
+    if (0 == dev_addr)
+        return 0;
+
+    vx_device *device = ((vx_device*)hdevice);
+    return device->mem_free(dev_addr);
+}
+
+extern int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = ((vx_device*)hdevice);
+    return device->mem_info(type, mem_free, mem_used);
+}
+
+extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = ((vx_device*)hdevice);
+
+    DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, host_ptr, size);
+
+    return device->upload(dev_addr, host_ptr, size);
+}
+
+extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = ((vx_device*)hdevice);
+
+    DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, host_ptr, size); 
+
+    return device->download(host_ptr, dev_addr, size);
+}
+
+extern int vx_start(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;    
+    
+    DBGPRINT("START\n");
+
+    vx_device *device = ((vx_device*)hdevice);
+    return device->start();
+}
+
+extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+    return device->wait(timeout);
+}
+
+extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    // Ensure ready for new command
+    if (vx_ready_wait(hdevice, -1) != 0)
+        return -1;
+
+    DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%lx\n", addr, value);
+  
+    return device->write_dcr(addr, value);
+}
--- a/runtime/src/tinyprintf.c
+++ b/runtime/src/tinyprintf.c
@@ -1,890 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-// \author (c) Marco Paland (info@paland.com)
-//             2014-2019, PALANDesign Hannover, Germany
-//
-// \license The MIT License (MIT)
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-//
-// \brief Tiny printf, sprintf and (v)snprintf implementation, optimized for speed on
-//        embedded systems with a very limited resources. These routines are thread
-//        safe and reentrant!
-//        Use this instead of the bloated standard/newlib printf cause these use
-//        malloc for printf (and may not be thread safe).
-//
-///////////////////////////////////////////////////////////////////////////////
-
-#include <stdbool.h>
-#include <stdint.h>
-#include "tinyprintf.h"
-#include "vx_print.h"
-
-
-// define this globally (e.g. gcc -DPRINTF_INCLUDE_CONFIG_H ...) to include the
-// printf_config.h header file
-// default: undefined
-#ifdef PRINTF_INCLUDE_CONFIG_H
-#include "printf_config.h"
-#endif
-
-
-// 'ntoa' conversion buffer size, this must be big enough to hold one converted
-// numeric number including padded zeros (dynamically created on stack)
-// default: 32 byte
-#ifndef PRINTF_NTOA_BUFFER_SIZE
-#define PRINTF_NTOA_BUFFER_SIZE    32U
-#endif
-
-// 'ftoa' conversion buffer size, this must be big enough to hold one converted
-// float number including padded zeros (dynamically created on stack)
-// default: 32 byte
-#ifndef PRINTF_FTOA_BUFFER_SIZE
-#define PRINTF_FTOA_BUFFER_SIZE    32U
-#endif
-
-// support for the floating point type (%f)
-// default: activated
-#ifndef PRINTF_DISABLE_SUPPORT_FLOAT
-#define PRINTF_SUPPORT_FLOAT
-#endif
-
-// support for exponential floating point notation (%e/%g)
-// default: activated
-#ifndef PRINTF_DISABLE_SUPPORT_EXPONENTIAL
-#define PRINTF_SUPPORT_EXPONENTIAL
-#endif
-
-// define the default floating point precision
-// default: 6 digits
-#ifndef PRINTF_DEFAULT_FLOAT_PRECISION
-#define PRINTF_DEFAULT_FLOAT_PRECISION  6U
-#endif
-
-// define the largest float suitable to print with %f
-// default: 1e9
-#ifndef PRINTF_MAX_FLOAT
-#define PRINTF_MAX_FLOAT  1e9
-#endif
-
-// support for the long long types (%llu or %p)
-// default: activated
-#ifndef PRINTF_DISABLE_SUPPORT_LONG_LONG
-#define PRINTF_SUPPORT_LONG_LONG
-#endif
-
-// support for the ptrdiff_t type (%t)
-// ptrdiff_t is normally defined in <stddef.h> as long or long long type
-// default: activated
-#ifndef PRINTF_DISABLE_SUPPORT_PTRDIFF_T
-#define PRINTF_SUPPORT_PTRDIFF_T
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-
-// internal flag definitions
-#define FLAGS_ZEROPAD   (1U <<  0U)
-#define FLAGS_LEFT      (1U <<  1U)
-#define FLAGS_PLUS      (1U <<  2U)
-#define FLAGS_SPACE     (1U <<  3U)
-#define FLAGS_HASH      (1U <<  4U)
-#define FLAGS_UPPERCASE (1U <<  5U)
-#define FLAGS_CHAR      (1U <<  6U)
-#define FLAGS_SHORT     (1U <<  7U)
-#define FLAGS_LONG      (1U <<  8U)
-#define FLAGS_LONG_LONG (1U <<  9U)
-#define FLAGS_PRECISION (1U << 10U)
-#define FLAGS_ADAPT_EXP (1U << 11U)
-
-
-// import float.h for DBL_MAX
-#if defined(PRINTF_SUPPORT_FLOAT)
-#include <float.h>
-#endif
-
-
-// output function type
-typedef void (*out_fct_type)(char character, void* buffer, size_t idx, size_t maxlen);
-
-
-// wrapper (used as buffer) for output function type
-typedef struct {
-  void  (*fct)(char character, void* arg);
-  void* arg;
-} out_fct_wrap_type;
-
-
-// internal buffer output
-static inline void _out_buffer(char character, void* buffer, size_t idx, size_t maxlen)
-{
-  if (idx < maxlen) {
-    ((char*)buffer)[idx] = character;
-  }
-}
-
-
-// internal null output
-static inline void _out_null(char character, void* buffer, size_t idx, size_t maxlen)
-{
-  (void)character; (void)buffer; (void)idx; (void)maxlen;
-}
-
-
-// internal _putchar wrapper
-static inline void _out_char(char character, void* buffer, size_t idx, size_t maxlen)
-{
-  (void)buffer; (void)idx; (void)maxlen;
-  if (character) {
-    vx_putchar(character);
-  }
-}
-
-
-// internal output function wrapper
-static inline void _out_fct(char character, void* buffer, size_t idx, size_t maxlen)
-{
-  (void)idx; (void)maxlen;
-  if (character) {
-    // buffer is the output fct pointer
-    ((out_fct_wrap_type*)buffer)->fct(character, ((out_fct_wrap_type*)buffer)->arg);
-  }
-}
-
-
-// internal secure strlen
-// \return The length of the string (excluding the terminating 0) limited by 'maxsize'
-static inline unsigned int _strnlen_s(const char* str, size_t maxsize)
-{
-  const char* s;
-  for (s = str; *s && maxsize--; ++s);
-  return (unsigned int)(s - str);
-}
-
-
-// internal test if char is a digit (0-9)
-// \return true if char is a digit
-static inline bool _is_digit(char ch)
-{
-  return (ch >= '0') && (ch <= '9');
-}
-
-
-// internal ASCII string to unsigned int conversion
-static unsigned int _atoi(const char** str)
-{
-  unsigned int i = 0U;
-  while (_is_digit(**str)) {
-    i = i * 10U + (unsigned int)(*((*str)++) - '0');
-  }
-  return i;
-}
-
-
-// output the specified string in reverse, taking care of any zero-padding
-static size_t _out_rev(out_fct_type out, char* buffer, size_t idx, size_t maxlen, const char* buf, size_t len, unsigned int width, unsigned int flags)
-{
-  const size_t start_idx = idx;
-
-  // pad spaces up to given width
-  if (!(flags & FLAGS_LEFT) && !(flags & FLAGS_ZEROPAD)) {
-    for (size_t i = len; i < width; i++) {
-      out(' ', buffer, idx++, maxlen);
-    }
-  }
-
-  // reverse string
-  while (len) {
-    out(buf[--len], buffer, idx++, maxlen);
-  }
-
-  // append pad spaces up to given width
-  if (flags & FLAGS_LEFT) {
-    while (idx - start_idx < width) {
-      out(' ', buffer, idx++, maxlen);
-    }
-  }
-
-  return idx;
-}
-
-
-// internal itoa format
-static size_t _ntoa_format(out_fct_type out, char* buffer, size_t idx, size_t maxlen, char* buf, size_t len, bool negative, unsigned int base, unsigned int prec, unsigned int width, unsigned int flags)
-{
-  // pad leading zeros
-  if (!(flags & FLAGS_LEFT)) {
-    if (width && (flags & FLAGS_ZEROPAD) && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
-      width--;
-    }
-    while ((len < prec) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
-      buf[len++] = '0';
-    }
-    while ((flags & FLAGS_ZEROPAD) && (len < width) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
-      buf[len++] = '0';
-    }
-  }
-
-  // handle hash
-  if (flags & FLAGS_HASH) {
-    if (!(flags & FLAGS_PRECISION) && len && ((len == prec) || (len == width))) {
-      len--;
-      if (len && (base == 16U)) {
-        len--;
-      }
-    }
-    if ((base == 16U) && !(flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
-      buf[len++] = 'x';
-    }
-    else if ((base == 16U) && (flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
-      buf[len++] = 'X';
-    }
-    else if ((base == 2U) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
-      buf[len++] = 'b';
-    }
-    if (len < PRINTF_NTOA_BUFFER_SIZE) {
-      buf[len++] = '0';
-    }
-  }
-
-  if (len < PRINTF_NTOA_BUFFER_SIZE) {
-    if (negative) {
-      buf[len++] = '-';
-    }
-    else if (flags & FLAGS_PLUS) {
-      buf[len++] = '+';  // ignore the space if the '+' exists
-    }
-    else if (flags & FLAGS_SPACE) {
-      buf[len++] = ' ';
-    }
-  }
-
-  return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
-}
-
-
-// internal itoa for 'long' type
-static size_t _ntoa_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long value, bool negative, unsigned long base, unsigned int prec, unsigned int width, unsigned int flags)
-{
-  char buf[PRINTF_NTOA_BUFFER_SIZE];
-  size_t len = 0U;
-
-  // no hash for 0 values
-  if (!value) {
-    flags &= ~FLAGS_HASH;
-  }
-
-  // write if precision != 0 and value is != 0
-  if (!(flags & FLAGS_PRECISION) || value) {
-    do {
-      const char digit = (char)(value % base);
-      buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
-      value /= base;
-    } while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
-  }
-
-  return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
-}
-
-
-// internal itoa for 'long long' type
-#if defined(PRINTF_SUPPORT_LONG_LONG)
-static size_t _ntoa_long_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long long value, bool negative, unsigned long long base, unsigned int prec, unsigned int width, unsigned int flags)
-{
-  char buf[PRINTF_NTOA_BUFFER_SIZE];
-  size_t len = 0U;
-
-  // no hash for 0 values
-  if (!value) {
-    flags &= ~FLAGS_HASH;
-  }
-
-  // write if precision != 0 and value is != 0
-  if (!(flags & FLAGS_PRECISION) || value) {
-    do {
-      const char digit = (char)(value % base);
-      buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
-      value /= base;
-    } while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
-  }
-
-  return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
-}
-#endif  // PRINTF_SUPPORT_LONG_LONG
-
-
-#if defined(PRINTF_SUPPORT_FLOAT)
-
-#if defined(PRINTF_SUPPORT_EXPONENTIAL)
-// forward declaration so that _ftoa can switch to exp notation for values > PRINTF_MAX_FLOAT
-static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags);
-#endif
-
-
-// internal ftoa for fixed decimal floating point
-static size_t _ftoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
-{
-  char buf[PRINTF_FTOA_BUFFER_SIZE];
-  size_t len  = 0U;
-  double diff = 0.0;
-
-  // powers of 10
-  static const double pow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 };
-
-  // test for special values
-  if (value != value)
-    return _out_rev(out, buffer, idx, maxlen, "nan", 3, width, flags);
-  if (value < -DBL_MAX)
-    return _out_rev(out, buffer, idx, maxlen, "fni-", 4, width, flags);
-  if (value > DBL_MAX)
-    return _out_rev(out, buffer, idx, maxlen, (flags & FLAGS_PLUS) ? "fni+" : "fni", (flags & FLAGS_PLUS) ? 4U : 3U, width, flags);
-
-  // test for very large values
-  // standard printf behavior is to print EVERY whole number digit -- which could be 100s of characters overflowing your buffers == bad
-  if ((value > PRINTF_MAX_FLOAT) || (value < -PRINTF_MAX_FLOAT)) {
-#if defined(PRINTF_SUPPORT_EXPONENTIAL)
-    return _etoa(out, buffer, idx, maxlen, value, prec, width, flags);
-#else
-    return 0U;
-#endif
-  }
-
-  // test for negative
-  bool negative = false;
-  if (value < 0) {
-    negative = true;
-    value = 0 - value;
-  }
-
-  // set default precision, if not set explicitly
-  if (!(flags & FLAGS_PRECISION)) {
-    prec = PRINTF_DEFAULT_FLOAT_PRECISION;
-  }
-  // limit precision to 9, cause a prec >= 10 can lead to overflow errors
-  while ((len < PRINTF_FTOA_BUFFER_SIZE) && (prec > 9U)) {
-    buf[len++] = '0';
-    prec--;
-  }
-
-  int whole = (int)value;
-  double tmp = (value - whole) * pow10[prec];
-  unsigned long frac = (unsigned long)tmp;
-  diff = tmp - frac;
-
-  if (diff > 0.5) {
-    ++frac;
-    // handle rollover, e.g. case 0.99 with prec 1 is 1.0
-    if (frac >= pow10[prec]) {
-      frac = 0;
-      ++whole;
-    }
-  }
-  else if (diff < 0.5) {
-  }
-  else if ((frac == 0U) || (frac & 1U)) {
-    // if halfway, round up if odd OR if last digit is 0
-    ++frac;
-  }
-
-  if (prec == 0U) {
-    diff = value - (double)whole;
-    if ((!(diff < 0.5) || (diff > 0.5)) && (whole & 1)) {
-      // exactly 0.5 and ODD, then round up
-      // 1.5 -> 2, but 2.5 -> 2
-      ++whole;
-    }
-  }
-  else {
-    unsigned int count = prec;
-    // now do fractional part, as an unsigned number
-    while (len < PRINTF_FTOA_BUFFER_SIZE) {
-      --count;
-      buf[len++] = (char)(48U + (frac % 10U));
-      if (!(frac /= 10U)) {
-        break;
-      }
-    }
-    // add extra 0s
-    while ((len < PRINTF_FTOA_BUFFER_SIZE) && (count-- > 0U)) {
-      buf[len++] = '0';
-    }
-    if (len < PRINTF_FTOA_BUFFER_SIZE) {
-      // add decimal
-      buf[len++] = '.';
-    }
-  }
-
-  // do whole part, number is reversed
-  while (len < PRINTF_FTOA_BUFFER_SIZE) {
-    buf[len++] = (char)(48 + (whole % 10));
-    if (!(whole /= 10)) {
-      break;
-    }
-  }
-
-  // pad leading zeros
-  if (!(flags & FLAGS_LEFT) && (flags & FLAGS_ZEROPAD)) {
-    if (width && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
-      width--;
-    }
-    while ((len < width) && (len < PRINTF_FTOA_BUFFER_SIZE)) {
-      buf[len++] = '0';
-    }
-  }
-
-  if (len < PRINTF_FTOA_BUFFER_SIZE) {
-    if (negative) {
-      buf[len++] = '-';
-    }
-    else if (flags & FLAGS_PLUS) {
-      buf[len++] = '+';  // ignore the space if the '+' exists
-    }
-    else if (flags & FLAGS_SPACE) {
-      buf[len++] = ' ';
-    }
-  }
-
-  return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
-}
-
-
-#if defined(PRINTF_SUPPORT_EXPONENTIAL)
-// internal ftoa variant for exponential floating-point type, contributed by Martijn Jasperse <m.jasperse@gmail.com>
-static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
-{
-  // check for NaN and special values
-  if ((value != value) || (value > DBL_MAX) || (value < -DBL_MAX)) {
-    return _ftoa(out, buffer, idx, maxlen, value, prec, width, flags);
-  }
-
-  // determine the sign
-  const bool negative = value < 0;
-  if (negative) {
-    value = -value;
-  }
-
-  // default precision
-  if (!(flags & FLAGS_PRECISION)) {
-    prec = PRINTF_DEFAULT_FLOAT_PRECISION;
-  }
-
-  // determine the decimal exponent
-  // based on the algorithm by David Gay (https://www.ampl.com/netlib/fp/dtoa.c)
-  union {
-    uint64_t U;
-    double   F;
-  } conv;
-
-  conv.F = value;
-  int exp2 = (int)((conv.U >> 52U) & 0x07FFU) - 1023;           // effectively log2
-  conv.U = (conv.U & ((1ULL << 52U) - 1U)) | (1023ULL << 52U);  // drop the exponent so conv.F is now in [1,2)
-  // now approximate log10 from the log2 integer part and an expansion of ln around 1.5
-  int expval = (int)(0.1760912590558 + exp2 * 0.301029995663981 + (conv.F - 1.5) * 0.289529654602168);
-  // now we want to compute 10^expval but we want to be sure it won't overflow
-  exp2 = (int)(expval * 3.321928094887362 + 0.5);
-  const double z  = expval * 2.302585092994046 - exp2 * 0.6931471805599453;
-  const double z2 = z * z;
-  conv.U = (uint64_t)(exp2 + 1023) << 52U;
-  // compute exp(z) using continued fractions, see https://en.wikipedia.org/wiki/Exponential_function#Continued_fractions_for_ex
-  conv.F *= 1 + 2 * z / (2 - z + (z2 / (6 + (z2 / (10 + z2 / 14)))));
-  // correct for rounding errors
-  if (value < conv.F) {
-    expval--;
-    conv.F /= 10;
-  }
-
-  // the exponent format is "%+03d" and largest value is "307", so set aside 4-5 characters
-  unsigned int minwidth = ((expval < 100) && (expval > -100)) ? 4U : 5U;
-
-  // in "%g" mode, "prec" is the number of *significant figures* not decimals
-  if (flags & FLAGS_ADAPT_EXP) {
-    // do we want to fall-back to "%f" mode?
-    if ((value >= 1e-4) && (value < 1e6)) {
-      if ((int)prec > expval) {
-        prec = (unsigned)((int)prec - expval - 1);
-      }
-      else {
-        prec = 0;
-      }
-      flags |= FLAGS_PRECISION;   // make sure _ftoa respects precision
-      // no characters in exponent
-      minwidth = 0U;
-      expval   = 0;
-    }
-    else {
-      // we use one sigfig for the whole part
-      if ((prec > 0) && (flags & FLAGS_PRECISION)) {
-        --prec;
-      }
-    }
-  }
-
-  // will everything fit?
-  unsigned int fwidth = width;
-  if (width > minwidth) {
-    // we didn't fall-back so subtract the characters required for the exponent
-    fwidth -= minwidth;
-  } else {
-    // not enough characters, so go back to default sizing
-    fwidth = 0U;
-  }
-  if ((flags & FLAGS_LEFT) && minwidth) {
-    // if we're padding on the right, DON'T pad the floating part
-    fwidth = 0U;
-  }
-
-  // rescale the float value
-  if (expval) {
-    value /= conv.F;
-  }
-
-  // output the floating part
-  const size_t start_idx = idx;
-  idx = _ftoa(out, buffer, idx, maxlen, negative ? -value : value, prec, fwidth, flags & ~FLAGS_ADAPT_EXP);
-
-  // output the exponent part
-  if (minwidth) {
-    // output the exponential symbol
-    out((flags & FLAGS_UPPERCASE) ? 'E' : 'e', buffer, idx++, maxlen);
-    // output the exponent value
-    idx = _ntoa_long(out, buffer, idx, maxlen, (expval < 0) ? -expval : expval, expval < 0, 10, 0, minwidth-1, FLAGS_ZEROPAD | FLAGS_PLUS);
-    // might need to right-pad spaces
-    if (flags & FLAGS_LEFT) {
-      while (idx - start_idx < width) out(' ', buffer, idx++, maxlen);
-    }
-  }
-  return idx;
-}
-#endif  // PRINTF_SUPPORT_EXPONENTIAL
-#endif  // PRINTF_SUPPORT_FLOAT
-
-
-// internal vsnprintf
-static int _vsnprintf(out_fct_type out, char* buffer, const size_t maxlen, const char* format, va_list va) {
-  unsigned int flags, width, precision, n;
-  size_t idx = 0U;
-
-  if (!buffer) {
-    // use null output function
-    out = _out_null;
-  }
-
-  while (*format)
-  {
-    // format specifier?  %[flags][width][.precision][length]
-    if (*format != '%') {
-      // no
-      out(*format, buffer, idx++, maxlen);
-      format++;
-      continue;
-    }
-    else {
-      // yes, evaluate it
-      format++;
-    }
-
-    // evaluate flags
-    flags = 0U;
-    do {
-      switch (*format) {
-        case '0': flags |= FLAGS_ZEROPAD; format++; n = 1U; break;
-        case '-': flags |= FLAGS_LEFT;    format++; n = 1U; break;
-        case '+': flags |= FLAGS_PLUS;    format++; n = 1U; break;
-        case ' ': flags |= FLAGS_SPACE;   format++; n = 1U; break;
-        case '#': flags |= FLAGS_HASH;    format++; n = 1U; break;
-        default :                                   n = 0U; break;
-      }
-    } while (n);
-
-    // evaluate width field
-    width = 0U;
-    if (_is_digit(*format)) {
-      width = _atoi(&format);
-    }
-    else if (*format == '*') {
-      const int w = va_arg(va, int);
-      if (w < 0) {
-        flags |= FLAGS_LEFT;    // reverse padding
-        width = (unsigned int)-w;
-      }
-      else {
-        width = (unsigned int)w;
-      }
-      format++;
-    }
-
-    // evaluate precision field
-    precision = 0U;
-    if (*format == '.') {
-      flags |= FLAGS_PRECISION;
-      format++;
-      if (_is_digit(*format)) {
-        precision = _atoi(&format);
-      }
-      else if (*format == '*') {
-        const int prec = (int)va_arg(va, int);
-        precision = prec > 0 ? (unsigned int)prec : 0U;
-        format++;
-      }
-    }
-
-    // evaluate length field
-    switch (*format) {
-      case 'l' :
-        flags |= FLAGS_LONG;
-        format++;
-        if (*format == 'l') {
-          flags |= FLAGS_LONG_LONG;
-          format++;
-        }
-        break;
-      case 'h' :
-        flags |= FLAGS_SHORT;
-        format++;
-        if (*format == 'h') {
-          flags |= FLAGS_CHAR;
-          format++;
-        }
-        break;
-#if defined(PRINTF_SUPPORT_PTRDIFF_T)
-      case 't' :
-        flags |= (sizeof(ptrdiff_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
-        format++;
-        break;
-#endif
-      case 'j' :
-        flags |= (sizeof(intmax_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
-        format++;
-        break;
-      case 'z' :
-        flags |= (sizeof(size_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
-        format++;
-        break;
-      default :
-        break;
-    }
-
-    // evaluate specifier
-    switch (*format) {
-      case 'd' :
-      case 'i' :
-      case 'u' :
-      case 'x' :
-      case 'X' :
-      case 'o' :
-      case 'b' : {
-        // set the base
-        unsigned int base;
-        if (*format == 'x' || *format == 'X') {
-          base = 16U;
-        }
-        else if (*format == 'o') {
-          base =  8U;
-        }
-        else if (*format == 'b') {
-          base =  2U;
-        }
-        else {
-          base = 10U;
-          flags &= ~FLAGS_HASH;   // no hash for dec format
-        }
-        // uppercase
-        if (*format == 'X') {
-          flags |= FLAGS_UPPERCASE;
-        }
-
-        // no plus or space flag for u, x, X, o, b
-        if ((*format != 'i') && (*format != 'd')) {
-          flags &= ~(FLAGS_PLUS | FLAGS_SPACE);
-        }
-
-        // ignore '0' flag when precision is given
-        if (flags & FLAGS_PRECISION) {
-          flags &= ~FLAGS_ZEROPAD;
-        }
-
-        // convert the integer
-        if ((*format == 'i') || (*format == 'd')) {
-          // signed
-          if (flags & FLAGS_LONG_LONG) {
-#if defined(PRINTF_SUPPORT_LONG_LONG)
-            const long long value = va_arg(va, long long);
-            idx = _ntoa_long_long(out, buffer, idx, maxlen, (unsigned long long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
-#endif
-          }
-          else if (flags & FLAGS_LONG) {
-            const long value = va_arg(va, long);
-            idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
-          }
-          else {
-            const int value = (flags & FLAGS_CHAR) ? (char)va_arg(va, int) : (flags & FLAGS_SHORT) ? (short int)va_arg(va, int) : va_arg(va, int);
-            idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned int)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
-          }
-        }
-        else {
-          // unsigned
-          if (flags & FLAGS_LONG_LONG) {
-#if defined(PRINTF_SUPPORT_LONG_LONG)
-            idx = _ntoa_long_long(out, buffer, idx, maxlen, va_arg(va, unsigned long long), false, base, precision, width, flags);
-#endif
-          }
-          else if (flags & FLAGS_LONG) {
-            idx = _ntoa_long(out, buffer, idx, maxlen, va_arg(va, unsigned long), false, base, precision, width, flags);
-          }
-          else {
-            const unsigned int value = (flags & FLAGS_CHAR) ? (unsigned char)va_arg(va, unsigned int) : (flags & FLAGS_SHORT) ? (unsigned short int)va_arg(va, unsigned int) : va_arg(va, unsigned int);
-            idx = _ntoa_long(out, buffer, idx, maxlen, value, false, base, precision, width, flags);
-          }
-        }
-        format++;
-        break;
-      }
-#if defined(PRINTF_SUPPORT_FLOAT)
-      case 'f' :
-      case 'F' :
-        if (*format == 'F') flags |= FLAGS_UPPERCASE;
-        idx = _ftoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
-        format++;
-        break;
-#if defined(PRINTF_SUPPORT_EXPONENTIAL)
-      case 'e':
-      case 'E':
-      case 'g':
-      case 'G':
-        if ((*format == 'g')||(*format == 'G')) flags |= FLAGS_ADAPT_EXP;
-        if ((*format == 'E')||(*format == 'G')) flags |= FLAGS_UPPERCASE;
-        idx = _etoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
-        format++;
-        break;
-#endif  // PRINTF_SUPPORT_EXPONENTIAL
-#endif  // PRINTF_SUPPORT_FLOAT
-      case 'c' : {
-        unsigned int l = 1U;
-        // pre padding
-        if (!(flags & FLAGS_LEFT)) {
-          while (l++ < width) {
-            out(' ', buffer, idx++, maxlen);
-          }
-        }
-        // char output
-        out((char)va_arg(va, int), buffer, idx++, maxlen);
-        // post padding
-        if (flags & FLAGS_LEFT) {
-          while (l++ < width) {
-            out(' ', buffer, idx++, maxlen);
-          }
-        }
-        format++;
-        break;
-      }
-
-      case 's' : {
-        const char* p = va_arg(va, char*);
-        unsigned int l = _strnlen_s(p, precision ? precision : (size_t)-1);
-        // pre padding
-        if (flags & FLAGS_PRECISION) {
-          l = (l < precision ? l : precision);
-        }
-        if (!(flags & FLAGS_LEFT)) {
-          while (l++ < width) {
-            out(' ', buffer, idx++, maxlen);
-          }
-        }
-        // string output
-        while ((*p != 0) && (!(flags & FLAGS_PRECISION) || precision--)) {
-          out(*(p++), buffer, idx++, maxlen);
-        }
-        // post padding
-        if (flags & FLAGS_LEFT) {
-          while (l++ < width) {
-            out(' ', buffer, idx++, maxlen);
-          }
-        }
-        format++;
-        break;
-      }
-
-      case 'p' : {
-        width = sizeof(void*) * 2U;
-        flags |= FLAGS_ZEROPAD | FLAGS_UPPERCASE;
-#if defined(PRINTF_SUPPORT_LONG_LONG)
-        const bool is_ll = sizeof(uintptr_t) == sizeof(long long);
-        if (is_ll) {
-          idx = _ntoa_long_long(out, buffer, idx, maxlen, (uintptr_t)va_arg(va, void*), false, 16U, precision, width, flags);
-        }
-        else {
-#endif
-          idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)((uintptr_t)va_arg(va, void*)), false, 16U, precision, width, flags);
-#if defined(PRINTF_SUPPORT_LONG_LONG)
-        }
-#endif
-        format++;
-        break;
-      }
-
-      case '%' :
-        out('%', buffer, idx++, maxlen);
-        format++;
-        break;
-
-      default :
-        out(*format, buffer, idx++, maxlen);
-        format++;
-        break;
-    }
-  }
-
-  // termination
-  out((char)0, buffer, idx < maxlen ? idx : maxlen - 1U, maxlen);
-
-  // return written chars without terminating \0
-  return (int)idx;
-}
-
-int tiny_printf(const char* format, ...) {
-  va_list va;
-  va_start(va, format);
-  char buffer[1];
-  const int ret = _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
-  va_end(va);
-  return ret;
-}
-
-int tiny_sprintf(char* buffer, const char* format, ...) {
-  va_list va;
-  va_start(va, format);
-  const int ret = _vsnprintf(_out_buffer, buffer, (size_t)-1, format, va);
-  va_end(va);
-  return ret;
-}
-
-int tiny_snprintf(char* buffer, size_t count, const char* format, ...) {
-  va_list va;
-  va_start(va, format);
-  const int ret = _vsnprintf(_out_buffer, buffer, count, format, va);
-  va_end(va);
-  return ret;
-}
-
-int tiny_vprintf(const char* format, va_list va) {
-  char buffer[1];
-  return _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
-}
-
-int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va) {
-  return _vsnprintf(_out_buffer, buffer, count, format, va);
-}
--- a/runtime/src/tinyprintf.h
+++ b/runtime/src/tinyprintf.h
@@ -1,86 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////
-// \author (c) Marco Paland (info@paland.com)
-//             2014-2019, PALANDesign Hannover, Germany
-//
-// \license The MIT License (MIT)
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-// 
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-//
-// \brief Tiny printf, sprintf and snprintf implementation, optimized for speed on
-//        embedded systems with a very limited resources.
-//        Use this instead of bloated standard/newlib printf.
-//        These routines are thread safe and reentrant.
-//
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef _TINYPRINTF_H_
-#define _TINYPRINTF_H_
-
-#include <stdarg.h>
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Tiny printf implementation
- * You have to implement _putchar if you use printf()
- * To avoid conflicts with the regular printf() API it is overridden by macro defines
- * and internal underscore-appended functions like printf_() are used
- * \param format A string that specifies the format of the output
- * \return The number of characters that are written into the array, not counting the terminating null character
- */
-int tiny_printf(const char* format, ...);
-
-/**
- * Tiny sprintf implementation
- * Due to security reasons (buffer overflow) YOU SHOULD CONSIDER USING (V)SNPRINTF INSTEAD!
- * \param buffer A pointer to the buffer where to store the formatted string. MUST be big enough to store the output!
- * \param format A string that specifies the format of the output
- * \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
- */
-int tiny_sprintf(char* buffer, const char* format, ...);
-
-/**
- * Tiny snprintf/vsnprintf implementation
- * \param buffer A pointer to the buffer where to store the formatted string
- * \param count The maximum number of characters to store in the buffer, including a terminating null character
- * \param format A string that specifies the format of the output
- * \param va A value identifying a variable arguments list
- * \return The number of characters that COULD have been written into the buffer, not counting the terminating
- *         null character. A value equal or larger than count indicates truncation. Only when the returned value
- *         is non-negative and less than count, the string has been completely written.
- */
-int tiny_snprintf(char* buffer, size_t count, const char* format, ...);
-int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va);
-
-/**
- * Tiny vprintf implementation
- * \param format A string that specifies the format of the output
- * \param va A value identifying a variable arguments list
- * \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
- */
-int tiny_vprintf(const char* format, va_list va);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // _TINYPRINTF_H_
--- a/runtime/src/vx_perf.c
+++ b/runtime/src/vx_perf.c
@@ -1,27 +0,0 @@
-
-#include <VX_config.h>
-#include <vx_intrinsics.h>
-#include <stdint.h>
-
-#define DUMP_CSR_4(d, s) \
-    csr_mem[d + 0] = csr_read(s + 0); \
-    csr_mem[d + 1] = csr_read(s + 1); \
-    csr_mem[d + 2] = csr_read(s + 2); \
-    csr_mem[d + 3] = csr_read(s + 3);
-
-#define DUMP_CSR_32(d, s) \
-    DUMP_CSR_4(d + 0,  s + 0)  \
-    DUMP_CSR_4(d + 4,  s + 4)  \
-    DUMP_CSR_4(d + 8,  s + 8)  \
-    DUMP_CSR_4(d + 12, s + 12) \
-    DUMP_CSR_4(d + 16, s + 16) \
-    DUMP_CSR_4(d + 20, s + 20) \
-    DUMP_CSR_4(d + 24, s + 24) \
-    DUMP_CSR_4(d + 28, s + 28)
-
-void vx_perf_dump() {
-    int core_id = vx_core_id();
-    uint32_t* const csr_mem = (uint32_t*)(IO_CSR_ADDR + 64 * sizeof(uint32_t) * core_id);
-    DUMP_CSR_32(0,  CSR_MPM_BASE)
-    DUMP_CSR_32(32, CSR_MPM_BASE_H)
-}
--- a/runtime/src/vx_print.S
+++ b/runtime/src/vx_print.S
@@ -1,11 +0,0 @@
-#include <VX_config.h>
-
-.type vx_putchar, @function
-.global vx_putchar
-vx_putchar:
-    csrr t0, CSR_GTID
-    andi t0, t0, %lo(IO_COUT_SIZE-1)
-    li t1, IO_COUT_ADDR
-    add t0, t0, t1    
-    sb a0, 0(t0)
-    ret
--- a/runtime/src/vx_print.c
+++ b/runtime/src/vx_print.c
@@ -1,94 +0,0 @@
-#include <vx_print.h>
-#include <vx_spawn.h>
-#include <vx_intrinsics.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include "tinyprintf.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
-	const char* format;
-	va_list*    va;
-	int         ret;
-} printf_arg_t;
-
-typedef struct {
-	int value;
-	int base;
-} putint_arg_t;
-
-typedef struct {
-	float value;
-	int precision;
-} putfloat_arg_t;
-
-static void __putint_cb(const putint_arg_t* arg) {	
-	char tmp[33];
-	float value = arg->value;
-	int base = arg->base;
-	itoa(value, tmp, base);
-	for (int i = 0; i < 33; ++i) {
-		int c = tmp[i];
-		if (!c) 
-			break;
-		vx_putchar(c);
-	}
-}
-
-static void __putfloat_cb(const putfloat_arg_t* arg) {
-	float value = arg->value;
-	int precision = arg->precision;
-	int ipart = (int)value;
-    vx_putint(ipart, 10);
-    if (precision != 0) {
-        vx_putchar('.');
-		float frac = value - (float)ipart;
-        float fscaled = frac * pow(10, precision);  
-        vx_putint((int)fscaled, 10);
-    }
-}
-
-static void __vprintf_cb(printf_arg_t* arg) {
-	arg->ret = tiny_vprintf(arg->format, *arg->va);
-}
-
-void vx_putint(int value, int base) {
-	putint_arg_t arg;
-	arg.value = value;
-	arg.base = base;
-	vx_serial((vx_serial_cb)__putint_cb, &arg);
-}
-
-void vx_putfloat(float value, int precision) {
-	putfloat_arg_t arg;
-	arg.value = value;
-	arg.precision = precision;
-	vx_serial((vx_serial_cb)__putfloat_cb, &arg);
-}
-
-int vx_vprintf(const char* format, va_list va) {
-	printf_arg_t arg;
-	arg.format = format;
-	arg.va = &va;
-	vx_serial((vx_serial_cb)__vprintf_cb, &arg);
-  	return arg.ret;
-}
-
-int vx_printf(const char * format, ...) {
-	int ret;
-	va_list va;
-	va_start(va, format);
-	ret = vx_vprintf(format, va);
-	va_end(va);		
-  	return ret;
-}
-
-#ifdef __cplusplus
-}
-#endif
--- a/runtime/src/vx_spawn.S
+++ b/runtime/src/vx_spawn.S
@@ -1,37 +0,0 @@
-#include <VX_config.h>
-
-.type vx_serial, @function
-.global vx_serial
-vx_serial: 
-    addi sp, sp, -24
-    sw   ra, 20(sp)
-    sw   s4, 16(sp)
-    sw   s3, 12(sp)
-    sw   s2, 8(sp)
-    sw   s1, 4(sp)
-    sw   s0, 0(sp)
-    mv   s4, a0                 # s4 <- callback
-    mv   s3, a1                 # s3 <- arg
-    csrr s2, CSR_NT             # s2 <- NT
-    csrr s1, CSR_WTID           # s1 <- tid    
-    li	 s0, 0                  # s0 <- index
-label_loop:
-    sub	 t0, s0, s1
-    seqz t1, t0                 # (index != tid)
-    .insn s 0x6b, 2, x0, 0(t1)  # split t0
-    bnez t0, label_join
-    mv   a0, s3                 # a0 <- arg
-    jalr s4                     # callback(arg)
-label_join:
-    .insn s 0x6b, 3, x0, 0(x0)  # join
-    addi s0, s0, 1              # index++
-    blt	 s0, s2, label_loop     # loop back
-    lw   ra, 20(sp)
-    lw   s4, 16(sp)
-    lw   s3, 12(sp)
-    lw   s2, 8(sp)
-    lw   s1, 4(sp)
-    lw   s0, 0(sp)
-    addi sp, sp, 24
-
-    ret
--- a/runtime/src/vx_spawn.c
+++ b/runtime/src/vx_spawn.c
@@ -1,307 +0,0 @@
-#include <vx_spawn.h>
-#include <vx_intrinsics.h>
-#include <inttypes.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define NUM_CORES_MAX 32
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-typedef struct {
-	vx_spawn_tasks_cb callback;
-	void * arg;
-	int offset;
-	int N;
-	int R;
-  int NW;
-} wspawn_tasks_args_t;
-
-typedef struct {
-  context_t * ctx;
-  vx_spawn_kernel_cb callback;
-  void * arg;
-  int  offset; 
-  int  N;
-  int  R;  
-  int  NW;
-  char isXYpow2;
-  char isXpow2;
-  char log2XY;
-  char log2X;
-} wspawn_kernel_args_t;
-
-void* g_wspawn_args[NUM_CORES_MAX];
-
-inline char is_log2(int x) {
-  return ((x & (x-1)) == 0);
-}
-
-inline int fast_log2(int x) {
-  float f = x;
-  return (*(int*)(&f)>>23) - 127;
-}
-
-static void __attribute__ ((noinline)) spawn_tasks_all_stub() { 
-  int core_id = vx_core_id();
-  int wid     = vx_warp_id();
-  int tid     = vx_thread_id(); 
-  int NT      = vx_num_threads();
-  
-  wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[core_id];
-
-  int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
-  int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
-  int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
-
-  for (int task_id = offset, N = task_id + tK; task_id < N; ++task_id) {
-    (p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
-  }
-
-  // wait for all warps to complete
-  vx_barrier(0, p_wspawn_args->NW);
-}
-
-static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {  
-  int core_id = vx_core_id(); 
-  int tid = vx_thread_gid();
-
-  wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[core_id];
-
-  int task_id = p_wspawn_args->offset + tid;
-  (p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
-}
-
-static void spawn_tasks_all_cb() {  
-  // activate all threads
-  vx_tmc(-1);
-
-  // call stub routine
-  spawn_tasks_all_stub();
-  
-  // set warp0 to single-threaded and stop other warps
-  int wid = vx_warp_id();
-  vx_tmc(0 == wid);
-}
-
-static void spawn_tasks_rem_cb(int thread_mask) {  
-  // activate threads  
-  vx_tmc(thread_mask);
-
-  // call stub routine
-  spawn_tasks_rem_stub();
-
-  // back to single-threaded
-  vx_tmc(1);
-}
-
-void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
-	// device specs
-  int NC = vx_num_cores();
-  int NW = vx_num_warps();
-  int NT = vx_num_threads();
-
-  // current core id
-  int core_id = vx_core_id();  
-  if (core_id >= NUM_CORES_MAX)
-    return;
-
-  // calculate necessary active cores
-  int WT = NW * NT;
-  int nC = (num_tasks > WT) ? (num_tasks / WT) : 1;
-  int nc = MIN(nC, NC);
-  if (core_id >= nc)
-    return; // terminate extra cores
-
-  // number of tasks per core
-  int tasks_per_core = num_tasks / nc;
-  int tasks_per_core0 = tasks_per_core;  
-  if (core_id == (NC-1)) {    
-    int QC_r = num_tasks - (nc * tasks_per_core0); 
-    tasks_per_core0 += QC_r; // last core executes remaining tasks
-  }
-
-  // number of tasks per warp
-  int nW = tasks_per_core0 / NT;        		// total warps per core
-  int rT = tasks_per_core0 - (nW * NT); 		// remaining threads
-  int fW  = (nW >= NW) ? (nW / NW) : 0;			// full warps iterations
-  int rW  = (fW != 0) ? (nW - fW * NW) : 0; // remaining warps
-  if (0 == fW)
-    fW = 1;
-
-  //--
-  wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW, 0 };
-  g_wspawn_args[core_id] = &wspawn_args;
-
-  //--
-	if (nW >= 1)	{ 
-    int nw = MIN(nW, NW);    
-    wspawn_args.NW = nw;
-	  vx_wspawn(nw, spawn_tasks_all_cb);
-    spawn_tasks_all_cb();
-	}  
-
-  //--    
-  if (rT != 0) {
-    wspawn_args.offset = tasks_per_core0 - rT;
-    int tmask = (1 << rT) - 1;
-    spawn_tasks_rem_cb(tmask);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-static void __attribute__ ((noinline)) spawn_kernel_all_stub() {
-  int core_id = vx_core_id();
-  int wid     = vx_warp_id();
-  int tid     = vx_thread_id(); 
-  int NT      = vx_num_threads();
-  
-  wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[core_id];
-
-  int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
-  int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
-  int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
-
-  int X = p_wspawn_args->ctx->num_groups[0];
-  int Y = p_wspawn_args->ctx->num_groups[1];
-  int XY = X * Y;
-
-  for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {    
-    int k = p_wspawn_args->isXYpow2 ? (wg_id >> p_wspawn_args->log2XY) : (wg_id / XY);
-    int wg_2d = wg_id - k * XY;
-    int j = p_wspawn_args->isXpow2 ? (wg_2d >> p_wspawn_args->log2X) : (wg_2d / X);
-    int i = wg_2d - j * X;
-
-    int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
-    int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
-    int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
-
-    (p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, gid0, gid1, gid2);
-  }
-
-  // wait for all warps to complete
-  vx_barrier(0, p_wspawn_args->NW);
-}
-
-static void __attribute__ ((noinline)) spawn_kernel_rem_stub() {
-  int core_id = vx_core_id(); 
-  int tid = vx_thread_gid();
-
-  wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[core_id];
-
-  int wg_id = p_wspawn_args->offset + tid;
-
-  int X = p_wspawn_args->ctx->num_groups[0];
-  int Y = p_wspawn_args->ctx->num_groups[1];
-  int XY = X * Y;
-  
-  int k = p_wspawn_args->isXYpow2 ? (wg_id >> p_wspawn_args->log2XY) : (wg_id / XY);
-  int wg_2d = wg_id - k * XY;
-  int j = p_wspawn_args->isXpow2 ? (wg_2d >> p_wspawn_args->log2X) : (wg_2d / X);
-  int i = wg_2d - j * X;
-
-  int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
-  int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
-  int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
-
-  (p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, gid0, gid1, gid2);
-}
-
-static void spawn_kernel_all_cb() {  
-  // activate all threads
-  vx_tmc(-1);
-
-  // call stub routine
-  spawn_kernel_all_stub();
-
-  // set warp0 to single-threaded and stop other warps
-  int wid = vx_warp_id();
-  vx_tmc(0 == wid);
-}
-
-static void spawn_kernel_rem_cb(int thread_mask) {    
-  // activate threads
-  vx_tmc(thread_mask);
-
-  // call stub routine
-  spawn_kernel_rem_stub();
-
-  // back to single-threaded
-  vx_tmc(1);
-}
-
-void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {  
-  // total number of WGs
-  int X  = ctx->num_groups[0];
-  int Y  = ctx->num_groups[1];
-  int Z  = ctx->num_groups[2];
-  int XY = X * Y;
-  int Q  = XY * Z;
-  
-  // device specs
-  int NC = vx_num_cores();
-  int NW = vx_num_warps();
-  int NT = vx_num_threads();
-
-  // current core id
-  int core_id = vx_core_id();  
-  if (core_id >= NUM_CORES_MAX)
-    return;
-
-  // calculate necessary active cores
-  int WT = NW * NT;
-  int nC = (Q > WT) ? (Q / WT) : 1;
-  int nc = MIN(nC, NC);
-  if (core_id >= nc)
-    return; // terminate extra cores
-
-  // number of workgroups per core
-  int wgs_per_core = Q / nc;
-  int wgs_per_core0 = wgs_per_core;  
-  if (core_id == (NC-1)) {    
-    int QC_r = Q - (nc * wgs_per_core0); 
-    wgs_per_core0 += QC_r; // last core executes remaining WGs
-  }
-
-  // number of workgroups per warp
-  int nW = wgs_per_core0 / NT;              // total warps per core
-  int rT = wgs_per_core0 - (nW * NT);       // remaining threads
-  int fW = (nW >= NW) ? (nW / NW) : 0;      // full warps iterations
-  int rW = (fW != 0) ? (nW - fW * NW) : 0;  // reamining full warps
-  if (0 == fW)
-    fW = 1;
-
-  // fast path handling
-  char isXYpow2 = is_log2(XY);
-  char isXpow2  = is_log2(X);
-  char log2XY   = fast_log2(XY);
-  char log2X    = fast_log2(X);
-
-  //--
-  wspawn_kernel_args_t wspawn_args = { 
-    ctx, callback, arg, core_id * wgs_per_core, fW, rW, 0, isXYpow2, isXpow2, log2XY, log2X 
-  };
-  g_wspawn_args[core_id] = &wspawn_args;
-
-  //--
-	if (nW >= 1)	{ 
-    int nw = MIN(nW, NW);    
-    wspawn_args.NW = nw;
-	  vx_wspawn(nw, spawn_kernel_all_cb);
-    spawn_kernel_all_cb();
-	}  
-
-  //--    
-  if (rT != 0) {
-    wspawn_args.offset = wgs_per_core0 - rT;
-    int tmask = (1 << rT) - 1;
-    spawn_kernel_rem_cb(tmask);
-  }
-}
-
-#ifdef __cplusplus
-}
-#endif
--- a/runtime/src/vx_start.S
+++ b/runtime/src/vx_start.S
@@ -1,110 +0,0 @@
-#include <VX_config.h>
-
-.section .init, "ax"
-.global _start
-.type   _start, @function
-_start:
-
-  # initialize per-thread registers
-  csrr a0, CSR_NW  # get num warps
-  la a1, init_regs
-  .insn s 0x6b, 1, a1, 0(a0)  # wspawn a0, a1
-  jal init_regs
-  # return back to single thread execution
-  li a0, 1
-  .insn s 0x6b, 0, x0, 0(a0)  # tmc a0
-
-  # initialize TLS for all warps
-  csrr a0, CSR_NW  # get num warps
-  la a1, __init_tls
-  .insn s 0x6b, 1, a1, 0(a0)  # wspawn a0, a1
-  call __init_tls
-  # return back to single thread execution
-  li a0, 1
-  .insn s 0x6b, 0, x0, 0(a0)  # tmc a0
-  
-  # clear BSS segment
-  la      a0, _edata
-  la      a2, _end
-  sub     a2, a2, a0
-  li      a1, 0
-  call    memset
-
-  # Initialize trap vector
-  # a t0, trap_entry
-  # csrw mtvec, t0
-
-  # Register global termination functions
-  la      a0, __libc_fini_array
-
-  # to be called upon exit
-  call    atexit
-
-  # Run global initialization functions
-  call    __libc_init_array
-
-  # call main program routine
-  call    main
-
-  # call exit routine
-  tail    exit
-.size  _start, .-_start
-
-.section .text
-.type _exit, @function
-.global _exit
-_exit:
-  mv s0, a0
-  call vx_perf_dump 
-  mv gp, s0
-  li a0, 0
-  .insn s 0x6b, 0, x0, 0(a0)  # tmc a0
-
-.section .text
-.type init_regs, @function
-.global init_regs
-init_regs:
-  # activate all threads
-  li a0, -1
-  .insn s 0x6b, 0, x0, 0(a0)  # tmc a0
-  
-  # set global pointer register
-  .option push
-  .option norelax
-  la gp, __global_pointer
-  .option pop
-
-  # allocate stack region for a threads on the processor 
-  # set stack pointer
-  li sp, SMEM_BASE_ADDR # load stack base address
-  #if SM_ENABLE
-  csrr a0, CSR_LTID    # get local thread id
-  #else
-  csrr a0, CSR_GTID    # get global thread id
-  #endif
-  sll  a1, a0, STACK_LOG2_SIZE
-  sub  sp, sp, a1
-
-  # set thread pointer register
-  # use address space after BSS region
-  # ensure cacheline alignment
-  la      a1, __tcb_aligned_size
-  mul     a0, a0, a1
-  la      tp, _end + 63
-  add     tp, tp, a0
-  and     tp, tp, -64
-
-  # disable active warps except warp0
-  csrr a3, CSR_LWID    # get local wid
-  beqz a3, RETURN
-  li a0, 0
-  .insn s 0x6b, 0, x0, 0(a0)  # tmc a0
-RETURN:
-  ret
-
-.section .data
-	.global __dso_handle
-	.weak __dso_handle
-__dso_handle:
-	.long	0
-  
--- a/runtime/src/vx_syscalls.c
+++ b/runtime/src/vx_syscalls.c
@@ -1,109 +0,0 @@
-#include <sys/stat.h>
-#include <newlib.h>
-#include <unistd.h>
-#include <vx_intrinsics.h>
-#include <vx_print.h>
-#include <string.h>
- 
-int _close(int file) { return -1; }
- 
-int _fstat(int file, struct stat *st) { return -1; }
- 
-int _isatty(int file) { return 0; }
- 
-int _lseek(int file, int ptr, int dir) { return 0; }
- 
-int _open(const char *name, int flags, int mode) { return -1; }
- 
-int _read(int file, char *ptr, int len) { return -1; }
- 
-caddr_t _sbrk(int incr) { 
-  __asm__ __volatile__("ebreak");
-  return 0; 
-}
- 
-int _write(int file, char *ptr, int len) {
-  int i; 
-  for (i = 0; i < len; ++i) {
-    vx_putchar(*ptr++);
-  }
-  return len;
-}
-
-int _kill(int pid, int sig) { return -1; }
-
-int _getpid() {
-  return vx_warp_gid();
-}
-
-void __init_tls(void) {  
-  extern char __tdata_start[];
-  extern char __tbss_offset[];
-  extern char __tdata_size[];
-  extern char __tbss_size[];
-
-  // activate all threads
-  vx_tmc(-1);
-
-  // TLS memory initialization
-  register char *__thread_self __asm__ ("tp");
-  memcpy(__thread_self, __tdata_start, (size_t)__tdata_size);
-  memset(__thread_self + (size_t)__tbss_offset, 0, (size_t)__tbss_size);
-
-  // back to single thread execution
-  vx_tmc(0 == vx_warp_id());
-}
-
- #ifdef HAVE_INITFINI_ARRAY
-
-/* These magic symbols are provided by the linker.  */
-extern void (*__preinit_array_start []) (void) __attribute__((weak));
-extern void (*__preinit_array_end []) (void) __attribute__((weak));
-extern void (*__init_array_start []) (void) __attribute__((weak));
-extern void (*__init_array_end []) (void) __attribute__((weak));
-
-#ifdef HAVE_INIT_FINI
-extern void _init (void);
-#endif
-
-/* Iterate over all the init routines.  */
-void __libc_init_array (void) {
-  size_t count;
-  size_t i;
-
-  count = __preinit_array_end - __preinit_array_start;
-  for (i = 0; i < count; i++)
-    __preinit_array_start[i] ();
-
-#ifdef HAVE_INIT_FINI
-  _init ();
-#endif
-
-  count = __init_array_end - __init_array_start;
-  for (i = 0; i < count; i++)
-    __init_array_start[i] ();
-}
-#endif
-
-#ifdef HAVE_INITFINI_ARRAY
-extern void (*__fini_array_start []) (void) __attribute__((weak));
-extern void (*__fini_array_end []) (void) __attribute__((weak));
-
-#ifdef HAVE_INIT_FINI
-extern void _fini (void);
-#endif
-
-/* Run all the cleanup routines.  */
-void __libc_fini_array (void) {
-  size_t count;
-  size_t i;
-  
-  count = __fini_array_end - __fini_array_start;
-  for (i = count; i > 0; i--)
-    __fini_array_start[i-1] ();
-
-#ifdef HAVE_INIT_FINI
-  _fini ();
-#endif
-}
-#endif
--- a/runtime/stub/Makefile
+++ b/runtime/stub/Makefile
@@ -0,0 +1,19 @@
+CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
+
+CXXFLAGS += -I../include -I../../runtime -I../../hw -I../../sim/common
+
+CXXFLAGS += -fPIC
+
+LDFLAGS += -shared -pthread
+
+SRCS = vortex.cpp ../common/utils.cpp
+
+PROJECT = libvortex.so
+
+all: $(PROJECT)
+
+$(PROJECT): $(SRCS) 
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+clean:
+	rm -rf $(PROJECT) obj_dir
--- a/runtime/stub/vortex.cpp
+++ b/runtime/stub/vortex.cpp
@@ -0,0 +1,58 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vortex.h>
+
+extern int vx_dev_open(vx_device_h* /*hdevice*/) {
+    return -1;
+}
+
+extern int vx_dev_close(vx_device_h /*hdevice*/) {
+    return -1;
+}
+
+extern int vx_dev_caps(vx_device_h /*hdevice*/, uint32_t /*caps_id*/, uint64_t* /*value*/) {
+    return -1;
+}
+
+extern int vx_mem_alloc(vx_device_h /*hdevice*/, uint64_t /*size*/, int /*type*/, uint64_t* /*dev_addr*/) {
+    return -1;
+}
+
+extern int vx_mem_free(vx_device_h /*hdevice*/, uint64_t /*dev_addr*/) {
+    return -1;
+}
+
+extern int vx_mem_info(vx_device_h /*hdevice*/, int /*type*/, uint64_t* /*mem_free*/, uint64_t* /*mem_used*/) {
+    return 0;
+}
+
+extern int vx_copy_to_dev(vx_device_h /*hdevice*/, uint64_t /*dev_addr*/, const void* /*host_ptr*/, uint64_t /*size*/) {
+    return -1;
+}
+
+extern int vx_copy_from_dev(vx_device_h /*hdevice*/, void* /*host_ptr*/, uint64_t /*dev_addr*/, uint64_t /*size*/) {
+     return -1;
+}
+
+extern int vx_start(vx_device_h /*hdevice*/) {
+    return -1;
+}
+
+extern int vx_ready_wait(vx_device_h /*hdevice*/, uint64_t /*timeout*/) {
+    return -1;
+}
+
+extern int vx_dcr_write(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint64_t /*value*/) {
+    return -1;
+}
--- a/runtime/xrt/Makefile
+++ b/runtime/xrt/Makefile
@@ -0,0 +1,33 @@
+CXXFLAGS += -std=c++14 -Wall -Wextra -Wfatal-errors
+
+CXXFLAGS += -I../include -I../common -I../../hw -I$(XILINX_XRT)/include -I../../sim/common
+
+CXXFLAGS += -fPIC
+
+LDFLAGS += -shared -pthread
+LDFLAGS += -L$(XILINX_XRT)/lib -luuid -lxrt_coreutil
+
+SRCS = vortex.cpp ../common/utils.cpp ../../sim/common/util.cpp
+
+PROJECT = libvortex.so
+
+# Debugigng
+ifdef DEBUG
+	CXXFLAGS += -g -O0
+else    
+	CXXFLAGS += -O2 -DNDEBUG
+endif
+
+# Enable scope logic analyzer
+ifdef SCOPE
+	CXXFLAGS += -DSCOPE	
+	SRCS += ../common/scope.cpp
+endif
+
+all: $(PROJECT)
+
+$(PROJECT): $(SRCS) $(SCOPE_JSON)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+clean:
+	rm -rf $(PROJECT) obj_dir
--- a/runtime/xrt/vortex.cpp
+++ b/runtime/xrt/vortex.cpp
@@ -0,0 +1,915 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vortex.h>
+#include <malloc.h>
+#include <utils.h>
+#include <VX_config.h>
+#include <VX_types.h>
+#include <stdarg.h>
+#include <util.h>
+#include <limits>
+#include <unordered_map>
+
+#ifdef SCOPE
+#include "scope.h"
+#endif
+
+// XRT includes
+#include "experimental/xrt_bo.h"
+#include "experimental/xrt_ip.h"
+#include "experimental/xrt_device.h"
+#include "experimental/xrt_kernel.h"
+#include "experimental/xrt_xclbin.h"
+#include "experimental/xrt_error.h"
+
+#define CPP_API
+//#define BANK_INTERLEAVE
+
+#define MMIO_CTL_ADDR   0x00
+#define MMIO_DEV_ADDR   0x10
+#define MMIO_ISA_ADDR   0x1C
+#define MMIO_DCR_ADDR   0x28
+#define MMIO_SCP_ADDR   0x34
+#define MMIO_MEM_ADDR   0x40
+
+#define CTL_AP_START    (1<<0)
+#define CTL_AP_DONE     (1<<1)
+#define CTL_AP_IDLE     (1<<2)
+#define CTL_AP_READY    (1<<3)
+#define CTL_AP_RESET    (1<<4)
+#define CTL_AP_RESTART  (1<<7)
+
+struct platform_info_t {
+    const char* prefix_name;
+    uint8_t     lg2_num_banks;    
+    uint8_t     lg2_bank_size;
+    uint64_t    mem_base;
+};
+
+static const platform_info_t g_platforms [] = {
+    {"xilinx_u50",     4, 0x1C, 0x0},
+    {"xilinx_u200",    4, 0x1C, 0x0},
+    {"xilinx_u280",    4, 0x1C, 0x0},
+    {"xilinx_vck5000", 0, 0x21, 0xC000000000},
+};
+
+#ifdef CPP_API
+
+    typedef xrt::device xrt_device_t;
+    typedef xrt::ip xrt_kernel_t;
+    typedef xrt::bo xrt_buffer_t;
+
+#else
+
+    typedef xrtDeviceHandle xrt_device_t;
+    typedef xrtKernelHandle xrt_kernel_t;
+    typedef xrtBufferHandle xrt_buffer_t;
+    
+#endif
+
+#define RAM_PAGE_SIZE 4096
+
+#define DEFAULT_DEVICE_INDEX 0
+
+#define DEFAULT_XCLBIN_PATH "vortex_afu.xclbin"
+
+#define KERNEL_NAME "vortex_afu"
+
+#ifndef NDEBUG
+#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
+#else
+#define DBGPRINT(format, ...) ((void)0)
+#endif
+
+#define CHECK_HANDLE(handle, _expr, _cleanup)   \
+    auto handle = _expr;                        \
+    if (handle == nullptr) {                    \
+        printf("[VXDRV] Error: '%s' returned NULL!\n", #_expr); \
+        _cleanup                                \
+    }
+
+#define CHECK_ERR(_expr, _cleanup)              \
+    do {                                        \
+        auto err = _expr;                       \
+        if (err == 0)                           \
+            break;                              \
+        printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
+        _cleanup                                \
+    } while (false)
+
+using namespace vortex;
+
+#ifndef CPP_API
+
+static void dump_xrt_error(xrtDeviceHandle xrtDevice, xrtErrorCode err) {
+    size_t len = 0;                        
+    xrtErrorGetString(xrtDevice, err, nullptr, 0, &len);
+    std::vector<char> buf(len);             
+    xrtErrorGetString(xrtDevice, err, buf.data(), buf.size(), nullptr);
+    printf("[VXDRV] detail: %s!\n", buf.data());
+}
+
+#endif
+
+static int get_platform_info(const std::string& device_name, platform_info_t* platform_info) {    
+    for (size_t i = 0; i < (sizeof(g_platforms)/sizeof(platform_info_t)); ++i) {
+        auto& platform = g_platforms[i];
+        if (device_name.rfind(platform.prefix_name, 0) == 0) {
+            *platform_info = platform;
+            return 0;
+        }
+    }    
+    return -1;
+}
+
+/*static void wait_for_enter(const std::string &msg) {
+    std::cout << msg << std::endl;
+    std::cin.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+}*/
+
+///////////////////////////////////////////////////////////////////////////////
+
+class vx_device {
+public: 
+
+    vx_device(xrt_device_t& device, xrt_kernel_t& kernel, const platform_info_t& platform)
+        : xrtDevice_(device)
+        , xrtKernel_(kernel)
+        , platform_(platform)
+    {}
+
+#ifndef CPP_API
+    
+    ~vx_device() {
+        for (auto& entry : xrtBuffers_) {
+        #ifdef BANK_INTERLEAVE
+            xrtBOFree(entry);
+        #else
+            xrtBOFree(entry.second.xrtBuffer);
+        #endif
+        }
+        if (xrtKernel_) {
+            xrtKernelClose(xrtKernel_); 
+        }
+        if (xrtDevice_) {
+            xrtDeviceClose(xrtDevice_);
+        }
+    }
+
+#endif
+
+    int init() {
+        CHECK_ERR(this->write_register(MMIO_CTL_ADDR, CTL_AP_RESET), {
+            return -1;
+        });
+
+        uint32_t num_banks = 1 << platform_.lg2_num_banks;
+        uint64_t bank_size = 1ull << platform_.lg2_bank_size;
+
+        for (uint32_t i = 0; i < num_banks; ++i) {
+            uint32_t reg_addr = MMIO_MEM_ADDR + (i * 12);
+            uint64_t reg_value = platform_.mem_base + i * bank_size;
+            CHECK_ERR(this->write_register(reg_addr, reg_value & 0xffffffff), {
+                return -1;
+            });
+
+            CHECK_ERR(this->write_register(reg_addr + 4, (reg_value >> 32) & 0xffffffff), {
+                return -1;
+            });
+        #ifndef BANK_INTERLEAVE
+            break;
+        #endif
+        }
+
+        CHECK_ERR(this->read_register(MMIO_DEV_ADDR, (uint32_t*)&this->dev_caps), {
+            return -1;
+        });
+        
+        CHECK_ERR(this->read_register(MMIO_DEV_ADDR + 4, (uint32_t*)&this->dev_caps + 1), {
+            return -1;
+        });
+
+        CHECK_ERR(this->read_register(MMIO_ISA_ADDR, (uint32_t*)&this->isa_caps), {
+            return -1;
+        });
+
+        CHECK_ERR(this->read_register(MMIO_ISA_ADDR + 4, (uint32_t*)&this->isa_caps + 1), {
+            return -1;
+        });
+
+        this->global_mem_size = num_banks * bank_size;
+
+        this->global_mem_ = std::make_shared<vortex::MemoryAllocator>(
+            ALLOC_BASE_ADDR, ALLOC_MAX_ADDR, RAM_PAGE_SIZE, CACHE_BLOCK_SIZE);
+
+        uint64_t local_mem_size = 0;
+        vx_dev_caps(this, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size);
+        if (local_mem_size <= 1) {        
+            this->local_mem_ = std::make_shared<vortex::MemoryAllocator>(
+                SMEM_BASE_ADDR, local_mem_size, RAM_PAGE_SIZE, 1);
+        }
+
+    #ifdef BANK_INTERLEAVE
+        xrtBuffers_.reserve(num_banks);
+        for (uint32_t i = 0; i < num_banks; ++i) {            
+        #ifdef CPP_API
+            xrtBuffers_.emplace_back(xrtDevice_, bank_size, xrt::bo::flags::normal, i);
+        #else
+            CHECK_HANDLE(xrtBuffer, xrtBOAlloc(xrtDevice_, bank_size, XRT_BO_FLAGS_NONE, i), {
+                return -1;
+            });
+            xrtBuffers_.push_back(xrtBuffer);
+        #endif            
+            printf("*** allocated bank%u/%u, size=%lu\n", i, num_banks, bank_size);
+        }
+    #endif
+
+        return 0;
+    }
+
+    int mem_alloc(uint64_t size, int type, uint64_t* dev_addr) {
+        uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
+
+        uint64_t addr;
+
+        if (type == VX_MEM_TYPE_GLOBAL) {
+            CHECK_ERR(global_mem_->allocate(asize, &addr), {
+                return -1;
+            });
+        #ifndef BANK_INTERLEAVE
+            uint32_t bank_id;
+            CHECK_ERR(this->get_bank_info(addr, &bank_id, nullptr), {
+                return -1;
+            });
+            CHECK_ERR(get_buffer(bank_id, nullptr), {
+                return -1;
+            });
+        #endif
+        } else if (type == VX_MEM_TYPE_LOCAL) {
+            if CHECK_ERR(local_mem_->allocate(asize, &addr), {
+                return -1;
+            });
+        } else {
+            return -1;
+        }       
+        *dev_addr = addr;
+        return 0;
+    }
+
+    int mem_free(uint64_t dev_addr) {    
+        if (dev_addr >= SMEM_BASE_ADDR) {
+            CHECK_ERR(local_mem_->release(dev_addr), {
+                return -1;
+            });    
+        } else {
+            CHECK_ERR(global_mem_->release(dev_addr), {
+                return -1;
+            });    
+        #ifdef BANK_INTERLEAVE
+            if (0 == global_mem_->allocated()) {
+            #ifndef CPP_API
+                for (auto& entry : xrtBuffers_) {
+                    xrtBOFree(entry);
+                }
+            #endif
+                xrtBuffers_.clear();
+            }
+        #else
+            uint32_t bank_id;
+            CHECK_ERR(this->get_bank_info(dev_addr, &bank_id, nullptr), {
+                return -1;
+            });
+            auto it = xrtBuffers_.find(bank_id);
+            if (it != xrtBuffers_.end()) {
+                auto count = --it->second.count;            
+                if (0 == count) {               
+                    printf("freeing bank%d...\n", bank_id); 
+                #ifndef CPP_API
+                    xrtBOFree(it->second.xrtBuffer);
+                #endif
+                    xrtBuffers_.erase(it);
+                }
+            } else {
+                fprintf(stderr, "[VXDRV] Error: invalid device memory address: 0x%lx\n", dev_addr);
+                return -1;
+            }
+        #endif
+        }
+        return 0;
+    }
+
+    int mem_info(int type, uint64_t* mem_free, uint64_t* mem_used) const {
+        if (type == VX_MEM_TYPE_GLOBAL) {
+            if (mem_free)
+                *mem_free = global_mem_->free();
+            if (mem_used)
+                *mem_used = global_mem_->allocated();
+        } else if (type == VX_MEM_TYPE_LOCAL) {
+            if (mem_free)
+                *mem_free = local_mem_->free();
+            if (mem_used)
+                *mem_free = local_mem_->allocated();
+        } else {
+            return -1;
+        }
+        return 0;
+    }
+
+    int write_register(uint32_t addr, uint32_t value) {
+    #ifdef CPP_API
+        xrtKernel_.write_register(addr, value);
+    #else        
+        CHECK_ERR(xrtKernelWriteRegister(xrtKernel_, addr, value), {
+            dump_xrt_error(xrtDevice_, err);
+            return -1;
+        }); 
+    #endif
+        DBGPRINT("*** write_register: addr=0x%x, value=0x%x\n", addr, value);
+        return 0;
+    }
+
+    int read_register(uint32_t addr, uint32_t* value) {
+    #ifdef CPP_API
+        *value = xrtKernel_.read_register(addr);
+    #else        
+        CHECK_ERR(xrtKernelReadRegister(xrtKernel_, addr, value), {
+            dump_xrt_error(xrtDevice_, err);
+            return -1;
+        });
+    #endif
+        DBGPRINT("*** read_register: addr=0x%x, value=0x%x\n", addr, *value);
+        return 0;
+    }
+
+    int upload(uint64_t dev_addr, uint8_t* host_ptr, uint64_t asize) {    
+        for (uint64_t end = dev_addr + asize; dev_addr < end; 
+            dev_addr += CACHE_BLOCK_SIZE, 
+            host_ptr += CACHE_BLOCK_SIZE) {      
+        #ifdef BANK_INTERLEAVE
+            asize = CACHE_BLOCK_SIZE;
+        #else
+            end = 0;
+        #endif
+            uint32_t bo_index;
+            uint64_t bo_offset;
+            xrt_buffer_t xrtBuffer;
+            CHECK_ERR(this->get_bank_info(dev_addr, &bo_index, &bo_offset), {
+                return -1;
+            });            
+            CHECK_ERR(this->get_buffer(bo_index, &xrtBuffer), {
+                return -1;
+            });
+        #ifdef CPP_API        
+            xrtBuffer.write(host_ptr, asize, bo_offset);
+            xrtBuffer.sync(XCL_BO_SYNC_BO_TO_DEVICE, asize, bo_offset);
+        #else
+            CHECK_ERR(xrtBOWrite(xrtBuffer, host_ptr, asize, bo_offset), {
+                dump_xrt_error(xrtDevice_, err);
+                return -1;
+            });        
+            CHECK_ERR(xrtBOSync(xrtBuffer, XCL_BO_SYNC_BO_TO_DEVICE, asize, bo_offset), {
+                dump_xrt_error(xrtDevice_, err);
+                return -1;
+            });    
+        #endif
+        }
+        return 0;
+    }
+
+    int download(uint8_t* host_ptr, uint64_t dev_addr, uint64_t asize) {
+        for (uint64_t end = dev_addr + asize; dev_addr < end; 
+            dev_addr += CACHE_BLOCK_SIZE, 
+            host_ptr += CACHE_BLOCK_SIZE) {      
+        #ifdef BANK_INTERLEAVE
+            asize = CACHE_BLOCK_SIZE;
+        #else
+            end = 0;
+        #endif
+            uint32_t bo_index;
+            uint64_t bo_offset;
+            xrt_buffer_t xrtBuffer;
+            CHECK_ERR(this->get_bank_info(dev_addr, &bo_index, &bo_offset), {
+                return -1;
+            });
+            CHECK_ERR(this->get_buffer(bo_index, &xrtBuffer), {
+                return -1;
+            });
+        #ifdef CPP_API
+            xrtBuffer.sync(XCL_BO_SYNC_BO_FROM_DEVICE, asize, bo_offset);
+            xrtBuffer.read(host_ptr, asize, bo_offset);
+        #else        
+            CHECK_ERR(xrtBOSync(xrtBuffer, XCL_BO_SYNC_BO_FROM_DEVICE, asize, bo_offset), {
+                dump_xrt_error(xrtDevice_, err);
+                return -1;
+            });
+            CHECK_ERR(xrtBORead(xrtBuffer, host_ptr, asize, bo_offset), {
+                dump_xrt_error(xrtDevice_, err);
+                return -1;
+            });         
+        #endif
+        }
+        return 0;
+    }
+
+    DeviceConfig dcrs;
+    uint64_t dev_caps;
+    uint64_t isa_caps;
+    uint64_t global_mem_size;
+
+private:
+
+    xrt_device_t xrtDevice_;
+    xrt_kernel_t xrtKernel_;
+    const platform_info_t platform_;    
+    std::shared_ptr<vortex::MemoryAllocator> global_mem_;
+    std::shared_ptr<vortex::MemoryAllocator> local_mem_;
+
+#ifdef BANK_INTERLEAVE
+
+    std::vector<xrt_buffer_t> xrtBuffers_;
+
+    int get_bank_info(uint64_t addr, uint32_t* pIdx, uint64_t* pOff) {
+        uint32_t num_banks = 1 << platform_.lg2_num_banks;
+        uint64_t block_addr = addr / CACHE_BLOCK_SIZE;
+        uint32_t index = block_addr & (num_banks-1);        
+        uint64_t offset = (block_addr >> platform_.lg2_num_banks) * CACHE_BLOCK_SIZE;
+        if (pIdx) {
+            *pIdx = index;
+        }
+        if (pOff) {
+            *pOff = offset;
+        }
+        printf("get_bank_info(addr=0x%lx, bank=%d, offset=0x%lx\n", addr, index, offset);
+        return 0;
+    }
+
+    int get_buffer(uint32_t bank_id, xrt_buffer_t* pBuf) {
+        if (pBuf) {
+            *pBuf = xrtBuffers_.at(bank_id);
+        }
+        return 0;        
+    }    
+
+#else
+    
+    struct buf_cnt_t {
+        xrt_buffer_t xrtBuffer;
+        uint32_t count;
+    };
+
+    std::unordered_map<uint32_t, buf_cnt_t> xrtBuffers_;
+
+    int get_bank_info(uint64_t addr, uint32_t* pIdx, uint64_t* pOff) {
+        uint32_t num_banks = 1 << platform_.lg2_num_banks;
+        uint64_t bank_size = 1ull << platform_.lg2_bank_size;
+        uint32_t index = addr >> platform_.lg2_bank_size;
+        uint64_t offset = addr & (bank_size-1);
+        if (index > num_banks) {
+            fprintf(stderr, "[VXDRV] Error: address out of range: 0x%lx\n", addr);
+            return -1;
+        }        
+        if (pIdx) {
+            *pIdx = index;
+        }
+        if (pOff) {
+            *pOff = offset;
+        }        
+        printf("get_bank_info(addr=0x%lx, bank=%d, offset=0x%lx\n", addr, index, offset);
+        return 0;
+    }
+
+    int get_buffer(uint32_t bank_id, xrt_buffer_t* pBuf) {
+        auto it = xrtBuffers_.find(bank_id);
+        if (it != xrtBuffers_.end()) {            
+            if (pBuf) {
+                *pBuf = it->second.xrtBuffer;
+            } else {
+                printf("reusing bank%d...\n", bank_id);
+                ++it->second.count;
+            }
+        } else {
+            printf("allocating bank%d...\n", bank_id);
+            uint64_t bank_size = 1ull << platform_.lg2_bank_size;
+        #ifdef CPP_API
+            xrt::bo xrtBuffer(xrtDevice_, bank_size, xrt::bo::flags::normal, bank_id);
+        #else
+            CHECK_HANDLE(xrtBuffer, xrtBOAlloc(xrtDevice_, bank_size, XRT_BO_FLAGS_NONE, bank_id), {
+                return -1;
+            });
+        #endif
+            xrtBuffers_.insert({bank_id, {xrtBuffer, 1}});
+            if (pBuf) {
+                *pBuf = xrtBuffer;
+            }
+        }
+        return 0;        
+    }    
+
+#endif   
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
+    if (nullptr == hdevice)
+        return -1;
+
+    vx_device *device = ((vx_device*)hdevice);
+
+    switch (caps_id) {
+    case VX_CAPS_VERSION:
+        *value = (device->dev_caps >> 0) & 0xff;
+        break;
+    case VX_CAPS_NUM_THREADS:
+        *value = (device->dev_caps >> 8) & 0xff;
+        break;
+    case VX_CAPS_NUM_WARPS:
+        *value = (device->dev_caps >> 16) & 0xff;
+        break;
+    case VX_CAPS_NUM_CORES:
+        *value = (device->dev_caps >> 24) & 0xffff;
+        break;
+    case VX_CAPS_CACHE_LINE_SIZE:
+        *value = CACHE_BLOCK_SIZE;
+        break;
+   case VX_CAPS_GLOBAL_MEM_SIZE:
+        *value = device->global_mem_size;
+        break;
+    case VX_CAPS_LOCAL_MEM_SIZE:
+        *value = 1ull << ((device->dev_caps >> 40) & 0xff);
+        break;
+    case VX_CAPS_KERNEL_BASE_ADDR:
+        *value = (uint64_t(device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | 
+                           device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
+        break;
+    case VX_CAPS_ISA_FLAGS:
+        *value = device->isa_caps;
+        break;
+    default:
+        fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id);
+        std::abort();
+        return -1;
+    }
+
+    return 0;
+}
+
+extern int vx_dev_open(vx_device_h* hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    int device_index = DEFAULT_DEVICE_INDEX;    
+    const char* device_index_s = getenv("XRT_DEVICE_INDEX");
+    if (device_index_s != nullptr) {
+        device_index = atoi(device_index_s);
+    }   
+
+    const char* xlbin_path_s = getenv("XRT_XCLBIN_PATH");
+    if (xlbin_path_s == nullptr) {
+        xlbin_path_s = DEFAULT_XCLBIN_PATH;
+    }   
+
+#ifdef CPP_API
+
+    auto xrtDevice = xrt::device(device_index);
+    auto uuid = xrtDevice.load_xclbin(xlbin_path_s);
+    auto xrtKernel = xrt::ip(xrtDevice, uuid, KERNEL_NAME);
+    auto xclbin = xrt::xclbin(xlbin_path_s);
+
+    auto device_name = xrtDevice.get_info<xrt::info::device::name>();
+
+    /*{
+        uint32_t num_banks = 0;
+        uint64_t bank_size = 0;
+        uint64_t mem_base  = 0;
+
+        auto mem_json = nlohmann::json::parse(xrtDevice.get_info<xrt::info::device::memory>());
+        if (!mem_json.is_null()) {
+            uint32_t index = 0;
+            for (auto& mem : mem_json["board"]["memory"]["memories"]) {            
+                auto enabled = mem["enabled"].get<std::string>();
+                if (enabled == "true") {                
+                    if (index == 0) {      
+                        mem_base = std::stoull(mem["base_address"].get<std::string>(), nullptr, 16);
+                        bank_size = std::stoull(mem["range_bytes"].get<std::string>(), nullptr, 16);
+                    }
+                    ++index;
+                }
+            }
+            num_banks = index;
+        }
+
+        fprintf(stderr, "[VXDRV] memory description: base=0x%lx, size=0x%lx, count=%d\n", mem_base, bank_size, num_banks);
+    }*/
+
+    /*{
+        std::cout << "Device" << device_index << " : " << xrtDevice.get_info<xrt::info::device::name>() << std::endl;
+        std::cout << "  bdf      : " << xrtDevice.get_info<xrt::info::device::bdf>() << std::endl;
+        std::cout << "  kdma     : " << xrtDevice.get_info<xrt::info::device::kdma>() << std::endl;
+        std::cout << "  max_freq : " << xrtDevice.get_info<xrt::info::device::max_clock_frequency_mhz>() << std::endl;
+        std::cout << "  memory   : " << xrtDevice.get_info<xrt::info::device::memory>() << std::endl;
+        std::cout << "  thermal  : " << xrtDevice.get_info<xrt::info::device::thermal>() << std::endl;
+        std::cout << "  m2m      : " << std::boolalpha << xrtDevice.get_info<xrt::info::device::m2m>() << std::dec << std::endl;
+        std::cout << "  nodma    : " << std::boolalpha << xrtDevice.get_info<xrt::info::device::nodma>() << std::dec << std::endl;
+                
+        std::cout << "Memory info :" << std::endl;        
+        for (const auto& mem_bank : xclbin.get_mems()) {
+            std::cout << "  index : " << mem_bank.get_index() << std::endl;
+            std::cout << "  tag : " << mem_bank.get_tag() << std::endl;
+            std::cout << "  type : " << (int)mem_bank.get_type() << std::endl;
+            std::cout << "  base_address : 0x" << std::hex << mem_bank.get_base_address() << std::endl;
+            std::cout << "  size : 0x" << (mem_bank.get_size_kb() * 1000) << std::dec << std::endl;
+            std::cout << "  used :" << mem_bank.get_used() << std::endl;
+        }
+    }*/    
+
+    // get platform info
+    platform_info_t platform_info;    
+    CHECK_ERR(get_platform_info(device_name, &platform_info), {
+        fprintf(stderr, "[VXDRV] Error: platform not supported: %s\n", device_name.c_str());
+        return -1;
+    });
+
+    CHECK_HANDLE(device, new vx_device(xrtDevice, xrtKernel, platform_info), {
+        return -1;
+    });
+
+#else
+
+    CHECK_HANDLE(xrtDevice, xrtDeviceOpen(device_index), { 
+        return -1; 
+    });
+
+    CHECK_ERR(xrtDeviceLoadXclbinFile(xrtDevice, xlbin_path_s), {
+        dump_xrt_error(xrtDevice, err);
+        xrtDeviceClose(xrtDevice);
+        return -1;
+    });
+
+    xuid_t uuid;
+    CHECK_ERR(xrtDeviceGetXclbinUUID(xrtDevice, uuid), {
+        dump_xrt_error(xrtDevice, err);
+        xrtDeviceClose(xrtDevice);
+        return -1;
+    });
+
+    CHECK_HANDLE(xrtKernel, xrtPLKernelOpenExclusive(xrtDevice, uuid, KERNEL_NAME), {
+        xrtDeviceClose(xrtDevice);
+        return -1;
+    });
+
+    int device_name_size;
+    xrtXclbinGetXSAName(xrtDevice, nullptr, 0, &device_name_size);
+    std::vector<char> device_name(device_name_size);
+    xrtXclbinGetXSAName(xrtDevice, device_name.data(), device_name_size, nullptr);
+
+    // get platform info
+    platform_info_t platform_info;
+    CHECK_ERR(get_platform_info(device_name.data(), &platform_info), {
+        fprintf(stderr, "[VXDRV] Error: platform not supported: %s\n", device_name.data());
+        return -1;
+    });
+
+    CHECK_HANDLE(device, new vx_device(xrtDevice, xrtKernel, platform_info), {
+        xrtKernelClose(xrtKernel);
+        xrtDeviceClose(xrtDevice);
+        return -1;
+    });
+
+#endif    
+
+    // initialize device
+    CHECK_ERR(device->init(), {
+        return -1;
+    });
+
+#ifdef SCOPE
+    {
+        scope_callback_t callback;
+        callback.registerWrite = [](vx_device_h hdevice, uint64_t value)->int { 
+            auto device = (vx_device*)hdevice;
+            uint32_t value_lo = (uint32_t)(value);
+            uint32_t value_hi = (uint32_t)(value >> 32);
+            CHECK_ERR(device->write_register(MMIO_SCP_ADDR, value_lo), {
+                return -1;
+            });
+            CHECK_ERR(device->write_register(MMIO_SCP_ADDR + 4, value_hi), {
+                return -1;
+            });
+            return 0;
+        };
+        callback.registerRead = [](vx_device_h hdevice, uint64_t* value)->int {
+            auto device = (vx_device*)hdevice;
+            uint32_t value_lo, value_hi;
+            CHECK_ERR(device->read_register(MMIO_SCP_ADDR, &value_lo), {
+                return -1;
+            });
+            CHECK_ERR(device->read_register(MMIO_SCP_ADDR + 4, &value_hi), {
+                return -1;
+            });
+            *value = (((uint64_t)value_hi) << 32) | value_lo;
+            return 0;
+        };
+        int ret = vx_scope_start(&callback, device, 0, -1);
+        if (ret != 0) {
+            delete device;
+            return ret;
+        }
+    }
+#endif
+        
+    CHECK_ERR(dcr_initialize(device), {
+        delete device;
+        return -1;
+    });    
+
+#ifdef DUMP_PERF_STATS
+    perf_add_device(device);
+#endif
+
+    *hdevice = device;
+
+    DBGPRINT("device creation complete!\n");
+
+    return 0;
+}
+
+extern int vx_dev_close(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+#ifdef SCOPE
+    vx_scope_stop(hdevice);
+#endif
+
+    auto device = (vx_device*)hdevice;
+
+    delete device;
+
+    DBGPRINT("device destroyed!\n");
+
+    return 0;
+}
+
+extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr) {
+   if (nullptr == hdevice 
+    || nullptr == dev_addr
+    || 0 == size)
+        return -1;
+
+    auto device = ((vx_device*)hdevice);
+    return device->mem_alloc(size, type, dev_addr);
+}
+
+extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
+    if (nullptr == hdevice)
+        return -1;
+
+    if (0 == dev_addr)
+        return 0;
+
+    auto device = (vx_device*)hdevice;
+    return device->mem_free(dev_addr);
+}
+
+extern int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = (vx_device*)hdevice;
+    return device->mem_info(type, mem_free, mem_used);
+}
+
+extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
+    if (nullptr == hdevice)
+        return -1;
+    
+    auto device = (vx_device*)hdevice;
+
+    // check alignment
+    if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
+        return -1;
+
+    auto asize = aligned_size(size, CACHE_BLOCK_SIZE);
+
+    // bound checking
+    if (dev_addr + asize > device->global_mem_size)
+        return -1;
+
+    CHECK_ERR(device->upload(dev_addr, host_ptr, asize), {
+        return -1;
+    });
+
+    DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, size);
+    
+    return 0;
+}
+
+extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = (vx_device*)hdevice;
+
+    // check alignment
+    if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
+        return -1;
+
+    auto asize = aligned_size(size, CACHE_BLOCK_SIZE);
+
+    // bound checking
+    if (dev_addr + asize > device->global_mem_size)
+        return -1;
+
+    CHECK_ERR(device->download(host_ptr, dev_addr, asize), {
+        return -1;
+    });
+
+    DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, asize);
+    
+    return 0;
+}
+
+extern int vx_start(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = (vx_device*)hdevice;
+
+    //wait_for_enter("\nPress ENTER to continue after setting up ILA trigger...");
+
+    CHECK_ERR(device->write_register(MMIO_CTL_ADDR, CTL_AP_START), {
+        return -1;
+    });
+    
+    DBGPRINT("START\n");
+
+    return 0;
+}
+
+extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = (vx_device*)hdevice;
+
+    struct timespec sleep_time; 
+
+#ifndef NDEBUG
+    sleep_time.tv_sec = 1;
+    sleep_time.tv_nsec = 0;
+#else
+    sleep_time.tv_sec = 0;
+    sleep_time.tv_nsec = 1000000;
+#endif
+
+    // to milliseconds
+    uint64_t sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
+    
+    for (;;) {
+        uint32_t status = 0;
+        CHECK_ERR(device->read_register(MMIO_CTL_ADDR, &status), {
+            return -1;
+        });
+        bool is_done = (status & CTL_AP_DONE) == CTL_AP_DONE;
+        if (is_done || 0 == timeout) {            
+            break;
+        }
+        nanosleep(&sleep_time, nullptr);
+        timeout -= sleep_time_ms;
+    };
+
+    return 0;
+}
+
+extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
+    if (nullptr == hdevice)
+        return -1;
+
+    auto device = (vx_device*)hdevice;
+   
+    CHECK_ERR(device->write_register(MMIO_DCR_ADDR, addr), {
+        return -1;
+    });
+
+    CHECK_ERR(device->write_register(MMIO_DCR_ADDR + 4, value), {
+        return -1;
+    });
+
+    // save the value
+    DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%lx\n", addr, value);
+    device->dcrs.write(addr, value);
+    
+    return 0;
+}