Vortex 2.0 changes:

+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions
--- a/runtime/common/malloc.h
+++ b/runtime/common/malloc.h
@@ -0,0 +1,455 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <assert.h>
+#include <stdio.h>
+
+namespace vortex {
+
+class MemoryAllocator {
+public:
+    MemoryAllocator(
+        uint64_t baseAddress,
+        uint64_t capacity,
+        uint32_t pageAlign, 
+        uint32_t blockAlign) 
+        : baseAddress_(baseAddress)
+        , capacity_(capacity)
+        , pageAlign_(pageAlign)
+        , blockAlign_(blockAlign)
+        , pages_(nullptr)
+        , nextAddress_(0)
+        , allocated_(0)
+    {}
+
+    ~MemoryAllocator() {
+        // Free allocated pages
+        page_t* currPage = pages_;
+        while (currPage) {
+            auto nextPage = currPage->next;
+            this->DeletePage(currPage);
+            currPage = nextPage;
+        }
+    }
+    
+    uint32_t baseAddress() const {
+        return baseAddress_;
+    }
+
+    uint32_t capacity() const {
+        return capacity_;
+    }
+
+    uint64_t free() const {
+        return (capacity_ - allocated_);
+    }
+
+    uint64_t allocated() const {
+        return allocated_;
+    }
+
+    int allocate(uint64_t size, uint64_t* addr) {
+        if (size == 0 || addr == nullptr) {
+            printf("error: invalid argurments\n");
+            return -1;
+        }
+
+        // Align allocation size
+        size = AlignSize(size, blockAlign_);
+
+        // Walk thru all pages to find a free block
+        block_t* freeBlock = nullptr;
+        auto currPage = pages_;
+        while (currPage) {
+            auto currBlock = currPage->freeSList;
+            if (currBlock) {
+                // The free S-list is already sorted with the largest block first
+                // Quick check if the head block has enough space.
+                if (currBlock->size >= size) {
+                    // Find the smallest matching block in the S-list
+                    while (currBlock->nextFreeS 
+                        && (currBlock->nextFreeS->size >= size)) {
+                        currBlock = currBlock->nextFreeS;
+                    }
+                    // Return the free block
+                    freeBlock = currBlock;
+                    break;
+                }
+            }
+            currPage = currPage->next;
+        }
+
+        if (nullptr == freeBlock) {
+            // Allocate a new page for this request
+            currPage = this->NewPage(size);
+            if (nullptr == currPage) {
+                printf("error: out of memory\n");
+                return -1;
+            }
+            freeBlock = currPage->freeSList;
+        }   
+
+        // Remove the block from the free lists
+        assert(freeBlock->size >= size);
+        currPage->RemoveFreeMList(freeBlock);
+        currPage->RemoveFreeSList(freeBlock);
+
+        // If the free block we have found is larger than what we are looking for,
+        // we may be able to split our free block in two.
+        uint64_t extraBytes = freeBlock->size - size;
+        if (extraBytes >= blockAlign_) {
+            // Reduce the free block size to the requested value
+            freeBlock->size = size;
+
+            // Allocate a new block to contain the extra buffer
+            auto nextAddr = freeBlock->addr + size;
+            auto newBlock = new block_t(nextAddr, extraBytes);
+
+            // Add the new block to the free lists
+            currPage->InsertFreeMList(newBlock);
+            currPage->InsertFreeSList(newBlock);
+        }
+
+        // Insert the free block into the used list
+        currPage->InsertUsedList(freeBlock);
+
+        // Return the free block address
+        *addr = baseAddress_ + freeBlock->addr;
+
+        // Update allocated size
+        allocated_ += size;
+
+        return 0;
+    }
+
+    int release(uint64_t addr) {
+        // Walk all pages to find the pointer
+        uint64_t local_addr = addr - baseAddress_;
+        block_t* usedBlock = nullptr;
+        auto currPage = pages_;
+        while (currPage) {
+            if (local_addr >= currPage->addr
+            &&  local_addr < (currPage->addr + currPage->size)) {
+                auto currBlock = currPage->usedList;
+                while (currBlock) {
+                    if (currBlock->addr == local_addr) {
+                        usedBlock = currBlock;
+                        break;
+                    }
+                    currBlock = currBlock->nextUsed;
+                }
+                break;
+            }
+            currPage = currPage->next;
+        }
+
+        // found the corresponding block?
+        if (nullptr == usedBlock) {
+            printf("error: invalid address to release: 0x%lx\n", addr);
+            return -1;
+        }
+
+        auto size = usedBlock->size;
+
+        // Remove the block from the used list
+        currPage->RemoveUsedList(usedBlock);
+
+        // Insert the block into the free M-list.
+        currPage->InsertFreeMList(usedBlock);
+
+        // Check if we can merge adjacent free blocks from the left.        
+        if (usedBlock->prevFreeM) {
+            // Calculate the previous address
+            auto prevAddr = usedBlock->prevFreeM->addr + usedBlock->prevFreeM->size;
+            if (usedBlock->addr == prevAddr) {
+                auto prevBlock = usedBlock->prevFreeM;
+
+                // Merge the blocks to the left
+                prevBlock->size += usedBlock->size;
+                prevBlock->nextFreeM = usedBlock->nextFreeM;
+                if (prevBlock->nextFreeM) {
+                    prevBlock->nextFreeM->prevFreeM = prevBlock;
+                }
+
+                // Detach previous block from the free S-list since size increased
+                currPage->RemoveFreeSList(prevBlock);
+
+                // reset usedBlock
+                delete usedBlock;
+                usedBlock = prevBlock;
+            }
+        }
+
+        // Check if we can merge adjacent free blocks from the right.
+        if (usedBlock->nextFreeM) {
+            // Calculate the next allocation start address
+            auto nextAddr = usedBlock->addr + usedBlock->size;
+            if (usedBlock->nextFreeM->addr == nextAddr) {
+                auto nextBlock = usedBlock->nextFreeM;
+
+                // Merge the blocks to the right
+                usedBlock->size += nextBlock->size;
+                usedBlock->nextFreeM = nextBlock->nextFreeM;
+                if (usedBlock->nextFreeM) {
+                    usedBlock->nextFreeM->prevFreeM = usedBlock;
+                }
+
+                // Delete next block
+                currPage->RemoveFreeSList(nextBlock);
+                delete nextBlock;
+            }
+        }
+
+        // Insert the block into the free S-list.
+        currPage->InsertFreeSList(usedBlock);
+
+        // Check if we can free empty pages
+        if (nullptr == currPage->usedList) {
+            // Try to delete the page
+            while (currPage && this->DeletePage(currPage)) {
+                currPage = this->FindNextEmptyPage();
+            }
+
+        }
+
+        // update allocated size
+        allocated_ -= size;
+
+        return 0;
+    }
+
+private:
+
+    struct block_t {
+        block_t* nextFreeS;
+        block_t* prevFreeS;
+        
+        block_t* nextFreeM;
+        block_t* prevFreeM;
+        
+        block_t* nextUsed;
+        block_t* prevUsed;
+
+        uint64_t addr;
+        uint64_t size;
+
+        block_t(uint64_t addr, uint64_t size) 
+            : nextFreeS(nullptr)
+            , prevFreeS(nullptr)
+            , nextFreeM(nullptr)
+            , prevFreeM(nullptr)
+            , nextUsed(nullptr)
+            , prevUsed(nullptr)
+            , addr(addr)
+            , size(size)
+        {}
+    };
+
+    struct page_t {
+        page_t*  next;        
+        
+        // List of used blocks
+        block_t* usedList;
+        
+        // List with blocks sorted by descreasing sizes
+        // Used for block lookup during memory allocation.
+        block_t* freeSList;
+        
+        // List with blocks sorted by increasing memory addresses
+        // Used for block merging during memory release.
+        block_t* freeMList;
+        
+        uint64_t addr;
+        uint64_t size;
+
+        page_t(uint64_t addr, uint64_t size) : 
+            next(nullptr),            
+            usedList(nullptr),
+            addr(addr),
+            size(size) {
+            freeSList = freeMList = new block_t(addr, size);
+        }
+
+        void InsertUsedList(block_t* block) {
+            block->nextUsed = usedList;
+            if (usedList) {
+                usedList->prevUsed = block;
+            }
+            usedList = block;
+        }
+
+        void RemoveUsedList(block_t* block) {
+            if (block->prevUsed) {
+                block->prevUsed->nextUsed = block->nextUsed;
+            } else {
+                usedList = block->nextUsed;
+            }
+            if (block->nextUsed) {
+                block->nextUsed->prevUsed = block->prevUsed;
+            }
+            block->nextUsed = nullptr;
+            block->prevUsed = nullptr;
+        }
+
+        void InsertFreeMList(block_t* block) {
+            block_t* currBlock = freeMList;
+            block_t* prevBlock = nullptr;
+            while (currBlock && (currBlock->addr < block->addr)) {
+                prevBlock = currBlock;
+                currBlock = currBlock->nextFreeM;
+            }
+            block->nextFreeM = currBlock;
+            block->prevFreeM = prevBlock;
+            if (prevBlock) {
+                prevBlock->nextFreeM = block;
+            } else {
+                freeMList = block;
+            }
+            if (currBlock) {
+                currBlock->prevFreeM = block;
+            }    
+        }
+
+        void RemoveFreeMList(block_t* block) {
+            if (block->prevFreeM) {
+                block->prevFreeM->nextFreeM = block->nextFreeM;
+            } else {
+                freeMList = block->nextFreeM;
+            }
+            if (block->nextFreeM) {
+                block->nextFreeM->prevFreeM = block->prevFreeM;
+            }
+            block->nextFreeM = nullptr;
+            block->prevFreeM = nullptr;
+        }
+
+        void InsertFreeSList(block_t* block) {
+            block_t* currBlock = this->freeSList;
+            block_t* prevBlock = nullptr;
+            while (currBlock && (currBlock->size > block->size)) {
+                prevBlock = currBlock;
+                currBlock = currBlock->nextFreeS;
+            }
+            block->nextFreeS = currBlock;
+            block->prevFreeS = prevBlock;
+            if (prevBlock) {
+                prevBlock->nextFreeS = block;
+            } else {
+                this->freeSList = block;
+            }
+            if (currBlock) {
+                currBlock->prevFreeS = block;
+            }
+        }
+
+        void RemoveFreeSList(block_t* block) {
+            if (block->prevFreeS) {
+                block->prevFreeS->nextFreeS = block->nextFreeS;
+            } else {
+                freeSList = block->nextFreeS;
+            }
+            if (block->nextFreeS) {
+                block->nextFreeS->prevFreeS = block->prevFreeS;
+            }
+            block->nextFreeS = nullptr;
+            block->prevFreeS = nullptr;    
+        }
+    };
+
+    page_t* NewPage(uint64_t size) {
+        // Increase buffer size to include the page and first block size
+        // also add padding to ensure page alignment
+        size = AlignSize(size, pageAlign_);
+
+        // Allocate page memory
+        auto addr = nextAddress_;
+        nextAddress_ += size;
+
+        // Overflow check
+        if (nextAddress_ > capacity_)
+            return nullptr;
+
+        // Allocate object
+        auto newPage = new page_t(addr, size);
+
+        // Insert the new page into the list
+        newPage->next = pages_;
+        pages_ = newPage;
+
+        return newPage;
+    }
+
+    bool DeletePage(page_t* page) {
+        // The page should be empty
+        assert(nullptr == page->usedList);
+        assert(page->freeMList && (nullptr == page->freeMList->nextFreeM));
+
+        // Only delete top-level pages
+        auto nextAddr = page->addr + page->size;
+        if (nextAddr != nextAddress_)
+            return false;
+
+        // Remove the page from the list
+        page_t* prevPage = nullptr;
+        auto currPage = pages_;
+        while (currPage) {
+            if (currPage == page) {
+                if (prevPage) {
+                    prevPage->next = currPage->next;
+                } else {
+                    pages_ = currPage->next;
+                }
+                break;
+            }
+            prevPage = currPage;
+            currPage = currPage->next;
+        }
+
+        // Update next allocation address
+        nextAddress_ = page->addr;
+        
+        // free object
+        delete page->freeMList;
+        delete page;
+
+        return true;
+    }
+
+    page_t* FindNextEmptyPage() {
+       auto currPage = pages_;
+        while (currPage) {
+            if (nullptr == currPage->usedList)
+                return currPage;
+            currPage = currPage->next;
+        } 
+        return nullptr;
+    }
+
+    static uint64_t AlignSize(uint64_t size, uint64_t alignment) {
+        assert(0 == (alignment & (alignment - 1)));
+        return (size + alignment - 1) & ~(alignment - 1);
+    }
+
+    uint64_t baseAddress_;
+    uint64_t capacity_;
+    uint32_t pageAlign_;    
+    uint32_t blockAlign_;    
+    page_t*  pages_;
+    uint16_t nextAddress_;
+    uint64_t allocated_;
+};
+
+} // namespace vortex
--- a/runtime/common/nlohmann_json.hpp
+++ b/runtime/common/nlohmann_json.hpp
--- a/runtime/common/scope.cpp
+++ b/runtime/common/scope.cpp
@@ -0,0 +1,359 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "scope.h"
+#include <VX_config.h>
+#include <nlohmann_json.hpp>
+#include <iostream>
+#include <fstream>
+#include <thread>
+#include <chrono>
+#include <vector>
+#include <list>
+#include <assert.h>
+#include <chrono>
+#include <thread>
+#include <condition_variable>
+#include <mutex>
+#include <unordered_set>
+#include <sstream>
+
+#define FRAME_FLUSH_SIZE 100
+
+#define MMIO_SCOPE_READ  (AFU_IMAGE_MMIO_SCOPE_READ * 4)
+#define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4)
+
+#define CMD_GET_WIDTH   0
+#define CMD_GET_COUNT   1
+#define CMD_GET_START   2
+#define CMD_GET_DATA    3
+#define CMD_SET_START   4
+#define CMD_SET_STOP    5
+
+#define CHECK_ERR(_expr)    \
+    do {                    \
+        int err = _expr;    \
+        if (err == 0)       \
+            break;          \
+        printf("[SCOPE] error: '%s' returned %d!\n", #_expr, err); \
+        return err;         \
+    } while (false)
+
+struct tap_signal_t {
+    uint32_t id;  
+    std::string name;    
+    uint32_t width;    
+};
+
+struct tap_t {
+    uint32_t id;    
+    uint32_t width;    
+    uint32_t frames;    
+    uint32_t cur_frame;
+    uint64_t cycle_time;
+    std::string path;
+    std::vector<tap_signal_t> signals;
+};
+
+static scope_callback_t g_callback;
+
+using json = nlohmann::json;
+
+static std::vector<std::string> split(const std::string &s, char delimiter) {
+    std::vector<std::string> tokens;
+    std::string token;
+    std::istringstream tokenStream(s);
+    while (std::getline(tokenStream, token, delimiter)) {
+        tokens.push_back(token);
+    }
+    return tokens;
+}
+
+static void dump_module(std::ofstream& ofs, 
+                        const std::string& name,
+                        std::unordered_map<std::string, std::unordered_set<std::string>>& hierarchy,
+                        std::unordered_map<std::string, tap_t*>& tails,
+                        int indentation) {
+    std::string indent(indentation, ' ');
+    ofs << indent << "$scope module " << name << " $end" << std::endl;
+
+    auto itt = tails.find(name);
+    if (itt != tails.end()) {
+        for (auto& signal : itt->second->signals) {
+            ofs << indent << " $var reg " << signal.width << " " << signal.id << " " << signal.name << " $end" << std::endl;                        
+        }
+    }
+
+    auto ith = hierarchy.find(name);
+    if (ith != hierarchy.end()) {
+        for (auto& child : ith->second) {
+            dump_module(ofs, child, hierarchy, tails, indentation + 1);
+        }
+    }
+
+    ofs << indent << "$upscope $end" << std::endl;
+}
+
+static void dump_header(std::ofstream& ofs, std::vector<tap_t>& taps) {
+    ofs << "$version Generated by Vortex Scope Analyzer $end" << std::endl;
+    ofs << "$timescale 1 ns $end" << std::endl; 
+    ofs << "$scope module TOP $end" << std::endl;
+    ofs << " $var reg 1 0 clk $end" << std::endl;
+
+    std::unordered_map<std::string, std::unordered_set<std::string>> hierarchy;
+    std::unordered_set<std::string> heads;
+    std::unordered_map<std::string, tap_t*> tails;
+
+    // Build hierarchy
+    for (auto& tap : taps) {
+        std::vector<std::string> tokens = split(tap.path, '.');
+        for (size_t i = 1; i < tokens.size(); ++i) {
+            hierarchy[tokens[i-1]].insert(tokens[i]);
+        }
+        auto h = tokens[0];
+        auto t = tokens[tokens.size()-1];
+        heads.insert(h);
+        tails[t] = &tap;
+    }
+
+    // Dump module huierarchy
+    for (auto& head : heads) {
+        dump_module(ofs, head, hierarchy, tails, 1);
+    }
+
+    ofs << "$upscope $end" << std::endl;    
+    ofs << "enddefinitions $end" << std::endl;
+}
+
+static tap_t* find_nearest_tap(std::vector<tap_t>& taps) {
+    tap_t* nearest = nullptr;
+    for (auto& tap : taps) {
+        if (tap.cur_frame == tap.frames)
+            continue;
+        if (nearest != nullptr) {
+            if (tap.cycle_time < nearest->cycle_time)
+                nearest = &tap;                
+        } else {
+            nearest = &tap;
+        }
+    }
+    return nearest;
+}
+
+static uint64_t advance_time(std::ofstream& ofs, uint64_t next_time, uint64_t cur_time) {
+    while (cur_time < next_time) {
+        ofs << '#' << (cur_time * 2 + 0) << std::endl;
+        ofs << "b0 0" << std::endl;
+        ofs << '#' << (cur_time * 2 + 1) << std::endl;
+        ofs << "b1 0" << std::endl;
+        ++cur_time;
+    }
+    return cur_time;
+}
+
+static int dump_tap(std::ofstream& ofs, tap_t* tap, vx_device_h hdevice) {
+    uint32_t signal_offset = 0;   
+    uint32_t frame_offset = 0;
+    uint64_t word;
+
+    std::vector<char> signal_data(tap->width);
+    auto signal_it = tap->signals.rbegin();
+    uint32_t signal_width = signal_it->width;
+
+    do {
+        // read data
+        uint64_t cmd_data = (tap->id << 3) | CMD_GET_DATA;
+        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));        
+        CHECK_ERR(g_callback.registerRead(hdevice, &word));        
+        do {            
+            uint32_t word_offset = frame_offset % 64;
+            signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0';
+            ++signal_offset;
+            ++frame_offset;
+            if (signal_offset == signal_width) {
+                signal_data[signal_width] = 0; // string null termination
+                ofs << 'b' << signal_data.data() << ' ' << signal_it->id << std::endl;
+                if (frame_offset == tap->width) {
+                    // end-of-frame
+                    ++tap->cur_frame;
+                    if (tap->cur_frame != tap->frames) {
+                        // read next delta
+                        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));      
+                        CHECK_ERR(g_callback.registerRead(hdevice, &word));
+                        tap->cycle_time += 1 + word;
+                        if (0 == (tap->cur_frame % FRAME_FLUSH_SIZE)) {
+                            ofs << std::flush;
+                            std::cout << std::dec << "[SCOPE] flush tap #" << tap->id << ": "<< tap->cur_frame << "/" << tap->frames << " frames, next_time=" << tap->cycle_time << std::endl;
+                        }
+                    }
+                    break; 
+                }
+                signal_offset = 0;
+                ++signal_it;
+                signal_width = signal_it->width;
+            }
+        } while ((frame_offset % 64) != 0);
+    } while (frame_offset != tap->width);
+
+    return 0;
+}
+
+int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t start_time, uint64_t stop_time) {    
+    if (nullptr == hdevice || nullptr == callback)
+        return -1;
+
+    const char* json_path = getenv("SCOPE_JSON_PATH");
+    std::ifstream ifs(json_path);
+    if (!ifs) {
+        std::cerr << "[SCOPE] error: cannot open scope manifest file: " << json_path << std::endl;
+        return -1;
+    }
+    auto json_obj = json::parse(ifs);
+    if (json_obj.is_null()) {
+        std::cerr << "[SCOPE] error: invalid scope manifest file: " << json_path << std::endl;
+        return -1;
+    }
+
+    g_callback = *callback;   
+
+    // validate scope manifest
+    for (auto& tap : json_obj["taps"]) {
+        auto id = tap["id"].get<uint32_t>();
+        auto width = tap["width"].get<uint32_t>();
+        
+        uint64_t cmd_width = (id << 3) | CMD_GET_WIDTH;
+        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_width));
+        uint64_t dev_width;
+        CHECK_ERR(g_callback.registerRead(hdevice, &dev_width));
+        if (width != dev_width) {
+            std::cerr << "[SCOPE] error: invalid tap #" << id << " width, actual=" << dev_width << ", expected=" << width << std::endl;
+            return 1;
+        }
+    }
+
+    // set stop time
+    if (stop_time != uint64_t(-1)) {
+        std::cout << "[SCOPE] stop time: " << std::dec << stop_time << "s" << std::endl;
+        for (auto& tap : json_obj["taps"]) {
+            auto id = tap["id"].get<uint32_t>();
+            uint64_t cmd_stop = (stop_time << 11) | (id << 3) | CMD_SET_STOP;
+            CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
+        }        
+    }
+
+    // start recording
+    if (start_time != uint64_t(-1)) {  
+        std::cout << "[SCOPE] start time: " << std::dec << start_time << "s" << std::endl;
+        for (auto& tap : json_obj["taps"]) {
+            auto id = tap["id"].get<uint32_t>();
+            uint64_t cmd_start = (start_time << 11) | (id << 3) | CMD_SET_START;
+            CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
+        }        
+    }
+
+    return 0;
+}
+
+int vx_scope_stop(vx_device_h hdevice) {
+    if (nullptr == hdevice)
+        return -1;
+
+    std::vector<tap_t> taps;
+
+    {
+        const char* json_path = getenv("SCOPE_JSON_PATH");
+        std::ifstream ifs(json_path);
+        auto json_obj = json::parse(ifs);
+        if (json_obj.is_null())
+            return 0;
+
+        uint32_t signal_id = 1;
+
+        for (auto& tap : json_obj["taps"]) {
+            tap_t _tap;
+            _tap.id    = tap["id"].get<uint32_t>();
+            _tap.width = tap["width"].get<uint32_t>();
+            _tap.path  = tap["path"].get<std::string>();
+            _tap.cycle_time = 0;
+            _tap.frames = 0;
+            _tap.cur_frame = 0;            
+
+            for (auto& signal : tap["signals"]) {
+                auto name  = signal[0].get<std::string>();
+                auto width = signal[1].get<uint32_t>();
+                _tap.signals.push_back({signal_id, name, width});
+                ++signal_id;
+            }
+
+            taps.emplace_back(std::move(_tap));
+        }
+    }
+
+    // stop recording
+    for (auto& tap : taps) {
+        uint64_t cmd_stop = (0 << 11) | (tap.id << 3) | CMD_SET_STOP;
+        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
+    }
+
+    std::cout << "[SCOPE] trace dump begin..." << std::endl;
+
+    std::ofstream ofs("scope.vcd");
+
+    dump_header(ofs, taps);
+
+    // load trace info
+    for (auto& tap : taps) {
+        uint64_t count, start, delta;
+
+        // get count
+        uint64_t cmd_count = (tap.id << 3) | CMD_GET_COUNT;
+        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_count));
+        CHECK_ERR(g_callback.registerRead(hdevice, &count));   
+
+        // get start    
+        uint64_t cmd_start = (tap.id << 3) | CMD_GET_START;
+        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
+        CHECK_ERR(g_callback.registerRead(hdevice, &start));
+
+        // get data
+        uint64_t cmd_data = (tap.id << 3) | CMD_GET_DATA;
+        CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
+        CHECK_ERR(g_callback.registerRead(hdevice, &delta));
+
+        tap.frames = count;
+        tap.cycle_time = 1 + start + delta;
+
+        std::cout << std::dec << "[SCOPE] tap #" << tap.id 
+                              << ": width=" << tap.width 
+                              << ", num_frames=" << tap.frames 
+                              << ", start_time=" << tap.cycle_time 
+                              << ", path=" << tap.path << std::endl;
+    }  
+
+    uint64_t cur_time = 0;
+
+    while (true) {
+        // find the nearest tap
+        auto tap = find_nearest_tap(taps);
+        if (tap == nullptr)
+            break;
+        // advance clock
+        cur_time = advance_time(ofs, tap->cycle_time, cur_time);        
+        // dump tap
+        CHECK_ERR(dump_tap(ofs, tap, hdevice));
+    };
+
+    std::cout << "[SCOPE] trace dump done! - " << (cur_time/2) << " cycles" << std::endl;
+
+    return 0;
+}
--- a/runtime/common/scope.h
+++ b/runtime/common/scope.h
@@ -0,0 +1,35 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vortex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int (*pfn_registerWrite)(vx_device_h hdevice, uint64_t value);
+typedef int (*pfn_registerRead)(vx_device_h hdevice, uint64_t *value);
+
+struct scope_callback_t {
+	pfn_registerWrite registerWrite;
+	pfn_registerRead  registerRead;
+};
+
+int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t start_time, uint64_t stop_time);
+int vx_scope_stop(vx_device_h hdevice);
+
+#ifdef __cplusplus
+}
+#endif
--- a/runtime/common/utils.cpp
+++ b/runtime/common/utils.cpp
@@ -0,0 +1,463 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "utils.h"
+#include <iostream>
+#include <fstream>
+#include <list>
+#include <cstring>
+#include <vector>
+#include <vortex.h>
+#include <assert.h>
+
+#define RT_CHECK(_expr, _cleanup)                               \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+     _cleanup                                                   \
+   } while (false)
+
+uint64_t aligned_size(uint64_t size, uint64_t alignment) {        
+    assert(0 == (alignment & (alignment - 1)));
+    return (size + alignment - 1) & ~(alignment - 1);
+}
+
+bool is_aligned(uint64_t addr, uint64_t alignment) {
+    assert(0 == (alignment & (alignment - 1)));
+    return 0 == (addr & (alignment - 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+class AutoPerfDump {
+public:
+    AutoPerfDump() : perf_class_(0) {}
+
+    ~AutoPerfDump() {
+      for (auto hdevice : hdevices_) {
+        vx_dump_perf(hdevice, stdout);
+      }
+    }
+
+    void add_device(vx_device_h hdevice) {
+      auto perf_class_s = getenv("PERF_CLASS");
+      if (perf_class_s) {
+        perf_class_ = std::atoi(perf_class_s);
+        vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, perf_class_);
+      }
+      hdevices_.push_back(hdevice);
+    }
+
+    void remove_device(vx_device_h hdevice) {
+      hdevices_.remove(hdevice);
+      vx_dump_perf(hdevice, stdout);
+    }
+
+    int get_perf_class() const {
+      return perf_class_;
+    }
+    
+private:
+    std::list<vx_device_h> hdevices_;
+    int perf_class_;
+};
+
+#ifdef DUMP_PERF_STATS
+AutoPerfDump gAutoPerfDump;
+#endif
+
+void perf_add_device(vx_device_h hdevice) {
+#ifdef DUMP_PERF_STATS
+  gAutoPerfDump.add_device(hdevice);
+#else
+  (void)hdevice;
+#endif
+}
+
+void perf_remove_device(vx_device_h hdevice) {
+#ifdef DUMP_PERF_STATS
+  gAutoPerfDump.remove_device(hdevice);
+#else
+  (void)hdevice;
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size) {
+  int err = 0;
+
+  if (NULL == content || 0 == size)
+    return -1;
+
+  uint64_t kernel_base_addr;
+  err = vx_dev_caps(hdevice, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr);
+  if (err != 0)
+    return err;
+
+  return vx_copy_to_dev(hdevice, kernel_base_addr, content, size);
+}
+
+extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename) {
+  std::ifstream ifs(filename);
+  if (!ifs) {
+    std::cout << "error: " << filename << " not found" << std::endl;
+    return -1;
+  }
+
+  // read file content
+  ifs.seekg(0, ifs.end);
+  auto size = ifs.tellg();
+  auto content = new char [size];   
+  ifs.seekg(0, ifs.beg);
+  ifs.read(content, size);
+
+  // upload
+  int err = vx_upload_kernel_bytes(hdevice, content, size);
+
+  // release buffer
+  delete[] content;
+
+  return err;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void DeviceConfig::write(uint32_t addr, uint32_t value) {
+  data_[addr] = value;
+}
+
+uint32_t DeviceConfig::read(uint32_t addr) const {
+  if (0 == data_.count(addr)) {
+    printf("Error: DeviceConfig::read(%d) failed\n", addr);
+  }
+  return data_.at(addr);
+}
+
+int dcr_initialize(vx_device_h hdevice) {
+  const uint64_t startup_addr(STARTUP_ADDR);
+  RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), {
+    return -1;
+  });
+
+  RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), {
+    return -1;
+  });
+
+  RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
+    return -1;
+  });
+  
+  return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+static uint64_t get_csr_64(const void* ptr, int addr) {
+  auto w_ptr = reinterpret_cast<const uint32_t*>(ptr);
+  uint32_t value_lo = w_ptr[addr - VX_CSR_MPM_BASE];
+  uint32_t value_hi = w_ptr[addr - VX_CSR_MPM_BASE + 32];
+  return (uint64_t(value_hi) << 32) | value_lo;
+}
+
+extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
+  int ret = 0;
+
+  uint64_t instrs = 0;
+  uint64_t cycles = 0;
+
+#ifdef PERF_ENABLE   
+  auto perf_class = gAutoPerfDump.get_perf_class();
+
+  // PERF: pipeline stalls
+  uint64_t ibuffer_stalls = 0;
+  uint64_t scoreboard_stalls = 0;
+  uint64_t lsu_stalls = 0;
+  uint64_t fpu_stalls = 0;
+  uint64_t alu_stalls = 0;
+  uint64_t sfu_stalls = 0;  
+  uint64_t ifetches = 0;
+  uint64_t loads = 0;
+  uint64_t stores = 0;
+  uint64_t ifetch_lat = 0;
+  uint64_t load_lat   = 0;
+  // PERF: Icache 
+  uint64_t icache_reads = 0;
+  uint64_t icache_read_misses = 0;
+  // PERF: Dcache 
+  uint64_t dcache_reads = 0;
+  uint64_t dcache_writes = 0;
+  uint64_t dcache_read_misses = 0;
+  uint64_t dcache_write_misses = 0;
+  uint64_t dcache_bank_stalls = 0; 
+  uint64_t dcache_mshr_stalls = 0;
+  // PERF: shared memory
+  uint64_t smem_reads = 0;
+  uint64_t smem_writes = 0;
+  uint64_t smem_bank_stalls = 0;
+  // PERF: l2cache 
+  uint64_t l2cache_reads = 0;
+  uint64_t l2cache_writes = 0;
+  uint64_t l2cache_read_misses = 0;
+  uint64_t l2cache_write_misses = 0;
+  uint64_t l2cache_bank_stalls = 0; 
+  uint64_t l2cache_mshr_stalls = 0;
+  // PERF: l3cache 
+  uint64_t l3cache_reads = 0;
+  uint64_t l3cache_writes = 0;
+  uint64_t l3cache_read_misses = 0;
+  uint64_t l3cache_write_misses = 0;
+  uint64_t l3cache_bank_stalls = 0; 
+  uint64_t l3cache_mshr_stalls = 0;
+  // PERF: memory
+  uint64_t mem_reads = 0;
+  uint64_t mem_writes = 0;
+  uint64_t mem_lat = 0;
+#endif
+
+  uint64_t num_cores;
+  ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores);
+  if (ret != 0)
+    return ret;
+
+  std::vector<uint8_t> staging_buf(64* sizeof(uint32_t));
+      
+  for (unsigned core_id = 0; core_id < num_cores; ++core_id) {    
+    uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size();    
+    ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
+    if (ret != 0)
+      return ret;
+
+    uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
+    uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
+    float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
+    if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);            
+    instrs += instrs_per_core;
+    cycles = std::max<uint64_t>(cycles_per_core, cycles);
+
+  #ifdef PERF_ENABLE
+    switch (perf_class) {
+    case VX_DCR_MPM_CLASS_CORE: {
+      // PERF: pipeline    
+      // ibuffer_stall
+      uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core);
+      ibuffer_stalls += ibuffer_stalls_per_core;
+      // scoreboard_stall
+      uint64_t scoreboard_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core);
+      scoreboard_stalls += scoreboard_stalls_per_core;
+      // alu_stall
+      uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
+      alu_stalls += alu_stalls_per_core;      
+      // lsu_stall
+      uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
+      lsu_stalls += lsu_stalls_per_core;
+      // fpu_stall
+      uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
+      fpu_stalls += fpu_stalls_per_core;      
+      // sfu_stall
+      uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core);
+      sfu_stalls += sfu_stalls_per_core;
+      // PERF: memory
+      // ifetches
+      uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
+      ifetches += ifetches_per_core;
+      // loads
+      uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
+      loads += loads_per_core;
+      // stores
+      uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES);
+      if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
+      stores += stores_per_core;
+      // ifetch latency
+      uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
+      if (num_cores > 1) {
+        int mem_avg_lat = (int)(double(ifetch_lat_per_core) / double(ifetches_per_core));
+        fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
+      }
+      ifetch_lat += ifetch_lat_per_core;
+      // load latency
+      uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
+      if (num_cores > 1) {
+        int mem_avg_lat = (int)(double(load_lat_per_core) / double(loads_per_core));
+        fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
+      }
+      load_lat += load_lat_per_core;      
+    } break;
+    case VX_DCR_MPM_CLASS_MEM: {      
+      if (0 == core_id) {
+        // PERF: Icache
+        icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS);
+        icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R);
+      
+        // PERF: Dcache
+        dcache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_READS);
+        dcache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_WRITES);
+        dcache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_R);
+        dcache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_W);
+        dcache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_BANK_ST);
+        dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST);
+      
+        // PERF: smem
+        smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS);
+        smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES);
+        smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST);
+      
+        // PERF: L2cache
+        l2cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_READS);
+        l2cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_WRITES);
+        l2cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_R);
+        l2cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_W);
+        l2cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_BANK_ST);
+        l2cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MSHR_ST);
+      
+        // PERF: L3cache
+        l3cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_READS);
+        l3cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_WRITES);
+        l3cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_R);
+        l3cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_W);
+        l3cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_BANK_ST);
+        l3cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MSHR_ST);
+      
+        // PERF: memory
+        mem_reads  = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS);
+        mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES);
+        mem_lat    = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LAT);
+      }
+    } break;
+    default:
+      break;
+    }
+  #endif
+  }  
+  
+  float IPC = (float)(double(instrs) / double(cycles));
+  fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);    
+      
+#ifdef PERF_ENABLE
+  switch (perf_class) {
+  case VX_DCR_MPM_CLASS_CORE: {    
+    int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
+    int load_avg_lat = (int)(double(load_lat) / double(loads));
+    fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
+    fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
+    fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
+    fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls);
+    fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
+    fprintf(stream, "PERF: sfu unit stalls=%ld\n", sfu_stalls);
+    fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
+    fprintf(stream, "PERF: loads=%ld\n", loads);
+    fprintf(stream, "PERF: stores=%ld\n", stores);    
+    fprintf(stream, "PERF: ifetch latency=%d cycles\n", ifetch_avg_lat);
+    fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
+    
+  } break;  
+  case VX_DCR_MPM_CLASS_MEM: {
+    int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);    
+    int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
+    int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
+    int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);    
+    int l2cache_read_hit_ratio = (int)((1.0 - (double(l2cache_read_misses) / double(l2cache_reads))) * 100);
+    int l2cache_write_hit_ratio = (int)((1.0 - (double(l2cache_write_misses) / double(l2cache_writes))) * 100);
+    int l2cache_bank_utilization = (int)((double(l2cache_reads + l2cache_writes) / double(l2cache_reads + l2cache_writes + l2cache_bank_stalls)) * 100);    
+    int l3cache_read_hit_ratio = (int)((1.0 - (double(l3cache_read_misses) / double(l3cache_reads))) * 100);
+    int l3cache_write_hit_ratio = (int)((1.0 - (double(l3cache_write_misses) / double(l3cache_writes))) * 100);
+    int l3cache_bank_utilization = (int)((double(l3cache_reads + l3cache_writes) / double(l3cache_reads + l3cache_writes + l3cache_bank_stalls)) * 100);    
+    int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);    
+    int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));    
+    fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
+    fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio);
+    fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
+    fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes);
+    fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio);
+    fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio);  
+    fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization);
+    fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
+    fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
+    fprintf(stream, "PERF: smem writes=%ld\n", smem_writes); 
+    fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
+    fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
+    fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
+    fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio);
+    fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio);  
+    fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization);
+    fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls);
+    fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
+    fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
+    fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio);
+    fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio);  
+    fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization);
+    fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls);
+    fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
+    fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
+  } break;
+  default:
+    break;
+  }
+#endif
+
+  fflush(stream);
+
+  return 0;
+}
+
+extern int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value) {
+  int ret = 0;
+  uint64_t num_cores;
+  ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores);
+  if (ret != 0)
+    return ret;
+
+  if (core_id >= (int)num_cores) {
+    std::cout << "error: core_id out of range" << std::endl;
+    return -1;
+  }
+
+  std::vector<uint8_t> staging_buf(64 * sizeof(uint32_t));
+
+  uint64_t _value = 0;
+  
+  unsigned i = 0;
+  if (core_id != -1) {
+    i = core_id;
+    num_cores = core_id + 1;
+  }
+      
+  for (i = 0; i < num_cores; ++i) {
+    uint64_t mpm_mem_addr = IO_CSR_ADDR + i * staging_buf.size();    
+    ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
+    if (ret != 0)
+      return ret;
+
+    auto per_core_value = get_csr_64(staging_buf.data(), counter);     
+    if (counter == VX_CSR_MCYCLE) {
+      _value = std::max<uint64_t>(per_core_value, _value);
+    } else {
+      _value += per_core_value;
+    }    
+  }
+
+  // output
+  *value = _value;
+
+  return 0;
+}
--- a/runtime/common/utils.h
+++ b/runtime/common/utils.h
@@ -0,0 +1,47 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vortex.h>
+#include <cstdint>
+#include <unordered_map>
+#include <VX_config.h>
+#include <VX_types.h>
+
+class DeviceConfig {
+public:    
+    void write(uint32_t addr, uint32_t value);
+    uint32_t read(uint32_t addr) const;
+private:
+     std::unordered_map<uint32_t, uint32_t> data_;
+};
+
+int dcr_initialize(vx_device_h device);
+
+uint64_t aligned_size(uint64_t size, uint64_t alignment);
+
+bool is_aligned(uint64_t addr, uint64_t alignment);
+
+void perf_add_device(vx_device_h device);
+
+void perf_remove_device(vx_device_h device);
+
+#define CACHE_BLOCK_SIZE    64
+#define ALLOC_BASE_ADDR     CACHE_BLOCK_SIZE
+#define ALLOC_MAX_ADDR      STARTUP_ADDR
+#if (XLEN == 64)
+#define GLOBAL_MEM_SIZE      0x200000000  // 8 GB
+#else
+#define GLOBAL_MEM_SIZE      0x100000000  // 4 GB
+#endif