Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions

455
runtime/common/malloc.h Normal file
View File

@@ -0,0 +1,455 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <assert.h>
#include <stdio.h>
namespace vortex {
class MemoryAllocator {
public:
MemoryAllocator(
uint64_t baseAddress,
uint64_t capacity,
uint32_t pageAlign,
uint32_t blockAlign)
: baseAddress_(baseAddress)
, capacity_(capacity)
, pageAlign_(pageAlign)
, blockAlign_(blockAlign)
, pages_(nullptr)
, nextAddress_(0)
, allocated_(0)
{}
~MemoryAllocator() {
// Free allocated pages
page_t* currPage = pages_;
while (currPage) {
auto nextPage = currPage->next;
this->DeletePage(currPage);
currPage = nextPage;
}
}
uint32_t baseAddress() const {
return baseAddress_;
}
uint32_t capacity() const {
return capacity_;
}
uint64_t free() const {
return (capacity_ - allocated_);
}
uint64_t allocated() const {
return allocated_;
}
int allocate(uint64_t size, uint64_t* addr) {
if (size == 0 || addr == nullptr) {
printf("error: invalid argurments\n");
return -1;
}
// Align allocation size
size = AlignSize(size, blockAlign_);
// Walk thru all pages to find a free block
block_t* freeBlock = nullptr;
auto currPage = pages_;
while (currPage) {
auto currBlock = currPage->freeSList;
if (currBlock) {
// The free S-list is already sorted with the largest block first
// Quick check if the head block has enough space.
if (currBlock->size >= size) {
// Find the smallest matching block in the S-list
while (currBlock->nextFreeS
&& (currBlock->nextFreeS->size >= size)) {
currBlock = currBlock->nextFreeS;
}
// Return the free block
freeBlock = currBlock;
break;
}
}
currPage = currPage->next;
}
if (nullptr == freeBlock) {
// Allocate a new page for this request
currPage = this->NewPage(size);
if (nullptr == currPage) {
printf("error: out of memory\n");
return -1;
}
freeBlock = currPage->freeSList;
}
// Remove the block from the free lists
assert(freeBlock->size >= size);
currPage->RemoveFreeMList(freeBlock);
currPage->RemoveFreeSList(freeBlock);
// If the free block we have found is larger than what we are looking for,
// we may be able to split our free block in two.
uint64_t extraBytes = freeBlock->size - size;
if (extraBytes >= blockAlign_) {
// Reduce the free block size to the requested value
freeBlock->size = size;
// Allocate a new block to contain the extra buffer
auto nextAddr = freeBlock->addr + size;
auto newBlock = new block_t(nextAddr, extraBytes);
// Add the new block to the free lists
currPage->InsertFreeMList(newBlock);
currPage->InsertFreeSList(newBlock);
}
// Insert the free block into the used list
currPage->InsertUsedList(freeBlock);
// Return the free block address
*addr = baseAddress_ + freeBlock->addr;
// Update allocated size
allocated_ += size;
return 0;
}
int release(uint64_t addr) {
// Walk all pages to find the pointer
uint64_t local_addr = addr - baseAddress_;
block_t* usedBlock = nullptr;
auto currPage = pages_;
while (currPage) {
if (local_addr >= currPage->addr
&& local_addr < (currPage->addr + currPage->size)) {
auto currBlock = currPage->usedList;
while (currBlock) {
if (currBlock->addr == local_addr) {
usedBlock = currBlock;
break;
}
currBlock = currBlock->nextUsed;
}
break;
}
currPage = currPage->next;
}
// found the corresponding block?
if (nullptr == usedBlock) {
printf("error: invalid address to release: 0x%lx\n", addr);
return -1;
}
auto size = usedBlock->size;
// Remove the block from the used list
currPage->RemoveUsedList(usedBlock);
// Insert the block into the free M-list.
currPage->InsertFreeMList(usedBlock);
// Check if we can merge adjacent free blocks from the left.
if (usedBlock->prevFreeM) {
// Calculate the previous address
auto prevAddr = usedBlock->prevFreeM->addr + usedBlock->prevFreeM->size;
if (usedBlock->addr == prevAddr) {
auto prevBlock = usedBlock->prevFreeM;
// Merge the blocks to the left
prevBlock->size += usedBlock->size;
prevBlock->nextFreeM = usedBlock->nextFreeM;
if (prevBlock->nextFreeM) {
prevBlock->nextFreeM->prevFreeM = prevBlock;
}
// Detach previous block from the free S-list since size increased
currPage->RemoveFreeSList(prevBlock);
// reset usedBlock
delete usedBlock;
usedBlock = prevBlock;
}
}
// Check if we can merge adjacent free blocks from the right.
if (usedBlock->nextFreeM) {
// Calculate the next allocation start address
auto nextAddr = usedBlock->addr + usedBlock->size;
if (usedBlock->nextFreeM->addr == nextAddr) {
auto nextBlock = usedBlock->nextFreeM;
// Merge the blocks to the right
usedBlock->size += nextBlock->size;
usedBlock->nextFreeM = nextBlock->nextFreeM;
if (usedBlock->nextFreeM) {
usedBlock->nextFreeM->prevFreeM = usedBlock;
}
// Delete next block
currPage->RemoveFreeSList(nextBlock);
delete nextBlock;
}
}
// Insert the block into the free S-list.
currPage->InsertFreeSList(usedBlock);
// Check if we can free empty pages
if (nullptr == currPage->usedList) {
// Try to delete the page
while (currPage && this->DeletePage(currPage)) {
currPage = this->FindNextEmptyPage();
}
}
// update allocated size
allocated_ -= size;
return 0;
}
private:
struct block_t {
block_t* nextFreeS;
block_t* prevFreeS;
block_t* nextFreeM;
block_t* prevFreeM;
block_t* nextUsed;
block_t* prevUsed;
uint64_t addr;
uint64_t size;
block_t(uint64_t addr, uint64_t size)
: nextFreeS(nullptr)
, prevFreeS(nullptr)
, nextFreeM(nullptr)
, prevFreeM(nullptr)
, nextUsed(nullptr)
, prevUsed(nullptr)
, addr(addr)
, size(size)
{}
};
struct page_t {
page_t* next;
// List of used blocks
block_t* usedList;
// List with blocks sorted by descreasing sizes
// Used for block lookup during memory allocation.
block_t* freeSList;
// List with blocks sorted by increasing memory addresses
// Used for block merging during memory release.
block_t* freeMList;
uint64_t addr;
uint64_t size;
page_t(uint64_t addr, uint64_t size) :
next(nullptr),
usedList(nullptr),
addr(addr),
size(size) {
freeSList = freeMList = new block_t(addr, size);
}
void InsertUsedList(block_t* block) {
block->nextUsed = usedList;
if (usedList) {
usedList->prevUsed = block;
}
usedList = block;
}
void RemoveUsedList(block_t* block) {
if (block->prevUsed) {
block->prevUsed->nextUsed = block->nextUsed;
} else {
usedList = block->nextUsed;
}
if (block->nextUsed) {
block->nextUsed->prevUsed = block->prevUsed;
}
block->nextUsed = nullptr;
block->prevUsed = nullptr;
}
void InsertFreeMList(block_t* block) {
block_t* currBlock = freeMList;
block_t* prevBlock = nullptr;
while (currBlock && (currBlock->addr < block->addr)) {
prevBlock = currBlock;
currBlock = currBlock->nextFreeM;
}
block->nextFreeM = currBlock;
block->prevFreeM = prevBlock;
if (prevBlock) {
prevBlock->nextFreeM = block;
} else {
freeMList = block;
}
if (currBlock) {
currBlock->prevFreeM = block;
}
}
void RemoveFreeMList(block_t* block) {
if (block->prevFreeM) {
block->prevFreeM->nextFreeM = block->nextFreeM;
} else {
freeMList = block->nextFreeM;
}
if (block->nextFreeM) {
block->nextFreeM->prevFreeM = block->prevFreeM;
}
block->nextFreeM = nullptr;
block->prevFreeM = nullptr;
}
void InsertFreeSList(block_t* block) {
block_t* currBlock = this->freeSList;
block_t* prevBlock = nullptr;
while (currBlock && (currBlock->size > block->size)) {
prevBlock = currBlock;
currBlock = currBlock->nextFreeS;
}
block->nextFreeS = currBlock;
block->prevFreeS = prevBlock;
if (prevBlock) {
prevBlock->nextFreeS = block;
} else {
this->freeSList = block;
}
if (currBlock) {
currBlock->prevFreeS = block;
}
}
void RemoveFreeSList(block_t* block) {
if (block->prevFreeS) {
block->prevFreeS->nextFreeS = block->nextFreeS;
} else {
freeSList = block->nextFreeS;
}
if (block->nextFreeS) {
block->nextFreeS->prevFreeS = block->prevFreeS;
}
block->nextFreeS = nullptr;
block->prevFreeS = nullptr;
}
};
page_t* NewPage(uint64_t size) {
// Increase buffer size to include the page and first block size
// also add padding to ensure page alignment
size = AlignSize(size, pageAlign_);
// Allocate page memory
auto addr = nextAddress_;
nextAddress_ += size;
// Overflow check
if (nextAddress_ > capacity_)
return nullptr;
// Allocate object
auto newPage = new page_t(addr, size);
// Insert the new page into the list
newPage->next = pages_;
pages_ = newPage;
return newPage;
}
bool DeletePage(page_t* page) {
// The page should be empty
assert(nullptr == page->usedList);
assert(page->freeMList && (nullptr == page->freeMList->nextFreeM));
// Only delete top-level pages
auto nextAddr = page->addr + page->size;
if (nextAddr != nextAddress_)
return false;
// Remove the page from the list
page_t* prevPage = nullptr;
auto currPage = pages_;
while (currPage) {
if (currPage == page) {
if (prevPage) {
prevPage->next = currPage->next;
} else {
pages_ = currPage->next;
}
break;
}
prevPage = currPage;
currPage = currPage->next;
}
// Update next allocation address
nextAddress_ = page->addr;
// free object
delete page->freeMList;
delete page;
return true;
}
page_t* FindNextEmptyPage() {
auto currPage = pages_;
while (currPage) {
if (nullptr == currPage->usedList)
return currPage;
currPage = currPage->next;
}
return nullptr;
}
static uint64_t AlignSize(uint64_t size, uint64_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return (size + alignment - 1) & ~(alignment - 1);
}
uint64_t baseAddress_;
uint64_t capacity_;
uint32_t pageAlign_;
uint32_t blockAlign_;
page_t* pages_;
uint16_t nextAddress_;
uint64_t allocated_;
};
} // namespace vortex

File diff suppressed because it is too large Load Diff

359
runtime/common/scope.cpp Normal file
View File

@@ -0,0 +1,359 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "scope.h"
#include <VX_config.h>
#include <nlohmann_json.hpp>
#include <iostream>
#include <fstream>
#include <thread>
#include <chrono>
#include <vector>
#include <list>
#include <assert.h>
#include <chrono>
#include <thread>
#include <condition_variable>
#include <mutex>
#include <unordered_set>
#include <sstream>
#define FRAME_FLUSH_SIZE 100
#define MMIO_SCOPE_READ (AFU_IMAGE_MMIO_SCOPE_READ * 4)
#define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4)
#define CMD_GET_WIDTH 0
#define CMD_GET_COUNT 1
#define CMD_GET_START 2
#define CMD_GET_DATA 3
#define CMD_SET_START 4
#define CMD_SET_STOP 5
#define CHECK_ERR(_expr) \
do { \
int err = _expr; \
if (err == 0) \
break; \
printf("[SCOPE] error: '%s' returned %d!\n", #_expr, err); \
return err; \
} while (false)
struct tap_signal_t {
uint32_t id;
std::string name;
uint32_t width;
};
struct tap_t {
uint32_t id;
uint32_t width;
uint32_t frames;
uint32_t cur_frame;
uint64_t cycle_time;
std::string path;
std::vector<tap_signal_t> signals;
};
static scope_callback_t g_callback;
using json = nlohmann::json;
static std::vector<std::string> split(const std::string &s, char delimiter) {
std::vector<std::string> tokens;
std::string token;
std::istringstream tokenStream(s);
while (std::getline(tokenStream, token, delimiter)) {
tokens.push_back(token);
}
return tokens;
}
static void dump_module(std::ofstream& ofs,
const std::string& name,
std::unordered_map<std::string, std::unordered_set<std::string>>& hierarchy,
std::unordered_map<std::string, tap_t*>& tails,
int indentation) {
std::string indent(indentation, ' ');
ofs << indent << "$scope module " << name << " $end" << std::endl;
auto itt = tails.find(name);
if (itt != tails.end()) {
for (auto& signal : itt->second->signals) {
ofs << indent << " $var reg " << signal.width << " " << signal.id << " " << signal.name << " $end" << std::endl;
}
}
auto ith = hierarchy.find(name);
if (ith != hierarchy.end()) {
for (auto& child : ith->second) {
dump_module(ofs, child, hierarchy, tails, indentation + 1);
}
}
ofs << indent << "$upscope $end" << std::endl;
}
static void dump_header(std::ofstream& ofs, std::vector<tap_t>& taps) {
ofs << "$version Generated by Vortex Scope Analyzer $end" << std::endl;
ofs << "$timescale 1 ns $end" << std::endl;
ofs << "$scope module TOP $end" << std::endl;
ofs << " $var reg 1 0 clk $end" << std::endl;
std::unordered_map<std::string, std::unordered_set<std::string>> hierarchy;
std::unordered_set<std::string> heads;
std::unordered_map<std::string, tap_t*> tails;
// Build hierarchy
for (auto& tap : taps) {
std::vector<std::string> tokens = split(tap.path, '.');
for (size_t i = 1; i < tokens.size(); ++i) {
hierarchy[tokens[i-1]].insert(tokens[i]);
}
auto h = tokens[0];
auto t = tokens[tokens.size()-1];
heads.insert(h);
tails[t] = &tap;
}
// Dump module huierarchy
for (auto& head : heads) {
dump_module(ofs, head, hierarchy, tails, 1);
}
ofs << "$upscope $end" << std::endl;
ofs << "enddefinitions $end" << std::endl;
}
static tap_t* find_nearest_tap(std::vector<tap_t>& taps) {
tap_t* nearest = nullptr;
for (auto& tap : taps) {
if (tap.cur_frame == tap.frames)
continue;
if (nearest != nullptr) {
if (tap.cycle_time < nearest->cycle_time)
nearest = &tap;
} else {
nearest = &tap;
}
}
return nearest;
}
static uint64_t advance_time(std::ofstream& ofs, uint64_t next_time, uint64_t cur_time) {
while (cur_time < next_time) {
ofs << '#' << (cur_time * 2 + 0) << std::endl;
ofs << "b0 0" << std::endl;
ofs << '#' << (cur_time * 2 + 1) << std::endl;
ofs << "b1 0" << std::endl;
++cur_time;
}
return cur_time;
}
static int dump_tap(std::ofstream& ofs, tap_t* tap, vx_device_h hdevice) {
uint32_t signal_offset = 0;
uint32_t frame_offset = 0;
uint64_t word;
std::vector<char> signal_data(tap->width);
auto signal_it = tap->signals.rbegin();
uint32_t signal_width = signal_it->width;
do {
// read data
uint64_t cmd_data = (tap->id << 3) | CMD_GET_DATA;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
CHECK_ERR(g_callback.registerRead(hdevice, &word));
do {
uint32_t word_offset = frame_offset % 64;
signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0';
++signal_offset;
++frame_offset;
if (signal_offset == signal_width) {
signal_data[signal_width] = 0; // string null termination
ofs << 'b' << signal_data.data() << ' ' << signal_it->id << std::endl;
if (frame_offset == tap->width) {
// end-of-frame
++tap->cur_frame;
if (tap->cur_frame != tap->frames) {
// read next delta
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
CHECK_ERR(g_callback.registerRead(hdevice, &word));
tap->cycle_time += 1 + word;
if (0 == (tap->cur_frame % FRAME_FLUSH_SIZE)) {
ofs << std::flush;
std::cout << std::dec << "[SCOPE] flush tap #" << tap->id << ": "<< tap->cur_frame << "/" << tap->frames << " frames, next_time=" << tap->cycle_time << std::endl;
}
}
break;
}
signal_offset = 0;
++signal_it;
signal_width = signal_it->width;
}
} while ((frame_offset % 64) != 0);
} while (frame_offset != tap->width);
return 0;
}
int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t start_time, uint64_t stop_time) {
if (nullptr == hdevice || nullptr == callback)
return -1;
const char* json_path = getenv("SCOPE_JSON_PATH");
std::ifstream ifs(json_path);
if (!ifs) {
std::cerr << "[SCOPE] error: cannot open scope manifest file: " << json_path << std::endl;
return -1;
}
auto json_obj = json::parse(ifs);
if (json_obj.is_null()) {
std::cerr << "[SCOPE] error: invalid scope manifest file: " << json_path << std::endl;
return -1;
}
g_callback = *callback;
// validate scope manifest
for (auto& tap : json_obj["taps"]) {
auto id = tap["id"].get<uint32_t>();
auto width = tap["width"].get<uint32_t>();
uint64_t cmd_width = (id << 3) | CMD_GET_WIDTH;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_width));
uint64_t dev_width;
CHECK_ERR(g_callback.registerRead(hdevice, &dev_width));
if (width != dev_width) {
std::cerr << "[SCOPE] error: invalid tap #" << id << " width, actual=" << dev_width << ", expected=" << width << std::endl;
return 1;
}
}
// set stop time
if (stop_time != uint64_t(-1)) {
std::cout << "[SCOPE] stop time: " << std::dec << stop_time << "s" << std::endl;
for (auto& tap : json_obj["taps"]) {
auto id = tap["id"].get<uint32_t>();
uint64_t cmd_stop = (stop_time << 11) | (id << 3) | CMD_SET_STOP;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
}
}
// start recording
if (start_time != uint64_t(-1)) {
std::cout << "[SCOPE] start time: " << std::dec << start_time << "s" << std::endl;
for (auto& tap : json_obj["taps"]) {
auto id = tap["id"].get<uint32_t>();
uint64_t cmd_start = (start_time << 11) | (id << 3) | CMD_SET_START;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
}
}
return 0;
}
int vx_scope_stop(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
std::vector<tap_t> taps;
{
const char* json_path = getenv("SCOPE_JSON_PATH");
std::ifstream ifs(json_path);
auto json_obj = json::parse(ifs);
if (json_obj.is_null())
return 0;
uint32_t signal_id = 1;
for (auto& tap : json_obj["taps"]) {
tap_t _tap;
_tap.id = tap["id"].get<uint32_t>();
_tap.width = tap["width"].get<uint32_t>();
_tap.path = tap["path"].get<std::string>();
_tap.cycle_time = 0;
_tap.frames = 0;
_tap.cur_frame = 0;
for (auto& signal : tap["signals"]) {
auto name = signal[0].get<std::string>();
auto width = signal[1].get<uint32_t>();
_tap.signals.push_back({signal_id, name, width});
++signal_id;
}
taps.emplace_back(std::move(_tap));
}
}
// stop recording
for (auto& tap : taps) {
uint64_t cmd_stop = (0 << 11) | (tap.id << 3) | CMD_SET_STOP;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
}
std::cout << "[SCOPE] trace dump begin..." << std::endl;
std::ofstream ofs("scope.vcd");
dump_header(ofs, taps);
// load trace info
for (auto& tap : taps) {
uint64_t count, start, delta;
// get count
uint64_t cmd_count = (tap.id << 3) | CMD_GET_COUNT;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_count));
CHECK_ERR(g_callback.registerRead(hdevice, &count));
// get start
uint64_t cmd_start = (tap.id << 3) | CMD_GET_START;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
CHECK_ERR(g_callback.registerRead(hdevice, &start));
// get data
uint64_t cmd_data = (tap.id << 3) | CMD_GET_DATA;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
CHECK_ERR(g_callback.registerRead(hdevice, &delta));
tap.frames = count;
tap.cycle_time = 1 + start + delta;
std::cout << std::dec << "[SCOPE] tap #" << tap.id
<< ": width=" << tap.width
<< ", num_frames=" << tap.frames
<< ", start_time=" << tap.cycle_time
<< ", path=" << tap.path << std::endl;
}
uint64_t cur_time = 0;
while (true) {
// find the nearest tap
auto tap = find_nearest_tap(taps);
if (tap == nullptr)
break;
// advance clock
cur_time = advance_time(ofs, tap->cycle_time, cur_time);
// dump tap
CHECK_ERR(dump_tap(ofs, tap, hdevice));
};
std::cout << "[SCOPE] trace dump done! - " << (cur_time/2) << " cycles" << std::endl;
return 0;
}

35
runtime/common/scope.h Normal file
View File

@@ -0,0 +1,35 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vortex.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef int (*pfn_registerWrite)(vx_device_h hdevice, uint64_t value);
typedef int (*pfn_registerRead)(vx_device_h hdevice, uint64_t *value);
struct scope_callback_t {
pfn_registerWrite registerWrite;
pfn_registerRead registerRead;
};
int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t start_time, uint64_t stop_time);
int vx_scope_stop(vx_device_h hdevice);
#ifdef __cplusplus
}
#endif

463
runtime/common/utils.cpp Normal file
View File

@@ -0,0 +1,463 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "utils.h"
#include <iostream>
#include <fstream>
#include <list>
#include <cstring>
#include <vector>
#include <vortex.h>
#include <assert.h>
#define RT_CHECK(_expr, _cleanup) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
_cleanup \
} while (false)
uint64_t aligned_size(uint64_t size, uint64_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return (size + alignment - 1) & ~(alignment - 1);
}
bool is_aligned(uint64_t addr, uint64_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return 0 == (addr & (alignment - 1));
}
///////////////////////////////////////////////////////////////////////////////
class AutoPerfDump {
public:
AutoPerfDump() : perf_class_(0) {}
~AutoPerfDump() {
for (auto hdevice : hdevices_) {
vx_dump_perf(hdevice, stdout);
}
}
void add_device(vx_device_h hdevice) {
auto perf_class_s = getenv("PERF_CLASS");
if (perf_class_s) {
perf_class_ = std::atoi(perf_class_s);
vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, perf_class_);
}
hdevices_.push_back(hdevice);
}
void remove_device(vx_device_h hdevice) {
hdevices_.remove(hdevice);
vx_dump_perf(hdevice, stdout);
}
int get_perf_class() const {
return perf_class_;
}
private:
std::list<vx_device_h> hdevices_;
int perf_class_;
};
#ifdef DUMP_PERF_STATS
AutoPerfDump gAutoPerfDump;
#endif
void perf_add_device(vx_device_h hdevice) {
#ifdef DUMP_PERF_STATS
gAutoPerfDump.add_device(hdevice);
#else
(void)hdevice;
#endif
}
void perf_remove_device(vx_device_h hdevice) {
#ifdef DUMP_PERF_STATS
gAutoPerfDump.remove_device(hdevice);
#else
(void)hdevice;
#endif
}
///////////////////////////////////////////////////////////////////////////////
extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size) {
int err = 0;
if (NULL == content || 0 == size)
return -1;
uint64_t kernel_base_addr;
err = vx_dev_caps(hdevice, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr);
if (err != 0)
return err;
return vx_copy_to_dev(hdevice, kernel_base_addr, content, size);
}
extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename) {
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
return -1;
}
// read file content
ifs.seekg(0, ifs.end);
auto size = ifs.tellg();
auto content = new char [size];
ifs.seekg(0, ifs.beg);
ifs.read(content, size);
// upload
int err = vx_upload_kernel_bytes(hdevice, content, size);
// release buffer
delete[] content;
return err;
}
///////////////////////////////////////////////////////////////////////////////
void DeviceConfig::write(uint32_t addr, uint32_t value) {
data_[addr] = value;
}
uint32_t DeviceConfig::read(uint32_t addr) const {
if (0 == data_.count(addr)) {
printf("Error: DeviceConfig::read(%d) failed\n", addr);
}
return data_.at(addr);
}
int dcr_initialize(vx_device_h hdevice) {
const uint64_t startup_addr(STARTUP_ADDR);
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), {
return -1;
});
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), {
return -1;
});
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
return -1;
});
return 0;
}
///////////////////////////////////////////////////////////////////////////////
static uint64_t get_csr_64(const void* ptr, int addr) {
auto w_ptr = reinterpret_cast<const uint32_t*>(ptr);
uint32_t value_lo = w_ptr[addr - VX_CSR_MPM_BASE];
uint32_t value_hi = w_ptr[addr - VX_CSR_MPM_BASE + 32];
return (uint64_t(value_hi) << 32) | value_lo;
}
extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int ret = 0;
uint64_t instrs = 0;
uint64_t cycles = 0;
#ifdef PERF_ENABLE
auto perf_class = gAutoPerfDump.get_perf_class();
// PERF: pipeline stalls
uint64_t ibuffer_stalls = 0;
uint64_t scoreboard_stalls = 0;
uint64_t lsu_stalls = 0;
uint64_t fpu_stalls = 0;
uint64_t alu_stalls = 0;
uint64_t sfu_stalls = 0;
uint64_t ifetches = 0;
uint64_t loads = 0;
uint64_t stores = 0;
uint64_t ifetch_lat = 0;
uint64_t load_lat = 0;
// PERF: Icache
uint64_t icache_reads = 0;
uint64_t icache_read_misses = 0;
// PERF: Dcache
uint64_t dcache_reads = 0;
uint64_t dcache_writes = 0;
uint64_t dcache_read_misses = 0;
uint64_t dcache_write_misses = 0;
uint64_t dcache_bank_stalls = 0;
uint64_t dcache_mshr_stalls = 0;
// PERF: shared memory
uint64_t smem_reads = 0;
uint64_t smem_writes = 0;
uint64_t smem_bank_stalls = 0;
// PERF: l2cache
uint64_t l2cache_reads = 0;
uint64_t l2cache_writes = 0;
uint64_t l2cache_read_misses = 0;
uint64_t l2cache_write_misses = 0;
uint64_t l2cache_bank_stalls = 0;
uint64_t l2cache_mshr_stalls = 0;
// PERF: l3cache
uint64_t l3cache_reads = 0;
uint64_t l3cache_writes = 0;
uint64_t l3cache_read_misses = 0;
uint64_t l3cache_write_misses = 0;
uint64_t l3cache_bank_stalls = 0;
uint64_t l3cache_mshr_stalls = 0;
// PERF: memory
uint64_t mem_reads = 0;
uint64_t mem_writes = 0;
uint64_t mem_lat = 0;
#endif
uint64_t num_cores;
ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores);
if (ret != 0)
return ret;
std::vector<uint8_t> staging_buf(64* sizeof(uint32_t));
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size();
ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
if (ret != 0)
return ret;
uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
instrs += instrs_per_core;
cycles = std::max<uint64_t>(cycles_per_core, cycles);
#ifdef PERF_ENABLE
switch (perf_class) {
case VX_DCR_MPM_CLASS_CORE: {
// PERF: pipeline
// ibuffer_stall
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core);
ibuffer_stalls += ibuffer_stalls_per_core;
// scoreboard_stall
uint64_t scoreboard_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core);
scoreboard_stalls += scoreboard_stalls_per_core;
// alu_stall
uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
alu_stalls += alu_stalls_per_core;
// lsu_stall
uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
lsu_stalls += lsu_stalls_per_core;
// fpu_stall
uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
fpu_stalls += fpu_stalls_per_core;
// sfu_stall
uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core);
sfu_stalls += sfu_stalls_per_core;
// PERF: memory
// ifetches
uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
ifetches += ifetches_per_core;
// loads
uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
loads += loads_per_core;
// stores
uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES);
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
stores += stores_per_core;
// ifetch latency
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
if (num_cores > 1) {
int mem_avg_lat = (int)(double(ifetch_lat_per_core) / double(ifetches_per_core));
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
}
ifetch_lat += ifetch_lat_per_core;
// load latency
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
if (num_cores > 1) {
int mem_avg_lat = (int)(double(load_lat_per_core) / double(loads_per_core));
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
}
load_lat += load_lat_per_core;
} break;
case VX_DCR_MPM_CLASS_MEM: {
if (0 == core_id) {
// PERF: Icache
icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS);
icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R);
// PERF: Dcache
dcache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_READS);
dcache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_WRITES);
dcache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_R);
dcache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_W);
dcache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_BANK_ST);
dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST);
// PERF: smem
smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS);
smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES);
smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST);
// PERF: L2cache
l2cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_READS);
l2cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_WRITES);
l2cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_R);
l2cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_W);
l2cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_BANK_ST);
l2cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MSHR_ST);
// PERF: L3cache
l3cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_READS);
l3cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_WRITES);
l3cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_R);
l3cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_W);
l3cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_BANK_ST);
l3cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MSHR_ST);
// PERF: memory
mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS);
mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES);
mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LAT);
}
} break;
default:
break;
}
#endif
}
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
#ifdef PERF_ENABLE
switch (perf_class) {
case VX_DCR_MPM_CLASS_CORE: {
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
int load_avg_lat = (int)(double(load_lat) / double(loads));
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls);
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
fprintf(stream, "PERF: sfu unit stalls=%ld\n", sfu_stalls);
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
fprintf(stream, "PERF: loads=%ld\n", loads);
fprintf(stream, "PERF: stores=%ld\n", stores);
fprintf(stream, "PERF: ifetch latency=%d cycles\n", ifetch_avg_lat);
fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
} break;
case VX_DCR_MPM_CLASS_MEM: {
int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
int l2cache_read_hit_ratio = (int)((1.0 - (double(l2cache_read_misses) / double(l2cache_reads))) * 100);
int l2cache_write_hit_ratio = (int)((1.0 - (double(l2cache_write_misses) / double(l2cache_writes))) * 100);
int l2cache_bank_utilization = (int)((double(l2cache_reads + l2cache_writes) / double(l2cache_reads + l2cache_writes + l2cache_bank_stalls)) * 100);
int l3cache_read_hit_ratio = (int)((1.0 - (double(l3cache_read_misses) / double(l3cache_reads))) * 100);
int l3cache_write_hit_ratio = (int)((1.0 - (double(l3cache_write_misses) / double(l3cache_writes))) * 100);
int l3cache_bank_utilization = (int)((double(l3cache_reads + l3cache_writes) / double(l3cache_reads + l3cache_writes + l3cache_bank_stalls)) * 100);
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio);
fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes);
fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio);
fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio);
fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization);
fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio);
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio);
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization);
fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls);
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio);
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio);
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization);
fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls);
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
} break;
default:
break;
}
#endif
fflush(stream);
return 0;
}
extern int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value) {
int ret = 0;
uint64_t num_cores;
ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores);
if (ret != 0)
return ret;
if (core_id >= (int)num_cores) {
std::cout << "error: core_id out of range" << std::endl;
return -1;
}
std::vector<uint8_t> staging_buf(64 * sizeof(uint32_t));
uint64_t _value = 0;
unsigned i = 0;
if (core_id != -1) {
i = core_id;
num_cores = core_id + 1;
}
for (i = 0; i < num_cores; ++i) {
uint64_t mpm_mem_addr = IO_CSR_ADDR + i * staging_buf.size();
ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
if (ret != 0)
return ret;
auto per_core_value = get_csr_64(staging_buf.data(), counter);
if (counter == VX_CSR_MCYCLE) {
_value = std::max<uint64_t>(per_core_value, _value);
} else {
_value += per_core_value;
}
}
// output
*value = _value;
return 0;
}

47
runtime/common/utils.h Normal file
View File

@@ -0,0 +1,47 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vortex.h>
#include <cstdint>
#include <unordered_map>
#include <VX_config.h>
#include <VX_types.h>
class DeviceConfig {
public:
void write(uint32_t addr, uint32_t value);
uint32_t read(uint32_t addr) const;
private:
std::unordered_map<uint32_t, uint32_t> data_;
};
int dcr_initialize(vx_device_h device);
uint64_t aligned_size(uint64_t size, uint64_t alignment);
bool is_aligned(uint64_t addr, uint64_t alignment);
void perf_add_device(vx_device_h device);
void perf_remove_device(vx_device_h device);
#define CACHE_BLOCK_SIZE 64
#define ALLOC_BASE_ADDR CACHE_BLOCK_SIZE
#define ALLOC_MAX_ADDR STARTUP_ADDR
#if (XLEN == 64)
#define GLOBAL_MEM_SIZE 0x200000000 // 8 GB
#else
#define GLOBAL_MEM_SIZE 0x100000000 // 4 GB
#endif