Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions

0
runtime/.gitignore vendored
View File

View File

@@ -1,49 +1,25 @@
XLEN ?= 32
all: stub rtlsim simx opae
ifeq ($(XLEN),32)
RISCV_TOOLCHAIN_PATH = /opt/riscv-gnu-toolchain
else
RISCV_TOOLCHAIN_PATH = /opt/riscv64-gnu-toolchain
endif
stub:
$(MAKE) -C stub
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
simx:
$(MAKE) -C simx
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objcopy
rtlsim:
$(MAKE) -C rtlsim
ifeq ($(XLEN),32)
CFLAGS += -march=rv32imf -mabi=ilp32f
else
CFLAGS += -march=rv64imfd -mabi=lp64d
endif
opae:
$(MAKE) -C opae
CFLAGS += -O3 -mcmodel=medany -Wstack-usage=1024 -fno-exceptions -fdata-sections -ffunction-sections
CFLAGS += -I./include -I../hw
PROJECT = libvortexrt
SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/tinyprintf.c ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
all: $(PROJECT).a $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).a
$(DP) -D $(PROJECT).a > $(PROJECT).dump
%.S.o: src/%.S
$(CC) $(CFLAGS) -c $< -o $@
%.c.o: src/%.c
$(CC) $(CFLAGS) -c $< -o $@
$(PROJECT).a: $(OBJS)
$(AR) rcs $@ $^
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
xrt:
$(MAKE) -C xrt
clean:
rm -rf *.a *.o *.dump .depend
$(MAKE) clean -C stub
$(MAKE) clean -C simx
$(MAKE) clean -C rtlsim
$(MAKE) clean -C opae
$(MAKE) clean -C xrt
.PHONY: all stub simx rtlsim opae xrt clean

455
runtime/common/malloc.h Normal file
View File

@@ -0,0 +1,455 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <assert.h>
#include <stdio.h>
namespace vortex {
class MemoryAllocator {
public:
MemoryAllocator(
uint64_t baseAddress,
uint64_t capacity,
uint32_t pageAlign,
uint32_t blockAlign)
: baseAddress_(baseAddress)
, capacity_(capacity)
, pageAlign_(pageAlign)
, blockAlign_(blockAlign)
, pages_(nullptr)
, nextAddress_(0)
, allocated_(0)
{}
~MemoryAllocator() {
// Free allocated pages
page_t* currPage = pages_;
while (currPage) {
auto nextPage = currPage->next;
this->DeletePage(currPage);
currPage = nextPage;
}
}
uint32_t baseAddress() const {
return baseAddress_;
}
uint32_t capacity() const {
return capacity_;
}
uint64_t free() const {
return (capacity_ - allocated_);
}
uint64_t allocated() const {
return allocated_;
}
int allocate(uint64_t size, uint64_t* addr) {
if (size == 0 || addr == nullptr) {
printf("error: invalid argurments\n");
return -1;
}
// Align allocation size
size = AlignSize(size, blockAlign_);
// Walk thru all pages to find a free block
block_t* freeBlock = nullptr;
auto currPage = pages_;
while (currPage) {
auto currBlock = currPage->freeSList;
if (currBlock) {
// The free S-list is already sorted with the largest block first
// Quick check if the head block has enough space.
if (currBlock->size >= size) {
// Find the smallest matching block in the S-list
while (currBlock->nextFreeS
&& (currBlock->nextFreeS->size >= size)) {
currBlock = currBlock->nextFreeS;
}
// Return the free block
freeBlock = currBlock;
break;
}
}
currPage = currPage->next;
}
if (nullptr == freeBlock) {
// Allocate a new page for this request
currPage = this->NewPage(size);
if (nullptr == currPage) {
printf("error: out of memory\n");
return -1;
}
freeBlock = currPage->freeSList;
}
// Remove the block from the free lists
assert(freeBlock->size >= size);
currPage->RemoveFreeMList(freeBlock);
currPage->RemoveFreeSList(freeBlock);
// If the free block we have found is larger than what we are looking for,
// we may be able to split our free block in two.
uint64_t extraBytes = freeBlock->size - size;
if (extraBytes >= blockAlign_) {
// Reduce the free block size to the requested value
freeBlock->size = size;
// Allocate a new block to contain the extra buffer
auto nextAddr = freeBlock->addr + size;
auto newBlock = new block_t(nextAddr, extraBytes);
// Add the new block to the free lists
currPage->InsertFreeMList(newBlock);
currPage->InsertFreeSList(newBlock);
}
// Insert the free block into the used list
currPage->InsertUsedList(freeBlock);
// Return the free block address
*addr = baseAddress_ + freeBlock->addr;
// Update allocated size
allocated_ += size;
return 0;
}
int release(uint64_t addr) {
// Walk all pages to find the pointer
uint64_t local_addr = addr - baseAddress_;
block_t* usedBlock = nullptr;
auto currPage = pages_;
while (currPage) {
if (local_addr >= currPage->addr
&& local_addr < (currPage->addr + currPage->size)) {
auto currBlock = currPage->usedList;
while (currBlock) {
if (currBlock->addr == local_addr) {
usedBlock = currBlock;
break;
}
currBlock = currBlock->nextUsed;
}
break;
}
currPage = currPage->next;
}
// found the corresponding block?
if (nullptr == usedBlock) {
printf("error: invalid address to release: 0x%lx\n", addr);
return -1;
}
auto size = usedBlock->size;
// Remove the block from the used list
currPage->RemoveUsedList(usedBlock);
// Insert the block into the free M-list.
currPage->InsertFreeMList(usedBlock);
// Check if we can merge adjacent free blocks from the left.
if (usedBlock->prevFreeM) {
// Calculate the previous address
auto prevAddr = usedBlock->prevFreeM->addr + usedBlock->prevFreeM->size;
if (usedBlock->addr == prevAddr) {
auto prevBlock = usedBlock->prevFreeM;
// Merge the blocks to the left
prevBlock->size += usedBlock->size;
prevBlock->nextFreeM = usedBlock->nextFreeM;
if (prevBlock->nextFreeM) {
prevBlock->nextFreeM->prevFreeM = prevBlock;
}
// Detach previous block from the free S-list since size increased
currPage->RemoveFreeSList(prevBlock);
// reset usedBlock
delete usedBlock;
usedBlock = prevBlock;
}
}
// Check if we can merge adjacent free blocks from the right.
if (usedBlock->nextFreeM) {
// Calculate the next allocation start address
auto nextAddr = usedBlock->addr + usedBlock->size;
if (usedBlock->nextFreeM->addr == nextAddr) {
auto nextBlock = usedBlock->nextFreeM;
// Merge the blocks to the right
usedBlock->size += nextBlock->size;
usedBlock->nextFreeM = nextBlock->nextFreeM;
if (usedBlock->nextFreeM) {
usedBlock->nextFreeM->prevFreeM = usedBlock;
}
// Delete next block
currPage->RemoveFreeSList(nextBlock);
delete nextBlock;
}
}
// Insert the block into the free S-list.
currPage->InsertFreeSList(usedBlock);
// Check if we can free empty pages
if (nullptr == currPage->usedList) {
// Try to delete the page
while (currPage && this->DeletePage(currPage)) {
currPage = this->FindNextEmptyPage();
}
}
// update allocated size
allocated_ -= size;
return 0;
}
private:
struct block_t {
block_t* nextFreeS;
block_t* prevFreeS;
block_t* nextFreeM;
block_t* prevFreeM;
block_t* nextUsed;
block_t* prevUsed;
uint64_t addr;
uint64_t size;
block_t(uint64_t addr, uint64_t size)
: nextFreeS(nullptr)
, prevFreeS(nullptr)
, nextFreeM(nullptr)
, prevFreeM(nullptr)
, nextUsed(nullptr)
, prevUsed(nullptr)
, addr(addr)
, size(size)
{}
};
struct page_t {
page_t* next;
// List of used blocks
block_t* usedList;
// List with blocks sorted by descreasing sizes
// Used for block lookup during memory allocation.
block_t* freeSList;
// List with blocks sorted by increasing memory addresses
// Used for block merging during memory release.
block_t* freeMList;
uint64_t addr;
uint64_t size;
page_t(uint64_t addr, uint64_t size) :
next(nullptr),
usedList(nullptr),
addr(addr),
size(size) {
freeSList = freeMList = new block_t(addr, size);
}
void InsertUsedList(block_t* block) {
block->nextUsed = usedList;
if (usedList) {
usedList->prevUsed = block;
}
usedList = block;
}
void RemoveUsedList(block_t* block) {
if (block->prevUsed) {
block->prevUsed->nextUsed = block->nextUsed;
} else {
usedList = block->nextUsed;
}
if (block->nextUsed) {
block->nextUsed->prevUsed = block->prevUsed;
}
block->nextUsed = nullptr;
block->prevUsed = nullptr;
}
void InsertFreeMList(block_t* block) {
block_t* currBlock = freeMList;
block_t* prevBlock = nullptr;
while (currBlock && (currBlock->addr < block->addr)) {
prevBlock = currBlock;
currBlock = currBlock->nextFreeM;
}
block->nextFreeM = currBlock;
block->prevFreeM = prevBlock;
if (prevBlock) {
prevBlock->nextFreeM = block;
} else {
freeMList = block;
}
if (currBlock) {
currBlock->prevFreeM = block;
}
}
void RemoveFreeMList(block_t* block) {
if (block->prevFreeM) {
block->prevFreeM->nextFreeM = block->nextFreeM;
} else {
freeMList = block->nextFreeM;
}
if (block->nextFreeM) {
block->nextFreeM->prevFreeM = block->prevFreeM;
}
block->nextFreeM = nullptr;
block->prevFreeM = nullptr;
}
void InsertFreeSList(block_t* block) {
block_t* currBlock = this->freeSList;
block_t* prevBlock = nullptr;
while (currBlock && (currBlock->size > block->size)) {
prevBlock = currBlock;
currBlock = currBlock->nextFreeS;
}
block->nextFreeS = currBlock;
block->prevFreeS = prevBlock;
if (prevBlock) {
prevBlock->nextFreeS = block;
} else {
this->freeSList = block;
}
if (currBlock) {
currBlock->prevFreeS = block;
}
}
void RemoveFreeSList(block_t* block) {
if (block->prevFreeS) {
block->prevFreeS->nextFreeS = block->nextFreeS;
} else {
freeSList = block->nextFreeS;
}
if (block->nextFreeS) {
block->nextFreeS->prevFreeS = block->prevFreeS;
}
block->nextFreeS = nullptr;
block->prevFreeS = nullptr;
}
};
page_t* NewPage(uint64_t size) {
// Increase buffer size to include the page and first block size
// also add padding to ensure page alignment
size = AlignSize(size, pageAlign_);
// Allocate page memory
auto addr = nextAddress_;
nextAddress_ += size;
// Overflow check
if (nextAddress_ > capacity_)
return nullptr;
// Allocate object
auto newPage = new page_t(addr, size);
// Insert the new page into the list
newPage->next = pages_;
pages_ = newPage;
return newPage;
}
bool DeletePage(page_t* page) {
// The page should be empty
assert(nullptr == page->usedList);
assert(page->freeMList && (nullptr == page->freeMList->nextFreeM));
// Only delete top-level pages
auto nextAddr = page->addr + page->size;
if (nextAddr != nextAddress_)
return false;
// Remove the page from the list
page_t* prevPage = nullptr;
auto currPage = pages_;
while (currPage) {
if (currPage == page) {
if (prevPage) {
prevPage->next = currPage->next;
} else {
pages_ = currPage->next;
}
break;
}
prevPage = currPage;
currPage = currPage->next;
}
// Update next allocation address
nextAddress_ = page->addr;
// free object
delete page->freeMList;
delete page;
return true;
}
page_t* FindNextEmptyPage() {
auto currPage = pages_;
while (currPage) {
if (nullptr == currPage->usedList)
return currPage;
currPage = currPage->next;
}
return nullptr;
}
static uint64_t AlignSize(uint64_t size, uint64_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return (size + alignment - 1) & ~(alignment - 1);
}
uint64_t baseAddress_;
uint64_t capacity_;
uint32_t pageAlign_;
uint32_t blockAlign_;
page_t* pages_;
uint16_t nextAddress_;
uint64_t allocated_;
};
} // namespace vortex

File diff suppressed because it is too large Load Diff

359
runtime/common/scope.cpp Normal file
View File

@@ -0,0 +1,359 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "scope.h"
#include <VX_config.h>
#include <nlohmann_json.hpp>
#include <iostream>
#include <fstream>
#include <thread>
#include <chrono>
#include <vector>
#include <list>
#include <assert.h>
#include <chrono>
#include <thread>
#include <condition_variable>
#include <mutex>
#include <unordered_set>
#include <sstream>
#define FRAME_FLUSH_SIZE 100
#define MMIO_SCOPE_READ (AFU_IMAGE_MMIO_SCOPE_READ * 4)
#define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4)
#define CMD_GET_WIDTH 0
#define CMD_GET_COUNT 1
#define CMD_GET_START 2
#define CMD_GET_DATA 3
#define CMD_SET_START 4
#define CMD_SET_STOP 5
#define CHECK_ERR(_expr) \
do { \
int err = _expr; \
if (err == 0) \
break; \
printf("[SCOPE] error: '%s' returned %d!\n", #_expr, err); \
return err; \
} while (false)
struct tap_signal_t {
uint32_t id;
std::string name;
uint32_t width;
};
struct tap_t {
uint32_t id;
uint32_t width;
uint32_t frames;
uint32_t cur_frame;
uint64_t cycle_time;
std::string path;
std::vector<tap_signal_t> signals;
};
static scope_callback_t g_callback;
using json = nlohmann::json;
static std::vector<std::string> split(const std::string &s, char delimiter) {
std::vector<std::string> tokens;
std::string token;
std::istringstream tokenStream(s);
while (std::getline(tokenStream, token, delimiter)) {
tokens.push_back(token);
}
return tokens;
}
static void dump_module(std::ofstream& ofs,
const std::string& name,
std::unordered_map<std::string, std::unordered_set<std::string>>& hierarchy,
std::unordered_map<std::string, tap_t*>& tails,
int indentation) {
std::string indent(indentation, ' ');
ofs << indent << "$scope module " << name << " $end" << std::endl;
auto itt = tails.find(name);
if (itt != tails.end()) {
for (auto& signal : itt->second->signals) {
ofs << indent << " $var reg " << signal.width << " " << signal.id << " " << signal.name << " $end" << std::endl;
}
}
auto ith = hierarchy.find(name);
if (ith != hierarchy.end()) {
for (auto& child : ith->second) {
dump_module(ofs, child, hierarchy, tails, indentation + 1);
}
}
ofs << indent << "$upscope $end" << std::endl;
}
static void dump_header(std::ofstream& ofs, std::vector<tap_t>& taps) {
ofs << "$version Generated by Vortex Scope Analyzer $end" << std::endl;
ofs << "$timescale 1 ns $end" << std::endl;
ofs << "$scope module TOP $end" << std::endl;
ofs << " $var reg 1 0 clk $end" << std::endl;
std::unordered_map<std::string, std::unordered_set<std::string>> hierarchy;
std::unordered_set<std::string> heads;
std::unordered_map<std::string, tap_t*> tails;
// Build hierarchy
for (auto& tap : taps) {
std::vector<std::string> tokens = split(tap.path, '.');
for (size_t i = 1; i < tokens.size(); ++i) {
hierarchy[tokens[i-1]].insert(tokens[i]);
}
auto h = tokens[0];
auto t = tokens[tokens.size()-1];
heads.insert(h);
tails[t] = &tap;
}
// Dump module huierarchy
for (auto& head : heads) {
dump_module(ofs, head, hierarchy, tails, 1);
}
ofs << "$upscope $end" << std::endl;
ofs << "enddefinitions $end" << std::endl;
}
static tap_t* find_nearest_tap(std::vector<tap_t>& taps) {
tap_t* nearest = nullptr;
for (auto& tap : taps) {
if (tap.cur_frame == tap.frames)
continue;
if (nearest != nullptr) {
if (tap.cycle_time < nearest->cycle_time)
nearest = &tap;
} else {
nearest = &tap;
}
}
return nearest;
}
static uint64_t advance_time(std::ofstream& ofs, uint64_t next_time, uint64_t cur_time) {
while (cur_time < next_time) {
ofs << '#' << (cur_time * 2 + 0) << std::endl;
ofs << "b0 0" << std::endl;
ofs << '#' << (cur_time * 2 + 1) << std::endl;
ofs << "b1 0" << std::endl;
++cur_time;
}
return cur_time;
}
static int dump_tap(std::ofstream& ofs, tap_t* tap, vx_device_h hdevice) {
uint32_t signal_offset = 0;
uint32_t frame_offset = 0;
uint64_t word;
std::vector<char> signal_data(tap->width);
auto signal_it = tap->signals.rbegin();
uint32_t signal_width = signal_it->width;
do {
// read data
uint64_t cmd_data = (tap->id << 3) | CMD_GET_DATA;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
CHECK_ERR(g_callback.registerRead(hdevice, &word));
do {
uint32_t word_offset = frame_offset % 64;
signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0';
++signal_offset;
++frame_offset;
if (signal_offset == signal_width) {
signal_data[signal_width] = 0; // string null termination
ofs << 'b' << signal_data.data() << ' ' << signal_it->id << std::endl;
if (frame_offset == tap->width) {
// end-of-frame
++tap->cur_frame;
if (tap->cur_frame != tap->frames) {
// read next delta
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
CHECK_ERR(g_callback.registerRead(hdevice, &word));
tap->cycle_time += 1 + word;
if (0 == (tap->cur_frame % FRAME_FLUSH_SIZE)) {
ofs << std::flush;
std::cout << std::dec << "[SCOPE] flush tap #" << tap->id << ": "<< tap->cur_frame << "/" << tap->frames << " frames, next_time=" << tap->cycle_time << std::endl;
}
}
break;
}
signal_offset = 0;
++signal_it;
signal_width = signal_it->width;
}
} while ((frame_offset % 64) != 0);
} while (frame_offset != tap->width);
return 0;
}
int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t start_time, uint64_t stop_time) {
if (nullptr == hdevice || nullptr == callback)
return -1;
const char* json_path = getenv("SCOPE_JSON_PATH");
std::ifstream ifs(json_path);
if (!ifs) {
std::cerr << "[SCOPE] error: cannot open scope manifest file: " << json_path << std::endl;
return -1;
}
auto json_obj = json::parse(ifs);
if (json_obj.is_null()) {
std::cerr << "[SCOPE] error: invalid scope manifest file: " << json_path << std::endl;
return -1;
}
g_callback = *callback;
// validate scope manifest
for (auto& tap : json_obj["taps"]) {
auto id = tap["id"].get<uint32_t>();
auto width = tap["width"].get<uint32_t>();
uint64_t cmd_width = (id << 3) | CMD_GET_WIDTH;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_width));
uint64_t dev_width;
CHECK_ERR(g_callback.registerRead(hdevice, &dev_width));
if (width != dev_width) {
std::cerr << "[SCOPE] error: invalid tap #" << id << " width, actual=" << dev_width << ", expected=" << width << std::endl;
return 1;
}
}
// set stop time
if (stop_time != uint64_t(-1)) {
std::cout << "[SCOPE] stop time: " << std::dec << stop_time << "s" << std::endl;
for (auto& tap : json_obj["taps"]) {
auto id = tap["id"].get<uint32_t>();
uint64_t cmd_stop = (stop_time << 11) | (id << 3) | CMD_SET_STOP;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
}
}
// start recording
if (start_time != uint64_t(-1)) {
std::cout << "[SCOPE] start time: " << std::dec << start_time << "s" << std::endl;
for (auto& tap : json_obj["taps"]) {
auto id = tap["id"].get<uint32_t>();
uint64_t cmd_start = (start_time << 11) | (id << 3) | CMD_SET_START;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
}
}
return 0;
}
int vx_scope_stop(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
std::vector<tap_t> taps;
{
const char* json_path = getenv("SCOPE_JSON_PATH");
std::ifstream ifs(json_path);
auto json_obj = json::parse(ifs);
if (json_obj.is_null())
return 0;
uint32_t signal_id = 1;
for (auto& tap : json_obj["taps"]) {
tap_t _tap;
_tap.id = tap["id"].get<uint32_t>();
_tap.width = tap["width"].get<uint32_t>();
_tap.path = tap["path"].get<std::string>();
_tap.cycle_time = 0;
_tap.frames = 0;
_tap.cur_frame = 0;
for (auto& signal : tap["signals"]) {
auto name = signal[0].get<std::string>();
auto width = signal[1].get<uint32_t>();
_tap.signals.push_back({signal_id, name, width});
++signal_id;
}
taps.emplace_back(std::move(_tap));
}
}
// stop recording
for (auto& tap : taps) {
uint64_t cmd_stop = (0 << 11) | (tap.id << 3) | CMD_SET_STOP;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
}
std::cout << "[SCOPE] trace dump begin..." << std::endl;
std::ofstream ofs("scope.vcd");
dump_header(ofs, taps);
// load trace info
for (auto& tap : taps) {
uint64_t count, start, delta;
// get count
uint64_t cmd_count = (tap.id << 3) | CMD_GET_COUNT;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_count));
CHECK_ERR(g_callback.registerRead(hdevice, &count));
// get start
uint64_t cmd_start = (tap.id << 3) | CMD_GET_START;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
CHECK_ERR(g_callback.registerRead(hdevice, &start));
// get data
uint64_t cmd_data = (tap.id << 3) | CMD_GET_DATA;
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
CHECK_ERR(g_callback.registerRead(hdevice, &delta));
tap.frames = count;
tap.cycle_time = 1 + start + delta;
std::cout << std::dec << "[SCOPE] tap #" << tap.id
<< ": width=" << tap.width
<< ", num_frames=" << tap.frames
<< ", start_time=" << tap.cycle_time
<< ", path=" << tap.path << std::endl;
}
uint64_t cur_time = 0;
while (true) {
// find the nearest tap
auto tap = find_nearest_tap(taps);
if (tap == nullptr)
break;
// advance clock
cur_time = advance_time(ofs, tap->cycle_time, cur_time);
// dump tap
CHECK_ERR(dump_tap(ofs, tap, hdevice));
};
std::cout << "[SCOPE] trace dump done! - " << (cur_time/2) << " cycles" << std::endl;
return 0;
}

35
runtime/common/scope.h Normal file
View File

@@ -0,0 +1,35 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vortex.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef int (*pfn_registerWrite)(vx_device_h hdevice, uint64_t value);
typedef int (*pfn_registerRead)(vx_device_h hdevice, uint64_t *value);
struct scope_callback_t {
pfn_registerWrite registerWrite;
pfn_registerRead registerRead;
};
int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t start_time, uint64_t stop_time);
int vx_scope_stop(vx_device_h hdevice);
#ifdef __cplusplus
}
#endif

463
runtime/common/utils.cpp Normal file
View File

@@ -0,0 +1,463 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "utils.h"
#include <iostream>
#include <fstream>
#include <list>
#include <cstring>
#include <vector>
#include <vortex.h>
#include <assert.h>
#define RT_CHECK(_expr, _cleanup) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
_cleanup \
} while (false)
uint64_t aligned_size(uint64_t size, uint64_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return (size + alignment - 1) & ~(alignment - 1);
}
bool is_aligned(uint64_t addr, uint64_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return 0 == (addr & (alignment - 1));
}
///////////////////////////////////////////////////////////////////////////////
class AutoPerfDump {
public:
AutoPerfDump() : perf_class_(0) {}
~AutoPerfDump() {
for (auto hdevice : hdevices_) {
vx_dump_perf(hdevice, stdout);
}
}
void add_device(vx_device_h hdevice) {
auto perf_class_s = getenv("PERF_CLASS");
if (perf_class_s) {
perf_class_ = std::atoi(perf_class_s);
vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, perf_class_);
}
hdevices_.push_back(hdevice);
}
void remove_device(vx_device_h hdevice) {
hdevices_.remove(hdevice);
vx_dump_perf(hdevice, stdout);
}
int get_perf_class() const {
return perf_class_;
}
private:
std::list<vx_device_h> hdevices_;
int perf_class_;
};
#ifdef DUMP_PERF_STATS
AutoPerfDump gAutoPerfDump;
#endif
void perf_add_device(vx_device_h hdevice) {
#ifdef DUMP_PERF_STATS
gAutoPerfDump.add_device(hdevice);
#else
(void)hdevice;
#endif
}
void perf_remove_device(vx_device_h hdevice) {
#ifdef DUMP_PERF_STATS
gAutoPerfDump.remove_device(hdevice);
#else
(void)hdevice;
#endif
}
///////////////////////////////////////////////////////////////////////////////
extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size) {
int err = 0;
if (NULL == content || 0 == size)
return -1;
uint64_t kernel_base_addr;
err = vx_dev_caps(hdevice, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr);
if (err != 0)
return err;
return vx_copy_to_dev(hdevice, kernel_base_addr, content, size);
}
extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename) {
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
return -1;
}
// read file content
ifs.seekg(0, ifs.end);
auto size = ifs.tellg();
auto content = new char [size];
ifs.seekg(0, ifs.beg);
ifs.read(content, size);
// upload
int err = vx_upload_kernel_bytes(hdevice, content, size);
// release buffer
delete[] content;
return err;
}
///////////////////////////////////////////////////////////////////////////////
void DeviceConfig::write(uint32_t addr, uint32_t value) {
data_[addr] = value;
}
uint32_t DeviceConfig::read(uint32_t addr) const {
if (0 == data_.count(addr)) {
printf("Error: DeviceConfig::read(%d) failed\n", addr);
}
return data_.at(addr);
}
int dcr_initialize(vx_device_h hdevice) {
const uint64_t startup_addr(STARTUP_ADDR);
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), {
return -1;
});
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), {
return -1;
});
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
return -1;
});
return 0;
}
///////////////////////////////////////////////////////////////////////////////
static uint64_t get_csr_64(const void* ptr, int addr) {
auto w_ptr = reinterpret_cast<const uint32_t*>(ptr);
uint32_t value_lo = w_ptr[addr - VX_CSR_MPM_BASE];
uint32_t value_hi = w_ptr[addr - VX_CSR_MPM_BASE + 32];
return (uint64_t(value_hi) << 32) | value_lo;
}
extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int ret = 0;
uint64_t instrs = 0;
uint64_t cycles = 0;
#ifdef PERF_ENABLE
auto perf_class = gAutoPerfDump.get_perf_class();
// PERF: pipeline stalls
uint64_t ibuffer_stalls = 0;
uint64_t scoreboard_stalls = 0;
uint64_t lsu_stalls = 0;
uint64_t fpu_stalls = 0;
uint64_t alu_stalls = 0;
uint64_t sfu_stalls = 0;
uint64_t ifetches = 0;
uint64_t loads = 0;
uint64_t stores = 0;
uint64_t ifetch_lat = 0;
uint64_t load_lat = 0;
// PERF: Icache
uint64_t icache_reads = 0;
uint64_t icache_read_misses = 0;
// PERF: Dcache
uint64_t dcache_reads = 0;
uint64_t dcache_writes = 0;
uint64_t dcache_read_misses = 0;
uint64_t dcache_write_misses = 0;
uint64_t dcache_bank_stalls = 0;
uint64_t dcache_mshr_stalls = 0;
// PERF: shared memory
uint64_t smem_reads = 0;
uint64_t smem_writes = 0;
uint64_t smem_bank_stalls = 0;
// PERF: l2cache
uint64_t l2cache_reads = 0;
uint64_t l2cache_writes = 0;
uint64_t l2cache_read_misses = 0;
uint64_t l2cache_write_misses = 0;
uint64_t l2cache_bank_stalls = 0;
uint64_t l2cache_mshr_stalls = 0;
// PERF: l3cache
uint64_t l3cache_reads = 0;
uint64_t l3cache_writes = 0;
uint64_t l3cache_read_misses = 0;
uint64_t l3cache_write_misses = 0;
uint64_t l3cache_bank_stalls = 0;
uint64_t l3cache_mshr_stalls = 0;
// PERF: memory
uint64_t mem_reads = 0;
uint64_t mem_writes = 0;
uint64_t mem_lat = 0;
#endif
uint64_t num_cores;
ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores);
if (ret != 0)
return ret;
std::vector<uint8_t> staging_buf(64* sizeof(uint32_t));
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size();
ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
if (ret != 0)
return ret;
uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
instrs += instrs_per_core;
cycles = std::max<uint64_t>(cycles_per_core, cycles);
#ifdef PERF_ENABLE
switch (perf_class) {
case VX_DCR_MPM_CLASS_CORE: {
// PERF: pipeline
// ibuffer_stall
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core);
ibuffer_stalls += ibuffer_stalls_per_core;
// scoreboard_stall
uint64_t scoreboard_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core);
scoreboard_stalls += scoreboard_stalls_per_core;
// alu_stall
uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
alu_stalls += alu_stalls_per_core;
// lsu_stall
uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
lsu_stalls += lsu_stalls_per_core;
// fpu_stall
uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
fpu_stalls += fpu_stalls_per_core;
// sfu_stall
uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core);
sfu_stalls += sfu_stalls_per_core;
// PERF: memory
// ifetches
uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
ifetches += ifetches_per_core;
// loads
uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
loads += loads_per_core;
// stores
uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES);
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
stores += stores_per_core;
// ifetch latency
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
if (num_cores > 1) {
int mem_avg_lat = (int)(double(ifetch_lat_per_core) / double(ifetches_per_core));
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
}
ifetch_lat += ifetch_lat_per_core;
// load latency
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
if (num_cores > 1) {
int mem_avg_lat = (int)(double(load_lat_per_core) / double(loads_per_core));
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
}
load_lat += load_lat_per_core;
} break;
case VX_DCR_MPM_CLASS_MEM: {
if (0 == core_id) {
// PERF: Icache
icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS);
icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R);
// PERF: Dcache
dcache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_READS);
dcache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_WRITES);
dcache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_R);
dcache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_W);
dcache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_BANK_ST);
dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST);
// PERF: smem
smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS);
smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES);
smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST);
// PERF: L2cache
l2cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_READS);
l2cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_WRITES);
l2cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_R);
l2cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_W);
l2cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_BANK_ST);
l2cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MSHR_ST);
// PERF: L3cache
l3cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_READS);
l3cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_WRITES);
l3cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_R);
l3cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_W);
l3cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_BANK_ST);
l3cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MSHR_ST);
// PERF: memory
mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS);
mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES);
mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LAT);
}
} break;
default:
break;
}
#endif
}
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
#ifdef PERF_ENABLE
switch (perf_class) {
case VX_DCR_MPM_CLASS_CORE: {
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
int load_avg_lat = (int)(double(load_lat) / double(loads));
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls);
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
fprintf(stream, "PERF: sfu unit stalls=%ld\n", sfu_stalls);
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
fprintf(stream, "PERF: loads=%ld\n", loads);
fprintf(stream, "PERF: stores=%ld\n", stores);
fprintf(stream, "PERF: ifetch latency=%d cycles\n", ifetch_avg_lat);
fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
} break;
case VX_DCR_MPM_CLASS_MEM: {
int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
int l2cache_read_hit_ratio = (int)((1.0 - (double(l2cache_read_misses) / double(l2cache_reads))) * 100);
int l2cache_write_hit_ratio = (int)((1.0 - (double(l2cache_write_misses) / double(l2cache_writes))) * 100);
int l2cache_bank_utilization = (int)((double(l2cache_reads + l2cache_writes) / double(l2cache_reads + l2cache_writes + l2cache_bank_stalls)) * 100);
int l3cache_read_hit_ratio = (int)((1.0 - (double(l3cache_read_misses) / double(l3cache_reads))) * 100);
int l3cache_write_hit_ratio = (int)((1.0 - (double(l3cache_write_misses) / double(l3cache_writes))) * 100);
int l3cache_bank_utilization = (int)((double(l3cache_reads + l3cache_writes) / double(l3cache_reads + l3cache_writes + l3cache_bank_stalls)) * 100);
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio);
fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes);
fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio);
fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio);
fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization);
fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio);
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio);
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization);
fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls);
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio);
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio);
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization);
fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls);
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
} break;
default:
break;
}
#endif
fflush(stream);
return 0;
}
extern int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value) {
int ret = 0;
uint64_t num_cores;
ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores);
if (ret != 0)
return ret;
if (core_id >= (int)num_cores) {
std::cout << "error: core_id out of range" << std::endl;
return -1;
}
std::vector<uint8_t> staging_buf(64 * sizeof(uint32_t));
uint64_t _value = 0;
unsigned i = 0;
if (core_id != -1) {
i = core_id;
num_cores = core_id + 1;
}
for (i = 0; i < num_cores; ++i) {
uint64_t mpm_mem_addr = IO_CSR_ADDR + i * staging_buf.size();
ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
if (ret != 0)
return ret;
auto per_core_value = get_csr_64(staging_buf.data(), counter);
if (counter == VX_CSR_MCYCLE) {
_value = std::max<uint64_t>(per_core_value, _value);
} else {
_value += per_core_value;
}
}
// output
*value = _value;
return 0;
}

47
runtime/common/utils.h Normal file
View File

@@ -0,0 +1,47 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vortex.h>
#include <cstdint>
#include <unordered_map>
#include <VX_config.h>
#include <VX_types.h>
class DeviceConfig {
public:
void write(uint32_t addr, uint32_t value);
uint32_t read(uint32_t addr) const;
private:
std::unordered_map<uint32_t, uint32_t> data_;
};
int dcr_initialize(vx_device_h device);
uint64_t aligned_size(uint64_t size, uint64_t alignment);
bool is_aligned(uint64_t addr, uint64_t alignment);
void perf_add_device(vx_device_h device);
void perf_remove_device(vx_device_h device);
#define CACHE_BLOCK_SIZE 64
#define ALLOC_BASE_ADDR CACHE_BLOCK_SIZE
#define ALLOC_MAX_ADDR STARTUP_ADDR
#if (XLEN == 64)
#define GLOBAL_MEM_SIZE 0x200000000 // 8 GB
#else
#define GLOBAL_MEM_SIZE 0x100000000 // 4 GB
#endif

111
runtime/include/vortex.h Normal file
View File

@@ -0,0 +1,111 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __VX_VORTEX_H__
#define __VX_VORTEX_H__
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef void* vx_device_h;
// device caps ids
#define VX_CAPS_VERSION 0x0
#define VX_CAPS_NUM_THREADS 0x1
#define VX_CAPS_NUM_WARPS 0x2
#define VX_CAPS_NUM_CORES 0x3
#define VX_CAPS_CACHE_LINE_SIZE 0x4
#define VX_CAPS_GLOBAL_MEM_SIZE 0x5
#define VX_CAPS_LOCAL_MEM_SIZE 0x6
#define VX_CAPS_KERNEL_BASE_ADDR 0x7
#define VX_CAPS_ISA_FLAGS 0x8
// device isa flags
#define VX_ISA_STD_A (1ull << 0)
#define VX_ISA_STD_C (1ull << 2)
#define VX_ISA_STD_D (1ull << 3)
#define VX_ISA_STD_E (1ull << 4)
#define VX_ISA_STD_F (1ull << 5)
#define VX_ISA_STD_H (1ull << 7)
#define VX_ISA_STD_I (1ull << 8)
#define VX_ISA_STD_N (1ull << 13)
#define VX_ISA_STD_Q (1ull << 16)
#define VX_ISA_STD_S (1ull << 18)
#define VX_ISA_STD_U (1ull << 20)
#define VX_ISA_BASE(flags) (1 << (((flags >> 30) & 0x3) + 4))
#define VX_ISA_EXT_TEX (1ull << 32)
#define VX_ISA_EXT_RASTER (1ull << 33)
#define VX_ISA_EXT_ROP (1ull << 34)
// device memory types
#define VX_MEM_TYPE_GLOBAL 0
#define VX_MEM_TYPE_LOCAL 1
// ready wait timeout
#define VX_MAX_TIMEOUT (24*60*60*1000) // 24 Hr
// open the device and connect to it
int vx_dev_open(vx_device_h* hdevice);
// Close the device when all the operations are done
int vx_dev_close(vx_device_h hdevice);
// return device configurations
int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value);
// allocate device memory and return address
int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr);
// release device memory
int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr);
// get device memory info
int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used);
// Copy bytes from host to device memory
int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size);
// Copy bytes from device memory to host
int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size);
// Start device execution
int vx_start(vx_device_h hdevice);
// Wait for device ready with milliseconds timeout
int vx_ready_wait(vx_device_h hdevice, uint64_t timeout);
// write device configuration registers
int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value);
////////////////////////////// UTILITY FUNCTIONS //////////////////////////////
// upload kernel bytes to device
int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size);
// upload kernel file to device
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename);
// performance counters
int vx_dump_perf(vx_device_h hdevice, FILE* stream);
int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value);
#ifdef __cplusplus
}
#endif
#endif // __VX_VORTEX_H__

View File

@@ -1,214 +0,0 @@
#ifndef VX_INTRINSICS_H
#define VX_INTRINSICS_H
#include <VX_config.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __ASSEMBLY__
#define __ASM_STR(x) x
#else
#define __ASM_STR(x) #x
#endif
#define csr_read(csr) ({ \
unsigned __r; \
__asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \
__r; \
})
#define csr_write(csr, val) ({ \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \
})
#define csr_swap(csr, val) ({ \
unsigned __r; \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
__r; \
})
#define csr_read_set(csr, val) ({ \
unsigned __r; \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
__r; \
})
#define csr_set(csr, val) ({ \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \
})
#define csr_read_clear(csr, val) ({ \
unsigned __r; \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
__r; \
})
#define csr_clear(csr, val) ({ \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \
})
// Texture load
#define vx_tex(unit, u, v, lod) ({ \
unsigned __r; \
__asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(unit), "r"(u), "r"(v), "r"(lod)); \
__r; \
})
// Conditional move
#define vx_cmov(c, t, f) ({ \
unsigned __r; \
__asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
__r; \
})
// Set thread mask
inline void vx_tmc(unsigned thread_mask) {
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask));
}
// Set thread predicate
inline void vx_pred(unsigned condition) {
asm volatile (".insn s 0x6b, 0, x1, 0(%0)" :: "r"(condition));
}
typedef void (*vx_wspawn_pfn)();
// Spawn warps
inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
asm volatile (".insn s 0x6b, 1, %1, 0(%0)" :: "r"(num_warps), "r"(func_ptr));
}
// Split on a predicate
inline void vx_split(int predicate) {
asm volatile (".insn s 0x6b, 2, x0, 0(%0)" :: "r"(predicate));
}
// Join
inline void vx_join() {
asm volatile (".insn s 0x6b, 3, x0, 0(x0)");
}
// Warp Barrier
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
}
// Prefetch
inline void vx_prefetch(unsigned addr) {
asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
}
// Return active warp's thread id
inline int vx_thread_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_WTID));
return result;
}
// Return active core's local thread id
inline int vx_thread_lid() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LTID));
return result;
}
// Return processsor global thread id
inline int vx_thread_gid() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GTID));
return result;
}
// Return active core's local warp id
inline int vx_warp_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LWID));
return result;
}
// Return processsor's global warp id
inline int vx_warp_gid() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GWID));
return result;
}
// Return processsor core id
inline int vx_core_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GCID));
return result;
}
// Return current threadk mask
inline int vx_thread_mask() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_TMASK));
return result;
}
// Return the number of threads in a warp
inline int vx_num_threads() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NT));
return result;
}
// Return the number of warps in a core
inline int vx_num_warps() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NW));
return result;
}
// Return the number of cores in the processsor
inline int vx_num_cores() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NC));
return result;
}
inline void vx_fence() {
asm volatile ("fence iorw, iorw");
}
#define __if(b) vx_split(b); \
if (b)
#define __else else
#define __endif vx_join();
#define __DIVERGENT__ __attribute__((annotate("divergent")))
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1,21 +0,0 @@
#ifndef VX_PRINT_H
#define VX_PRINT_H
#include <stdarg.h>
#ifdef __cplusplus
extern "C" {
#endif
int vx_vprintf(const char* format, va_list va);
int vx_printf(const char * format, ...);
void vx_putchar(int c);
void vx_putint(int value, int base);
void vx_putfloat(float value, int precision);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1,43 +0,0 @@
#ifndef VX_API_H
#define VX_API_H
#include <stdint.h>
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
uint32_t num_groups[3];
uint32_t global_offset[3];
uint32_t local_size[3];
char * printf_buffer;
uint32_t *printf_buffer_position;
uint32_t printf_buffer_capacity;
uint32_t work_dim;
} context_t;
typedef void (*vx_spawn_kernel_cb) (
const void * /* arg */,
const context_t * /* context */,
uint32_t /* group_x */,
uint32_t /* group_y */,
uint32_t /* group_z */
);
typedef void (*vx_spawn_tasks_cb)(int task_id, void *arg);
typedef void (*vx_serial_cb)(void *arg);
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg);
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
void vx_serial(vx_serial_cb callback, void * arg);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1,264 +0,0 @@
/* ---- Original Script: /opt/riscv32i/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.x ---- */
/* Default linker script, for normal executables */
/* Copyright (C) 2014-2017 Free Software Foundation, Inc.
Copying and distribution of this script, with or without modification,
are permitted in any medium without royalty provided the copyright
notice and this notice are preserved. */
OUTPUT_FORMAT("elf32-littleriscv", "elf32-littleriscv", "elf32-littleriscv")
OUTPUT_ARCH(riscv)
ENTRY(_start)
SECTIONS
{
. = 0x80000000;
.interp : { *(.interp) }
.note.gnu.build-id : { *(.note.gnu.build-id) }
.hash : { *(.hash) }
.gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
.dynstr : { *(.dynstr) }
.gnu.version : { *(.gnu.version) }
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
.rela.dyn :
{
*(.rela.init)
*(.rela.text .rela.text.* .rela.gnu.linkonce.t.*)
*(.rela.fini)
*(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*)
*(.rela.data .rela.data.* .rela.gnu.linkonce.d.*)
*(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*)
*(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*)
*(.rela.ctors)
*(.rela.dtors)
*(.rela.got)
*(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*)
*(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*)
*(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*)
*(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*)
*(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*)
PROVIDE_HIDDEN (__rela_iplt_start = .);
*(.rela.iplt)
PROVIDE_HIDDEN (__rela_iplt_end = .);
}
.rela.plt :
{
*(.rela.plt)
}
.init :
{
KEEP (*(SORT_NONE(.init)))
}
.plt : { *(.plt) }
.iplt : { *(.iplt) }
.text :
{
*(.text.unlikely .text.*_unlikely .text.unlikely.*)
*(.text.exit .text.exit.*)
*(.text.startup .text.startup.*)
*(.text.hot .text.hot.*)
*(.text .stub .text.* .gnu.linkonce.t.*)
/* .gnu.warning sections are handled specially by elf32.em. */
*(.gnu.warning)
}
.fini :
{
KEEP (*(SORT_NONE(.fini)))
}
PROVIDE (__etext = .);
PROVIDE (_etext = .);
PROVIDE (etext = .);
.rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
.rodata1 : { *(.rodata1) }
.sdata2 :
{
*(.sdata2 .sdata2.* .gnu.linkonce.s2.*)
}
.sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) }
.eh_frame_hdr : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) }
.eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) *(.eh_frame.*) }
.gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) }
.gnu_extab : ONLY_IF_RO { *(.gnu_extab*) }
/* These sections are generated by the Sun/Oracle C++ compiler. */
.exception_ranges : ONLY_IF_RO { *(.exception_ranges*) }
/* Adjust the address for the data segment. We want to adjust up to
the same address within the page on the next page up. */
. = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
/* Exception handling */
.eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) }
.gnu_extab : ONLY_IF_RW { *(.gnu_extab) }
.gcc_except_table : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) }
.exception_ranges : ONLY_IF_RW { *(.exception_ranges*) }
/* Thread Local Storage sections */
.tdata :
{
PROVIDE_HIDDEN (__tdata_start = .);
*(.tdata .tdata.* .gnu.linkonce.td.*)
PROVIDE_HIDDEN (__tdata_end = .);
}
PROVIDE (__tdata_size = SIZEOF (.tdata));
.tbss :
{
PROVIDE_HIDDEN (__tbss_start = .);
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
*(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
PROVIDE_HIDDEN (__tbss_end = .);
}
PROVIDE (__tbss_size = SIZEOF (.tbss));
PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
.preinit_array :
{
PROVIDE_HIDDEN (__preinit_array_start = .);
KEEP (*(.preinit_array))
PROVIDE_HIDDEN (__preinit_array_end = .);
}
.init_array :
{
PROVIDE_HIDDEN (__init_array_start = .);
KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
PROVIDE_HIDDEN (__init_array_end = .);
}
.fini_array :
{
PROVIDE_HIDDEN (__fini_array_start = .);
KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors))
PROVIDE_HIDDEN (__fini_array_end = .);
}
.ctors :
{
/* gcc uses crtbegin.o to find the start of
the constructors, so we make sure it is
first. Because this is a wildcard, it
doesn't matter if the user does not
actually link against crtbegin.o; the
linker won't look for a file to match a
wildcard. The wildcard also means that it
doesn't matter which directory crtbegin.o
is in. */
KEEP (*crtbegin.o(.ctors))
KEEP (*crtbegin?.o(.ctors))
/* We don't want to include the .ctor section from
the crtend.o file until after the sorted ctors.
The .ctor section from the crtend file contains the
end of ctors marker and it must be last */
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
KEEP (*(SORT(.ctors.*)))
KEEP (*(.ctors))
}
.dtors :
{
KEEP (*crtbegin.o(.dtors))
KEEP (*crtbegin?.o(.dtors))
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
KEEP (*(SORT(.dtors.*)))
KEEP (*(.dtors))
}
.jcr : { KEEP (*(.jcr)) }
.data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) }
.dynamic : { *(.dynamic) }
. = DATA_SEGMENT_RELRO_END (0, .);
.data :
{
__DATA_BEGIN__ = .;
*(.data .data.* .gnu.linkonce.d.*)
SORT(CONSTRUCTORS)
}
.data1 : { *(.data1) }
.got : { *(.got.plt) *(.igot.plt) *(.got) *(.igot) }
/* We want the small data sections together, so single-instruction offsets
can access them all, and initialized data all before uninitialized, so
we can shorten the on-disk segment size. */
.sdata :
{
__SDATA_BEGIN__ = .;
*(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*)
*(.sdata .sdata.* .gnu.linkonce.s.*)
}
_edata = .; PROVIDE (edata = .);
. = .;
__bss_start = .;
.sbss :
{
*(.dynsbss)
*(.sbss .sbss.* .gnu.linkonce.sb.*)
*(.scommon)
}
.bss :
{
*(.dynbss)
*(.bss .bss.* .gnu.linkonce.b.*)
*(COMMON)
/* Align here to ensure that the .bss section occupies space up to
_end. Align after .bss to ensure correct alignment even if the
.bss section disappears because there are no input sections.
FIXME: Why do we need it? When there is no .bss section, we do not
pad the .data section. */
. = ALIGN(. != 0 ? 32 / 8 : 1);
}
. = ALIGN(32 / 8);
. = SEGMENT_START("ldata-segment", .);
. = ALIGN(32 / 8);
__BSS_END__ = .;
__global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
_end = .; PROVIDE (end = .);
. = DATA_SEGMENT_END (.);
/* .stack_dummy section doesn't contains any symbols. It is only
* used for linker to calculate size of stack sections, and assign
* values to stack symbols later */
.stack_dummy (COPY):
{
KEEP(*(.stack*))
}
__stack_usage = SIZEOF(.stack_dummy);
PROVIDE(__stack_top = 0xFF000000);
PROVIDE(__stack_size = 0x400);
PROVIDE(__stack = __stack_top);
ASSERT(__stack_usage <= __stack_size, "stack overflow")
/* Stabs debugging sections. */
.stab 0 : { *(.stab) }
.stabstr 0 : { *(.stabstr) }
.stab.excl 0 : { *(.stab.excl) }
.stab.exclstr 0 : { *(.stab.exclstr) }
.stab.index 0 : { *(.stab.index) }
.stab.indexstr 0 : { *(.stab.indexstr) }
.comment 0 : { *(.comment) }
.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
/* DWARF debug sections.
Symbols in the DWARF debugging sections are relative to the beginning
of the section so we begin them at 0. */
/* DWARF 1 */
.debug 0 : { *(.debug) }
.line 0 : { *(.line) }
/* GNU DWARF 1 extensions */
.debug_srcinfo 0 : { *(.debug_srcinfo) }
.debug_sfnames 0 : { *(.debug_sfnames) }
/* DWARF 1.1 and DWARF 2 */
.debug_aranges 0 : { *(.debug_aranges) }
.debug_pubnames 0 : { *(.debug_pubnames) }
/* DWARF 2 */
.debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
.debug_abbrev 0 : { *(.debug_abbrev) }
.debug_line 0 : { *(.debug_line .debug_line.* .debug_line_end) }
.debug_frame 0 : { *(.debug_frame) }
.debug_str 0 : { *(.debug_str) }
.debug_loc 0 : { *(.debug_loc) }
.debug_macinfo 0 : { *(.debug_macinfo) }
/* SGI/MIPS DWARF 2 extensions */
.debug_weaknames 0 : { *(.debug_weaknames) }
.debug_funcnames 0 : { *(.debug_funcnames) }
.debug_typenames 0 : { *(.debug_typenames) }
.debug_varnames 0 : { *(.debug_varnames) }
/* DWARF 3 */
.debug_pubtypes 0 : { *(.debug_pubtypes) }
.debug_ranges 0 : { *(.debug_ranges) }
/* DWARF Extension. */
.debug_macro 0 : { *(.debug_macro) }
.debug_addr 0 : { *(.debug_addr) }
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
/DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
}

View File

@@ -1,264 +0,0 @@
/* ---- Original Script: /opt/riscv32i/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.x ---- */
/* Default linker script, for normal executables */
/* Copyright (C) 2014-2017 Free Software Foundation, Inc.
Copying and distribution of this script, with or without modification,
are permitted in any medium without royalty provided the copyright
notice and this notice are preserved. */
OUTPUT_FORMAT("elf64-littleriscv", "elf64-littleriscv", "elf64-littleriscv")
OUTPUT_ARCH(riscv)
ENTRY(_start)
SECTIONS
{
. = 0x80000000;
.interp : { *(.interp) }
.note.gnu.build-id : { *(.note.gnu.build-id) }
.hash : { *(.hash) }
.gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
.dynstr : { *(.dynstr) }
.gnu.version : { *(.gnu.version) }
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
.rela.dyn :
{
*(.rela.init)
*(.rela.text .rela.text.* .rela.gnu.linkonce.t.*)
*(.rela.fini)
*(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*)
*(.rela.data .rela.data.* .rela.gnu.linkonce.d.*)
*(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*)
*(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*)
*(.rela.ctors)
*(.rela.dtors)
*(.rela.got)
*(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*)
*(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*)
*(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*)
*(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*)
*(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*)
PROVIDE_HIDDEN (__rela_iplt_start = .);
*(.rela.iplt)
PROVIDE_HIDDEN (__rela_iplt_end = .);
}
.rela.plt :
{
*(.rela.plt)
}
.init :
{
KEEP (*(SORT_NONE(.init)))
}
.plt : { *(.plt) }
.iplt : { *(.iplt) }
.text :
{
*(.text.unlikely .text.*_unlikely .text.unlikely.*)
*(.text.exit .text.exit.*)
*(.text.startup .text.startup.*)
*(.text.hot .text.hot.*)
*(.text .stub .text.* .gnu.linkonce.t.*)
/* .gnu.warning sections are handled specially by elf32.em. */
*(.gnu.warning)
}
.fini :
{
KEEP (*(SORT_NONE(.fini)))
}
PROVIDE (__etext = .);
PROVIDE (_etext = .);
PROVIDE (etext = .);
.rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
.rodata1 : { *(.rodata1) }
.sdata2 :
{
*(.sdata2 .sdata2.* .gnu.linkonce.s2.*)
}
.sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) }
.eh_frame_hdr : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) }
.eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) *(.eh_frame.*) }
.gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) }
.gnu_extab : ONLY_IF_RO { *(.gnu_extab*) }
/* These sections are generated by the Sun/Oracle C++ compiler. */
.exception_ranges : ONLY_IF_RO { *(.exception_ranges*) }
/* Adjust the address for the data segment. We want to adjust up to
the same address within the page on the next page up. */
. = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
/* Exception handling */
.eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) }
.gnu_extab : ONLY_IF_RW { *(.gnu_extab) }
.gcc_except_table : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) }
.exception_ranges : ONLY_IF_RW { *(.exception_ranges*) }
/* Thread Local Storage sections */
.tdata :
{
PROVIDE_HIDDEN (__tdata_start = .);
*(.tdata .tdata.* .gnu.linkonce.td.*)
PROVIDE_HIDDEN (__tdata_end = .);
}
PROVIDE (__tdata_size = SIZEOF (.tdata));
.tbss :
{
PROVIDE_HIDDEN (__tbss_start = .);
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
*(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
PROVIDE_HIDDEN (__tbss_end = .);
}
PROVIDE (__tbss_size = SIZEOF (.tbss));
PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
.preinit_array :
{
PROVIDE_HIDDEN (__preinit_array_start = .);
KEEP (*(.preinit_array))
PROVIDE_HIDDEN (__preinit_array_end = .);
}
.init_array :
{
PROVIDE_HIDDEN (__init_array_start = .);
KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
PROVIDE_HIDDEN (__init_array_end = .);
}
.fini_array :
{
PROVIDE_HIDDEN (__fini_array_start = .);
KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors))
PROVIDE_HIDDEN (__fini_array_end = .);
}
.ctors :
{
/* gcc uses crtbegin.o to find the start of
the constructors, so we make sure it is
first. Because this is a wildcard, it
doesn't matter if the user does not
actually link against crtbegin.o; the
linker won't look for a file to match a
wildcard. The wildcard also means that it
doesn't matter which directory crtbegin.o
is in. */
KEEP (*crtbegin.o(.ctors))
KEEP (*crtbegin?.o(.ctors))
/* We don't want to include the .ctor section from
the crtend.o file until after the sorted ctors.
The .ctor section from the crtend file contains the
end of ctors marker and it must be last */
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
KEEP (*(SORT(.ctors.*)))
KEEP (*(.ctors))
}
.dtors :
{
KEEP (*crtbegin.o(.dtors))
KEEP (*crtbegin?.o(.dtors))
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
KEEP (*(SORT(.dtors.*)))
KEEP (*(.dtors))
}
.jcr : { KEEP (*(.jcr)) }
.data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) }
.dynamic : { *(.dynamic) }
. = DATA_SEGMENT_RELRO_END (0, .);
.data :
{
__DATA_BEGIN__ = .;
*(.data .data.* .gnu.linkonce.d.*)
SORT(CONSTRUCTORS)
}
.data1 : { *(.data1) }
.got : { *(.got.plt) *(.igot.plt) *(.got) *(.igot) }
/* We want the small data sections together, so single-instruction offsets
can access them all, and initialized data all before uninitialized, so
we can shorten the on-disk segment size. */
.sdata :
{
__SDATA_BEGIN__ = .;
*(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*)
*(.sdata .sdata.* .gnu.linkonce.s.*)
}
_edata = .; PROVIDE (edata = .);
. = .;
__bss_start = .;
.sbss :
{
*(.dynsbss)
*(.sbss .sbss.* .gnu.linkonce.sb.*)
*(.scommon)
}
.bss :
{
*(.dynbss)
*(.bss .bss.* .gnu.linkonce.b.*)
*(COMMON)
/* Align here to ensure that the .bss section occupies space up to
_end. Align after .bss to ensure correct alignment even if the
.bss section disappears because there are no input sections.
FIXME: Why do we need it? When there is no .bss section, we do not
pad the .data section. */
. = ALIGN(. != 0 ? 64 / 8 : 1);
}
. = ALIGN(64 / 8);
. = SEGMENT_START("ldata-segment", .);
. = ALIGN(64 / 8);
__BSS_END__ = .;
__global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
_end = .; PROVIDE (end = .);
. = DATA_SEGMENT_END (.);
/* .stack_dummy section doesn't contains any symbols. It is only
* used for linker to calculate size of stack sections, and assign
* values to stack symbols later */
.stack_dummy (COPY):
{
KEEP(*(.stack*))
}
__stack_usage = SIZEOF(.stack_dummy);
PROVIDE(__stack_top = 0xFF000000);
PROVIDE(__stack_size = 0x400);
PROVIDE(__stack = __stack_top);
ASSERT(__stack_usage <= __stack_size, "stack overflow")
/* Stabs debugging sections. */
.stab 0 : { *(.stab) }
.stabstr 0 : { *(.stabstr) }
.stab.excl 0 : { *(.stab.excl) }
.stab.exclstr 0 : { *(.stab.exclstr) }
.stab.index 0 : { *(.stab.index) }
.stab.indexstr 0 : { *(.stab.indexstr) }
.comment 0 : { *(.comment) }
.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
/* DWARF debug sections.
Symbols in the DWARF debugging sections are relative to the beginning
of the section so we begin them at 0. */
/* DWARF 1 */
.debug 0 : { *(.debug) }
.line 0 : { *(.line) }
/* GNU DWARF 1 extensions */
.debug_srcinfo 0 : { *(.debug_srcinfo) }
.debug_sfnames 0 : { *(.debug_sfnames) }
/* DWARF 1.1 and DWARF 2 */
.debug_aranges 0 : { *(.debug_aranges) }
.debug_pubnames 0 : { *(.debug_pubnames) }
/* DWARF 2 */
.debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
.debug_abbrev 0 : { *(.debug_abbrev) }
.debug_line 0 : { *(.debug_line .debug_line.* .debug_line_end) }
.debug_frame 0 : { *(.debug_frame) }
.debug_str 0 : { *(.debug_str) }
.debug_loc 0 : { *(.debug_loc) }
.debug_macinfo 0 : { *(.debug_macinfo) }
/* SGI/MIPS DWARF 2 extensions */
.debug_weaknames 0 : { *(.debug_weaknames) }
.debug_funcnames 0 : { *(.debug_funcnames) }
.debug_typenames 0 : { *(.debug_typenames) }
.debug_varnames 0 : { *(.debug_varnames) }
/* DWARF 3 */
.debug_pubtypes 0 : { *(.debug_pubtypes) }
.debug_ranges 0 : { *(.debug_ranges) }
/* DWARF Extension. */
.debug_macro 0 : { *(.debug_macro) }
.debug_addr 0 : { *(.debug_addr) }
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
/DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
}

78
runtime/opae/Makefile Normal file
View File

@@ -0,0 +1,78 @@
XLEN ?= 32
TARGET ?= opaesim
OPAESIM_DIR = ../../sim/opaesim
RTL_DIR=../../hw/rtl
SYN_DIR=../../hw/syn/altera/opae
SCRIPT_DIR=../../hw/scripts
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I. -I../include -I../common/ -I../../hw
CXXFLAGS += -DXLEN_$(XLEN)
ifeq ($(TARGET), opaesim)
CXXFLAGS += -I$(OPAESIM_DIR)
else
CXXFLAGS += -I$(SYN_DIR)
endif
# Position independent code
CXXFLAGS += -fPIC
# Add external configuration
CXXFLAGS += $(CONFIGS)
# Dump perf stats
CXXFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -luuid -ldl -pthread
SRCS = vortex.cpp driver.cpp ../common/utils.cpp
# set up target types
ifeq ($(TARGET), opaesim)
CXXFLAGS += -DOPAESIM
OPAESIM = libopae-c-sim.so
else
ifeq ($(TARGET), asesim)
CXXFLAGS += -DASESIM
else
CXXFLAGS += -DFPGA
endif
endif
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
# Enable scope logic analyzer
ifdef SCOPE
CXXFLAGS += -DSCOPE
SRCS += ../common/scope.cpp
endif
# Enable perf counters
ifdef PERF
CXXFLAGS += -DPERF_ENABLE
endif
PROJECT = libvortex.so
all: $(PROJECT)
libopae-c-sim.so:
DESTDIR=../../runtime/opae $(MAKE) -C $(OPAESIM_DIR) ../../runtime/opae/libopae-c-sim.so
$(PROJECT): $(SRCS) $(OPAESIM)
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT)
clean:
DESTDIR=../../runtime/opae $(MAKE) -C $(OPAESIM_DIR) clean
rm -rf $(PROJECT)

93
runtime/opae/driver.cpp Normal file
View File

@@ -0,0 +1,93 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "driver.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <linux/limits.h>
#include <dlfcn.h>
#include <string>
#include <vector>
#include <sstream>
#ifdef OPAESIM
#define DEFAULT_OPAE_DRV_PATHS "libopae-c-sim.so"
#elif ASESIM
#define DEFAULT_OPAE_DRV_PATHS "libopae-c-ase.so"
#else
#define DEFAULT_OPAE_DRV_PATHS "libopae-c.so"
#endif
#define SET_API(func) \
opae_drv_funcs->func = (pfn_##func)dlsym(dl_handle, #func); \
if (opae_drv_funcs->func == nullptr) { \
printf("dlsym failed: %s\n", dlerror()); \
dlclose(dl_handle); \
return -1; \
}
void* dl_handle = nullptr;
int drv_init(opae_drv_api_t* opae_drv_funcs) {
if (opae_drv_funcs == nullptr)
return -1;
const char* api_path_s = getenv("OPAE_DRV_PATHS");
if (api_path_s == nullptr || api_path_s[0] == '\0') {
api_path_s = DEFAULT_OPAE_DRV_PATHS;
}
std::vector<std::string> api_paths;
{
std::stringstream ss(api_path_s);
while (ss.good()) {
std::string path;
getline(ss, path, ',');
api_paths.push_back(path);
}
}
for (auto& api_path : api_paths) {
dl_handle = dlopen(api_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
if (dl_handle)
break;
}
if (dl_handle == nullptr) {
printf("dlopen failed: %s\n", dlerror());
return -1;
}
SET_API (fpgaGetProperties);
SET_API (fpgaPropertiesSetObjectType);
SET_API (fpgaPropertiesSetGUID);
SET_API (fpgaDestroyProperties);
SET_API (fpgaDestroyToken);
SET_API (fpgaPropertiesGetLocalMemorySize);
SET_API (fpgaEnumerate);
SET_API (fpgaOpen);
SET_API (fpgaClose);
SET_API (fpgaPrepareBuffer);
SET_API (fpgaReleaseBuffer);
SET_API (fpgaGetIOAddress);
SET_API (fpgaWriteMMIO64);
SET_API (fpgaReadMMIO64);
SET_API (fpgaErrStr);
return 0;
}
void drv_close() {
dlclose(dl_handle);
}

61
runtime/opae/driver.h Normal file
View File

@@ -0,0 +1,61 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifndef OPAESIM
#include <opae/fpga.h>
#include <uuid/uuid.h>
#else
#include <fpga.h>
#endif
typedef fpga_result (*pfn_fpgaGetProperties)(fpga_token token, fpga_properties *prop);
typedef fpga_result (*pfn_fpgaPropertiesSetObjectType)(fpga_properties prop, fpga_objtype objtype);
typedef fpga_result (*pfn_fpgaPropertiesSetGUID)(fpga_properties prop, fpga_guid guid);
typedef fpga_result (*pfn_fpgaDestroyProperties)(fpga_properties *prop);
typedef fpga_result (*pfn_fpgaEnumerate)(const fpga_properties *filters, uint32_t num_filters, fpga_token *tokens, uint32_t max_tokens, uint32_t *num_matches);
typedef fpga_result (*pfn_fpgaDestroyToken)(fpga_token *token);
typedef fpga_result (*pfn_fpgaPropertiesGetLocalMemorySize)(fpga_properties prop, uint64_t *lms);
typedef fpga_result (*pfn_fpgaOpen)(fpga_token token, fpga_handle *handle, int flags);
typedef fpga_result (*pfn_fpgaClose)(fpga_handle handle);
typedef fpga_result (*pfn_fpgaPrepareBuffer)(fpga_handle handle, uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
typedef fpga_result (*pfn_fpgaReleaseBuffer)(fpga_handle handle, uint64_t wsid);
typedef fpga_result (*pfn_fpgaGetIOAddress)(fpga_handle handle, uint64_t wsid, uint64_t *ioaddr);
typedef fpga_result (*pfn_fpgaWriteMMIO64)(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t value);
typedef fpga_result (*pfn_fpgaReadMMIO64)(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t *value);
typedef const char *(*pfn_fpgaErrStr)(fpga_result e);
struct opae_drv_api_t {
pfn_fpgaGetProperties fpgaGetProperties;
pfn_fpgaPropertiesSetObjectType fpgaPropertiesSetObjectType;
pfn_fpgaPropertiesSetGUID fpgaPropertiesSetGUID;
pfn_fpgaDestroyProperties fpgaDestroyProperties;
pfn_fpgaEnumerate fpgaEnumerate;
pfn_fpgaDestroyToken fpgaDestroyToken;
pfn_fpgaPropertiesGetLocalMemorySize fpgaPropertiesGetLocalMemorySize;
pfn_fpgaOpen fpgaOpen;
pfn_fpgaClose fpgaClose;
pfn_fpgaPrepareBuffer fpgaPrepareBuffer;
pfn_fpgaReleaseBuffer fpgaReleaseBuffer;
pfn_fpgaGetIOAddress fpgaGetIOAddress;
pfn_fpgaWriteMMIO64 fpgaWriteMMIO64;
pfn_fpgaReadMMIO64 fpgaReadMMIO64;
pfn_fpgaErrStr fpgaErrStr;
};
int drv_init(opae_drv_api_t* opae_drv_funcs);
void drv_close();

610
runtime/opae/vortex.cpp Executable file
View File

@@ -0,0 +1,610 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vortex.h>
#include <utils.h>
#include <malloc.h>
#include "driver.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <cstdlib>
#include <cstring>
#include <uuid/uuid.h>
#include <unistd.h>
#include <assert.h>
#include <cmath>
#include <sstream>
#include <unordered_map>
#include <algorithm>
#include <memory>
#include <list>
#include <VX_config.h>
#include <VX_types.h>
#include <vortex_afu.h>
#ifdef SCOPE
#include "scope.h"
#endif
///////////////////////////////////////////////////////////////////////////////
#define CMD_MEM_READ AFU_IMAGE_CMD_MEM_READ
#define CMD_MEM_WRITE AFU_IMAGE_CMD_MEM_WRITE
#define CMD_RUN AFU_IMAGE_CMD_RUN
#define CMD_DCR_WRITE AFU_IMAGE_CMD_DCR_WRITE
#define MMIO_CMD_TYPE (AFU_IMAGE_MMIO_CMD_TYPE * 4)
#define MMIO_CMD_ARG0 (AFU_IMAGE_MMIO_CMD_ARG0 * 4)
#define MMIO_CMD_ARG1 (AFU_IMAGE_MMIO_CMD_ARG1 * 4)
#define MMIO_CMD_ARG2 (AFU_IMAGE_MMIO_CMD_ARG2 * 4)
#define MMIO_STATUS (AFU_IMAGE_MMIO_STATUS * 4)
#define MMIO_DEV_CAPS (AFU_IMAGE_MMIO_DEV_CAPS * 4)
#define MMIO_ISA_CAPS (AFU_IMAGE_MMIO_ISA_CAPS * 4)
#define MMIO_SCOPE_READ (AFU_IMAGE_MMIO_SCOPE_READ * 4)
#define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4)
#define STATUS_STATE_BITS 8
#define RAM_PAGE_SIZE 4096
#define CHECK_HANDLE(handle, _expr, _cleanup) \
auto handle = _expr; \
if (handle == nullptr) { \
printf("[VXDRV] Error: '%s' returned NULL!\n", #_expr); \
_cleanup \
}
#define CHECK_ERR(_expr, _cleanup) \
do { \
auto err = _expr; \
if (err == 0) \
break; \
printf("[VXDRV] Error: '%s' returned %d, %s!\n", #_expr, (int)err, api.fpgaErrStr(err)); \
_cleanup \
} while (false)
///////////////////////////////////////////////////////////////////////////////
class vx_device {
public:
vx_device() :
staging_wsid(0),
staging_ioaddr(0),
staging_ptr(nullptr),
staging_size(0)
{}
~vx_device() {}
int ensure_staging(uint64_t size) {
size_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
if (staging_size >= asize)
return 0;
if (staging_size != 0) {
// release existing buffer
api.fpgaReleaseBuffer(fpga, staging_wsid);
staging_size = 0;
}
// allocate new buffer
CHECK_ERR(api.fpgaPrepareBuffer(fpga, asize, (void**)&staging_ptr, &staging_wsid, 0), {
return -1;
});
// get the physical address of the buffer in the accelerator
CHECK_ERR(api.fpgaGetIOAddress(fpga, staging_wsid, &staging_ioaddr), {
api.fpgaReleaseBuffer(fpga, staging_wsid);
return -1;
});
staging_size = asize;
return 0;
}
opae_drv_api_t api;
fpga_handle fpga;
std::shared_ptr<vortex::MemoryAllocator> global_mem;
std::shared_ptr<vortex::MemoryAllocator> local_mem;
DeviceConfig dcrs;
uint64_t dev_caps;
uint64_t isa_caps;
uint64_t global_mem_size;
uint64_t staging_wsid;
uint64_t staging_ioaddr;
uint8_t* staging_ptr;
uint64_t staging_size;
};
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
if (nullptr == hdevice)
return -1;
auto device = ((vx_device*)hdevice);
switch (caps_id) {
case VX_CAPS_VERSION:
*value = (device->dev_caps >> 0) & 0xff;
break;
case VX_CAPS_NUM_THREADS:
*value = (device->dev_caps >> 8) & 0xff;
break;
case VX_CAPS_NUM_WARPS:
*value = (device->dev_caps >> 16) & 0xff;
break;
case VX_CAPS_NUM_CORES:
*value = (device->dev_caps >> 24) & 0xffff;
break;
case VX_CAPS_CACHE_LINE_SIZE:
*value = CACHE_BLOCK_SIZE;
break;
case VX_CAPS_GLOBAL_MEM_SIZE:
*value = device->global_mem_size;
break;
case VX_CAPS_LOCAL_MEM_SIZE:
*value = 1ull << ((device->dev_caps >> 40) & 0xff);
break;
case VX_CAPS_KERNEL_BASE_ADDR:
*value = (uint64_t(device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) |
device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
break;
case VX_CAPS_ISA_FLAGS:
*value = device->isa_caps;
break;
default:
fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id);
std::abort();
return -1;
}
return 0;
}
extern int vx_dev_open(vx_device_h* hdevice) {
if (nullptr == hdevice)
return -1;
vx_device* device;
fpga_handle accel_handle;
fpga_token accel_token;
fpga_properties filter;
fpga_guid guid;
uint32_t num_matches;
opae_drv_api_t api;
memset(&api, 0, sizeof(opae_drv_api_t));
if (drv_init(&api) !=0) {
return -1;
}
// Set up a filter that will search for an accelerator
CHECK_ERR(api.fpgaGetProperties(nullptr, &filter), {
return -1;
});
CHECK_ERR(api.fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR), {
api.fpgaDestroyProperties(&filter);
return -1;
});
// Add the desired UUID to the filter
std::string s_uuid(AFU_ACCEL_UUID);
std::replace(s_uuid.begin(), s_uuid.end(), '_', '-');
uuid_parse(s_uuid.c_str(), guid);
CHECK_ERR(api.fpgaPropertiesSetGUID(filter, guid), {
api.fpgaDestroyProperties(&filter);
return -1;
});
// Do the search across the available FPGA contexts
CHECK_ERR(api.fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches), {
api.fpgaDestroyProperties(&filter);
return -1;
});
// Not needed anymore
CHECK_ERR(api.fpgaDestroyProperties(&filter), {
api.fpgaDestroyToken(&accel_token);
return -1;
});
if (num_matches < 1) {
fprintf(stderr, "[VXDRV] Error: accelerator %s not found!\n", AFU_ACCEL_UUID);
api.fpgaDestroyToken(&accel_token);
return -1;
}
// Open accelerator
CHECK_ERR(api.fpgaOpen(accel_token, &accel_handle, 0), {
api.fpgaDestroyToken(&accel_token);
return -1;
});
// Done with token
CHECK_ERR(api.fpgaDestroyToken(&accel_token), {
api.fpgaClose(accel_handle);
return -1;
});
// allocate device object
device = new vx_device();
if (nullptr == device) {
api.fpgaClose(accel_handle);
return -1;
}
device->api = api;
device->fpga = accel_handle;
{
// retrieve FPGA global memory size
CHECK_ERR(api.fpgaPropertiesGetLocalMemorySize(filter, &device->global_mem_size), {
// assume 8GB as default
device->global_mem_size = GLOBAL_MEM_SIZE;
});
// Load ISA CAPS
CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_ISA_CAPS, &device->isa_caps), {
api.fpgaClose(accel_handle);
return -1;
});
// Load device CAPS
CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_DEV_CAPS, &device->dev_caps), {
api.fpgaClose(accel_handle);
return -1;
});
}
device->global_mem = std::make_shared<vortex::MemoryAllocator>(
ALLOC_BASE_ADDR, ALLOC_MAX_ADDR - ALLOC_BASE_ADDR, RAM_PAGE_SIZE, CACHE_BLOCK_SIZE);
uint64_t local_mem_size = 0;
vx_dev_caps(device, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size);
if (local_mem_size <= 1) {
device->local_mem = std::make_shared<vortex::MemoryAllocator>(
SMEM_BASE_ADDR, local_mem_size, RAM_PAGE_SIZE, 1);
}
#ifdef SCOPE
{
scope_callback_t callback;
callback.registerWrite = [](vx_device_h hdevice, uint64_t value)->int {
auto device = (vx_device*)hdevice;
return device->api.fpgaWriteMMIO64(device->fpga, 0, MMIO_SCOPE_WRITE, value);
};
callback.registerRead = [](vx_device_h hdevice, uint64_t* value)->int {
auto device = (vx_device*)hdevice;
return device->api.fpgaReadMMIO64(device->fpga, 0, MMIO_SCOPE_READ, value);
};
int ret = vx_scope_start(&callback, device, 0, -1);
if (ret != 0) {
api.fpgaClose(accel_handle);
return ret;
}
}
#endif
int err = dcr_initialize(device);
if (err != 0) {
delete device;
return err;
}
#ifdef DUMP_PERF_STATS
perf_add_device(device);
#endif
*hdevice = device;
return 0;
}
extern int vx_dev_close(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
auto device = ((vx_device*)hdevice);
auto& api = device->api;
#ifdef SCOPE
vx_scope_stop(hdevice);
#endif
#ifdef DUMP_PERF_STATS
perf_remove_device(hdevice);
#endif
// release staging buffer
if (device->staging_size != 0) {
api.fpgaReleaseBuffer(device->fpga, device->staging_wsid);
device->staging_size = 0;
}
// close the device
api.fpgaClose(device->fpga);
delete device;
drv_close();
return 0;
}
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr) {
if (nullptr == hdevice
|| nullptr == dev_addr
|| 0 == size)
return -1;
auto device = ((vx_device*)hdevice);
if (type == VX_MEM_TYPE_GLOBAL) {
return device->global_mem->allocate(size, dev_addr);
} else if (type == VX_MEM_TYPE_LOCAL) {
return device->local_mem->allocate(size, dev_addr);
}
return -1;
}
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
if (nullptr == hdevice)
return -1;
if (0 == dev_addr)
return 0;
auto device = ((vx_device*)hdevice);
if (dev_addr >= SMEM_BASE_ADDR) {
return device->local_mem->release(dev_addr);
} else {
return device->global_mem->release(dev_addr);
}
}
extern int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used) {
if (nullptr == hdevice)
return -1;
auto device = ((vx_device*)hdevice);
if (type == VX_MEM_TYPE_GLOBAL) {
if (mem_free)
*mem_free = device->global_mem->free();
if (mem_used)
*mem_used = device->global_mem->allocated();
} else if (type == VX_MEM_TYPE_LOCAL) {
if (mem_free)
*mem_free = device->local_mem->free();
if (mem_used)
*mem_free = device->local_mem->allocated();
} else {
return -1;
}
return 0;
}
extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
auto& api = device->api;
if (device->ensure_staging(size) != 0)
return -1;
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
// check alignment
if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
return -1;
// bound checking
if (dev_addr + asize > device->global_mem_size)
return -1;
// ensure ready for new command
if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
return -1;
// update staging buffer
memcpy(device->staging_ptr, host_ptr, size);
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE), {
return -1;
});
// Wait for the write operation to finish
if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
return -1;
return 0;
}
extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
auto& api = device->api;
if (device->ensure_staging(size) != 0)
return -1;
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
// check alignment
if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
return -1;
// bound checking
if (dev_addr + asize > device->global_mem_size)
return -1;
// Ensure ready for new command
if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
return -1;
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ), {
return -1;
});
// wait for the write operation to finish
if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
return -1;
// read staging buffer
memcpy(host_ptr, device->staging_ptr, size);
return 0;
}
extern int vx_start(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
auto device = ((vx_device*)hdevice);
auto& api = device->api;
// Ensure ready for new command
if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
return -1;
// start execution
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN), {
return -1;
});
return 0;
}
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
if (nullptr == hdevice)
return -1;
std::unordered_map<uint32_t, std::stringstream> print_bufs;
auto device = ((vx_device*)hdevice);
auto& api = device->api;
struct timespec sleep_time;
sleep_time.tv_sec = 0;
sleep_time.tv_nsec = 1000000;
// to milliseconds
uint64_t sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
for (;;) {
uint64_t status;
CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), {
return -1;
});
// check for console data
uint32_t cout_data = status >> STATUS_STATE_BITS;
if (cout_data & 0x1) {
// retrieve console data
do {
char cout_char = (cout_data >> 1) & 0xff;
uint32_t cout_tid = (cout_data >> 9) & 0xff;
auto& ss_buf = print_bufs[cout_tid];
ss_buf << cout_char;
if (cout_char == '\n') {
std::cout << std::dec << "#" << cout_tid << ": " << ss_buf.str() << std::flush;
ss_buf.str("");
}
CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), {
return -1;
});
cout_data = status >> STATUS_STATE_BITS;
} while (cout_data & 0x1);
}
uint32_t state = status & ((1 << STATUS_STATE_BITS)-1);
if (0 == state || 0 == timeout) {
for (auto& buf : print_bufs) {
auto str = buf.second.str();
if (!str.empty()) {
std::cout << "#" << buf.first << ": " << str << std::endl;
}
}
if (state != 0) {
fprintf(stdout, "[VXDRV] ready-wait timed out: state=%d\n", state);
}
break;
}
nanosleep(&sleep_time, nullptr);
timeout -= sleep_time_ms;
};
return 0;
}
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
if (nullptr == hdevice)
return -1;
auto device = ((vx_device*)hdevice);
auto& api = device->api;
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
// write DCR value
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, addr), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, value), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_DCR_WRITE), {
return -1;
});
// save the value
device->dcrs.write(addr, value);
return 0;
}

2
runtime/rtlsim/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
obj_dir
*.so

45
runtime/rtlsim/Makefile Normal file
View File

@@ -0,0 +1,45 @@
XLEN ?= 32
RTLSIM_DIR = ../../sim/rtlsim
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I../include -I../common -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common
CXXFLAGS += -DXLEN_$(XLEN)
# Position independent code
CXXFLAGS += -fPIC
# Add external configuration
CXXFLAGS += $(CONFIGS)
# Dump perf stats
CXXFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -pthread
LDFLAGS += -L. -lrtlsim
SRCS = vortex.cpp ../common/utils.cpp
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
# Enable perf counters
ifdef PERF
CXXFLAGS += -DPERF_ENABLE
endif
PROJECT = libvortex.so
all: $(PROJECT)
$(PROJECT): $(SRCS)
DESTDIR=../../runtime/rtlsim $(MAKE) -C $(RTLSIM_DIR) ../../runtime/rtlsim/librtlsim.so
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT)
clean:
DESTDIR=../../runtime/rtlsim $(MAKE) -C $(RTLSIM_DIR) clean
rm -rf $(PROJECT) *.o

336
runtime/rtlsim/vortex.cpp Normal file
View File

@@ -0,0 +1,336 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <iostream>
#include <future>
#include <list>
#include <chrono>
#include <vortex.h>
#include <malloc.h>
#include <utils.h>
#include <VX_config.h>
#include <VX_types.h>
#include <mem.h>
#include <util.h>
#include <processor.h>
#define RAM_PAGE_SIZE 4096
using namespace vortex;
///////////////////////////////////////////////////////////////////////////////
class vx_device {
public:
vx_device()
: ram_(RAM_PAGE_SIZE)
, global_mem_(
ALLOC_BASE_ADDR,
ALLOC_MAX_ADDR - ALLOC_BASE_ADDR,
RAM_PAGE_SIZE,
CACHE_BLOCK_SIZE)
, local_mem_(
SMEM_BASE_ADDR,
(1ull << SMEM_LOG_SIZE),
RAM_PAGE_SIZE,
1)
{
processor_.attach_ram(&ram_);
}
~vx_device() {
if (future_.valid()) {
future_.wait();
}
}
int mem_alloc(uint64_t size, int type, uint64_t* dev_addr) {
if (type == VX_MEM_TYPE_GLOBAL) {
return global_mem_.allocate(size, dev_addr);
} else if (type == VX_MEM_TYPE_LOCAL) {
return local_mem_.allocate(size, dev_addr);
}
return -1;
}
int mem_free(uint64_t dev_addr) {
if (dev_addr >= SMEM_BASE_ADDR) {
return local_mem_.release(dev_addr);
} else {
return global_mem_.release(dev_addr);
}
}
int mem_info(int type, uint64_t* mem_free, uint64_t* mem_used) const {
if (type == VX_MEM_TYPE_GLOBAL) {
if (mem_free)
*mem_free = global_mem_.free();
if (mem_used)
*mem_used = global_mem_.allocated();
} else if (type == VX_MEM_TYPE_LOCAL) {
if (mem_free)
*mem_free = local_mem_.free();
if (mem_used)
*mem_free = local_mem_.allocated();
} else {
return -1;
}
return 0;
}
int upload(uint64_t dest_addr, const void* src, uint64_t size) {
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
if (dest_addr + asize > GLOBAL_MEM_SIZE)
return -1;
/*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src));
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
printf("\n0x%08lx=", dest_addr + i * CACHE_BLOCK_SIZE);
for (int j = 0; j < CACHE_BLOCK_SIZE; ++j) {
printf("%02x", *((uint8_t*)src + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
}
}
printf("\n");*/
ram_.write((const uint8_t*)src, dest_addr, size);
return 0;
}
int download(void* dest, uint64_t src_addr, uint64_t size) {
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
if (src_addr + asize > GLOBAL_MEM_SIZE)
return -1;
ram_.read((uint8_t*)dest, src_addr, size);
/*printf("VXDRV: download %ld bytes to 0x%lx:", size, uintptr_t((uint8_t*)dest));
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
printf("\n0x%08lx=", src_addr + i * CACHE_BLOCK_SIZE);
for (int j = 0; j < CACHE_BLOCK_SIZE; ++j) {
printf("%02x", *((uint8_t*)dest + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
}
}
printf("\n");*/
return 0;
}
int start() {
// ensure prior run completed
if (future_.valid()) {
future_.wait();
}
// start new run
future_ = std::async(std::launch::async, [&]{
processor_.run();
});
return 0;
}
int wait(uint64_t timeout) {
if (!future_.valid())
return 0;
uint64_t timeout_sec = timeout / 1000;
std::chrono::seconds wait_time(1);
for (;;) {
// wait for 1 sec and check status
auto status = future_.wait_for(wait_time);
if (status == std::future_status::ready
|| 0 == timeout_sec--)
break;
}
return 0;
}
int write_dcr(uint32_t addr, uint32_t value) {
if (future_.valid()) {
future_.wait(); // ensure prior run completed
}
processor_.write_dcr(addr, value);
dcrs_.write(addr, value);
return 0;
}
uint64_t read_dcr(uint32_t addr) const {
return dcrs_.read(addr);
}
private:
RAM ram_;
Processor processor_;
MemoryAllocator global_mem_;
MemoryAllocator local_mem_;
DeviceConfig dcrs_;
std::future<void> future_;
};
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
switch (caps_id) {
case VX_CAPS_VERSION:
*value = IMPLEMENTATION_ID;
break;
case VX_CAPS_NUM_THREADS:
*value = NUM_THREADS;
break;
case VX_CAPS_NUM_WARPS:
*value = NUM_WARPS;
break;
case VX_CAPS_NUM_CORES:
*value = NUM_CORES * NUM_CLUSTERS;
break;
case VX_CAPS_CACHE_LINE_SIZE:
*value = CACHE_BLOCK_SIZE;
break;
case VX_CAPS_GLOBAL_MEM_SIZE:
*value = GLOBAL_MEM_SIZE;
break;
case VX_CAPS_KERNEL_BASE_ADDR:
*value = (uint64_t(device->read_dcr(VX_DCR_BASE_STARTUP_ADDR1)) << 32)
| device->read_dcr(VX_DCR_BASE_STARTUP_ADDR0);
break;
case VX_CAPS_ISA_FLAGS:
*value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break;
default:
std::cout << "invalid caps id: " << caps_id << std::endl;
std::abort();
return -1;
}
return 0;
}
extern int vx_dev_open(vx_device_h* hdevice) {
if (nullptr == hdevice)
return -1;
auto device = new vx_device();
if (device == nullptr)
return -1;
int err = dcr_initialize(device);
if (err != 0) {
delete device;
return err;
}
#ifdef DUMP_PERF_STATS
perf_add_device(device);
#endif
*hdevice = device;
return 0;
}
extern int vx_dev_close(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
#ifdef DUMP_PERF_STATS
perf_remove_device(hdevice);
#endif
delete device;
return 0;
}
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr) {
if (nullptr == hdevice
|| nullptr == dev_addr
|| 0 == size)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->mem_alloc(size, type, dev_addr);
}
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
if (nullptr == hdevice)
return -1;
if (0 == dev_addr)
return 0;
vx_device *device = ((vx_device*)hdevice);
return device->mem_free(dev_addr);
}
extern int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used) {
if (nullptr == hdevice)
return -1;
auto device = ((vx_device*)hdevice);
return device->mem_info(type, mem_free, mem_used);
}
extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
return device->upload(dev_addr, host_ptr, size);
}
extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
return device->download(host_ptr, dev_addr, size);
}
extern int vx_start(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->start();
}
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->wait(timeout);
}
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
return device->write_dcr(addr, value);
}

2
runtime/simx/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
obj_dir
libvortex.so

34
runtime/simx/Makefile Normal file
View File

@@ -0,0 +1,34 @@
XLEN ?= 32
SIMX_DIR = ../../sim/simx
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I../include -I../common -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common
CXXFLAGS += $(CONFIGS)
CXXFLAGS += -DDUMP_PERF_STATS
CXXFLAGS += -DXLEN_$(XLEN)
LDFLAGS += -shared -pthread
LDFLAGS += -L. -lsimx
SRCS = vortex.cpp ../common/utils.cpp
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = libvortex.so
all: $(PROJECT)
$(PROJECT): $(SRCS)
DESTDIR=../../runtime/simx $(MAKE) -C $(SIMX_DIR) ../../runtime/simx/libsimx.so
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
clean:
DESTDIR=../../runtime/simx $(MAKE) -C $(SIMX_DIR) clean
rm -rf libsimx.so $(PROJECT) *.o

397
runtime/simx/vortex.cpp Normal file
View File

@@ -0,0 +1,397 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <iostream>
#include <future>
#include <chrono>
#include <vortex.h>
#include <utils.h>
#include <malloc.h>
#include <VX_config.h>
#include <VX_types.h>
#include <util.h>
#include <processor.h>
#include <arch.h>
#include <mem.h>
#include <constants.h>
#ifndef NDEBUG
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
#else
#define DBGPRINT(format, ...) ((void)0)
#endif
using namespace vortex;
///////////////////////////////////////////////////////////////////////////////
class vx_device;
class vx_buffer {
public:
vx_buffer(uint64_t size, vx_device* device)
: size_(size)
, device_(device) {
uint64_t aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE);
data_ = aligned_malloc(aligned_asize, CACHE_BLOCK_SIZE);
// set uninitialized data to "baadf00d"
for (uint32_t i = 0; i < aligned_asize; ++i) {
((uint8_t*)data_)[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff;
}
}
~vx_buffer() {
if (data_) {
aligned_free(data_);
}
}
void* data() const {
return data_;
}
uint64_t size() const {
return size_;
}
vx_device* device() const {
return device_;
}
private:
uint64_t size_;
vx_device* device_;
void* data_;
};
///////////////////////////////////////////////////////////////////////////////
class vx_device {
public:
vx_device()
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, NUM_CLUSTERS)
, ram_(RAM_PAGE_SIZE)
, processor_(arch_)
, global_mem_(
ALLOC_BASE_ADDR,
ALLOC_MAX_ADDR - ALLOC_BASE_ADDR,
RAM_PAGE_SIZE,
CACHE_BLOCK_SIZE)
, local_mem_(
SMEM_BASE_ADDR,
(1ull << SMEM_LOG_SIZE),
RAM_PAGE_SIZE,
1)
{
// attach memory module
processor_.attach_ram(&ram_);
}
~vx_device() {
if (future_.valid()) {
future_.wait();
}
}
int mem_alloc(uint64_t size, int type, uint64_t* dev_addr) {
if (type == VX_MEM_TYPE_GLOBAL) {
return global_mem_.allocate(size, dev_addr);
} else if (type == VX_MEM_TYPE_LOCAL) {
return local_mem_.allocate(size, dev_addr);
}
return -1;
}
int mem_free(uint64_t dev_addr) {
if (dev_addr >= SMEM_BASE_ADDR) {
return local_mem_.release(dev_addr);
} else {
return global_mem_.release(dev_addr);
}
}
int mem_info(int type, uint64_t* mem_free, uint64_t* mem_used) const {
if (type == VX_MEM_TYPE_GLOBAL) {
if (mem_free)
*mem_free = global_mem_.free();
if (mem_used)
*mem_used = global_mem_.allocated();
} else if (type == VX_MEM_TYPE_LOCAL) {
if (mem_free)
*mem_free = local_mem_.free();
if (mem_used)
*mem_free = local_mem_.allocated();
} else {
return -1;
}
return 0;
}
int upload(uint64_t dest_addr, const void* src, uint64_t size) {
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
if (dest_addr + asize > GLOBAL_MEM_SIZE)
return -1;
ram_.write((const uint8_t*)src, dest_addr, size);
/*DBGPRINT("upload %ld bytes to 0x%lx\n", size, dest_addr);
for (uint64_t i = 0; i < size && i < 1024; i += 4) {
DBGPRINT(" 0x%lx <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + i));
}*/
return 0;
}
int download(void* dest, uint64_t src_addr, uint64_t size) {
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
if (src_addr + asize > GLOBAL_MEM_SIZE)
return -1;
ram_.read((uint8_t*)dest, src_addr, size);
/*DBGPRINT("download %ld bytes from 0x%lx\n", size, src_addr);
for (uint64_t i = 0; i < size && i < 1024; i += 4) {
DBGPRINT(" 0x%lx -> 0x%x\n", src_addr + i, *(uint32_t*)((uint8_t*)dest + i));
}*/
return 0;
}
int start() {
// ensure prior run completed
if (future_.valid()) {
future_.wait();
}
// start new run
future_ = std::async(std::launch::async, [&]{
processor_.run(false);
});
return 0;
}
int wait(uint64_t timeout) {
if (!future_.valid())
return 0;
uint64_t timeout_sec = timeout / 1000;
std::chrono::seconds wait_time(1);
for (;;) {
// wait for 1 sec and check status
auto status = future_.wait_for(wait_time);
if (status == std::future_status::ready
|| 0 == timeout_sec--)
break;
}
return 0;
}
int write_dcr(uint32_t addr, uint32_t value) {
if (future_.valid()) {
future_.wait(); // ensure prior run completed
}
processor_.write_dcr(addr, value);
dcrs_.write(addr, value);
return 0;
}
uint64_t read_dcr(uint32_t addr) const {
return dcrs_.read(addr);
}
private:
Arch arch_;
RAM ram_;
Processor processor_;
MemoryAllocator global_mem_;
MemoryAllocator local_mem_;
DeviceConfig dcrs_;
std::future<void> future_;
};
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_open(vx_device_h* hdevice) {
if (nullptr == hdevice)
return -1;
auto device = new vx_device();
if (device == nullptr)
return -1;
int err = dcr_initialize(device);
if (err != 0) {
delete device;
return err;
}
#ifdef DUMP_PERF_STATS
perf_add_device(device);
#endif
*hdevice = device;
DBGPRINT("device creation complete!\n");
return 0;
}
extern int vx_dev_close(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
#ifdef DUMP_PERF_STATS
perf_remove_device(hdevice);
#endif
delete device;
DBGPRINT("device destroyed!\n");
return 0;
}
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
switch (caps_id) {
case VX_CAPS_VERSION:
*value = IMPLEMENTATION_ID;
break;
case VX_CAPS_NUM_THREADS:
*value = NUM_THREADS;
break;
case VX_CAPS_NUM_WARPS:
*value = NUM_WARPS;
break;
case VX_CAPS_NUM_CORES:
*value = NUM_CORES * NUM_CLUSTERS;
break;
case VX_CAPS_CACHE_LINE_SIZE:
*value = CACHE_BLOCK_SIZE;
break;
case VX_CAPS_GLOBAL_MEM_SIZE:
*value = GLOBAL_MEM_SIZE;
break;
case VX_CAPS_KERNEL_BASE_ADDR:
*value = (uint64_t(device->read_dcr(VX_DCR_BASE_STARTUP_ADDR1)) << 32)
| device->read_dcr(VX_DCR_BASE_STARTUP_ADDR0);
break;
case VX_CAPS_ISA_FLAGS:
*value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break;
default:
std::cout << "invalid caps id: " << caps_id << std::endl;
std::abort();
return -1;
}
return 0;
}
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr) {
if (nullptr == hdevice
|| nullptr == dev_addr
|| 0 == size)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->mem_alloc(size, type, dev_addr);
}
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
if (nullptr == hdevice)
return -1;
if (0 == dev_addr)
return 0;
vx_device *device = ((vx_device*)hdevice);
return device->mem_free(dev_addr);
}
extern int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used) {
if (nullptr == hdevice)
return -1;
auto device = ((vx_device*)hdevice);
return device->mem_info(type, mem_free, mem_used);
}
extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
if (nullptr == hdevice)
return -1;
auto device = ((vx_device*)hdevice);
DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, host_ptr, size);
return device->upload(dev_addr, host_ptr, size);
}
extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
if (nullptr == hdevice)
return -1;
auto device = ((vx_device*)hdevice);
DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, host_ptr, size);
return device->download(host_ptr, dev_addr, size);
}
extern int vx_start(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
DBGPRINT("START\n");
vx_device *device = ((vx_device*)hdevice);
return device->start();
}
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->wait(timeout);
}
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%lx\n", addr, value);
return device->write_dcr(addr, value);
}

View File

@@ -1,890 +0,0 @@
///////////////////////////////////////////////////////////////////////////////
// \author (c) Marco Paland (info@paland.com)
// 2014-2019, PALANDesign Hannover, Germany
//
// \license The MIT License (MIT)
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// \brief Tiny printf, sprintf and (v)snprintf implementation, optimized for speed on
// embedded systems with a very limited resources. These routines are thread
// safe and reentrant!
// Use this instead of the bloated standard/newlib printf cause these use
// malloc for printf (and may not be thread safe).
//
///////////////////////////////////////////////////////////////////////////////
#include <stdbool.h>
#include <stdint.h>
#include "tinyprintf.h"
#include "vx_print.h"
// define this globally (e.g. gcc -DPRINTF_INCLUDE_CONFIG_H ...) to include the
// printf_config.h header file
// default: undefined
#ifdef PRINTF_INCLUDE_CONFIG_H
#include "printf_config.h"
#endif
// 'ntoa' conversion buffer size, this must be big enough to hold one converted
// numeric number including padded zeros (dynamically created on stack)
// default: 32 byte
#ifndef PRINTF_NTOA_BUFFER_SIZE
#define PRINTF_NTOA_BUFFER_SIZE 32U
#endif
// 'ftoa' conversion buffer size, this must be big enough to hold one converted
// float number including padded zeros (dynamically created on stack)
// default: 32 byte
#ifndef PRINTF_FTOA_BUFFER_SIZE
#define PRINTF_FTOA_BUFFER_SIZE 32U
#endif
// support for the floating point type (%f)
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_FLOAT
#define PRINTF_SUPPORT_FLOAT
#endif
// support for exponential floating point notation (%e/%g)
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_EXPONENTIAL
#define PRINTF_SUPPORT_EXPONENTIAL
#endif
// define the default floating point precision
// default: 6 digits
#ifndef PRINTF_DEFAULT_FLOAT_PRECISION
#define PRINTF_DEFAULT_FLOAT_PRECISION 6U
#endif
// define the largest float suitable to print with %f
// default: 1e9
#ifndef PRINTF_MAX_FLOAT
#define PRINTF_MAX_FLOAT 1e9
#endif
// support for the long long types (%llu or %p)
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_LONG_LONG
#define PRINTF_SUPPORT_LONG_LONG
#endif
// support for the ptrdiff_t type (%t)
// ptrdiff_t is normally defined in <stddef.h> as long or long long type
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_PTRDIFF_T
#define PRINTF_SUPPORT_PTRDIFF_T
#endif
///////////////////////////////////////////////////////////////////////////////
// internal flag definitions
#define FLAGS_ZEROPAD (1U << 0U)
#define FLAGS_LEFT (1U << 1U)
#define FLAGS_PLUS (1U << 2U)
#define FLAGS_SPACE (1U << 3U)
#define FLAGS_HASH (1U << 4U)
#define FLAGS_UPPERCASE (1U << 5U)
#define FLAGS_CHAR (1U << 6U)
#define FLAGS_SHORT (1U << 7U)
#define FLAGS_LONG (1U << 8U)
#define FLAGS_LONG_LONG (1U << 9U)
#define FLAGS_PRECISION (1U << 10U)
#define FLAGS_ADAPT_EXP (1U << 11U)
// import float.h for DBL_MAX
#if defined(PRINTF_SUPPORT_FLOAT)
#include <float.h>
#endif
// output function type
typedef void (*out_fct_type)(char character, void* buffer, size_t idx, size_t maxlen);
// wrapper (used as buffer) for output function type
typedef struct {
void (*fct)(char character, void* arg);
void* arg;
} out_fct_wrap_type;
// internal buffer output
static inline void _out_buffer(char character, void* buffer, size_t idx, size_t maxlen)
{
if (idx < maxlen) {
((char*)buffer)[idx] = character;
}
}
// internal null output
static inline void _out_null(char character, void* buffer, size_t idx, size_t maxlen)
{
(void)character; (void)buffer; (void)idx; (void)maxlen;
}
// internal _putchar wrapper
static inline void _out_char(char character, void* buffer, size_t idx, size_t maxlen)
{
(void)buffer; (void)idx; (void)maxlen;
if (character) {
vx_putchar(character);
}
}
// internal output function wrapper
static inline void _out_fct(char character, void* buffer, size_t idx, size_t maxlen)
{
(void)idx; (void)maxlen;
if (character) {
// buffer is the output fct pointer
((out_fct_wrap_type*)buffer)->fct(character, ((out_fct_wrap_type*)buffer)->arg);
}
}
// internal secure strlen
// \return The length of the string (excluding the terminating 0) limited by 'maxsize'
static inline unsigned int _strnlen_s(const char* str, size_t maxsize)
{
const char* s;
for (s = str; *s && maxsize--; ++s);
return (unsigned int)(s - str);
}
// internal test if char is a digit (0-9)
// \return true if char is a digit
static inline bool _is_digit(char ch)
{
return (ch >= '0') && (ch <= '9');
}
// internal ASCII string to unsigned int conversion
static unsigned int _atoi(const char** str)
{
unsigned int i = 0U;
while (_is_digit(**str)) {
i = i * 10U + (unsigned int)(*((*str)++) - '0');
}
return i;
}
// output the specified string in reverse, taking care of any zero-padding
static size_t _out_rev(out_fct_type out, char* buffer, size_t idx, size_t maxlen, const char* buf, size_t len, unsigned int width, unsigned int flags)
{
const size_t start_idx = idx;
// pad spaces up to given width
if (!(flags & FLAGS_LEFT) && !(flags & FLAGS_ZEROPAD)) {
for (size_t i = len; i < width; i++) {
out(' ', buffer, idx++, maxlen);
}
}
// reverse string
while (len) {
out(buf[--len], buffer, idx++, maxlen);
}
// append pad spaces up to given width
if (flags & FLAGS_LEFT) {
while (idx - start_idx < width) {
out(' ', buffer, idx++, maxlen);
}
}
return idx;
}
// internal itoa format
static size_t _ntoa_format(out_fct_type out, char* buffer, size_t idx, size_t maxlen, char* buf, size_t len, bool negative, unsigned int base, unsigned int prec, unsigned int width, unsigned int flags)
{
// pad leading zeros
if (!(flags & FLAGS_LEFT)) {
if (width && (flags & FLAGS_ZEROPAD) && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
width--;
}
while ((len < prec) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = '0';
}
while ((flags & FLAGS_ZEROPAD) && (len < width) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = '0';
}
}
// handle hash
if (flags & FLAGS_HASH) {
if (!(flags & FLAGS_PRECISION) && len && ((len == prec) || (len == width))) {
len--;
if (len && (base == 16U)) {
len--;
}
}
if ((base == 16U) && !(flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = 'x';
}
else if ((base == 16U) && (flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = 'X';
}
else if ((base == 2U) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = 'b';
}
if (len < PRINTF_NTOA_BUFFER_SIZE) {
buf[len++] = '0';
}
}
if (len < PRINTF_NTOA_BUFFER_SIZE) {
if (negative) {
buf[len++] = '-';
}
else if (flags & FLAGS_PLUS) {
buf[len++] = '+'; // ignore the space if the '+' exists
}
else if (flags & FLAGS_SPACE) {
buf[len++] = ' ';
}
}
return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
}
// internal itoa for 'long' type
static size_t _ntoa_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long value, bool negative, unsigned long base, unsigned int prec, unsigned int width, unsigned int flags)
{
char buf[PRINTF_NTOA_BUFFER_SIZE];
size_t len = 0U;
// no hash for 0 values
if (!value) {
flags &= ~FLAGS_HASH;
}
// write if precision != 0 and value is != 0
if (!(flags & FLAGS_PRECISION) || value) {
do {
const char digit = (char)(value % base);
buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
value /= base;
} while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
}
return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
}
// internal itoa for 'long long' type
#if defined(PRINTF_SUPPORT_LONG_LONG)
static size_t _ntoa_long_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long long value, bool negative, unsigned long long base, unsigned int prec, unsigned int width, unsigned int flags)
{
char buf[PRINTF_NTOA_BUFFER_SIZE];
size_t len = 0U;
// no hash for 0 values
if (!value) {
flags &= ~FLAGS_HASH;
}
// write if precision != 0 and value is != 0
if (!(flags & FLAGS_PRECISION) || value) {
do {
const char digit = (char)(value % base);
buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
value /= base;
} while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
}
return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
}
#endif // PRINTF_SUPPORT_LONG_LONG
#if defined(PRINTF_SUPPORT_FLOAT)
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
// forward declaration so that _ftoa can switch to exp notation for values > PRINTF_MAX_FLOAT
static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags);
#endif
// internal ftoa for fixed decimal floating point
static size_t _ftoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
{
char buf[PRINTF_FTOA_BUFFER_SIZE];
size_t len = 0U;
double diff = 0.0;
// powers of 10
static const double pow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 };
// test for special values
if (value != value)
return _out_rev(out, buffer, idx, maxlen, "nan", 3, width, flags);
if (value < -DBL_MAX)
return _out_rev(out, buffer, idx, maxlen, "fni-", 4, width, flags);
if (value > DBL_MAX)
return _out_rev(out, buffer, idx, maxlen, (flags & FLAGS_PLUS) ? "fni+" : "fni", (flags & FLAGS_PLUS) ? 4U : 3U, width, flags);
// test for very large values
// standard printf behavior is to print EVERY whole number digit -- which could be 100s of characters overflowing your buffers == bad
if ((value > PRINTF_MAX_FLOAT) || (value < -PRINTF_MAX_FLOAT)) {
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
return _etoa(out, buffer, idx, maxlen, value, prec, width, flags);
#else
return 0U;
#endif
}
// test for negative
bool negative = false;
if (value < 0) {
negative = true;
value = 0 - value;
}
// set default precision, if not set explicitly
if (!(flags & FLAGS_PRECISION)) {
prec = PRINTF_DEFAULT_FLOAT_PRECISION;
}
// limit precision to 9, cause a prec >= 10 can lead to overflow errors
while ((len < PRINTF_FTOA_BUFFER_SIZE) && (prec > 9U)) {
buf[len++] = '0';
prec--;
}
int whole = (int)value;
double tmp = (value - whole) * pow10[prec];
unsigned long frac = (unsigned long)tmp;
diff = tmp - frac;
if (diff > 0.5) {
++frac;
// handle rollover, e.g. case 0.99 with prec 1 is 1.0
if (frac >= pow10[prec]) {
frac = 0;
++whole;
}
}
else if (diff < 0.5) {
}
else if ((frac == 0U) || (frac & 1U)) {
// if halfway, round up if odd OR if last digit is 0
++frac;
}
if (prec == 0U) {
diff = value - (double)whole;
if ((!(diff < 0.5) || (diff > 0.5)) && (whole & 1)) {
// exactly 0.5 and ODD, then round up
// 1.5 -> 2, but 2.5 -> 2
++whole;
}
}
else {
unsigned int count = prec;
// now do fractional part, as an unsigned number
while (len < PRINTF_FTOA_BUFFER_SIZE) {
--count;
buf[len++] = (char)(48U + (frac % 10U));
if (!(frac /= 10U)) {
break;
}
}
// add extra 0s
while ((len < PRINTF_FTOA_BUFFER_SIZE) && (count-- > 0U)) {
buf[len++] = '0';
}
if (len < PRINTF_FTOA_BUFFER_SIZE) {
// add decimal
buf[len++] = '.';
}
}
// do whole part, number is reversed
while (len < PRINTF_FTOA_BUFFER_SIZE) {
buf[len++] = (char)(48 + (whole % 10));
if (!(whole /= 10)) {
break;
}
}
// pad leading zeros
if (!(flags & FLAGS_LEFT) && (flags & FLAGS_ZEROPAD)) {
if (width && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
width--;
}
while ((len < width) && (len < PRINTF_FTOA_BUFFER_SIZE)) {
buf[len++] = '0';
}
}
if (len < PRINTF_FTOA_BUFFER_SIZE) {
if (negative) {
buf[len++] = '-';
}
else if (flags & FLAGS_PLUS) {
buf[len++] = '+'; // ignore the space if the '+' exists
}
else if (flags & FLAGS_SPACE) {
buf[len++] = ' ';
}
}
return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
}
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
// internal ftoa variant for exponential floating-point type, contributed by Martijn Jasperse <m.jasperse@gmail.com>
static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
{
// check for NaN and special values
if ((value != value) || (value > DBL_MAX) || (value < -DBL_MAX)) {
return _ftoa(out, buffer, idx, maxlen, value, prec, width, flags);
}
// determine the sign
const bool negative = value < 0;
if (negative) {
value = -value;
}
// default precision
if (!(flags & FLAGS_PRECISION)) {
prec = PRINTF_DEFAULT_FLOAT_PRECISION;
}
// determine the decimal exponent
// based on the algorithm by David Gay (https://www.ampl.com/netlib/fp/dtoa.c)
union {
uint64_t U;
double F;
} conv;
conv.F = value;
int exp2 = (int)((conv.U >> 52U) & 0x07FFU) - 1023; // effectively log2
conv.U = (conv.U & ((1ULL << 52U) - 1U)) | (1023ULL << 52U); // drop the exponent so conv.F is now in [1,2)
// now approximate log10 from the log2 integer part and an expansion of ln around 1.5
int expval = (int)(0.1760912590558 + exp2 * 0.301029995663981 + (conv.F - 1.5) * 0.289529654602168);
// now we want to compute 10^expval but we want to be sure it won't overflow
exp2 = (int)(expval * 3.321928094887362 + 0.5);
const double z = expval * 2.302585092994046 - exp2 * 0.6931471805599453;
const double z2 = z * z;
conv.U = (uint64_t)(exp2 + 1023) << 52U;
// compute exp(z) using continued fractions, see https://en.wikipedia.org/wiki/Exponential_function#Continued_fractions_for_ex
conv.F *= 1 + 2 * z / (2 - z + (z2 / (6 + (z2 / (10 + z2 / 14)))));
// correct for rounding errors
if (value < conv.F) {
expval--;
conv.F /= 10;
}
// the exponent format is "%+03d" and largest value is "307", so set aside 4-5 characters
unsigned int minwidth = ((expval < 100) && (expval > -100)) ? 4U : 5U;
// in "%g" mode, "prec" is the number of *significant figures* not decimals
if (flags & FLAGS_ADAPT_EXP) {
// do we want to fall-back to "%f" mode?
if ((value >= 1e-4) && (value < 1e6)) {
if ((int)prec > expval) {
prec = (unsigned)((int)prec - expval - 1);
}
else {
prec = 0;
}
flags |= FLAGS_PRECISION; // make sure _ftoa respects precision
// no characters in exponent
minwidth = 0U;
expval = 0;
}
else {
// we use one sigfig for the whole part
if ((prec > 0) && (flags & FLAGS_PRECISION)) {
--prec;
}
}
}
// will everything fit?
unsigned int fwidth = width;
if (width > minwidth) {
// we didn't fall-back so subtract the characters required for the exponent
fwidth -= minwidth;
} else {
// not enough characters, so go back to default sizing
fwidth = 0U;
}
if ((flags & FLAGS_LEFT) && minwidth) {
// if we're padding on the right, DON'T pad the floating part
fwidth = 0U;
}
// rescale the float value
if (expval) {
value /= conv.F;
}
// output the floating part
const size_t start_idx = idx;
idx = _ftoa(out, buffer, idx, maxlen, negative ? -value : value, prec, fwidth, flags & ~FLAGS_ADAPT_EXP);
// output the exponent part
if (minwidth) {
// output the exponential symbol
out((flags & FLAGS_UPPERCASE) ? 'E' : 'e', buffer, idx++, maxlen);
// output the exponent value
idx = _ntoa_long(out, buffer, idx, maxlen, (expval < 0) ? -expval : expval, expval < 0, 10, 0, minwidth-1, FLAGS_ZEROPAD | FLAGS_PLUS);
// might need to right-pad spaces
if (flags & FLAGS_LEFT) {
while (idx - start_idx < width) out(' ', buffer, idx++, maxlen);
}
}
return idx;
}
#endif // PRINTF_SUPPORT_EXPONENTIAL
#endif // PRINTF_SUPPORT_FLOAT
// internal vsnprintf
static int _vsnprintf(out_fct_type out, char* buffer, const size_t maxlen, const char* format, va_list va) {
unsigned int flags, width, precision, n;
size_t idx = 0U;
if (!buffer) {
// use null output function
out = _out_null;
}
while (*format)
{
// format specifier? %[flags][width][.precision][length]
if (*format != '%') {
// no
out(*format, buffer, idx++, maxlen);
format++;
continue;
}
else {
// yes, evaluate it
format++;
}
// evaluate flags
flags = 0U;
do {
switch (*format) {
case '0': flags |= FLAGS_ZEROPAD; format++; n = 1U; break;
case '-': flags |= FLAGS_LEFT; format++; n = 1U; break;
case '+': flags |= FLAGS_PLUS; format++; n = 1U; break;
case ' ': flags |= FLAGS_SPACE; format++; n = 1U; break;
case '#': flags |= FLAGS_HASH; format++; n = 1U; break;
default : n = 0U; break;
}
} while (n);
// evaluate width field
width = 0U;
if (_is_digit(*format)) {
width = _atoi(&format);
}
else if (*format == '*') {
const int w = va_arg(va, int);
if (w < 0) {
flags |= FLAGS_LEFT; // reverse padding
width = (unsigned int)-w;
}
else {
width = (unsigned int)w;
}
format++;
}
// evaluate precision field
precision = 0U;
if (*format == '.') {
flags |= FLAGS_PRECISION;
format++;
if (_is_digit(*format)) {
precision = _atoi(&format);
}
else if (*format == '*') {
const int prec = (int)va_arg(va, int);
precision = prec > 0 ? (unsigned int)prec : 0U;
format++;
}
}
// evaluate length field
switch (*format) {
case 'l' :
flags |= FLAGS_LONG;
format++;
if (*format == 'l') {
flags |= FLAGS_LONG_LONG;
format++;
}
break;
case 'h' :
flags |= FLAGS_SHORT;
format++;
if (*format == 'h') {
flags |= FLAGS_CHAR;
format++;
}
break;
#if defined(PRINTF_SUPPORT_PTRDIFF_T)
case 't' :
flags |= (sizeof(ptrdiff_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
format++;
break;
#endif
case 'j' :
flags |= (sizeof(intmax_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
format++;
break;
case 'z' :
flags |= (sizeof(size_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
format++;
break;
default :
break;
}
// evaluate specifier
switch (*format) {
case 'd' :
case 'i' :
case 'u' :
case 'x' :
case 'X' :
case 'o' :
case 'b' : {
// set the base
unsigned int base;
if (*format == 'x' || *format == 'X') {
base = 16U;
}
else if (*format == 'o') {
base = 8U;
}
else if (*format == 'b') {
base = 2U;
}
else {
base = 10U;
flags &= ~FLAGS_HASH; // no hash for dec format
}
// uppercase
if (*format == 'X') {
flags |= FLAGS_UPPERCASE;
}
// no plus or space flag for u, x, X, o, b
if ((*format != 'i') && (*format != 'd')) {
flags &= ~(FLAGS_PLUS | FLAGS_SPACE);
}
// ignore '0' flag when precision is given
if (flags & FLAGS_PRECISION) {
flags &= ~FLAGS_ZEROPAD;
}
// convert the integer
if ((*format == 'i') || (*format == 'd')) {
// signed
if (flags & FLAGS_LONG_LONG) {
#if defined(PRINTF_SUPPORT_LONG_LONG)
const long long value = va_arg(va, long long);
idx = _ntoa_long_long(out, buffer, idx, maxlen, (unsigned long long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
#endif
}
else if (flags & FLAGS_LONG) {
const long value = va_arg(va, long);
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
}
else {
const int value = (flags & FLAGS_CHAR) ? (char)va_arg(va, int) : (flags & FLAGS_SHORT) ? (short int)va_arg(va, int) : va_arg(va, int);
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned int)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
}
}
else {
// unsigned
if (flags & FLAGS_LONG_LONG) {
#if defined(PRINTF_SUPPORT_LONG_LONG)
idx = _ntoa_long_long(out, buffer, idx, maxlen, va_arg(va, unsigned long long), false, base, precision, width, flags);
#endif
}
else if (flags & FLAGS_LONG) {
idx = _ntoa_long(out, buffer, idx, maxlen, va_arg(va, unsigned long), false, base, precision, width, flags);
}
else {
const unsigned int value = (flags & FLAGS_CHAR) ? (unsigned char)va_arg(va, unsigned int) : (flags & FLAGS_SHORT) ? (unsigned short int)va_arg(va, unsigned int) : va_arg(va, unsigned int);
idx = _ntoa_long(out, buffer, idx, maxlen, value, false, base, precision, width, flags);
}
}
format++;
break;
}
#if defined(PRINTF_SUPPORT_FLOAT)
case 'f' :
case 'F' :
if (*format == 'F') flags |= FLAGS_UPPERCASE;
idx = _ftoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
format++;
break;
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
case 'e':
case 'E':
case 'g':
case 'G':
if ((*format == 'g')||(*format == 'G')) flags |= FLAGS_ADAPT_EXP;
if ((*format == 'E')||(*format == 'G')) flags |= FLAGS_UPPERCASE;
idx = _etoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
format++;
break;
#endif // PRINTF_SUPPORT_EXPONENTIAL
#endif // PRINTF_SUPPORT_FLOAT
case 'c' : {
unsigned int l = 1U;
// pre padding
if (!(flags & FLAGS_LEFT)) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
// char output
out((char)va_arg(va, int), buffer, idx++, maxlen);
// post padding
if (flags & FLAGS_LEFT) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
format++;
break;
}
case 's' : {
const char* p = va_arg(va, char*);
unsigned int l = _strnlen_s(p, precision ? precision : (size_t)-1);
// pre padding
if (flags & FLAGS_PRECISION) {
l = (l < precision ? l : precision);
}
if (!(flags & FLAGS_LEFT)) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
// string output
while ((*p != 0) && (!(flags & FLAGS_PRECISION) || precision--)) {
out(*(p++), buffer, idx++, maxlen);
}
// post padding
if (flags & FLAGS_LEFT) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
format++;
break;
}
case 'p' : {
width = sizeof(void*) * 2U;
flags |= FLAGS_ZEROPAD | FLAGS_UPPERCASE;
#if defined(PRINTF_SUPPORT_LONG_LONG)
const bool is_ll = sizeof(uintptr_t) == sizeof(long long);
if (is_ll) {
idx = _ntoa_long_long(out, buffer, idx, maxlen, (uintptr_t)va_arg(va, void*), false, 16U, precision, width, flags);
}
else {
#endif
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)((uintptr_t)va_arg(va, void*)), false, 16U, precision, width, flags);
#if defined(PRINTF_SUPPORT_LONG_LONG)
}
#endif
format++;
break;
}
case '%' :
out('%', buffer, idx++, maxlen);
format++;
break;
default :
out(*format, buffer, idx++, maxlen);
format++;
break;
}
}
// termination
out((char)0, buffer, idx < maxlen ? idx : maxlen - 1U, maxlen);
// return written chars without terminating \0
return (int)idx;
}
int tiny_printf(const char* format, ...) {
va_list va;
va_start(va, format);
char buffer[1];
const int ret = _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
va_end(va);
return ret;
}
int tiny_sprintf(char* buffer, const char* format, ...) {
va_list va;
va_start(va, format);
const int ret = _vsnprintf(_out_buffer, buffer, (size_t)-1, format, va);
va_end(va);
return ret;
}
int tiny_snprintf(char* buffer, size_t count, const char* format, ...) {
va_list va;
va_start(va, format);
const int ret = _vsnprintf(_out_buffer, buffer, count, format, va);
va_end(va);
return ret;
}
int tiny_vprintf(const char* format, va_list va) {
char buffer[1];
return _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
}
int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va) {
return _vsnprintf(_out_buffer, buffer, count, format, va);
}

View File

@@ -1,86 +0,0 @@
///////////////////////////////////////////////////////////////////////////////
// \author (c) Marco Paland (info@paland.com)
// 2014-2019, PALANDesign Hannover, Germany
//
// \license The MIT License (MIT)
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// \brief Tiny printf, sprintf and snprintf implementation, optimized for speed on
// embedded systems with a very limited resources.
// Use this instead of bloated standard/newlib printf.
// These routines are thread safe and reentrant.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef _TINYPRINTF_H_
#define _TINYPRINTF_H_
#include <stdarg.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
/**
* Tiny printf implementation
* You have to implement _putchar if you use printf()
* To avoid conflicts with the regular printf() API it is overridden by macro defines
* and internal underscore-appended functions like printf_() are used
* \param format A string that specifies the format of the output
* \return The number of characters that are written into the array, not counting the terminating null character
*/
int tiny_printf(const char* format, ...);
/**
* Tiny sprintf implementation
* Due to security reasons (buffer overflow) YOU SHOULD CONSIDER USING (V)SNPRINTF INSTEAD!
* \param buffer A pointer to the buffer where to store the formatted string. MUST be big enough to store the output!
* \param format A string that specifies the format of the output
* \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
*/
int tiny_sprintf(char* buffer, const char* format, ...);
/**
* Tiny snprintf/vsnprintf implementation
* \param buffer A pointer to the buffer where to store the formatted string
* \param count The maximum number of characters to store in the buffer, including a terminating null character
* \param format A string that specifies the format of the output
* \param va A value identifying a variable arguments list
* \return The number of characters that COULD have been written into the buffer, not counting the terminating
* null character. A value equal or larger than count indicates truncation. Only when the returned value
* is non-negative and less than count, the string has been completely written.
*/
int tiny_snprintf(char* buffer, size_t count, const char* format, ...);
int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va);
/**
* Tiny vprintf implementation
* \param format A string that specifies the format of the output
* \param va A value identifying a variable arguments list
* \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
*/
int tiny_vprintf(const char* format, va_list va);
#ifdef __cplusplus
}
#endif
#endif // _TINYPRINTF_H_

View File

@@ -1,27 +0,0 @@
#include <VX_config.h>
#include <vx_intrinsics.h>
#include <stdint.h>
#define DUMP_CSR_4(d, s) \
csr_mem[d + 0] = csr_read(s + 0); \
csr_mem[d + 1] = csr_read(s + 1); \
csr_mem[d + 2] = csr_read(s + 2); \
csr_mem[d + 3] = csr_read(s + 3);
#define DUMP_CSR_32(d, s) \
DUMP_CSR_4(d + 0, s + 0) \
DUMP_CSR_4(d + 4, s + 4) \
DUMP_CSR_4(d + 8, s + 8) \
DUMP_CSR_4(d + 12, s + 12) \
DUMP_CSR_4(d + 16, s + 16) \
DUMP_CSR_4(d + 20, s + 20) \
DUMP_CSR_4(d + 24, s + 24) \
DUMP_CSR_4(d + 28, s + 28)
void vx_perf_dump() {
int core_id = vx_core_id();
uint32_t* const csr_mem = (uint32_t*)(IO_CSR_ADDR + 64 * sizeof(uint32_t) * core_id);
DUMP_CSR_32(0, CSR_MPM_BASE)
DUMP_CSR_32(32, CSR_MPM_BASE_H)
}

View File

@@ -1,11 +0,0 @@
#include <VX_config.h>
.type vx_putchar, @function
.global vx_putchar
vx_putchar:
csrr t0, CSR_GTID
andi t0, t0, %lo(IO_COUT_SIZE-1)
li t1, IO_COUT_ADDR
add t0, t0, t1
sb a0, 0(t0)
ret

View File

@@ -1,94 +0,0 @@
#include <vx_print.h>
#include <vx_spawn.h>
#include <vx_intrinsics.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "tinyprintf.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
const char* format;
va_list* va;
int ret;
} printf_arg_t;
typedef struct {
int value;
int base;
} putint_arg_t;
typedef struct {
float value;
int precision;
} putfloat_arg_t;
static void __putint_cb(const putint_arg_t* arg) {
char tmp[33];
float value = arg->value;
int base = arg->base;
itoa(value, tmp, base);
for (int i = 0; i < 33; ++i) {
int c = tmp[i];
if (!c)
break;
vx_putchar(c);
}
}
static void __putfloat_cb(const putfloat_arg_t* arg) {
float value = arg->value;
int precision = arg->precision;
int ipart = (int)value;
vx_putint(ipart, 10);
if (precision != 0) {
vx_putchar('.');
float frac = value - (float)ipart;
float fscaled = frac * pow(10, precision);
vx_putint((int)fscaled, 10);
}
}
static void __vprintf_cb(printf_arg_t* arg) {
arg->ret = tiny_vprintf(arg->format, *arg->va);
}
void vx_putint(int value, int base) {
putint_arg_t arg;
arg.value = value;
arg.base = base;
vx_serial((vx_serial_cb)__putint_cb, &arg);
}
void vx_putfloat(float value, int precision) {
putfloat_arg_t arg;
arg.value = value;
arg.precision = precision;
vx_serial((vx_serial_cb)__putfloat_cb, &arg);
}
int vx_vprintf(const char* format, va_list va) {
printf_arg_t arg;
arg.format = format;
arg.va = &va;
vx_serial((vx_serial_cb)__vprintf_cb, &arg);
return arg.ret;
}
int vx_printf(const char * format, ...) {
int ret;
va_list va;
va_start(va, format);
ret = vx_vprintf(format, va);
va_end(va);
return ret;
}
#ifdef __cplusplus
}
#endif

View File

@@ -1,37 +0,0 @@
#include <VX_config.h>
.type vx_serial, @function
.global vx_serial
vx_serial:
addi sp, sp, -24
sw ra, 20(sp)
sw s4, 16(sp)
sw s3, 12(sp)
sw s2, 8(sp)
sw s1, 4(sp)
sw s0, 0(sp)
mv s4, a0 # s4 <- callback
mv s3, a1 # s3 <- arg
csrr s2, CSR_NT # s2 <- NT
csrr s1, CSR_WTID # s1 <- tid
li s0, 0 # s0 <- index
label_loop:
sub t0, s0, s1
seqz t1, t0 # (index != tid)
.insn s 0x6b, 2, x0, 0(t1) # split t0
bnez t0, label_join
mv a0, s3 # a0 <- arg
jalr s4 # callback(arg)
label_join:
.insn s 0x6b, 3, x0, 0(x0) # join
addi s0, s0, 1 # index++
blt s0, s2, label_loop # loop back
lw ra, 20(sp)
lw s4, 16(sp)
lw s3, 12(sp)
lw s2, 8(sp)
lw s1, 4(sp)
lw s0, 0(sp)
addi sp, sp, 24
ret

View File

@@ -1,307 +0,0 @@
#include <vx_spawn.h>
#include <vx_intrinsics.h>
#include <inttypes.h>
#ifdef __cplusplus
extern "C" {
#endif
#define NUM_CORES_MAX 32
#define MIN(a, b) ((a) < (b) ? (a) : (b))
typedef struct {
vx_spawn_tasks_cb callback;
void * arg;
int offset;
int N;
int R;
int NW;
} wspawn_tasks_args_t;
typedef struct {
context_t * ctx;
vx_spawn_kernel_cb callback;
void * arg;
int offset;
int N;
int R;
int NW;
char isXYpow2;
char isXpow2;
char log2XY;
char log2X;
} wspawn_kernel_args_t;
void* g_wspawn_args[NUM_CORES_MAX];
inline char is_log2(int x) {
return ((x & (x-1)) == 0);
}
inline int fast_log2(int x) {
float f = x;
return (*(int*)(&f)>>23) - 127;
}
static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
int core_id = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
int NT = vx_num_threads();
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[core_id];
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
for (int task_id = offset, N = task_id + tK; task_id < N; ++task_id) {
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
}
// wait for all warps to complete
vx_barrier(0, p_wspawn_args->NW);
}
static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
int core_id = vx_core_id();
int tid = vx_thread_gid();
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[core_id];
int task_id = p_wspawn_args->offset + tid;
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
}
static void spawn_tasks_all_cb() {
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_tasks_all_stub();
// set warp0 to single-threaded and stop other warps
int wid = vx_warp_id();
vx_tmc(0 == wid);
}
static void spawn_tasks_rem_cb(int thread_mask) {
// activate threads
vx_tmc(thread_mask);
// call stub routine
spawn_tasks_rem_stub();
// back to single-threaded
vx_tmc(1);
}
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NT = vx_num_threads();
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
// calculate necessary active cores
int WT = NW * NT;
int nC = (num_tasks > WT) ? (num_tasks / WT) : 1;
int nc = MIN(nC, NC);
if (core_id >= nc)
return; // terminate extra cores
// number of tasks per core
int tasks_per_core = num_tasks / nc;
int tasks_per_core0 = tasks_per_core;
if (core_id == (NC-1)) {
int QC_r = num_tasks - (nc * tasks_per_core0);
tasks_per_core0 += QC_r; // last core executes remaining tasks
}
// number of tasks per warp
int nW = tasks_per_core0 / NT; // total warps per core
int rT = tasks_per_core0 - (nW * NT); // remaining threads
int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations
int rW = (fW != 0) ? (nW - fW * NW) : 0; // remaining warps
if (0 == fW)
fW = 1;
//--
wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW, 0 };
g_wspawn_args[core_id] = &wspawn_args;
//--
if (nW >= 1) {
int nw = MIN(nW, NW);
wspawn_args.NW = nw;
vx_wspawn(nw, spawn_tasks_all_cb);
spawn_tasks_all_cb();
}
//--
if (rT != 0) {
wspawn_args.offset = tasks_per_core0 - rT;
int tmask = (1 << rT) - 1;
spawn_tasks_rem_cb(tmask);
}
}
///////////////////////////////////////////////////////////////////////////////
static void __attribute__ ((noinline)) spawn_kernel_all_stub() {
int core_id = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
int NT = vx_num_threads();
wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[core_id];
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
int X = p_wspawn_args->ctx->num_groups[0];
int Y = p_wspawn_args->ctx->num_groups[1];
int XY = X * Y;
for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {
int k = p_wspawn_args->isXYpow2 ? (wg_id >> p_wspawn_args->log2XY) : (wg_id / XY);
int wg_2d = wg_id - k * XY;
int j = p_wspawn_args->isXpow2 ? (wg_2d >> p_wspawn_args->log2X) : (wg_2d / X);
int i = wg_2d - j * X;
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
(p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, gid0, gid1, gid2);
}
// wait for all warps to complete
vx_barrier(0, p_wspawn_args->NW);
}
static void __attribute__ ((noinline)) spawn_kernel_rem_stub() {
int core_id = vx_core_id();
int tid = vx_thread_gid();
wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[core_id];
int wg_id = p_wspawn_args->offset + tid;
int X = p_wspawn_args->ctx->num_groups[0];
int Y = p_wspawn_args->ctx->num_groups[1];
int XY = X * Y;
int k = p_wspawn_args->isXYpow2 ? (wg_id >> p_wspawn_args->log2XY) : (wg_id / XY);
int wg_2d = wg_id - k * XY;
int j = p_wspawn_args->isXpow2 ? (wg_2d >> p_wspawn_args->log2X) : (wg_2d / X);
int i = wg_2d - j * X;
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
(p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, gid0, gid1, gid2);
}
static void spawn_kernel_all_cb() {
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_kernel_all_stub();
// set warp0 to single-threaded and stop other warps
int wid = vx_warp_id();
vx_tmc(0 == wid);
}
static void spawn_kernel_rem_cb(int thread_mask) {
// activate threads
vx_tmc(thread_mask);
// call stub routine
spawn_kernel_rem_stub();
// back to single-threaded
vx_tmc(1);
}
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
// total number of WGs
int X = ctx->num_groups[0];
int Y = ctx->num_groups[1];
int Z = ctx->num_groups[2];
int XY = X * Y;
int Q = XY * Z;
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NT = vx_num_threads();
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
// calculate necessary active cores
int WT = NW * NT;
int nC = (Q > WT) ? (Q / WT) : 1;
int nc = MIN(nC, NC);
if (core_id >= nc)
return; // terminate extra cores
// number of workgroups per core
int wgs_per_core = Q / nc;
int wgs_per_core0 = wgs_per_core;
if (core_id == (NC-1)) {
int QC_r = Q - (nc * wgs_per_core0);
wgs_per_core0 += QC_r; // last core executes remaining WGs
}
// number of workgroups per warp
int nW = wgs_per_core0 / NT; // total warps per core
int rT = wgs_per_core0 - (nW * NT); // remaining threads
int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations
int rW = (fW != 0) ? (nW - fW * NW) : 0; // reamining full warps
if (0 == fW)
fW = 1;
// fast path handling
char isXYpow2 = is_log2(XY);
char isXpow2 = is_log2(X);
char log2XY = fast_log2(XY);
char log2X = fast_log2(X);
//--
wspawn_kernel_args_t wspawn_args = {
ctx, callback, arg, core_id * wgs_per_core, fW, rW, 0, isXYpow2, isXpow2, log2XY, log2X
};
g_wspawn_args[core_id] = &wspawn_args;
//--
if (nW >= 1) {
int nw = MIN(nW, NW);
wspawn_args.NW = nw;
vx_wspawn(nw, spawn_kernel_all_cb);
spawn_kernel_all_cb();
}
//--
if (rT != 0) {
wspawn_args.offset = wgs_per_core0 - rT;
int tmask = (1 << rT) - 1;
spawn_kernel_rem_cb(tmask);
}
}
#ifdef __cplusplus
}
#endif

View File

@@ -1,110 +0,0 @@
#include <VX_config.h>
.section .init, "ax"
.global _start
.type _start, @function
_start:
# initialize per-thread registers
csrr a0, CSR_NW # get num warps
la a1, init_regs
.insn s 0x6b, 1, a1, 0(a0) # wspawn a0, a1
jal init_regs
# return back to single thread execution
li a0, 1
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
# initialize TLS for all warps
csrr a0, CSR_NW # get num warps
la a1, __init_tls
.insn s 0x6b, 1, a1, 0(a0) # wspawn a0, a1
call __init_tls
# return back to single thread execution
li a0, 1
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
# clear BSS segment
la a0, _edata
la a2, _end
sub a2, a2, a0
li a1, 0
call memset
# Initialize trap vector
# a t0, trap_entry
# csrw mtvec, t0
# Register global termination functions
la a0, __libc_fini_array
# to be called upon exit
call atexit
# Run global initialization functions
call __libc_init_array
# call main program routine
call main
# call exit routine
tail exit
.size _start, .-_start
.section .text
.type _exit, @function
.global _exit
_exit:
mv s0, a0
call vx_perf_dump
mv gp, s0
li a0, 0
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
.section .text
.type init_regs, @function
.global init_regs
init_regs:
# activate all threads
li a0, -1
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
# set global pointer register
.option push
.option norelax
la gp, __global_pointer
.option pop
# allocate stack region for a threads on the processor
# set stack pointer
li sp, SMEM_BASE_ADDR # load stack base address
#if SM_ENABLE
csrr a0, CSR_LTID # get local thread id
#else
csrr a0, CSR_GTID # get global thread id
#endif
sll a1, a0, STACK_LOG2_SIZE
sub sp, sp, a1
# set thread pointer register
# use address space after BSS region
# ensure cacheline alignment
la a1, __tcb_aligned_size
mul a0, a0, a1
la tp, _end + 63
add tp, tp, a0
and tp, tp, -64
# disable active warps except warp0
csrr a3, CSR_LWID # get local wid
beqz a3, RETURN
li a0, 0
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
RETURN:
ret
.section .data
.global __dso_handle
.weak __dso_handle
__dso_handle:
.long 0

View File

@@ -1,109 +0,0 @@
#include <sys/stat.h>
#include <newlib.h>
#include <unistd.h>
#include <vx_intrinsics.h>
#include <vx_print.h>
#include <string.h>
int _close(int file) { return -1; }
int _fstat(int file, struct stat *st) { return -1; }
int _isatty(int file) { return 0; }
int _lseek(int file, int ptr, int dir) { return 0; }
int _open(const char *name, int flags, int mode) { return -1; }
int _read(int file, char *ptr, int len) { return -1; }
caddr_t _sbrk(int incr) {
__asm__ __volatile__("ebreak");
return 0;
}
int _write(int file, char *ptr, int len) {
int i;
for (i = 0; i < len; ++i) {
vx_putchar(*ptr++);
}
return len;
}
int _kill(int pid, int sig) { return -1; }
int _getpid() {
return vx_warp_gid();
}
void __init_tls(void) {
extern char __tdata_start[];
extern char __tbss_offset[];
extern char __tdata_size[];
extern char __tbss_size[];
// activate all threads
vx_tmc(-1);
// TLS memory initialization
register char *__thread_self __asm__ ("tp");
memcpy(__thread_self, __tdata_start, (size_t)__tdata_size);
memset(__thread_self + (size_t)__tbss_offset, 0, (size_t)__tbss_size);
// back to single thread execution
vx_tmc(0 == vx_warp_id());
}
#ifdef HAVE_INITFINI_ARRAY
/* These magic symbols are provided by the linker. */
extern void (*__preinit_array_start []) (void) __attribute__((weak));
extern void (*__preinit_array_end []) (void) __attribute__((weak));
extern void (*__init_array_start []) (void) __attribute__((weak));
extern void (*__init_array_end []) (void) __attribute__((weak));
#ifdef HAVE_INIT_FINI
extern void _init (void);
#endif
/* Iterate over all the init routines. */
void __libc_init_array (void) {
size_t count;
size_t i;
count = __preinit_array_end - __preinit_array_start;
for (i = 0; i < count; i++)
__preinit_array_start[i] ();
#ifdef HAVE_INIT_FINI
_init ();
#endif
count = __init_array_end - __init_array_start;
for (i = 0; i < count; i++)
__init_array_start[i] ();
}
#endif
#ifdef HAVE_INITFINI_ARRAY
extern void (*__fini_array_start []) (void) __attribute__((weak));
extern void (*__fini_array_end []) (void) __attribute__((weak));
#ifdef HAVE_INIT_FINI
extern void _fini (void);
#endif
/* Run all the cleanup routines. */
void __libc_fini_array (void) {
size_t count;
size_t i;
count = __fini_array_end - __fini_array_start;
for (i = count; i > 0; i--)
__fini_array_start[i-1] ();
#ifdef HAVE_INIT_FINI
_fini ();
#endif
}
#endif

19
runtime/stub/Makefile Normal file
View File

@@ -0,0 +1,19 @@
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I../include -I../../runtime -I../../hw -I../../sim/common
CXXFLAGS += -fPIC
LDFLAGS += -shared -pthread
SRCS = vortex.cpp ../common/utils.cpp
PROJECT = libvortex.so
all: $(PROJECT)
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
clean:
rm -rf $(PROJECT) obj_dir

58
runtime/stub/vortex.cpp Normal file
View File

@@ -0,0 +1,58 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vortex.h>
extern int vx_dev_open(vx_device_h* /*hdevice*/) {
return -1;
}
extern int vx_dev_close(vx_device_h /*hdevice*/) {
return -1;
}
extern int vx_dev_caps(vx_device_h /*hdevice*/, uint32_t /*caps_id*/, uint64_t* /*value*/) {
return -1;
}
extern int vx_mem_alloc(vx_device_h /*hdevice*/, uint64_t /*size*/, int /*type*/, uint64_t* /*dev_addr*/) {
return -1;
}
extern int vx_mem_free(vx_device_h /*hdevice*/, uint64_t /*dev_addr*/) {
return -1;
}
extern int vx_mem_info(vx_device_h /*hdevice*/, int /*type*/, uint64_t* /*mem_free*/, uint64_t* /*mem_used*/) {
return 0;
}
extern int vx_copy_to_dev(vx_device_h /*hdevice*/, uint64_t /*dev_addr*/, const void* /*host_ptr*/, uint64_t /*size*/) {
return -1;
}
extern int vx_copy_from_dev(vx_device_h /*hdevice*/, void* /*host_ptr*/, uint64_t /*dev_addr*/, uint64_t /*size*/) {
return -1;
}
extern int vx_start(vx_device_h /*hdevice*/) {
return -1;
}
extern int vx_ready_wait(vx_device_h /*hdevice*/, uint64_t /*timeout*/) {
return -1;
}
extern int vx_dcr_write(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint64_t /*value*/) {
return -1;
}

33
runtime/xrt/Makefile Normal file
View File

@@ -0,0 +1,33 @@
CXXFLAGS += -std=c++14 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -I../include -I../common -I../../hw -I$(XILINX_XRT)/include -I../../sim/common
CXXFLAGS += -fPIC
LDFLAGS += -shared -pthread
LDFLAGS += -L$(XILINX_XRT)/lib -luuid -lxrt_coreutil
SRCS = vortex.cpp ../common/utils.cpp ../../sim/common/util.cpp
PROJECT = libvortex.so
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
# Enable scope logic analyzer
ifdef SCOPE
CXXFLAGS += -DSCOPE
SRCS += ../common/scope.cpp
endif
all: $(PROJECT)
$(PROJECT): $(SRCS) $(SCOPE_JSON)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
clean:
rm -rf $(PROJECT) obj_dir

915
runtime/xrt/vortex.cpp Normal file
View File

@@ -0,0 +1,915 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vortex.h>
#include <malloc.h>
#include <utils.h>
#include <VX_config.h>
#include <VX_types.h>
#include <stdarg.h>
#include <util.h>
#include <limits>
#include <unordered_map>
#ifdef SCOPE
#include "scope.h"
#endif
// XRT includes
#include "experimental/xrt_bo.h"
#include "experimental/xrt_ip.h"
#include "experimental/xrt_device.h"
#include "experimental/xrt_kernel.h"
#include "experimental/xrt_xclbin.h"
#include "experimental/xrt_error.h"
#define CPP_API
//#define BANK_INTERLEAVE
#define MMIO_CTL_ADDR 0x00
#define MMIO_DEV_ADDR 0x10
#define MMIO_ISA_ADDR 0x1C
#define MMIO_DCR_ADDR 0x28
#define MMIO_SCP_ADDR 0x34
#define MMIO_MEM_ADDR 0x40
#define CTL_AP_START (1<<0)
#define CTL_AP_DONE (1<<1)
#define CTL_AP_IDLE (1<<2)
#define CTL_AP_READY (1<<3)
#define CTL_AP_RESET (1<<4)
#define CTL_AP_RESTART (1<<7)
struct platform_info_t {
const char* prefix_name;
uint8_t lg2_num_banks;
uint8_t lg2_bank_size;
uint64_t mem_base;
};
static const platform_info_t g_platforms [] = {
{"xilinx_u50", 4, 0x1C, 0x0},
{"xilinx_u200", 4, 0x1C, 0x0},
{"xilinx_u280", 4, 0x1C, 0x0},
{"xilinx_vck5000", 0, 0x21, 0xC000000000},
};
#ifdef CPP_API
typedef xrt::device xrt_device_t;
typedef xrt::ip xrt_kernel_t;
typedef xrt::bo xrt_buffer_t;
#else
typedef xrtDeviceHandle xrt_device_t;
typedef xrtKernelHandle xrt_kernel_t;
typedef xrtBufferHandle xrt_buffer_t;
#endif
#define RAM_PAGE_SIZE 4096
#define DEFAULT_DEVICE_INDEX 0
#define DEFAULT_XCLBIN_PATH "vortex_afu.xclbin"
#define KERNEL_NAME "vortex_afu"
#ifndef NDEBUG
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
#else
#define DBGPRINT(format, ...) ((void)0)
#endif
#define CHECK_HANDLE(handle, _expr, _cleanup) \
auto handle = _expr; \
if (handle == nullptr) { \
printf("[VXDRV] Error: '%s' returned NULL!\n", #_expr); \
_cleanup \
}
#define CHECK_ERR(_expr, _cleanup) \
do { \
auto err = _expr; \
if (err == 0) \
break; \
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
_cleanup \
} while (false)
using namespace vortex;
#ifndef CPP_API
static void dump_xrt_error(xrtDeviceHandle xrtDevice, xrtErrorCode err) {
size_t len = 0;
xrtErrorGetString(xrtDevice, err, nullptr, 0, &len);
std::vector<char> buf(len);
xrtErrorGetString(xrtDevice, err, buf.data(), buf.size(), nullptr);
printf("[VXDRV] detail: %s!\n", buf.data());
}
#endif
static int get_platform_info(const std::string& device_name, platform_info_t* platform_info) {
for (size_t i = 0; i < (sizeof(g_platforms)/sizeof(platform_info_t)); ++i) {
auto& platform = g_platforms[i];
if (device_name.rfind(platform.prefix_name, 0) == 0) {
*platform_info = platform;
return 0;
}
}
return -1;
}
/*static void wait_for_enter(const std::string &msg) {
std::cout << msg << std::endl;
std::cin.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
}*/
///////////////////////////////////////////////////////////////////////////////
class vx_device {
public:
vx_device(xrt_device_t& device, xrt_kernel_t& kernel, const platform_info_t& platform)
: xrtDevice_(device)
, xrtKernel_(kernel)
, platform_(platform)
{}
#ifndef CPP_API
~vx_device() {
for (auto& entry : xrtBuffers_) {
#ifdef BANK_INTERLEAVE
xrtBOFree(entry);
#else
xrtBOFree(entry.second.xrtBuffer);
#endif
}
if (xrtKernel_) {
xrtKernelClose(xrtKernel_);
}
if (xrtDevice_) {
xrtDeviceClose(xrtDevice_);
}
}
#endif
int init() {
CHECK_ERR(this->write_register(MMIO_CTL_ADDR, CTL_AP_RESET), {
return -1;
});
uint32_t num_banks = 1 << platform_.lg2_num_banks;
uint64_t bank_size = 1ull << platform_.lg2_bank_size;
for (uint32_t i = 0; i < num_banks; ++i) {
uint32_t reg_addr = MMIO_MEM_ADDR + (i * 12);
uint64_t reg_value = platform_.mem_base + i * bank_size;
CHECK_ERR(this->write_register(reg_addr, reg_value & 0xffffffff), {
return -1;
});
CHECK_ERR(this->write_register(reg_addr + 4, (reg_value >> 32) & 0xffffffff), {
return -1;
});
#ifndef BANK_INTERLEAVE
break;
#endif
}
CHECK_ERR(this->read_register(MMIO_DEV_ADDR, (uint32_t*)&this->dev_caps), {
return -1;
});
CHECK_ERR(this->read_register(MMIO_DEV_ADDR + 4, (uint32_t*)&this->dev_caps + 1), {
return -1;
});
CHECK_ERR(this->read_register(MMIO_ISA_ADDR, (uint32_t*)&this->isa_caps), {
return -1;
});
CHECK_ERR(this->read_register(MMIO_ISA_ADDR + 4, (uint32_t*)&this->isa_caps + 1), {
return -1;
});
this->global_mem_size = num_banks * bank_size;
this->global_mem_ = std::make_shared<vortex::MemoryAllocator>(
ALLOC_BASE_ADDR, ALLOC_MAX_ADDR, RAM_PAGE_SIZE, CACHE_BLOCK_SIZE);
uint64_t local_mem_size = 0;
vx_dev_caps(this, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size);
if (local_mem_size <= 1) {
this->local_mem_ = std::make_shared<vortex::MemoryAllocator>(
SMEM_BASE_ADDR, local_mem_size, RAM_PAGE_SIZE, 1);
}
#ifdef BANK_INTERLEAVE
xrtBuffers_.reserve(num_banks);
for (uint32_t i = 0; i < num_banks; ++i) {
#ifdef CPP_API
xrtBuffers_.emplace_back(xrtDevice_, bank_size, xrt::bo::flags::normal, i);
#else
CHECK_HANDLE(xrtBuffer, xrtBOAlloc(xrtDevice_, bank_size, XRT_BO_FLAGS_NONE, i), {
return -1;
});
xrtBuffers_.push_back(xrtBuffer);
#endif
printf("*** allocated bank%u/%u, size=%lu\n", i, num_banks, bank_size);
}
#endif
return 0;
}
int mem_alloc(uint64_t size, int type, uint64_t* dev_addr) {
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
uint64_t addr;
if (type == VX_MEM_TYPE_GLOBAL) {
CHECK_ERR(global_mem_->allocate(asize, &addr), {
return -1;
});
#ifndef BANK_INTERLEAVE
uint32_t bank_id;
CHECK_ERR(this->get_bank_info(addr, &bank_id, nullptr), {
return -1;
});
CHECK_ERR(get_buffer(bank_id, nullptr), {
return -1;
});
#endif
} else if (type == VX_MEM_TYPE_LOCAL) {
if CHECK_ERR(local_mem_->allocate(asize, &addr), {
return -1;
});
} else {
return -1;
}
*dev_addr = addr;
return 0;
}
int mem_free(uint64_t dev_addr) {
if (dev_addr >= SMEM_BASE_ADDR) {
CHECK_ERR(local_mem_->release(dev_addr), {
return -1;
});
} else {
CHECK_ERR(global_mem_->release(dev_addr), {
return -1;
});
#ifdef BANK_INTERLEAVE
if (0 == global_mem_->allocated()) {
#ifndef CPP_API
for (auto& entry : xrtBuffers_) {
xrtBOFree(entry);
}
#endif
xrtBuffers_.clear();
}
#else
uint32_t bank_id;
CHECK_ERR(this->get_bank_info(dev_addr, &bank_id, nullptr), {
return -1;
});
auto it = xrtBuffers_.find(bank_id);
if (it != xrtBuffers_.end()) {
auto count = --it->second.count;
if (0 == count) {
printf("freeing bank%d...\n", bank_id);
#ifndef CPP_API
xrtBOFree(it->second.xrtBuffer);
#endif
xrtBuffers_.erase(it);
}
} else {
fprintf(stderr, "[VXDRV] Error: invalid device memory address: 0x%lx\n", dev_addr);
return -1;
}
#endif
}
return 0;
}
int mem_info(int type, uint64_t* mem_free, uint64_t* mem_used) const {
if (type == VX_MEM_TYPE_GLOBAL) {
if (mem_free)
*mem_free = global_mem_->free();
if (mem_used)
*mem_used = global_mem_->allocated();
} else if (type == VX_MEM_TYPE_LOCAL) {
if (mem_free)
*mem_free = local_mem_->free();
if (mem_used)
*mem_free = local_mem_->allocated();
} else {
return -1;
}
return 0;
}
int write_register(uint32_t addr, uint32_t value) {
#ifdef CPP_API
xrtKernel_.write_register(addr, value);
#else
CHECK_ERR(xrtKernelWriteRegister(xrtKernel_, addr, value), {
dump_xrt_error(xrtDevice_, err);
return -1;
});
#endif
DBGPRINT("*** write_register: addr=0x%x, value=0x%x\n", addr, value);
return 0;
}
int read_register(uint32_t addr, uint32_t* value) {
#ifdef CPP_API
*value = xrtKernel_.read_register(addr);
#else
CHECK_ERR(xrtKernelReadRegister(xrtKernel_, addr, value), {
dump_xrt_error(xrtDevice_, err);
return -1;
});
#endif
DBGPRINT("*** read_register: addr=0x%x, value=0x%x\n", addr, *value);
return 0;
}
int upload(uint64_t dev_addr, uint8_t* host_ptr, uint64_t asize) {
for (uint64_t end = dev_addr + asize; dev_addr < end;
dev_addr += CACHE_BLOCK_SIZE,
host_ptr += CACHE_BLOCK_SIZE) {
#ifdef BANK_INTERLEAVE
asize = CACHE_BLOCK_SIZE;
#else
end = 0;
#endif
uint32_t bo_index;
uint64_t bo_offset;
xrt_buffer_t xrtBuffer;
CHECK_ERR(this->get_bank_info(dev_addr, &bo_index, &bo_offset), {
return -1;
});
CHECK_ERR(this->get_buffer(bo_index, &xrtBuffer), {
return -1;
});
#ifdef CPP_API
xrtBuffer.write(host_ptr, asize, bo_offset);
xrtBuffer.sync(XCL_BO_SYNC_BO_TO_DEVICE, asize, bo_offset);
#else
CHECK_ERR(xrtBOWrite(xrtBuffer, host_ptr, asize, bo_offset), {
dump_xrt_error(xrtDevice_, err);
return -1;
});
CHECK_ERR(xrtBOSync(xrtBuffer, XCL_BO_SYNC_BO_TO_DEVICE, asize, bo_offset), {
dump_xrt_error(xrtDevice_, err);
return -1;
});
#endif
}
return 0;
}
int download(uint8_t* host_ptr, uint64_t dev_addr, uint64_t asize) {
for (uint64_t end = dev_addr + asize; dev_addr < end;
dev_addr += CACHE_BLOCK_SIZE,
host_ptr += CACHE_BLOCK_SIZE) {
#ifdef BANK_INTERLEAVE
asize = CACHE_BLOCK_SIZE;
#else
end = 0;
#endif
uint32_t bo_index;
uint64_t bo_offset;
xrt_buffer_t xrtBuffer;
CHECK_ERR(this->get_bank_info(dev_addr, &bo_index, &bo_offset), {
return -1;
});
CHECK_ERR(this->get_buffer(bo_index, &xrtBuffer), {
return -1;
});
#ifdef CPP_API
xrtBuffer.sync(XCL_BO_SYNC_BO_FROM_DEVICE, asize, bo_offset);
xrtBuffer.read(host_ptr, asize, bo_offset);
#else
CHECK_ERR(xrtBOSync(xrtBuffer, XCL_BO_SYNC_BO_FROM_DEVICE, asize, bo_offset), {
dump_xrt_error(xrtDevice_, err);
return -1;
});
CHECK_ERR(xrtBORead(xrtBuffer, host_ptr, asize, bo_offset), {
dump_xrt_error(xrtDevice_, err);
return -1;
});
#endif
}
return 0;
}
DeviceConfig dcrs;
uint64_t dev_caps;
uint64_t isa_caps;
uint64_t global_mem_size;
private:
xrt_device_t xrtDevice_;
xrt_kernel_t xrtKernel_;
const platform_info_t platform_;
std::shared_ptr<vortex::MemoryAllocator> global_mem_;
std::shared_ptr<vortex::MemoryAllocator> local_mem_;
#ifdef BANK_INTERLEAVE
std::vector<xrt_buffer_t> xrtBuffers_;
int get_bank_info(uint64_t addr, uint32_t* pIdx, uint64_t* pOff) {
uint32_t num_banks = 1 << platform_.lg2_num_banks;
uint64_t block_addr = addr / CACHE_BLOCK_SIZE;
uint32_t index = block_addr & (num_banks-1);
uint64_t offset = (block_addr >> platform_.lg2_num_banks) * CACHE_BLOCK_SIZE;
if (pIdx) {
*pIdx = index;
}
if (pOff) {
*pOff = offset;
}
printf("get_bank_info(addr=0x%lx, bank=%d, offset=0x%lx\n", addr, index, offset);
return 0;
}
int get_buffer(uint32_t bank_id, xrt_buffer_t* pBuf) {
if (pBuf) {
*pBuf = xrtBuffers_.at(bank_id);
}
return 0;
}
#else
struct buf_cnt_t {
xrt_buffer_t xrtBuffer;
uint32_t count;
};
std::unordered_map<uint32_t, buf_cnt_t> xrtBuffers_;
int get_bank_info(uint64_t addr, uint32_t* pIdx, uint64_t* pOff) {
uint32_t num_banks = 1 << platform_.lg2_num_banks;
uint64_t bank_size = 1ull << platform_.lg2_bank_size;
uint32_t index = addr >> platform_.lg2_bank_size;
uint64_t offset = addr & (bank_size-1);
if (index > num_banks) {
fprintf(stderr, "[VXDRV] Error: address out of range: 0x%lx\n", addr);
return -1;
}
if (pIdx) {
*pIdx = index;
}
if (pOff) {
*pOff = offset;
}
printf("get_bank_info(addr=0x%lx, bank=%d, offset=0x%lx\n", addr, index, offset);
return 0;
}
int get_buffer(uint32_t bank_id, xrt_buffer_t* pBuf) {
auto it = xrtBuffers_.find(bank_id);
if (it != xrtBuffers_.end()) {
if (pBuf) {
*pBuf = it->second.xrtBuffer;
} else {
printf("reusing bank%d...\n", bank_id);
++it->second.count;
}
} else {
printf("allocating bank%d...\n", bank_id);
uint64_t bank_size = 1ull << platform_.lg2_bank_size;
#ifdef CPP_API
xrt::bo xrtBuffer(xrtDevice_, bank_size, xrt::bo::flags::normal, bank_id);
#else
CHECK_HANDLE(xrtBuffer, xrtBOAlloc(xrtDevice_, bank_size, XRT_BO_FLAGS_NONE, bank_id), {
return -1;
});
#endif
xrtBuffers_.insert({bank_id, {xrtBuffer, 1}});
if (pBuf) {
*pBuf = xrtBuffer;
}
}
return 0;
}
#endif
};
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
switch (caps_id) {
case VX_CAPS_VERSION:
*value = (device->dev_caps >> 0) & 0xff;
break;
case VX_CAPS_NUM_THREADS:
*value = (device->dev_caps >> 8) & 0xff;
break;
case VX_CAPS_NUM_WARPS:
*value = (device->dev_caps >> 16) & 0xff;
break;
case VX_CAPS_NUM_CORES:
*value = (device->dev_caps >> 24) & 0xffff;
break;
case VX_CAPS_CACHE_LINE_SIZE:
*value = CACHE_BLOCK_SIZE;
break;
case VX_CAPS_GLOBAL_MEM_SIZE:
*value = device->global_mem_size;
break;
case VX_CAPS_LOCAL_MEM_SIZE:
*value = 1ull << ((device->dev_caps >> 40) & 0xff);
break;
case VX_CAPS_KERNEL_BASE_ADDR:
*value = (uint64_t(device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) |
device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
break;
case VX_CAPS_ISA_FLAGS:
*value = device->isa_caps;
break;
default:
fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id);
std::abort();
return -1;
}
return 0;
}
extern int vx_dev_open(vx_device_h* hdevice) {
if (nullptr == hdevice)
return -1;
int device_index = DEFAULT_DEVICE_INDEX;
const char* device_index_s = getenv("XRT_DEVICE_INDEX");
if (device_index_s != nullptr) {
device_index = atoi(device_index_s);
}
const char* xlbin_path_s = getenv("XRT_XCLBIN_PATH");
if (xlbin_path_s == nullptr) {
xlbin_path_s = DEFAULT_XCLBIN_PATH;
}
#ifdef CPP_API
auto xrtDevice = xrt::device(device_index);
auto uuid = xrtDevice.load_xclbin(xlbin_path_s);
auto xrtKernel = xrt::ip(xrtDevice, uuid, KERNEL_NAME);
auto xclbin = xrt::xclbin(xlbin_path_s);
auto device_name = xrtDevice.get_info<xrt::info::device::name>();
/*{
uint32_t num_banks = 0;
uint64_t bank_size = 0;
uint64_t mem_base = 0;
auto mem_json = nlohmann::json::parse(xrtDevice.get_info<xrt::info::device::memory>());
if (!mem_json.is_null()) {
uint32_t index = 0;
for (auto& mem : mem_json["board"]["memory"]["memories"]) {
auto enabled = mem["enabled"].get<std::string>();
if (enabled == "true") {
if (index == 0) {
mem_base = std::stoull(mem["base_address"].get<std::string>(), nullptr, 16);
bank_size = std::stoull(mem["range_bytes"].get<std::string>(), nullptr, 16);
}
++index;
}
}
num_banks = index;
}
fprintf(stderr, "[VXDRV] memory description: base=0x%lx, size=0x%lx, count=%d\n", mem_base, bank_size, num_banks);
}*/
/*{
std::cout << "Device" << device_index << " : " << xrtDevice.get_info<xrt::info::device::name>() << std::endl;
std::cout << " bdf : " << xrtDevice.get_info<xrt::info::device::bdf>() << std::endl;
std::cout << " kdma : " << xrtDevice.get_info<xrt::info::device::kdma>() << std::endl;
std::cout << " max_freq : " << xrtDevice.get_info<xrt::info::device::max_clock_frequency_mhz>() << std::endl;
std::cout << " memory : " << xrtDevice.get_info<xrt::info::device::memory>() << std::endl;
std::cout << " thermal : " << xrtDevice.get_info<xrt::info::device::thermal>() << std::endl;
std::cout << " m2m : " << std::boolalpha << xrtDevice.get_info<xrt::info::device::m2m>() << std::dec << std::endl;
std::cout << " nodma : " << std::boolalpha << xrtDevice.get_info<xrt::info::device::nodma>() << std::dec << std::endl;
std::cout << "Memory info :" << std::endl;
for (const auto& mem_bank : xclbin.get_mems()) {
std::cout << " index : " << mem_bank.get_index() << std::endl;
std::cout << " tag : " << mem_bank.get_tag() << std::endl;
std::cout << " type : " << (int)mem_bank.get_type() << std::endl;
std::cout << " base_address : 0x" << std::hex << mem_bank.get_base_address() << std::endl;
std::cout << " size : 0x" << (mem_bank.get_size_kb() * 1000) << std::dec << std::endl;
std::cout << " used :" << mem_bank.get_used() << std::endl;
}
}*/
// get platform info
platform_info_t platform_info;
CHECK_ERR(get_platform_info(device_name, &platform_info), {
fprintf(stderr, "[VXDRV] Error: platform not supported: %s\n", device_name.c_str());
return -1;
});
CHECK_HANDLE(device, new vx_device(xrtDevice, xrtKernel, platform_info), {
return -1;
});
#else
CHECK_HANDLE(xrtDevice, xrtDeviceOpen(device_index), {
return -1;
});
CHECK_ERR(xrtDeviceLoadXclbinFile(xrtDevice, xlbin_path_s), {
dump_xrt_error(xrtDevice, err);
xrtDeviceClose(xrtDevice);
return -1;
});
xuid_t uuid;
CHECK_ERR(xrtDeviceGetXclbinUUID(xrtDevice, uuid), {
dump_xrt_error(xrtDevice, err);
xrtDeviceClose(xrtDevice);
return -1;
});
CHECK_HANDLE(xrtKernel, xrtPLKernelOpenExclusive(xrtDevice, uuid, KERNEL_NAME), {
xrtDeviceClose(xrtDevice);
return -1;
});
int device_name_size;
xrtXclbinGetXSAName(xrtDevice, nullptr, 0, &device_name_size);
std::vector<char> device_name(device_name_size);
xrtXclbinGetXSAName(xrtDevice, device_name.data(), device_name_size, nullptr);
// get platform info
platform_info_t platform_info;
CHECK_ERR(get_platform_info(device_name.data(), &platform_info), {
fprintf(stderr, "[VXDRV] Error: platform not supported: %s\n", device_name.data());
return -1;
});
CHECK_HANDLE(device, new vx_device(xrtDevice, xrtKernel, platform_info), {
xrtKernelClose(xrtKernel);
xrtDeviceClose(xrtDevice);
return -1;
});
#endif
// initialize device
CHECK_ERR(device->init(), {
return -1;
});
#ifdef SCOPE
{
scope_callback_t callback;
callback.registerWrite = [](vx_device_h hdevice, uint64_t value)->int {
auto device = (vx_device*)hdevice;
uint32_t value_lo = (uint32_t)(value);
uint32_t value_hi = (uint32_t)(value >> 32);
CHECK_ERR(device->write_register(MMIO_SCP_ADDR, value_lo), {
return -1;
});
CHECK_ERR(device->write_register(MMIO_SCP_ADDR + 4, value_hi), {
return -1;
});
return 0;
};
callback.registerRead = [](vx_device_h hdevice, uint64_t* value)->int {
auto device = (vx_device*)hdevice;
uint32_t value_lo, value_hi;
CHECK_ERR(device->read_register(MMIO_SCP_ADDR, &value_lo), {
return -1;
});
CHECK_ERR(device->read_register(MMIO_SCP_ADDR + 4, &value_hi), {
return -1;
});
*value = (((uint64_t)value_hi) << 32) | value_lo;
return 0;
};
int ret = vx_scope_start(&callback, device, 0, -1);
if (ret != 0) {
delete device;
return ret;
}
}
#endif
CHECK_ERR(dcr_initialize(device), {
delete device;
return -1;
});
#ifdef DUMP_PERF_STATS
perf_add_device(device);
#endif
*hdevice = device;
DBGPRINT("device creation complete!\n");
return 0;
}
extern int vx_dev_close(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
#ifdef SCOPE
vx_scope_stop(hdevice);
#endif
auto device = (vx_device*)hdevice;
delete device;
DBGPRINT("device destroyed!\n");
return 0;
}
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr) {
if (nullptr == hdevice
|| nullptr == dev_addr
|| 0 == size)
return -1;
auto device = ((vx_device*)hdevice);
return device->mem_alloc(size, type, dev_addr);
}
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
if (nullptr == hdevice)
return -1;
if (0 == dev_addr)
return 0;
auto device = (vx_device*)hdevice;
return device->mem_free(dev_addr);
}
extern int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
return device->mem_info(type, mem_free, mem_used);
}
extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
// check alignment
if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
return -1;
auto asize = aligned_size(size, CACHE_BLOCK_SIZE);
// bound checking
if (dev_addr + asize > device->global_mem_size)
return -1;
CHECK_ERR(device->upload(dev_addr, host_ptr, asize), {
return -1;
});
DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, size);
return 0;
}
extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
// check alignment
if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
return -1;
auto asize = aligned_size(size, CACHE_BLOCK_SIZE);
// bound checking
if (dev_addr + asize > device->global_mem_size)
return -1;
CHECK_ERR(device->download(host_ptr, dev_addr, asize), {
return -1;
});
DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, asize);
return 0;
}
extern int vx_start(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
//wait_for_enter("\nPress ENTER to continue after setting up ILA trigger...");
CHECK_ERR(device->write_register(MMIO_CTL_ADDR, CTL_AP_START), {
return -1;
});
DBGPRINT("START\n");
return 0;
}
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
struct timespec sleep_time;
#ifndef NDEBUG
sleep_time.tv_sec = 1;
sleep_time.tv_nsec = 0;
#else
sleep_time.tv_sec = 0;
sleep_time.tv_nsec = 1000000;
#endif
// to milliseconds
uint64_t sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
for (;;) {
uint32_t status = 0;
CHECK_ERR(device->read_register(MMIO_CTL_ADDR, &status), {
return -1;
});
bool is_done = (status & CTL_AP_DONE) == CTL_AP_DONE;
if (is_done || 0 == timeout) {
break;
}
nanosleep(&sleep_time, nullptr);
timeout -= sleep_time_ms;
};
return 0;
}
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
CHECK_ERR(device->write_register(MMIO_DCR_ADDR, addr), {
return -1;
});
CHECK_ERR(device->write_register(MMIO_DCR_ADDR + 4, value), {
return -1;
});
// save the value
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%lx\n", addr, value);
device->dcrs.write(addr, value);
return 0;
}