Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes
This commit is contained in:
0
runtime/.gitignore
vendored
0
runtime/.gitignore
vendored
@@ -1,49 +1,25 @@
|
||||
XLEN ?= 32
|
||||
all: stub rtlsim simx opae
|
||||
|
||||
ifeq ($(XLEN),32)
|
||||
RISCV_TOOLCHAIN_PATH = /opt/riscv-gnu-toolchain
|
||||
else
|
||||
RISCV_TOOLCHAIN_PATH = /opt/riscv64-gnu-toolchain
|
||||
endif
|
||||
stub:
|
||||
$(MAKE) -C stub
|
||||
|
||||
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
|
||||
simx:
|
||||
$(MAKE) -C simx
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc
|
||||
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc-ar
|
||||
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objdump
|
||||
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objcopy
|
||||
rtlsim:
|
||||
$(MAKE) -C rtlsim
|
||||
|
||||
ifeq ($(XLEN),32)
|
||||
CFLAGS += -march=rv32imf -mabi=ilp32f
|
||||
else
|
||||
CFLAGS += -march=rv64imfd -mabi=lp64d
|
||||
endif
|
||||
opae:
|
||||
$(MAKE) -C opae
|
||||
|
||||
CFLAGS += -O3 -mcmodel=medany -Wstack-usage=1024 -fno-exceptions -fdata-sections -ffunction-sections
|
||||
CFLAGS += -I./include -I../hw
|
||||
|
||||
PROJECT = libvortexrt
|
||||
|
||||
SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/tinyprintf.c ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c
|
||||
|
||||
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
|
||||
|
||||
all: $(PROJECT).a $(PROJECT).dump
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).a
|
||||
$(DP) -D $(PROJECT).a > $(PROJECT).dump
|
||||
|
||||
%.S.o: src/%.S
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
%.c.o: src/%.c
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
$(PROJECT).a: $(OBJS)
|
||||
$(AR) rcs $@ $^
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CC) $(CFLAGS) -MM $^ > .depend;
|
||||
xrt:
|
||||
$(MAKE) -C xrt
|
||||
|
||||
clean:
|
||||
rm -rf *.a *.o *.dump .depend
|
||||
$(MAKE) clean -C stub
|
||||
$(MAKE) clean -C simx
|
||||
$(MAKE) clean -C rtlsim
|
||||
$(MAKE) clean -C opae
|
||||
$(MAKE) clean -C xrt
|
||||
|
||||
.PHONY: all stub simx rtlsim opae xrt clean
|
||||
455
runtime/common/malloc.h
Normal file
455
runtime/common/malloc.h
Normal file
@@ -0,0 +1,455 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class MemoryAllocator {
|
||||
public:
|
||||
MemoryAllocator(
|
||||
uint64_t baseAddress,
|
||||
uint64_t capacity,
|
||||
uint32_t pageAlign,
|
||||
uint32_t blockAlign)
|
||||
: baseAddress_(baseAddress)
|
||||
, capacity_(capacity)
|
||||
, pageAlign_(pageAlign)
|
||||
, blockAlign_(blockAlign)
|
||||
, pages_(nullptr)
|
||||
, nextAddress_(0)
|
||||
, allocated_(0)
|
||||
{}
|
||||
|
||||
~MemoryAllocator() {
|
||||
// Free allocated pages
|
||||
page_t* currPage = pages_;
|
||||
while (currPage) {
|
||||
auto nextPage = currPage->next;
|
||||
this->DeletePage(currPage);
|
||||
currPage = nextPage;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t baseAddress() const {
|
||||
return baseAddress_;
|
||||
}
|
||||
|
||||
uint32_t capacity() const {
|
||||
return capacity_;
|
||||
}
|
||||
|
||||
uint64_t free() const {
|
||||
return (capacity_ - allocated_);
|
||||
}
|
||||
|
||||
uint64_t allocated() const {
|
||||
return allocated_;
|
||||
}
|
||||
|
||||
int allocate(uint64_t size, uint64_t* addr) {
|
||||
if (size == 0 || addr == nullptr) {
|
||||
printf("error: invalid argurments\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Align allocation size
|
||||
size = AlignSize(size, blockAlign_);
|
||||
|
||||
// Walk thru all pages to find a free block
|
||||
block_t* freeBlock = nullptr;
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
auto currBlock = currPage->freeSList;
|
||||
if (currBlock) {
|
||||
// The free S-list is already sorted with the largest block first
|
||||
// Quick check if the head block has enough space.
|
||||
if (currBlock->size >= size) {
|
||||
// Find the smallest matching block in the S-list
|
||||
while (currBlock->nextFreeS
|
||||
&& (currBlock->nextFreeS->size >= size)) {
|
||||
currBlock = currBlock->nextFreeS;
|
||||
}
|
||||
// Return the free block
|
||||
freeBlock = currBlock;
|
||||
break;
|
||||
}
|
||||
}
|
||||
currPage = currPage->next;
|
||||
}
|
||||
|
||||
if (nullptr == freeBlock) {
|
||||
// Allocate a new page for this request
|
||||
currPage = this->NewPage(size);
|
||||
if (nullptr == currPage) {
|
||||
printf("error: out of memory\n");
|
||||
return -1;
|
||||
}
|
||||
freeBlock = currPage->freeSList;
|
||||
}
|
||||
|
||||
// Remove the block from the free lists
|
||||
assert(freeBlock->size >= size);
|
||||
currPage->RemoveFreeMList(freeBlock);
|
||||
currPage->RemoveFreeSList(freeBlock);
|
||||
|
||||
// If the free block we have found is larger than what we are looking for,
|
||||
// we may be able to split our free block in two.
|
||||
uint64_t extraBytes = freeBlock->size - size;
|
||||
if (extraBytes >= blockAlign_) {
|
||||
// Reduce the free block size to the requested value
|
||||
freeBlock->size = size;
|
||||
|
||||
// Allocate a new block to contain the extra buffer
|
||||
auto nextAddr = freeBlock->addr + size;
|
||||
auto newBlock = new block_t(nextAddr, extraBytes);
|
||||
|
||||
// Add the new block to the free lists
|
||||
currPage->InsertFreeMList(newBlock);
|
||||
currPage->InsertFreeSList(newBlock);
|
||||
}
|
||||
|
||||
// Insert the free block into the used list
|
||||
currPage->InsertUsedList(freeBlock);
|
||||
|
||||
// Return the free block address
|
||||
*addr = baseAddress_ + freeBlock->addr;
|
||||
|
||||
// Update allocated size
|
||||
allocated_ += size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int release(uint64_t addr) {
|
||||
// Walk all pages to find the pointer
|
||||
uint64_t local_addr = addr - baseAddress_;
|
||||
block_t* usedBlock = nullptr;
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
if (local_addr >= currPage->addr
|
||||
&& local_addr < (currPage->addr + currPage->size)) {
|
||||
auto currBlock = currPage->usedList;
|
||||
while (currBlock) {
|
||||
if (currBlock->addr == local_addr) {
|
||||
usedBlock = currBlock;
|
||||
break;
|
||||
}
|
||||
currBlock = currBlock->nextUsed;
|
||||
}
|
||||
break;
|
||||
}
|
||||
currPage = currPage->next;
|
||||
}
|
||||
|
||||
// found the corresponding block?
|
||||
if (nullptr == usedBlock) {
|
||||
printf("error: invalid address to release: 0x%lx\n", addr);
|
||||
return -1;
|
||||
}
|
||||
|
||||
auto size = usedBlock->size;
|
||||
|
||||
// Remove the block from the used list
|
||||
currPage->RemoveUsedList(usedBlock);
|
||||
|
||||
// Insert the block into the free M-list.
|
||||
currPage->InsertFreeMList(usedBlock);
|
||||
|
||||
// Check if we can merge adjacent free blocks from the left.
|
||||
if (usedBlock->prevFreeM) {
|
||||
// Calculate the previous address
|
||||
auto prevAddr = usedBlock->prevFreeM->addr + usedBlock->prevFreeM->size;
|
||||
if (usedBlock->addr == prevAddr) {
|
||||
auto prevBlock = usedBlock->prevFreeM;
|
||||
|
||||
// Merge the blocks to the left
|
||||
prevBlock->size += usedBlock->size;
|
||||
prevBlock->nextFreeM = usedBlock->nextFreeM;
|
||||
if (prevBlock->nextFreeM) {
|
||||
prevBlock->nextFreeM->prevFreeM = prevBlock;
|
||||
}
|
||||
|
||||
// Detach previous block from the free S-list since size increased
|
||||
currPage->RemoveFreeSList(prevBlock);
|
||||
|
||||
// reset usedBlock
|
||||
delete usedBlock;
|
||||
usedBlock = prevBlock;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we can merge adjacent free blocks from the right.
|
||||
if (usedBlock->nextFreeM) {
|
||||
// Calculate the next allocation start address
|
||||
auto nextAddr = usedBlock->addr + usedBlock->size;
|
||||
if (usedBlock->nextFreeM->addr == nextAddr) {
|
||||
auto nextBlock = usedBlock->nextFreeM;
|
||||
|
||||
// Merge the blocks to the right
|
||||
usedBlock->size += nextBlock->size;
|
||||
usedBlock->nextFreeM = nextBlock->nextFreeM;
|
||||
if (usedBlock->nextFreeM) {
|
||||
usedBlock->nextFreeM->prevFreeM = usedBlock;
|
||||
}
|
||||
|
||||
// Delete next block
|
||||
currPage->RemoveFreeSList(nextBlock);
|
||||
delete nextBlock;
|
||||
}
|
||||
}
|
||||
|
||||
// Insert the block into the free S-list.
|
||||
currPage->InsertFreeSList(usedBlock);
|
||||
|
||||
// Check if we can free empty pages
|
||||
if (nullptr == currPage->usedList) {
|
||||
// Try to delete the page
|
||||
while (currPage && this->DeletePage(currPage)) {
|
||||
currPage = this->FindNextEmptyPage();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// update allocated size
|
||||
allocated_ -= size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
struct block_t {
|
||||
block_t* nextFreeS;
|
||||
block_t* prevFreeS;
|
||||
|
||||
block_t* nextFreeM;
|
||||
block_t* prevFreeM;
|
||||
|
||||
block_t* nextUsed;
|
||||
block_t* prevUsed;
|
||||
|
||||
uint64_t addr;
|
||||
uint64_t size;
|
||||
|
||||
block_t(uint64_t addr, uint64_t size)
|
||||
: nextFreeS(nullptr)
|
||||
, prevFreeS(nullptr)
|
||||
, nextFreeM(nullptr)
|
||||
, prevFreeM(nullptr)
|
||||
, nextUsed(nullptr)
|
||||
, prevUsed(nullptr)
|
||||
, addr(addr)
|
||||
, size(size)
|
||||
{}
|
||||
};
|
||||
|
||||
struct page_t {
|
||||
page_t* next;
|
||||
|
||||
// List of used blocks
|
||||
block_t* usedList;
|
||||
|
||||
// List with blocks sorted by descreasing sizes
|
||||
// Used for block lookup during memory allocation.
|
||||
block_t* freeSList;
|
||||
|
||||
// List with blocks sorted by increasing memory addresses
|
||||
// Used for block merging during memory release.
|
||||
block_t* freeMList;
|
||||
|
||||
uint64_t addr;
|
||||
uint64_t size;
|
||||
|
||||
page_t(uint64_t addr, uint64_t size) :
|
||||
next(nullptr),
|
||||
usedList(nullptr),
|
||||
addr(addr),
|
||||
size(size) {
|
||||
freeSList = freeMList = new block_t(addr, size);
|
||||
}
|
||||
|
||||
void InsertUsedList(block_t* block) {
|
||||
block->nextUsed = usedList;
|
||||
if (usedList) {
|
||||
usedList->prevUsed = block;
|
||||
}
|
||||
usedList = block;
|
||||
}
|
||||
|
||||
void RemoveUsedList(block_t* block) {
|
||||
if (block->prevUsed) {
|
||||
block->prevUsed->nextUsed = block->nextUsed;
|
||||
} else {
|
||||
usedList = block->nextUsed;
|
||||
}
|
||||
if (block->nextUsed) {
|
||||
block->nextUsed->prevUsed = block->prevUsed;
|
||||
}
|
||||
block->nextUsed = nullptr;
|
||||
block->prevUsed = nullptr;
|
||||
}
|
||||
|
||||
void InsertFreeMList(block_t* block) {
|
||||
block_t* currBlock = freeMList;
|
||||
block_t* prevBlock = nullptr;
|
||||
while (currBlock && (currBlock->addr < block->addr)) {
|
||||
prevBlock = currBlock;
|
||||
currBlock = currBlock->nextFreeM;
|
||||
}
|
||||
block->nextFreeM = currBlock;
|
||||
block->prevFreeM = prevBlock;
|
||||
if (prevBlock) {
|
||||
prevBlock->nextFreeM = block;
|
||||
} else {
|
||||
freeMList = block;
|
||||
}
|
||||
if (currBlock) {
|
||||
currBlock->prevFreeM = block;
|
||||
}
|
||||
}
|
||||
|
||||
void RemoveFreeMList(block_t* block) {
|
||||
if (block->prevFreeM) {
|
||||
block->prevFreeM->nextFreeM = block->nextFreeM;
|
||||
} else {
|
||||
freeMList = block->nextFreeM;
|
||||
}
|
||||
if (block->nextFreeM) {
|
||||
block->nextFreeM->prevFreeM = block->prevFreeM;
|
||||
}
|
||||
block->nextFreeM = nullptr;
|
||||
block->prevFreeM = nullptr;
|
||||
}
|
||||
|
||||
void InsertFreeSList(block_t* block) {
|
||||
block_t* currBlock = this->freeSList;
|
||||
block_t* prevBlock = nullptr;
|
||||
while (currBlock && (currBlock->size > block->size)) {
|
||||
prevBlock = currBlock;
|
||||
currBlock = currBlock->nextFreeS;
|
||||
}
|
||||
block->nextFreeS = currBlock;
|
||||
block->prevFreeS = prevBlock;
|
||||
if (prevBlock) {
|
||||
prevBlock->nextFreeS = block;
|
||||
} else {
|
||||
this->freeSList = block;
|
||||
}
|
||||
if (currBlock) {
|
||||
currBlock->prevFreeS = block;
|
||||
}
|
||||
}
|
||||
|
||||
void RemoveFreeSList(block_t* block) {
|
||||
if (block->prevFreeS) {
|
||||
block->prevFreeS->nextFreeS = block->nextFreeS;
|
||||
} else {
|
||||
freeSList = block->nextFreeS;
|
||||
}
|
||||
if (block->nextFreeS) {
|
||||
block->nextFreeS->prevFreeS = block->prevFreeS;
|
||||
}
|
||||
block->nextFreeS = nullptr;
|
||||
block->prevFreeS = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
page_t* NewPage(uint64_t size) {
|
||||
// Increase buffer size to include the page and first block size
|
||||
// also add padding to ensure page alignment
|
||||
size = AlignSize(size, pageAlign_);
|
||||
|
||||
// Allocate page memory
|
||||
auto addr = nextAddress_;
|
||||
nextAddress_ += size;
|
||||
|
||||
// Overflow check
|
||||
if (nextAddress_ > capacity_)
|
||||
return nullptr;
|
||||
|
||||
// Allocate object
|
||||
auto newPage = new page_t(addr, size);
|
||||
|
||||
// Insert the new page into the list
|
||||
newPage->next = pages_;
|
||||
pages_ = newPage;
|
||||
|
||||
return newPage;
|
||||
}
|
||||
|
||||
bool DeletePage(page_t* page) {
|
||||
// The page should be empty
|
||||
assert(nullptr == page->usedList);
|
||||
assert(page->freeMList && (nullptr == page->freeMList->nextFreeM));
|
||||
|
||||
// Only delete top-level pages
|
||||
auto nextAddr = page->addr + page->size;
|
||||
if (nextAddr != nextAddress_)
|
||||
return false;
|
||||
|
||||
// Remove the page from the list
|
||||
page_t* prevPage = nullptr;
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
if (currPage == page) {
|
||||
if (prevPage) {
|
||||
prevPage->next = currPage->next;
|
||||
} else {
|
||||
pages_ = currPage->next;
|
||||
}
|
||||
break;
|
||||
}
|
||||
prevPage = currPage;
|
||||
currPage = currPage->next;
|
||||
}
|
||||
|
||||
// Update next allocation address
|
||||
nextAddress_ = page->addr;
|
||||
|
||||
// free object
|
||||
delete page->freeMList;
|
||||
delete page;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
page_t* FindNextEmptyPage() {
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
if (nullptr == currPage->usedList)
|
||||
return currPage;
|
||||
currPage = currPage->next;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static uint64_t AlignSize(uint64_t size, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
}
|
||||
|
||||
uint64_t baseAddress_;
|
||||
uint64_t capacity_;
|
||||
uint32_t pageAlign_;
|
||||
uint32_t blockAlign_;
|
||||
page_t* pages_;
|
||||
uint16_t nextAddress_;
|
||||
uint64_t allocated_;
|
||||
};
|
||||
|
||||
} // namespace vortex
|
||||
24674
runtime/common/nlohmann_json.hpp
Normal file
24674
runtime/common/nlohmann_json.hpp
Normal file
File diff suppressed because it is too large
Load Diff
359
runtime/common/scope.cpp
Normal file
359
runtime/common/scope.cpp
Normal file
@@ -0,0 +1,359 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "scope.h"
|
||||
#include <VX_config.h>
|
||||
#include <nlohmann_json.hpp>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <thread>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
#include <assert.h>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
#include <condition_variable>
|
||||
#include <mutex>
|
||||
#include <unordered_set>
|
||||
#include <sstream>
|
||||
|
||||
#define FRAME_FLUSH_SIZE 100
|
||||
|
||||
#define MMIO_SCOPE_READ (AFU_IMAGE_MMIO_SCOPE_READ * 4)
|
||||
#define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4)
|
||||
|
||||
#define CMD_GET_WIDTH 0
|
||||
#define CMD_GET_COUNT 1
|
||||
#define CMD_GET_START 2
|
||||
#define CMD_GET_DATA 3
|
||||
#define CMD_SET_START 4
|
||||
#define CMD_SET_STOP 5
|
||||
|
||||
#define CHECK_ERR(_expr) \
|
||||
do { \
|
||||
int err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[SCOPE] error: '%s' returned %d!\n", #_expr, err); \
|
||||
return err; \
|
||||
} while (false)
|
||||
|
||||
struct tap_signal_t {
|
||||
uint32_t id;
|
||||
std::string name;
|
||||
uint32_t width;
|
||||
};
|
||||
|
||||
struct tap_t {
|
||||
uint32_t id;
|
||||
uint32_t width;
|
||||
uint32_t frames;
|
||||
uint32_t cur_frame;
|
||||
uint64_t cycle_time;
|
||||
std::string path;
|
||||
std::vector<tap_signal_t> signals;
|
||||
};
|
||||
|
||||
static scope_callback_t g_callback;
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
static std::vector<std::string> split(const std::string &s, char delimiter) {
|
||||
std::vector<std::string> tokens;
|
||||
std::string token;
|
||||
std::istringstream tokenStream(s);
|
||||
while (std::getline(tokenStream, token, delimiter)) {
|
||||
tokens.push_back(token);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
static void dump_module(std::ofstream& ofs,
|
||||
const std::string& name,
|
||||
std::unordered_map<std::string, std::unordered_set<std::string>>& hierarchy,
|
||||
std::unordered_map<std::string, tap_t*>& tails,
|
||||
int indentation) {
|
||||
std::string indent(indentation, ' ');
|
||||
ofs << indent << "$scope module " << name << " $end" << std::endl;
|
||||
|
||||
auto itt = tails.find(name);
|
||||
if (itt != tails.end()) {
|
||||
for (auto& signal : itt->second->signals) {
|
||||
ofs << indent << " $var reg " << signal.width << " " << signal.id << " " << signal.name << " $end" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
auto ith = hierarchy.find(name);
|
||||
if (ith != hierarchy.end()) {
|
||||
for (auto& child : ith->second) {
|
||||
dump_module(ofs, child, hierarchy, tails, indentation + 1);
|
||||
}
|
||||
}
|
||||
|
||||
ofs << indent << "$upscope $end" << std::endl;
|
||||
}
|
||||
|
||||
static void dump_header(std::ofstream& ofs, std::vector<tap_t>& taps) {
|
||||
ofs << "$version Generated by Vortex Scope Analyzer $end" << std::endl;
|
||||
ofs << "$timescale 1 ns $end" << std::endl;
|
||||
ofs << "$scope module TOP $end" << std::endl;
|
||||
ofs << " $var reg 1 0 clk $end" << std::endl;
|
||||
|
||||
std::unordered_map<std::string, std::unordered_set<std::string>> hierarchy;
|
||||
std::unordered_set<std::string> heads;
|
||||
std::unordered_map<std::string, tap_t*> tails;
|
||||
|
||||
// Build hierarchy
|
||||
for (auto& tap : taps) {
|
||||
std::vector<std::string> tokens = split(tap.path, '.');
|
||||
for (size_t i = 1; i < tokens.size(); ++i) {
|
||||
hierarchy[tokens[i-1]].insert(tokens[i]);
|
||||
}
|
||||
auto h = tokens[0];
|
||||
auto t = tokens[tokens.size()-1];
|
||||
heads.insert(h);
|
||||
tails[t] = &tap;
|
||||
}
|
||||
|
||||
// Dump module huierarchy
|
||||
for (auto& head : heads) {
|
||||
dump_module(ofs, head, hierarchy, tails, 1);
|
||||
}
|
||||
|
||||
ofs << "$upscope $end" << std::endl;
|
||||
ofs << "enddefinitions $end" << std::endl;
|
||||
}
|
||||
|
||||
static tap_t* find_nearest_tap(std::vector<tap_t>& taps) {
|
||||
tap_t* nearest = nullptr;
|
||||
for (auto& tap : taps) {
|
||||
if (tap.cur_frame == tap.frames)
|
||||
continue;
|
||||
if (nearest != nullptr) {
|
||||
if (tap.cycle_time < nearest->cycle_time)
|
||||
nearest = &tap;
|
||||
} else {
|
||||
nearest = &tap;
|
||||
}
|
||||
}
|
||||
return nearest;
|
||||
}
|
||||
|
||||
static uint64_t advance_time(std::ofstream& ofs, uint64_t next_time, uint64_t cur_time) {
|
||||
while (cur_time < next_time) {
|
||||
ofs << '#' << (cur_time * 2 + 0) << std::endl;
|
||||
ofs << "b0 0" << std::endl;
|
||||
ofs << '#' << (cur_time * 2 + 1) << std::endl;
|
||||
ofs << "b1 0" << std::endl;
|
||||
++cur_time;
|
||||
}
|
||||
return cur_time;
|
||||
}
|
||||
|
||||
static int dump_tap(std::ofstream& ofs, tap_t* tap, vx_device_h hdevice) {
|
||||
uint32_t signal_offset = 0;
|
||||
uint32_t frame_offset = 0;
|
||||
uint64_t word;
|
||||
|
||||
std::vector<char> signal_data(tap->width);
|
||||
auto signal_it = tap->signals.rbegin();
|
||||
uint32_t signal_width = signal_it->width;
|
||||
|
||||
do {
|
||||
// read data
|
||||
uint64_t cmd_data = (tap->id << 3) | CMD_GET_DATA;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &word));
|
||||
do {
|
||||
uint32_t word_offset = frame_offset % 64;
|
||||
signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0';
|
||||
++signal_offset;
|
||||
++frame_offset;
|
||||
if (signal_offset == signal_width) {
|
||||
signal_data[signal_width] = 0; // string null termination
|
||||
ofs << 'b' << signal_data.data() << ' ' << signal_it->id << std::endl;
|
||||
if (frame_offset == tap->width) {
|
||||
// end-of-frame
|
||||
++tap->cur_frame;
|
||||
if (tap->cur_frame != tap->frames) {
|
||||
// read next delta
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &word));
|
||||
tap->cycle_time += 1 + word;
|
||||
if (0 == (tap->cur_frame % FRAME_FLUSH_SIZE)) {
|
||||
ofs << std::flush;
|
||||
std::cout << std::dec << "[SCOPE] flush tap #" << tap->id << ": "<< tap->cur_frame << "/" << tap->frames << " frames, next_time=" << tap->cycle_time << std::endl;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
signal_offset = 0;
|
||||
++signal_it;
|
||||
signal_width = signal_it->width;
|
||||
}
|
||||
} while ((frame_offset % 64) != 0);
|
||||
} while (frame_offset != tap->width);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t start_time, uint64_t stop_time) {
|
||||
if (nullptr == hdevice || nullptr == callback)
|
||||
return -1;
|
||||
|
||||
const char* json_path = getenv("SCOPE_JSON_PATH");
|
||||
std::ifstream ifs(json_path);
|
||||
if (!ifs) {
|
||||
std::cerr << "[SCOPE] error: cannot open scope manifest file: " << json_path << std::endl;
|
||||
return -1;
|
||||
}
|
||||
auto json_obj = json::parse(ifs);
|
||||
if (json_obj.is_null()) {
|
||||
std::cerr << "[SCOPE] error: invalid scope manifest file: " << json_path << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
g_callback = *callback;
|
||||
|
||||
// validate scope manifest
|
||||
for (auto& tap : json_obj["taps"]) {
|
||||
auto id = tap["id"].get<uint32_t>();
|
||||
auto width = tap["width"].get<uint32_t>();
|
||||
|
||||
uint64_t cmd_width = (id << 3) | CMD_GET_WIDTH;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_width));
|
||||
uint64_t dev_width;
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &dev_width));
|
||||
if (width != dev_width) {
|
||||
std::cerr << "[SCOPE] error: invalid tap #" << id << " width, actual=" << dev_width << ", expected=" << width << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// set stop time
|
||||
if (stop_time != uint64_t(-1)) {
|
||||
std::cout << "[SCOPE] stop time: " << std::dec << stop_time << "s" << std::endl;
|
||||
for (auto& tap : json_obj["taps"]) {
|
||||
auto id = tap["id"].get<uint32_t>();
|
||||
uint64_t cmd_stop = (stop_time << 11) | (id << 3) | CMD_SET_STOP;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
|
||||
}
|
||||
}
|
||||
|
||||
// start recording
|
||||
if (start_time != uint64_t(-1)) {
|
||||
std::cout << "[SCOPE] start time: " << std::dec << start_time << "s" << std::endl;
|
||||
for (auto& tap : json_obj["taps"]) {
|
||||
auto id = tap["id"].get<uint32_t>();
|
||||
uint64_t cmd_start = (start_time << 11) | (id << 3) | CMD_SET_START;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int vx_scope_stop(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
std::vector<tap_t> taps;
|
||||
|
||||
{
|
||||
const char* json_path = getenv("SCOPE_JSON_PATH");
|
||||
std::ifstream ifs(json_path);
|
||||
auto json_obj = json::parse(ifs);
|
||||
if (json_obj.is_null())
|
||||
return 0;
|
||||
|
||||
uint32_t signal_id = 1;
|
||||
|
||||
for (auto& tap : json_obj["taps"]) {
|
||||
tap_t _tap;
|
||||
_tap.id = tap["id"].get<uint32_t>();
|
||||
_tap.width = tap["width"].get<uint32_t>();
|
||||
_tap.path = tap["path"].get<std::string>();
|
||||
_tap.cycle_time = 0;
|
||||
_tap.frames = 0;
|
||||
_tap.cur_frame = 0;
|
||||
|
||||
for (auto& signal : tap["signals"]) {
|
||||
auto name = signal[0].get<std::string>();
|
||||
auto width = signal[1].get<uint32_t>();
|
||||
_tap.signals.push_back({signal_id, name, width});
|
||||
++signal_id;
|
||||
}
|
||||
|
||||
taps.emplace_back(std::move(_tap));
|
||||
}
|
||||
}
|
||||
|
||||
// stop recording
|
||||
for (auto& tap : taps) {
|
||||
uint64_t cmd_stop = (0 << 11) | (tap.id << 3) | CMD_SET_STOP;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop));
|
||||
}
|
||||
|
||||
std::cout << "[SCOPE] trace dump begin..." << std::endl;
|
||||
|
||||
std::ofstream ofs("scope.vcd");
|
||||
|
||||
dump_header(ofs, taps);
|
||||
|
||||
// load trace info
|
||||
for (auto& tap : taps) {
|
||||
uint64_t count, start, delta;
|
||||
|
||||
// get count
|
||||
uint64_t cmd_count = (tap.id << 3) | CMD_GET_COUNT;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_count));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &count));
|
||||
|
||||
// get start
|
||||
uint64_t cmd_start = (tap.id << 3) | CMD_GET_START;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &start));
|
||||
|
||||
// get data
|
||||
uint64_t cmd_data = (tap.id << 3) | CMD_GET_DATA;
|
||||
CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data));
|
||||
CHECK_ERR(g_callback.registerRead(hdevice, &delta));
|
||||
|
||||
tap.frames = count;
|
||||
tap.cycle_time = 1 + start + delta;
|
||||
|
||||
std::cout << std::dec << "[SCOPE] tap #" << tap.id
|
||||
<< ": width=" << tap.width
|
||||
<< ", num_frames=" << tap.frames
|
||||
<< ", start_time=" << tap.cycle_time
|
||||
<< ", path=" << tap.path << std::endl;
|
||||
}
|
||||
|
||||
uint64_t cur_time = 0;
|
||||
|
||||
while (true) {
|
||||
// find the nearest tap
|
||||
auto tap = find_nearest_tap(taps);
|
||||
if (tap == nullptr)
|
||||
break;
|
||||
// advance clock
|
||||
cur_time = advance_time(ofs, tap->cycle_time, cur_time);
|
||||
// dump tap
|
||||
CHECK_ERR(dump_tap(ofs, tap, hdevice));
|
||||
};
|
||||
|
||||
std::cout << "[SCOPE] trace dump done! - " << (cur_time/2) << " cycles" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
35
runtime/common/scope.h
Normal file
35
runtime/common/scope.h
Normal file
@@ -0,0 +1,35 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vortex.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef int (*pfn_registerWrite)(vx_device_h hdevice, uint64_t value);
|
||||
typedef int (*pfn_registerRead)(vx_device_h hdevice, uint64_t *value);
|
||||
|
||||
struct scope_callback_t {
|
||||
pfn_registerWrite registerWrite;
|
||||
pfn_registerRead registerRead;
|
||||
};
|
||||
|
||||
int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t start_time, uint64_t stop_time);
|
||||
int vx_scope_stop(vx_device_h hdevice);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
463
runtime/common/utils.cpp
Normal file
463
runtime/common/utils.cpp
Normal file
@@ -0,0 +1,463 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "utils.h"
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <list>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <vortex.h>
|
||||
#include <assert.h>
|
||||
|
||||
#define RT_CHECK(_expr, _cleanup) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
if (0 == _ret) \
|
||||
break; \
|
||||
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
uint64_t aligned_size(uint64_t size, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
}
|
||||
|
||||
bool is_aligned(uint64_t addr, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return 0 == (addr & (alignment - 1));
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class AutoPerfDump {
|
||||
public:
|
||||
AutoPerfDump() : perf_class_(0) {}
|
||||
|
||||
~AutoPerfDump() {
|
||||
for (auto hdevice : hdevices_) {
|
||||
vx_dump_perf(hdevice, stdout);
|
||||
}
|
||||
}
|
||||
|
||||
void add_device(vx_device_h hdevice) {
|
||||
auto perf_class_s = getenv("PERF_CLASS");
|
||||
if (perf_class_s) {
|
||||
perf_class_ = std::atoi(perf_class_s);
|
||||
vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, perf_class_);
|
||||
}
|
||||
hdevices_.push_back(hdevice);
|
||||
}
|
||||
|
||||
void remove_device(vx_device_h hdevice) {
|
||||
hdevices_.remove(hdevice);
|
||||
vx_dump_perf(hdevice, stdout);
|
||||
}
|
||||
|
||||
int get_perf_class() const {
|
||||
return perf_class_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::list<vx_device_h> hdevices_;
|
||||
int perf_class_;
|
||||
};
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
AutoPerfDump gAutoPerfDump;
|
||||
#endif
|
||||
|
||||
void perf_add_device(vx_device_h hdevice) {
|
||||
#ifdef DUMP_PERF_STATS
|
||||
gAutoPerfDump.add_device(hdevice);
|
||||
#else
|
||||
(void)hdevice;
|
||||
#endif
|
||||
}
|
||||
|
||||
void perf_remove_device(vx_device_h hdevice) {
|
||||
#ifdef DUMP_PERF_STATS
|
||||
gAutoPerfDump.remove_device(hdevice);
|
||||
#else
|
||||
(void)hdevice;
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size) {
|
||||
int err = 0;
|
||||
|
||||
if (NULL == content || 0 == size)
|
||||
return -1;
|
||||
|
||||
uint64_t kernel_base_addr;
|
||||
err = vx_dev_caps(hdevice, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr);
|
||||
if (err != 0)
|
||||
return err;
|
||||
|
||||
return vx_copy_to_dev(hdevice, kernel_base_addr, content, size);
|
||||
}
|
||||
|
||||
extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename) {
|
||||
std::ifstream ifs(filename);
|
||||
if (!ifs) {
|
||||
std::cout << "error: " << filename << " not found" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// read file content
|
||||
ifs.seekg(0, ifs.end);
|
||||
auto size = ifs.tellg();
|
||||
auto content = new char [size];
|
||||
ifs.seekg(0, ifs.beg);
|
||||
ifs.read(content, size);
|
||||
|
||||
// upload
|
||||
int err = vx_upload_kernel_bytes(hdevice, content, size);
|
||||
|
||||
// release buffer
|
||||
delete[] content;
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void DeviceConfig::write(uint32_t addr, uint32_t value) {
|
||||
data_[addr] = value;
|
||||
}
|
||||
|
||||
uint32_t DeviceConfig::read(uint32_t addr) const {
|
||||
if (0 == data_.count(addr)) {
|
||||
printf("Error: DeviceConfig::read(%d) failed\n", addr);
|
||||
}
|
||||
return data_.at(addr);
|
||||
}
|
||||
|
||||
int dcr_initialize(vx_device_h hdevice) {
|
||||
const uint64_t startup_addr(STARTUP_ADDR);
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static uint64_t get_csr_64(const void* ptr, int addr) {
|
||||
auto w_ptr = reinterpret_cast<const uint32_t*>(ptr);
|
||||
uint32_t value_lo = w_ptr[addr - VX_CSR_MPM_BASE];
|
||||
uint32_t value_hi = w_ptr[addr - VX_CSR_MPM_BASE + 32];
|
||||
return (uint64_t(value_hi) << 32) | value_lo;
|
||||
}
|
||||
|
||||
extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
||||
int ret = 0;
|
||||
|
||||
uint64_t instrs = 0;
|
||||
uint64_t cycles = 0;
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
auto perf_class = gAutoPerfDump.get_perf_class();
|
||||
|
||||
// PERF: pipeline stalls
|
||||
uint64_t ibuffer_stalls = 0;
|
||||
uint64_t scoreboard_stalls = 0;
|
||||
uint64_t lsu_stalls = 0;
|
||||
uint64_t fpu_stalls = 0;
|
||||
uint64_t alu_stalls = 0;
|
||||
uint64_t sfu_stalls = 0;
|
||||
uint64_t ifetches = 0;
|
||||
uint64_t loads = 0;
|
||||
uint64_t stores = 0;
|
||||
uint64_t ifetch_lat = 0;
|
||||
uint64_t load_lat = 0;
|
||||
// PERF: Icache
|
||||
uint64_t icache_reads = 0;
|
||||
uint64_t icache_read_misses = 0;
|
||||
// PERF: Dcache
|
||||
uint64_t dcache_reads = 0;
|
||||
uint64_t dcache_writes = 0;
|
||||
uint64_t dcache_read_misses = 0;
|
||||
uint64_t dcache_write_misses = 0;
|
||||
uint64_t dcache_bank_stalls = 0;
|
||||
uint64_t dcache_mshr_stalls = 0;
|
||||
// PERF: shared memory
|
||||
uint64_t smem_reads = 0;
|
||||
uint64_t smem_writes = 0;
|
||||
uint64_t smem_bank_stalls = 0;
|
||||
// PERF: l2cache
|
||||
uint64_t l2cache_reads = 0;
|
||||
uint64_t l2cache_writes = 0;
|
||||
uint64_t l2cache_read_misses = 0;
|
||||
uint64_t l2cache_write_misses = 0;
|
||||
uint64_t l2cache_bank_stalls = 0;
|
||||
uint64_t l2cache_mshr_stalls = 0;
|
||||
// PERF: l3cache
|
||||
uint64_t l3cache_reads = 0;
|
||||
uint64_t l3cache_writes = 0;
|
||||
uint64_t l3cache_read_misses = 0;
|
||||
uint64_t l3cache_write_misses = 0;
|
||||
uint64_t l3cache_bank_stalls = 0;
|
||||
uint64_t l3cache_mshr_stalls = 0;
|
||||
// PERF: memory
|
||||
uint64_t mem_reads = 0;
|
||||
uint64_t mem_writes = 0;
|
||||
uint64_t mem_lat = 0;
|
||||
#endif
|
||||
|
||||
uint64_t num_cores;
|
||||
ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
std::vector<uint8_t> staging_buf(64* sizeof(uint32_t));
|
||||
|
||||
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
||||
uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size();
|
||||
ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
|
||||
uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
|
||||
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
|
||||
instrs += instrs_per_core;
|
||||
cycles = std::max<uint64_t>(cycles_per_core, cycles);
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
switch (perf_class) {
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
// PERF: pipeline
|
||||
// ibuffer_stall
|
||||
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core);
|
||||
ibuffer_stalls += ibuffer_stalls_per_core;
|
||||
// scoreboard_stall
|
||||
uint64_t scoreboard_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core);
|
||||
scoreboard_stalls += scoreboard_stalls_per_core;
|
||||
// alu_stall
|
||||
uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
|
||||
alu_stalls += alu_stalls_per_core;
|
||||
// lsu_stall
|
||||
uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
|
||||
lsu_stalls += lsu_stalls_per_core;
|
||||
// fpu_stall
|
||||
uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
|
||||
fpu_stalls += fpu_stalls_per_core;
|
||||
// sfu_stall
|
||||
uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core);
|
||||
sfu_stalls += sfu_stalls_per_core;
|
||||
// PERF: memory
|
||||
// ifetches
|
||||
uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
|
||||
ifetches += ifetches_per_core;
|
||||
// loads
|
||||
uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
|
||||
loads += loads_per_core;
|
||||
// stores
|
||||
uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
|
||||
stores += stores_per_core;
|
||||
// ifetch latency
|
||||
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = (int)(double(ifetch_lat_per_core) / double(ifetches_per_core));
|
||||
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
}
|
||||
ifetch_lat += ifetch_lat_per_core;
|
||||
// load latency
|
||||
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = (int)(double(load_lat_per_core) / double(loads_per_core));
|
||||
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
}
|
||||
load_lat += load_lat_per_core;
|
||||
} break;
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
if (0 == core_id) {
|
||||
// PERF: Icache
|
||||
icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS);
|
||||
icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R);
|
||||
|
||||
// PERF: Dcache
|
||||
dcache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_READS);
|
||||
dcache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_WRITES);
|
||||
dcache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_R);
|
||||
dcache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_W);
|
||||
dcache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_BANK_ST);
|
||||
dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST);
|
||||
|
||||
// PERF: smem
|
||||
smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS);
|
||||
smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES);
|
||||
smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST);
|
||||
|
||||
// PERF: L2cache
|
||||
l2cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_READS);
|
||||
l2cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_WRITES);
|
||||
l2cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_R);
|
||||
l2cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_W);
|
||||
l2cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_BANK_ST);
|
||||
l2cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MSHR_ST);
|
||||
|
||||
// PERF: L3cache
|
||||
l3cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_READS);
|
||||
l3cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_WRITES);
|
||||
l3cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_R);
|
||||
l3cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_W);
|
||||
l3cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_BANK_ST);
|
||||
l3cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MSHR_ST);
|
||||
|
||||
// PERF: memory
|
||||
mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS);
|
||||
mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES);
|
||||
mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LAT);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
float IPC = (float)(double(instrs) / double(cycles));
|
||||
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
switch (perf_class) {
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
|
||||
int load_avg_lat = (int)(double(load_lat) / double(loads));
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
|
||||
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
|
||||
fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls);
|
||||
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
|
||||
fprintf(stream, "PERF: sfu unit stalls=%ld\n", sfu_stalls);
|
||||
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
|
||||
fprintf(stream, "PERF: loads=%ld\n", loads);
|
||||
fprintf(stream, "PERF: stores=%ld\n", stores);
|
||||
fprintf(stream, "PERF: ifetch latency=%d cycles\n", ifetch_avg_lat);
|
||||
fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
|
||||
|
||||
} break;
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);
|
||||
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
|
||||
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
|
||||
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
|
||||
int l2cache_read_hit_ratio = (int)((1.0 - (double(l2cache_read_misses) / double(l2cache_reads))) * 100);
|
||||
int l2cache_write_hit_ratio = (int)((1.0 - (double(l2cache_write_misses) / double(l2cache_writes))) * 100);
|
||||
int l2cache_bank_utilization = (int)((double(l2cache_reads + l2cache_writes) / double(l2cache_reads + l2cache_writes + l2cache_bank_stalls)) * 100);
|
||||
int l3cache_read_hit_ratio = (int)((1.0 - (double(l3cache_read_misses) / double(l3cache_reads))) * 100);
|
||||
int l3cache_write_hit_ratio = (int)((1.0 - (double(l3cache_write_misses) / double(l3cache_writes))) * 100);
|
||||
int l3cache_bank_utilization = (int)((double(l3cache_reads + l3cache_writes) / double(l3cache_reads + l3cache_writes + l3cache_bank_stalls)) * 100);
|
||||
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
|
||||
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
|
||||
fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
|
||||
fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
|
||||
fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes);
|
||||
fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization);
|
||||
fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
|
||||
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
|
||||
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
|
||||
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
|
||||
fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
|
||||
fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
|
||||
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization);
|
||||
fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls);
|
||||
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
|
||||
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
|
||||
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization);
|
||||
fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls);
|
||||
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
|
||||
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
fflush(stream);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value) {
|
||||
int ret = 0;
|
||||
uint64_t num_cores;
|
||||
ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
if (core_id >= (int)num_cores) {
|
||||
std::cout << "error: core_id out of range" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::vector<uint8_t> staging_buf(64 * sizeof(uint32_t));
|
||||
|
||||
uint64_t _value = 0;
|
||||
|
||||
unsigned i = 0;
|
||||
if (core_id != -1) {
|
||||
i = core_id;
|
||||
num_cores = core_id + 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < num_cores; ++i) {
|
||||
uint64_t mpm_mem_addr = IO_CSR_ADDR + i * staging_buf.size();
|
||||
ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
auto per_core_value = get_csr_64(staging_buf.data(), counter);
|
||||
if (counter == VX_CSR_MCYCLE) {
|
||||
_value = std::max<uint64_t>(per_core_value, _value);
|
||||
} else {
|
||||
_value += per_core_value;
|
||||
}
|
||||
}
|
||||
|
||||
// output
|
||||
*value = _value;
|
||||
|
||||
return 0;
|
||||
}
|
||||
47
runtime/common/utils.h
Normal file
47
runtime/common/utils.h
Normal file
@@ -0,0 +1,47 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vortex.h>
|
||||
#include <cstdint>
|
||||
#include <unordered_map>
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
|
||||
class DeviceConfig {
|
||||
public:
|
||||
void write(uint32_t addr, uint32_t value);
|
||||
uint32_t read(uint32_t addr) const;
|
||||
private:
|
||||
std::unordered_map<uint32_t, uint32_t> data_;
|
||||
};
|
||||
|
||||
int dcr_initialize(vx_device_h device);
|
||||
|
||||
uint64_t aligned_size(uint64_t size, uint64_t alignment);
|
||||
|
||||
bool is_aligned(uint64_t addr, uint64_t alignment);
|
||||
|
||||
void perf_add_device(vx_device_h device);
|
||||
|
||||
void perf_remove_device(vx_device_h device);
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
#define ALLOC_BASE_ADDR CACHE_BLOCK_SIZE
|
||||
#define ALLOC_MAX_ADDR STARTUP_ADDR
|
||||
#if (XLEN == 64)
|
||||
#define GLOBAL_MEM_SIZE 0x200000000 // 8 GB
|
||||
#else
|
||||
#define GLOBAL_MEM_SIZE 0x100000000 // 4 GB
|
||||
#endif
|
||||
111
runtime/include/vortex.h
Normal file
111
runtime/include/vortex.h
Normal file
@@ -0,0 +1,111 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef __VX_VORTEX_H__
|
||||
#define __VX_VORTEX_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void* vx_device_h;
|
||||
|
||||
// device caps ids
|
||||
#define VX_CAPS_VERSION 0x0
|
||||
#define VX_CAPS_NUM_THREADS 0x1
|
||||
#define VX_CAPS_NUM_WARPS 0x2
|
||||
#define VX_CAPS_NUM_CORES 0x3
|
||||
#define VX_CAPS_CACHE_LINE_SIZE 0x4
|
||||
#define VX_CAPS_GLOBAL_MEM_SIZE 0x5
|
||||
#define VX_CAPS_LOCAL_MEM_SIZE 0x6
|
||||
#define VX_CAPS_KERNEL_BASE_ADDR 0x7
|
||||
#define VX_CAPS_ISA_FLAGS 0x8
|
||||
|
||||
// device isa flags
|
||||
#define VX_ISA_STD_A (1ull << 0)
|
||||
#define VX_ISA_STD_C (1ull << 2)
|
||||
#define VX_ISA_STD_D (1ull << 3)
|
||||
#define VX_ISA_STD_E (1ull << 4)
|
||||
#define VX_ISA_STD_F (1ull << 5)
|
||||
#define VX_ISA_STD_H (1ull << 7)
|
||||
#define VX_ISA_STD_I (1ull << 8)
|
||||
#define VX_ISA_STD_N (1ull << 13)
|
||||
#define VX_ISA_STD_Q (1ull << 16)
|
||||
#define VX_ISA_STD_S (1ull << 18)
|
||||
#define VX_ISA_STD_U (1ull << 20)
|
||||
#define VX_ISA_BASE(flags) (1 << (((flags >> 30) & 0x3) + 4))
|
||||
#define VX_ISA_EXT_TEX (1ull << 32)
|
||||
#define VX_ISA_EXT_RASTER (1ull << 33)
|
||||
#define VX_ISA_EXT_ROP (1ull << 34)
|
||||
|
||||
// device memory types
|
||||
#define VX_MEM_TYPE_GLOBAL 0
|
||||
#define VX_MEM_TYPE_LOCAL 1
|
||||
|
||||
// ready wait timeout
|
||||
#define VX_MAX_TIMEOUT (24*60*60*1000) // 24 Hr
|
||||
|
||||
// open the device and connect to it
|
||||
int vx_dev_open(vx_device_h* hdevice);
|
||||
|
||||
// Close the device when all the operations are done
|
||||
int vx_dev_close(vx_device_h hdevice);
|
||||
|
||||
// return device configurations
|
||||
int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value);
|
||||
|
||||
// allocate device memory and return address
|
||||
int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr);
|
||||
|
||||
// release device memory
|
||||
int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr);
|
||||
|
||||
// get device memory info
|
||||
int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used);
|
||||
|
||||
// Copy bytes from host to device memory
|
||||
int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size);
|
||||
|
||||
// Copy bytes from device memory to host
|
||||
int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size);
|
||||
|
||||
// Start device execution
|
||||
int vx_start(vx_device_h hdevice);
|
||||
|
||||
// Wait for device ready with milliseconds timeout
|
||||
int vx_ready_wait(vx_device_h hdevice, uint64_t timeout);
|
||||
|
||||
// write device configuration registers
|
||||
int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value);
|
||||
|
||||
////////////////////////////// UTILITY FUNCTIONS //////////////////////////////
|
||||
|
||||
// upload kernel bytes to device
|
||||
int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size);
|
||||
|
||||
// upload kernel file to device
|
||||
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename);
|
||||
|
||||
// performance counters
|
||||
int vx_dump_perf(vx_device_h hdevice, FILE* stream);
|
||||
int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // __VX_VORTEX_H__
|
||||
@@ -1,214 +0,0 @@
|
||||
#ifndef VX_INTRINSICS_H
|
||||
#define VX_INTRINSICS_H
|
||||
|
||||
#include <VX_config.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef __ASSEMBLY__
|
||||
#define __ASM_STR(x) x
|
||||
#else
|
||||
#define __ASM_STR(x) #x
|
||||
#endif
|
||||
|
||||
#define csr_read(csr) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define csr_write(csr, val) ({ \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \
|
||||
})
|
||||
|
||||
#define csr_swap(csr, val) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define csr_read_set(csr, val) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define csr_set(csr, val) ({ \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \
|
||||
})
|
||||
|
||||
#define csr_read_clear(csr, val) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define csr_clear(csr, val) ({ \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \
|
||||
})
|
||||
|
||||
// Texture load
|
||||
#define vx_tex(unit, u, v, lod) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(unit), "r"(u), "r"(v), "r"(lod)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// Conditional move
|
||||
#define vx_cmov(c, t, f) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// Set thread mask
|
||||
inline void vx_tmc(unsigned thread_mask) {
|
||||
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask));
|
||||
}
|
||||
|
||||
// Set thread predicate
|
||||
inline void vx_pred(unsigned condition) {
|
||||
asm volatile (".insn s 0x6b, 0, x1, 0(%0)" :: "r"(condition));
|
||||
}
|
||||
|
||||
typedef void (*vx_wspawn_pfn)();
|
||||
|
||||
// Spawn warps
|
||||
inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
|
||||
asm volatile (".insn s 0x6b, 1, %1, 0(%0)" :: "r"(num_warps), "r"(func_ptr));
|
||||
}
|
||||
|
||||
// Split on a predicate
|
||||
inline void vx_split(int predicate) {
|
||||
asm volatile (".insn s 0x6b, 2, x0, 0(%0)" :: "r"(predicate));
|
||||
}
|
||||
|
||||
// Join
|
||||
inline void vx_join() {
|
||||
asm volatile (".insn s 0x6b, 3, x0, 0(x0)");
|
||||
}
|
||||
|
||||
// Warp Barrier
|
||||
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
||||
asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
|
||||
}
|
||||
|
||||
// Prefetch
|
||||
inline void vx_prefetch(unsigned addr) {
|
||||
asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
|
||||
}
|
||||
|
||||
// Return active warp's thread id
|
||||
inline int vx_thread_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_WTID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return active core's local thread id
|
||||
inline int vx_thread_lid() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LTID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return processsor global thread id
|
||||
inline int vx_thread_gid() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GTID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return active core's local warp id
|
||||
inline int vx_warp_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LWID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return processsor's global warp id
|
||||
inline int vx_warp_gid() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GWID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return processsor core id
|
||||
inline int vx_core_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GCID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return current threadk mask
|
||||
inline int vx_thread_mask() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_TMASK));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return the number of threads in a warp
|
||||
inline int vx_num_threads() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NT));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return the number of warps in a core
|
||||
inline int vx_num_warps() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NW));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return the number of cores in the processsor
|
||||
inline int vx_num_cores() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NC));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline void vx_fence() {
|
||||
asm volatile ("fence iorw, iorw");
|
||||
}
|
||||
|
||||
#define __if(b) vx_split(b); \
|
||||
if (b)
|
||||
|
||||
#define __else else
|
||||
|
||||
#define __endif vx_join();
|
||||
|
||||
#define __DIVERGENT__ __attribute__((annotate("divergent")))
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,21 +0,0 @@
|
||||
#ifndef VX_PRINT_H
|
||||
#define VX_PRINT_H
|
||||
|
||||
#include <stdarg.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int vx_vprintf(const char* format, va_list va);
|
||||
int vx_printf(const char * format, ...);
|
||||
|
||||
void vx_putchar(int c);
|
||||
void vx_putint(int value, int base);
|
||||
void vx_putfloat(float value, int precision);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,43 +0,0 @@
|
||||
#ifndef VX_API_H
|
||||
#define VX_API_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t num_groups[3];
|
||||
uint32_t global_offset[3];
|
||||
uint32_t local_size[3];
|
||||
char * printf_buffer;
|
||||
uint32_t *printf_buffer_position;
|
||||
uint32_t printf_buffer_capacity;
|
||||
uint32_t work_dim;
|
||||
} context_t;
|
||||
|
||||
typedef void (*vx_spawn_kernel_cb) (
|
||||
const void * /* arg */,
|
||||
const context_t * /* context */,
|
||||
uint32_t /* group_x */,
|
||||
uint32_t /* group_y */,
|
||||
uint32_t /* group_z */
|
||||
);
|
||||
|
||||
typedef void (*vx_spawn_tasks_cb)(int task_id, void *arg);
|
||||
|
||||
typedef void (*vx_serial_cb)(void *arg);
|
||||
|
||||
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg);
|
||||
|
||||
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
|
||||
|
||||
void vx_serial(vx_serial_cb callback, void * arg);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,264 +0,0 @@
|
||||
/* ---- Original Script: /opt/riscv32i/riscv32-unknown-elf/lib/ldscripts/elf32lriscv.x ---- */
|
||||
/* Default linker script, for normal executables */
|
||||
/* Copyright (C) 2014-2017 Free Software Foundation, Inc.
|
||||
Copying and distribution of this script, with or without modification,
|
||||
are permitted in any medium without royalty provided the copyright
|
||||
notice and this notice are preserved. */
|
||||
OUTPUT_FORMAT("elf32-littleriscv", "elf32-littleriscv", "elf32-littleriscv")
|
||||
OUTPUT_ARCH(riscv)
|
||||
ENTRY(_start)
|
||||
SECTIONS
|
||||
{
|
||||
. = 0x80000000;
|
||||
.interp : { *(.interp) }
|
||||
.note.gnu.build-id : { *(.note.gnu.build-id) }
|
||||
.hash : { *(.hash) }
|
||||
.gnu.hash : { *(.gnu.hash) }
|
||||
.dynsym : { *(.dynsym) }
|
||||
.dynstr : { *(.dynstr) }
|
||||
.gnu.version : { *(.gnu.version) }
|
||||
.gnu.version_d : { *(.gnu.version_d) }
|
||||
.gnu.version_r : { *(.gnu.version_r) }
|
||||
.rela.dyn :
|
||||
{
|
||||
*(.rela.init)
|
||||
*(.rela.text .rela.text.* .rela.gnu.linkonce.t.*)
|
||||
*(.rela.fini)
|
||||
*(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*)
|
||||
*(.rela.data .rela.data.* .rela.gnu.linkonce.d.*)
|
||||
*(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*)
|
||||
*(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*)
|
||||
*(.rela.ctors)
|
||||
*(.rela.dtors)
|
||||
*(.rela.got)
|
||||
*(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*)
|
||||
*(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*)
|
||||
*(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*)
|
||||
*(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*)
|
||||
*(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*)
|
||||
PROVIDE_HIDDEN (__rela_iplt_start = .);
|
||||
*(.rela.iplt)
|
||||
PROVIDE_HIDDEN (__rela_iplt_end = .);
|
||||
}
|
||||
.rela.plt :
|
||||
{
|
||||
*(.rela.plt)
|
||||
}
|
||||
.init :
|
||||
{
|
||||
KEEP (*(SORT_NONE(.init)))
|
||||
}
|
||||
.plt : { *(.plt) }
|
||||
.iplt : { *(.iplt) }
|
||||
.text :
|
||||
{
|
||||
*(.text.unlikely .text.*_unlikely .text.unlikely.*)
|
||||
*(.text.exit .text.exit.*)
|
||||
*(.text.startup .text.startup.*)
|
||||
*(.text.hot .text.hot.*)
|
||||
*(.text .stub .text.* .gnu.linkonce.t.*)
|
||||
/* .gnu.warning sections are handled specially by elf32.em. */
|
||||
*(.gnu.warning)
|
||||
}
|
||||
.fini :
|
||||
{
|
||||
KEEP (*(SORT_NONE(.fini)))
|
||||
}
|
||||
PROVIDE (__etext = .);
|
||||
PROVIDE (_etext = .);
|
||||
PROVIDE (etext = .);
|
||||
.rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
|
||||
.rodata1 : { *(.rodata1) }
|
||||
.sdata2 :
|
||||
{
|
||||
*(.sdata2 .sdata2.* .gnu.linkonce.s2.*)
|
||||
}
|
||||
.sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) }
|
||||
.eh_frame_hdr : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) }
|
||||
.eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) *(.eh_frame.*) }
|
||||
.gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) }
|
||||
.gnu_extab : ONLY_IF_RO { *(.gnu_extab*) }
|
||||
/* These sections are generated by the Sun/Oracle C++ compiler. */
|
||||
.exception_ranges : ONLY_IF_RO { *(.exception_ranges*) }
|
||||
/* Adjust the address for the data segment. We want to adjust up to
|
||||
the same address within the page on the next page up. */
|
||||
. = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
|
||||
/* Exception handling */
|
||||
.eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) }
|
||||
.gnu_extab : ONLY_IF_RW { *(.gnu_extab) }
|
||||
.gcc_except_table : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) }
|
||||
.exception_ranges : ONLY_IF_RW { *(.exception_ranges*) }
|
||||
/* Thread Local Storage sections */
|
||||
.tdata :
|
||||
{
|
||||
PROVIDE_HIDDEN (__tdata_start = .);
|
||||
*(.tdata .tdata.* .gnu.linkonce.td.*)
|
||||
PROVIDE_HIDDEN (__tdata_end = .);
|
||||
}
|
||||
PROVIDE (__tdata_size = SIZEOF (.tdata));
|
||||
.tbss :
|
||||
{
|
||||
PROVIDE_HIDDEN (__tbss_start = .);
|
||||
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
|
||||
*(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
|
||||
PROVIDE_HIDDEN (__tbss_end = .);
|
||||
}
|
||||
PROVIDE (__tbss_size = SIZEOF (.tbss));
|
||||
PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
|
||||
.preinit_array :
|
||||
{
|
||||
PROVIDE_HIDDEN (__preinit_array_start = .);
|
||||
KEEP (*(.preinit_array))
|
||||
PROVIDE_HIDDEN (__preinit_array_end = .);
|
||||
}
|
||||
.init_array :
|
||||
{
|
||||
PROVIDE_HIDDEN (__init_array_start = .);
|
||||
KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
|
||||
KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
|
||||
PROVIDE_HIDDEN (__init_array_end = .);
|
||||
}
|
||||
.fini_array :
|
||||
{
|
||||
PROVIDE_HIDDEN (__fini_array_start = .);
|
||||
KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
|
||||
KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors))
|
||||
PROVIDE_HIDDEN (__fini_array_end = .);
|
||||
}
|
||||
.ctors :
|
||||
{
|
||||
/* gcc uses crtbegin.o to find the start of
|
||||
the constructors, so we make sure it is
|
||||
first. Because this is a wildcard, it
|
||||
doesn't matter if the user does not
|
||||
actually link against crtbegin.o; the
|
||||
linker won't look for a file to match a
|
||||
wildcard. The wildcard also means that it
|
||||
doesn't matter which directory crtbegin.o
|
||||
is in. */
|
||||
KEEP (*crtbegin.o(.ctors))
|
||||
KEEP (*crtbegin?.o(.ctors))
|
||||
/* We don't want to include the .ctor section from
|
||||
the crtend.o file until after the sorted ctors.
|
||||
The .ctor section from the crtend file contains the
|
||||
end of ctors marker and it must be last */
|
||||
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
|
||||
KEEP (*(SORT(.ctors.*)))
|
||||
KEEP (*(.ctors))
|
||||
}
|
||||
.dtors :
|
||||
{
|
||||
KEEP (*crtbegin.o(.dtors))
|
||||
KEEP (*crtbegin?.o(.dtors))
|
||||
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
|
||||
KEEP (*(SORT(.dtors.*)))
|
||||
KEEP (*(.dtors))
|
||||
}
|
||||
.jcr : { KEEP (*(.jcr)) }
|
||||
.data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) }
|
||||
.dynamic : { *(.dynamic) }
|
||||
. = DATA_SEGMENT_RELRO_END (0, .);
|
||||
.data :
|
||||
{
|
||||
__DATA_BEGIN__ = .;
|
||||
*(.data .data.* .gnu.linkonce.d.*)
|
||||
SORT(CONSTRUCTORS)
|
||||
}
|
||||
.data1 : { *(.data1) }
|
||||
.got : { *(.got.plt) *(.igot.plt) *(.got) *(.igot) }
|
||||
/* We want the small data sections together, so single-instruction offsets
|
||||
can access them all, and initialized data all before uninitialized, so
|
||||
we can shorten the on-disk segment size. */
|
||||
.sdata :
|
||||
{
|
||||
__SDATA_BEGIN__ = .;
|
||||
*(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*)
|
||||
*(.sdata .sdata.* .gnu.linkonce.s.*)
|
||||
}
|
||||
_edata = .; PROVIDE (edata = .);
|
||||
. = .;
|
||||
__bss_start = .;
|
||||
.sbss :
|
||||
{
|
||||
*(.dynsbss)
|
||||
*(.sbss .sbss.* .gnu.linkonce.sb.*)
|
||||
*(.scommon)
|
||||
}
|
||||
.bss :
|
||||
{
|
||||
*(.dynbss)
|
||||
*(.bss .bss.* .gnu.linkonce.b.*)
|
||||
*(COMMON)
|
||||
/* Align here to ensure that the .bss section occupies space up to
|
||||
_end. Align after .bss to ensure correct alignment even if the
|
||||
.bss section disappears because there are no input sections.
|
||||
FIXME: Why do we need it? When there is no .bss section, we do not
|
||||
pad the .data section. */
|
||||
. = ALIGN(. != 0 ? 32 / 8 : 1);
|
||||
}
|
||||
. = ALIGN(32 / 8);
|
||||
. = SEGMENT_START("ldata-segment", .);
|
||||
. = ALIGN(32 / 8);
|
||||
__BSS_END__ = .;
|
||||
__global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
|
||||
MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
|
||||
_end = .; PROVIDE (end = .);
|
||||
. = DATA_SEGMENT_END (.);
|
||||
|
||||
/* .stack_dummy section doesn't contains any symbols. It is only
|
||||
* used for linker to calculate size of stack sections, and assign
|
||||
* values to stack symbols later */
|
||||
.stack_dummy (COPY):
|
||||
{
|
||||
KEEP(*(.stack*))
|
||||
}
|
||||
__stack_usage = SIZEOF(.stack_dummy);
|
||||
PROVIDE(__stack_top = 0xFF000000);
|
||||
PROVIDE(__stack_size = 0x400);
|
||||
PROVIDE(__stack = __stack_top);
|
||||
ASSERT(__stack_usage <= __stack_size, "stack overflow")
|
||||
|
||||
/* Stabs debugging sections. */
|
||||
.stab 0 : { *(.stab) }
|
||||
.stabstr 0 : { *(.stabstr) }
|
||||
.stab.excl 0 : { *(.stab.excl) }
|
||||
.stab.exclstr 0 : { *(.stab.exclstr) }
|
||||
.stab.index 0 : { *(.stab.index) }
|
||||
.stab.indexstr 0 : { *(.stab.indexstr) }
|
||||
.comment 0 : { *(.comment) }
|
||||
.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
|
||||
/* DWARF debug sections.
|
||||
Symbols in the DWARF debugging sections are relative to the beginning
|
||||
of the section so we begin them at 0. */
|
||||
/* DWARF 1 */
|
||||
.debug 0 : { *(.debug) }
|
||||
.line 0 : { *(.line) }
|
||||
/* GNU DWARF 1 extensions */
|
||||
.debug_srcinfo 0 : { *(.debug_srcinfo) }
|
||||
.debug_sfnames 0 : { *(.debug_sfnames) }
|
||||
/* DWARF 1.1 and DWARF 2 */
|
||||
.debug_aranges 0 : { *(.debug_aranges) }
|
||||
.debug_pubnames 0 : { *(.debug_pubnames) }
|
||||
/* DWARF 2 */
|
||||
.debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
|
||||
.debug_abbrev 0 : { *(.debug_abbrev) }
|
||||
.debug_line 0 : { *(.debug_line .debug_line.* .debug_line_end) }
|
||||
.debug_frame 0 : { *(.debug_frame) }
|
||||
.debug_str 0 : { *(.debug_str) }
|
||||
.debug_loc 0 : { *(.debug_loc) }
|
||||
.debug_macinfo 0 : { *(.debug_macinfo) }
|
||||
/* SGI/MIPS DWARF 2 extensions */
|
||||
.debug_weaknames 0 : { *(.debug_weaknames) }
|
||||
.debug_funcnames 0 : { *(.debug_funcnames) }
|
||||
.debug_typenames 0 : { *(.debug_typenames) }
|
||||
.debug_varnames 0 : { *(.debug_varnames) }
|
||||
/* DWARF 3 */
|
||||
.debug_pubtypes 0 : { *(.debug_pubtypes) }
|
||||
.debug_ranges 0 : { *(.debug_ranges) }
|
||||
/* DWARF Extension. */
|
||||
.debug_macro 0 : { *(.debug_macro) }
|
||||
.debug_addr 0 : { *(.debug_addr) }
|
||||
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
|
||||
/DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
|
||||
|
||||
}
|
||||
@@ -1,264 +0,0 @@
|
||||
/* ---- Original Script: /opt/riscv32i/riscv32-unknown-elf/lib/ldscripts/elf64lriscv.x ---- */
|
||||
/* Default linker script, for normal executables */
|
||||
/* Copyright (C) 2014-2017 Free Software Foundation, Inc.
|
||||
Copying and distribution of this script, with or without modification,
|
||||
are permitted in any medium without royalty provided the copyright
|
||||
notice and this notice are preserved. */
|
||||
OUTPUT_FORMAT("elf64-littleriscv", "elf64-littleriscv", "elf64-littleriscv")
|
||||
OUTPUT_ARCH(riscv)
|
||||
ENTRY(_start)
|
||||
SECTIONS
|
||||
{
|
||||
. = 0x80000000;
|
||||
.interp : { *(.interp) }
|
||||
.note.gnu.build-id : { *(.note.gnu.build-id) }
|
||||
.hash : { *(.hash) }
|
||||
.gnu.hash : { *(.gnu.hash) }
|
||||
.dynsym : { *(.dynsym) }
|
||||
.dynstr : { *(.dynstr) }
|
||||
.gnu.version : { *(.gnu.version) }
|
||||
.gnu.version_d : { *(.gnu.version_d) }
|
||||
.gnu.version_r : { *(.gnu.version_r) }
|
||||
.rela.dyn :
|
||||
{
|
||||
*(.rela.init)
|
||||
*(.rela.text .rela.text.* .rela.gnu.linkonce.t.*)
|
||||
*(.rela.fini)
|
||||
*(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*)
|
||||
*(.rela.data .rela.data.* .rela.gnu.linkonce.d.*)
|
||||
*(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*)
|
||||
*(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*)
|
||||
*(.rela.ctors)
|
||||
*(.rela.dtors)
|
||||
*(.rela.got)
|
||||
*(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*)
|
||||
*(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*)
|
||||
*(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*)
|
||||
*(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*)
|
||||
*(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*)
|
||||
PROVIDE_HIDDEN (__rela_iplt_start = .);
|
||||
*(.rela.iplt)
|
||||
PROVIDE_HIDDEN (__rela_iplt_end = .);
|
||||
}
|
||||
.rela.plt :
|
||||
{
|
||||
*(.rela.plt)
|
||||
}
|
||||
.init :
|
||||
{
|
||||
KEEP (*(SORT_NONE(.init)))
|
||||
}
|
||||
.plt : { *(.plt) }
|
||||
.iplt : { *(.iplt) }
|
||||
.text :
|
||||
{
|
||||
*(.text.unlikely .text.*_unlikely .text.unlikely.*)
|
||||
*(.text.exit .text.exit.*)
|
||||
*(.text.startup .text.startup.*)
|
||||
*(.text.hot .text.hot.*)
|
||||
*(.text .stub .text.* .gnu.linkonce.t.*)
|
||||
/* .gnu.warning sections are handled specially by elf32.em. */
|
||||
*(.gnu.warning)
|
||||
}
|
||||
.fini :
|
||||
{
|
||||
KEEP (*(SORT_NONE(.fini)))
|
||||
}
|
||||
PROVIDE (__etext = .);
|
||||
PROVIDE (_etext = .);
|
||||
PROVIDE (etext = .);
|
||||
.rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
|
||||
.rodata1 : { *(.rodata1) }
|
||||
.sdata2 :
|
||||
{
|
||||
*(.sdata2 .sdata2.* .gnu.linkonce.s2.*)
|
||||
}
|
||||
.sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) }
|
||||
.eh_frame_hdr : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) }
|
||||
.eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) *(.eh_frame.*) }
|
||||
.gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) }
|
||||
.gnu_extab : ONLY_IF_RO { *(.gnu_extab*) }
|
||||
/* These sections are generated by the Sun/Oracle C++ compiler. */
|
||||
.exception_ranges : ONLY_IF_RO { *(.exception_ranges*) }
|
||||
/* Adjust the address for the data segment. We want to adjust up to
|
||||
the same address within the page on the next page up. */
|
||||
. = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
|
||||
/* Exception handling */
|
||||
.eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) }
|
||||
.gnu_extab : ONLY_IF_RW { *(.gnu_extab) }
|
||||
.gcc_except_table : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) }
|
||||
.exception_ranges : ONLY_IF_RW { *(.exception_ranges*) }
|
||||
/* Thread Local Storage sections */
|
||||
.tdata :
|
||||
{
|
||||
PROVIDE_HIDDEN (__tdata_start = .);
|
||||
*(.tdata .tdata.* .gnu.linkonce.td.*)
|
||||
PROVIDE_HIDDEN (__tdata_end = .);
|
||||
}
|
||||
PROVIDE (__tdata_size = SIZEOF (.tdata));
|
||||
.tbss :
|
||||
{
|
||||
PROVIDE_HIDDEN (__tbss_start = .);
|
||||
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
|
||||
*(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
|
||||
PROVIDE_HIDDEN (__tbss_end = .);
|
||||
}
|
||||
PROVIDE (__tbss_size = SIZEOF (.tbss));
|
||||
PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
|
||||
.preinit_array :
|
||||
{
|
||||
PROVIDE_HIDDEN (__preinit_array_start = .);
|
||||
KEEP (*(.preinit_array))
|
||||
PROVIDE_HIDDEN (__preinit_array_end = .);
|
||||
}
|
||||
.init_array :
|
||||
{
|
||||
PROVIDE_HIDDEN (__init_array_start = .);
|
||||
KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
|
||||
KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
|
||||
PROVIDE_HIDDEN (__init_array_end = .);
|
||||
}
|
||||
.fini_array :
|
||||
{
|
||||
PROVIDE_HIDDEN (__fini_array_start = .);
|
||||
KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
|
||||
KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors))
|
||||
PROVIDE_HIDDEN (__fini_array_end = .);
|
||||
}
|
||||
.ctors :
|
||||
{
|
||||
/* gcc uses crtbegin.o to find the start of
|
||||
the constructors, so we make sure it is
|
||||
first. Because this is a wildcard, it
|
||||
doesn't matter if the user does not
|
||||
actually link against crtbegin.o; the
|
||||
linker won't look for a file to match a
|
||||
wildcard. The wildcard also means that it
|
||||
doesn't matter which directory crtbegin.o
|
||||
is in. */
|
||||
KEEP (*crtbegin.o(.ctors))
|
||||
KEEP (*crtbegin?.o(.ctors))
|
||||
/* We don't want to include the .ctor section from
|
||||
the crtend.o file until after the sorted ctors.
|
||||
The .ctor section from the crtend file contains the
|
||||
end of ctors marker and it must be last */
|
||||
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
|
||||
KEEP (*(SORT(.ctors.*)))
|
||||
KEEP (*(.ctors))
|
||||
}
|
||||
.dtors :
|
||||
{
|
||||
KEEP (*crtbegin.o(.dtors))
|
||||
KEEP (*crtbegin?.o(.dtors))
|
||||
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
|
||||
KEEP (*(SORT(.dtors.*)))
|
||||
KEEP (*(.dtors))
|
||||
}
|
||||
.jcr : { KEEP (*(.jcr)) }
|
||||
.data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) }
|
||||
.dynamic : { *(.dynamic) }
|
||||
. = DATA_SEGMENT_RELRO_END (0, .);
|
||||
.data :
|
||||
{
|
||||
__DATA_BEGIN__ = .;
|
||||
*(.data .data.* .gnu.linkonce.d.*)
|
||||
SORT(CONSTRUCTORS)
|
||||
}
|
||||
.data1 : { *(.data1) }
|
||||
.got : { *(.got.plt) *(.igot.plt) *(.got) *(.igot) }
|
||||
/* We want the small data sections together, so single-instruction offsets
|
||||
can access them all, and initialized data all before uninitialized, so
|
||||
we can shorten the on-disk segment size. */
|
||||
.sdata :
|
||||
{
|
||||
__SDATA_BEGIN__ = .;
|
||||
*(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*)
|
||||
*(.sdata .sdata.* .gnu.linkonce.s.*)
|
||||
}
|
||||
_edata = .; PROVIDE (edata = .);
|
||||
. = .;
|
||||
__bss_start = .;
|
||||
.sbss :
|
||||
{
|
||||
*(.dynsbss)
|
||||
*(.sbss .sbss.* .gnu.linkonce.sb.*)
|
||||
*(.scommon)
|
||||
}
|
||||
.bss :
|
||||
{
|
||||
*(.dynbss)
|
||||
*(.bss .bss.* .gnu.linkonce.b.*)
|
||||
*(COMMON)
|
||||
/* Align here to ensure that the .bss section occupies space up to
|
||||
_end. Align after .bss to ensure correct alignment even if the
|
||||
.bss section disappears because there are no input sections.
|
||||
FIXME: Why do we need it? When there is no .bss section, we do not
|
||||
pad the .data section. */
|
||||
. = ALIGN(. != 0 ? 64 / 8 : 1);
|
||||
}
|
||||
. = ALIGN(64 / 8);
|
||||
. = SEGMENT_START("ldata-segment", .);
|
||||
. = ALIGN(64 / 8);
|
||||
__BSS_END__ = .;
|
||||
__global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
|
||||
MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
|
||||
_end = .; PROVIDE (end = .);
|
||||
. = DATA_SEGMENT_END (.);
|
||||
|
||||
/* .stack_dummy section doesn't contains any symbols. It is only
|
||||
* used for linker to calculate size of stack sections, and assign
|
||||
* values to stack symbols later */
|
||||
.stack_dummy (COPY):
|
||||
{
|
||||
KEEP(*(.stack*))
|
||||
}
|
||||
__stack_usage = SIZEOF(.stack_dummy);
|
||||
PROVIDE(__stack_top = 0xFF000000);
|
||||
PROVIDE(__stack_size = 0x400);
|
||||
PROVIDE(__stack = __stack_top);
|
||||
ASSERT(__stack_usage <= __stack_size, "stack overflow")
|
||||
|
||||
/* Stabs debugging sections. */
|
||||
.stab 0 : { *(.stab) }
|
||||
.stabstr 0 : { *(.stabstr) }
|
||||
.stab.excl 0 : { *(.stab.excl) }
|
||||
.stab.exclstr 0 : { *(.stab.exclstr) }
|
||||
.stab.index 0 : { *(.stab.index) }
|
||||
.stab.indexstr 0 : { *(.stab.indexstr) }
|
||||
.comment 0 : { *(.comment) }
|
||||
.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
|
||||
/* DWARF debug sections.
|
||||
Symbols in the DWARF debugging sections are relative to the beginning
|
||||
of the section so we begin them at 0. */
|
||||
/* DWARF 1 */
|
||||
.debug 0 : { *(.debug) }
|
||||
.line 0 : { *(.line) }
|
||||
/* GNU DWARF 1 extensions */
|
||||
.debug_srcinfo 0 : { *(.debug_srcinfo) }
|
||||
.debug_sfnames 0 : { *(.debug_sfnames) }
|
||||
/* DWARF 1.1 and DWARF 2 */
|
||||
.debug_aranges 0 : { *(.debug_aranges) }
|
||||
.debug_pubnames 0 : { *(.debug_pubnames) }
|
||||
/* DWARF 2 */
|
||||
.debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
|
||||
.debug_abbrev 0 : { *(.debug_abbrev) }
|
||||
.debug_line 0 : { *(.debug_line .debug_line.* .debug_line_end) }
|
||||
.debug_frame 0 : { *(.debug_frame) }
|
||||
.debug_str 0 : { *(.debug_str) }
|
||||
.debug_loc 0 : { *(.debug_loc) }
|
||||
.debug_macinfo 0 : { *(.debug_macinfo) }
|
||||
/* SGI/MIPS DWARF 2 extensions */
|
||||
.debug_weaknames 0 : { *(.debug_weaknames) }
|
||||
.debug_funcnames 0 : { *(.debug_funcnames) }
|
||||
.debug_typenames 0 : { *(.debug_typenames) }
|
||||
.debug_varnames 0 : { *(.debug_varnames) }
|
||||
/* DWARF 3 */
|
||||
.debug_pubtypes 0 : { *(.debug_pubtypes) }
|
||||
.debug_ranges 0 : { *(.debug_ranges) }
|
||||
/* DWARF Extension. */
|
||||
.debug_macro 0 : { *(.debug_macro) }
|
||||
.debug_addr 0 : { *(.debug_addr) }
|
||||
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
|
||||
/DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
|
||||
|
||||
}
|
||||
78
runtime/opae/Makefile
Normal file
78
runtime/opae/Makefile
Normal file
@@ -0,0 +1,78 @@
|
||||
XLEN ?= 32
|
||||
|
||||
TARGET ?= opaesim
|
||||
|
||||
OPAESIM_DIR = ../../sim/opaesim
|
||||
|
||||
RTL_DIR=../../hw/rtl
|
||||
|
||||
SYN_DIR=../../hw/syn/altera/opae
|
||||
|
||||
SCRIPT_DIR=../../hw/scripts
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
CXXFLAGS += -I. -I../include -I../common/ -I../../hw
|
||||
CXXFLAGS += -DXLEN_$(XLEN)
|
||||
|
||||
ifeq ($(TARGET), opaesim)
|
||||
CXXFLAGS += -I$(OPAESIM_DIR)
|
||||
else
|
||||
CXXFLAGS += -I$(SYN_DIR)
|
||||
endif
|
||||
|
||||
# Position independent code
|
||||
CXXFLAGS += -fPIC
|
||||
|
||||
# Add external configuration
|
||||
CXXFLAGS += $(CONFIGS)
|
||||
|
||||
# Dump perf stats
|
||||
CXXFLAGS += -DDUMP_PERF_STATS
|
||||
|
||||
LDFLAGS += -shared -luuid -ldl -pthread
|
||||
|
||||
SRCS = vortex.cpp driver.cpp ../common/utils.cpp
|
||||
|
||||
# set up target types
|
||||
ifeq ($(TARGET), opaesim)
|
||||
CXXFLAGS += -DOPAESIM
|
||||
OPAESIM = libopae-c-sim.so
|
||||
else
|
||||
ifeq ($(TARGET), asesim)
|
||||
CXXFLAGS += -DASESIM
|
||||
else
|
||||
CXXFLAGS += -DFPGA
|
||||
endif
|
||||
endif
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
# Enable scope logic analyzer
|
||||
ifdef SCOPE
|
||||
CXXFLAGS += -DSCOPE
|
||||
SRCS += ../common/scope.cpp
|
||||
endif
|
||||
|
||||
# Enable perf counters
|
||||
ifdef PERF
|
||||
CXXFLAGS += -DPERF_ENABLE
|
||||
endif
|
||||
|
||||
PROJECT = libvortex.so
|
||||
|
||||
all: $(PROJECT)
|
||||
|
||||
libopae-c-sim.so:
|
||||
DESTDIR=../../runtime/opae $(MAKE) -C $(OPAESIM_DIR) ../../runtime/opae/libopae-c-sim.so
|
||||
|
||||
$(PROJECT): $(SRCS) $(OPAESIM)
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT)
|
||||
|
||||
clean:
|
||||
DESTDIR=../../runtime/opae $(MAKE) -C $(OPAESIM_DIR) clean
|
||||
rm -rf $(PROJECT)
|
||||
93
runtime/opae/driver.cpp
Normal file
93
runtime/opae/driver.cpp
Normal file
@@ -0,0 +1,93 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "driver.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <linux/limits.h>
|
||||
#include <dlfcn.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
|
||||
#ifdef OPAESIM
|
||||
#define DEFAULT_OPAE_DRV_PATHS "libopae-c-sim.so"
|
||||
#elif ASESIM
|
||||
#define DEFAULT_OPAE_DRV_PATHS "libopae-c-ase.so"
|
||||
#else
|
||||
#define DEFAULT_OPAE_DRV_PATHS "libopae-c.so"
|
||||
#endif
|
||||
|
||||
#define SET_API(func) \
|
||||
opae_drv_funcs->func = (pfn_##func)dlsym(dl_handle, #func); \
|
||||
if (opae_drv_funcs->func == nullptr) { \
|
||||
printf("dlsym failed: %s\n", dlerror()); \
|
||||
dlclose(dl_handle); \
|
||||
return -1; \
|
||||
}
|
||||
|
||||
void* dl_handle = nullptr;
|
||||
|
||||
int drv_init(opae_drv_api_t* opae_drv_funcs) {
|
||||
if (opae_drv_funcs == nullptr)
|
||||
return -1;
|
||||
|
||||
const char* api_path_s = getenv("OPAE_DRV_PATHS");
|
||||
if (api_path_s == nullptr || api_path_s[0] == '\0') {
|
||||
api_path_s = DEFAULT_OPAE_DRV_PATHS;
|
||||
}
|
||||
|
||||
std::vector<std::string> api_paths;
|
||||
{
|
||||
std::stringstream ss(api_path_s);
|
||||
while (ss.good()) {
|
||||
std::string path;
|
||||
getline(ss, path, ',');
|
||||
api_paths.push_back(path);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& api_path : api_paths) {
|
||||
dl_handle = dlopen(api_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
|
||||
if (dl_handle)
|
||||
break;
|
||||
}
|
||||
if (dl_handle == nullptr) {
|
||||
printf("dlopen failed: %s\n", dlerror());
|
||||
return -1;
|
||||
}
|
||||
|
||||
SET_API (fpgaGetProperties);
|
||||
SET_API (fpgaPropertiesSetObjectType);
|
||||
SET_API (fpgaPropertiesSetGUID);
|
||||
SET_API (fpgaDestroyProperties);
|
||||
SET_API (fpgaDestroyToken);
|
||||
SET_API (fpgaPropertiesGetLocalMemorySize);
|
||||
SET_API (fpgaEnumerate);
|
||||
SET_API (fpgaOpen);
|
||||
SET_API (fpgaClose);
|
||||
SET_API (fpgaPrepareBuffer);
|
||||
SET_API (fpgaReleaseBuffer);
|
||||
SET_API (fpgaGetIOAddress);
|
||||
SET_API (fpgaWriteMMIO64);
|
||||
SET_API (fpgaReadMMIO64);
|
||||
SET_API (fpgaErrStr);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void drv_close() {
|
||||
dlclose(dl_handle);
|
||||
}
|
||||
61
runtime/opae/driver.h
Normal file
61
runtime/opae/driver.h
Normal file
@@ -0,0 +1,61 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef OPAESIM
|
||||
#include <opae/fpga.h>
|
||||
#include <uuid/uuid.h>
|
||||
#else
|
||||
#include <fpga.h>
|
||||
#endif
|
||||
|
||||
typedef fpga_result (*pfn_fpgaGetProperties)(fpga_token token, fpga_properties *prop);
|
||||
typedef fpga_result (*pfn_fpgaPropertiesSetObjectType)(fpga_properties prop, fpga_objtype objtype);
|
||||
typedef fpga_result (*pfn_fpgaPropertiesSetGUID)(fpga_properties prop, fpga_guid guid);
|
||||
typedef fpga_result (*pfn_fpgaDestroyProperties)(fpga_properties *prop);
|
||||
typedef fpga_result (*pfn_fpgaEnumerate)(const fpga_properties *filters, uint32_t num_filters, fpga_token *tokens, uint32_t max_tokens, uint32_t *num_matches);
|
||||
typedef fpga_result (*pfn_fpgaDestroyToken)(fpga_token *token);
|
||||
typedef fpga_result (*pfn_fpgaPropertiesGetLocalMemorySize)(fpga_properties prop, uint64_t *lms);
|
||||
|
||||
typedef fpga_result (*pfn_fpgaOpen)(fpga_token token, fpga_handle *handle, int flags);
|
||||
typedef fpga_result (*pfn_fpgaClose)(fpga_handle handle);
|
||||
typedef fpga_result (*pfn_fpgaPrepareBuffer)(fpga_handle handle, uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
|
||||
typedef fpga_result (*pfn_fpgaReleaseBuffer)(fpga_handle handle, uint64_t wsid);
|
||||
typedef fpga_result (*pfn_fpgaGetIOAddress)(fpga_handle handle, uint64_t wsid, uint64_t *ioaddr);
|
||||
typedef fpga_result (*pfn_fpgaWriteMMIO64)(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t value);
|
||||
typedef fpga_result (*pfn_fpgaReadMMIO64)(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t *value);
|
||||
typedef const char *(*pfn_fpgaErrStr)(fpga_result e);
|
||||
|
||||
struct opae_drv_api_t {
|
||||
pfn_fpgaGetProperties fpgaGetProperties;
|
||||
pfn_fpgaPropertiesSetObjectType fpgaPropertiesSetObjectType;
|
||||
pfn_fpgaPropertiesSetGUID fpgaPropertiesSetGUID;
|
||||
pfn_fpgaDestroyProperties fpgaDestroyProperties;
|
||||
pfn_fpgaEnumerate fpgaEnumerate;
|
||||
pfn_fpgaDestroyToken fpgaDestroyToken;
|
||||
pfn_fpgaPropertiesGetLocalMemorySize fpgaPropertiesGetLocalMemorySize;
|
||||
|
||||
pfn_fpgaOpen fpgaOpen;
|
||||
pfn_fpgaClose fpgaClose;
|
||||
pfn_fpgaPrepareBuffer fpgaPrepareBuffer;
|
||||
pfn_fpgaReleaseBuffer fpgaReleaseBuffer;
|
||||
pfn_fpgaGetIOAddress fpgaGetIOAddress;
|
||||
pfn_fpgaWriteMMIO64 fpgaWriteMMIO64;
|
||||
pfn_fpgaReadMMIO64 fpgaReadMMIO64;
|
||||
pfn_fpgaErrStr fpgaErrStr;
|
||||
};
|
||||
|
||||
int drv_init(opae_drv_api_t* opae_drv_funcs);
|
||||
|
||||
void drv_close();
|
||||
610
runtime/opae/vortex.cpp
Executable file
610
runtime/opae/vortex.cpp
Executable file
@@ -0,0 +1,610 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <vortex.h>
|
||||
#include <utils.h>
|
||||
#include <malloc.h>
|
||||
#include "driver.h"
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <uuid/uuid.h>
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
#include <unordered_map>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <list>
|
||||
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
#include <vortex_afu.h>
|
||||
|
||||
#ifdef SCOPE
|
||||
#include "scope.h"
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define CMD_MEM_READ AFU_IMAGE_CMD_MEM_READ
|
||||
#define CMD_MEM_WRITE AFU_IMAGE_CMD_MEM_WRITE
|
||||
#define CMD_RUN AFU_IMAGE_CMD_RUN
|
||||
#define CMD_DCR_WRITE AFU_IMAGE_CMD_DCR_WRITE
|
||||
|
||||
#define MMIO_CMD_TYPE (AFU_IMAGE_MMIO_CMD_TYPE * 4)
|
||||
#define MMIO_CMD_ARG0 (AFU_IMAGE_MMIO_CMD_ARG0 * 4)
|
||||
#define MMIO_CMD_ARG1 (AFU_IMAGE_MMIO_CMD_ARG1 * 4)
|
||||
#define MMIO_CMD_ARG2 (AFU_IMAGE_MMIO_CMD_ARG2 * 4)
|
||||
#define MMIO_STATUS (AFU_IMAGE_MMIO_STATUS * 4)
|
||||
#define MMIO_DEV_CAPS (AFU_IMAGE_MMIO_DEV_CAPS * 4)
|
||||
#define MMIO_ISA_CAPS (AFU_IMAGE_MMIO_ISA_CAPS * 4)
|
||||
#define MMIO_SCOPE_READ (AFU_IMAGE_MMIO_SCOPE_READ * 4)
|
||||
#define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4)
|
||||
|
||||
#define STATUS_STATE_BITS 8
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
#define CHECK_HANDLE(handle, _expr, _cleanup) \
|
||||
auto handle = _expr; \
|
||||
if (handle == nullptr) { \
|
||||
printf("[VXDRV] Error: '%s' returned NULL!\n", #_expr); \
|
||||
_cleanup \
|
||||
}
|
||||
|
||||
#define CHECK_ERR(_expr, _cleanup) \
|
||||
do { \
|
||||
auto err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d, %s!\n", #_expr, (int)err, api.fpgaErrStr(err)); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device() :
|
||||
staging_wsid(0),
|
||||
staging_ioaddr(0),
|
||||
staging_ptr(nullptr),
|
||||
staging_size(0)
|
||||
{}
|
||||
|
||||
~vx_device() {}
|
||||
|
||||
int ensure_staging(uint64_t size) {
|
||||
size_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (staging_size >= asize)
|
||||
return 0;
|
||||
|
||||
if (staging_size != 0) {
|
||||
// release existing buffer
|
||||
api.fpgaReleaseBuffer(fpga, staging_wsid);
|
||||
staging_size = 0;
|
||||
}
|
||||
|
||||
// allocate new buffer
|
||||
CHECK_ERR(api.fpgaPrepareBuffer(fpga, asize, (void**)&staging_ptr, &staging_wsid, 0), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
// get the physical address of the buffer in the accelerator
|
||||
CHECK_ERR(api.fpgaGetIOAddress(fpga, staging_wsid, &staging_ioaddr), {
|
||||
api.fpgaReleaseBuffer(fpga, staging_wsid);
|
||||
return -1;
|
||||
});
|
||||
|
||||
staging_size = asize;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
opae_drv_api_t api;
|
||||
fpga_handle fpga;
|
||||
std::shared_ptr<vortex::MemoryAllocator> global_mem;
|
||||
std::shared_ptr<vortex::MemoryAllocator> local_mem;
|
||||
DeviceConfig dcrs;
|
||||
uint64_t dev_caps;
|
||||
uint64_t isa_caps;
|
||||
uint64_t global_mem_size;
|
||||
uint64_t staging_wsid;
|
||||
uint64_t staging_ioaddr;
|
||||
uint8_t* staging_ptr;
|
||||
uint64_t staging_size;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
*value = (device->dev_caps >> 0) & 0xff;
|
||||
break;
|
||||
case VX_CAPS_NUM_THREADS:
|
||||
*value = (device->dev_caps >> 8) & 0xff;
|
||||
break;
|
||||
case VX_CAPS_NUM_WARPS:
|
||||
*value = (device->dev_caps >> 16) & 0xff;
|
||||
break;
|
||||
case VX_CAPS_NUM_CORES:
|
||||
*value = (device->dev_caps >> 24) & 0xffff;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_GLOBAL_MEM_SIZE:
|
||||
*value = device->global_mem_size;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
*value = 1ull << ((device->dev_caps >> 40) & 0xff);
|
||||
break;
|
||||
case VX_CAPS_KERNEL_BASE_ADDR:
|
||||
*value = (uint64_t(device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) |
|
||||
device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
|
||||
break;
|
||||
case VX_CAPS_ISA_FLAGS:
|
||||
*value = device->isa_caps;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id);
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_open(vx_device_h* hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device* device;
|
||||
|
||||
fpga_handle accel_handle;
|
||||
fpga_token accel_token;
|
||||
fpga_properties filter;
|
||||
fpga_guid guid;
|
||||
|
||||
uint32_t num_matches;
|
||||
|
||||
opae_drv_api_t api;
|
||||
memset(&api, 0, sizeof(opae_drv_api_t));
|
||||
if (drv_init(&api) !=0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Set up a filter that will search for an accelerator
|
||||
CHECK_ERR(api.fpgaGetProperties(nullptr, &filter), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
CHECK_ERR(api.fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR), {
|
||||
api.fpgaDestroyProperties(&filter);
|
||||
return -1;
|
||||
});
|
||||
|
||||
// Add the desired UUID to the filter
|
||||
std::string s_uuid(AFU_ACCEL_UUID);
|
||||
std::replace(s_uuid.begin(), s_uuid.end(), '_', '-');
|
||||
uuid_parse(s_uuid.c_str(), guid);
|
||||
CHECK_ERR(api.fpgaPropertiesSetGUID(filter, guid), {
|
||||
api.fpgaDestroyProperties(&filter);
|
||||
return -1;
|
||||
});
|
||||
|
||||
// Do the search across the available FPGA contexts
|
||||
CHECK_ERR(api.fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches), {
|
||||
api.fpgaDestroyProperties(&filter);
|
||||
return -1;
|
||||
});
|
||||
|
||||
// Not needed anymore
|
||||
CHECK_ERR(api.fpgaDestroyProperties(&filter), {
|
||||
api.fpgaDestroyToken(&accel_token);
|
||||
return -1;
|
||||
});
|
||||
|
||||
if (num_matches < 1) {
|
||||
fprintf(stderr, "[VXDRV] Error: accelerator %s not found!\n", AFU_ACCEL_UUID);
|
||||
api.fpgaDestroyToken(&accel_token);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open accelerator
|
||||
CHECK_ERR(api.fpgaOpen(accel_token, &accel_handle, 0), {
|
||||
api.fpgaDestroyToken(&accel_token);
|
||||
return -1;
|
||||
});
|
||||
|
||||
// Done with token
|
||||
CHECK_ERR(api.fpgaDestroyToken(&accel_token), {
|
||||
api.fpgaClose(accel_handle);
|
||||
return -1;
|
||||
});
|
||||
|
||||
// allocate device object
|
||||
device = new vx_device();
|
||||
if (nullptr == device) {
|
||||
api.fpgaClose(accel_handle);
|
||||
return -1;
|
||||
}
|
||||
|
||||
device->api = api;
|
||||
device->fpga = accel_handle;
|
||||
|
||||
{
|
||||
// retrieve FPGA global memory size
|
||||
CHECK_ERR(api.fpgaPropertiesGetLocalMemorySize(filter, &device->global_mem_size), {
|
||||
// assume 8GB as default
|
||||
device->global_mem_size = GLOBAL_MEM_SIZE;
|
||||
});
|
||||
|
||||
// Load ISA CAPS
|
||||
CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_ISA_CAPS, &device->isa_caps), {
|
||||
api.fpgaClose(accel_handle);
|
||||
return -1;
|
||||
});
|
||||
|
||||
// Load device CAPS
|
||||
CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_DEV_CAPS, &device->dev_caps), {
|
||||
api.fpgaClose(accel_handle);
|
||||
return -1;
|
||||
});
|
||||
}
|
||||
|
||||
device->global_mem = std::make_shared<vortex::MemoryAllocator>(
|
||||
ALLOC_BASE_ADDR, ALLOC_MAX_ADDR - ALLOC_BASE_ADDR, RAM_PAGE_SIZE, CACHE_BLOCK_SIZE);
|
||||
|
||||
uint64_t local_mem_size = 0;
|
||||
vx_dev_caps(device, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size);
|
||||
if (local_mem_size <= 1) {
|
||||
device->local_mem = std::make_shared<vortex::MemoryAllocator>(
|
||||
SMEM_BASE_ADDR, local_mem_size, RAM_PAGE_SIZE, 1);
|
||||
}
|
||||
|
||||
#ifdef SCOPE
|
||||
{
|
||||
scope_callback_t callback;
|
||||
callback.registerWrite = [](vx_device_h hdevice, uint64_t value)->int {
|
||||
auto device = (vx_device*)hdevice;
|
||||
return device->api.fpgaWriteMMIO64(device->fpga, 0, MMIO_SCOPE_WRITE, value);
|
||||
};
|
||||
callback.registerRead = [](vx_device_h hdevice, uint64_t* value)->int {
|
||||
auto device = (vx_device*)hdevice;
|
||||
return device->api.fpgaReadMMIO64(device->fpga, 0, MMIO_SCOPE_READ, value);
|
||||
};
|
||||
int ret = vx_scope_start(&callback, device, 0, -1);
|
||||
if (ret != 0) {
|
||||
api.fpgaClose(accel_handle);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
int err = dcr_initialize(device);
|
||||
if (err != 0) {
|
||||
delete device;
|
||||
return err;
|
||||
}
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
perf_add_device(device);
|
||||
#endif
|
||||
|
||||
*hdevice = device;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_close(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
auto& api = device->api;
|
||||
|
||||
#ifdef SCOPE
|
||||
vx_scope_stop(hdevice);
|
||||
#endif
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
perf_remove_device(hdevice);
|
||||
#endif
|
||||
|
||||
// release staging buffer
|
||||
if (device->staging_size != 0) {
|
||||
api.fpgaReleaseBuffer(device->fpga, device->staging_wsid);
|
||||
device->staging_size = 0;
|
||||
}
|
||||
|
||||
// close the device
|
||||
api.fpgaClose(device->fpga);
|
||||
|
||||
delete device;
|
||||
|
||||
drv_close();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == dev_addr
|
||||
|| 0 == size)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
if (type == VX_MEM_TYPE_GLOBAL) {
|
||||
return device->global_mem->allocate(size, dev_addr);
|
||||
} else if (type == VX_MEM_TYPE_LOCAL) {
|
||||
return device->local_mem->allocate(size, dev_addr);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
if (0 == dev_addr)
|
||||
return 0;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
if (dev_addr >= SMEM_BASE_ADDR) {
|
||||
return device->local_mem->release(dev_addr);
|
||||
} else {
|
||||
return device->global_mem->release(dev_addr);
|
||||
}
|
||||
}
|
||||
|
||||
extern int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
if (type == VX_MEM_TYPE_GLOBAL) {
|
||||
if (mem_free)
|
||||
*mem_free = device->global_mem->free();
|
||||
if (mem_used)
|
||||
*mem_used = device->global_mem->allocated();
|
||||
} else if (type == VX_MEM_TYPE_LOCAL) {
|
||||
if (mem_free)
|
||||
*mem_free = device->local_mem->free();
|
||||
if (mem_used)
|
||||
*mem_free = device->local_mem->allocated();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
auto& api = device->api;
|
||||
|
||||
if (device->ensure_staging(size) != 0)
|
||||
return -1;
|
||||
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
|
||||
// check alignment
|
||||
if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
|
||||
return -1;
|
||||
|
||||
// bound checking
|
||||
if (dev_addr + asize > device->global_mem_size)
|
||||
return -1;
|
||||
|
||||
// ensure ready for new command
|
||||
if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
// update staging buffer
|
||||
memcpy(device->staging_ptr, host_ptr, size);
|
||||
|
||||
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
|
||||
|
||||
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
// Wait for the write operation to finish
|
||||
if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
auto& api = device->api;
|
||||
|
||||
if (device->ensure_staging(size) != 0)
|
||||
return -1;
|
||||
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
|
||||
// check alignment
|
||||
if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
|
||||
return -1;
|
||||
|
||||
// bound checking
|
||||
if (dev_addr + asize > device->global_mem_size)
|
||||
return -1;
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
|
||||
|
||||
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
// wait for the write operation to finish
|
||||
if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
// read staging buffer
|
||||
memcpy(host_ptr, device->staging_ptr, size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
auto& api = device->api;
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
// start execution
|
||||
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
std::unordered_map<uint32_t, std::stringstream> print_bufs;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
auto& api = device->api;
|
||||
|
||||
struct timespec sleep_time;
|
||||
|
||||
sleep_time.tv_sec = 0;
|
||||
sleep_time.tv_nsec = 1000000;
|
||||
|
||||
// to milliseconds
|
||||
uint64_t sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
|
||||
|
||||
for (;;) {
|
||||
uint64_t status;
|
||||
CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
// check for console data
|
||||
uint32_t cout_data = status >> STATUS_STATE_BITS;
|
||||
if (cout_data & 0x1) {
|
||||
// retrieve console data
|
||||
do {
|
||||
char cout_char = (cout_data >> 1) & 0xff;
|
||||
uint32_t cout_tid = (cout_data >> 9) & 0xff;
|
||||
auto& ss_buf = print_bufs[cout_tid];
|
||||
ss_buf << cout_char;
|
||||
if (cout_char == '\n') {
|
||||
std::cout << std::dec << "#" << cout_tid << ": " << ss_buf.str() << std::flush;
|
||||
ss_buf.str("");
|
||||
}
|
||||
CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), {
|
||||
return -1;
|
||||
});
|
||||
cout_data = status >> STATUS_STATE_BITS;
|
||||
} while (cout_data & 0x1);
|
||||
}
|
||||
|
||||
uint32_t state = status & ((1 << STATUS_STATE_BITS)-1);
|
||||
|
||||
if (0 == state || 0 == timeout) {
|
||||
for (auto& buf : print_bufs) {
|
||||
auto str = buf.second.str();
|
||||
if (!str.empty()) {
|
||||
std::cout << "#" << buf.first << ": " << str << std::endl;
|
||||
}
|
||||
}
|
||||
if (state != 0) {
|
||||
fprintf(stdout, "[VXDRV] ready-wait timed out: state=%d\n", state);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
nanosleep(&sleep_time, nullptr);
|
||||
timeout -= sleep_time_ms;
|
||||
};
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
auto& api = device->api;
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(hdevice, -1) != 0)
|
||||
return -1;
|
||||
|
||||
// write DCR value
|
||||
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, addr), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, value), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_DCR_WRITE), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
// save the value
|
||||
device->dcrs.write(addr, value);
|
||||
|
||||
return 0;
|
||||
}
|
||||
2
runtime/rtlsim/.gitignore
vendored
Normal file
2
runtime/rtlsim/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
obj_dir
|
||||
*.so
|
||||
45
runtime/rtlsim/Makefile
Normal file
45
runtime/rtlsim/Makefile
Normal file
@@ -0,0 +1,45 @@
|
||||
XLEN ?= 32
|
||||
|
||||
RTLSIM_DIR = ../../sim/rtlsim
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
CXXFLAGS += -I../include -I../common -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common
|
||||
CXXFLAGS += -DXLEN_$(XLEN)
|
||||
|
||||
# Position independent code
|
||||
CXXFLAGS += -fPIC
|
||||
|
||||
# Add external configuration
|
||||
CXXFLAGS += $(CONFIGS)
|
||||
|
||||
# Dump perf stats
|
||||
CXXFLAGS += -DDUMP_PERF_STATS
|
||||
|
||||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -L. -lrtlsim
|
||||
|
||||
SRCS = vortex.cpp ../common/utils.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
# Enable perf counters
|
||||
ifdef PERF
|
||||
CXXFLAGS += -DPERF_ENABLE
|
||||
endif
|
||||
|
||||
PROJECT = libvortex.so
|
||||
|
||||
all: $(PROJECT)
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
DESTDIR=../../runtime/rtlsim $(MAKE) -C $(RTLSIM_DIR) ../../runtime/rtlsim/librtlsim.so
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT)
|
||||
|
||||
clean:
|
||||
DESTDIR=../../runtime/rtlsim $(MAKE) -C $(RTLSIM_DIR) clean
|
||||
rm -rf $(PROJECT) *.o
|
||||
336
runtime/rtlsim/vortex.cpp
Normal file
336
runtime/rtlsim/vortex.cpp
Normal file
@@ -0,0 +1,336 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <iostream>
|
||||
#include <future>
|
||||
#include <list>
|
||||
#include <chrono>
|
||||
|
||||
#include <vortex.h>
|
||||
#include <malloc.h>
|
||||
#include <utils.h>
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
|
||||
#include <mem.h>
|
||||
#include <util.h>
|
||||
#include <processor.h>
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
: ram_(RAM_PAGE_SIZE)
|
||||
, global_mem_(
|
||||
ALLOC_BASE_ADDR,
|
||||
ALLOC_MAX_ADDR - ALLOC_BASE_ADDR,
|
||||
RAM_PAGE_SIZE,
|
||||
CACHE_BLOCK_SIZE)
|
||||
, local_mem_(
|
||||
SMEM_BASE_ADDR,
|
||||
(1ull << SMEM_LOG_SIZE),
|
||||
RAM_PAGE_SIZE,
|
||||
1)
|
||||
{
|
||||
processor_.attach_ram(&ram_);
|
||||
}
|
||||
|
||||
~vx_device() {
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
}
|
||||
|
||||
int mem_alloc(uint64_t size, int type, uint64_t* dev_addr) {
|
||||
if (type == VX_MEM_TYPE_GLOBAL) {
|
||||
return global_mem_.allocate(size, dev_addr);
|
||||
} else if (type == VX_MEM_TYPE_LOCAL) {
|
||||
return local_mem_.allocate(size, dev_addr);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int mem_free(uint64_t dev_addr) {
|
||||
if (dev_addr >= SMEM_BASE_ADDR) {
|
||||
return local_mem_.release(dev_addr);
|
||||
} else {
|
||||
return global_mem_.release(dev_addr);
|
||||
}
|
||||
}
|
||||
|
||||
int mem_info(int type, uint64_t* mem_free, uint64_t* mem_used) const {
|
||||
if (type == VX_MEM_TYPE_GLOBAL) {
|
||||
if (mem_free)
|
||||
*mem_free = global_mem_.free();
|
||||
if (mem_used)
|
||||
*mem_used = global_mem_.allocated();
|
||||
} else if (type == VX_MEM_TYPE_LOCAL) {
|
||||
if (mem_free)
|
||||
*mem_free = local_mem_.free();
|
||||
if (mem_used)
|
||||
*mem_free = local_mem_.allocated();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int upload(uint64_t dest_addr, const void* src, uint64_t size) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
/*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src));
|
||||
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
|
||||
printf("\n0x%08lx=", dest_addr + i * CACHE_BLOCK_SIZE);
|
||||
for (int j = 0; j < CACHE_BLOCK_SIZE; ++j) {
|
||||
printf("%02x", *((uint8_t*)src + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
|
||||
}
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
ram_.write((const uint8_t*)src, dest_addr, size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int download(void* dest, uint64_t src_addr, uint64_t size) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.read((uint8_t*)dest, src_addr, size);
|
||||
|
||||
/*printf("VXDRV: download %ld bytes to 0x%lx:", size, uintptr_t((uint8_t*)dest));
|
||||
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
|
||||
printf("\n0x%08lx=", src_addr + i * CACHE_BLOCK_SIZE);
|
||||
for (int j = 0; j < CACHE_BLOCK_SIZE; ++j) {
|
||||
printf("%02x", *((uint8_t*)dest + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
|
||||
}
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int start() {
|
||||
// ensure prior run completed
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
});
|
||||
return 0;
|
||||
}
|
||||
|
||||
int wait(uint64_t timeout) {
|
||||
if (!future_.valid())
|
||||
return 0;
|
||||
uint64_t timeout_sec = timeout / 1000;
|
||||
std::chrono::seconds wait_time(1);
|
||||
for (;;) {
|
||||
// wait for 1 sec and check status
|
||||
auto status = future_.wait_for(wait_time);
|
||||
if (status == std::future_status::ready
|
||||
|| 0 == timeout_sec--)
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int write_dcr(uint32_t addr, uint32_t value) {
|
||||
if (future_.valid()) {
|
||||
future_.wait(); // ensure prior run completed
|
||||
}
|
||||
processor_.write_dcr(addr, value);
|
||||
dcrs_.write(addr, value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t read_dcr(uint32_t addr) const {
|
||||
return dcrs_.read(addr);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
RAM ram_;
|
||||
Processor processor_;
|
||||
MemoryAllocator global_mem_;
|
||||
MemoryAllocator local_mem_;
|
||||
DeviceConfig dcrs_;
|
||||
std::future<void> future_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
*value = IMPLEMENTATION_ID;
|
||||
break;
|
||||
case VX_CAPS_NUM_THREADS:
|
||||
*value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_NUM_WARPS:
|
||||
*value = NUM_WARPS;
|
||||
break;
|
||||
case VX_CAPS_NUM_CORES:
|
||||
*value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_GLOBAL_MEM_SIZE:
|
||||
*value = GLOBAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_KERNEL_BASE_ADDR:
|
||||
*value = (uint64_t(device->read_dcr(VX_DCR_BASE_STARTUP_ADDR1)) << 32)
|
||||
| device->read_dcr(VX_DCR_BASE_STARTUP_ADDR0);
|
||||
break;
|
||||
case VX_CAPS_ISA_FLAGS:
|
||||
*value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_open(vx_device_h* hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = new vx_device();
|
||||
if (device == nullptr)
|
||||
return -1;
|
||||
|
||||
int err = dcr_initialize(device);
|
||||
if (err != 0) {
|
||||
delete device;
|
||||
return err;
|
||||
}
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
perf_add_device(device);
|
||||
#endif
|
||||
|
||||
*hdevice = device;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_close(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
perf_remove_device(hdevice);
|
||||
#endif
|
||||
|
||||
delete device;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == dev_addr
|
||||
|| 0 == size)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->mem_alloc(size, type, dev_addr);
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
if (0 == dev_addr)
|
||||
return 0;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->mem_free(dev_addr);
|
||||
}
|
||||
|
||||
extern int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
return device->mem_info(type, mem_free, mem_used);
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
return device->upload(dev_addr, host_ptr, size);
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
return device->download(host_ptr, dev_addr, size);
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->start();
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->wait(timeout);
|
||||
}
|
||||
|
||||
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(hdevice, -1) != 0)
|
||||
return -1;
|
||||
return device->write_dcr(addr, value);
|
||||
}
|
||||
2
runtime/simx/.gitignore
vendored
Normal file
2
runtime/simx/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
obj_dir
|
||||
libvortex.so
|
||||
34
runtime/simx/Makefile
Normal file
34
runtime/simx/Makefile
Normal file
@@ -0,0 +1,34 @@
|
||||
XLEN ?= 32
|
||||
|
||||
SIMX_DIR = ../../sim/simx
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
|
||||
CXXFLAGS += -I../include -I../common -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common
|
||||
CXXFLAGS += $(CONFIGS)
|
||||
CXXFLAGS += -DDUMP_PERF_STATS
|
||||
CXXFLAGS += -DXLEN_$(XLEN)
|
||||
|
||||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -L. -lsimx
|
||||
|
||||
SRCS = vortex.cpp ../common/utils.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = libvortex.so
|
||||
|
||||
all: $(PROJECT)
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
DESTDIR=../../runtime/simx $(MAKE) -C $(SIMX_DIR) ../../runtime/simx/libsimx.so
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
clean:
|
||||
DESTDIR=../../runtime/simx $(MAKE) -C $(SIMX_DIR) clean
|
||||
rm -rf libsimx.so $(PROJECT) *.o
|
||||
397
runtime/simx/vortex.cpp
Normal file
397
runtime/simx/vortex.cpp
Normal file
@@ -0,0 +1,397 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <iostream>
|
||||
#include <future>
|
||||
#include <chrono>
|
||||
|
||||
#include <vortex.h>
|
||||
#include <utils.h>
|
||||
#include <malloc.h>
|
||||
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
|
||||
#include <util.h>
|
||||
|
||||
#include <processor.h>
|
||||
#include <arch.h>
|
||||
#include <mem.h>
|
||||
#include <constants.h>
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define DBGPRINT(format, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device;
|
||||
|
||||
class vx_buffer {
|
||||
public:
|
||||
vx_buffer(uint64_t size, vx_device* device)
|
||||
: size_(size)
|
||||
, device_(device) {
|
||||
uint64_t aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
data_ = aligned_malloc(aligned_asize, CACHE_BLOCK_SIZE);
|
||||
// set uninitialized data to "baadf00d"
|
||||
for (uint32_t i = 0; i < aligned_asize; ++i) {
|
||||
((uint8_t*)data_)[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff;
|
||||
}
|
||||
}
|
||||
|
||||
~vx_buffer() {
|
||||
if (data_) {
|
||||
aligned_free(data_);
|
||||
}
|
||||
}
|
||||
|
||||
void* data() const {
|
||||
return data_;
|
||||
}
|
||||
|
||||
uint64_t size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
vx_device* device() const {
|
||||
return device_;
|
||||
}
|
||||
|
||||
private:
|
||||
uint64_t size_;
|
||||
vx_device* device_;
|
||||
void* data_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, NUM_CLUSTERS)
|
||||
, ram_(RAM_PAGE_SIZE)
|
||||
, processor_(arch_)
|
||||
, global_mem_(
|
||||
ALLOC_BASE_ADDR,
|
||||
ALLOC_MAX_ADDR - ALLOC_BASE_ADDR,
|
||||
RAM_PAGE_SIZE,
|
||||
CACHE_BLOCK_SIZE)
|
||||
, local_mem_(
|
||||
SMEM_BASE_ADDR,
|
||||
(1ull << SMEM_LOG_SIZE),
|
||||
RAM_PAGE_SIZE,
|
||||
1)
|
||||
{
|
||||
// attach memory module
|
||||
processor_.attach_ram(&ram_);
|
||||
}
|
||||
|
||||
~vx_device() {
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
}
|
||||
|
||||
int mem_alloc(uint64_t size, int type, uint64_t* dev_addr) {
|
||||
if (type == VX_MEM_TYPE_GLOBAL) {
|
||||
return global_mem_.allocate(size, dev_addr);
|
||||
} else if (type == VX_MEM_TYPE_LOCAL) {
|
||||
return local_mem_.allocate(size, dev_addr);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int mem_free(uint64_t dev_addr) {
|
||||
if (dev_addr >= SMEM_BASE_ADDR) {
|
||||
return local_mem_.release(dev_addr);
|
||||
} else {
|
||||
return global_mem_.release(dev_addr);
|
||||
}
|
||||
}
|
||||
|
||||
int mem_info(int type, uint64_t* mem_free, uint64_t* mem_used) const {
|
||||
if (type == VX_MEM_TYPE_GLOBAL) {
|
||||
if (mem_free)
|
||||
*mem_free = global_mem_.free();
|
||||
if (mem_used)
|
||||
*mem_used = global_mem_.allocated();
|
||||
} else if (type == VX_MEM_TYPE_LOCAL) {
|
||||
if (mem_free)
|
||||
*mem_free = local_mem_.free();
|
||||
if (mem_used)
|
||||
*mem_free = local_mem_.allocated();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int upload(uint64_t dest_addr, const void* src, uint64_t size) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.write((const uint8_t*)src, dest_addr, size);
|
||||
|
||||
/*DBGPRINT("upload %ld bytes to 0x%lx\n", size, dest_addr);
|
||||
for (uint64_t i = 0; i < size && i < 1024; i += 4) {
|
||||
DBGPRINT(" 0x%lx <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + i));
|
||||
}*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int download(void* dest, uint64_t src_addr, uint64_t size) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.read((uint8_t*)dest, src_addr, size);
|
||||
|
||||
/*DBGPRINT("download %ld bytes from 0x%lx\n", size, src_addr);
|
||||
for (uint64_t i = 0; i < size && i < 1024; i += 4) {
|
||||
DBGPRINT(" 0x%lx -> 0x%x\n", src_addr + i, *(uint32_t*)((uint8_t*)dest + i));
|
||||
}*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int start() {
|
||||
// ensure prior run completed
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run(false);
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int wait(uint64_t timeout) {
|
||||
if (!future_.valid())
|
||||
return 0;
|
||||
uint64_t timeout_sec = timeout / 1000;
|
||||
std::chrono::seconds wait_time(1);
|
||||
for (;;) {
|
||||
// wait for 1 sec and check status
|
||||
auto status = future_.wait_for(wait_time);
|
||||
if (status == std::future_status::ready
|
||||
|| 0 == timeout_sec--)
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int write_dcr(uint32_t addr, uint32_t value) {
|
||||
if (future_.valid()) {
|
||||
future_.wait(); // ensure prior run completed
|
||||
}
|
||||
processor_.write_dcr(addr, value);
|
||||
dcrs_.write(addr, value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t read_dcr(uint32_t addr) const {
|
||||
return dcrs_.read(addr);
|
||||
}
|
||||
|
||||
private:
|
||||
Arch arch_;
|
||||
RAM ram_;
|
||||
Processor processor_;
|
||||
MemoryAllocator global_mem_;
|
||||
MemoryAllocator local_mem_;
|
||||
DeviceConfig dcrs_;
|
||||
std::future<void> future_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_dev_open(vx_device_h* hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = new vx_device();
|
||||
if (device == nullptr)
|
||||
return -1;
|
||||
|
||||
int err = dcr_initialize(device);
|
||||
if (err != 0) {
|
||||
delete device;
|
||||
return err;
|
||||
}
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
perf_add_device(device);
|
||||
#endif
|
||||
|
||||
*hdevice = device;
|
||||
|
||||
DBGPRINT("device creation complete!\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_close(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
perf_remove_device(hdevice);
|
||||
#endif
|
||||
|
||||
delete device;
|
||||
|
||||
DBGPRINT("device destroyed!\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
*value = IMPLEMENTATION_ID;
|
||||
break;
|
||||
case VX_CAPS_NUM_THREADS:
|
||||
*value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_NUM_WARPS:
|
||||
*value = NUM_WARPS;
|
||||
break;
|
||||
case VX_CAPS_NUM_CORES:
|
||||
*value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_GLOBAL_MEM_SIZE:
|
||||
*value = GLOBAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_KERNEL_BASE_ADDR:
|
||||
*value = (uint64_t(device->read_dcr(VX_DCR_BASE_STARTUP_ADDR1)) << 32)
|
||||
| device->read_dcr(VX_DCR_BASE_STARTUP_ADDR0);
|
||||
break;
|
||||
case VX_CAPS_ISA_FLAGS:
|
||||
*value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == dev_addr
|
||||
|| 0 == size)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->mem_alloc(size, type, dev_addr);
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
if (0 == dev_addr)
|
||||
return 0;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->mem_free(dev_addr);
|
||||
}
|
||||
|
||||
extern int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
return device->mem_info(type, mem_free, mem_used);
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, host_ptr, size);
|
||||
|
||||
return device->upload(dev_addr, host_ptr, size);
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, host_ptr, size);
|
||||
|
||||
return device->download(host_ptr, dev_addr, size);
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("START\n");
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->start();
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->wait(timeout);
|
||||
}
|
||||
|
||||
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(hdevice, -1) != 0)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%lx\n", addr, value);
|
||||
|
||||
return device->write_dcr(addr, value);
|
||||
}
|
||||
@@ -1,890 +0,0 @@
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// \author (c) Marco Paland (info@paland.com)
|
||||
// 2014-2019, PALANDesign Hannover, Germany
|
||||
//
|
||||
// \license The MIT License (MIT)
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
//
|
||||
// \brief Tiny printf, sprintf and (v)snprintf implementation, optimized for speed on
|
||||
// embedded systems with a very limited resources. These routines are thread
|
||||
// safe and reentrant!
|
||||
// Use this instead of the bloated standard/newlib printf cause these use
|
||||
// malloc for printf (and may not be thread safe).
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include "tinyprintf.h"
|
||||
#include "vx_print.h"
|
||||
|
||||
|
||||
// define this globally (e.g. gcc -DPRINTF_INCLUDE_CONFIG_H ...) to include the
|
||||
// printf_config.h header file
|
||||
// default: undefined
|
||||
#ifdef PRINTF_INCLUDE_CONFIG_H
|
||||
#include "printf_config.h"
|
||||
#endif
|
||||
|
||||
|
||||
// 'ntoa' conversion buffer size, this must be big enough to hold one converted
|
||||
// numeric number including padded zeros (dynamically created on stack)
|
||||
// default: 32 byte
|
||||
#ifndef PRINTF_NTOA_BUFFER_SIZE
|
||||
#define PRINTF_NTOA_BUFFER_SIZE 32U
|
||||
#endif
|
||||
|
||||
// 'ftoa' conversion buffer size, this must be big enough to hold one converted
|
||||
// float number including padded zeros (dynamically created on stack)
|
||||
// default: 32 byte
|
||||
#ifndef PRINTF_FTOA_BUFFER_SIZE
|
||||
#define PRINTF_FTOA_BUFFER_SIZE 32U
|
||||
#endif
|
||||
|
||||
// support for the floating point type (%f)
|
||||
// default: activated
|
||||
#ifndef PRINTF_DISABLE_SUPPORT_FLOAT
|
||||
#define PRINTF_SUPPORT_FLOAT
|
||||
#endif
|
||||
|
||||
// support for exponential floating point notation (%e/%g)
|
||||
// default: activated
|
||||
#ifndef PRINTF_DISABLE_SUPPORT_EXPONENTIAL
|
||||
#define PRINTF_SUPPORT_EXPONENTIAL
|
||||
#endif
|
||||
|
||||
// define the default floating point precision
|
||||
// default: 6 digits
|
||||
#ifndef PRINTF_DEFAULT_FLOAT_PRECISION
|
||||
#define PRINTF_DEFAULT_FLOAT_PRECISION 6U
|
||||
#endif
|
||||
|
||||
// define the largest float suitable to print with %f
|
||||
// default: 1e9
|
||||
#ifndef PRINTF_MAX_FLOAT
|
||||
#define PRINTF_MAX_FLOAT 1e9
|
||||
#endif
|
||||
|
||||
// support for the long long types (%llu or %p)
|
||||
// default: activated
|
||||
#ifndef PRINTF_DISABLE_SUPPORT_LONG_LONG
|
||||
#define PRINTF_SUPPORT_LONG_LONG
|
||||
#endif
|
||||
|
||||
// support for the ptrdiff_t type (%t)
|
||||
// ptrdiff_t is normally defined in <stddef.h> as long or long long type
|
||||
// default: activated
|
||||
#ifndef PRINTF_DISABLE_SUPPORT_PTRDIFF_T
|
||||
#define PRINTF_SUPPORT_PTRDIFF_T
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// internal flag definitions
|
||||
#define FLAGS_ZEROPAD (1U << 0U)
|
||||
#define FLAGS_LEFT (1U << 1U)
|
||||
#define FLAGS_PLUS (1U << 2U)
|
||||
#define FLAGS_SPACE (1U << 3U)
|
||||
#define FLAGS_HASH (1U << 4U)
|
||||
#define FLAGS_UPPERCASE (1U << 5U)
|
||||
#define FLAGS_CHAR (1U << 6U)
|
||||
#define FLAGS_SHORT (1U << 7U)
|
||||
#define FLAGS_LONG (1U << 8U)
|
||||
#define FLAGS_LONG_LONG (1U << 9U)
|
||||
#define FLAGS_PRECISION (1U << 10U)
|
||||
#define FLAGS_ADAPT_EXP (1U << 11U)
|
||||
|
||||
|
||||
// import float.h for DBL_MAX
|
||||
#if defined(PRINTF_SUPPORT_FLOAT)
|
||||
#include <float.h>
|
||||
#endif
|
||||
|
||||
|
||||
// output function type
|
||||
typedef void (*out_fct_type)(char character, void* buffer, size_t idx, size_t maxlen);
|
||||
|
||||
|
||||
// wrapper (used as buffer) for output function type
|
||||
typedef struct {
|
||||
void (*fct)(char character, void* arg);
|
||||
void* arg;
|
||||
} out_fct_wrap_type;
|
||||
|
||||
|
||||
// internal buffer output
|
||||
static inline void _out_buffer(char character, void* buffer, size_t idx, size_t maxlen)
|
||||
{
|
||||
if (idx < maxlen) {
|
||||
((char*)buffer)[idx] = character;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// internal null output
|
||||
static inline void _out_null(char character, void* buffer, size_t idx, size_t maxlen)
|
||||
{
|
||||
(void)character; (void)buffer; (void)idx; (void)maxlen;
|
||||
}
|
||||
|
||||
|
||||
// internal _putchar wrapper
|
||||
static inline void _out_char(char character, void* buffer, size_t idx, size_t maxlen)
|
||||
{
|
||||
(void)buffer; (void)idx; (void)maxlen;
|
||||
if (character) {
|
||||
vx_putchar(character);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// internal output function wrapper
|
||||
static inline void _out_fct(char character, void* buffer, size_t idx, size_t maxlen)
|
||||
{
|
||||
(void)idx; (void)maxlen;
|
||||
if (character) {
|
||||
// buffer is the output fct pointer
|
||||
((out_fct_wrap_type*)buffer)->fct(character, ((out_fct_wrap_type*)buffer)->arg);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// internal secure strlen
|
||||
// \return The length of the string (excluding the terminating 0) limited by 'maxsize'
|
||||
static inline unsigned int _strnlen_s(const char* str, size_t maxsize)
|
||||
{
|
||||
const char* s;
|
||||
for (s = str; *s && maxsize--; ++s);
|
||||
return (unsigned int)(s - str);
|
||||
}
|
||||
|
||||
|
||||
// internal test if char is a digit (0-9)
|
||||
// \return true if char is a digit
|
||||
static inline bool _is_digit(char ch)
|
||||
{
|
||||
return (ch >= '0') && (ch <= '9');
|
||||
}
|
||||
|
||||
|
||||
// internal ASCII string to unsigned int conversion
|
||||
static unsigned int _atoi(const char** str)
|
||||
{
|
||||
unsigned int i = 0U;
|
||||
while (_is_digit(**str)) {
|
||||
i = i * 10U + (unsigned int)(*((*str)++) - '0');
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
|
||||
// output the specified string in reverse, taking care of any zero-padding
|
||||
static size_t _out_rev(out_fct_type out, char* buffer, size_t idx, size_t maxlen, const char* buf, size_t len, unsigned int width, unsigned int flags)
|
||||
{
|
||||
const size_t start_idx = idx;
|
||||
|
||||
// pad spaces up to given width
|
||||
if (!(flags & FLAGS_LEFT) && !(flags & FLAGS_ZEROPAD)) {
|
||||
for (size_t i = len; i < width; i++) {
|
||||
out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
|
||||
// reverse string
|
||||
while (len) {
|
||||
out(buf[--len], buffer, idx++, maxlen);
|
||||
}
|
||||
|
||||
// append pad spaces up to given width
|
||||
if (flags & FLAGS_LEFT) {
|
||||
while (idx - start_idx < width) {
|
||||
out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
|
||||
// internal itoa format
|
||||
static size_t _ntoa_format(out_fct_type out, char* buffer, size_t idx, size_t maxlen, char* buf, size_t len, bool negative, unsigned int base, unsigned int prec, unsigned int width, unsigned int flags)
|
||||
{
|
||||
// pad leading zeros
|
||||
if (!(flags & FLAGS_LEFT)) {
|
||||
if (width && (flags & FLAGS_ZEROPAD) && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
|
||||
width--;
|
||||
}
|
||||
while ((len < prec) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
|
||||
buf[len++] = '0';
|
||||
}
|
||||
while ((flags & FLAGS_ZEROPAD) && (len < width) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
|
||||
buf[len++] = '0';
|
||||
}
|
||||
}
|
||||
|
||||
// handle hash
|
||||
if (flags & FLAGS_HASH) {
|
||||
if (!(flags & FLAGS_PRECISION) && len && ((len == prec) || (len == width))) {
|
||||
len--;
|
||||
if (len && (base == 16U)) {
|
||||
len--;
|
||||
}
|
||||
}
|
||||
if ((base == 16U) && !(flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
|
||||
buf[len++] = 'x';
|
||||
}
|
||||
else if ((base == 16U) && (flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
|
||||
buf[len++] = 'X';
|
||||
}
|
||||
else if ((base == 2U) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
|
||||
buf[len++] = 'b';
|
||||
}
|
||||
if (len < PRINTF_NTOA_BUFFER_SIZE) {
|
||||
buf[len++] = '0';
|
||||
}
|
||||
}
|
||||
|
||||
if (len < PRINTF_NTOA_BUFFER_SIZE) {
|
||||
if (negative) {
|
||||
buf[len++] = '-';
|
||||
}
|
||||
else if (flags & FLAGS_PLUS) {
|
||||
buf[len++] = '+'; // ignore the space if the '+' exists
|
||||
}
|
||||
else if (flags & FLAGS_SPACE) {
|
||||
buf[len++] = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
|
||||
}
|
||||
|
||||
|
||||
// internal itoa for 'long' type
|
||||
static size_t _ntoa_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long value, bool negative, unsigned long base, unsigned int prec, unsigned int width, unsigned int flags)
|
||||
{
|
||||
char buf[PRINTF_NTOA_BUFFER_SIZE];
|
||||
size_t len = 0U;
|
||||
|
||||
// no hash for 0 values
|
||||
if (!value) {
|
||||
flags &= ~FLAGS_HASH;
|
||||
}
|
||||
|
||||
// write if precision != 0 and value is != 0
|
||||
if (!(flags & FLAGS_PRECISION) || value) {
|
||||
do {
|
||||
const char digit = (char)(value % base);
|
||||
buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
|
||||
value /= base;
|
||||
} while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
|
||||
}
|
||||
|
||||
return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
|
||||
}
|
||||
|
||||
|
||||
// internal itoa for 'long long' type
|
||||
#if defined(PRINTF_SUPPORT_LONG_LONG)
|
||||
static size_t _ntoa_long_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long long value, bool negative, unsigned long long base, unsigned int prec, unsigned int width, unsigned int flags)
|
||||
{
|
||||
char buf[PRINTF_NTOA_BUFFER_SIZE];
|
||||
size_t len = 0U;
|
||||
|
||||
// no hash for 0 values
|
||||
if (!value) {
|
||||
flags &= ~FLAGS_HASH;
|
||||
}
|
||||
|
||||
// write if precision != 0 and value is != 0
|
||||
if (!(flags & FLAGS_PRECISION) || value) {
|
||||
do {
|
||||
const char digit = (char)(value % base);
|
||||
buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
|
||||
value /= base;
|
||||
} while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
|
||||
}
|
||||
|
||||
return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
|
||||
}
|
||||
#endif // PRINTF_SUPPORT_LONG_LONG
|
||||
|
||||
|
||||
#if defined(PRINTF_SUPPORT_FLOAT)
|
||||
|
||||
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
|
||||
// forward declaration so that _ftoa can switch to exp notation for values > PRINTF_MAX_FLOAT
|
||||
static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags);
|
||||
#endif
|
||||
|
||||
|
||||
// internal ftoa for fixed decimal floating point
|
||||
static size_t _ftoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
|
||||
{
|
||||
char buf[PRINTF_FTOA_BUFFER_SIZE];
|
||||
size_t len = 0U;
|
||||
double diff = 0.0;
|
||||
|
||||
// powers of 10
|
||||
static const double pow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 };
|
||||
|
||||
// test for special values
|
||||
if (value != value)
|
||||
return _out_rev(out, buffer, idx, maxlen, "nan", 3, width, flags);
|
||||
if (value < -DBL_MAX)
|
||||
return _out_rev(out, buffer, idx, maxlen, "fni-", 4, width, flags);
|
||||
if (value > DBL_MAX)
|
||||
return _out_rev(out, buffer, idx, maxlen, (flags & FLAGS_PLUS) ? "fni+" : "fni", (flags & FLAGS_PLUS) ? 4U : 3U, width, flags);
|
||||
|
||||
// test for very large values
|
||||
// standard printf behavior is to print EVERY whole number digit -- which could be 100s of characters overflowing your buffers == bad
|
||||
if ((value > PRINTF_MAX_FLOAT) || (value < -PRINTF_MAX_FLOAT)) {
|
||||
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
|
||||
return _etoa(out, buffer, idx, maxlen, value, prec, width, flags);
|
||||
#else
|
||||
return 0U;
|
||||
#endif
|
||||
}
|
||||
|
||||
// test for negative
|
||||
bool negative = false;
|
||||
if (value < 0) {
|
||||
negative = true;
|
||||
value = 0 - value;
|
||||
}
|
||||
|
||||
// set default precision, if not set explicitly
|
||||
if (!(flags & FLAGS_PRECISION)) {
|
||||
prec = PRINTF_DEFAULT_FLOAT_PRECISION;
|
||||
}
|
||||
// limit precision to 9, cause a prec >= 10 can lead to overflow errors
|
||||
while ((len < PRINTF_FTOA_BUFFER_SIZE) && (prec > 9U)) {
|
||||
buf[len++] = '0';
|
||||
prec--;
|
||||
}
|
||||
|
||||
int whole = (int)value;
|
||||
double tmp = (value - whole) * pow10[prec];
|
||||
unsigned long frac = (unsigned long)tmp;
|
||||
diff = tmp - frac;
|
||||
|
||||
if (diff > 0.5) {
|
||||
++frac;
|
||||
// handle rollover, e.g. case 0.99 with prec 1 is 1.0
|
||||
if (frac >= pow10[prec]) {
|
||||
frac = 0;
|
||||
++whole;
|
||||
}
|
||||
}
|
||||
else if (diff < 0.5) {
|
||||
}
|
||||
else if ((frac == 0U) || (frac & 1U)) {
|
||||
// if halfway, round up if odd OR if last digit is 0
|
||||
++frac;
|
||||
}
|
||||
|
||||
if (prec == 0U) {
|
||||
diff = value - (double)whole;
|
||||
if ((!(diff < 0.5) || (diff > 0.5)) && (whole & 1)) {
|
||||
// exactly 0.5 and ODD, then round up
|
||||
// 1.5 -> 2, but 2.5 -> 2
|
||||
++whole;
|
||||
}
|
||||
}
|
||||
else {
|
||||
unsigned int count = prec;
|
||||
// now do fractional part, as an unsigned number
|
||||
while (len < PRINTF_FTOA_BUFFER_SIZE) {
|
||||
--count;
|
||||
buf[len++] = (char)(48U + (frac % 10U));
|
||||
if (!(frac /= 10U)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// add extra 0s
|
||||
while ((len < PRINTF_FTOA_BUFFER_SIZE) && (count-- > 0U)) {
|
||||
buf[len++] = '0';
|
||||
}
|
||||
if (len < PRINTF_FTOA_BUFFER_SIZE) {
|
||||
// add decimal
|
||||
buf[len++] = '.';
|
||||
}
|
||||
}
|
||||
|
||||
// do whole part, number is reversed
|
||||
while (len < PRINTF_FTOA_BUFFER_SIZE) {
|
||||
buf[len++] = (char)(48 + (whole % 10));
|
||||
if (!(whole /= 10)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// pad leading zeros
|
||||
if (!(flags & FLAGS_LEFT) && (flags & FLAGS_ZEROPAD)) {
|
||||
if (width && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
|
||||
width--;
|
||||
}
|
||||
while ((len < width) && (len < PRINTF_FTOA_BUFFER_SIZE)) {
|
||||
buf[len++] = '0';
|
||||
}
|
||||
}
|
||||
|
||||
if (len < PRINTF_FTOA_BUFFER_SIZE) {
|
||||
if (negative) {
|
||||
buf[len++] = '-';
|
||||
}
|
||||
else if (flags & FLAGS_PLUS) {
|
||||
buf[len++] = '+'; // ignore the space if the '+' exists
|
||||
}
|
||||
else if (flags & FLAGS_SPACE) {
|
||||
buf[len++] = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
|
||||
}
|
||||
|
||||
|
||||
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
|
||||
// internal ftoa variant for exponential floating-point type, contributed by Martijn Jasperse <m.jasperse@gmail.com>
|
||||
static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
|
||||
{
|
||||
// check for NaN and special values
|
||||
if ((value != value) || (value > DBL_MAX) || (value < -DBL_MAX)) {
|
||||
return _ftoa(out, buffer, idx, maxlen, value, prec, width, flags);
|
||||
}
|
||||
|
||||
// determine the sign
|
||||
const bool negative = value < 0;
|
||||
if (negative) {
|
||||
value = -value;
|
||||
}
|
||||
|
||||
// default precision
|
||||
if (!(flags & FLAGS_PRECISION)) {
|
||||
prec = PRINTF_DEFAULT_FLOAT_PRECISION;
|
||||
}
|
||||
|
||||
// determine the decimal exponent
|
||||
// based on the algorithm by David Gay (https://www.ampl.com/netlib/fp/dtoa.c)
|
||||
union {
|
||||
uint64_t U;
|
||||
double F;
|
||||
} conv;
|
||||
|
||||
conv.F = value;
|
||||
int exp2 = (int)((conv.U >> 52U) & 0x07FFU) - 1023; // effectively log2
|
||||
conv.U = (conv.U & ((1ULL << 52U) - 1U)) | (1023ULL << 52U); // drop the exponent so conv.F is now in [1,2)
|
||||
// now approximate log10 from the log2 integer part and an expansion of ln around 1.5
|
||||
int expval = (int)(0.1760912590558 + exp2 * 0.301029995663981 + (conv.F - 1.5) * 0.289529654602168);
|
||||
// now we want to compute 10^expval but we want to be sure it won't overflow
|
||||
exp2 = (int)(expval * 3.321928094887362 + 0.5);
|
||||
const double z = expval * 2.302585092994046 - exp2 * 0.6931471805599453;
|
||||
const double z2 = z * z;
|
||||
conv.U = (uint64_t)(exp2 + 1023) << 52U;
|
||||
// compute exp(z) using continued fractions, see https://en.wikipedia.org/wiki/Exponential_function#Continued_fractions_for_ex
|
||||
conv.F *= 1 + 2 * z / (2 - z + (z2 / (6 + (z2 / (10 + z2 / 14)))));
|
||||
// correct for rounding errors
|
||||
if (value < conv.F) {
|
||||
expval--;
|
||||
conv.F /= 10;
|
||||
}
|
||||
|
||||
// the exponent format is "%+03d" and largest value is "307", so set aside 4-5 characters
|
||||
unsigned int minwidth = ((expval < 100) && (expval > -100)) ? 4U : 5U;
|
||||
|
||||
// in "%g" mode, "prec" is the number of *significant figures* not decimals
|
||||
if (flags & FLAGS_ADAPT_EXP) {
|
||||
// do we want to fall-back to "%f" mode?
|
||||
if ((value >= 1e-4) && (value < 1e6)) {
|
||||
if ((int)prec > expval) {
|
||||
prec = (unsigned)((int)prec - expval - 1);
|
||||
}
|
||||
else {
|
||||
prec = 0;
|
||||
}
|
||||
flags |= FLAGS_PRECISION; // make sure _ftoa respects precision
|
||||
// no characters in exponent
|
||||
minwidth = 0U;
|
||||
expval = 0;
|
||||
}
|
||||
else {
|
||||
// we use one sigfig for the whole part
|
||||
if ((prec > 0) && (flags & FLAGS_PRECISION)) {
|
||||
--prec;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// will everything fit?
|
||||
unsigned int fwidth = width;
|
||||
if (width > minwidth) {
|
||||
// we didn't fall-back so subtract the characters required for the exponent
|
||||
fwidth -= minwidth;
|
||||
} else {
|
||||
// not enough characters, so go back to default sizing
|
||||
fwidth = 0U;
|
||||
}
|
||||
if ((flags & FLAGS_LEFT) && minwidth) {
|
||||
// if we're padding on the right, DON'T pad the floating part
|
||||
fwidth = 0U;
|
||||
}
|
||||
|
||||
// rescale the float value
|
||||
if (expval) {
|
||||
value /= conv.F;
|
||||
}
|
||||
|
||||
// output the floating part
|
||||
const size_t start_idx = idx;
|
||||
idx = _ftoa(out, buffer, idx, maxlen, negative ? -value : value, prec, fwidth, flags & ~FLAGS_ADAPT_EXP);
|
||||
|
||||
// output the exponent part
|
||||
if (minwidth) {
|
||||
// output the exponential symbol
|
||||
out((flags & FLAGS_UPPERCASE) ? 'E' : 'e', buffer, idx++, maxlen);
|
||||
// output the exponent value
|
||||
idx = _ntoa_long(out, buffer, idx, maxlen, (expval < 0) ? -expval : expval, expval < 0, 10, 0, minwidth-1, FLAGS_ZEROPAD | FLAGS_PLUS);
|
||||
// might need to right-pad spaces
|
||||
if (flags & FLAGS_LEFT) {
|
||||
while (idx - start_idx < width) out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
return idx;
|
||||
}
|
||||
#endif // PRINTF_SUPPORT_EXPONENTIAL
|
||||
#endif // PRINTF_SUPPORT_FLOAT
|
||||
|
||||
|
||||
// internal vsnprintf
|
||||
static int _vsnprintf(out_fct_type out, char* buffer, const size_t maxlen, const char* format, va_list va) {
|
||||
unsigned int flags, width, precision, n;
|
||||
size_t idx = 0U;
|
||||
|
||||
if (!buffer) {
|
||||
// use null output function
|
||||
out = _out_null;
|
||||
}
|
||||
|
||||
while (*format)
|
||||
{
|
||||
// format specifier? %[flags][width][.precision][length]
|
||||
if (*format != '%') {
|
||||
// no
|
||||
out(*format, buffer, idx++, maxlen);
|
||||
format++;
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
// yes, evaluate it
|
||||
format++;
|
||||
}
|
||||
|
||||
// evaluate flags
|
||||
flags = 0U;
|
||||
do {
|
||||
switch (*format) {
|
||||
case '0': flags |= FLAGS_ZEROPAD; format++; n = 1U; break;
|
||||
case '-': flags |= FLAGS_LEFT; format++; n = 1U; break;
|
||||
case '+': flags |= FLAGS_PLUS; format++; n = 1U; break;
|
||||
case ' ': flags |= FLAGS_SPACE; format++; n = 1U; break;
|
||||
case '#': flags |= FLAGS_HASH; format++; n = 1U; break;
|
||||
default : n = 0U; break;
|
||||
}
|
||||
} while (n);
|
||||
|
||||
// evaluate width field
|
||||
width = 0U;
|
||||
if (_is_digit(*format)) {
|
||||
width = _atoi(&format);
|
||||
}
|
||||
else if (*format == '*') {
|
||||
const int w = va_arg(va, int);
|
||||
if (w < 0) {
|
||||
flags |= FLAGS_LEFT; // reverse padding
|
||||
width = (unsigned int)-w;
|
||||
}
|
||||
else {
|
||||
width = (unsigned int)w;
|
||||
}
|
||||
format++;
|
||||
}
|
||||
|
||||
// evaluate precision field
|
||||
precision = 0U;
|
||||
if (*format == '.') {
|
||||
flags |= FLAGS_PRECISION;
|
||||
format++;
|
||||
if (_is_digit(*format)) {
|
||||
precision = _atoi(&format);
|
||||
}
|
||||
else if (*format == '*') {
|
||||
const int prec = (int)va_arg(va, int);
|
||||
precision = prec > 0 ? (unsigned int)prec : 0U;
|
||||
format++;
|
||||
}
|
||||
}
|
||||
|
||||
// evaluate length field
|
||||
switch (*format) {
|
||||
case 'l' :
|
||||
flags |= FLAGS_LONG;
|
||||
format++;
|
||||
if (*format == 'l') {
|
||||
flags |= FLAGS_LONG_LONG;
|
||||
format++;
|
||||
}
|
||||
break;
|
||||
case 'h' :
|
||||
flags |= FLAGS_SHORT;
|
||||
format++;
|
||||
if (*format == 'h') {
|
||||
flags |= FLAGS_CHAR;
|
||||
format++;
|
||||
}
|
||||
break;
|
||||
#if defined(PRINTF_SUPPORT_PTRDIFF_T)
|
||||
case 't' :
|
||||
flags |= (sizeof(ptrdiff_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
|
||||
format++;
|
||||
break;
|
||||
#endif
|
||||
case 'j' :
|
||||
flags |= (sizeof(intmax_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
|
||||
format++;
|
||||
break;
|
||||
case 'z' :
|
||||
flags |= (sizeof(size_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
|
||||
format++;
|
||||
break;
|
||||
default :
|
||||
break;
|
||||
}
|
||||
|
||||
// evaluate specifier
|
||||
switch (*format) {
|
||||
case 'd' :
|
||||
case 'i' :
|
||||
case 'u' :
|
||||
case 'x' :
|
||||
case 'X' :
|
||||
case 'o' :
|
||||
case 'b' : {
|
||||
// set the base
|
||||
unsigned int base;
|
||||
if (*format == 'x' || *format == 'X') {
|
||||
base = 16U;
|
||||
}
|
||||
else if (*format == 'o') {
|
||||
base = 8U;
|
||||
}
|
||||
else if (*format == 'b') {
|
||||
base = 2U;
|
||||
}
|
||||
else {
|
||||
base = 10U;
|
||||
flags &= ~FLAGS_HASH; // no hash for dec format
|
||||
}
|
||||
// uppercase
|
||||
if (*format == 'X') {
|
||||
flags |= FLAGS_UPPERCASE;
|
||||
}
|
||||
|
||||
// no plus or space flag for u, x, X, o, b
|
||||
if ((*format != 'i') && (*format != 'd')) {
|
||||
flags &= ~(FLAGS_PLUS | FLAGS_SPACE);
|
||||
}
|
||||
|
||||
// ignore '0' flag when precision is given
|
||||
if (flags & FLAGS_PRECISION) {
|
||||
flags &= ~FLAGS_ZEROPAD;
|
||||
}
|
||||
|
||||
// convert the integer
|
||||
if ((*format == 'i') || (*format == 'd')) {
|
||||
// signed
|
||||
if (flags & FLAGS_LONG_LONG) {
|
||||
#if defined(PRINTF_SUPPORT_LONG_LONG)
|
||||
const long long value = va_arg(va, long long);
|
||||
idx = _ntoa_long_long(out, buffer, idx, maxlen, (unsigned long long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
|
||||
#endif
|
||||
}
|
||||
else if (flags & FLAGS_LONG) {
|
||||
const long value = va_arg(va, long);
|
||||
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
|
||||
}
|
||||
else {
|
||||
const int value = (flags & FLAGS_CHAR) ? (char)va_arg(va, int) : (flags & FLAGS_SHORT) ? (short int)va_arg(va, int) : va_arg(va, int);
|
||||
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned int)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// unsigned
|
||||
if (flags & FLAGS_LONG_LONG) {
|
||||
#if defined(PRINTF_SUPPORT_LONG_LONG)
|
||||
idx = _ntoa_long_long(out, buffer, idx, maxlen, va_arg(va, unsigned long long), false, base, precision, width, flags);
|
||||
#endif
|
||||
}
|
||||
else if (flags & FLAGS_LONG) {
|
||||
idx = _ntoa_long(out, buffer, idx, maxlen, va_arg(va, unsigned long), false, base, precision, width, flags);
|
||||
}
|
||||
else {
|
||||
const unsigned int value = (flags & FLAGS_CHAR) ? (unsigned char)va_arg(va, unsigned int) : (flags & FLAGS_SHORT) ? (unsigned short int)va_arg(va, unsigned int) : va_arg(va, unsigned int);
|
||||
idx = _ntoa_long(out, buffer, idx, maxlen, value, false, base, precision, width, flags);
|
||||
}
|
||||
}
|
||||
format++;
|
||||
break;
|
||||
}
|
||||
#if defined(PRINTF_SUPPORT_FLOAT)
|
||||
case 'f' :
|
||||
case 'F' :
|
||||
if (*format == 'F') flags |= FLAGS_UPPERCASE;
|
||||
idx = _ftoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
|
||||
format++;
|
||||
break;
|
||||
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
|
||||
case 'e':
|
||||
case 'E':
|
||||
case 'g':
|
||||
case 'G':
|
||||
if ((*format == 'g')||(*format == 'G')) flags |= FLAGS_ADAPT_EXP;
|
||||
if ((*format == 'E')||(*format == 'G')) flags |= FLAGS_UPPERCASE;
|
||||
idx = _etoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
|
||||
format++;
|
||||
break;
|
||||
#endif // PRINTF_SUPPORT_EXPONENTIAL
|
||||
#endif // PRINTF_SUPPORT_FLOAT
|
||||
case 'c' : {
|
||||
unsigned int l = 1U;
|
||||
// pre padding
|
||||
if (!(flags & FLAGS_LEFT)) {
|
||||
while (l++ < width) {
|
||||
out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
// char output
|
||||
out((char)va_arg(va, int), buffer, idx++, maxlen);
|
||||
// post padding
|
||||
if (flags & FLAGS_LEFT) {
|
||||
while (l++ < width) {
|
||||
out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
format++;
|
||||
break;
|
||||
}
|
||||
|
||||
case 's' : {
|
||||
const char* p = va_arg(va, char*);
|
||||
unsigned int l = _strnlen_s(p, precision ? precision : (size_t)-1);
|
||||
// pre padding
|
||||
if (flags & FLAGS_PRECISION) {
|
||||
l = (l < precision ? l : precision);
|
||||
}
|
||||
if (!(flags & FLAGS_LEFT)) {
|
||||
while (l++ < width) {
|
||||
out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
// string output
|
||||
while ((*p != 0) && (!(flags & FLAGS_PRECISION) || precision--)) {
|
||||
out(*(p++), buffer, idx++, maxlen);
|
||||
}
|
||||
// post padding
|
||||
if (flags & FLAGS_LEFT) {
|
||||
while (l++ < width) {
|
||||
out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
format++;
|
||||
break;
|
||||
}
|
||||
|
||||
case 'p' : {
|
||||
width = sizeof(void*) * 2U;
|
||||
flags |= FLAGS_ZEROPAD | FLAGS_UPPERCASE;
|
||||
#if defined(PRINTF_SUPPORT_LONG_LONG)
|
||||
const bool is_ll = sizeof(uintptr_t) == sizeof(long long);
|
||||
if (is_ll) {
|
||||
idx = _ntoa_long_long(out, buffer, idx, maxlen, (uintptr_t)va_arg(va, void*), false, 16U, precision, width, flags);
|
||||
}
|
||||
else {
|
||||
#endif
|
||||
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)((uintptr_t)va_arg(va, void*)), false, 16U, precision, width, flags);
|
||||
#if defined(PRINTF_SUPPORT_LONG_LONG)
|
||||
}
|
||||
#endif
|
||||
format++;
|
||||
break;
|
||||
}
|
||||
|
||||
case '%' :
|
||||
out('%', buffer, idx++, maxlen);
|
||||
format++;
|
||||
break;
|
||||
|
||||
default :
|
||||
out(*format, buffer, idx++, maxlen);
|
||||
format++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// termination
|
||||
out((char)0, buffer, idx < maxlen ? idx : maxlen - 1U, maxlen);
|
||||
|
||||
// return written chars without terminating \0
|
||||
return (int)idx;
|
||||
}
|
||||
|
||||
int tiny_printf(const char* format, ...) {
|
||||
va_list va;
|
||||
va_start(va, format);
|
||||
char buffer[1];
|
||||
const int ret = _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
|
||||
va_end(va);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int tiny_sprintf(char* buffer, const char* format, ...) {
|
||||
va_list va;
|
||||
va_start(va, format);
|
||||
const int ret = _vsnprintf(_out_buffer, buffer, (size_t)-1, format, va);
|
||||
va_end(va);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int tiny_snprintf(char* buffer, size_t count, const char* format, ...) {
|
||||
va_list va;
|
||||
va_start(va, format);
|
||||
const int ret = _vsnprintf(_out_buffer, buffer, count, format, va);
|
||||
va_end(va);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int tiny_vprintf(const char* format, va_list va) {
|
||||
char buffer[1];
|
||||
return _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
|
||||
}
|
||||
|
||||
int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va) {
|
||||
return _vsnprintf(_out_buffer, buffer, count, format, va);
|
||||
}
|
||||
@@ -1,86 +0,0 @@
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// \author (c) Marco Paland (info@paland.com)
|
||||
// 2014-2019, PALANDesign Hannover, Germany
|
||||
//
|
||||
// \license The MIT License (MIT)
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
//
|
||||
// \brief Tiny printf, sprintf and snprintf implementation, optimized for speed on
|
||||
// embedded systems with a very limited resources.
|
||||
// Use this instead of bloated standard/newlib printf.
|
||||
// These routines are thread safe and reentrant.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef _TINYPRINTF_H_
|
||||
#define _TINYPRINTF_H_
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Tiny printf implementation
|
||||
* You have to implement _putchar if you use printf()
|
||||
* To avoid conflicts with the regular printf() API it is overridden by macro defines
|
||||
* and internal underscore-appended functions like printf_() are used
|
||||
* \param format A string that specifies the format of the output
|
||||
* \return The number of characters that are written into the array, not counting the terminating null character
|
||||
*/
|
||||
int tiny_printf(const char* format, ...);
|
||||
|
||||
/**
|
||||
* Tiny sprintf implementation
|
||||
* Due to security reasons (buffer overflow) YOU SHOULD CONSIDER USING (V)SNPRINTF INSTEAD!
|
||||
* \param buffer A pointer to the buffer where to store the formatted string. MUST be big enough to store the output!
|
||||
* \param format A string that specifies the format of the output
|
||||
* \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
|
||||
*/
|
||||
int tiny_sprintf(char* buffer, const char* format, ...);
|
||||
|
||||
/**
|
||||
* Tiny snprintf/vsnprintf implementation
|
||||
* \param buffer A pointer to the buffer where to store the formatted string
|
||||
* \param count The maximum number of characters to store in the buffer, including a terminating null character
|
||||
* \param format A string that specifies the format of the output
|
||||
* \param va A value identifying a variable arguments list
|
||||
* \return The number of characters that COULD have been written into the buffer, not counting the terminating
|
||||
* null character. A value equal or larger than count indicates truncation. Only when the returned value
|
||||
* is non-negative and less than count, the string has been completely written.
|
||||
*/
|
||||
int tiny_snprintf(char* buffer, size_t count, const char* format, ...);
|
||||
int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va);
|
||||
|
||||
/**
|
||||
* Tiny vprintf implementation
|
||||
* \param format A string that specifies the format of the output
|
||||
* \param va A value identifying a variable arguments list
|
||||
* \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
|
||||
*/
|
||||
int tiny_vprintf(const char* format, va_list va);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // _TINYPRINTF_H_
|
||||
@@ -1,27 +0,0 @@
|
||||
|
||||
#include <VX_config.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#define DUMP_CSR_4(d, s) \
|
||||
csr_mem[d + 0] = csr_read(s + 0); \
|
||||
csr_mem[d + 1] = csr_read(s + 1); \
|
||||
csr_mem[d + 2] = csr_read(s + 2); \
|
||||
csr_mem[d + 3] = csr_read(s + 3);
|
||||
|
||||
#define DUMP_CSR_32(d, s) \
|
||||
DUMP_CSR_4(d + 0, s + 0) \
|
||||
DUMP_CSR_4(d + 4, s + 4) \
|
||||
DUMP_CSR_4(d + 8, s + 8) \
|
||||
DUMP_CSR_4(d + 12, s + 12) \
|
||||
DUMP_CSR_4(d + 16, s + 16) \
|
||||
DUMP_CSR_4(d + 20, s + 20) \
|
||||
DUMP_CSR_4(d + 24, s + 24) \
|
||||
DUMP_CSR_4(d + 28, s + 28)
|
||||
|
||||
void vx_perf_dump() {
|
||||
int core_id = vx_core_id();
|
||||
uint32_t* const csr_mem = (uint32_t*)(IO_CSR_ADDR + 64 * sizeof(uint32_t) * core_id);
|
||||
DUMP_CSR_32(0, CSR_MPM_BASE)
|
||||
DUMP_CSR_32(32, CSR_MPM_BASE_H)
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
#include <VX_config.h>
|
||||
|
||||
.type vx_putchar, @function
|
||||
.global vx_putchar
|
||||
vx_putchar:
|
||||
csrr t0, CSR_GTID
|
||||
andi t0, t0, %lo(IO_COUT_SIZE-1)
|
||||
li t1, IO_COUT_ADDR
|
||||
add t0, t0, t1
|
||||
sb a0, 0(t0)
|
||||
ret
|
||||
@@ -1,94 +0,0 @@
|
||||
#include <vx_print.h>
|
||||
#include <vx_spawn.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include "tinyprintf.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
const char* format;
|
||||
va_list* va;
|
||||
int ret;
|
||||
} printf_arg_t;
|
||||
|
||||
typedef struct {
|
||||
int value;
|
||||
int base;
|
||||
} putint_arg_t;
|
||||
|
||||
typedef struct {
|
||||
float value;
|
||||
int precision;
|
||||
} putfloat_arg_t;
|
||||
|
||||
static void __putint_cb(const putint_arg_t* arg) {
|
||||
char tmp[33];
|
||||
float value = arg->value;
|
||||
int base = arg->base;
|
||||
itoa(value, tmp, base);
|
||||
for (int i = 0; i < 33; ++i) {
|
||||
int c = tmp[i];
|
||||
if (!c)
|
||||
break;
|
||||
vx_putchar(c);
|
||||
}
|
||||
}
|
||||
|
||||
static void __putfloat_cb(const putfloat_arg_t* arg) {
|
||||
float value = arg->value;
|
||||
int precision = arg->precision;
|
||||
int ipart = (int)value;
|
||||
vx_putint(ipart, 10);
|
||||
if (precision != 0) {
|
||||
vx_putchar('.');
|
||||
float frac = value - (float)ipart;
|
||||
float fscaled = frac * pow(10, precision);
|
||||
vx_putint((int)fscaled, 10);
|
||||
}
|
||||
}
|
||||
|
||||
static void __vprintf_cb(printf_arg_t* arg) {
|
||||
arg->ret = tiny_vprintf(arg->format, *arg->va);
|
||||
}
|
||||
|
||||
void vx_putint(int value, int base) {
|
||||
putint_arg_t arg;
|
||||
arg.value = value;
|
||||
arg.base = base;
|
||||
vx_serial((vx_serial_cb)__putint_cb, &arg);
|
||||
}
|
||||
|
||||
void vx_putfloat(float value, int precision) {
|
||||
putfloat_arg_t arg;
|
||||
arg.value = value;
|
||||
arg.precision = precision;
|
||||
vx_serial((vx_serial_cb)__putfloat_cb, &arg);
|
||||
}
|
||||
|
||||
int vx_vprintf(const char* format, va_list va) {
|
||||
printf_arg_t arg;
|
||||
arg.format = format;
|
||||
arg.va = &va;
|
||||
vx_serial((vx_serial_cb)__vprintf_cb, &arg);
|
||||
return arg.ret;
|
||||
}
|
||||
|
||||
int vx_printf(const char * format, ...) {
|
||||
int ret;
|
||||
va_list va;
|
||||
va_start(va, format);
|
||||
ret = vx_vprintf(format, va);
|
||||
va_end(va);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -1,37 +0,0 @@
|
||||
#include <VX_config.h>
|
||||
|
||||
.type vx_serial, @function
|
||||
.global vx_serial
|
||||
vx_serial:
|
||||
addi sp, sp, -24
|
||||
sw ra, 20(sp)
|
||||
sw s4, 16(sp)
|
||||
sw s3, 12(sp)
|
||||
sw s2, 8(sp)
|
||||
sw s1, 4(sp)
|
||||
sw s0, 0(sp)
|
||||
mv s4, a0 # s4 <- callback
|
||||
mv s3, a1 # s3 <- arg
|
||||
csrr s2, CSR_NT # s2 <- NT
|
||||
csrr s1, CSR_WTID # s1 <- tid
|
||||
li s0, 0 # s0 <- index
|
||||
label_loop:
|
||||
sub t0, s0, s1
|
||||
seqz t1, t0 # (index != tid)
|
||||
.insn s 0x6b, 2, x0, 0(t1) # split t0
|
||||
bnez t0, label_join
|
||||
mv a0, s3 # a0 <- arg
|
||||
jalr s4 # callback(arg)
|
||||
label_join:
|
||||
.insn s 0x6b, 3, x0, 0(x0) # join
|
||||
addi s0, s0, 1 # index++
|
||||
blt s0, s2, label_loop # loop back
|
||||
lw ra, 20(sp)
|
||||
lw s4, 16(sp)
|
||||
lw s3, 12(sp)
|
||||
lw s2, 8(sp)
|
||||
lw s1, 4(sp)
|
||||
lw s0, 0(sp)
|
||||
addi sp, sp, 24
|
||||
|
||||
ret
|
||||
@@ -1,307 +0,0 @@
|
||||
#include <vx_spawn.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define NUM_CORES_MAX 32
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
typedef struct {
|
||||
vx_spawn_tasks_cb callback;
|
||||
void * arg;
|
||||
int offset;
|
||||
int N;
|
||||
int R;
|
||||
int NW;
|
||||
} wspawn_tasks_args_t;
|
||||
|
||||
typedef struct {
|
||||
context_t * ctx;
|
||||
vx_spawn_kernel_cb callback;
|
||||
void * arg;
|
||||
int offset;
|
||||
int N;
|
||||
int R;
|
||||
int NW;
|
||||
char isXYpow2;
|
||||
char isXpow2;
|
||||
char log2XY;
|
||||
char log2X;
|
||||
} wspawn_kernel_args_t;
|
||||
|
||||
void* g_wspawn_args[NUM_CORES_MAX];
|
||||
|
||||
inline char is_log2(int x) {
|
||||
return ((x & (x-1)) == 0);
|
||||
}
|
||||
|
||||
inline int fast_log2(int x) {
|
||||
float f = x;
|
||||
return (*(int*)(&f)>>23) - 127;
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
|
||||
int core_id = vx_core_id();
|
||||
int wid = vx_warp_id();
|
||||
int tid = vx_thread_id();
|
||||
int NT = vx_num_threads();
|
||||
|
||||
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[core_id];
|
||||
|
||||
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
|
||||
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
|
||||
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
|
||||
|
||||
for (int task_id = offset, N = task_id + tK; task_id < N; ++task_id) {
|
||||
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
|
||||
}
|
||||
|
||||
// wait for all warps to complete
|
||||
vx_barrier(0, p_wspawn_args->NW);
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
|
||||
int core_id = vx_core_id();
|
||||
int tid = vx_thread_gid();
|
||||
|
||||
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[core_id];
|
||||
|
||||
int task_id = p_wspawn_args->offset + tid;
|
||||
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
|
||||
}
|
||||
|
||||
static void spawn_tasks_all_cb() {
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// call stub routine
|
||||
spawn_tasks_all_stub();
|
||||
|
||||
// set warp0 to single-threaded and stop other warps
|
||||
int wid = vx_warp_id();
|
||||
vx_tmc(0 == wid);
|
||||
}
|
||||
|
||||
static void spawn_tasks_rem_cb(int thread_mask) {
|
||||
// activate threads
|
||||
vx_tmc(thread_mask);
|
||||
|
||||
// call stub routine
|
||||
spawn_tasks_rem_stub();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc(1);
|
||||
}
|
||||
|
||||
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
|
||||
// device specs
|
||||
int NC = vx_num_cores();
|
||||
int NW = vx_num_warps();
|
||||
int NT = vx_num_threads();
|
||||
|
||||
// current core id
|
||||
int core_id = vx_core_id();
|
||||
if (core_id >= NUM_CORES_MAX)
|
||||
return;
|
||||
|
||||
// calculate necessary active cores
|
||||
int WT = NW * NT;
|
||||
int nC = (num_tasks > WT) ? (num_tasks / WT) : 1;
|
||||
int nc = MIN(nC, NC);
|
||||
if (core_id >= nc)
|
||||
return; // terminate extra cores
|
||||
|
||||
// number of tasks per core
|
||||
int tasks_per_core = num_tasks / nc;
|
||||
int tasks_per_core0 = tasks_per_core;
|
||||
if (core_id == (NC-1)) {
|
||||
int QC_r = num_tasks - (nc * tasks_per_core0);
|
||||
tasks_per_core0 += QC_r; // last core executes remaining tasks
|
||||
}
|
||||
|
||||
// number of tasks per warp
|
||||
int nW = tasks_per_core0 / NT; // total warps per core
|
||||
int rT = tasks_per_core0 - (nW * NT); // remaining threads
|
||||
int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations
|
||||
int rW = (fW != 0) ? (nW - fW * NW) : 0; // remaining warps
|
||||
if (0 == fW)
|
||||
fW = 1;
|
||||
|
||||
//--
|
||||
wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW, 0 };
|
||||
g_wspawn_args[core_id] = &wspawn_args;
|
||||
|
||||
//--
|
||||
if (nW >= 1) {
|
||||
int nw = MIN(nW, NW);
|
||||
wspawn_args.NW = nw;
|
||||
vx_wspawn(nw, spawn_tasks_all_cb);
|
||||
spawn_tasks_all_cb();
|
||||
}
|
||||
|
||||
//--
|
||||
if (rT != 0) {
|
||||
wspawn_args.offset = tasks_per_core0 - rT;
|
||||
int tmask = (1 << rT) - 1;
|
||||
spawn_tasks_rem_cb(tmask);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_kernel_all_stub() {
|
||||
int core_id = vx_core_id();
|
||||
int wid = vx_warp_id();
|
||||
int tid = vx_thread_id();
|
||||
int NT = vx_num_threads();
|
||||
|
||||
wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[core_id];
|
||||
|
||||
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
|
||||
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
|
||||
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
|
||||
|
||||
int X = p_wspawn_args->ctx->num_groups[0];
|
||||
int Y = p_wspawn_args->ctx->num_groups[1];
|
||||
int XY = X * Y;
|
||||
|
||||
for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {
|
||||
int k = p_wspawn_args->isXYpow2 ? (wg_id >> p_wspawn_args->log2XY) : (wg_id / XY);
|
||||
int wg_2d = wg_id - k * XY;
|
||||
int j = p_wspawn_args->isXpow2 ? (wg_2d >> p_wspawn_args->log2X) : (wg_2d / X);
|
||||
int i = wg_2d - j * X;
|
||||
|
||||
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
|
||||
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
|
||||
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
|
||||
|
||||
(p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, gid0, gid1, gid2);
|
||||
}
|
||||
|
||||
// wait for all warps to complete
|
||||
vx_barrier(0, p_wspawn_args->NW);
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_kernel_rem_stub() {
|
||||
int core_id = vx_core_id();
|
||||
int tid = vx_thread_gid();
|
||||
|
||||
wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[core_id];
|
||||
|
||||
int wg_id = p_wspawn_args->offset + tid;
|
||||
|
||||
int X = p_wspawn_args->ctx->num_groups[0];
|
||||
int Y = p_wspawn_args->ctx->num_groups[1];
|
||||
int XY = X * Y;
|
||||
|
||||
int k = p_wspawn_args->isXYpow2 ? (wg_id >> p_wspawn_args->log2XY) : (wg_id / XY);
|
||||
int wg_2d = wg_id - k * XY;
|
||||
int j = p_wspawn_args->isXpow2 ? (wg_2d >> p_wspawn_args->log2X) : (wg_2d / X);
|
||||
int i = wg_2d - j * X;
|
||||
|
||||
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
|
||||
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
|
||||
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
|
||||
|
||||
(p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, gid0, gid1, gid2);
|
||||
}
|
||||
|
||||
static void spawn_kernel_all_cb() {
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// call stub routine
|
||||
spawn_kernel_all_stub();
|
||||
|
||||
// set warp0 to single-threaded and stop other warps
|
||||
int wid = vx_warp_id();
|
||||
vx_tmc(0 == wid);
|
||||
}
|
||||
|
||||
static void spawn_kernel_rem_cb(int thread_mask) {
|
||||
// activate threads
|
||||
vx_tmc(thread_mask);
|
||||
|
||||
// call stub routine
|
||||
spawn_kernel_rem_stub();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc(1);
|
||||
}
|
||||
|
||||
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
|
||||
// total number of WGs
|
||||
int X = ctx->num_groups[0];
|
||||
int Y = ctx->num_groups[1];
|
||||
int Z = ctx->num_groups[2];
|
||||
int XY = X * Y;
|
||||
int Q = XY * Z;
|
||||
|
||||
// device specs
|
||||
int NC = vx_num_cores();
|
||||
int NW = vx_num_warps();
|
||||
int NT = vx_num_threads();
|
||||
|
||||
// current core id
|
||||
int core_id = vx_core_id();
|
||||
if (core_id >= NUM_CORES_MAX)
|
||||
return;
|
||||
|
||||
// calculate necessary active cores
|
||||
int WT = NW * NT;
|
||||
int nC = (Q > WT) ? (Q / WT) : 1;
|
||||
int nc = MIN(nC, NC);
|
||||
if (core_id >= nc)
|
||||
return; // terminate extra cores
|
||||
|
||||
// number of workgroups per core
|
||||
int wgs_per_core = Q / nc;
|
||||
int wgs_per_core0 = wgs_per_core;
|
||||
if (core_id == (NC-1)) {
|
||||
int QC_r = Q - (nc * wgs_per_core0);
|
||||
wgs_per_core0 += QC_r; // last core executes remaining WGs
|
||||
}
|
||||
|
||||
// number of workgroups per warp
|
||||
int nW = wgs_per_core0 / NT; // total warps per core
|
||||
int rT = wgs_per_core0 - (nW * NT); // remaining threads
|
||||
int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations
|
||||
int rW = (fW != 0) ? (nW - fW * NW) : 0; // reamining full warps
|
||||
if (0 == fW)
|
||||
fW = 1;
|
||||
|
||||
// fast path handling
|
||||
char isXYpow2 = is_log2(XY);
|
||||
char isXpow2 = is_log2(X);
|
||||
char log2XY = fast_log2(XY);
|
||||
char log2X = fast_log2(X);
|
||||
|
||||
//--
|
||||
wspawn_kernel_args_t wspawn_args = {
|
||||
ctx, callback, arg, core_id * wgs_per_core, fW, rW, 0, isXYpow2, isXpow2, log2XY, log2X
|
||||
};
|
||||
g_wspawn_args[core_id] = &wspawn_args;
|
||||
|
||||
//--
|
||||
if (nW >= 1) {
|
||||
int nw = MIN(nW, NW);
|
||||
wspawn_args.NW = nw;
|
||||
vx_wspawn(nw, spawn_kernel_all_cb);
|
||||
spawn_kernel_all_cb();
|
||||
}
|
||||
|
||||
//--
|
||||
if (rT != 0) {
|
||||
wspawn_args.offset = wgs_per_core0 - rT;
|
||||
int tmask = (1 << rT) - 1;
|
||||
spawn_kernel_rem_cb(tmask);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -1,110 +0,0 @@
|
||||
#include <VX_config.h>
|
||||
|
||||
.section .init, "ax"
|
||||
.global _start
|
||||
.type _start, @function
|
||||
_start:
|
||||
|
||||
# initialize per-thread registers
|
||||
csrr a0, CSR_NW # get num warps
|
||||
la a1, init_regs
|
||||
.insn s 0x6b, 1, a1, 0(a0) # wspawn a0, a1
|
||||
jal init_regs
|
||||
# return back to single thread execution
|
||||
li a0, 1
|
||||
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
|
||||
|
||||
# initialize TLS for all warps
|
||||
csrr a0, CSR_NW # get num warps
|
||||
la a1, __init_tls
|
||||
.insn s 0x6b, 1, a1, 0(a0) # wspawn a0, a1
|
||||
call __init_tls
|
||||
# return back to single thread execution
|
||||
li a0, 1
|
||||
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
|
||||
|
||||
# clear BSS segment
|
||||
la a0, _edata
|
||||
la a2, _end
|
||||
sub a2, a2, a0
|
||||
li a1, 0
|
||||
call memset
|
||||
|
||||
# Initialize trap vector
|
||||
# a t0, trap_entry
|
||||
# csrw mtvec, t0
|
||||
|
||||
# Register global termination functions
|
||||
la a0, __libc_fini_array
|
||||
|
||||
# to be called upon exit
|
||||
call atexit
|
||||
|
||||
# Run global initialization functions
|
||||
call __libc_init_array
|
||||
|
||||
# call main program routine
|
||||
call main
|
||||
|
||||
# call exit routine
|
||||
tail exit
|
||||
.size _start, .-_start
|
||||
|
||||
.section .text
|
||||
.type _exit, @function
|
||||
.global _exit
|
||||
_exit:
|
||||
mv s0, a0
|
||||
call vx_perf_dump
|
||||
mv gp, s0
|
||||
li a0, 0
|
||||
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
|
||||
|
||||
.section .text
|
||||
.type init_regs, @function
|
||||
.global init_regs
|
||||
init_regs:
|
||||
# activate all threads
|
||||
li a0, -1
|
||||
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
|
||||
|
||||
# set global pointer register
|
||||
.option push
|
||||
.option norelax
|
||||
la gp, __global_pointer
|
||||
.option pop
|
||||
|
||||
# allocate stack region for a threads on the processor
|
||||
# set stack pointer
|
||||
li sp, SMEM_BASE_ADDR # load stack base address
|
||||
#if SM_ENABLE
|
||||
csrr a0, CSR_LTID # get local thread id
|
||||
#else
|
||||
csrr a0, CSR_GTID # get global thread id
|
||||
#endif
|
||||
sll a1, a0, STACK_LOG2_SIZE
|
||||
sub sp, sp, a1
|
||||
|
||||
# set thread pointer register
|
||||
# use address space after BSS region
|
||||
# ensure cacheline alignment
|
||||
la a1, __tcb_aligned_size
|
||||
mul a0, a0, a1
|
||||
la tp, _end + 63
|
||||
add tp, tp, a0
|
||||
and tp, tp, -64
|
||||
|
||||
# disable active warps except warp0
|
||||
csrr a3, CSR_LWID # get local wid
|
||||
beqz a3, RETURN
|
||||
li a0, 0
|
||||
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
|
||||
RETURN:
|
||||
ret
|
||||
|
||||
.section .data
|
||||
.global __dso_handle
|
||||
.weak __dso_handle
|
||||
__dso_handle:
|
||||
.long 0
|
||||
|
||||
@@ -1,109 +0,0 @@
|
||||
#include <sys/stat.h>
|
||||
#include <newlib.h>
|
||||
#include <unistd.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_print.h>
|
||||
#include <string.h>
|
||||
|
||||
int _close(int file) { return -1; }
|
||||
|
||||
int _fstat(int file, struct stat *st) { return -1; }
|
||||
|
||||
int _isatty(int file) { return 0; }
|
||||
|
||||
int _lseek(int file, int ptr, int dir) { return 0; }
|
||||
|
||||
int _open(const char *name, int flags, int mode) { return -1; }
|
||||
|
||||
int _read(int file, char *ptr, int len) { return -1; }
|
||||
|
||||
caddr_t _sbrk(int incr) {
|
||||
__asm__ __volatile__("ebreak");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int _write(int file, char *ptr, int len) {
|
||||
int i;
|
||||
for (i = 0; i < len; ++i) {
|
||||
vx_putchar(*ptr++);
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
int _kill(int pid, int sig) { return -1; }
|
||||
|
||||
int _getpid() {
|
||||
return vx_warp_gid();
|
||||
}
|
||||
|
||||
void __init_tls(void) {
|
||||
extern char __tdata_start[];
|
||||
extern char __tbss_offset[];
|
||||
extern char __tdata_size[];
|
||||
extern char __tbss_size[];
|
||||
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// TLS memory initialization
|
||||
register char *__thread_self __asm__ ("tp");
|
||||
memcpy(__thread_self, __tdata_start, (size_t)__tdata_size);
|
||||
memset(__thread_self + (size_t)__tbss_offset, 0, (size_t)__tbss_size);
|
||||
|
||||
// back to single thread execution
|
||||
vx_tmc(0 == vx_warp_id());
|
||||
}
|
||||
|
||||
#ifdef HAVE_INITFINI_ARRAY
|
||||
|
||||
/* These magic symbols are provided by the linker. */
|
||||
extern void (*__preinit_array_start []) (void) __attribute__((weak));
|
||||
extern void (*__preinit_array_end []) (void) __attribute__((weak));
|
||||
extern void (*__init_array_start []) (void) __attribute__((weak));
|
||||
extern void (*__init_array_end []) (void) __attribute__((weak));
|
||||
|
||||
#ifdef HAVE_INIT_FINI
|
||||
extern void _init (void);
|
||||
#endif
|
||||
|
||||
/* Iterate over all the init routines. */
|
||||
void __libc_init_array (void) {
|
||||
size_t count;
|
||||
size_t i;
|
||||
|
||||
count = __preinit_array_end - __preinit_array_start;
|
||||
for (i = 0; i < count; i++)
|
||||
__preinit_array_start[i] ();
|
||||
|
||||
#ifdef HAVE_INIT_FINI
|
||||
_init ();
|
||||
#endif
|
||||
|
||||
count = __init_array_end - __init_array_start;
|
||||
for (i = 0; i < count; i++)
|
||||
__init_array_start[i] ();
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_INITFINI_ARRAY
|
||||
extern void (*__fini_array_start []) (void) __attribute__((weak));
|
||||
extern void (*__fini_array_end []) (void) __attribute__((weak));
|
||||
|
||||
#ifdef HAVE_INIT_FINI
|
||||
extern void _fini (void);
|
||||
#endif
|
||||
|
||||
/* Run all the cleanup routines. */
|
||||
void __libc_fini_array (void) {
|
||||
size_t count;
|
||||
size_t i;
|
||||
|
||||
count = __fini_array_end - __fini_array_start;
|
||||
for (i = count; i > 0; i--)
|
||||
__fini_array_start[i-1] ();
|
||||
|
||||
#ifdef HAVE_INIT_FINI
|
||||
_fini ();
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
19
runtime/stub/Makefile
Normal file
19
runtime/stub/Makefile
Normal file
@@ -0,0 +1,19 @@
|
||||
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I../include -I../../runtime -I../../hw -I../../sim/common
|
||||
|
||||
CXXFLAGS += -fPIC
|
||||
|
||||
LDFLAGS += -shared -pthread
|
||||
|
||||
SRCS = vortex.cpp ../common/utils.cpp
|
||||
|
||||
PROJECT = libvortex.so
|
||||
|
||||
all: $(PROJECT)
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) obj_dir
|
||||
58
runtime/stub/vortex.cpp
Normal file
58
runtime/stub/vortex.cpp
Normal file
@@ -0,0 +1,58 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <vortex.h>
|
||||
|
||||
extern int vx_dev_open(vx_device_h* /*hdevice*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_dev_close(vx_device_h /*hdevice*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_dev_caps(vx_device_h /*hdevice*/, uint32_t /*caps_id*/, uint64_t* /*value*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h /*hdevice*/, uint64_t /*size*/, int /*type*/, uint64_t* /*dev_addr*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_device_h /*hdevice*/, uint64_t /*dev_addr*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_mem_info(vx_device_h /*hdevice*/, int /*type*/, uint64_t* /*mem_free*/, uint64_t* /*mem_used*/) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_device_h /*hdevice*/, uint64_t /*dev_addr*/, const void* /*host_ptr*/, uint64_t /*size*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_device_h /*hdevice*/, void* /*host_ptr*/, uint64_t /*dev_addr*/, uint64_t /*size*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h /*hdevice*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h /*hdevice*/, uint64_t /*timeout*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_dcr_write(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint64_t /*value*/) {
|
||||
return -1;
|
||||
}
|
||||
33
runtime/xrt/Makefile
Normal file
33
runtime/xrt/Makefile
Normal file
@@ -0,0 +1,33 @@
|
||||
CXXFLAGS += -std=c++14 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I../include -I../common -I../../hw -I$(XILINX_XRT)/include -I../../sim/common
|
||||
|
||||
CXXFLAGS += -fPIC
|
||||
|
||||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -L$(XILINX_XRT)/lib -luuid -lxrt_coreutil
|
||||
|
||||
SRCS = vortex.cpp ../common/utils.cpp ../../sim/common/util.cpp
|
||||
|
||||
PROJECT = libvortex.so
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
# Enable scope logic analyzer
|
||||
ifdef SCOPE
|
||||
CXXFLAGS += -DSCOPE
|
||||
SRCS += ../common/scope.cpp
|
||||
endif
|
||||
|
||||
all: $(PROJECT)
|
||||
|
||||
$(PROJECT): $(SRCS) $(SCOPE_JSON)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) obj_dir
|
||||
915
runtime/xrt/vortex.cpp
Normal file
915
runtime/xrt/vortex.cpp
Normal file
@@ -0,0 +1,915 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <vortex.h>
|
||||
#include <malloc.h>
|
||||
#include <utils.h>
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
#include <stdarg.h>
|
||||
#include <util.h>
|
||||
#include <limits>
|
||||
#include <unordered_map>
|
||||
|
||||
#ifdef SCOPE
|
||||
#include "scope.h"
|
||||
#endif
|
||||
|
||||
// XRT includes
|
||||
#include "experimental/xrt_bo.h"
|
||||
#include "experimental/xrt_ip.h"
|
||||
#include "experimental/xrt_device.h"
|
||||
#include "experimental/xrt_kernel.h"
|
||||
#include "experimental/xrt_xclbin.h"
|
||||
#include "experimental/xrt_error.h"
|
||||
|
||||
#define CPP_API
|
||||
//#define BANK_INTERLEAVE
|
||||
|
||||
#define MMIO_CTL_ADDR 0x00
|
||||
#define MMIO_DEV_ADDR 0x10
|
||||
#define MMIO_ISA_ADDR 0x1C
|
||||
#define MMIO_DCR_ADDR 0x28
|
||||
#define MMIO_SCP_ADDR 0x34
|
||||
#define MMIO_MEM_ADDR 0x40
|
||||
|
||||
#define CTL_AP_START (1<<0)
|
||||
#define CTL_AP_DONE (1<<1)
|
||||
#define CTL_AP_IDLE (1<<2)
|
||||
#define CTL_AP_READY (1<<3)
|
||||
#define CTL_AP_RESET (1<<4)
|
||||
#define CTL_AP_RESTART (1<<7)
|
||||
|
||||
struct platform_info_t {
|
||||
const char* prefix_name;
|
||||
uint8_t lg2_num_banks;
|
||||
uint8_t lg2_bank_size;
|
||||
uint64_t mem_base;
|
||||
};
|
||||
|
||||
static const platform_info_t g_platforms [] = {
|
||||
{"xilinx_u50", 4, 0x1C, 0x0},
|
||||
{"xilinx_u200", 4, 0x1C, 0x0},
|
||||
{"xilinx_u280", 4, 0x1C, 0x0},
|
||||
{"xilinx_vck5000", 0, 0x21, 0xC000000000},
|
||||
};
|
||||
|
||||
#ifdef CPP_API
|
||||
|
||||
typedef xrt::device xrt_device_t;
|
||||
typedef xrt::ip xrt_kernel_t;
|
||||
typedef xrt::bo xrt_buffer_t;
|
||||
|
||||
#else
|
||||
|
||||
typedef xrtDeviceHandle xrt_device_t;
|
||||
typedef xrtKernelHandle xrt_kernel_t;
|
||||
typedef xrtBufferHandle xrt_buffer_t;
|
||||
|
||||
#endif
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
#define DEFAULT_DEVICE_INDEX 0
|
||||
|
||||
#define DEFAULT_XCLBIN_PATH "vortex_afu.xclbin"
|
||||
|
||||
#define KERNEL_NAME "vortex_afu"
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define DBGPRINT(format, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
#define CHECK_HANDLE(handle, _expr, _cleanup) \
|
||||
auto handle = _expr; \
|
||||
if (handle == nullptr) { \
|
||||
printf("[VXDRV] Error: '%s' returned NULL!\n", #_expr); \
|
||||
_cleanup \
|
||||
}
|
||||
|
||||
#define CHECK_ERR(_expr, _cleanup) \
|
||||
do { \
|
||||
auto err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
#ifndef CPP_API
|
||||
|
||||
static void dump_xrt_error(xrtDeviceHandle xrtDevice, xrtErrorCode err) {
|
||||
size_t len = 0;
|
||||
xrtErrorGetString(xrtDevice, err, nullptr, 0, &len);
|
||||
std::vector<char> buf(len);
|
||||
xrtErrorGetString(xrtDevice, err, buf.data(), buf.size(), nullptr);
|
||||
printf("[VXDRV] detail: %s!\n", buf.data());
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static int get_platform_info(const std::string& device_name, platform_info_t* platform_info) {
|
||||
for (size_t i = 0; i < (sizeof(g_platforms)/sizeof(platform_info_t)); ++i) {
|
||||
auto& platform = g_platforms[i];
|
||||
if (device_name.rfind(platform.prefix_name, 0) == 0) {
|
||||
*platform_info = platform;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*static void wait_for_enter(const std::string &msg) {
|
||||
std::cout << msg << std::endl;
|
||||
std::cin.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
|
||||
}*/
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
public:
|
||||
|
||||
vx_device(xrt_device_t& device, xrt_kernel_t& kernel, const platform_info_t& platform)
|
||||
: xrtDevice_(device)
|
||||
, xrtKernel_(kernel)
|
||||
, platform_(platform)
|
||||
{}
|
||||
|
||||
#ifndef CPP_API
|
||||
|
||||
~vx_device() {
|
||||
for (auto& entry : xrtBuffers_) {
|
||||
#ifdef BANK_INTERLEAVE
|
||||
xrtBOFree(entry);
|
||||
#else
|
||||
xrtBOFree(entry.second.xrtBuffer);
|
||||
#endif
|
||||
}
|
||||
if (xrtKernel_) {
|
||||
xrtKernelClose(xrtKernel_);
|
||||
}
|
||||
if (xrtDevice_) {
|
||||
xrtDeviceClose(xrtDevice_);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int init() {
|
||||
CHECK_ERR(this->write_register(MMIO_CTL_ADDR, CTL_AP_RESET), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
uint32_t num_banks = 1 << platform_.lg2_num_banks;
|
||||
uint64_t bank_size = 1ull << platform_.lg2_bank_size;
|
||||
|
||||
for (uint32_t i = 0; i < num_banks; ++i) {
|
||||
uint32_t reg_addr = MMIO_MEM_ADDR + (i * 12);
|
||||
uint64_t reg_value = platform_.mem_base + i * bank_size;
|
||||
CHECK_ERR(this->write_register(reg_addr, reg_value & 0xffffffff), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
CHECK_ERR(this->write_register(reg_addr + 4, (reg_value >> 32) & 0xffffffff), {
|
||||
return -1;
|
||||
});
|
||||
#ifndef BANK_INTERLEAVE
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
|
||||
CHECK_ERR(this->read_register(MMIO_DEV_ADDR, (uint32_t*)&this->dev_caps), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
CHECK_ERR(this->read_register(MMIO_DEV_ADDR + 4, (uint32_t*)&this->dev_caps + 1), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
CHECK_ERR(this->read_register(MMIO_ISA_ADDR, (uint32_t*)&this->isa_caps), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
CHECK_ERR(this->read_register(MMIO_ISA_ADDR + 4, (uint32_t*)&this->isa_caps + 1), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
this->global_mem_size = num_banks * bank_size;
|
||||
|
||||
this->global_mem_ = std::make_shared<vortex::MemoryAllocator>(
|
||||
ALLOC_BASE_ADDR, ALLOC_MAX_ADDR, RAM_PAGE_SIZE, CACHE_BLOCK_SIZE);
|
||||
|
||||
uint64_t local_mem_size = 0;
|
||||
vx_dev_caps(this, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size);
|
||||
if (local_mem_size <= 1) {
|
||||
this->local_mem_ = std::make_shared<vortex::MemoryAllocator>(
|
||||
SMEM_BASE_ADDR, local_mem_size, RAM_PAGE_SIZE, 1);
|
||||
}
|
||||
|
||||
#ifdef BANK_INTERLEAVE
|
||||
xrtBuffers_.reserve(num_banks);
|
||||
for (uint32_t i = 0; i < num_banks; ++i) {
|
||||
#ifdef CPP_API
|
||||
xrtBuffers_.emplace_back(xrtDevice_, bank_size, xrt::bo::flags::normal, i);
|
||||
#else
|
||||
CHECK_HANDLE(xrtBuffer, xrtBOAlloc(xrtDevice_, bank_size, XRT_BO_FLAGS_NONE, i), {
|
||||
return -1;
|
||||
});
|
||||
xrtBuffers_.push_back(xrtBuffer);
|
||||
#endif
|
||||
printf("*** allocated bank%u/%u, size=%lu\n", i, num_banks, bank_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_alloc(uint64_t size, int type, uint64_t* dev_addr) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
|
||||
uint64_t addr;
|
||||
|
||||
if (type == VX_MEM_TYPE_GLOBAL) {
|
||||
CHECK_ERR(global_mem_->allocate(asize, &addr), {
|
||||
return -1;
|
||||
});
|
||||
#ifndef BANK_INTERLEAVE
|
||||
uint32_t bank_id;
|
||||
CHECK_ERR(this->get_bank_info(addr, &bank_id, nullptr), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(get_buffer(bank_id, nullptr), {
|
||||
return -1;
|
||||
});
|
||||
#endif
|
||||
} else if (type == VX_MEM_TYPE_LOCAL) {
|
||||
if CHECK_ERR(local_mem_->allocate(asize, &addr), {
|
||||
return -1;
|
||||
});
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
*dev_addr = addr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_free(uint64_t dev_addr) {
|
||||
if (dev_addr >= SMEM_BASE_ADDR) {
|
||||
CHECK_ERR(local_mem_->release(dev_addr), {
|
||||
return -1;
|
||||
});
|
||||
} else {
|
||||
CHECK_ERR(global_mem_->release(dev_addr), {
|
||||
return -1;
|
||||
});
|
||||
#ifdef BANK_INTERLEAVE
|
||||
if (0 == global_mem_->allocated()) {
|
||||
#ifndef CPP_API
|
||||
for (auto& entry : xrtBuffers_) {
|
||||
xrtBOFree(entry);
|
||||
}
|
||||
#endif
|
||||
xrtBuffers_.clear();
|
||||
}
|
||||
#else
|
||||
uint32_t bank_id;
|
||||
CHECK_ERR(this->get_bank_info(dev_addr, &bank_id, nullptr), {
|
||||
return -1;
|
||||
});
|
||||
auto it = xrtBuffers_.find(bank_id);
|
||||
if (it != xrtBuffers_.end()) {
|
||||
auto count = --it->second.count;
|
||||
if (0 == count) {
|
||||
printf("freeing bank%d...\n", bank_id);
|
||||
#ifndef CPP_API
|
||||
xrtBOFree(it->second.xrtBuffer);
|
||||
#endif
|
||||
xrtBuffers_.erase(it);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[VXDRV] Error: invalid device memory address: 0x%lx\n", dev_addr);
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_info(int type, uint64_t* mem_free, uint64_t* mem_used) const {
|
||||
if (type == VX_MEM_TYPE_GLOBAL) {
|
||||
if (mem_free)
|
||||
*mem_free = global_mem_->free();
|
||||
if (mem_used)
|
||||
*mem_used = global_mem_->allocated();
|
||||
} else if (type == VX_MEM_TYPE_LOCAL) {
|
||||
if (mem_free)
|
||||
*mem_free = local_mem_->free();
|
||||
if (mem_used)
|
||||
*mem_free = local_mem_->allocated();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int write_register(uint32_t addr, uint32_t value) {
|
||||
#ifdef CPP_API
|
||||
xrtKernel_.write_register(addr, value);
|
||||
#else
|
||||
CHECK_ERR(xrtKernelWriteRegister(xrtKernel_, addr, value), {
|
||||
dump_xrt_error(xrtDevice_, err);
|
||||
return -1;
|
||||
});
|
||||
#endif
|
||||
DBGPRINT("*** write_register: addr=0x%x, value=0x%x\n", addr, value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int read_register(uint32_t addr, uint32_t* value) {
|
||||
#ifdef CPP_API
|
||||
*value = xrtKernel_.read_register(addr);
|
||||
#else
|
||||
CHECK_ERR(xrtKernelReadRegister(xrtKernel_, addr, value), {
|
||||
dump_xrt_error(xrtDevice_, err);
|
||||
return -1;
|
||||
});
|
||||
#endif
|
||||
DBGPRINT("*** read_register: addr=0x%x, value=0x%x\n", addr, *value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int upload(uint64_t dev_addr, uint8_t* host_ptr, uint64_t asize) {
|
||||
for (uint64_t end = dev_addr + asize; dev_addr < end;
|
||||
dev_addr += CACHE_BLOCK_SIZE,
|
||||
host_ptr += CACHE_BLOCK_SIZE) {
|
||||
#ifdef BANK_INTERLEAVE
|
||||
asize = CACHE_BLOCK_SIZE;
|
||||
#else
|
||||
end = 0;
|
||||
#endif
|
||||
uint32_t bo_index;
|
||||
uint64_t bo_offset;
|
||||
xrt_buffer_t xrtBuffer;
|
||||
CHECK_ERR(this->get_bank_info(dev_addr, &bo_index, &bo_offset), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(this->get_buffer(bo_index, &xrtBuffer), {
|
||||
return -1;
|
||||
});
|
||||
#ifdef CPP_API
|
||||
xrtBuffer.write(host_ptr, asize, bo_offset);
|
||||
xrtBuffer.sync(XCL_BO_SYNC_BO_TO_DEVICE, asize, bo_offset);
|
||||
#else
|
||||
CHECK_ERR(xrtBOWrite(xrtBuffer, host_ptr, asize, bo_offset), {
|
||||
dump_xrt_error(xrtDevice_, err);
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(xrtBOSync(xrtBuffer, XCL_BO_SYNC_BO_TO_DEVICE, asize, bo_offset), {
|
||||
dump_xrt_error(xrtDevice_, err);
|
||||
return -1;
|
||||
});
|
||||
#endif
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int download(uint8_t* host_ptr, uint64_t dev_addr, uint64_t asize) {
|
||||
for (uint64_t end = dev_addr + asize; dev_addr < end;
|
||||
dev_addr += CACHE_BLOCK_SIZE,
|
||||
host_ptr += CACHE_BLOCK_SIZE) {
|
||||
#ifdef BANK_INTERLEAVE
|
||||
asize = CACHE_BLOCK_SIZE;
|
||||
#else
|
||||
end = 0;
|
||||
#endif
|
||||
uint32_t bo_index;
|
||||
uint64_t bo_offset;
|
||||
xrt_buffer_t xrtBuffer;
|
||||
CHECK_ERR(this->get_bank_info(dev_addr, &bo_index, &bo_offset), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(this->get_buffer(bo_index, &xrtBuffer), {
|
||||
return -1;
|
||||
});
|
||||
#ifdef CPP_API
|
||||
xrtBuffer.sync(XCL_BO_SYNC_BO_FROM_DEVICE, asize, bo_offset);
|
||||
xrtBuffer.read(host_ptr, asize, bo_offset);
|
||||
#else
|
||||
CHECK_ERR(xrtBOSync(xrtBuffer, XCL_BO_SYNC_BO_FROM_DEVICE, asize, bo_offset), {
|
||||
dump_xrt_error(xrtDevice_, err);
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(xrtBORead(xrtBuffer, host_ptr, asize, bo_offset), {
|
||||
dump_xrt_error(xrtDevice_, err);
|
||||
return -1;
|
||||
});
|
||||
#endif
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
DeviceConfig dcrs;
|
||||
uint64_t dev_caps;
|
||||
uint64_t isa_caps;
|
||||
uint64_t global_mem_size;
|
||||
|
||||
private:
|
||||
|
||||
xrt_device_t xrtDevice_;
|
||||
xrt_kernel_t xrtKernel_;
|
||||
const platform_info_t platform_;
|
||||
std::shared_ptr<vortex::MemoryAllocator> global_mem_;
|
||||
std::shared_ptr<vortex::MemoryAllocator> local_mem_;
|
||||
|
||||
#ifdef BANK_INTERLEAVE
|
||||
|
||||
std::vector<xrt_buffer_t> xrtBuffers_;
|
||||
|
||||
int get_bank_info(uint64_t addr, uint32_t* pIdx, uint64_t* pOff) {
|
||||
uint32_t num_banks = 1 << platform_.lg2_num_banks;
|
||||
uint64_t block_addr = addr / CACHE_BLOCK_SIZE;
|
||||
uint32_t index = block_addr & (num_banks-1);
|
||||
uint64_t offset = (block_addr >> platform_.lg2_num_banks) * CACHE_BLOCK_SIZE;
|
||||
if (pIdx) {
|
||||
*pIdx = index;
|
||||
}
|
||||
if (pOff) {
|
||||
*pOff = offset;
|
||||
}
|
||||
printf("get_bank_info(addr=0x%lx, bank=%d, offset=0x%lx\n", addr, index, offset);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int get_buffer(uint32_t bank_id, xrt_buffer_t* pBuf) {
|
||||
if (pBuf) {
|
||||
*pBuf = xrtBuffers_.at(bank_id);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
struct buf_cnt_t {
|
||||
xrt_buffer_t xrtBuffer;
|
||||
uint32_t count;
|
||||
};
|
||||
|
||||
std::unordered_map<uint32_t, buf_cnt_t> xrtBuffers_;
|
||||
|
||||
int get_bank_info(uint64_t addr, uint32_t* pIdx, uint64_t* pOff) {
|
||||
uint32_t num_banks = 1 << platform_.lg2_num_banks;
|
||||
uint64_t bank_size = 1ull << platform_.lg2_bank_size;
|
||||
uint32_t index = addr >> platform_.lg2_bank_size;
|
||||
uint64_t offset = addr & (bank_size-1);
|
||||
if (index > num_banks) {
|
||||
fprintf(stderr, "[VXDRV] Error: address out of range: 0x%lx\n", addr);
|
||||
return -1;
|
||||
}
|
||||
if (pIdx) {
|
||||
*pIdx = index;
|
||||
}
|
||||
if (pOff) {
|
||||
*pOff = offset;
|
||||
}
|
||||
printf("get_bank_info(addr=0x%lx, bank=%d, offset=0x%lx\n", addr, index, offset);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int get_buffer(uint32_t bank_id, xrt_buffer_t* pBuf) {
|
||||
auto it = xrtBuffers_.find(bank_id);
|
||||
if (it != xrtBuffers_.end()) {
|
||||
if (pBuf) {
|
||||
*pBuf = it->second.xrtBuffer;
|
||||
} else {
|
||||
printf("reusing bank%d...\n", bank_id);
|
||||
++it->second.count;
|
||||
}
|
||||
} else {
|
||||
printf("allocating bank%d...\n", bank_id);
|
||||
uint64_t bank_size = 1ull << platform_.lg2_bank_size;
|
||||
#ifdef CPP_API
|
||||
xrt::bo xrtBuffer(xrtDevice_, bank_size, xrt::bo::flags::normal, bank_id);
|
||||
#else
|
||||
CHECK_HANDLE(xrtBuffer, xrtBOAlloc(xrtDevice_, bank_size, XRT_BO_FLAGS_NONE, bank_id), {
|
||||
return -1;
|
||||
});
|
||||
#endif
|
||||
xrtBuffers_.insert({bank_id, {xrtBuffer, 1}});
|
||||
if (pBuf) {
|
||||
*pBuf = xrtBuffer;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
*value = (device->dev_caps >> 0) & 0xff;
|
||||
break;
|
||||
case VX_CAPS_NUM_THREADS:
|
||||
*value = (device->dev_caps >> 8) & 0xff;
|
||||
break;
|
||||
case VX_CAPS_NUM_WARPS:
|
||||
*value = (device->dev_caps >> 16) & 0xff;
|
||||
break;
|
||||
case VX_CAPS_NUM_CORES:
|
||||
*value = (device->dev_caps >> 24) & 0xffff;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_GLOBAL_MEM_SIZE:
|
||||
*value = device->global_mem_size;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
*value = 1ull << ((device->dev_caps >> 40) & 0xff);
|
||||
break;
|
||||
case VX_CAPS_KERNEL_BASE_ADDR:
|
||||
*value = (uint64_t(device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) |
|
||||
device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
|
||||
break;
|
||||
case VX_CAPS_ISA_FLAGS:
|
||||
*value = device->isa_caps;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id);
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_open(vx_device_h* hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
int device_index = DEFAULT_DEVICE_INDEX;
|
||||
const char* device_index_s = getenv("XRT_DEVICE_INDEX");
|
||||
if (device_index_s != nullptr) {
|
||||
device_index = atoi(device_index_s);
|
||||
}
|
||||
|
||||
const char* xlbin_path_s = getenv("XRT_XCLBIN_PATH");
|
||||
if (xlbin_path_s == nullptr) {
|
||||
xlbin_path_s = DEFAULT_XCLBIN_PATH;
|
||||
}
|
||||
|
||||
#ifdef CPP_API
|
||||
|
||||
auto xrtDevice = xrt::device(device_index);
|
||||
auto uuid = xrtDevice.load_xclbin(xlbin_path_s);
|
||||
auto xrtKernel = xrt::ip(xrtDevice, uuid, KERNEL_NAME);
|
||||
auto xclbin = xrt::xclbin(xlbin_path_s);
|
||||
|
||||
auto device_name = xrtDevice.get_info<xrt::info::device::name>();
|
||||
|
||||
/*{
|
||||
uint32_t num_banks = 0;
|
||||
uint64_t bank_size = 0;
|
||||
uint64_t mem_base = 0;
|
||||
|
||||
auto mem_json = nlohmann::json::parse(xrtDevice.get_info<xrt::info::device::memory>());
|
||||
if (!mem_json.is_null()) {
|
||||
uint32_t index = 0;
|
||||
for (auto& mem : mem_json["board"]["memory"]["memories"]) {
|
||||
auto enabled = mem["enabled"].get<std::string>();
|
||||
if (enabled == "true") {
|
||||
if (index == 0) {
|
||||
mem_base = std::stoull(mem["base_address"].get<std::string>(), nullptr, 16);
|
||||
bank_size = std::stoull(mem["range_bytes"].get<std::string>(), nullptr, 16);
|
||||
}
|
||||
++index;
|
||||
}
|
||||
}
|
||||
num_banks = index;
|
||||
}
|
||||
|
||||
fprintf(stderr, "[VXDRV] memory description: base=0x%lx, size=0x%lx, count=%d\n", mem_base, bank_size, num_banks);
|
||||
}*/
|
||||
|
||||
/*{
|
||||
std::cout << "Device" << device_index << " : " << xrtDevice.get_info<xrt::info::device::name>() << std::endl;
|
||||
std::cout << " bdf : " << xrtDevice.get_info<xrt::info::device::bdf>() << std::endl;
|
||||
std::cout << " kdma : " << xrtDevice.get_info<xrt::info::device::kdma>() << std::endl;
|
||||
std::cout << " max_freq : " << xrtDevice.get_info<xrt::info::device::max_clock_frequency_mhz>() << std::endl;
|
||||
std::cout << " memory : " << xrtDevice.get_info<xrt::info::device::memory>() << std::endl;
|
||||
std::cout << " thermal : " << xrtDevice.get_info<xrt::info::device::thermal>() << std::endl;
|
||||
std::cout << " m2m : " << std::boolalpha << xrtDevice.get_info<xrt::info::device::m2m>() << std::dec << std::endl;
|
||||
std::cout << " nodma : " << std::boolalpha << xrtDevice.get_info<xrt::info::device::nodma>() << std::dec << std::endl;
|
||||
|
||||
std::cout << "Memory info :" << std::endl;
|
||||
for (const auto& mem_bank : xclbin.get_mems()) {
|
||||
std::cout << " index : " << mem_bank.get_index() << std::endl;
|
||||
std::cout << " tag : " << mem_bank.get_tag() << std::endl;
|
||||
std::cout << " type : " << (int)mem_bank.get_type() << std::endl;
|
||||
std::cout << " base_address : 0x" << std::hex << mem_bank.get_base_address() << std::endl;
|
||||
std::cout << " size : 0x" << (mem_bank.get_size_kb() * 1000) << std::dec << std::endl;
|
||||
std::cout << " used :" << mem_bank.get_used() << std::endl;
|
||||
}
|
||||
}*/
|
||||
|
||||
// get platform info
|
||||
platform_info_t platform_info;
|
||||
CHECK_ERR(get_platform_info(device_name, &platform_info), {
|
||||
fprintf(stderr, "[VXDRV] Error: platform not supported: %s\n", device_name.c_str());
|
||||
return -1;
|
||||
});
|
||||
|
||||
CHECK_HANDLE(device, new vx_device(xrtDevice, xrtKernel, platform_info), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
#else
|
||||
|
||||
CHECK_HANDLE(xrtDevice, xrtDeviceOpen(device_index), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
CHECK_ERR(xrtDeviceLoadXclbinFile(xrtDevice, xlbin_path_s), {
|
||||
dump_xrt_error(xrtDevice, err);
|
||||
xrtDeviceClose(xrtDevice);
|
||||
return -1;
|
||||
});
|
||||
|
||||
xuid_t uuid;
|
||||
CHECK_ERR(xrtDeviceGetXclbinUUID(xrtDevice, uuid), {
|
||||
dump_xrt_error(xrtDevice, err);
|
||||
xrtDeviceClose(xrtDevice);
|
||||
return -1;
|
||||
});
|
||||
|
||||
CHECK_HANDLE(xrtKernel, xrtPLKernelOpenExclusive(xrtDevice, uuid, KERNEL_NAME), {
|
||||
xrtDeviceClose(xrtDevice);
|
||||
return -1;
|
||||
});
|
||||
|
||||
int device_name_size;
|
||||
xrtXclbinGetXSAName(xrtDevice, nullptr, 0, &device_name_size);
|
||||
std::vector<char> device_name(device_name_size);
|
||||
xrtXclbinGetXSAName(xrtDevice, device_name.data(), device_name_size, nullptr);
|
||||
|
||||
// get platform info
|
||||
platform_info_t platform_info;
|
||||
CHECK_ERR(get_platform_info(device_name.data(), &platform_info), {
|
||||
fprintf(stderr, "[VXDRV] Error: platform not supported: %s\n", device_name.data());
|
||||
return -1;
|
||||
});
|
||||
|
||||
CHECK_HANDLE(device, new vx_device(xrtDevice, xrtKernel, platform_info), {
|
||||
xrtKernelClose(xrtKernel);
|
||||
xrtDeviceClose(xrtDevice);
|
||||
return -1;
|
||||
});
|
||||
|
||||
#endif
|
||||
|
||||
// initialize device
|
||||
CHECK_ERR(device->init(), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
#ifdef SCOPE
|
||||
{
|
||||
scope_callback_t callback;
|
||||
callback.registerWrite = [](vx_device_h hdevice, uint64_t value)->int {
|
||||
auto device = (vx_device*)hdevice;
|
||||
uint32_t value_lo = (uint32_t)(value);
|
||||
uint32_t value_hi = (uint32_t)(value >> 32);
|
||||
CHECK_ERR(device->write_register(MMIO_SCP_ADDR, value_lo), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(device->write_register(MMIO_SCP_ADDR + 4, value_hi), {
|
||||
return -1;
|
||||
});
|
||||
return 0;
|
||||
};
|
||||
callback.registerRead = [](vx_device_h hdevice, uint64_t* value)->int {
|
||||
auto device = (vx_device*)hdevice;
|
||||
uint32_t value_lo, value_hi;
|
||||
CHECK_ERR(device->read_register(MMIO_SCP_ADDR, &value_lo), {
|
||||
return -1;
|
||||
});
|
||||
CHECK_ERR(device->read_register(MMIO_SCP_ADDR + 4, &value_hi), {
|
||||
return -1;
|
||||
});
|
||||
*value = (((uint64_t)value_hi) << 32) | value_lo;
|
||||
return 0;
|
||||
};
|
||||
int ret = vx_scope_start(&callback, device, 0, -1);
|
||||
if (ret != 0) {
|
||||
delete device;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
CHECK_ERR(dcr_initialize(device), {
|
||||
delete device;
|
||||
return -1;
|
||||
});
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
perf_add_device(device);
|
||||
#endif
|
||||
|
||||
*hdevice = device;
|
||||
|
||||
DBGPRINT("device creation complete!\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_close(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
#ifdef SCOPE
|
||||
vx_scope_stop(hdevice);
|
||||
#endif
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
|
||||
delete device;
|
||||
|
||||
DBGPRINT("device destroyed!\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == dev_addr
|
||||
|| 0 == size)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
return device->mem_alloc(size, type, dev_addr);
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
if (0 == dev_addr)
|
||||
return 0;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
return device->mem_free(dev_addr);
|
||||
}
|
||||
|
||||
extern int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
return device->mem_info(type, mem_free, mem_used);
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
|
||||
// check alignment
|
||||
if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
|
||||
return -1;
|
||||
|
||||
auto asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
|
||||
// bound checking
|
||||
if (dev_addr + asize > device->global_mem_size)
|
||||
return -1;
|
||||
|
||||
CHECK_ERR(device->upload(dev_addr, host_ptr, asize), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
|
||||
// check alignment
|
||||
if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
|
||||
return -1;
|
||||
|
||||
auto asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
|
||||
// bound checking
|
||||
if (dev_addr + asize > device->global_mem_size)
|
||||
return -1;
|
||||
|
||||
CHECK_ERR(device->download(host_ptr, dev_addr, asize), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, asize);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
|
||||
//wait_for_enter("\nPress ENTER to continue after setting up ILA trigger...");
|
||||
|
||||
CHECK_ERR(device->write_register(MMIO_CTL_ADDR, CTL_AP_START), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
DBGPRINT("START\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
|
||||
struct timespec sleep_time;
|
||||
|
||||
#ifndef NDEBUG
|
||||
sleep_time.tv_sec = 1;
|
||||
sleep_time.tv_nsec = 0;
|
||||
#else
|
||||
sleep_time.tv_sec = 0;
|
||||
sleep_time.tv_nsec = 1000000;
|
||||
#endif
|
||||
|
||||
// to milliseconds
|
||||
uint64_t sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
|
||||
|
||||
for (;;) {
|
||||
uint32_t status = 0;
|
||||
CHECK_ERR(device->read_register(MMIO_CTL_ADDR, &status), {
|
||||
return -1;
|
||||
});
|
||||
bool is_done = (status & CTL_AP_DONE) == CTL_AP_DONE;
|
||||
if (is_done || 0 == timeout) {
|
||||
break;
|
||||
}
|
||||
nanosleep(&sleep_time, nullptr);
|
||||
timeout -= sleep_time_ms;
|
||||
};
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
|
||||
CHECK_ERR(device->write_register(MMIO_DCR_ADDR, addr), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
CHECK_ERR(device->write_register(MMIO_DCR_ADDR + 4, value), {
|
||||
return -1;
|
||||
});
|
||||
|
||||
// save the value
|
||||
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%lx\n", addr, value);
|
||||
device->dcrs.write(addr, value);
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user