Conflicts:
	benchmarks/opencl/bfs/Makefile
	benchmarks/opencl/convolution/Makefile
	benchmarks/opencl/guassian/Makefile
	benchmarks/opencl/kmeans/Makefile
	benchmarks/opencl/nearn/Makefile
	benchmarks/opencl/saxpy/Makefile
	benchmarks/opencl/sfilter/Makefile
	benchmarks/opencl/sgemm/Makefile
	benchmarks/opencl/transpose/Makefile
	benchmarks/opencl/vecadd/Makefile
	driver/tests/basic/Makefile
	driver/tests/demo/Makefile
	driver/tests/dogfood/Makefile
	miscs/rvvector/vector_test/Makefile
	runtime/Makefile
	runtime/tests/dev/Makefile
	runtime/tests/hello/Makefile
	runtime/tests/nlTest/Makefile
	runtime/tests/simple/Makefile
	runtime/tests/vecadd/Makefile
This commit is contained in:
Blaise Tine
2020-10-12 20:33:32 -07:00
262 changed files with 405303 additions and 2855367 deletions

View File

@@ -1,4 +1,4 @@
all: stub rtlsim simx
all: stub rtlsim simx opae
stub:
$(MAKE) -C stub

View File

@@ -34,7 +34,7 @@ int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value);
int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer);
// Get host pointer address
volatile void* vx_host_ptr(vx_buffer_h hbuffer);
void* vx_host_ptr(vx_buffer_h hbuffer);
// release buffer
int vx_buf_release(vx_buffer_h hbuffer);

View File

@@ -1,9 +1,13 @@
OPAE_HOME ?= /tools/opae/1.4.0
#CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I../include -I/tools/opae/1.4.0/include -I../../hw
CXXFLAGS += -I../include -I$(OPAE_HOME)/include -I../../hw
LDFLAGS += -L/tools/opae/1.4.0/lib
LDFLAGS += -L$(OPAE_HOME)/lib
SCOPE=1
# stack execution protection
LDFLAGS +=-z noexecstack
@@ -21,51 +25,66 @@ CXXFLAGS += -fPIC
CXXFLAGS += -DDUMP_PERF_STATS
# Enable scope analyzer
#CXXFLAGS += -DSCOPE
LDFLAGS += -luuid
# Enable scope analyzer
ifdef SCOPE
CXXFLAGS += -DSCOPE
SET_SCOPE = SCOPE=1
endif
LDFLAGS += -shared
FPGA_LIBS += -lopae-c
FPGA_LIBS += -luuid -lopae-c
ASE_LIBS += -lopae-c-ase
ASE_LIBS += -luuid -lopae-c-ase
VLSIM_LIBS += -lopae-c-vlsim
LIB_DIR=../lib
ASE_DIR = ase
VLSIM_DIR = vlsim
PROJECT = libvortex.so
PROJECT_ASE = $(ASE_DIR)/libvortex.so
PROJECT_VLSIM = $(VLSIM_DIR)/libvortex.so
AFU_JSON_INFO = vortex_afu.h
SRCS = vortex.cpp scope.cpp ../common/vx_utils.cpp
SRCS = vortex.cpp vx_scope.cpp ../common/vx_utils.cpp
all: $(PROJECT) $(PROJECT_ASE)
all: vlsim
# AFU info from JSON file, including AFU UUID
$(AFU_JSON_INFO): ../../hw/opae/vortex_afu.json
json: ../../hw/opae/vortex_afu.json
afu_json_mgr json-info --afu-json=$^ --c-hdr=$@
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) $(FPGA_LIBS) -o $@
fpga: $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) $(FPGA_LIBS) -o $(PROJECT)
$(PROJECT_ASE): $(SRCS) $(ASE_DIR)
$(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) $(ASE_LIBS) -o $@
ase: $(SRCS) $(ASE_DIR)
$(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) $(ASE_LIBS) -o $(PROJECT_ASE)
vortex.o: vortex.cpp $(AFU_JSON_INFO)
vlsim: $(SRCS) opae-vlsim
$(CXX) $(CXXFLAGS) -L./vlsim -DUSE_VLSIM $(SRCS) $(LDFLAGS) $(VLSIM_LIBS) -o $(PROJECT_VLSIM)
opae-vlsim:
$(SET_SCOPE) $(MAKE) -C vlsim
vortex.o: vortex.cpp
$(CXX) $(CXXFLAGS) -c vortex.cpp -o $@
$(ASE_DIR):
mkdir -p ase
.depend: $(SRCS) $(AFU_JSON_INFO)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean:
rm -rf $(PROJECT) $(PROJECT_ASE) $(AFU_JSON_INFO) *.o .depend
rm -rf $(PROJECT) $(PROJECT_ASE) $(PROJECT_VLSIM) *.o .depend
$(MAKE) -C vlsim clean
ifneq ($(MAKECMDGOALS),clean)
-include .depend

View File

@@ -1,303 +0,0 @@
#include <iostream>
#include <fstream>
#include <thread>
#include <chrono>
#include <vector>
#include <assert.h>
#include <VX_config.h>
#include "scope.h"
#include "vortex_afu.h"
#define CHECK_RES(_expr) \
do { \
fpga_result res = _expr; \
if (res == FPGA_OK) \
break; \
printf("OPAE Error: '%s' returned %d, %s!\n", \
#_expr, (int)res, fpgaErrStr(res)); \
return -1; \
} while (false)
template<int N>
constexpr bool static_print() {
return (0 < N < 100);
}
#define MMIO_SCOPE_READ (AFU_IMAGE_MMIO_SCOPE_READ * 4)
#define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4)
struct scope_signal_t {
int width;
const char* name;
};
constexpr int ilog2(int n) {
return (n > 1) ? 1 +
ilog2(n >> 1) : 0;
}
static constexpr int NW_BITS = ilog2(NUM_WARPS);
#ifdef EXT_F_ENABLE
static constexpr int NR_BITS = ilog2(64);
#else
static constexpr int NR_BITS = ilog2(32);
#endif
static constexpr int EX_BITS = 3;
static constexpr int OP_BITS = 4;
static constexpr int MOD_BITS = 3;
static constexpr int ICORE_TAG_WIDTH = NW_BITS;
static constexpr int DCORE_TAG_WIDTH = ilog2(LSUQ_SIZE);
static constexpr scope_signal_t scope_signals[] = {
{ 32, "dram_req_addr" },
{ 1, "dram_req_rw" },
{ 16, "dram_req_byteen" },
{ 128, "dram_req_data" },
{ 29, "dram_req_tag" },
{ 128, "dram_rsp_data" },
{ 29, "dram_rsp_tag" },
{ 32, "snp_req_addr" },
{ 1, "snp_req_invalidate" },
{ 16, "snp_req_tag" },
{ 16, "snp_rsp_tag" },
{ NW_BITS, "icache_req_wid" },
{ 32, "icache_req_addr" },
{ ICORE_TAG_WIDTH, "icache_req_tag" },
{ 32, "icache_rsp_data" },
{ ICORE_TAG_WIDTH, "icache_rsp_tag" },
{ NW_BITS, "dcache_req_wid" },
{ 32, "dcache_req_pc" },
{ NUM_THREADS * 32, "dcache_req_addr" },
{ 1, "dcache_req_rw" },
{ NUM_THREADS * 4, "dcache_req_byteen" },
{ NUM_THREADS * 32, "dcache_req_data" },
{ DCORE_TAG_WIDTH, "dcache_req_tag" },
{ NUM_THREADS * 32, "dcache_rsp_data" },
{ DCORE_TAG_WIDTH, "dcache_rsp_tag" },
{ NW_BITS, "issue_wid" },
{ NUM_THREADS, "issue_tmask" },
{ 32, "issue_pc" },
{ EX_BITS, "issue_ex_type" },
{ OP_BITS, "issue_op_type" },
{ MOD_BITS, "issue_op_mod" },
{ 1, "issue_wb" },
{ NR_BITS, "issue_rd" },
{ NR_BITS, "issue_rs1" },
{ NR_BITS, "issue_rs2" },
{ NR_BITS, "issue_rs3" },
{ 32, "issue_imm" },
{ 1, "issue_rs1_is_pc" },
{ 1, "issue_rs2_is_imm" },
{ NW_BITS, "gpr_rsp_wid" },
{ 32, "gpr_rsp_pc" },
{ NUM_THREADS * 32, "gpr_rsp_a" },
{ NUM_THREADS * 32, "gpr_rsp_b" },
{ NUM_THREADS * 32, "gpr_rsp_c" },
{ NW_BITS, "writeback_wid" },
{ 32, "writeback_pc" },
{ NR_BITS, "writeback_rd" },
{ NUM_THREADS * 32, "writeback_data" },
{ 32, "bank_addr_st0" },
{ 32, "bank_addr_st1" },
{ 32, "bank_addr_st2" },
{ 1, "scope_bank_is_mrvq_st1" },
{ 1, "scope_bank_miss_st1" },
{ 1, "scope_bank_dirty_st1" },
{ 1, "scope_bank_force_miss_st1" },
///////////////////////////////////////////////////////////////////////////
{ 1, "dram_req_valid" },
{ 1, "dram_req_ready" },
{ 1, "dram_rsp_valid" },
{ 1, "dram_rsp_ready" },
{ 1, "snp_req_valid" },
{ 1, "snp_req_ready" },
{ 1, "snp_rsp_valid" },
{ 1, "snp_rsp_ready" },
{ 1, "icache_req_valid" },
{ 1, "icache_req_ready" },
{ 1, "icache_rsp_valid" },
{ 1, "icache_rsp_ready" },
{ NUM_THREADS, "dcache_req_valid" },
{ 1, "dcache_req_ready" },
{ NUM_THREADS, "dcache_rsp_valid" },
{ 1, "dcache_rsp_ready" },
{ 1, "bank_valid_st0" },
{ 1, "bank_valid_st1" },
{ 1, "bank_valid_st2" },
{ 1, "bank_stall_pipe" },
{ 1, "issue_valid" },
{ 1, "issue_ready" },
{ 1, "gpr_rsp_valid" },
{ 1, "writeback_valid" },
{ 1, "scoreboard_delay" },
{ 1, "gpr_delay" },
{ 1, "execute_delay" },
{ 1, "busy" },
};
static constexpr int num_signals = sizeof(scope_signals) / sizeof(scope_signal_t);
constexpr int calcFrameWidth(int index = 0) {
return (index < num_signals) ? (scope_signals[index].width + calcFrameWidth(index + 1)) : 0;
}
static constexpr int fwidth = calcFrameWidth();
static_assert(fwidth == 1766, "invalid size");
int vx_scope_start(fpga_handle hfpga, uint64_t delay) {
if (nullptr == hfpga)
return -1;
if (delay != uint64_t(-1)) {
// set start delay
uint64_t cmd_delay = ((delay << 3) | 4);
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, cmd_delay));
std::cout << "scope start delay: " << delay << std::endl;
}
return 0;
}
int vx_scope_stop(fpga_handle hfpga, uint64_t delay) {
if (nullptr == hfpga)
return -1;
if (delay != uint64_t(-1)) {
// stop recording
uint64_t cmd_stop = ((delay << 3) | 5);
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, cmd_stop));
std::cout << "scope stop delay: " << delay << std::endl;
}
std::ofstream ofs("vx_scope.vcd");
ofs << "$timescale 1 ns $end" << std::endl;
ofs << "$var reg 1 0 clk $end" << std::endl;
ofs << "enddefinitions $end" << std::endl;
uint64_t frame_width, max_frames, data_valid;
// wait for recording to terminate
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 0));
do {
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid));
if (data_valid)
break;
std::this_thread::sleep_for(std::chrono::seconds(1));
} while (true);
std::cout << "scope trace dump begin..." << std::endl;
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 2));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &frame_width));
std::cout << "scope::frame_width=" << std::dec << frame_width << std::endl;
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 3));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &max_frames));
std::cout << "scope::max_frames=" << std::dec << max_frames << std::endl;
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 1));
if (fwidth != (int)frame_width) {
std::cerr << "invalid frame_width: expecting " << std::dec << fwidth << "!" << std::endl;
std::abort();
}
std::vector<char> signal_data(frame_width+1);
uint64_t frame_offset = 0;
uint64_t frame_no = 0;
uint64_t timestamp = 0;
int signal_id = 0;
int signal_offset = 0;
auto print_header = [&] () {
ofs << '#' << timestamp++ << std::endl;
ofs << "b0 0" << std::endl;
ofs << '#' << timestamp++ << std::endl;
ofs << "b1 0" << std::endl;
uint64_t delta;
fpga_result res = fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &delta);
assert(res == FPGA_OK);
while (delta != 0) {
ofs << '#' << timestamp++ << std::endl;
ofs << "b0 0" << std::endl;
ofs << '#' << timestamp++ << std::endl;
ofs << "b1 0" << std::endl;
--delta;
}
signal_id = num_signals;
};
print_header();
do {
if (frame_no == (max_frames-1)) {
// verify last frame is valid
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 0));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid));
assert(data_valid == 1);
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 1));
}
uint64_t word;
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &word));
do {
int signal_width = scope_signals[signal_id-1].width;
int word_offset = frame_offset % 64;
signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0';
++signal_offset;
++frame_offset;
if (signal_offset == signal_width) {
signal_data[signal_width] = 0; // string null termination
ofs << 'b' << signal_data.data() << ' ' << signal_id << std::endl;
signal_offset = 0;
--signal_id;
}
if (frame_offset == frame_width) {
assert(0 == signal_offset);
frame_offset = 0;
++frame_no;
if (frame_no != max_frames) {
print_header();
}
}
} while ((frame_offset % 64) != 0);
} while (frame_no != max_frames);
std::cout << "scope trace dump done! - " << (timestamp/2) << " cycles" << std::endl;
// verify data not valid
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 0));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid));
assert(data_valid == 0);
return 0;
}

1
driver/opae/vlsim/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/obj_dir/*

View File

@@ -0,0 +1,91 @@
#CFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -Wfatal-errors
CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
CFLAGS += -I../../../../hw
# control RTL debug print states
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_SNP
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSRQ
DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_PRINT_FLAGS += -DDBG_CORE_REQ_INFO
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CORE_REQ_INFO
#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=1
DEBUG=1
SCOPE=1
CFLAGS += -fPIC
CFLAGS += -DUSE_RTLSIM $(CONFIGS)
CFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -pthread
# LDFLAGS += -dynamiclib -pthread
TOP = vortex_afu_shim
RTL_DIR=../../../hw/rtl
SRCS = fpga.cpp opae_sim.cpp
SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
VL_FLAGS += -Wno-DECLFILENAME
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += verilator.vlt
# Enable Verilator multithreaded simulation
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
#VL_FLAGS += --threads $(THREADS)
# Debugigng
ifdef DEBUG
VL_FLAGS += -DVCD_OUTPUT --assert --trace $(DBG_FLAGS)
CFLAGS += -DVCD_OUTPUT $(DBG_FLAGS)
else
VL_FLAGS += -DNDEBUG
CFLAGS += -DNDEBUG
endif
# Enable scope analyzer
ifdef SCOPE
VL_FLAGS += -DSCOPE
CFLAGS += -DSCOPE
SCOPE_CFG = scope
endif
VL_FLAGS += -DNOPAE
CFLAGS += -DNOPAE
RTL_INCLUDE += -I../../../hw/opae -I../../../hw/opae/ccip
PROJECT = libopae-c-vlsim.so
all: $(PROJECT)
# generate scope data
scope: ../../../hw/scripts/scope.json
../../../hw/scripts/scope.py $(RTL_INCLUDE) $(CONFIGS) -cc ../scope-defs.h -vl ../../../hw/rtl/scope-defs.vh ../../../hw/scripts/scope.json
$(PROJECT): $(SRCS) $(SCOPE_CFG)
verilator --exe --cc $(TOP) --top-module $(TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
OPT_FAST="-O0 -g" make -j -C obj_dir -f V$(TOP).mk
clean:
rm -rf $(PROJECT) obj_dir ../scope-defs.h ../../../hw/rtl/scope-defs.vh

View File

@@ -0,0 +1,94 @@
#include <stdint.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <cstdlib>
#include <unistd.h>
#include <assert.h>
#include "fpga.h"
#include "opae_sim.h"
#include <VX_config.h>
extern fpga_result fpgaOpen(fpga_token token, fpga_handle *handle, int flags) {
if (NULL == handle || flags != 0)
return FPGA_INVALID_PARAM;
auto sim = new opae_sim();
*handle = reinterpret_cast<fpga_handle>(sim);
return FPGA_OK;
}
extern fpga_result fpgaClose(fpga_handle handle) {
if (NULL == handle)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
delete sim;
return FPGA_OK;
}
extern fpga_result fpgaPrepareBuffer(fpga_handle handle, uint64_t len, void **buf_addr, uint64_t *wsid, int flags) {
if (NULL == handle || len == 0 || buf_addr == NULL || wsid == NULL)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
int ret = sim->prepare_buffer(len, buf_addr, wsid, flags);
if (ret != 0)
return FPGA_NO_MEMORY;
return FPGA_OK;
}
extern fpga_result fpgaReleaseBuffer(fpga_handle handle, uint64_t wsid) {
if (NULL == handle)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
sim->release_buffer(wsid);
return FPGA_OK;
}
extern fpga_result fpgaGetIOAddress(fpga_handle handle, uint64_t wsid, uint64_t *ioaddr) {
if (NULL == handle || ioaddr == NULL)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
sim->get_io_address(wsid, ioaddr);
return FPGA_OK;
}
extern fpga_result fpgaWriteMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t value) {
if (NULL == handle || mmio_num != 0)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
sim->write_mmio64(mmio_num, offset, value);
return FPGA_OK;
}
extern fpga_result fpgaReadMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t *value) {
if (NULL == handle || mmio_num != 0 || value == NULL)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
sim->read_mmio64(mmio_num, offset, value);
return FPGA_OK;
}
extern fpga_result fpgaFlush(fpga_handle handle) {
if (NULL == handle)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
sim->flush();
return FPGA_OK;
}
extern const char *fpgaErrStr(fpga_result e) {
return "";
}

50
driver/opae/vlsim/fpga.h Normal file
View File

@@ -0,0 +1,50 @@
#ifndef __FPGA_H__
#define __FPGA_H__
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef enum {
FPGA_OK = 0, /**< Operation completed successfully */
FPGA_INVALID_PARAM, /**< Invalid parameter supplied */
FPGA_BUSY, /**< Resource is busy */
FPGA_EXCEPTION, /**< An exception occurred */
FPGA_NOT_FOUND, /**< A required resource was not found */
FPGA_NO_MEMORY, /**< Not enough memory to complete operation */
FPGA_NOT_SUPPORTED, /**< Requested operation is not supported */
FPGA_NO_DRIVER, /**< Driver is not loaded */
FPGA_NO_DAEMON, /**< FPGA Daemon (fpgad) is not running */
FPGA_NO_ACCESS, /**< Insufficient privileges or permissions */
FPGA_RECONF_ERROR /**< Error while reconfiguring FPGA */
} fpga_result;
typedef void *fpga_handle;
typedef void *fpga_token;
fpga_result fpgaOpen(fpga_token token, fpga_handle *handle, int flags);
fpga_result fpgaClose(fpga_handle handle);
fpga_result fpgaPrepareBuffer(fpga_handle handle, uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
fpga_result fpgaReleaseBuffer(fpga_handle handle, uint64_t wsid);
fpga_result fpgaGetIOAddress(fpga_handle handle, uint64_t wsid, uint64_t *ioaddr);
fpga_result fpgaWriteMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t value);
fpga_result fpgaReadMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t *value);
fpga_result fpgaFlush(fpga_handle handle);
const char *fpgaErrStr(fpga_result e);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif // __FPGA_H__

View File

@@ -0,0 +1,283 @@
#include "opae_sim.h"
#include <iostream>
#include <fstream>
#include <iomanip>
#define CCI_LATENCY 8
#define CCI_RAND_MOD 8
#define CCI_RQ_SIZE 16
#define CCI_WQ_SIZE 16
#define ENABLE_DRAM_STALLS
#define DRAM_LATENCY 4
#define DRAM_RQ_SIZE 16
#define DRAM_STALLS_MODULO 16
uint64_t timestamp = 0;
double sc_time_stamp() {
return timestamp;
}
opae_sim::opae_sim() {
// force random values for unitialized signals
Verilated::randReset(2);
Verilated::randSeed(50);
// Turn off assertion before reset
Verilated::assertOn(false);
vortex_afu_ = new Vvortex_afu_shim();
#ifdef VCD_OUTPUT
Verilated::traceEverOn(true);
trace_ = new VerilatedVcdC();
vortex_afu_->trace(trace_, 99);
trace_->open("trace.vcd");
#endif
this->reset();
stop_ = false;
future_ = std::async(std::launch::async, [&]{
while (!stop_) {
std::lock_guard<std::mutex> guard(mutex_);
this->step();
}
});
}
opae_sim::~opae_sim() {
stop_ = true;
if (future_.valid()) {
future_.wait();
}
#ifdef VCD_OUTPUT
trace_->close();
#endif
delete vortex_afu_;
}
int opae_sim::prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) {
auto alloc = aligned_alloc(CACHE_BLOCK_SIZE, len);
if (alloc == NULL)
return -1;
host_buffer_t buffer;
buffer.data = (uint64_t*)alloc;
buffer.size = len;
buffer.ioaddr = uintptr_t(alloc);
auto index = host_buffers_.size();
host_buffers_.emplace(index, buffer);
*buf_addr = alloc;
*wsid = index;
return 0;
}
void opae_sim::release_buffer(uint64_t wsid) {
auto it = host_buffers_.find(wsid);
if (it != host_buffers_.end()) {
free(it->second.data);
host_buffers_.erase(it);
}
}
void opae_sim::get_io_address(uint64_t wsid, uint64_t *ioaddr) {
*ioaddr = host_buffers_[wsid].ioaddr;
}
void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) {
std::lock_guard<std::mutex> guard(mutex_);
vortex_afu_->vcp2af_sRxPort_c0_mmioWrValid = 1;
vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, &value, 8);
this->step();
assert(!vortex_afu_->vcp2af_sRxPort_c0_mmioWrValid);
}
void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
std::lock_guard<std::mutex> guard(mutex_);
vortex_afu_->vcp2af_sRxPort_c0_mmioRdValid = 1;
vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
vortex_afu_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
this->step();
assert(!vortex_afu_->vcp2af_sRxPort_c0_mmioRdValid);
assert(vortex_afu_->af2cp_sTxPort_c2_mmioRdValid);
*value = vortex_afu_->af2cp_sTxPort_c2_data;
}
void opae_sim::flush() {
// flush pending CCI requests
}
///////////////////////////////////////////////////////////////////////////////
void opae_sim::reset() {
vortex_afu_->reset = 1;
this->step();
vortex_afu_->reset = 0;
// Turn on assertion after reset
Verilated::assertOn(true);
}
void opae_sim::step() {
vortex_afu_->clk = 0;
this->eval();
vortex_afu_->clk = 1;
this->eval();
this->sRxPort_bus();
this->sTxPort_bus();
this->avs_bus();
#ifndef NDEBUG
fflush(stdout);
#endif
}
void opae_sim::eval() {
vortex_afu_->eval();
#ifdef VCD_OUTPUT
trace_->dump(timestamp);
#endif
++timestamp;
}
void opae_sim::sRxPort_bus() {
// schedule CCI read responses
int cci_rd_index = -1;
for (int i = 0; i < cci_reads_.size(); i++) {
if (cci_reads_[i].cycles_left > 0) {
cci_reads_[i].cycles_left -= 1;
}
if ((cci_rd_index == -1)
&& (cci_reads_[i].cycles_left == 0)) {
cci_rd_index = i;
}
}
// schedule CCI write responses
int cci_wr_index = -1;
for (int i = 0; i < cci_writes_.size(); i++) {
if (cci_writes_[i].cycles_left > 0) {
cci_writes_[i].cycles_left -= 1;
}
if ((cci_wr_index == -1)
&& (cci_writes_[i].cycles_left == 0)) {
cci_wr_index = i;
}
}
// send CCI read response
vortex_afu_->vcp2af_sRxPort_c0_rspValid = 0;
if (cci_rd_index != -1) {
vortex_afu_->vcp2af_sRxPort_c0_rspValid = 1;
memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, cci_reads_[cci_rd_index].block.data(), CACHE_BLOCK_SIZE);
vortex_afu_->vcp2af_sRxPort_c0_hdr_mdata = cci_reads_[cci_rd_index].mdata;
cci_reads_.erase(cci_reads_.begin() + cci_rd_index);
}
// send CCI write response
vortex_afu_->vcp2af_sRxPort_c1_rspValid = 0;
if (cci_wr_index != -1) {
vortex_afu_->vcp2af_sRxPort_c1_rspValid = 1;
vortex_afu_->vcp2af_sRxPort_c1_hdr_mdata = cci_writes_[cci_wr_index].mdata;
cci_writes_.erase(cci_writes_.begin() + cci_wr_index);
}
// mmio
vortex_afu_->vcp2af_sRxPort_c0_mmioWrValid = 0;
vortex_afu_->vcp2af_sRxPort_c0_mmioRdValid = 0;
}
void opae_sim::sTxPort_bus() {
// check read queue size
vortex_afu_->vcp2af_sRxPort_c0_TxAlmFull = (cci_reads_.size() >= CCI_RQ_SIZE);
// check write queue size
vortex_afu_->vcp2af_sRxPort_c1_TxAlmFull = (cci_writes_.size() >= CCI_WQ_SIZE);
// process read requests
if (vortex_afu_->af2cp_sTxPort_c0_valid && !vortex_afu_->vcp2af_sRxPort_c0_TxAlmFull) {
cci_rd_req_t cci_req;
cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD);
cci_req.mdata = vortex_afu_->af2cp_sTxPort_c0_hdr_mdata;
auto host_ptr = (uint64_t*)(vortex_afu_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE);
memcpy(cci_req.block.data(), host_ptr, CACHE_BLOCK_SIZE);
cci_reads_.push_back(cci_req);
}
// process write requests
if (vortex_afu_->af2cp_sTxPort_c1_valid && !vortex_afu_->vcp2af_sRxPort_c1_TxAlmFull) {
cci_wr_req_t cci_req;
cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD);
cci_req.mdata = vortex_afu_->af2cp_sTxPort_c1_hdr_mdata;
auto host_ptr = (uint64_t*)(vortex_afu_->af2cp_sTxPort_c1_hdr_address * CACHE_BLOCK_SIZE);
memcpy(host_ptr, vortex_afu_->af2cp_sTxPort_c1_data, CACHE_BLOCK_SIZE);
cci_writes_.push_back(cci_req);
}
}
void opae_sim::avs_bus() {
// schedule DRAM read responses
int dram_rd_index = -1;
for (int i = 0; i < dram_reads_.size(); i++) {
if (dram_reads_[i].cycles_left > 0) {
dram_reads_[i].cycles_left -= 1;
}
if ((dram_rd_index == -1)
&& (dram_reads_[i].cycles_left == 0)) {
dram_rd_index = i;
}
}
// send DRAM response
vortex_afu_->avs_readdatavalid = 0;
if (dram_rd_index != -1) {
vortex_afu_->avs_readdatavalid = 1;
memcpy(vortex_afu_->avs_readdata, dram_reads_[dram_rd_index].block.data(), CACHE_BLOCK_SIZE);
dram_reads_.erase(dram_reads_.begin() + dram_rd_index);
}
// handle DRAM stalls
bool dram_stalled = false;
#ifdef ENABLE_DRAM_STALLS
if (0 == ((timestamp/2) % DRAM_STALLS_MODULO)) {
dram_stalled = true;
} else
if (dram_reads_.size() >= DRAM_RQ_SIZE) {
dram_stalled = true;
}
#endif
// process DRAM requests
if (!dram_stalled) {
if (vortex_afu_->avs_write) {
assert(0 == vortex_afu_->mem_bank_select);
uint64_t byteen = vortex_afu_->avs_byteenable;
unsigned base_addr = (vortex_afu_->avs_address * CACHE_BLOCK_SIZE);
uint8_t* data = (uint8_t*)(vortex_afu_->avs_writedata);
for (int i = 0; i < CACHE_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
ram_[base_addr + i] = data[i];
}
}
}
if (vortex_afu_->avs_read) {
assert(0 == vortex_afu_->mem_bank_select);
dram_rd_req_t dram_req;
dram_req.cycles_left = DRAM_LATENCY;
unsigned base_addr = (vortex_afu_->avs_address * CACHE_BLOCK_SIZE);
ram_.read(base_addr, CACHE_BLOCK_SIZE, dram_req.block.data());
dram_reads_.push_back(dram_req);
}
}
vortex_afu_->avs_waitrequest = dram_stalled;
}

View File

@@ -0,0 +1,92 @@
#pragma once
#include "Vvortex_afu_shim.h"
#include "Vvortex_afu_shim__Syms.h"
#include "verilated.h"
#ifdef VCD_OUTPUT
#include <verilated_vcd_c.h>
#endif
#include <VX_config.h>
#include "ram.h"
#include <ostream>
#include <future>
#include <vector>
#include <unordered_map>
#define CACHE_BLOCK_SIZE 64
class opae_sim {
public:
opae_sim();
virtual ~opae_sim();
int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
void release_buffer(uint64_t wsid);
void get_io_address(uint64_t wsid, uint64_t *ioaddr);
void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value);
void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value);
void flush();
private:
typedef struct {
int cycles_left;
std::array<uint8_t, CACHE_BLOCK_SIZE> block;
unsigned tag;
} dram_rd_req_t;
typedef struct {
int cycles_left;
std::array<uint8_t, CACHE_BLOCK_SIZE> block;
unsigned mdata;
} cci_rd_req_t;
typedef struct {
int cycles_left;
unsigned mdata;
} cci_wr_req_t;
typedef struct {
uint64_t* data;
size_t size;
uint64_t ioaddr;
} host_buffer_t;
void reset();
void eval();
void step();
void sRxPort_bus();
void sTxPort_bus();
void avs_bus();
std::future<void> future_;
bool stop_;
std::unordered_map<int64_t, host_buffer_t> host_buffers_;
std::vector<dram_rd_req_t> dram_reads_;
std::vector<cci_rd_req_t> cci_reads_;
std::vector<cci_wr_req_t> cci_writes_;
std::mutex mutex_;
RAM ram_;
Vvortex_afu_shim *vortex_afu_;
#ifdef VCD_OUTPUT
VerilatedVcdC *trace_;
#endif
};

64
driver/opae/vlsim/ram.h Normal file
View File

@@ -0,0 +1,64 @@
#pragma once
#include <stdio.h>
#include <stdint.h>
class RAM {
private:
mutable uint8_t *mem_[(1 << 12)];
uint8_t *get(uint32_t address) const {
uint32_t block_addr = address >> 20;
uint32_t block_offset = address & 0x000FFFFF;
if (mem_[block_addr] == NULL) {
mem_[block_addr] = new uint8_t[(1 << 20)];
}
return mem_[block_addr] + block_offset;
}
public:
RAM() {
for (uint32_t i = 0; i < (1 << 12); i++) {
mem_[i] = NULL;
}
}
~RAM() {
this->clear();
}
size_t size() const {
return (1ull << 32);
}
void clear() {
for (uint32_t i = 0; i < (1 << 12); i++) {
if (mem_[i]) {
delete mem_[i];
mem_[i] = NULL;
}
}
}
void read(uint32_t address, uint32_t length, uint8_t *data) const {
for (unsigned i = 0; i < length; i++) {
data[i] = *this->get(address + i);
}
}
void write(uint32_t address, uint32_t length, const uint8_t *data) {
for (unsigned i = 0; i < length; i++) {
*this->get(address + i) = data[i];
}
}
uint8_t& operator[](uint32_t address) {
return *get(address);
}
const uint8_t& operator[](uint32_t address) const {
return *get(address);
}
};

View File

@@ -0,0 +1,9 @@
`verilator_config
lint_off -rule BLKANDNBLK -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule UNOPTFLAT -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule WIDTH -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule UNUSED -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule LITENDIAN -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule IMPORTSTAR -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule PINCONNECTEMPTY -file "../../../hw/rtl/fp_cores/fpnew/*"

View File

@@ -0,0 +1,170 @@
`include "vortex_afu.vh"
`include "VX_define.vh"
/* verilator lint_off IMPORTSTAR */
import ccip_if_pkg::*;
import local_mem_cfg_pkg::*;
/* verilator lint_on IMPORTSTAR */
module vortex_afu_shim #(
parameter NUM_LOCAL_MEM_BANKS = 2
) (
// global signals
input clk,
input reset,
// IF signals between CCI and AFU
input logic vcp2af_sRxPort_c0_TxAlmFull,
input logic vcp2af_sRxPort_c1_TxAlmFull,
input t_ccip_vc vcp2af_sRxPort_c0_hdr_vc_used,
input logic vcp2af_sRxPort_c0_hdr_rsvd1,
input logic vcp2af_sRxPort_c0_hdr_hit_miss,
input logic [1:0] vcp2af_sRxPort_c0_hdr_rsvd0,
input t_ccip_clNum vcp2af_sRxPort_c0_hdr_cl_num,
input t_ccip_c0_rsp vcp2af_sRxPort_c0_hdr_resp_type,
input t_ccip_mdata vcp2af_sRxPort_c0_hdr_mdata,
input t_ccip_clData vcp2af_sRxPort_c0_data,
input logic vcp2af_sRxPort_c0_rspValid,
input logic vcp2af_sRxPort_c0_mmioRdValid,
input logic vcp2af_sRxPort_c0_mmioWrValid,
input t_ccip_mmioAddr vcp2af_sRxPort_c0_ReqMmioHdr_address,
input logic [1:0] vcp2af_sRxPort_c0_ReqMmioHdr_length,
input logic vcp2af_sRxPort_c0_ReqMmioHdr_rsvd,
input t_ccip_tid vcp2af_sRxPort_c0_ReqMmioHdr_tid,
input t_ccip_vc vcp2af_sRxPort_c1_hdr_vc_used,
input logic vcp2af_sRxPort_c1_hdr_rsvd1,
input logic vcp2af_sRxPort_c1_hdr_hit_miss,
input logic vcp2af_sRxPort_c1_hdr_format,
input logic vcp2af_sRxPort_c1_hdr_rsvd0,
input t_ccip_clNum vcp2af_sRxPort_c1_hdr_cl_num,
input t_ccip_c1_rsp vcp2af_sRxPort_c1_hdr_resp_type,
input t_ccip_mdata vcp2af_sRxPort_c1_hdr_mdata,
input logic vcp2af_sRxPort_c1_rspValid,
output t_ccip_vc af2cp_sTxPort_c0_hdr_vc_sel,
output logic [1:0] af2cp_sTxPort_c0_hdr_rsvd1,
output t_ccip_clLen af2cp_sTxPort_c0_hdr_cl_len,
output t_ccip_c0_req af2cp_sTxPort_c0_hdr_req_type,
output logic [5:0] af2cp_sTxPort_c0_hdr_rsvd0,
output t_ccip_clAddr af2cp_sTxPort_c0_hdr_address,
output t_ccip_mdata af2cp_sTxPort_c0_hdr_mdata,
output logic af2cp_sTxPort_c0_valid,
output logic [5:0] af2cp_sTxPort_c1_hdr_rsvd2,
output t_ccip_vc af2cp_sTxPort_c1_hdr_vc_sel,
output logic af2cp_sTxPort_c1_hdr_sop,
output logic af2cp_sTxPort_c1_hdr_rsvd1,
output t_ccip_clLen af2cp_sTxPort_c1_hdr_cl_len,
output t_ccip_c1_req af2cp_sTxPort_c1_hdr_req_type,
output logic [5:0] af2cp_sTxPort_c1_hdr_rsvd0,
output t_ccip_clAddr af2cp_sTxPort_c1_hdr_address,
output t_ccip_mdata af2cp_sTxPort_c1_hdr_mdata,
output t_ccip_clData af2cp_sTxPort_c1_data,
output logic af2cp_sTxPort_c1_valid,
output t_ccip_tid af2cp_sTxPort_c2_hdr_tid,
output logic af2cp_sTxPort_c2_mmioRdValid,
output t_ccip_mmioData af2cp_sTxPort_c2_data,
// Avalon signals for local memory access
output t_local_mem_data avs_writedata,
input t_local_mem_data avs_readdata,
output t_local_mem_addr avs_address,
input logic avs_waitrequest,
output logic avs_write,
output logic avs_read,
output t_local_mem_byte_mask avs_byteenable,
output t_local_mem_burst_cnt avs_burstcount,
input avs_readdatavalid,
output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select
);
t_if_ccip_Rx cp2af_sRxPort;
t_if_ccip_Tx af2cp_sTxPort;
vortex_afu #(
.NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS)
) vortex_afu (
.clk(clk),
.reset(reset),
.cp2af_sRxPort(cp2af_sRxPort),
.af2cp_sTxPort(af2cp_sTxPort),
.avs_writedata(avs_writedata),
.avs_readdata(avs_readdata),
.avs_address(avs_address),
.avs_waitrequest(avs_waitrequest),
.avs_write(avs_write),
.avs_read(avs_read),
.avs_byteenable(avs_byteenable),
.avs_burstcount(avs_burstcount),
.avs_readdatavalid(avs_readdatavalid),
.mem_bank_select(mem_bank_select)
);
t_if_ccip_c0_RxHdr c0_RxHdr;
always @ (*) begin
c0_RxHdr = 'x;
if (vcp2af_sRxPort_c0_mmioWrValid || vcp2af_sRxPort_c0_mmioRdValid) begin
c0_RxHdr.reqMmioHdr.address = vcp2af_sRxPort_c0_ReqMmioHdr_address;
c0_RxHdr.reqMmioHdr.length = vcp2af_sRxPort_c0_ReqMmioHdr_length;
c0_RxHdr.reqMmioHdr.rsvd = vcp2af_sRxPort_c0_ReqMmioHdr_rsvd;
c0_RxHdr.reqMmioHdr.tid = vcp2af_sRxPort_c0_ReqMmioHdr_tid;
end else begin
c0_RxHdr.rspMemHdr.vc_used = vcp2af_sRxPort_c0_hdr_vc_used;
c0_RxHdr.rspMemHdr.rsvd1 = vcp2af_sRxPort_c0_hdr_rsvd1;
c0_RxHdr.rspMemHdr.hit_miss = vcp2af_sRxPort_c0_hdr_hit_miss;
c0_RxHdr.rspMemHdr.rsvd0 = vcp2af_sRxPort_c0_hdr_rsvd0;
c0_RxHdr.rspMemHdr.cl_num = vcp2af_sRxPort_c0_hdr_cl_num;
c0_RxHdr.rspMemHdr.resp_type = vcp2af_sRxPort_c0_hdr_resp_type;
c0_RxHdr.rspMemHdr.mdata = vcp2af_sRxPort_c0_hdr_mdata;
end
end
assign cp2af_sRxPort.c0TxAlmFull = vcp2af_sRxPort_c0_TxAlmFull;
assign cp2af_sRxPort.c1TxAlmFull = vcp2af_sRxPort_c1_TxAlmFull;
assign cp2af_sRxPort.c0.hdr = c0_RxHdr;
assign cp2af_sRxPort.c0.data = vcp2af_sRxPort_c0_data;
assign cp2af_sRxPort.c0.rspValid = vcp2af_sRxPort_c0_rspValid;
assign cp2af_sRxPort.c0.mmioRdValid = vcp2af_sRxPort_c0_mmioRdValid;
assign cp2af_sRxPort.c0.mmioWrValid = vcp2af_sRxPort_c0_mmioWrValid;
assign cp2af_sRxPort.c1.hdr.vc_used = vcp2af_sRxPort_c1_hdr_vc_used;
assign cp2af_sRxPort.c1.hdr.rsvd1 = vcp2af_sRxPort_c1_hdr_rsvd1;
assign cp2af_sRxPort.c1.hdr.hit_miss = vcp2af_sRxPort_c1_hdr_hit_miss;
assign cp2af_sRxPort.c1.hdr.format = vcp2af_sRxPort_c1_hdr_format;
assign cp2af_sRxPort.c1.hdr.rsvd0 = vcp2af_sRxPort_c1_hdr_rsvd0;
assign cp2af_sRxPort.c1.hdr.cl_num = vcp2af_sRxPort_c1_hdr_cl_num;
assign cp2af_sRxPort.c1.hdr.resp_type = vcp2af_sRxPort_c1_hdr_resp_type;
assign cp2af_sRxPort.c1.hdr.mdata = vcp2af_sRxPort_c1_hdr_mdata;
assign cp2af_sRxPort.c1.rspValid = vcp2af_sRxPort_c1_rspValid;
assign af2cp_sTxPort_c0_hdr_vc_sel = af2cp_sTxPort.c0.hdr.vc_sel;
assign af2cp_sTxPort_c0_hdr_rsvd1 = af2cp_sTxPort.c0.hdr.rsvd1;
assign af2cp_sTxPort_c0_hdr_cl_len = af2cp_sTxPort.c0.hdr.cl_len;
assign af2cp_sTxPort_c0_hdr_req_type = af2cp_sTxPort.c0.hdr.req_type;
assign af2cp_sTxPort_c0_hdr_rsvd0 = af2cp_sTxPort.c0.hdr.rsvd0;
assign af2cp_sTxPort_c0_hdr_address = af2cp_sTxPort.c0.hdr.address;
assign af2cp_sTxPort_c0_hdr_mdata = af2cp_sTxPort.c0.hdr.mdata;
assign af2cp_sTxPort_c0_valid = af2cp_sTxPort.c0.valid;
assign af2cp_sTxPort_c1_hdr_rsvd2 = af2cp_sTxPort.c1.hdr.rsvd2;
assign af2cp_sTxPort_c1_hdr_vc_sel = af2cp_sTxPort.c1.hdr.vc_sel;
assign af2cp_sTxPort_c1_hdr_sop = af2cp_sTxPort.c1.hdr.sop;
assign af2cp_sTxPort_c1_hdr_rsvd1 = af2cp_sTxPort.c1.hdr.rsvd1;
assign af2cp_sTxPort_c1_hdr_cl_len = af2cp_sTxPort.c1.hdr.cl_len;
assign af2cp_sTxPort_c1_hdr_req_type = af2cp_sTxPort.c1.hdr.req_type;
assign af2cp_sTxPort_c1_hdr_rsvd0 = af2cp_sTxPort.c1.hdr.rsvd0;
assign af2cp_sTxPort_c1_hdr_address = af2cp_sTxPort.c1.hdr.address;
assign af2cp_sTxPort_c1_hdr_mdata = af2cp_sTxPort.c1.hdr.mdata;
assign af2cp_sTxPort_c1_data = af2cp_sTxPort.c1.data;
assign af2cp_sTxPort_c1_valid = af2cp_sTxPort.c1.valid;
assign af2cp_sTxPort_c2_hdr_tid = af2cp_sTxPort.c2.hdr.tid;
assign af2cp_sTxPort_c2_mmioRdValid = af2cp_sTxPort.c2.mmioRdValid;
assign af2cp_sTxPort_c2_data = af2cp_sTxPort.c2.data;
endmodule

View File

@@ -6,18 +6,25 @@
#include <unistd.h>
#include <assert.h>
#include <cmath>
#include <uuid/uuid.h>
#ifdef USE_VLSIM
#include "vlsim/fpga.h"
#else
#include <opae/fpga.h>
#include <uuid/uuid.h>
#endif
#include <vortex.h>
#include <VX_config.h>
#include "vortex_afu.h"
#ifdef SCOPE
#include "scope.h"
#include "vx_scope.h"
#endif
#define CACHE_LINESIZE 64
#define ALLOC_BASE_ADDR 0x10000000
#define LOCAL_MEM_SIZE 0xffffffff
#define CACHE_BLOCK_SIZE 64
#define ALLOC_BASE_ADDR 0x10000000
#define LOCAL_MEM_SIZE 0xffffffff
#define CHECK_RES(_expr) \
do { \
@@ -61,7 +68,7 @@ typedef struct vx_device_ {
typedef struct vx_buffer_ {
uint64_t wsid;
volatile void* host_ptr;
void* host_ptr;
uint64_t io_addr;
vx_device_h hdevice;
size_t size;
@@ -99,7 +106,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
*value = device->num_threads;
break;
case VX_CAPS_CACHE_LINESIZE:
*value = CACHE_LINESIZE;
*value = CACHE_BLOCK_SIZE;
break;
case VX_CAPS_LOCAL_MEM_SIZE:
*value = LOCAL_MEM_SIZE;
@@ -122,14 +129,16 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
extern int vx_dev_open(vx_device_h* hdevice) {
if (nullptr == hdevice)
return -1;
fpga_properties filter = nullptr;
fpga_result res;
fpga_guid guid;
fpga_token accel_token;
uint32_t num_matches;
fpga_handle accel_handle;
fpga_result res;
fpga_handle accel_handle;
vx_device_t* device;
#ifndef USE_VLSIM
fpga_token accel_token;
fpga_properties filter = nullptr;
fpga_guid guid;
uint32_t num_matches;
// Set up a filter that will search for an accelerator
fpgaGetProperties(nullptr, &filter);
@@ -159,6 +168,13 @@ extern int vx_dev_open(vx_device_h* hdevice) {
// Done with token
fpgaDestroyToken(&accel_token);
#else
// Open accelerator
res = fpgaOpen(NULL, &accel_handle, 0);
if (FPGA_OK != res) {
return -1;
}
#endif
// allocate device object
device = (vx_device_t*)malloc(sizeof(vx_device_t));
@@ -174,9 +190,9 @@ extern int vx_dev_open(vx_device_h* hdevice) {
// Load device CAPS
int ret = 0;
ret |= vx_csr_get(device, 0, CSR_MIMPID, &device->implementation_id);
ret |= vx_csr_get(device, 0, CSR_NC, &device->num_cores);
ret |= vx_csr_get(device, 0, CSR_NW, &device->num_warps);
ret |= vx_csr_get(device, 0, CSR_NT, &device->num_threads);
ret |= vx_csr_get(device, 0, CSR_NC, &device->num_cores);
ret |= vx_csr_get(device, 0, CSR_NW, &device->num_warps);
ret |= vx_csr_get(device, 0, CSR_NT, &device->num_threads);
if (ret != 0) {
fpgaClose(accel_handle);
return ret;
@@ -237,8 +253,6 @@ extern int vx_dev_close(vx_device_h hdevice) {
fpgaClose(device->fpga);
free(device);
return 0;
}
@@ -251,7 +265,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr)
vx_device_t *device = ((vx_device_t*)hdevice);
size_t dev_mem_size = LOCAL_MEM_SIZE;
size_t asize = align_size(size, CACHE_LINESIZE);
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
if (device->mem_allocation + asize > dev_mem_size)
return -1;
@@ -276,7 +290,7 @@ extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hb
vx_device_t *device = ((vx_device_t*)hdevice);
size_t asize = align_size(size, CACHE_LINESIZE);
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
res = fpgaPrepareBuffer(device->fpga, asize, &host_ptr, &wsid, 0);
if (FPGA_OK != res) {
@@ -308,11 +322,15 @@ extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hb
return 0;
}
extern volatile void* vx_host_ptr(vx_buffer_h hbuffer) {
extern void* vx_host_ptr(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return nullptr;
vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer);
#ifdef USE_VLSIM
vx_device_t *device = ((vx_device_t*)buffer->hdevice);
fpgaFlush(device);
#endif
return buffer->host_ptr;
}
@@ -337,7 +355,6 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
vx_device_t *device = ((vx_device_t*)hdevice);
uint64_t data = 0;
struct timespec sleep_time;
#if defined(USE_ASE)
@@ -352,6 +369,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
long long sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
for (;;) {
uint64_t data;
CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &data));
if (0 == data || 0 == timeout) {
if (data != 0) {
@@ -375,12 +393,12 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si
vx_device_t *device = ((vx_device_t*)buffer->hdevice);
size_t dev_mem_size = LOCAL_MEM_SIZE;
size_t asize = align_size(size, CACHE_LINESIZE);
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
// check alignment
if (!is_aligned(dev_maddr, CACHE_LINESIZE))
if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE))
return -1;
if (!is_aligned(buffer->io_addr + src_offset, CACHE_LINESIZE))
if (!is_aligned(buffer->io_addr + src_offset, CACHE_BLOCK_SIZE))
return -1;
// bound checking
@@ -393,7 +411,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si
if (vx_ready_wait(buffer->hdevice, -1) != 0)
return -1;
auto ls_shift = (int)std::log2(CACHE_LINESIZE);
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_IO_ADDR, (buffer->io_addr + src_offset) >> ls_shift));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_MEM_ADDR, dev_maddr >> ls_shift));
@@ -416,12 +434,12 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size,
vx_device_t *device = ((vx_device_t*)buffer->hdevice);
size_t dev_mem_size = LOCAL_MEM_SIZE;
size_t asize = align_size(size, CACHE_LINESIZE);
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
// check alignment
if (!is_aligned(dev_maddr, CACHE_LINESIZE))
if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE))
return -1;
if (!is_aligned(buffer->io_addr + dest_offset, CACHE_LINESIZE))
if (!is_aligned(buffer->io_addr + dest_offset, CACHE_BLOCK_SIZE))
return -1;
// bound checking
@@ -434,7 +452,7 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size,
if (vx_ready_wait(buffer->hdevice, -1) != 0)
return -1;
auto ls_shift = (int)std::log2(CACHE_LINESIZE);
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_IO_ADDR, (buffer->io_addr + dest_offset) >> ls_shift));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_MEM_ADDR, dev_maddr >> ls_shift));
@@ -455,17 +473,17 @@ extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) {
vx_device_t* device = ((vx_device_t*)hdevice);
size_t asize = align_size(size, CACHE_LINESIZE);
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
// check alignment
if (!is_aligned(dev_maddr, CACHE_LINESIZE))
if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE))
return -1;
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
auto ls_shift = (int)std::log2(CACHE_LINESIZE);
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_MEM_ADDR, dev_maddr >> ls_shift));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_DATA_SIZE, asize >> ls_shift));
@@ -491,11 +509,11 @@ extern int vx_start(vx_device_h hdevice) {
// start execution
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN));
#ifdef SCOPE
/*#ifdef SCOPE
sleep(15);
vx_scope_stop(device->fpga, 0);
exit(0);
#endif
#endif*/
return 0;
}
@@ -529,9 +547,8 @@ extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* valu
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
return -1;
// write CSR value
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CORE, core_id));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, addr));

30
driver/opae/vortex_afu.h Normal file
View File

@@ -0,0 +1,30 @@
//
// Generated by afu_json_mgr from ../../hw/opae/vortex_afu.json
//
#ifndef __AFU_JSON_INFO__
#define __AFU_JSON_INFO__
#define AFU_ACCEL_NAME "vortex_afu"
#define AFU_ACCEL_UUID "35F9452B-25C2-434C-93D5-6F8C60DB361C"
#define AFU_IMAGE_CMD_CLFLUSH 4
#define AFU_IMAGE_CMD_CSR_READ 5
#define AFU_IMAGE_CMD_CSR_WRITE 6
#define AFU_IMAGE_CMD_MEM_READ 1
#define AFU_IMAGE_CMD_MEM_WRITE 2
#define AFU_IMAGE_CMD_RUN 3
#define AFU_IMAGE_MMIO_CMD_TYPE 10
#define AFU_IMAGE_MMIO_CSR_ADDR 26
#define AFU_IMAGE_MMIO_CSR_CORE 24
#define AFU_IMAGE_MMIO_CSR_DATA 28
#define AFU_IMAGE_MMIO_CSR_READ 30
#define AFU_IMAGE_MMIO_DATA_SIZE 16
#define AFU_IMAGE_MMIO_IO_ADDR 12
#define AFU_IMAGE_MMIO_MEM_ADDR 14
#define AFU_IMAGE_MMIO_SCOPE_READ 20
#define AFU_IMAGE_MMIO_SCOPE_WRITE 22
#define AFU_IMAGE_MMIO_STATUS 18
#define AFU_IMAGE_POWER 0
#define AFU_TOP_IFC "ccip_std_afu_avalon_mm"
#endif // __AFU_JSON_INFO__

202
driver/opae/vx_scope.cpp Normal file
View File

@@ -0,0 +1,202 @@
#include <iostream>
#include <fstream>
#include <thread>
#include <chrono>
#include <vector>
#include <assert.h>
#ifdef USE_VLSIM
#include "vlsim/fpga.h"
#else
#include <opae/fpga.h>
#endif
#include <VX_config.h>
#include "vx_scope.h"
#include "vortex_afu.h"
#include "scope-defs.h"
#define SCOPE_FRAME_WIDTH 1768
#define CHECK_RES(_expr) \
do { \
fpga_result res = _expr; \
if (res == FPGA_OK) \
break; \
printf("OPAE Error: '%s' returned %d, %s!\n", \
#_expr, (int)res, fpgaErrStr(res)); \
return -1; \
} while (false)
#define MMIO_SCOPE_READ (AFU_IMAGE_MMIO_SCOPE_READ * 4)
#define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4)
#define CMD_GET_VALID 0
#define CMD_GET_DATA 1
#define CMD_GET_WIDTH 2
#define CMD_GET_COUNT 3
#define CMD_SET_DELAY 4
#define CMD_SET_STOP 5
#define CMD_GET_OFFSET 6
static constexpr int num_signals = sizeof(scope_signals) / sizeof(scope_signal_t);
constexpr int calcFrameWidth(int index = 0) {
return (index < num_signals) ? (scope_signals[index].width + calcFrameWidth(index + 1)) : 0;
}
static constexpr int fwidth = calcFrameWidth();
uint64_t print_clock(std::ofstream& ofs, uint64_t delta, uint64_t timestamp) {
while (delta != 0) {
ofs << '#' << timestamp++ << std::endl;
ofs << "b0 0" << std::endl;
ofs << '#' << timestamp++ << std::endl;
ofs << "b1 0" << std::endl;
--delta;
}
return timestamp;
}
int vx_scope_start(fpga_handle hfpga, uint64_t delay) {
if (nullptr == hfpga)
return -1;
if (delay != uint64_t(-1)) {
// set start delay
uint64_t cmd_delay = ((delay << 3) | CMD_SET_DELAY);
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, cmd_delay));
std::cout << "scope start delay: " << delay << std::endl;
}
return 0;
}
int vx_scope_stop(fpga_handle hfpga, uint64_t delay) {
if (nullptr == hfpga)
return -1;
if (delay != uint64_t(-1)) {
// stop recording
uint64_t cmd_stop = ((delay << 3) | CMD_SET_STOP);
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, cmd_stop));
std::cout << "scope stop delay: " << delay << std::endl;
}
std::ofstream ofs("vx_scope.vcd");
ofs << "$version Generated by Vortex Scope $end" << std::endl;
ofs << "$timescale 1 ns $end" << std::endl;
ofs << "$scope module TOP $end" << std::endl;
ofs << "$var reg 1 0 clk $end" << std::endl;
for (int i = 0; i < num_signals; ++i) {
ofs << "$var reg " << scope_signals[i].width << " " << (i+1) << " " << scope_signals[i].name << " $end" << std::endl;
}
ofs << "$upscope $end" << std::endl;
ofs << "enddefinitions $end" << std::endl;
uint64_t frame_width, max_frames, data_valid, offset, delta;
uint64_t timestamp = 0;
uint64_t frame_offset = 0;
uint64_t frame_no = 0;
int signal_id = 0;
int signal_offset = 0;
// wait for recording to terminate
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_VALID));
do {
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid));
if (data_valid)
break;
std::this_thread::sleep_for(std::chrono::seconds(1));
} while (true);
std::cout << "scope trace dump begin..." << std::endl;
// get frame width
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_WIDTH));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &frame_width));
std::cout << "scope::frame_width=" << std::dec << frame_width << std::endl;
if (fwidth != (int)frame_width) {
std::cerr << "invalid frame_width: expecting " << std::dec << fwidth << "!" << std::endl;
std::abort();
}
// get max frames
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_COUNT));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &max_frames));
std::cout << "scope::max_frames=" << std::dec << max_frames << std::endl;
// get offset
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_OFFSET));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &offset));
// get data
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_DATA));
// print clock header
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &delta));
timestamp = print_clock(ofs, offset + delta + 2, timestamp);
signal_id = num_signals;
std::vector<char> signal_data(frame_width+1);
do {
if (frame_no == (max_frames-1)) {
// verify last frame is valid
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_VALID));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid));
assert(data_valid == 1);
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_DATA));
}
// read next data words
uint64_t word;
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &word));
do {
int signal_width = scope_signals[signal_id-1].width;
int word_offset = frame_offset % 64;
signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0';
++signal_offset;
++frame_offset;
if (signal_offset == signal_width) {
signal_data[signal_width] = 0; // string null termination
ofs << 'b' << signal_data.data() << ' ' << signal_id << std::endl;
signal_offset = 0;
--signal_id;
}
if (frame_offset == frame_width) {
assert(0 == signal_offset);
frame_offset = 0;
++frame_no;
if (frame_no != max_frames) {
// print clock header
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &delta));
timestamp = print_clock(ofs, delta + 1, timestamp);
signal_id = num_signals;
//std::cout << "*** " << frame_no << " frames, timestamp=" << timestamp << std::endl;
}
}
} while ((frame_offset % 64) != 0);
} while (frame_no != max_frames);
std::cout << "scope trace dump done! - " << (timestamp/2) << " cycles" << std::endl;
// verify data not valid
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_VALID));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid));
assert(data_valid == 0);
return 0;
}

View File

@@ -1,7 +1,5 @@
#pragma once
#include <opae/fpga.h>
int vx_scope_start(fpga_handle hfpga, uint64_t delay = -1);
int vx_scope_stop(fpga_handle hfpga, uint64_t delay = -1);

View File

@@ -12,17 +12,18 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSRQ
DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_PRINT_FLAGS += -DDBG_CORE_REQ_INFO
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CORE_REQ_INFO
#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=1
CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=1
#DEBUG=1
#AFU=1
CFLAGS += -fPIC
@@ -35,11 +36,13 @@ LDFLAGS += -shared -pthread
TOP = Vortex
SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
SRCS += ../../hw/rtl/fp_cores/svdpi/float_dpi.cpp
RTL_DIR = ../../hw/rtl
FPU_INCLUDE = -I../../hw/rtl/fp_cores -I../../hw/rtl/fp_cores/svdpi -I../../hw/rtl/fp_cores/fpnew/src/common_cells/include -I../../hw/rtl/fp_cores/fpnew/src/common_cells/src -I../../hw/rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../hw/rtl/fp_cores/fpnew/src
RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache $(FPU_INCLUDE)
SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
VL_FLAGS += -Wno-DECLFILENAME
@@ -59,16 +62,6 @@ else
CFLAGS += -DNDEBUG
endif
# AFU
ifdef AFU
TOP = vortex_afu_sim
VL_FLAGS += -DNOPAE
CFLAGS += -DNOPAE
VL_FLAGS += -DSCOPE
CFLAGS += -DSCOPE
RTL_INCLUDE += -I../../hw/opae -I../../hw/opae/ccip
endif
PROJECT = libvortex.so
# PROJECT = libvortex.dylib
@@ -76,7 +69,7 @@ all: $(PROJECT)
$(PROJECT): $(SRCS)
verilator --exe --cc $(TOP) --top-module $(TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
make -j -C obj_dir -f V$(TOP).mk
OPT_FAST="-O0 -g" make -j -C obj_dir -f V$(TOP).mk
clean:
rm -rf $(PROJECT) obj_dir

View File

@@ -64,8 +64,7 @@ private:
class vx_device {
public:
vx_device() {
mem_allocation_ = ALLOC_BASE_ADDR;
simulator_.attach_ram(&ram_);
mem_allocation_ = ALLOC_BASE_ADDR;
}
~vx_device() {
@@ -117,6 +116,7 @@ public:
if (future_.valid()) {
future_.wait(); // ensure prior run completed
}
simulator_.attach_ram(&ram_);
future_ = std::async(std::launch::async, [&]{
simulator_.reset();
while (simulator_.is_busy()) {
@@ -144,10 +144,12 @@ public:
if (future_.valid()) {
future_.wait(); // ensure prior run completed
}
simulator_.attach_ram(&ram_);
simulator_.flush_caches(dev_maddr, size);
while (simulator_.is_busy()) {
while (simulator_.snp_req_active()) {
simulator_.step();
};
simulator_.attach_ram(NULL);
return 0;
}
@@ -156,7 +158,7 @@ public:
future_.wait(); // ensure prior run completed
}
simulator_.set_csr(core_id, addr, value);
while (simulator_.is_busy()) {
while (simulator_.csr_req_active()) {
simulator_.step();
};
return 0;
@@ -167,7 +169,7 @@ public:
future_.wait(); // ensure prior run completed
}
simulator_.get_csr(core_id, addr, value);
while (simulator_.is_busy()) {
while (simulator_.csr_req_active()) {
simulator_.step();
};
return 0;
@@ -304,7 +306,7 @@ extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hb
return 0;
}
extern volatile void* vx_host_ptr(vx_buffer_h hbuffer) {
extern void* vx_host_ptr(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return nullptr;

View File

@@ -294,7 +294,7 @@ extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hb
return 0;
}
extern volatile void* vx_host_ptr(vx_buffer_h hbuffer) {
extern void* vx_host_ptr(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return nullptr;

View File

@@ -24,7 +24,7 @@ extern int vx_alloc_shared_mem(vx_device_h /*hdevice*/, size_t /*size*/, vx_buff
return -1;
}
extern volatile void* vx_host_ptr(vx_buffer_h /*hbuffer*/) {
extern void* vx_host_ptr(vx_buffer_h /*hbuffer*/) {
return nullptr;
}

View File

@@ -1,4 +1,4 @@
RISCV_TOOLCHAIN_PATH ?= $(wildcard /opt/riscv-gnu-toolchain/drops)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
OPTS ?= -n256
@@ -8,7 +8,7 @@ VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32im -mabi=ilp32 -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include
VX_LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a
@@ -45,6 +45,9 @@ run-fpga: $(PROJECT)
run-ase: $(PROJECT)
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT)
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT)
LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)

View File

@@ -68,18 +68,27 @@ uint64_t shuffle(int i, uint64_t value) {
int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
int errors = 0;
int num_blocks_8 = (64 * num_blocks) / 8;
// update source buffer
for (int i = 0; i < (64 * num_blocks) / 8; ++i) {
for (int i = 0; i < num_blocks_8; ++i) {
((uint64_t*)vx_host_ptr(buffer))[i] = shuffle(i, value);
}
/*for (int i = 0; i < num_blocks; ++i) {
std::cout << "data[" << i << "]=0x";
for (int j = 7; j >= 0; --j) {
std::cout << std::hex << ((uint64_t*)vx_host_ptr(buffer))[i * 8 +j];
}
std::cout << std::endl;
}*/
// write buffer to local memory
std::cout << "write buffer to local memory" << std::endl;
RT_CHECK(vx_copy_to_dev(buffer, dev_addr, 64 * num_blocks, 0));
// clear destination buffer
for (int i = 0; i < (64 * num_blocks) / 8; ++i) {
for (int i = 0; i < num_blocks_8; ++i) {
((uint64_t*)vx_host_ptr(buffer))[i] = 0;
}
@@ -89,7 +98,7 @@ int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
// verify result
std::cout << "verify result" << std::endl;
for (int i = 0; i < (64 * num_blocks) / 8; ++i) {
for (int i = 0; i < num_blocks_8; ++i) {
auto curr = ((uint64_t*)vx_host_ptr(buffer))[i];
auto ref = shuffle(i, value);
if (curr != ref) {
@@ -127,7 +136,7 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
{
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xffffffff;
buf_ptr[i] = 0xdeadbeef;
}
}
std::cout << "clear destination buffer" << std::endl;
@@ -214,7 +223,8 @@ int main(int argc, char *argv[]) {
if (0 == test || -1 == test) {
std::cout << "run memcopy test" << std::endl;
RT_CHECK(run_memcopy_test(kernel_arg.src_ptr, 0x0badf00d00ff00ff, 1));
RT_CHECK(run_memcopy_test(kernel_arg.src_ptr, 0x0badf00d40ff40ff, num_blocks));
if (num_blocks >= 4) RT_CHECK(run_memcopy_test(kernel_arg.src_ptr, 0x0badf00d00ff00ff, num_blocks/2));
if (num_blocks >= 2) RT_CHECK(run_memcopy_test(kernel_arg.src_ptr, 0x0badf00d40ff40ff, num_blocks));
}
if (1 == test || -1 == test) {

Binary file not shown.

View File

@@ -74,7 +74,7 @@ Disassembly of section .text:
800000e0: 0005006b 0x5006b
800000e4: 00002197 auipc gp,0x2
800000e8: c8418193 addi gp,gp,-892 # 80001d68 <__global_pointer$>
800000ec: f14025f3 csrr a1,mhartid
800000ec: 022025f3 csrr a1,0x22
800000f0: 00a59593 slli a1,a1,0xa
800000f4: 02002673 csrr a2,0x20
800000f8: 00261613 slli a2,a2,0x2
@@ -122,7 +122,7 @@ Disassembly of section .text:
80000158: 00008067 ret
8000015c <vx_thread_gid>:
8000015c: f1402573 csrr a0,mhartid
8000015c: 02202573 csrr a0,0x22
80000160: 00008067 ret
80000164 <vx_core_id>:
@@ -458,13 +458,12 @@ Disassembly of section .comment:
Disassembly of section .riscv.attributes:
00000000 <.riscv.attributes>:
0: 2041 jal 80 <_start-0x7fffff80>
0: 2541 jal 680 <_start-0x7ffff980>
2: 0000 unimp
4: 7200 flw fs0,32(a2)
6: 7369 lui t1,0xffffa
8: 01007663 bgeu zero,a6,14 <_start-0x7fffffec>
c: 0016 c.slli zero,0x5
e: 0000 unimp
c: 0000001b 0x1b
10: 1004 addi s1,sp,32
12: 7205 lui tp,0xfffe1
14: 3376 fld ft6,376(sp)
@@ -473,4 +472,4 @@ Disassembly of section .riscv.attributes:
1a: 5f30 lw a2,120(a4)
1c: 326d jal fffff9c6 <__global_pointer$+0x7fffdc5e>
1e: 3070 fld fa2,224(s0)
...
20: 665f 7032 0030 0x307032665f

Binary file not shown.

View File

@@ -1,4 +1,4 @@
RISCV_TOOLCHAIN_PATH ?= $(wildcard /opt/riscv-gnu-toolchain/drops)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
OPTS ?= -n64
@@ -8,7 +8,7 @@ VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32im -mabi=ilp32 -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include
VX_LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a
@@ -43,6 +43,9 @@ run-fpga: $(PROJECT)
run-ase: $(PROJECT)
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT)
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT)
LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)

View File

@@ -184,7 +184,7 @@ int main(int argc, char *argv[]) {
{
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xffffffff;
buf_ptr[i] = 0xdeadbeef;
}
}
std::cout << "clear destination buffer" << std::endl;

View File

@@ -1,4 +1,4 @@
RISCV_TOOLCHAIN_PATH ?= $(wildcard /opt/riscv-gnu-toolchain/drops)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
OPTS ?= -n64
@@ -8,7 +8,7 @@ VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32 -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -nostartfiles -Wl,--gc-sections
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -nostartfiles -Wl,--gc-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include
VX_LDFLAGS += $(VORTEX_RT_PATH)/libvortexrt.a
@@ -44,6 +44,9 @@ run-fpga: $(PROJECT)
run-ase: $(PROJECT)
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT)
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT)
LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)

BIN
driver/tests/dogfood/kernel.bin Normal file → Executable file

Binary file not shown.

File diff suppressed because it is too large Load Diff

BIN
driver/tests/dogfood/kernel.elf Normal file → Executable file

Binary file not shown.