From c2721fd545a74e7e3b29f1a79c8e0bfd2c8fa8f3 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 13 Nov 2021 01:41:12 -0500 Subject: [PATCH 01/27] SimX timing simulation --- sim/common/simobject.h | 427 +++++++ sim/common/util.h | 76 +- sim/simX/Makefile | 2 +- sim/simX/archdef.h | 45 +- sim/simX/cache.cpp | 497 ++++++++ sim/simX/cache.h | 40 + sim/simX/constants.h | 21 + sim/simX/core.cpp | 392 ++++--- sim/simX/core.h | 96 +- sim/simX/decode.cpp | 12 +- sim/simX/decode.h | 2 +- sim/simX/execute.cpp | 2517 ++++++++++++++++++++++------------------ sim/simX/exeunit.cpp | 152 +++ sim/simX/exeunit.h | 103 ++ sim/simX/ibuffer.h | 39 + sim/simX/instr.h | 7 +- sim/simX/main.cpp | 76 +- sim/simX/memsim.cpp | 58 + sim/simX/memsim.h | 35 + sim/simX/pipeline.cpp | 63 - sim/simX/pipeline.h | 86 +- sim/simX/processor.h | 189 +++ sim/simX/scoreboard.h | 71 ++ sim/simX/types.h | 240 +++- sim/simX/warp.cpp | 69 +- sim/simX/warp.h | 14 +- 26 files changed, 3690 insertions(+), 1639 deletions(-) create mode 100644 sim/common/simobject.h create mode 100644 sim/simX/cache.cpp create mode 100644 sim/simX/cache.h create mode 100644 sim/simX/constants.h create mode 100644 sim/simX/exeunit.cpp create mode 100644 sim/simX/exeunit.h create mode 100644 sim/simX/ibuffer.h create mode 100644 sim/simX/memsim.cpp create mode 100644 sim/simX/memsim.h delete mode 100644 sim/simX/pipeline.cpp create mode 100644 sim/simX/processor.h create mode 100644 sim/simX/scoreboard.h diff --git a/sim/common/simobject.h b/sim/common/simobject.h new file mode 100644 index 00000000..68bccc87 --- /dev/null +++ b/sim/common/simobject.h @@ -0,0 +1,427 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace vortex { + +class SimObjectBase; + +class SimEventBase { +public: + typedef std::shared_ptr Ptr; + + virtual ~SimEventBase() {} + + virtual void fire() const = 0; + + bool step() { + return (0 == --delay_); + } + +protected: + SimEventBase(uint64_t delay) : delay_(delay) {} + + uint64_t delay_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SimSimpleEvent : public SimEventBase { +public: + typedef std::function Func; + + template + static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) { + return std::make_shared(func, pkt, delay); + } + + SimSimpleEvent(const Func& func, const Pkt& pkt, uint64_t delay) + : SimEventBase(delay) + , func_(func) + , pkt_(pkt) + {} + + void fire() const override { + func_(pkt_); + } + +protected: + Func func_; + Pkt pkt_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SimPortEvent : public SimEventBase { +public: + typedef std::function Func; + + template + static Ptr Create(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) { + return std::make_shared(func, pkt, port_id, delay); + } + + SimPortEvent(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) + : SimEventBase(delay) + , func_(func) + , pkt_(pkt) + , port_id_(port_id) + {} + + void fire() const override { + func_(pkt_, port_id_); + } + +private: + Func func_; + Pkt pkt_; + uint32_t port_id_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +class SimPortBase { +public: + typedef std::shared_ptr Ptr; + + virtual ~SimPortBase() {} + + SimObjectBase* module() const { + return module_; + } + + uint32_t port_id() const { + return port_id_; + } + + SimPortBase* peer() const { + return peer_; + } + + bool connected() const { + return (peer_ != nullptr); + } + + bool is_slave() const { + return is_slave_; + } + +protected: + + SimPortBase(SimObjectBase* module, bool is_slave); + + void connect(SimPortBase* peer) { + assert(peer_ == nullptr); + peer_ = peer; + } + + void disconnect() { + assert(peer_ == nullptr); + peer_ = nullptr; + } + + SimObjectBase* module_; + uint32_t port_id_; + bool is_slave_; + SimPortBase* peer_; + + template friend class MasterPort; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SlavePort : public SimPortBase { +public: + typedef std::shared_ptr> Ptr; + typedef std::function Func; + + static Ptr Create(SimObjectBase* module, const Func& func) { + return std::make_shared>(module, func); + } + + template + static Ptr Create(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) { + return std::make_shared>(module, obj, entry); + } + + SlavePort(SimObjectBase* module, const Func& func) + : SimPortBase(module, true) + , func_(func) + {} + + template + SlavePort(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) + : SimPortBase(module, true) + , func_(std::bind(entry, obj, std::placeholders::_1, std::placeholders::_2)) + {} + + SlavePort(SimObjectBase* module, SlavePort* peer) + : SimPortBase(module, false) + { + this->connect(peer); + } + + void send(const Pkt& pkt, uint64_t delay) const; + + const Func& func() const { + return func_; + } + +protected: + SlavePort& operator=(const SlavePort&); + Func func_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class MasterPort : public SimPortBase { +public: + typedef std::shared_ptr> Ptr; + typedef std::function Func; + + static Ptr Create() { + return std::make_shared>(module); + } + + MasterPort(SimObjectBase* module) : SimPortBase(module, false) {} + + MasterPort(SimObjectBase* module, MasterPort* peer) + : SimPortBase(module, false) + { + peer->connect(this); + } + + void bind(SlavePort* peer) { + this->connect(peer); + } + + void unbind() { + peer_->disconnect(); + this->disconnect(); + } + + void send(const Pkt& pkt, uint64_t delay) const { + assert(peer_ != nullptr); + if (peer_->is_slave()) { + auto slave = reinterpret_cast*>(peer_); + slave->send(pkt, delay); + } else { + auto master = reinterpret_cast*>(peer_); + master->send(pkt, delay); + } + } + +private: + MasterPort& operator=(const MasterPort&); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class SimContext; + +class SimObjectBase { +public: + typedef std::shared_ptr Ptr; + + virtual ~SimObjectBase() {} + + template + void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay); + + virtual void step(uint64_t cycle) = 0; + + const std::string& name() const { + return name_; + } + +protected: + + SimObjectBase(const SimContext& ctx, const char* name); + + uint32_t allocate_port(SimPortBase* port) { + uint32_t id = ports_.size(); + ports_.push_back(port); + return id; + } + +private: + std::string name_; + std::vector ports_; + + friend class SimPlatform; + friend class SimPortBase; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SimObject : public SimObjectBase { +public: + typedef std::shared_ptr Ptr; + + template + static Ptr Create(Args&&... args); + +protected: + + SimObject(const SimContext& ctx, const char* name) : SimObjectBase(ctx, name) {} + + void step(uint64_t cycle) override { + this->impl().step(cycle); + } + +private: + + const Impl& impl() const { + return static_cast(*this); + } + + Impl& impl() { + return static_cast(*this); + } +}; + +class SimContext { +private: + SimContext() {} + template template + friend typename SimObject::Ptr SimObject::Create(Args&&... args); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class SimPlatform { +public: + static SimPlatform& instance() { + static SimPlatform s_inst; + return s_inst; + } + + bool initialize() { + //-- + return true; + } + + void finalize() { + instance().clear(); + } + + void register_object(const SimObjectBase::Ptr& obj) { + objects_.push_back(obj); + } + + template + void schedule(const typename SimSimpleEvent::Func& callback, + const Pkt& pkt, + uint64_t delay) { + auto evt = SimSimpleEvent::Create(callback, pkt, delay); + assert(delay != 0); + events_.emplace_back(evt); + } + + template + void schedule(const typename SimPortEvent::Func& callback, + const Pkt& pkt, + uint32_t port_id, + uint64_t delay) { + auto evt = SimPortEvent::Create(callback, pkt, port_id, delay); + assert(delay != 0); + events_.emplace_back(evt); + } + + void step() { + // evaluate events + auto evt_it = events_.begin(); + auto evt_it_end = events_.end(); + while (evt_it != evt_it_end) { + auto& event = *evt_it; + if (event->step()) { + event->fire(); + evt_it = events_.erase(evt_it); + } else { + ++evt_it; + } + } + // evaluate components + for (auto& object : objects_) { + object->step(cycles_); + } + // advance clock + ++cycles_; + } + + uint64_t cycles() const { + return cycles_; + } + +private: + + SimPlatform() : cycles_(0) {} + + virtual ~SimPlatform() { + this->clear(); + } + + void clear() { + objects_.clear(); + events_.clear(); + } + + std::vector objects_; + std::list events_; + uint64_t cycles_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +inline SimPortBase::SimPortBase(SimObjectBase* module, bool is_slave) + : module_(module) + , port_id_(module->allocate_port(this)) + , is_slave_(is_slave) + , peer_(nullptr) +{} + +inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) + : name_(name) +{} + +template +template +typename SimObject::Ptr SimObject::Create(Args&&... args) { + auto obj = std::make_shared(SimContext{}, std::forward(args)...); + SimPlatform::instance().register_object(obj); + return obj; +} + +template +void SlavePort::send(const Pkt& pkt, uint64_t delay) const { + if (func_) { + SimPlatform::instance().schedule(func_, pkt, port_id_, delay); + } else { + assert(peer_ != nullptr); + if (peer_->is_slave()) { + auto slave = reinterpret_cast*>(peer_); + slave->send(pkt, delay); + } else { + auto master = reinterpret_cast*>(peer_); + master->send(pkt, delay); + } + } +} + +template +void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) { + auto callback = std::bind(entry, obj, std::placeholders::_1); + SimPlatform::instance().schedule(callback, pkt, delay); +} + +} \ No newline at end of file diff --git a/sim/common/util.h b/sim/common/util.h index dbaeb5fa..668f3e26 100644 --- a/sim/common/util.h +++ b/sim/common/util.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include template @@ -8,24 +9,83 @@ void unused(Args&&...) {} #define __unused(...) unused(__VA_ARGS__) -constexpr bool ispow2(uint64_t value) { +constexpr uint32_t count_leading_zeros(uint32_t value) { + return value ? __builtin_clz(value) : 32; +} + +constexpr uint32_t count_trailing_zeros(uint32_t value) { + return value ? __builtin_ctz(value) : 32; +} + +constexpr bool ispow2(uint32_t value) { return value && !(value & (value - 1)); } -constexpr unsigned log2ceil(uint32_t value) { - return 32 - __builtin_clz(value - 1); +constexpr uint32_t log2ceil(uint32_t value) { + return 32 - count_leading_zeros(value - 1); } -inline uint64_t align_size(uint64_t size, uint64_t alignment) { +inline unsigned log2up(uint32_t value) { + return std::max(1, log2ceil(value)); +} + +constexpr unsigned log2floor(uint32_t value) { + return 31 - count_leading_zeros(value); +} + +constexpr unsigned ceil2(uint32_t value) { + return 32 - count_leading_zeros(value); +} + +inline uint64_t bit_clr(uint64_t bits, uint32_t index) { + assert(index <= 63); + return bits & ~(1ull << index); +} + +inline uint64_t bit_set(uint64_t bits, uint32_t index) { + assert(index <= 63); + return bits | (1ull << index); +} + +inline bool bit_get(uint64_t bits, uint32_t index) { + assert(index <= 63); + return (bits >> index) & 0x1; +} + +inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift; + return bits & ~mask; +} + +inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + uint64_t dirty = (value << (shift + start)) >> shift; + return bit_clrw(bits, start, end) | dirty; +} + +inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + return (bits << shift) >> (shift + start); +} + +inline uint64_t aligned_size(uint64_t size, uint32_t alignment) { assert(0 == (alignment & (alignment - 1))); return (size + alignment - 1) & ~(alignment - 1); } // Apply integer sign extension -inline uint32_t signExt(uint32_t w, uint32_t bit, uint32_t mask) { - if (w >> (bit - 1)) - w |= ~mask; - return w; +inline uint32_t sext32(uint32_t word, uint32_t width) { + assert(width > 1); + assert(width <= 32); + uint32_t mask = (1 << width) - 1; + return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word; } // return file extension diff --git a/sim/simX/Makefile b/sim/simX/Makefile index 29b53fc3..e42464c6 100644 --- a/sim/simX/Makefile +++ b/sim/simX/Makefile @@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a TOP = vx_cache_sim SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp -SRCS += args.cpp pipeline.cpp warp.cpp core.cpp decode.cpp execute.cpp main.cpp +SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp main.cpp OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS))) VPATH := $(sort $(dir $(SRCS))) diff --git a/sim/simX/archdef.h b/sim/simX/archdef.h index 75248c1a..c6728831 100644 --- a/sim/simX/archdef.h +++ b/sim/simX/archdef.h @@ -9,21 +9,31 @@ namespace vortex { -class ArchDef { +class ArchDef { +private: + int num_cores_; + int num_warps_; + int num_threads_; + int wsize_; + int vsize_; + int num_regs_; + int num_csrs_; + int num_barriers_; + public: - ArchDef(const std::string &/*arch*/, + ArchDef(const std::string& /*arch*/, int num_cores, int num_warps, - int num_threads) { - wsize_ = 4; - vsize_ = 16; - num_regs_ = 32; - num_csrs_ = 4096; - num_barriers_= NUM_BARRIERS; - num_cores_ = num_cores; - num_warps_ = num_warps; - num_threads_ = num_threads; - } + int num_threads) + : num_cores_(num_cores) + , num_warps_(num_warps) + , num_threads_(num_threads) + , wsize_(4) + , vsize_(16) + , num_regs_(32) + , num_csrs_(4096) + , num_barriers_(NUM_BARRIERS) + {} int wsize() const { return wsize_; @@ -56,17 +66,6 @@ public: int num_cores() const { return num_cores_; } - -private: - - int wsize_; - int vsize_; - int num_regs_; - int num_csrs_; - int num_barriers_; - int num_threads_; - int num_warps_; - int num_cores_; }; } \ No newline at end of file diff --git a/sim/simX/cache.cpp b/sim/simX/cache.cpp new file mode 100644 index 00000000..f139cb43 --- /dev/null +++ b/sim/simX/cache.cpp @@ -0,0 +1,497 @@ +#include "cache.h" +#include "debug.h" +#include +#include +#include +#include +#include + +using namespace vortex; + +struct params_t { + uint32_t sets_per_bank; + uint32_t blocks_per_set; + uint32_t words_per_block; + + uint32_t word_select_addr_start; + uint32_t word_select_addr_end; + + uint32_t bank_select_addr_start; + uint32_t bank_select_addr_end; + + uint32_t set_select_addr_start; + uint32_t set_select_addr_end; + + uint32_t tag_select_addr_start; + uint32_t tag_select_addr_end; + + params_t(const CacheConfig& config) { + uint32_t bank_bits = log2ceil(config.num_banks); + uint32_t offset_bits = config.B - config.W; + uint32_t log2_bank_size = config.C - bank_bits; + uint32_t index_bits = log2_bank_size - (config.B << config.A); + assert(log2_bank_size >= config.B); + + + this->words_per_block = 1 << offset_bits; + this->blocks_per_set = 1 << config.A; + this->sets_per_bank = 1 << index_bits; + + assert(config.ports_per_bank <= this->words_per_block); + + // Word select + this->word_select_addr_start = config.W; + this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1); + + // Bank select + this->bank_select_addr_start = (1+this->word_select_addr_end); + this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1); + + // Set select + this->set_select_addr_start = (1+this->bank_select_addr_end); + this->set_select_addr_end = (this->set_select_addr_start+index_bits-1); + + // Tag select + this->tag_select_addr_start = (1+this->set_select_addr_end); + this->tag_select_addr_end = (config.addr_width-1); + } + + uint32_t addr_bank_id(uint64_t word_addr) const { + if (bank_select_addr_end >= bank_select_addr_start) + return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end); + else + return 0; + } + + uint32_t addr_set_id(uint64_t word_addr) const { + if (set_select_addr_end >= set_select_addr_start) + return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end); + else + return 0; + } + + uint64_t addr_tag(uint64_t word_addr) const { + if (tag_select_addr_end >= tag_select_addr_start) + return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end); + else + return 0; + } + + uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const { + uint64_t addr(0); + if (bank_select_addr_end >= bank_select_addr_start) + addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id); + if (set_select_addr_end >= set_select_addr_start) + addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id); + if (tag_select_addr_end >= tag_select_addr_start) + addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag); + return addr; + } +}; + +struct block_t { + bool valid; + bool dirty; + uint64_t tag; + uint32_t lru_ctr; +}; + +struct set_t { + std::vector blocks; + set_t(uint32_t size) : blocks(size) {} +}; + +struct bank_req_info_t { + bool valid; + uint32_t req_id; + uint32_t req_tag; +}; + +struct bank_req_t { + bool valid; + bool write; + bool mshr_replay; + uint64_t tag; + uint32_t set_id; + std::vector infos; + + bank_req_t(uint32_t size) + : valid(false) + , write(false) + , mshr_replay(false) + , tag(0) + , set_id(0) + , infos(size) + {} +}; + +struct mshr_entry_t : public bank_req_t { + uint32_t block_id; + + mshr_entry_t(uint32_t size = 0) + : bank_req_t(size) + , block_id(0) + {} +}; + +class MSHR { +private: + std::vector entries_; + uint32_t capacity_; + +public: + MSHR(uint32_t size) + : entries_(size) + , capacity_(0) + {} + + bool empty() const { + return (0 == capacity_); + } + + bool full() const { + return (capacity_ == entries_.size()); + } + + int lookup(const bank_req_t& bank_req) { + for (uint32_t i = 0, n = entries_.size(); i < n; ++i) { + auto& entry = entries_.at(i); + if (entry.valid + && entry.set_id == bank_req.set_id + && entry.tag == bank_req.tag) { + return i; + } + } + return -1; + } + + int allocate(const bank_req_t& bank_req, uint32_t block_id) { + for (uint32_t i = 0, n = entries_.size(); i < n; ++i) { + auto& entry = entries_.at(i); + if (!entry.valid) { + *(bank_req_t*)&entry = bank_req; + entry.valid = true; + entry.mshr_replay = false; + entry.block_id = block_id; + ++capacity_; + return i; + } + } + return -1; + } + + mshr_entry_t& replay(uint32_t id) { + auto& root_entry = entries_.at(id); + assert(root_entry.valid); + // make all related mshr entries for replay + for (auto& entry : entries_) { + if (entry.valid + && entry.set_id == root_entry.set_id + && entry.tag == root_entry.tag) { + entry.mshr_replay = true; + } + } + return root_entry; + } + + bool try_pop(bank_req_t* out) { + for (auto& entry : entries_) { + if (entry.valid && entry.mshr_replay) { + *out = entry; + entry.valid = false; + --capacity_; + return true; + } + } + return false; + } +}; + +struct bank_t { + std::vector sets; + MSHR mshr; + std::queue stall_buffer; + bank_req_t active_req; + + bank_t(const CacheConfig& config, + const params_t& params) + : sets(params.sets_per_bank, params.blocks_per_set) + , mshr(config.mshr_size) + , active_req(config.ports_per_bank) + {} +}; + +/////////////////////////////////////////////////////////////////////////////// + +class Cache::Impl { +private: + Cache* const simobject_; + CacheConfig config_; + params_t params_; + std::vector banks_; + std::vector> core_reqs_; + std::pair mem_rsp_; + std::vector> core_rsps_; + +public: + Impl(Cache* simobject, const CacheConfig& config) + : simobject_(simobject) + , config_(config) + , params_(config) + , banks_(config.num_banks, {config, params_}) + , core_reqs_(config.num_inputs) + , core_rsps_(config.num_inputs) + {} + + void handleMemResponse(const MemRsp& response, uint32_t) { + mem_rsp_ = {true, response}; + } + + void handleCoreRequest(const MemReq& request, uint32_t port_id) { + core_reqs_.at(port_id) = {true, request}; + } + + void step(uint64_t /*cycle*/) { + // process core response + for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) { + auto& core_rsp = core_rsps_.at(req_id); + if (!core_rsp.empty()) { + simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_rsp.front()}, config_.latency); + core_rsp.pop(); + } + } + + for (auto& bank : banks_) { + auto& active_req = bank.active_req; + + // try chedule mshr replay + if (!active_req.valid) { + bank.mshr.try_pop(&active_req); + } + + // try schedule stall replay + if (!active_req.valid + && !bank.stall_buffer.empty()) { + active_req = bank.stall_buffer.front(); + bank.stall_buffer.pop(); + } + } + + // handle memory fills + if (mem_rsp_.first) { + mem_rsp_.first = false; + auto bank_id = bit_getw(mem_rsp_.second.tag, 0, 15); + auto mshr_id = bit_getw(mem_rsp_.second.tag, 16, 31); + this->processMemoryFill(bank_id, mshr_id); + } + + // handle incoming core requests + for (uint32_t i = 0, n = core_reqs_.size(); i < n; ++i) { + auto& entry = core_reqs_.at(i); + if (!entry.first) + continue; + + entry.first = false; + + auto& core_req = entry.second; + auto bank_id = params_.addr_bank_id(core_req.addr); + auto set_id = params_.addr_set_id(core_req.addr); + auto tag = params_.addr_tag(core_req.addr); + auto port_id = i % config_.ports_per_bank; + + // create abnk request + bank_req_t bank_req(config_.ports_per_bank); + bank_req.valid = true; + bank_req.write = core_req.write; + bank_req.mshr_replay = false; + bank_req.tag = tag; + bank_req.set_id = set_id; + bank_req.infos.at(port_id) = {true, i, core_req.tag}; + + auto& bank = banks_.at(bank_id); + + // check MSHR capacity + if (bank.mshr.full()) { + // add to stall buffer + bank.stall_buffer.emplace(bank_req); + continue; + } + + auto& active_req = bank.active_req; + + // check pending MSHR request + if (active_req.valid + && active_req.mshr_replay) { + // add to stall buffer + bank.stall_buffer.emplace(bank_req); + continue; + } + + // check bank conflicts + if (active_req.valid) { + // check port conflict + if (active_req.write != core_req.write + || active_req.set_id != set_id + || active_req.tag != tag + || active_req.infos[port_id].valid) { + // add to stall buffer + bank.stall_buffer.emplace(bank_req); + continue; + } + // update pending request infos + active_req.infos[port_id] = bank_req.infos[port_id]; + } else { + // schedule new request + active_req = bank_req; + } + } + + // process active request + for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + this->processBankRequest(bank_id); + } + } + + void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) { + // update block + auto& bank = banks_.at(bank_id); + auto& root_entry = bank.mshr.replay(mshr_id); + auto& set = bank.sets.at(root_entry.set_id); + auto& block = set.blocks.at(root_entry.block_id); + block.valid = true; + block.tag = root_entry.tag; + } + + void processBankRequest(uint32_t bank_id) { + auto& bank = banks_.at(bank_id); + auto& active_req = bank.active_req; + if (!active_req.valid) + return; + + active_req.valid = false; + + auto& set = bank.sets.at(active_req.set_id); + + if (active_req.mshr_replay) { + // send core response + for (auto& info : active_req.infos) { + core_rsps_.at(info.req_id).emplace(info.req_tag); + } + } else { + bool hit = false; + bool found_free_block = false; + int hit_block_id = 0; + int repl_block_id = 0; + uint32_t max_cnt = 0; + + for (int i = 0, n = set.blocks.size(); i < n; ++i) { + auto& block = set.blocks.at(i); + if (block.valid) { + if (block.tag == active_req.tag) { + block.lru_ctr = 0; + hit_block_id = i; + hit = true; + } else { + ++block.lru_ctr; + } + if (max_cnt < block.lru_ctr) { + max_cnt = block.lru_ctr; + repl_block_id = i; + } + } else { + found_free_block = true; + repl_block_id = i; + } + } + + if (hit) { + // + // MISS handling + // + if (active_req.write) { + // handle write hit + auto& hit_block = set.blocks.at(hit_block_id); + if (config_.write_through) { + // forward write request to memory + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag); + mem_req.write = true; + mem_req.tag = 0; + simobject_->MemReqPort.send(mem_req, 1); + } else { + // mark block as dirty + hit_block.dirty = true; + } + } + // send core response + for (auto& info : active_req.infos) { + core_rsps_.at(info.req_id).emplace(info.req_tag); + } + } else { + // + // MISS handling + // + if (!found_free_block && !config_.write_through) { + // write back dirty block + auto& repl_block = set.blocks.at(repl_block_id); + if (repl_block.dirty) { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag); + mem_req.write = true; + simobject_->MemReqPort.send(mem_req, 1); + } + } + + if (active_req.write && config_.write_through) { + // forward write request to memory + { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); + mem_req.write = true; + mem_req.tag = 0; + simobject_->MemReqPort.send(mem_req, 1); + } + // send core response + for (auto& info : active_req.infos) { + core_rsps_.at(info.req_id).emplace(info.req_tag); + } + } else { + // lookup + int pending = bank.mshr.lookup(active_req); + + // allocate MSHR + int mshr_id = bank.mshr.allocate(active_req, repl_block_id); + + // send fill request + if (pending == -1) { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); + mem_req.write = active_req.write; + mem_req.tag = bit_setw(0, 0, 15, bank_id); + mem_req.tag = bit_setw(mem_req.tag, 16, 31, mshr_id); + simobject_->MemReqPort.send(mem_req, 1); + } + } + } + } + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config) + : SimObject(ctx, name) + , impl_(new Impl(this, config)) + , CoreReqPorts(config.num_inputs, {this, impl_, &Cache::Impl::handleCoreRequest}) + , CoreRspPorts(config.num_inputs, this) + , MemReqPort(this) + , MemRspPort(this, impl_, &Impl::handleMemResponse) +{} + +Cache::~Cache() { + delete impl_; +} + +void Cache::step(uint64_t cycle) { + impl_->step(cycle); +} \ No newline at end of file diff --git a/sim/simX/cache.h b/sim/simX/cache.h new file mode 100644 index 00000000..1c0c82f6 --- /dev/null +++ b/sim/simX/cache.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include "memsim.h" + +namespace vortex { + +struct CacheConfig { + uint8_t C; // log2 cache size + uint8_t B; // log2 block size + uint8_t W; // log2 word size + uint8_t A; // log2 associativity + uint8_t addr_width; // word address bits + uint8_t num_banks; // number of banks + uint8_t ports_per_bank; // number of ports per bank + uint8_t num_inputs; // number of inputs + bool write_through; // is write-through cache + uint16_t victim_size; // victim cache size + uint16_t mshr_size; // MSHR buffer size + uint8_t latency; // pipeline latency +}; + +class Cache : public SimObject { +private: + class Impl; + Impl* impl_; + +public: + Cache(const SimContext& ctx, const char* name, const CacheConfig& config); + ~Cache(); + + void step(uint64_t cycle); + + std::vector> CoreReqPorts; + std::vector> CoreRspPorts; + MasterPort MemReqPort; + SlavePort MemRspPort; +}; + +} \ No newline at end of file diff --git a/sim/simX/constants.h b/sim/simX/constants.h new file mode 100644 index 00000000..d9171b8d --- /dev/null +++ b/sim/simX/constants.h @@ -0,0 +1,21 @@ +#pragma once + +#include "types.h" + +#ifndef MEM_LATENCY +#define MEM_LATENCY 18 +#endif + +namespace vortex { + +struct Constants { + +static constexpr uint32_t CORE_TO_DCACHE_DELAY = 1 + SM_ENABLE; +static constexpr uint32_t CORE_TO_ICACHE_DELAY = 1; + +static constexpr uint32_t ICACHE_TO_MEM_DELAY = 2; +static constexpr uint32_t DCACHE_TO_MEM_DELAY = 2; + +}; + +} \ No newline at end of file diff --git a/sim/simX/core.cpp b/sim/simX/core.cpp index c68ac854..af0a4441 100644 --- a/sim/simX/core.cpp +++ b/sim/simX/core.cpp @@ -12,34 +12,92 @@ using namespace vortex; -Core::Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id) - : id_(id) +Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id) + : SimObject(ctx, "Core") + , id_(id) , arch_(arch) , decoder_(decoder) , mem_(mem) , shared_mem_(1, SMEM_SIZE) - , inst_in_schedule_("schedule") - , inst_in_fetch_("fetch") - , inst_in_decode_("decode") - , inst_in_issue_("issue") - , inst_in_execute_("execute") - , inst_in_writeback_("writeback") { - in_use_iregs_.resize(arch.num_warps(), 0); - in_use_fregs_.resize(arch.num_warps(), 0); - in_use_vregs_.reset(); - - csrs_.resize(arch_.num_csrs(), 0); - - fcsrs_.resize(arch_.num_warps(), 0); - - barriers_.resize(arch_.num_barriers(), 0); - - warps_.resize(arch_.num_warps()); + , warps_(arch.num_warps()) + , barriers_(arch.num_barriers(), 0) + , csrs_(arch.num_csrs(), 0) + , fcsrs_(arch.num_warps(), 0) + , ibuffers_(arch.num_warps(), IBUF_SIZE) + , scoreboard_(arch_) + , exe_units_((int)ExeType::MAX) + , icache_(Cache::Create("Icache", CacheConfig{ + log2ceil(ICACHE_SIZE), // C + log2ceil(L1_BLOCK_SIZE),// B + 2, // W + 0, // A + 32, // address bits + 1, // number of banks + 1, // number of ports + 1, // request size + true, // write-throught + 0, // victim size + NUM_WARPS, // mshr + 2, // pipeline latency + })) + , dcache_(Cache::Create("Dcache", CacheConfig{ + log2ceil(DCACHE_SIZE), // C + log2ceil(L1_BLOCK_SIZE),// B + 2, // W + 0, // A + 32, // address bits + DCACHE_NUM_BANKS, // number of banks + DCACHE_NUM_PORTS, // number of ports + (uint8_t)arch.num_threads(), // request size + true, // write-throught + 0, // victim size + DCACHE_MSHR_SIZE, // mshr + 2, // pipeline latency + })) + , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) + , icache_rsp_port_(this, this, &Core::icache_handleCacheReponse) + , dcache_rsp_port_(arch.num_threads(), {this, reinterpret_cast(exe_units_.at((int)ExeType::LSU).get()) , &LsuUnit::handleCacheReponse}) + , fetch_stage_("fetch") + , decode_stage_("decode") + , issue_stage_("issue") + , execute_stage_("execute") + , commit_stage_("writeback") + , pending_icache_(arch_.num_warps()) + , stalled_warps_(0) + , last_schedule_wid_(0) + , pending_instrs_(0) + , ebreak_(false) + , stats_insts_(0) + , stats_loads_(0) + , stats_stores_(0) + , MemRspPort(this, &l1_mem_switch_->RspIn) + , MemReqPort(this, &l1_mem_switch_->ReqOut) +{ for (int i = 0; i < arch_.num_warps(); ++i) { - warps_[i] = std::make_shared(this, i); + warps_.at(i) = std::make_shared(this, i); } - this->clear(); + // register execute units + exe_units_.at((int)ExeType::ALU) = std::make_shared(this); + exe_units_.at((int)ExeType::LSU) = std::make_shared(this); + exe_units_.at((int)ExeType::CSR) = std::make_shared(this); + exe_units_.at((int)ExeType::FPU) = std::make_shared(this); + exe_units_.at((int)ExeType::GPU) = std::make_shared(this); + + // connect l1 caches + icache_->CoreRspPorts.at(0).bind(&icache_rsp_port_); + for (int i = 0; i < arch_.num_threads(); ++i) { + dcache_->CoreRspPorts.at(i).bind(&dcache_rsp_port_.at(i)); + } + + // connect l1 switch + icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]); + dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]); + l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort); + l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort); + + // activate warp0 + warps_.at(0)->setTmask(0, true); } Core::~Core() { @@ -51,194 +109,164 @@ Core::~Core() { } } -void Core::clear() { - for (int w = 0; w < arch_.num_warps(); ++w) { - in_use_iregs_[w].reset(); - in_use_fregs_[w].reset(); - } - stalled_warps_.reset(); - - in_use_vregs_.reset(); - - for (auto& csr : csrs_) { - csr = 0; - } - - for (auto& fcsr : fcsrs_) { - fcsr = 0; - } - - for (auto& barrier : barriers_) { - barrier.reset(); - } - - for (auto warp : warps_) { - warp->clear(); - } - - inst_in_schedule_.clear(); - inst_in_fetch_.clear(); - inst_in_decode_.clear(); - inst_in_issue_.clear(); - inst_in_execute_.clear(); - inst_in_writeback_.clear(); - print_bufs_.clear(); - - steps_ = 0; - insts_ = 0; - loads_ = 0; - stores_ = 0; - - inst_in_schedule_.valid = true; - warps_[0]->setTmask(0, true); - - ebreak_ = false; +void Core::icache_handleCacheReponse(const MemRsp& response, uint32_t /*port_id*/) { + // advance to decode stage + uint32_t wid = response.tag; + pipeline_state_t state; + pending_icache_.remove(wid, &state); + auto latency = (SimPlatform::instance().cycles() - state.icache_latency); + state.icache_latency = latency; + decode_stage_.push(state); } -void Core::step() { +void Core::step(uint64_t cycle) { + __unused (cycle); D(2, "###########################################################"); + D(2, std::dec << "Core" << id_ << ": cycle: " << cycle); - steps_++; - D(2, std::dec << "Core" << id_ << ": cycle: " << steps_); - - this->writeback(); + this->commit(); this->execute(); this->issue(); this->decode(); this->fetch(); - this->schedule(); DPN(2, std::flush); } -void Core::schedule() { - if (!inst_in_schedule_.enter(&inst_in_fetch_)) - return; - +void Core::warp_scheduler() { bool foundSchedule = false; - int scheduled_warp = inst_in_schedule_.wid; + int scheduled_warp = last_schedule_wid_; - for (size_t wid = 0; wid < warps_.size(); ++wid) { - // round robin scheduling + // round robin scheduling + for (size_t wid = 0; wid < warps_.size(); ++wid) { scheduled_warp = (scheduled_warp + 1) % warps_.size(); - bool is_active = warps_[scheduled_warp]->active(); - bool stalled = stalled_warps_[scheduled_warp]; - if (is_active && !stalled) { + bool warp_active = warps_.at(scheduled_warp)->active(); + bool warp_stalled = stalled_warps_.test(scheduled_warp); + if (warp_active && !warp_stalled) { + last_schedule_wid_ = scheduled_warp; foundSchedule = true; break; } } if (!foundSchedule) - return; + return; - D(2, "Schedule: wid=" << scheduled_warp); - inst_in_schedule_.wid = scheduled_warp; + // suspend warp until decode + stalled_warps_.set(scheduled_warp); - // advance pipeline - inst_in_schedule_.next(&inst_in_fetch_); + auto& warp = warps_.at(scheduled_warp); + stats_insts_ += warp->getActiveThreads(); + + pipeline_state_t state; + warp->eval(&state); + + D(4, state); + + // advance to fetch stage + ++pending_instrs_; + fetch_stage_.push(state); } void Core::fetch() { - if (!inst_in_fetch_.enter(&inst_in_issue_)) - return; + // schedule icache request + pipeline_state_t state; + if (fetch_stage_.try_pop(&state)) { + state.icache_latency = SimPlatform::instance().cycles(); + MemReq mem_req; + mem_req.addr = state.PC; + mem_req.write = false; + mem_req.tag = pending_icache_.allocate(state); + icache_->CoreReqPorts.at(0).send(mem_req, 1); + } - int wid = inst_in_fetch_.wid; - - auto active_threads_b = warps_[wid]->getActiveThreads(); - warps_[wid]->step(&inst_in_fetch_); - auto active_threads_a = warps_[wid]->getActiveThreads(); - - insts_ += active_threads_b; - if (active_threads_b != active_threads_a) { - D(3, "*** warp#" << wid << " active threads changed to " << active_threads_a); - } - - if (inst_in_fetch_.stall_warp) { - D(3, "*** warp#" << wid << " fetch stalled"); - stalled_warps_[wid] = true; - } - - D(4, inst_in_fetch_); - - // advance pipeline - inst_in_fetch_.next(&inst_in_issue_); + // schedule next warp + this->warp_scheduler(); } void Core::decode() { - if (!inst_in_decode_.enter(&inst_in_issue_)) - return; + pipeline_state_t state; + if (!decode_stage_.try_pop(&state)) + return; - // advance pipeline - inst_in_decode_.next(&inst_in_issue_); + if (state.stall_warp) { + D(3, "*** warp#" << state.wid << " fetch stalled"); + } else { + // release warp + stalled_warps_.reset(state.wid); + } + + // advance to issue stage + issue_stage_.push(state); } void Core::issue() { - if (!inst_in_issue_.enter(&inst_in_execute_)) - return; + if (!issue_stage_.empty()) { + // insert to ibuffer + auto& state = issue_stage_.top(); + auto& ibuffer = ibuffers_.at(state.wid); + if (!ibuffer.full()) { + ibuffer.push(state); + issue_stage_.pop(); + } + } + + // issue ibuffer instructions + for (auto& ibuffer : ibuffers_) { + if (ibuffer.empty()) + continue; - bool in_use_regs = (inst_in_issue_.used_iregs & in_use_iregs_[inst_in_issue_.wid]) != 0 - || (inst_in_issue_.used_fregs & in_use_fregs_[inst_in_issue_.wid]) != 0 - || (inst_in_issue_.used_vregs & in_use_vregs_) != 0; + auto& state = ibuffer.top(); + + // check scoreboard + if (scoreboard_.in_use(state)) + continue; + + // update scoreboard + scoreboard_.reserve(state); + + // advance to execute stage + execute_stage_.push(state); + + ibuffer.pop(); + break; + } +} + +void Core::execute() { + // process stage inputs + if (!execute_stage_.empty()) { + auto& state = execute_stage_.top(); + auto& exe_unit = exe_units_.at((int)state.exe_type); + exe_unit->push_input(state); + execute_stage_.pop(); + } + + // advance execute units + for (auto& exe_unit : exe_units_) { + exe_unit->step(); + } - if (in_use_regs) { - D(3, "*** Issue: registers not ready!"); - inst_in_issue_.stalled = true; - return; - } - - switch (inst_in_issue_.rdest_type) { - case 1: - if (inst_in_issue_.rdest) - in_use_iregs_[inst_in_issue_.wid][inst_in_issue_.rdest] = 1; - break; - case 2: - in_use_fregs_[inst_in_issue_.wid][inst_in_issue_.rdest] = 1; - break; - case 3: - in_use_vregs_[inst_in_issue_.rdest] = 1; - break; - default: - break; + // commit completed instructions + for (auto& exe_unit : exe_units_) { + pipeline_state_t state; + if (exe_unit->pop_output(&state)) { + if (state.stall_warp) { + stalled_warps_.reset(state.wid); + } + // advance to commit stage + commit_stage_.push(state); + } } - - // advance pipeline - inst_in_issue_.next(&inst_in_execute_); } -void Core::execute() { - if (!inst_in_execute_.enter(&inst_in_writeback_)) +void Core::commit() { + pipeline_state_t state; + if (!commit_stage_.try_pop(&state)) return; - // advance pipeline - inst_in_execute_.next(&inst_in_writeback_); -} - -void Core::writeback() { - if (!inst_in_writeback_.enter(NULL)) - return; - - switch (inst_in_writeback_.rdest_type) { - case 1: - in_use_iregs_[inst_in_writeback_.wid][inst_in_writeback_.rdest] = 0; - break; - case 2: - in_use_fregs_[inst_in_writeback_.wid][inst_in_writeback_.rdest] = 0; - break; - case 3: - in_use_vregs_[inst_in_writeback_.rdest] = 0; - break; - default: - break; - } - - if (inst_in_writeback_.stall_warp) { - stalled_warps_[inst_in_writeback_.wid] = false; - D(3, "*** warp#" << inst_in_writeback_.wid << " fetch released"); - } - - // advance pipeline - inst_in_writeback_.next(NULL); + // update scoreboard + scoreboard_.release(state); } Word Core::get_csr(Addr addr, int tid, int wid) { @@ -281,16 +309,16 @@ Word Core::get_csr(Addr addr, int tid, int wid) { return arch_.num_cores(); } else if (addr == CSR_MINSTRET) { // NumInsts - return insts_; + return stats_insts_; } else if (addr == CSR_MINSTRET_H) { // NumInsts - return (Word)(insts_ >> 32); + return (Word)(stats_insts_ >> 32); } else if (addr == CSR_MCYCLE) { // NumCycles - return (Word)steps_; + return (Word)SimPlatform::instance().cycles(); } else if (addr == CSR_MCYCLE_H) { // NumCycles - return (Word)(steps_ >> 32); + return (Word)(SimPlatform::instance().cycles() >> 32); } else { return csrs_.at(addr); } @@ -328,7 +356,7 @@ Word Core::icache_fetch(Addr addr) { } Word Core::dcache_read(Addr addr, Size size) { - ++loads_; + ++stats_loads_; Word data = 0; #ifdef SM_ENABLE if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) @@ -342,7 +370,7 @@ Word Core::dcache_read(Addr addr, Size size) { } void Core::dcache_write(Addr addr, Word data, Size size) { - ++stores_; + ++stats_stores_; #ifdef SM_ENABLE if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) && ((addr + 3) < SMEM_BASE_ADDR)) { @@ -359,23 +387,19 @@ void Core::dcache_write(Addr addr, Word data, Size size) { } bool Core::running() const { - return inst_in_fetch_.valid - || inst_in_decode_.valid - || inst_in_issue_.valid - || inst_in_execute_.valid - || inst_in_writeback_.valid; + return pending_instrs_; } void Core::printStats() const { - std::cout << "Steps : " << steps_ << std::endl - << "Insts : " << insts_ << std::endl - << "Loads : " << loads_ << std::endl - << "Stores: " << stores_ << std::endl; + std::cout << "Cycles: " << SimPlatform::instance().cycles() << std::endl + << "Insts : " << stats_insts_ << std::endl + << "Loads : " << stats_loads_ << std::endl + << "Stores: " << stats_stores_ << std::endl; } void Core::writeToStdOut(Addr addr, Word data) { uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1); - auto& ss_buf = print_bufs_[tid]; + auto& ss_buf = print_bufs_.at(tid); char c = (char)data; ss_buf << c; if (c == '\n') { diff --git a/sim/simX/core.h b/sim/simX/core.h index 29de3ec6..913db4a6 100644 --- a/sim/simX/core.h +++ b/sim/simX/core.h @@ -4,10 +4,11 @@ #include #include #include +#include #include #include #include - +#include #include "debug.h" #include "types.h" #include "archdef.h" @@ -15,20 +16,21 @@ #include "mem.h" #include "warp.h" #include "pipeline.h" +#include "cache.h" +#include "ibuffer.h" +#include "scoreboard.h" +#include "exeunit.h" namespace vortex { -class Core { +class Core : public SimObject { public: - Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id); - + Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id); ~Core(); - void clear(); - bool running() const; - void step(); + void step(uint64_t cycle); void printStats() const; @@ -40,7 +42,7 @@ public: return *warps_.at(i); } - Decoder& decoder() { + const Decoder& decoder() { return decoder_; } @@ -48,16 +50,12 @@ public: return arch_; } - unsigned long num_insts() const { - return insts_; - } - - unsigned long num_steps() const { - return steps_; + unsigned long stats_insts() const { + return stats_insts_; } Word getIRegValue(int reg) const { - return warps_[0]->getIRegValue(reg); + return warps_.at(0)->getIRegValue(reg); } Word get_csr(Addr addr, int tid, int wid); @@ -73,50 +71,66 @@ public: void dcache_write(Addr, Word, Size); void trigger_ebreak(); + bool check_ebreak() const; -private: +private: - void schedule(); void fetch(); void decode(); void issue(); void execute(); - void writeback(); + void commit(); + + void warp_scheduler(); + + void icache_handleCacheReponse(const MemRsp& response, uint32_t port_id); void writeToStdOut(Addr addr, Word data); - - std::vector in_use_iregs_; - std::vector in_use_fregs_; - RegMask in_use_vregs_; - WarpMask stalled_warps_; - std::vector> warps_; - std::vector barriers_; - std::vector csrs_; - std::vector fcsrs_; - std::unordered_map print_bufs_; Word id_; - const ArchDef &arch_; - Decoder &decoder_; - MemoryUnit &mem_; + const ArchDef& arch_; + const Decoder& decoder_; + MemoryUnit& mem_; #ifdef SM_ENABLE RAM shared_mem_; #endif + std::vector> warps_; + std::vector barriers_; + std::vector csrs_; + std::vector fcsrs_; + std::vector ibuffers_; + Scoreboard scoreboard_; + std::vector exe_units_; + Cache::Ptr icache_; + Cache::Ptr dcache_; + Switch::Ptr l1_mem_switch_; + SlavePort icache_rsp_port_; + std::vector> dcache_rsp_port_; + + PipelineStage fetch_stage_; + PipelineStage decode_stage_; + PipelineStage issue_stage_; + PipelineStage execute_stage_; + PipelineStage commit_stage_; + + HashTable pending_icache_; + WarpMask stalled_warps_; + uint32_t last_schedule_wid_; + uint32_t pending_instrs_; bool ebreak_; - Pipeline inst_in_schedule_; - Pipeline inst_in_fetch_; - Pipeline inst_in_decode_; - Pipeline inst_in_issue_; - Pipeline inst_in_execute_; - Pipeline inst_in_writeback_; + std::unordered_map print_bufs_; + uint64_t stats_insts_; + uint64_t stats_loads_; + uint64_t stats_stores_; - uint64_t steps_; - uint64_t insts_; - uint64_t loads_; - uint64_t stores_; + friend class LsuUnit; + +public: + SlavePort MemRspPort; + MasterPort MemReqPort; }; } // namespace vortex \ No newline at end of file diff --git a/sim/simX/decode.cpp b/sim/simX/decode.cpp index dbc7115a..3c76231f 100644 --- a/sim/simX/decode.cpp +++ b/sim/simX/decode.cpp @@ -281,7 +281,7 @@ Decoder::Decoder(const ArchDef &arch) { v_imm_mask_ = 0x7ff; } -std::shared_ptr Decoder::decode(Word code, Word PC) { +std::shared_ptr Decoder::decode(Word code, Word PC) const { auto instr = std::make_shared(); Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_); instr->setOpcode(op); @@ -351,9 +351,9 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { instr->setFunc3(func3); instr->setFunc7(func7); if ((func3 == 5) && (op != L_INST) && (op != Opcode::FL)) { - instr->setImm(signExt(rs2, 5, reg_mask_)); + instr->setImm(sext32(rs2, 5)); } else { - instr->setImm(signExt(code >> shift_rs2_, 12, i_imm_mask_)); + instr->setImm(sext32(code >> shift_rs2_, 12)); } } break; @@ -366,7 +366,7 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { } instr->setFunc3(func3); Word imeed = (func7 << reg_s_) | rd; - instr->setImm(signExt(imeed, 12, s_imm_mask_)); + instr->setImm(sext32(imeed, 12)); } break; case InstType::B_TYPE: { @@ -378,12 +378,12 @@ std::shared_ptr Decoder::decode(Word code, Word PC) { Word bit_10_5 = func7 & 0x3f; Word bit_12 = func7 >> 6; Word imeed = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12); - instr->setImm(signExt(imeed, 13, b_imm_mask_)); + instr->setImm(sext32(imeed, 13)); } break; case InstType::U_TYPE: instr->setDestReg(rd); - instr->setImm(signExt(code >> shift_func3_, 20, u_imm_mask_)); + instr->setImm(sext32(code >> shift_func3_, 20)); break; case InstType::J_TYPE: { diff --git a/sim/simX/decode.h b/sim/simX/decode.h index f8f3909c..d4f9f976 100644 --- a/sim/simX/decode.h +++ b/sim/simX/decode.h @@ -13,7 +13,7 @@ class Decoder { public: Decoder(const ArchDef &); - std::shared_ptr decode(Word code, Word PC); + std::shared_ptr decode(Word code, Word PC) const; private: diff --git a/sim/simX/execute.cpp b/sim/simX/execute.cpp index 47bf4e04..602f7f3a 100644 --- a/sim/simX/execute.cpp +++ b/sim/simX/execute.cpp @@ -49,346 +49,445 @@ inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid) } } -void Warp::execute(const Instr &instr, Pipeline *pipeline) { +void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { assert(tmask_.any()); Word nextPC = PC_ + core_->arch().wsize(); - bool runOnce = false; - - Word func3 = instr.getFunc3(); - Word func6 = instr.getFunc6(); - Word func7 = instr.getFunc7(); + + Word func3 = instr.getFunc3(); + Word func6 = instr.getFunc6(); + Word func7 = instr.getFunc7(); auto opcode = instr.getOpcode(); - int rdest = instr.getRDest(); - int rsrc0 = instr.getRSrc(0); - int rsrc1 = instr.getRSrc(1); - Word immsrc= instr.getImm(); - Word vmask = instr.getVmask(); + int rdest = instr.getRDest(); + int rsrc0 = instr.getRSrc(0); + int rsrc1 = instr.getRSrc(1); + int rsrc2 = instr.getRSrc(2); + Word immsrc = instr.getImm(); + Word vmask = instr.getVmask(); int num_threads = core_->arch().num_threads(); - for (int t = 0; t < num_threads; t++) { - if (!tmask_.test(t) || runOnce) - continue; - - auto &iregs = iRegFile_.at(t); - auto &fregs = fRegFile_.at(t); - Word rsdata[3]; - Word rddata; - - int num_rsrcs = instr.getNRSrc(); - if (num_rsrcs) { - DPH(2, "[" << std::dec << t << "] Src Regs: "); - for (int i = 0; i < num_rsrcs; ++i) { - int rst = instr.getRSType(i); - int rs = instr.getRSrc(i); - if (i) DPN(2, ", "); - switch (rst) { - case 1: - rsdata[i] = iregs[rs]; - DPN(2, "r" << std::dec << rs << "=0x" << std::hex << rsdata[i]); - break; - case 2: - rsdata[i] = fregs[rs]; - DPN(2, "fr" << std::dec << rs << "=0x" << std::hex << rsdata[i]); - break; - default: break; - } - } - DPN(2, std::endl); - } - - bool rd_write = false; + std::vector rsdata(num_threads); + std::vector rddata(num_threads); - switch (opcode) { - case NOP: - break; - case LUI_INST: - rddata = (immsrc << 12) & 0xfffff000; - rd_write = true; - break; - case AUIPC_INST: - rddata = ((immsrc << 12) & 0xfffff000) + PC_; - rd_write = true; - break; - case R_INST: { + int num_rsrcs = instr.getNRSrc(); + if (num_rsrcs) { + for (int i = 0; i < num_rsrcs; ++i) { + DPH(2, "Src Reg [" << std::dec << i << "]: "); + int type = instr.getRSType(i); + int reg = instr.getRSrc(i); + switch (type) { + case 1: + DPH(2, "r" << std::dec << reg << "={"); + for (int t = 0; t < num_threads; ++t) { + if (t) DPN(2, ", "); + if (!tmask_.test(t)) { + DPN(2, "-"); + continue; + } + rsdata[t][i] = iRegFile_.at(t)[reg]; + DPN(2, std::hex << rsdata[t][i]); + } + DPN(2, "}" << std::endl); + break; + case 2: + DPH(2, "fr" << std::dec << reg << "={"); + for (int t = 0; t < num_threads; ++t) { + if (t) DPN(2, ", "); + if (!tmask_.test(t)) { + DPN(2, "-"); + continue; + } + rsdata[t][i] = fRegFile_.at(t)[reg]; + DPN(2, std::hex << rsdata[t][i]); + } + DPN(2, "}" << std::endl); + break; + default: + break; + } + } + } + + bool rd_write = false; + + switch (opcode) { + case NOP: + break; + case LUI_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::ARITH; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = (immsrc << 12) & 0xfffff000; + } + rd_write = true; + break; + case AUIPC_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::ARITH; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = ((immsrc << 12) & 0xfffff000) + PC_; + } + rd_write = true; + break; + case R_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::ARITH; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; if (func7 & 0x1) { switch (func3) { case 0: // MUL - rddata = ((WordI)rsdata[0]) * ((WordI)rsdata[1]); + rddata[t] = ((WordI)rsdata[t][0]) * ((WordI)rsdata[t][1]); + pipeline_state->alu.type = AluType::IMUL; break; case 1: { // MULH - int64_t first = (int64_t)rsdata[0]; - if (rsdata[0] & 0x80000000) { + int64_t first = (int64_t)rsdata[t][0]; + if (rsdata[t][0] & 0x80000000) { first = first | 0xFFFFFFFF00000000; } - int64_t second = (int64_t)rsdata[1]; - if (rsdata[1] & 0x80000000) { + int64_t second = (int64_t)rsdata[t][1]; + if (rsdata[t][1] & 0x80000000) { second = second | 0xFFFFFFFF00000000; } uint64_t result = first * second; - rddata = (result >> 32) & 0xFFFFFFFF; + rddata[t] = (result >> 32) & 0xFFFFFFFF; + pipeline_state->alu.type = AluType::IMUL; } break; case 2: { // MULHSU - int64_t first = (int64_t)rsdata[0]; - if (rsdata[0] & 0x80000000) { + int64_t first = (int64_t)rsdata[t][0]; + if (rsdata[t][0] & 0x80000000) { first = first | 0xFFFFFFFF00000000; } - int64_t second = (int64_t)rsdata[1]; - rddata = ((first * second) >> 32) & 0xFFFFFFFF; + int64_t second = (int64_t)rsdata[t][1]; + rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF; + pipeline_state->alu.type = AluType::IMUL; } break; case 3: { // MULHU - uint64_t first = (uint64_t)rsdata[0]; - uint64_t second = (uint64_t)rsdata[1]; - rddata = ((first * second) >> 32) & 0xFFFFFFFF; + uint64_t first = (uint64_t)rsdata[t][0]; + uint64_t second = (uint64_t)rsdata[t][1]; + rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF; + pipeline_state->alu.type = AluType::IMUL; } break; case 4: { // DIV - WordI dividen = rsdata[0]; - WordI divisor = rsdata[1]; + WordI dividen = rsdata[t][0]; + WordI divisor = rsdata[t][1]; if (divisor == 0) { - rddata = -1; + rddata[t] = -1; } else if (dividen == WordI(0x80000000) && divisor == WordI(0xffffffff)) { - rddata = dividen; + rddata[t] = dividen; } else { - rddata = dividen / divisor; + rddata[t] = dividen / divisor; } + pipeline_state->alu.type = AluType::IDIV; } break; case 5: { // DIVU - Word dividen = rsdata[0]; - Word divisor = rsdata[1]; + Word dividen = rsdata[t][0]; + Word divisor = rsdata[t][1]; if (divisor == 0) { - rddata = -1; + rddata[t] = -1; } else { - rddata = dividen / divisor; + rddata[t] = dividen / divisor; } + pipeline_state->alu.type = AluType::IDIV; } break; case 6: { // REM - WordI dividen = rsdata[0]; - WordI divisor = rsdata[1]; - if (rsdata[1] == 0) { - rddata = dividen; + WordI dividen = rsdata[t][0]; + WordI divisor = rsdata[t][1]; + if (rsdata[t][1] == 0) { + rddata[t] = dividen; } else if (dividen == WordI(0x80000000) && divisor == WordI(0xffffffff)) { - rddata = 0; + rddata[t] = 0; } else { - rddata = dividen % divisor; + rddata[t] = dividen % divisor; } + pipeline_state->alu.type = AluType::IDIV; } break; case 7: { // REMU - Word dividen = rsdata[0]; - Word divisor = rsdata[1]; - if (rsdata[1] == 0) { - rddata = dividen; + Word dividen = rsdata[t][0]; + Word divisor = rsdata[t][1]; + if (rsdata[t][1] == 0) { + rddata[t] = dividen; } else { - rddata = dividen % divisor; + rddata[t] = dividen % divisor; } + pipeline_state->alu.type = AluType::IDIV; } break; default: - std::cout << "unsupported MUL/DIV instr\n"; std::abort(); } } else { switch (func3) { case 0: if (func7) { - rddata = rsdata[0] - rsdata[1]; + // SUB + rddata[t] = rsdata[t][0] - rsdata[t][1]; } else { - rddata = rsdata[0] + rsdata[1]; + // ADD + rddata[t] = rsdata[t][0] + rsdata[t][1]; } break; case 1: - rddata = rsdata[0] << rsdata[1]; + // SHL + rddata[t] = rsdata[t][0] << rsdata[t][1]; break; case 2: - rddata = (WordI(rsdata[0]) < WordI(rsdata[1])); + // LT + rddata[t] = (WordI(rsdata[t][0]) < WordI(rsdata[t][1])); break; case 3: - rddata = (Word(rsdata[0]) < Word(rsdata[1])); + // LTU + rddata[t] = (Word(rsdata[t][0]) < Word(rsdata[t][1])); break; case 4: - rddata = rsdata[0] ^ rsdata[1]; + // XOR + rddata[t] = rsdata[t][0] ^ rsdata[t][1]; break; case 5: if (func7) { - rddata = WordI(rsdata[0]) >> WordI(rsdata[1]); + // SRA + rddata[t] = WordI(rsdata[t][0]) >> WordI(rsdata[t][1]); } else { - rddata = Word(rsdata[0]) >> Word(rsdata[1]); + // SHR + rddata[t] = Word(rsdata[t][0]) >> Word(rsdata[t][1]); } break; case 6: - rddata = rsdata[0] | rsdata[1]; + // OR + rddata[t] = rsdata[t][0] | rsdata[t][1]; break; case 7: - rddata = rsdata[0] & rsdata[1]; + // AND + rddata[t] = rsdata[t][0] & rsdata[t][1]; break; default: std::abort(); } } - rd_write = true; - } break; - case I_INST: + } + rd_write = true; + break; + case I_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::ARITH; + pipeline_state->used_iregs[rsrc0] = 1; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; switch (func3) { case 0: // ADDI - rddata = rsdata[0] + immsrc; + rddata[t] = rsdata[t][0] + immsrc; break; case 1: // SLLI - rddata = rsdata[0] << immsrc; + rddata[t] = rsdata[t][0] << immsrc; break; case 2: // SLTI - rddata = (WordI(rsdata[0]) < WordI(immsrc)); + rddata[t] = (WordI(rsdata[t][0]) < WordI(immsrc)); break; case 3: { // SLTIU - rddata = (Word(rsdata[0]) < Word(immsrc)); + rddata[t] = (Word(rsdata[t][0]) < Word(immsrc)); } break; case 4: // XORI - rddata = rsdata[0] ^ immsrc; + rddata[t] = rsdata[t][0] ^ immsrc; break; case 5: if (func7) { // SRAI - Word result = WordI(rsdata[0]) >> immsrc; - rddata = result; + Word result = WordI(rsdata[t][0]) >> immsrc; + rddata[t] = result; } else { // SRLI - Word result = Word(rsdata[0]) >> immsrc; - rddata = result; + Word result = Word(rsdata[t][0]) >> immsrc; + rddata[t] = result; } break; case 6: // ORI - rddata = rsdata[0] | immsrc; + rddata[t] = rsdata[t][0] | immsrc; break; case 7: // ANDI - rddata = rsdata[0] & immsrc; + rddata[t] = rsdata[t][0] & immsrc; break; - default: - std::abort(); } - rd_write = true; - break; - case B_INST: + } + rd_write = true; + break; + case B_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::BRANCH; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; switch (func3) { case 0: // BEQ - if (rsdata[0] == rsdata[1]) { + if (rsdata[t][0] == rsdata[t][1]) { nextPC = PC_ + immsrc; } break; case 1: // BNE - if (rsdata[0] != rsdata[1]) { + if (rsdata[t][0] != rsdata[t][1]) { nextPC = PC_ + immsrc; } break; case 4: // BLT - if (WordI(rsdata[0]) < WordI(rsdata[1])) { + if (WordI(rsdata[t][0]) < WordI(rsdata[t][1])) { nextPC = PC_ + immsrc; } break; case 5: // BGE - if (WordI(rsdata[0]) >= WordI(rsdata[1])) { + if (WordI(rsdata[t][0]) >= WordI(rsdata[t][1])) { nextPC = PC_ + immsrc; } break; case 6: // BLTU - if (Word(rsdata[0]) < Word(rsdata[1])) { + if (Word(rsdata[t][0]) < Word(rsdata[t][1])) { nextPC = PC_ + immsrc; } break; case 7: // BGEU - if (Word(rsdata[0]) >= Word(rsdata[1])) { + if (Word(rsdata[t][0]) >= Word(rsdata[t][1])) { nextPC = PC_ + immsrc; } break; - } - pipeline->stall_warp = true; - runOnce = true; - break; - case JAL_INST: - rddata = nextPC; - nextPC = PC_ + immsrc; - pipeline->stall_warp = true; - runOnce = true; - rd_write = true; - break; - case JALR_INST: - rddata = nextPC; - nextPC = rsdata[0] + immsrc; - pipeline->stall_warp = true; - runOnce = true; - rd_write = true; - break; - case L_INST: { - Word memAddr = ((rsdata[0] + immsrc) & 0xFFFFFFFC); // word aligned - Word shift_by = ((rsdata[0] + immsrc) & 0x00000003) * 8; - Word data_read = core_->dcache_read(memAddr, 4); - D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); - switch (func3) { - case 0: - // LBI - rddata = signExt((data_read >> shift_by) & 0xFF, 8, 0xFF); - break; - case 1: - // LHI - rddata = signExt((data_read >> shift_by) & 0xFFFF, 16, 0xFFFF); - break; - case 2: - // LW - rddata = data_read; - break; - case 4: - // LBU - rddata = Word((data_read >> shift_by) & 0xFF); - break; - case 5: - // LHU - rddata = Word((data_read >> shift_by) & 0xFFFF); - break; - default: - std::abort(); - } - rd_write = true; - } break; - case S_INST: { - Word memAddr = rsdata[0] + immsrc; - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - switch (func3) { - case 0: - // SB - core_->dcache_write(memAddr, rsdata[1] & 0x000000FF, 1); - break; - case 1: - // SH - core_->dcache_write(memAddr, rsdata[1], 2); - break; - case 2: - // SW - core_->dcache_write(memAddr, rsdata[1], 4); - break; default: std::abort(); } - } break; - case SYS_INST: { + break; // runonce + } + pipeline_state->stall_warp = true; + break; + case JAL_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::BRANCH; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = nextPC; + nextPC = PC_ + immsrc; + pipeline_state->stall_warp = true; + break; // runonce + } + rd_write = true; + break; + case JALR_INST: + pipeline_state->exe_type = ExeType::ALU; + pipeline_state->alu.type = AluType::BRANCH; + pipeline_state->used_iregs[rsrc0] = 1; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = nextPC; + nextPC = rsdata[t][0] + immsrc; + pipeline_state->stall_warp = true; + break; // runOnce + } + rd_write = true; + break; + case L_INST: + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.load = 0; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->mem_addrs.resize(num_threads); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned + Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; + Word data_read = core_->dcache_read(memAddr, 4); + pipeline_state->mem_addrs.at(t) = memAddr; + D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); + switch (func3) { + case 0: + // LBI + rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8); + break; + case 1: + // LHI + rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16); + break; + case 2: + // LW + rddata[t] = data_read; + break; + case 4: + // LBU + rddata[t] = Word((data_read >> shift_by) & 0xFF); + break; + case 5: + // LHU + rddata[t] = Word((data_read >> shift_by) & 0xFFFF); + break; + default: + std::abort(); + } + } + rd_write = true; + break; + case S_INST: + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.store = 1; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + pipeline_state->mem_addrs.resize(num_threads); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word memAddr = rsdata[t][0] + immsrc; + pipeline_state->mem_addrs.at(t) = memAddr; + D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + switch (func3) { + case 0: + // SB + core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1); + break; + case 1: + // SH + core_->dcache_write(memAddr, rsdata[t][1], 2); + break; + case 2: + // SW + core_->dcache_write(memAddr, rsdata[t][1], 4); + break; + default: + std::abort(); + } + } + break; + case SYS_INST: + pipeline_state->exe_type = ExeType::CSR; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; Word csr_addr = immsrc & 0x00000FFF; Word csr_value = core_->get_csr(csr_addr, t, id_); switch (func3) { @@ -400,229 +499,306 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) { break; case 1: // CSRRW - rddata = csr_value; - core_->set_csr(csr_addr, rsdata[0], t, id_); + rddata[t] = csr_value; + core_->set_csr(csr_addr, rsdata[t][0], t, id_); + pipeline_state->used_iregs[rsrc0] = 1; rd_write = true; break; case 2: // CSRRS - rddata = csr_value; - core_->set_csr(csr_addr, csr_value | rsdata[0], t, id_); + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_); + pipeline_state->used_iregs[rsrc0] = 1; rd_write = true; break; case 3: // CSRRC - rddata = csr_value; - core_->set_csr(csr_addr, csr_value & ~rsdata[0], t, id_); + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_); + pipeline_state->used_iregs[rsrc0] = 1; rd_write = true; break; case 5: // CSRRWI - rddata = csr_value; - core_->set_csr(csr_addr, rsrc0, t, id_); + rddata[t] = csr_value; + core_->set_csr(csr_addr, rsrc0, t, id_); rd_write = true; break; case 6: // CSRRSI - rddata = csr_value; + rddata[t] = csr_value; core_->set_csr(csr_addr, csr_value | rsrc0, t, id_); rd_write = true; break; case 7: // CSRRCI - rddata = csr_value; + rddata[t] = csr_value; core_->set_csr(csr_addr, csr_value & ~rsrc0, t, id_); rd_write = true; break; default: break; } - } break; - case FENCE: - pipeline->stall_warp = true; - runOnce = true; - break; - case (FL | VL): - if (func3 == 0x2) { - Word memAddr = rsdata[0] + immsrc; + } + break; + case FENCE: + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.fence = 1; + pipeline_state->stall_warp = true; + break; + case (FL | VL): + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.load = 1; + pipeline_state->used_iregs[rsrc0] = 1; + if (func3 == 0x2) { + pipeline_state->mem_addrs.resize(num_threads); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word memAddr = rsdata[t][0] + immsrc; + pipeline_state->mem_addrs.at(t) = memAddr; Word data_read = core_->dcache_read(memAddr, 4); D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); - rddata = data_read; - } else { - D(3, "Executing vector load"); - D(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); - D(3, "src: " << rsrc0 << " " << rsdata[0]); - D(3, "dest" << rdest); - D(3, "width" << instr.getVlsWidth()); - - auto &vd = vRegFile_[rdest]; - + rddata[t] = data_read; + } + } else { + D(3, "Executing vector load"); + D(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); + D(3, "dest: v" << rdest); + D(3, "width" << instr.getVlsWidth()); + pipeline_state->mem_addrs.resize(vl_); + auto &vd = vRegFile_.at(rdest); + switch (instr.getVlsWidth()) { + case 6: { + // load word and unit strided (not checking for unit stride) + for (int i = 0; i < vl_; i++) { + Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); + pipeline_state->mem_addrs.at(i) = memAddr; + D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + Word data_read = core_->dcache_read(memAddr, 4); + D(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); + int *result_ptr = (int *)(vd.data() + i); + *result_ptr = data_read; + } + } break; + default: + std::abort(); + } + break; + } + rd_write = true; + break; + case (FS | VS): + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.store = 1; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + if (func3 == 0x2) { + pipeline_state->mem_addrs.resize(num_threads); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word memAddr = rsdata[t][0] + immsrc; + pipeline_state->mem_addrs.at(t) = memAddr; + core_->dcache_write(memAddr, rsdata[t][1], 4); + D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + } + } else { + pipeline_state->mem_addrs.resize(vl_); + for (int i = 0; i < vl_; i++) { + Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8); + pipeline_state->mem_addrs.at(i) = memAddr; + D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); switch (instr.getVlsWidth()) { - case 6: { - //load word and unit strided (not checking for unit stride) - for (int i = 0; i < vl_; i++) { - Word memAddr = ((rsdata[0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - Word data_read = core_->dcache_read(memAddr, 4); - D(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); - int *result_ptr = (int *)(vd.data() + i); - *result_ptr = data_read; - } + case 6: { + //store word and unit strided (not checking for unit stride) + uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i); + core_->dcache_write(memAddr, value, 4); + D(3, "store: " << memAddr << " value:" << value); } break; default: std::abort(); - } - break; - } - rd_write = true; - break; - case (FS | VS): - if (func3 == 0x2) { - Word memAddr = rsdata[0] + immsrc; - core_->dcache_write(memAddr, rsdata[1], 4); - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - } else { - for (int i = 0; i < vl_; i++) { - Word memAddr = rsdata[0] + (i * vtype_.vsew / 8); - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - switch (instr.getVlsWidth()) { - case 6: { - //store word and unit strided (not checking for unit stride) - uint32_t value = *(uint32_t *)(vRegFile_[instr.getVs3()].data() + i); - core_->dcache_write(memAddr, value, 4); - D(3, "store: " << memAddr << " value:" << value); - } break; - default: - std::abort(); - } - } + } } - break; - case FCI: { + } + break; + case FCI: + pipeline_state->exe_type = ExeType::FPU; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; uint32_t frm = get_fpu_rm(func3, core_, t, id_); uint32_t fflags = 0; switch (func7) { case 0x00: //FADD - rddata = rv_fadd(rsdata[0], rsdata[1], frm, &fflags); + rddata[t] = rv_fadd(rsdata[t][0], rsdata[t][1], frm, &fflags); + pipeline_state->fpu.type = FpuType::FMA; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; break; case 0x04: //FSUB - rddata = rv_fsub(rsdata[0], rsdata[1], frm, &fflags); + rddata[t] = rv_fsub(rsdata[t][0], rsdata[t][1], frm, &fflags); + pipeline_state->fpu.type = FpuType::FMA; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; break; case 0x08: //FMUL - rddata = rv_fmul(rsdata[0], rsdata[1], frm, &fflags); + rddata[t] = rv_fmul(rsdata[t][0], rsdata[t][1], frm, &fflags); + pipeline_state->fpu.type = FpuType::FMA; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; break; case 0x0c: //FDIV - rddata = rv_fdiv(rsdata[0], rsdata[1], frm, &fflags); + rddata[t] = rv_fdiv(rsdata[t][0], rsdata[t][1], frm, &fflags); + pipeline_state->fpu.type = FpuType::FDIV; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; break; case 0x2c: //FSQRT - rddata = rv_fsqrt(rsdata[0], frm, &fflags); + rddata[t] = rv_fsqrt(rsdata[t][0], frm, &fflags); + pipeline_state->fpu.type = FpuType::FSQRT; + pipeline_state->used_fregs[rsrc0] = 1; break; case 0x10: switch (func3) { case 0: // FSGNJ.S - rddata = rv_fsgnj(rsdata[0], rsdata[1]); + rddata[t] = rv_fsgnj(rsdata[t][0], rsdata[t][1]); break; case 1: // FSGNJN.S - rddata = rv_fsgnjn(rsdata[0], rsdata[1]); + rddata[t] = rv_fsgnjn(rsdata[t][0], rsdata[t][1]); break; case 2: // FSGNJX.S - rddata = rv_fsgnjx(rsdata[0], rsdata[1]); + rddata[t] = rv_fsgnjx(rsdata[t][0], rsdata[t][1]); break; } + pipeline_state->fpu.type = FpuType::FNCP; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; break; - case 0x14: + case 0x14: if (func3) { // FMAX.S - rddata = rv_fmax(rsdata[0], rsdata[1], &fflags); + rddata[t] = rv_fmax(rsdata[t][0], rsdata[t][1], &fflags); } else { // FMIN.S - rddata = rv_fmin(rsdata[0], rsdata[1], &fflags); + rddata[t] = rv_fmin(rsdata[t][0], rsdata[t][1], &fflags); } + pipeline_state->fpu.type = FpuType::FNCP; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; break; case 0x60: if (rsrc1 == 0) { // FCVT.W.S - rddata = rv_ftoi(rsdata[0], frm, &fflags); + rddata[t] = rv_ftoi(rsdata[t][0], frm, &fflags); } else { // FCVT.WU.S - rddata = rv_ftou(rsdata[0], frm, &fflags); + rddata[t] = rv_ftou(rsdata[t][0], frm, &fflags); } + pipeline_state->fpu.type = FpuType::FCVT; + pipeline_state->used_fregs[rsrc0] = 1; break; case 0x70: if (func3) { // FCLASS.S - rddata = rv_fclss(rsdata[0]); + rddata[t] = rv_fclss(rsdata[t][0]); } else { // FMV.X.W - rddata = rsdata[0]; + rddata[t] = rsdata[t][0]; + pipeline_state->fpu.type = FpuType::FNCP; + pipeline_state->used_fregs[rsrc0] = 1; } break; - case 0x50: + case 0x50: switch(func3) { case 0: // FLE.S - rddata = rv_fle(rsdata[0], rsdata[1], &fflags); + rddata[t] = rv_fle(rsdata[t][0], rsdata[t][1], &fflags); break; case 1: // FLT.S - rddata = rv_flt(rsdata[0], rsdata[1], &fflags); + rddata[t] = rv_flt(rsdata[t][0], rsdata[t][1], &fflags); break; case 2: // FEQ.S - rddata = rv_feq(rsdata[0], rsdata[1], &fflags); + rddata[t] = rv_feq(rsdata[t][0], rsdata[t][1], &fflags); break; - } break; + } + pipeline_state->fpu.type = FpuType::FNCP; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; + break; case 0x68: if (rsrc1) { // FCVT.S.WU: - rddata = rv_utof(rsdata[0], frm, &fflags); + rddata[t] = rv_utof(rsdata[t][0], frm, &fflags); } else { // FCVT.S.W: - rddata = rv_itof(rsdata[0], frm, &fflags); + rddata[t] = rv_itof(rsdata[t][0], frm, &fflags); } + pipeline_state->fpu.type = FpuType::FCVT; + pipeline_state->used_iregs[rsrc0] = 1; break; case 0x78: // FMV.W.X - rddata = rsdata[0]; + rddata[t] = rsdata[t][0]; + pipeline_state->fpu.type = FpuType::FNCP; + pipeline_state->used_iregs[rsrc0] = 1; break; } update_fcrs(fflags, core_, t, id_); - rd_write = true; - } break; - case FMADD: - case FMSUB: - case FMNMADD: - case FMNMSUB: { + } + rd_write = true; + break; + case FMADD: + case FMSUB: + case FMNMADD: + case FMNMSUB: + pipeline_state->fpu.type = FpuType::FMA; + pipeline_state->used_fregs[rsrc0] = 1; + pipeline_state->used_fregs[rsrc1] = 1; + pipeline_state->used_fregs[rsrc2] = 1; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; int frm = get_fpu_rm(func3, core_, t, id_); Word fflags = 0; switch (opcode) { case FMADD: - rddata = rv_fmadd(rsdata[0], rsdata[1], rsdata[2], frm, &fflags); + rddata[t] = rv_fmadd(rsdata[t][0], rsdata[t][1], rsdata[t][2], frm, &fflags); break; case FMSUB: - rddata = rv_fmsub(rsdata[0], rsdata[1], rsdata[2], frm, &fflags); + rddata[t] = rv_fmsub(rsdata[t][0], rsdata[t][1], rsdata[t][2], frm, &fflags); break; case FMNMADD: - rddata = rv_fnmadd(rsdata[0], rsdata[1], rsdata[2], frm, &fflags); + rddata[t] = rv_fnmadd(rsdata[t][0], rsdata[t][1], rsdata[t][2], frm, &fflags); break; case FMNMSUB: - rddata = rv_fnmsub(rsdata[0], rsdata[1], rsdata[2], frm, &fflags); + rddata[t] = rv_fnmsub(rsdata[t][0], rsdata[t][1], rsdata[t][2], frm, &fflags); break; default: break; } update_fcrs(fflags, core_, t, id_); - rd_write = true; - } break; - case GPGPU: + } + rd_write = true; + break; + case GPGPU: + pipeline_state->exe_type = ExeType::GPU; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; switch (func3) { case 0: { // TMC + pipeline_state->gpu.type = GpuType::TMC; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->stall_warp = true; if (rsrc1) { // predicate mode ThreadMask pred; for (int i = 0; i < num_threads; ++i) { - pred[i] = tmask_[i] ? (iRegFile_[i][rsrc0] != 0) : 0; + pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0; } if (pred.any()) { tmask_ &= pred; @@ -630,58 +806,64 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) { } else { tmask_.reset(); for (int i = 0; i < num_threads; ++i) { - tmask_[i] = rsdata[0] & (1 << i); + tmask_.set(i, rsdata.at(t)[0] & (1 << i)); } } D(3, "*** TMC " << tmask_); active_ = tmask_.any(); - pipeline->stall_warp = true; - runOnce = true; + break; // runOnce } break; case 1: { // WSPAWN - int active_warps = std::min(rsdata[0], core_->arch().num_warps()); - D(3, "*** Spawning " << (active_warps-1) << " warps at PC: " << std::hex << rsdata[1]); + pipeline_state->gpu.type = GpuType::WSPAWN; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + pipeline_state->stall_warp = true; + int active_warps = std::min(rsdata.at(t)[0], core_->arch().num_warps()); + D(3, "*** Spawning " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(t)[1]); for (int i = 1; i < active_warps; ++i) { Warp &newWarp = core_->warp(i); - newWarp.setPC(rsdata[1]); + newWarp.setPC(rsdata[t][1]); newWarp.setTmask(0, true); } - pipeline->stall_warp = true; - runOnce = true; + break; // runOnce } break; case 2: { // SPLIT + pipeline_state->gpu.type = GpuType::SPLIT; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->stall_warp = true; if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) { ThreadMask tmask; for (int i = 0; i < num_threads; ++i) { - tmask[i] = tmask_[i] && !iRegFile_[i][rsrc0]; + tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0); } DomStackEntry e(tmask, nextPC); domStack_.push(tmask_); domStack_.push(e); for (size_t i = 0; i < e.tmask.size(); ++i) { - tmask_[i] = !e.tmask[i] && tmask_[i]; + tmask_.set(i, !e.tmask.test(i) && tmask_.test(i)); } active_ = tmask_.any(); DPH(3, "*** Split: New TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, tmask_[num_threads-i-1]); + for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); DPN(3, ", Pushed TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask[num_threads-i-1]); + for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1)); DPN(3, ", PC=0x" << std::hex << e.PC << "\n"); } else { D(3, "*** Unanimous pred"); DomStackEntry e(tmask_); e.unanimous = true; domStack_.push(e); - } - pipeline->stall_warp = true; - runOnce = true; + } + break; // runOnce } break; case 3: { // JOIN + pipeline_state->gpu.type = GpuType::JOIN; + pipeline_state->stall_warp = true; if (!domStack_.empty() && domStack_.top().unanimous) { D(3, "*** Uninimous branch at join"); tmask_ = domStack_.top().tmask; @@ -697,898 +879,923 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) { active_ = tmask_.any(); DPH(3, "*** Join: New TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, tmask_[num_threads-i-1]); + for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); DPN(3, "\n"); domStack_.pop(); - } - pipeline->stall_warp = true; - runOnce = true; + } + break; // runOnce } break; case 4: { // BAR + pipeline_state->gpu.type = GpuType::BAR; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + pipeline_state->stall_warp = true; active_ = false; - core_->barrier(rsdata[0], rsdata[1], id_); - pipeline->stall_warp = true; - runOnce = true; + core_->barrier(rsdata[t][0], rsdata[t][1], id_); + break; // runOnce } break; case 6: { // PREFETCH - int addr = rsdata[0]; + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.prefetch = 1; + pipeline_state->used_iregs[rsrc0] = 1; + int addr = rsdata[t][0]; printf("*** PREFETCHED %d ***\n", addr); } break; default: std::abort(); } - break; - case VSET: { - int VLEN = core_->arch().vsize() * 8; - int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew(); - switch (func3) { - case 0: // vector-vector - switch (func6) { - case 0: { - auto& vr1 = vRegFile_[rsrc0]; - auto& vr2 = vRegFile_[rsrc1]; - auto& vd = vRegFile_[rdest]; - auto& mask = vRegFile_[0]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t emask = *(uint8_t *)(mask.data() + i); - uint8_t value = emask & 0x1; - if (vmask || (!vmask && value)) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t emask = *(uint16_t *)(mask.data() + i); - uint16_t value = emask & 0x1; - if (vmask || (!vmask && value)) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t emask = *(uint32_t *)(mask.data() + i); - uint32_t value = emask & 0x1; - if (vmask || (!vmask && value)) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - } - } - } break; - case 24: { - //vmseq - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { + } + break; + case VSET: { + int VLEN = core_->arch().vsize() * 8; + int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew(); + switch (func3) { + case 0: // vector-vector + switch (func6) { + case 0: { + auto& vr1 = vRegFile_.at(rsrc0); + auto& vr2 = vRegFile_.at(rsrc1); + auto& vd = vRegFile_.at(rdest); + auto& mask = vRegFile_.at(0); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t emask = *(uint8_t *)(mask.data() + i); + uint8_t value = emask & 0x1; + if (vmask || (!vmask && value)) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + uint8_t result = first + second; + D(3, "Adding " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t emask = *(uint16_t *)(mask.data() + i); + uint16_t value = emask & 0x1; + if (vmask || (!vmask && value)) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + uint16_t result = first + second; + D(3, "Adding " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t emask = *(uint32_t *)(mask.data() + i); + uint32_t value = emask & 0x1; + if (vmask || (!vmask && value)) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + uint32_t result = first + second; + D(3, "Adding " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } - } break; - case 25: { - //vmsne - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } + } + } break; + case 24: { + //vmseq + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first == second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } - } break; - case 26: { - //vmsltu - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first == second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } - } break; - case 27: { - //vmslt - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - int8_t first = *(int8_t *)(vr1.data() + i); - int8_t second = *(int8_t *)(vr2.data() + i); - int8_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - int16_t first = *(int16_t *)(vr1.data() + i); - int16_t second = *(int16_t *)(vr2.data() + i); - int16_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - int32_t first = *(int32_t *)(vr1.data() + i); - int32_t second = *(int32_t *)(vr2.data() + i); - int32_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int32_t *)(vd.data() + i) = result; - } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first == second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } - } break; - case 28: { - //vmsleu - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - } - } break; - case 29: { - //vmsle - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - int8_t first = *(int8_t *)(vr1.data() + i); - int8_t second = *(int8_t *)(vr2.data() + i); - int8_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - int16_t first = *(int16_t *)(vr1.data() + i); - int16_t second = *(int16_t *)(vr2.data() + i); - int16_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - int32_t first = *(int32_t *)(vr1.data() + i); - int32_t second = *(int32_t *)(vr2.data() + i); - int32_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int32_t *)(vd.data() + i) = result; - } - } - } break; - case 30: { - //vmsgtu - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - } - } break; - case 31: { - //vmsgt - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - int8_t first = *(int8_t *)(vr1.data() + i); - int8_t second = *(int8_t *)(vr2.data() + i); - int8_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - int16_t first = *(int16_t *)(vr1.data() + i); - int16_t second = *(int16_t *)(vr2.data() + i); - int16_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int16_t *)(vd.data() + i) = result; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - int32_t first = *(int32_t *)(vr1.data() + i); - int32_t second = *(int32_t *)(vr2.data() + i); - int32_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); - *(int32_t *)(vd.data() + i) = result; - } - } - } break; - } - break; - case 2: { - switch (func6) { - case 24: { - // vmandnot - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 25: { - // vmand - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 26: { - // vmor - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 27: { - //vmxor - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 28: { - //vmornot - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 29: { - //vmnand - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 30: { - //vmnor - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 31: { - //vmxnor - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t first_value = (first & 0x1); - uint8_t second_value = (second & 0x1); - uint8_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t first_value = (first & 0x1); - uint16_t second_value = (second & 0x1); - uint16_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t first_value = (first & 0x1); - uint32_t second_value = (second & 0x1); - uint32_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 37: { - //vmul - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; - case 45: { - // vmacc - auto &vr1 = vRegFile_[rsrc0]; - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t first = *(uint8_t *)(vr1.data() + i); - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) += result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t first = *(uint16_t *)(vr1.data() + i); - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) += result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t first = *(uint32_t *)(vr1.data() + i); - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) += result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } - } - } break; } } break; - case 6: { - switch (func6) { - case 0: { - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (rsdata[0] + second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (rsdata[0] + second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (rsdata[0] + second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } + case 25: { + //vmsne + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first != second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } - } break; - case 37: { - // vmul.vx - auto &vr2 = vRegFile_[rsrc1]; - auto &vd = vRegFile_[rdest]; - if (vtype_.vsew == 8) { - for (int i = 0; i < vl_; i++) { - uint8_t second = *(uint8_t *)(vr2.data() + i); - uint8_t result = (rsdata[0] * second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint8_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint8_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 16) { - for (int i = 0; i < vl_; i++) { - uint16_t second = *(uint16_t *)(vr2.data() + i); - uint16_t result = (rsdata[0] * second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint16_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint16_t *)(vd.data() + i) = 0; - } - } else if (vtype_.vsew == 32) { - for (int i = 0; i < vl_; i++) { - uint32_t second = *(uint32_t *)(vr2.data() + i); - uint32_t result = (rsdata[0] * second); - D(3, "Comparing " << rsdata[0] << " + " << second << " = " << result); - *(uint32_t *)(vd.data() + i) = result; - } - for (int i = vl_; i < VLMAX; i++) { - *(uint32_t *)(vd.data() + i) = 0; - } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first != second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first != second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } - } break; } } break; - case 7: { - vtype_.vill = 0; - vtype_.vediv = instr.getVediv(); - vtype_.vsew = instr.getVsew(); - vtype_.vlmul = instr.getVlmul(); - - D(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0] << "VLMAX" << VLMAX); - - int s0 = rsdata[0]; - if (s0 <= VLMAX) { - vl_ = s0; - } else if (s0 < (2 * VLMAX)) { - vl_ = (int)ceil((s0 * 1.0) / 2.0); - } else if (s0 >= (2 * VLMAX)) { - vl_ = VLMAX; - } - rddata = vl_; + case 26: { + //vmsltu + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first < second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first < second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first < second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + } + } break; + case 27: { + //vmslt + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + int8_t first = *(int8_t *)(vr1.data() + i); + int8_t second = *(int8_t *)(vr2.data() + i); + int8_t result = (first < second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + int16_t first = *(int16_t *)(vr1.data() + i); + int16_t second = *(int16_t *)(vr2.data() + i); + int16_t result = (first < second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(int16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + int32_t first = *(int32_t *)(vr1.data() + i); + int32_t second = *(int32_t *)(vr2.data() + i); + int32_t result = (first < second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(int32_t *)(vd.data() + i) = result; + } + } + } break; + case 28: { + //vmsleu + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first <= second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first <= second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first <= second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + } + } break; + case 29: { + //vmsle + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + int8_t first = *(int8_t *)(vr1.data() + i); + int8_t second = *(int8_t *)(vr2.data() + i); + int8_t result = (first <= second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + int16_t first = *(int16_t *)(vr1.data() + i); + int16_t second = *(int16_t *)(vr2.data() + i); + int16_t result = (first <= second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(int16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + int32_t first = *(int32_t *)(vr1.data() + i); + int32_t second = *(int32_t *)(vr2.data() + i); + int32_t result = (first <= second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(int32_t *)(vd.data() + i) = result; + } + } + } break; + case 30: { + //vmsgtu + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first > second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first > second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first > second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + } + } break; + case 31: { + //vmsgt + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + int8_t first = *(int8_t *)(vr1.data() + i); + int8_t second = *(int8_t *)(vr2.data() + i); + int8_t result = (first > second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + int16_t first = *(int16_t *)(vr1.data() + i); + int16_t second = *(int16_t *)(vr2.data() + i); + int16_t result = (first > second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(int16_t *)(vd.data() + i) = result; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + int32_t first = *(int32_t *)(vr1.data() + i); + int32_t second = *(int32_t *)(vr2.data() + i); + int32_t result = (first > second) ? 1 : 0; + D(3, "Comparing " << first << " + " << second << " = " << result); + *(int32_t *)(vd.data() + i) = result; + } + } } break; - default: - std::abort(); } - } break; + break; + case 2: { + switch (func6) { + case 24: { + // vmandnot + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value & !second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value & !second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value & !second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 25: { + // vmand + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value & second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value & second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value & second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 26: { + // vmor + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value | second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value | second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value | second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 27: { + //vmxor + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value ^ second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value ^ second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value ^ second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 28: { + //vmornot + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = (first_value | !second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = (first_value | !second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = (first_value | !second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 29: { + //vmnand + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = !(first_value & second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = !(first_value & second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = !(first_value & second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 30: { + //vmnor + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = !(first_value | second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = !(first_value | second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = !(first_value | second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 31: { + //vmxnor + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); + uint8_t result = !(first_value ^ second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); + uint16_t result = !(first_value ^ second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); + uint32_t result = !(first_value ^ second_value); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 37: { + //vmul + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first * second); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first * second); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first * second); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 45: { + // vmacc + auto &vr1 = vRegFile_.at(rsrc0); + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first * second); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) += result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first * second); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) += result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first * second); + D(3, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) += result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + } + } break; + case 6: { + switch (func6) { + case 0: { + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (rsdata[i][0] + second); + D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (rsdata[i][0] + second); + D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (rsdata[i][0] + second); + D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + case 37: { + // vmul.vx + auto &vr2 = vRegFile_.at(rsrc1); + auto &vd = vRegFile_.at(rdest); + if (vtype_.vsew == 8) { + for (int i = 0; i < vl_; i++) { + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (rsdata[i][0] * second); + D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint8_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 16) { + for (int i = 0; i < vl_; i++) { + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (rsdata[i][0] * second); + D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint16_t *)(vd.data() + i) = 0; + } + } else if (vtype_.vsew == 32) { + for (int i = 0; i < vl_; i++) { + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (rsdata[i][0] * second); + D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; + } + for (int i = vl_; i < VLMAX; i++) { + *(uint32_t *)(vd.data() + i) = 0; + } + } + } break; + } + } break; + case 7: { + vtype_.vill = 0; + vtype_.vediv = instr.getVediv(); + vtype_.vsew = instr.getVsew(); + vtype_.vlmul = instr.getVlmul(); + + D(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX); + + int s0 = rsdata[0][0]; + if (s0 <= VLMAX) { + vl_ = s0; + } else if (s0 < (2 * VLMAX)) { + vl_ = (int)ceil((s0 * 1.0) / 2.0); + } else if (s0 >= (2 * VLMAX)) { + vl_ = VLMAX; + } + rddata[0] = vl_; + } break; default: std::abort(); } + } break; + default: + std::abort(); + } - if (rd_write) { - int rdt = instr.getRDType(); - switch (rdt) { - case 1: - if (rdest) { - D(2, "[" << std::dec << t << "] Dest Regs: r" << rdest << "=0x" << std::hex << std::hex << rddata); - iregs[rdest] = rddata; + if (rd_write) { + DPH(2, "Dest Reg: "); + int rdt = instr.getRDType(); + switch (rdt) { + case 1: + if (rdest) { + DPH(2, "r" << std::dec << rdest << "={"); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + iRegFile_.at(t)[rdest] = rddata[t]; + if (t) DPN(2, ", "); + DPN(2, "0x" << std::hex << rddata[t]); } - break; - case 2: - D(2, "[" << std::dec << t << "] Dest Regs: fr" << rdest << "=0x" << std::hex << std::hex << rddata); - fregs[rdest] = rddata; - break; - default: - break; + DPN(2, "}" << std::endl); + pipeline_state->used_iregs[rdest] = 1; } + break; + case 2: + DPH(2, "fr" << std::dec << rdest << "={"); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + fRegFile_.at(t)[rdest] = rddata[t]; + if (t) DPN(2, ", "); + DPN(2, "0x" << std::hex << rddata[t]); + } + DPN(2, "}" << std::endl); + pipeline_state->used_fregs[rdest] = 1; + break; + case 3: + pipeline_state->used_vregs[rdest] = 1; + break; + default: + break; } } diff --git a/sim/simX/exeunit.cpp b/sim/simX/exeunit.cpp new file mode 100644 index 00000000..5cdf22f3 --- /dev/null +++ b/sim/simX/exeunit.cpp @@ -0,0 +1,152 @@ +#include "exeunit.h" +#include +#include +#include +#include +#include +#include "debug.h" +#include "core.h" + +using namespace vortex; + +LsuUnit::LsuUnit(Core* core) + : ExeUnit("LSU") + , core_(core) + , num_threads_(core->arch().num_threads()) + , pending_dcache_(LSUQ_SIZE) + , fence_lock_(false) +{} + +void LsuUnit::handleCacheReponse(const MemRsp& response, uint32_t port_id) { + auto entry = pending_dcache_.at(response.tag); + entry.second.reset(port_id); // track remaining blocks + if (!entry.second.any()) { + auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency); + entry.first.dcache_latency = latency; + this->schedule_output(entry.first, 1); + pending_dcache_.release(response.tag); + } +} + +void LsuUnit::step() { + if (fence_lock_) { + // wait for all pending memory operations to complete + if (!pending_dcache_.empty()) + return; + this->schedule_output(fence_state_, 1); + fence_lock_ = false; + } + + if (inputs_.empty()) + return; + + auto state = inputs_.top(); + + if (state.lsu.fence) { + // schedule fence lock + fence_state_ = state; + fence_lock_ = true; + inputs_.pop(); + return; + } + + // send dcache requests + if (!pending_dcache_.full()) { + state.dcache_latency = SimPlatform::instance().cycles(); + auto tag = pending_dcache_.allocate({state, state.tmask}); + for (uint32_t t = 0; t < num_threads_; ++t) { + if (!state.tmask.test(t)) + continue; + MemReq mem_req; + mem_req.addr = state.mem_addrs.at(t); + mem_req.write = state.lsu.store; + mem_req.tag = tag; + core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1); + } + inputs_.pop(); + } +} + +/////////////////////////////////////////////////////////////////////////////// + +AluUnit::AluUnit(Core*) : ExeUnit("ALU") {} + +void AluUnit::step() { + pipeline_state_t state; + if (!inputs_.try_pop(&state)) + return; + switch (state.alu.type) { + case AluType::ARITH: + this->schedule_output(state, 1); + break; + case AluType::BRANCH: + this->schedule_output(state, 1); + break; + case AluType::IMUL: + this->schedule_output(state, LATENCY_IMUL); + break; + case AluType::IDIV: + this->schedule_output(state, XLEN); + break; + } +} + +/////////////////////////////////////////////////////////////////////////////// + +CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {} + +void CsrUnit::step() { + pipeline_state_t state; + if (!inputs_.try_pop(&state)) + return; + this->schedule_output(state, 1); +} + +/////////////////////////////////////////////////////////////////////////////// + +FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {} + +void FpuUnit::step() { + pipeline_state_t state; + if (!inputs_.try_pop(&state)) + return; + switch (state.fpu.type) { + case FpuType::FNCP: + this->schedule_output(state, 1); + break; + case FpuType::FMA: + this->schedule_output(state, LATENCY_FMA); + break; + case FpuType::FDIV: + this->schedule_output(state, LATENCY_FDIV); + break; + case FpuType::FSQRT: + this->schedule_output(state, LATENCY_FSQRT); + break; + case FpuType::FCVT: + this->schedule_output(state, LATENCY_FCVT); + break; + } +} + +/////////////////////////////////////////////////////////////////////////////// + +GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {} + +void GpuUnit::step() { + pipeline_state_t state; + if (!inputs_.try_pop(&state)) + return; + switch (state.gpu.type) { + case GpuType::TMC: + case GpuType::WSPAWN: + case GpuType::SPLIT: + case GpuType::JOIN: + case GpuType::BAR: + this->schedule_output(state, 1); + break; + case GpuType::TEX: + /* TODO */ + break; + } +} \ No newline at end of file diff --git a/sim/simX/exeunit.h b/sim/simX/exeunit.h new file mode 100644 index 00000000..915089d3 --- /dev/null +++ b/sim/simX/exeunit.h @@ -0,0 +1,103 @@ +#pragma once + +#include +#include "pipeline.h" +#include "cache.h" + +namespace vortex { + +class Core; + +class ExeUnit { +protected: + const char* name_; + Queue inputs_; + Queue outputs_; + + void schedule_output(const pipeline_state_t& state, uint32_t delay) { + if (delay > 1) { + SimPlatform::instance().schedule( + [&](const pipeline_state_t& req) { + outputs_.push(req); + }, + state, + (delay - 1) + ); + } else { + outputs_.push(state); + } + } + +public: + typedef std::shared_ptr Ptr; + + ExeUnit(const char* name) : name_(name) {} + + virtual ~ExeUnit() {} + + void push_input(const pipeline_state_t& state) { + inputs_.push(state); + } + + bool pop_output(pipeline_state_t* state) { + return outputs_.try_pop(state); + } + + virtual void step() = 0; +}; + +/////////////////////////////////////////////////////////////////////////////// + +class LsuUnit : public ExeUnit { +private: + Core* core_; + uint32_t num_threads_; + HashTable> pending_dcache_; + pipeline_state_t fence_state_; + bool fence_lock_; + +public: + LsuUnit(Core*); + + void handleCacheReponse(const MemRsp& response, uint32_t port_id); + + void step(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class AluUnit : public ExeUnit { +public: + AluUnit(Core*); + + void step(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class CsrUnit : public ExeUnit { +public: + CsrUnit(Core*); + + void step(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class FpuUnit : public ExeUnit { +public: + FpuUnit(Core*); + + void step(); +}; + +/////////////////////////////////////////////////////////////////////////////// + +class GpuUnit : public ExeUnit { +public: + GpuUnit(Core*); + + void step(); +}; + +} \ No newline at end of file diff --git a/sim/simX/ibuffer.h b/sim/simX/ibuffer.h new file mode 100644 index 00000000..86bdeed7 --- /dev/null +++ b/sim/simX/ibuffer.h @@ -0,0 +1,39 @@ +#pragma once + +#include "pipeline.h" +#include + +namespace vortex { + +class IBuffer { +private: + std::queue entries_; + uint32_t capacity_; + +public: + IBuffer(uint32_t size) + : capacity_(size) + {} + + bool empty() const { + return entries_.empty(); + } + + bool full() const { + return (entries_.size() == capacity_); + } + + const pipeline_state_t& top() const { + return entries_.front(); + } + + void push(const pipeline_state_t& state) { + entries_.emplace(state); + } + + void pop() { + return entries_.pop(); + } +}; + +} \ No newline at end of file diff --git a/sim/simX/instr.h b/sim/simX/instr.h index a93dd61b..1a205478 100644 --- a/sim/simX/instr.h +++ b/sim/simX/instr.h @@ -113,15 +113,12 @@ private: int num_rsrcs_; bool has_imm_; int rdest_type_; - int isrc_mask_; - int fsrc_mask_; - int vsrc_mask_; Word imm_; int rsrc_type_[MAX_REG_SOURCES]; int rsrc_[MAX_REG_SOURCES]; int rdest_; Word func3_; - Word func7_; + Word func6_; //Vector Word vmask_; @@ -132,7 +129,7 @@ private: Word vlmul_; Word vsew_; Word vediv_; - Word func6_; + Word func7_; friend std::ostream &operator<<(std::ostream &, const Instr&); }; diff --git a/sim/simX/main.cpp b/sim/simX/main.cpp index 9af8ff02..a34ada0e 100644 --- a/sim/simX/main.cpp +++ b/sim/simX/main.cpp @@ -5,28 +5,26 @@ #include #include #include - -#include "debug.h" -#include "types.h" -#include "core.h" +#include "processor.h" #include "args.h" using namespace vortex; int main(int argc, char **argv) { + int ret; - std::string archString("rv32imf"); + std::string archStr("rv32imf"); + std::string imgFileName; int num_cores(NUM_CORES * NUM_CLUSTERS); int num_warps(NUM_WARPS); - int num_threads(NUM_THREADS); - std::string imgFileName; + int num_threads(NUM_THREADS); bool showHelp(false); bool showStats(false); bool riscv_test(false); /* Read the command line arguments. */ CommandLineArgFlag fh("-h", "--help", "", showHelp); - CommandLineArgSetter fa("-a", "--arch", "", archString); + CommandLineArgSetter fa("-a", "--arch", "", archStr); CommandLineArgSetter fi("-i", "--image", "", imgFileName); CommandLineArgSetter fc("-c", "--cores", "", num_cores); CommandLineArgSetter fw("-w", "--warps", "", num_warps); @@ -48,62 +46,18 @@ int main(int argc, char **argv) { return 0; } - ArchDef arch(archString, num_cores, num_warps, num_threads); - - Decoder decoder(arch); - MemoryUnit mu(0, arch.wsize(), true); + std::cout << "Running " << imgFileName << "..." << std::endl; - RAM ram((1<<12), (1<<20)); - - std::string program_ext(fileExtension(imgFileName.c_str())); - if (program_ext == "bin") { - ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR); - } else if (program_ext == "hex") { - ram.loadHexImage(imgFileName.c_str()); - } else { - std::cout << "*** error: only *.bin or *.hex images supported." << std::endl; + if (!SimPlatform::instance().initialize()) return -1; - } - mu.attach(ram, 0, 0xFFFFFFFF); + { + ArchDef arch(archStr, num_cores, num_warps, num_threads); + Processor processor(arch); + ret = processor.run(imgFileName, riscv_test, showStats); + } - struct stat hello; - fstat(0, &hello); + SimPlatform::instance().finalize(); - std::vector> cores(num_cores); - for (int i = 0; i < num_cores; ++i) { - cores[i] = std::make_shared(arch, decoder, mu, i); - } - - bool running; - int exitcode = 0; - do { - running = false; - for (auto& core : cores) { - core->step(); - if (core->running()) { - running = true; - } - if (core->check_ebreak()) { - exitcode = core->getIRegValue(3); - running = false; - break; - } - } - } while (running); - - if (riscv_test) { - if (1 == exitcode) { - std::cout << "Passed." << std::endl; - exitcode = 0; - } else { - std::cout << "Failed." << std::endl; - } - } else { - if (exitcode != 0) { - std::cout << "*** error: exitcode=" << exitcode << std::endl; - } - } - - return exitcode; + return ret; } diff --git a/sim/simX/memsim.cpp b/sim/simX/memsim.cpp new file mode 100644 index 00000000..c377972d --- /dev/null +++ b/sim/simX/memsim.cpp @@ -0,0 +1,58 @@ +#include "memsim.h" +#include +#include +#include "constants.h" + +using namespace vortex; + +class MemSim::Impl { +private: + MemSim* simobject_; + std::vector> inputs_; + uint32_t latency_; + +public: + Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency) + : simobject_(simobject) + , inputs_(num_banks) + , latency_(latency) + {} + + void handleMemRequest(const MemReq& mem_req, uint32_t port_id) { + inputs_.at(port_id).push(mem_req); + } + + void step(uint64_t /*cycle*/) { + for (uint32_t i = 0, n = inputs_.size(); i < n; ++i) { + auto& queue = inputs_.at(i); + if (queue.empty()) + continue; + auto& entry = queue.front(); + if (!entry.write) { + MemRsp mem_rsp; + mem_rsp.tag = entry.tag; + simobject_->MemRspPorts.at(i).send(mem_rsp, latency_); + } + queue.pop(); + } + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +MemSim::MemSim(const SimContext& ctx, + uint32_t num_banks, + uint32_t latency) + : SimObject(ctx, "MemSim") + , impl_(new Impl(this, num_banks, latency)) + , MemReqPorts(num_banks, {this, impl_, &Impl::handleMemRequest}) + , MemRspPorts(num_banks, this) +{} + +MemSim::~MemSim() { + delete impl_; +} + +void MemSim::step(uint64_t cycle) { + impl_->step(cycle); +} \ No newline at end of file diff --git a/sim/simX/memsim.h b/sim/simX/memsim.h new file mode 100644 index 00000000..24d8e6ca --- /dev/null +++ b/sim/simX/memsim.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include +#include + +namespace vortex { + +struct MemReq { + uint64_t addr; + uint32_t tag; + bool write; +}; + +struct MemRsp { + uint32_t tag; +}; + +class MemSim : public SimObject{ +private: + class Impl; + Impl* impl_; + +public: + + MemSim(const SimContext& ctx, uint32_t num_inputs, uint32_t latency); + ~MemSim(); + + void step(uint64_t cycle); + + std::vector> MemReqPorts; + std::vector> MemRspPorts; +}; + +}; \ No newline at end of file diff --git a/sim/simX/pipeline.cpp b/sim/simX/pipeline.cpp deleted file mode 100644 index c54977a0..00000000 --- a/sim/simX/pipeline.cpp +++ /dev/null @@ -1,63 +0,0 @@ -#include -#include "pipeline.h" - -using namespace vortex; - -namespace vortex { -std::ostream &operator<<(std::ostream &os, const Pipeline& pipeline) { - os << pipeline.name_ << ": valid=" << pipeline.valid << std::endl; - os << pipeline.name_ << ": stalled=" << pipeline.stalled << std::endl; - os << pipeline.name_ << ": stall_warp=" << pipeline.stall_warp << std::endl; - os << pipeline.name_ << ": wid=" << pipeline.wid << std::endl; - os << pipeline.name_ << ": PC=" << std::hex << pipeline.PC << std::endl; - os << pipeline.name_ << ": used_iregs=" << pipeline.used_iregs << std::endl; - os << pipeline.name_ << ": used_fregs=" << pipeline.used_fregs << std::endl; - os << pipeline.name_ << ": used_vregs=" << pipeline.used_vregs << std::endl; - return os; -} -} - -Pipeline::Pipeline(const char* name) -: name_(name) { - this->clear(); -} - -void Pipeline::clear() { - valid = false; - stalled = false; - stall_warp = false; - wid = 0; - PC = 0; - used_iregs.reset(); - used_fregs.reset(); - used_vregs.reset(); -} - -bool Pipeline::enter(Pipeline *drain) { - if (drain) { - if (drain->stalled) { - this->stalled = true; - return false; - } - drain->valid = false; - } - this->stalled = false; - if (!this->valid) - return false; - return true; -} - -void Pipeline::next(Pipeline *drain) { - if (drain) { - drain->valid = this->valid; - drain->stalled = this->stalled; - drain->stall_warp = this->stall_warp; - drain->wid = this->wid; - drain->PC = this->PC; - drain->rdest = this->rdest; - drain->rdest_type = this->rdest_type; - drain->used_iregs = this->used_iregs; - drain->used_fregs = this->used_fregs; - drain->used_vregs = this->used_vregs; - } -} \ No newline at end of file diff --git a/sim/simX/pipeline.h b/sim/simX/pipeline.h index f8899a63..82735c2a 100644 --- a/sim/simX/pipeline.h +++ b/sim/simX/pipeline.h @@ -2,47 +2,75 @@ #pragma once #include +#include #include #include "types.h" #include "debug.h" namespace vortex { -class Instr; - -class Pipeline { -public: - Pipeline(const char* name); - - void clear(); - - bool enter(Pipeline* drain); - - void next(Pipeline* drain); - - //-- - bool valid; - - //-- - bool stalled; - bool stall_warp; - +struct pipeline_state_t { //-- - int wid; - Word PC; + int wid; + ThreadMask tmask; + Word PC; //-- - int rdest_type; - int rdest; - RegMask used_iregs; - RegMask used_fregs; - RegMask used_vregs; + bool stall_warp; + int rdest_type; + int rdest; + RegMask used_iregs; + RegMask used_fregs; + RegMask used_vregs; -private: + //- + ExeType exe_type; + std::vector mem_addrs; + + //-- + union { + struct { + uint8_t load : 1; + uint8_t store: 1; + uint8_t fence : 1; + uint8_t prefetch: 1; + } lsu; + struct { + AluType type; + } alu; + struct { + FpuType type; + } fpu; + struct { + GpuType type; + } gpu; + }; + // stats + uint64_t icache_latency; + uint64_t dcache_latency; +}; + +class PipelineStage : public Queue { +protected: const char* name_; + friend std::ostream &operator<<(std::ostream &, const pipeline_state_t&); - friend std::ostream &operator<<(std::ostream &, const Pipeline&); -}; +public: + PipelineStage(const char* name = nullptr) + : name_(name) + {} +}; + +inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) { + os << "stall_warp=" << state.stall_warp; + os << ", wid=" << state.wid; + os << ", PC=" << std::hex << state.PC; + os << ", used_iregs=" << state.used_iregs; + os << ", used_fregs=" << state.used_fregs; + os << ", used_vregs=" << state.used_vregs; + os << std::endl; + return os; +} } \ No newline at end of file diff --git a/sim/simX/processor.h b/sim/simX/processor.h new file mode 100644 index 00000000..50671953 --- /dev/null +++ b/sim/simX/processor.h @@ -0,0 +1,189 @@ +#pragma once + +#include "constants.h" +#include "debug.h" +#include "types.h" +#include "core.h" + +namespace vortex { + +class Processor { +private: + ArchDef arch_; + Decoder decoder_; + MemoryUnit mu_; + RAM ram_; + std::vector cores_; + std::vector l2caches_; + std::vector::Ptr> l2_mem_switches_; + Cache::Ptr l3cache_; + Switch::Ptr l3_mem_switch_; + MemSim::Ptr memsim_; + +public: + Processor(const ArchDef& arch) + : arch_(arch) + , decoder_(arch) + , mu_(0, arch.wsize(), true) + , ram_((1<<12), (1<<20)) + , cores_(arch.num_cores()) + , l2caches_(NUM_CLUSTERS) + , l2_mem_switches_(NUM_CLUSTERS) + { + uint32_t num_cores = arch.num_cores(); + uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; + + // bind RAM to memory unit + mu_.attach(ram_, 0, 0xFFFFFFFF); + + // create cores + for (uint32_t i = 0; i < num_cores; ++i) { + cores_.at(i) = Core::Create(arch, decoder_, mu_, i); + } + + // connect memory sub-systen + memsim_ = MemSim::Create(1, MEM_LATENCY); + std::vector*> mem_req_ports(1); + std::vector*> mem_rsp_ports(1); + mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0); + mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0); + + if (L3_ENABLE) { + l3cache_ = Cache::Create("l3cache", CacheConfig{ + log2ceil(L3_CACHE_SIZE), // C + log2ceil(MEM_BLOCK_SIZE), // B + 2, // W + 0, // A + 32, // address bits + L3_NUM_BANKS, // number of banks + L3_NUM_PORTS, // number of ports + NUM_CLUSTERS, // request size + true, // write-throught + 0, // victim size + L3_MSHR_SIZE, // mshr + 2, // pipeline latency + }); + mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort); + l3cache_->MemReqPort.bind(mem_req_ports.at(0)); + + mem_req_ports.resize(NUM_CLUSTERS); + mem_rsp_ports.resize(NUM_CLUSTERS); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i); + mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i); + } + } else if (NUM_CLUSTERS > 1) { + l3_mem_switch_ = Switch::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS); + mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn); + l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0)); + + mem_req_ports.resize(NUM_CLUSTERS); + mem_rsp_ports.resize(NUM_CLUSTERS); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i); + mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i); + } + } + + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + if (L2_ENABLE) { + auto& l2cache = l2caches_.at(i); + l2cache = Cache::Create("l2cache", CacheConfig{ + log2ceil(L2_CACHE_SIZE), // C + log2ceil(MEM_BLOCK_SIZE), // B + 2, // W + 0, // A + 32, // address bits + L2_NUM_BANKS, // number of banks + L2_NUM_PORTS, // number of ports + NUM_CORES, // request size + true, // write-throught + 0, // victim size + L2_MSHR_SIZE, // mshr + 2, // pipeline latency + }); + mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort); + l2cache->MemReqPort.bind(mem_req_ports.at(i)); + + mem_req_ports.resize(cores_per_cluster); + mem_rsp_ports.resize(cores_per_cluster); + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); + mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); + } + } else if (cores_per_cluster > 1) { + auto& l2_mem_switch = l2_mem_switches_.at(i); + l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES); + mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); + l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); + + mem_req_ports.resize(cores_per_cluster); + mem_rsp_ports.resize(cores_per_cluster); + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); + mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); + } + } + + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + auto& core = cores_.at((i * NUM_CLUSTERS) + j); + mem_rsp_ports.at(i)->bind(&core->MemRspPort); + core->MemReqPort.bind(mem_req_ports.at(j)); + } + } + } + + ~Processor() {} + + int run(const std::string& program, bool riscv_test, bool /*showStats*/) { + { + std::string program_ext(fileExtension(program.c_str())); + if (program_ext == "bin") { + ram_.loadBinImage(program.c_str(), STARTUP_ADDR); + } else if (program_ext == "hex") { + ram_.loadHexImage(program.c_str()); + } else { + std::cout << "*** error: only *.bin or *.hex images supported." << std::endl; + return -1; + } + } + + bool running; + int exitcode = 0; + do { + SimPlatform::instance().step(); + + running = false; + for (auto& core : cores_) { + if (core->running()) { + running = true; + } + if (core->check_ebreak()) { + exitcode = core->getIRegValue(3); + running = false; + break; + } + } + } while (running); + + // get error status + + if (riscv_test) { + if (1 == exitcode) { + std::cout << "Passed." << std::endl; + exitcode = 0; + } else { + std::cout << "Failed." << std::endl; + } + } else { + if (exitcode != 0) { + std::cout << "*** error: exitcode=" << exitcode << std::endl; + } + } + + return exitcode; + } + +}; + +} \ No newline at end of file diff --git a/sim/simX/scoreboard.h b/sim/simX/scoreboard.h new file mode 100644 index 00000000..0e0e0577 --- /dev/null +++ b/sim/simX/scoreboard.h @@ -0,0 +1,71 @@ +#pragma once + +#include "pipeline.h" +#include + +namespace vortex { + +class Scoreboard { +private: + std::vector in_use_iregs_; + std::vector in_use_fregs_; + std::vector in_use_vregs_; + +public: + Scoreboard(const ArchDef &arch) + : in_use_iregs_(arch.num_warps()) + , in_use_fregs_(arch.num_warps()) + , in_use_vregs_(arch.num_warps()) + { + for (int w = 0; w < arch.num_warps(); ++w) { + in_use_iregs_.at(w).reset(); + in_use_fregs_.at(w).reset(); + in_use_vregs_.at(w).reset(); + } + } + + bool in_use(const pipeline_state_t& state) const { + return (state.used_iregs & in_use_iregs_.at(state.wid)) != 0 + || (state.used_fregs & in_use_fregs_.at(state.wid)) != 0 + || (state.used_vregs & in_use_vregs_.at(state.wid)) != 0; + } + + void reserve(const pipeline_state_t& state) { + if (!state.rdest) + return; + + switch (state.rdest_type) { + case 1: + in_use_iregs_.at(state.wid).set(state.rdest); + break; + case 2: + in_use_fregs_.at(state.wid).set(state.rdest); + break; + case 3: + in_use_vregs_.at(state.wid).set(state.rdest); + break; + default: + break; + } + } + + void release(const pipeline_state_t& state) { + if (!state.rdest) + return; + switch (state.rdest_type) { + case 1: + in_use_iregs_.at(state.wid).reset(state.rdest); + break; + case 2: + in_use_fregs_.at(state.wid).reset(state.rdest); + break; + case 3: + in_use_vregs_.at(state.wid).reset(state.rdest); + break; + default: + break; + } + } +}; + +} \ No newline at end of file diff --git a/sim/simX/types.h b/sim/simX/types.h index ca732040..3dabfe3e 100644 --- a/sim/simX/types.h +++ b/sim/simX/types.h @@ -2,7 +2,10 @@ #include #include +#include +#include #include +#include namespace vortex { @@ -14,9 +17,242 @@ typedef uint32_t Addr; typedef uint32_t Size; typedef std::bitset<32> RegMask; - typedef std::bitset<32> ThreadMask; - typedef std::bitset<32> WarpMask; +enum class ExeType { + ALU, + LSU, + CSR, + FPU, + GPU, + MAX, +}; + +enum class AluType { + ARITH, + BRANCH, + IMUL, + IDIV, +}; + +enum class FpuType { + FNCP, + FMA, + FDIV, + FSQRT, + FCVT, +}; + +enum class GpuType { + TMC, + WSPAWN, + SPLIT, + JOIN, + BAR, + TEX, +}; + +enum class ArbiterType { + Priority, + RoundRobin +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class Queue { +protected: + std::queue queue_; + +public: + Queue() {} + + bool empty() const { + return queue_.empty(); + } + + const T& top() const { + return queue_.front(); + } + + void push(const T& value) { + queue_.push(value); + } + + void pop() { + queue_.pop(); + } + + bool try_pop(T* value) { + if (queue_.empty()) + return false; + *value = queue_.front(); + queue_.pop(); + return true; + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class HashTable { +private: + std::vector> entries_; + uint32_t capacity_; + +public: + HashTable(uint32_t size) + : entries_(size) + , capacity_(0) + {} + + bool empty() const { + return (0 == capacity_); + } + + bool full() const { + return (capacity_ == entries_.size()); + } + + bool contains(uint32_t index) const { + return entries_.at(index).first; + } + + const T& at(uint32_t index) const { + auto& entry = entries_.at(index); + assert(entry.first); + return entry.second; + } + + T& at(uint32_t index) { + auto& entry = entries_.at(index); + assert(entry.first); + return entry.second; + } + + uint32_t allocate(const T& value) { + for (uint32_t i = 0, n = entries_.size(); i < n; ++i) { + auto& entry = entries_.at(i); + if (!entry.first) { + entry.first = true; + entry.second = value; + ++capacity_; + return i; + } + } + return -1; + } + + void release(uint32_t index) { + auto& entry = entries_.at(index); + assert(entry.first); + entry.first = false; + } + + void remove(uint32_t index, T* value) { + auto& entry = entries_.at(index); + assert(entry.first); + *value = entry.second; + entry.first = false; + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class Switch : public SimObject> { +private: + struct req_t { + std::vector data; + std::bitset valid; + req_t() {} + req_t(uint32_t size) : data(size) {} + }; + + void handleIncomingRequest(const Req& req, uint32_t port_id) { + cur_req_.data.at(port_id) = req; + cur_req_.valid.set(port_id); + } + + void handleIncomingResponse(const Rsp& rsp, uint32_t) { + rsps_.push(rsp); + } + + ArbiterType type_; + std::queue reqs_; + std::queue rsps_; + req_t cur_req_; + uint32_t delay_; + uint32_t cursor_; + std::unordered_map addr_table_; + +public: + Switch( + const SimContext& ctx, + const char* name, + ArbiterType type, + uint32_t num_inputs, + uint32_t delay = 1 + ) + : SimObject>(ctx, name) + , type_(type) + , cur_req_(num_inputs) + , delay_(delay) + , cursor_(0) + , ReqIn(num_inputs, {this, this, &Switch::handleIncomingRequest}) + , ReqOut(this) + , RspIn(this, this, &Switch::handleIncomingResponse) + , RspOut(num_inputs, this) + { + assert(delay_ != 0); + assert(num_inputs <= MaxInputs); + } + + void step(uint64_t /*cycle*/) { + if (cur_req_.valid.any()) { + reqs_.push(cur_req_); + cur_req_.valid.reset(); + } + + while (!reqs_.empty()) { + auto& entry = reqs_.front(); + bool found = false; + for (uint32_t i = 0, n = entry.data.size(); i < n; ++i) { + auto j = (cursor_ + i) % n; + if (entry.valid.test(j)) { + auto& req = entry.data.at(j); + addr_table_[req.tag] = j; + ReqOut.send(req, delay_); + entry.valid.reset(j); + this->update_cursor(j); + found = true; + break; + } + } + if (found) + break; + reqs_.pop(); + } + + if (!rsps_.empty()) { + auto& rsp = rsps_.front(); + auto port_id = addr_table_.at(rsp.tag); + RspOut.at(port_id).send(rsp, 1); + rsps_.pop(); + } + } + + void update_cursor(uint32_t grant) { + if (type_ == ArbiterType::RoundRobin) { + cursor_ = grant + 1; + } + } + + std::vector> ReqIn; + MasterPort ReqOut; + SlavePort RspIn; + std::vector> RspOut; +}; + } \ No newline at end of file diff --git a/sim/simX/warp.cpp b/sim/simX/warp.cpp index a505fe5c..0c989d0c 100644 --- a/sim/simX/warp.cpp +++ b/sim/simX/warp.cpp @@ -12,25 +12,21 @@ using namespace vortex; Warp::Warp(Core *core, Word id) : id_(id) - , core_(core) { + , core_(core) + , active_(false) + , PC_(STARTUP_ADDR) + , tmask_(0) { iRegFile_.resize(core_->arch().num_threads(), std::vector(core_->arch().num_regs(), 0)); fRegFile_.resize(core_->arch().num_threads(), std::vector(core_->arch().num_regs(), 0)); vRegFile_.resize(core_->arch().num_regs(), std::vector(core_->arch().vsize(), 0)); - this->clear(); } -void Warp::clear() { - PC_ = STARTUP_ADDR; - tmask_.reset(); - active_ = false; -} - -void Warp::step(Pipeline *pipeline) { +void Warp::eval(pipeline_state_t *pipeline_state) { assert(tmask_.any()); DPH(2, "Step: wid=" << id_ << ", PC=0x" << std::hex << PC_ << ", tmask="); for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) - DPN(2, tmask_[n-i-1]); + DPN(2, tmask_.test(n-i-1)); DPN(2, "\n"); /* Fetch and decode. */ @@ -38,55 +34,24 @@ void Warp::step(Pipeline *pipeline) { Word fetched = core_->icache_fetch(PC_); auto instr = core_->decoder().decode(fetched, PC_); - // Update pipeline - pipeline->valid = true; - pipeline->PC = PC_; - pipeline->rdest = instr->getRDest(); - pipeline->rdest_type = instr->getRDType(); - pipeline->used_iregs.reset(); - pipeline->used_fregs.reset(); - pipeline->used_vregs.reset(); - - switch (pipeline->rdest_type) { - case 1: - pipeline->used_iregs[pipeline->rdest] = 1; - break; - case 2: - pipeline->used_fregs[pipeline->rdest] = 1; - break; - case 3: - pipeline->used_vregs[pipeline->rdest] = 1; - break; - default: - break; - } - - for (int i = 0; i < instr->getNRSrc(); ++i) { - int type = instr->getRSType(i); - int reg = instr->getRSrc(i); - switch (type) { - case 1: - pipeline->used_iregs[reg] = 1; - break; - case 2: - pipeline->used_fregs[reg] = 1; - break; - case 3: - pipeline->used_vregs[reg] = 1; - break; - default: - break; - } - } + // Update state + pipeline_state->wid = id_; + pipeline_state->PC = PC_; + pipeline_state->tmask = tmask_; + pipeline_state->rdest = instr->getRDest(); + pipeline_state->rdest_type = instr->getRDType(); + pipeline_state->used_iregs.reset(); + pipeline_state->used_fregs.reset(); + pipeline_state->used_vregs.reset(); // Execute - this->execute(*instr, pipeline); + this->execute(*instr, pipeline_state); D(4, "Register state:"); for (int i = 0; i < core_->arch().num_regs(); ++i) { DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':'); for (int j = 0; j < core_->arch().num_threads(); ++j) { - DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_[j][i] << std::setfill(' ') << ' '); + DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_.at(j).at(i) << std::setfill(' ') << ' '); } DPN(4, std::endl); } diff --git a/sim/simX/warp.h b/sim/simX/warp.h index 7473d858..99b372ca 100644 --- a/sim/simX/warp.h +++ b/sim/simX/warp.h @@ -9,7 +9,7 @@ namespace vortex { class Core; class Instr; -class Pipeline; +class pipeline_state_t; struct DomStackEntry { DomStackEntry(const ThreadMask &tmask, Word PC) : tmask(tmask) @@ -41,8 +41,6 @@ struct vtype { class Warp { public: Warp(Core *core, Word id); - - void clear(); bool active() const { return active_; @@ -71,7 +69,7 @@ public: } void setTmask(size_t index, bool value) { - tmask_[index] = value; + tmask_.set(index, value); active_ = tmask_.any(); } @@ -82,18 +80,18 @@ public: } Word getIRegValue(int reg) const { - return iRegFile_[0][reg]; + return iRegFile_.at(0).at(reg); } - void step(Pipeline *); + void eval(pipeline_state_t *); private: - void execute(const Instr &instr, Pipeline *); + void execute(const Instr &instr, pipeline_state_t *pipeline_state); Word id_; - bool active_; Core *core_; + bool active_; Word PC_; ThreadMask tmask_; From bd70afa6883e1e5427ea3755681ee78357003eef Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 14 Nov 2021 04:44:25 -0500 Subject: [PATCH 02/27] cache multi-porting fix - ensure per-bank uniform rw --- hw/rtl/cache/VX_cache_define.vh | 9 ++++----- hw/rtl/cache/VX_core_req_bank_sel.sv | 11 +++++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index c0709cce..8af2921b 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -24,7 +24,7 @@ `define WORD_ADDR_WIDTH (32-`CLOG2(WORD_SIZE)) `define MEM_ADDR_WIDTH (32-`CLOG2(CACHE_LINE_SIZE)) -`define LINE_ADDR_WIDTH (`MEM_ADDR_WIDTH-`BANK_SELECT_BITS) +`define LINE_ADDR_WIDTH (`MEM_ADDR_WIDTH-`CLOG2(NUM_BANKS)) // Word select `define WORD_SELECT_BITS `CLOG2(`WORDS_PER_LINE) @@ -46,10 +46,9 @@ `define TAG_SELECT_ADDR_START (1+`LINE_SELECT_ADDR_END) `define TAG_SELECT_ADDR_END (`WORD_ADDR_WIDTH-1) -`define BANK_SELECT_ADDR(x) x[`BANK_SELECT_ADDR_END : `BANK_SELECT_ADDR_START] - -`define LINE_SELECT_ADDR0(x) x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START] -`define LINE_SELECT_ADDRX(x) {x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START], x[`BANK_SELECT_ADDR_START-1 : 1+`WORD_SELECT_ADDR_END]} +`define SELECT_BANK_ID(x) x[`BANK_SELECT_ADDR_END : `BANK_SELECT_ADDR_START] +`define SELECT_LINE_ADDR0(x) x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START] +`define SELECT_LINE_ADDRX(x) {x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START], x[`BANK_SELECT_ADDR_START-1 : 1+`WORD_SELECT_ADDR_END]} `define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS] diff --git a/hw/rtl/cache/VX_core_req_bank_sel.sv b/hw/rtl/cache/VX_core_req_bank_sel.sv index 01c9f12b..1197edfb 100644 --- a/hw/rtl/cache/VX_core_req_bank_sel.sv +++ b/hw/rtl/cache/VX_core_req_bank_sel.sv @@ -57,16 +57,16 @@ module VX_core_req_bank_sel #( for (genvar i = 0; i < NUM_REQS; i++) begin if (BANK_ADDR_OFFSET == 0) begin - assign core_req_line_addr[i] = `LINE_SELECT_ADDR0(core_req_addr[i]); + assign core_req_line_addr[i] = `SELECT_LINE_ADDR0(core_req_addr[i]); end else begin - assign core_req_line_addr[i] = `LINE_SELECT_ADDRX(core_req_addr[i]); + assign core_req_line_addr[i] = `SELECT_LINE_ADDRX(core_req_addr[i]); end assign core_req_wsel[i] = core_req_addr[i][`UP(`WORD_SELECT_BITS)-1:0]; end for (genvar i = 0; i < NUM_REQS; ++i) begin if (NUM_BANKS > 1) begin - assign core_req_bid[i] = `BANK_SELECT_ADDR(core_req_addr[i]); + assign core_req_bid[i] = `SELECT_BANK_ID(core_req_addr[i]); end else begin assign core_req_bid[i] = 0; end @@ -88,6 +88,7 @@ module VX_core_req_bank_sel #( if (NUM_PORTS > 1) begin reg [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_line_addr_r; + reg [NUM_BANKS-1:0] per_bank_rw_r; wire [NUM_REQS-1:0] core_req_line_match; always @(*) begin @@ -95,12 +96,14 @@ module VX_core_req_bank_sel #( for (integer i = NUM_REQS-1; i >= 0; --i) begin if (core_req_valid[i]) begin per_bank_line_addr_r[core_req_bid[i]] = core_req_line_addr[i]; + per_bank_rw_r[core_req_bid[i]] = core_req_rw[i]; end end end for (genvar i = 0; i < NUM_REQS; ++i) begin - assign core_req_line_match[i] = (core_req_line_addr[i] == per_bank_line_addr_r[core_req_bid[i]]); + assign core_req_line_match[i] = (core_req_line_addr[i] == per_bank_line_addr_r[core_req_bid[i]]) + && (core_req_rw[i] == per_bank_rw_r[core_req_bid[i]]); end if (NUM_PORTS < NUM_REQS) begin From 9656779d48b2e5d20a4390d892329ff8c5773f3f Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 14 Nov 2021 04:45:06 -0500 Subject: [PATCH 03/27] minor update --- driver/common/opae.cpp | 10 +++++----- driver/rtlsim/vortex.cpp | 8 ++++---- driver/simx/vortex.cpp | 12 ++++++------ hw/rtl/VX_config.vh | 8 ++++++-- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/driver/common/opae.cpp b/driver/common/opae.cpp index de6e0fbb..aa4bf933 100755 --- a/driver/common/opae.cpp +++ b/driver/common/opae.cpp @@ -67,7 +67,7 @@ typedef struct vx_buffer_ { size_t size; } vx_buffer_t; -inline size_t align_size(size_t size, size_t alignment) { +inline size_t aligned_size(size_t size, size_t alignment) { assert(0 == (alignment & (alignment - 1))); return (size + alignment - 1) & ~(alignment - 1); } @@ -288,7 +288,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) vx_device_t *device = ((vx_device_t*)hdevice); size_t dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (device->mem_allocation + asize > dev_mem_size) return -1; @@ -313,7 +313,7 @@ extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hb vx_device_t *device = ((vx_device_t*)hdevice); - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); res = fpgaPrepareBuffer(device->fpga, asize, &host_ptr, &wsid, 0); if (FPGA_OK != res) { @@ -439,7 +439,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si vx_device_t *device = ((vx_device_t*)buffer->hdevice); size_t dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); // check alignment if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE)) @@ -480,7 +480,7 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, vx_device_t *device = ((vx_device_t*)buffer->hdevice); size_t dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); // check alignment if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE)) diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index 64fcd72e..cfed5a97 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -22,7 +22,7 @@ public: vx_buffer(size_t size, vx_device* device) : size_(size) , device_(device) { - auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE); + auto aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE); data_ = malloc(aligned_asize); } @@ -66,7 +66,7 @@ public: int alloc_local_mem(size_t size, size_t* dev_maddr) { auto dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (mem_allocation_ + asize > dev_mem_size) return -1; *dev_maddr = mem_allocation_; @@ -75,7 +75,7 @@ public: } int upload(const void* src, size_t dest_addr, size_t size, size_t src_offset) { - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (dest_addr + asize > ram_.size()) return -1; @@ -93,7 +93,7 @@ public: } int download(void* dest, size_t src_addr, size_t size, size_t dest_offset) { - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (src_addr + asize > ram_.size()) return -1; diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index 5ad4242b..5c31cb87 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -25,7 +25,7 @@ public: vx_buffer(size_t size, vx_device* device) : size_(size) , device_(device) { - auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE); + auto aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE); data_ = malloc(aligned_asize); } @@ -70,7 +70,7 @@ public: mem_allocation_ = ALLOC_BASE_ADDR; mmu_.attach(ram_, 0, 0xffffffff); for (int i = 0; i < arch_.num_cores(); ++i) { - cores_[i] = std::make_shared(arch_, decoder_, mmu_, i); + cores_.at(i) = std::make_shared(arch_, decoder_, mmu_, i); } } @@ -84,7 +84,7 @@ public: int alloc_local_mem(size_t size, size_t* dev_maddr) { auto dev_mem_size = LOCAL_MEM_SIZE; - auto asize = align_size(size, CACHE_BLOCK_SIZE); + auto asize = aligned_size(size, CACHE_BLOCK_SIZE); if (mem_allocation_ + asize > dev_mem_size) return -1; *dev_maddr = mem_allocation_; @@ -93,7 +93,7 @@ public: } int upload(const void* src, size_t dest_addr, size_t size, size_t src_offset) { - auto asize = align_size(size, CACHE_BLOCK_SIZE); + auto asize = aligned_size(size, CACHE_BLOCK_SIZE); if (dest_addr + asize > ram_.size()) return -1; @@ -108,7 +108,7 @@ public: } int download(void* dest, size_t src_addr, size_t size, size_t dest_offset) { - size_t asize = align_size(size, CACHE_BLOCK_SIZE); + size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (src_addr + asize > ram_.size()) return -1; @@ -126,7 +126,7 @@ public: mutex_.lock(); for (int i = 0; i < arch_.num_cores(); ++i) { - cores_[i]->clear(); + cores_.at(i)->clear(); } is_running_ = true; mutex_.unlock(); diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index b52a1ab2..de58d9ee 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -1,6 +1,10 @@ `ifndef VX_CONFIG `define VX_CONFIG +`ifndef XLEN +`define XLEN 32 +`endif + `ifndef NUM_CLUSTERS `define NUM_CLUSTERS 1 `endif @@ -373,7 +377,7 @@ // Number of banks `ifndef L2_NUM_BANKS -`define L2_NUM_BANKS `MIN(`NUM_CORES, 4) +`define L2_NUM_BANKS ((`NUM_CORES < 4) ? `NUM_CORES : 4) `endif // Number of ports per bank @@ -415,7 +419,7 @@ // Number of banks `ifndef L3_NUM_BANKS -`define L3_NUM_BANKS `MIN(`NUM_CLUSTERS, 4) +`define L3_NUM_BANKS ((`NUM_CLUSTERS < 4) ? `NUM_CORES : 4) `endif // Number of ports per bank From 808bddb586124bbc99b482cc6ad0d37fafce5cdd Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 14 Nov 2021 08:52:34 -0500 Subject: [PATCH 04/27] simx timing simulation refactoring --- sim/common/simobject.h | 328 +++++++++++++---------------- sim/simX/Makefile | 2 +- sim/simX/cache.cpp | 77 +++---- sim/simX/cache.h | 10 +- sim/simX/core.cpp | 169 +++++++++------ sim/simX/core.h | 33 +-- sim/simX/debug.h | 34 ++- sim/simX/decode.cpp | 43 +--- sim/simX/decode.h | 2 +- sim/simX/execute.cpp | 464 +++++++++++++++++++++-------------------- sim/simX/exeunit.cpp | 87 +++++--- sim/simX/exeunit.h | 23 +- sim/simX/instr.h | 25 +-- sim/simX/main.cpp | 40 +++- sim/simX/memsim.cpp | 22 +- sim/simX/pipeline.h | 54 +++-- sim/simX/processor.cpp | 141 +++++++++++++ sim/simX/processor.h | 182 +--------------- sim/simX/scoreboard.h | 70 +++++-- sim/simX/types.h | 194 +++++++++++++---- sim/simX/warp.cpp | 22 +- sim/vlsim/opae_sim.cpp | 4 +- 22 files changed, 1123 insertions(+), 903 deletions(-) create mode 100644 sim/simX/processor.cpp diff --git a/sim/common/simobject.h b/sim/common/simobject.h index 68bccc87..487d385c 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -11,6 +11,128 @@ namespace vortex { class SimObjectBase; +/////////////////////////////////////////////////////////////////////////////// + +class SimPortBase { +public: + virtual ~SimPortBase() {} + + SimObjectBase* module() const { + return module_; + } + + SimPortBase* peer() const { + return peer_; + } + + bool connected() const { + return (peer_ != nullptr); + } + +protected: + SimPortBase(SimObjectBase* module) + : module_(module) + , peer_(nullptr) + {} + + void connect(SimPortBase* peer) { + assert(peer_ == nullptr); + peer_ = peer; + } + + void disconnect() { + assert(peer_ == nullptr); + peer_ = nullptr; + } + + SimPortBase& operator=(const SimPortBase&) = delete; + + SimObjectBase* module_; + SimPortBase* peer_; + + template friend class SlavePort; + template friend class MasterPort; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SimPort : public SimPortBase { +public: + void send(const Pkt& pkt, uint64_t delay) const; + + bool read(Pkt* out) { + if (!valid_) + return false; + *out = data_; + valid_ = false; + return true; + } + +protected: + SimPort(SimObjectBase* module) + : SimPortBase(module) + , valid_(false) + {} + + void write(const Pkt& data) { + assert(!valid_); + data_ = data; + valid_ = true; + } + + SimPort& operator=(const SimPort&) = delete; + + Pkt data_; + bool valid_; + + template friend class SimPortEvent; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class SlavePort : public SimPort { +public: + SlavePort(SimObjectBase* module) : SimPort(module) {} + + void bind(SlavePort* peer) { + this->connect(peer); + } + + void unbind() { + this->disconnect(); + } + +protected: + SlavePort& operator=(const SlavePort&) = delete; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class MasterPort : public SimPort { +public: + MasterPort(SimObjectBase* module) : SimPort(module) {} + + void bind(SlavePort* peer) { + this->connect(peer); + } + + void bind(MasterPort* peer) { + this->connect(peer); + } + + void unbind() { + this->disconnect(); + } + +protected: + MasterPort& operator=(const MasterPort&) = delete; +}; + +/////////////////////////////////////////////////////////////////////////////// + class SimEventBase { public: typedef std::shared_ptr Ptr; @@ -32,16 +154,16 @@ protected: /////////////////////////////////////////////////////////////////////////////// template -class SimSimpleEvent : public SimEventBase { +class SimCallEvent : public SimEventBase { public: typedef std::function Func; template static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) { - return std::make_shared(func, pkt, delay); + return std::make_shared(func, pkt, delay); } - SimSimpleEvent(const Func& func, const Pkt& pkt, uint64_t delay) + SimCallEvent(const Func& func, const Pkt& pkt, uint64_t delay) : SimEventBase(delay) , func_(func) , pkt_(pkt) @@ -61,167 +183,23 @@ protected: template class SimPortEvent : public SimEventBase { public: - typedef std::function Func; - - template - static Ptr Create(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) { - return std::make_shared(func, pkt, port_id, delay); + static Ptr Create(const SimPort* port, const Pkt& pkt, uint64_t delay) { + return std::make_shared(port, pkt, delay); } - SimPortEvent(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) + SimPortEvent(const SimPort* port, const Pkt& pkt, uint64_t delay) : SimEventBase(delay) - , func_(func) + , port_(port) , pkt_(pkt) - , port_id_(port_id) {} void fire() const override { - func_(pkt_, port_id_); + const_cast*>(port_)->write(pkt_); } private: - Func func_; - Pkt pkt_; - uint32_t port_id_; -}; - -/////////////////////////////////////////////////////////////////////////////// - -class SimPortBase { -public: - typedef std::shared_ptr Ptr; - - virtual ~SimPortBase() {} - - SimObjectBase* module() const { - return module_; - } - - uint32_t port_id() const { - return port_id_; - } - - SimPortBase* peer() const { - return peer_; - } - - bool connected() const { - return (peer_ != nullptr); - } - - bool is_slave() const { - return is_slave_; - } - -protected: - - SimPortBase(SimObjectBase* module, bool is_slave); - - void connect(SimPortBase* peer) { - assert(peer_ == nullptr); - peer_ = peer; - } - - void disconnect() { - assert(peer_ == nullptr); - peer_ = nullptr; - } - - SimObjectBase* module_; - uint32_t port_id_; - bool is_slave_; - SimPortBase* peer_; - - template friend class MasterPort; -}; - -/////////////////////////////////////////////////////////////////////////////// - -template -class SlavePort : public SimPortBase { -public: - typedef std::shared_ptr> Ptr; - typedef std::function Func; - - static Ptr Create(SimObjectBase* module, const Func& func) { - return std::make_shared>(module, func); - } - - template - static Ptr Create(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) { - return std::make_shared>(module, obj, entry); - } - - SlavePort(SimObjectBase* module, const Func& func) - : SimPortBase(module, true) - , func_(func) - {} - - template - SlavePort(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) - : SimPortBase(module, true) - , func_(std::bind(entry, obj, std::placeholders::_1, std::placeholders::_2)) - {} - - SlavePort(SimObjectBase* module, SlavePort* peer) - : SimPortBase(module, false) - { - this->connect(peer); - } - - void send(const Pkt& pkt, uint64_t delay) const; - - const Func& func() const { - return func_; - } - -protected: - SlavePort& operator=(const SlavePort&); - Func func_; -}; - -/////////////////////////////////////////////////////////////////////////////// - -template -class MasterPort : public SimPortBase { -public: - typedef std::shared_ptr> Ptr; - typedef std::function Func; - - static Ptr Create() { - return std::make_shared>(module); - } - - MasterPort(SimObjectBase* module) : SimPortBase(module, false) {} - - MasterPort(SimObjectBase* module, MasterPort* peer) - : SimPortBase(module, false) - { - peer->connect(this); - } - - void bind(SlavePort* peer) { - this->connect(peer); - } - - void unbind() { - peer_->disconnect(); - this->disconnect(); - } - - void send(const Pkt& pkt, uint64_t delay) const { - assert(peer_ != nullptr); - if (peer_->is_slave()) { - auto slave = reinterpret_cast*>(peer_); - slave->send(pkt, delay); - } else { - auto master = reinterpret_cast*>(peer_); - master->send(pkt, delay); - } - } - -private: - MasterPort& operator=(const MasterPort&); + const SimPort* port_; + Pkt pkt_; }; /////////////////////////////////////////////////////////////////////////////// @@ -237,25 +215,18 @@ public: template void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay); - virtual void step(uint64_t cycle) = 0; - const std::string& name() const { return name_; } protected: - SimObjectBase(const SimContext& ctx, const char* name); + virtual void step(uint64_t cycle) = 0; - uint32_t allocate_port(SimPortBase* port) { - uint32_t id = ports_.size(); - ports_.push_back(port); - return id; - } + SimObjectBase(const SimContext& ctx, const char* name); private: std::string name_; - std::vector ports_; friend class SimPlatform; friend class SimPortBase; @@ -320,20 +291,19 @@ public: } template - void schedule(const typename SimSimpleEvent::Func& callback, + void schedule(const typename SimCallEvent::Func& callback, const Pkt& pkt, uint64_t delay) { - auto evt = SimSimpleEvent::Create(callback, pkt, delay); + auto evt = SimCallEvent::Create(callback, pkt, delay); assert(delay != 0); events_.emplace_back(evt); } template - void schedule(const typename SimPortEvent::Func& callback, + void schedule(const SimPort* port, const Pkt& pkt, - uint32_t port_id, uint64_t delay) { - auto evt = SimPortEvent::Create(callback, pkt, port_id, delay); + auto evt = SimPortEvent::Create(port, pkt, delay); assert(delay != 0); events_.emplace_back(evt); } @@ -383,13 +353,6 @@ private: /////////////////////////////////////////////////////////////////////////////// -inline SimPortBase::SimPortBase(SimObjectBase* module, bool is_slave) - : module_(module) - , port_id_(module->allocate_port(this)) - , is_slave_(is_slave) - , peer_(nullptr) -{} - inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) : name_(name) {} @@ -403,18 +366,11 @@ typename SimObject::Ptr SimObject::Create(Args&&... args) { } template -void SlavePort::send(const Pkt& pkt, uint64_t delay) const { - if (func_) { - SimPlatform::instance().schedule(func_, pkt, port_id_, delay); +void SimPort::send(const Pkt& pkt, uint64_t delay) const { + if (peer_) { + reinterpret_cast*>(peer_)->send(pkt, delay); } else { - assert(peer_ != nullptr); - if (peer_->is_slave()) { - auto slave = reinterpret_cast*>(peer_); - slave->send(pkt, delay); - } else { - auto master = reinterpret_cast*>(peer_); - master->send(pkt, delay); - } + SimPlatform::instance().schedule(this, pkt, delay); } } diff --git a/sim/simX/Makefile b/sim/simX/Makefile index e42464c6..75a4a495 100644 --- a/sim/simX/Makefile +++ b/sim/simX/Makefile @@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a TOP = vx_cache_sim SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp -SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp main.cpp +SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp processor.cpp main.cpp OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS))) VPATH := $(sort $(dir $(SRCS))) diff --git a/sim/simX/cache.cpp b/sim/simX/cache.cpp index f139cb43..503d32c5 100644 --- a/sim/simX/cache.cpp +++ b/sim/simX/cache.cpp @@ -1,5 +1,6 @@ #include "cache.h" #include "debug.h" +#include "types.h" #include #include #include @@ -30,8 +31,7 @@ struct params_t { uint32_t offset_bits = config.B - config.W; uint32_t log2_bank_size = config.C - bank_bits; uint32_t index_bits = log2_bank_size - (config.B << config.A); - assert(log2_bank_size >= config.B); - + assert(log2_bank_size >= config.B); this->words_per_block = 1 << offset_bits; this->blocks_per_set = 1 << config.A; @@ -229,9 +229,10 @@ private: CacheConfig config_; params_t params_; std::vector banks_; - std::vector> core_reqs_; - std::pair mem_rsp_; std::vector> core_rsps_; + Switch::Ptr mem_switch_; + std::vector> mem_req_ports_; + std::vector> mem_rsp_ports_; public: Impl(Cache* simobject, const CacheConfig& config) @@ -239,16 +240,22 @@ public: , config_(config) , params_(config) , banks_(config.num_banks, {config, params_}) - , core_reqs_(config.num_inputs) , core_rsps_(config.num_inputs) - {} - - void handleMemResponse(const MemRsp& response, uint32_t) { - mem_rsp_ = {true, response}; - } - - void handleCoreRequest(const MemReq& request, uint32_t port_id) { - core_reqs_.at(port_id) = {true, request}; + , mem_req_ports_(config.num_banks, simobject) + , mem_rsp_ports_(config.num_banks, simobject) + { + if (config.num_banks > 1) { + mem_switch_ = Switch::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks); + for (uint32_t i = 0, n = config.num_banks; i < n; ++i) { + mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i)); + mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i)); + } + mem_switch_->ReqOut.bind(&simobject->MemReqPort); + simobject->MemRspPort.bind(&mem_switch_->RspIn); + } else { + mem_req_ports_.at(0).bind(&simobject->MemReqPort); + simobject->MemRspPort.bind(&mem_rsp_ports_.at(0)); + } } void step(uint64_t /*cycle*/) { @@ -269,31 +276,29 @@ public: bank.mshr.try_pop(&active_req); } - // try schedule stall replay + // try schedule stall queue if MSHR has space if (!active_req.valid - && !bank.stall_buffer.empty()) { + && !bank.stall_buffer.empty() + && !bank.mshr.full()) { active_req = bank.stall_buffer.front(); bank.stall_buffer.pop(); } } // handle memory fills - if (mem_rsp_.first) { - mem_rsp_.first = false; - auto bank_id = bit_getw(mem_rsp_.second.tag, 0, 15); - auto mshr_id = bit_getw(mem_rsp_.second.tag, 16, 31); - this->processMemoryFill(bank_id, mshr_id); + for (uint32_t i = 0, n = config_.num_banks; i < n; ++i) { + MemRsp mem_rsp; + if (mem_rsp_ports_.at(i).read(&mem_rsp)) { + this->processMemoryFill(i, mem_rsp.tag); + } } // handle incoming core requests - for (uint32_t i = 0, n = core_reqs_.size(); i < n; ++i) { - auto& entry = core_reqs_.at(i); - if (!entry.first) + for (uint32_t i = 0, n = config_.num_inputs; i < n; ++i) { + MemReq core_req; + if (!simobject_->CoreReqPorts.at(i).read(&core_req)) continue; - - entry.first = false; - auto& core_req = entry.second; auto bank_id = params_.addr_bank_id(core_req.addr); auto set_id = params_.addr_set_id(core_req.addr); auto tag = params_.addr_tag(core_req.addr); @@ -417,7 +422,7 @@ public: mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag); mem_req.write = true; mem_req.tag = 0; - simobject_->MemReqPort.send(mem_req, 1); + mem_req_ports_.at(bank_id).send(mem_req, 1); } else { // mark block as dirty hit_block.dirty = true; @@ -438,7 +443,8 @@ public: MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag); mem_req.write = true; - simobject_->MemReqPort.send(mem_req, 1); + mem_req.tag = 0; + mem_req_ports_.at(bank_id).send(mem_req, 1); } } @@ -449,7 +455,7 @@ public: mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); mem_req.write = true; mem_req.tag = 0; - simobject_->MemReqPort.send(mem_req, 1); + mem_req_ports_.at(bank_id).send(mem_req, 1); } // send core response for (auto& info : active_req.infos) { @@ -467,9 +473,8 @@ public: MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); mem_req.write = active_req.write; - mem_req.tag = bit_setw(0, 0, 15, bank_id); - mem_req.tag = bit_setw(mem_req.tag, 16, 31, mshr_id); - simobject_->MemReqPort.send(mem_req, 1); + mem_req.tag = mshr_id; + mem_req_ports_.at(bank_id).send(mem_req, 1); } } } @@ -480,12 +485,12 @@ public: /////////////////////////////////////////////////////////////////////////////// Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config) - : SimObject(ctx, name) - , impl_(new Impl(this, config)) - , CoreReqPorts(config.num_inputs, {this, impl_, &Cache::Impl::handleCoreRequest}) + : SimObject(ctx, name) + , CoreReqPorts(config.num_inputs, this) , CoreRspPorts(config.num_inputs, this) , MemReqPort(this) - , MemRspPort(this, impl_, &Impl::handleMemResponse) + , MemRspPort(this) + , impl_(new Impl(this, config)) {} Cache::~Cache() { diff --git a/sim/simX/cache.h b/sim/simX/cache.h index 1c0c82f6..58767d9f 100644 --- a/sim/simX/cache.h +++ b/sim/simX/cache.h @@ -20,11 +20,7 @@ struct CacheConfig { uint8_t latency; // pipeline latency }; -class Cache : public SimObject { -private: - class Impl; - Impl* impl_; - +class Cache : public SimObject { public: Cache(const SimContext& ctx, const char* name, const CacheConfig& config); ~Cache(); @@ -35,6 +31,10 @@ public: std::vector> CoreRspPorts; MasterPort MemReqPort; SlavePort MemRspPort; + +private: + class Impl; + Impl* impl_; }; } \ No newline at end of file diff --git a/sim/simX/core.cpp b/sim/simX/core.cpp index af0a4441..e1333dac 100644 --- a/sim/simX/core.cpp +++ b/sim/simX/core.cpp @@ -12,13 +12,13 @@ using namespace vortex; -Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id) +Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) : SimObject(ctx, "Core") , id_(id) , arch_(arch) - , decoder_(decoder) - , mem_(mem) - , shared_mem_(1, SMEM_SIZE) + , decoder_(arch) + , mmu_(0, arch.wsize(), true) + , shared_mem_(4096) , warps_(arch.num_warps()) , barriers_(arch.num_barriers(), 0) , csrs_(arch.num_csrs(), 0) @@ -54,9 +54,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryU DCACHE_MSHR_SIZE, // mshr 2, // pipeline latency })) - , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) - , icache_rsp_port_(this, this, &Core::icache_handleCacheReponse) - , dcache_rsp_port_(arch.num_threads(), {this, reinterpret_cast(exe_units_.at((int)ExeType::LSU).get()) , &LsuUnit::handleCacheReponse}) + , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) , fetch_stage_("fetch") , decode_stage_("decode") , issue_stage_("issue") @@ -65,36 +63,34 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryU , pending_icache_(arch_.num_warps()) , stalled_warps_(0) , last_schedule_wid_(0) - , pending_instrs_(0) + , issued_instrs_(0) + , committed_instrs_(0) , ebreak_(false) , stats_insts_(0) , stats_loads_(0) , stats_stores_(0) - , MemRspPort(this, &l1_mem_switch_->RspIn) - , MemReqPort(this, &l1_mem_switch_->ReqOut) + , MemRspPort(this) + , MemReqPort(this) { for (int i = 0; i < arch_.num_warps(); ++i) { warps_.at(i) = std::make_shared(this, i); } // register execute units + exe_units_.at((int)ExeType::NOP) = std::make_shared(this); exe_units_.at((int)ExeType::ALU) = std::make_shared(this); exe_units_.at((int)ExeType::LSU) = std::make_shared(this); exe_units_.at((int)ExeType::CSR) = std::make_shared(this); exe_units_.at((int)ExeType::FPU) = std::make_shared(this); exe_units_.at((int)ExeType::GPU) = std::make_shared(this); - // connect l1 caches - icache_->CoreRspPorts.at(0).bind(&icache_rsp_port_); - for (int i = 0; i < arch_.num_threads(); ++i) { - dcache_->CoreRspPorts.at(i).bind(&dcache_rsp_port_.at(i)); - } - // connect l1 switch icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]); dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]); l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort); l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort); + this->MemRspPort.bind(&l1_mem_switch_->RspIn); + l1_mem_switch_->ReqOut.bind(&this->MemReqPort); // activate warp0 warps_.at(0)->setTmask(0, true); @@ -109,31 +105,24 @@ Core::~Core() { } } -void Core::icache_handleCacheReponse(const MemRsp& response, uint32_t /*port_id*/) { - // advance to decode stage - uint32_t wid = response.tag; - pipeline_state_t state; - pending_icache_.remove(wid, &state); - auto latency = (SimPlatform::instance().cycles() - state.icache_latency); - state.icache_latency = latency; - decode_stage_.push(state); +void Core::attach_ram(RAM* ram) { + // bind RAM to memory unit + mmu_.attach(*ram, 0, 0xFFFFFFFF); } void Core::step(uint64_t cycle) { - __unused (cycle); - D(2, "###########################################################"); - D(2, std::dec << "Core" << id_ << ": cycle: " << cycle); - - this->commit(); - this->execute(); - this->issue(); - this->decode(); - this->fetch(); + this->commit(cycle); + this->execute(cycle); + this->issue(cycle); + this->decode(cycle); + this->fetch(cycle); DPN(2, std::flush); } -void Core::warp_scheduler() { +void Core::warp_scheduler(uint64_t cycle) { + __unused (cycle); + bool foundSchedule = false; int scheduled_warp = last_schedule_wid_; @@ -159,53 +148,77 @@ void Core::warp_scheduler() { stats_insts_ += warp->getActiveThreads(); pipeline_state_t state; + state.clear(); + state.id = (issued_instrs_++ * arch_.num_cores()) + id_; + warp->eval(&state); - D(4, state); + DT(3, cycle, "pipeline-schedule: " << state); - // advance to fetch stage - ++pending_instrs_; + // advance to fetch stage fetch_stage_.push(state); } -void Core::fetch() { - // schedule icache request - pipeline_state_t state; - if (fetch_stage_.try_pop(&state)) { - state.icache_latency = SimPlatform::instance().cycles(); - MemReq mem_req; - mem_req.addr = state.PC; - mem_req.write = false; - mem_req.tag = pending_icache_.allocate(state); - icache_->CoreReqPorts.at(0).send(mem_req, 1); +void Core::fetch(uint64_t cycle) { + // handle icache reponse + { + MemRsp mem_rsp; + if (icache_->CoreRspPorts.at(0).read(&mem_rsp)){ + pipeline_state_t state; + pending_icache_.remove(mem_rsp.tag, &state); + auto latency = (SimPlatform::instance().cycles() - state.icache_latency); + state.icache_latency = latency; + decode_stage_.push(state); + DT(3, cycle, "icache-rsp: addr=" << std::hex << state.PC << ", tag=" << mem_rsp.tag << ", " << state); + } + } + + // send icache request + { + pipeline_state_t state; + if (fetch_stage_.try_pop(&state)) { + state.icache_latency = SimPlatform::instance().cycles(); + MemReq mem_req; + mem_req.addr = state.PC; + mem_req.write = false; + mem_req.tag = pending_icache_.allocate(state); + icache_->CoreReqPorts.at(0).send(mem_req, 1); + DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << state); + } } // schedule next warp - this->warp_scheduler(); + this->warp_scheduler(cycle); } -void Core::decode() { +void Core::decode(uint64_t cycle) { + __unused (cycle); + pipeline_state_t state; if (!decode_stage_.try_pop(&state)) return; - if (state.stall_warp) { - D(3, "*** warp#" << state.wid << " fetch stalled"); - } else { - // release warp + // release warp + if (!state.stall_warp) { stalled_warps_.reset(state.wid); } + + DT(3, cycle, "pipeline-decode: " << state); // advance to issue stage issue_stage_.push(state); } -void Core::issue() { +void Core::issue(uint64_t cycle) { + __unused (cycle); + if (!issue_stage_.empty()) { // insert to ibuffer auto& state = issue_stage_.top(); auto& ibuffer = ibuffers_.at(state.wid); - if (!ibuffer.full()) { + if (ibuffer.full()) { + DT(3, cycle, "*** ibuffer-stall: " << state); + } else { ibuffer.push(state); issue_stage_.pop(); } @@ -219,8 +232,18 @@ void Core::issue() { auto& state = ibuffer.top(); // check scoreboard - if (scoreboard_.in_use(state)) + if (scoreboard_.in_use(state)) { + DTH(3, cycle, "*** scoreboard-stall: dependents={"); + auto owners = scoreboard_.owners(state); + for (uint32_t i = 0, n = owners.size(); i < n; ++i) { + if (i) DTN(3, ", "); + DTN(3, "#" << owners.at(i)); + } + DTN(3, "}, " << state << std::endl); continue; + } + + DT(3, cycle, "pipeline-issue: " << state); // update scoreboard scoreboard_.reserve(state); @@ -233,18 +256,19 @@ void Core::issue() { } } -void Core::execute() { +void Core::execute(uint64_t cycle) { // process stage inputs if (!execute_stage_.empty()) { auto& state = execute_stage_.top(); auto& exe_unit = exe_units_.at((int)state.exe_type); exe_unit->push_input(state); execute_stage_.pop(); + DT(3, cycle, "pipeline-execute: " << state); } // advance execute units for (auto& exe_unit : exe_units_) { - exe_unit->step(); + exe_unit->step(cycle); } // commit completed instructions @@ -255,18 +279,29 @@ void Core::execute() { stalled_warps_.reset(state.wid); } // advance to commit stage - commit_stage_.push(state); + commit_stage_.push(state); } } } -void Core::commit() { +void Core::commit(uint64_t cycle) { + __unused (cycle); + pipeline_state_t state; if (!commit_stage_.try_pop(&state)) return; + DT(3, cycle, "pipeline-commit: " << state); + // update scoreboard scoreboard_.release(state); + + assert(committed_instrs_ <= issued_instrs_); + ++committed_instrs_; +} + +bool Core::running() const { + return (committed_instrs_ != issued_instrs_); } Word Core::get_csr(Addr addr, int tid, int wid) { @@ -349,9 +384,9 @@ void Core::barrier(int bar_id, int count, int warp_id) { barrier.reset(); } -Word Core::icache_fetch(Addr addr) { +Word Core::icache_read(Addr addr, Size size) { Word data; - mem_.read(&data, addr, sizeof(Word), 0); + mmu_.read(&data, addr, size, 0); return data; } @@ -365,7 +400,7 @@ Word Core::dcache_read(Addr addr, Size size) { return data; } #endif - mem_.read(&data, addr, size, 0); + mmu_.read(&data, addr, size, 0); return data; } @@ -383,11 +418,7 @@ void Core::dcache_write(Addr addr, Word data, Size size) { this->writeToStdOut(addr, data); return; } - mem_.write(&data, addr, size, 0); -} - -bool Core::running() const { - return pending_instrs_; + mmu_.write(&data, addr, size, 0); } void Core::printStats() const { @@ -399,7 +430,7 @@ void Core::printStats() const { void Core::writeToStdOut(Addr addr, Word data) { uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1); - auto& ss_buf = print_bufs_.at(tid); + auto& ss_buf = print_bufs_[tid]; char c = (char)data; ss_buf << c; if (c == '\n') { diff --git a/sim/simX/core.h b/sim/simX/core.h index 913db4a6..ea1a6582 100644 --- a/sim/simX/core.h +++ b/sim/simX/core.h @@ -25,9 +25,11 @@ namespace vortex { class Core : public SimObject { public: - Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id); + Core(const SimContext& ctx, const ArchDef &arch, Word id); ~Core(); + void attach_ram(RAM* ram); + bool running() const; void step(uint64_t cycle); @@ -64,7 +66,7 @@ public: void barrier(int bar_id, int count, int warp_id); - Word icache_fetch(Addr); + Word icache_read(Addr, Size); Word dcache_read(Addr, Size); @@ -76,22 +78,21 @@ public: private: - void fetch(); - void decode(); - void issue(); - void execute(); - void commit(); + void fetch(uint64_t cycle); + void decode(uint64_t cycle); + void issue(uint64_t cycle); + void execute(uint64_t cycle); + void commit(uint64_t cycle); - void warp_scheduler(); - - void icache_handleCacheReponse(const MemRsp& response, uint32_t port_id); + void warp_scheduler(uint64_t cycle); void writeToStdOut(Addr addr, Word data); Word id_; - const ArchDef& arch_; - const Decoder& decoder_; - MemoryUnit& mem_; + const ArchDef arch_; + const Decoder decoder_; + MemoryUnit mmu_; + #ifdef SM_ENABLE RAM shared_mem_; #endif @@ -106,8 +107,6 @@ private: Cache::Ptr icache_; Cache::Ptr dcache_; Switch::Ptr l1_mem_switch_; - SlavePort icache_rsp_port_; - std::vector> dcache_rsp_port_; PipelineStage fetch_stage_; PipelineStage decode_stage_; @@ -118,10 +117,12 @@ private: HashTable pending_icache_; WarpMask stalled_warps_; uint32_t last_schedule_wid_; - uint32_t pending_instrs_; + uint32_t issued_instrs_; + uint32_t committed_instrs_; bool ebreak_; std::unordered_map print_bufs_; + uint64_t stats_insts_; uint64_t stats_loads_; uint64_t stats_stores_; diff --git a/sim/simX/debug.h b/sim/simX/debug.h index ad7fd16f..53d2d62a 100644 --- a/sim/simX/debug.h +++ b/sim/simX/debug.h @@ -7,14 +7,15 @@ #define DEBUG_HEADER << "DEBUG " //#define DEBUG_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": " +#define TRACE_HEADER << "TRACE " +//#define TRACE_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": " + #ifndef NDEBUG #include #include -#define DX(x) x - -#define D(lvl, x) do { \ +#define DP(lvl, x) do { \ if ((lvl) <= DEBUG_LEVEL) { \ std::cout DEBUG_HEADER << x << std::endl; \ } \ @@ -32,12 +33,33 @@ } \ } while(0) +#define DT(lvl, t, x) do { \ + if ((lvl) <= DEBUG_LEVEL) { \ + std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x << std::endl; \ + } \ +} while(0) + +#define DTH(lvl, t, x) do { \ + if ((lvl) <= DEBUG_LEVEL) { \ + std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x; \ + } \ +} while(0) + +#define DTN(lvl, x) do { \ + if ((lvl) <= DEBUG_LEVEL) { \ + std::cout << x; \ + } \ +} while(0) + + #else -#define DX(x) -#define D(lvl, x) do {} while(0) +#define DP(lvl, x) do {} while(0) #define DPH(lvl, x) do {} while(0) #define DPN(lvl, x) do {} while(0) -#define D_RAW(x) do {} while(0) + +#define DT(lvl, t, x) do {} while(0) +#define DTH(lvl, t, x) do {} while(0) +#define DTN(lvl, x) do {} while(0) #endif \ No newline at end of file diff --git a/sim/simX/decode.cpp b/sim/simX/decode.cpp index 3c76231f..6530d223 100644 --- a/sim/simX/decode.cpp +++ b/sim/simX/decode.cpp @@ -194,47 +194,26 @@ static const char* op_string(const Instr &instr) { namespace vortex { std::ostream &operator<<(std::ostream &os, const Instr &instr) { os << op_string(instr) << ": "; - auto opcode = instr.getOpcode(); - - auto rd_to_string = [&]() { - int rdt = instr.getRDType(); - int rd = instr.getRDest(); - switch (rdt) { - case 1: os << "r" << std::dec << rd << " <- "; break; - case 2: os << "fr" << std::dec << rd << " <- "; break; - case 3: os << "vr" << std::dec << rd << " <- "; break; - default: break; - } - }; - - auto rs_to_string = [&](int i) { - int rst = instr.getRSType(i); - int rs = instr.getRSrc(i); - switch (rst) { - case 1: os << "r" << std::dec << rs; break; - case 2: os << "fr" << std::dec << rs; break; - case 3: os << "vr" << std::dec << rs; break; - default: break; - } - }; - + auto opcode = instr.getOpcode(); if (opcode == S_INST || opcode == FS || opcode == VS) { os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- "; - rs_to_string(1); + os << instr.getRSType(1) << std::dec << instr.getRSrc(1); } else if (opcode == L_INST || opcode == FL || opcode == VL) { - rd_to_string(); + os << instr.getRDType() << std::dec << instr.getRDest() << " <- "; os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]"; } else { - rd_to_string(); + if (instr.getRDType() != RegType::None) { + os << instr.getRDType() << std::dec << instr.getRDest() << " <- "; + } int i = 0; for (; i < instr.getNRSrc(); ++i) { if (i) os << ", "; - rs_to_string(i); + os << instr.getRSType(i) << std::dec << instr.getRSrc(i); } if (instr.hasImm()) { if (i) os << ", "; @@ -281,7 +260,7 @@ Decoder::Decoder(const ArchDef &arch) { v_imm_mask_ = 0x7ff; } -std::shared_ptr Decoder::decode(Word code, Word PC) const { +std::shared_ptr Decoder::decode(Word code) const { auto instr = std::make_shared(); Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_); instr->setOpcode(op); @@ -297,8 +276,8 @@ std::shared_ptr Decoder::decode(Word code, Word PC) const { auto op_it = sc_instTable.find(op); if (op_it == sc_instTable.end()) { - std::cout << std::hex << "invalid opcode: 0x" << op << ", instruction=0x" << code << ", PC=" << PC << std::endl; - std::abort(); + std::cout << std::hex << "Error: invalid opcode: 0x" << op << std::endl; + return nullptr; } auto iType = op_it->second.iType; @@ -459,7 +438,5 @@ std::shared_ptr Decoder::decode(Word code, Word PC) const { std::abort(); } - D(2, "Instr 0x" << std::hex << code << ": " << *instr << std::flush); - return instr; } diff --git a/sim/simX/decode.h b/sim/simX/decode.h index d4f9f976..e481cb28 100644 --- a/sim/simX/decode.h +++ b/sim/simX/decode.h @@ -13,7 +13,7 @@ class Decoder { public: Decoder(const ArchDef &); - std::shared_ptr decode(Word code, Word PC) const; + std::shared_ptr decode(Word code) const; private: diff --git a/sim/simX/execute.cpp b/sim/simX/execute.cpp index 602f7f3a..ff705d82 100644 --- a/sim/simX/execute.cpp +++ b/sim/simX/execute.cpp @@ -75,11 +75,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { if (num_rsrcs) { for (int i = 0; i < num_rsrcs; ++i) { DPH(2, "Src Reg [" << std::dec << i << "]: "); - int type = instr.getRSType(i); + auto type = instr.getRSType(i); int reg = instr.getRSrc(i); switch (type) { - case 1: - DPH(2, "r" << std::dec << reg << "={"); + case RegType::Integer: + DPN(2, "r" << std::dec << reg << "={"); for (int t = 0; t < num_threads; ++t) { if (t) DPN(2, ", "); if (!tmask_.test(t)) { @@ -91,8 +91,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } DPN(2, "}" << std::endl); break; - case 2: - DPH(2, "fr" << std::dec << reg << "={"); + case RegType::Float: + DPN(2, "fr" << std::dec << reg << "={"); for (int t = 0; t < num_threads; ++t) { if (t) DPN(2, ", "); if (!tmask_.test(t)) { @@ -105,6 +105,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { DPN(2, "}" << std::endl); break; default: + std::abort(); break; } } @@ -415,7 +416,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { break; case L_INST: pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.load = 0; + pipeline_state->lsu.type = LsuType::LOAD; pipeline_state->used_iregs[rsrc0] = 1; pipeline_state->mem_addrs.resize(num_threads); for (int t = 0; t < num_threads; ++t) { @@ -425,7 +426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; Word data_read = core_->dcache_read(memAddr, 4); pipeline_state->mem_addrs.at(t) = memAddr; - D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); + DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); switch (func3) { case 0: // LBI @@ -455,7 +456,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { break; case S_INST: pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.store = 1; + pipeline_state->lsu.type = LsuType::STORE; pipeline_state->used_iregs[rsrc0] = 1; pipeline_state->used_iregs[rsrc1] = 1; pipeline_state->mem_addrs.resize(num_threads); @@ -464,7 +465,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { continue; Word memAddr = rsdata[t][0] + immsrc; pipeline_state->mem_addrs.at(t) = memAddr; - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); switch (func3) { case 0: // SB @@ -543,12 +544,12 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { break; case FENCE: pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.fence = 1; + pipeline_state->lsu.type = LsuType::FENCE; pipeline_state->stall_warp = true; break; case (FL | VL): pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.load = 1; + pipeline_state->lsu.type = LsuType::LOAD; pipeline_state->used_iregs[rsrc0] = 1; if (func3 == 0x2) { pipeline_state->mem_addrs.resize(num_threads); @@ -558,14 +559,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { Word memAddr = rsdata[t][0] + immsrc; pipeline_state->mem_addrs.at(t) = memAddr; Word data_read = core_->dcache_read(memAddr, 4); - D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); + DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); rddata[t] = data_read; } } else { - D(3, "Executing vector load"); - D(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); - D(3, "dest: v" << rdest); - D(3, "width" << instr.getVlsWidth()); + DP(3, "Executing vector load"); + DP(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); + DP(3, "dest: v" << rdest); + DP(3, "width" << instr.getVlsWidth()); pipeline_state->mem_addrs.resize(vl_); auto &vd = vRegFile_.at(rdest); switch (instr.getVlsWidth()) { @@ -574,9 +575,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); pipeline_state->mem_addrs.at(i) = memAddr; - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); Word data_read = core_->dcache_read(memAddr, 4); - D(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); + DP(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); int *result_ptr = (int *)(vd.data() + i); *result_ptr = data_read; } @@ -590,7 +591,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { break; case (FS | VS): pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.store = 1; + pipeline_state->lsu.type = LsuType::STORE; pipeline_state->used_iregs[rsrc0] = 1; pipeline_state->used_iregs[rsrc1] = 1; if (func3 == 0x2) { @@ -601,20 +602,20 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { Word memAddr = rsdata[t][0] + immsrc; pipeline_state->mem_addrs.at(t) = memAddr; core_->dcache_write(memAddr, rsdata[t][1], 4); - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); } } else { pipeline_state->mem_addrs.resize(vl_); for (int i = 0; i < vl_; i++) { Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8); pipeline_state->mem_addrs.at(i) = memAddr; - D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); switch (instr.getVlsWidth()) { case 6: { //store word and unit strided (not checking for unit stride) uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i); core_->dcache_write(memAddr, value, 4); - D(3, "store: " << memAddr << " value:" << value); + DP(3, "store: " << memAddr << " value:" << value); } break; default: std::abort(); @@ -705,9 +706,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } else { // FMV.X.W rddata[t] = rsdata[t][0]; - pipeline_state->fpu.type = FpuType::FNCP; - pipeline_state->used_fregs[rsrc0] = 1; - } + } + pipeline_state->fpu.type = FpuType::FNCP; + pipeline_state->used_fregs[rsrc0] = 1; break; case 0x50: switch(func3) { @@ -783,132 +784,138 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } rd_write = true; break; - case GPGPU: - pipeline_state->exe_type = ExeType::GPU; + case GPGPU: { + pipeline_state->exe_type = ExeType::GPU; + int ts = 0; for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - switch (func3) { - case 0: { - // TMC - pipeline_state->gpu.type = GpuType::TMC; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->stall_warp = true; - if (rsrc1) { - // predicate mode - ThreadMask pred; - for (int i = 0; i < num_threads; ++i) { - pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0; - } - if (pred.any()) { - tmask_ &= pred; - } - } else { - tmask_.reset(); - for (int i = 0; i < num_threads; ++i) { - tmask_.set(i, rsdata.at(t)[0] & (1 << i)); - } - } - D(3, "*** TMC " << tmask_); - active_ = tmask_.any(); - break; // runOnce - } break; - case 1: { - // WSPAWN - pipeline_state->gpu.type = GpuType::WSPAWN; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; - pipeline_state->stall_warp = true; - int active_warps = std::min(rsdata.at(t)[0], core_->arch().num_warps()); - D(3, "*** Spawning " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(t)[1]); - for (int i = 1; i < active_warps; ++i) { - Warp &newWarp = core_->warp(i); - newWarp.setPC(rsdata[t][1]); - newWarp.setTmask(0, true); - } - break; // runOnce - } break; - case 2: { - // SPLIT - pipeline_state->gpu.type = GpuType::SPLIT; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->stall_warp = true; - if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) { - ThreadMask tmask; - for (int i = 0; i < num_threads; ++i) { - tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0); - } - - DomStackEntry e(tmask, nextPC); - domStack_.push(tmask_); - domStack_.push(e); - for (size_t i = 0; i < e.tmask.size(); ++i) { - tmask_.set(i, !e.tmask.test(i) && tmask_.test(i)); - } - active_ = tmask_.any(); - - DPH(3, "*** Split: New TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); - DPN(3, ", Pushed TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1)); - DPN(3, ", PC=0x" << std::hex << e.PC << "\n"); - } else { - D(3, "*** Unanimous pred"); - DomStackEntry e(tmask_); - e.unanimous = true; - domStack_.push(e); - } - break; // runOnce - } break; - case 3: { - // JOIN - pipeline_state->gpu.type = GpuType::JOIN; - pipeline_state->stall_warp = true; - if (!domStack_.empty() && domStack_.top().unanimous) { - D(3, "*** Uninimous branch at join"); - tmask_ = domStack_.top().tmask; - active_ = tmask_.any(); - domStack_.pop(); - } else { - if (!domStack_.top().fallThrough) { - nextPC = domStack_.top().PC; - D(3, "*** Join: next PC: " << std::hex << nextPC << std::dec); - } - - tmask_ = domStack_.top().tmask; - active_ = tmask_.any(); - - DPH(3, "*** Join: New TM="); - for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); - DPN(3, "\n"); - - domStack_.pop(); - } - break; // runOnce - } break; - case 4: { - // BAR - pipeline_state->gpu.type = GpuType::BAR; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; - pipeline_state->stall_warp = true; - active_ = false; - core_->barrier(rsdata[t][0], rsdata[t][1], id_); - break; // runOnce - } break; - case 6: { - // PREFETCH - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.prefetch = 1; - pipeline_state->used_iregs[rsrc0] = 1; - int addr = rsdata[t][0]; - printf("*** PREFETCHED %d ***\n", addr); - } break; - default: - std::abort(); + if (tmask_.test(t)) { + ts = t; + break; } } - break; + switch (func3) { + case 0: { + // TMC + pipeline_state->gpu.type = GpuType::TMC; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->stall_warp = true; + if (rsrc1) { + // predicate mode + ThreadMask pred; + for (int i = 0; i < num_threads; ++i) { + pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0; + } + if (pred.any()) { + tmask_ &= pred; + } + } else { + tmask_.reset(); + for (int i = 0; i < num_threads; ++i) { + tmask_.set(i, rsdata.at(ts)[0] & (1 << i)); + } + } + DPH(3, "*** New TMC: "); + for (int i = 0; i < num_threads; ++i) + DPN(3, tmask_.test(num_threads-i-1)); + DPN(3, std::endl); + + active_ = tmask_.any(); + } break; + case 1: { + // WSPAWN + pipeline_state->gpu.type = GpuType::WSPAWN; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + pipeline_state->stall_warp = true; + int active_warps = std::min(rsdata.at(ts)[0], core_->arch().num_warps()); + DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]); + for (int i = 1; i < active_warps; ++i) { + Warp &newWarp = core_->warp(i); + newWarp.setPC(rsdata[ts][1]); + newWarp.setTmask(0, true); + } + } break; + case 2: { + // SPLIT + pipeline_state->gpu.type = GpuType::SPLIT; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->stall_warp = true; + if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) { + ThreadMask tmask; + for (int i = 0; i < num_threads; ++i) { + tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0); + } + + DomStackEntry e(tmask, nextPC); + domStack_.push(tmask_); + domStack_.push(e); + for (size_t i = 0; i < e.tmask.size(); ++i) { + tmask_.set(i, !e.tmask.test(i) && tmask_.test(i)); + } + active_ = tmask_.any(); + + DPH(3, "*** Split: New TM="); + for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); + DPN(3, ", Pushed TM="); + for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1)); + DPN(3, ", PC=0x" << std::hex << e.PC << "\n"); + } else { + DP(3, "*** Unanimous pred"); + DomStackEntry e(tmask_); + e.unanimous = true; + domStack_.push(e); + } + } break; + case 3: { + // JOIN + pipeline_state->gpu.type = GpuType::JOIN; + pipeline_state->stall_warp = true; + if (!domStack_.empty() && domStack_.top().unanimous) { + DP(3, "*** Uninimous branch at join"); + tmask_ = domStack_.top().tmask; + active_ = tmask_.any(); + domStack_.pop(); + } else { + if (!domStack_.top().fallThrough) { + nextPC = domStack_.top().PC; + DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec); + } + + tmask_ = domStack_.top().tmask; + active_ = tmask_.any(); + + DPH(3, "*** Join: New TM="); + for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); + DPN(3, "\n"); + + domStack_.pop(); + } + } break; + case 4: { + // BAR + pipeline_state->gpu.type = GpuType::BAR; + pipeline_state->used_iregs[rsrc0] = 1; + pipeline_state->used_iregs[rsrc1] = 1; + pipeline_state->stall_warp = true; + active_ = false; + core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); + } break; + case 6: { + // PREFETCH + pipeline_state->exe_type = ExeType::LSU; + pipeline_state->lsu.type = LsuType::PREFETCH; + pipeline_state->used_iregs[rsrc0] = 1; + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + int addr = rsdata[t][0]; + printf("*** PREFETCHED %d ***\n", addr); + } + } break; + default: + std::abort(); + } + } break; case VSET: { int VLEN = core_->arch().vsize() * 8; int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew(); @@ -928,7 +935,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); + DP(3, "Adding " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } @@ -940,7 +947,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); + DP(3, "Adding " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } } @@ -952,7 +959,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = first + second; - D(3, "Adding " << first << " + " << second << " = " << result); + DP(3, "Adding " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } @@ -968,7 +975,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -976,7 +983,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -984,7 +991,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first == second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } @@ -999,7 +1006,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1007,7 +1014,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1015,7 +1022,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first != second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } @@ -1030,7 +1037,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1038,7 +1045,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1046,7 +1053,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } @@ -1061,7 +1068,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int8_t first = *(int8_t *)(vr1.data() + i); int8_t second = *(int8_t *)(vr2.data() + i); int8_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1069,7 +1076,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int16_t first = *(int16_t *)(vr1.data() + i); int16_t second = *(int16_t *)(vr2.data() + i); int16_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(int16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1077,7 +1084,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int32_t first = *(int32_t *)(vr1.data() + i); int32_t second = *(int32_t *)(vr2.data() + i); int32_t result = (first < second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(int32_t *)(vd.data() + i) = result; } } @@ -1092,7 +1099,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1100,7 +1107,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1108,7 +1115,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } @@ -1123,7 +1130,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int8_t first = *(int8_t *)(vr1.data() + i); int8_t second = *(int8_t *)(vr2.data() + i); int8_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1131,7 +1138,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int16_t first = *(int16_t *)(vr1.data() + i); int16_t second = *(int16_t *)(vr2.data() + i); int16_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(int16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1139,7 +1146,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int32_t first = *(int32_t *)(vr1.data() + i); int32_t second = *(int32_t *)(vr2.data() + i); int32_t result = (first <= second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(int32_t *)(vd.data() + i) = result; } } @@ -1154,7 +1161,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1162,7 +1169,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1170,7 +1177,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } } @@ -1185,7 +1192,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int8_t first = *(int8_t *)(vr1.data() + i); int8_t second = *(int8_t *)(vr2.data() + i); int8_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { @@ -1193,7 +1200,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int16_t first = *(int16_t *)(vr1.data() + i); int16_t second = *(int16_t *)(vr2.data() + i); int16_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(int16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { @@ -1201,7 +1208,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { int32_t first = *(int32_t *)(vr1.data() + i); int32_t second = *(int32_t *)(vr2.data() + i); int32_t result = (first > second) ? 1 : 0; - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(int32_t *)(vd.data() + i) = result; } } @@ -1222,7 +1229,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1235,7 +1242,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1248,7 +1255,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = (first_value & !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1268,7 +1275,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1281,7 +1288,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1294,7 +1301,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = (first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1314,7 +1321,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1327,7 +1334,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1340,7 +1347,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = (first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1360,7 +1367,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1373,7 +1380,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1386,7 +1393,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = (first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1406,7 +1413,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1419,7 +1426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1432,7 +1439,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = (first_value | !second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1452,7 +1459,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1465,7 +1472,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1478,7 +1485,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = !(first_value & second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1498,7 +1505,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1511,7 +1518,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1524,7 +1531,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = !(first_value | second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1544,7 +1551,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first_value = (first & 0x1); uint8_t second_value = (second & 0x1); uint8_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1557,7 +1564,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first_value = (first & 0x1); uint16_t second_value = (second & 0x1); uint16_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1570,7 +1577,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first_value = (first & 0x1); uint32_t second_value = (second & 0x1); uint32_t result = !(first_value ^ second_value); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1588,7 +1595,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1599,7 +1606,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1610,7 +1617,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1628,7 +1635,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint8_t first = *(uint8_t *)(vr1.data() + i); uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) += result; } for (int i = vl_; i < VLMAX; i++) { @@ -1639,7 +1646,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint16_t first = *(uint16_t *)(vr1.data() + i); uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) += result; } for (int i = vl_; i < VLMAX; i++) { @@ -1650,7 +1657,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { uint32_t first = *(uint32_t *)(vr1.data() + i); uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (first * second); - D(3, "Comparing " << first << " + " << second << " = " << result); + DP(3, "Comparing " << first << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) += result; } for (int i = vl_; i < VLMAX; i++) { @@ -1669,7 +1676,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (rsdata[i][0] + second); - D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1679,7 +1686,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (rsdata[i][0] + second); - D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1689,7 +1696,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (rsdata[i][0] + second); - D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1705,7 +1712,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { uint8_t second = *(uint8_t *)(vr2.data() + i); uint8_t result = (rsdata[i][0] * second); - D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1715,7 +1722,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { uint16_t second = *(uint16_t *)(vr2.data() + i); uint16_t result = (rsdata[i][0] * second); - D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1725,7 +1732,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { for (int i = 0; i < vl_; i++) { uint32_t second = *(uint32_t *)(vr2.data() + i); uint32_t result = (rsdata[i][0] * second); - D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); + DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result); *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { @@ -1741,7 +1748,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { vtype_.vsew = instr.getVsew(); vtype_.vlmul = instr.getVlmul(); - D(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX); + DP(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX); int s0 = rsdata[0][0]; if (s0 <= VLMAX) { @@ -1762,46 +1769,49 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } if (rd_write) { + pipeline_state->wb = true; DPH(2, "Dest Reg: "); - int rdt = instr.getRDType(); + auto rdt = instr.getRDType(); switch (rdt) { - case 1: + case RegType::Integer: if (rdest) { - DPH(2, "r" << std::dec << rdest << "={"); + DPN(2, "r" << std::dec << rdest << "={"); for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - iRegFile_.at(t)[rdest] = rddata[t]; if (t) DPN(2, ", "); + if (!tmask_.test(t)) { + DPN(2, "-"); + continue; + } + iRegFile_.at(t)[rdest] = rddata[t]; DPN(2, "0x" << std::hex << rddata[t]); } DPN(2, "}" << std::endl); pipeline_state->used_iregs[rdest] = 1; } break; - case 2: - DPH(2, "fr" << std::dec << rdest << "={"); + case RegType::Float: + DPN(2, "fr" << std::dec << rdest << "={"); for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - fRegFile_.at(t)[rdest] = rddata[t]; if (t) DPN(2, ", "); + if (!tmask_.test(t)) { + DPN(2, "-"); + continue; + } + fRegFile_.at(t)[rdest] = rddata[t]; DPN(2, "0x" << std::hex << rddata[t]); } DPN(2, "}" << std::endl); pipeline_state->used_fregs[rdest] = 1; break; - case 3: - pipeline_state->used_vregs[rdest] = 1; - break; default: + std::abort(); break; } } PC_ += core_->arch().wsize(); if (PC_ != nextPC) { - D(3, "*** Next PC: " << std::hex << nextPC << std::dec); + DP(3, "*** Next PC: " << std::hex << nextPC << std::dec); PC_ = nextPC; } } diff --git a/sim/simX/exeunit.cpp b/sim/simX/exeunit.cpp index 5cdf22f3..ba280812 100644 --- a/sim/simX/exeunit.cpp +++ b/sim/simX/exeunit.cpp @@ -9,6 +9,17 @@ using namespace vortex; +NopUnit::NopUnit(Core*) : ExeUnit("NOP") {} + +void NopUnit::step(uint64_t /*cycle*/) { + pipeline_state_t state; + if (!inputs_.try_pop(&state)) + return; + this->schedule_output(state, 1); +} + +/////////////////////////////////////////////////////////////////////////////// + LsuUnit::LsuUnit(Core* core) : ExeUnit("LSU") , core_(core) @@ -17,61 +28,77 @@ LsuUnit::LsuUnit(Core* core) , fence_lock_(false) {} -void LsuUnit::handleCacheReponse(const MemRsp& response, uint32_t port_id) { - auto entry = pending_dcache_.at(response.tag); - entry.second.reset(port_id); // track remaining blocks - if (!entry.second.any()) { - auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency); - entry.first.dcache_latency = latency; - this->schedule_output(entry.first, 1); - pending_dcache_.release(response.tag); - } -} +void LsuUnit::step(uint64_t cycle) { + __unused (cycle); + + // handle dcache response + for (uint32_t t = 0; t < num_threads_; ++t) { + MemRsp mem_rsp; + if (!core_->dcache_->CoreRspPorts.at(t).read(&mem_rsp)) + continue; + auto& entry = pending_dcache_.at(mem_rsp.tag); + DT(3, cycle, "dcache-rsp: addr=" << std::hex << entry.first.mem_addrs.at(t) << ", tag=" << mem_rsp.tag << ", type=" << entry.first.lsu.type << ", tid=" << t << ", " << entry.first); + assert(entry.second.test(t)); + entry.second.reset(t); // track remaining blocks + if (!entry.second.any()) { + auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency); + entry.first.dcache_latency = latency; + this->schedule_output(entry.first, 1); + pending_dcache_.release(mem_rsp.tag); + } + } -void LsuUnit::step() { if (fence_lock_) { // wait for all pending memory operations to complete if (!pending_dcache_.empty()) return; this->schedule_output(fence_state_, 1); fence_lock_ = false; + DT(3, cycle, "fence-unlock: " << fence_state_); } + // check input queue if (inputs_.empty()) return; auto state = inputs_.top(); - if (state.lsu.fence) { + if (state.lsu.type == LsuType::FENCE) { // schedule fence lock fence_state_ = state; fence_lock_ = true; inputs_.pop(); + DT(3, cycle, "fence-lock: " << state); return; } - // send dcache requests - if (!pending_dcache_.full()) { - state.dcache_latency = SimPlatform::instance().cycles(); - auto tag = pending_dcache_.allocate({state, state.tmask}); - for (uint32_t t = 0; t < num_threads_; ++t) { - if (!state.tmask.test(t)) - continue; - MemReq mem_req; - mem_req.addr = state.mem_addrs.at(t); - mem_req.write = state.lsu.store; - mem_req.tag = tag; - core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1); - } - inputs_.pop(); + // check pending queue capacity + if (pending_dcache_.full()) { + DT(3, cycle, "*** lsu-queue-stall: " << state); + return; } + + // send dcache request + state.dcache_latency = SimPlatform::instance().cycles(); + auto tag = pending_dcache_.allocate({state, state.tmask}); + for (uint32_t t = 0; t < num_threads_; ++t) { + if (!state.tmask.test(t)) + continue; + MemReq mem_req; + mem_req.addr = state.mem_addrs.at(t); + mem_req.write = (state.lsu.type == LsuType::STORE); + mem_req.tag = tag; + core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1); + DT(3, cycle, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", type=" << state.lsu.type << ", tid=" << t << ", " << state); + } + inputs_.pop(); } /////////////////////////////////////////////////////////////////////////////// AluUnit::AluUnit(Core*) : ExeUnit("ALU") {} -void AluUnit::step() { +void AluUnit::step(uint64_t /*cycle*/) { pipeline_state_t state; if (!inputs_.try_pop(&state)) return; @@ -95,7 +122,7 @@ void AluUnit::step() { CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {} -void CsrUnit::step() { +void CsrUnit::step(uint64_t /*cycle*/) { pipeline_state_t state; if (!inputs_.try_pop(&state)) return; @@ -106,7 +133,7 @@ void CsrUnit::step() { FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {} -void FpuUnit::step() { +void FpuUnit::step(uint64_t /*cycle*/) { pipeline_state_t state; if (!inputs_.try_pop(&state)) return; @@ -133,7 +160,7 @@ void FpuUnit::step() { GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {} -void GpuUnit::step() { +void GpuUnit::step(uint64_t /*cycle*/) { pipeline_state_t state; if (!inputs_.try_pop(&state)) return; diff --git a/sim/simX/exeunit.h b/sim/simX/exeunit.h index 915089d3..3b2bbf91 100644 --- a/sim/simX/exeunit.h +++ b/sim/simX/exeunit.h @@ -43,7 +43,16 @@ public: return outputs_.try_pop(state); } - virtual void step() = 0; + virtual void step(uint64_t cycle) = 0; +}; + +/////////////////////////////////////////////////////////////////////////////// + +class NopUnit : public ExeUnit { +public: + NopUnit(Core*); + + void step(uint64_t cycle); }; /////////////////////////////////////////////////////////////////////////////// @@ -59,9 +68,7 @@ private: public: LsuUnit(Core*); - void handleCacheReponse(const MemRsp& response, uint32_t port_id); - - void step(); + void step(uint64_t cycle); }; /////////////////////////////////////////////////////////////////////////////// @@ -70,7 +77,7 @@ class AluUnit : public ExeUnit { public: AluUnit(Core*); - void step(); + void step(uint64_t cycle); }; /////////////////////////////////////////////////////////////////////////////// @@ -79,7 +86,7 @@ class CsrUnit : public ExeUnit { public: CsrUnit(Core*); - void step(); + void step(uint64_t cycle); }; /////////////////////////////////////////////////////////////////////////////// @@ -88,7 +95,7 @@ class FpuUnit : public ExeUnit { public: FpuUnit(Core*); - void step(); + void step(uint64_t cycle); }; /////////////////////////////////////////////////////////////////////////////// @@ -97,7 +104,7 @@ class GpuUnit : public ExeUnit { public: GpuUnit(Core*); - void step(); + void step(uint64_t cycle); }; } \ No newline at end of file diff --git a/sim/simX/instr.h b/sim/simX/instr.h index 1a205478..5deace6c 100644 --- a/sim/simX/instr.h +++ b/sim/simX/instr.h @@ -53,22 +53,23 @@ public: : opcode_(Opcode::NOP) , num_rsrcs_(0) , has_imm_(false) + , rdest_type_(RegType::None) , rdest_(0) , func3_(0) , func7_(0) { for (int i = 0; i < MAX_REG_SOURCES; ++i) { - rsrc_type_[i] = 0; + rsrc_type_[i] = RegType::None; } } /* Setters used to "craft" the instruction. */ void setOpcode(Opcode opcode) { opcode_ = opcode; } - void setDestReg(int destReg) { rdest_type_ = 1; rdest_ = destReg; } - void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = 1; rsrc_[num_rsrcs_++] = srcReg; } - void setDestFReg(int destReg) { rdest_type_ = 2; rdest_ = destReg; } - void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = 2; rsrc_[num_rsrcs_++] = srcReg; } - void setDestVReg(int destReg) { rdest_type_ = 3; rdest_ = destReg; } - void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = 3; rsrc_[num_rsrcs_++] = srcReg; } + void setDestReg(int destReg) { rdest_type_ = RegType::Integer; rdest_ = destReg; } + void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Integer; rsrc_[num_rsrcs_++] = srcReg; } + void setDestFReg(int destReg) { rdest_type_ = RegType::Float; rdest_ = destReg; } + void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg; } + void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; } + void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; } void setFunc3(Word func3) { func3_ = func3; } void setFunc7(Word func7) { func7_ = func7; } void setImm(Word imm) { has_imm_ = true; imm_ = imm; } @@ -89,9 +90,9 @@ public: Word getFunc7() const { return func7_; } int getNRSrc() const { return num_rsrcs_; } int getRSrc(int i) const { return rsrc_[i]; } - int getRSType(int i) const { return rsrc_type_[i]; } + RegType getRSType(int i) const { return rsrc_type_[i]; } int getRDest() const { return rdest_; } - int getRDType() const { return rdest_type_; } + RegType getRDType() const { return rdest_type_; } bool hasImm() const { return has_imm_; } Word getImm() const { return imm_; } Word getVlsWidth() const { return vlsWidth_; } @@ -112,15 +113,15 @@ private: Opcode opcode_; int num_rsrcs_; bool has_imm_; - int rdest_type_; + RegType rdest_type_; Word imm_; - int rsrc_type_[MAX_REG_SOURCES]; + RegType rsrc_type_[MAX_REG_SOURCES]; int rsrc_[MAX_REG_SOURCES]; int rdest_; Word func3_; Word func6_; - //Vector + // Vector Word vmask_; Word vlsWidth_; Word vMop_; diff --git a/sim/simX/main.cpp b/sim/simX/main.cpp index a34ada0e..a0e07faf 100644 --- a/sim/simX/main.cpp +++ b/sim/simX/main.cpp @@ -6,12 +6,15 @@ #include #include #include "processor.h" +#include #include "args.h" +#define RAM_PAGE_SIZE 4096 + using namespace vortex; int main(int argc, char **argv) { - int ret; + int exitcode; std::string archStr("rv32imf"); std::string imgFileName; @@ -53,11 +56,42 @@ int main(int argc, char **argv) { { ArchDef arch(archStr, num_cores, num_warps, num_threads); + Processor processor(arch); - ret = processor.run(imgFileName, riscv_test, showStats); + + RAM ram(RAM_PAGE_SIZE); + + { + std::string program_ext(fileExtension(imgFileName.c_str())); + if (program_ext == "bin") { + ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR); + } else if (program_ext == "hex") { + ram.loadHexImage(imgFileName.c_str()); + } else { + std::cout << "*** error: only *.bin or *.hex images supported." << std::endl; + return -1; + } + } + + processor.attach_ram(&ram); + + exitcode = processor.run(); + + if (riscv_test) { + if (1 == exitcode) { + std::cout << "Passed." << std::endl; + exitcode = 0; + } else { + std::cout << "Failed." << std::endl; + } + } else { + if (exitcode != 0) { + std::cout << "*** error: exitcode=" << exitcode << std::endl; + } + } } SimPlatform::instance().finalize(); - return ret; + return exitcode; } diff --git a/sim/simX/memsim.cpp b/sim/simX/memsim.cpp index c377972d..63ba571a 100644 --- a/sim/simX/memsim.cpp +++ b/sim/simX/memsim.cpp @@ -8,32 +8,26 @@ using namespace vortex; class MemSim::Impl { private: MemSim* simobject_; - std::vector> inputs_; + uint32_t num_banks_; uint32_t latency_; public: Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency) : simobject_(simobject) - , inputs_(num_banks) + , num_banks_(num_banks) , latency_(latency) {} - void handleMemRequest(const MemReq& mem_req, uint32_t port_id) { - inputs_.at(port_id).push(mem_req); - } - void step(uint64_t /*cycle*/) { - for (uint32_t i = 0, n = inputs_.size(); i < n; ++i) { - auto& queue = inputs_.at(i); - if (queue.empty()) + for (uint32_t i = 0, n = num_banks_; i < n; ++i) { + MemReq mem_req; + if (!simobject_->MemReqPorts.at(i).read(&mem_req)) continue; - auto& entry = queue.front(); - if (!entry.write) { + if (!mem_req.write) { MemRsp mem_rsp; - mem_rsp.tag = entry.tag; + mem_rsp.tag = mem_req.tag; simobject_->MemRspPorts.at(i).send(mem_rsp, latency_); } - queue.pop(); } } }; @@ -45,7 +39,7 @@ MemSim::MemSim(const SimContext& ctx, uint32_t latency) : SimObject(ctx, "MemSim") , impl_(new Impl(this, num_banks, latency)) - , MemReqPorts(num_banks, {this, impl_, &Impl::handleMemRequest}) + , MemReqPorts(num_banks, this) , MemRspPorts(num_banks, this) {} diff --git a/sim/simX/pipeline.h b/sim/simX/pipeline.h index 82735c2a..b5937b29 100644 --- a/sim/simX/pipeline.h +++ b/sim/simX/pipeline.h @@ -10,14 +10,19 @@ namespace vortex { struct pipeline_state_t { - //-- + //-- + uint64_t id; + + //-- + int cid; int wid; ThreadMask tmask; Word PC; //-- bool stall_warp; - int rdest_type; + bool wb; + RegType rdest_type; int rdest; RegMask used_iregs; RegMask used_fregs; @@ -30,10 +35,7 @@ struct pipeline_state_t { //-- union { struct { - uint8_t load : 1; - uint8_t store: 1; - uint8_t fence : 1; - uint8_t prefetch: 1; + LsuType type; } lsu; struct { AluType type; @@ -49,8 +51,37 @@ struct pipeline_state_t { // stats uint64_t icache_latency; uint64_t dcache_latency; + + void clear() { + cid = 0; + wid = 0; + tmask.reset(); + PC = 0; + stall_warp = false; + wb = false; + rdest = 0; + rdest_type = RegType::None; + used_iregs.reset(); + used_fregs.reset(); + used_vregs.reset(); + exe_type = ExeType::NOP; + mem_addrs.clear(); + icache_latency = 0; + dcache_latency = 0; + } }; +inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) { + os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC; + os << ", wb=" << state.wb; + if (state.wb) { + os << ", rd=" << state.rdest_type << std::dec << state.rdest; + } + os << ", ex=" << state.exe_type; + os << " (#" << std::dec << state.id << ")"; + return os; +} + class PipelineStage : public Queue { protected: const char* name_; @@ -62,15 +93,4 @@ public: {} }; -inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) { - os << "stall_warp=" << state.stall_warp; - os << ", wid=" << state.wid; - os << ", PC=" << std::hex << state.PC; - os << ", used_iregs=" << state.used_iregs; - os << ", used_fregs=" << state.used_fregs; - os << ", used_vregs=" << state.used_vregs; - os << std::endl; - return os; -} - } \ No newline at end of file diff --git a/sim/simX/processor.cpp b/sim/simX/processor.cpp new file mode 100644 index 00000000..be5cd4f4 --- /dev/null +++ b/sim/simX/processor.cpp @@ -0,0 +1,141 @@ +#include "processor.h" +#include "constants.h" + +using namespace vortex; + +Processor::Processor(const ArchDef& arch) + : cores_(arch.num_cores()) + , l2caches_(NUM_CLUSTERS) + , l2_mem_switches_(NUM_CLUSTERS) +{ + uint32_t num_cores = arch.num_cores(); + uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; + + // create cores + for (uint32_t i = 0; i < num_cores; ++i) { + cores_.at(i) = Core::Create(arch, i); + } + + // connect memory sub-systen + memsim_ = MemSim::Create(1, MEM_LATENCY); + std::vector*> mem_req_ports(1); + std::vector*> mem_rsp_ports(1); + mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0); + mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0); + + if (L3_ENABLE) { + l3cache_ = Cache::Create("l3cache", CacheConfig{ + log2ceil(L3_CACHE_SIZE), // C + log2ceil(MEM_BLOCK_SIZE), // B + 2, // W + 0, // A + 32, // address bits + L3_NUM_BANKS, // number of banks + L3_NUM_PORTS, // number of ports + NUM_CLUSTERS, // request size + true, // write-throught + 0, // victim size + L3_MSHR_SIZE, // mshr + 2, // pipeline latency + } + ); + + mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort); + l3cache_->MemReqPort.bind(mem_req_ports.at(0)); + + mem_req_ports.resize(NUM_CLUSTERS); + mem_rsp_ports.resize(NUM_CLUSTERS); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i); + mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i); + } + } else if (NUM_CLUSTERS > 1) { + l3_mem_switch_ = Switch::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS); + mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn); + l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0)); + + mem_req_ports.resize(NUM_CLUSTERS); + mem_rsp_ports.resize(NUM_CLUSTERS); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i); + mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i); + } + } + + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + if (L2_ENABLE) { + auto& l2cache = l2caches_.at(i); + l2cache = Cache::Create("l2cache", CacheConfig{ + log2ceil(L2_CACHE_SIZE), // C + log2ceil(MEM_BLOCK_SIZE), // B + 2, // W + 0, // A + 32, // address bits + L2_NUM_BANKS, // number of banks + L2_NUM_PORTS, // number of ports + NUM_CORES, // request size + true, // write-throught + 0, // victim size + L2_MSHR_SIZE, // mshr + 2, // pipeline latency + }); + mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort); + l2cache->MemReqPort.bind(mem_req_ports.at(i)); + + mem_req_ports.resize(cores_per_cluster); + mem_rsp_ports.resize(cores_per_cluster); + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); + mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); + } + } else if (cores_per_cluster > 1) { + auto& l2_mem_switch = l2_mem_switches_.at(i); + l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES); + mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); + l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); + + mem_req_ports.resize(cores_per_cluster); + mem_rsp_ports.resize(cores_per_cluster); + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); + mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); + } + } + + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + auto& core = cores_.at((i * NUM_CLUSTERS) + j); + mem_rsp_ports.at(i)->bind(&core->MemRspPort); + core->MemReqPort.bind(mem_req_ports.at(j)); + } + } +} + +void Processor::attach_ram(RAM* ram) { + for (auto core : cores_) { + core->attach_ram(ram); + } +} + +Processor::~Processor() {} + +int Processor::run() { + bool running; + int exitcode = 0; + do { + SimPlatform::instance().step(); + + running = false; + for (auto& core : cores_) { + if (core->running()) { + running = true; + } + if (core->check_ebreak()) { + exitcode = core->getIRegValue(3); + running = false; + break; + } + } + } while (running); + + return exitcode; +} \ No newline at end of file diff --git a/sim/simX/processor.h b/sim/simX/processor.h index 50671953..e41fd740 100644 --- a/sim/simX/processor.h +++ b/sim/simX/processor.h @@ -1,189 +1,27 @@ #pragma once -#include "constants.h" -#include "debug.h" -#include "types.h" #include "core.h" namespace vortex { class Processor { +public: + typedef std::shared_ptr Ptr; + + Processor(const ArchDef& arch); + ~Processor(); + + void attach_ram(RAM* mem); + + int run(); + private: - ArchDef arch_; - Decoder decoder_; - MemoryUnit mu_; - RAM ram_; std::vector cores_; std::vector l2caches_; std::vector::Ptr> l2_mem_switches_; Cache::Ptr l3cache_; Switch::Ptr l3_mem_switch_; MemSim::Ptr memsim_; - -public: - Processor(const ArchDef& arch) - : arch_(arch) - , decoder_(arch) - , mu_(0, arch.wsize(), true) - , ram_((1<<12), (1<<20)) - , cores_(arch.num_cores()) - , l2caches_(NUM_CLUSTERS) - , l2_mem_switches_(NUM_CLUSTERS) - { - uint32_t num_cores = arch.num_cores(); - uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; - - // bind RAM to memory unit - mu_.attach(ram_, 0, 0xFFFFFFFF); - - // create cores - for (uint32_t i = 0; i < num_cores; ++i) { - cores_.at(i) = Core::Create(arch, decoder_, mu_, i); - } - - // connect memory sub-systen - memsim_ = MemSim::Create(1, MEM_LATENCY); - std::vector*> mem_req_ports(1); - std::vector*> mem_rsp_ports(1); - mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0); - mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0); - - if (L3_ENABLE) { - l3cache_ = Cache::Create("l3cache", CacheConfig{ - log2ceil(L3_CACHE_SIZE), // C - log2ceil(MEM_BLOCK_SIZE), // B - 2, // W - 0, // A - 32, // address bits - L3_NUM_BANKS, // number of banks - L3_NUM_PORTS, // number of ports - NUM_CLUSTERS, // request size - true, // write-throught - 0, // victim size - L3_MSHR_SIZE, // mshr - 2, // pipeline latency - }); - mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort); - l3cache_->MemReqPort.bind(mem_req_ports.at(0)); - - mem_req_ports.resize(NUM_CLUSTERS); - mem_rsp_ports.resize(NUM_CLUSTERS); - for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { - mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i); - mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i); - } - } else if (NUM_CLUSTERS > 1) { - l3_mem_switch_ = Switch::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS); - mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn); - l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0)); - - mem_req_ports.resize(NUM_CLUSTERS); - mem_rsp_ports.resize(NUM_CLUSTERS); - for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { - mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i); - mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i); - } - } - - for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { - if (L2_ENABLE) { - auto& l2cache = l2caches_.at(i); - l2cache = Cache::Create("l2cache", CacheConfig{ - log2ceil(L2_CACHE_SIZE), // C - log2ceil(MEM_BLOCK_SIZE), // B - 2, // W - 0, // A - 32, // address bits - L2_NUM_BANKS, // number of banks - L2_NUM_PORTS, // number of ports - NUM_CORES, // request size - true, // write-throught - 0, // victim size - L2_MSHR_SIZE, // mshr - 2, // pipeline latency - }); - mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort); - l2cache->MemReqPort.bind(mem_req_ports.at(i)); - - mem_req_ports.resize(cores_per_cluster); - mem_rsp_ports.resize(cores_per_cluster); - for (uint32_t j = 0; j < cores_per_cluster; ++j) { - mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); - mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); - } - } else if (cores_per_cluster > 1) { - auto& l2_mem_switch = l2_mem_switches_.at(i); - l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES); - mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); - l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); - - mem_req_ports.resize(cores_per_cluster); - mem_rsp_ports.resize(cores_per_cluster); - for (uint32_t j = 0; j < cores_per_cluster; ++j) { - mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); - mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); - } - } - - for (uint32_t j = 0; j < cores_per_cluster; ++j) { - auto& core = cores_.at((i * NUM_CLUSTERS) + j); - mem_rsp_ports.at(i)->bind(&core->MemRspPort); - core->MemReqPort.bind(mem_req_ports.at(j)); - } - } - } - - ~Processor() {} - - int run(const std::string& program, bool riscv_test, bool /*showStats*/) { - { - std::string program_ext(fileExtension(program.c_str())); - if (program_ext == "bin") { - ram_.loadBinImage(program.c_str(), STARTUP_ADDR); - } else if (program_ext == "hex") { - ram_.loadHexImage(program.c_str()); - } else { - std::cout << "*** error: only *.bin or *.hex images supported." << std::endl; - return -1; - } - } - - bool running; - int exitcode = 0; - do { - SimPlatform::instance().step(); - - running = false; - for (auto& core : cores_) { - if (core->running()) { - running = true; - } - if (core->check_ebreak()) { - exitcode = core->getIRegValue(3); - running = false; - break; - } - } - } while (running); - - // get error status - - if (riscv_test) { - if (1 == exitcode) { - std::cout << "Passed." << std::endl; - exitcode = 0; - } else { - std::cout << "Failed." << std::endl; - } - } else { - if (exitcode != 0) { - std::cout << "*** error: exitcode=" << exitcode << std::endl; - } - } - - return exitcode; - } - }; } \ No newline at end of file diff --git a/sim/simX/scoreboard.h b/sim/simX/scoreboard.h index 0e0e0577..46bf3bdc 100644 --- a/sim/simX/scoreboard.h +++ b/sim/simX/scoreboard.h @@ -10,6 +10,7 @@ private: std::vector in_use_iregs_; std::vector in_use_fregs_; std::vector in_use_vregs_; + std::unordered_map owners_; public: Scoreboard(const ArchDef &arch) @@ -29,42 +30,87 @@ public: || (state.used_fregs & in_use_fregs_.at(state.wid)) != 0 || (state.used_vregs & in_use_vregs_.at(state.wid)) != 0; } + + std::vector owners(const pipeline_state_t& state) const { + std::vector out; + { + uint32_t r = 0; + auto used_iregs = state.used_iregs & in_use_iregs_.at(state.wid); + while (used_iregs.any()) { + if (used_iregs.test(0)) { + uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Integer; + out.push_back(owners_.at(tag)); + } + used_iregs >>= 1; + ++r; + } + } + { + uint32_t r = 0; + auto used_fregs = state.used_fregs & in_use_fregs_.at(state.wid); + while (used_fregs.any()) { + if (used_fregs.test(0)) { + uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Float; + out.push_back(owners_.at(tag)); + } + used_fregs >>= 1; + ++r; + } + } + { + uint32_t r = 0; + auto used_vregs = state.used_vregs & in_use_vregs_.at(state.wid); + while (used_vregs.any()) { + if (used_vregs.test(0)) { + uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Vector; + out.push_back(owners_.at(tag)); + } + used_vregs >>= 1; + ++r; + } + } + return std::move(out); + } void reserve(const pipeline_state_t& state) { - if (!state.rdest) - return; - + if (!state.wb) + return; switch (state.rdest_type) { - case 1: + case RegType::Integer: in_use_iregs_.at(state.wid).set(state.rdest); break; - case 2: + case RegType::Float: in_use_fregs_.at(state.wid).set(state.rdest); break; - case 3: + case RegType::Vector: in_use_vregs_.at(state.wid).set(state.rdest); break; default: break; - } + } + uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type; + assert(owners_.count(tag) == 0); + owners_[tag] = state.id; } void release(const pipeline_state_t& state) { - if (!state.rdest) - return; + if (!state.wb) + return; switch (state.rdest_type) { - case 1: + case RegType::Integer: in_use_iregs_.at(state.wid).reset(state.rdest); break; - case 2: + case RegType::Float: in_use_fregs_.at(state.wid).reset(state.rdest); break; - case 3: + case RegType::Vector: in_use_vregs_.at(state.wid).reset(state.rdest); break; default: break; } + uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type; + owners_.erase(tag); } }; diff --git a/sim/simX/types.h b/sim/simX/types.h index 3dabfe3e..f53c3754 100644 --- a/sim/simX/types.h +++ b/sim/simX/types.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -20,7 +21,25 @@ typedef std::bitset<32> RegMask; typedef std::bitset<32> ThreadMask; typedef std::bitset<32> WarpMask; +enum class RegType { + None, + Integer, + Float, + Vector +}; + +inline std::ostream &operator<<(std::ostream &os, const RegType& type) { + switch (type) { + case RegType::None: break; + case RegType::Integer: os << "r"; break; + case RegType::Float: os << "fr"; break; + case RegType::Vector: os << "vr"; break; + } + return os; +} + enum class ExeType { + NOP, ALU, LSU, CSR, @@ -29,6 +48,19 @@ enum class ExeType { MAX, }; +inline std::ostream &operator<<(std::ostream &os, const ExeType& type) { + switch (type) { + case ExeType::NOP: os << "NOP"; break; + case ExeType::ALU: os << "ALU"; break; + case ExeType::LSU: os << "LSU"; break; + case ExeType::CSR: os << "CSR"; break; + case ExeType::FPU: os << "FPU"; break; + case ExeType::GPU: os << "GPU"; break; + case ExeType::MAX: break; + } + return os; +} + enum class AluType { ARITH, BRANCH, @@ -36,6 +68,33 @@ enum class AluType { IDIV, }; +inline std::ostream &operator<<(std::ostream &os, const AluType& type) { + switch (type) { + case AluType::ARITH: os << "ARITH"; break; + case AluType::BRANCH: os << "BRANCH"; break; + case AluType::IMUL: os << "IMUL"; break; + case AluType::IDIV: os << "IDIV"; break; + } + return os; +} + +enum class LsuType { + LOAD, + STORE, + FENCE, + PREFETCH, +}; + +inline std::ostream &operator<<(std::ostream &os, const LsuType& type) { + switch (type) { + case LsuType::LOAD: os << "LOAD"; break; + case LsuType::STORE: os << "STORE"; break; + case LsuType::FENCE: os << "FENCE"; break; + case LsuType::PREFETCH: os << "PREFETCH"; break; + } + return os; +} + enum class FpuType { FNCP, FMA, @@ -44,6 +103,17 @@ enum class FpuType { FCVT, }; +inline std::ostream &operator<<(std::ostream &os, const FpuType& type) { + switch (type) { + case FpuType::FNCP: os << "FNCP"; break; + case FpuType::FMA: os << "FMA"; break; + case FpuType::FDIV: os << "FDIV"; break; + case FpuType::FSQRT: os << "FSQRT"; break; + case FpuType::FCVT: os << "FCVT"; break; + } + return os; +} + enum class GpuType { TMC, WSPAWN, @@ -53,11 +123,31 @@ enum class GpuType { TEX, }; +inline std::ostream &operator<<(std::ostream &os, const GpuType& type) { + switch (type) { + case GpuType::TMC: os << "TMC"; break; + case GpuType::WSPAWN: os << "WSPAWN"; break; + case GpuType::SPLIT: os << "SPLIT"; break; + case GpuType::JOIN: os << "JOIN"; break; + case GpuType::BAR: os << "BAR"; break; + case GpuType::TEX: os << "TEX"; break; + } + return os; +} + enum class ArbiterType { Priority, RoundRobin }; +inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) { + switch (type) { + case ArbiterType::Priority: os << "Priority"; break; + case ArbiterType::RoundRobin: os << "RoundRobin"; break; + } + return os; +} + /////////////////////////////////////////////////////////////////////////////// template @@ -65,6 +155,8 @@ class Queue { protected: std::queue queue_; + uint32_t count; + public: Queue() {} @@ -77,6 +169,7 @@ public: } void push(const T& value) { + ++count; queue_.push(value); } @@ -141,6 +234,7 @@ public: return i; } } + assert(false); return -1; } @@ -148,6 +242,7 @@ public: auto& entry = entries_.at(index); assert(entry.first); entry.first = false; + --capacity_; } void remove(uint32_t index, T* value) { @@ -155,6 +250,7 @@ public: assert(entry.first); *value = entry.second; entry.first = false; + --capacity_; } }; @@ -163,29 +259,21 @@ public: template class Switch : public SimObject> { private: - struct req_t { + struct req_batch_t { std::vector data; std::bitset valid; - req_t() {} - req_t(uint32_t size) : data(size) {} + req_batch_t() {} + req_batch_t(uint32_t size) + : data(size) + , valid(0) + {} }; - void handleIncomingRequest(const Req& req, uint32_t port_id) { - cur_req_.data.at(port_id) = req; - cur_req_.valid.set(port_id); - } - - void handleIncomingResponse(const Rsp& rsp, uint32_t) { - rsps_.push(rsp); - } - ArbiterType type_; - std::queue reqs_; - std::queue rsps_; - req_t cur_req_; + std::queue reqq_; uint32_t delay_; uint32_t cursor_; - std::unordered_map addr_table_; + uint32_t tag_shift_; public: Switch( @@ -197,12 +285,12 @@ public: ) : SimObject>(ctx, name) , type_(type) - , cur_req_(num_inputs) , delay_(delay) , cursor_(0) - , ReqIn(num_inputs, {this, this, &Switch::handleIncomingRequest}) + , tag_shift_(log2ceil(num_inputs)) + , ReqIn(num_inputs, this) , ReqOut(this) - , RspIn(this, this, &Switch::handleIncomingResponse) + , RspIn(this) , RspOut(num_inputs, this) { assert(delay_ != 0); @@ -210,36 +298,52 @@ public: } void step(uint64_t /*cycle*/) { - if (cur_req_.valid.any()) { - reqs_.push(cur_req_); - cur_req_.valid.reset(); - } - - while (!reqs_.empty()) { - auto& entry = reqs_.front(); - bool found = false; - for (uint32_t i = 0, n = entry.data.size(); i < n; ++i) { - auto j = (cursor_ + i) % n; - if (entry.valid.test(j)) { - auto& req = entry.data.at(j); - addr_table_[req.tag] = j; - ReqOut.send(req, delay_); - entry.valid.reset(j); - this->update_cursor(j); - found = true; - break; + // process incomming requests + { + req_batch_t req_batch(ReqIn.size()); + for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) { + Req req; + if (ReqIn.at(i).read(&req)) { + req_batch.data.at(i) = req; + req_batch.valid.set(i); } } - if (found) - break; - reqs_.pop(); + if (req_batch.valid.any()) { + reqq_.push(req_batch); + } + } + + // apply arbitration + if (!reqq_.empty()) { + auto& req_batch = reqq_.front(); + for (uint32_t i = 0, n = req_batch.data.size(); i < n; ++i) { + auto j = (cursor_ + i) % n; + if (req_batch.valid.test(j)) { + auto& req = req_batch.data.at(j); + if (tag_shift_) { + req.tag = (req.tag << tag_shift_) | j; + } + ReqOut.send(req, delay_); + req_batch.valid.reset(j); + this->update_cursor(j); + if (!req_batch.valid.any()) + reqq_.pop(); // pop when empty + break; + } + } } - if (!rsps_.empty()) { - auto& rsp = rsps_.front(); - auto port_id = addr_table_.at(rsp.tag); - RspOut.at(port_id).send(rsp, 1); - rsps_.pop(); + // process incoming reponses + { + Rsp rsp; + if (RspIn.read(&rsp)) { + uint32_t port_id = 0; + if (tag_shift_) { + port_id = rsp.tag & ((1 << tag_shift_)-1); + rsp.tag >>= tag_shift_; + } + RspOut.at(port_id).send(rsp, 1); + } } } diff --git a/sim/simX/warp.cpp b/sim/simX/warp.cpp index 0c989d0c..89b9cc39 100644 --- a/sim/simX/warp.cpp +++ b/sim/simX/warp.cpp @@ -24,30 +24,34 @@ Warp::Warp(Core *core, Word id) void Warp::eval(pipeline_state_t *pipeline_state) { assert(tmask_.any()); - DPH(2, "Step: wid=" << id_ << ", PC=0x" << std::hex << PC_ << ", tmask="); + DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask="); for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) DPN(2, tmask_.test(n-i-1)); - DPN(2, "\n"); + DPN(2, ", PC=0x" << std::hex << PC_ << std::endl); /* Fetch and decode. */ - Word fetched = core_->icache_fetch(PC_); - auto instr = core_->decoder().decode(fetched, PC_); + Word instr_code = core_->icache_read(PC_, sizeof(Word)); + auto instr = core_->decoder().decode(instr_code); + if (!instr) { + std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl; + std::abort(); + } + + DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr); // Update state + pipeline_state->cid = core_->id(); pipeline_state->wid = id_; pipeline_state->PC = PC_; pipeline_state->tmask = tmask_; pipeline_state->rdest = instr->getRDest(); pipeline_state->rdest_type = instr->getRDType(); - pipeline_state->used_iregs.reset(); - pipeline_state->used_fregs.reset(); - pipeline_state->used_vregs.reset(); - + // Execute this->execute(*instr, pipeline_state); - D(4, "Register state:"); + DP(4, "Register state:"); for (int i = 0; i < core_->arch().num_regs(); ++i) { DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':'); for (int j = 0; j < core_->arch().num_threads(); ++j) { diff --git a/sim/vlsim/opae_sim.cpp b/sim/vlsim/opae_sim.cpp index ced1e233..5da617b5 100644 --- a/sim/vlsim/opae_sim.cpp +++ b/sim/vlsim/opae_sim.cpp @@ -44,6 +44,8 @@ #define VERILATOR_RESET_VALUE 2 #endif +#define RAM_PAGE_SIZE 4096 + using namespace vortex; static uint64_t timestamp = 0; @@ -136,7 +138,7 @@ opae_sim::opae_sim() : stop_(false) , host_buffer_ids_(0) { vl_obj_ = new VL_OBJ(); - ram_ = new RAM((1<<12), (1<<20)); + ram_ = new RAM(RAM_PAGE_SIZE); // reset the device this->reset(); From 27a65fdee78febacf5e51deee36ed3260855c9f3 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 14 Nov 2021 09:05:15 -0500 Subject: [PATCH 05/27] driver refactoring --- driver/common/opae.cpp | 47 +++++------- driver/common/vx_scope.h | 2 +- driver/common/vx_utils.cpp | 22 ++++-- driver/common/vx_utils.h | 11 +++ driver/include/vortex.h | 19 +++-- driver/rtlsim/Makefile | 2 +- driver/rtlsim/vortex.cpp | 56 +++++++------- driver/simx/Makefile | 2 +- driver/simx/vortex.cpp | 109 ++++++++++------------------ driver/stub/vortex.cpp | 12 +-- sim/common/mem.cpp | 53 ++++++++------ sim/common/mem.h | 12 +-- sim/common/simobject.h | 4 + sim/common/util.h | 5 -- sim/rtlsim/main.cpp | 4 +- sim/rtlsim/simulator.cpp | 2 +- tests/regression/basic/main.cpp | 4 +- tests/regression/demo/main.cpp | 4 +- tests/regression/diverge/main.cpp | 2 +- tests/regression/dogfood/main.cpp | 4 +- tests/regression/fence/main.cpp | 4 +- tests/regression/io_addr/main.cpp | 2 +- tests/regression/mstress/main.cpp | 4 +- tests/regression/no_mf_ext/main.cpp | 2 +- tests/regression/no_smem/main.cpp | 2 +- tests/regression/printf/main.cpp | 4 +- tests/regression/tex/main.cpp | 4 +- 27 files changed, 200 insertions(+), 198 deletions(-) create mode 100644 driver/common/vx_utils.h diff --git a/driver/common/opae.cpp b/driver/common/opae.cpp index aa4bf933..e0f9ad09 100755 --- a/driver/common/opae.cpp +++ b/driver/common/opae.cpp @@ -17,6 +17,7 @@ #include #endif +#include "vx_utils.h" #include #include #include "vortex_afu.h" @@ -52,7 +53,7 @@ typedef struct vx_device_ { fpga_handle fpga; - size_t mem_allocation; + uint64_t mem_allocation; unsigned version; unsigned num_cores; unsigned num_warps; @@ -64,19 +65,9 @@ typedef struct vx_buffer_ { void* host_ptr; uint64_t io_addr; vx_device_h hdevice; - size_t size; + uint64_t size; } vx_buffer_t; -inline size_t aligned_size(size_t size, size_t alignment) { - assert(0 == (alignment & (alignment - 1))); - return (size + alignment - 1) & ~(alignment - 1); -} - -inline bool is_aligned(size_t addr, size_t alignment) { - assert(0 == (alignment & (alignment - 1))); - return 0 == (addr & (alignment - 1)); -} - /////////////////////////////////////////////////////////////////////////////// #ifdef DUMP_PERF_STATS @@ -107,7 +98,7 @@ AutoPerfDump gAutoPerfDump; /////////////////////////////////////////////////////////////////////////////// -extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { +extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { if (nullptr == hdevice) return -1; @@ -279,7 +270,7 @@ extern int vx_dev_close(vx_device_h hdevice) { return 0; } -extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) { +extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) { if (nullptr == hdevice || nullptr == dev_maddr || 0 >= size) @@ -299,7 +290,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) return 0; } -extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) { +extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) { fpga_result res; void* host_ptr; uint64_t wsid; @@ -367,7 +358,7 @@ extern int vx_buf_release(vx_buffer_h hbuffer) { return 0; } -extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { +extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { if (nullptr == hdevice) return -1; @@ -386,7 +377,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { #endif // to milliseconds - long long sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000); + uint64_t sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000); for (;;) { uint64_t status; @@ -430,7 +421,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { return 0; } -extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) { +extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -438,8 +429,8 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer); vx_device_t *device = ((vx_device_t*)buffer->hdevice); - size_t dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + uint64_t dev_mem_size = LOCAL_MEM_SIZE; + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); // check alignment if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE)) @@ -454,7 +445,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si return -1; // Ensure ready for new command - if (vx_ready_wait(buffer->hdevice, -1) != 0) + if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0) return -1; auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE); @@ -465,13 +456,13 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE)); // Wait for the write operation to finish - if (vx_ready_wait(buffer->hdevice, -1) != 0) + if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0) return -1; return 0; } -extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) { +extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -479,8 +470,8 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer); vx_device_t *device = ((vx_device_t*)buffer->hdevice); - size_t dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + uint64_t dev_mem_size = LOCAL_MEM_SIZE; + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); // check alignment if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE)) @@ -495,7 +486,7 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, return -1; // Ensure ready for new command - if (vx_ready_wait(buffer->hdevice, -1) != 0) + if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0) return -1; auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE); @@ -506,7 +497,7 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ)); // Wait for the write operation to finish - if (vx_ready_wait(buffer->hdevice, -1) != 0) + if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0) return -1; return 0; @@ -519,7 +510,7 @@ extern int vx_start(vx_device_h hdevice) { vx_device_t *device = ((vx_device_t*)hdevice); // Ensure ready for new command - if (vx_ready_wait(hdevice, -1) != 0) + if (vx_ready_wait(hdevice, MAX_TIMEOUT) != 0) return -1; // start execution diff --git a/driver/common/vx_scope.h b/driver/common/vx_scope.h index dfc53520..0e2ae081 100644 --- a/driver/common/vx_scope.h +++ b/driver/common/vx_scope.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #ifdef USE_VLSIM #include diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index 79853aa1..5b70e09b 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -1,17 +1,29 @@ +#include "vx_utils.h" #include #include #include #include #include +#include -extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size) { +uint64_t aligned_size(uint64_t size, uint64_t alignment) { + assert(0 == (alignment & (alignment - 1))); + return (size + alignment - 1) & ~(alignment - 1); +} + +bool is_aligned(uint64_t addr, uint64_t alignment) { + assert(0 == (alignment & (alignment - 1))); + return 0 == (addr & (alignment - 1)); +} + +extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, uint64_t size) { int err = 0; if (NULL == content || 0 == size) return -1; uint32_t buffer_transfer_size = 65536; - unsigned kernel_base_addr; + uint64_t kernel_base_addr; err = vx_dev_caps(device, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr); if (err != 0) return -1; @@ -29,9 +41,9 @@ extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_ // upload content // - size_t offset = 0; + uint64_t offset = 0; while (offset < size) { - auto chunk_size = std::min(buffer_transfer_size, size - offset); + auto chunk_size = std::min(buffer_transfer_size, size - offset); std::memcpy(buf_ptr, (uint8_t*)content + offset, chunk_size); /*printf("*** Upload Kernel to 0x%0x: data=", kernel_base_addr + offset); @@ -127,7 +139,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t mem_lat = 0; #endif - unsigned num_cores; + uint64_t num_cores; ret = vx_dev_caps(device, VX_CAPS_MAX_CORES, &num_cores); if (ret != 0) return ret; diff --git a/driver/common/vx_utils.h b/driver/common/vx_utils.h new file mode 100644 index 00000000..b86c75af --- /dev/null +++ b/driver/common/vx_utils.h @@ -0,0 +1,11 @@ +#pragma once + +#include + +uint64_t aligned_size(uint64_t size, uint64_t alignment); + +bool is_aligned(uint64_t addr, uint64_t alignment); + +#define CACHE_BLOCK_SIZE 64 +#define ALLOC_BASE_ADDR 0x00000000 +#define LOCAL_MEM_SIZE 4294967296 // 4 GB \ No newline at end of file diff --git a/driver/include/vortex.h b/driver/include/vortex.h index 05648671..0fc9c5ce 100644 --- a/driver/include/vortex.h +++ b/driver/include/vortex.h @@ -2,6 +2,7 @@ #define __VX_DRIVER_H__ #include +#include #include #ifdef __cplusplus @@ -22,9 +23,7 @@ typedef void* vx_buffer_h; #define VX_CAPS_ALLOC_BASE_ADDR 0x6 #define VX_CAPS_KERNEL_BASE_ADDR 0x7 -#define CACHE_BLOCK_SIZE 64 -#define ALLOC_BASE_ADDR 0x00000000 -#define LOCAL_MEM_SIZE 0xffffffff +#define MAX_TIMEOUT (60*60*1000) // 1hr // open the device and connect to it int vx_dev_open(vx_device_h* hdevice); @@ -33,10 +32,10 @@ int vx_dev_open(vx_device_h* hdevice); int vx_dev_close(vx_device_h hdevice); // return device configurations -int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value); +int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value); // Allocate shared buffer with device -int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer); +int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer); // Get host pointer address void* vx_host_ptr(vx_buffer_h hbuffer); @@ -45,24 +44,24 @@ void* vx_host_ptr(vx_buffer_h hbuffer); int vx_buf_release(vx_buffer_h hbuffer); // allocate device memory and return address -int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr); +int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr); // Copy bytes from buffer to device local memory -int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset); +int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset); // Copy bytes from device local memory to buffer -int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dst_offset); +int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dst_offset); // Start device execution int vx_start(vx_device_h hdevice); // Wait for device ready with milliseconds timeout -int vx_ready_wait(vx_device_h hdevice, long long timeout); +int vx_ready_wait(vx_device_h hdevice, uint64_t timeout); ////////////////////////////// UTILITY FUNCIONS /////////////////////////////// // upload kernel bytes to device -int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size); +int vx_upload_kernel_bytes(vx_device_h device, const void* content, uint64_t size); // upload kernel file to device int vx_upload_kernel_file(vx_device_h device, const char* filename); diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index cf0a184d..4626eeb3 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -3,7 +3,7 @@ RTLSIM_DIR = ../../sim/rtlsim CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors #CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors -CXXFLAGS += -I../include -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common +CXXFLAGS += -I../include -I../common -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common LDFLAGS += $(RTLSIM_DIR)/librtlsim.a diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index cfed5a97..bed5c807 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -7,11 +7,14 @@ #include #include +#include #include #include #include #include +#define RAM_PAGE_SIZE 4096 + using namespace vortex; /////////////////////////////////////////////////////////////////////////////// @@ -19,7 +22,7 @@ using namespace vortex; class vx_device; class vx_buffer { public: - vx_buffer(size_t size, vx_device* device) + vx_buffer(uint64_t size, vx_device* device) : size_(size) , device_(device) { auto aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE); @@ -36,7 +39,7 @@ public: return data_; } - size_t size() const { + uint64_t size() const { return size_; } @@ -45,7 +48,7 @@ public: } private: - size_t size_; + uint64_t size_; vx_device* device_; void* data_; }; @@ -54,9 +57,10 @@ private: class vx_device { public: - vx_device() : ram_((1<<12), (1<<20)) { - mem_allocation_ = ALLOC_BASE_ADDR; - } + vx_device() + : ram_(RAM_PAGE_SIZE) + , mem_allocation_(ALLOC_BASE_ADDR) + {} ~vx_device() { if (future_.valid()) { @@ -64,9 +68,9 @@ public: } } - int alloc_local_mem(size_t size, size_t* dev_maddr) { - auto dev_mem_size = LOCAL_MEM_SIZE; - size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) { + uint64_t dev_mem_size = LOCAL_MEM_SIZE; + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (mem_allocation_ + asize > dev_mem_size) return -1; *dev_maddr = mem_allocation_; @@ -74,9 +78,9 @@ public: return 0; } - int upload(const void* src, size_t dest_addr, size_t size, size_t src_offset) { - size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); - if (dest_addr + asize > ram_.size()) + int upload(const void* src, uint64_t dest_addr, uint64_t size, uint64_t src_offset) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (dest_addr + asize > LOCAL_MEM_SIZE) return -1; /*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src + src_offset)); @@ -92,9 +96,9 @@ public: return 0; } - int download(void* dest, size_t src_addr, size_t size, size_t dest_offset) { - size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); - if (src_addr + asize > ram_.size()) + int download(void* dest, uint64_t src_addr, uint64_t size, uint64_t dest_offset) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (src_addr + asize > LOCAL_MEM_SIZE) return -1; ram_.read((uint8_t*)dest + dest_offset, src_addr, asize); @@ -125,10 +129,10 @@ public: return 0; } - int wait(long long timeout) { + int wait(uint64_t timeout) { if (!future_.valid()) return 0; - auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000); + uint64_t timeout_sec = timeout / 1000; std::chrono::seconds wait_time(1); for (;;) { auto status = future_.wait_for(wait_time); // wait for 1 sec and check status @@ -141,9 +145,9 @@ public: private: - size_t mem_allocation_; RAM ram_; Simulator simulator_; + uint64_t mem_allocation_; std::future future_; }; @@ -177,7 +181,7 @@ AutoPerfDump gAutoPerfDump; /////////////////////////////////////////////////////////////////////////////// -extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { +extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { if (nullptr == hdevice) return -1; @@ -198,10 +202,10 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { *value = CACHE_BLOCK_SIZE; break; case VX_CAPS_LOCAL_MEM_SIZE: - *value = 0xffffffff; + *value = LOCAL_MEM_SIZE; break; case VX_CAPS_ALLOC_BASE_ADDR: - *value = 0x10000000; + *value = ALLOC_BASE_ADDR; break; case VX_CAPS_KERNEL_BASE_ADDR: *value = STARTUP_ADDR; @@ -244,7 +248,7 @@ extern int vx_dev_close(vx_device_h hdevice) { return 0; } -extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) { +extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) { if (nullptr == hdevice || nullptr == dev_maddr || 0 >= size) @@ -255,7 +259,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) } -extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) { +extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) { if (nullptr == hdevice || 0 >= size || nullptr == hbuffer) @@ -294,7 +298,7 @@ extern int vx_buf_release(vx_buffer_h hbuffer) { return 0; } -extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) { +extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -307,7 +311,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset); } -extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) { +extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -329,7 +333,7 @@ extern int vx_start(vx_device_h hdevice) { return device->start(); } -extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { +extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { if (nullptr == hdevice) return -1; diff --git a/driver/simx/Makefile b/driver/simx/Makefile index 82bf6e32..dea65c35 100644 --- a/driver/simx/Makefile +++ b/driver/simx/Makefile @@ -4,7 +4,7 @@ CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CXXFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors CXXFLAGS += -fPIC -Wno-maybe-uninitialized -CXXFLAGS += -I../include -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common +CXXFLAGS += -I../include -I../common -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common CXXFLAGS += $(CONFIGS) CXXFLAGS += -DDUMP_PERF_STATS diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index 5c31cb87..1bd15e07 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -8,11 +8,12 @@ #include #include -#include +#include +#include #include #include -#define PAGE_SIZE 4096 +#define RAM_PAGE_SIZE 4096 using namespace vortex; @@ -22,10 +23,10 @@ class vx_device; class vx_buffer { public: - vx_buffer(size_t size, vx_device* device) + vx_buffer(uint64_t size, vx_device* device) : size_(size) , device_(device) { - auto aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE); + uint64_t aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE); data_ = malloc(aligned_asize); } @@ -39,7 +40,7 @@ public: return data_; } - size_t size() const { + uint64_t size() const { return size_; } @@ -48,7 +49,7 @@ public: } private: - size_t size_; + uint64_t size_; vx_device* device_; void* data_; }; @@ -59,32 +60,23 @@ class vx_device { public: vx_device() : arch_("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS) - , decoder_(arch_) - , mmu_(PAGE_SIZE, arch_.wsize(), true) - , cores_(arch_.num_cores()) , is_done_(false) , is_running_(false) + , mem_allocation_(ALLOC_BASE_ADDR) , thread_(__thread_proc__, this) - , ram_((1<<12), (1<<20)) { - - mem_allocation_ = ALLOC_BASE_ADDR; - mmu_.attach(ram_, 0, 0xffffffff); - for (int i = 0; i < arch_.num_cores(); ++i) { - cores_.at(i) = std::make_shared(arch_, decoder_, mmu_, i); - } - } + , ram_(RAM_PAGE_SIZE) + {} ~vx_device() { mutex_.lock(); is_done_ = true; - mutex_.unlock(); - + mutex_.unlock(); thread_.join(); } - int alloc_local_mem(size_t size, size_t* dev_maddr) { - auto dev_mem_size = LOCAL_MEM_SIZE; - auto asize = aligned_size(size, CACHE_BLOCK_SIZE); + int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) { + uint64_t dev_mem_size = LOCAL_MEM_SIZE; + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (mem_allocation_ + asize > dev_mem_size) return -1; *dev_maddr = mem_allocation_; @@ -92,9 +84,9 @@ public: return 0; } - int upload(const void* src, size_t dest_addr, size_t size, size_t src_offset) { - auto asize = aligned_size(size, CACHE_BLOCK_SIZE); - if (dest_addr + asize > ram_.size()) + int upload(const void* src, uint64_t dest_addr, uint64_t size, uint64_t src_offset) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (dest_addr + asize > LOCAL_MEM_SIZE) return -1; ram_.write((const uint8_t*)src + src_offset, dest_addr, asize); @@ -107,9 +99,9 @@ public: return 0; } - int download(void* dest, size_t src_addr, size_t size, size_t dest_offset) { - size_t asize = aligned_size(size, CACHE_BLOCK_SIZE); - if (src_addr + asize > ram_.size()) + int download(void* dest, uint64_t src_addr, uint64_t size, uint64_t dest_offset) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (src_addr + asize > LOCAL_MEM_SIZE) return -1; ram_.read((uint8_t*)dest + dest_offset, src_addr, asize); @@ -123,19 +115,17 @@ public: } int start() { - mutex_.lock(); - for (int i = 0; i < arch_.num_cores(); ++i) { - cores_.at(i)->clear(); - } + SimPlatform::instance().flush(); + processor_ = std::make_shared(arch_); + processor_->attach_ram(&ram_); is_running_ = true; mutex_.unlock(); - return 0; } - int wait(long long timeout) { - auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000); + int wait(uint64_t timeout) { + uint64_t timeout_sec = timeout / 1000; for (;;) { mutex_.lock(); bool is_running = is_running_; @@ -147,32 +137,10 @@ public: std::this_thread::sleep_for(std::chrono::seconds(1)); } return 0; - } - - int get_csr(int core_id, int addr, unsigned *value) { - *value = cores_.at(core_id)->get_csr(addr, 0, 0); - return 0; - } - - int set_csr(int core_id, int addr, unsigned value) { - cores_.at(core_id)->set_csr(addr, value, 0, 0); - return 0; - } + } private: - void run() { - bool running; - do { - running = false; - for (auto& core : cores_) { - core->step(); - if (core->running()) - running = true; - } - } while (running); - } - void thread_proc() { std::cout << "Device ready..." << std::flush << std::endl; @@ -188,7 +156,7 @@ private: if (is_running) { std::cout << "Device running..." << std::flush << std::endl; - this->run(); + processor_->run(); mutex_.lock(); is_running_ = false; @@ -206,12 +174,10 @@ private: } ArchDef arch_; - Decoder decoder_; - MemoryUnit mmu_; - std::vector> cores_; + Processor::Ptr processor_; bool is_done_; bool is_running_; - size_t mem_allocation_; + uint64_t mem_allocation_; std::thread thread_; RAM ram_; std::mutex mutex_; @@ -251,6 +217,9 @@ extern int vx_dev_open(vx_device_h* hdevice) { if (nullptr == hdevice) return -1; + if (!SimPlatform::instance().initialize()) + return -1; + *hdevice = new vx_device(); #ifdef DUMP_PERF_STATS @@ -273,10 +242,12 @@ extern int vx_dev_close(vx_device_h hdevice) { delete device; + SimPlatform::instance().finalize(); + return 0; } -extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { +extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { if (nullptr == hdevice) return -1; @@ -314,7 +285,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { return 0; } -extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) { +extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) { if (nullptr == hdevice || nullptr == dev_maddr || 0 >= size) @@ -324,7 +295,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) return device->alloc_local_mem(size, dev_maddr); } -extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) { +extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) { if (nullptr == hdevice || 0 >= size || nullptr == hbuffer) @@ -363,7 +334,7 @@ extern int vx_buf_release(vx_buffer_h hbuffer) { return 0; } -extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) { +extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -376,7 +347,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset); } -extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) { +extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) { if (nullptr == hbuffer || 0 >= size) return -1; @@ -398,7 +369,7 @@ extern int vx_start(vx_device_h hdevice) { return device->start(); } -extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { +extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { if (nullptr == hdevice) return -1; diff --git a/driver/stub/vortex.cpp b/driver/stub/vortex.cpp index f5079500..95777257 100644 --- a/driver/stub/vortex.cpp +++ b/driver/stub/vortex.cpp @@ -8,15 +8,15 @@ extern int vx_dev_close(vx_device_h /*hdevice*/) { return -1; } -extern int vx_dev_caps(vx_device_h /*hdevice*/, unsigned /*caps_id*/, unsigned* /*value*/) { +extern int vx_dev_caps(vx_device_h /*hdevice*/, uint32_t /*caps_id*/, uint64_t* /*value*/) { return -1; } -extern int vx_alloc_dev_mem(vx_device_h /*hdevice*/, size_t /*size*/, size_t* /*dev_maddr*/) { +extern int vx_alloc_dev_mem(vx_device_h /*hdevice*/, uint64_t /*size*/, uint64_t* /*dev_maddr*/) { return -1; } -extern int vx_alloc_shared_mem(vx_device_h /*hdevice*/, size_t /*size*/, vx_buffer_h* /*hbuffer*/) { +extern int vx_alloc_shared_mem(vx_device_h /*hdevice*/, uint64_t /*size*/, vx_buffer_h* /*hbuffer*/) { return -1; } @@ -28,11 +28,11 @@ extern int vx_buf_release(vx_buffer_h /*hbuffer*/) { return -1; } -extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, size_t /*dev_maddr*/, size_t /*size*/, size_t /*src_offset*/) { +extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, uint64_t /*dev_maddr*/, uint64_t /*size*/, uint64_t /*src_offset*/) { return -1; } -extern int vx_copy_from_dev(vx_buffer_h /*hbuffer*/, size_t /*dev_maddr*/, size_t /*size*/, size_t /*dest_offset*/) { +extern int vx_copy_from_dev(vx_buffer_h /*hbuffer*/, uint64_t /*dev_maddr*/, uint64_t /*size*/, uint64_t /*dest_offset*/) { return -1; } @@ -40,6 +40,6 @@ extern int vx_start(vx_device_h /*hdevice*/) { return -1; } -extern int vx_ready_wait(vx_device_h /*hdevice*/, long long /*timeout*/) { +extern int vx_ready_wait(vx_device_h /*hdevice*/, uint64_t /*timeout*/) { return -1; } \ No newline at end of file diff --git a/sim/common/mem.cpp b/sim/common/mem.cpp index 6c4b94de..ff67489d 100644 --- a/sim/common/mem.cpp +++ b/sim/common/mem.cpp @@ -168,11 +168,12 @@ void MemoryUnit::tlbRm(uint64_t va) { /////////////////////////////////////////////////////////////////////////////// -RAM::RAM(uint32_t num_pages, uint32_t page_size) - : page_bits_(log2ceil(page_size)) { - assert(ispow2(page_size)); - mem_.resize(num_pages, NULL); - size_ = uint64_t(mem_.size()) << page_bits_; +RAM::RAM(uint32_t page_size) + : size_(0) + , page_bits_(log2ceil(page_size)) + , last_page_(nullptr) + , last_page_index_(0) { + assert(ispow2(page_size)); } RAM::~RAM() { @@ -180,31 +181,41 @@ RAM::~RAM() { } void RAM::clear() { - for (auto& page : mem_) { - delete[] page; - page = NULL; + for (auto& page : pages_) { + delete[] page.second; } } uint64_t RAM::size() const { - return size_; + return uint64_t(pages_.size()) << page_bits_; } -uint8_t *RAM::get(uint32_t address) const { - uint32_t page_size = 1 << page_bits_; - uint32_t page_index = address >> page_bits_; - uint32_t byte_offset = address & ((1 << page_bits_) - 1); +uint8_t *RAM::get(uint64_t address) const { + uint32_t page_size = 1 << page_bits_; + uint32_t page_offset = address & (page_size - 1); + uint64_t page_index = address >> page_bits_; - auto &page = mem_.at(page_index); - if (page == NULL) { - uint8_t *ptr = new uint8_t[page_size]; - // set uninitialized data to "baadf00d" - for (uint32_t i = 0; i < page_size; ++i) { - ptr[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff; + uint8_t* page; + if (last_page_ && last_page_index_ == page_index) { + page = last_page_; + } else { + auto it = pages_.find(page_index); + if (it != pages_.end()) { + page = it->second; + } else { + uint8_t *ptr = new uint8_t[page_size]; + // set uninitialized data to "baadf00d" + for (uint32_t i = 0; i < page_size; ++i) { + ptr[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff; + } + pages_.emplace(page_index, ptr); + page = ptr; } - page = ptr; + last_page_ = page; + last_page_index_ = page_index; } - return page + byte_offset; + + return page + page_offset; } void RAM::read(void *data, uint64_t addr, uint64_t size) { diff --git a/sim/common/mem.h b/sim/common/mem.h index 8929b4d9..d404602d 100644 --- a/sim/common/mem.h +++ b/sim/common/mem.h @@ -130,13 +130,13 @@ private: class RAM : public MemDevice { public: - RAM(uint32_t num_pages, uint32_t page_size); - + RAM(uint32_t page_size); ~RAM(); void clear(); uint64_t size() const override; + void read(void *data, uint64_t addr, uint64_t size) override; void write(const void *data, uint64_t addr, uint64_t size) override; @@ -153,11 +153,13 @@ public: private: - uint8_t *get(uint32_t address) const; + uint8_t *get(uint64_t address) const; - mutable std::vector mem_; - uint32_t page_bits_; uint64_t size_; + uint32_t page_bits_; + mutable std::unordered_map pages_; + mutable uint8_t* last_page_; + mutable uint64_t last_page_index_; }; } // namespace vortex \ No newline at end of file diff --git a/sim/common/simobject.h b/sim/common/simobject.h index 487d385c..52c74643 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -282,6 +282,10 @@ public: return true; } + void flush() { + instance().clear(); + } + void finalize() { instance().clear(); } diff --git a/sim/common/util.h b/sim/common/util.h index 668f3e26..b6137199 100644 --- a/sim/common/util.h +++ b/sim/common/util.h @@ -75,11 +75,6 @@ inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) { return (bits << shift) >> (shift + start); } -inline uint64_t aligned_size(uint64_t size, uint32_t alignment) { - assert(0 == (alignment & (alignment - 1))); - return (size + alignment - 1) & ~(alignment - 1); -} - // Apply integer sign extension inline uint32_t sext32(uint32_t word, uint32_t width) { assert(width > 1); diff --git a/sim/rtlsim/main.cpp b/sim/rtlsim/main.cpp index 0f0575f5..652e550f 100644 --- a/sim/rtlsim/main.cpp +++ b/sim/rtlsim/main.cpp @@ -7,6 +7,8 @@ #include #include "simulator.h" +#define RAM_PAGE_SIZE 4096 + using namespace vortex; static void show_usage() { @@ -49,7 +51,7 @@ int main(int argc, char **argv) { for (auto program : programs) { std::cout << "Running " << program << "..." << std::endl; - vortex::RAM ram((1<<12), (1<<20)); + vortex::RAM ram(RAM_PAGE_SIZE); vortex::Simulator simulator; simulator.attach_ram(&ram); diff --git a/sim/rtlsim/simulator.cpp b/sim/rtlsim/simulator.cpp index 8d3f9acf..0f6df7d7 100644 --- a/sim/rtlsim/simulator.cpp +++ b/sim/rtlsim/simulator.cpp @@ -477,7 +477,7 @@ void Simulator::eval_mem_bus(bool clk) { uint8_t* data = (uint8_t*)(vl_obj_->device->mem_req_data); if (base_addr >= IO_COUT_ADDR && base_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + for (int i = 0; i < IO_COUT_SIZE; i++) { if ((byteen >> i) & 0x1) { auto& ss_buf = print_bufs_[i]; char c = data[i]; diff --git a/tests/regression/basic/main.cpp b/tests/regression/basic/main.cpp index c92bae8d..fcea1fda 100755 --- a/tests/regression/basic/main.cpp +++ b/tests/regression/basic/main.cpp @@ -169,7 +169,7 @@ int run_kernel_test(const kernel_arg_t& kernel_arg, std::cout << "start execution" << std::endl; auto t2 = std::chrono::high_resolution_clock::now(); RT_CHECK(vx_start(device)); - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); auto t3 = std::chrono::high_resolution_clock::now(); // read destination buffer from local memory @@ -228,7 +228,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores; + uint64_t max_cores; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); uint32_t num_points = count; uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64; diff --git a/tests/regression/demo/main.cpp b/tests/regression/demo/main.cpp index 2961b517..29cc7d85 100644 --- a/tests/regression/demo/main.cpp +++ b/tests/regression/demo/main.cpp @@ -67,7 +67,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/diverge/main.cpp b/tests/regression/diverge/main.cpp index 7b27760c..778d118f 100644 --- a/tests/regression/diverge/main.cpp +++ b/tests/regression/diverge/main.cpp @@ -121,7 +121,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/dogfood/main.cpp b/tests/regression/dogfood/main.cpp index 804609ae..71ae6624 100644 --- a/tests/regression/dogfood/main.cpp +++ b/tests/regression/dogfood/main.cpp @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); @@ -245,7 +245,7 @@ int main(int argc, char *argv[]) { // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/fence/main.cpp b/tests/regression/fence/main.cpp index 2961b517..29cc7d85 100644 --- a/tests/regression/fence/main.cpp +++ b/tests/regression/fence/main.cpp @@ -67,7 +67,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/io_addr/main.cpp b/tests/regression/io_addr/main.cpp index 7899aa2a..42dcd7c0 100644 --- a/tests/regression/io_addr/main.cpp +++ b/tests/regression/io_addr/main.cpp @@ -101,7 +101,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/mstress/main.cpp b/tests/regression/mstress/main.cpp index bbb4660f..c2354edc 100644 --- a/tests/regression/mstress/main.cpp +++ b/tests/regression/mstress/main.cpp @@ -136,7 +136,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; @@ -194,7 +194,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/no_mf_ext/main.cpp b/tests/regression/no_mf_ext/main.cpp index 01bcfb90..01ae744c 100644 --- a/tests/regression/no_mf_ext/main.cpp +++ b/tests/regression/no_mf_ext/main.cpp @@ -67,7 +67,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/no_smem/main.cpp b/tests/regression/no_smem/main.cpp index 01bcfb90..01ae744c 100644 --- a/tests/regression/no_smem/main.cpp +++ b/tests/regression/no_smem/main.cpp @@ -67,7 +67,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; diff --git a/tests/regression/printf/main.cpp b/tests/regression/printf/main.cpp index 11b9fc50..b9d4db38 100644 --- a/tests/regression/printf/main.cpp +++ b/tests/regression/printf/main.cpp @@ -65,7 +65,7 @@ int run_test() { // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); return 0; } @@ -85,7 +85,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); diff --git a/tests/regression/tex/main.cpp b/tests/regression/tex/main.cpp index 39ffea0c..a83651ee 100644 --- a/tests/regression/tex/main.cpp +++ b/tests/regression/tex/main.cpp @@ -110,7 +110,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); auto time_end = std::chrono::high_resolution_clock::now(); double elapsed = std::chrono::duration_cast(time_end - time_start).count(); @@ -166,7 +166,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); From 1501360f4bcc6b8e855120d5ba8aaa45981bd9d7 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 14 Nov 2021 09:06:13 -0500 Subject: [PATCH 06/27] minor update --- runtime/include/vx_intrinsics.h | 20 ++++++++++++++++++++ tests/regression/sort/main.cpp | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/runtime/include/vx_intrinsics.h b/runtime/include/vx_intrinsics.h index 9c3149d7..abbca493 100644 --- a/runtime/include/vx_intrinsics.h +++ b/runtime/include/vx_intrinsics.h @@ -117,6 +117,26 @@ extern "C" { __r; \ }) +// Lerp instruction +#define vx_lerp(a, b, s) ({ \ + unsigned __r; \ + unsigned __a = a; \ + unsigned __b = b; \ + unsigned __s = s; \ + __asm__ __volatile__ (".insn r4 0x6b, 7, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r : "r"(__a), "r"(__b), "r"(__s)); \ + __r; \ +}) + +// Conditional move +#define vx_cmov(c, t, f) ({ \ + unsigned __r; \ + unsigned __c = c; \ + unsigned __t = t; \ + unsigned __f = f; \ + __asm__ __volatile__ (".insn r4 0x6b, 6, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r : "r"(__c), "r"(__t), "r"(__f)); \ + __r; \ +}) + // Set thread mask inline void vx_tmc(unsigned thread_mask) { asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask)); diff --git a/tests/regression/sort/main.cpp b/tests/regression/sort/main.cpp index c5f23141..96032a91 100644 --- a/tests/regression/sort/main.cpp +++ b/tests/regression/sort/main.cpp @@ -98,7 +98,7 @@ int run_test(const kernel_arg_t& kernel_arg, // wait for completion std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, -1)); + RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT)); // download destination buffer std::cout << "download destination buffer" << std::endl; From 18762dffce0fd57a09f7d4cad7571d6f2e652e5f Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 24 Nov 2021 00:00:17 -0500 Subject: [PATCH 07/27] fixes: texture unit mem access sometimes going to smem, bilinear texture filtering; new: cache req_id, --- ci/regression.sh | 2 +- driver/rtlsim/vortex.cpp | 7 +- driver/simx/vortex.cpp | 82 +-- hw/rtl/VX_config.vh | 37 +- hw/rtl/VX_csr_data.sv | 72 +-- hw/rtl/VX_decode.sv | 28 +- hw/rtl/VX_define.vh | 51 +- hw/rtl/VX_execute.sv | 34 +- hw/rtl/VX_icache_stage.sv | 34 +- hw/rtl/VX_lsu_unit.sv | 48 +- hw/rtl/cache/VX_bank.sv | 71 +-- hw/rtl/cache/VX_cache.sv | 3 +- hw/rtl/cache/VX_cache_define.vh | 7 +- hw/rtl/cache/VX_data_access.sv | 9 +- hw/rtl/cache/VX_miss_resrv.sv | 26 +- hw/rtl/cache/VX_shared_mem.sv | 29 +- hw/rtl/cache/VX_tag_access.sv | 9 +- hw/rtl/tex_unit/VX_tex_addr.sv | 133 +++-- hw/rtl/tex_unit/VX_tex_define.vh | 23 +- hw/rtl/tex_unit/VX_tex_mem.sv | 58 +- hw/rtl/tex_unit/VX_tex_sampler.sv | 10 +- hw/rtl/tex_unit/VX_tex_stride.sv | 4 +- hw/rtl/tex_unit/VX_tex_unit.sv | 78 +-- hw/rtl/tex_unit/VX_tex_wrap.sv | 16 +- hw/syn/opae/Makefile | 1 - runtime/Makefile | 2 +- runtime/include/vx_intrinsics.h | 178 +++--- runtime/src/tinyprintf.c | 890 ++++++++++++++++++++++++++++++ runtime/src/tinyprintf.h | 86 +++ runtime/src/vx_perf.c | 8 +- runtime/src/vx_print.c | 63 ++- runtime/src/vx_syscalls.c | 5 +- sim/common/bitmanip.h | 79 +++ sim/common/fixed.h | 419 ++++++++++++++ sim/common/simobject.h | 69 +-- sim/common/texturing.h | 221 ++++++++ sim/common/util.h | 75 +-- sim/rtlsim/Makefile | 13 +- sim/simX/Makefile | 2 +- sim/simX/cache.cpp | 391 +++++++------ sim/simX/cache.h | 3 +- sim/simX/constants.h | 6 +- sim/simX/core.cpp | 224 ++++---- sim/simX/core.h | 20 +- sim/simX/decode.cpp | 89 ++- sim/simX/execute.cpp | 542 +++++++++--------- sim/simX/exeunit.cpp | 269 ++++++--- sim/simX/exeunit.h | 42 +- sim/simX/ibuffer.h | 8 +- sim/simX/instr.h | 6 +- sim/simX/memsim.cpp | 6 +- sim/simX/memsim.h | 14 +- sim/simX/pipeline.h | 39 +- sim/simX/processor.cpp | 10 +- sim/simX/scoreboard.h | 66 ++- sim/simX/tex_unit.cpp | 91 +++ sim/simX/tex_unit.h | 26 + sim/simX/types.h | 108 ++-- sim/simX/warp.cpp | 20 +- sim/simX/warp.h | 6 +- sim/vlsim/Makefile | 8 +- tests/regression/tex/Makefile | 8 +- tests/regression/tex/common.h | 34 +- tests/regression/tex/kernel.c | 73 ++- tests/regression/tex/main.cpp | 73 ++- tests/regression/tex/texsw.h | 247 ++++----- tests/regression/tex/utils.cpp | 108 ++++ tests/regression/tex/utils.h | 16 +- tests/runtime/fibonacci/main.cpp | 7 +- tests/runtime/hello/main.cpp | 3 +- 70 files changed, 3818 insertions(+), 1727 deletions(-) create mode 100644 runtime/src/tinyprintf.c create mode 100644 runtime/src/tinyprintf.h create mode 100644 sim/common/bitmanip.h create mode 100644 sim/common/fixed.h create mode 100644 sim/common/texturing.h create mode 100644 sim/simX/tex_unit.cpp create mode 100644 sim/simX/tex_unit.h diff --git a/ci/regression.sh b/ci/regression.sh index 073c0ed1..936ca13b 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -28,7 +28,7 @@ echo "begin texture tests..." CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0" CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" -CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-irainbow.png -orainbow_result.png -g1" +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-irainbow.png -orainbow_result.png -g2" echo "coverage texture done!" } diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index bed5c807..cc16f0d3 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -116,9 +116,11 @@ public: } int start() { + // ensure prior run completed if (future_.valid()) { - future_.wait(); // ensure prior run completed + future_.wait(); } + // start new run simulator_.attach_ram(&ram_); future_ = std::async(std::launch::async, [&]{ simulator_.reset(); @@ -135,7 +137,8 @@ public: uint64_t timeout_sec = timeout / 1000; std::chrono::seconds wait_time(1); for (;;) { - auto status = future_.wait_for(wait_time); // wait for 1 sec and check status + // wait for 1 sec and check status + auto status = future_.wait_for(wait_time); if (status == std::future_status::ready || 0 == timeout_sec--) break; diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index 1bd15e07..d63005d6 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -3,8 +3,7 @@ #include #include #include -#include -#include +#include #include #include @@ -60,18 +59,14 @@ class vx_device { public: vx_device() : arch_("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS) - , is_done_(false) - , is_running_(false) - , mem_allocation_(ALLOC_BASE_ADDR) - , thread_(__thread_proc__, this) , ram_(RAM_PAGE_SIZE) + , mem_allocation_(ALLOC_BASE_ADDR) {} ~vx_device() { - mutex_.lock(); - is_done_ = true; - mutex_.unlock(); - thread_.join(); + if (future_.valid()) { + future_.wait(); + } } int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) { @@ -115,72 +110,41 @@ public: } int start() { - mutex_.lock(); + // ensure prior run completed + if (future_.valid()) { + future_.wait(); + } + // start new run SimPlatform::instance().flush(); processor_ = std::make_shared(arch_); processor_->attach_ram(&ram_); - is_running_ = true; - mutex_.unlock(); + future_ = std::async(std::launch::async, [&]{ + processor_->run(); + }); return 0; } int wait(uint64_t timeout) { + if (!future_.valid()) + return 0; uint64_t timeout_sec = timeout / 1000; + std::chrono::seconds wait_time(1); for (;;) { - mutex_.lock(); - bool is_running = is_running_; - mutex_.unlock(); - - if (!is_running || 0 == timeout_sec--) + // wait for 1 sec and check status + auto status = future_.wait_for(wait_time); + if (status == std::future_status::ready + || 0 == timeout_sec--) break; - - std::this_thread::sleep_for(std::chrono::seconds(1)); } return 0; } private: - - void thread_proc() { - std::cout << "Device ready..." << std::flush << std::endl; - - for (;;) { - mutex_.lock(); - bool is_done = is_done_; - bool is_running = is_running_; - mutex_.unlock(); - - if (is_done) - break; - - if (is_running) { - std::cout << "Device running..." << std::flush << std::endl; - - processor_->run(); - - mutex_.lock(); - is_running_ = false; - mutex_.unlock(); - - std::cout << "Device ready..." << std::flush << std::endl; - } - } - - std::cout << "Device shutdown..." << std::flush << std::endl; - } - - static void __thread_proc__(vx_device* device) { - device->thread_proc(); - } - ArchDef arch_; - Processor::Ptr processor_; - bool is_done_; - bool is_running_; - uint64_t mem_allocation_; - std::thread thread_; RAM ram_; - std::mutex mutex_; + Processor::Ptr processor_; + uint64_t mem_allocation_; + std::future future_; }; /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index de58d9ee..82da10c2 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -236,18 +236,30 @@ ////////// Texture Units ////////////////////////////////////////////////////// -`define NUM_TEX_UNITS 2 +`define NUM_TEX_UNITS 2 +`define TEX_SUBPIXEL_BITS 8 -`define CSR_TEX_STATES 7 -`define CSR_TEX_BEGIN(x) (12'hFD0 + (x) * `CSR_TEX_STATES) +`define TEX_DIM_BITS 15 +`define TEX_LOD_MAX `TEX_DIM_BITS +`define TEX_LOD_BITS 4 -`define CSR_TEX_ADDR(x) (`CSR_TEX_BEGIN(x) + 12'h00) -`define CSR_TEX_FORMAT(x) (`CSR_TEX_BEGIN(x) + 12'h01) -`define CSR_TEX_WRAP(x) (`CSR_TEX_BEGIN(x) + 12'h02) -`define CSR_TEX_FILTER(x) (`CSR_TEX_BEGIN(x) + 12'h03) -`define CSR_TEX_MIPOFF(x) (`CSR_TEX_BEGIN(x) + 12'h04) -`define CSR_TEX_WIDTH(x) (`CSR_TEX_BEGIN(x) + 12'h05) -`define CSR_TEX_HEIGHT(x) (`CSR_TEX_BEGIN(x) + 12'h06) +`define TEX_FXD_BITS 32 +`define TEX_FXD_FRAC (`TEX_DIM_BITS+`TEX_SUBPIXEL_BITS) + +`define TEX_STATE_ADDR 0 +`define TEX_STATE_WIDTH 1 +`define TEX_STATE_HEIGHT 2 +`define TEX_STATE_FORMAT 3 +`define TEX_STATE_FILTER 4 +`define TEX_STATE_WRAPU 5 +`define TEX_STATE_WRAPV 6 +`define TEX_STATE_MIPOFF(lod) (7+(lod)) + +`define NUM_TEX_STATES (7+`TEX_LOD_MAX) + +`define CSR_TEX(unit,state) (12'hFD0 + ((unit) * `NUM_TEX_STATES) + (state)) +`define CSR_TEX_UNIT(csr) (((csr) - 12'hFD0) / `NUM_TEX_STATES) +`define CSR_TEX_STATE(csr) (((csr) - 12'hFD0) % `NUM_TEX_STATES) // Pipeline Queues //////////////////////////////////////////////////////////// @@ -266,6 +278,11 @@ `define FPUQ_SIZE 8 `endif +// Texture Unit Request Queue +`ifndef TEXQ_SIZE +`define TEXQ_SIZE (`NUM_WARPS * 2) +`endif + // Icache Configurable Knobs ////////////////////////////////////////////////// // Size of cache in bytes diff --git a/hw/rtl/VX_csr_data.sv b/hw/rtl/VX_csr_data.sv index b071a347..396358d1 100644 --- a/hw/rtl/VX_csr_data.sv +++ b/hw/rtl/VX_csr_data.sv @@ -50,35 +50,40 @@ module VX_csr_data #( reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr; always @(posedge clk) begin - `ifdef EXT_F_ENABLE if (reset) begin fcsr <= '0; - end - if (fpu_to_csr_if.write_enable) begin - fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] - | fpu_to_csr_if.write_fflags; - end - `endif - if (write_enable) begin - case (write_addr) - `CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0]; - `CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0]; - `CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0]; - `CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0]; - `CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0]; - `CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0]; - `CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0]; - `CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0]; - `CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0]; - `CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0]; - `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0]; - `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0]; - default: begin - `ASSERT(write_addr >= `CSR_TEX_BEGIN(0) - && write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES), - ("%t: invalid CSR write address: %0h", $time, write_addr)); - end - endcase + end else begin + `ifdef EXT_F_ENABLE + if (fpu_to_csr_if.write_enable) begin + fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] + | fpu_to_csr_if.write_fflags; + end + `endif + if (write_enable) begin + case (write_addr) + `CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0]; + `CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0]; + `CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0]; + `CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0]; + `CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0]; + `CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0]; + `CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0]; + `CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0]; + `CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0]; + `CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0]; + `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0]; + `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0]; + default: begin + `ifdef EXT_TEX_ENABLE + `ASSERT(write_addr >= `CSR_TEX(0,0) + && write_addr < `CSR_TEX(`NUM_TEX_UNITS, 0), + ("%t: invalid CSR write address: %0h", $time, write_addr)); + `else + `ASSERT(~write_enable, ("%t: invalid CSR write address: %0h", $time, write_addr)); + `endif + end + endcase + end end end @@ -217,11 +222,16 @@ module VX_csr_data #( `CSR_MIMPID : read_data_r = `IMPLEMENTATION_ID; default: begin - if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32)) - || (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32) - || (read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))))) begin + if ((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32)) + || (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32))) begin + read_addr_valid_r = 1; + end else + `ifdef EXT_TEX_ENABLE + if (read_addr >= `CSR_TEX(0,0) && read_addr < `CSR_TEX(`NUM_TEX_UNITS,0)) begin + read_addr_valid_r = 1; + end else + `endif read_addr_valid_r = 0; - end end endcase end diff --git a/hw/rtl/VX_decode.sv b/hw/rtl/VX_decode.sv index 89d70d7a..2c6f09fb 100644 --- a/hw/rtl/VX_decode.sv +++ b/hw/rtl/VX_decode.sv @@ -214,9 +214,9 @@ module VX_decode #( case (u_12) 12'h000: op_type = `INST_OP_BITS'(`INST_BR_ECALL); 12'h001: op_type = `INST_OP_BITS'(`INST_BR_EBREAK); + 12'h002: op_type = `INST_OP_BITS'(`INST_BR_URET); + 12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET); 12'h302: op_type = `INST_OP_BITS'(`INST_BR_MRET); - 12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET); - 12'h7B2: op_type = `INST_OP_BITS'(`INST_BR_DRET); default:; endcase op_mod = 1; @@ -347,7 +347,7 @@ module VX_decode #( endcase end `endif - `INST_GPU: begin + `INST_GPGPU: begin ex_type = `EX_GPU; case (func3) 3'h0: begin @@ -374,9 +374,21 @@ module VX_decode #( is_wstall = 1; `USED_IREG (rs1); `USED_IREG (rs2); - end - `ifdef EXT_TEX_ENABLE + end 3'h5: begin + ex_type = `EX_LSU; + op_type = `INST_OP_BITS'(`INST_LSU_LW); + op_mod = `INST_MOD_BITS'(2); + `USED_IREG (rs1); + end + default:; + endcase + end + `INST_GPU: begin + case (func3) + `ifdef EXT_TEX_ENABLE + 3'h0: begin + ex_type = `EX_GPU; op_type = `INST_OP_BITS'(`INST_GPU_TEX); op_mod = `INST_MOD_BITS'(func2); use_rd = 1; @@ -386,12 +398,6 @@ module VX_decode #( `USED_IREG (rs3); end `endif - 3'h6: begin - ex_type = `EX_LSU; - op_type = `INST_OP_BITS'(`INST_LSU_LW); - op_mod = `INST_MOD_BITS'(2); - `USED_IREG (rs1); - end default:; endcase end diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index c3706000..696b6eaa 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -66,7 +66,8 @@ `define INST_FNMADD 7'b1001111 `define INST_FCI 7'b1010011 // float common instructions -`define INST_GPU 7'b1101011 +`define INST_GPGPU 7'b1101011 +`define INST_GPU 7'b1011011 `define INST_TEX 7'b0101011 @@ -117,9 +118,9 @@ `define INST_BR_JALR 4'b1001 `define INST_BR_ECALL 4'b1010 `define INST_BR_EBREAK 4'b1011 -`define INST_BR_MRET 4'b1100 +`define INST_BR_URET 4'b1100 `define INST_BR_SRET 4'b1101 -`define INST_BR_DRET 4'b1110 +`define INST_BR_MRET 4'b1110 `define INST_BR_OTHER 4'b1111 `define INST_BR_BITS 4 `define INST_BR_NEG(x) x[1] @@ -185,14 +186,14 @@ `define INST_FPU_NMADD 4'hF `define INST_FPU_BITS 4 -`define INST_GPU_TMC 3'h0 -`define INST_GPU_WSPAWN 3'h1 -`define INST_GPU_SPLIT 3'h2 -`define INST_GPU_JOIN 3'h3 -`define INST_GPU_BAR 3'h4 -`define INST_GPU_PRED 3'h5 -`define INST_GPU_TEX 3'h6 -`define INST_GPU_BITS 3 +`define INST_GPU_TMC 4'h0 +`define INST_GPU_WSPAWN 4'h1 +`define INST_GPU_SPLIT 4'h2 +`define INST_GPU_JOIN 4'h3 +`define INST_GPU_BAR 4'h4 +`define INST_GPU_PRED 4'h5 +`define INST_GPU_TEX 4'h6 +`define INST_GPU_BITS 4 /////////////////////////////////////////////////////////////////////////////// @@ -237,11 +238,9 @@ /////////////////////////////////////////////////////////////////////////////// -`ifdef DBG_CACHE_REQ_INFO // wid PC -`define DBG_CACHE_REQ_MDATAW (`NW_BITS + 32) -`else -`define DBG_CACHE_REQ_MDATAW 0 -`endif +// cache request identifier +`define DBG_CACHE_REQ_IDW 48 +`define DBG_CACHE_REQ_ID(type, ctr) {4'(type), {`DBG_CACHE_REQ_IDW-4{1'b0}}} + ctr // non-cacheable tag bits `define NC_TAG_BIT 1 @@ -249,6 +248,9 @@ // texture tag bits `define TEX_TAG_BIT 1 +// cache address type bits +`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BIT + `SM_ENABLE) + ////////////////////////// Icache Configurable Knobs ////////////////////////// // Cache ID @@ -264,7 +266,7 @@ `define ICACHE_CORE_TAG_ID_BITS `NW_BITS // Core request tag bits -`define ICACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `ICACHE_CORE_TAG_ID_BITS) +`define ICACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_IDW + `ICACHE_CORE_TAG_ID_BITS) // Memory request data bits `define ICACHE_MEM_DATA_WIDTH (`ICACHE_LINE_SIZE * 8) @@ -289,17 +291,14 @@ // Core request tag bits `define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE) `ifdef EXT_TEX_ENABLE -`define LSU_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE) -`define TEX_TAG_ID_BITS (2) -`define LSU_TEX_TAG_ID_BITS `MAX(`LSU_TAG_ID_BITS, `TEX_TAG_ID_BITS) -`define DCACHE_CORE_TAG_ID_BITS (`LSU_TEX_TAG_ID_BITS + `TEX_TAG_BIT) -`define LSU_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TAG_ID_BITS) -`define TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `TEX_TAG_ID_BITS) -`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TEX_TAG_ID_BITS) +`define LSU_TAG_ID_BITS `MAX(`LSUQ_ADDR_BITS, 2) +`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_IDW + `LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS) +`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS + `TEX_TAG_BIT) `else -`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE) +`define LSU_TAG_ID_BITS `LSUQ_ADDR_BITS +`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS) `endif -`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS) +`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_IDW + `DCACHE_CORE_TAG_ID_BITS) // Memory request data bits `define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8) diff --git a/hw/rtl/VX_execute.sv b/hw/rtl/VX_execute.sv index f0cdd37e..029d58ab 100644 --- a/hw/rtl/VX_execute.sv +++ b/hw/rtl/VX_execute.sv @@ -52,51 +52,29 @@ module VX_execute #( VX_dcache_req_if #( .NUM_REQS (`NUM_THREADS), .WORD_SIZE (4), - .TAG_WIDTH (`LSU_DCACHE_TAG_BITS) + .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS) ) lsu_dcache_req_if(); VX_dcache_rsp_if #( .NUM_REQS (`NUM_THREADS), .WORD_SIZE (4), - .TAG_WIDTH (`LSU_DCACHE_TAG_BITS) + .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS) ) lsu_dcache_rsp_if(); VX_dcache_req_if #( .NUM_REQS (`NUM_THREADS), .WORD_SIZE (4), - .TAG_WIDTH (`TEX_DCACHE_TAG_BITS) + .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS) ) tex_dcache_req_if(); VX_dcache_rsp_if #( .NUM_REQS (`NUM_THREADS), .WORD_SIZE (4), - .TAG_WIDTH (`TEX_DCACHE_TAG_BITS) + .TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS) ) tex_dcache_rsp_if(); VX_tex_csr_if tex_csr_if(); - wire [`NUM_THREADS-1:0][`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_in, lsu_tag_in; - wire [`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_out, lsu_tag_out; - - `UNUSED_VAR (tex_tag_out) - `UNUSED_VAR (lsu_tag_out) - - for (genvar i = 0; i < `NUM_THREADS; ++i) begin - assign tex_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(tex_dcache_req_if.tag[i][`TEX_TAG_ID_BITS-1:0]); - assign lsu_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(lsu_dcache_req_if.tag[i][`LSU_TAG_ID_BITS-1:0]); - `ifdef DBG_CACHE_REQ_INFO - assign tex_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = tex_dcache_req_if.tag[i][`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS]; - assign lsu_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = lsu_dcache_req_if.tag[i][`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS]; - `endif - end - - assign tex_dcache_rsp_if.tag[`TEX_TAG_ID_BITS-1:0] = tex_tag_out[`TEX_TAG_ID_BITS-1:0]; - assign lsu_dcache_rsp_if.tag[`LSU_TAG_ID_BITS-1:0] = lsu_tag_out[`LSU_TAG_ID_BITS-1:0]; -`ifdef DBG_CACHE_REQ_INFO - assign tex_dcache_rsp_if.tag[`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS] = tex_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS]; - assign lsu_dcache_rsp_if.tag[`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS] = lsu_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS]; -`endif - VX_cache_arb #( .NUM_REQS (2), .LANES (`NUM_THREADS), @@ -113,7 +91,7 @@ module VX_execute #( .req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}), .req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}), .req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}), - .req_tag_in ({tex_tag_in, lsu_tag_in}), + .req_tag_in ({tex_dcache_req_if.tag, lsu_dcache_req_if.tag}), .req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}), // Dcache request @@ -136,7 +114,7 @@ module VX_execute #( .rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}), .rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}), .rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}), - .rsp_tag_out ({tex_tag_out, lsu_tag_out}), + .rsp_tag_out ({tex_dcache_rsp_if.tag, lsu_dcache_rsp_if.tag}), .rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready}) ); diff --git a/hw/rtl/VX_icache_stage.sv b/hw/rtl/VX_icache_stage.sv index cb33b82d..ad296649 100644 --- a/hw/rtl/VX_icache_stage.sv +++ b/hw/rtl/VX_icache_stage.sv @@ -24,10 +24,17 @@ module VX_icache_stage #( localparam OUT_REG = 0; + reg [`DBG_CACHE_REQ_IDW-1:0] req_id; + wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id; + wire [`NW_BITS-1:0] req_tag, rsp_tag; + + `UNUSED_VAR (rsp_req_id) + wire icache_req_fire = icache_req_if.valid && icache_req_if.ready; - wire [`NW_BITS-1:0] req_tag = ifetch_req_if.wid; - wire [`NW_BITS-1:0] rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0]; + assign req_tag = ifetch_req_if.wid; + assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0]; + assign rsp_req_id = icache_rsp_if.tag[`NW_BITS +: `DBG_CACHE_REQ_IDW]; wire [31:0] rsp_PC; wire [`NUM_THREADS-1:0] rsp_tmask; @@ -51,16 +58,21 @@ module VX_icache_stage #( // Icache Request assign icache_req_if.valid = ifetch_req_if.valid; assign icache_req_if.addr = ifetch_req_if.PC[31:2]; + assign icache_req_if.tag = {req_id, req_tag}; + + always @(posedge clk) begin + if (reset) begin + req_id <= `DBG_CACHE_REQ_ID(0, 0); + end else begin + if (icache_req_fire) begin + req_id <= req_id + 1; + end + end + end // Can accept new request? assign ifetch_req_if.ready = icache_req_if.ready; -`ifdef DBG_CACHE_REQ_INFO - assign icache_req_if.tag = {ifetch_req_if.wid, ifetch_req_if.PC, req_tag}; -`else - assign icache_req_if.tag = req_tag; -`endif - wire [`NW_BITS-1:0] rsp_wid = rsp_tag; wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid); @@ -90,11 +102,11 @@ module VX_icache_stage #( `ifdef DBG_TRACE_CORE_ICACHE always @(posedge clk) begin - if (icache_req_if.valid && icache_req_if.ready) begin - dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC); + if (icache_req_fire) begin + dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h, req_id=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, req_id); end if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin - dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, data=%0h\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, ifetch_rsp_if.data); + dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, req_id=%0h, data=%0h\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, rsp_req_id, ifetch_rsp_if.data); end end `endif diff --git a/hw/rtl/VX_lsu_unit.sv b/hw/rtl/VX_lsu_unit.sv index 8541f4c6..de47dca0 100644 --- a/hw/rtl/VX_lsu_unit.sv +++ b/hw/rtl/VX_lsu_unit.sv @@ -24,8 +24,6 @@ module VX_lsu_unit #( localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE); - localparam ADDR_TYPEW = `NC_TAG_BIT + `SM_ENABLE; - `STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) `STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) `STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter")) @@ -44,7 +42,7 @@ module VX_lsu_unit #( wire mbuf_empty; - wire [`NUM_THREADS-1:0][ADDR_TYPEW-1:0] lsu_addr_type, req_addr_type; + wire [`NUM_THREADS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type, req_addr_type; wire [`NUM_THREADS-1:0][31:0] full_addr; for (genvar i = 0; i < `NUM_THREADS; i++) begin @@ -83,7 +81,7 @@ module VX_lsu_unit #( wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch; VX_pipe_register #( - .DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) req_pipe_reg ( .clk (clk), @@ -104,19 +102,22 @@ module VX_lsu_unit #( wire rsp_is_dup; wire rsp_is_prefetch; - `UNUSED_VAR (rsp_type) - `UNUSED_VAR (rsp_is_prefetch) - reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask; wire [`NUM_THREADS-1:0] rsp_rem_mask_n; wire [`NUM_THREADS-1:0] rsp_tmask; + reg [`DBG_CACHE_REQ_IDW-1:0] req_id; + wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id; reg [`NUM_THREADS-1:0] req_sent_mask; reg is_req_start; wire [`LSUQ_ADDR_BITS-1:0] mbuf_waddr, mbuf_raddr; wire mbuf_full; + `UNUSED_VAR (rsp_type) + `UNUSED_VAR (rsp_is_prefetch) + `UNUSED_VAR (rsp_req_id) + wire [`NUM_THREADS-1:0][REQ_ASHIFT-1:0] req_offset, rsp_offset; for (genvar i = 0; i < `NUM_THREADS; i++) begin assign req_offset[i] = req_addr[i][1:0]; @@ -124,6 +125,8 @@ module VX_lsu_unit #( wire [`NUM_THREADS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready; + wire dcache_req_fire_any = (| dcache_req_fire); + wire dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready; wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1}; @@ -135,7 +138,8 @@ module VX_lsu_unit #( wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n); - assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS]; + assign mbuf_raddr = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: `LSUQ_ADDR_BITS]; + assign rsp_req_id = dcache_rsp_if.tag[(`CACHE_ADDR_TYPE_BITS + `LSU_TAG_ID_BITS) +: `DBG_CACHE_REQ_IDW]; `UNUSED_VAR (dcache_rsp_if.tag) // do not writeback from software prefetch @@ -214,7 +218,7 @@ module VX_lsu_unit #( 0: mem_req_byteen[req_offset[i]] = 1; 1: begin mem_req_byteen[req_offset[i]] = 1; - mem_req_byteen[{req_addr[i][1], 1'b1}] = 1; + mem_req_byteen[{req_offset[i][1], 1'b1}] = 1; end default : mem_req_byteen = {4{1'b1}}; endcase @@ -235,12 +239,17 @@ module VX_lsu_unit #( assign dcache_req_if.addr[i] = req_addr[i][31:2]; assign dcache_req_if.byteen[i] = mem_req_byteen; assign dcache_req_if.data[i] = mem_req_data; + assign dcache_req_if.tag[i] = {req_id, `LSU_TAG_ID_BITS'(req_tag), req_addr_type[i]}; + end - `ifdef DBG_CACHE_REQ_INFO - assign dcache_req_if.tag[i] = {req_wid, req_pc, req_tag, req_addr_type[i]}; - `else - assign dcache_req_if.tag[i] = {req_tag, req_addr_type[i]}; - `endif + always @(posedge clk) begin + if (reset) begin + req_id <= `DBG_CACHE_REQ_ID(1, 0); + end else begin + if (dcache_req_fire_any) begin + req_id <= req_id + 1; + end + end end assign ready_in = req_dep_ready && dcache_req_ready; @@ -339,22 +348,21 @@ module VX_lsu_unit #( `endif `ifdef DBG_TRACE_CORE_DCACHE - wire dcache_req_fire_any = (| dcache_req_fire); always @(posedge clk) begin if (lsu_req_if.valid && fence_wait) begin dpi_trace("%d: *** D$%0d fence wait\n", $time, CORE_ID); end if (dcache_req_fire_any) begin if (dcache_req_if.rw[0]) begin - dpi_trace("%d: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire); + dpi_trace("%d: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, req_id=%0h, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire, req_id); `TRACE_ARRAY1D(req_addr, `NUM_THREADS); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); dpi_trace(", data="); `TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(", req_id=%0h\n", req_id); end else begin - dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire); + dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire, req_id); `TRACE_ARRAY1D(req_addr, `NUM_THREADS); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); @@ -362,8 +370,8 @@ module VX_lsu_unit #( end end if (dcache_rsp_fire) begin - dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=", - $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd); + dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, rd=%0d, data=", + $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, rsp_req_id, mbuf_raddr, rsp_rd); `TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS); dpi_trace(", is_dup=%b\n", rsp_is_dup); end diff --git a/hw/rtl/cache/VX_bank.sv b/hw/rtl/cache/VX_bank.sv index 14d50e29..2dfc51fe 100644 --- a/hw/rtl/cache/VX_bank.sv +++ b/hw/rtl/cache/VX_bank.sv @@ -33,9 +33,6 @@ module VX_bank #( // core request tag size parameter CORE_TAG_WIDTH = 1, - // size of tag id in core request tag - parameter CORE_TAG_ID_BITS = 0, - // bank offset from beginning of index range parameter BANK_ADDR_OFFSET = 0, @@ -96,14 +93,9 @@ module VX_bank #( input wire [`LINE_SELECT_BITS-1:0] flush_addr ); - `UNUSED_PARAM (CORE_TAG_ID_BITS) - -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - wire [31:0] debug_pc_sel, debug_pc_st0, debug_pc_st1; - wire [`NW_BITS-1:0] debug_wid_sel, debug_wid_st0, debug_wid_st1; + wire [`DBG_CACHE_REQ_IDW-1:0] req_id_sel, req_id_st0, req_id_st1; `IGNORE_UNUSED_END -`endif wire [NUM_PORTS-1:0] creq_pmask; wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] creq_wsel; @@ -197,13 +189,7 @@ module VX_bank #( wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; wire creq_fire = creq_valid && creq_ready; -`ifdef DBG_CACHE_REQ_INFO - if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_sel, debug_pc_sel} = mshr_enable ? mshr_tag[0][`CACHE_REQ_INFO_RNG] : creq_tag[0][`CACHE_REQ_INFO_RNG]; - end else begin - assign {debug_wid_sel, debug_pc_sel} = 0; - end -`endif + assign req_id_sel = mshr_enable ? mshr_tag[0][`CACHE_REQ_ID_RNG] : creq_tag[0][`CACHE_REQ_ID_RNG]; wire [`CACHE_LINE_WIDTH-1:0] wdata_sel; assign wdata_sel[(NUM_PORTS * `WORD_WIDTH)-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[(NUM_PORTS * `WORD_WIDTH)-1:0] : creq_data; @@ -237,13 +223,7 @@ module VX_bank #( .data_out ({valid_st0, is_flush_st0, is_mshr_st0, is_fill_st0, is_read_st0, is_write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0}) ); -`ifdef DBG_CACHE_REQ_INFO - if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st0, debug_pc_st0} = tag_st0[0][`CACHE_REQ_INFO_RNG]; - end else begin - assign {debug_wid_st0, debug_pc_st0} = 0; - end -`endif + assign req_id_st0 = tag_st0[0][`CACHE_REQ_ID_RNG]; wire do_fill_st0 = valid_st0 && is_fill_st0; wire do_flush_st0 = valid_st0 && is_flush_st0; @@ -263,11 +243,9 @@ module VX_bank #( .clk (clk), .reset (reset), - `ifdef DBG_CACHE_REQ_INFO - .debug_pc (debug_pc_st0), - .debug_wid (debug_wid_st0), - `endif - .stall (crsq_stall), + .req_id (req_id_st0), + + .stall (crsq_stall), // read/Fill .lookup (do_lookup_st0), @@ -293,13 +271,7 @@ module VX_bank #( .data_out ({valid_st1, is_mshr_st1, is_fill_st1, is_read_st1, is_write_st1, miss_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1}) ); -`ifdef DBG_CACHE_REQ_INFO - if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st1, debug_pc_st1} = tag_st1[0][`CACHE_REQ_INFO_RNG]; - end else begin - assign {debug_wid_st1, debug_pc_st1} = 0; - end -`endif + assign req_id_st1 = tag_st1[0][`CACHE_REQ_ID_RNG]; wire do_read_st0 = valid_st0 && is_read_st0; wire do_read_st1 = valid_st1 && is_read_st1; @@ -323,10 +295,8 @@ module VX_bank #( .clk (clk), .reset (reset), - `ifdef DBG_CACHE_REQ_INFO - .debug_pc (debug_pc_st1), - .debug_wid (debug_wid_st1), - `endif + .req_id (req_id_st1), + .stall (crsq_stall), .read (do_read_st1 || do_mshr_st1), @@ -372,14 +342,9 @@ module VX_bank #( .clk (clk), .reset (reset), - `ifdef DBG_CACHE_REQ_INFO - .deq_debug_pc (debug_pc_sel), - .deq_debug_wid (debug_wid_sel), - .lkp_debug_pc (debug_pc_st0), - .lkp_debug_wid (debug_wid_st0), - .rel_debug_pc (debug_pc_st1), - .rel_debug_wid (debug_wid_st1), - `endif + .deq_req_id (req_id_sel), + .lkp_req_id (req_id_st0), + .rel_req_id (req_id_st1), // allocate .allocate_valid (mshr_allocate), @@ -525,22 +490,22 @@ module VX_bank #( dpi_trace("%d: cache%0d:%0d fill-rsp: addr=%0h, id=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data); end if (mshr_fire) begin - dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, debug_wid_sel, debug_pc_sel); + dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, req_id_sel); end if (creq_fire) begin if (creq_rw) - dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel); + dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, req_id_sel); else - dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel); + dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, req_id_sel); end if (crsq_fire) begin - dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1); + dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, req_id_st1); end if (mreq_push) begin if (is_write_st1) - dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, debug_wid_st1, debug_pc_st1); + dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, req_id_st1); else - dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, debug_wid_st1, debug_pc_st1); + dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, req_id_st1); end end `endif diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index 29e14892..6b6841dd 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -580,8 +580,7 @@ module VX_cache #( .MSHR_SIZE (MSHR_SIZE), .MREQ_SIZE (MREQ_SIZE), .WRITE_ENABLE (WRITE_ENABLE), - .CORE_TAG_WIDTH (CORE_TAG_X_WIDTH), - .CORE_TAG_ID_BITS (CORE_TAG_ID_X_BITS), + .CORE_TAG_WIDTH (CORE_TAG_X_WIDTH), .BANK_ADDR_OFFSET (BANK_ADDR_OFFSET) ) bank ( `SCOPE_BIND_VX_cache_bank(i) diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index 8af2921b..b8f2fdbc 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -3,9 +3,8 @@ `include "VX_platform.vh" -`ifdef DBG_CACHE_REQ_INFO -`include "VX_define.vh" -`endif +// cache request identifier +`define DBG_CACHE_REQ_IDW 48 `define REQS_BITS `LOG2UP(NUM_REQS) @@ -52,7 +51,7 @@ `define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS] -`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_MDATAW) +`define CACHE_REQ_ID_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_IDW) /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/cache/VX_data_access.sv b/hw/rtl/cache/VX_data_access.sv index a1a5247b..887b4095 100644 --- a/hw/rtl/cache/VX_data_access.sv +++ b/hw/rtl/cache/VX_data_access.sv @@ -21,12 +21,9 @@ module VX_data_access #( input wire clk, input wire reset, -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - input wire[31:0] debug_pc, - input wire[`NW_BITS-1:0] debug_wid, + input wire[`DBG_CACHE_REQ_IDW-1:0] req_id, `IGNORE_UNUSED_END -`endif input wire stall, @@ -125,10 +122,10 @@ module VX_data_access #( dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data); end if (read && ~stall) begin - dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, read_data); + dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, req_id=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, line_addr, read_data); end if (write && ~stall) begin - dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, wid=%0d, PC=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, byteen, line_addr, write_data); + dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, req_id=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, byteen, line_addr, write_data); end end `endif diff --git a/hw/rtl/cache/VX_miss_resrv.sv b/hw/rtl/cache/VX_miss_resrv.sv index bda63bb1..08b76add 100644 --- a/hw/rtl/cache/VX_miss_resrv.sv +++ b/hw/rtl/cache/VX_miss_resrv.sv @@ -25,16 +25,11 @@ module VX_miss_resrv #( input wire clk, input wire reset, -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - input wire[31:0] deq_debug_pc, - input wire[`NW_BITS-1:0] deq_debug_wid, - input wire[31:0] lkp_debug_pc, - input wire[`NW_BITS-1:0] lkp_debug_wid, - input wire[31:0] rel_debug_pc, - input wire[`NW_BITS-1:0] rel_debug_wid, + input wire[`DBG_CACHE_REQ_IDW-1:0] deq_req_id, + input wire[`DBG_CACHE_REQ_IDW-1:0] lkp_req_id, + input wire[`DBG_CACHE_REQ_IDW-1:0] rel_req_id, `IGNORE_UNUSED_END -`endif // allocate input wire allocate_valid, @@ -206,23 +201,22 @@ module VX_miss_resrv #( always @(posedge clk) begin if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin if (allocate_fire) - dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, - `LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_debug_wid, deq_debug_pc); + dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, + `LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_req_id); if (fill_valid) dpi_trace("%d: cache%0d:%0d mshr-fill: addr=%0h, id=%0d, addr=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_table[fill_id], BANK_ID), fill_id, `LINE_TO_BYTE_ADDR(fill_addr, BANK_ID)); if (dequeue_fire) - dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, - `LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_debug_wid, deq_debug_pc); + dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, + `LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_id); if (lookup_replay) dpi_trace("%d: cache%0d:%0d mshr-replay: addr=%0h, id=%0d\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id); if (lookup_valid) - dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, - `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_debug_wid, lkp_debug_pc); + dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b, req_id=%0h\n", $time, CACHE_ID, BANK_ID, + `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_req_id); if (release_valid) - dpi_trace("%d: cache%0d:%0d mshr-release id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, - release_id, rel_debug_wid, rel_debug_pc); + dpi_trace("%d: cache%0d:%0d mshr-release id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, release_id, rel_req_id); dpi_trace("%d: cache%0d:%0d mshr-table", $time, CACHE_ID, BANK_ID); for (integer i = 0; i < MSHR_SIZE; ++i) begin if (valid_table[i]) begin diff --git a/hw/rtl/cache/VX_shared_mem.sv b/hw/rtl/cache/VX_shared_mem.sv index 46ea0cfc..257cf295 100644 --- a/hw/rtl/cache/VX_shared_mem.sv +++ b/hw/rtl/cache/VX_shared_mem.sv @@ -254,22 +254,19 @@ module VX_shared_mem #( .ready_out (core_rsp_ready) ); -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - wire [NUM_BANKS-1:0][31:0] debug_pc_st0, debug_pc_st1; - wire [NUM_BANKS-1:0][`NW_BITS-1:0] debug_wid_st0, debug_wid_st1; + wire [NUM_BANKS-1:0][`DBG_CACHE_REQ_IDW-1:0] req_id_st0, req_id_st1; `IGNORE_UNUSED_END for (genvar i = 0; i < NUM_BANKS; ++i) begin if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_wid_st0[i], debug_pc_st0[i]} = per_bank_core_req_tag_unqual[i][`CACHE_REQ_INFO_RNG]; - assign {debug_wid_st1[i], debug_pc_st1[i]} = per_bank_core_req_tag[i][`CACHE_REQ_INFO_RNG]; + assign req_id_st0[i] = per_bank_core_req_tag_unqual[i][`CACHE_REQ_ID_RNG]; + assign req_id_st1[i] = per_bank_core_req_tag[i][`CACHE_REQ_ID_RNG]; end else begin - assign {debug_wid_st0[i], debug_pc_st0[i]} = 0; - assign {debug_wid_st1[i], debug_pc_st1[i]} = 0; + assign req_id_st0[i] = 0; + assign req_id_st1[i] = 0; end end -`endif `ifdef DBG_TRACE_CACHE_BANK @@ -309,11 +306,11 @@ module VX_shared_mem #( for (integer i = 0; i < NUM_BANKS; ++i) begin if (per_bank_core_req_valid_unqual[i]) begin if (per_bank_core_req_rw_unqual[i]) begin - dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", - $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], debug_wid_st0[i], debug_pc_st0[i]); + dpi_trace("%d: smem%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, req_id=%0h\n", + $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], req_id_st0[i]); end else begin - dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h\n", - $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], debug_wid_st0[i], debug_pc_st0[i]); + dpi_trace("%d: smem%0d:%0d core-rd-req: addr=%0h, tag=%0h, req_id=%0h\n", + $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], req_id_st0[i]); end end end @@ -322,11 +319,11 @@ module VX_shared_mem #( for (integer i = 0; i < NUM_BANKS; ++i) begin if (per_bank_core_req_valid[i]) begin if (per_bank_core_req_rw[i]) begin - dpi_trace("%d: cache%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", - $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_req_data[i], debug_wid_st1[i], debug_pc_st1[i]); + dpi_trace("%d: smem%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, data=%0h, req_id=%0h\n", + $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_data[i], req_id_st1[i]); end else begin - dpi_trace("%d: cache%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", - $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_rsp_data[i], debug_wid_st1[i], debug_pc_st1[i]); + dpi_trace("%d: smem%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, data=%0h, req_id=%0h\n", + $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_rsp_data[i], req_id_st1[i]); end end end diff --git a/hw/rtl/cache/VX_tag_access.sv b/hw/rtl/cache/VX_tag_access.sv index 55124a65..808008d5 100644 --- a/hw/rtl/cache/VX_tag_access.sv +++ b/hw/rtl/cache/VX_tag_access.sv @@ -17,12 +17,9 @@ module VX_tag_access #( input wire clk, input wire reset, -`ifdef DBG_CACHE_REQ_INFO `IGNORE_UNUSED_BEGIN - input wire[31:0] debug_pc, - input wire[`NW_BITS-1:0] debug_wid, + input wire[`DBG_CACHE_REQ_IDW-1:0] req_id, `IGNORE_UNUSED_END -`endif input wire stall, @@ -71,9 +68,9 @@ module VX_tag_access #( end if (lookup && ~stall) begin if (tag_match) begin - dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag); + dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, req_id=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, line_addr, line_tag); end else begin - dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag, read_tag); + dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, req_id=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, line_addr, line_tag, read_tag); end end end diff --git a/hw/rtl/tex_unit/VX_tex_addr.sv b/hw/rtl/tex_unit/VX_tex_addr.sv index 26a20566..c33cc47a 100644 --- a/hw/rtl/tex_unit/VX_tex_addr.sv +++ b/hw/rtl/tex_unit/VX_tex_addr.sv @@ -12,13 +12,13 @@ module VX_tex_addr #( input wire req_valid, input wire [NUM_REQS-1:0] req_tmask, - input wire [1:0][NUM_REQS-1:0][31:0] req_coords, + input wire [1:0][NUM_REQS-1:0][`TEX_FXD_BITS-1:0] req_coords, input wire [`TEX_FORMAT_BITS-1:0] req_format, input wire [`TEX_FILTER_BITS-1:0] req_filter, input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps, input wire [`TEX_ADDR_BITS-1:0] req_baseaddr, input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff, - input wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] req_logdims, + input wire [NUM_REQS-1:0][1:0][`TEX_LOD_BITS-1:0] req_logdims, input wire [REQ_INFOW-1:0] req_info, output wire req_ready, @@ -27,31 +27,33 @@ module VX_tex_addr #( output wire rsp_valid, output wire [NUM_REQS-1:0] rsp_tmask, output wire [`TEX_FILTER_BITS-1:0] rsp_filter, - output wire [`TEX_STRIDE_BITS-1:0] rsp_stride, + output wire [`TEX_LGSTRIDE_BITS-1:0] rsp_lgstride, output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr, - output wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends, + output wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends, output wire [REQ_INFOW-1:0] rsp_info, input wire rsp_ready ); `UNUSED_PARAM (CORE_ID) - localparam PITCH_BITS = `MAX(`TEX_DIM_BITS, `TEX_STRIDE_BITS) + 1; - localparam SCALED_U_W = `FIXED_INT + `TEX_STRIDE_BITS; - localparam SCALED_X_W = (2 * `FIXED_INT); - localparam SCALED_V_W = SCALED_X_W + `TEX_STRIDE_BITS; + localparam SHIFT_BITS = $clog2(`TEX_FXD_FRAC+1); + localparam PITCH_BITS = `MAX(`TEX_LOD_BITS, `TEX_LGSTRIDE_BITS) + 1; + localparam SCALED_X_W = `TEX_DIM_BITS + `TEX_BLEND_FRAC; + localparam OFFSET_U_W = `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX; + localparam OFFSET_V_W = `TEX_DIM_BITS + `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX; wire valid_s0; wire [NUM_REQS-1:0] tmask_s0; wire [`TEX_FILTER_BITS-1:0] filter_s0; wire [REQ_INFOW-1:0] req_info_s0; - wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_lo, clamped_lo_s0; - wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_hi, clamped_hi_s0; - wire [`TEX_STRIDE_BITS-1:0] log_stride, log_stride_s0; + wire [NUM_REQS-1:0][1:0][`TEX_FXD_FRAC-1:0] clamped_lo, clamped_lo_s0; + wire [NUM_REQS-1:0][1:0][`TEX_FXD_FRAC-1:0] clamped_hi, clamped_hi_s0; + wire [NUM_REQS-1:0][1:0][SHIFT_BITS-1:0] dim_shift, dim_shift_s0; + wire [`TEX_LGSTRIDE_BITS-1:0] log_stride, log_stride_s0; wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0; - wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] log_dims_s0; wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0; - + wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0; + wire stall_out; // stride @@ -67,9 +69,9 @@ module VX_tex_addr #( for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar j = 0; j < 2; ++j) begin - wire [`FIXED_FRAC-1:0] delta = (`FIXED_HALF >> req_logdims[i][j]); - wire [31:0] coord_lo = req_filter ? (req_coords[j][i] - 32'(delta)) : req_coords[j][i]; - wire [31:0] coord_hi = req_filter ? (req_coords[j][i] + 32'(delta)) : req_coords[j][i]; + wire [`TEX_FXD_FRAC-1:0] delta = (`TEX_FXD_HALF >> req_logdims[i][j]); + wire [`TEX_FXD_BITS-1:0] coord_lo = req_filter ? (req_coords[j][i] - `TEX_FXD_BITS'(delta)) : req_coords[j][i]; + wire [`TEX_FXD_BITS-1:0] coord_hi = req_filter ? (req_coords[j][i] + `TEX_FXD_BITS'(delta)) : req_coords[j][i]; VX_tex_wrap #( .CORE_ID (CORE_ID) @@ -86,66 +88,72 @@ module VX_tex_addr #( .coord_i (coord_hi), .coord_o (clamped_hi[i][j]) ); + + assign dim_shift[i][j] = (`TEX_FXD_FRAC - `TEX_BLEND_FRAC - req_logdims[i][j]); end assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0]) + PITCH_BITS'(log_stride); assign mip_addr[i] = req_baseaddr + 32'(req_mipoff[i]); end VX_pipe_register #( - .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * `TEX_DIM_BITS + 32 + 2 * 2 * `FIXED_FRAC)), + .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * SHIFT_BITS + 32 + 2 * 2 * `TEX_FXD_FRAC)), .RESETW (1) ) pipe_reg0 ( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, req_logdims, mip_addr, clamped_lo, clamped_hi}), - .data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, log_dims_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0}) + .data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, dim_shift, mip_addr, clamped_lo, clamped_hi}), + .data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, dim_shift_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0}) ); // addresses generation - wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_lo; - wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_hi; - wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] blends; + wire [NUM_REQS-1:0][1:0][SCALED_X_W-1:0] scaled_lo; + wire [NUM_REQS-1:0][1:0][SCALED_X_W-1:0] scaled_hi; + wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_lo; + wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_hi; + wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_lo; + wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_hi; + wire [NUM_REQS-1:0][31:0] base_addr_lo; + wire [NUM_REQS-1:0][31:0] base_addr_hi; + wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] blends; wire [NUM_REQS-1:0][3:0][31:0] addr; for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar j = 0; j < 2; ++j) begin - assign scaled_lo[i][j] = scale_to_dim(clamped_lo_s0[i][j], log_dims_s0[i][j]); - assign scaled_hi[i][j] = scale_to_dim(clamped_hi_s0[i][j], log_dims_s0[i][j]); - assign blends[i][j] = filter_s0 ? clamped_lo_s0[i][j][`BLEND_FRAC-1:0] : `BLEND_FRAC'(0); + assign scaled_lo[i][j] = SCALED_X_W'(clamped_lo_s0[i][j] >> dim_shift_s0[i][j]); + assign scaled_hi[i][j] = SCALED_X_W'(clamped_hi_s0[i][j] >> dim_shift_s0[i][j]); + assign blends[i][j] = filter_s0 ? scaled_lo[i][j][`TEX_BLEND_FRAC-1:0] : `TEX_BLEND_FRAC'(0); end end - `UNUSED_VAR (log_pitch_s0) - for (genvar i = 0; i < NUM_REQS; ++i) begin - wire [SCALED_U_W-1:0] offset_u_lo = SCALED_U_W'(scaled_lo[i][0]) << log_stride_s0; - wire [SCALED_U_W-1:0] offset_u_hi = SCALED_U_W'(scaled_hi[i][0]) << log_stride_s0; + assign offset_u_lo[i] = OFFSET_U_W'(scaled_lo[i][0][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_stride_s0; + assign offset_u_hi[i] = OFFSET_U_W'(scaled_hi[i][0][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_stride_s0; - wire [SCALED_V_W-1:0] offset_v_lo = SCALED_V_W'(scaled_lo[i][1]) << log_pitch_s0[i]; - wire [SCALED_V_W-1:0] offset_v_hi = SCALED_V_W'(scaled_hi[i][1]) << log_pitch_s0[i]; + assign offset_v_lo[i] = OFFSET_V_W'(scaled_lo[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i]; + assign offset_v_hi[i] = OFFSET_V_W'(scaled_hi[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i]; - wire [31:0] base_addr_lo = mip_addr_s0[i] + 32'(offset_v_lo); - wire [31:0] base_addr_hi = mip_addr_s0[i] + 32'(offset_v_hi); + assign base_addr_lo[i] = mip_addr_s0[i] + 32'(offset_v_lo[i]); + assign base_addr_hi[i] = mip_addr_s0[i] + 32'(offset_v_hi[i]); - assign addr[i][0] = base_addr_lo + 32'(offset_u_lo); - assign addr[i][1] = base_addr_lo + 32'(offset_u_hi); - assign addr[i][2] = base_addr_hi + 32'(offset_u_lo); - assign addr[i][3] = base_addr_hi + 32'(offset_u_hi); + assign addr[i][0] = base_addr_lo[i] + 32'(offset_u_lo[i]); + assign addr[i][1] = base_addr_lo[i] + 32'(offset_u_hi[i]); + assign addr[i][2] = base_addr_hi[i] + 32'(offset_u_lo[i]); + assign addr[i][3] = base_addr_hi[i] + 32'(offset_u_hi[i]); end assign stall_out = rsp_valid && ~rsp_ready; VX_pipe_register #( - .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `BLEND_FRAC) + REQ_INFOW), + .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `TEX_BLEND_FRAC) + REQ_INFOW), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (~stall_out), .data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}), - .data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_stride, rsp_addr, rsp_blends, rsp_info}) + .data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_lgstride, rsp_addr, rsp_blends, rsp_info}) ); assign req_ready = ~stall_out; @@ -157,22 +165,47 @@ module VX_tex_addr #( assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0]; always @(posedge clk) begin + if (req_valid && ~stall_out) begin + dpi_trace("%d: *** log_pitch=", $time); + `TRACE_ARRAY1D(log_pitch, NUM_REQS); + dpi_trace(", mip_addr="); + `TRACE_ARRAY1D(mip_addr, NUM_REQS); + dpi_trace(", req_logdims="); + `TRACE_ARRAY2D(req_logdims, 2, NUM_REQS); + dpi_trace(", clamped_lo="); + `TRACE_ARRAY2D(clamped_lo, 2, NUM_REQS); + dpi_trace(", clamped_hi="); + `TRACE_ARRAY2D(clamped_hi, 2, NUM_REQS); + dpi_trace("\n"); + end + + if (valid_s0 && ~stall_out) begin + dpi_trace("%d: *** scaled_lo=", $time); + `TRACE_ARRAY2D(scaled_lo, 2, NUM_REQS); + dpi_trace(", scaled_hi="); + `TRACE_ARRAY2D(scaled_hi, 2, NUM_REQS); + dpi_trace(", offset_u_lo="); + `TRACE_ARRAY1D(offset_u_lo, NUM_REQS); + dpi_trace(", offset_u_hi="); + `TRACE_ARRAY1D(offset_u_hi, NUM_REQS); + dpi_trace(", offset_v_lo="); + `TRACE_ARRAY1D(offset_v_lo, NUM_REQS); + dpi_trace(", offset_v_hi="); + `TRACE_ARRAY1D(offset_v_hi, NUM_REQS); + dpi_trace(", base_addr_lo="); + `TRACE_ARRAY1D(base_addr_lo, NUM_REQS); + dpi_trace(", base_addr_hi="); + `TRACE_ARRAY1D(base_addr_hi, NUM_REQS); + dpi_trace("\n"); + end + if (rsp_valid && rsp_ready) begin - dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, tride=%0d, addr=", - $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_stride); + dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, lgstride=%0d, addr=", + $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_lgstride); `TRACE_ARRAY2D(rsp_addr, 4, NUM_REQS); dpi_trace("\n"); end end `endif -function logic [`FIXED_INT-1:0] scale_to_dim (input logic [`FIXED_FRAC-1:0] src, - input logic [`TEX_DIM_BITS-1:0] dim); -`IGNORE_WARNINGS_BEGIN - logic [`FIXED_BITS-1:0] out; -`IGNORE_WARNINGS_END - out = `FIXED_BITS'(src) << dim; - return out[`FIXED_FRAC +: `FIXED_INT]; -endfunction - endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_define.vh b/hw/rtl/tex_unit/VX_tex_define.vh index 16272fc9..34564b39 100644 --- a/hw/rtl/tex_unit/VX_tex_define.vh +++ b/hw/rtl/tex_unit/VX_tex_define.vh @@ -3,31 +3,26 @@ `include "VX_define.vh" -`define FIXED_BITS 32 -`define FIXED_FRAC 20 -`define FIXED_INT (`FIXED_BITS - `FIXED_FRAC) -`define FIXED_ONE (2 ** `FIXED_FRAC) -`define FIXED_HALF (`FIXED_ONE >> 1) -`define FIXED_MASK (`FIXED_ONE - 1) +`define TEX_FXD_INT (`TEX_FXD_BITS - `TEX_FXD_FRAC) +`define TEX_FXD_ONE (2 ** `TEX_FXD_FRAC) +`define TEX_FXD_HALF (`TEX_FXD_ONE >> 1) +`define TEX_FXD_MASK (`TEX_FXD_ONE - 1) `define TEX_ADDR_BITS 32 `define TEX_FORMAT_BITS 3 `define TEX_WRAP_BITS 2 -`define TEX_DIM_BITS 4 `define TEX_FILTER_BITS 1 +`define TEX_MIPOFF_BITS (2*`TEX_DIM_BITS+1) -`define TEX_MIPOFF_BITS (2*12+1) -`define TEX_STRIDE_BITS 2 - -`define TEX_LOD_BITS 4 -`define TEX_MIP_BITS (`NTEX_BITS + `TEX_LOD_BITS) +`define TEX_LGSTRIDE_MAX 2 +`define TEX_LGSTRIDE_BITS 2 `define TEX_WRAP_CLAMP 0 `define TEX_WRAP_REPEAT 1 `define TEX_WRAP_MIRROR 2 -`define BLEND_FRAC 8 -`define BLEND_ONE (2 ** `BLEND_FRAC) +`define TEX_BLEND_FRAC 8 +`define TEX_BLEND_ONE (2 ** `TEX_BLEND_FRAC) `define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0) `define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1) diff --git a/hw/rtl/tex_unit/VX_tex_mem.sv b/hw/rtl/tex_unit/VX_tex_mem.sv index 91aa0438..fc99466e 100644 --- a/hw/rtl/tex_unit/VX_tex_mem.sv +++ b/hw/rtl/tex_unit/VX_tex_mem.sv @@ -15,7 +15,7 @@ module VX_tex_mem #( input wire req_valid, input wire [NUM_REQS-1:0] req_tmask, input wire [`TEX_FILTER_BITS-1:0] req_filter, - input wire [`TEX_STRIDE_BITS-1:0] req_stride, + input wire [`TEX_LGSTRIDE_BITS-1:0] req_lgstride, input wire [NUM_REQS-1:0][3:0][31:0] req_addr, input wire [REQ_INFOW-1:0] req_info, output wire req_ready, @@ -63,23 +63,23 @@ module VX_tex_mem #( wire [NUM_REQS-1:0] q_req_tmask; wire [`TEX_FILTER_BITS-1:0] q_req_filter; wire [REQ_INFOW-1:0] q_req_info; - wire [`TEX_STRIDE_BITS-1:0] q_req_stride; + wire [`TEX_LGSTRIDE_BITS-1:0] q_req_lgstride; wire [3:0][NUM_REQS-1:0][1:0] q_align_offs; wire [3:0] q_dup_reqs; assign reqq_push = req_valid && req_ready; VX_fifo_queue #( - .DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (4 * NUM_REQS * 2) + 4), - .SIZE (`LSUQ_SIZE), + .DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (4 * NUM_REQS * 2) + 4), + .SIZE (`TEXQ_SIZE), .OUT_REG (1) ) req_queue ( .clk (clk), .reset (reset), .push (reqq_push), .pop (reqq_pop), - .data_in ({req_addr_w, req_tmask, req_info, req_filter, req_stride, align_offs, dup_reqs}), - .data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_stride, q_align_offs, q_dup_reqs}), + .data_in ({req_addr_w, req_tmask, req_info, req_filter, req_lgstride, align_offs, dup_reqs}), + .data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_lgstride, q_align_offs, q_dup_reqs}), .empty (reqq_empty), .full (reqq_full), `UNUSED_PIN (alm_full), @@ -96,8 +96,12 @@ module VX_tex_mem #( wire sent_all_ready, last_texel_sent; wire req_texel_dup; wire [NUM_REQS-1:0][29:0] req_texel_addr; + reg [`DBG_CACHE_REQ_IDW-1:0] req_id; + wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id; reg [1:0] req_texel_idx; reg req_texels_done; + + `UNUSED_VAR (rsp_req_id) always @(posedge clk) begin if (reset || last_texel_sent) begin @@ -146,14 +150,19 @@ module VX_tex_mem #( assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask; assign dcache_req_if.rw = {NUM_REQS{1'b0}}; assign dcache_req_if.addr = req_texel_addr; - assign dcache_req_if.byteen = {NUM_REQS{4'b1111}}; + assign dcache_req_if.byteen = {NUM_REQS{4'b0}}; assign dcache_req_if.data = 'x; + assign dcache_req_if.tag = {NUM_REQS{req_id, `LSU_TAG_ID_BITS'(req_texel_idx), `CACHE_ADDR_TYPE_BITS'(0)}}; -`ifdef DBG_CACHE_REQ_INFO - assign dcache_req_if.tag = {NUM_REQS{q_req_info[`DBG_CACHE_REQ_MDATAW-1:0], req_texel_idx}}; -`else - assign dcache_req_if.tag = {NUM_REQS{req_texel_idx}}; -`endif + always @(posedge clk) begin + if (reset) begin + req_id <= `DBG_CACHE_REQ_ID(2, 0); + end else begin + if (dcache_req_fire_any) begin + req_id <= req_id + 1; + end + end + end // Dcache Response @@ -162,14 +171,17 @@ module VX_tex_mem #( reg [NUM_REQS-1:0][31:0] rsp_data_qual; reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init; wire [RSP_CTR_W-1:0] rsp_rem_ctr_n; + wire [NUM_REQS-1:0][1:0] rsp_align_offs; wire dcache_rsp_fire; wire [1:0] rsp_texel_idx; wire rsp_texel_dup; - - assign rsp_texel_idx = dcache_rsp_if.tag[1:0]; + + assign rsp_texel_idx = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: 2]; + assign rsp_req_id = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS + `LSU_TAG_ID_BITS +: `DBG_CACHE_REQ_IDW]; `UNUSED_VAR (dcache_rsp_if.tag) assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx]; + assign rsp_align_offs = q_align_offs[rsp_texel_idx]; assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready; @@ -180,12 +192,12 @@ module VX_tex_mem #( reg [31:0] rsp_data_shifted; always @(*) begin rsp_data_shifted[31:16] = src_data[31:16]; - rsp_data_shifted[15:0] = q_align_offs[rsp_texel_idx][i][1] ? src_data[31:16] : src_data[15:0]; - rsp_data_shifted[7:0] = q_align_offs[rsp_texel_idx][i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0]; + rsp_data_shifted[15:0] = rsp_align_offs[i][1] ? src_data[31:16] : src_data[15:0]; + rsp_data_shifted[7:0] = rsp_align_offs[i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0]; end always @(*) begin - case (q_req_stride) + case (q_req_lgstride) 0: rsp_data_qual[i] = 32'(rsp_data_shifted[7:0]); 1: rsp_data_qual[i] = 32'(rsp_data_shifted[15:0]); default: rsp_data_qual[i] = rsp_data_shifted; @@ -266,20 +278,20 @@ module VX_tex_mem #( always @(posedge clk) begin if (dcache_req_fire_any) begin - dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=", - $time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_texel_idx); + dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, addr=", + $time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_id, req_texel_idx); `TRACE_ARRAY1D(req_texel_addr, NUM_REQS); dpi_trace(", is_dup=%b\n", req_texel_dup); end if (dcache_rsp_fire) begin - dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, data=", - $time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_texel_idx); + dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, data=", + $time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_req_id, rsp_texel_idx); `TRACE_ARRAY1D(dcache_rsp_if.data, NUM_REQS); dpi_trace("\n"); end if (req_valid && req_ready) begin - dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, stride=%0d, addr=", - $time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_stride); + dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, lgstride=%0d, addr=", + $time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_lgstride); `TRACE_ARRAY2D(req_addr, 4, NUM_REQS); dpi_trace("\n"); end diff --git a/hw/rtl/tex_unit/VX_tex_sampler.sv b/hw/rtl/tex_unit/VX_tex_sampler.sv index ac0f1496..63371337 100644 --- a/hw/rtl/tex_unit/VX_tex_sampler.sv +++ b/hw/rtl/tex_unit/VX_tex_sampler.sv @@ -12,7 +12,7 @@ module VX_tex_sampler #( input wire req_valid, input wire [NUM_REQS-1:0] req_tmask, input wire [`TEX_FORMAT_BITS-1:0] req_format, - input wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] req_blends, + input wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] req_blends, input wire [NUM_REQS-1:0][3:0][31:0] req_data, input wire [REQ_INFOW-1:0] req_info, output wire req_ready, @@ -32,7 +32,7 @@ module VX_tex_sampler #( wire [REQ_INFOW-1:0] req_info_s0; wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh; wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0; - wire [NUM_REQS-1:0][`BLEND_FRAC-1:0] blend_v, blend_v_s0; + wire [NUM_REQS-1:0][`TEX_BLEND_FRAC-1:0] blend_v, blend_v_s0; wire [NUM_REQS-1:0][31:0] texel_v; wire stall_out; @@ -52,7 +52,7 @@ module VX_tex_sampler #( end wire [7:0] beta = req_blends[i][0]; - wire [8:0] alpha = `BLEND_ONE - beta; + wire [8:0] alpha = `TEX_BLEND_ONE - beta; VX_tex_lerp #( ) tex_lerp_ul ( @@ -76,7 +76,7 @@ module VX_tex_sampler #( end VX_pipe_register #( - .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `BLEND_FRAC) + (2 * NUM_REQS * 32)), + .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `TEX_BLEND_FRAC) + (2 * NUM_REQS * 32)), .RESETW (1) ) pipe_reg0 ( .clk (clk), @@ -88,7 +88,7 @@ module VX_tex_sampler #( for (genvar i = 0; i < NUM_REQS; i++) begin wire [7:0] beta = blend_v_s0[i]; - wire [8:0] alpha = `BLEND_ONE - beta; + wire [8:0] alpha = `TEX_BLEND_ONE - beta; VX_tex_lerp #( ) tex_lerp_v ( diff --git a/hw/rtl/tex_unit/VX_tex_stride.sv b/hw/rtl/tex_unit/VX_tex_stride.sv index 50393fe9..0e1eca6a 100644 --- a/hw/rtl/tex_unit/VX_tex_stride.sv +++ b/hw/rtl/tex_unit/VX_tex_stride.sv @@ -4,11 +4,11 @@ module VX_tex_stride #( parameter CORE_ID = 0 ) ( input wire [`TEX_FORMAT_BITS-1:0] format, - output wire [`TEX_STRIDE_BITS-1:0] log_stride + output wire [`TEX_LGSTRIDE_BITS-1:0] log_stride ); `UNUSED_PARAM (CORE_ID) - reg [`TEX_STRIDE_BITS-1:0] log_stride_r; + reg [`TEX_LGSTRIDE_BITS-1:0] log_stride_r; always @(*) begin case (format) diff --git a/hw/rtl/tex_unit/VX_tex_unit.sv b/hw/rtl/tex_unit/VX_tex_unit.sv index 6be6aa43..38f93eb2 100644 --- a/hw/rtl/tex_unit/VX_tex_unit.sv +++ b/hw/rtl/tex_unit/VX_tex_unit.sv @@ -20,13 +20,13 @@ module VX_tex_unit #( localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32; localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S; - localparam REQ_INFOW_M = (2 * `NUM_THREADS * `BLEND_FRAC) + REQ_INFOW_A; + localparam REQ_INFOW_M = (2 * `NUM_THREADS * `TEX_BLEND_FRAC) + REQ_INFOW_A; - reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; - reg [1:0][`TEX_DIM_BITS-1:0] tex_dims [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; + reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][`TEX_LOD_MAX+1-1:0]; + reg [1:0][`TEX_LOD_BITS-1:0] tex_logdims [`NUM_TEX_UNITS-1:0]; + reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0]; reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0]; reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0]; - reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0]; reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0]; // CSRs programming @@ -35,38 +35,46 @@ module VX_tex_unit #( `UNUSED_VAR (csrs_dirty) for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin - wire [`TEX_LOD_BITS-1:0] mip_level = tex_csr_if.write_data[28 +: `TEX_LOD_BITS]; always @(posedge clk) begin if (tex_csr_if.write_enable) begin case (tex_csr_if.write_addr) - `CSR_TEX_ADDR(i) : begin + `CSR_TEX(i, `TEX_STATE_ADDR) : begin tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; csrs_dirty[i] <= 1; end - `CSR_TEX_FORMAT(i) : begin + `CSR_TEX(i, `TEX_STATE_FORMAT) : begin tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; csrs_dirty[i] <= 1; end - `CSR_TEX_WRAP(i) : begin - tex_wraps[i][0] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS]; - tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS]; + `CSR_TEX(i, `TEX_STATE_WRAPU) : begin + tex_wraps[i][0] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; csrs_dirty[i] <= 1; end - `CSR_TEX_FILTER(i) : begin + `CSR_TEX(i, `TEX_STATE_WRAPV) : begin + tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; + csrs_dirty[i] <= 1; + end + `CSR_TEX(i, `TEX_STATE_FILTER) : begin tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; csrs_dirty[i] <= 1; end - `CSR_TEX_MIPOFF(i) : begin - tex_mipoff[i][mip_level] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; + `CSR_TEX(i, `TEX_STATE_WIDTH) : begin + tex_logdims[i][0] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; csrs_dirty[i] <= 1; end - `CSR_TEX_WIDTH(i) : begin - tex_dims[i][mip_level][0] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0]; + `CSR_TEX(i, `TEX_STATE_HEIGHT) : begin + tex_logdims[i][1] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; csrs_dirty[i] <= 1; end - `CSR_TEX_HEIGHT(i) : begin - tex_dims[i][mip_level][1] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0]; - csrs_dirty[i] <= 1; + default: begin + for (integer j = 0; j <= `TEX_LOD_MAX; ++j) begin + `IGNORE_WARNINGS_BEGIN + if (tex_csr_if.write_addr == `CSR_ADDR_BITS'(`CSR_TEX(i, `TEX_STATE_MIPOFF(j)))) begin + `IGNORE_WARNINGS_END + tex_mipoff[i][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; + csrs_dirty[i] <= 1; + end + end end endcase end @@ -78,14 +86,15 @@ module VX_tex_unit #( // mipmap attributes - wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff; - wire [`NUM_THREADS-1:0][1:0][`TEX_DIM_BITS-1:0] sel_dims; + wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff; + wire [`NUM_THREADS-1:0][1:0][`TEX_LOD_BITS-1:0] sel_logdims; for (genvar i = 0; i < `NUM_THREADS; ++i) begin wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0]; - wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][20+:`TEX_LOD_BITS]; - assign sel_mipoff[i] = tex_mipoff[unit][mip_level]; - assign sel_dims[i] = tex_dims[unit][mip_level]; + wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][`TEX_LOD_BITS-1:0]; + assign sel_mipoff[i] = tex_mipoff[unit][mip_level]; + assign sel_logdims[i][0] = (tex_logdims[unit][0] - mip_level); + assign sel_logdims[i][1] = (tex_logdims[unit][1] - mip_level); end // address generation @@ -93,8 +102,8 @@ module VX_tex_unit #( wire mem_req_valid; wire [`NUM_THREADS-1:0] mem_req_tmask; wire [`TEX_FILTER_BITS-1:0] mem_req_filter; - wire [`TEX_STRIDE_BITS-1:0] mem_req_stride; - wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] mem_req_blends; + wire [`TEX_LGSTRIDE_BITS-1:0] mem_req_lgstride; + wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] mem_req_blends; wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr; wire [REQ_INFOW_A-1:0] mem_req_info; wire mem_req_ready; @@ -113,16 +122,16 @@ module VX_tex_unit #( .req_format (tex_format[tex_req_if.unit]), .req_filter (tex_filter[tex_req_if.unit]), .req_wraps (tex_wraps[tex_req_if.unit]), - .req_baseaddr (tex_baddr[tex_req_if.unit]), + .req_baseaddr(tex_baddr[tex_req_if.unit]), .req_mipoff (sel_mipoff), - .req_logdims (sel_dims), + .req_logdims(sel_logdims), .req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), .req_ready (tex_req_if.ready), .rsp_valid (mem_req_valid), .rsp_tmask (mem_req_tmask), .rsp_filter (mem_req_filter), - .rsp_stride (mem_req_stride), + .rsp_lgstride(mem_req_lgstride), .rsp_addr (mem_req_addr), .rsp_blends (mem_req_blends), .rsp_info (mem_req_info), @@ -142,8 +151,8 @@ module VX_tex_unit #( .REQ_INFOW (REQ_INFOW_M), .NUM_REQS (`NUM_THREADS) ) tex_mem ( - .clk (clk), - .reset (reset), + .clk (clk), + .reset (reset), // memory interface .dcache_req_if (dcache_req_if), @@ -153,7 +162,7 @@ module VX_tex_unit #( .req_valid (mem_req_valid), .req_tmask (mem_req_tmask), .req_filter(mem_req_filter), - .req_stride(mem_req_stride), + .req_lgstride(mem_req_lgstride), .req_addr (mem_req_addr), .req_info ({mem_req_blends, mem_req_info}), .req_ready (mem_req_ready), @@ -168,7 +177,7 @@ module VX_tex_unit #( // apply sampler - wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends; + wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends; wire [`TEX_FORMAT_BITS-1:0] rsp_format; wire [REQ_INFOW_S-1:0] rsp_info; @@ -205,13 +214,12 @@ module VX_tex_unit #( for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin if (csrs_dirty[i]) begin dpi_trace("%d: core%0d-tex-csr: tex%0d_addr=%0h\n", $time, CORE_ID, i, tex_baddr[i]); + dpi_trace("%d: core%0d-tex-csr: tex%0d_logwidth=%0h\n", $time, CORE_ID, i, tex_logdims[i][0]); + dpi_trace("%d: core%0d-tex-csr: tex%0d_logheight=%0h\n", $time, CORE_ID, i, tex_logdims[i][1]); dpi_trace("%d: core%0d-tex-csr: tex%0d_format=%0h\n", $time, CORE_ID, i, tex_format[i]); dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_u=%0h\n", $time, CORE_ID, i, tex_wraps[i][0]); dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_v=%0h\n", $time, CORE_ID, i, tex_wraps[i][1]); dpi_trace("%d: core%0d-tex-csr: tex%0d_filter=%0h\n", $time, CORE_ID, i, tex_filter[i]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_mipoff[0]=%0h\n", $time, CORE_ID, i, tex_mipoff[i][0]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_width[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][0]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_height[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][1]); end end diff --git a/hw/rtl/tex_unit/VX_tex_wrap.sv b/hw/rtl/tex_unit/VX_tex_wrap.sv index 8cc7b2f5..fe2110ba 100644 --- a/hw/rtl/tex_unit/VX_tex_wrap.sv +++ b/hw/rtl/tex_unit/VX_tex_wrap.sv @@ -4,19 +4,19 @@ module VX_tex_wrap #( parameter CORE_ID = 0 ) ( input wire [`TEX_WRAP_BITS-1:0] wrap_i, - input wire [31:0] coord_i, - output wire [`FIXED_FRAC-1:0] coord_o + input wire [`TEX_FXD_BITS-1:0] coord_i, + output wire [`TEX_FXD_FRAC-1:0] coord_o ); `UNUSED_PARAM (CORE_ID) - reg [`FIXED_FRAC-1:0] coord_r; + reg [`TEX_FXD_FRAC-1:0] coord_r; - wire [`FIXED_FRAC-1:0] clamp; + wire [`TEX_FXD_FRAC-1:0] clamp; VX_tex_sat #( - .IN_W (32), - .OUT_W (`FIXED_FRAC) + .IN_W (`TEX_FXD_BITS), + .OUT_W (`TEX_FXD_FRAC) ) sat_fx ( .data_in (coord_i), .data_out (clamp) @@ -27,9 +27,9 @@ module VX_tex_wrap #( `TEX_WRAP_CLAMP: coord_r = clamp; `TEX_WRAP_MIRROR: - coord_r = coord_i[`FIXED_FRAC-1:0] ^ {`FIXED_FRAC{coord_i[`FIXED_FRAC]}}; + coord_r = coord_i[`TEX_FXD_FRAC-1:0] ^ {`TEX_FXD_FRAC{coord_i[`TEX_FXD_FRAC]}}; default: //`TEX_WRAP_REPEAT - coord_r = coord_i[`FIXED_FRAC-1:0]; + coord_r = coord_i[`TEX_FXD_FRAC-1:0]; endcase end diff --git a/hw/syn/opae/Makefile b/hw/syn/opae/Makefile index 010baea3..29b6a922 100644 --- a/hw/syn/opae/Makefile +++ b/hw/syn/opae/Makefile @@ -23,7 +23,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_FLAGS += $(DBG_TRACE_FLAGS) -DBG_FLAGS += -DDBG_CACHE_REQ_INFO CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) CONFIG2 := -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) diff --git a/runtime/Makefile b/runtime/Makefile index 60c3b398..c329e531 100644 --- a/runtime/Makefile +++ b/runtime/Makefile @@ -10,7 +10,7 @@ CFLAGS += -I./include -I../hw PROJECT = libvortexrt -SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c +SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/tinyprintf.c ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c OBJS := $(addsuffix .o, $(notdir $(SRCS))) diff --git a/runtime/include/vx_intrinsics.h b/runtime/include/vx_intrinsics.h index abbca493..f3562872 100644 --- a/runtime/include/vx_intrinsics.h +++ b/runtime/include/vx_intrinsics.h @@ -5,62 +5,7 @@ #ifdef __cplusplus extern "C" { - #endif -#ifdef __ASSEMBLY__ -#define __ASM_STR(x) x -#else -#define __ASM_STR(x) #x -#endif - -#define vx_csr_swap(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_read(csr) ({ \ - register unsigned __v; \ - __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \ - __v; \ -}) - -#define vx_csr_write(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -#define vx_csr_read_set(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_set(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -#define vx_csr_read_clear(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_clear(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -// Texture load -#define vx_tex(unit, u, v, l) ({ \ - unsigned __r; \ - unsigned __u = u; \ - unsigned __v = v; \ - unsigned __l = l; \ - __asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \ - __r; \ -}) #ifdef __ASSEMBLY__ #define __ASM_STR(x) x @@ -68,72 +13,77 @@ extern "C" { #define __ASM_STR(x) #x #endif -#define vx_csr_swap(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_read(csr) ({ \ - register unsigned __v; \ - __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \ - __v; \ -}) - -#define vx_csr_write(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -#define vx_csr_read_set(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_set(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -#define vx_csr_read_clear(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ - __v; \ -}) - -#define vx_csr_clear(csr, val) ({ \ - unsigned __v = (unsigned )(val); \ - __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ -}) - -// Texture load -#define vx_tex(unit, u, v, l) ({ \ - unsigned __r; \ - unsigned __u = u; \ - unsigned __v = v; \ - unsigned __l = l; \ - __asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \ +#define csr_read(csr) ({ \ + unsigned __r; \ + __asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \ __r; \ }) -// Lerp instruction -#define vx_lerp(a, b, s) ({ \ - unsigned __r; \ - unsigned __a = a; \ - unsigned __b = b; \ - unsigned __s = s; \ - __asm__ __volatile__ (".insn r4 0x6b, 7, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r : "r"(__a), "r"(__b), "r"(__s)); \ +#define csr_write(csr, val) ({ \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \ +}) + +#define csr_swap(csr, val) ({ \ + unsigned __r; \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \ + __r; \ +}) + +#define csr_read_set(csr, val) ({ \ + unsigned __r; \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \ + __r; \ +}) + +#define csr_set(csr, val) ({ \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \ +}) + +#define csr_read_clear(csr, val) ({ \ + unsigned __r; \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \ + __r; \ +}) + +#define csr_clear(csr, val) ({ \ + unsigned __v = (unsigned)(val); \ + if (__builtin_constant_p(val) && __v < 32) \ + __asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \ + else \ + __asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \ +}) + +// Texture load +#define vx_tex(unit, u, v, lod) ({ \ + unsigned __r; \ + __asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(unit), "r"(u), "r"(v), "r"(lod)); \ __r; \ }) // Conditional move -#define vx_cmov(c, t, f) ({ \ +#define vx_cmov(c, t, f) ({ \ unsigned __r; \ - unsigned __c = c; \ - unsigned __t = t; \ - unsigned __f = f; \ - __asm__ __volatile__ (".insn r4 0x6b, 6, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r : "r"(__c), "r"(__t), "r"(__f)); \ + __asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \ __r; \ }) @@ -171,7 +121,7 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) { // Prefetch inline void vx_prefetch(unsigned addr) { - asm volatile (".insn s 0x6b, 6, x0, 0(%0)" :: "r"(addr) ); + asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) ); } // Return active warp's thread id diff --git a/runtime/src/tinyprintf.c b/runtime/src/tinyprintf.c new file mode 100644 index 00000000..4c88ef29 --- /dev/null +++ b/runtime/src/tinyprintf.c @@ -0,0 +1,890 @@ +/////////////////////////////////////////////////////////////////////////////// +// \author (c) Marco Paland (info@paland.com) +// 2014-2019, PALANDesign Hannover, Germany +// +// \license The MIT License (MIT) +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// \brief Tiny printf, sprintf and (v)snprintf implementation, optimized for speed on +// embedded systems with a very limited resources. These routines are thread +// safe and reentrant! +// Use this instead of the bloated standard/newlib printf cause these use +// malloc for printf (and may not be thread safe). +// +/////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include "tinyprintf.h" +#include "vx_print.h" + + +// define this globally (e.g. gcc -DPRINTF_INCLUDE_CONFIG_H ...) to include the +// printf_config.h header file +// default: undefined +#ifdef PRINTF_INCLUDE_CONFIG_H +#include "printf_config.h" +#endif + + +// 'ntoa' conversion buffer size, this must be big enough to hold one converted +// numeric number including padded zeros (dynamically created on stack) +// default: 32 byte +#ifndef PRINTF_NTOA_BUFFER_SIZE +#define PRINTF_NTOA_BUFFER_SIZE 32U +#endif + +// 'ftoa' conversion buffer size, this must be big enough to hold one converted +// float number including padded zeros (dynamically created on stack) +// default: 32 byte +#ifndef PRINTF_FTOA_BUFFER_SIZE +#define PRINTF_FTOA_BUFFER_SIZE 32U +#endif + +// support for the floating point type (%f) +// default: activated +#ifndef PRINTF_DISABLE_SUPPORT_FLOAT +#define PRINTF_SUPPORT_FLOAT +#endif + +// support for exponential floating point notation (%e/%g) +// default: activated +#ifndef PRINTF_DISABLE_SUPPORT_EXPONENTIAL +#define PRINTF_SUPPORT_EXPONENTIAL +#endif + +// define the default floating point precision +// default: 6 digits +#ifndef PRINTF_DEFAULT_FLOAT_PRECISION +#define PRINTF_DEFAULT_FLOAT_PRECISION 6U +#endif + +// define the largest float suitable to print with %f +// default: 1e9 +#ifndef PRINTF_MAX_FLOAT +#define PRINTF_MAX_FLOAT 1e9 +#endif + +// support for the long long types (%llu or %p) +// default: activated +#ifndef PRINTF_DISABLE_SUPPORT_LONG_LONG +#define PRINTF_SUPPORT_LONG_LONG +#endif + +// support for the ptrdiff_t type (%t) +// ptrdiff_t is normally defined in as long or long long type +// default: activated +#ifndef PRINTF_DISABLE_SUPPORT_PTRDIFF_T +#define PRINTF_SUPPORT_PTRDIFF_T +#endif + +/////////////////////////////////////////////////////////////////////////////// + +// internal flag definitions +#define FLAGS_ZEROPAD (1U << 0U) +#define FLAGS_LEFT (1U << 1U) +#define FLAGS_PLUS (1U << 2U) +#define FLAGS_SPACE (1U << 3U) +#define FLAGS_HASH (1U << 4U) +#define FLAGS_UPPERCASE (1U << 5U) +#define FLAGS_CHAR (1U << 6U) +#define FLAGS_SHORT (1U << 7U) +#define FLAGS_LONG (1U << 8U) +#define FLAGS_LONG_LONG (1U << 9U) +#define FLAGS_PRECISION (1U << 10U) +#define FLAGS_ADAPT_EXP (1U << 11U) + + +// import float.h for DBL_MAX +#if defined(PRINTF_SUPPORT_FLOAT) +#include +#endif + + +// output function type +typedef void (*out_fct_type)(char character, void* buffer, size_t idx, size_t maxlen); + + +// wrapper (used as buffer) for output function type +typedef struct { + void (*fct)(char character, void* arg); + void* arg; +} out_fct_wrap_type; + + +// internal buffer output +static inline void _out_buffer(char character, void* buffer, size_t idx, size_t maxlen) +{ + if (idx < maxlen) { + ((char*)buffer)[idx] = character; + } +} + + +// internal null output +static inline void _out_null(char character, void* buffer, size_t idx, size_t maxlen) +{ + (void)character; (void)buffer; (void)idx; (void)maxlen; +} + + +// internal _putchar wrapper +static inline void _out_char(char character, void* buffer, size_t idx, size_t maxlen) +{ + (void)buffer; (void)idx; (void)maxlen; + if (character) { + vx_putchar(character); + } +} + + +// internal output function wrapper +static inline void _out_fct(char character, void* buffer, size_t idx, size_t maxlen) +{ + (void)idx; (void)maxlen; + if (character) { + // buffer is the output fct pointer + ((out_fct_wrap_type*)buffer)->fct(character, ((out_fct_wrap_type*)buffer)->arg); + } +} + + +// internal secure strlen +// \return The length of the string (excluding the terminating 0) limited by 'maxsize' +static inline unsigned int _strnlen_s(const char* str, size_t maxsize) +{ + const char* s; + for (s = str; *s && maxsize--; ++s); + return (unsigned int)(s - str); +} + + +// internal test if char is a digit (0-9) +// \return true if char is a digit +static inline bool _is_digit(char ch) +{ + return (ch >= '0') && (ch <= '9'); +} + + +// internal ASCII string to unsigned int conversion +static unsigned int _atoi(const char** str) +{ + unsigned int i = 0U; + while (_is_digit(**str)) { + i = i * 10U + (unsigned int)(*((*str)++) - '0'); + } + return i; +} + + +// output the specified string in reverse, taking care of any zero-padding +static size_t _out_rev(out_fct_type out, char* buffer, size_t idx, size_t maxlen, const char* buf, size_t len, unsigned int width, unsigned int flags) +{ + const size_t start_idx = idx; + + // pad spaces up to given width + if (!(flags & FLAGS_LEFT) && !(flags & FLAGS_ZEROPAD)) { + for (size_t i = len; i < width; i++) { + out(' ', buffer, idx++, maxlen); + } + } + + // reverse string + while (len) { + out(buf[--len], buffer, idx++, maxlen); + } + + // append pad spaces up to given width + if (flags & FLAGS_LEFT) { + while (idx - start_idx < width) { + out(' ', buffer, idx++, maxlen); + } + } + + return idx; +} + + +// internal itoa format +static size_t _ntoa_format(out_fct_type out, char* buffer, size_t idx, size_t maxlen, char* buf, size_t len, bool negative, unsigned int base, unsigned int prec, unsigned int width, unsigned int flags) +{ + // pad leading zeros + if (!(flags & FLAGS_LEFT)) { + if (width && (flags & FLAGS_ZEROPAD) && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) { + width--; + } + while ((len < prec) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = '0'; + } + while ((flags & FLAGS_ZEROPAD) && (len < width) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = '0'; + } + } + + // handle hash + if (flags & FLAGS_HASH) { + if (!(flags & FLAGS_PRECISION) && len && ((len == prec) || (len == width))) { + len--; + if (len && (base == 16U)) { + len--; + } + } + if ((base == 16U) && !(flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = 'x'; + } + else if ((base == 16U) && (flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = 'X'; + } + else if ((base == 2U) && (len < PRINTF_NTOA_BUFFER_SIZE)) { + buf[len++] = 'b'; + } + if (len < PRINTF_NTOA_BUFFER_SIZE) { + buf[len++] = '0'; + } + } + + if (len < PRINTF_NTOA_BUFFER_SIZE) { + if (negative) { + buf[len++] = '-'; + } + else if (flags & FLAGS_PLUS) { + buf[len++] = '+'; // ignore the space if the '+' exists + } + else if (flags & FLAGS_SPACE) { + buf[len++] = ' '; + } + } + + return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags); +} + + +// internal itoa for 'long' type +static size_t _ntoa_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long value, bool negative, unsigned long base, unsigned int prec, unsigned int width, unsigned int flags) +{ + char buf[PRINTF_NTOA_BUFFER_SIZE]; + size_t len = 0U; + + // no hash for 0 values + if (!value) { + flags &= ~FLAGS_HASH; + } + + // write if precision != 0 and value is != 0 + if (!(flags & FLAGS_PRECISION) || value) { + do { + const char digit = (char)(value % base); + buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10; + value /= base; + } while (value && (len < PRINTF_NTOA_BUFFER_SIZE)); + } + + return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags); +} + + +// internal itoa for 'long long' type +#if defined(PRINTF_SUPPORT_LONG_LONG) +static size_t _ntoa_long_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long long value, bool negative, unsigned long long base, unsigned int prec, unsigned int width, unsigned int flags) +{ + char buf[PRINTF_NTOA_BUFFER_SIZE]; + size_t len = 0U; + + // no hash for 0 values + if (!value) { + flags &= ~FLAGS_HASH; + } + + // write if precision != 0 and value is != 0 + if (!(flags & FLAGS_PRECISION) || value) { + do { + const char digit = (char)(value % base); + buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10; + value /= base; + } while (value && (len < PRINTF_NTOA_BUFFER_SIZE)); + } + + return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags); +} +#endif // PRINTF_SUPPORT_LONG_LONG + + +#if defined(PRINTF_SUPPORT_FLOAT) + +#if defined(PRINTF_SUPPORT_EXPONENTIAL) +// forward declaration so that _ftoa can switch to exp notation for values > PRINTF_MAX_FLOAT +static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags); +#endif + + +// internal ftoa for fixed decimal floating point +static size_t _ftoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags) +{ + char buf[PRINTF_FTOA_BUFFER_SIZE]; + size_t len = 0U; + double diff = 0.0; + + // powers of 10 + static const double pow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 }; + + // test for special values + if (value != value) + return _out_rev(out, buffer, idx, maxlen, "nan", 3, width, flags); + if (value < -DBL_MAX) + return _out_rev(out, buffer, idx, maxlen, "fni-", 4, width, flags); + if (value > DBL_MAX) + return _out_rev(out, buffer, idx, maxlen, (flags & FLAGS_PLUS) ? "fni+" : "fni", (flags & FLAGS_PLUS) ? 4U : 3U, width, flags); + + // test for very large values + // standard printf behavior is to print EVERY whole number digit -- which could be 100s of characters overflowing your buffers == bad + if ((value > PRINTF_MAX_FLOAT) || (value < -PRINTF_MAX_FLOAT)) { +#if defined(PRINTF_SUPPORT_EXPONENTIAL) + return _etoa(out, buffer, idx, maxlen, value, prec, width, flags); +#else + return 0U; +#endif + } + + // test for negative + bool negative = false; + if (value < 0) { + negative = true; + value = 0 - value; + } + + // set default precision, if not set explicitly + if (!(flags & FLAGS_PRECISION)) { + prec = PRINTF_DEFAULT_FLOAT_PRECISION; + } + // limit precision to 9, cause a prec >= 10 can lead to overflow errors + while ((len < PRINTF_FTOA_BUFFER_SIZE) && (prec > 9U)) { + buf[len++] = '0'; + prec--; + } + + int whole = (int)value; + double tmp = (value - whole) * pow10[prec]; + unsigned long frac = (unsigned long)tmp; + diff = tmp - frac; + + if (diff > 0.5) { + ++frac; + // handle rollover, e.g. case 0.99 with prec 1 is 1.0 + if (frac >= pow10[prec]) { + frac = 0; + ++whole; + } + } + else if (diff < 0.5) { + } + else if ((frac == 0U) || (frac & 1U)) { + // if halfway, round up if odd OR if last digit is 0 + ++frac; + } + + if (prec == 0U) { + diff = value - (double)whole; + if ((!(diff < 0.5) || (diff > 0.5)) && (whole & 1)) { + // exactly 0.5 and ODD, then round up + // 1.5 -> 2, but 2.5 -> 2 + ++whole; + } + } + else { + unsigned int count = prec; + // now do fractional part, as an unsigned number + while (len < PRINTF_FTOA_BUFFER_SIZE) { + --count; + buf[len++] = (char)(48U + (frac % 10U)); + if (!(frac /= 10U)) { + break; + } + } + // add extra 0s + while ((len < PRINTF_FTOA_BUFFER_SIZE) && (count-- > 0U)) { + buf[len++] = '0'; + } + if (len < PRINTF_FTOA_BUFFER_SIZE) { + // add decimal + buf[len++] = '.'; + } + } + + // do whole part, number is reversed + while (len < PRINTF_FTOA_BUFFER_SIZE) { + buf[len++] = (char)(48 + (whole % 10)); + if (!(whole /= 10)) { + break; + } + } + + // pad leading zeros + if (!(flags & FLAGS_LEFT) && (flags & FLAGS_ZEROPAD)) { + if (width && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) { + width--; + } + while ((len < width) && (len < PRINTF_FTOA_BUFFER_SIZE)) { + buf[len++] = '0'; + } + } + + if (len < PRINTF_FTOA_BUFFER_SIZE) { + if (negative) { + buf[len++] = '-'; + } + else if (flags & FLAGS_PLUS) { + buf[len++] = '+'; // ignore the space if the '+' exists + } + else if (flags & FLAGS_SPACE) { + buf[len++] = ' '; + } + } + + return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags); +} + + +#if defined(PRINTF_SUPPORT_EXPONENTIAL) +// internal ftoa variant for exponential floating-point type, contributed by Martijn Jasperse +static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags) +{ + // check for NaN and special values + if ((value != value) || (value > DBL_MAX) || (value < -DBL_MAX)) { + return _ftoa(out, buffer, idx, maxlen, value, prec, width, flags); + } + + // determine the sign + const bool negative = value < 0; + if (negative) { + value = -value; + } + + // default precision + if (!(flags & FLAGS_PRECISION)) { + prec = PRINTF_DEFAULT_FLOAT_PRECISION; + } + + // determine the decimal exponent + // based on the algorithm by David Gay (https://www.ampl.com/netlib/fp/dtoa.c) + union { + uint64_t U; + double F; + } conv; + + conv.F = value; + int exp2 = (int)((conv.U >> 52U) & 0x07FFU) - 1023; // effectively log2 + conv.U = (conv.U & ((1ULL << 52U) - 1U)) | (1023ULL << 52U); // drop the exponent so conv.F is now in [1,2) + // now approximate log10 from the log2 integer part and an expansion of ln around 1.5 + int expval = (int)(0.1760912590558 + exp2 * 0.301029995663981 + (conv.F - 1.5) * 0.289529654602168); + // now we want to compute 10^expval but we want to be sure it won't overflow + exp2 = (int)(expval * 3.321928094887362 + 0.5); + const double z = expval * 2.302585092994046 - exp2 * 0.6931471805599453; + const double z2 = z * z; + conv.U = (uint64_t)(exp2 + 1023) << 52U; + // compute exp(z) using continued fractions, see https://en.wikipedia.org/wiki/Exponential_function#Continued_fractions_for_ex + conv.F *= 1 + 2 * z / (2 - z + (z2 / (6 + (z2 / (10 + z2 / 14))))); + // correct for rounding errors + if (value < conv.F) { + expval--; + conv.F /= 10; + } + + // the exponent format is "%+03d" and largest value is "307", so set aside 4-5 characters + unsigned int minwidth = ((expval < 100) && (expval > -100)) ? 4U : 5U; + + // in "%g" mode, "prec" is the number of *significant figures* not decimals + if (flags & FLAGS_ADAPT_EXP) { + // do we want to fall-back to "%f" mode? + if ((value >= 1e-4) && (value < 1e6)) { + if ((int)prec > expval) { + prec = (unsigned)((int)prec - expval - 1); + } + else { + prec = 0; + } + flags |= FLAGS_PRECISION; // make sure _ftoa respects precision + // no characters in exponent + minwidth = 0U; + expval = 0; + } + else { + // we use one sigfig for the whole part + if ((prec > 0) && (flags & FLAGS_PRECISION)) { + --prec; + } + } + } + + // will everything fit? + unsigned int fwidth = width; + if (width > minwidth) { + // we didn't fall-back so subtract the characters required for the exponent + fwidth -= minwidth; + } else { + // not enough characters, so go back to default sizing + fwidth = 0U; + } + if ((flags & FLAGS_LEFT) && minwidth) { + // if we're padding on the right, DON'T pad the floating part + fwidth = 0U; + } + + // rescale the float value + if (expval) { + value /= conv.F; + } + + // output the floating part + const size_t start_idx = idx; + idx = _ftoa(out, buffer, idx, maxlen, negative ? -value : value, prec, fwidth, flags & ~FLAGS_ADAPT_EXP); + + // output the exponent part + if (minwidth) { + // output the exponential symbol + out((flags & FLAGS_UPPERCASE) ? 'E' : 'e', buffer, idx++, maxlen); + // output the exponent value + idx = _ntoa_long(out, buffer, idx, maxlen, (expval < 0) ? -expval : expval, expval < 0, 10, 0, minwidth-1, FLAGS_ZEROPAD | FLAGS_PLUS); + // might need to right-pad spaces + if (flags & FLAGS_LEFT) { + while (idx - start_idx < width) out(' ', buffer, idx++, maxlen); + } + } + return idx; +} +#endif // PRINTF_SUPPORT_EXPONENTIAL +#endif // PRINTF_SUPPORT_FLOAT + + +// internal vsnprintf +static int _vsnprintf(out_fct_type out, char* buffer, const size_t maxlen, const char* format, va_list va) { + unsigned int flags, width, precision, n; + size_t idx = 0U; + + if (!buffer) { + // use null output function + out = _out_null; + } + + while (*format) + { + // format specifier? %[flags][width][.precision][length] + if (*format != '%') { + // no + out(*format, buffer, idx++, maxlen); + format++; + continue; + } + else { + // yes, evaluate it + format++; + } + + // evaluate flags + flags = 0U; + do { + switch (*format) { + case '0': flags |= FLAGS_ZEROPAD; format++; n = 1U; break; + case '-': flags |= FLAGS_LEFT; format++; n = 1U; break; + case '+': flags |= FLAGS_PLUS; format++; n = 1U; break; + case ' ': flags |= FLAGS_SPACE; format++; n = 1U; break; + case '#': flags |= FLAGS_HASH; format++; n = 1U; break; + default : n = 0U; break; + } + } while (n); + + // evaluate width field + width = 0U; + if (_is_digit(*format)) { + width = _atoi(&format); + } + else if (*format == '*') { + const int w = va_arg(va, int); + if (w < 0) { + flags |= FLAGS_LEFT; // reverse padding + width = (unsigned int)-w; + } + else { + width = (unsigned int)w; + } + format++; + } + + // evaluate precision field + precision = 0U; + if (*format == '.') { + flags |= FLAGS_PRECISION; + format++; + if (_is_digit(*format)) { + precision = _atoi(&format); + } + else if (*format == '*') { + const int prec = (int)va_arg(va, int); + precision = prec > 0 ? (unsigned int)prec : 0U; + format++; + } + } + + // evaluate length field + switch (*format) { + case 'l' : + flags |= FLAGS_LONG; + format++; + if (*format == 'l') { + flags |= FLAGS_LONG_LONG; + format++; + } + break; + case 'h' : + flags |= FLAGS_SHORT; + format++; + if (*format == 'h') { + flags |= FLAGS_CHAR; + format++; + } + break; +#if defined(PRINTF_SUPPORT_PTRDIFF_T) + case 't' : + flags |= (sizeof(ptrdiff_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG); + format++; + break; +#endif + case 'j' : + flags |= (sizeof(intmax_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG); + format++; + break; + case 'z' : + flags |= (sizeof(size_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG); + format++; + break; + default : + break; + } + + // evaluate specifier + switch (*format) { + case 'd' : + case 'i' : + case 'u' : + case 'x' : + case 'X' : + case 'o' : + case 'b' : { + // set the base + unsigned int base; + if (*format == 'x' || *format == 'X') { + base = 16U; + } + else if (*format == 'o') { + base = 8U; + } + else if (*format == 'b') { + base = 2U; + } + else { + base = 10U; + flags &= ~FLAGS_HASH; // no hash for dec format + } + // uppercase + if (*format == 'X') { + flags |= FLAGS_UPPERCASE; + } + + // no plus or space flag for u, x, X, o, b + if ((*format != 'i') && (*format != 'd')) { + flags &= ~(FLAGS_PLUS | FLAGS_SPACE); + } + + // ignore '0' flag when precision is given + if (flags & FLAGS_PRECISION) { + flags &= ~FLAGS_ZEROPAD; + } + + // convert the integer + if ((*format == 'i') || (*format == 'd')) { + // signed + if (flags & FLAGS_LONG_LONG) { +#if defined(PRINTF_SUPPORT_LONG_LONG) + const long long value = va_arg(va, long long); + idx = _ntoa_long_long(out, buffer, idx, maxlen, (unsigned long long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags); +#endif + } + else if (flags & FLAGS_LONG) { + const long value = va_arg(va, long); + idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags); + } + else { + const int value = (flags & FLAGS_CHAR) ? (char)va_arg(va, int) : (flags & FLAGS_SHORT) ? (short int)va_arg(va, int) : va_arg(va, int); + idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned int)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags); + } + } + else { + // unsigned + if (flags & FLAGS_LONG_LONG) { +#if defined(PRINTF_SUPPORT_LONG_LONG) + idx = _ntoa_long_long(out, buffer, idx, maxlen, va_arg(va, unsigned long long), false, base, precision, width, flags); +#endif + } + else if (flags & FLAGS_LONG) { + idx = _ntoa_long(out, buffer, idx, maxlen, va_arg(va, unsigned long), false, base, precision, width, flags); + } + else { + const unsigned int value = (flags & FLAGS_CHAR) ? (unsigned char)va_arg(va, unsigned int) : (flags & FLAGS_SHORT) ? (unsigned short int)va_arg(va, unsigned int) : va_arg(va, unsigned int); + idx = _ntoa_long(out, buffer, idx, maxlen, value, false, base, precision, width, flags); + } + } + format++; + break; + } +#if defined(PRINTF_SUPPORT_FLOAT) + case 'f' : + case 'F' : + if (*format == 'F') flags |= FLAGS_UPPERCASE; + idx = _ftoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags); + format++; + break; +#if defined(PRINTF_SUPPORT_EXPONENTIAL) + case 'e': + case 'E': + case 'g': + case 'G': + if ((*format == 'g')||(*format == 'G')) flags |= FLAGS_ADAPT_EXP; + if ((*format == 'E')||(*format == 'G')) flags |= FLAGS_UPPERCASE; + idx = _etoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags); + format++; + break; +#endif // PRINTF_SUPPORT_EXPONENTIAL +#endif // PRINTF_SUPPORT_FLOAT + case 'c' : { + unsigned int l = 1U; + // pre padding + if (!(flags & FLAGS_LEFT)) { + while (l++ < width) { + out(' ', buffer, idx++, maxlen); + } + } + // char output + out((char)va_arg(va, int), buffer, idx++, maxlen); + // post padding + if (flags & FLAGS_LEFT) { + while (l++ < width) { + out(' ', buffer, idx++, maxlen); + } + } + format++; + break; + } + + case 's' : { + const char* p = va_arg(va, char*); + unsigned int l = _strnlen_s(p, precision ? precision : (size_t)-1); + // pre padding + if (flags & FLAGS_PRECISION) { + l = (l < precision ? l : precision); + } + if (!(flags & FLAGS_LEFT)) { + while (l++ < width) { + out(' ', buffer, idx++, maxlen); + } + } + // string output + while ((*p != 0) && (!(flags & FLAGS_PRECISION) || precision--)) { + out(*(p++), buffer, idx++, maxlen); + } + // post padding + if (flags & FLAGS_LEFT) { + while (l++ < width) { + out(' ', buffer, idx++, maxlen); + } + } + format++; + break; + } + + case 'p' : { + width = sizeof(void*) * 2U; + flags |= FLAGS_ZEROPAD | FLAGS_UPPERCASE; +#if defined(PRINTF_SUPPORT_LONG_LONG) + const bool is_ll = sizeof(uintptr_t) == sizeof(long long); + if (is_ll) { + idx = _ntoa_long_long(out, buffer, idx, maxlen, (uintptr_t)va_arg(va, void*), false, 16U, precision, width, flags); + } + else { +#endif + idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)((uintptr_t)va_arg(va, void*)), false, 16U, precision, width, flags); +#if defined(PRINTF_SUPPORT_LONG_LONG) + } +#endif + format++; + break; + } + + case '%' : + out('%', buffer, idx++, maxlen); + format++; + break; + + default : + out(*format, buffer, idx++, maxlen); + format++; + break; + } + } + + // termination + out((char)0, buffer, idx < maxlen ? idx : maxlen - 1U, maxlen); + + // return written chars without terminating \0 + return (int)idx; +} + +int tiny_printf(const char* format, ...) { + va_list va; + va_start(va, format); + char buffer[1]; + const int ret = _vsnprintf(_out_char, buffer, (size_t)-1, format, va); + va_end(va); + return ret; +} + +int tiny_sprintf(char* buffer, const char* format, ...) { + va_list va; + va_start(va, format); + const int ret = _vsnprintf(_out_buffer, buffer, (size_t)-1, format, va); + va_end(va); + return ret; +} + +int tiny_snprintf(char* buffer, size_t count, const char* format, ...) { + va_list va; + va_start(va, format); + const int ret = _vsnprintf(_out_buffer, buffer, count, format, va); + va_end(va); + return ret; +} + +int tiny_vprintf(const char* format, va_list va) { + char buffer[1]; + return _vsnprintf(_out_char, buffer, (size_t)-1, format, va); +} + +int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va) { + return _vsnprintf(_out_buffer, buffer, count, format, va); +} \ No newline at end of file diff --git a/runtime/src/tinyprintf.h b/runtime/src/tinyprintf.h new file mode 100644 index 00000000..9aa79d9a --- /dev/null +++ b/runtime/src/tinyprintf.h @@ -0,0 +1,86 @@ +/////////////////////////////////////////////////////////////////////////////// +// \author (c) Marco Paland (info@paland.com) +// 2014-2019, PALANDesign Hannover, Germany +// +// \license The MIT License (MIT) +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// \brief Tiny printf, sprintf and snprintf implementation, optimized for speed on +// embedded systems with a very limited resources. +// Use this instead of bloated standard/newlib printf. +// These routines are thread safe and reentrant. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _TINYPRINTF_H_ +#define _TINYPRINTF_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Tiny printf implementation + * You have to implement _putchar if you use printf() + * To avoid conflicts with the regular printf() API it is overridden by macro defines + * and internal underscore-appended functions like printf_() are used + * \param format A string that specifies the format of the output + * \return The number of characters that are written into the array, not counting the terminating null character + */ +int tiny_printf(const char* format, ...); + +/** + * Tiny sprintf implementation + * Due to security reasons (buffer overflow) YOU SHOULD CONSIDER USING (V)SNPRINTF INSTEAD! + * \param buffer A pointer to the buffer where to store the formatted string. MUST be big enough to store the output! + * \param format A string that specifies the format of the output + * \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character + */ +int tiny_sprintf(char* buffer, const char* format, ...); + +/** + * Tiny snprintf/vsnprintf implementation + * \param buffer A pointer to the buffer where to store the formatted string + * \param count The maximum number of characters to store in the buffer, including a terminating null character + * \param format A string that specifies the format of the output + * \param va A value identifying a variable arguments list + * \return The number of characters that COULD have been written into the buffer, not counting the terminating + * null character. A value equal or larger than count indicates truncation. Only when the returned value + * is non-negative and less than count, the string has been completely written. + */ +int tiny_snprintf(char* buffer, size_t count, const char* format, ...); +int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va); + +/** + * Tiny vprintf implementation + * \param format A string that specifies the format of the output + * \param va A value identifying a variable arguments list + * \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character + */ +int tiny_vprintf(const char* format, va_list va); + +#ifdef __cplusplus +} +#endif + +#endif // _TINYPRINTF_H_ \ No newline at end of file diff --git a/runtime/src/vx_perf.c b/runtime/src/vx_perf.c index edfecdeb..0fe74375 100644 --- a/runtime/src/vx_perf.c +++ b/runtime/src/vx_perf.c @@ -4,10 +4,10 @@ #include #define DUMP_CSR_4(d, s) \ - csr_mem[d + 0] = vx_csr_read(s + 0); \ - csr_mem[d + 1] = vx_csr_read(s + 1); \ - csr_mem[d + 2] = vx_csr_read(s + 2); \ - csr_mem[d + 3] = vx_csr_read(s + 3); + csr_mem[d + 0] = csr_read(s + 0); \ + csr_mem[d + 1] = csr_read(s + 1); \ + csr_mem[d + 2] = csr_read(s + 2); \ + csr_mem[d + 3] = csr_read(s + 3); #define DUMP_CSR_32(d, s) \ DUMP_CSR_4(d + 0, s + 0) \ diff --git a/runtime/src/vx_print.c b/runtime/src/vx_print.c index 86458644..e75993e2 100644 --- a/runtime/src/vx_print.c +++ b/runtime/src/vx_print.c @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include "tinyprintf.h" #ifdef __cplusplus extern "C" { @@ -26,46 +28,19 @@ typedef struct { int precision; } putfloat_arg_t; -static void __printf_cb(printf_arg_t* arg) { - arg->ret = vprintf(arg->format, *arg->va); -} - -int vx_vprintf(const char* format, va_list va) { - printf_arg_t arg; - arg.format = format; - arg.va = &va; - vx_serial((vx_serial_cb)__printf_cb, &arg); - return arg.ret; -} - -int vx_printf(const char * format, ...) { - int ret; - va_list va; - va_start(va, format); - ret = vx_vprintf(format, va); - va_end(va); - return ret; -} - -static void __putint_cb(const putint_arg_t* arg) { +static void __putint_cb(const putint_arg_t* arg) { char tmp[33]; float value = arg->value; int base = arg->base; itoa(value, tmp, base); for (int i = 0; i < 33; ++i) { int c = tmp[i]; - if (!c) break; + if (!c) + break; vx_putchar(c); } } -void vx_putint(int value, int base) { - putint_arg_t arg; - arg.value = value; - arg.base = base; - vx_serial((vx_serial_cb)__putint_cb, &arg); -} - static void __putfloat_cb(const putfloat_arg_t* arg) { float value = arg->value; int precision = arg->precision; @@ -79,6 +54,17 @@ static void __putfloat_cb(const putfloat_arg_t* arg) { } } +static void __vprintf_cb(printf_arg_t* arg) { + arg->ret = tiny_vprintf(arg->format, *arg->va); +} + +void vx_putint(int value, int base) { + putint_arg_t arg; + arg.value = value; + arg.base = base; + vx_serial((vx_serial_cb)__putint_cb, &arg); +} + void vx_putfloat(float value, int precision) { putfloat_arg_t arg; arg.value = value; @@ -86,6 +72,23 @@ void vx_putfloat(float value, int precision) { vx_serial((vx_serial_cb)__putfloat_cb, &arg); } +int vx_vprintf(const char* format, va_list va) { + printf_arg_t arg; + arg.format = format; + arg.va = &va; + vx_serial((vx_serial_cb)__vprintf_cb, &arg); + return arg.ret; +} + +int vx_printf(const char * format, ...) { + int ret; + va_list va; + va_start(va, format); + ret = vx_vprintf(format, va); + va_end(va); + return ret; +} + #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/runtime/src/vx_syscalls.c b/runtime/src/vx_syscalls.c index 37d60b8d..37e4d193 100644 --- a/runtime/src/vx_syscalls.c +++ b/runtime/src/vx_syscalls.c @@ -16,7 +16,10 @@ int _open(const char *name, int flags, int mode) { return -1; } int _read(int file, char *ptr, int len) { return -1; } -caddr_t _sbrk(int incr) { return 0; } +caddr_t _sbrk(int incr) { + __asm__ __volatile__("ebreak"); + return 0; +} int _write(int file, char *ptr, int len) { int i; diff --git a/sim/common/bitmanip.h b/sim/common/bitmanip.h new file mode 100644 index 00000000..f485cd6d --- /dev/null +++ b/sim/common/bitmanip.h @@ -0,0 +1,79 @@ +#pragma once + +#include +#include +#include + +constexpr uint32_t count_leading_zeros(uint32_t value) { + return value ? __builtin_clz(value) : 32; +} + +constexpr uint32_t count_trailing_zeros(uint32_t value) { + return value ? __builtin_ctz(value) : 32; +} + +constexpr bool ispow2(uint32_t value) { + return value && !(value & (value - 1)); +} + +constexpr uint32_t log2ceil(uint32_t value) { + return 32 - count_leading_zeros(value - 1); +} + +inline unsigned log2up(uint32_t value) { + return std::max(1, log2ceil(value)); +} + +constexpr unsigned log2floor(uint32_t value) { + return 31 - count_leading_zeros(value); +} + +constexpr unsigned ceil2(uint32_t value) { + return 32 - count_leading_zeros(value); +} + +inline uint64_t bit_clr(uint64_t bits, uint32_t index) { + assert(index <= 63); + return bits & ~(1ull << index); +} + +inline uint64_t bit_set(uint64_t bits, uint32_t index) { + assert(index <= 63); + return bits | (1ull << index); +} + +inline bool bit_get(uint64_t bits, uint32_t index) { + assert(index <= 63); + return (bits >> index) & 0x1; +} + +inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift; + return bits & ~mask; +} + +inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + uint64_t dirty = (value << (shift + start)) >> shift; + return bit_clrw(bits, start, end) | dirty; +} + +inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) { + assert(end >= start); + assert(end <= 63); + uint32_t shift = 63 - end; + return (bits << shift) >> (shift + start); +} + +// Apply integer sign extension +inline uint32_t sext32(uint32_t word, uint32_t width) { + assert(width > 1); + assert(width <= 32); + uint32_t mask = (1 << width) - 1; + return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word; +} \ No newline at end of file diff --git a/sim/common/fixed.h b/sim/common/fixed.h new file mode 100644 index 00000000..8ef60d9a --- /dev/null +++ b/sim/common/fixed.h @@ -0,0 +1,419 @@ +#pragma once + +#include +#include +#include + +template +class Fixed { +private: + + template + struct Cast { + private: + template struct Tag {}; + + inline static T Convert(T2 value, Tag) { + return static_cast(value) << (F - F2); + } + + inline static T Convert(T2 value, Tag) { + return static_cast(value) >> (F2 - F); + } + + inline static T Convert(T2 value, Tag) { + return static_cast(value << (F - F2)); + } + + inline static T Convert(T2 value, Tag) { + return static_cast(value >> (F2 - F)); + } + + public: + inline static T Convert(T2 value) { + return Convert(value, Tag<(sizeof(T2) > sizeof(T)), (F2 > F)>{}); + } + }; + +public: + using data_type = T; + + static constexpr uint32_t FRAC = F; + static constexpr uint32_t INT = sizeof(T) * 8 - FRAC; + static constexpr uint32_t HFRAC = FRAC >> 1; + static constexpr T ONE = static_cast(1) << FRAC; + static constexpr T MASK = ONE - 1; + static constexpr T IMASK = ~MASK; + static constexpr T HALF = ONE >> 1; + static constexpr T TWO = ONE << 1; + + Fixed() {} + + explicit Fixed(int64_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(uint64_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(int32_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(uint32_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(int16_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(uint16_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(int8_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + explicit Fixed(uint8_t rhs) + : data_(static_cast(rhs << FRAC)) { + assert((static_cast(rhs) << FRAC) == data_); + } + + template + explicit Fixed(Fixed rhs) + : data_(Cast::Convert(rhs.data())) + {} + + explicit Fixed(float rhs) + : data_(static_cast(rhs * ONE)) { + assert(data_ == static_cast(rhs * ONE)); + } + + bool operator==(Fixed rhs) const { + return (data_ == rhs.data_); + } + + bool operator!=(Fixed rhs) const { + return (data_ != rhs.data_); + } + + bool operator<(Fixed rhs) const { + return (data_ < rhs.data_); + } + + bool operator<=(Fixed rhs) const { + return (data_ <= rhs.data_); + } + + bool operator>(Fixed rhs) const { + return (data_ > rhs.data_); + } + + bool operator>=(Fixed rhs) const { + return (data_ >= rhs.data_); + } + + Fixed operator-() const { + return make(-data_); + } + + Fixed operator+=(Fixed rhs) { + *this = (*this) + rhs; + return *this; + } + + Fixed operator-=(Fixed rhs) { + *this = (*this) - rhs; + return *this; + } + + Fixed operator*=(Fixed rhs) { + *this = (*this) * rhs; + return *this; + } + + Fixed operator/=(Fixed rhs) { + *this = (*this) / rhs; + return *this; + } + + template + Fixed operator*=(Fixed rhs) { + *this = (*this) * rhs; + return *this; + } + + template + Fixed operator/=(Fixed rhs) { + *this = (*this) / rhs; + return *this; + } + + Fixed operator*=(int32_t rhs) { + *this = (*this) * rhs; + return *this; + } + + Fixed operator*=(uint32_t rhs) { + *this = (*this) * rhs; + return *this; + } + + Fixed operator*=(float rhs) { + *this = (*this) * rhs; + return *this; + } + + Fixed operator/=(int32_t rhs) { + *this = (*this) / rhs; + return *this; + } + + Fixed operator/=(uint32_t rhs) { + *this = (*this) / rhs; + return *this; + } + + Fixed operator/=(float rhs) { + *this = (*this) / rhs; + return *this; + } + + friend Fixed operator+(Fixed lhs, Fixed rhs) { + assert((static_cast(lhs.data_) + rhs.data_) == + (lhs.data_ + rhs.data_)); + return Fixed::make(lhs.data_ + rhs.data_); + } + + friend Fixed operator-(Fixed lhs, Fixed rhs) { + assert((static_cast(lhs.data_) - rhs.data_) == + (lhs.data_ - rhs.data_)); + return Fixed::make(lhs.data_ - rhs.data_); + } + + friend Fixed operator*(Fixed lhs, Fixed rhs) { + return Fixed::make((static_cast(lhs.data_) * rhs.data_) >> FRAC); + } + + template + friend Fixed operator*(Fixed lhs, Fixed rhs) { + return Fixed::make((static_cast(lhs.data_) * rhs.data()) >> F2); + } + + friend Fixed operator/(Fixed lhs, Fixed rhs) { + assert(rhs.data_ != 0); + return Fixed::make((static_cast(lhs.data_) << FRAC) / rhs.data_); + } + + template + friend Fixed operator/(Fixed lhs, Fixed rhs) { + assert(rhs.data() != 0); + return Fixed::make((static_cast(lhs.data_) << F2) / rhs.data()); + } + + friend Fixed operator*(Fixed lhs, float rhs) { + return static_cast(lhs) * rhs; + } + + friend Fixed operator*(float lhs, Fixed rhs) { + return lhs * static_cast(rhs); + } + + friend Fixed operator/(Fixed lhs, float rhs) { + return static_cast(lhs) / rhs; + } + + friend Fixed operator/(float lhs, Fixed rhs) { + return lhs / static_cast(rhs); + } + + friend Fixed operator*(Fixed lhs, char rhs) { + return lhs * static_cast(rhs); + } + + friend Fixed operator*(char lhs, Fixed rhs) { + return rhs * lhs; + } + + friend Fixed operator/(Fixed lhs, char rhs) { + return lhs / static_cast(rhs); + } + + friend Fixed operator/(char lhs, Fixed rhs) { + return rhs / lhs; + } + + friend Fixed operator*(Fixed lhs, uint8_t rhs) { + return lhs * static_cast(rhs); + } + + friend Fixed operator*(uint8_t lhs, Fixed rhs) { + return rhs * lhs; + } + + friend Fixed operator/(Fixed lhs, uint8_t rhs) { + return lhs / static_cast(rhs); + } + + friend Fixed operator/(uint8_t lhs, Fixed rhs) { + return rhs / lhs; + } + + friend Fixed operator*(Fixed lhs, short rhs) { + return lhs * static_cast(rhs); + } + + friend Fixed operator*(short lhs, Fixed rhs) { + return rhs * lhs; + } + + friend Fixed operator/(Fixed lhs, short rhs) { + return lhs / static_cast(rhs); + } + + friend Fixed operator/(short lhs, Fixed rhs) { + return rhs / lhs; + } + + friend Fixed operator*(Fixed lhs, uint16_t rhs) { + return lhs * static_cast(rhs); + } + + friend Fixed operator*(uint16_t lhs, Fixed rhs) { + return rhs * lhs; + } + + friend Fixed operator/(Fixed lhs, uint16_t rhs) { + return lhs / static_cast(rhs); + } + + friend Fixed operator/(uint16_t lhs, Fixed rhs) { + return rhs / lhs; + } + + friend Fixed operator*(Fixed lhs, int32_t rhs) { + auto value = static_cast(lhs.data_ * rhs); + assert((lhs.data_ * static_cast(rhs)) == value); + return Fixed::make(value); + } + + friend Fixed operator*(int32_t lhs, Fixed rhs) { + return rhs * lhs; + } + + friend Fixed operator/(Fixed lhs, int32_t rhs) { + assert(rhs); + auto value = static_cast(lhs.data_ / rhs); + return Fixed::make(value); + } + + friend Fixed operator/(int32_t lhs, Fixed rhs) { + return rhs / lhs; + } + + friend Fixed operator*(Fixed lhs, uint32_t rhs) { + auto value = static_cast(lhs.data_ << rhs); + assert((lhs.data_ << static_cast(rhs)) == value); + return Fixed::make(value); + } + + friend Fixed operator*(uint32_t lhs, Fixed rhs) { + return rhs * lhs; + } + + friend Fixed operator/(Fixed lhs, uint32_t rhs) { + assert(rhs); + auto value = static_cast(lhs.data_ / rhs); + return Fixed::make(value); + } + + friend Fixed operator/(uint32_t lhs, Fixed rhs) { + return rhs / lhs; + } + + friend Fixed operator<<(Fixed lhs, int32_t rhs) { + auto value = static_cast(lhs.data_ << rhs); + assert((lhs.data_ << static_cast(rhs)) == value); + return Fixed::make(value); + } + + friend Fixed operator>>(Fixed lhs, int32_t rhs) { + auto value = static_cast(lhs.data_ >> rhs); + return Fixed::make(value); + } + + friend Fixed operator<<(Fixed lhs, uint32_t rhs) { + auto value = static_cast(lhs.data_ << rhs); + assert((lhs.data_ << static_cast(rhs)) == value); + return Fixed::make(value); + } + + friend Fixed operator>>(Fixed lhs, uint32_t rhs) { + auto value = static_cast(lhs.data_ >> rhs); + return Fixed::make(value); + } + + static Fixed make(T value) { + Fixed ret; + ret.data_ = value; + return ret; + } + + explicit operator int64_t() const { + return static_cast(data_ >> F); + } + + explicit operator uint64_t() const { + return static_cast(data_ >> F); + } + + explicit operator int32_t() const { + return static_cast(data_ >> F); + } + + explicit operator uint32_t() const { + return static_cast(data_ >> F); + } + + explicit operator int16_t() const { + return static_cast(data_ >> F); + } + + explicit operator uint16_t() const { + return static_cast(data_ >> F); + } + + explicit operator int8_t() const { + return static_cast(data_ >> F); + } + + explicit operator uint8_t() const { + return static_cast(data_ >> F); + } + + template + explicit operator Fixed() const { + return Fixed(*this); + } + + explicit operator float() const { + return static_cast(data_) / (static_cast(1) << F); + } + + T data() const { + return data_; + } + +private: + T data_; +}; \ No newline at end of file diff --git a/sim/common/simobject.h b/sim/common/simobject.h index 52c74643..369a3503 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -5,10 +5,9 @@ #include #include #include +#include #include -namespace vortex { - class SimObjectBase; /////////////////////////////////////////////////////////////////////////////// @@ -59,32 +58,44 @@ protected: template class SimPort : public SimPortBase { public: - void send(const Pkt& pkt, uint64_t delay) const; + void send(const Pkt& pkt, uint64_t delay) const; - bool read(Pkt* out) { - if (!valid_) - return false; - *out = data_; - valid_ = false; - return true; + void bind(SimPort* peer) { + this->connect(peer); } + void unbind() { + this->disconnect(); + } + + bool empty() const { + return queue_.empty(); + } + + const Pkt& top() const { + return queue_.front(); + } + + Pkt& top() { + return queue_.front(); + } + + void pop() { + queue_.pop(); + } + protected: SimPort(SimObjectBase* module) : SimPortBase(module) - , valid_(false) {} - void write(const Pkt& data) { - assert(!valid_); - data_ = data; - valid_ = true; + void push(const Pkt& data) { + queue_.push(data); } SimPort& operator=(const SimPort&) = delete; - Pkt data_; - bool valid_; + std::queue queue_; template friend class SimPortEvent; }; @@ -94,15 +105,7 @@ protected: template class SlavePort : public SimPort { public: - SlavePort(SimObjectBase* module) : SimPort(module) {} - - void bind(SlavePort* peer) { - this->connect(peer); - } - - void unbind() { - this->disconnect(); - } + SlavePort(SimObjectBase* module) : SimPort(module) {} protected: SlavePort& operator=(const SlavePort&) = delete; @@ -115,18 +118,6 @@ class MasterPort : public SimPort { public: MasterPort(SimObjectBase* module) : SimPort(module) {} - void bind(SlavePort* peer) { - this->connect(peer); - } - - void bind(MasterPort* peer) { - this->connect(peer); - } - - void unbind() { - this->disconnect(); - } - protected: MasterPort& operator=(const MasterPort&) = delete; }; @@ -194,7 +185,7 @@ public: {} void fire() const override { - const_cast*>(port_)->write(pkt_); + const_cast*>(port_)->push(pkt_); } private: @@ -382,6 +373,4 @@ template void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) { auto callback = std::bind(entry, obj, std::placeholders::_1); SimPlatform::instance().schedule(callback, pkt, delay); -} - } \ No newline at end of file diff --git a/sim/common/texturing.h b/sim/common/texturing.h new file mode 100644 index 00000000..8d76519e --- /dev/null +++ b/sim/common/texturing.h @@ -0,0 +1,221 @@ +#pragma once + +#include +#include +#include +#include + +enum class WrapMode { + Clamp, + Repeat, + Mirror, +}; + +enum class TexFormat { + R8G8B8A8, + R5G6B5, + R4G4B4A4, + L8A8, + L8, + A8, +}; + +template +T Clamp(Fixed fx, WrapMode mode) { + switch (mode) { + case WrapMode::Clamp: return (fx.data() < 0) ? 0 : ((fx.data() > Fixed::MASK) ? Fixed::MASK : fx.data()); + case WrapMode::Repeat: return (fx.data() & Fixed::MASK); + case WrapMode::Mirror: return (bit_get(fx.data(), Fixed::FRAC) ? ~fx.data() : fx.data()); + default: + std::abort(); + return 0; + } +} + +inline uint32_t Stride(TexFormat format) { + switch (format) { + case TexFormat::R8G8B8A8: + return 4; + case TexFormat::R5G6B5: + case TexFormat::R4G4B4A4: + case TexFormat::L8A8: + return 2; + case TexFormat::L8: + case TexFormat::A8: + return 1; + default: + std::abort(); + return 0; + } +} + +inline void Unpack8888(TexFormat format, + uint32_t texel, + uint32_t* lo, + uint32_t* hi) { + switch (format) { + case TexFormat::R8G8B8A8: + *lo = texel & 0x00ff00ff; + *hi = (texel >> 8) & 0x00ff00ff; + break; + case TexFormat::R5G6B5: + case TexFormat::R4G4B4A4: + *lo = texel; + *hi= 0; + break; + case TexFormat::L8A8: + *lo = (texel | (texel << 8)) & 0x00ff00ff; + *hi = 0; + break; + case TexFormat::L8: + *lo = (texel | (texel << 16)) & 0x07e0f81f; + *hi = 0; + break; + case TexFormat::A8: + *lo = (texel | (texel << 12)) & 0x0f0f0f0f; + *hi = 0; + break; + default: + std::abort(); + } +} + +inline uint32_t Pack8888(TexFormat format, uint32_t lo, uint32_t hi) { + switch (format) { + case TexFormat::R8G8B8A8: + return (hi << 8) | lo; + case TexFormat::R5G6B5: + case TexFormat::R4G4B4A4: + return lo; + case TexFormat::L8A8: + return (lo | (lo >> 8)) & 0xffff; + case TexFormat::L8: + return (lo | (lo >> 16)) & 0xffff; + case TexFormat::A8: + return (lo | (lo >> 12)) & 0xffff; + default: + std::abort(); + return 0; + } +} + +inline void Lerp8888(uint32_t al, + uint32_t ah, + uint32_t bl, + uint32_t bh, + uint32_t frac, + uint32_t* lo, + uint32_t* hi) { + *lo = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; + *hi = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; +} + +template +void TexAddressLinear(Fixed fu, + Fixed fv, + uint32_t log_width, + uint32_t log_height, + WrapMode wrapu, + WrapMode wrapv, + uint32_t* addr00, + uint32_t* addr01, + uint32_t* addr10, + uint32_t* addr11, + uint32_t* alpha, + uint32_t* beta +) { + auto delta_x = Fixed::make(Fixed::HALF >> log_width); + auto delta_y = Fixed::make(Fixed::HALF >> log_height); + + uint32_t u0 = Clamp(fu - delta_x, wrapu); + uint32_t u1 = Clamp(fu + delta_x, wrapu); + uint32_t v0 = Clamp(fv - delta_y, wrapv); + uint32_t v1 = Clamp(fv + delta_y, wrapv); + + uint32_t shift_u = (Fixed::FRAC - log_width); + uint32_t shift_v = (Fixed::FRAC - log_height); + + uint32_t x0s = (u0 << 8) >> shift_u; + uint32_t y0s = (v0 << 8) >> shift_v; + + uint32_t x0 = x0s >> 8; + uint32_t y0 = y0s >> 8; + uint32_t x1 = u1 >> shift_u; + uint32_t y1 = v1 >> shift_v; + + *addr00 = x0 + (y0 << log_width); + *addr01 = x1 + (y0 << log_width); + *addr10 = x0 + (y1 << log_width); + *addr11 = x1 + (y1 << log_width); + + *alpha = x0s & 0xff; + *beta = y0s & 0xff; + + //printf("*** fu=0x%x, fv=0x%x, u0=0x%x, u1=0x%x, v0=0x%x, v1=0x%x, x0=0x%x, x1=0x%x, y0=0x%x, y1=0x%x, addr00=0x%x, addr01=0x%x, addr10=0x%x, addr11=0x%x\n", fu.data(), fv.data(), u0, u1, v0, v1, x0, x1, y0, y1, *addr00, *addr01, *addr10, *addr11); +} + +template +void TexAddressPoint(Fixed fu, + Fixed fv, + uint32_t log_width, + uint32_t log_height, + WrapMode wrapu, + WrapMode wrapv, + uint32_t* addr +) { + uint32_t u = Clamp(fu, wrapu); + uint32_t v = Clamp(fv, wrapv); + + uint32_t x = u >> (Fixed::FRAC - log_width); + uint32_t y = v >> (Fixed::FRAC - log_height); + + *addr = x + (y << log_width); + + //printf("*** fu=0x%x, fv=0x%x, u=0x%x, v=0x%x, x=0x%x, y=0x%x, addr=0x%x\n", fu.data(), fv.data(), u, v, x, y, *addr); +} + +inline uint32_t TexFilterLinear( + TexFormat format, + uint32_t texel00, + uint32_t texel01, + uint32_t texel10, + uint32_t texel11, + uint32_t alpha, + uint32_t beta +) { + uint32_t c01l, c01h; + { + uint32_t c0l, c0h; + uint32_t c1l, c1h; + Unpack8888(format, texel00, &c0l, &c0h); + Unpack8888(format, texel01, &c1l, &c1h); + Lerp8888(c0l, c0h, c1l, c1h, alpha, &c01l, &c01h); + } + + uint32_t c23l, c23h; + { + uint32_t c2l, c2h; + uint32_t c3l, c3h; + Unpack8888(format, texel10, &c2l, &c2h); + Unpack8888(format, texel11, &c3l, &c3h); + Lerp8888(c2l, c2h, c3l, c3h, alpha, &c23l, &c23h); + } + + uint32_t cl, ch; + Lerp8888(c01l, c01h, c23l, c23h, beta, &cl, &ch); + uint32_t color = Pack8888(TexFormat::R8G8B8A8, cl, ch); + + //printf("*** texel00=0x%x, texel01=0x%x, texel10=0x%x, texel11=0x%x, color=0x%x\n", texel00, texel01, texel10, texel11, color); + + return color; +} + +inline uint32_t TexFilterPoint(TexFormat format, uint32_t texel) { + uint32_t cl, ch; + Unpack8888(format, texel, &cl, &ch); + uint32_t color = Pack8888(TexFormat::R8G8B8A8, cl, ch); + + //printf("*** texel=0x%x, color=0x%x\n", texel, color); + + return color; +} \ No newline at end of file diff --git a/sim/common/util.h b/sim/common/util.h index b6137199..d66305ee 100644 --- a/sim/common/util.h +++ b/sim/common/util.h @@ -3,85 +3,12 @@ #include #include #include +#include template void unused(Args&&...) {} #define __unused(...) unused(__VA_ARGS__) -constexpr uint32_t count_leading_zeros(uint32_t value) { - return value ? __builtin_clz(value) : 32; -} - -constexpr uint32_t count_trailing_zeros(uint32_t value) { - return value ? __builtin_ctz(value) : 32; -} - -constexpr bool ispow2(uint32_t value) { - return value && !(value & (value - 1)); -} - -constexpr uint32_t log2ceil(uint32_t value) { - return 32 - count_leading_zeros(value - 1); -} - -inline unsigned log2up(uint32_t value) { - return std::max(1, log2ceil(value)); -} - -constexpr unsigned log2floor(uint32_t value) { - return 31 - count_leading_zeros(value); -} - -constexpr unsigned ceil2(uint32_t value) { - return 32 - count_leading_zeros(value); -} - -inline uint64_t bit_clr(uint64_t bits, uint32_t index) { - assert(index <= 63); - return bits & ~(1ull << index); -} - -inline uint64_t bit_set(uint64_t bits, uint32_t index) { - assert(index <= 63); - return bits | (1ull << index); -} - -inline bool bit_get(uint64_t bits, uint32_t index) { - assert(index <= 63); - return (bits >> index) & 0x1; -} - -inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) { - assert(end >= start); - assert(end <= 63); - uint32_t shift = 63 - end; - uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift; - return bits & ~mask; -} - -inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) { - assert(end >= start); - assert(end <= 63); - uint32_t shift = 63 - end; - uint64_t dirty = (value << (shift + start)) >> shift; - return bit_clrw(bits, start, end) | dirty; -} - -inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) { - assert(end >= start); - assert(end <= 63); - uint32_t shift = 63 - end; - return (bits << shift) >> (shift + start); -} - -// Apply integer sign extension -inline uint32_t sext32(uint32_t word, uint32_t width) { - assert(width > 1); - assert(width <= 32); - uint32_t mask = (1 << width) - 1; - return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word; -} - // return file extension const char* fileExtension(const char* filepath); \ No newline at end of file diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index a0c8d339..662fbf1d 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -23,8 +23,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_FLAGS += $(DBG_TRACE_FLAGS) -DBG_FLAGS += -DDBG_CACHE_REQ_INFO -DBG_FLAGS += -DVCD_OUTPUT FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src TEX_INCLUDE = -I$(RTL_DIR)/tex_unit @@ -51,10 +49,17 @@ VL_FLAGS += $(RTL_INCLUDE) VL_FLAGS += $(CONFIGS) CXXFLAGS += $(CONFIGS) +# Enable Verilator multithreaded simulation +#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') +#VL_FLAGS += --threads $(THREADS) + +# Enable VCD trace +VCD_TRACE = -DVCD_OUTPUT + # Debugigng ifdef DEBUG - VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS) - CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS) + VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS) + CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS) else VL_FLAGS += -DNDEBUG CXXFLAGS += -O2 -DNDEBUG diff --git a/sim/simX/Makefile b/sim/simX/Makefile index 75a4a495..7ea54863 100644 --- a/sim/simX/Makefile +++ b/sim/simX/Makefile @@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a TOP = vx_cache_sim SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp -SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp processor.cpp main.cpp +SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp main.cpp OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS))) VPATH := $(sort $(dir $(SRCS))) diff --git a/sim/simX/cache.cpp b/sim/simX/cache.cpp index 503d32c5..da69cf3a 100644 --- a/sim/simX/cache.cpp +++ b/sim/simX/cache.cpp @@ -13,6 +13,7 @@ struct params_t { uint32_t sets_per_bank; uint32_t blocks_per_set; uint32_t words_per_block; + uint32_t log2_num_inputs; uint32_t word_select_addr_start; uint32_t word_select_addr_end; @@ -31,8 +32,10 @@ struct params_t { uint32_t offset_bits = config.B - config.W; uint32_t log2_bank_size = config.C - bank_bits; uint32_t index_bits = log2_bank_size - (config.B << config.A); - assert(log2_bank_size >= config.B); - + assert(log2_bank_size >= config.B); + + this->log2_num_inputs = log2ceil(config.num_inputs); + this->words_per_block = 1 << offset_bits; this->blocks_per_set = 1 << config.A; this->sets_per_bank = 1 << index_bits; @@ -104,7 +107,7 @@ struct set_t { struct bank_req_info_t { bool valid; uint32_t req_id; - uint32_t req_tag; + uint64_t req_tag; }; struct bank_req_t { @@ -194,7 +197,7 @@ public: return root_entry; } - bool try_pop(bank_req_t* out) { + bool pop(bank_req_t* out) { for (auto& entry : entries_) { if (entry.valid && entry.mshr_replay) { *out = entry; @@ -208,16 +211,13 @@ public: }; struct bank_t { - std::vector sets; - MSHR mshr; - std::queue stall_buffer; - bank_req_t active_req; + std::vector sets; + MSHR mshr; bank_t(const CacheConfig& config, const params_t& params) : sets(params.sets_per_bank, params.blocks_per_set) , mshr(config.mshr_size) - , active_req(config.ports_per_bank) {} }; @@ -229,8 +229,8 @@ private: CacheConfig config_; params_t params_; std::vector banks_; - std::vector> core_rsps_; - Switch::Ptr mem_switch_; + Switch::Ptr mem_switch_; + Switch::Ptr bypass_switch_; std::vector> mem_req_ports_; std::vector> mem_rsp_ports_; @@ -240,241 +240,270 @@ public: , config_(config) , params_(config) , banks_(config.num_banks, {config, params_}) - , core_rsps_(config.num_inputs) , mem_req_ports_(config.num_banks, simobject) , mem_rsp_ports_(config.num_banks, simobject) { + bypass_switch_ = Switch::Create("bypass_arb", ArbiterType::Priority, 2); + bypass_switch_->ReqOut.bind(&simobject->MemReqPort); + simobject->MemRspPort.bind(&bypass_switch_->RspIn); + if (config.num_banks > 1) { mem_switch_ = Switch::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks); for (uint32_t i = 0, n = config.num_banks; i < n; ++i) { mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i)); mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i)); } - mem_switch_->ReqOut.bind(&simobject->MemReqPort); - simobject->MemRspPort.bind(&mem_switch_->RspIn); + mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn); } else { - mem_req_ports_.at(0).bind(&simobject->MemReqPort); - simobject->MemRspPort.bind(&mem_rsp_ports_.at(0)); + mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); + bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0)); } } void step(uint64_t /*cycle*/) { - // process core response - for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) { - auto& core_rsp = core_rsps_.at(req_id); - if (!core_rsp.empty()) { - simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_rsp.front()}, config_.latency); - core_rsp.pop(); - } + // handle bypasss responses + auto& bypass_port = bypass_switch_->RspOut.at(1); + if (!bypass_port.empty()) { + auto& mem_rsp = bypass_port.top(); + uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1); + uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs; + MemRsp core_rsp(tag); + simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency); + bypass_port.pop(); } - for (auto& bank : banks_) { - auto& active_req = bank.active_req; + std::vector pipeline_reqs(config_.num_banks, config_.ports_per_bank); - // try chedule mshr replay - if (!active_req.valid) { - bank.mshr.try_pop(&active_req); - } - - // try schedule stall queue if MSHR has space - if (!active_req.valid - && !bank.stall_buffer.empty() - && !bank.mshr.full()) { - active_req = bank.stall_buffer.front(); - bank.stall_buffer.pop(); - } - } + // handle MSHR replay + for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + auto& bank = banks_.at(bank_id); + auto& pipeline_req = pipeline_reqs.at(bank_id); + bank.mshr.pop(&pipeline_req); + } // handle memory fills - for (uint32_t i = 0, n = config_.num_banks; i < n; ++i) { - MemRsp mem_rsp; - if (mem_rsp_ports_.at(i).read(&mem_rsp)) { - this->processMemoryFill(i, mem_rsp.tag); + std::vector pending_fill_req(config_.num_banks, false); + for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + auto& mem_rsp_port = mem_rsp_ports_.at(bank_id); + if (!mem_rsp_port.empty()) { + auto& mem_rsp = mem_rsp_port.top(); + this->processMemoryFill(bank_id, mem_rsp.tag); + pending_fill_req.at(bank_id) = true; + mem_rsp_port.pop(); } } // handle incoming core requests - for (uint32_t i = 0, n = config_.num_inputs; i < n; ++i) { - MemReq core_req; - if (!simobject_->CoreReqPorts.at(i).read(&core_req)) + for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) { + auto& core_req_port = simobject_->CoreReqPorts.at(req_id); + if (core_req_port.empty()) continue; - auto bank_id = params_.addr_bank_id(core_req.addr); - auto set_id = params_.addr_set_id(core_req.addr); - auto tag = params_.addr_tag(core_req.addr); - auto port_id = i % config_.ports_per_bank; + auto& core_req = core_req_port.top(); + + // check cache bypassing + if (core_req.is_io) { + // send IO request + this->processIORequest(core_req, req_id); + + // remove request + core_req_port.pop(); + continue; + } + + auto bank_id = params_.addr_bank_id(core_req.addr); + auto set_id = params_.addr_set_id(core_req.addr); + auto tag = params_.addr_tag(core_req.addr); + auto port_id = req_id % config_.ports_per_bank; - // create abnk request + // create bank request bank_req_t bank_req(config_.ports_per_bank); bank_req.valid = true; bank_req.write = core_req.write; bank_req.mshr_replay = false; bank_req.tag = tag; bank_req.set_id = set_id; - bank_req.infos.at(port_id) = {true, i, core_req.tag}; + bank_req.infos.at(port_id) = {true, req_id, core_req.tag}; - auto& bank = banks_.at(bank_id); - - // check MSHR capacity - if (bank.mshr.full()) { - // add to stall buffer - bank.stall_buffer.emplace(bank_req); + auto& bank = banks_.at(bank_id); + auto& pipeline_req = pipeline_reqs.at(bank_id); + + // check pending MSHR replay + if (pipeline_req.valid + && pipeline_req.mshr_replay) { + // stall + continue; + } + + // check pending fill request + if (pending_fill_req.at(bank_id)) { + // stall continue; } - - auto& active_req = bank.active_req; - - // check pending MSHR request - if (active_req.valid - && active_req.mshr_replay) { - // add to stall buffer - bank.stall_buffer.emplace(bank_req); + + // check MSHR capacity if read or writeback + if ((!core_req.write || !config_.write_through) + && bank.mshr.full()) { + // stall continue; - } + } // check bank conflicts - if (active_req.valid) { + if (pipeline_req.valid) { // check port conflict - if (active_req.write != core_req.write - || active_req.set_id != set_id - || active_req.tag != tag - || active_req.infos[port_id].valid) { - // add to stall buffer - bank.stall_buffer.emplace(bank_req); + if (pipeline_req.write != core_req.write + || pipeline_req.set_id != set_id + || pipeline_req.tag != tag + || pipeline_req.infos[port_id].valid) { + // stall continue; } // update pending request infos - active_req.infos[port_id] = bank_req.infos[port_id]; + pipeline_req.infos[port_id] = bank_req.infos[port_id]; } else { // schedule new request - active_req = bank_req; + pipeline_req = bank_req; } + // remove request + core_req_port.pop(); } - // process active request - for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { - this->processBankRequest(bank_id); + // process active request + this->processBankRequest(pipeline_reqs); + } + + void processIORequest(const MemReq& core_req, uint32_t req_id) { + { + MemReq mem_req(core_req); + mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id; + bypass_switch_->ReqIn.at(1).send(mem_req, 1); + } + + if (core_req.write && config_.write_reponse) { + simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_req.tag}, 1); } } void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) { // update block - auto& bank = banks_.at(bank_id); - auto& root_entry = bank.mshr.replay(mshr_id); - auto& set = bank.sets.at(root_entry.set_id); - auto& block = set.blocks.at(root_entry.block_id); + auto& bank = banks_.at(bank_id); + auto& entry = bank.mshr.replay(mshr_id); + auto& set = bank.sets.at(entry.set_id); + auto& block = set.blocks.at(entry.block_id); block.valid = true; - block.tag = root_entry.tag; + block.tag = entry.tag; } - void processBankRequest(uint32_t bank_id) { - auto& bank = banks_.at(bank_id); - auto& active_req = bank.active_req; - if (!active_req.valid) - return; + void processBankRequest(const std::vector& pipeline_reqs) { + for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + auto& pipeline_req = pipeline_reqs.at(bank_id); + if (!pipeline_req.valid) + continue; - active_req.valid = false; + auto& bank = banks_.at(bank_id); + auto& set = bank.sets.at(pipeline_req.set_id); - auto& set = bank.sets.at(active_req.set_id); - - if (active_req.mshr_replay) { - // send core response - for (auto& info : active_req.infos) { - core_rsps_.at(info.req_id).emplace(info.req_tag); - } - } else { - bool hit = false; - bool found_free_block = false; - int hit_block_id = 0; - int repl_block_id = 0; - uint32_t max_cnt = 0; - - for (int i = 0, n = set.blocks.size(); i < n; ++i) { - auto& block = set.blocks.at(i); - if (block.valid) { - if (block.tag == active_req.tag) { - block.lru_ctr = 0; - hit_block_id = i; - hit = true; - } else { - ++block.lru_ctr; - } - if (max_cnt < block.lru_ctr) { - max_cnt = block.lru_ctr; + if (pipeline_req.mshr_replay) { + // send core response + for (auto& info : pipeline_req.infos) { + simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency); + } + } else { + bool hit = false; + bool found_free_block = false; + int hit_block_id = 0; + int repl_block_id = 0; + uint32_t max_cnt = 0; + + for (int i = 0, n = set.blocks.size(); i < n; ++i) { + auto& block = set.blocks.at(i); + if (block.valid) { + if (block.tag == pipeline_req.tag) { + block.lru_ctr = 0; + hit_block_id = i; + hit = true; + } else { + ++block.lru_ctr; + } + if (max_cnt < block.lru_ctr) { + max_cnt = block.lru_ctr; + repl_block_id = i; + } + } else { + found_free_block = true; repl_block_id = i; } - } else { - found_free_block = true; - repl_block_id = i; - } - } - - if (hit) { - // - // MISS handling - // - if (active_req.write) { - // handle write hit - auto& hit_block = set.blocks.at(hit_block_id); - if (config_.write_through) { - // forward write request to memory - MemReq mem_req; - mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag); - mem_req.write = true; - mem_req.tag = 0; - mem_req_ports_.at(bank_id).send(mem_req, 1); - } else { - // mark block as dirty - hit_block.dirty = true; - } - } - // send core response - for (auto& info : active_req.infos) { - core_rsps_.at(info.req_id).emplace(info.req_tag); - } - } else { - // - // MISS handling - // - if (!found_free_block && !config_.write_through) { - // write back dirty block - auto& repl_block = set.blocks.at(repl_block_id); - if (repl_block.dirty) { - MemReq mem_req; - mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag); - mem_req.write = true; - mem_req.tag = 0; - mem_req_ports_.at(bank_id).send(mem_req, 1); - } } - if (active_req.write && config_.write_through) { - // forward write request to memory - { - MemReq mem_req; - mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); - mem_req.write = true; - mem_req.tag = 0; - mem_req_ports_.at(bank_id).send(mem_req, 1); + if (hit) { + // + // MISS handling + // + if (pipeline_req.write) { + // handle write hit + auto& hit_block = set.blocks.at(hit_block_id); + if (config_.write_through) { + // forward write request to memory + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag); + mem_req.write = true; + mem_req_ports_.at(bank_id).send(mem_req, 1); + } else { + // mark block as dirty + hit_block.dirty = true; + } } // send core response - for (auto& info : active_req.infos) { - core_rsps_.at(info.req_id).emplace(info.req_tag); + if (!pipeline_req.write || config_.write_reponse) { + for (auto& info : pipeline_req.infos) { + simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency); + } + } + } else { + // + // MISS handling + // + if (!found_free_block && !config_.write_through) { + // write back dirty block + auto& repl_block = set.blocks.at(repl_block_id); + if (repl_block.dirty) { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag); + mem_req.write = true; + mem_req_ports_.at(bank_id).send(mem_req, 1); + } } - } else { - // lookup - int pending = bank.mshr.lookup(active_req); - // allocate MSHR - int mshr_id = bank.mshr.allocate(active_req, repl_block_id); - - // send fill request - if (pending == -1) { - MemReq mem_req; - mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag); - mem_req.write = active_req.write; - mem_req.tag = mshr_id; - mem_req_ports_.at(bank_id).send(mem_req, 1); + if (pipeline_req.write && config_.write_through) { + // forward write request to memory + { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); + mem_req.write = true; + mem_req_ports_.at(bank_id).send(mem_req, 1); + } + // send core response + if (config_.write_reponse) { + for (auto& info : pipeline_req.infos) { + simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency); + } + } + } else { + // MSHR lookup + int pending = bank.mshr.lookup(pipeline_req); + + // allocate MSHR + int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id); + + // send fill request + if (pending == -1) { + MemReq mem_req; + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); + mem_req.write = pipeline_req.write; + mem_req.tag = mshr_id; + mem_req_ports_.at(bank_id).send(mem_req, 1); + } } } } diff --git a/sim/simX/cache.h b/sim/simX/cache.h index 58767d9f..0be8cf6e 100644 --- a/sim/simX/cache.h +++ b/sim/simX/cache.h @@ -14,7 +14,8 @@ struct CacheConfig { uint8_t num_banks; // number of banks uint8_t ports_per_bank; // number of ports per bank uint8_t num_inputs; // number of inputs - bool write_through; // is write-through cache + bool write_through; // is write-through + bool write_reponse; // enable write response uint16_t victim_size; // victim cache size uint16_t mshr_size; // MSHR buffer size uint8_t latency; // pipeline latency diff --git a/sim/simX/constants.h b/sim/simX/constants.h index d9171b8d..218fa5f9 100644 --- a/sim/simX/constants.h +++ b/sim/simX/constants.h @@ -10,11 +10,7 @@ namespace vortex { struct Constants { -static constexpr uint32_t CORE_TO_DCACHE_DELAY = 1 + SM_ENABLE; -static constexpr uint32_t CORE_TO_ICACHE_DELAY = 1; - -static constexpr uint32_t ICACHE_TO_MEM_DELAY = 2; -static constexpr uint32_t DCACHE_TO_MEM_DELAY = 2; +static constexpr uint32_t SMEM_DELAY = 1 + SM_ENABLE; }; diff --git a/sim/simX/core.cpp b/sim/simX/core.cpp index e1333dac..19b20967 100644 --- a/sim/simX/core.cpp +++ b/sim/simX/core.cpp @@ -19,6 +19,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) , decoder_(arch) , mmu_(0, arch.wsize(), true) , shared_mem_(4096) + , tex_units_(NUM_TEX_UNITS, this) , warps_(arch.num_warps()) , barriers_(arch.num_barriers(), 0) , csrs_(arch.num_csrs(), 0) @@ -35,7 +36,8 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) 1, // number of banks 1, // number of ports 1, // request size - true, // write-throught + true, // write-through + false, // write response 0, // victim size NUM_WARPS, // mshr 2, // pipeline latency @@ -49,12 +51,14 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) DCACHE_NUM_BANKS, // number of banks DCACHE_NUM_PORTS, // number of ports (uint8_t)arch.num_threads(), // request size - true, // write-throught + true, // write-through + false, // write response 0, // victim size DCACHE_MSHR_SIZE, // mshr 2, // pipeline latency })) - , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) + , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) + , dcache_switch_(arch.num_threads()) , fetch_stage_("fetch") , decode_stage_("decode") , issue_stage_("issue") @@ -65,10 +69,9 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) , last_schedule_wid_(0) , issued_instrs_(0) , committed_instrs_(0) + , ecall_(false) , ebreak_(false) , stats_insts_(0) - , stats_loads_(0) - , stats_stores_(0) , MemRspPort(this) , MemReqPort(this) { @@ -92,6 +95,18 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) this->MemRspPort.bind(&l1_mem_switch_->RspIn); l1_mem_switch_->ReqOut.bind(&this->MemReqPort); + // lsu/tex switch + for (uint32_t i = 0, n = arch.num_threads(); i < n; ++i) { + auto& sw = dcache_switch_.at(i); +#ifdef EXT_TEX_ENABLE + sw = Switch::Create("lsu_arb", ArbiterType::Priority, 2); +#else + sw = Switch::Create("lsu_arb", ArbiterType::Priority, 1); +#endif + sw->ReqOut.bind(&dcache_->CoreReqPorts.at(i)); + dcache_->CoreRspPorts.at(i).bind(&sw->RspIn); + } + // activate warp0 warps_.at(0)->setTmask(0, true); } @@ -147,44 +162,41 @@ void Core::warp_scheduler(uint64_t cycle) { auto& warp = warps_.at(scheduled_warp); stats_insts_ += warp->getActiveThreads(); - pipeline_state_t state; - state.clear(); - state.id = (issued_instrs_++ * arch_.num_cores()) + id_; + auto trace = new pipeline_trace_t((issued_instrs_++ * arch_.num_cores()) + id_, arch_); - warp->eval(&state); + warp->eval(trace); - DT(3, cycle, "pipeline-schedule: " << state); + DT(3, cycle, "pipeline-schedule: " << *trace); // advance to fetch stage - fetch_stage_.push(state); + fetch_stage_.push(trace); } void Core::fetch(uint64_t cycle) { // handle icache reponse - { - MemRsp mem_rsp; - if (icache_->CoreRspPorts.at(0).read(&mem_rsp)){ - pipeline_state_t state; - pending_icache_.remove(mem_rsp.tag, &state); - auto latency = (SimPlatform::instance().cycles() - state.icache_latency); - state.icache_latency = latency; - decode_stage_.push(state); - DT(3, cycle, "icache-rsp: addr=" << std::hex << state.PC << ", tag=" << mem_rsp.tag << ", " << state); - } + auto& icache_rsp_port = icache_->CoreRspPorts.at(0); + if (!icache_rsp_port.empty()){ + auto& mem_rsp = icache_rsp_port.top(); + auto trace = pending_icache_.at(mem_rsp.tag); + auto latency = (SimPlatform::instance().cycles() - trace->icache_latency); + trace->icache_latency = latency; + decode_stage_.push(trace); + DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace); + pending_icache_.release(mem_rsp.tag); + icache_rsp_port.pop(); } // send icache request - { - pipeline_state_t state; - if (fetch_stage_.try_pop(&state)) { - state.icache_latency = SimPlatform::instance().cycles(); - MemReq mem_req; - mem_req.addr = state.PC; - mem_req.write = false; - mem_req.tag = pending_icache_.allocate(state); - icache_->CoreReqPorts.at(0).send(mem_req, 1); - DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << state); - } + if (!fetch_stage_.empty()) { + auto trace = fetch_stage_.top(); + trace->icache_latency = SimPlatform::instance().cycles(); + MemReq mem_req; + mem_req.addr = trace->PC; + mem_req.write = false; + mem_req.tag = pending_icache_.allocate(trace); + icache_->CoreReqPorts.at(0).send(mem_req, 1); + DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); + fetch_stage_.pop(); } // schedule next warp @@ -194,19 +206,21 @@ void Core::fetch(uint64_t cycle) { void Core::decode(uint64_t cycle) { __unused (cycle); - pipeline_state_t state; - if (!decode_stage_.try_pop(&state)) - return; + if (decode_stage_.empty()) + return; + + auto trace = decode_stage_.top(); // release warp - if (!state.stall_warp) { - stalled_warps_.reset(state.wid); + if (!trace->fetch_stall) { + stalled_warps_.reset(trace->wid); } - DT(3, cycle, "pipeline-decode: " << state); + DT(3, cycle, "pipeline-decode: " << *trace); // advance to issue stage - issue_stage_.push(state); + issue_stage_.push(trace); + decode_stage_.pop(); } void Core::issue(uint64_t cycle) { @@ -214,12 +228,13 @@ void Core::issue(uint64_t cycle) { if (!issue_stage_.empty()) { // insert to ibuffer - auto& state = issue_stage_.top(); - auto& ibuffer = ibuffers_.at(state.wid); - if (ibuffer.full()) { - DT(3, cycle, "*** ibuffer-stall: " << state); - } else { - ibuffer.push(state); + auto trace = issue_stage_.top(); + auto& ibuffer = ibuffers_.at(trace->wid); + if (!trace->check_stalled(ibuffer.full())) { + DT(3, cycle, "*** ibuffer-stall: " << *trace); + } + if (!ibuffer.full()) { + ibuffer.push(trace); issue_stage_.pop(); } } @@ -229,27 +244,30 @@ void Core::issue(uint64_t cycle) { if (ibuffer.empty()) continue; - auto& state = ibuffer.top(); + auto trace = ibuffer.top(); // check scoreboard - if (scoreboard_.in_use(state)) { + if (!trace->check_stalled(scoreboard_.in_use(trace))) { DTH(3, cycle, "*** scoreboard-stall: dependents={"); - auto owners = scoreboard_.owners(state); - for (uint32_t i = 0, n = owners.size(); i < n; ++i) { - if (i) DTN(3, ", "); - DTN(3, "#" << owners.at(i)); + auto uses = scoreboard_.get_uses(trace); + for (uint32_t i = 0, n = uses.size(); i < n; ++i) { + auto& use = uses.at(i); + __unused(use); + if (i) DTN(3, ", "); + DTN(3, use.type << use.reg << "(#" << use.owner << ")"); } - DTN(3, "}, " << state << std::endl); - continue; + DTN(3, "}, " << *trace << std::endl); } + if (scoreboard_.in_use(trace)) + continue; - DT(3, cycle, "pipeline-issue: " << state); + DT(3, cycle, "pipeline-issue: " << *trace); // update scoreboard - scoreboard_.reserve(state); + scoreboard_.reserve(trace); // advance to execute stage - execute_stage_.push(state); + execute_stage_.push(trace); ibuffer.pop(); break; @@ -259,11 +277,11 @@ void Core::issue(uint64_t cycle) { void Core::execute(uint64_t cycle) { // process stage inputs if (!execute_stage_.empty()) { - auto& state = execute_stage_.top(); - auto& exe_unit = exe_units_.at((int)state.exe_type); - exe_unit->push_input(state); + auto trace = execute_stage_.top(); + auto& exe_unit = exe_units_.at((int)trace->exe_type); + exe_unit->push(trace); + DT(3, cycle, "pipeline-execute: " << *trace); execute_stage_.pop(); - DT(3, cycle, "pipeline-execute: " << state); } // advance execute units @@ -273,13 +291,14 @@ void Core::execute(uint64_t cycle) { // commit completed instructions for (auto& exe_unit : exe_units_) { - pipeline_state_t state; - if (exe_unit->pop_output(&state)) { - if (state.stall_warp) { - stalled_warps_.reset(state.wid); + if (!exe_unit->empty()) { + auto trace = exe_unit->top(); + if (trace->fetch_stall) { + stalled_warps_.reset(trace->wid); } // advance to commit stage - commit_stage_.push(state); + commit_stage_.push(trace); + exe_unit->pop(); } } } @@ -287,21 +306,28 @@ void Core::execute(uint64_t cycle) { void Core::commit(uint64_t cycle) { __unused (cycle); - pipeline_state_t state; - if (!commit_stage_.try_pop(&state)) + if (commit_stage_.empty()) return; - DT(3, cycle, "pipeline-commit: " << state); + auto trace = commit_stage_.top(); + + DT(3, cycle, "pipeline-commit: " << *trace); // update scoreboard - scoreboard_.release(state); + scoreboard_.release(trace); assert(committed_instrs_ <= issued_instrs_); ++committed_instrs_; + + commit_stage_.pop(); + + // delete the trace + delete trace; } bool Core::running() const { - return (committed_instrs_ != issued_instrs_); + bool is_running = (committed_instrs_ != issued_instrs_); + return is_running; } Word Core::get_csr(Addr addr, int tid, int wid) { @@ -355,6 +381,12 @@ Word Core::get_csr(Addr addr, int tid, int wid) { // NumCycles return (Word)(SimPlatform::instance().cycles() >> 32); } else { + if (addr >= CSR_TEX(0,0) + && addr < CSR_TEX(NUM_TEX_UNITS,0)) { + uint32_t unit = CSR_TEX_UNIT(addr); + uint32_t state = CSR_TEX_STATE(addr); + return tex_units_.at(unit).get_state(state); + } return csrs_.at(addr); } } @@ -367,6 +399,13 @@ void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) { } else if (addr == CSR_FCSR) { fcsrs_.at(wid) = value & 0xff; } else { + if (addr >= CSR_TEX(0,0) + && addr < CSR_TEX(NUM_TEX_UNITS,0)) { + uint32_t unit = CSR_TEX_UNIT(addr); + uint32_t state = CSR_TEX_STATE(addr); + tex_units_.at(unit).set_state(state, value); + return; + } csrs_.at(addr) = value; } } @@ -390,29 +429,27 @@ Word Core::icache_read(Addr addr, Size size) { return data; } -Word Core::dcache_read(Addr addr, Size size) { - ++stats_loads_; +Word Core::dcache_read(Addr addr, Size size) { Word data = 0; -#ifdef SM_ENABLE - if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) - && ((addr + 3) < SMEM_BASE_ADDR)) { - shared_mem_.read(&data, addr & (SMEM_SIZE-1), size); - return data; + if (SM_ENABLE) { + if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) + && ((addr + 3) < SMEM_BASE_ADDR)) { + shared_mem_.read(&data, addr & (SMEM_SIZE-1), size); + return data; + } } -#endif mmu_.read(&data, addr, size, 0); return data; } -void Core::dcache_write(Addr addr, Word data, Size size) { - ++stats_stores_; -#ifdef SM_ENABLE - if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) - && ((addr + 3) < SMEM_BASE_ADDR)) { - shared_mem_.write(&data, addr & (SMEM_SIZE-1), size); - return; +void Core::dcache_write(Addr addr, Word data, Size size) { + if (SM_ENABLE) { + if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) + && ((addr + 3) < SMEM_BASE_ADDR)) { + shared_mem_.write(&data, addr & (SMEM_SIZE-1), size); + return; + } } -#endif if (addr >= IO_COUT_ADDR && addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { this->writeToStdOut(addr, data); @@ -421,11 +458,8 @@ void Core::dcache_write(Addr addr, Word data, Size size) { mmu_.write(&data, addr, size, 0); } -void Core::printStats() const { - std::cout << "Cycles: " << SimPlatform::instance().cycles() << std::endl - << "Insts : " << stats_insts_ << std::endl - << "Loads : " << stats_loads_ << std::endl - << "Stores: " << stats_stores_ << std::endl; +Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector* mem_addrs) { + return tex_units_.at(unit).read(u, v, lod, mem_addrs); } void Core::writeToStdOut(Addr addr, Word data) { @@ -439,10 +473,14 @@ void Core::writeToStdOut(Addr addr, Word data) { } } +void Core::trigger_ecall() { + ecall_ = true; +} + void Core::trigger_ebreak() { ebreak_ = true; } -bool Core::check_ebreak() const { - return ebreak_; +bool Core::check_exit() const { + return ebreak_ || ecall_; } \ No newline at end of file diff --git a/sim/simX/core.h b/sim/simX/core.h index ea1a6582..5066d8af 100644 --- a/sim/simX/core.h +++ b/sim/simX/core.h @@ -20,6 +20,7 @@ #include "ibuffer.h" #include "scoreboard.h" #include "exeunit.h" +#include "tex_unit.h" namespace vortex { @@ -34,8 +35,6 @@ public: void step(uint64_t cycle); - void printStats() const; - Word id() const { return id_; } @@ -72,9 +71,13 @@ public: void dcache_write(Addr, Word, Size); + Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector* mem_addrs); + + void trigger_ecall(); + void trigger_ebreak(); - bool check_ebreak() const; + bool check_exit() const; private: @@ -92,10 +95,8 @@ private: const ArchDef arch_; const Decoder decoder_; MemoryUnit mmu_; - -#ifdef SM_ENABLE RAM shared_mem_; -#endif + std::vector tex_units_; std::vector> warps_; std::vector barriers_; @@ -107,6 +108,7 @@ private: Cache::Ptr icache_; Cache::Ptr dcache_; Switch::Ptr l1_mem_switch_; + std::vector::Ptr> dcache_switch_; PipelineStage fetch_stage_; PipelineStage decode_stage_; @@ -114,20 +116,20 @@ private: PipelineStage execute_stage_; PipelineStage commit_stage_; - HashTable pending_icache_; + HashTable pending_icache_; WarpMask stalled_warps_; uint32_t last_schedule_wid_; uint32_t issued_instrs_; uint32_t committed_instrs_; + bool ecall_; bool ebreak_; std::unordered_map print_bufs_; uint64_t stats_insts_; - uint64_t stats_loads_; - uint64_t stats_stores_; friend class LsuUnit; + friend class GpuUnit; public: SlavePort MemRspPort; diff --git a/sim/simX/decode.cpp b/sim/simX/decode.cpp index 6530d223..a2957c64 100644 --- a/sim/simX/decode.cpp +++ b/sim/simX/decode.cpp @@ -41,14 +41,18 @@ static const std::unordered_map sc_instTable = { {Opcode::FMNMSUB, {false, InstType::R4_TYPE}}, {Opcode::VSET, {false, InstType::V_TYPE}}, {Opcode::GPGPU, {false, InstType::R_TYPE}}, + {Opcode::GPU, {false, InstType::R4_TYPE}}, }; -static const char* op_string(const Instr &instr) { - Word func3 = instr.getFunc3(); - Word func7 = instr.getFunc7(); - Word rs2 = instr.getRSrc(1); - Word imm = instr.getImm(); - switch (instr.getOpcode()) { +static const char* op_string(const Instr &instr) { + auto opcode = instr.getOpcode(); + Word func2 = instr.getFunc2(); + Word func3 = instr.getFunc3(); + Word func7 = instr.getFunc7(); + Word rs2 = instr.getRSrc(1); + Word imm = instr.getImm(); + + switch (opcode) { case Opcode::NOP: return "NOP"; case Opcode::LUI_INST: return "LUI"; case Opcode::AUIPC_INST: return "AUIPC"; @@ -120,7 +124,16 @@ static const char* op_string(const Instr &instr) { } case Opcode::SYS_INST: switch (func3) { - case 0: return imm ? "EBREAK" : "ECALL"; + case 0: + switch (imm) { + case 0x000: return "ECALL"; + case 0x001: return "EBREAK"; + case 0x002: return "URET"; + case 0x102: return "SRET"; + case 0x302: return "MRET"; + default: + std::abort(); + } case 1: return "CSRRW"; case 2: return "CSRRS"; case 3: return "CSRRC"; @@ -181,29 +194,43 @@ static const char* op_string(const Instr &instr) { case 1: return "WSPAWN"; case 2: return "SPLIT"; case 3: return "JOIN"; - case 4: return "BAR"; - case 6: return "PREFETCH"; + case 4: return "BAR"; + default: + std::abort(); + } + case Opcode::GPU: + switch (func3) { + case 0: return "TEX"; + case 1: { + switch (func2) { + case 0: return "CMOV"; + default: + std::abort(); + } + } default: std::abort(); } default: std::abort(); - } + } } namespace vortex { -std::ostream &operator<<(std::ostream &os, const Instr &instr) { - os << op_string(instr) << ": "; +std::ostream &operator<<(std::ostream &os, const Instr &instr) { auto opcode = instr.getOpcode(); + Word func2 = instr.getFunc2(); + Word func3 = instr.getFunc3(); + + os << op_string(instr) << ": "; + if (opcode == S_INST - || opcode == FS - || opcode == VS) { + || opcode == FS) { os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- "; os << instr.getRSType(1) << std::dec << instr.getRSrc(1); } else if (opcode == L_INST - || opcode == FL - || opcode == VL) { + || opcode == FL) { os << instr.getRDType() << std::dec << instr.getRDest() << " <- "; os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]"; } else { @@ -219,8 +246,10 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) { if (i) os << ", "; os << "imm=0x" << std::hex << instr.getImm(); } - } - + if (opcode == GPU && func3 == 0) { + os << ", unit=" << std::dec << func2; + } + } return os; } } @@ -239,6 +268,7 @@ Decoder::Decoder(const ArchDef &arch) { shift_func3_ = shift_rd_ + reg_s_; shift_rs1_ = shift_func3_ + func3_s_; shift_rs2_ = shift_rs1_ + reg_s_; + shift_func2_ = shift_rs2_ + reg_s_; shift_func7_ = shift_rs2_ + reg_s_; shift_rs3_ = shift_func7_ + func2_s_; shift_vmop_ = shift_func7_ + vmask_s_; @@ -247,7 +277,7 @@ Decoder::Decoder(const ArchDef &arch) { shift_vset_ = shift_func7_ + 6; reg_mask_ = 0x1f; - func2_mask_ = 0x2; + func2_mask_ = 0x3; func3_mask_ = 0x7; func6_mask_ = 0x3f; func7_mask_ = 0x7f; @@ -265,6 +295,7 @@ std::shared_ptr Decoder::decode(Word code) const { Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_); instr->setOpcode(op); + Word func2 = (code >> shift_func2_) & func2_mask_; Word func3 = (code >> shift_func3_) & func3_mask_; Word func6 = (code >> shift_func6_) & func6_mask_; Word func7 = (code >> shift_func7_) & func7_mask_; @@ -403,7 +434,7 @@ std::shared_ptr Decoder::decode(Word code) const { } } break; - case Opcode::VL: + case Opcode::FL: instr->setDestVReg(rd); instr->setSrcVReg(rs1); instr->setVlsWidth(func3); @@ -413,7 +444,7 @@ std::shared_ptr Decoder::decode(Word code) const { instr->setVnf((code >> shift_vnf_) & func3_mask_); break; - case Opcode::VS: + case Opcode::FS: instr->setVs3(rd); instr->setSrcVReg(rs1); instr->setVlsWidth(func3); @@ -428,10 +459,18 @@ std::shared_ptr Decoder::decode(Word code) const { } break; case R4_TYPE: - instr->setDestFReg(rd); - instr->setSrcFReg(rs1); - instr->setSrcFReg(rs2); - instr->setSrcFReg(rs3); + if (op == Opcode::GPU) { + instr->setDestReg(rd); + instr->setSrcReg(rs1); + instr->setSrcReg(rs2); + instr->setSrcReg(rs3); + } else { + instr->setDestFReg(rd); + instr->setSrcFReg(rs1); + instr->setSrcFReg(rs2); + instr->setSrcFReg(rs3); + } + instr->setFunc2(func2); instr->setFunc3(func3); break; default: diff --git a/sim/simX/execute.cpp b/sim/simX/execute.cpp index ff705d82..d55ba2f9 100644 --- a/sim/simX/execute.cpp +++ b/sim/simX/execute.cpp @@ -49,11 +49,12 @@ inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid) } } -void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { +void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { assert(tmask_.any()); Word nextPC = PC_ + core_->arch().wsize(); + Word func2 = instr.getFunc2(); Word func3 = instr.getFunc3(); Word func6 = instr.getFunc6(); Word func7 = instr.getFunc7(); @@ -117,8 +118,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { case NOP: break; case LUI_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::ARITH; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::ARITH; for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -127,8 +128,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { rd_write = true; break; case AUIPC_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::ARITH; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::ARITH; for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -137,10 +138,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { rd_write = true; break; case R_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::ARITH; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::ARITH; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -149,7 +150,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { case 0: // MUL rddata[t] = ((WordI)rsdata[t][0]) * ((WordI)rsdata[t][1]); - pipeline_state->alu.type = AluType::IMUL; + trace->alu.type = AluType::IMUL; break; case 1: { // MULH @@ -163,7 +164,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } uint64_t result = first * second; rddata[t] = (result >> 32) & 0xFFFFFFFF; - pipeline_state->alu.type = AluType::IMUL; + trace->alu.type = AluType::IMUL; } break; case 2: { // MULHSU @@ -173,14 +174,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } int64_t second = (int64_t)rsdata[t][1]; rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF; - pipeline_state->alu.type = AluType::IMUL; + trace->alu.type = AluType::IMUL; } break; case 3: { // MULHU uint64_t first = (uint64_t)rsdata[t][0]; uint64_t second = (uint64_t)rsdata[t][1]; rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF; - pipeline_state->alu.type = AluType::IMUL; + trace->alu.type = AluType::IMUL; } break; case 4: { // DIV @@ -193,7 +194,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } else { rddata[t] = dividen / divisor; } - pipeline_state->alu.type = AluType::IDIV; + trace->alu.type = AluType::IDIV; } break; case 5: { // DIVU @@ -204,7 +205,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } else { rddata[t] = dividen / divisor; } - pipeline_state->alu.type = AluType::IDIV; + trace->alu.type = AluType::IDIV; } break; case 6: { // REM @@ -217,7 +218,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } else { rddata[t] = dividen % divisor; } - pipeline_state->alu.type = AluType::IDIV; + trace->alu.type = AluType::IDIV; } break; case 7: { // REMU @@ -228,7 +229,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } else { rddata[t] = dividen % divisor; } - pipeline_state->alu.type = AluType::IDIV; + trace->alu.type = AluType::IDIV; } break; default: std::abort(); @@ -285,9 +286,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { rd_write = true; break; case I_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::ARITH; - pipeline_state->used_iregs[rsrc0] = 1; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::ARITH; + trace->used_iregs.set(rsrc0); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -336,10 +337,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { rd_write = true; break; case B_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::BRANCH; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::BRANCH; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -385,107 +386,149 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } break; // runonce } - pipeline_state->stall_warp = true; + trace->fetch_stall = true; break; case JAL_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::BRANCH; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::BRANCH; for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; rddata[t] = nextPC; nextPC = PC_ + immsrc; - pipeline_state->stall_warp = true; + trace->fetch_stall = true; break; // runonce } rd_write = true; break; case JALR_INST: - pipeline_state->exe_type = ExeType::ALU; - pipeline_state->alu.type = AluType::BRANCH; - pipeline_state->used_iregs[rsrc0] = 1; + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::BRANCH; + trace->used_iregs.set(rsrc0); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; rddata[t] = nextPC; nextPC = rsdata[t][0] + immsrc; - pipeline_state->stall_warp = true; + trace->fetch_stall = true; break; // runOnce } rd_write = true; break; case L_INST: - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.type = LsuType::LOAD; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->mem_addrs.resize(num_threads); - for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned - Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; - Word data_read = core_->dcache_read(memAddr, 4); - pipeline_state->mem_addrs.at(t) = memAddr; - DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); - switch (func3) { - case 0: - // LBI - rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8); - break; - case 1: - // LHI - rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16); - break; - case 2: - // LW - rddata[t] = data_read; - break; - case 4: - // LBU - rddata[t] = Word((data_read >> shift_by) & 0xFF); - break; - case 5: - // LHU - rddata[t] = Word((data_read >> shift_by) & 0xFFFF); - break; - default: - std::abort(); + case FL: + trace->exe_type = ExeType::LSU; + trace->lsu.type = LsuType::LOAD; + trace->used_iregs.set(rsrc0); + if (opcode == L_INST + || (opcode == FL && func3 == 2)) { + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned + Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; + Word data_read = core_->dcache_read(memAddr, 4); + trace->mem_addrs.at(t).push_back(memAddr); + DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); + switch (func3) { + case 0: + // LBI + rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8); + break; + case 1: + // LHI + rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16); + break; + case 2: + // LW + rddata[t] = data_read; + break; + case 4: + // LBU + rddata[t] = Word((data_read >> shift_by) & 0xFF); + break; + case 5: + // LHU + rddata[t] = Word((data_read >> shift_by) & 0xFFFF); + break; + default: + std::abort(); + } } - } - rd_write = true; - break; - case S_INST: - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.type = LsuType::STORE; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; - pipeline_state->mem_addrs.resize(num_threads); - for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - Word memAddr = rsdata[t][0] + immsrc; - pipeline_state->mem_addrs.at(t) = memAddr; - DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - switch (func3) { - case 0: - // SB - core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1); - break; - case 1: - // SH - core_->dcache_write(memAddr, rsdata[t][1], 2); - break; - case 2: - // SW - core_->dcache_write(memAddr, rsdata[t][1], 4); - break; + } else { + DP(4, "Executing vector load"); + DP(4, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); + DP(4, "dest: v" << rdest); + DP(4, "width" << instr.getVlsWidth()); + auto &vd = vRegFile_.at(rdest); + switch (instr.getVlsWidth()) { + case 6: { + // load word and unit strided (not checking for unit stride) + for (int i = 0; i < vl_; i++) { + Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); + DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr); + Word data_read = core_->dcache_read(memAddr, 4); + DP(4, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); + int *result_ptr = (int *)(vd.data() + i); + *result_ptr = data_read; + } + } break; default: std::abort(); } } + rd_write = true; + break; + case S_INST: + case FS: + trace->exe_type = ExeType::LSU; + trace->lsu.type = LsuType::STORE; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + if (opcode == S_INST + || (opcode == FS && func3 == 2)) { + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + Word memAddr = rsdata[t][0] + immsrc; + trace->mem_addrs.at(t).push_back(memAddr); + DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + switch (func3) { + case 0: + // SB + core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1); + break; + case 1: + // SH + core_->dcache_write(memAddr, rsdata[t][1], 2); + break; + case 2: + // SW + core_->dcache_write(memAddr, rsdata[t][1], 4); + break; + default: + std::abort(); + } + } + } else { + for (int i = 0; i < vl_; i++) { + Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8); + DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + switch (instr.getVlsWidth()) { + case 6: { + // store word and unit strided (not checking for unit stride) + uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i); + core_->dcache_write(memAddr, value, 4); + DP(4, "store: " << memAddr << " value:" << value); + } break; + default: + std::abort(); + } + } + } break; case SYS_INST: - pipeline_state->exe_type = ExeType::CSR; + trace->exe_type = ExeType::CSR; for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -493,30 +536,40 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { Word csr_value = core_->get_csr(csr_addr, t, id_); switch (func3) { case 0: - if (csr_addr < 2) { - // ECALL/EBREAK + switch (csr_addr) { + case 0: // ECALL + core_->trigger_ecall(); + break; + case 1: // EBREAK core_->trigger_ebreak(); - } + break; + case 0x002: // URET + case 0x102: // SRET + case 0x302: // MRET + break; + default: + std::abort(); + } break; case 1: // CSRRW rddata[t] = csr_value; core_->set_csr(csr_addr, rsdata[t][0], t, id_); - pipeline_state->used_iregs[rsrc0] = 1; + trace->used_iregs.set(rsrc0); rd_write = true; break; case 2: // CSRRS rddata[t] = csr_value; core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_); - pipeline_state->used_iregs[rsrc0] = 1; + trace->used_iregs.set(rsrc0); rd_write = true; break; case 3: // CSRRC rddata[t] = csr_value; core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_); - pipeline_state->used_iregs[rsrc0] = 1; + trace->used_iregs.set(rsrc0); rd_write = true; break; case 5: @@ -543,88 +596,12 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } break; case FENCE: - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.type = LsuType::FENCE; - pipeline_state->stall_warp = true; - break; - case (FL | VL): - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.type = LsuType::LOAD; - pipeline_state->used_iregs[rsrc0] = 1; - if (func3 == 0x2) { - pipeline_state->mem_addrs.resize(num_threads); - for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - Word memAddr = rsdata[t][0] + immsrc; - pipeline_state->mem_addrs.at(t) = memAddr; - Word data_read = core_->dcache_read(memAddr, 4); - DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); - rddata[t] = data_read; - } - } else { - DP(3, "Executing vector load"); - DP(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); - DP(3, "dest: v" << rdest); - DP(3, "width" << instr.getVlsWidth()); - pipeline_state->mem_addrs.resize(vl_); - auto &vd = vRegFile_.at(rdest); - switch (instr.getVlsWidth()) { - case 6: { - // load word and unit strided (not checking for unit stride) - for (int i = 0; i < vl_; i++) { - Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); - pipeline_state->mem_addrs.at(i) = memAddr; - DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - Word data_read = core_->dcache_read(memAddr, 4); - DP(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); - int *result_ptr = (int *)(vd.data() + i); - *result_ptr = data_read; - } - } break; - default: - std::abort(); - } - break; - } - rd_write = true; - break; - case (FS | VS): - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.type = LsuType::STORE; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; - if (func3 == 0x2) { - pipeline_state->mem_addrs.resize(num_threads); - for (int t = 0; t < num_threads; ++t) { - if (!tmask_.test(t)) - continue; - Word memAddr = rsdata[t][0] + immsrc; - pipeline_state->mem_addrs.at(t) = memAddr; - core_->dcache_write(memAddr, rsdata[t][1], 4); - DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - } - } else { - pipeline_state->mem_addrs.resize(vl_); - for (int i = 0; i < vl_; i++) { - Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8); - pipeline_state->mem_addrs.at(i) = memAddr; - DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); - switch (instr.getVlsWidth()) { - case 6: { - //store word and unit strided (not checking for unit stride) - uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i); - core_->dcache_write(memAddr, value, 4); - DP(3, "store: " << memAddr << " value:" << value); - } break; - default: - std::abort(); - } - } - } - break; + trace->exe_type = ExeType::LSU; + trace->lsu.type = LsuType::FENCE; + trace->fetch_stall = true; + break; case FCI: - pipeline_state->exe_type = ExeType::FPU; + trace->exe_type = ExeType::FPU; for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -633,32 +610,32 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { switch (func7) { case 0x00: //FADD rddata[t] = rv_fadd(rsdata[t][0], rsdata[t][1], frm, &fflags); - pipeline_state->fpu.type = FpuType::FMA; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FMA; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x04: //FSUB rddata[t] = rv_fsub(rsdata[t][0], rsdata[t][1], frm, &fflags); - pipeline_state->fpu.type = FpuType::FMA; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FMA; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x08: //FMUL rddata[t] = rv_fmul(rsdata[t][0], rsdata[t][1], frm, &fflags); - pipeline_state->fpu.type = FpuType::FMA; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FMA; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x0c: //FDIV rddata[t] = rv_fdiv(rsdata[t][0], rsdata[t][1], frm, &fflags); - pipeline_state->fpu.type = FpuType::FDIV; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FDIV; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x2c: //FSQRT rddata[t] = rv_fsqrt(rsdata[t][0], frm, &fflags); - pipeline_state->fpu.type = FpuType::FSQRT; - pipeline_state->used_fregs[rsrc0] = 1; + trace->fpu.type = FpuType::FSQRT; + trace->used_fregs.set(rsrc0); break; case 0x10: switch (func3) { @@ -672,9 +649,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { rddata[t] = rv_fsgnjx(rsdata[t][0], rsdata[t][1]); break; } - pipeline_state->fpu.type = FpuType::FNCP; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FNCP; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x14: if (func3) { @@ -684,9 +661,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { // FMIN.S rddata[t] = rv_fmin(rsdata[t][0], rsdata[t][1], &fflags); } - pipeline_state->fpu.type = FpuType::FNCP; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FNCP; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x60: if (rsrc1 == 0) { @@ -696,8 +673,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { // FCVT.WU.S rddata[t] = rv_ftou(rsdata[t][0], frm, &fflags); } - pipeline_state->fpu.type = FpuType::FCVT; - pipeline_state->used_fregs[rsrc0] = 1; + trace->fpu.type = FpuType::FCVT; + trace->used_fregs.set(rsrc0); break; case 0x70: if (func3) { @@ -707,8 +684,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { // FMV.X.W rddata[t] = rsdata[t][0]; } - pipeline_state->fpu.type = FpuType::FNCP; - pipeline_state->used_fregs[rsrc0] = 1; + trace->fpu.type = FpuType::FNCP; + trace->used_fregs.set(rsrc0); break; case 0x50: switch(func3) { @@ -725,9 +702,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { rddata[t] = rv_feq(rsdata[t][0], rsdata[t][1], &fflags); break; } - pipeline_state->fpu.type = FpuType::FNCP; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; + trace->fpu.type = FpuType::FNCP; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); break; case 0x68: if (rsrc1) { @@ -737,14 +714,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { // FCVT.S.W: rddata[t] = rv_itof(rsdata[t][0], frm, &fflags); } - pipeline_state->fpu.type = FpuType::FCVT; - pipeline_state->used_iregs[rsrc0] = 1; + trace->fpu.type = FpuType::FCVT; + trace->used_iregs.set(rsrc0); break; case 0x78: // FMV.W.X rddata[t] = rsdata[t][0]; - pipeline_state->fpu.type = FpuType::FNCP; - pipeline_state->used_iregs[rsrc0] = 1; + trace->fpu.type = FpuType::FNCP; + trace->used_iregs.set(rsrc0); break; } update_fcrs(fflags, core_, t, id_); @@ -755,10 +732,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { case FMSUB: case FMNMADD: case FMNMSUB: - pipeline_state->fpu.type = FpuType::FMA; - pipeline_state->used_fregs[rsrc0] = 1; - pipeline_state->used_fregs[rsrc1] = 1; - pipeline_state->used_fregs[rsrc2] = 1; + trace->fpu.type = FpuType::FMA; + trace->used_fregs.set(rsrc0); + trace->used_fregs.set(rsrc1); + trace->used_fregs.set(rsrc2); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -784,8 +761,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } rd_write = true; break; - case GPGPU: { - pipeline_state->exe_type = ExeType::GPU; + case GPGPU: { int ts = 0; for (int t = 0; t < num_threads; ++t) { if (tmask_.test(t)) { @@ -795,10 +771,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } switch (func3) { case 0: { - // TMC - pipeline_state->gpu.type = GpuType::TMC; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->stall_warp = true; + // TMC + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::TMC; + trace->used_iregs.set(rsrc0); + trace->fetch_stall = true; if (rsrc1) { // predicate mode ThreadMask pred; @@ -823,10 +800,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } break; case 1: { // WSPAWN - pipeline_state->gpu.type = GpuType::WSPAWN; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; - pipeline_state->stall_warp = true; + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::WSPAWN; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + trace->fetch_stall = true; int active_warps = std::min(rsdata.at(ts)[0], core_->arch().num_warps()); DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]); for (int i = 1; i < active_warps; ++i) { @@ -837,9 +815,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } break; case 2: { // SPLIT - pipeline_state->gpu.type = GpuType::SPLIT; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->stall_warp = true; + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::SPLIT; + trace->used_iregs.set(rsrc0); + trace->fetch_stall = true; if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) { ThreadMask tmask; for (int i = 0; i < num_threads; ++i) { @@ -868,8 +847,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } break; case 3: { // JOIN - pipeline_state->gpu.type = GpuType::JOIN; - pipeline_state->stall_warp = true; + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::JOIN; + trace->fetch_stall = true; if (!domStack_.empty() && domStack_.top().unanimous) { DP(3, "*** Uninimous branch at join"); tmask_ = domStack_.top().tmask; @@ -893,18 +873,19 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } break; case 4: { // BAR - pipeline_state->gpu.type = GpuType::BAR; - pipeline_state->used_iregs[rsrc0] = 1; - pipeline_state->used_iregs[rsrc1] = 1; - pipeline_state->stall_warp = true; + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::BAR; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + trace->fetch_stall = true; active_ = false; core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); } break; - case 6: { + case 5: { // PREFETCH - pipeline_state->exe_type = ExeType::LSU; - pipeline_state->lsu.type = LsuType::PREFETCH; - pipeline_state->used_iregs[rsrc0] = 1; + trace->exe_type = ExeType::LSU; + trace->lsu.type = LsuType::PREFETCH; + trace->used_iregs.set(rsrc0); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; @@ -915,7 +896,50 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { default: std::abort(); } - } break; + } break; + case GPU: { + switch (func3) { + case 0: { // TEX + trace->exe_type = ExeType::GPU; + trace->gpu.type = GpuType::TEX; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + trace->used_iregs.set(rsrc2); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + auto unit = func2; + auto u = rsdata[t][0]; + auto v = rsdata[t][1]; + auto lod = rsdata[t][2]; + auto color = core_->tex_read(unit, u, v, lod, &trace->mem_addrs.at(t)); + rddata[t] = color; + } + rd_write = true; + } break; + case 1: + switch (func2) { + case 0: { // CMOV + trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::CMOV; + trace->used_iregs.set(rsrc0); + trace->used_iregs.set(rsrc1); + trace->used_iregs.set(rsrc2); + for (int t = 0; t < num_threads; ++t) { + if (!tmask_.test(t)) + continue; + rddata[t] = rsdata[t][0] ? rsdata[t][1] : rsdata[t][2]; + } + rd_write = true; + } break; + default: + std::abort(); + } + break; + default: + std::abort(); + } + } break; case VSET: { int VLEN = core_->arch().vsize() * 8; int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew(); @@ -966,7 +990,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 24: { - //vmseq + // vmseq auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -997,7 +1021,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 25: { - //vmsne + // vmsne auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1028,7 +1052,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 26: { - //vmsltu + // vmsltu auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1059,7 +1083,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 27: { - //vmslt + // vmslt auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1090,7 +1114,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 28: { - //vmsleu + // vmsleu auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1121,7 +1145,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 29: { - //vmsle + // vmsle auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1152,7 +1176,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 30: { - //vmsgtu + // vmsgtu auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1183,7 +1207,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 31: { - //vmsgt + // vmsgt auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1356,7 +1380,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 27: { - //vmxor + // vmxor auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1402,7 +1426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 28: { - //vmornot + // vmornot auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1448,7 +1472,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 29: { - //vmnand + // vmnand auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1494,7 +1518,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 30: { - //vmnor + // vmnor auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1540,7 +1564,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 31: { - //vmxnor + // vmxnor auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1586,7 +1610,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } } break; case 37: { - //vmul + // vmul auto &vr1 = vRegFile_.at(rsrc0); auto &vr2 = vRegFile_.at(rsrc1); auto &vd = vRegFile_.at(rdest); @@ -1769,7 +1793,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { } if (rd_write) { - pipeline_state->wb = true; + trace->wb = true; DPH(2, "Dest Reg: "); auto rdt = instr.getRDType(); switch (rdt) { @@ -1786,7 +1810,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { DPN(2, "0x" << std::hex << rddata[t]); } DPN(2, "}" << std::endl); - pipeline_state->used_iregs[rdest] = 1; + trace->used_iregs[rdest] = 1; } break; case RegType::Float: @@ -1801,7 +1825,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) { DPN(2, "0x" << std::hex << rddata[t]); } DPN(2, "}" << std::endl); - pipeline_state->used_fregs[rdest] = 1; + trace->used_fregs[rdest] = 1; break; default: std::abort(); diff --git a/sim/simX/exeunit.cpp b/sim/simX/exeunit.cpp index ba280812..1d0a3cfc 100644 --- a/sim/simX/exeunit.cpp +++ b/sim/simX/exeunit.cpp @@ -6,16 +6,18 @@ #include #include "debug.h" #include "core.h" +#include "constants.h" using namespace vortex; NopUnit::NopUnit(Core*) : ExeUnit("NOP") {} void NopUnit::step(uint64_t /*cycle*/) { - pipeline_state_t state; - if (!inputs_.try_pop(&state)) + if (inputs_.empty()) return; - this->schedule_output(state, 1); + auto trace = inputs_.top(); + this->schedule_output(trace, 1); + inputs_.pop(); } /////////////////////////////////////////////////////////////////////////////// @@ -33,19 +35,23 @@ void LsuUnit::step(uint64_t cycle) { // handle dcache response for (uint32_t t = 0; t < num_threads_; ++t) { - MemRsp mem_rsp; - if (!core_->dcache_->CoreRspPorts.at(t).read(&mem_rsp)) + auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0); + if (dcache_rsp_port.empty()) continue; - auto& entry = pending_dcache_.at(mem_rsp.tag); - DT(3, cycle, "dcache-rsp: addr=" << std::hex << entry.first.mem_addrs.at(t) << ", tag=" << mem_rsp.tag << ", type=" << entry.first.lsu.type << ", tid=" << t << ", " << entry.first); - assert(entry.second.test(t)); - entry.second.reset(t); // track remaining blocks - if (!entry.second.any()) { - auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency); - entry.first.dcache_latency = latency; - this->schedule_output(entry.first, 1); + auto& mem_rsp = dcache_rsp_port.top(); + auto& entry = pending_dcache_.at(mem_rsp.tag); + auto trace = entry.first; + DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type + << ", tid=" << t << ", " << *trace); + assert(entry.second); + --entry.second; // track remaining blocks + if (0 == entry.second) { + auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency); + trace->dcache_latency = latency; + this->schedule_output(trace, 1); pending_dcache_.release(mem_rsp.tag); - } + } + dcache_rsp_port.pop(); } if (fence_lock_) { @@ -61,36 +67,83 @@ void LsuUnit::step(uint64_t cycle) { if (inputs_.empty()) return; - auto state = inputs_.top(); + auto trace = inputs_.top(); - if (state.lsu.type == LsuType::FENCE) { + if (trace->lsu.type == LsuType::FENCE) { // schedule fence lock - fence_state_ = state; - fence_lock_ = true; - inputs_.pop(); - DT(3, cycle, "fence-lock: " << state); + fence_state_ = trace; + fence_lock_ = true; + DT(3, cycle, "fence-lock: " << *trace); + // remove input + inputs_.pop(); return; } // check pending queue capacity - if (pending_dcache_.full()) { - DT(3, cycle, "*** lsu-queue-stall: " << state); + if (!trace->check_stalled(pending_dcache_.full())) { + DT(3, cycle, "*** lsu-queue-stall: " << *trace); + } + if (pending_dcache_.full()) return; + + // send memory request + + bool has_shared_memory = false; + bool mem_rsp_pending = false; + bool is_write = (trace->lsu.type == LsuType::STORE); + + uint32_t valid_addrs = 0; + for (auto& mem_addr : trace->mem_addrs) { + valid_addrs += mem_addr.size(); + } + + trace->dcache_latency = SimPlatform::instance().cycles(); + auto tag = pending_dcache_.allocate({trace, valid_addrs}); + + for (uint32_t t = 0; t < num_threads_; ++t) { + if (!trace->tmask.test(t)) + continue; + + auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0); + for (auto mem_addr : trace->mem_addrs.at(t)) { + // check shared memory address + if (SM_ENABLE) { + if ((mem_addr >= (SMEM_BASE_ADDR-SMEM_SIZE)) + && (mem_addr < SMEM_BASE_ADDR)) { + DT(3, cycle, "smem-access: addr=" << std::hex << mem_addr << ", tag=" << tag + << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); + has_shared_memory = true; + continue; + } + } + + bool is_io = (mem_addr >= IO_BASE_ADDR); + + MemReq mem_req; + mem_req.addr = mem_addr; + mem_req.write = is_write; + mem_req.tag = tag; + mem_req.is_io = is_io; + dcache_req_port.send(mem_req, 1); + DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr << ", tag=" << tag + << ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << is_io << ", "<< trace); + // do not wait on writes + mem_rsp_pending = !is_write; + } } - // send dcache request - state.dcache_latency = SimPlatform::instance().cycles(); - auto tag = pending_dcache_.allocate({state, state.tmask}); - for (uint32_t t = 0; t < num_threads_; ++t) { - if (!state.tmask.test(t)) - continue; - MemReq mem_req; - mem_req.addr = state.mem_addrs.at(t); - mem_req.write = (state.lsu.type == LsuType::STORE); - mem_req.tag = tag; - core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1); - DT(3, cycle, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", type=" << state.lsu.type << ", tid=" << t << ", " << state); - } + // do not wait + if (!mem_rsp_pending) { + pending_dcache_.release(tag); + uint32_t delay = 1; + if (has_shared_memory) { + // all threads accessed shared memory + delay += Constants::SMEM_DELAY; + } + this->schedule_output(trace, delay); + } + + // remove input inputs_.pop(); } @@ -98,23 +151,27 @@ void LsuUnit::step(uint64_t cycle) { AluUnit::AluUnit(Core*) : ExeUnit("ALU") {} -void AluUnit::step(uint64_t /*cycle*/) { - pipeline_state_t state; - if (!inputs_.try_pop(&state)) +void AluUnit::step(uint64_t /*cycle*/) { + if (inputs_.empty()) return; - switch (state.alu.type) { - case AluType::ARITH: - this->schedule_output(state, 1); - break; + auto trace = inputs_.top(); + switch (trace->alu.type) { + case AluType::ARITH: case AluType::BRANCH: - this->schedule_output(state, 1); + case AluType::CMOV: + this->schedule_output(trace, 1); + inputs_.pop(); break; case AluType::IMUL: - this->schedule_output(state, LATENCY_IMUL); + this->schedule_output(trace, LATENCY_IMUL); + inputs_.pop(); break; case AluType::IDIV: - this->schedule_output(state, XLEN); + this->schedule_output(trace, XLEN); + inputs_.pop(); break; + default: + std::abort(); } } @@ -123,10 +180,11 @@ void AluUnit::step(uint64_t /*cycle*/) { CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {} void CsrUnit::step(uint64_t /*cycle*/) { - pipeline_state_t state; - if (!inputs_.try_pop(&state)) + if (inputs_.empty()) return; - this->schedule_output(state, 1); + auto trace = inputs_.top(); + this->schedule_output(trace, 1); + inputs_.pop(); } /////////////////////////////////////////////////////////////////////////////// @@ -134,46 +192,127 @@ void CsrUnit::step(uint64_t /*cycle*/) { FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {} void FpuUnit::step(uint64_t /*cycle*/) { - pipeline_state_t state; - if (!inputs_.try_pop(&state)) + if (inputs_.empty()) return; - switch (state.fpu.type) { + auto trace = inputs_.top(); + switch (trace->fpu.type) { case FpuType::FNCP: - this->schedule_output(state, 1); + this->schedule_output(trace, 1); + inputs_.pop(); break; case FpuType::FMA: - this->schedule_output(state, LATENCY_FMA); + this->schedule_output(trace, LATENCY_FMA); + inputs_.pop(); break; case FpuType::FDIV: - this->schedule_output(state, LATENCY_FDIV); + this->schedule_output(trace, LATENCY_FDIV); + inputs_.pop(); break; case FpuType::FSQRT: - this->schedule_output(state, LATENCY_FSQRT); + this->schedule_output(trace, LATENCY_FSQRT); + inputs_.pop(); break; case FpuType::FCVT: - this->schedule_output(state, LATENCY_FCVT); + this->schedule_output(trace, LATENCY_FCVT); + inputs_.pop(); break; + default: + std::abort(); } } /////////////////////////////////////////////////////////////////////////////// -GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {} +GpuUnit::GpuUnit(Core* core) + : ExeUnit("GPU") + , core_(core) + , num_threads_(core->arch().num_threads()) + , pending_tex_reqs_(TEXQ_SIZE) +{} -void GpuUnit::step(uint64_t /*cycle*/) { - pipeline_state_t state; - if (!inputs_.try_pop(&state)) +void GpuUnit::step(uint64_t cycle) { + __unused (cycle); +#ifdef EXT_TEX_ENABLE + // handle memory response + for (uint32_t t = 0; t < num_threads_; ++t) { + auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1); + if (dcache_rsp_port.empty()) + continue; + auto& mem_rsp = dcache_rsp_port.top(); + auto& entry = pending_tex_reqs_.at(mem_rsp.tag); + auto trace = entry.first; + DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace); + assert(entry.second); + --entry.second; // track remaining blocks + if (0 == entry.second) { + auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency); + trace->dcache_latency = latency; + this->schedule_output(trace, 1); + pending_tex_reqs_.release(mem_rsp.tag); + } + dcache_rsp_port.pop(); + } +#endif + + // check input queue + if (inputs_.empty()) return; - switch (state.gpu.type) { + + auto trace = inputs_.top(); + + switch (trace->gpu.type) { case GpuType::TMC: case GpuType::WSPAWN: case GpuType::SPLIT: case GpuType::JOIN: case GpuType::BAR: - this->schedule_output(state, 1); - break; - case GpuType::TEX: - /* TODO */ + this->schedule_output(trace, 1); + inputs_.pop(); break; + case GpuType::TEX: { + if (this->processTexRequest(cycle, trace)) + inputs_.pop(); + } break; + default: + std::abort(); } +} + +bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { + __unused (cycle); + + // check pending queue capacity + if (!trace->check_stalled(pending_tex_reqs_.full())) { + DT(3, cycle, "*** tex-queue-stall: " << *trace); + } + if (pending_tex_reqs_.full()) + return false; + + // send memory request + + uint32_t valid_addrs = 0; + for (auto& mem_addr : trace->mem_addrs) { + valid_addrs += mem_addr.size(); + } + + trace->tex_latency = SimPlatform::instance().cycles(); + auto tag = pending_tex_reqs_.allocate({trace, valid_addrs}); + + for (uint32_t t = 0; t < num_threads_; ++t) { + if (!trace->tmask.test(t)) + continue; + + auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1); + for (auto mem_addr : trace->mem_addrs.at(t)) { + MemReq mem_req; + mem_req.addr = mem_addr; + mem_req.write = (trace->lsu.type == LsuType::STORE); + mem_req.tag = tag; + dcache_req_port.send(mem_req, 1); + DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr << ", tag=" << tag + << ", tid=" << t << ", "<< trace); + } + } + + return true; } \ No newline at end of file diff --git a/sim/simX/exeunit.h b/sim/simX/exeunit.h index 3b2bbf91..83e69463 100644 --- a/sim/simX/exeunit.h +++ b/sim/simX/exeunit.h @@ -11,36 +11,43 @@ class Core; class ExeUnit { protected: const char* name_; - Queue inputs_; - Queue outputs_; + Queue inputs_; + Queue outputs_; - void schedule_output(const pipeline_state_t& state, uint32_t delay) { + void schedule_output(pipeline_trace_t* trace, uint32_t delay) { if (delay > 1) { SimPlatform::instance().schedule( - [&](const pipeline_state_t& req) { + [&](pipeline_trace_t* req) { outputs_.push(req); }, - state, + trace, (delay - 1) ); } else { - outputs_.push(state); + outputs_.push(trace); } } public: typedef std::shared_ptr Ptr; - ExeUnit(const char* name) : name_(name) {} - + ExeUnit(const char* name) : name_(name) {} virtual ~ExeUnit() {} - void push_input(const pipeline_state_t& state) { - inputs_.push(state); + void push(pipeline_trace_t* trace) { + inputs_.push(trace); } - bool pop_output(pipeline_state_t* state) { - return outputs_.try_pop(state); + bool empty() const { + return outputs_.empty(); + } + + pipeline_trace_t* top() const { + return outputs_.top(); + } + + void pop() { + outputs_.pop(); } virtual void step(uint64_t cycle) = 0; @@ -61,8 +68,8 @@ class LsuUnit : public ExeUnit { private: Core* core_; uint32_t num_threads_; - HashTable> pending_dcache_; - pipeline_state_t fence_state_; + HashTable> pending_dcache_; + pipeline_trace_t* fence_state_; bool fence_lock_; public: @@ -101,6 +108,13 @@ public: /////////////////////////////////////////////////////////////////////////////// class GpuUnit : public ExeUnit { +private: + Core* core_; + uint32_t num_threads_; + HashTable> pending_tex_reqs_; + + bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace); + public: GpuUnit(Core*); diff --git a/sim/simX/ibuffer.h b/sim/simX/ibuffer.h index 86bdeed7..b4c6f51e 100644 --- a/sim/simX/ibuffer.h +++ b/sim/simX/ibuffer.h @@ -7,7 +7,7 @@ namespace vortex { class IBuffer { private: - std::queue entries_; + std::queue entries_; uint32_t capacity_; public: @@ -23,12 +23,12 @@ public: return (entries_.size() == capacity_); } - const pipeline_state_t& top() const { + pipeline_trace_t* top() const { return entries_.front(); } - void push(const pipeline_state_t& state) { - entries_.emplace(state); + void push(pipeline_trace_t* trace) { + entries_.emplace(trace); } void pop() { diff --git a/sim/simX/instr.h b/sim/simX/instr.h index 5deace6c..334b8565 100644 --- a/sim/simX/instr.h +++ b/sim/simX/instr.h @@ -29,10 +29,9 @@ enum Opcode { FMNMADD = 0x4f, // Vector Extension VSET = 0x57, - VL = 0x7, - VS = 0x27, // GPGPU Extension GPGPU = 0x6b, + GPU = 0x5b, }; enum InstType { @@ -70,6 +69,7 @@ public: void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg; } void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; } void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; } + void setFunc2(Word func2) { func2_ = func2; } void setFunc3(Word func3) { func3_ = func3; } void setFunc7(Word func7) { func7_ = func7; } void setImm(Word imm) { has_imm_ = true; imm_ = imm; } @@ -85,6 +85,7 @@ public: /* Getters used by encoders. */ Opcode getOpcode() const { return opcode_; } + Word getFunc2() const { return func2_; } Word getFunc3() const { return func3_; } Word getFunc6() const { return func6_; } Word getFunc7() const { return func7_; } @@ -118,6 +119,7 @@ private: RegType rsrc_type_[MAX_REG_SOURCES]; int rsrc_[MAX_REG_SOURCES]; int rdest_; + Word func2_; Word func3_; Word func6_; diff --git a/sim/simX/memsim.cpp b/sim/simX/memsim.cpp index 63ba571a..6559000d 100644 --- a/sim/simX/memsim.cpp +++ b/sim/simX/memsim.cpp @@ -20,14 +20,16 @@ public: void step(uint64_t /*cycle*/) { for (uint32_t i = 0, n = num_banks_; i < n; ++i) { - MemReq mem_req; - if (!simobject_->MemReqPorts.at(i).read(&mem_req)) + auto& mem_req_port = simobject_->MemReqPorts.at(i); + if (mem_req_port.empty()) continue; + auto& mem_req = mem_req_port.top(); if (!mem_req.write) { MemRsp mem_rsp; mem_rsp.tag = mem_req.tag; simobject_->MemRspPorts.at(i).send(mem_rsp, latency_); } + mem_req_port.pop(); } } }; diff --git a/sim/simX/memsim.h b/sim/simX/memsim.h index 24d8e6ca..3d5b33fe 100644 --- a/sim/simX/memsim.h +++ b/sim/simX/memsim.h @@ -10,10 +10,22 @@ struct MemReq { uint64_t addr; uint32_t tag; bool write; + bool is_io; + + MemReq(uint64_t _addr = 0, + uint64_t _tag = 0, + bool _write = false, + bool _is_io = false + ) : addr(_addr) + , tag(_tag) + , write(_write) + , is_io(_is_io) + {} }; struct MemRsp { - uint32_t tag; + uint64_t tag; + MemRsp(uint64_t _tag = 0) : tag (_tag) {} }; class MemSim : public SimObject{ diff --git a/sim/simX/pipeline.h b/sim/simX/pipeline.h index b5937b29..a5bf6d52 100644 --- a/sim/simX/pipeline.h +++ b/sim/simX/pipeline.h @@ -5,11 +5,12 @@ #include #include #include "types.h" +#include "archdef.h" #include "debug.h" namespace vortex { -struct pipeline_state_t { +struct pipeline_trace_t { //-- uint64_t id; @@ -20,17 +21,24 @@ struct pipeline_state_t { Word PC; //-- - bool stall_warp; + bool fetch_stall; + bool pipeline_stall; + + //-- bool wb; RegType rdest_type; int rdest; + + //-- RegMask used_iregs; RegMask used_fregs; RegMask used_vregs; //- ExeType exe_type; - std::vector mem_addrs; + + //-- + std::vector> mem_addrs; //-- union { @@ -51,27 +59,37 @@ struct pipeline_state_t { // stats uint64_t icache_latency; uint64_t dcache_latency; + uint64_t tex_latency; - void clear() { + pipeline_trace_t(uint64_t id_, const ArchDef& arch) { + id = id_; cid = 0; wid = 0; tmask.reset(); - PC = 0; - stall_warp = false; - wb = false; + PC = 0; + fetch_stall = false; + pipeline_stall = false; + wb = false; rdest = 0; rdest_type = RegType::None; used_iregs.reset(); used_fregs.reset(); used_vregs.reset(); exe_type = ExeType::NOP; - mem_addrs.clear(); + mem_addrs.resize(arch.num_threads()); icache_latency = 0; dcache_latency = 0; + tex_latency = 0; + } + + bool check_stalled(bool stall) { + bool old = pipeline_stall; + pipeline_stall = stall; + return stall ? old : true; } }; -inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) { +inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) { os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC; os << ", wb=" << state.wb; if (state.wb) { @@ -82,10 +100,9 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) return os; } -class PipelineStage : public Queue { +class PipelineStage : public Queue { protected: const char* name_; - friend std::ostream &operator<<(std::ostream &, const pipeline_state_t&); public: PipelineStage(const char* name = nullptr) diff --git a/sim/simX/processor.cpp b/sim/simX/processor.cpp index be5cd4f4..7b54b505 100644 --- a/sim/simX/processor.cpp +++ b/sim/simX/processor.cpp @@ -33,7 +33,8 @@ Processor::Processor(const ArchDef& arch) L3_NUM_BANKS, // number of banks L3_NUM_PORTS, // number of ports NUM_CLUSTERS, // request size - true, // write-throught + true, // write-through + false, // write response 0, // victim size L3_MSHR_SIZE, // mshr 2, // pipeline latency @@ -74,7 +75,8 @@ Processor::Processor(const ArchDef& arch) L2_NUM_BANKS, // number of banks L2_NUM_PORTS, // number of ports NUM_CORES, // request size - true, // write-throught + true, // write-through + false, // write response 0, // victim size L2_MSHR_SIZE, // mshr 2, // pipeline latency @@ -129,7 +131,7 @@ int Processor::run() { if (core->running()) { running = true; } - if (core->check_ebreak()) { + if (core->check_exit()) { exitcode = core->getIRegValue(3); running = false; break; @@ -137,5 +139,7 @@ int Processor::run() { } } while (running); + std::cout << std::flush; + return exitcode; } \ No newline at end of file diff --git a/sim/simX/scoreboard.h b/sim/simX/scoreboard.h index 46bf3bdc..95ba0700 100644 --- a/sim/simX/scoreboard.h +++ b/sim/simX/scoreboard.h @@ -7,6 +7,12 @@ namespace vortex { class Scoreboard { private: + struct reg_use_t { + RegType type; + uint32_t reg; + uint64_t owner; + }; + std::vector in_use_iregs_; std::vector in_use_fregs_; std::vector in_use_vregs_; @@ -25,21 +31,21 @@ public: } } - bool in_use(const pipeline_state_t& state) const { - return (state.used_iregs & in_use_iregs_.at(state.wid)) != 0 - || (state.used_fregs & in_use_fregs_.at(state.wid)) != 0 - || (state.used_vregs & in_use_vregs_.at(state.wid)) != 0; + bool in_use(pipeline_trace_t* state) const { + return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0 + || (state->used_fregs & in_use_fregs_.at(state->wid)) != 0 + || (state->used_vregs & in_use_vregs_.at(state->wid)) != 0; } - std::vector owners(const pipeline_state_t& state) const { - std::vector out; + std::vector get_uses(pipeline_trace_t* state) const { + std::vector out; { uint32_t r = 0; - auto used_iregs = state.used_iregs & in_use_iregs_.at(state.wid); + auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid); while (used_iregs.any()) { if (used_iregs.test(0)) { - uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Integer; - out.push_back(owners_.at(tag)); + uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer; + out.push_back({RegType::Integer, r, owners_.at(tag)}); } used_iregs >>= 1; ++r; @@ -47,11 +53,11 @@ public: } { uint32_t r = 0; - auto used_fregs = state.used_fregs & in_use_fregs_.at(state.wid); + auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid); while (used_fregs.any()) { if (used_fregs.test(0)) { - uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Float; - out.push_back(owners_.at(tag)); + uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float; + out.push_back({RegType::Float, r, owners_.at(tag)}); } used_fregs >>= 1; ++r; @@ -59,11 +65,11 @@ public: } { uint32_t r = 0; - auto used_vregs = state.used_vregs & in_use_vregs_.at(state.wid); + auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid); while (used_vregs.any()) { if (used_vregs.test(0)) { - uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Vector; - out.push_back(owners_.at(tag)); + uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector; + out.push_back({RegType::Vector, r, owners_.at(tag)}); } used_vregs >>= 1; ++r; @@ -72,44 +78,44 @@ public: return std::move(out); } - void reserve(const pipeline_state_t& state) { - if (!state.wb) + void reserve(pipeline_trace_t* state) { + if (!state->wb) return; - switch (state.rdest_type) { + switch (state->rdest_type) { case RegType::Integer: - in_use_iregs_.at(state.wid).set(state.rdest); + in_use_iregs_.at(state->wid).set(state->rdest); break; case RegType::Float: - in_use_fregs_.at(state.wid).set(state.rdest); + in_use_fregs_.at(state->wid).set(state->rdest); break; case RegType::Vector: - in_use_vregs_.at(state.wid).set(state.rdest); + in_use_vregs_.at(state->wid).set(state->rdest); break; default: break; } - uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type; + uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type; assert(owners_.count(tag) == 0); - owners_[tag] = state.id; + owners_[tag] = state->id; } - void release(const pipeline_state_t& state) { - if (!state.wb) + void release(pipeline_trace_t* state) { + if (!state->wb) return; - switch (state.rdest_type) { + switch (state->rdest_type) { case RegType::Integer: - in_use_iregs_.at(state.wid).reset(state.rdest); + in_use_iregs_.at(state->wid).reset(state->rdest); break; case RegType::Float: - in_use_fregs_.at(state.wid).reset(state.rdest); + in_use_fregs_.at(state->wid).reset(state->rdest); break; case RegType::Vector: - in_use_vregs_.at(state.wid).reset(state.rdest); + in_use_vregs_.at(state->wid).reset(state->rdest); break; default: break; } - uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type; + uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type; owners_.erase(tag); } }; diff --git a/sim/simX/tex_unit.cpp b/sim/simX/tex_unit.cpp new file mode 100644 index 00000000..d73bd728 --- /dev/null +++ b/sim/simX/tex_unit.cpp @@ -0,0 +1,91 @@ +#include "tex_unit.h" +#include "core.h" +#include +#include + +using namespace vortex; + +enum class FilterMode { + Point, + Bilinear, + Trilinear, +}; + +TexUnit::TexUnit(Core* core) : core_(core) {} + +TexUnit::~TexUnit() {} + +uint32_t TexUnit::get_state(uint32_t state) { + return states_.at(state); +} + +void TexUnit::set_state(uint32_t state, uint32_t value) { + states_.at(state) = value; +} + +uint32_t TexUnit::read(int32_t u, + int32_t v, + int32_t lod, + std::vector* mem_addrs) { + //-- + auto xu = Fixed::make(u); + auto xv = Fixed::make(v); + uint32_t base_addr = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod)); + uint32_t log_width = std::max(states_.at(TEX_STATE_WIDTH) - lod, 0); + uint32_t log_height = std::max(states_.at(TEX_STATE_HEIGHT) - lod, 0); + auto format = (TexFormat)states_.at(TEX_STATE_FORMAT); + auto filter = (FilterMode)states_.at(TEX_STATE_FILTER); + auto wrapu = (WrapMode)states_.at(TEX_STATE_WRAPU); + auto wrapv = (WrapMode)states_.at(TEX_STATE_WRAPV); + + auto stride = Stride(format); + + switch (filter) { + case FilterMode::Bilinear: { + // addressing + uint32_t offset00, offset01, offset10, offset11; + uint32_t alpha, beta; + TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv, + &offset00, &offset01, &offset10, &offset11, &alpha, &beta); + + uint32_t addr00 = base_addr + offset00 * stride; + uint32_t addr01 = base_addr + offset01 * stride; + uint32_t addr10 = base_addr + offset10 * stride; + uint32_t addr11 = base_addr + offset11 * stride; + + // memory lookup + uint32_t texel00 = core_->dcache_read(addr00, stride); + uint32_t texel01 = core_->dcache_read(addr01, stride); + uint32_t texel10 = core_->dcache_read(addr10, stride); + uint32_t texel11 = core_->dcache_read(addr11, stride); + + mem_addrs->push_back(addr00); + mem_addrs->push_back(addr01); + mem_addrs->push_back(addr10); + mem_addrs->push_back(addr11); + + // filtering + auto color = TexFilterLinear( + format, texel00, texel01, texel10, texel11, alpha, beta); + return color; + } + case FilterMode::Point: { + // addressing + uint32_t offset; + TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset); + + uint32_t addr = base_addr + offset * stride; + + // memory lookup + uint32_t texel = core_->dcache_read(addr, stride); + mem_addrs->push_back(addr); + + // filtering + auto color = TexFilterPoint(format, texel); + return color; + } + default: + std::abort(); + return 0; + } +} \ No newline at end of file diff --git a/sim/simX/tex_unit.h b/sim/simX/tex_unit.h new file mode 100644 index 00000000..759dda2a --- /dev/null +++ b/sim/simX/tex_unit.h @@ -0,0 +1,26 @@ +#pragma once + +#include "types.h" + +namespace vortex { + +class Core; + +class TexUnit { +public: + TexUnit(Core* core); + ~TexUnit(); + + uint32_t get_state(uint32_t state); + + void set_state(uint32_t state, uint32_t value); + + uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector* mem_addrs); + +private: + + std::array states_; + Core* core_; +}; + +} \ No newline at end of file diff --git a/sim/simX/types.h b/sim/simX/types.h index f53c3754..d4feb1cb 100644 --- a/sim/simX/types.h +++ b/sim/simX/types.h @@ -66,6 +66,7 @@ enum class AluType { BRANCH, IMUL, IDIV, + CMOV, }; inline std::ostream &operator<<(std::ostream &os, const AluType& type) { @@ -74,6 +75,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) { case AluType::BRANCH: os << "BRANCH"; break; case AluType::IMUL: os << "IMUL"; break; case AluType::IDIV: os << "IDIV"; break; + case AluType::CMOV: os << "CMOV"; break; } return os; } @@ -155,8 +157,6 @@ class Queue { protected: std::queue queue_; - uint32_t count; - public: Queue() {} @@ -168,21 +168,16 @@ public: return queue_.front(); } - void push(const T& value) { - ++count; - queue_.push(value); + T& top() { + return queue_.front(); } void pop() { queue_.pop(); } - bool try_pop(T* value) { - if (queue_.empty()) - return false; - *value = queue_.front(); - queue_.pop(); - return true; + void push(const T& value) { + queue_.push(value); } }; @@ -244,14 +239,6 @@ public: entry.first = false; --capacity_; } - - void remove(uint32_t index, T* value) { - auto& entry = entries_.at(index); - assert(entry.first); - *value = entry.second; - entry.first = false; - --capacity_; - } }; /////////////////////////////////////////////////////////////////////////////// @@ -259,18 +246,7 @@ public: template class Switch : public SimObject> { private: - struct req_batch_t { - std::vector data; - std::bitset valid; - req_batch_t() {} - req_batch_t(uint32_t size) - : data(size) - , valid(0) - {} - }; - ArbiterType type_; - std::queue reqq_; uint32_t delay_; uint32_t cursor_; uint32_t tag_shift_; @@ -295,55 +271,43 @@ public: { assert(delay_ != 0); assert(num_inputs <= MaxInputs); + if (num_inputs == 1) { + // bypass + ReqIn.at(0).bind(&ReqOut); + RspIn.bind(&RspOut.at(0)); + } } - void step(uint64_t /*cycle*/) { - // process incomming requests - { - req_batch_t req_batch(ReqIn.size()); - for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) { - Req req; - if (ReqIn.at(i).read(&req)) { - req_batch.data.at(i) = req; - req_batch.valid.set(i); + void step(uint64_t /*cycle*/) { + if (ReqIn.size() == 1) + return; + + // process incomming requests + for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) { + uint32_t j = (cursor_ + i) % n; + auto& req_in = ReqIn.at(j); + if (!req_in.empty()) { + auto& req = req_in.top(); + if (tag_shift_) { + req.tag = (req.tag << tag_shift_) | j; } + ReqOut.send(req, delay_); + req_in.pop(); + this->update_cursor(j); + break; } - if (req_batch.valid.any()) { - reqq_.push(req_batch); - } - } - - // apply arbitration - if (!reqq_.empty()) { - auto& req_batch = reqq_.front(); - for (uint32_t i = 0, n = req_batch.data.size(); i < n; ++i) { - auto j = (cursor_ + i) % n; - if (req_batch.valid.test(j)) { - auto& req = req_batch.data.at(j); - if (tag_shift_) { - req.tag = (req.tag << tag_shift_) | j; - } - ReqOut.send(req, delay_); - req_batch.valid.reset(j); - this->update_cursor(j); - if (!req_batch.valid.any()) - reqq_.pop(); // pop when empty - break; - } - } } // process incoming reponses - { - Rsp rsp; - if (RspIn.read(&rsp)) { - uint32_t port_id = 0; - if (tag_shift_) { - port_id = rsp.tag & ((1 << tag_shift_)-1); - rsp.tag >>= tag_shift_; - } - RspOut.at(port_id).send(rsp, 1); - } + if (!RspIn.empty()) { + auto& rsp = RspIn.top(); + uint32_t port_id = 0; + if (tag_shift_) { + port_id = rsp.tag & ((1 << tag_shift_)-1); + rsp.tag >>= tag_shift_; + } + RspOut.at(port_id).send(rsp, 1); + RspIn.pop(); } } diff --git a/sim/simX/warp.cpp b/sim/simX/warp.cpp index 89b9cc39..0392c1b9 100644 --- a/sim/simX/warp.cpp +++ b/sim/simX/warp.cpp @@ -21,7 +21,7 @@ Warp::Warp(Core *core, Word id) vRegFile_.resize(core_->arch().num_regs(), std::vector(core_->arch().vsize(), 0)); } -void Warp::eval(pipeline_state_t *pipeline_state) { +void Warp::eval(pipeline_trace_t *trace) { assert(tmask_.any()); DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask="); @@ -38,18 +38,18 @@ void Warp::eval(pipeline_state_t *pipeline_state) { std::abort(); } - DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr); + DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr << " (#" << trace->id << ")"); - // Update state - pipeline_state->cid = core_->id(); - pipeline_state->wid = id_; - pipeline_state->PC = PC_; - pipeline_state->tmask = tmask_; - pipeline_state->rdest = instr->getRDest(); - pipeline_state->rdest_type = instr->getRDType(); + // Update trace + trace->cid = core_->id(); + trace->wid = id_; + trace->PC = PC_; + trace->tmask = tmask_; + trace->rdest = instr->getRDest(); + trace->rdest_type = instr->getRDType(); // Execute - this->execute(*instr, pipeline_state); + this->execute(*instr, trace); DP(4, "Register state:"); for (int i = 0; i < core_->arch().num_regs(); ++i) { diff --git a/sim/simX/warp.h b/sim/simX/warp.h index 99b372ca..5af5eb02 100644 --- a/sim/simX/warp.h +++ b/sim/simX/warp.h @@ -9,7 +9,7 @@ namespace vortex { class Core; class Instr; -class pipeline_state_t; +class pipeline_trace_t; struct DomStackEntry { DomStackEntry(const ThreadMask &tmask, Word PC) : tmask(tmask) @@ -83,11 +83,11 @@ public: return iRegFile_.at(0).at(reg); } - void eval(pipeline_state_t *); + void eval(pipeline_trace_t *); private: - void execute(const Instr &instr, pipeline_state_t *pipeline_state); + void execute(const Instr &instr, pipeline_trace_t *trace); Word id_; Core *core_; diff --git a/sim/vlsim/Makefile b/sim/vlsim/Makefile index ce01395d..57e114a2 100644 --- a/sim/vlsim/Makefile +++ b/sim/vlsim/Makefile @@ -24,7 +24,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_FLAGS += $(DBG_TRACE_FLAGS) -DBG_FLAGS += -DDBG_CACHE_REQ_INFO SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp @@ -51,10 +50,13 @@ CXXFLAGS += $(CONFIGS) #THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') #VL_FLAGS += --threads $(THREADS) +# Enable VCD trace +#VCD_TRACE = -DVCD_OUTPUT + # Debugigng ifdef DEBUG - VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS) - CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS) + VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS) + CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS) else VL_FLAGS += -DNDEBUG CXXFLAGS += -O2 -DNDEBUG diff --git a/tests/regression/tex/Makefile b/tests/regression/tex/Makefile index 8b313d25..1a771373 100644 --- a/tests/regression/tex/Makefile +++ b/tests/regression/tex/Makefile @@ -9,8 +9,8 @@ VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy -VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections -VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw +VX_CFLAGS += -std=c++11 -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections +VX_CFLAGS += -DENABLE_SW -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a @@ -21,7 +21,7 @@ CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors CXXFLAGS += -DLUPNG_USE_ZLIB -CXXFLAGS += -I$(VORTEX_DRV_PATH)/include +CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex -lz @@ -38,7 +38,7 @@ kernel.bin: kernel.elf $(VX_CP) -O binary kernel.elf kernel.bin kernel.elf: $(VX_SRCS) - $(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf + $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ diff --git a/tests/regression/tex/common.h b/tests/regression/tex/common.h index 2abb7234..1a7f53d0 100644 --- a/tests/regression/tex/common.h +++ b/tests/regression/tex/common.h @@ -1,25 +1,27 @@ #ifndef _COMMON_H_ #define _COMMON_H_ +#include + #define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 typedef struct { - uint32_t num_tasks; - uint8_t format; - uint8_t filter; - uint8_t wrap; - uint8_t use_sw; - uint32_t lod; - uint8_t src_logWidth; - uint8_t src_logHeight; - uint8_t src_stride; - uint8_t src_pitch; - uint32_t src_ptr; - uint32_t dst_width; - uint32_t dst_height; - uint8_t dst_stride; - uint32_t dst_pitch; - uint32_t dst_ptr; + bool use_sw; + uint32_t num_tasks; + uint8_t format; + uint8_t filter; + uint8_t wrapu; + uint8_t wrapv; + uint8_t src_logwidth; + uint8_t src_logheight; + uint32_t src_addr; + float lod; + uint32_t mip_offs[TEX_LOD_MAX+1]; + uint32_t dst_width; + uint32_t dst_height; + uint8_t dst_stride; + uint32_t dst_pitch; + uint32_t dst_addr; } kernel_arg_t; #endif \ No newline at end of file diff --git a/tests/regression/tex/kernel.c b/tests/regression/tex/kernel.c index bd0cebb4..9aaaad24 100644 --- a/tests/regression/tex/kernel.c +++ b/tests/regression/tex/kernel.c @@ -1,11 +1,9 @@ #include #include #include -#include "common.h" +#include #include "texsw.h" -#define ENABLE_SW - typedef struct { kernel_arg_t* state; uint32_t tile_width; @@ -14,29 +12,50 @@ typedef struct { float deltaY; } tile_arg_t; +template +struct static_for_t { + template + inline void operator()(const Fn& callback) const { + callback(Start); + static_for_t()(callback); + } +}; + +template +struct static_for_t { + template + inline void operator()(const Fn& callback) const {} +}; + void kernel_body(int task_id, tile_arg_t* arg) { kernel_arg_t* state = arg->state; uint32_t xoffset = 0; - uint32_t yoffset = task_id * arg->tile_height; - uint8_t* dst_ptr = (uint8_t*)(state->dst_ptr + xoffset * state->dst_stride + yoffset * state->dst_pitch); + uint32_t yoffset = task_id * arg->tile_height; - float fv = yoffset * arg->deltaY; + uint8_t* dst_ptr = (uint8_t*)(state->dst_addr + xoffset * state->dst_stride + yoffset * state->dst_pitch); + + Fixed<16> xlod(state->lod); + + /*vx_printf("task_id=%d, deltaX=%f, deltaY=%f, tile_width=%d, tile_height=%d\n", + task_id, arg->deltaX, arg->deltaY, arg->tile_width, arg->tile_height);*/ + + float fv = (yoffset + 0.5f) * arg->deltaY; for (uint32_t y = 0; y < arg->tile_height; ++y) { uint32_t* dst_row = (uint32_t*)dst_ptr; - float fu = xoffset * arg->deltaX; + float fu = (xoffset + 0.5f) * arg->deltaX; for (uint32_t x = 0; x < arg->tile_width; ++x) { - int32_t u = (int32_t)(fu * (1<<20)); - int32_t v = (int32_t)(fv * (1<<20)); + Fixed xu(fu); + Fixed xv(fv); + uint32_t color; #ifdef ENABLE_SW - if (state->use_sw) { - dst_row[x] = (state->filter == 2) ? tex3_sw(state, 0, u, v, state->lod) : tex_sw(state, 0, u, v, state->lod); - } else { - #endif - dst_row[x] = (state->filter == 2) ? vx_tex3(0, u, v, state->lod) : vx_tex(0, u, v, state->lod); - #ifdef ENABLE_SW - } + if (state->use_sw) + color = tex_load_sw(state, xu, xv, xlod); + else #endif + color = tex_load_hw(state, xu, xv, xlod); + //vx_printf("task_id=%d, x=%d, y=%d, fu=%f, fv=%f, xu=0x%x, xv=0x%x, color=0x%x\n", task_id, x, y, fu, fv, xu.data(), xv.data(), color); + dst_row[x] = color; fu += arg->deltaX; } dst_ptr += state->dst_pitch; @@ -48,13 +67,16 @@ int main() { kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; // configure texture unit - vx_csr_write(CSR_TEX_ADDR(0), arg->src_ptr); - vx_csr_write(CSR_TEX_MIPOFF(0), 0); - vx_csr_write(CSR_TEX_WIDTH(0), arg->src_logWidth); - vx_csr_write(CSR_TEX_HEIGHT(0), arg->src_logHeight); - vx_csr_write(CSR_TEX_FORMAT(0), arg->format); - vx_csr_write(CSR_TEX_WRAP(0), (arg->wrap << 2) | arg->wrap); - vx_csr_write(CSR_TEX_FILTER(0), (arg->filter ? 1 : 0)); + csr_write(CSR_TEX(0, TEX_STATE_WIDTH), arg->src_logwidth); + csr_write(CSR_TEX(0, TEX_STATE_HEIGHT), arg->src_logheight); + csr_write(CSR_TEX(0, TEX_STATE_FORMAT), arg->format); + csr_write(CSR_TEX(0, TEX_STATE_WRAPU), arg->wrapu); + csr_write(CSR_TEX(0, TEX_STATE_WRAPV), arg->wrapv); + csr_write(CSR_TEX(0, TEX_STATE_FILTER), (arg->filter ? 1 : 0)); + csr_write(CSR_TEX(0, TEX_STATE_ADDR), arg->src_addr); + static_for_t()([&](int i) { + csr_write(CSR_TEX(0, TEX_STATE_MIPOFF(i)), arg->mip_offs[i]); + }); tile_arg_t targ; targ.state = arg; @@ -64,4 +86,9 @@ int main() { targ.deltaY = 1.0f / arg->dst_height; vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, &targ); + /*for (uint32_t t=0; t < arg->num_tasks; ++t) { + kernel_body(t, &targ); + }*/ + + return 0; } \ No newline at end of file diff --git a/tests/regression/tex/main.cpp b/tests/regression/tex/main.cpp index a83651ee..ffdfb593 100644 --- a/tests/regression/tex/main.cpp +++ b/tests/regression/tex/main.cpp @@ -25,10 +25,11 @@ const char* kernel_file = "kernel.bin"; const char* input_file = "palette64.png"; const char* output_file = "output.png"; int wrap = 0; -int filter = 0; +int filter = 0; // 0-> point, 1->bilinear, 2->trilinear float scale = 1.0f; int format = 0; bool use_sw = false; +float lod = 1.0f; // >= 1.0f ePixelFormat eformat = FORMAT_A8R8G8B8; vx_device_h device = nullptr; @@ -36,7 +37,7 @@ vx_buffer_h buffer = nullptr; static void show_usage() { std::cout << "Vortex Texture Test." << std::endl; - std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-z no_hw] [-h: help]" << std::endl; + std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-l lod] [-z no_hw] [-h: help]" << std::endl; } static void parse_args(int argc, char **argv) { @@ -55,6 +56,9 @@ static void parse_args(int argc, char **argv) { case 'w': wrap = std::atoi(optarg); break; + case 'l': + lod = std::stof(optarg, NULL); + break; case 'z': use_sw = true; break; @@ -118,7 +122,7 @@ int run_test(const kernel_arg_t& kernel_arg, // download destination buffer std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0)); + RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_addr, buf_size, 0)); std::vector dst_pixels(buf_size); auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); @@ -137,25 +141,39 @@ int run_test(const kernel_arg_t& kernel_arg, int main(int argc, char *argv[]) { kernel_arg_t kernel_arg; std::vector src_pixels; + std::vector mip_offsets; uint32_t src_width; uint32_t src_height; // parse command arguments parse_args(argc, argv); - RT_CHECK(LoadImage(input_file, eformat, src_pixels, &src_width, &src_height)); + { + std::vector staging; + RT_CHECK(LoadImage(input_file, eformat, staging, &src_width, &src_height)); + + RT_CHECK(GenerateMipmaps(src_pixels, mip_offsets, staging, eformat, src_width, src_height)); + + //uint32_t src_bpp = Format::GetInfo(eformat).BytePerPixel; + //dump_image(src_pixels, src_pixels.size() / src_bpp, 1, src_bpp); + } // check power of two support - if (!ISPOW2(src_width) || !ISPOW2(src_height)) { + if (!ispow2(src_width) || !ispow2(src_height)) { std::cout << "Error: only power of two textures supported: width=" << src_width << ", heigth=" << src_height << std::endl; return -1; } - uint32_t src_bpp = Format::GetInfo(eformat).BytePerPixel; - - //dump_image(src_pixels, src_width, src_height, src_bpp); + uint32_t src_logwidth = log2ceil(src_width); + uint32_t src_logheight = log2ceil(src_height); - uint32_t src_bufsize = src_bpp * src_width * src_height; + uint32_t src_max_lod = std::max(src_logwidth, src_logheight); + if (lod > src_max_lod) { + std::cout << "Error: out-of-bound level-of-detail: lod=" << lod << ", source image=" << src_max_lod << std::endl; + return -1; + } + + uint32_t src_bufsize = src_pixels.size(); uint32_t dst_width = (uint32_t)(src_width * scale); uint32_t dst_height = (uint32_t)(src_height * scale); @@ -183,7 +201,7 @@ int main(int argc, char *argv[]) { // allocate device memory std::cout << "allocate device memory" << std::endl; - size_t src_addr, dst_addr; + uint64_t src_addr, dst_addr; RT_CHECK(vx_alloc_dev_mem(device, src_bufsize, &src_addr)); RT_CHECK(vx_alloc_dev_mem(device, dst_bufsize, &dst_addr)); @@ -192,32 +210,37 @@ int main(int argc, char *argv[]) { // allocate staging shared memory std::cout << "allocate shared memory" << std::endl; - uint32_t alloc_size = std::max(sizeof(kernel_arg_t), std::max(src_bufsize, dst_bufsize)); + uint32_t alloc_size = std::max(sizeof(kernel_arg_t), + std::max(src_bufsize, dst_bufsize)); RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer)); // upload kernel argument std::cout << "upload kernel argument" << std::endl; { + kernel_arg.use_sw = use_sw; kernel_arg.num_tasks = std::min(num_tasks, dst_height); kernel_arg.format = format; kernel_arg.filter = filter; - kernel_arg.wrap = wrap; - kernel_arg.use_sw = use_sw; - kernel_arg.lod = 0x0; + kernel_arg.wrapu = wrap; + kernel_arg.wrapv = wrap; - kernel_arg.src_logWidth = (uint32_t)std::log2(src_width); - kernel_arg.src_logHeight = (uint32_t)std::log2(src_height); - kernel_arg.src_stride = src_bpp; - kernel_arg.src_pitch = src_bpp * src_width; - kernel_arg.src_ptr = src_addr; + kernel_arg.src_logwidth = src_logwidth; + kernel_arg.src_logheight = src_logheight; + kernel_arg.src_addr = src_addr; + kernel_arg.lod = lod; + + for (uint32_t i = 0; i < mip_offsets.size(); ++i) { + assert(i < TEX_LOD_MAX); + kernel_arg.mip_offs[i] = mip_offsets.at(i); + } kernel_arg.dst_width = dst_width; kernel_arg.dst_height = dst_height; kernel_arg.dst_stride = dst_bpp; kernel_arg.dst_pitch = dst_bpp * dst_width; - kernel_arg.dst_ptr = dst_addr; + kernel_arg.dst_addr = dst_addr; - auto buf_ptr = (int*)vx_host_ptr(buffer); + auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); } @@ -225,21 +248,21 @@ int main(int argc, char *argv[]) { // upload source buffer std::cout << "upload source buffer" << std::endl; { - auto buf_ptr = (int8_t*)vx_host_ptr(buffer); + auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); for (uint32_t i = 0; i < src_bufsize; ++i) { buf_ptr[i] = src_pixels[i]; } - RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, src_bufsize, 0)); + RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_addr, src_bufsize, 0)); } // clear destination buffer std::cout << "clear destination buffer" << std::endl; { - auto buf_ptr = (int32_t*)vx_host_ptr(buffer); + auto buf_ptr = (uint32_t*)vx_host_ptr(buffer); for (uint32_t i = 0; i < (dst_bufsize/4); ++i) { buf_ptr[i] = 0xdeadbeef; } - RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, dst_bufsize, 0)); + RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_addr, dst_bufsize, 0)); } // run tests diff --git a/tests/regression/tex/texsw.h b/tests/regression/tex/texsw.h index 96b9a19e..c9961ab8 100644 --- a/tests/regression/tex/texsw.h +++ b/tests/regression/tex/texsw.h @@ -1,167 +1,122 @@ -#ifndef _TEXSW_H_ +#pragma once +#include +#include #include "common.h" -#define TEX_LOD_MAX 11 - -#define MIN(x, y) ((x < y) ? (x) : (y)) - -#define MAX(x, y) ((x > y) ? (x) : (y)) - -inline int address(int wrap, int value) { - switch (wrap) { - case 1: return value & 0xfffff; - default: - case 0: return MIN(MAX(value, 0), 0xfffff); +inline uint32_t texel_read(uint8_t* address, uint32_t stride) { + switch (stride) { + case 1: return *(uint8_t*)address; + case 2: return *(uint16_t*)address; + case 4: return *(uint32_t*)address; + default: + std::abort(); + return 0; } } -inline void unpack(int format, int value, int* l, int* h) { - switch (format) { - case 1: - case 2: - *l = value; - *h = 0; - break; - case 3: - *l = (value | (value << 8)) & 0x00ff00ff; - *h = 0; - break; - case 4: - *l = (value | (value << 16)) & 0x07e0f81f; - *h = 0; - break; - case 5: - *l = (value | (value << 12)) & 0x0f0f0f0f; - *h = 0; - break; - default: - case 0: - *l = value & 0x00ff00ff; - *h = (value >> 8) & 0x00ff00ff; - break; - } -} +inline uint32_t vx_tex_sw(kernel_arg_t* state, + Fixed xu, + Fixed xv, + uint32_t lod) { + uint8_t* base_addr = ((uint8_t*)state->src_addr) + state->mip_offs[lod]; + uint32_t log_width = std::max(state->src_logwidth - lod, 0); + uint32_t log_height = std::max(state->src_logheight - lod, 0); + auto format = (TexFormat)state->format; + auto wrapu = (WrapMode)state->wrapu; + auto wrapv = (WrapMode)state->wrapv; + auto filter = state->filter; + auto stride = Stride(format); -inline void lerp(int al, int ah, int bl, int bh, int frac, int* l, int* h) { - *l = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; - *h = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; -} - -inline int pack(int format, int l, int h) { - switch (format) { - case 1: - case 2: - return l; - case 3: - return (l | (l >> 8)) & 0xffff; - case 4: - return (l | (l >> 16)) & 0xffff; - case 5: - return (l | (l >> 12)) & 0xffff; - default: - case 0: - return (h << 8) | l; - } -} - -inline int tex_sw(kernel_arg_t* state, int stage, int u, int v, int lod) { - int base_addr = state->src_ptr; - int mip_offset = 0; - int log_width = state->src_logWidth; - int log_height = state->src_logHeight; - int format = state->format; - int wrap = state->wrap; - int filter = state->filter; - - int32_t* pBits = ((uint32_t*)base_addr) + mip_offset; + uint32_t color; if (filter) { - int u0 = address(wrap, u - (0x80000 >> log_width)); - int v0 = address(wrap, v - (0x80000 >> log_height)); - int u1 = address(wrap, u + (0x80000 >> log_width)); - int v1 = address(wrap, v + (0x80000 >> log_height)); + // addressing + uint32_t offset00, offset01, offset10, offset11; + uint32_t alpha, beta; + TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv, + &offset00, &offset01, &offset10, &offset11, &alpha, &beta); - int x0 = u0 >> (20 - log_width); - int y0 = v0 >> (20 - log_height); - int x1 = u1 >> (20 - log_width); - int y1 = v1 >> (20 - log_height); + uint8_t* addr00 = base_addr + offset00 * stride; + uint8_t* addr01 = base_addr + offset01 * stride; + uint8_t* addr10 = base_addr + offset10 * stride; + uint8_t* addr11 = base_addr + offset11 * stride; // memory lookup - - int c0 = pBits[x0 + (y0 << log_width)]; - int c1 = pBits[x1 + (y0 << log_width)]; - int c2 = pBits[x0 + (y1 << log_width)]; - int c3 = pBits[x1 + (y1 << log_width)]; + uint32_t texel00 = texel_read(addr00, stride); + uint32_t texel01 = texel_read(addr01, stride); + uint32_t texel10 = texel_read(addr10, stride); + uint32_t texel11 = texel_read(addr11, stride); // filtering - - int alpha = x0 & 0xff; - int beta = y0 & 0xff; - - int c0a, c0b; - int c1a, c1b; - int c01a, c01b; - - unpack(format, c0, &c0a, &c0b); - unpack(format, c1, &c1a, &c1b); - lerp(c0a, c0b, c1a, c1b, alpha, &c01a, &c01b); - - int c2a, c2b; - int c3a, c3b; - int c23a, c23b; - - unpack(format, c2, &c2a, &c2b); - unpack(format, c3, &c3a, &c3b); - lerp(c2a, c2b, c3a, c3b, alpha, &c23a, &c23b); - - int c4a, c4b; - lerp(c01a, c01b, c23a, c23b, beta, &c4a, &c4b); - return pack(format, c4a, c4b); + color = TexFilterLinear( + format, texel00, texel01, texel10, texel11, alpha, beta); } else { - int u0 = address(wrap, u); - int v0 = address(wrap, v); + // addressing + uint32_t offset; + TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset); + + uint8_t* addr = base_addr + offset * stride; + + // memory lookup + uint32_t texel = texel_read(addr, stride); - int x0 = u0 >> (20 - log_width); - int y0 = v0 >> (20 - log_height); - - int c0 = pBits[x0 + (y0 <> 8) & 0x00ff00ff; - int bl = b & 0x00ff00ff; - int bh = (b >> 8) & 0x00ff00ff; - int frac = (lod >> 12) & 0xff; - int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; - int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; - int c = al | (ah << 8); - return c; +inline uint32_t tex_load_hw(kernel_arg_t* state, + Fixed xu, + Fixed xv, + Fixed<16> xlod) { + uint32_t color; + int32_t ilod = std::max(xlod.data(), Fixed<16>::ONE); + uint32_t lod = std::min(log2floor(ilod) - 16, TEX_LOD_MAX); + if (state->filter == 2) { + uint32_t lod_n = std::min(lod + 1, TEX_LOD_MAX); + uint32_t frac = ilod >> (lod + 16 - 8); + uint32_t texel0 = vx_tex(0, xu.data(), xv.data(), lod); + uint32_t texel1 = vx_tex(0, xu.data(), xv.data(), lod_n); + uint32_t cl, ch; + { + uint32_t c0l, c0h; + uint32_t c1l, c1h; + Unpack8888(TexFormat::R8G8B8A8, texel0, &c0l, &c0h); + Unpack8888(TexFormat::R8G8B8A8, texel1, &c1l, &c1h); + Lerp8888(c0l, c0h, c1l, c1h, frac, &cl, &ch); + } + color = Pack8888(TexFormat::R8G8B8A8, cl, ch); + } else { + color = vx_tex(0, xu.data(), xv.data(), lod); + } + return color; } -inline int tex3_sw(kernel_arg_t* state, int stage, int u, int v, int lod) { - int lodn = MIN(lod + 0x10000, TEX_LOD_MAX); - int a = tex_sw(state, 0, u, v, lod); - int b = tex_sw(state, 0, u, v, lodn); - int al = a & 0x00ff00ff; - int ah = (a >> 8) & 0x00ff00ff; - - int bl = b & 0x00ff00ff; - int bh = (b >> 8) & 0x00ff00ff; - int frac = (lod >> 12) & 0xff; - int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; - int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; - int c = al | (ah << 8); - return c; -} - -#endif \ No newline at end of file +inline uint32_t tex_load_sw(kernel_arg_t* state, + Fixed xu, + Fixed xv, + Fixed<16> xlod) { + uint32_t color; + int32_t ilod = std::max(xlod.data(), Fixed<16>::ONE); + uint32_t lod = std::min(log2floor(ilod) - 16, TEX_LOD_MAX); + if (state->filter == 2) { + uint32_t lod_n = std::min(lod + 1, TEX_LOD_MAX); + uint32_t frac = ilod >> (lod + 16 - 8); + uint32_t texel0 = vx_tex_sw(state, xu, xv, lod); + uint32_t texel1 = vx_tex_sw(state, xu, xv, lod_n); + uint32_t cl, ch; + { + uint32_t c0l, c0h; + uint32_t c1l, c1h; + Unpack8888(TexFormat::R8G8B8A8, texel0, &c0l, &c0h); + Unpack8888(TexFormat::R8G8B8A8, texel1, &c1l, &c1h); + Lerp8888(c0l, c0h, c1l, c1h, frac, &cl, &ch); + } + color = Pack8888(TexFormat::R8G8B8A8, cl, ch); + } else { + color = vx_tex_sw(state, xu, xv, lod); + } + return color; +} \ No newline at end of file diff --git a/tests/regression/tex/utils.cpp b/tests/regression/tex/utils.cpp index 8a2ff760..81a47158 100644 --- a/tests/regression/tex/utils.cpp +++ b/tests/regression/tex/utils.cpp @@ -191,4 +191,112 @@ int ConvertImage(std::vector& dst_pixels, SurfaceDesc dstDesc{dst_format, dst_pixels.data(), width, height, dst_pitch}; return CopyBuffers(dstDesc, 0, 0, width, height, srcDesc, 0, 0); +} + + + +int GenerateMipmaps(std::vector& dst_pixels, + std::vector& mip_offsets, + const std::vector& src_pixels, + ePixelFormat format, + uint32_t src_width, + uint32_t src_height) { + std::vector src_staging, dst_staging; + const std::vector *pSrcPixels; + std::vector *pDstPixels; + + // convert source image if needed + bool need_conversion = (format != FORMAT_A8R8G8B8); + if (need_conversion) { + ConvertImage(src_staging, src_pixels, src_width, src_height, format, FORMAT_A8R8G8B8); + pSrcPixels = &src_staging; + pDstPixels = &dst_staging; + } else { + pSrcPixels = &src_pixels; + pDstPixels = &dst_pixels; + } + + uint32_t src_logwidth = log2ceil(src_width); + uint32_t src_logheight = log2ceil(src_height); + uint32_t max_lod = std::max(src_logwidth, src_logheight) + 1; + + mip_offsets.resize(max_lod); + + // Calculate mipmaps buffer size + uint32_t dst_height = 1; + uint32_t dst_width = 0; + for (uint32_t lod = 0, w = src_width, h = src_height; lod < max_lod; ++lod) { + assert((w > 0) || (w > 0)); + uint32_t pw = std::max(w, 1); + uint32_t ph = std::max(h, 1); + mip_offsets.at(lod) = dst_width; + dst_width += pw * ph; + w >>= 1; + h >>= 1; + } + + // allocate mipmap + pDstPixels->resize(dst_width * 4); + + // generate mipmaps + { + auto pSrc = reinterpret_cast(pSrcPixels->data()); + auto pDst = reinterpret_cast(pDstPixels->data()); + + // copy level 0 + memcpy(pDst, pSrc, pSrcPixels->size()); + assert(pSrcPixels->size() == 4 * src_width * src_height); + pSrc = pDst; + pDst += src_width * src_height; + + // copy lower levels + for (uint32_t lod = 1, w = (src_width/2), h = (src_height/2); lod < max_lod;) { + assert((w > 0) || (w > 0)); + uint32_t pw = std::max(w, 1); + uint32_t ph = std::max(h, 1); + for (uint32_t y = 0; y < pw; ++y) { + auto v0 = 2 * y; + auto v1 = 2 * y + ((ph > 1) ? 1 : 0); + auto pSrc0 = pSrc + v0 * (2 * pw); + auto pSrc1 = pSrc + v1 * (2 * pw); + + for (uint32_t x = 0; x 1) ? 1 : 0); + + auto c00 = Format::ConvertFrom(pSrc0 + u0); + auto c01 = Format::ConvertFrom(pSrc0 + u1); + auto c10 = Format::ConvertFrom(pSrc1 + u0); + auto c11 = Format::ConvertFrom(pSrc1 + u1); + + const ColorARGB color((c00.a + c01.a + c10.a + c11.a+2) >> 2, + (c00.r + c01.r + c10.r + c11.r+2) >> 2, + (c00.g + c01.g + c10.g + c11.g+2) >> 2, + (c00.b + c01.b + c10.b + c11.b+2) >> 2); + + uint32_t ncolor; + Format::ConvertTo(&ncolor, color); + pDst[x + y * pw] = ncolor; + } + } + ++lod; + pSrc = pDst; + pDst += pw * ph; + w >>= 1; + h >>= 1; + } + assert((pDst - reinterpret_cast(pDstPixels->data())) == dst_width); + } + + // convert destination image if needed + if (need_conversion) { + ConvertImage(dst_staging, dst_staging, dst_width, dst_height, FORMAT_A8R8G8B8, format); + } + + uint32_t bpp = Format::GetInfo(format).BytePerPixel; + for (auto& offset : mip_offsets) { + offset *= bpp; + } + + return 0; } \ No newline at end of file diff --git a/tests/regression/tex/utils.h b/tests/regression/tex/utils.h index 48b1ad55..7ce58941 100644 --- a/tests/regression/tex/utils.h +++ b/tests/regression/tex/utils.h @@ -1,14 +1,9 @@ #include #include #include +#include #include "surfacedesc.h" -#define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1)))) - -inline uint32_t ilog2 (uint32_t value) { - return (uint32_t)(sizeof(uint32_t) * 8UL) - (uint32_t)__builtin_clzl((value << 1) - 1UL) - 1; -} - int LoadImage(const char *filename, ePixelFormat format, std::vector &pixels, @@ -37,7 +32,14 @@ int ConvertImage(std::vector& dst_pixels, ePixelFormat src_format, ePixelFormat dst_format); +int GenerateMipmaps(std::vector& dst_pixels, + std::vector& mip_offsets, + const std::vector& src_pixels, + ePixelFormat format, + uint32_t src_width, + uint32_t src_height); + void dump_image(const std::vector& pixels, uint32_t width, uint32_t height, - uint32_t bpp); + uint32_t bpp); \ No newline at end of file diff --git a/tests/runtime/fibonacci/main.cpp b/tests/runtime/fibonacci/main.cpp index f6612c29..c6fc036a 100644 --- a/tests/runtime/fibonacci/main.cpp +++ b/tests/runtime/fibonacci/main.cpp @@ -1,4 +1,5 @@ #include +#include const int Num = 9; const int Ans = 34; @@ -14,12 +15,12 @@ int main() { int fib = fibonacci(Num); - printf("fibonacci(%d) = %d\n", Num, fib); + vx_printf("fibonacci(%d) = %d\n", Num, fib); if (fib == Ans) { - printf("Passed!\n"); + vx_printf("Passed!\n"); } else { - printf("Failed! value=%d, expected=%d\n", fib, Ans); + vx_printf("Failed! value=%d, expected=%d\n", fib, Ans); errors = 1; } diff --git a/tests/runtime/hello/main.cpp b/tests/runtime/hello/main.cpp index 69904cfd..94aff07e 100644 --- a/tests/runtime/hello/main.cpp +++ b/tests/runtime/hello/main.cpp @@ -1,8 +1,9 @@ #include +#include int main() { - printf("Hello World!\n"); + vx_printf("Hello World!\n"); return 0; } \ No newline at end of file From d25fa21a5931c7387bab73151c0631218f62e9b9 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 24 Nov 2021 18:03:09 -0500 Subject: [PATCH 08/27] adding cocogfx submodule --- .gitmodules | 3 +++ third_party/cocogfx | 1 + 2 files changed, 4 insertions(+) create mode 160000 third_party/cocogfx diff --git a/.gitmodules b/.gitmodules index 96aeefdb..fb38564e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "sim/common/softfloat"] path = sim/common/softfloat url = https://github.com/ucb-bar/berkeley-softfloat-3.git +[submodule "third_party/cocogfx"] + path = third_party/cocogfx + url = https://github.com/gtcasl/cocogfx.git diff --git a/third_party/cocogfx b/third_party/cocogfx new file mode 160000 index 00000000..68e3625e --- /dev/null +++ b/third_party/cocogfx @@ -0,0 +1 @@ +Subproject commit 68e3625e70acd1fbd5fcfc629223d370f7b6806e From a671e1a05dd2ee7f07ede45fba4133282bd91cea Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 24 Nov 2021 18:10:00 -0500 Subject: [PATCH 09/27] moving submodules into third_party folder --- .gitmodules | 12 ++++++------ hw/rtl/fp_cores/fpnew | 1 - third_party/fpnew | 1 + {sim/common => third_party}/softfloat | 0 4 files changed, 7 insertions(+), 7 deletions(-) delete mode 160000 hw/rtl/fp_cores/fpnew create mode 160000 third_party/fpnew rename {sim/common => third_party}/softfloat (100%) diff --git a/.gitmodules b/.gitmodules index fb38564e..360e5c00 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,9 @@ -[submodule "hw/rtl/fp_cores/fpnew"] - path = hw/rtl/fp_cores/fpnew - url = https://github.com/pulp-platform/fpnew.git -[submodule "sim/common/softfloat"] - path = sim/common/softfloat - url = https://github.com/ucb-bar/berkeley-softfloat-3.git [submodule "third_party/cocogfx"] path = third_party/cocogfx url = https://github.com/gtcasl/cocogfx.git +[submodule "third_party/fpnew"] + path = third_party/fpnew + url = https://github.com/pulp-platform/fpnew.git +[submodule "third_party/softfloat"] + path = third_party/softfloat + url = https://github.com/ucb-bar/berkeley-softfloat-3.git diff --git a/hw/rtl/fp_cores/fpnew b/hw/rtl/fp_cores/fpnew deleted file mode 160000 index 1def7bb6..00000000 --- a/hw/rtl/fp_cores/fpnew +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1def7bb630ceae2ebc58921f6b5ee3e686fb6d5a diff --git a/third_party/fpnew b/third_party/fpnew new file mode 160000 index 00000000..0bfbeede --- /dev/null +++ b/third_party/fpnew @@ -0,0 +1 @@ +Subproject commit 0bfbeede0e01b2e44e41bb14c70a80efeffa1bbd diff --git a/sim/common/softfloat b/third_party/softfloat similarity index 100% rename from sim/common/softfloat rename to third_party/softfloat From b995843a5b77376f50bef46a33a2005cb5fce01a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 25 Nov 2021 13:58:09 -0500 Subject: [PATCH 10/27] cocogfx fixes and refactoring --- Makefile | 3 +- hw/rtl/tex_unit/VX_tex_define.vh | 11 +- hw/rtl/tex_unit/VX_tex_format.sv | 26 +- hw/rtl/tex_unit/VX_tex_stride.sv | 15 +- hw/syn/quartus/core/Makefile | 3 +- hw/syn/quartus/fpu_core/Makefile | 3 +- hw/syn/quartus/pipeline/Makefile | 3 +- hw/syn/quartus/top1/Makefile | 3 +- hw/syn/quartus/top16/Makefile | 3 +- hw/syn/quartus/top2/Makefile | 3 +- hw/syn/quartus/top32/Makefile | 3 +- hw/syn/quartus/top4/Makefile | 3 +- hw/syn/quartus/top64/Makefile | 5 +- hw/syn/quartus/top8/Makefile | 3 +- hw/syn/quartus/unittest/Makefile | 3 +- hw/syn/quartus/vortex/Makefile | 3 +- runtime/Makefile | 2 +- sim/Makefile | 2 - sim/common/Makefile | 5 - sim/common/fixed.h | 419 --------- sim/common/rvfloats.cpp | 4 +- sim/common/texturing.h | 138 +-- sim/rtlsim/Makefile | 13 +- sim/simX/Makefile | 8 +- sim/simX/tex_unit.cpp | 1 + sim/vlsim/Makefile | 11 +- sim/vlsim/vortex_afu.h | 49 ++ tests/regression/tex/Makefile | 10 +- tests/regression/tex/blitter.h | 268 ------ tests/regression/tex/color.h | 68 -- tests/regression/tex/common.h | 1 - tests/regression/tex/format.h | 1022 ---------------------- tests/regression/tex/int24.h | 37 - tests/regression/tex/kernel.c | 25 +- tests/regression/tex/lupng.c | 1313 ---------------------------- tests/regression/tex/lupng.h | 186 ---- tests/regression/tex/main.cpp | 45 +- tests/regression/tex/surfacedesc.h | 25 - tests/regression/tex/texsw.h | 125 +-- tests/regression/tex/tga.cpp | 122 --- tests/regression/tex/tga.h | 14 - tests/regression/tex/utils.cpp | 208 +---- tests/regression/tex/utils.h | 31 +- third_party/Makefile | 15 + 44 files changed, 339 insertions(+), 3921 deletions(-) delete mode 100644 sim/common/Makefile delete mode 100644 sim/common/fixed.h create mode 100644 sim/vlsim/vortex_afu.h delete mode 100644 tests/regression/tex/blitter.h delete mode 100644 tests/regression/tex/color.h delete mode 100644 tests/regression/tex/format.h delete mode 100644 tests/regression/tex/int24.h delete mode 100644 tests/regression/tex/lupng.c delete mode 100644 tests/regression/tex/lupng.h delete mode 100644 tests/regression/tex/surfacedesc.h delete mode 100644 tests/regression/tex/tga.cpp delete mode 100644 tests/regression/tex/tga.h create mode 100644 third_party/Makefile diff --git a/Makefile b/Makefile index 859c597d..8142a1be 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,11 @@ all: + $(MAKE) -C third_party $(MAKE) -C hw $(MAKE) -C sim $(MAKE) -C driver $(MAKE) -C runtime $(MAKE) -C tests - + clean: $(MAKE) -C hw clean $(MAKE) -C sim clean diff --git a/hw/rtl/tex_unit/VX_tex_define.vh b/hw/rtl/tex_unit/VX_tex_define.vh index 34564b39..a3e1a926 100644 --- a/hw/rtl/tex_unit/VX_tex_define.vh +++ b/hw/rtl/tex_unit/VX_tex_define.vh @@ -24,11 +24,12 @@ `define TEX_BLEND_FRAC 8 `define TEX_BLEND_ONE (2 ** `TEX_BLEND_FRAC) -`define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0) +`define TEX_FORMAT_A8R8G8B8 `TEX_FORMAT_BITS'(0) `define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1) -`define TEX_FORMAT_R4G4B4A4 `TEX_FORMAT_BITS'(2) -`define TEX_FORMAT_L8A8 `TEX_FORMAT_BITS'(3) -`define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(4) -`define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(5) +`define TEX_FORMAT_A1R5G5B5 `TEX_FORMAT_BITS'(2) +`define TEX_FORMAT_A4R4G4B4 `TEX_FORMAT_BITS'(3) +`define TEX_FORMAT_A8L8 `TEX_FORMAT_BITS'(4) +`define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(5) +`define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(6) `endif \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_format.sv b/hw/rtl/tex_unit/VX_tex_format.sv index 91e0e6f8..e299ed17 100644 --- a/hw/rtl/tex_unit/VX_tex_format.sv +++ b/hw/rtl/tex_unit/VX_tex_format.sv @@ -13,25 +13,31 @@ module VX_tex_format #( always @(*) begin case (format) - `TEX_FORMAT_R8G8B8A8: begin + `TEX_FORMAT_A8R8G8B8: begin texel_out_r[07:00] = texel_in[7:0]; texel_out_r[15:08] = texel_in[15:8]; texel_out_r[23:16] = texel_in[23:16]; texel_out_r[31:24] = texel_in[31:24]; end `TEX_FORMAT_R5G6B5: begin - texel_out_r[07:00] = {texel_in[15:11], texel_in[15:13]}; + texel_out_r[07:00] = {texel_in[4:0], texel_in[4:2]}; texel_out_r[15:08] = {texel_in[10:5], texel_in[10:9]}; - texel_out_r[23:16] = {texel_in[4:0], texel_in[4:2]}; + texel_out_r[23:16] = {texel_in[15:11], texel_in[15:13]}; texel_out_r[31:24] = 8'hff; end - `TEX_FORMAT_R4G4B4A4: begin - texel_out_r[07:00] = {texel_in[11:8], texel_in[15:12]}; + `TEX_FORMAT_A1R5G5B5: begin + texel_out_r[07:00] = {texel_in[4:0], texel_in[4:2]}; + texel_out_r[15:08] = {texel_in[9:5], texel_in[9:7]}; + texel_out_r[23:16] = {texel_in[14:10], texel_in[14:12]}; + texel_out_r[31:24] = {8{texel_in[15]}}; + end + `TEX_FORMAT_A4R4G4B4: begin + texel_out_r[07:00] = {2{texel_in[3:0]}}; texel_out_r[15:08] = {2{texel_in[7:4]}}; - texel_out_r[23:16] = {2{texel_in[3:0]}}; + texel_out_r[23:16] = {2{texel_in[11:8]}}; texel_out_r[31:24] = {2{texel_in[15:12]}}; end - `TEX_FORMAT_L8A8: begin + `TEX_FORMAT_A8L8: begin texel_out_r[07:00] = texel_in[7:0]; texel_out_r[15:08] = texel_in[7:0]; texel_out_r[23:16] = texel_in[7:0]; @@ -45,9 +51,9 @@ module VX_tex_format #( end //`TEX_FORMAT_A8 default: begin - texel_out_r[07:00] = 0; - texel_out_r[15:08] = 0; - texel_out_r[23:16] = 0; + texel_out_r[07:00] = 8'hff; + texel_out_r[15:08] = 8'hff; + texel_out_r[23:16] = 8'hff; texel_out_r[31:24] = texel_in[7:0]; end endcase diff --git a/hw/rtl/tex_unit/VX_tex_stride.sv b/hw/rtl/tex_unit/VX_tex_stride.sv index 0e1eca6a..3f1427bb 100644 --- a/hw/rtl/tex_unit/VX_tex_stride.sv +++ b/hw/rtl/tex_unit/VX_tex_stride.sv @@ -12,13 +12,14 @@ module VX_tex_stride #( always @(*) begin case (format) - `TEX_FORMAT_A8: log_stride_r = 0; - `TEX_FORMAT_L8: log_stride_r = 0; - `TEX_FORMAT_L8A8: log_stride_r = 1; - `TEX_FORMAT_R5G6B5: log_stride_r = 1; - `TEX_FORMAT_R4G4B4A4: log_stride_r = 1; - //`TEX_FORMAT_R8G8B8A8 - default: log_stride_r = 2; + `TEX_FORMAT_A8R8G8B8: log_stride_r = 2; + `TEX_FORMAT_R5G6B5, + `TEX_FORMAT_A1R5G5B5, + `TEX_FORMAT_A4R4G4B4, + `TEX_FORMAT_A8L8: log_stride_r = 1; + // `TEX_FORMAT_L8: + // `TEX_FORMAT_A8: + default: log_stride_r = 0; endcase end diff --git a/hw/syn/quartus/core/Makefile b/hw/syn/quartus/core/Makefile index b976110c..d209c80d 100644 --- a/hw/syn/quartus/core/Makefile +++ b/hw/syn/quartus/core/Makefile @@ -2,6 +2,7 @@ PROJECT = Core TOP_LEVEL_ENTITY = VX_core SRC_FILE = VX_core.v RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/fpu_core/Makefile b/hw/syn/quartus/fpu_core/Makefile index 291d8124..26ca51ac 100644 --- a/hw/syn/quartus/fpu_core/Makefile +++ b/hw/syn/quartus/fpu_core/Makefile @@ -2,6 +2,7 @@ PROJECT = VX_fpu_fpga TOP_LEVEL_ENTITY = VX_fpu_fpga SRC_FILE = VX_fpu_fpga.v RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/pipeline/Makefile b/hw/syn/quartus/pipeline/Makefile index e4cad107..665f7829 100644 --- a/hw/syn/quartus/pipeline/Makefile +++ b/hw/syn/quartus/pipeline/Makefile @@ -2,6 +2,7 @@ PROJECT = VX_pipeline TOP_LEVEL_ENTITY = VX_pipeline SRC_FILE = VX_pipeline.v RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top1/Makefile b/hw/syn/quartus/top1/Makefile index 374f84e1..9494b2d3 100644 --- a/hw/syn/quartus/top1/Makefile +++ b/hw/syn/quartus/top1/Makefile @@ -2,6 +2,7 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top16/Makefile b/hw/syn/quartus/top16/Makefile index 78f4df68..836e3558 100644 --- a/hw/syn/quartus/top16/Makefile +++ b/hw/syn/quartus/top16/Makefile @@ -2,6 +2,7 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top2/Makefile b/hw/syn/quartus/top2/Makefile index f8801373..d4c6abbc 100644 --- a/hw/syn/quartus/top2/Makefile +++ b/hw/syn/quartus/top2/Makefile @@ -2,6 +2,7 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top32/Makefile b/hw/syn/quartus/top32/Makefile index cea702f5..d07a515c 100644 --- a/hw/syn/quartus/top32/Makefile +++ b/hw/syn/quartus/top32/Makefile @@ -2,6 +2,7 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top4/Makefile b/hw/syn/quartus/top4/Makefile index bfe734a7..af33661c 100644 --- a/hw/syn/quartus/top4/Makefile +++ b/hw/syn/quartus/top4/Makefile @@ -2,6 +2,7 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top64/Makefile b/hw/syn/quartus/top64/Makefile index 604f794f..1d60b214 100644 --- a/hw/syn/quartus/top64/Makefile +++ b/hw/syn/quartus/top64/Makefile @@ -1,7 +1,8 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv -RTL_DIR=../../../../rtl +RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party #FAMILY = "Arria 10" #DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FAMILY = "Stratix 10" DEVICE = 1SX280HN2F43E2VG FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/top8/Makefile b/hw/syn/quartus/top8/Makefile index 0614e0d5..b2efcc6d 100644 --- a/hw/syn/quartus/top8/Makefile +++ b/hw/syn/quartus/top8/Makefile @@ -2,6 +2,7 @@ PROJECT = vortex_afu TOP_LEVEL_ENTITY = vortex_afu SRC_FILE = vortex_afu.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/unittest/Makefile b/hw/syn/quartus/unittest/Makefile index 3b1bc6da..975ec0a1 100644 --- a/hw/syn/quartus/unittest/Makefile +++ b/hw/syn/quartus/unittest/Makefile @@ -2,6 +2,7 @@ PROJECT = Unittest TOP_LEVEL_ENTITY = VX_core_req_bank_sel SRC_FILE = VX_core_req_bank_sel.v RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/hw/syn/quartus/vortex/Makefile b/hw/syn/quartus/vortex/Makefile index 6874cce3..b2046cf8 100644 --- a/hw/syn/quartus/vortex/Makefile +++ b/hw/syn/quartus/vortex/Makefile @@ -2,6 +2,7 @@ PROJECT = Vortex TOP_LEVEL_ENTITY = Vortex SRC_FILE = Vortex.sv RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party FAMILY = "Arria 10" DEVICE = 10AX115N3F40E2SG @@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #DEVICE = 1SX280HN2F43E2VG #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 -FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src TEX_INCLUDE = $(RTL_DIR)/tex_unit RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) diff --git a/runtime/Makefile b/runtime/Makefile index c329e531..d72eb665 100644 --- a/runtime/Makefile +++ b/runtime/Makefile @@ -26,7 +26,7 @@ $(PROJECT).dump: $(PROJECT).a $(CC) $(CFLAGS) -c $< -o $@ $(PROJECT).a: $(OBJS) - $(AR) rcs $(PROJECT).a $^ + $(AR) rcs $@ $^ .depend: $(SRCS) $(CC) $(CFLAGS) -MM $^ > .depend; diff --git a/sim/Makefile b/sim/Makefile index eca60c0b..e0361709 100644 --- a/sim/Makefile +++ b/sim/Makefile @@ -1,11 +1,9 @@ all: - $(MAKE) -C common $(MAKE) -C simX $(MAKE) -C rtlsim $(MAKE) -C vlsim clean: - $(MAKE) -C common clean $(MAKE) -C simX clean $(MAKE) -C rtlsim clean $(MAKE) -C vlsim clean \ No newline at end of file diff --git a/sim/common/Makefile b/sim/common/Makefile deleted file mode 100644 index b17dc25b..00000000 --- a/sim/common/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -all: - SPECIALIZE_TYPE=RISCV SOFTFLOAT_OPTS="-fPIC -DSOFTFLOAT_ROUND_ODD -DINLINE_LEVEL=5 -DSOFTFLOAT_FAST_DIV32TO16 -DSOFTFLOAT_FAST_DIV64TO32" $(MAKE) -C softfloat/build/Linux-x86_64-GCC - -clean: - $(MAKE) -C softfloat/build/Linux-x86_64-GCC clean \ No newline at end of file diff --git a/sim/common/fixed.h b/sim/common/fixed.h deleted file mode 100644 index 8ef60d9a..00000000 --- a/sim/common/fixed.h +++ /dev/null @@ -1,419 +0,0 @@ -#pragma once - -#include -#include -#include - -template -class Fixed { -private: - - template - struct Cast { - private: - template struct Tag {}; - - inline static T Convert(T2 value, Tag) { - return static_cast(value) << (F - F2); - } - - inline static T Convert(T2 value, Tag) { - return static_cast(value) >> (F2 - F); - } - - inline static T Convert(T2 value, Tag) { - return static_cast(value << (F - F2)); - } - - inline static T Convert(T2 value, Tag) { - return static_cast(value >> (F2 - F)); - } - - public: - inline static T Convert(T2 value) { - return Convert(value, Tag<(sizeof(T2) > sizeof(T)), (F2 > F)>{}); - } - }; - -public: - using data_type = T; - - static constexpr uint32_t FRAC = F; - static constexpr uint32_t INT = sizeof(T) * 8 - FRAC; - static constexpr uint32_t HFRAC = FRAC >> 1; - static constexpr T ONE = static_cast(1) << FRAC; - static constexpr T MASK = ONE - 1; - static constexpr T IMASK = ~MASK; - static constexpr T HALF = ONE >> 1; - static constexpr T TWO = ONE << 1; - - Fixed() {} - - explicit Fixed(int64_t rhs) - : data_(static_cast(rhs << FRAC)) { - assert((static_cast(rhs) << FRAC) == data_); - } - - explicit Fixed(uint64_t rhs) - : data_(static_cast(rhs << FRAC)) { - assert((static_cast(rhs) << FRAC) == data_); - } - - explicit Fixed(int32_t rhs) - : data_(static_cast(rhs << FRAC)) { - assert((static_cast(rhs) << FRAC) == data_); - } - - explicit Fixed(uint32_t rhs) - : data_(static_cast(rhs << FRAC)) { - assert((static_cast(rhs) << FRAC) == data_); - } - - explicit Fixed(int16_t rhs) - : data_(static_cast(rhs << FRAC)) { - assert((static_cast(rhs) << FRAC) == data_); - } - - explicit Fixed(uint16_t rhs) - : data_(static_cast(rhs << FRAC)) { - assert((static_cast(rhs) << FRAC) == data_); - } - - explicit Fixed(int8_t rhs) - : data_(static_cast(rhs << FRAC)) { - assert((static_cast(rhs) << FRAC) == data_); - } - - explicit Fixed(uint8_t rhs) - : data_(static_cast(rhs << FRAC)) { - assert((static_cast(rhs) << FRAC) == data_); - } - - template - explicit Fixed(Fixed rhs) - : data_(Cast::Convert(rhs.data())) - {} - - explicit Fixed(float rhs) - : data_(static_cast(rhs * ONE)) { - assert(data_ == static_cast(rhs * ONE)); - } - - bool operator==(Fixed rhs) const { - return (data_ == rhs.data_); - } - - bool operator!=(Fixed rhs) const { - return (data_ != rhs.data_); - } - - bool operator<(Fixed rhs) const { - return (data_ < rhs.data_); - } - - bool operator<=(Fixed rhs) const { - return (data_ <= rhs.data_); - } - - bool operator>(Fixed rhs) const { - return (data_ > rhs.data_); - } - - bool operator>=(Fixed rhs) const { - return (data_ >= rhs.data_); - } - - Fixed operator-() const { - return make(-data_); - } - - Fixed operator+=(Fixed rhs) { - *this = (*this) + rhs; - return *this; - } - - Fixed operator-=(Fixed rhs) { - *this = (*this) - rhs; - return *this; - } - - Fixed operator*=(Fixed rhs) { - *this = (*this) * rhs; - return *this; - } - - Fixed operator/=(Fixed rhs) { - *this = (*this) / rhs; - return *this; - } - - template - Fixed operator*=(Fixed rhs) { - *this = (*this) * rhs; - return *this; - } - - template - Fixed operator/=(Fixed rhs) { - *this = (*this) / rhs; - return *this; - } - - Fixed operator*=(int32_t rhs) { - *this = (*this) * rhs; - return *this; - } - - Fixed operator*=(uint32_t rhs) { - *this = (*this) * rhs; - return *this; - } - - Fixed operator*=(float rhs) { - *this = (*this) * rhs; - return *this; - } - - Fixed operator/=(int32_t rhs) { - *this = (*this) / rhs; - return *this; - } - - Fixed operator/=(uint32_t rhs) { - *this = (*this) / rhs; - return *this; - } - - Fixed operator/=(float rhs) { - *this = (*this) / rhs; - return *this; - } - - friend Fixed operator+(Fixed lhs, Fixed rhs) { - assert((static_cast(lhs.data_) + rhs.data_) == - (lhs.data_ + rhs.data_)); - return Fixed::make(lhs.data_ + rhs.data_); - } - - friend Fixed operator-(Fixed lhs, Fixed rhs) { - assert((static_cast(lhs.data_) - rhs.data_) == - (lhs.data_ - rhs.data_)); - return Fixed::make(lhs.data_ - rhs.data_); - } - - friend Fixed operator*(Fixed lhs, Fixed rhs) { - return Fixed::make((static_cast(lhs.data_) * rhs.data_) >> FRAC); - } - - template - friend Fixed operator*(Fixed lhs, Fixed rhs) { - return Fixed::make((static_cast(lhs.data_) * rhs.data()) >> F2); - } - - friend Fixed operator/(Fixed lhs, Fixed rhs) { - assert(rhs.data_ != 0); - return Fixed::make((static_cast(lhs.data_) << FRAC) / rhs.data_); - } - - template - friend Fixed operator/(Fixed lhs, Fixed rhs) { - assert(rhs.data() != 0); - return Fixed::make((static_cast(lhs.data_) << F2) / rhs.data()); - } - - friend Fixed operator*(Fixed lhs, float rhs) { - return static_cast(lhs) * rhs; - } - - friend Fixed operator*(float lhs, Fixed rhs) { - return lhs * static_cast(rhs); - } - - friend Fixed operator/(Fixed lhs, float rhs) { - return static_cast(lhs) / rhs; - } - - friend Fixed operator/(float lhs, Fixed rhs) { - return lhs / static_cast(rhs); - } - - friend Fixed operator*(Fixed lhs, char rhs) { - return lhs * static_cast(rhs); - } - - friend Fixed operator*(char lhs, Fixed rhs) { - return rhs * lhs; - } - - friend Fixed operator/(Fixed lhs, char rhs) { - return lhs / static_cast(rhs); - } - - friend Fixed operator/(char lhs, Fixed rhs) { - return rhs / lhs; - } - - friend Fixed operator*(Fixed lhs, uint8_t rhs) { - return lhs * static_cast(rhs); - } - - friend Fixed operator*(uint8_t lhs, Fixed rhs) { - return rhs * lhs; - } - - friend Fixed operator/(Fixed lhs, uint8_t rhs) { - return lhs / static_cast(rhs); - } - - friend Fixed operator/(uint8_t lhs, Fixed rhs) { - return rhs / lhs; - } - - friend Fixed operator*(Fixed lhs, short rhs) { - return lhs * static_cast(rhs); - } - - friend Fixed operator*(short lhs, Fixed rhs) { - return rhs * lhs; - } - - friend Fixed operator/(Fixed lhs, short rhs) { - return lhs / static_cast(rhs); - } - - friend Fixed operator/(short lhs, Fixed rhs) { - return rhs / lhs; - } - - friend Fixed operator*(Fixed lhs, uint16_t rhs) { - return lhs * static_cast(rhs); - } - - friend Fixed operator*(uint16_t lhs, Fixed rhs) { - return rhs * lhs; - } - - friend Fixed operator/(Fixed lhs, uint16_t rhs) { - return lhs / static_cast(rhs); - } - - friend Fixed operator/(uint16_t lhs, Fixed rhs) { - return rhs / lhs; - } - - friend Fixed operator*(Fixed lhs, int32_t rhs) { - auto value = static_cast(lhs.data_ * rhs); - assert((lhs.data_ * static_cast(rhs)) == value); - return Fixed::make(value); - } - - friend Fixed operator*(int32_t lhs, Fixed rhs) { - return rhs * lhs; - } - - friend Fixed operator/(Fixed lhs, int32_t rhs) { - assert(rhs); - auto value = static_cast(lhs.data_ / rhs); - return Fixed::make(value); - } - - friend Fixed operator/(int32_t lhs, Fixed rhs) { - return rhs / lhs; - } - - friend Fixed operator*(Fixed lhs, uint32_t rhs) { - auto value = static_cast(lhs.data_ << rhs); - assert((lhs.data_ << static_cast(rhs)) == value); - return Fixed::make(value); - } - - friend Fixed operator*(uint32_t lhs, Fixed rhs) { - return rhs * lhs; - } - - friend Fixed operator/(Fixed lhs, uint32_t rhs) { - assert(rhs); - auto value = static_cast(lhs.data_ / rhs); - return Fixed::make(value); - } - - friend Fixed operator/(uint32_t lhs, Fixed rhs) { - return rhs / lhs; - } - - friend Fixed operator<<(Fixed lhs, int32_t rhs) { - auto value = static_cast(lhs.data_ << rhs); - assert((lhs.data_ << static_cast(rhs)) == value); - return Fixed::make(value); - } - - friend Fixed operator>>(Fixed lhs, int32_t rhs) { - auto value = static_cast(lhs.data_ >> rhs); - return Fixed::make(value); - } - - friend Fixed operator<<(Fixed lhs, uint32_t rhs) { - auto value = static_cast(lhs.data_ << rhs); - assert((lhs.data_ << static_cast(rhs)) == value); - return Fixed::make(value); - } - - friend Fixed operator>>(Fixed lhs, uint32_t rhs) { - auto value = static_cast(lhs.data_ >> rhs); - return Fixed::make(value); - } - - static Fixed make(T value) { - Fixed ret; - ret.data_ = value; - return ret; - } - - explicit operator int64_t() const { - return static_cast(data_ >> F); - } - - explicit operator uint64_t() const { - return static_cast(data_ >> F); - } - - explicit operator int32_t() const { - return static_cast(data_ >> F); - } - - explicit operator uint32_t() const { - return static_cast(data_ >> F); - } - - explicit operator int16_t() const { - return static_cast(data_ >> F); - } - - explicit operator uint16_t() const { - return static_cast(data_ >> F); - } - - explicit operator int8_t() const { - return static_cast(data_ >> F); - } - - explicit operator uint8_t() const { - return static_cast(data_ >> F); - } - - template - explicit operator Fixed() const { - return Fixed(*this); - } - - explicit operator float() const { - return static_cast(data_) / (static_cast(1) << F); - } - - T data() const { - return data_; - } - -private: - T data_; -}; \ No newline at end of file diff --git a/sim/common/rvfloats.cpp b/sim/common/rvfloats.cpp index c23cb8da..17fab394 100644 --- a/sim/common/rvfloats.cpp +++ b/sim/common/rvfloats.cpp @@ -3,8 +3,8 @@ extern "C" { #include -#include -#include +#include +#include <../RISCV/specialize.h> } #define F32_SIGN 0x80000000 diff --git a/sim/common/texturing.h b/sim/common/texturing.h index 8d76519e..9b0e4526 100644 --- a/sim/common/texturing.h +++ b/sim/common/texturing.h @@ -1,10 +1,11 @@ #pragma once #include -#include #include #include +using namespace cocogfx; + enum class WrapMode { Clamp, Repeat, @@ -12,10 +13,11 @@ enum class WrapMode { }; enum class TexFormat { - R8G8B8A8, - R5G6B5, - R4G4B4A4, - L8A8, + A8R8G8B8, + R5G6B5, + A1R5G5B5, + A4R4G4B4, + A8L8, L8, A8, }; @@ -34,11 +36,12 @@ T Clamp(Fixed fx, WrapMode mode) { inline uint32_t Stride(TexFormat format) { switch (format) { - case TexFormat::R8G8B8A8: + case TexFormat::A8R8G8B8: return 4; case TexFormat::R5G6B5: - case TexFormat::R4G4B4A4: - case TexFormat::L8A8: + case TexFormat::A1R5G5B5: + case TexFormat::A4R4G4B4: + case TexFormat::A8L8: return 2; case TexFormat::L8: case TexFormat::A8: @@ -53,61 +56,68 @@ inline void Unpack8888(TexFormat format, uint32_t texel, uint32_t* lo, uint32_t* hi) { + int r, g, b, a; switch (format) { - case TexFormat::R8G8B8A8: - *lo = texel & 0x00ff00ff; - *hi = (texel >> 8) & 0x00ff00ff; + case TexFormat::A8R8G8B8: + r = (texel >> 16) & 0xff; + g = (texel >> 8) & 0xff; + b = texel & 0xff; + a = texel >> 24; break; - case TexFormat::R5G6B5: - case TexFormat::R4G4B4A4: - *lo = texel; - *hi= 0; + case TexFormat::R5G6B5: + r = ((texel >> 11) << 3) | (texel >> 13); + g = ((texel >> 3) & 0xfc) | ((texel >> 9) & 0x3); + b = ((texel & 0x1f) << 3) | ((texel & 0x1c) >> 2); + a = 0xff; break; - case TexFormat::L8A8: - *lo = (texel | (texel << 8)) & 0x00ff00ff; - *hi = 0; + case TexFormat::A1R5G5B5: + r = ((texel >> 7) & 0xf8) | ((texel << 1) >> 13); + g = ((texel >> 2) & 0xf8) | ((texel >> 7) & 7); + b = ((texel & 0x1f) << 3) | ((texel & 0x1c) >> 2); + a = 0xff * (texel >> 15); + break; + case TexFormat::A4R4G4B4: + r = ((texel >> 4) & 0xf0) | ((texel >> 8) & 0x0f); + g = ((texel & 0xf0) >> 0) | ((texel & 0xf0) >> 4); + b = ((texel & 0x0f) << 4) | ((texel & 0x0f) >> 0); + a = ((texel >> 8) & 0xf0) | (texel >> 12); + break; + case TexFormat::A8L8: + r = texel & 0xff; + g = r; + b = r; + a = texel >> 8; break; case TexFormat::L8: - *lo = (texel | (texel << 16)) & 0x07e0f81f; - *hi = 0; + r = texel & 0xff; + g = r; + b = r; + a = 0xff; break; case TexFormat::A8: - *lo = (texel | (texel << 12)) & 0x0f0f0f0f; - *hi = 0; + r = 0xff; + g = 0xff; + b = 0xff; + a = texel & 0xff; break; default: std::abort(); - } + } + *lo = (r << 16) + b; + *hi = (a << 16) + g; } -inline uint32_t Pack8888(TexFormat format, uint32_t lo, uint32_t hi) { - switch (format) { - case TexFormat::R8G8B8A8: - return (hi << 8) | lo; - case TexFormat::R5G6B5: - case TexFormat::R4G4B4A4: - return lo; - case TexFormat::L8A8: - return (lo | (lo >> 8)) & 0xffff; - case TexFormat::L8: - return (lo | (lo >> 16)) & 0xffff; - case TexFormat::A8: - return (lo | (lo >> 12)) & 0xffff; - default: - std::abort(); - return 0; - } +inline void Unpack8888(uint32_t texel, uint32_t* lo, uint32_t* hi) { + *lo = texel & 0x00ff00ff; + *hi = (texel >> 8) & 0x00ff00ff; } -inline void Lerp8888(uint32_t al, - uint32_t ah, - uint32_t bl, - uint32_t bh, - uint32_t frac, - uint32_t* lo, - uint32_t* hi) { - *lo = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; - *hi = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; +inline uint32_t Pack8888(uint32_t lo, uint32_t hi) { + return (hi << 8) | lo; +} + +inline uint32_t Lerp8888(uint32_t a, uint32_t b, uint32_t f) { + return (a + (((b - a) * f) >> 8)) & 0x00ff00ff; } template @@ -185,25 +195,28 @@ inline uint32_t TexFilterLinear( ) { uint32_t c01l, c01h; { - uint32_t c0l, c0h; - uint32_t c1l, c1h; + uint32_t c0l, c0h, c1l, c1h; Unpack8888(format, texel00, &c0l, &c0h); Unpack8888(format, texel01, &c1l, &c1h); - Lerp8888(c0l, c0h, c1l, c1h, alpha, &c01l, &c01h); + c01l = Lerp8888(c0l, c1l, alpha); + c01h = Lerp8888(c0h, c1h, alpha); } uint32_t c23l, c23h; { - uint32_t c2l, c2h; - uint32_t c3l, c3h; + uint32_t c2l, c2h, c3l, c3h; Unpack8888(format, texel10, &c2l, &c2h); Unpack8888(format, texel11, &c3l, &c3h); - Lerp8888(c2l, c2h, c3l, c3h, alpha, &c23l, &c23h); + c23l = Lerp8888(c2l, c3l, alpha); + c23h = Lerp8888(c2h, c3h, alpha); } - uint32_t cl, ch; - Lerp8888(c01l, c01h, c23l, c23h, beta, &cl, &ch); - uint32_t color = Pack8888(TexFormat::R8G8B8A8, cl, ch); + uint32_t color; + { + uint32_t cl = Lerp8888(c01l, c23l, beta); + uint32_t ch = Lerp8888(c01h, c23h, beta); + color = Pack8888(cl, ch); + } //printf("*** texel00=0x%x, texel01=0x%x, texel10=0x%x, texel11=0x%x, color=0x%x\n", texel00, texel01, texel10, texel11, color); @@ -211,9 +224,12 @@ inline uint32_t TexFilterLinear( } inline uint32_t TexFilterPoint(TexFormat format, uint32_t texel) { - uint32_t cl, ch; - Unpack8888(format, texel, &cl, &ch); - uint32_t color = Pack8888(TexFormat::R8G8B8A8, cl, ch); + uint32_t color; + { + uint32_t cl, ch; + Unpack8888(format, texel, &cl, &ch); + color = Pack8888(cl, ch); + } //printf("*** texel=0x%x, color=0x%x\n", texel, color); diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index 662fbf1d..df9970d5 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -1,12 +1,13 @@ -RTL_DIR=../../hw/rtl -DPI_DIR=../../hw/dpi +RTL_DIR = ../../hw/rtl +DPI_DIR = ../../hw/dpi +THIRD_PARTY_DIR = ../../third_party CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -I../../../hw -I../../common -CXXFLAGS += -I../../common/softfloat/source/include +CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include -LDFLAGS += ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a +LDFLAGS += ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a # control RTL debug tracing states DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE @@ -24,7 +25,7 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_TEX DBG_FLAGS += $(DBG_TRACE_FLAGS) -FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src +FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src TEX_INCLUDE = -I$(RTL_DIR)/tex_unit RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) $(TEX_INCLUDE) @@ -90,7 +91,7 @@ $(PROJECT): $(SRCS) static: $(SRCS) verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' - $(AR) rcs lib$(PROJECT).a obj_dir/*.o ../common/softfloat/build/Linux-x86_64-GCC/*.o + $(AR) rcs lib$(PROJECT).a obj_dir/*.o $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/*.o clean-static: rm -rf lib$(PROJECT).a obj_dir diff --git a/sim/simX/Makefile b/sim/simX/Makefile index 7ea54863..b3312bb0 100644 --- a/sim/simX/Makefile +++ b/sim/simX/Makefile @@ -1,12 +1,14 @@ RTL_DIR = ../hw/rtl +THIRD_PARTY_DIR = ../../third_party CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -I. -I../common -I../../hw -CXXFLAGS += -I../common/softfloat/source/include +CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include +CXXFLAGS += -I$(THIRD_PARTY_DIR)/cocogfx/include CXXFLAGS += $(CONFIGS) -LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a +LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx TOP = vx_cache_sim @@ -38,7 +40,7 @@ obj_dir/%.o: %.cpp $(CXX) $(CXXFLAGS) -c $< -o $@ static: $(OBJS) - $(AR) rcs lib$(PROJECT).a $(OBJS) ../common/softfloat/build/Linux-x86_64-GCC/*.o + $(AR) rcs lib$(PROJECT).a $(OBJS) $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/*.o .depend: $(SRCS) $(CXX) $(CXXFLAGS) -MM $^ > .depend; diff --git a/sim/simX/tex_unit.cpp b/sim/simX/tex_unit.cpp index d73bd728..bfbcef1a 100644 --- a/sim/simX/tex_unit.cpp +++ b/sim/simX/tex_unit.cpp @@ -4,6 +4,7 @@ #include using namespace vortex; +using namespace cocogfx; enum class FilterMode { Point, diff --git a/sim/vlsim/Makefile b/sim/vlsim/Makefile index 57e114a2..879bd954 100644 --- a/sim/vlsim/Makefile +++ b/sim/vlsim/Makefile @@ -1,13 +1,14 @@ RTL_DIR = ../../hw/rtl DPI_DIR = ../../hw/dpi -SCRIPT_DIR=../../hw/scripts +SCRIPT_DIR = ../../hw/scripts +THIRD_PARTY_DIR = ../../third_party CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -I.. -I../../../hw -I../../common -CXXFLAGS += -I../../common/softfloat/source/include +CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include -LDFLAGS += -shared ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a +LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a # control RTL debug tracing states DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE @@ -29,7 +30,7 @@ SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += fpga.cpp opae_sim.cpp -FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src +FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src TEX_INCLUDE = -I$(RTL_DIR)/tex_unit RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE) RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip @@ -98,7 +99,7 @@ $(PROJECT).so: $(SRCS) vortex_afu.h static: $(SRCS) vortex_afu.h verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' - $(AR) rcs $(PROJECT).a obj_dir/*.o ../common/softfloat/build/Linux-x86_64-GCC/*.o + $(AR) rcs $(PROJECT).a obj_dir/*.o $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/*.o clean-static: rm -rf $(PROJECT).a obj_dir vortex_afu.h diff --git a/sim/vlsim/vortex_afu.h b/sim/vlsim/vortex_afu.h new file mode 100644 index 00000000..1a1dee44 --- /dev/null +++ b/sim/vlsim/vortex_afu.h @@ -0,0 +1,49 @@ +// auto-generated by gen_config.py. DO NOT EDIT +// Generated at 2021-11-25 13:43:13.259966 + +// Translated from VX_config.vh: + +#ifndef __VORTEX_AFU__ +#define __VORTEX_AFU__ + + + +#define PLATFORM_PROVIDES_LOCAL_MEMORY + +#ifndef PLATFORM_PARAM_LOCAL_MEMORY_BANKS +#define PLATFORM_PARAM_LOCAL_MEMORY_BANKS 2 +#endif + +#ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH +#define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH 26 +#endif + +#ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH +#define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH 512 +#endif + +#ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH +#define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4 +#endif + + + +#define AFU_ACCEL_NAME "vortex_afu" +#define AFU_ACCEL_UUID 0x35f9452b_25c2_434c_93d5_6f8c60db361c + +#define AFU_IMAGE_CMD_MEM_READ 1 +#define AFU_IMAGE_CMD_MEM_WRITE 2 +#define AFU_IMAGE_CMD_RUN 3 +#define AFU_IMAGE_MMIO_CMD_TYPE 10 +#define AFU_IMAGE_MMIO_DATA_SIZE 16 +#define AFU_IMAGE_MMIO_IO_ADDR 12 +#define AFU_IMAGE_MMIO_MEM_ADDR 14 +#define AFU_IMAGE_MMIO_SCOPE_READ 20 +#define AFU_IMAGE_MMIO_SCOPE_WRITE 22 +#define AFU_IMAGE_MMIO_DEV_CAPS 24 +#define AFU_IMAGE_MMIO_STATUS 18 + +#define AFU_IMAGE_POWER 0 +#define AFU_TOP_IFC "ccip_std_afu_avalon_mm" + +#endif diff --git a/tests/regression/tex/Makefile b/tests/regression/tex/Makefile index 1a771373..fd9f195d 100644 --- a/tests/regression/tex/Makefile +++ b/tests/regression/tex/Makefile @@ -10,7 +10,7 @@ VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy VX_CFLAGS += -std=c++11 -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections -VX_CFLAGS += -DENABLE_SW -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common +VX_CFLAGS += -DENABLE_SW -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common -I$(VORTEX_RT_PATH)/../third_party/cocogfx/include VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a @@ -19,15 +19,13 @@ VX_SRCS = kernel.c #CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors -CXXFLAGS += -DLUPNG_USE_ZLIB +CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common -I$(VORTEX_RT_PATH)/../third_party/cocogfx/include -CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common - -LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex -lz +LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex $(VORTEX_RT_PATH)/../third_party/cocogfx/libcocogfx.a -lz PROJECT = tex -SRCS = main.cpp utils.cpp tga.cpp lupng.c +SRCS = main.cpp utils.cpp all: $(PROJECT) kernel.bin kernel.dump diff --git a/tests/regression/tex/blitter.h b/tests/regression/tex/blitter.h deleted file mode 100644 index e05f64b8..00000000 --- a/tests/regression/tex/blitter.h +++ /dev/null @@ -1,268 +0,0 @@ -// -// Copyright (c) Blaise Tine. All rights reserved. -// -// -// Use of this sample source code is subject to the terms of the Microsoft -// license agreement under which you licensed this sample source code. If -// you did not accept the terms of the license agreement, you are not -// authorized to use this sample source code. For the terms of the license, -// please see the license agreement between you and Microsoft or, if applicable, -// see the LICENSE.RTF on your install media or the root of your tools -// installation. -// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR -// INDEMNITIES. -// -#pragma once - -#include "surfacedesc.h" - -class BlitTable { -public: - typedef int (*PfnCopy)(const SurfaceDesc &dstDesc, - uint32_t dstOffsetX, - uint32_t dstOffsetY, - uint32_t copyWidth, - uint32_t copyHeight, - const SurfaceDesc &srcDesc, - uint32_t srcOffsetX, - uint32_t srcOffsetY); - - BlitTable() { - for (uint32_t s = 0; s < FORMAT_COLOR_SIZE_; ++s) { - for (uint32_t d = 0; d < FORMAT_COLOR_SIZE_; ++d) { - copyFuncs_[s][d] = CopyInvalid; - } - } - - for (uint32_t s = 0; s < FORMAT_COLOR_SIZE_; ++s) { - switch (s) { - case FORMAT_A8: - case FORMAT_L8: - copyFuncs_[s][s] = CopyFast; - break; - - case FORMAT_A8L8: - copyFuncs_[FORMAT_A8L8][FORMAT_A8] = Copy; - copyFuncs_[FORMAT_A8L8][FORMAT_A8L8] = CopyFast; - break; - - case FORMAT_R5G6B5: - copyFuncs_[FORMAT_R5G6B5][FORMAT_L8] = Copy; - copyFuncs_[FORMAT_R5G6B5][FORMAT_R5G6B5] = CopyFast; - copyFuncs_[FORMAT_R5G6B5][FORMAT_R8G8B8] = - Copy; - copyFuncs_[FORMAT_R5G6B5][FORMAT_B8G8R8] = - Copy; - copyFuncs_[FORMAT_R5G6B5][FORMAT_A8B8G8R8] = - Copy; - copyFuncs_[FORMAT_R5G6B5][FORMAT_A8R8G8B8] = - Copy; - break; - - case FORMAT_A1R5G5B5: - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_L8] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8L8] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R8G8B8] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8R8G8B8] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R5G5B5A1] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R4G4B4A4] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_B8G8R8] = - Copy; - copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8B8G8R8] = - Copy; - break; - - case FORMAT_A4R4G4B4: - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_L8] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8L8] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R8G8B8] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8R8G8B8] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R5G5B5A1] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R4G4B4A4] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_B8G8R8] = - Copy; - copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8B8G8R8] = - Copy; - break; - - case FORMAT_R8G8B8: - copyFuncs_[FORMAT_R8G8B8][FORMAT_L8] = Copy; - copyFuncs_[FORMAT_R8G8B8][FORMAT_R5G6B5] = - Copy; - copyFuncs_[FORMAT_R8G8B8][FORMAT_R8G8B8] = CopyFast; - copyFuncs_[FORMAT_R8G8B8][FORMAT_B8G8R8] = - Copy; - copyFuncs_[FORMAT_R8G8B8][FORMAT_A8B8G8R8] = - Copy; - copyFuncs_[FORMAT_R8G8B8][FORMAT_A8R8G8B8] = - Copy; - break; - - case FORMAT_A8R8G8B8: - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_L8] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8L8] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R5G6B5] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R8G8B8] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8R8G8B8] = CopyFast; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R5G5B5A1] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R4G4B4A4] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_B8G8R8] = - Copy; - copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8B8G8R8] = - Copy; - break; - - case FORMAT_R5G5B5A1: - copyFuncs_[FORMAT_R5G5B5A1][FORMAT_A8] = - Copy; - copyFuncs_[FORMAT_R5G5B5A1][FORMAT_L8] = - Copy; - copyFuncs_[FORMAT_R5G5B5A1][FORMAT_A8L8] = - Copy; - copyFuncs_[FORMAT_R5G5B5A1][FORMAT_RGB] = - Copy; - copyFuncs_[FORMAT_R5G5B5A1][FORMAT_ARGB] = - Copy; - break; - - case FORMAT_R4G4B4A4: - copyFuncs_[FORMAT_R4G4B4A4][FORMAT_A8] = - Copy; - copyFuncs_[FORMAT_R4G4B4A4][FORMAT_L8] = - Copy; - copyFuncs_[FORMAT_R4G4B4A4][FORMAT_A8L8] = - Copy; - copyFuncs_[FORMAT_R4G4B4A4][FORMAT_RGB] = - Copy; - copyFuncs_[FORMAT_R4G4B4A4][FORMAT_ARGB] = - Copy; - break; - - case FORMAT_B8G8R8: - copyFuncs_[FORMAT_B8G8R8][FORMAT_L8] = Copy; - copyFuncs_[FORMAT_B8G8R8][FORMAT_RGB] = Copy; - break; - - case FORMAT_A8B8G8R8: - copyFuncs_[FORMAT_A8B8G8R8][FORMAT_A8] = - Copy; - copyFuncs_[FORMAT_A8B8G8R8][FORMAT_L8] = - Copy; - copyFuncs_[FORMAT_A8B8G8R8][FORMAT_A8L8] = - Copy; - copyFuncs_[FORMAT_A8B8G8R8][FORMAT_RGB] = - Copy; - copyFuncs_[FORMAT_A8B8G8R8][FORMAT_ARGB] = - Copy; - break; - } - } - } - - PfnCopy get(uint32_t srcFormat, uint32_t dstFormat) const { - assert(srcFormat < FORMAT_COLOR_SIZE_); - assert(dstFormat < FORMAT_COLOR_SIZE_); - return copyFuncs_[srcFormat][dstFormat]; - } - -private: - template - static int Copy(const SurfaceDesc &dstDesc, - uint32_t dstOffsetX, - uint32_t dstOffsetY, - uint32_t copyWidth, - uint32_t copyHeight, - const SurfaceDesc &srcDesc, - uint32_t srcOffsetX, - uint32_t srcOffsetY) { - auto srcBPP = TFormatInfo::CBSIZE; - auto dstBPP = TFormatInfo::CBSIZE; - auto srcNextLine = srcDesc.Pitch; - auto dstNextLine = dstDesc.Pitch; - - auto pbSrc = srcDesc.pBits + srcOffsetX * srcBPP + srcOffsetY * srcDesc.Pitch; - auto pbDst = dstDesc.pBits + dstOffsetX * dstBPP + dstOffsetY * dstDesc.Pitch; - - while (copyHeight--) { - auto pSrc = reinterpret_cast::TYPE *>(pbSrc); - for (auto *pDst = reinterpret_cast::TYPE *>( - pbDst), - *const pEnd = pDst + copyWidth; - pDst != pEnd; ++pDst, ++pSrc) { - auto tmp = Format::ConvertFrom(pSrc); - Format::ConvertTo(pDst, tmp); - } - - pbSrc += srcNextLine; - pbDst += dstNextLine; - } - return 0; - } - - template - static int CopyFast(const SurfaceDesc &dstDesc, - uint32_t dstOffsetX, - uint32_t dstOffsetY, - uint32_t copyWidth, - uint32_t copyHeight, - const SurfaceDesc &srcDesc, - uint32_t srcOffsetX, - uint32_t srcOffsetY) { - auto nBPP = sizeof(Type); - auto srcNextLine = srcDesc.Pitch; - auto dstNextLine = dstDesc.Pitch; - - auto pbSrc = srcDesc.pBits + srcOffsetX * nBPP + srcOffsetY * srcDesc.Pitch; - auto pbDst = dstDesc.pBits + dstOffsetX * nBPP + dstOffsetY * dstDesc.Pitch; - - while (copyHeight--) { - auto pSrc = reinterpret_cast(pbSrc); - for (auto *pDst = reinterpret_cast(pbDst), *const pEnd = pDst + copyWidth; - pDst != pEnd; ++pDst, ++pSrc) { - *pDst = *pSrc; - } - pbSrc += srcNextLine; - pbDst += dstNextLine; - } - return 0; - } - - static int CopyInvalid(const SurfaceDesc & /*dstDesc*/, - uint32_t /*dstOffsetX*/, - uint32_t /*dstOffsetY*/, - uint32_t /*copyWidth*/, - uint32_t /*copyHeight*/, - const SurfaceDesc & /*srcDesc*/, - uint32_t /*srcOffsetX*/, - uint32_t /*srcOffsetY*/) - { - std::cout << "Error: invalid format" << std::endl; - return -1; - } - - PfnCopy copyFuncs_[FORMAT_COLOR_SIZE_][FORMAT_COLOR_SIZE_]; -}; \ No newline at end of file diff --git a/tests/regression/tex/color.h b/tests/regression/tex/color.h deleted file mode 100644 index 708565a3..00000000 --- a/tests/regression/tex/color.h +++ /dev/null @@ -1,68 +0,0 @@ -// -// Copyright (c) Blaise Tine. All rights reserved. -// -// -// Use of this sample source code is subject to the terms of the Microsoft -// license agreement under which you licensed this sample source code. If -// you did not accept the terms of the license agreement, you are not -// authorized to use this sample source code. For the terms of the license, -// please see the license agreement between you and Microsoft or, if applicable, -// see the LICENSE.RTF on your install media or the root of your tools -// installation. -// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR -// INDEMNITIES. -// -#pragma once - -#include -#include - -struct ColorARGB { - union { - struct { - uint32_t value; - }; - struct { - uint8_t b, g, r, a; - }; - struct { - uint8_t m[4]; - }; - }; - - ColorARGB() {} - - ColorARGB(int a, int r, int g, int b) { - assert((a >= 0) && (a <= 0xff)); - assert((r >= 0) && (r <= 0xff)); - assert((g >= 0) && (g <= 0xff)); - assert((b >= 0) && (b <= 0xff)); - - this->b = static_cast(b); - this->g = static_cast(g); - this->r = static_cast(r); - this->a = static_cast(a); - } - - ColorARGB(int r, int g, int b) { - assert((r >= 0) && (r <= 0xff)); - assert((g >= 0) && (g <= 0xff)); - assert((b >= 0) && (b <= 0xff)); - - this->b = static_cast(b); - this->g = static_cast(g); - this->r = static_cast(r); - } - - ColorARGB(int value) { - this->value = value; - } - - void operator=(const ColorARGB &rhs) { - this->value = rhs.value; - } - - operator uint32_t() const { - return this->value; - } -}; \ No newline at end of file diff --git a/tests/regression/tex/common.h b/tests/regression/tex/common.h index 1a7f53d0..00d7148f 100644 --- a/tests/regression/tex/common.h +++ b/tests/regression/tex/common.h @@ -15,7 +15,6 @@ typedef struct { uint8_t src_logwidth; uint8_t src_logheight; uint32_t src_addr; - float lod; uint32_t mip_offs[TEX_LOD_MAX+1]; uint32_t dst_width; uint32_t dst_height; diff --git a/tests/regression/tex/format.h b/tests/regression/tex/format.h deleted file mode 100644 index 4ee8268e..00000000 --- a/tests/regression/tex/format.h +++ /dev/null @@ -1,1022 +0,0 @@ -// -// Copyright (c) Blaise Tine. All rights reserved. -// -// -// Use of this sample source code is subject to the terms of the Microsoft -// license agreement under which you licensed this sample source code. If -// you did not accept the terms of the license agreement, you are not -// authorized to use this sample source code. For the terms of the license, -// please see the license agreement between you and Microsoft or, if applicable, -// see the LICENSE.RTF on your install media or the root of your tools -// installation. -// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR -// INDEMNITIES. -// -#pragma once - -#include "int24.h" -#include "color.h" -#include - -enum ePixelFormat { - FORMAT_UNKNOWN, - FORMAT_A8, - FORMAT_L8, - FORMAT_A8L8, - FORMAT_R5G6B5, - FORMAT_A8R8G8B8, - FORMAT_A1R5G5B5, - FORMAT_R8G8B8, - FORMAT_A4R4G4B4, - FORMAT_A8B8G8R8, - FORMAT_R5G5B5A1, - FORMAT_B8G8R8, - FORMAT_R4G4B4A4, - FORMAT_COLOR_SIZE_, - FORMAT_D16 = FORMAT_COLOR_SIZE_, - FORMAT_X8S8D16, - FORMAT_PAL4_B8G8R8, - FORMAT_PAL4_A8B8G8R8, - FORMAT_PAL4_R5G6B5, - FORMAT_PAL4_R4G4B4A4, - FORMAT_PAL4_R5G5B5A1, - FORMAT_PAL8_B8G8R8, - FORMAT_PAL8_A8B8G8R8, - FORMAT_PAL8_R5G6B5, - FORMAT_PAL8_R4G4B4A4, - FORMAT_PAL8_R5G5B5A1, - FORMAT_SIZE_, -}; - -#define FORMAT_A FORMAT_A8 -#define FORMAT_RGB FORMAT_R5G6B5 -#define FORMAT_RGB_ FORMAT_R8G8B8 -#define FORMAT_ARGB FORMAT_A8R8G8B8 -#define FORMAT_ARGB_ FORMAT_A4R4G4B4 - -template -struct TFormatInfo {}; - -template <> -struct TFormatInfo { - typedef uint8_t TYPE; - - enum { - CBSIZE = 0, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 4, - RED = 4, - GREEN = 4, - BLUE = 4, - LERP = 4, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 4, - RED = 4, - GREEN = 4, - BLUE = 4, - LERP = 4, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 1, - RED = 5, - GREEN = 5, - BLUE = 5, - LERP = 5, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 1, - RED = 5, - GREEN = 5, - BLUE = 5, - LERP = 5, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - RED = 5, - GREEN = 6, - BLUE = 5, - LERP = 5, - }; -}; - -template <> -struct TFormatInfo { - typedef uint24_t TYPE; - - enum { - CBSIZE = 3, - RED = 8, - GREEN = 8, - BLUE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint24_t TYPE; - - enum { - CBSIZE = 3, - RED = 8, - GREEN = 8, - BLUE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint32_t TYPE; - - enum { - CBSIZE = 4, - ALPHA = 8, - RED = 8, - GREEN = 8, - BLUE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint32_t TYPE; - - enum { - CBSIZE = 4, - ALPHA = 8, - RED = 8, - GREEN = 8, - BLUE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint8_t TYPE; - - enum { - CBSIZE = 1, - ALPHA = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint8_t TYPE; - - enum { - CBSIZE = 1, - LUMINANCE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 8, - LUMINANCE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - DEPTH = 16, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 4, - DEPTH = 16, - STENCIL = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 3, - RED = 8, - GREEN = 8, - BLUE = 8, - PALETTE = 4, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 4, - ALPHA = 8, - RED = 8, - GREEN = 8, - BLUE = 8, - PALETTE = 4, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - RED = 5, - GREEN = 6, - BLUE = 5, - PALETTE = 4, - LERP = 5, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 4, - RED = 4, - GREEN = 4, - BLUE = 4, - PALETTE = 4, - LERP = 4, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 1, - RED = 5, - GREEN = 5, - BLUE = 5, - PALETTE = 4, - LERP = 5, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 3, - RED = 8, - GREEN = 8, - BLUE = 8, - PALETTE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 4, - ALPHA = 8, - RED = 8, - GREEN = 8, - BLUE = 8, - PALETTE = 8, - LERP = 8, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - RED = 5, - GREEN = 6, - BLUE = 5, - PALETTE = 8, - LERP = 5, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 4, - RED = 4, - GREEN = 4, - BLUE = 4, - PALETTE = 8, - LERP = 4, - }; -}; - -template <> -struct TFormatInfo { - typedef uint16_t TYPE; - - enum { - CBSIZE = 2, - ALPHA = 1, - RED = 5, - GREEN = 5, - BLUE = 5, - PALETTE = 8, - LERP = 5, - }; -}; - -/////////////////////////////////////////////////////////////////////////////// - -#define DEF_GET_ENUM_VALUE(Name, Default) \ - template \ - struct enum_get_##Name { \ - static constexpr int value = Default; \ - }; \ - template \ - struct enum_get_##Name::type> { \ - static constexpr int value = T::Name; \ - } - -#define __formatInfo(format) \ - { \ - TFormatInfo::CBSIZE, FormatSize>::RED, \ - FormatSize>::GREEN, \ - FormatSize>::BLUE, \ - FormatSize>::ALPHA, \ - FormatSize>::LUMINANCE, \ - FormatSize>::DEPTH, \ - FormatSize>::STENCIL, \ - FormatSize>::PALETTE, \ - FormatSize>::LERP \ - } - -/////////////////////////////////////////////////////////////////////////////// - -struct FormatInfo { - uint8_t BytePerPixel; - uint8_t Red; - uint8_t Green; - uint8_t Blue; - uint8_t Alpha; - uint8_t Luminance; - uint8_t Depth; - uint8_t Stencil; - uint8_t PaletteBits; - uint8_t LerpBits; -}; - -template -class FormatSize { -protected: - DEF_GET_ENUM_VALUE(RED, 0); - DEF_GET_ENUM_VALUE(GREEN, 0); - DEF_GET_ENUM_VALUE(BLUE, 0); - DEF_GET_ENUM_VALUE(ALPHA, 0); - DEF_GET_ENUM_VALUE(LUMINANCE, 0); - DEF_GET_ENUM_VALUE(DEPTH, 0); - DEF_GET_ENUM_VALUE(STENCIL, 0); - DEF_GET_ENUM_VALUE(PALETTE, 0); - DEF_GET_ENUM_VALUE(LERP, 0); - -public: - enum { - RED = enum_get_RED::value, - GREEN = enum_get_GREEN::value, - BLUE = enum_get_BLUE::value, - ALPHA = enum_get_ALPHA::value, - LUMINANCE = enum_get_LUMINANCE::value, - DEPTH = enum_get_DEPTH::value, - STENCIL = enum_get_STENCIL::value, - PALETTE = enum_get_PALETTE::value, - LERP = enum_get_LERP::value, - - RGB = RED + GREEN + BLUE + LUMINANCE, - RGBA = RGB + ALPHA - }; -}; - -namespace Format { - -inline static const FormatInfo &GetInfo(ePixelFormat pixelFormat) { - static const FormatInfo sc_formatInfos[FORMAT_SIZE_] = { - __formatInfo(FORMAT_UNKNOWN), - __formatInfo(FORMAT_A8), - __formatInfo(FORMAT_L8), - __formatInfo(FORMAT_A8L8), - __formatInfo(FORMAT_RGB), - __formatInfo(FORMAT_ARGB), - __formatInfo(FORMAT_A1R5G5B5), - __formatInfo(FORMAT_RGB_), - __formatInfo(FORMAT_ARGB_), - __formatInfo(FORMAT_R4G4B4A4), - __formatInfo(FORMAT_R5G5B5A1), - __formatInfo(FORMAT_B8G8R8), - __formatInfo(FORMAT_A8B8G8R8), - __formatInfo(FORMAT_D16), - __formatInfo(FORMAT_X8S8D16), - __formatInfo(FORMAT_PAL4_B8G8R8), - __formatInfo(FORMAT_PAL4_A8B8G8R8), - __formatInfo(FORMAT_PAL4_R5G6B5), - __formatInfo(FORMAT_PAL4_R4G4B4A4), - __formatInfo(FORMAT_PAL4_R5G5B5A1), - __formatInfo(FORMAT_PAL8_B8G8R8), - __formatInfo(FORMAT_PAL8_A8B8G8R8), - __formatInfo(FORMAT_PAL8_R5G6B5), - __formatInfo(FORMAT_PAL8_R4G4B4A4), - __formatInfo(FORMAT_PAL8_R5G5B5A1), - }; - assert(pixelFormat < FORMAT_SIZE_); - return sc_formatInfos[pixelFormat]; -} - -#undef __formatInfo -#undef DEF_GET_ENUM_VALUE - -typedef ColorARGB (*pfn_convert_from)(const void *pIn); - -typedef void (*pfn_convert_to)(void *pOut, const ColorARGB &in); - -template -static uint32_t ConvertTo(const ColorARGB &color); - -template -static void ConvertTo(void *pOut, const ColorARGB &in) { - *reinterpret_cast::TYPE *>(pOut) = - static_cast::TYPE>( - ConvertTo(in)); -} - -template -static ColorARGB ConvertFrom(uint32_t in); - -template -static ColorARGB ConvertFrom(const void *pIn) { - return ConvertFrom( - *reinterpret_cast::TYPE *>(pIn)); -} - -inline static pfn_convert_to GetConvertTo(ePixelFormat pixelFormat) { - switch (pixelFormat) { - case FORMAT_A8: - return &ConvertTo; - case FORMAT_L8: - return &ConvertTo; - case FORMAT_A8L8: - return &ConvertTo; - case FORMAT_R5G6B5: - return &ConvertTo; - case FORMAT_A1R5G5B5: - return &ConvertTo; - case FORMAT_A4R4G4B4: - return &ConvertTo; - case FORMAT_R8G8B8: - return &ConvertTo; - case FORMAT_A8R8G8B8: - return &ConvertTo; - case FORMAT_R5G5B5A1: - return &ConvertTo; - case FORMAT_R4G4B4A4: - return &ConvertTo; - case FORMAT_B8G8R8: - return &ConvertTo; - case FORMAT_A8B8G8R8: - return &ConvertTo; - case FORMAT_D16: - return &ConvertTo; - case FORMAT_X8S8D16: - return &ConvertTo; - default: - return &ConvertTo; - } - return nullptr; -} - -inline static pfn_convert_from GetConvertFrom(ePixelFormat pixelFormat, - bool bForceAlpha) { - if (bForceAlpha) { - switch (pixelFormat) { - case FORMAT_A8: - return &ConvertFrom; - case FORMAT_L8: - return &ConvertFrom; - case FORMAT_A8L8: - return &ConvertFrom; - case FORMAT_R5G6B5: - return &ConvertFrom; - case FORMAT_A1R5G5B5: - return &ConvertFrom; - case FORMAT_A4R4G4B4: - return &ConvertFrom; - case FORMAT_R8G8B8: - return &ConvertFrom; - case FORMAT_A8R8G8B8: - return &ConvertFrom; - case FORMAT_R5G5B5A1: - return &ConvertFrom; - case FORMAT_R4G4B4A4: - return &ConvertFrom; - case FORMAT_B8G8R8: - return &ConvertFrom; - case FORMAT_A8B8G8R8: - return &ConvertFrom; - case FORMAT_D16: - return &ConvertFrom; - case FORMAT_X8S8D16: - return &ConvertFrom; - default: - return &ConvertFrom; - } - } else { - switch (pixelFormat) { - case FORMAT_A8: - return &ConvertFrom; - case FORMAT_L8: - return &ConvertFrom; - case FORMAT_A8L8: - return &ConvertFrom; - case FORMAT_R5G6B5: - return &ConvertFrom; - case FORMAT_A1R5G5B5: - return &ConvertFrom; - case FORMAT_A4R4G4B4: - return &ConvertFrom; - case FORMAT_R8G8B8: - return &ConvertFrom; - case FORMAT_A8R8G8B8: - return &ConvertFrom; - case FORMAT_R5G5B5A1: - return &ConvertFrom; - case FORMAT_R4G4B4A4: - return &ConvertFrom; - case FORMAT_B8G8R8: - return &ConvertFrom; - case FORMAT_A8B8G8R8: - return &ConvertFrom; - case FORMAT_D16: - return &ConvertFrom; - case FORMAT_X8S8D16: - return &ConvertFrom; - default: - return &ConvertFrom; - } - } - - return nullptr; -} - -inline static uint32_t GetNativeFormat(ePixelFormat pixelFormat) { - switch (pixelFormat) { - case FORMAT_PAL4_B8G8R8: - case FORMAT_PAL8_B8G8R8: - return FORMAT_B8G8R8; - - case FORMAT_PAL4_A8B8G8R8: - case FORMAT_PAL8_A8B8G8R8: - return FORMAT_A8B8G8R8; - - case FORMAT_PAL4_R5G6B5: - case FORMAT_PAL8_R5G6B5: - return FORMAT_R5G6B5; - - case FORMAT_PAL4_R4G4B4A4: - case FORMAT_PAL8_R4G4B4A4: - return FORMAT_R4G4B4A4; - - case FORMAT_PAL4_R5G5B5A1: - case FORMAT_PAL8_R5G5B5A1: - return FORMAT_R5G5B5A1; - - default: - return pixelFormat; - } -} - -/////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &/*in*/) { - return 0; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t /*in*/) { - return 0; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t /*in*/) { - return 0; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return ((in.r & 0xf8) << 8) | ((in.g & 0xfc) << 3) | (in.b >> 3); -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.r = ((in >> 11) << 3) | (in >> 13); - ret.g = ((in >> 3) & 0xfc) | ((in >> 9) & 0x3); - ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff; - ret.r = ((in >> 11) << 3) | (in >> 13); - ret.g = ((in >> 3) & 0xfc) | ((in >> 9) & 0x3); - ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return (in.a ? 0x8000 : 0) | ((in.r & 0xf8) << 7) | ((in.g & 0xf8) << 2) | - (in.b >> 3); -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff * (in >> 15); - ret.r = ((in >> 7) & 0xf8) | ((in << 1) >> 13); - ret.g = ((in >> 2) & 0xf8) | ((in >> 7) & 7); - ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff * (in >> 15); - ret.r = ((in >> 7) & 0xf8) | ((in << 1) >> 13); - ret.g = ((in >> 2) & 0xf8) | ((in >> 7) & 7); - ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return ((in.r & 0xf8) << 8) | ((in.g & 0xf8) << 3) | ((in.b & 0xf8) >> 2) | - (in.a ? 0x1 : 0); -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff * (in & 0x1); - ret.r = ((in >> 8) & 0xf8) | (in >> 13); - ret.g = ((in >> 3) & 0xf8) | ((in >> 8) & 7); - ret.b = ((in & 0x3e) << 2) | ((in & 0x3e) >> 3); - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff * (in & 0x1); - ret.r = ((in >> 8) & 0xf8) | (in >> 13); - ret.g = ((in >> 3) & 0xf8) | ((in >> 8) & 7); - ret.b = ((in & 0x3e) << 2) | ((in & 0x3e) >> 3); - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return ((in.a & 0xf0) << 8) | ((in.r & 0xf0) << 4) | ((in.g & 0xf0) << 0) | - (in.b >> 4); -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = ((in >> 8) & 0xf0) | (in >> 12); - ret.r = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); - ret.g = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); - ret.b = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = ((in >> 8) & 0xf0) | (in >> 12); - ret.r = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); - ret.g = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); - ret.b = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return ((in.r & 0xf0) << 8) | ((in.g & 0xf0) << 4) | ((in.b & 0xf0) << 0) | - (in.a >> 4); -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); - ret.r = ((in >> 8) & 0xf0) | (in >> 12); - ret.g = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); - ret.b = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); - ret.r = ((in >> 8) & 0xf0) | (in >> 12); - ret.g = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); - ret.b = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return (in.r << 16) | (in.g << 8) | in.b; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.r = in >> 16; - ret.g = (in >> 8) & 0xff; - ret.b = in & 0xff; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff; - ret.r = in >> 16; - ret.g = (in >> 8) & 0xff; - ret.b = in & 0xff; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return (in.b << 16) | (in.g << 8) | in.r; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.r = in & 0xff; - ret.g = (in >> 8) & 0xff; - ret.b = in >> 16; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff; - ret.r = in & 0xff; - ret.g = (in >> 8) & 0xff; - ret.b = in >> 16; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return (in.a << 24) | (in.r << 16) | (in.g << 8) | in.b; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in >> 24; - ret.r = (in >> 16) & 0xff; - ret.g = (in >> 8) & 0xff; - ret.b = in & 0xff; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in >> 24; - ret.r = (in >> 16) & 0xff; - ret.g = (in >> 8) & 0xff; - ret.b = in & 0xff; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return (in.a << 24) | (in.b << 16) | (in.g << 8) | in.r; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in >> 24; - ret.r = in & 0xff; - ret.g = (in >> 8) & 0xff; - ret.b = (in >> 16) & 0xff; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in >> 24; - ret.r = in & 0xff; - ret.g = (in >> 8) & 0xff; - ret.b = (in >> 16) & 0xff; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return in.a; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return in.r; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.r = in; - ret.g = in; - ret.b = in; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = 0xff; - ret.r = in; - ret.g = in; - ret.b = in; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return (in.a << 8) | in.r; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in >> 8; - ret.r = in & 0xff; - ret.g = in & 0xff; - ret.b = in & 0xff; - return ret; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.a = in >> 8; - ret.r = in & 0xff; - ret.g = in & 0xff; - ret.b = in & 0xff; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return in.value; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.value = in; - return ret; -} - -////////////////////////////////////////////////////////////////////////////// - -template <> -inline uint32_t ConvertTo(const ColorARGB &in) { - return in.b; -} - -template <> -inline ColorARGB ConvertFrom(uint32_t in) { - ColorARGB ret; - ret.value = in; - return ret; -} - -} // namespace Format \ No newline at end of file diff --git a/tests/regression/tex/int24.h b/tests/regression/tex/int24.h deleted file mode 100644 index b08537a7..00000000 --- a/tests/regression/tex/int24.h +++ /dev/null @@ -1,37 +0,0 @@ -// -// Copyright (c) Blaise Tine. All rights reserved. -// -// -// Use of this sample source code is subject to the terms of the Microsoft -// license agreement under which you licensed this sample source code. If -// you did not accept the terms of the license agreement, you are not -// authorized to use this sample source code. For the terms of the license, -// please see the license agreement between you and Microsoft or, if applicable, -// see the LICENSE.RTF on your install media or the root of your tools -// installation. -// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR -// INDEMNITIES. -// -#pragma once - -#include - -struct uint24_t { - uint8_t m[3]; - - explicit uint24_t(uint32_t value) { - m[0] = (value >> 0) & 0xff; - m[1] = (value >> 8) & 0xff; - m[2] = (value >> 16) & 0xff; - } - - explicit uint24_t(uint8_t x, uint8_t y, uint8_t z) { - m[0] = x; - m[1] = y; - m[2] = z; - } - - operator uint32_t() const { - return (m[2] << 16) | (m[1] << 8) | m[0]; - } -}; diff --git a/tests/regression/tex/kernel.c b/tests/regression/tex/kernel.c index 9aaaad24..88aec50c 100644 --- a/tests/regression/tex/kernel.c +++ b/tests/regression/tex/kernel.c @@ -10,6 +10,7 @@ typedef struct { uint32_t tile_height; float deltaX; float deltaY; + float minification; } tile_arg_t; template @@ -35,10 +36,10 @@ void kernel_body(int task_id, tile_arg_t* arg) { uint8_t* dst_ptr = (uint8_t*)(state->dst_addr + xoffset * state->dst_stride + yoffset * state->dst_pitch); - Fixed<16> xlod(state->lod); + Fixed<16> xj(arg->minification); - /*vx_printf("task_id=%d, deltaX=%f, deltaY=%f, tile_width=%d, tile_height=%d\n", - task_id, arg->deltaX, arg->deltaY, arg->tile_width, arg->tile_height);*/ + /*vx_printf("task_id=%d, tile_width=%d, tile_height=%d, deltaX=%f, deltaY=%f, minification=%f\n", + task_id, arg->tile_width, arg->tile_height, arg->deltaX, arg->deltaY, arg->minification);*/ float fv = (yoffset + 0.5f) * arg->deltaY; for (uint32_t y = 0; y < arg->tile_height; ++y) { @@ -47,13 +48,7 @@ void kernel_body(int task_id, tile_arg_t* arg) { for (uint32_t x = 0; x < arg->tile_width; ++x) { Fixed xu(fu); Fixed xv(fv); - uint32_t color; - #ifdef ENABLE_SW - if (state->use_sw) - color = tex_load_sw(state, xu, xv, xlod); - else - #endif - color = tex_load_hw(state, xu, xv, xlod); + uint32_t color = tex_load(state, xu, xv, xj); //vx_printf("task_id=%d, x=%d, y=%d, fu=%f, fv=%f, xu=0x%x, xv=0x%x, color=0x%x\n", task_id, x, y, fu, fv, xu.data(), xv.data(), color); dst_row[x] = color; fu += arg->deltaX; @@ -76,7 +71,7 @@ int main() { csr_write(CSR_TEX(0, TEX_STATE_ADDR), arg->src_addr); static_for_t()([&](int i) { csr_write(CSR_TEX(0, TEX_STATE_MIPOFF(i)), arg->mip_offs[i]); - }); + }); tile_arg_t targ; targ.state = arg; @@ -84,6 +79,14 @@ int main() { targ.tile_height = (arg->dst_height + arg->num_tasks - 1) / arg->num_tasks; targ.deltaX = 1.0f / arg->dst_width; targ.deltaY = 1.0f / arg->dst_height; + + { + uint32_t src_width = (1 << arg->src_logwidth); + uint32_t src_height = (1 << arg->src_logheight); + float width_ratio = float(src_width) / arg->dst_width; + float height_ratio = float(src_height) / arg->dst_height; + targ.minification = std::max(width_ratio, height_ratio); + } vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, &targ); /*for (uint32_t t=0; t < arg->num_tasks; ++t) { diff --git a/tests/regression/tex/lupng.c b/tests/regression/tex/lupng.c deleted file mode 100644 index f612fbc9..00000000 --- a/tests/regression/tex/lupng.c +++ /dev/null @@ -1,1313 +0,0 @@ -/* - * The MIT License (MIT) - * - * Copyright (c) 2014 Jan Solanti - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include - -#ifndef LUPNG_USE_ZLIB -#include -#else -#include -#endif - -#include "lupng.h" - -#define PNG_NONE 0 -#define PNG_IHDR 0x01 -#define PNG_PLTE 0x02 -#define PNG_IDAT 0x04 -#define PNG_IEND 0x08 - -#define PNG_GRAYSCALE 0 -#define PNG_TRUECOLOR 2 -/* 24bpp RGB palette */ -#define PNG_PALETTED 3 -#define PNG_GRAYSCALE_ALPHA 4 -#define PNG_TRUECOLOR_ALPHA 6 - -#define PNG_FILTER_NONE 0 -#define PNG_FILTER_SUB 1 -#define PNG_FILTER_UP 2 -#define PNG_FILTER_AVERAGE 3 -#define PNG_FILTER_PAETH 4 - -#define PNG_SIG_SIZE 8 - -#define PNG_DONE 1 -#define PNG_OK 0 -#define PNG_ERROR -1 - -#define BUF_SIZE 8192 -#define MAX(x, y) (x > y ? x : y) - -#if defined(_MSC_VER) -#define LU_INLINE __inline /* MS-specific inline */ -#else -#define LU_INLINE inline /* rest of the world... */ -#endif - -#define SIZE_T_MAX_POSITIVE ( ((size_t)-1) >> 1 ) - -/******************************************************** - * CRC computation as per PNG spec - ********************************************************/ - -/* Precomputed table of CRCs of all 8-bit messages - using the polynomial from the PNG spec, 0xEDB88320L. */ -static const uint32_t crcTable[] = -{ - 0x0, 0x77073096, 0xEE0E612C, 0x990951BA, 0x76DC419, 0x706AF48F, - 0xE963A535, 0x9E6495A3, 0xEDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, - 0x9B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2, - 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, - 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, - 0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, - 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C, - 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, - 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, - 0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, - 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x1DB7106, - 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x6B6B51F, 0x9FBFE4A5, 0xE8B8D433, - 0x7807C9A2, 0xF00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x86D3D2D, - 0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, - 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950, - 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, - 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, - 0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, - 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA, - 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F, - 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, - 0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x3B6E20C, 0x74B1D29A, - 0xEAD54739, 0x9DD277AF, 0x4DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84, - 0xD6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0xA00AE27, 0x7D079EB1, - 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, - 0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, - 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E, - 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B, - 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, - 0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, - 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28, - 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, - 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x26D930A, 0x9C0906A9, 0xEB0E363F, - 0x72076785, 0x5005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0xCB61B38, - 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0xBDBDF21, 0x86D3D2D4, 0xF1D4E242, - 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, - 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, - 0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, - 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC, - 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9, - 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, - 0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, - 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D -}; - -/* Update a running CRC with the bytes buf[0..len-1]--the CRC - should be initialized to all 1's, and the transmitted value - is the 1's complement of the final running CRC (see the - crc() routine below)). */ -static uint32_t updateCrc(uint32_t crc, unsigned char *buf, - size_t len) -{ - uint32_t c = crc; - size_t n; - - for (n = 0; n < len; n++) - c = crcTable[(c ^ buf[n]) & 0xFF] ^ (c >> 8); - - return c; -} - -/* Return the CRC of the bytes buf[0..len-1]. */ -static uint32_t crc(unsigned char *buf, size_t len) -{ - return updateCrc(0xFFFFFFFFL, buf, len) ^ 0xFFFFFFFFL; -} - - - -/******************************************************** - * Helper structs - ********************************************************/ - -typedef struct -{ - uint32_t length; - uint8_t *type; - uint8_t *data; - uint32_t crc; -} PngChunk; - -typedef struct { - const LuUserContext *userCtx; - int8_t chunksFound; - - /* IHDR info */ - int32_t width; - int32_t height; - uint8_t depth; - uint8_t colorType; - uint8_t channels; - uint8_t compression; - uint8_t filter; - uint8_t interlace; - - /* PLTE info */ - uint32_t paletteItems; - uint8_t *palette; - - /* fields used for (de)compression & (de-)filtering */ - z_stream stream; - size_t scanlineBytes; - int32_t currentCol; - int32_t currentRow; - uint32_t currentElem; - size_t currentByte; - int bytesPerPixel; - uint8_t *currentScanline; - uint8_t *previousScanline; - uint8_t currentFilter; - uint8_t interlacePass; - size_t compressedBytes; - - /* used for constructing 16 bit deep pixels */ - int tmpCount; - uint8_t tmpBytes[2]; - - /* the output image */ - LuImage *img; - const LuImage *cimg; /* constant pointer version */ -} PngInfoStruct; - -/* helper macro to output warning via user context of the info struct */ -#define LUPNG_WARN_UC(uc,...) do { if ((uc)->warnProc) { (uc)->warnProc((uc)->warnProcUserPtr, __VA_ARGS__); }} while(0) -#define LUPNG_WARN(info,...) LUPNG_WARN_UC((info)->userCtx, __VA_ARGS__) - -/* PNG header: */ -static const uint8_t PNG_SIG[] = -/* P N G \r \n SUB \n */ -{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}; - -static const int startingRow[] = { 0, 0, 0, 4, 0, 2, 0, 1 }; -static const int startingCol[] = { 0, 0, 4, 0, 2, 0, 1, 0 }; -static const int rowIncrement[] = { 1, 8, 8, 8, 4, 4, 2, 2 }; -static const int colIncrement[] = { 1, 8, 8, 4, 4, 2, 2, 1 }; - - - -/******************************************************** - * Helper functions - ********************************************************/ - -static LU_INLINE void releaseChunk(PngChunk *chunk, const LuUserContext *userCtx) -{ - /* Only release chunk->type since chunk->data points to the same memory. */ - userCtx->freeProc(chunk->type, userCtx->freeProcUserPtr); - userCtx->freeProc(chunk, userCtx->freeProcUserPtr); -} - -static LU_INLINE uint32_t swap32(uint32_t n) -{ - union { - unsigned char np[4]; - uint32_t i; - } u; - u.i = n; - - return ((uint32_t)u.np[0] << 24) | - ((uint32_t)u.np[1] << 16) | - ((uint32_t)u.np[2] << 8) | - (uint32_t)u.np[3]; -} - -static LU_INLINE uint16_t swap16(uint16_t n) -{ - union { - unsigned char np[2]; - uint16_t i; - } u; - u.i = n; - - return ((uint16_t)u.np[0] << 8) | (uint16_t)u.np[1]; -} - -static int bytesEqual(const uint8_t *a, const uint8_t *b, size_t count) -{ - size_t i; - for (i = 0; i < count; ++i) - { - if (*(a+i) != *(b+i)) - return 0; - } - - return 1; -} - -static void* internalMalloc(size_t size, void *userPtr) -{ - (void)userPtr; /* not used */ - return malloc(size); -} - -static void internalFree(void *ptr, void *userPtr) -{ - (void)userPtr; /* not used */ - free(ptr); -} - -static void internalPrintf(void *userPtr, const char *fmt, ...) -{ - FILE *outStream = (FILE*)userPtr; - va_list args; - - va_start(args, fmt); - vfprintf(outStream, fmt, args); - va_end(args); - fputc('\n', outStream); -} - -static size_t internalFread(void *ptr, size_t size, size_t count, void *userPtr) -{ - return fread(ptr, size, count, (FILE *)userPtr); -} - -static size_t internalFwrite(const void *ptr, size_t size, size_t count, void *userPtr) -{ - return fwrite(ptr, size, count, (FILE *)userPtr); -} - -/******************************************************** - * Png filter functions - ********************************************************/ -static LU_INLINE int absi(int val) -{ - return val > 0 ? val : -val; -} - -static LU_INLINE uint8_t raw(PngInfoStruct *info, size_t col) -{ - if (col > SIZE_T_MAX_POSITIVE) - return 0; - return info->currentScanline[col]; -} - -static LU_INLINE uint8_t prior(PngInfoStruct *info, size_t col) -{ - if (info->currentRow <= startingRow[info->interlacePass] || col > SIZE_T_MAX_POSITIVE) - return 0; - return info->previousScanline[col]; -} - - -static LU_INLINE uint8_t paethPredictor(uint8_t a, uint8_t b, uint8_t c) -{ - unsigned int A = a, B = b, C = c; - int p = (int)A + (int)B - (int)C; - int pa = absi(p - (int)A); - int pb = absi(p - (int)B); - int pc = absi(p - (int)C); - - if (pa <= pb && pa <= pc) - return a; - if (pb <= pc) - return b; - return c; -} - -static LU_INLINE uint8_t deSub(PngInfoStruct *info, uint8_t filtered) -{ - return filtered + raw(info, info->currentByte-info->bytesPerPixel); -} - -static LU_INLINE uint8_t deUp(PngInfoStruct *info, uint8_t filtered) -{ - return filtered + prior(info, info->currentByte); -} - -static LU_INLINE uint8_t deAverage(PngInfoStruct *info, uint8_t filtered) -{ - uint16_t avg = (uint16_t)(raw(info, info->currentByte-info->bytesPerPixel) - + prior(info, info->currentByte)); - avg >>= 1; - return filtered + avg; -} - -static LU_INLINE uint8_t dePaeth(PngInfoStruct *info, uint8_t filtered) -{ - return filtered + paethPredictor( - raw(info, info->currentByte-info->bytesPerPixel), - prior(info, info->currentByte), - prior(info, info->currentByte-info->bytesPerPixel)); -} - -static LU_INLINE uint8_t none(PngInfoStruct *info) -{ - return raw(info, info->currentByte); -} - -static LU_INLINE uint8_t sub(PngInfoStruct *info) -{ - return raw(info, info->currentByte) - raw(info, info->currentByte-info->bytesPerPixel); -} - -static LU_INLINE uint8_t up(PngInfoStruct *info) -{ - return raw(info, info->currentByte) - prior(info, info->currentByte); -} - -static LU_INLINE uint8_t average(PngInfoStruct *info) -{ - uint16_t avg = (uint16_t)(raw(info, info->currentByte-info->bytesPerPixel) - + prior(info, info->currentByte)); - avg >>= 1; - return raw(info, info->currentByte) - avg; -} - -static LU_INLINE uint8_t paeth(PngInfoStruct *info) -{ - return raw(info, info->currentByte) - paethPredictor( - raw(info, info->currentByte-info->bytesPerPixel), - prior(info, info->currentByte), - prior(info, info->currentByte-info->bytesPerPixel)); -} - - - -/******************************************************** - * Actual implementation - ********************************************************/ -static LU_INLINE int parseIhdr(PngInfoStruct *info, PngChunk *chunk) -{ - if (info->chunksFound) - { - LUPNG_WARN(info,"PNG: malformed PNG file!"); - return PNG_ERROR; - } - - info->chunksFound |= PNG_IHDR; - info->width = swap32(*(uint32_t *)chunk->data); - info->height = swap32(*((uint32_t *)chunk->data + 1)); - info->depth = *(chunk->data + 8); - info->colorType = *(chunk->data + 9); - info->compression = *(chunk->data + 10); - info->filter = *(chunk->data + 11); - info->interlace = *(chunk->data + 12); - - switch (info->colorType) - { - case PNG_GRAYSCALE: - info->channels = 1; - break; - case PNG_TRUECOLOR: - info->channels = 3; - break; - case PNG_PALETTED: - info->channels = 3; - break; - case PNG_GRAYSCALE_ALPHA: - info->channels = 2; - break; - case PNG_TRUECOLOR_ALPHA: - info->channels = 4; - break; - default: - LUPNG_WARN(info,"PNG: illegal color type: %u", - (unsigned int)info->colorType); - return PNG_ERROR; - break; - } - - if (info->width <= 0 || info->height <= 0) - { - LUPNG_WARN(info, "PNG: illegal dimensions"); - return PNG_ERROR; - } - - if ((info->colorType != PNG_GRAYSCALE && info->colorType != PNG_PALETTED && - info->depth < 8) || - (info->colorType == PNG_PALETTED && info->depth == 16) || - info->depth > 16) - { - LUPNG_WARN(info, "PNG: illegal bit depth for color type"); - return PNG_ERROR; - } - - if (info->compression) - { - LUPNG_WARN(info,"PNG: unknown compression method: %u", - (unsigned int)info->compression); - return PNG_ERROR; - } - - if (info->filter) - { - LUPNG_WARN(info,"PNG: unknown filter scheme: %u", - (unsigned int)info->filter); - return PNG_ERROR; - } - - memset(&(info->stream), 0, sizeof(info->stream)); - if(inflateInit(&(info->stream)) != Z_OK) - { - LUPNG_WARN(info, "PNG: inflateInit failed!"); - return PNG_ERROR; - } - info->img = luImageCreate(info->width, info->height, - info->channels, info->depth < 16 ? 8 : 16, NULL, info->userCtx); - info->cimg = info->img; - info->scanlineBytes = MAX((info->width * info->channels * info->depth) >> 3, 1); - info->currentScanline = (uint8_t *)info->userCtx->allocProc(info->scanlineBytes, info->userCtx->allocProcUserPtr); - info->previousScanline = (uint8_t *)info->userCtx->allocProc(info->scanlineBytes, info->userCtx->allocProcUserPtr); - info->currentCol = -1; - info->interlacePass = info->interlace ? 1 : 0; - info->bytesPerPixel = MAX((info->channels * info->depth) >> 3, 1); - if (!info->img || !info->currentScanline || !info->previousScanline) - { - LUPNG_WARN(info, "PNG: memory allocation failed!"); - return PNG_ERROR; - } - - return PNG_OK; -} - -static LU_INLINE int parsePlte(PngInfoStruct *info, PngChunk *chunk) -{ - if (info->chunksFound & PNG_PLTE) - { - LUPNG_WARN(info, "PNG: too many palette chunks in file!"); - return PNG_ERROR; - } - info->chunksFound |= PNG_PLTE; - - if (info->chunksFound & PNG_IDAT || !(info->chunksFound & PNG_IHDR)) - { - LUPNG_WARN(info, "PNG: malformed PNG file!"); - return PNG_ERROR; - } - - if (info->colorType == PNG_GRAYSCALE || info->colorType == PNG_GRAYSCALE_ALPHA) - { - LUPNG_WARN(info, "PNG: palettes are not allowed in grayscale images!"); - return PNG_ERROR; - } - - if (chunk->length % 3 != 0) - { - LUPNG_WARN(info, "PNG: invalid palette size!"); - return PNG_ERROR; - } - - info->paletteItems = chunk->length/3; - info->palette = (uint8_t *)info->userCtx->allocProc(chunk->length,info->userCtx->allocProcUserPtr); - if (!info->palette) - { - LUPNG_WARN(info, "PNG: memory allocation failed!"); - return PNG_ERROR; - } - memcpy(info->palette, chunk->data, chunk->length); - - return PNG_OK; -} - -static LU_INLINE void stretchBits(uint8_t inByte, uint8_t outBytes[8], int depth) -{ - int i; - switch (depth) { - case 1: - for (i = 0; i < 8; ++i) - outBytes[i] = (inByte >> (7-i)) & 0x01; - break; - - case 2: - outBytes[0] = (inByte >> 6) & 0x03; - outBytes[1] = (inByte >> 4) & 0x03; - outBytes[2] = (inByte >> 2) & 0x03; - outBytes[3] = inByte & 0x03; - break; - - case 4: - outBytes[0] = (inByte >> 4) & 0x0F; - outBytes[1] = inByte & 0x0F; - break; - - default: - break; - } -} - -/* returns: 1 if at end of scanline, 0 otherwise */ -static LU_INLINE int insertByte(PngInfoStruct *info, uint8_t byte) -{ - int advance = 0; - const uint8_t scale[] = {0x00, 0xFF, 0x55, 0x00, 0x11, 0x00, 0x00, 0x00}; - - /* for paletted images currentElem will always be 0 */ - size_t idx = info->currentRow * info->width * info->channels - + info->currentCol * info->channels - + info->currentElem; - - if (info->colorType != PNG_PALETTED) - { - if (info->depth == 8) - info->cimg->data[idx] = byte; - - else if (info->depth < 8) - info->cimg->data[idx] = byte * scale[info->depth]; - - else /* depth == 16 */ - { - info->tmpBytes[info->tmpCount] = byte; - if (info->tmpCount) /* just inserted 2nd byte */ - { - uint16_t val = *(uint16_t *)info->tmpBytes; - val = swap16(val); - info->tmpCount = 0; - - ((uint16_t *)(info->cimg->data))[idx] = val; - } - else - { - ++info->tmpCount; - return 0; - } - } - - ++info->currentElem; - if (info->currentElem >= info->channels) - { - advance = 1; - info->currentElem = 0; - } - } - else - { - /* The spec limits palette size to 256 entries */ - if (byte < info->paletteItems) - { - info->cimg->data[idx ] = info->palette[3*byte ]; - info->cimg->data[idx+1] = info->palette[3*byte+1]; - info->cimg->data[idx+2] = info->palette[3*byte+2]; - } - else - { - LUPNG_WARN(info,"PNG: invalid palette index encountered!"); - } - advance = 1; - } - - if (advance) - { - /* advance to next pixel */ - info->currentCol += colIncrement[info->interlacePass]; - - if (info->currentCol >= info->width) - { - uint8_t *tmp = info->currentScanline; - info->currentScanline = info->previousScanline; - info->previousScanline = tmp; - - info->currentCol = -1; - info->currentByte = 0; - - info->currentRow += rowIncrement[info->interlacePass]; - if (info->currentRow >= info->height && info->interlace) - { - ++info->interlacePass; - while (startingCol[info->interlacePass] >= info->width || - startingRow[info->interlacePass] >= info->height) - ++info->interlacePass; - info->currentRow = startingRow[info->interlacePass]; - } - return 1; - } - } - - return 0; -} - -static LU_INLINE int parseIdat(PngInfoStruct *info, PngChunk *chunk) -{ - unsigned char filtered[BUF_SIZE]; - int status = Z_OK; - - if (!(info->chunksFound & PNG_IHDR)) - { - LUPNG_WARN(info,"PNG: malformed PNG file!"); - return PNG_ERROR; - } - - if (info->colorType == PNG_PALETTED && !(info->chunksFound & PNG_PLTE)) - { - LUPNG_WARN(info,"PNG: palette required but missing!"); - return PNG_ERROR; - } - - info->chunksFound |= PNG_IDAT; - info->stream.next_in = (unsigned char *)chunk->data; - info->stream.avail_in = chunk->length; - do - { - size_t decompressed; - size_t i; - - info->stream.next_out = filtered; - info->stream.avail_out = BUF_SIZE; - status = inflate(&(info->stream), Z_NO_FLUSH); - decompressed = BUF_SIZE - info->stream.avail_out; - - if (status != Z_OK && - status != Z_STREAM_END && - status != Z_BUF_ERROR && - status != Z_NEED_DICT) - { - LUPNG_WARN(info, "PNG: inflate error!"); - return PNG_ERROR; - } - - for (i = 0; - i < decompressed && info->currentCol < info->width && info->currentRow < info->height; - ++i) - { - if (info->currentCol < 0) - { - info->currentCol = startingCol[info->interlacePass]; - info->currentFilter = filtered[i]; - } - else - { - uint8_t rawByte = 0; - uint8_t fullBytes[8] = {0}; - switch (info->currentFilter) - { - case PNG_FILTER_NONE: - rawByte = filtered[i]; - break; - case PNG_FILTER_SUB: - rawByte = deSub(info, filtered[i]); - break; - case PNG_FILTER_UP: - rawByte = deUp(info, filtered[i]); - break; - case PNG_FILTER_AVERAGE: - rawByte = deAverage(info, filtered[i]); - break; - case PNG_FILTER_PAETH: - rawByte = dePaeth(info, filtered[i]); - break; - default: - break; - } - - info->currentScanline[info->currentByte] = rawByte; - ++info->currentByte; - - if (info->depth < 8) - { - int j; - stretchBits(rawByte, fullBytes, info->depth); - for (j = 0; j < 8/info->depth; ++j) - if(insertByte(info, fullBytes[j])) - break; - } - else - insertByte(info, rawByte); - } - } - } while ((info->stream.avail_in > 0 || info->stream.avail_out == 0) - && info->currentCol < info->width && info->currentRow < info->height); - - return PNG_OK; -} - -static LU_INLINE PngChunk *readChunk(PngInfoStruct *info) -{ - PngChunk *chunk = (PngChunk *)info->userCtx->allocProc(sizeof(PngChunk),info->userCtx->allocProcUserPtr); - size_t read = 0; - if (!chunk) - { - LUPNG_WARN(info,"PNG: memory allocation failed!"); - return NULL; - } - - info->userCtx->readProc((void *)&chunk->length, 4, 1, info->userCtx->readProcUserPtr); - chunk->length = swap32(chunk->length); - if (chunk->length+4 < chunk->length) - { - LUPNG_WARN(info, "PNG: chunk claims to be absurdly large"); - info->userCtx->freeProc(chunk, info->userCtx->freeProcUserPtr); - return NULL; - } - - // Store chunk type and contents in the same buffer for convenience - chunk->type = (uint8_t *)info->userCtx->allocProc(chunk->length + 4, info->userCtx->allocProcUserPtr); - if (!chunk->type) - { - LUPNG_WARN(info,"PNG: memory allocation failed!"); - info->userCtx->freeProc(chunk, info->userCtx->freeProcUserPtr); - return NULL; - } - chunk->data = chunk->type + 4; - info->userCtx->readProc((void *)chunk->type, 1, chunk->length + 4, info->userCtx->readProcUserPtr); - read = info->userCtx->readProc((void *)&chunk->crc, 4, 1, info->userCtx->readProcUserPtr); - chunk->crc = swap32(chunk->crc); - - for (int i = 0; i < 4; ++i) - { - char byte = chunk->type[i]; - if ((byte < 'a' || byte > 'z') && (byte < 'A' || byte > 'Z')) - { - LUPNG_WARN(info, "PNG: invalid chunk name, possibly unprintable"); - releaseChunk(chunk, info->userCtx); - return NULL; - } - } - if (read != 1) - { - LUPNG_WARN(info, "PNG: read error"); - releaseChunk(chunk, info->userCtx); - return NULL; - } - - if (crc(chunk->type, chunk->length+4) != chunk->crc) - { - LUPNG_WARN(info, "PNG: CRC mismatch in \'%.4s\' chunk", (char *)chunk->type); - releaseChunk(chunk, info->userCtx); - return NULL; - } - - return chunk; -} - -static LU_INLINE int handleChunk(PngInfoStruct *info, PngChunk *chunk) -{ - /* critical chunk */ - if (!(chunk->type[0] & 0x20)) - { - if (bytesEqual(chunk->type, (const uint8_t *)"IHDR", 4)) - return parseIhdr(info, chunk); - if (bytesEqual(chunk->type, (const uint8_t *)"PLTE", 4)) - return parsePlte(info, chunk); - if (bytesEqual(chunk->type, (const uint8_t *)"IDAT", 4)) - return parseIdat(info, chunk); - if (bytesEqual(chunk->type, (const uint8_t *)"IEND", 4)) - { - info->chunksFound |= PNG_IEND; - if (!(info->chunksFound & PNG_IDAT)) - { - LUPNG_WARN(info, "PNG: no IDAT chunk found"); - return PNG_ERROR; - } - return PNG_DONE; - } - } - /* ignore ancillary chunks for now */ - - return PNG_OK; -} - -LuImage *luPngReadUC(const LuUserContext *userCtx) -{ - - uint8_t signature[PNG_SIG_SIZE]; - int status = PNG_ERROR; - - PngInfoStruct info; - memset(&info, 0, sizeof(PngInfoStruct)); - info.userCtx = userCtx; - - if (!userCtx->skipSig) - { - info.userCtx->readProc((void *)signature, 1, PNG_SIG_SIZE, info.userCtx->readProcUserPtr); - status = bytesEqual(signature, PNG_SIG, PNG_SIG_SIZE) ? PNG_OK : PNG_ERROR; - } - - if (status == PNG_OK) - { - PngChunk *chunk; - while ((chunk = readChunk(&info))) - { - status = handleChunk(&info, chunk); - releaseChunk(chunk, info.userCtx); - - if (status != PNG_OK) - break; - } - } - else - LUPNG_WARN(&info, "PNG: invalid header"); - - userCtx->freeProc(info.currentScanline, userCtx->freeProcUserPtr); - userCtx->freeProc(info.previousScanline, userCtx->freeProcUserPtr); - userCtx->freeProc(info.palette, userCtx->freeProcUserPtr); - inflateEnd(&info.stream); - - if (status == PNG_DONE) - return info.img; - else - if (info.img) - luImageRelease(info.img, info.userCtx); - - return NULL; -} - -LuImage *luPngRead(PngReadProc readProc, void *userPtr, int skipSig) -{ - LuUserContext userCtx; - - luUserContextInitDefault(&userCtx); - userCtx.readProc = readProc; - userCtx.readProcUserPtr = userPtr; - userCtx.skipSig = skipSig; - return luPngReadUC(&userCtx); -} - -LuImage *luPngReadFile(const char *filename, LuUserContext *userCtx) -{ - LuUserContext tmp_userCtx; - if (userCtx == NULL) { - luUserContextInitDefault(&tmp_userCtx); - userCtx = &tmp_userCtx; - } - - LuImage *img; - FILE *f = fopen(filename,"rb"); - - if (f) { - userCtx->readProc = internalFread; - userCtx->readProcUserPtr = f; - img = luPngReadUC(userCtx); - fclose(f); - } else { - LUPNG_WARN_UC(userCtx, "PNG: failed to open '%s'", filename); - img = NULL; - } - - return img; -} - -static LU_INLINE int writeIhdr(PngInfoStruct *info) -{ - static uint8_t buf[17]; - static const uint8_t colorType[] = { - PNG_GRAYSCALE, - PNG_GRAYSCALE_ALPHA, - PNG_TRUECOLOR, - PNG_TRUECOLOR_ALPHA - }; - size_t written = 0; - PngChunk c; - - if (info->cimg->channels > 4) - { - LUPNG_WARN(info, "PNG: too many channels in image"); - return PNG_ERROR; - } - - c.length = swap32(13); - c.type = buf; /* 4 (type) + 4 + 4 + 5x1 */ - c.data = c.type + 4; - - memcpy((void *)c.type, (void *)"IHDR", 4); - *(uint32_t *)(c.data) = swap32((uint32_t)info->cimg->width); - *(uint32_t *)(c.data + 4) = swap32((uint32_t)info->cimg->height); - *(c.data + 8) = info->cimg->depth; - *(c.data + 9) = colorType[info->cimg->channels-1]; - *(c.data + 10) = 0; /* compression method */ - *(c.data + 11) = 0; /* filter method */ - *(c.data + 12) = 0; /* interlace method: none */ - - c.crc = swap32(crc(c.type, 17)); - - written += info->userCtx->writeProc((void *)&c.length, 4, 1, info->userCtx->writeProcUserPtr) * 4; - written += info->userCtx->writeProc((void *)c.type, 1, 4, info->userCtx->writeProcUserPtr); - written += info->userCtx->writeProc((void *)c.data, 1, 13, info->userCtx->writeProcUserPtr); - written += info->userCtx->writeProc((void *)&c.crc, 4, 1, info->userCtx->writeProcUserPtr) * 4; - - if (written != 25) - { - LUPNG_WARN(info, "PNG: write error"); - return PNG_ERROR; - } - - return PNG_OK; -} - -static LU_INLINE int writeIdat(PngInfoStruct *info, uint8_t *buf, size_t buflen) -{ - size_t written = 0; - PngChunk c; - - c.length = swap32((uint32_t)(buflen-4)); - c.crc = swap32(crc(buf, buflen)); - - written += info->userCtx->writeProc((void *)&c.length, 4, 1, info->userCtx->writeProcUserPtr) * 4; - written += info->userCtx->writeProc((void *)buf, 1, buflen, info->userCtx->writeProcUserPtr); - written += info->userCtx->writeProc((void *)&c.crc, 4, 1, info->userCtx->writeProcUserPtr) * 4; - - if (written != buflen+8) - { - LUPNG_WARN(info, "PNG: write error"); - return PNG_ERROR; - } - - return PNG_OK; -} - -static LU_INLINE void advanceBytep(PngInfoStruct *info, int is16bit) -{ - if (is16bit) - { - if (info->currentByte%2) - --info->currentByte; - else - info->currentByte+=3; - } - else - ++info->currentByte; -} - -static LU_INLINE size_t filterScanline(PngInfoStruct *info, - uint8_t(*f)(PngInfoStruct *info), - uint8_t filter, - uint8_t *filterCandidate, - int is16bit) -{ - size_t curSum = 0; - size_t fc; - - filterCandidate[0] = filter; - for (info->currentByte = is16bit ? 1 : 0, fc = 1; - info->currentByte < info->scanlineBytes; ++fc, advanceBytep(info, is16bit) ) - { - uint8_t val = f(info); - filterCandidate[fc] = val; - curSum += val; - } - - return curSum; -} - -/* - * Processes the input image and calls writeIdat for every BUF_SIZE compressed - * bytes. - */ -static LU_INLINE int processPixels(PngInfoStruct *info) -{ - uint8_t idatBuf[BUF_SIZE+4] = {'I', 'D', 'A', 'T'}; - uint8_t *compressed = idatBuf+4; - uint8_t *filterCandidate = (uint8_t *)info->userCtx->allocProc(info->scanlineBytes+1, info->userCtx->allocProcUserPtr); - uint8_t *bestCandidate = (uint8_t *)info->userCtx->allocProc(info->scanlineBytes+1, info->userCtx->allocProcUserPtr); - size_t minSum = (size_t)-1, curSum = 0; - int status = Z_OK; - int is16bit = info->cimg->depth == 16; - - if (!filterCandidate || !bestCandidate) - { - LUPNG_WARN(info, "PNG: memory allocation failed!"); - } - - memset(&(info->stream), 0, sizeof(info->stream)); - if(deflateInit(&(info->stream), info->userCtx->compressionLevel) != Z_OK) - { - LUPNG_WARN(info, "PNG: deflateInit failed!"); - info->userCtx->freeProc(filterCandidate, info->userCtx->freeProcUserPtr); - info->userCtx->freeProc(bestCandidate, info->userCtx->freeProcUserPtr); - return PNG_ERROR; - } - - info->stream.avail_out = BUF_SIZE; - info->stream.next_out = compressed; - - for (info->currentRow = 0; info->currentRow < info->cimg->height; - ++info->currentRow) - { - int flush = (info->currentRow < info->cimg->height-1) ? - Z_NO_FLUSH : Z_FINISH; - minSum = (size_t)-1; - - /* - * 1st time it doesn't matter, the filters never look at the previous - * scanline when processing row 0. And next time it'll be valid. - */ - info->previousScanline = info->currentScanline; - info->currentScanline = info->cimg->data + (info->currentRow*info->scanlineBytes); - - /* - * Try to choose the best filter for each scanline. - * Breaks in case of overflow, but hey it's just a heuristic. - */ - for (info->currentFilter = PNG_FILTER_NONE; info->currentFilter <= PNG_FILTER_PAETH; ++info->currentFilter) - { - - switch (info->currentFilter) - { - case PNG_FILTER_NONE: - curSum = filterScanline(info, none, PNG_FILTER_NONE, filterCandidate, is16bit); - break; - - case PNG_FILTER_SUB: - curSum = filterScanline(info, sub, PNG_FILTER_SUB, filterCandidate, is16bit); - break; - - case PNG_FILTER_UP: - curSum = filterScanline(info, up, PNG_FILTER_UP, filterCandidate, is16bit); - break; - - case PNG_FILTER_AVERAGE: - curSum = filterScanline(info, average, PNG_FILTER_AVERAGE, filterCandidate, is16bit); - break; - - case PNG_FILTER_PAETH: - curSum = filterScanline(info, paeth, PNG_FILTER_PAETH, filterCandidate, is16bit); - break; - - default: - break; - } - - if (curSum < minSum || !info->currentFilter) - { - uint8_t *tmp = bestCandidate; - bestCandidate = filterCandidate; - filterCandidate = tmp; - minSum = curSum; - } - } - - info->stream.avail_in = (unsigned int)info->scanlineBytes+1; - info->stream.next_in = bestCandidate; - - /* compress bestCandidate */ - do - { - status = deflate(&info->stream, flush); - - if (info->stream.avail_out < BUF_SIZE) - { - writeIdat(info, idatBuf, BUF_SIZE-info->stream.avail_out+4); - info->stream.next_out = compressed; - info->stream.avail_out = BUF_SIZE; - } - } while ((flush == Z_FINISH && status != Z_STREAM_END) - || (flush == Z_NO_FLUSH && info->stream.avail_in)); - } - - info->userCtx->freeProc(filterCandidate, info->userCtx->freeProcUserPtr); - info->userCtx->freeProc(bestCandidate, info->userCtx->freeProcUserPtr); - - return PNG_OK; -} - -static LU_INLINE int writeIend(PngInfoStruct *info) -{ - PngChunk c = { 0, (uint8_t *)"IEND", 0, 0 }; - size_t written = 0; - c.crc = swap32(crc(c.type, 4)); - - written += info->userCtx->writeProc((void *)&c.length, 4, 1, info->userCtx->writeProcUserPtr) * 4; - written += info->userCtx->writeProc((void *)c.type, 1, 4, info->userCtx->writeProcUserPtr); - written += info->userCtx->writeProc((void *)&c.crc, 4, 1, info->userCtx->writeProcUserPtr) * 4; - - if (written != 12) - { - LUPNG_WARN(info, "PNG: write error"); - return PNG_ERROR; - } - - return PNG_OK; -} - -int luPngWriteUC(const LuUserContext *userCtx, const LuImage *img) -{ - PngInfoStruct info; - memset(&info, 0, sizeof(PngInfoStruct)); - info.userCtx = userCtx; - info.cimg = img; - info.bytesPerPixel = (info.cimg->channels * info.cimg->depth) >> 3; - - if (info.userCtx->writeProc((void *)PNG_SIG, 1, PNG_SIG_SIZE, info.userCtx->writeProcUserPtr) != PNG_SIG_SIZE) - { - LUPNG_WARN(&info, "PNG: write error"); - return PNG_ERROR; - } - - if (writeIhdr(&info) != PNG_OK) - return PNG_ERROR; - - info.scanlineBytes = (info.cimg->depth >> 3) * info.cimg->channels * info.cimg->width; - if (processPixels(&info) != PNG_OK) - { - deflateEnd(&(info.stream)); - return PNG_ERROR; - } - - deflateEnd(&(info.stream)); - return writeIend(&info); -} - -int luPngWrite(PngWriteProc writeProc, void *userPtr, const LuImage *img) -{ - LuUserContext userCtx; - - luUserContextInitDefault(&userCtx); - userCtx.writeProc = writeProc; - userCtx.writeProcUserPtr = userPtr; - return luPngWriteUC(&userCtx, img); -} - -int luPngWriteFile(const char *filename, const LuImage *img) -{ - LuUserContext userCtx; - FILE *f; - - if (!img) - { - return PNG_ERROR; - } - - f = fopen(filename,"wb"); - luUserContextInitDefault(&userCtx); - if (f) - { - userCtx.writeProc = internalFwrite; - userCtx.writeProcUserPtr = f; - luPngWriteUC(&userCtx, img); - fclose(f); - } - else - { - LUPNG_WARN_UC(&userCtx, "PNG: failed to open '%s'", filename); - return PNG_ERROR; - } - - return PNG_OK; -} - -void luImageRelease(LuImage *img, const LuUserContext *userCtx) -{ - LuUserContext ucDefault; - - if (userCtx == NULL) - { - luUserContextInitDefault(&ucDefault); - userCtx = &ucDefault; - } - - userCtx->freeProc(img->data, userCtx->freeProcUserPtr); - if (userCtx->overrideImage != img) - userCtx->freeProc(img, userCtx->freeProcUserPtr); -} - -LuImage *luImageCreate(size_t width, size_t height, uint8_t channels, uint8_t depth, - uint8_t *buffer, const LuUserContext *userCtx) -{ - LuImage *img; - LuUserContext ucDefault; - - if (userCtx == NULL) { - luUserContextInitDefault(&ucDefault); - userCtx = &ucDefault; - } - - if (depth != 8 && depth != 16) - { - LUPNG_WARN_UC(userCtx,"Image: only bit depths 8 and 16 are supported!"); - return NULL; - } - if (width > 0x7FFFFFFF || height > 0x7FFFFFFF) { - LUPNG_WARN_UC(userCtx, "Image: only 32 bit signed image dimensions are supported!"); - return NULL; - } - - if (userCtx->overrideImage) - img = userCtx->overrideImage; - else - img = (LuImage *)userCtx->allocProc(sizeof(LuImage), userCtx->allocProcUserPtr); - if (!img) - return NULL; - - img->width = (int32_t)width; - img->height = (int32_t)height; - img->channels = channels; - img->depth = depth; - img->dataSize = (size_t)((depth >> 3) * width * height * channels); - if (buffer) - img->data = buffer; - else - img->data = (uint8_t *)userCtx->allocProc(img->dataSize, userCtx->allocProcUserPtr); - - if (img->data == NULL) - { - luImageRelease(img, userCtx); - return NULL; - } - - return img; -} - -uint8_t *luImageExtractBufAndRelease(LuImage *img, const LuUserContext *userCtx) -{ - uint8_t *data; - LuUserContext ucDefault; - - if (userCtx == NULL) { - luUserContextInitDefault(&ucDefault); - userCtx = &ucDefault; - } - - if (img) - { - data = img->data; - img->data = NULL; - luImageRelease(img, userCtx); - } - else - { - data = NULL; - } - - return data; -} - -void luUserContextInitDefault(LuUserContext *userCtx) -{ - userCtx->readProc=NULL; - userCtx->readProcUserPtr=NULL; - userCtx->skipSig = 0; - - userCtx->writeProc=NULL; - userCtx->writeProcUserPtr=NULL; - userCtx->compressionLevel=Z_DEFAULT_COMPRESSION; - - userCtx->allocProc=internalMalloc; - userCtx->allocProcUserPtr=NULL; - userCtx->freeProc=internalFree; - userCtx->freeProcUserPtr=NULL; - - userCtx->warnProc=internalPrintf; - userCtx->warnProcUserPtr=(void*)stderr; - - userCtx->overrideImage=NULL; -} \ No newline at end of file diff --git a/tests/regression/tex/lupng.h b/tests/regression/tex/lupng.h deleted file mode 100644 index 5c3f8465..00000000 --- a/tests/regression/tex/lupng.h +++ /dev/null @@ -1,186 +0,0 @@ -/* - * The MIT License (MIT) - * - * Copyright (c) 2014 Jan Solanti - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifdef __cplusplus -extern "C" { -#endif - -#pragma once - -#if defined(_MSC_VER) && (_MSC_VER < 1600) -typedef __int8 int8_t; -typedef __int16 int16_t; -typedef __int32 int32_t; -typedef unsigned __int8 uint8_t; -typedef unsigned __int16 uint16_t; -typedef unsigned __int32 uint32_t; -#else -#include -#include -#endif - -typedef struct { - int32_t width; - int32_t height; - uint8_t channels; - uint8_t depth; /* must be 8 or 16 */ - size_t dataSize; - uint8_t *data; -} LuImage; - -typedef size_t (*PngReadProc)(void *outPtr, size_t size, size_t count, void *userPtr); -typedef size_t (*PngWriteProc)(const void *inPtr, size_t size, size_t count, void *userPtr); -typedef void* (*PngAllocProc)(size_t size, void *userPtr); -typedef void (*PngFreeProc)(void *ptr, void *userPtr); -typedef void (*PngWarnProc)(void *userPtr, const char *fmt, ...); - -typedef struct { - /* loader */ - PngReadProc readProc; - void *readProcUserPtr; - int skipSig; - - /* writer */ - PngWriteProc writeProc; - void *writeProcUserPtr; - int compressionLevel; - - /* memory allocation */ - PngAllocProc allocProc; - void *allocProcUserPtr; - PngFreeProc freeProc; - void *freeProcUserPtr; - - /* warnings/error output */ - PngWarnProc warnProc; /* set to NULL to disable output altogether */ - void *warnProcUserPtr; - - /* special case: avoid allocating a LuImage when loading or creating - * an image, just use this one */ - LuImage *overrideImage; -} LuUserContext; - -/** - * Initializes a LuUserContext to use the defaul malloc implementation. - * - * @param userCtx the LuUserContext to initialize - */ -void luUserContextInitDefault(LuUserContext *userCtx); - -/** - * Creates a new Image object with the specified attributes. - * The data store of the Image is allocated but its contents are undefined. - * Only 8 and 16 bits deep images with 1-4 channels are supported. - * - * @param buffer pointer to an existing buffer (which may already contain the - * image data), or NULL to internally allocate a new buffer - * @param userCtx the user context (with the memory allocator function - * pointers to use), or NULL to use the default allocator - * (malloc). - */ -LuImage *luImageCreate(size_t width, size_t height, uint8_t channels, uint8_t depth, - uint8_t *buffer, const LuUserContext *usrCtx); - -/** - * Releases the memory associated with the given Image object. - * - * @param userCtx the user context (with the memory deallocator function - * pointers to use), or NULL to use the default deallocator - * (free). The deallocator should match the ones used for - * allocation. - */ -void luImageRelease(LuImage *img, const LuUserContext *usrCtx); - -/** - * Extracts the raw image buffer form a LuImage and releases the - * then-orphaned LuImage object. This can be used if you want to use - * the image data in your own structures. - * - * @param userCtx the user context (with the memory deallocator function - * pointers to use), or NULL to use the default deallocator - * (free). The deallocator should match the ones used for - * allocation. - */ -uint8_t *luImageExtractBufAndRelease(LuImage *img, const LuUserContext *userCtx); - -/** - * Decodes a PNG image from a file - * - * @param filename the file name (optionally with full path) to read from. - * @param userCtx the user context (with the memory allocator function - * pointers to use), or NULL to use the default allocator - * (malloc). - */ -LuImage *luPngReadFile(const char *filename, LuUserContext *userCtx); - -/** - * Decodes a PNG image with the provided read function into a LuImage struct - * - * @param readProc a function pointer to a user-defined function to use for - * reading the PNG data. - * @param userPtr an opaque pointer provided as an argument to readProc - * @param skipSig don't verify PNG signature - the bytes have already been - * removed from the input stream - */ -LuImage *luPngRead(PngReadProc readProc, void *userPtr, int skipSig); - -/** - * Decodes a PNG image with the provided user context into a LuImage struct - * - * @param userCtx the LuUserContext to use - */ -LuImage *luPngReadUC(const LuUserContext *userCtx); - -/** - * Encodes a LuImage struct to PNG and writes it out to a file. - * - * @param filename the file name (optionally with full path) to write to. - * Existing files will be overwritten! - * @param img the LuImage to encode - */ -int luPngWriteFile(const char *filename, const LuImage *img); - -/** - * Encodes a LuImage struct to PNG and writes it out using a user-defined write - * function. - * - * @param writeProc a function pointer to a user-defined function that will be - * used for writing the final PNG data. - * @param userPtr an opaque pointer provided as an argument to writeProc - * @param img the LuImage to encode - */ -int luPngWrite(PngWriteProc writeProc, void *userPtr, const LuImage *img); - -/** - * Encodes a LuImage struct to PNG and writes it out with the provided user - * context. - * - * @param userCtx the LuUserContext to use - * @param img the LuImage to encode - */ -int luPngWriteUC(const LuUserContext *userCtx, const LuImage *img); - -#ifdef __cplusplus -} -#endif \ No newline at end of file diff --git a/tests/regression/tex/main.cpp b/tests/regression/tex/main.cpp index ffdfb593..5ea47cc0 100644 --- a/tests/regression/tex/main.cpp +++ b/tests/regression/tex/main.cpp @@ -9,6 +9,8 @@ #include "common.h" #include "utils.h" +using namespace cocogfx; + #define RT_CHECK(_expr) \ do { \ int _ret = _expr; \ @@ -29,7 +31,6 @@ int filter = 0; // 0-> point, 1->bilinear, 2->trilinear float scale = 1.0f; int format = 0; bool use_sw = false; -float lod = 1.0f; // >= 1.0f ePixelFormat eformat = FORMAT_A8R8G8B8; vx_device_h device = nullptr; @@ -37,18 +38,18 @@ vx_buffer_h buffer = nullptr; static void show_usage() { std::cout << "Vortex Texture Test." << std::endl; - std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-l lod] [-z no_hw] [-h: help]" << std::endl; + std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-z no_hw] [-h: help]" << std::endl; } static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "zi:o:k:w:f:g:h?")) != -1) { + while ((c = getopt(argc, argv, "zi:o:k:w:f:g:s:h?")) != -1) { switch (c) { case 'i': - input_file = optarg; + input_file = optarg; break; case 'o': - output_file = optarg; + output_file = optarg; break; case 's': scale = std::stof(optarg, NULL); @@ -56,9 +57,6 @@ static void parse_args(int argc, char **argv) { case 'w': wrap = std::atoi(optarg); break; - case 'l': - lod = std::stof(optarg, NULL); - break; case 'z': use_sw = true; break; @@ -67,9 +65,11 @@ static void parse_args(int argc, char **argv) { switch (format) { case 0: eformat = FORMAT_A8R8G8B8; break; case 1: eformat = FORMAT_R5G6B5; break; - case 2: eformat = FORMAT_R4G4B4A4; break; - case 3: eformat = FORMAT_L8; break; - case 4: eformat = FORMAT_A8; break; + case 2: eformat = FORMAT_A1R5G5B5; break; + case 3: eformat = FORMAT_A4R4G4B4; break; + case 4: eformat = FORMAT_A8L8; break; + case 5: eformat = FORMAT_L8; break; + case 6: eformat = FORMAT_A8; break; default: std::cout << "Error: invalid format: " << format << std::endl; exit(1); @@ -105,7 +105,9 @@ void cleanup() { int run_test(const kernel_arg_t& kernel_arg, uint32_t buf_size, uint32_t width, - uint32_t height) { + uint32_t height, + uint32_t bpp) { + (void)bpp; auto time_start = std::chrono::high_resolution_clock::now(); // start device @@ -132,7 +134,7 @@ int run_test(const kernel_arg_t& kernel_arg, // save output image std::cout << "save output image" << std::endl; - //dump_image(dst_pixels, width, height, bpp); + //dump_image(dst_pixels, width, height, bpp); RT_CHECK(SaveImage(output_file, FORMAT_A8R8G8B8, dst_pixels, width, height)); return 0; @@ -151,11 +153,9 @@ int main(int argc, char *argv[]) { { std::vector staging; RT_CHECK(LoadImage(input_file, eformat, staging, &src_width, &src_height)); - - RT_CHECK(GenerateMipmaps(src_pixels, mip_offsets, staging, eformat, src_width, src_height)); - - //uint32_t src_bpp = Format::GetInfo(eformat).BytePerPixel; - //dump_image(src_pixels, src_pixels.size() / src_bpp, 1, src_bpp); + uint32_t src_bpp = GetInfo(eformat).BytePerPixel; + //dump_image(staging, src_width, src_height, src_bpp); + RT_CHECK(GenerateMipmaps(src_pixels, mip_offsets, staging, eformat, src_width, src_height, src_width * src_bpp)); } // check power of two support @@ -167,12 +167,6 @@ int main(int argc, char *argv[]) { uint32_t src_logwidth = log2ceil(src_width); uint32_t src_logheight = log2ceil(src_height); - uint32_t src_max_lod = std::max(src_logwidth, src_logheight); - if (lod > src_max_lod) { - std::cout << "Error: out-of-bound level-of-detail: lod=" << lod << ", source image=" << src_max_lod << std::endl; - return -1; - } - uint32_t src_bufsize = src_pixels.size(); uint32_t dst_width = (uint32_t)(src_width * scale); @@ -227,7 +221,6 @@ int main(int argc, char *argv[]) { kernel_arg.src_logwidth = src_logwidth; kernel_arg.src_logheight = src_logheight; kernel_arg.src_addr = src_addr; - kernel_arg.lod = lod; for (uint32_t i = 0; i < mip_offsets.size(); ++i) { assert(i < TEX_LOD_MAX); @@ -267,7 +260,7 @@ int main(int argc, char *argv[]) { // run tests std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, dst_bufsize, dst_width, dst_height)); + RT_CHECK(run_test(kernel_arg, dst_bufsize, dst_width, dst_height, dst_bpp)); // cleanup std::cout << "cleanup" << std::endl; diff --git a/tests/regression/tex/surfacedesc.h b/tests/regression/tex/surfacedesc.h deleted file mode 100644 index cf303584..00000000 --- a/tests/regression/tex/surfacedesc.h +++ /dev/null @@ -1,25 +0,0 @@ -// -// Copyright (c) Blaise Tine. All rights reserved. -// -// -// Use of this sample source code is subject to the terms of the Microsoft -// license agreement under which you licensed this sample source code. If -// you did not accept the terms of the license agreement, you are not -// authorized to use this sample source code. For the terms of the license, -// please see the license agreement between you and Microsoft or, if applicable, -// see the LICENSE.RTF on your install media or the root of your tools -// installation. -// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR -// INDEMNITIES. -// -#pragma once - -#include "format.h" - -struct SurfaceDesc { - ePixelFormat Format; - uint8_t *pBits; - uint32_t Width; - uint32_t Height; - uint32_t Pitch; -}; \ No newline at end of file diff --git a/tests/regression/tex/texsw.h b/tests/regression/tex/texsw.h index c9961ab8..2eecb079 100644 --- a/tests/regression/tex/texsw.h +++ b/tests/regression/tex/texsw.h @@ -4,14 +4,30 @@ #include #include "common.h" -inline uint32_t texel_read(uint8_t* address, uint32_t stride) { +using namespace cocogfx; + +inline void texel_read(uint32_t* texels, + uint8_t** addresses, + uint32_t count, + uint32_t stride) { switch (stride) { - case 1: return *(uint8_t*)address; - case 2: return *(uint16_t*)address; - case 4: return *(uint32_t*)address; + case 1: + for (uint32_t i = 0; i < count; ++i) { + texels[i] = *(uint8_t*)addresses[i]; + } + break; + case 2: + for (uint32_t i = 0; i < count; ++i) { + texels[i] = *(uint16_t*)addresses[i]; + } + break; + case 4: + for (uint32_t i = 0; i < count; ++i) { + texels[i] = *(uint32_t*)addresses[i]; + } + break; default: std::abort(); - return 0; } } @@ -34,32 +50,35 @@ inline uint32_t vx_tex_sw(kernel_arg_t* state, // addressing uint32_t offset00, offset01, offset10, offset11; uint32_t alpha, beta; + uint8_t* addr[4]; + uint32_t texel[4]; + TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv, &offset00, &offset01, &offset10, &offset11, &alpha, &beta); - uint8_t* addr00 = base_addr + offset00 * stride; - uint8_t* addr01 = base_addr + offset01 * stride; - uint8_t* addr10 = base_addr + offset10 * stride; - uint8_t* addr11 = base_addr + offset11 * stride; + addr[0] = base_addr + offset00 * stride; + addr[1] = base_addr + offset01 * stride; + addr[2] = base_addr + offset10 * stride; + addr[3] = base_addr + offset11 * stride; - // memory lookup - uint32_t texel00 = texel_read(addr00, stride); - uint32_t texel01 = texel_read(addr01, stride); - uint32_t texel10 = texel_read(addr10, stride); - uint32_t texel11 = texel_read(addr11, stride); + // memory fetch + texel_read(texel, addr, 4, stride); // filtering color = TexFilterLinear( - format, texel00, texel01, texel10, texel11, alpha, beta); + format, texel[0], texel[1], texel[2], texel[3], alpha, beta); } else { // addressing uint32_t offset; + uint8_t* addr; + uint32_t texel; + TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset); - uint8_t* addr = base_addr + offset * stride; + addr = base_addr + offset * stride; - // memory lookup - uint32_t texel = texel_read(addr, stride); + // memory fetch + texel_read(&texel, &addr, 1, stride); // filtering color = TexFilterPoint(format, texel); @@ -67,56 +86,40 @@ inline uint32_t vx_tex_sw(kernel_arg_t* state, return color; } -inline uint32_t tex_load_hw(kernel_arg_t* state, - Fixed xu, - Fixed xv, - Fixed<16> xlod) { +inline uint32_t tex_load(kernel_arg_t* state, + Fixed xu, + Fixed xv, + Fixed<16> xj) { uint32_t color; - int32_t ilod = std::max(xlod.data(), Fixed<16>::ONE); - uint32_t lod = std::min(log2floor(ilod) - 16, TEX_LOD_MAX); + uint32_t j = std::max(xj.data(), Fixed<16>::ONE); + uint32_t l = std::min(log2floor(j) - 16, TEX_LOD_MAX); if (state->filter == 2) { - uint32_t lod_n = std::min(lod + 1, TEX_LOD_MAX); - uint32_t frac = ilod >> (lod + 16 - 8); - uint32_t texel0 = vx_tex(0, xu.data(), xv.data(), lod); - uint32_t texel1 = vx_tex(0, xu.data(), xv.data(), lod_n); + uint32_t ln = std::min(l + 1, TEX_LOD_MAX); + uint32_t f = (j - (1 << (l + 16))) >> (l + 16 - 8); + uint32_t texel0, texel1; + if (state->use_sw) { + texel0 = vx_tex_sw(state, xu, xv, l); + texel1 = vx_tex_sw(state, xu, xv, ln); + } else { + texel0 = vx_tex(0, xu.data(), xv.data(), l); + texel1 = vx_tex(0, xu.data(), xv.data(), ln); + } uint32_t cl, ch; { - uint32_t c0l, c0h; - uint32_t c1l, c1h; - Unpack8888(TexFormat::R8G8B8A8, texel0, &c0l, &c0h); - Unpack8888(TexFormat::R8G8B8A8, texel1, &c1l, &c1h); - Lerp8888(c0l, c0h, c1l, c1h, frac, &cl, &ch); + uint32_t c0l, c0h, c1l, c1h; + Unpack8888(texel0, &c0l, &c0h); + Unpack8888(texel1, &c1l, &c1h); + cl = Lerp8888(c0l, c1l, f); + ch = Lerp8888(c0h, c1h, f); } - color = Pack8888(TexFormat::R8G8B8A8, cl, ch); + color = Pack8888(cl, ch); + //vx_printf("j=0x%x, l=%d, ln=%d, f=%d, texel0=0x%x, texel1=0x%x, color=0x%x\n", j, l, ln, f, texel0, texel1, color); } else { - color = vx_tex(0, xu.data(), xv.data(), lod); - } - return color; -} - -inline uint32_t tex_load_sw(kernel_arg_t* state, - Fixed xu, - Fixed xv, - Fixed<16> xlod) { - uint32_t color; - int32_t ilod = std::max(xlod.data(), Fixed<16>::ONE); - uint32_t lod = std::min(log2floor(ilod) - 16, TEX_LOD_MAX); - if (state->filter == 2) { - uint32_t lod_n = std::min(lod + 1, TEX_LOD_MAX); - uint32_t frac = ilod >> (lod + 16 - 8); - uint32_t texel0 = vx_tex_sw(state, xu, xv, lod); - uint32_t texel1 = vx_tex_sw(state, xu, xv, lod_n); - uint32_t cl, ch; - { - uint32_t c0l, c0h; - uint32_t c1l, c1h; - Unpack8888(TexFormat::R8G8B8A8, texel0, &c0l, &c0h); - Unpack8888(TexFormat::R8G8B8A8, texel1, &c1l, &c1h); - Lerp8888(c0l, c0h, c1l, c1h, frac, &cl, &ch); + if (state->use_sw) { + color = vx_tex_sw(state, xu, xv, l); + } else { + color = vx_tex(0, xu.data(), xv.data(), l); } - color = Pack8888(TexFormat::R8G8B8A8, cl, ch); - } else { - color = vx_tex_sw(state, xu, xv, lod); } return color; } \ No newline at end of file diff --git a/tests/regression/tex/tga.cpp b/tests/regression/tex/tga.cpp deleted file mode 100644 index 62641587..00000000 --- a/tests/regression/tex/tga.cpp +++ /dev/null @@ -1,122 +0,0 @@ -#include "tga.h" -#include -#include -#include "format.h" - -struct __attribute__((__packed__)) tga_header_t { - int8_t idlength; - int8_t colormaptype; - int8_t imagetype; - int16_t colormaporigin; - int16_t colormaplength; - int8_t colormapdepth; - int16_t xoffset; - int16_t yoffset; - int16_t width; - int16_t height; - int8_t bitsperpixel; - int8_t imagedescriptor; -}; - -int LoadTGA(const char *filename, - std::vector &pixels, - uint32_t *width, - uint32_t *height, - uint32_t *bpp) { - std::ifstream ifs(filename, std::ios::in | std::ios::binary); - if (!ifs.is_open()) { - std::cerr << "couldn't open file: " << filename << "!" << std::endl; - return -1; - } - - tga_header_t header; - ifs.read(reinterpret_cast(&header), sizeof(tga_header_t)); - if (ifs.fail()) { - std::cerr << "invalid TGA file header!" << std::endl; - return -1; - } - - if (header.imagetype != 2) { - std::cerr << "unsupported TGA encoding format!" << std::endl; - return -1; - } - - ifs.seekg(header.idlength, std::ios::cur); // skip string - if (ifs.fail()) { - std::cerr << "invalid TGA file!" << std::endl; - return -1; - } - - switch (header.bitsperpixel) { - case 16: - case 24: - case 32: { - // Read pixels data - auto stride = header.bitsperpixel / 8; - pixels.resize(stride * header.width * header.height); - ifs.read((char*)pixels.data(), pixels.size()); - if (ifs.fail()) { - std::cerr << "invalid TGA file!" << std::endl; - return -1; - } - *bpp = stride; - break; - } - default: - std::cerr << "unsupported TGA bitsperpixel!" << std::endl; - return -1; - } - - *width = header.width; - *height = header.height; - - return 0; -} - -int SaveTGA(const char *filename, - const std::vector &pixels, - uint32_t width, - uint32_t height, - uint32_t bpp) { - std::ofstream ofs(filename, std::ios::out | std::ios::binary); - if (!ofs.is_open()) { - std::cerr << "couldn't create file: " << filename << "!" << std::endl; - return -1; - } - - if (bpp < 2 || bpp > 4) { - std::cerr << "unsupported pixel stride: " << bpp << "!" << std::endl; - return -1; - } - - tga_header_t header; - header.idlength = 0; - header.colormaptype = 0; // no palette - header.imagetype = 2; // color mapped data - header.colormaporigin = 0; - header.colormaplength = 0; - header.colormapdepth = 0; - header.xoffset = 0; - header.yoffset = 0; - header.width = width; - header.height = height; - header.bitsperpixel = bpp * 8; - header.imagedescriptor = 0; - - // write header - ofs.write(reinterpret_cast(&header), sizeof(tga_header_t)); - - // write pixel data - uint32_t pitch = bpp * width; - const uint8_t* pixel_bytes = pixels.data() + (height - 1) * pitch; - for (uint32_t y = 0; y < height; ++y) { - const uint8_t* pixel_row = pixel_bytes; - for (uint32_t x = 0; x < width; ++x) { - ofs.write((const char*)pixel_row, bpp); - pixel_row += bpp; - } - pixel_bytes -= pitch; - } - - return 0; -} \ No newline at end of file diff --git a/tests/regression/tex/tga.h b/tests/regression/tex/tga.h deleted file mode 100644 index 24b92a75..00000000 --- a/tests/regression/tex/tga.h +++ /dev/null @@ -1,14 +0,0 @@ -#include -#include - -int LoadTGA(const char *filename, - std::vector &pixels, - uint32_t *width, - uint32_t *height, - uint32_t *bpp); - -int SaveTGA(const char *filename, - const std::vector &pixels, - uint32_t width, - uint32_t height, - uint32_t bpp); \ No newline at end of file diff --git a/tests/regression/tex/utils.cpp b/tests/regression/tex/utils.cpp index 81a47158..e76b72f9 100644 --- a/tests/regression/tex/utils.cpp +++ b/tests/regression/tex/utils.cpp @@ -1,10 +1,12 @@ #include "utils.h" #include -#include -#include "blitter.h" -#include "format.h" -#include "tga.h" -#include "lupng.h" +#include +#include +#include +#include +#include + +using namespace cocogfx; std::string getFileExt(const std::string& str) { auto i = str.rfind('.'); @@ -41,22 +43,9 @@ int LoadImage(const char *filename, return ret; } else if (iequals(ext, "png")) { - auto image = luPngReadFile(filename, NULL); - if (image == NULL) - return -1; - if (image->depth != 8 - || (image->channels != 3 - && image->channels != 4)) { - luImageRelease(image, NULL); - std::cerr << "invalid png file format!" << std::endl; - return -1; - } - pixels.resize(image->channels * image->width * image->height); - memcpy(pixels.data(), image->data, pixels.size()); - img_width = image->width; - img_height = image->height; - img_bpp = image->channels; - luImageRelease(image, NULL); + int ret = LoadPNG(filename, pixels, &img_width, &img_height, &img_bpp); + if (ret) + return ret; } else { std::cerr << "invalid file extension: " << ext << "!" << std::endl; return -1; @@ -83,7 +72,7 @@ int LoadImage(const char *filename, if (img_format != format) { // format conversion to RGBA std::vector staging; - int ret = ConvertImage(staging, pixels, img_width, img_height, img_format, format); + int ret = ConvertImage(staging, format, pixels, img_format, img_width, img_height, img_width * img_bpp); if (ret) return ret; pixels.swap(staging); @@ -100,19 +89,13 @@ int SaveImage(const char *filename, const std::vector &pixels, uint32_t width, uint32_t height) { - uint32_t bpp = Format::GetInfo(format).BytePerPixel; + uint32_t bpp = GetInfo(format).BytePerPixel; auto ext = getFileExt(filename); if (iequals(ext, "tga")) { return SaveTGA(filename, pixels, width, height, bpp); } else if (iequals(ext, "png")) { - LuImage image; - image.width = width; - image.height = height; - image.depth = 8; - image.channels = bpp; - image.data = (uint8_t*)pixels.data(); - return luPngWriteFile(filename, &image); + return SavePNG(filename, pixels, width, height, bpp); } else { std::cerr << "invalid file extension: " << ext << "!" << std::endl; return -1; @@ -132,171 +115,8 @@ void dump_image(const std::vector& pixels, uint32_t width, uint32_t hei pixel32 |= pixel8 << (b * 8); } if (x) std::cout << ", "; - std::cout << std::hex << pixel32; + std::cout << std::hex << std::setw(bpp * 2) << std::setfill('0') << pixel32; } std::cout << std::endl; } -} - -int CopyBuffers(SurfaceDesc &dstDesc, - int32_t dstOffsetX, - int32_t dstOffsetY, - uint32_t copyWidth, - uint32_t copyHeight, - const SurfaceDesc &srcDesc, - int32_t srcOffsetX, - int32_t srcOffsetY) { - - static const BlitTable s_blitTable; - - if ((srcOffsetX >= (int32_t)srcDesc.Width) || (srcOffsetY >= (int32_t)srcDesc.Height) || - (dstOffsetX >= (int32_t)dstDesc.Width) || (dstOffsetY >= (int32_t)dstDesc.Height)) { - return -1; - } - - if (copyWidth > dstDesc.Width) { - copyWidth = dstDesc.Width; - } - - if (copyWidth > srcDesc.Width) { - copyWidth = srcDesc.Width; - } - - if (copyHeight > dstDesc.Height) { - copyHeight = dstDesc.Height; - } - - if (copyHeight > srcDesc.Height) { - copyHeight = srcDesc.Height; - } - - return s_blitTable.get(srcDesc.Format, dstDesc.Format)( - dstDesc, dstOffsetX, dstOffsetY, copyWidth, copyHeight, srcDesc, - srcOffsetX, srcOffsetY); -} - -int ConvertImage(std::vector& dst_pixels, - const std::vector& src_pixels, - uint32_t width, - uint32_t height, - ePixelFormat src_format, - ePixelFormat dst_format) { - - uint32_t src_pitch = Format::GetInfo(src_format).BytePerPixel * width; - uint32_t dst_pitch = Format::GetInfo(dst_format).BytePerPixel * width; - - dst_pixels.resize(dst_pitch * height); - - SurfaceDesc srcDesc{src_format, (uint8_t*)src_pixels.data(), width, height, src_pitch}; - SurfaceDesc dstDesc{dst_format, dst_pixels.data(), width, height, dst_pitch}; - - return CopyBuffers(dstDesc, 0, 0, width, height, srcDesc, 0, 0); -} - - - -int GenerateMipmaps(std::vector& dst_pixels, - std::vector& mip_offsets, - const std::vector& src_pixels, - ePixelFormat format, - uint32_t src_width, - uint32_t src_height) { - std::vector src_staging, dst_staging; - const std::vector *pSrcPixels; - std::vector *pDstPixels; - - // convert source image if needed - bool need_conversion = (format != FORMAT_A8R8G8B8); - if (need_conversion) { - ConvertImage(src_staging, src_pixels, src_width, src_height, format, FORMAT_A8R8G8B8); - pSrcPixels = &src_staging; - pDstPixels = &dst_staging; - } else { - pSrcPixels = &src_pixels; - pDstPixels = &dst_pixels; - } - - uint32_t src_logwidth = log2ceil(src_width); - uint32_t src_logheight = log2ceil(src_height); - uint32_t max_lod = std::max(src_logwidth, src_logheight) + 1; - - mip_offsets.resize(max_lod); - - // Calculate mipmaps buffer size - uint32_t dst_height = 1; - uint32_t dst_width = 0; - for (uint32_t lod = 0, w = src_width, h = src_height; lod < max_lod; ++lod) { - assert((w > 0) || (w > 0)); - uint32_t pw = std::max(w, 1); - uint32_t ph = std::max(h, 1); - mip_offsets.at(lod) = dst_width; - dst_width += pw * ph; - w >>= 1; - h >>= 1; - } - - // allocate mipmap - pDstPixels->resize(dst_width * 4); - - // generate mipmaps - { - auto pSrc = reinterpret_cast(pSrcPixels->data()); - auto pDst = reinterpret_cast(pDstPixels->data()); - - // copy level 0 - memcpy(pDst, pSrc, pSrcPixels->size()); - assert(pSrcPixels->size() == 4 * src_width * src_height); - pSrc = pDst; - pDst += src_width * src_height; - - // copy lower levels - for (uint32_t lod = 1, w = (src_width/2), h = (src_height/2); lod < max_lod;) { - assert((w > 0) || (w > 0)); - uint32_t pw = std::max(w, 1); - uint32_t ph = std::max(h, 1); - for (uint32_t y = 0; y < pw; ++y) { - auto v0 = 2 * y; - auto v1 = 2 * y + ((ph > 1) ? 1 : 0); - auto pSrc0 = pSrc + v0 * (2 * pw); - auto pSrc1 = pSrc + v1 * (2 * pw); - - for (uint32_t x = 0; x 1) ? 1 : 0); - - auto c00 = Format::ConvertFrom(pSrc0 + u0); - auto c01 = Format::ConvertFrom(pSrc0 + u1); - auto c10 = Format::ConvertFrom(pSrc1 + u0); - auto c11 = Format::ConvertFrom(pSrc1 + u1); - - const ColorARGB color((c00.a + c01.a + c10.a + c11.a+2) >> 2, - (c00.r + c01.r + c10.r + c11.r+2) >> 2, - (c00.g + c01.g + c10.g + c11.g+2) >> 2, - (c00.b + c01.b + c10.b + c11.b+2) >> 2); - - uint32_t ncolor; - Format::ConvertTo(&ncolor, color); - pDst[x + y * pw] = ncolor; - } - } - ++lod; - pSrc = pDst; - pDst += pw * ph; - w >>= 1; - h >>= 1; - } - assert((pDst - reinterpret_cast(pDstPixels->data())) == dst_width); - } - - // convert destination image if needed - if (need_conversion) { - ConvertImage(dst_staging, dst_staging, dst_width, dst_height, FORMAT_A8R8G8B8, format); - } - - uint32_t bpp = Format::GetInfo(format).BytePerPixel; - for (auto& offset : mip_offsets) { - offset *= bpp; - } - - return 0; } \ No newline at end of file diff --git a/tests/regression/tex/utils.h b/tests/regression/tex/utils.h index 7ce58941..a3ffccae 100644 --- a/tests/regression/tex/utils.h +++ b/tests/regression/tex/utils.h @@ -1,44 +1,21 @@ #include #include -#include +#include +#include #include -#include "surfacedesc.h" int LoadImage(const char *filename, - ePixelFormat format, + cocogfx::ePixelFormat format, std::vector &pixels, uint32_t *width, uint32_t *height); int SaveImage(const char *filename, - ePixelFormat format, + cocogfx::ePixelFormat format, const std::vector &pixels, uint32_t width, uint32_t height); -int CopyBuffers(SurfaceDesc &dstDesc, - int32_t dstOffsetX, - int32_t dstOffsetY, - uint32_t copyWidth, - uint32_t copyHeight, - const SurfaceDesc &srcDesc, - int32_t srcOffsetX, - int32_t srcOffsetY); - -int ConvertImage(std::vector& dst_pixels, - const std::vector& src_pixels, - uint32_t width, - uint32_t height, - ePixelFormat src_format, - ePixelFormat dst_format); - -int GenerateMipmaps(std::vector& dst_pixels, - std::vector& mip_offsets, - const std::vector& src_pixels, - ePixelFormat format, - uint32_t src_width, - uint32_t src_height); - void dump_image(const std::vector& pixels, uint32_t width, uint32_t height, diff --git a/third_party/Makefile b/third_party/Makefile new file mode 100644 index 00000000..8a9ed890 --- /dev/null +++ b/third_party/Makefile @@ -0,0 +1,15 @@ +all: fpnew cocogfx softfloat + +fpnew: + +cocogfx: + $(MAKE) -C cocogfx + +softfloat: + SPECIALIZE_TYPE=RISCV SOFTFLOAT_OPTS="-fPIC -DSOFTFLOAT_ROUND_ODD -DINLINE_LEVEL=5 -DSOFTFLOAT_FAST_DIV32TO16 -DSOFTFLOAT_FAST_DIV64TO32" $(MAKE) -C softfloat/build/Linux-x86_64-GCC + +clean: + $(MAKE) clean -C cocogfx + $(MAKE) -C softfloat/build/Linux-x86_64-GCC clean + +.PHONY: all fpnew cocogfx softfloat \ No newline at end of file From 41d7e6c63afd8a90f69559a22426ca71e98a6c12 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Nov 2021 07:08:15 -0500 Subject: [PATCH 11/27] cummulative fixes, RTL uuid trace, texture unit fixes, simx timing fixes --- ci/blackbox.sh | 12 +- ci/regression.sh | 9 + driver/common/vx_utils.cpp | 81 ++-- hw/rtl/VX_alu_unit.sv | 16 +- hw/rtl/VX_commit.sv | 46 +- hw/rtl/VX_config.vh | 96 ++-- hw/rtl/VX_csr_data.sv | 57 ++- hw/rtl/VX_csr_unit.sv | 21 +- hw/rtl/VX_decode.sv | 49 +- hw/rtl/VX_define.vh | 3 +- hw/rtl/VX_dispatch.sv | 30 +- hw/rtl/VX_execute.sv | 10 + hw/rtl/VX_fpu_unit.sv | 13 +- hw/rtl/VX_gpu_unit.sv | 31 +- hw/rtl/VX_ibuffer.sv | 8 +- hw/rtl/VX_icache_stage.sv | 33 +- hw/rtl/VX_issue.sv | 68 ++- hw/rtl/VX_lsu_unit.sv | 47 +- hw/rtl/VX_mem_unit.sv | 30 +- hw/rtl/VX_muldiv.sv | 35 +- hw/rtl/VX_pipeline.sv | 5 +- hw/rtl/VX_scoreboard.sv | 12 +- hw/rtl/VX_trace_instr.vh | 4 +- hw/rtl/VX_warp_sched.sv | 20 +- hw/rtl/VX_writeback.sv | 27 +- hw/rtl/afu/VX_to_mem.sv | 3 +- hw/rtl/cache/VX_bank.sv | 2 - hw/rtl/cache/VX_cache.sv | 244 +++++----- hw/rtl/cache/VX_shared_mem.sv | 25 +- hw/rtl/interfaces/VX_alu_req_if.sv | 5 +- hw/rtl/interfaces/VX_cmt_to_csr_if.sv | 9 +- hw/rtl/interfaces/VX_commit_if.sv | 3 + hw/rtl/interfaces/VX_csr_req_if.sv | 3 + hw/rtl/interfaces/VX_decode_if.sv | 7 +- hw/rtl/interfaces/VX_fpu_req_if.sv | 3 + hw/rtl/interfaces/VX_gpu_req_if.sv | 5 +- hw/rtl/interfaces/VX_ibuffer_if.sv | 3 + hw/rtl/interfaces/VX_ifetch_req_if.sv | 9 +- hw/rtl/interfaces/VX_ifetch_rsp_if.sv | 7 +- hw/rtl/interfaces/VX_lsu_req_if.sv | 3 + hw/rtl/interfaces/VX_perf_cache_if.sv | 6 +- hw/rtl/interfaces/VX_perf_memsys_if.sv | 24 +- hw/rtl/interfaces/VX_perf_pipeline_if.sv | 32 +- hw/rtl/interfaces/VX_perf_tex_if.sv | 23 + hw/rtl/interfaces/VX_tex_req_if.sv | 3 + hw/rtl/interfaces/VX_tex_rsp_if.sv | 3 + hw/rtl/interfaces/VX_writeback_if.sv | 3 + hw/rtl/libs/VX_axi_adapter.sv | 4 +- hw/rtl/libs/VX_index_queue.sv | 2 +- hw/rtl/libs/VX_popcount.sv | 11 +- hw/rtl/libs/VX_skid_buffer.sv | 2 +- hw/rtl/tex_unit/VX_tex_unit.sv | 147 ++++-- hw/scripts/scope.json | 76 +-- runtime/src/vx_start.S | 12 +- sim/common/mempool.h | 47 ++ sim/common/simobject.h | 254 +++++----- sim/simX/archdef.h | 38 +- sim/simX/args.h | 4 +- sim/simX/cache.cpp | 79 ++- sim/simX/cache.h | 69 ++- sim/simX/constants.h | 6 +- sim/simX/core.cpp | 594 ++++++++++++++--------- sim/simX/core.h | 85 +++- sim/simX/decode.cpp | 38 +- sim/simX/execute.cpp | 120 +++-- sim/simX/exeunit.cpp | 305 +++++++----- sim/simX/exeunit.h | 69 +-- sim/simX/memsim.cpp | 12 +- sim/simX/memsim.h | 49 +- sim/simX/pipeline.h | 42 +- sim/simX/processor.cpp | 8 +- sim/simX/scoreboard.h | 2 +- sim/simX/sharedmem.h | 93 ++++ sim/simX/tex_unit.cpp | 12 +- sim/simX/tex_unit.h | 2 +- sim/simX/types.h | 125 ++++- sim/simX/warp.cpp | 4 +- sim/simX/warp.h | 4 + tests/regression/tex/kernel.c | 17 +- 79 files changed, 2148 insertions(+), 1372 deletions(-) create mode 100644 hw/rtl/interfaces/VX_perf_tex_if.sv create mode 100644 sim/common/mempool.h create mode 100644 sim/simX/sharedmem.h diff --git a/ci/blackbox.sh b/ci/blackbox.sh index 5ba7a29a..f2c6ec2b 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -124,7 +124,17 @@ CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_TH echo "CONFIGS=$CONFIGS" -make -C $DRIVER_PATH clean +if [ -f "blackbox.cache" ] +then + LAST_CONFIGS=`cat blackbox.cache` +fi + +if [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ]; +then + make -C $DRIVER_PATH clean +fi + +echo "$CONFIGS+$DEBUG+$SCOPE" > blackbox.cache status=0 diff --git a/ci/regression.sh b/ci/regression.sh index 936ca13b..2be58140 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -27,8 +27,11 @@ tex() echo "begin texture tests..." CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0" +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-isoccer.png -osoccer_result.png -g0" CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-irainbow.png -orainbow_result.png -g2" +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" --perf +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-itoad.png -otoad_result.png -g1" --perf echo "coverage texture done!" } @@ -58,7 +61,9 @@ debug() echo "begin debugging tests..." ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1" ./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1" echo "debugging tests done!" @@ -73,9 +78,13 @@ CONFIGS=-DEXT_M_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_e # disabling F extension CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext +CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf +CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf # disable shared memory CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem +CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem --perf +CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=simx --cores=1 --app=no_smem --perf # using Default FPU core FPU_CORE=FPU_DEFAULT ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index 5b70e09b..a69df27c 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -114,11 +114,13 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t csr_stalls = 0; uint64_t alu_stalls = 0; uint64_t gpu_stalls = 0; + // PERF: decode + uint64_t loads = 0; + uint64_t stores = 0; + uint64_t branches = 0; // PERF: Icache uint64_t icache_reads = 0; uint64_t icache_read_misses = 0; - uint64_t icache_pipe_stalls = 0; - uint64_t icache_rsp_stalls = 0; // PERF: Dcache uint64_t dcache_reads = 0; uint64_t dcache_writes = 0; @@ -126,17 +128,19 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t dcache_write_misses = 0; uint64_t dcache_bank_stalls = 0; uint64_t dcache_mshr_stalls = 0; - uint64_t dcache_pipe_stalls = 0; - uint64_t dcache_rsp_stalls = 0; - // PERF: SMEM + // PERF: shared memory uint64_t smem_reads = 0; uint64_t smem_writes = 0; uint64_t smem_bank_stalls = 0; // PERF: memory uint64_t mem_reads = 0; uint64_t mem_writes = 0; - uint64_t mem_stalls = 0; uint64_t mem_lat = 0; +#ifdef EXT_TEX_ENABLE + // PERF: texunit + uint64_t tex_mem_reads = 0; + uint64_t tex_mem_lat = 0; +#endif #endif uint64_t num_cores; @@ -196,6 +200,20 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu unit stalls=%ld\n", core_id, gpu_stalls_per_core); gpu_stalls += gpu_stalls_per_core; + // PERF: decode + // loads + uint64_t loads_per_core = get_csr_64(staging_ptr, CSR_MPM_LOADS); + if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core); + loads += loads_per_core; + // stores + uint64_t stores_per_core = get_csr_64(staging_ptr, CSR_MPM_STORES); + if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core); + stores += stores_per_core; + // branches + uint64_t branches_per_core = get_csr_64(staging_ptr, CSR_MPM_BRANCHES); + if (num_cores > 1) fprintf(stream, "PERF: core%d: branches=%ld\n", core_id, branches_per_core); + branches += branches_per_core; + // PERF: Icache // total reads uint64_t icache_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_READS); @@ -204,16 +222,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { // read misses uint64_t icache_miss_r_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_MISS_R); int icache_read_hit_ratio = (int)((1.0 - (double(icache_miss_r_per_core) / double(icache_reads_per_core))) * 100); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio); + if (num_cores > 1) fprintf(stream, "PERF: core%d: icache misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio); icache_read_misses += icache_miss_r_per_core; - // pipeline stalls - uint64_t icache_pipe_st_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_PIPE_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache pipeline stalls=%ld\n", core_id, icache_pipe_st_per_core); - icache_pipe_stalls += icache_pipe_st_per_core; - // response stalls - uint64_t icache_crsp_st_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_CRSP_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reponse stalls=%ld\n", core_id, icache_crsp_st_per_core); - icache_rsp_stalls += icache_crsp_st_per_core; // PERF: Dcache // total reads @@ -243,14 +253,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t dcache_mshr_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MSHR_ST); if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_st_per_core); dcache_mshr_stalls += dcache_mshr_st_per_core; - // pipeline stalls - uint64_t dcache_pipe_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_PIPE_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core); - dcache_pipe_stalls += dcache_pipe_st_per_core; - // response stalls - uint64_t dcache_crsp_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_CRSP_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core); - dcache_rsp_stalls += dcache_crsp_st_per_core; // PERF: SMEM // total reads @@ -270,17 +272,26 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { // PERF: memory uint64_t mem_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_READS); uint64_t mem_writes_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_WRITES); - uint64_t mem_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_ST); uint64_t mem_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_LAT); - int mem_utilization = (int)((double(mem_reads_per_core + mem_writes_per_core) / double(mem_reads_per_core + mem_writes_per_core + mem_stalls_per_core)) * 100); int mem_avg_lat = (int)(double(mem_lat_per_core) / double(mem_reads_per_core)); if (num_cores > 1) fprintf(stream, "PERF: core%d: memory requests=%ld (reads=%ld, writes=%ld)\n", core_id, (mem_reads_per_core + mem_writes_per_core), mem_reads_per_core, mem_writes_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: memory stalls=%ld (utilization=%d%%)\n", core_id, mem_stalls_per_core, mem_utilization); - if (num_cores > 1) fprintf(stream, "PERF: core%d: memory average latency=%d cycles\n", core_id, mem_avg_lat); + if (num_cores > 1) fprintf(stream, "PERF: core%d: memory latency=%d cycles\n", core_id, mem_avg_lat); mem_reads += mem_reads_per_core; mem_writes += mem_writes_per_core; - mem_stalls += mem_stalls_per_core; mem_lat += mem_lat_per_core; + + #ifdef EXT_TEX_ENABLE + // total reads + uint64_t tex_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_TEX_READS); + if (num_cores > 1) fprintf(stream, "PERF: core%d: tex memory reads=%ld\n", core_id, tex_reads_per_core); + tex_mem_reads += tex_reads_per_core; + + // read latency + uint64_t tex_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_TEX_LAT); + int tex_avg_lat = (int)(double(tex_lat_per_core) / double(tex_reads_per_core)); + if (num_cores > 1) fprintf(stream, "PERF: core%d: tex memory latency=%d cycles\n", core_id, tex_avg_lat); + tex_mem_lat += tex_lat_per_core; + #endif #endif } @@ -293,7 +304,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100); int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100); int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100); - int mem_utilization = (int)((double(mem_reads + mem_writes) / double(mem_reads + mem_writes + mem_stalls)) * 100); int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads)); fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls); fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls); @@ -302,24 +312,27 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { fprintf(stream, "PERF: csr unit stalls=%ld\n", csr_stalls); fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls); fprintf(stream, "PERF: gpu unit stalls=%ld\n", gpu_stalls); + fprintf(stream, "PERF: loads=%ld\n", loads); + fprintf(stream, "PERF: stores=%ld\n", stores); + fprintf(stream, "PERF: branches=%ld\n", branches); fprintf(stream, "PERF: icache reads=%ld\n", icache_reads); fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio); - fprintf(stream, "PERF: icache pipeline stalls=%ld\n", icache_pipe_stalls); - fprintf(stream, "PERF: icache reponse stalls=%ld\n", icache_rsp_stalls); fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads); fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes); fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio); fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio); fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization); fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls); - fprintf(stream, "PERF: dcache pipeline stalls=%ld\n", dcache_pipe_stalls); - fprintf(stream, "PERF: dcache reponse stalls=%ld\n", dcache_rsp_stalls); fprintf(stream, "PERF: smem reads=%ld\n", smem_reads); fprintf(stream, "PERF: smem writes=%ld\n", smem_writes); fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization); fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes); - fprintf(stream, "PERF: memory stalls=%ld (utilization=%d%%)\n", mem_stalls, mem_utilization); fprintf(stream, "PERF: memory average latency=%d cycles\n", mem_avg_lat); +#ifdef EXT_TEX_ENABLE + int tex_avg_lat = (int)(double(tex_mem_lat) / double(tex_mem_reads)); + fprintf(stream, "PERF: tex memory reads=%ld\n", tex_mem_reads); + fprintf(stream, "PERF: tex memory latency=%d cycles\n", tex_avg_lat); +#endif #endif // release allocated resources diff --git a/hw/rtl/VX_alu_unit.sv b/hw/rtl/VX_alu_unit.sv index 8840f044..da20eb6d 100644 --- a/hw/rtl/VX_alu_unit.sv +++ b/hw/rtl/VX_alu_unit.sv @@ -96,6 +96,7 @@ module VX_alu_unit #( wire alu_ready_in; wire alu_valid_out; wire alu_ready_out; + wire [63:0] alu_uuid; wire [`NW_BITS-1:0] alu_wid; wire [`NUM_THREADS-1:0] alu_tmask; wire [31:0] alu_PC; @@ -112,14 +113,14 @@ module VX_alu_unit #( assign alu_ready_in = alu_ready_out || ~alu_valid_out; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (alu_ready_in), - .data_in ({alu_valid_in, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}), - .data_out ({alu_valid_out, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r}) + .data_in ({alu_valid_in, alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}), + .data_out ({alu_valid_out, alu_uuid, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r}) ); `UNUSED_VAR (br_op_r) @@ -138,6 +139,7 @@ module VX_alu_unit #( wire mul_ready_in; wire mul_valid_out; wire mul_ready_out; + wire [63:0] mul_uuid; wire [`NW_BITS-1:0] mul_wid; wire [`NUM_THREADS-1:0] mul_tmask; wire [31:0] mul_PC; @@ -153,6 +155,7 @@ module VX_alu_unit #( // Inputs .alu_op (mul_op), + .uuid_in (alu_req_if.uuid), .wid_in (alu_req_if.wid), .tmask_in (alu_req_if.tmask), .PC_in (alu_req_if.PC), @@ -163,6 +166,7 @@ module VX_alu_unit #( // Outputs .wid_out (mul_wid), + .uuid_out (mul_uuid), .tmask_out (mul_tmask), .PC_out (mul_PC), .rd_out (mul_rd), @@ -184,6 +188,7 @@ module VX_alu_unit #( assign mul_valid_in = alu_req_if.valid && is_mul_op; assign alu_commit_if.valid = alu_valid_out || mul_valid_out; + assign alu_commit_if.uuid = alu_valid_out ? alu_uuid : mul_uuid; assign alu_commit_if.wid = alu_valid_out ? alu_wid : mul_wid; assign alu_commit_if.tmask = alu_valid_out ? alu_tmask : mul_tmask; assign alu_commit_if.PC = alu_valid_out ? alu_PC : mul_PC; @@ -201,6 +206,7 @@ module VX_alu_unit #( assign alu_valid_in = alu_req_if.valid; assign alu_commit_if.valid = alu_valid_out; + assign alu_commit_if.uuid = alu_uuid; assign alu_commit_if.wid = alu_wid; assign alu_commit_if.tmask = alu_tmask; assign alu_commit_if.PC = alu_PC; @@ -220,8 +226,8 @@ module VX_alu_unit #( `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (branch_ctl_if.valid) begin - dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h\n", - $time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest); + dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h (#%0d)\n", + $time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest, alu_uuid); end end `endif diff --git a/hw/rtl/VX_commit.sv b/hw/rtl/VX_commit.sv index 07b83df0..574ed36e 100644 --- a/hw/rtl/VX_commit.sv +++ b/hw/rtl/VX_commit.sv @@ -40,27 +40,35 @@ module VX_commit #( `endif || gpu_commit_fire; - wire [`NUM_THREADS-1:0] commit_tmask; - assign commit_tmask = alu_commit_fire ? alu_commit_if.tmask: - ld_commit_fire ? ld_commit_if.tmask: - st_commit_fire ? st_commit_if.tmask: - csr_commit_fire ? csr_commit_if.tmask: - `ifdef EXT_F_ENABLE - fpu_commit_fire ? fpu_commit_if.tmask: - `endif - /*gpu_commit_fire ?*/ gpu_commit_if.tmask; +`ifdef EXT_F_ENABLE + wire [(6*`NUM_THREADS)-1:0] commit_tmask; +`else + wire [(5*`NUM_THREADS)-1:0] commit_tmask; +`endif - wire [$clog2(`NUM_THREADS+1)-1:0] commit_cnt; - `POP_COUNT(commit_cnt, commit_tmask); + wire [$clog2($bits(commit_tmask)+1)-1:0] commit_size; + + assign commit_tmask = { + {`NUM_THREADS{alu_commit_fire}} & alu_commit_if.tmask, + {`NUM_THREADS{ld_commit_fire}} & ld_commit_if.tmask, + {`NUM_THREADS{st_commit_fire}} & st_commit_if.tmask, + {`NUM_THREADS{csr_commit_fire}} & csr_commit_if.tmask, + `ifdef EXT_F_ENABLE + {`NUM_THREADS{fpu_commit_fire}} & fpu_commit_if.tmask, + `endif + {`NUM_THREADS{gpu_commit_fire}} & gpu_commit_if.tmask + }; + + `POP_COUNT(commit_size, commit_tmask); VX_pipe_register #( - .DATAW (1 + $clog2(`NUM_THREADS+1)), + .DATAW (1 + $bits(commit_size)), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (1'b1), - .data_in ({commit_fire, commit_cnt}), + .data_in ({commit_fire, commit_size}), .data_out ({cmt_to_csr_if.valid, cmt_to_csr_if.commit_size}) ); @@ -90,32 +98,32 @@ module VX_commit #( if (alu_commit_if.valid && alu_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd); `TRACE_ARRAY1D(alu_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", alu_commit_if.uuid); end if (ld_commit_if.valid && ld_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.wb, ld_commit_if.rd); `TRACE_ARRAY1D(ld_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", ld_commit_if.uuid); end if (st_commit_if.valid && st_commit_if.ready) begin - dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd); + dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d (#%0d)\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd, st_commit_if.uuid); end if (csr_commit_if.valid && csr_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=CSR, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.wb, csr_commit_if.rd); `TRACE_ARRAY1D(csr_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", csr_commit_if.uuid); end `ifdef EXT_F_ENABLE if (fpu_commit_if.valid && fpu_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.wb, fpu_commit_if.rd); `TRACE_ARRAY1D(fpu_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", fpu_commit_if.uuid); end `endif if (gpu_commit_if.valid && gpu_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=GPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.wb, gpu_commit_if.rd); `TRACE_ARRAY1D(gpu_commit_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", gpu_commit_if.uuid); end end `endif diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 82da10c2..8e0bbaa8 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -171,48 +171,50 @@ `define CSR_MPM_FPU_ST_H 12'hB88 `define CSR_MPM_GPU_ST 12'hB09 `define CSR_MPM_GPU_ST_H 12'hB89 +// PERF: decode +`define CSR_MPM_LOADS 12'hB0A +`define CSR_MPM_LOADS_H 12'hB8A +`define CSR_MPM_STORES 12'hB0B +`define CSR_MPM_STORES_H 12'hB8B +`define CSR_MPM_BRANCHES 12'hB0C +`define CSR_MPM_BRANCHES_H 12'hB8C // PERF: icache -`define CSR_MPM_ICACHE_READS 12'hB0A // total reads -`define CSR_MPM_ICACHE_READS_H 12'hB8A -`define CSR_MPM_ICACHE_MISS_R 12'hB0B // total misses -`define CSR_MPM_ICACHE_MISS_R_H 12'hB8B -`define CSR_MPM_ICACHE_PIPE_ST 12'hB0C // pipeline stalls -`define CSR_MPM_ICACHE_PIPE_ST_H 12'hB8C -`define CSR_MPM_ICACHE_CRSP_ST 12'hB0D // core response stalls -`define CSR_MPM_ICACHE_CRSP_ST_H 12'hB8D +`define CSR_MPM_ICACHE_READS 12'hB0D // total reads +`define CSR_MPM_ICACHE_READS_H 12'hB8D +`define CSR_MPM_ICACHE_MISS_R 12'hB0E // read misses +`define CSR_MPM_ICACHE_MISS_R_H 12'hB8E // PERF: dcache -`define CSR_MPM_DCACHE_READS 12'hB0E // total reads -`define CSR_MPM_DCACHE_READS_H 12'hB8E -`define CSR_MPM_DCACHE_WRITES 12'hB0F // total writes -`define CSR_MPM_DCACHE_WRITES_H 12'hB8F -`define CSR_MPM_DCACHE_MISS_R 12'hB10 // read misses -`define CSR_MPM_DCACHE_MISS_R_H 12'hB90 -`define CSR_MPM_DCACHE_MISS_W 12'hB11 // write misses -`define CSR_MPM_DCACHE_MISS_W_H 12'hB91 -`define CSR_MPM_DCACHE_BANK_ST 12'hB12 // bank conflicts stalls -`define CSR_MPM_DCACHE_BANK_ST_H 12'hB92 -`define CSR_MPM_DCACHE_MSHR_ST 12'hB13 // MSHR stalls -`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB93 -`define CSR_MPM_DCACHE_PIPE_ST 12'hB14 // pipeline stalls -`define CSR_MPM_DCACHE_PIPE_ST_H 12'hB94 -`define CSR_MPM_DCACHE_CRSP_ST 12'hB15 // core response stalls -`define CSR_MPM_DCACHE_CRSP_ST_H 12'hB95 +`define CSR_MPM_DCACHE_READS 12'hB0F // total reads +`define CSR_MPM_DCACHE_READS_H 12'hB8F +`define CSR_MPM_DCACHE_WRITES 12'hB10 // total writes +`define CSR_MPM_DCACHE_WRITES_H 12'hB90 +`define CSR_MPM_DCACHE_MISS_R 12'hB11 // read misses +`define CSR_MPM_DCACHE_MISS_R_H 12'hB91 +`define CSR_MPM_DCACHE_MISS_W 12'hB12 // write misses +`define CSR_MPM_DCACHE_MISS_W_H 12'hB92 +`define CSR_MPM_DCACHE_BANK_ST 12'hB13 // bank conflicts +`define CSR_MPM_DCACHE_BANK_ST_H 12'hB93 +`define CSR_MPM_DCACHE_MSHR_ST 12'hB14 // MSHR stalls +`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB94 // PERF: smem -`define CSR_MPM_SMEM_READS 12'hB16 // total reads -`define CSR_MPM_SMEM_READS_H 12'hB96 -`define CSR_MPM_SMEM_WRITES 12'hB17 // total writes -`define CSR_MPM_SMEM_WRITES_H 12'hB97 -`define CSR_MPM_SMEM_BANK_ST 12'hB18 // bank conflicts stalls -`define CSR_MPM_SMEM_BANK_ST_H 12'hB98 +`define CSR_MPM_SMEM_READS 12'hB15 // total reads +`define CSR_MPM_SMEM_READS_H 12'hB95 +`define CSR_MPM_SMEM_WRITES 12'hB16 // total writes +`define CSR_MPM_SMEM_WRITES_H 12'hB96 +`define CSR_MPM_SMEM_BANK_ST 12'hB17 // bank conflicts +`define CSR_MPM_SMEM_BANK_ST_H 12'hB97 // PERF: memory -`define CSR_MPM_MEM_READS 12'hB19 // memory reads -`define CSR_MPM_MEM_READS_H 12'hB99 -`define CSR_MPM_MEM_WRITES 12'hB1A // memory writes -`define CSR_MPM_MEM_WRITES_H 12'hB9A -`define CSR_MPM_MEM_ST 12'hB1B // memory request stalls -`define CSR_MPM_MEM_ST_H 12'hB9B -`define CSR_MPM_MEM_LAT 12'hB1C // memory latency (total) -`define CSR_MPM_MEM_LAT_H 12'hB9C +`define CSR_MPM_MEM_READS 12'hB18 // memory reads +`define CSR_MPM_MEM_READS_H 12'hB98 +`define CSR_MPM_MEM_WRITES 12'hB19 // memory writes +`define CSR_MPM_MEM_WRITES_H 12'hB99 +`define CSR_MPM_MEM_LAT 12'hB1A // memory latency +`define CSR_MPM_MEM_LAT_H 12'hB9A +// PERF: texunit +`define CSR_MPM_TEX_READS 12'hB1B // texture accesses +`define CSR_MPM_TEX_READS_H 12'hB9B +`define CSR_MPM_TEX_LAT 12'hB1C // texture latency +`define CSR_MPM_TEX_LAT_H 12'hB9C // Machine Information Registers `define CSR_MVENDORID 12'hF11 @@ -254,12 +256,22 @@ `define TEX_STATE_WRAPU 5 `define TEX_STATE_WRAPV 6 `define TEX_STATE_MIPOFF(lod) (7+(lod)) +`define NUM_TEX_STATES (`TEX_STATE_MIPOFF(`TEX_LOD_MAX)+1) -`define NUM_TEX_STATES (7+`TEX_LOD_MAX) +`define CSR_TEX_UNIT 12'hFD0 -`define CSR_TEX(unit,state) (12'hFD0 + ((unit) * `NUM_TEX_STATES) + (state)) -`define CSR_TEX_UNIT(csr) (((csr) - 12'hFD0) / `NUM_TEX_STATES) -`define CSR_TEX_STATE(csr) (((csr) - 12'hFD0) % `NUM_TEX_STATES) +`define CSR_TEX_STATE_BEGIN 12'hFD1 +`define CSR_TEX_ADDR (`CSR_TEX_STATE_BEGIN+`TEX_STATE_ADDR) +`define CSR_TEX_WIDTH (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WIDTH) +`define CSR_TEX_HEIGHT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_HEIGHT) +`define CSR_TEX_FORMAT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FORMAT) +`define CSR_TEX_FILTER (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FILTER) +`define CSR_TEX_WRAPU (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPU) +`define CSR_TEX_WRAPV (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPV) +`define CSR_TEX_MIPOFF(lod) (`CSR_TEX_STATE_BEGIN+`TEX_STATE_MIPOFF(lod)) +`define CSR_TEX_STATE_END (`CSR_TEX_STATE_BEGIN + `NUM_TEX_STATES) + +`define CSR_TEX_STATE(addr) ((addr) - `CSR_TEX_STATE_BEGIN) // Pipeline Queues //////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_csr_data.sv b/hw/rtl/VX_csr_data.sv index 396358d1..d63d1b7d 100644 --- a/hw/rtl/VX_csr_data.sv +++ b/hw/rtl/VX_csr_data.sv @@ -7,6 +7,9 @@ module VX_csr_data #( input wire reset, `ifdef PERF_ENABLE +`ifdef EXT_TEX_ENABLE + VX_perf_tex_if.slave perf_tex_if, +`endif VX_perf_memsys_if.slave perf_memsys_if, VX_perf_pipeline_if.slave perf_pipeline_if, `endif @@ -22,11 +25,13 @@ module VX_csr_data #( `endif input wire read_enable, + input wire [63:0] read_uuid, input wire[`CSR_ADDR_BITS-1:0] read_addr, input wire[`NW_BITS-1:0] read_wid, output wire[31:0] read_data, input wire write_enable, + input wire [63:0] write_uuid, input wire[`CSR_ADDR_BITS-1:0] write_addr, input wire[`NW_BITS-1:0] write_wid, input wire[31:0] write_data, @@ -56,7 +61,7 @@ module VX_csr_data #( `ifdef EXT_F_ENABLE if (fpu_to_csr_if.write_enable) begin fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] - | fpu_to_csr_if.write_fflags; + | fpu_to_csr_if.write_fflags; end `endif if (write_enable) begin @@ -75,11 +80,12 @@ module VX_csr_data #( `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0]; default: begin `ifdef EXT_TEX_ENABLE - `ASSERT(write_addr >= `CSR_TEX(0,0) - && write_addr < `CSR_TEX(`NUM_TEX_UNITS, 0), - ("%t: invalid CSR write address: %0h", $time, write_addr)); + `ASSERT((write_addr == `CSR_TEX_UNIT) + || (write_addr >= `CSR_TEX_STATE_BEGIN + && write_addr < `CSR_TEX_STATE_END), + ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid)); `else - `ASSERT(~write_enable, ("%t: invalid CSR write address: %0h", $time, write_addr)); + `ASSERT(~write_enable, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid)); `endif end endcase @@ -152,20 +158,28 @@ module VX_csr_data #( `CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0]; `CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]); + `ifdef EXT_F_ENABLE `CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0]; `CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]); + `else + `CSR_MPM_FPU_ST : read_data_r = '0; + `CSR_MPM_FPU_ST_H : read_data_r = '0; + `endif `CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0]; `CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]); + // PERF: decode + `CSR_MPM_LOADS : read_data_r = perf_pipeline_if.loads[31:0]; + `CSR_MPM_LOADS_H : read_data_r = 32'(perf_pipeline_if.loads[`PERF_CTR_BITS-1:32]); + `CSR_MPM_STORES : read_data_r = perf_pipeline_if.stores[31:0]; + `CSR_MPM_STORES_H : read_data_r = 32'(perf_pipeline_if.stores[`PERF_CTR_BITS-1:32]); + `CSR_MPM_BRANCHES : read_data_r = perf_pipeline_if.branches[31:0]; + `CSR_MPM_BRANCHES_H : read_data_r = 32'(perf_pipeline_if.branches[`PERF_CTR_BITS-1:32]); // PERF: icache `CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0]; `CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0]; `CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[`PERF_CTR_BITS-1:32]); - `CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0]; - `CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.icache_pipe_stalls[`PERF_CTR_BITS-1:32]); - `CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0]; - `CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.icache_crsp_stalls[`PERF_CTR_BITS-1:32]); - // PERF: dcache + // PERF: dcache `CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0]; `CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0]; @@ -178,26 +192,27 @@ module VX_csr_data #( `CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0]; `CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]); - `CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0]; - `CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.dcache_pipe_stalls[`PERF_CTR_BITS-1:32]); - `CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0]; - `CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[`PERF_CTR_BITS-1:32]); - // PERF: smem + // PERF: smem `CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0]; `CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0]; `CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]); `CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0]; `CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]); - // PERF: MEM + // PERF: memory `CSR_MPM_MEM_READS : read_data_r = perf_memsys_if.mem_reads[31:0]; `CSR_MPM_MEM_READS_H : read_data_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]); `CSR_MPM_MEM_WRITES : read_data_r = perf_memsys_if.mem_writes[31:0]; `CSR_MPM_MEM_WRITES_H : read_data_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]); - `CSR_MPM_MEM_ST : read_data_r = perf_memsys_if.mem_stalls[31:0]; - `CSR_MPM_MEM_ST_H : read_data_r = 32'(perf_memsys_if.mem_stalls[`PERF_CTR_BITS-1:32]); `CSR_MPM_MEM_LAT : read_data_r = perf_memsys_if.mem_latency[31:0]; `CSR_MPM_MEM_LAT_H : read_data_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]); + `ifdef EXT_TEX_ENABLE + // PERF: texunit + `CSR_MPM_TEX_READS : read_data_r = perf_tex_if.mem_reads[31:0]; + `CSR_MPM_TEX_READS_H : read_data_r = 32'(perf_tex_if.mem_reads[`PERF_CTR_BITS-1:32]); + `CSR_MPM_TEX_LAT : read_data_r = perf_tex_if.mem_latency[31:0]; + `CSR_MPM_TEX_LAT_H : read_data_r = 32'(perf_tex_if.mem_latency[`PERF_CTR_BITS-1:32]); + `endif // PERF: reserved `CSR_MPM_RESERVED : read_data_r = '0; `CSR_MPM_RESERVED_H : read_data_r = '0; @@ -227,7 +242,9 @@ module VX_csr_data #( read_addr_valid_r = 1; end else `ifdef EXT_TEX_ENABLE - if (read_addr >= `CSR_TEX(0,0) && read_addr < `CSR_TEX(`NUM_TEX_UNITS,0)) begin + if ((read_addr == `CSR_TEX_UNIT) + || (read_addr >= `CSR_TEX_STATE_BEGIN + && read_addr < `CSR_TEX_STATE_END)) begin read_addr_valid_r = 1; end else `endif @@ -236,7 +253,7 @@ module VX_csr_data #( endcase end - `RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("invalid CSR read address: %0h", read_addr)) + `RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: %0h (#%0d)", $time, read_addr, read_uuid)) assign read_data = read_data_r; diff --git a/hw/rtl/VX_csr_unit.sv b/hw/rtl/VX_csr_unit.sv index 0b05ca9c..6f7b35c9 100644 --- a/hw/rtl/VX_csr_unit.sv +++ b/hw/rtl/VX_csr_unit.sv @@ -7,6 +7,9 @@ module VX_csr_unit #( input wire reset, `ifdef PERF_ENABLE +`ifdef EXT_TEX_ENABLE + VX_perf_tex_if.slave perf_tex_if, +`endif VX_perf_memsys_if.slave perf_memsys_if, VX_perf_pipeline_if.slave perf_pipeline_if, `endif @@ -29,7 +32,8 @@ module VX_csr_unit #( ); wire csr_we_s1; wire [`CSR_ADDR_BITS-1:0] csr_addr_s1; - wire [31:0] csr_read_data, csr_read_data_s1; + wire [31:0] csr_read_data; + wire [31:0] csr_read_data_s1; wire [31:0] csr_updated_data_s1; wire write_enable = csr_commit_if.valid && csr_we_s1; @@ -42,8 +46,11 @@ module VX_csr_unit #( .clk (clk), .reset (reset), `ifdef PERF_ENABLE - .perf_memsys_if (perf_memsys_if), - .perf_pipeline_if (perf_pipeline_if), + `ifdef EXT_TEX_ENABLE + .perf_tex_if (perf_tex_if), + `endif + .perf_memsys_if (perf_memsys_if), + .perf_pipeline_if(perf_pipeline_if), `endif .cmt_to_csr_if (cmt_to_csr_if), .fetch_to_csr_if(fetch_to_csr_if), @@ -54,10 +61,12 @@ module VX_csr_unit #( .tex_csr_if (tex_csr_if), `endif .read_enable (csr_req_if.valid), + .read_uuid (csr_req_if.uuid), .read_addr (csr_req_if.addr), .read_wid (csr_req_if.wid), .read_data (csr_read_data), .write_enable (write_enable), + .write_uuid (csr_commit_if.uuid), .write_addr (csr_addr_s1), .write_wid (csr_commit_if.wid), .write_data (csr_updated_data_s1), @@ -101,14 +110,14 @@ module VX_csr_unit #( wire stall_out = ~csr_commit_if.ready && csr_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({csr_req_valid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}), - .data_out ({csr_commit_if.valid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1}) + .data_in ({csr_req_valid, csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}), + .data_out ({csr_commit_if.valid, csr_commit_if.uuid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1}) ); for (genvar i = 0; i < `NUM_THREADS; i++) begin diff --git a/hw/rtl/VX_decode.sv b/hw/rtl/VX_decode.sv index 2c6f09fb..3f9af431 100644 --- a/hw/rtl/VX_decode.sv +++ b/hw/rtl/VX_decode.sv @@ -20,6 +20,10 @@ module VX_decode #( input wire clk, input wire reset, +`ifdef PERF_ENABLE + VX_perf_pipeline_if.decode perf_decode_if, +`endif + // inputs VX_ifetch_rsp_if.slave ifetch_rsp_if, @@ -57,7 +61,6 @@ module VX_decode #( wire [11:0] s_imm = {func7, rd}; wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0}; wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0}; - wire [11:0] jalr_imm = {func7, rs2}; `UNUSED_VAR (rs3) @@ -169,7 +172,7 @@ module VX_decode #( use_rd = 1; use_imm = 1; is_wstall = 1; - imm = {{20{jalr_imm[11]}}, jalr_imm}; + imm = {{20{u_12[11]}}, u_12}; `USED_IREG (rd); `USED_IREG (rs1); end @@ -192,7 +195,7 @@ module VX_decode #( `USED_IREG (rs1); `USED_IREG (rs2); end - `INST_F: begin + `INST_FENCE: begin ex_type = `EX_LSU; op_mod = `INST_MOD_BITS'(1); end @@ -411,6 +414,7 @@ module VX_decode #( wire wb = use_rd && (| rd_r); assign decode_if.valid = ifetch_rsp_if.valid; + assign decode_if.uuid = ifetch_rsp_if.uuid; assign decode_if.wid = ifetch_rsp_if.wid; assign decode_if.tmask = ifetch_rsp_if.tmask; assign decode_if.PC = ifetch_rsp_if.PC; @@ -439,6 +443,42 @@ module VX_decode #( assign ifetch_rsp_if.ready = decode_if.ready; +`ifdef PERF_ENABLE + wire [$clog2(`NUM_THREADS+1)-1:0] perf_loads_per_cycle; + wire [$clog2(`NUM_THREADS+1)-1:0] perf_stores_per_cycle; + wire [$clog2(`NUM_THREADS+1)-1:0] perf_branches_per_cycle; + + wire [`NUM_THREADS-1:0] perf_loads_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && decode_if.wb}}; + wire [`NUM_THREADS-1:0] perf_stores_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && ~decode_if.wb}}; + wire [`NUM_THREADS-1:0] perf_branches_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_ALU && `INST_ALU_IS_BR(decode_if.op_mod)}}; + + `POP_COUNT(perf_loads_per_cycle, perf_loads_per_mask); + `POP_COUNT(perf_stores_per_cycle, perf_stores_per_mask); + `POP_COUNT(perf_branches_per_cycle, perf_branches_per_mask); + + reg [`PERF_CTR_BITS-1:0] perf_loads; + reg [`PERF_CTR_BITS-1:0] perf_stores; + reg [`PERF_CTR_BITS-1:0] perf_branches; + + always @(posedge clk) begin + if (reset) begin + perf_loads <= 0; + perf_stores <= 0; + perf_branches <= 0; + end else begin + if (decode_if.valid && decode_if.ready) begin + perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_loads_per_cycle); + perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_stores_per_cycle); + perf_branches <= perf_branches + `PERF_CTR_BITS'(perf_branches_per_cycle); + end + end + end + + assign perf_decode_if.loads = perf_loads; + assign perf_decode_if.stores = perf_stores; + assign perf_decode_if.branches = perf_branches; +`endif + `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (decode_if.valid && decode_if.ready) begin @@ -446,7 +486,8 @@ module VX_decode #( trace_ex_type(decode_if.ex_type); dpi_trace(", op="); trace_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod); - dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm); + dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b (#%0d)\n", + decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm, decode_if.uuid); end end `endif diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 696b6eaa..d4cf83fa 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -55,7 +55,7 @@ `define INST_S 7'b0100011 // store instructions `define INST_I 7'b0010011 // immediate instructions `define INST_R 7'b0110011 // register instructions -`define INST_F 7'b0001111 // Fence instructions +`define INST_FENCE 7'b0001111 // Fence instructions `define INST_SYS 7'b1110011 // system instructions `define INST_FL 7'b0000111 // float load instruction @@ -155,6 +155,7 @@ `define INST_LSU_BITS 4 `define INST_LSU_FMT(x) x[2:0] `define INST_LSU_WSIZE(x) x[1:0] +`define INST_LSU_IS_MEM(x) (3'h0 == x) `define INST_LSU_IS_FENCE(x) (3'h1 == x) `define INST_LSU_IS_PREFETCH(x) (3'h2 == x) diff --git a/hw/rtl/VX_dispatch.sv b/hw/rtl/VX_dispatch.sv index 008a7c62..5715d14b 100644 --- a/hw/rtl/VX_dispatch.sv +++ b/hw/rtl/VX_dispatch.sv @@ -42,15 +42,15 @@ module VX_dispatch ( wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), + .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), .OUT_REG (1) ) alu_buffer ( .clk (clk), .reset (reset), .valid_in (alu_req_valid), .ready_in (alu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), - .data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), + .data_out ({alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}), .valid_out (alu_req_if.valid), .ready_out (alu_req_if.ready) ); @@ -63,15 +63,15 @@ module VX_dispatch ( wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1), + .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1), .OUT_REG (1) ) lsu_buffer ( .clk (clk), .reset (reset), .valid_in (lsu_req_valid), .ready_in (lsu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}), - .data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}), + .data_out ({lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}), .valid_out (lsu_req_if.valid), .ready_out (lsu_req_if.ready) ); @@ -85,15 +85,15 @@ module VX_dispatch ( wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid]; VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32), + .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32), .OUT_REG (1) ) csr_buffer ( .clk (clk), .reset (reset), .valid_in (csr_req_valid), .ready_in (csr_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}), - .data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}), + .data_out ({csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}), .valid_out (csr_req_if.valid), .ready_out (csr_req_if.ready) ); @@ -105,15 +105,15 @@ module VX_dispatch ( wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), + .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), .OUT_REG (1) ) fpu_buffer ( .clk (clk), .reset (reset), .valid_in (fpu_req_valid), .ready_in (fpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), - .data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), + .data_out ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), .valid_out (fpu_req_if.valid), .ready_out (fpu_req_if.ready) ); @@ -127,15 +127,15 @@ module VX_dispatch ( wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)), + .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)), .OUT_REG (1) ) gpu_buffer ( .clk (clk), .reset (reset), .valid_in (gpu_req_valid), .ready_in (gpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), - .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}), + .data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), + .data_out ({gpu_req_if.uuid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}), .valid_out (gpu_req_if.valid), .ready_out (gpu_req_if.ready) ); diff --git a/hw/rtl/VX_execute.sv b/hw/rtl/VX_execute.sv index 029d58ab..3549465a 100644 --- a/hw/rtl/VX_execute.sv +++ b/hw/rtl/VX_execute.sv @@ -75,6 +75,10 @@ module VX_execute #( VX_tex_csr_if tex_csr_if(); +`ifdef PERF_ENABLE + VX_perf_tex_if perf_tex_if(); +`endif + VX_cache_arb #( .NUM_REQS (2), .LANES (`NUM_THREADS), @@ -165,6 +169,9 @@ module VX_execute #( .clk (clk), .reset (csr_reset), `ifdef PERF_ENABLE + `ifdef EXT_TEX_ENABLE + .perf_tex_if (perf_tex_if), + `endif .perf_memsys_if (perf_memsys_if), .perf_pipeline_if(perf_pipeline_if), `endif @@ -209,6 +216,9 @@ module VX_execute #( .reset (gpu_reset), .gpu_req_if (gpu_req_if), `ifdef EXT_TEX_ENABLE + `ifdef PERF_ENABLE + .perf_tex_if (perf_tex_if), + `endif .tex_csr_if (tex_csr_if), .dcache_req_if (tex_dcache_req_if), .dcache_rsp_if (tex_dcache_rsp_if), diff --git a/hw/rtl/VX_fpu_unit.sv b/hw/rtl/VX_fpu_unit.sv index 7b0f07cc..84af116b 100644 --- a/hw/rtl/VX_fpu_unit.sv +++ b/hw/rtl/VX_fpu_unit.sv @@ -22,6 +22,7 @@ module VX_fpu_unit #( wire valid_out; wire ready_out; + wire [63:0] rsp_uuid; wire [`NW_BITS-1:0] rsp_wid; wire [`NUM_THREADS-1:0] rsp_tmask; wire [31:0] rsp_PC; @@ -39,7 +40,7 @@ module VX_fpu_unit #( wire fpuq_pop = valid_out && ready_out; VX_index_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), + .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), .SIZE (`FPUQ_SIZE) ) req_metadata ( .clk (clk), @@ -48,8 +49,8 @@ module VX_fpu_unit #( .write_addr (tag_in), .read_addr (tag_out), .release_addr (tag_out), - .write_data ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}), - .read_data ({rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}), + .write_data ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}), + .read_data ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}), .release_slot (fpuq_pop), .full (fpuq_full), `UNUSED_PIN (empty) @@ -180,14 +181,14 @@ module VX_fpu_unit #( wire stall_out = ~fpu_commit_if.ready && fpu_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({valid_out, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result, has_fflags, rsp_fflags}), - .data_out ({fpu_commit_if.valid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r}) + .data_in ({valid_out, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result, has_fflags, rsp_fflags}), + .data_out ({fpu_commit_if.valid, fpu_commit_if.uuid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r}) ); assign fpu_commit_if.eop = 1'b1; diff --git a/hw/rtl/VX_gpu_unit.sv b/hw/rtl/VX_gpu_unit.sv index 06d5fbc7..6db637a2 100644 --- a/hw/rtl/VX_gpu_unit.sv +++ b/hw/rtl/VX_gpu_unit.sv @@ -12,6 +12,10 @@ module VX_gpu_unit #( VX_gpu_req_if.slave gpu_req_if, `ifdef EXT_TEX_ENABLE + // PERF +`ifdef PERF_ENABLE + VX_perf_tex_if.master perf_tex_if, +`endif VX_dcache_req_if.master dcache_req_if, VX_dcache_rsp_if.slave dcache_rsp_if, VX_tex_csr_if.slave tex_csr_if, @@ -28,12 +32,13 @@ module VX_gpu_unit #( localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS; localparam RSP_DATAW = `MAX(`NUM_THREADS * 32, WCTL_DATAW); - wire rsp_valid; - wire [`NW_BITS-1:0] rsp_wid; - wire [`NUM_THREADS-1:0] rsp_tmask; - wire [31:0] rsp_PC; - wire [`NR_BITS-1:0] rsp_rd; - wire rsp_wb; + wire rsp_valid; + wire [63:0] rsp_uuid; + wire [`NW_BITS-1:0] rsp_wid; + wire [`NUM_THREADS-1:0] rsp_tmask; + wire [31:0] rsp_PC; + wire [`NR_BITS-1:0] rsp_rd; + wire rsp_wb; wire [RSP_DATAW-1:0] rsp_data, rsp_data_r; @@ -112,6 +117,7 @@ module VX_gpu_unit #( wire is_tex = (gpu_req_if.op_type == `INST_GPU_TEX); assign tex_req_if.valid = gpu_req_if.valid && is_tex; + assign tex_req_if.uuid = gpu_req_if.uuid; assign tex_req_if.wid = gpu_req_if.wid; assign tex_req_if.tmask = gpu_req_if.tmask; assign tex_req_if.PC = gpu_req_if.PC; @@ -128,6 +134,9 @@ module VX_gpu_unit #( ) tex_unit ( .clk (clk), .reset (reset), + `ifdef PERF_ENABLE + .perf_tex_if (perf_tex_if), + `endif .tex_req_if (tex_req_if), .tex_csr_if (tex_csr_if), .tex_rsp_if (tex_rsp_if), @@ -143,6 +152,7 @@ module VX_gpu_unit #( assign is_warp_ctl = !(is_tex || tex_rsp_if.valid); assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex); + assign rsp_uuid = tex_rsp_if.valid ? tex_rsp_if.uuid : gpu_req_if.uuid; assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid; assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask; assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC; @@ -161,6 +171,7 @@ module VX_gpu_unit #( assign is_warp_ctl = 1; assign rsp_valid = gpu_req_if.valid; + assign rsp_uuid = gpu_req_if.uuid; assign rsp_wid = gpu_req_if.wid; assign rsp_tmask = gpu_req_if.tmask; assign rsp_PC = gpu_req_if.PC; @@ -176,14 +187,14 @@ module VX_gpu_unit #( assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}), - .data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r}) + .data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}), + .data_out ({gpu_commit_if.valid, gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r}) ); assign gpu_commit_if.data = rsp_data_r[(`NUM_THREADS * 32)-1:0]; @@ -200,7 +211,7 @@ module VX_gpu_unit #( assign gpu_req_if.ready = ~stall_in; `SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid); - `SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid); + `SCOPE_ASSIGN (gpu_rsp_uuid, gpu_commit_if.uuid); `SCOPE_ASSIGN (gpu_rsp_tmc, warp_ctl_if.tmc.valid); `SCOPE_ASSIGN (gpu_rsp_wspawn, warp_ctl_if.wspawn.valid); `SCOPE_ASSIGN (gpu_rsp_split, warp_ctl_if.split.valid); diff --git a/hw/rtl/VX_ibuffer.sv b/hw/rtl/VX_ibuffer.sv index 9b9fd397..953f1426 100644 --- a/hw/rtl/VX_ibuffer.sv +++ b/hw/rtl/VX_ibuffer.sv @@ -15,7 +15,7 @@ module VX_ibuffer #( `UNUSED_PARAM (CORE_ID) - localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1; + localparam DATAW = 64 + `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1; localparam ADDRW = $clog2(`IBUF_SIZE+1); localparam NWARPSW = $clog2(`NUM_WARPS+1); @@ -168,7 +168,8 @@ module VX_ibuffer #( assign decode_if.ready = ~q_full[decode_if.wid]; - assign q_data_in = {decode_if.tmask, + assign q_data_in = {decode_if.uuid, + decode_if.tmask, decode_if.PC, decode_if.ex_type, decode_if.op_type, @@ -184,7 +185,8 @@ module VX_ibuffer #( assign ibuffer_if.valid = deq_valid; assign ibuffer_if.wid = deq_wid; - assign {ibuffer_if.tmask, + assign {ibuffer_if.uuid, + ibuffer_if.tmask, ibuffer_if.PC, ibuffer_if.ex_type, ibuffer_if.op_type, diff --git a/hw/rtl/VX_icache_stage.sv b/hw/rtl/VX_icache_stage.sv index ad296649..77a20b47 100644 --- a/hw/rtl/VX_icache_stage.sv +++ b/hw/rtl/VX_icache_stage.sv @@ -25,35 +25,36 @@ module VX_icache_stage #( localparam OUT_REG = 0; reg [`DBG_CACHE_REQ_IDW-1:0] req_id; - wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id; + wire [`DBG_CACHE_REQ_IDW-1:0] rsp_id; wire [`NW_BITS-1:0] req_tag, rsp_tag; - `UNUSED_VAR (rsp_req_id) + `UNUSED_VAR (rsp_id) wire icache_req_fire = icache_req_if.valid && icache_req_if.ready; - assign req_tag = ifetch_req_if.wid; - assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0]; - assign rsp_req_id = icache_rsp_if.tag[`NW_BITS +: `DBG_CACHE_REQ_IDW]; + assign req_tag = ifetch_req_if.wid; + assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0]; + assign rsp_id = icache_rsp_if.tag[`NW_BITS +: `DBG_CACHE_REQ_IDW]; + wire [63:0] rsp_uuid; wire [31:0] rsp_PC; wire [`NUM_THREADS-1:0] rsp_tmask; VX_dp_ram #( - .DATAW (32 + `NUM_THREADS), + .DATAW (32 + `NUM_THREADS + 64), .SIZE (`NUM_WARPS), .LUTRAM (1) ) req_metadata ( .clk (clk), .wren (icache_req_fire), .waddr (req_tag), - .wdata ({ifetch_req_if.PC, ifetch_req_if.tmask}), + .wdata ({ifetch_req_if.PC, ifetch_req_if.tmask, ifetch_req_if.uuid}), .raddr (rsp_tag), - .rdata ({rsp_PC, rsp_tmask}) + .rdata ({rsp_PC, rsp_tmask, rsp_uuid}) ); `RUNTIME_ASSERT((!ifetch_req_if.valid || ifetch_req_if.PC >= `STARTUP_ADDR), - ("invalid PC=%0h, wid=%0d, tmask=%b", ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask)) + ("%t: *** invalid PC=%0h, wid=%0d, tmask=%b (#%0d)", $time, ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask, ifetch_req_if.uuid)) // Icache Request assign icache_req_if.valid = ifetch_req_if.valid; @@ -78,35 +79,37 @@ module VX_icache_stage #( wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid); VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32), + .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32 + 64), .RESETW (1), .DEPTH (OUT_REG) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({icache_rsp_if.valid, rsp_wid, rsp_tmask, rsp_PC, icache_rsp_if.data}), - .data_out ({ifetch_rsp_if.valid, ifetch_rsp_if.wid, ifetch_rsp_if.tmask, ifetch_rsp_if.PC, ifetch_rsp_if.data}) + .data_in ({icache_rsp_if.valid, rsp_wid, rsp_tmask, rsp_PC, icache_rsp_if.data, rsp_uuid}), + .data_out ({ifetch_rsp_if.valid, ifetch_rsp_if.wid, ifetch_rsp_if.tmask, ifetch_rsp_if.PC, ifetch_rsp_if.data, ifetch_rsp_if.uuid}) ); // Can accept new response? assign icache_rsp_if.ready = ~stall_out; `SCOPE_ASSIGN (icache_req_fire, icache_req_fire); - `SCOPE_ASSIGN (icache_req_wid, ifetch_req_if.wid); + `SCOPE_ASSIGN (icache_req_uuid, ifetch_req_if.uuid); `SCOPE_ASSIGN (icache_req_addr, {icache_req_if.addr, 2'b0}); `SCOPE_ASSIGN (icache_req_tag, req_tag); + `SCOPE_ASSIGN (icache_rsp_fire, icache_rsp_if.valid && icache_rsp_if.ready); + `SCOPE_ASSIGN (icache_rsp_uuid, rsp_uuid); `SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data); `SCOPE_ASSIGN (icache_rsp_tag, rsp_tag); `ifdef DBG_TRACE_CORE_ICACHE always @(posedge clk) begin if (icache_req_fire) begin - dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h, req_id=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, req_id); + dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h, req_id=%0h (#%0d)\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, req_id, ifetch_req_if.uuid); end if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin - dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, req_id=%0h, data=%0h\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, rsp_req_id, ifetch_rsp_if.data); + dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, req_id=%0h, data=%0h (#%0d)\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, rsp_id, ifetch_rsp_if.data, ifetch_rsp_if.uuid); end end `endif diff --git a/hw/rtl/VX_issue.sv b/hw/rtl/VX_issue.sv index abbb5241..e20f5fce 100644 --- a/hw/rtl/VX_issue.sv +++ b/hw/rtl/VX_issue.sv @@ -9,7 +9,7 @@ module VX_issue #( input wire reset, `ifdef PERF_ENABLE - VX_perf_pipeline_if.master perf_pipeline_if, + VX_perf_pipeline_if.issue perf_issue_if, `endif VX_decode_if.slave decode_if, @@ -38,6 +38,7 @@ module VX_issue #( // scoreboard writeback interface assign sboard_wb_if.valid = writeback_if.valid; + assign sboard_wb_if.uuid = writeback_if.uuid; assign sboard_wb_if.wid = writeback_if.wid; assign sboard_wb_if.PC = writeback_if.PC; assign sboard_wb_if.rd = writeback_if.rd; @@ -45,6 +46,7 @@ module VX_issue #( // scoreboard interface assign scoreboard_if.valid = ibuffer_if.valid && dispatch_if.ready; + assign scoreboard_if.uuid = ibuffer_if.uuid; assign scoreboard_if.wid = ibuffer_if.wid; assign scoreboard_if.PC = ibuffer_if.PC; assign scoreboard_if.wb = ibuffer_if.wb; @@ -57,6 +59,7 @@ module VX_issue #( // dispatch interface assign dispatch_if.valid = ibuffer_if.valid && scoreboard_if.ready; + assign dispatch_if.uuid = ibuffer_if.uuid; assign dispatch_if.wid = ibuffer_if.wid; assign dispatch_if.tmask = ibuffer_if.tmask; assign dispatch_if.PC = ibuffer_if.PC; @@ -121,9 +124,8 @@ module VX_issue #( ); `SCOPE_ASSIGN (issue_fire, ibuffer_if.valid && ibuffer_if.ready); - `SCOPE_ASSIGN (issue_wid, ibuffer_if.wid); + `SCOPE_ASSIGN (issue_uuid, ibuffer_if.uuid); `SCOPE_ASSIGN (issue_tmask, ibuffer_if.tmask); - `SCOPE_ASSIGN (issue_pc, ibuffer_if.PC); `SCOPE_ASSIGN (issue_ex_type, ibuffer_if.ex_type); `SCOPE_ASSIGN (issue_op_type, ibuffer_if.op_type); `SCOPE_ASSIGN (issue_op_mod, ibuffer_if.op_mod); @@ -140,10 +142,9 @@ module VX_issue #( `SCOPE_ASSIGN (gpr_rs1, gpr_rsp_if.rs1_data); `SCOPE_ASSIGN (gpr_rs2, gpr_rsp_if.rs2_data); `SCOPE_ASSIGN (gpr_rs3, gpr_rsp_if.rs3_data); - `SCOPE_ASSIGN (writeback_valid, writeback_if.valid); + `SCOPE_ASSIGN (writeback_valid, writeback_if.valid); + `SCOPE_ASSIGN (writeback_uuid, writeback_if.uuid); `SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask); - `SCOPE_ASSIGN (writeback_wid, writeback_if.wid); - `SCOPE_ASSIGN (writeback_pc, writeback_if.PC); `SCOPE_ASSIGN (writeback_rd, writeback_if.rd); `SCOPE_ASSIGN (writeback_data, writeback_if.data); `SCOPE_ASSIGN (writeback_eop, writeback_if.eop); @@ -171,40 +172,35 @@ module VX_issue #( perf_fpu_stalls <= 0; `endif end else begin - if (decode_if.valid & !decode_if.ready) begin + if (decode_if.valid & ~decode_if.ready) begin perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1; end - if (scoreboard_if.valid & !scoreboard_if.ready) begin + if (scoreboard_if.valid & ~scoreboard_if.ready) begin perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1; end - if (alu_req_if.valid & !alu_req_if.ready) begin - perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1; + if (dispatch_if.valid & ~dispatch_if.ready) begin + case (dispatch_if.ex_type) + `EX_ALU: perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1; + `ifdef EXT_F_ENABLE + `EX_FPU: perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1; + `endif + `EX_LSU: perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1; + `EX_CSR: perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1; + //`EX_GPU: + default: perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1; + endcase end - if (lsu_req_if.valid & !lsu_req_if.ready) begin - perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1; - end - if (csr_req_if.valid & !csr_req_if.ready) begin - perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1; - end - if (gpu_req_if.valid & !gpu_req_if.ready) begin - perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1; - end - `ifdef EXT_F_ENABLE - if (fpu_req_if.valid & !fpu_req_if.ready) begin - perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1; - end - `endif end end - assign perf_pipeline_if.ibf_stalls = perf_ibf_stalls; - assign perf_pipeline_if.scb_stalls = perf_scb_stalls; - assign perf_pipeline_if.alu_stalls = perf_alu_stalls; - assign perf_pipeline_if.lsu_stalls = perf_lsu_stalls; - assign perf_pipeline_if.csr_stalls = perf_csr_stalls; - assign perf_pipeline_if.gpu_stalls = perf_gpu_stalls; + assign perf_issue_if.ibf_stalls = perf_ibf_stalls; + assign perf_issue_if.scb_stalls = perf_scb_stalls; + assign perf_issue_if.alu_stalls = perf_alu_stalls; + assign perf_issue_if.lsu_stalls = perf_lsu_stalls; + assign perf_issue_if.csr_stalls = perf_csr_stalls; + assign perf_issue_if.gpu_stalls = perf_gpu_stalls; `ifdef EXT_F_ENABLE - assign perf_pipeline_if.fpu_stalls = perf_fpu_stalls; + assign perf_issue_if.fpu_stalls = perf_fpu_stalls; `endif `endif @@ -216,7 +212,7 @@ module VX_issue #( `TRACE_ARRAY1D(alu_req_if.rs1_data, `NUM_THREADS); dpi_trace(", rs2_data="); `TRACE_ARRAY1D(alu_req_if.rs2_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", alu_req_if.uuid); end if (lsu_req_if.valid && lsu_req_if.ready) begin dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, offset=%0h, addr=", @@ -224,13 +220,13 @@ module VX_issue #( `TRACE_ARRAY1D(lsu_req_if.base_addr, `NUM_THREADS); dpi_trace(", data="); `TRACE_ARRAY1D(lsu_req_if.store_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", lsu_req_if.uuid); end if (csr_req_if.valid && csr_req_if.ready) begin dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, rs1_data=", $time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.rd, csr_req_if.addr); `TRACE_ARRAY1D(csr_req_if.rs1_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", csr_req_if.uuid); end `ifdef EXT_F_ENABLE if (fpu_req_if.valid && fpu_req_if.ready) begin @@ -241,7 +237,7 @@ module VX_issue #( `TRACE_ARRAY1D(fpu_req_if.rs2_data, `NUM_THREADS); dpi_trace(", rs3_data="); `TRACE_ARRAY1D(fpu_req_if.rs3_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", fpu_req_if.uuid); end `endif if (gpu_req_if.valid && gpu_req_if.ready) begin @@ -252,7 +248,7 @@ module VX_issue #( `TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS); dpi_trace(", rs3_data="); `TRACE_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", gpu_req_if.uuid); end end `endif diff --git a/hw/rtl/VX_lsu_unit.sv b/hw/rtl/VX_lsu_unit.sv index de47dca0..e0ed73b5 100644 --- a/hw/rtl/VX_lsu_unit.sv +++ b/hw/rtl/VX_lsu_unit.sv @@ -21,7 +21,6 @@ module VX_lsu_unit #( ); localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE); localparam MEM_ADDRW = 32 - MEM_ASHIFT; - localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE); `STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) @@ -29,6 +28,7 @@ module VX_lsu_unit #( `STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter")) wire req_valid; + wire [63:0] req_uuid; wire [`NUM_THREADS-1:0] req_tmask; wire [`NUM_THREADS-1:0][31:0] req_addr; wire [`INST_LSU_BITS-1:0] req_type; @@ -54,16 +54,16 @@ module VX_lsu_unit #( for (genvar i = 0; i < (`NUM_THREADS-1); i++) begin assign addr_matches[i] = (lsu_req_if.base_addr[i+1] == lsu_req_if.base_addr[0]) || ~lsu_req_if.tmask[i+1]; end + wire lsu_is_dup = lsu_req_if.tmask[0] && (& addr_matches); for (genvar i = 0; i < `NUM_THREADS; i++) begin // is non-cacheable address wire is_addr_nc = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'(`IO_BASE_ADDR >> MEM_ASHIFT)); - if (`SM_ENABLE) begin // is shared memory address wire is_addr_sm = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'((`SMEM_BASE_ADDR - `SMEM_SIZE) >> MEM_ASHIFT)) - & (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT)); + & (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT)); assign lsu_addr_type[i] = {is_addr_nc, is_addr_sm}; end else begin assign lsu_addr_type[i] = is_addr_nc; @@ -81,19 +81,20 @@ module VX_lsu_unit #( wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch; VX_pipe_register #( - .DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 1 + 1 + 64 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) req_pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_in), - .data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}), - .data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data}) + .data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}), + .data_out ({req_valid, req_is_dup, req_is_prefetch, req_uuid, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data}) ); // Can accept new request? assign lsu_req_if.ready = ~stall_in && ~fence_wait; + wire [63:0] rsp_uuid; wire [`NW_BITS-1:0] rsp_wid; wire [31:0] rsp_pc; wire [`NR_BITS-1:0] rsp_rd; @@ -146,7 +147,7 @@ module VX_lsu_unit #( wire req_wb2 = req_wb && ~req_is_prefetch; VX_index_buffer #( - .DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1), + .DATAW (64 + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1), .SIZE (`LSUQ_SIZE) ) req_metadata ( .clk (clk), @@ -154,8 +155,8 @@ module VX_lsu_unit #( .write_addr (mbuf_waddr), .acquire_slot (mbuf_push), .read_addr (mbuf_raddr), - .write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}), - .read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}), + .write_data ({req_uuid, req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}), + .read_data ({rsp_uuid, rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}), .release_addr (mbuf_raddr), .release_slot (mbuf_pop), .full (mbuf_full), @@ -259,6 +260,7 @@ module VX_lsu_unit #( wire is_store_rsp = req_valid && ~req_wb && dcache_req_ready; assign st_commit_if.valid = is_store_rsp; + assign st_commit_if.uuid = req_uuid; assign st_commit_if.wid = req_wid; assign st_commit_if.tmask = req_tmask; assign st_commit_if.PC = req_pc; @@ -295,14 +297,14 @@ module VX_lsu_unit #( wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1), .RESETW (1) ) rsp_pipe_reg ( .clk (clk), .reset (reset), .enable (!load_rsp_stall), - .data_in ({dcache_rsp_if.valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), - .data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) + .data_in ({dcache_rsp_if.valid, rsp_uuid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), + .data_out ({ld_commit_if.valid, ld_commit_if.uuid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) ); // Can accept new cache response? @@ -310,19 +312,19 @@ module VX_lsu_unit #( // scope registration `SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire); - `SCOPE_ASSIGN (dcache_req_wid, req_wid); - `SCOPE_ASSIGN (dcache_req_pc, req_pc); + `SCOPE_ASSIGN (dcache_req_uuid, req_uuid); `SCOPE_ASSIGN (dcache_req_addr, req_addr); `SCOPE_ASSIGN (dcache_req_rw, ~req_wb); `SCOPE_ASSIGN (dcache_req_byteen,dcache_req_if.byteen); `SCOPE_ASSIGN (dcache_req_data, dcache_req_if.data); `SCOPE_ASSIGN (dcache_req_tag, req_tag); `SCOPE_ASSIGN (dcache_rsp_fire, dcache_rsp_if.tmask & {`NUM_THREADS{dcache_rsp_fire}}); + `SCOPE_ASSIGN (dcache_rsp_uuid, rsp_uuid); `SCOPE_ASSIGN (dcache_rsp_data, dcache_rsp_if.data); `SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr); `ifndef SYNTHESIS - reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + 64 + 1)-1:0] pending_reqs; + reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + 64 + 64 + 1)-1:0] pending_reqs; wire [63:0] delay_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE)); always @(posedge clk) begin @@ -330,7 +332,7 @@ module VX_lsu_unit #( pending_reqs <= '0; end begin if (mbuf_push) begin - pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, $time, 1'b1}; + pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, req_uuid, $time, 1'b1}; end if (mbuf_pop) begin pending_reqs[mbuf_raddr] <= '0; @@ -340,8 +342,11 @@ module VX_lsu_unit #( for (integer i = 0; i < `LSUQ_SIZE; ++i) begin if (pending_reqs[i][0]) begin `ASSERT(($time - pending_reqs[i][1 +: 64]) < delay_timeout, - ("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d", - $time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+32+`NR_BITS +: `NW_BITS], pending_reqs[i][1+64+`NR_BITS +: 32], pending_reqs[i][1+64 +: `NR_BITS])); + ("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d (#%0d)", + $time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+64+32+`NR_BITS +: `NW_BITS], + pending_reqs[i][1+64+64+`NR_BITS +: 32], + pending_reqs[i][1+64+64 +: `NR_BITS], + pending_reqs[i][1+64 +: 64])); end end end @@ -360,20 +365,20 @@ module VX_lsu_unit #( `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); dpi_trace(", data="); `TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS); - dpi_trace(", req_id=%0h\n", req_id); + dpi_trace(", (#%0d)\n", req_uuid); end else begin dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire, req_id); `TRACE_ARRAY1D(req_addr, `NUM_THREADS); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); - dpi_trace(", rd=%0d, is_dup=%b\n", req_rd, req_is_dup); + dpi_trace(", rd=%0d, is_dup=%b (#%0d)\n", req_rd, req_is_dup, req_uuid); end end if (dcache_rsp_fire) begin dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, rd=%0d, data=", $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, rsp_req_id, mbuf_raddr, rsp_rd); `TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS); - dpi_trace(", is_dup=%b\n", rsp_is_dup); + dpi_trace(", is_dup=%b (#%0d)\n", rsp_is_dup, rsp_uuid); end end `endif diff --git a/hw/rtl/VX_mem_unit.sv b/hw/rtl/VX_mem_unit.sv index 56de47ef..ade9600f 100644 --- a/hw/rtl/VX_mem_unit.sv +++ b/hw/rtl/VX_mem_unit.sv @@ -358,19 +358,17 @@ module VX_mem_unit # ( `ifdef PERF_ENABLE + `UNUSED_VAR (perf_dcache_if.mem_stalls) + `UNUSED_VAR (perf_dcache_if.crsp_stalls) + assign perf_memsys_if.icache_reads = perf_icache_if.reads; assign perf_memsys_if.icache_read_misses = perf_icache_if.read_misses; - assign perf_memsys_if.icache_pipe_stalls = perf_icache_if.pipe_stalls; - assign perf_memsys_if.icache_crsp_stalls = perf_icache_if.crsp_stalls; - assign perf_memsys_if.dcache_reads = perf_dcache_if.reads; assign perf_memsys_if.dcache_writes = perf_dcache_if.writes; assign perf_memsys_if.dcache_read_misses = perf_dcache_if.read_misses; assign perf_memsys_if.dcache_write_misses= perf_dcache_if.write_misses; assign perf_memsys_if.dcache_bank_stalls = perf_dcache_if.bank_stalls; assign perf_memsys_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls; - assign perf_memsys_if.dcache_pipe_stalls = perf_dcache_if.pipe_stalls; - assign perf_memsys_if.dcache_crsp_stalls = perf_dcache_if.crsp_stalls; if (`SM_ENABLE) begin assign perf_memsys_if.smem_reads = perf_smem_if.reads; @@ -382,47 +380,41 @@ end else begin assign perf_memsys_if.smem_bank_stalls = 0; end - reg [`PERF_CTR_BITS-1:0] perf_mem_lat_per_cycle; + reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads; always @(posedge clk) begin if (reset) begin - perf_mem_lat_per_cycle <= 0; + perf_mem_pending_reads <= 0; end else begin - perf_mem_lat_per_cycle <= perf_mem_lat_per_cycle + - `PERF_CTR_BITS'($signed(2'((mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready) && !(mem_rsp_if.valid && mem_rsp_if.ready)) - - 2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready)))); + perf_mem_pending_reads <= perf_mem_pending_reads + + `PERF_CTR_BITS'($signed(2'((mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) && !(mem_rsp_if.valid && mem_rsp_if.ready)) - + 2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw)))); end end reg [`PERF_CTR_BITS-1:0] perf_mem_reads; reg [`PERF_CTR_BITS-1:0] perf_mem_writes; reg [`PERF_CTR_BITS-1:0] perf_mem_lat; - reg [`PERF_CTR_BITS-1:0] perf_mem_stalls; always @(posedge clk) begin if (reset) begin perf_mem_reads <= 0; perf_mem_writes <= 0; perf_mem_lat <= 0; - perf_mem_stalls <= 0; end else begin if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'd1; end if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'd1; - end - if (mem_req_if.valid && !mem_req_if.ready) begin - perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'd1; - end - perf_mem_lat <= perf_mem_lat + perf_mem_lat_per_cycle; + end + perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads; end end assign perf_memsys_if.mem_reads = perf_mem_reads; assign perf_memsys_if.mem_writes = perf_mem_writes; - assign perf_memsys_if.mem_latency = perf_mem_lat; - assign perf_memsys_if.mem_stalls = perf_mem_stalls; + assign perf_memsys_if.mem_latency = perf_mem_lat; `endif endmodule diff --git a/hw/rtl/VX_muldiv.sv b/hw/rtl/VX_muldiv.sv index 5cd13f5c..c4dda93b 100644 --- a/hw/rtl/VX_muldiv.sv +++ b/hw/rtl/VX_muldiv.sv @@ -6,6 +6,7 @@ module VX_muldiv ( // Inputs input wire [`INST_MUL_BITS-1:0] alu_op, + input wire [63:0] uuid_in, input wire [`NW_BITS-1:0] wid_in, input wire [`NUM_THREADS-1:0] tmask_in, input wire [31:0] PC_in, @@ -15,6 +16,7 @@ module VX_muldiv ( input wire [`NUM_THREADS-1:0][31:0] alu_in2, // Outputs + output wire [63:0] uuid_out, output wire [`NW_BITS-1:0] wid_out, output wire [`NUM_THREADS-1:0] tmask_out, output wire [31:0] PC_out, @@ -32,6 +34,7 @@ module VX_muldiv ( wire is_div_op = `INST_MUL_IS_DIV(alu_op); wire [`NUM_THREADS-1:0][31:0] mul_result; + wire [63:0] mul_uuid_out; wire [`NW_BITS-1:0] mul_wid_out; wire [`NUM_THREADS-1:0] mul_tmask_out; wire [31:0] mul_PC_out; @@ -63,15 +66,15 @@ module VX_muldiv ( end VX_shift_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .DEPTH (`LATENCY_IMUL), .RESETW (1) ) mul_shift_reg ( .clk(clk), .reset (reset), .enable (mul_ready_in), - .data_in ({mul_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}), - .data_out ({mul_valid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result}) + .data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}), + .data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result}) ); `else @@ -103,15 +106,15 @@ module VX_muldiv ( end VX_shift_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1), .DEPTH (`LATENCY_IMUL), .RESETW (1) ) mul_shift_reg ( .clk(clk), .reset (reset), .enable (mul_ready_in), - .data_in ({mul_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}), - .data_out ({mul_valid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out}) + .data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}), + .data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out}) ); `endif @@ -119,6 +122,7 @@ module VX_muldiv ( /////////////////////////////////////////////////////////////////////////// wire [`NUM_THREADS-1:0][31:0] div_result; + wire [63:0] div_uuid_out; wire [`NW_BITS-1:0] div_wid_out; wire [`NUM_THREADS-1:0] div_tmask_out; wire [31:0] div_PC_out; @@ -147,15 +151,15 @@ module VX_muldiv ( end VX_shift_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .DEPTH (`LATENCY_IMUL), .RESETW (1) ) div_shift_reg ( .clk(clk), .reset (reset), .enable (div_ready_in), - .data_in ({div_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}), - .data_out ({div_valid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result}) + .data_in ({div_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}), + .data_out ({div_valid_out, div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result}) ); assign div_ready_in = div_ready_out || ~div_valid_out; @@ -171,21 +175,21 @@ module VX_muldiv ( .WIDTHQ (32), .WIDTHR (32), .LANES (`NUM_THREADS), - .TAGW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1) + .TAGW (64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1) ) divide ( .clk (clk), .reset (reset), .valid_in (div_valid_in), .ready_in (div_ready_in), .signed_mode(is_signed_div), - .tag_in ({wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}), + .tag_in ({uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}), .numer (alu_in1), .denom (alu_in2), .quotient (div_result_tmp), .remainder (rem_result_tmp), .ready_out (div_ready_out), .valid_out (div_valid_out), - .tag_out ({div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out}) + .tag_out ({div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out}) ); assign div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp; @@ -195,6 +199,7 @@ module VX_muldiv ( /////////////////////////////////////////////////////////////////////////// wire rsp_valid = mul_valid_out || div_valid_out; + wire [63:0] rsp_uuid = mul_valid_out ? mul_uuid_out : div_uuid_out; wire [`NW_BITS-1:0] rsp_wid = mul_valid_out ? mul_wid_out : div_wid_out; wire [`NUM_THREADS-1:0] rsp_tmask = mul_valid_out ? mul_tmask_out : div_tmask_out; wire [31:0] rsp_PC = mul_valid_out ? mul_PC_out : div_PC_out; @@ -205,14 +210,14 @@ module VX_muldiv ( assign stall_out = ~ready_out && valid_out; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}), - .data_out ({valid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out}) + .data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}), + .data_out ({valid_out, uuid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out}) ); // can accept new request? diff --git a/hw/rtl/VX_pipeline.sv b/hw/rtl/VX_pipeline.sv index 8bbc7ead..1ab20c4a 100644 --- a/hw/rtl/VX_pipeline.sv +++ b/hw/rtl/VX_pipeline.sv @@ -165,6 +165,9 @@ module VX_pipeline #( ) decode ( .clk (clk), .reset (decode_reset), + `ifdef PERF_ENABLE + .perf_decode_if (perf_pipeline_if.decode), + `endif .ifetch_rsp_if (ifetch_rsp_if), .decode_if (decode_if), .wstall_if (wstall_if), @@ -180,7 +183,7 @@ module VX_pipeline #( .reset (issue_reset), `ifdef PERF_ENABLE - .perf_pipeline_if (perf_pipeline_if), + .perf_issue_if (perf_pipeline_if.issue), `endif .decode_if (decode_if), diff --git a/hw/rtl/VX_scoreboard.sv b/hw/rtl/VX_scoreboard.sv index 6ba4e998..9a3fed37 100644 --- a/hw/rtl/VX_scoreboard.sv +++ b/hw/rtl/VX_scoreboard.sv @@ -60,22 +60,22 @@ module VX_scoreboard #( end else begin `ifdef DBG_TRACE_PIPELINE if (ibuffer_if.valid && ~ibuffer_if.ready) begin - dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n", + dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)\n", $time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb, - deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3); + deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid); end `endif if (release_reg) begin `ASSERT(inuse_regs[writeback_if.wid][writeback_if.rd] != 0, - ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d", - $time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd)); + ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d (#%0d)", + $time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd,writeback_if.uuid)); end if (ibuffer_if.valid && ~ibuffer_if.ready) begin deadlock_ctr <= deadlock_ctr + 1; `ASSERT(deadlock_ctr < deadlock_timeout, - ("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b", + ("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)", $time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb, - deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3)); + deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid)); end else if (ibuffer_if.valid && ibuffer_if.ready) begin deadlock_ctr <= 0; end diff --git a/hw/rtl/VX_trace_instr.vh b/hw/rtl/VX_trace_instr.vh index e228179e..5e8e031e 100644 --- a/hw/rtl/VX_trace_instr.vh +++ b/hw/rtl/VX_trace_instr.vh @@ -35,9 +35,9 @@ task trace_ex_op ( `INST_BR_JALR: dpi_trace("JALR"); `INST_BR_ECALL: dpi_trace("ECALL"); `INST_BR_EBREAK:dpi_trace("EBREAK"); - `INST_BR_MRET: dpi_trace("MRET"); + `INST_BR_URET: dpi_trace("URET"); `INST_BR_SRET: dpi_trace("SRET"); - `INST_BR_DRET: dpi_trace("DRET"); + `INST_BR_MRET: dpi_trace("MRET"); default: dpi_trace("?"); endcase end else if (`INST_ALU_IS_MUL(op_mod)) begin diff --git a/hw/rtl/VX_warp_sched.sv b/hw/rtl/VX_warp_sched.sv index 979a3536..b8ec17bf 100644 --- a/hw/rtl/VX_warp_sched.sv +++ b/hw/rtl/VX_warp_sched.sv @@ -46,6 +46,8 @@ module VX_warp_sched #( wire schedule_valid; wire warp_scheduled; + reg [63:0] issued_instrs; + wire ifetch_req_fire = ifetch_req_if.valid && ifetch_req_if.ready; wire tmc_active = (warp_ctl_if.tmc.tmask != 0); @@ -62,12 +64,13 @@ module VX_warp_sched #( always @(posedge clk) begin if (reset) begin - barrier_masks <= 0; - use_wspawn <= 0; - stalled_warps <= 0; + barrier_masks <= '0; + use_wspawn <= '0; + stalled_warps <= '0; warp_pcs <= '0; active_warps <= '0; thread_masks <= '0; + issued_instrs <= '0; // activate first warp warp_pcs[0] <= `STARTUP_ADDR; @@ -117,6 +120,8 @@ module VX_warp_sched #( if (use_wspawn[schedule_wid]) begin thread_masks[schedule_wid] <= 1; end + + issued_instrs <= issued_instrs + 1; end if (ifetch_req_fire) begin @@ -223,20 +228,23 @@ module VX_warp_sched #( assign warp_scheduled = schedule_valid && ~stall_out; + wire [63:0] instr_uuid = (issued_instrs * `NUM_CORES * `NUM_CLUSTERS) + 64'(CORE_ID); + VX_pipe_register #( - .DATAW (1 + `NUM_THREADS + 32 + `NW_BITS), + .DATAW (1 + 64 + `NUM_THREADS + 32 + `NW_BITS), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_out), - .data_in ({schedule_valid, schedule_tmask, schedule_pc, schedule_wid}), - .data_out ({ifetch_req_if.valid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid}) + .data_in ({schedule_valid, instr_uuid, schedule_tmask, schedule_pc, schedule_wid}), + .data_out ({ifetch_req_if.valid, ifetch_req_if.uuid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid}) ); assign busy = (active_warps != 0); `SCOPE_ASSIGN (wsched_scheduled, warp_scheduled); + `SCOPE_ASSIGN (wsched_schedule_uuid, instr_uuid); `SCOPE_ASSIGN (wsched_active_warps, active_warps); `SCOPE_ASSIGN (wsched_stalled_warps, stalled_warps); `SCOPE_ASSIGN (wsched_schedule_wid, schedule_wid); diff --git a/hw/rtl/VX_writeback.sv b/hw/rtl/VX_writeback.sv index cdf7f988..5b67256c 100644 --- a/hw/rtl/VX_writeback.sv +++ b/hw/rtl/VX_writeback.sv @@ -23,17 +23,9 @@ module VX_writeback #( localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1; `ifdef EXT_F_ENABLE -`ifdef EXT_TEX_ENABLE localparam NUM_RSPS = 5; `else localparam NUM_RSPS = 4; -`endif -`else -`ifdef EXT_TEX_ENABLE - localparam NUM_RSPS = 4; -`else - localparam NUM_RSPS = 3; -`endif `endif wire wb_valid; @@ -50,9 +42,7 @@ module VX_writeback #( wire stall; assign rsp_valid = { - `ifdef EXT_TEX_ENABLE gpu_commit_if.valid && gpu_commit_if.wb, - `endif csr_commit_if.valid && csr_commit_if.wb, alu_commit_if.valid && alu_commit_if.wb, `ifdef EXT_F_ENABLE @@ -62,9 +52,7 @@ module VX_writeback #( }; assign rsp_data = { - `ifdef EXT_TEX_ENABLE {gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop}, - `endif {csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop}, {alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop}, `ifdef EXT_F_ENABLE @@ -88,28 +76,17 @@ module VX_writeback #( .ready_out (~stall) ); - assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb; + assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb; `ifdef EXT_F_ENABLE assign fpu_commit_if.ready = rsp_ready[1] || ~fpu_commit_if.wb; assign alu_commit_if.ready = rsp_ready[2] || ~alu_commit_if.wb; assign csr_commit_if.ready = rsp_ready[3] || ~csr_commit_if.wb; + assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb; `else assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb; assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb; -`ifdef EXT_TEX_ENABLE assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb; `endif -`endif - -`ifdef EXT_TEX_ENABLE -`ifdef EXT_F_ENABLE - assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb; -`else - assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb; -`endif -`else - assign gpu_commit_if.ready = 1; -`endif assign stall = ~writeback_if.ready && writeback_if.valid; diff --git a/hw/rtl/afu/VX_to_mem.sv b/hw/rtl/afu/VX_to_mem.sv index 472f8cb3..acc2899b 100644 --- a/hw/rtl/afu/VX_to_mem.sv +++ b/hw/rtl/afu/VX_to_mem.sv @@ -124,7 +124,8 @@ module VX_to_mem #( end end assign mem_rsp_tag_in_w = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_in; - `RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_w == mem_rsp_tag_in), ("out-of-order memory reponse! cur=%d, expected=%d", mem_rsp_tag_in_w, mem_rsp_tag_in)) + `RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_w == mem_rsp_tag_in), + ("%t: *** out-of-order memory reponse! cur=%d, expected=%d", $time, mem_rsp_tag_in_w, mem_rsp_tag_in)) wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr}; diff --git a/hw/rtl/cache/VX_bank.sv b/hw/rtl/cache/VX_bank.sv index 2dfc51fe..9e1f3552 100644 --- a/hw/rtl/cache/VX_bank.sv +++ b/hw/rtl/cache/VX_bank.sv @@ -48,7 +48,6 @@ module VX_bank #( output wire perf_read_misses, output wire perf_write_misses, output wire perf_mshr_stalls, - output wire perf_pipe_stalls, `endif // Core Request @@ -470,7 +469,6 @@ module VX_bank #( `ifdef PERF_ENABLE assign perf_read_misses = do_read_st1 && miss_st1; assign perf_write_misses = do_write_st1 && miss_st1; - assign perf_pipe_stalls = crsq_stall || mreq_alm_full || mshr_alm_full; assign perf_mshr_stalls = mshr_alm_full; `endif diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index 6b6841dd..1b7d7abf 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -102,7 +102,6 @@ module VX_cache #( wire [NUM_BANKS-1:0] perf_read_miss_per_bank; wire [NUM_BANKS-1:0] perf_write_miss_per_bank; wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank; - wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank; `endif /////////////////////////////////////////////////////////////////////////// @@ -219,37 +218,37 @@ module VX_cache #( /////////////////////////////////////////////////////////////////////////// // Core request - wire [NUM_REQS-1:0] core_req_valid_nc; - wire [NUM_REQS-1:0] core_req_rw_nc; - wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr_nc; - wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_nc; - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data_nc; - wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_nc; - wire [NUM_REQS-1:0] core_req_ready_nc; + wire [NUM_REQS-1:0] core_req_valid_c; + wire [NUM_REQS-1:0] core_req_rw_c; + wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr_c; + wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_c; + wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data_c; + wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_c; + wire [NUM_REQS-1:0] core_req_ready_c; // Core response - wire [`CORE_RSP_TAGS-1:0] core_rsp_valid_nc; - wire [NUM_REQS-1:0] core_rsp_tmask_nc; - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_nc; - wire [`CORE_RSP_TAGS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_nc; - wire [`CORE_RSP_TAGS-1:0] core_rsp_ready_nc; + wire [`CORE_RSP_TAGS-1:0] core_rsp_valid_c; + wire [NUM_REQS-1:0] core_rsp_tmask_c; + wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_c; + wire [`CORE_RSP_TAGS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_c; + wire [`CORE_RSP_TAGS-1:0] core_rsp_ready_c; // Memory request - wire mem_req_valid_nc; - wire mem_req_rw_nc; - wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_nc; - wire [NUM_PORTS-1:0] mem_req_pmask_nc; - wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_nc; - wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_nc; - wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_nc; - wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_nc; - wire mem_req_ready_nc; + wire mem_req_valid_c; + wire mem_req_rw_c; + wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_c; + wire [NUM_PORTS-1:0] mem_req_pmask_c; + wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_c; + wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_c; + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_c; + wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_c; + wire mem_req_ready_c; // Memory response - wire mem_rsp_valid_nc; - wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_nc; - wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_nc; - wire mem_rsp_ready_nc; + wire mem_rsp_valid_c; + wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_c; + wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_c; + wire mem_rsp_ready_c; if (NC_ENABLE) begin VX_nc_bypass #( @@ -280,20 +279,20 @@ module VX_cache #( .core_req_ready_in (core_req_ready), // Core request out - .core_req_valid_out (core_req_valid_nc), - .core_req_rw_out (core_req_rw_nc), - .core_req_byteen_out(core_req_byteen_nc), - .core_req_addr_out (core_req_addr_nc), - .core_req_data_out (core_req_data_nc), - .core_req_tag_out (core_req_tag_nc), - .core_req_ready_out (core_req_ready_nc), + .core_req_valid_out (core_req_valid_c), + .core_req_rw_out (core_req_rw_c), + .core_req_byteen_out(core_req_byteen_c), + .core_req_addr_out (core_req_addr_c), + .core_req_data_out (core_req_data_c), + .core_req_tag_out (core_req_tag_c), + .core_req_ready_out (core_req_ready_c), // Core response in - .core_rsp_valid_in (core_rsp_valid_nc), - .core_rsp_tmask_in (core_rsp_tmask_nc), - .core_rsp_data_in (core_rsp_data_nc), - .core_rsp_tag_in (core_rsp_tag_nc), - .core_rsp_ready_in (core_rsp_ready_nc), + .core_rsp_valid_in (core_rsp_valid_c), + .core_rsp_tmask_in (core_rsp_tmask_c), + .core_rsp_data_in (core_rsp_data_c), + .core_rsp_tag_in (core_rsp_tag_c), + .core_rsp_ready_in (core_rsp_ready_c), // Core response out .core_rsp_valid_out (core_rsp_valid_sb), @@ -303,15 +302,15 @@ module VX_cache #( .core_rsp_ready_out (core_rsp_ready_sb), // Memory request in - .mem_req_valid_in (mem_req_valid_nc), - .mem_req_rw_in (mem_req_rw_nc), - .mem_req_addr_in (mem_req_addr_nc), - .mem_req_pmask_in (mem_req_pmask_nc), - .mem_req_byteen_in (mem_req_byteen_nc), - .mem_req_wsel_in (mem_req_wsel_nc), - .mem_req_data_in (mem_req_data_nc), - .mem_req_tag_in (mem_req_tag_nc), - .mem_req_ready_in (mem_req_ready_nc), + .mem_req_valid_in (mem_req_valid_c), + .mem_req_rw_in (mem_req_rw_c), + .mem_req_addr_in (mem_req_addr_c), + .mem_req_pmask_in (mem_req_pmask_c), + .mem_req_byteen_in (mem_req_byteen_c), + .mem_req_wsel_in (mem_req_wsel_c), + .mem_req_data_in (mem_req_data_c), + .mem_req_tag_in (mem_req_tag_c), + .mem_req_ready_in (mem_req_ready_c), // Memory request out .mem_req_valid_out (mem_req_valid_sb), @@ -331,40 +330,40 @@ module VX_cache #( .mem_rsp_ready_in (mem_rsp_ready), // Memory response out - .mem_rsp_valid_out (mem_rsp_valid_nc), - .mem_rsp_data_out (mem_rsp_data_nc), - .mem_rsp_tag_out (mem_rsp_tag_nc), - .mem_rsp_ready_out (mem_rsp_ready_nc) + .mem_rsp_valid_out (mem_rsp_valid_c), + .mem_rsp_data_out (mem_rsp_data_c), + .mem_rsp_tag_out (mem_rsp_tag_c), + .mem_rsp_ready_out (mem_rsp_ready_c) ); end else begin - assign core_req_valid_nc = core_req_valid; - assign core_req_rw_nc = core_req_rw; - assign core_req_addr_nc = core_req_addr; - assign core_req_byteen_nc = core_req_byteen; - assign core_req_data_nc = core_req_data; - assign core_req_tag_nc = core_req_tag; - assign core_req_ready = core_req_ready_nc; + assign core_req_valid_c = core_req_valid; + assign core_req_rw_c = core_req_rw; + assign core_req_addr_c = core_req_addr; + assign core_req_byteen_c = core_req_byteen; + assign core_req_data_c = core_req_data; + assign core_req_tag_c = core_req_tag; + assign core_req_ready = core_req_ready_c; - assign core_rsp_valid_sb = core_rsp_valid_nc; - assign core_rsp_tmask_sb = core_rsp_tmask_nc; - assign core_rsp_data_sb = core_rsp_data_nc; - assign core_rsp_tag_sb = core_rsp_tag_nc; - assign core_rsp_ready_nc = core_rsp_ready_sb; + assign core_rsp_valid_sb = core_rsp_valid_c; + assign core_rsp_tmask_sb = core_rsp_tmask_c; + assign core_rsp_data_sb = core_rsp_data_c; + assign core_rsp_tag_sb = core_rsp_tag_c; + assign core_rsp_ready_c = core_rsp_ready_sb; - assign mem_req_valid_sb = mem_req_valid_nc; - assign mem_req_addr_sb = mem_req_addr_nc; - assign mem_req_rw_p = mem_req_rw_nc; - assign mem_req_pmask_p = mem_req_pmask_nc; - assign mem_req_byteen_p = mem_req_byteen_nc; - assign mem_req_wsel_p = mem_req_wsel_nc; - assign mem_req_data_p = mem_req_data_nc; - assign mem_req_tag_sb = mem_req_tag_nc; - assign mem_req_ready_nc = mem_req_ready_sb; + assign mem_req_valid_sb = mem_req_valid_c; + assign mem_req_addr_sb = mem_req_addr_c; + assign mem_req_rw_p = mem_req_rw_c; + assign mem_req_pmask_p = mem_req_pmask_c; + assign mem_req_byteen_p = mem_req_byteen_c; + assign mem_req_wsel_p = mem_req_wsel_c; + assign mem_req_data_p = mem_req_data_c; + assign mem_req_tag_sb = mem_req_tag_c; + assign mem_req_ready_c = mem_req_ready_sb; - assign mem_rsp_valid_nc = mem_rsp_valid; - assign mem_rsp_data_nc = mem_rsp_data; - assign mem_rsp_tag_nc = mem_rsp_tag; - assign mem_rsp_ready = mem_rsp_ready_nc; + assign mem_rsp_valid_c = mem_rsp_valid; + assign mem_rsp_data_c = mem_rsp_data; + assign mem_rsp_tag_c = mem_rsp_tag; + assign mem_rsp_ready = mem_rsp_ready_c; end /////////////////////////////////////////////////////////////////////////// @@ -383,15 +382,15 @@ module VX_cache #( ) mem_rsp_queue ( .clk (clk), .reset (mrsq_reset), - .ready_in (mem_rsp_ready_nc), - .valid_in (mem_rsp_valid_nc), - .data_in ({mem_rsp_tag_nc, mem_rsp_data_nc}), + .ready_in (mem_rsp_ready_c), + .valid_in (mem_rsp_valid_c), + .data_in ({mem_rsp_tag_c, mem_rsp_data_c}), .data_out ({mem_rsp_tag_qual, mem_rsp_data_qual}), .ready_out (mrsq_out_ready), .valid_out (mrsq_out_valid) ); - `UNUSED_VAR (mem_rsp_tag_nc) + `UNUSED_VAR (mem_rsp_tag_c) /////////////////////////////////////////////////////////////////////////// @@ -464,13 +463,13 @@ module VX_cache #( `ifdef PERF_ENABLE .bank_stalls(perf_cache_if.bank_stalls), `endif - .core_req_valid (core_req_valid_nc), - .core_req_rw (core_req_rw_nc), - .core_req_addr (core_req_addr_nc), - .core_req_byteen (core_req_byteen_nc), - .core_req_data (core_req_data_nc), - .core_req_tag (core_req_tag_nc), - .core_req_ready (core_req_ready_nc), + .core_req_valid (core_req_valid_c), + .core_req_rw (core_req_rw_c), + .core_req_addr (core_req_addr_c), + .core_req_byteen (core_req_byteen_c), + .core_req_data (core_req_data_c), + .core_req_tag (core_req_tag_c), + .core_req_ready (core_req_ready_c), .per_bank_core_req_valid (per_bank_core_req_valid), .per_bank_core_req_pmask (per_bank_core_req_pmask), .per_bank_core_req_rw (per_bank_core_req_rw), @@ -592,7 +591,6 @@ module VX_cache #( .perf_read_misses (perf_read_miss_per_bank[i]), .perf_write_misses (perf_write_miss_per_bank[i]), .perf_mshr_stalls (perf_mshr_stall_per_bank[i]), - .perf_pipe_stalls (perf_pipe_stall_per_bank[i]), `endif // Core request @@ -655,11 +653,11 @@ module VX_cache #( .per_bank_core_rsp_tag (per_bank_core_rsp_tag), .per_bank_core_rsp_tid (per_bank_core_rsp_tid), .per_bank_core_rsp_ready (per_bank_core_rsp_ready), - .core_rsp_valid (core_rsp_valid_nc), - .core_rsp_tmask (core_rsp_tmask_nc), - .core_rsp_tag (core_rsp_tag_nc), - .core_rsp_data (core_rsp_data_nc), - .core_rsp_ready (core_rsp_ready_nc) + .core_rsp_valid (core_rsp_valid_c), + .core_rsp_tmask (core_rsp_tmask_c), + .core_rsp_tag (core_rsp_tag_c), + .core_rsp_data (core_rsp_data_c), + .core_rsp_ready (core_rsp_ready_c) ); wire [NUM_BANKS-1:0][(`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH))-1:0] data_in; @@ -681,15 +679,15 @@ module VX_cache #( .valid_in (per_bank_mem_req_valid), .data_in (data_in), .ready_in (per_bank_mem_req_ready), - .valid_out (mem_req_valid_nc), - .data_out ({mem_req_addr_nc, mem_req_id, mem_req_rw_nc, mem_req_pmask_nc, mem_req_byteen_nc, mem_req_wsel_nc, mem_req_data_nc}), - .ready_out (mem_req_ready_nc) + .valid_out (mem_req_valid_c), + .data_out ({mem_req_addr_c, mem_req_id, mem_req_rw_c, mem_req_pmask_c, mem_req_byteen_c, mem_req_wsel_c, mem_req_data_c}), + .ready_out (mem_req_ready_c) ); if (NUM_BANKS == 1) begin - assign mem_req_tag_nc = MEM_TAG_IN_WIDTH'(mem_req_id); + assign mem_req_tag_c = MEM_TAG_IN_WIDTH'(mem_req_id); end else begin - assign mem_req_tag_nc = MEM_TAG_IN_WIDTH'({`MEM_ADDR_TO_BANK_ID(mem_req_addr_nc), mem_req_id}); + assign mem_req_tag_c = MEM_TAG_IN_WIDTH'({`MEM_ADDR_TO_BANK_ID(mem_req_addr_c), mem_req_id}); end `ifdef PERF_ENABLE @@ -697,12 +695,21 @@ module VX_cache #( wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; wire [$clog2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; - - wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw; - wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw; - - `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask); + + wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid_c & core_req_ready_c & ~core_req_rw; + wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid_c & core_req_ready_c & core_req_rw; + + // per cycle: read misses, write misses, msrq stalls, pipeline stalls + wire [$clog2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; + wire [$clog2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle; + wire [$clog2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; + wire [$clog2(NUM_BANKS+1)-1:0] perf_crsp_stall_per_cycle; + + `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask); `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask); + `POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank); + `POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank); + `POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank); if (CORE_TAG_ID_BITS != 0) begin wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}}; @@ -712,23 +719,14 @@ module VX_cache #( `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask); end - // per cycle: read misses, write misses, msrq stalls, pipeline stalls - wire [$clog2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; - wire [$clog2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle; - wire [$clog2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; - wire [$clog2(NUM_BANKS+1)-1:0] perf_pipe_stall_per_cycle; - - `POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank); - `POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank); - `POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank); - `POP_COUNT(perf_pipe_stall_per_cycle, perf_pipe_stall_per_bank); + wire perf_mem_stall_per_cycle = mem_req_valid & ~mem_req_ready; reg [`PERF_CTR_BITS-1:0] perf_core_reads; reg [`PERF_CTR_BITS-1:0] perf_core_writes; reg [`PERF_CTR_BITS-1:0] perf_read_misses; reg [`PERF_CTR_BITS-1:0] perf_write_misses; reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls; - reg [`PERF_CTR_BITS-1:0] perf_pipe_stalls; + reg [`PERF_CTR_BITS-1:0] perf_mem_stalls; reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls; always @(posedge clk) begin @@ -738,16 +736,16 @@ module VX_cache #( perf_read_misses <= 0; perf_write_misses <= 0; perf_mshr_stalls <= 0; - perf_pipe_stalls <= 0; + perf_mem_stalls <= 0; perf_crsp_stalls <= 0; end else begin - perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); - perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); - perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle); - perf_write_misses <= perf_write_misses+ `PERF_CTR_BITS'(perf_write_miss_per_cycle); - perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle); - perf_pipe_stalls <= perf_pipe_stalls + `PERF_CTR_BITS'(perf_pipe_stall_per_cycle); - perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); + perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); + perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); + perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle); + perf_write_misses <= perf_write_misses + `PERF_CTR_BITS'(perf_write_miss_per_cycle); + perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle); + perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle); + perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); end end @@ -756,7 +754,7 @@ module VX_cache #( assign perf_cache_if.read_misses = perf_read_misses; assign perf_cache_if.write_misses = perf_write_misses; assign perf_cache_if.mshr_stalls = perf_mshr_stalls; - assign perf_cache_if.pipe_stalls = perf_pipe_stalls; + assign perf_cache_if.mem_stalls = perf_mem_stalls; assign perf_cache_if.crsp_stalls = perf_crsp_stalls; `endif diff --git a/hw/rtl/cache/VX_shared_mem.sv b/hw/rtl/cache/VX_shared_mem.sv index 257cf295..971795e0 100644 --- a/hw/rtl/cache/VX_shared_mem.sv +++ b/hw/rtl/cache/VX_shared_mem.sv @@ -335,21 +335,13 @@ module VX_shared_mem #( // per cycle: core_reads, core_writes wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; - wire [$clog2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw; wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw; `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask); `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask); - - if (CORE_TAG_ID_BITS != 0) begin - wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}}; - `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask); - end else begin - wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_valid & ~core_rsp_ready; - `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask); - end + wire perf_crsp_stall_per_cycle = core_rsp_valid & ~core_rsp_ready; reg [`PERF_CTR_BITS-1:0] perf_core_reads; reg [`PERF_CTR_BITS-1:0] perf_core_writes; @@ -357,13 +349,13 @@ module VX_shared_mem #( always @(posedge clk) begin if (reset) begin - perf_core_reads <= 0; - perf_core_writes <= 0; - perf_crsp_stalls <= 0; + perf_core_reads <= 0; + perf_core_writes <= 0; + perf_crsp_stalls <= 0; end else begin - perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); - perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); - perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); + perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); + perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); + perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); end end @@ -371,7 +363,8 @@ module VX_shared_mem #( assign perf_cache_if.writes = perf_core_writes; assign perf_cache_if.read_misses = '0; assign perf_cache_if.write_misses = '0; - assign perf_cache_if.pipe_stalls = '0; + assign perf_cache_if.mshr_stalls = '0; + assign perf_cache_if.mem_stalls = '0; assign perf_cache_if.crsp_stalls = perf_crsp_stalls; `endif diff --git a/hw/rtl/interfaces/VX_alu_req_if.sv b/hw/rtl/interfaces/VX_alu_req_if.sv index 2c6ffd5e..35049542 100644 --- a/hw/rtl/interfaces/VX_alu_req_if.sv +++ b/hw/rtl/interfaces/VX_alu_req_if.sv @@ -5,7 +5,8 @@ interface VX_alu_req_if (); - wire valid; + wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -24,6 +25,7 @@ interface VX_alu_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -43,6 +45,7 @@ interface VX_alu_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_cmt_to_csr_if.sv b/hw/rtl/interfaces/VX_cmt_to_csr_if.sv index 800d428d..ed5ffc24 100644 --- a/hw/rtl/interfaces/VX_cmt_to_csr_if.sv +++ b/hw/rtl/interfaces/VX_cmt_to_csr_if.sv @@ -5,9 +5,12 @@ interface VX_cmt_to_csr_if (); - wire valid; - wire [$clog2(`NUM_THREADS+1)-1:0] commit_size; - + wire valid; +`ifdef EXT_F_ENABLE + wire [$clog2(6*`NUM_THREADS+1)-1:0] commit_size; +`else + wire [$clog2(5*`NUM_THREADS+1)-1:0] commit_size; +`endif modport master ( output valid, output commit_size diff --git a/hw/rtl/interfaces/VX_commit_if.sv b/hw/rtl/interfaces/VX_commit_if.sv index 4b6844d6..e85d310f 100644 --- a/hw/rtl/interfaces/VX_commit_if.sv +++ b/hw/rtl/interfaces/VX_commit_if.sv @@ -6,6 +6,7 @@ interface VX_commit_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -17,6 +18,7 @@ interface VX_commit_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -29,6 +31,7 @@ interface VX_commit_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_csr_req_if.sv b/hw/rtl/interfaces/VX_csr_req_if.sv index 23345d53..0639f3aa 100644 --- a/hw/rtl/interfaces/VX_csr_req_if.sv +++ b/hw/rtl/interfaces/VX_csr_req_if.sv @@ -6,6 +6,7 @@ interface VX_csr_req_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -20,6 +21,7 @@ interface VX_csr_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -35,6 +37,7 @@ interface VX_csr_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_decode_if.sv b/hw/rtl/interfaces/VX_decode_if.sv index 90c5d70e..23039847 100644 --- a/hw/rtl/interfaces/VX_decode_if.sv +++ b/hw/rtl/interfaces/VX_decode_if.sv @@ -6,6 +6,7 @@ interface VX_decode_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -23,7 +24,8 @@ interface VX_decode_if (); wire ready; modport master ( - output valid, + output valid, + output uuid, output wid, output tmask, output PC, @@ -42,7 +44,8 @@ interface VX_decode_if (); ); modport slave ( - input valid, + input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_fpu_req_if.sv b/hw/rtl/interfaces/VX_fpu_req_if.sv index 25867e42..2b7d69f0 100644 --- a/hw/rtl/interfaces/VX_fpu_req_if.sv +++ b/hw/rtl/interfaces/VX_fpu_req_if.sv @@ -6,6 +6,7 @@ interface VX_fpu_req_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -20,6 +21,7 @@ interface VX_fpu_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -35,6 +37,7 @@ interface VX_fpu_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_gpu_req_if.sv b/hw/rtl/interfaces/VX_gpu_req_if.sv index 50ac8c7c..06ef6cc7 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.sv +++ b/hw/rtl/interfaces/VX_gpu_req_if.sv @@ -6,7 +6,7 @@ interface VX_gpu_req_if(); wire valid; - + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -19,11 +19,11 @@ interface VX_gpu_req_if(); wire [`NUM_THREADS-1:0][31:0] rs3_data; wire [`NR_BITS-1:0] rd; wire wb; - wire ready; modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -41,6 +41,7 @@ interface VX_gpu_req_if(); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_ibuffer_if.sv b/hw/rtl/interfaces/VX_ibuffer_if.sv index bb791737..a436ae7b 100644 --- a/hw/rtl/interfaces/VX_ibuffer_if.sv +++ b/hw/rtl/interfaces/VX_ibuffer_if.sv @@ -6,6 +6,7 @@ interface VX_ibuffer_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -31,6 +32,7 @@ interface VX_ibuffer_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -55,6 +57,7 @@ interface VX_ibuffer_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_ifetch_req_if.sv b/hw/rtl/interfaces/VX_ifetch_req_if.sv index 3d75e736..4132f90b 100644 --- a/hw/rtl/interfaces/VX_ifetch_req_if.sv +++ b/hw/rtl/interfaces/VX_ifetch_req_if.sv @@ -5,14 +5,16 @@ interface VX_ifetch_req_if (); - wire valid; + wire valid; + wire [63:0] uuid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; wire ready; modport master ( - output valid, + output valid, + output uuid, output tmask, output wid, output PC, @@ -20,7 +22,8 @@ interface VX_ifetch_req_if (); ); modport slave ( - input valid, + input valid, + input uuid, input tmask, input wid, input PC, diff --git a/hw/rtl/interfaces/VX_ifetch_rsp_if.sv b/hw/rtl/interfaces/VX_ifetch_rsp_if.sv index a2f04fe4..350af081 100644 --- a/hw/rtl/interfaces/VX_ifetch_rsp_if.sv +++ b/hw/rtl/interfaces/VX_ifetch_rsp_if.sv @@ -6,6 +6,7 @@ interface VX_ifetch_rsp_if (); wire valid; + wire [63:0] uuid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; @@ -13,7 +14,8 @@ interface VX_ifetch_rsp_if (); wire ready; modport master ( - output valid, + output valid, + output uuid, output tmask, output wid, output PC, @@ -22,7 +24,8 @@ interface VX_ifetch_rsp_if (); ); modport slave ( - input valid, + input valid, + input uuid, input tmask, input wid, input PC, diff --git a/hw/rtl/interfaces/VX_lsu_req_if.sv b/hw/rtl/interfaces/VX_lsu_req_if.sv index 4f31b17c..128b3c20 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.sv +++ b/hw/rtl/interfaces/VX_lsu_req_if.sv @@ -6,6 +6,7 @@ interface VX_lsu_req_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -21,6 +22,7 @@ interface VX_lsu_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -37,6 +39,7 @@ interface VX_lsu_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_perf_cache_if.sv b/hw/rtl/interfaces/VX_perf_cache_if.sv index d9efb2cc..0ec8d582 100644 --- a/hw/rtl/interfaces/VX_perf_cache_if.sv +++ b/hw/rtl/interfaces/VX_perf_cache_if.sv @@ -11,7 +11,7 @@ interface VX_perf_cache_if (); wire [`PERF_CTR_BITS-1:0] write_misses; wire [`PERF_CTR_BITS-1:0] bank_stalls; wire [`PERF_CTR_BITS-1:0] mshr_stalls; - wire [`PERF_CTR_BITS-1:0] pipe_stalls; + wire [`PERF_CTR_BITS-1:0] mem_stalls; wire [`PERF_CTR_BITS-1:0] crsp_stalls; modport master ( @@ -21,7 +21,7 @@ interface VX_perf_cache_if (); output write_misses, output bank_stalls, output mshr_stalls, - output pipe_stalls, + output mem_stalls, output crsp_stalls ); @@ -32,7 +32,7 @@ interface VX_perf_cache_if (); input write_misses, input bank_stalls, input mshr_stalls, - input pipe_stalls, + input mem_stalls, input crsp_stalls ); diff --git a/hw/rtl/interfaces/VX_perf_memsys_if.sv b/hw/rtl/interfaces/VX_perf_memsys_if.sv index f0e27ed6..9a38dc26 100644 --- a/hw/rtl/interfaces/VX_perf_memsys_if.sv +++ b/hw/rtl/interfaces/VX_perf_memsys_if.sv @@ -7,68 +7,50 @@ interface VX_perf_memsys_if (); wire [`PERF_CTR_BITS-1:0] icache_reads; wire [`PERF_CTR_BITS-1:0] icache_read_misses; - wire [`PERF_CTR_BITS-1:0] icache_pipe_stalls; - wire [`PERF_CTR_BITS-1:0] icache_crsp_stalls; - wire [`PERF_CTR_BITS-1:0] dcache_reads; - wire [`PERF_CTR_BITS-1:0] dcache_writes; + wire [`PERF_CTR_BITS-1:0] dcache_writes; wire [`PERF_CTR_BITS-1:0] dcache_read_misses; wire [`PERF_CTR_BITS-1:0] dcache_write_misses; wire [`PERF_CTR_BITS-1:0] dcache_bank_stalls; wire [`PERF_CTR_BITS-1:0] dcache_mshr_stalls; - wire [`PERF_CTR_BITS-1:0] dcache_pipe_stalls; - wire [`PERF_CTR_BITS-1:0] dcache_crsp_stalls; - wire [`PERF_CTR_BITS-1:0] smem_reads; wire [`PERF_CTR_BITS-1:0] smem_writes; wire [`PERF_CTR_BITS-1:0] smem_bank_stalls; - wire [`PERF_CTR_BITS-1:0] mem_reads; wire [`PERF_CTR_BITS-1:0] mem_writes; - wire [`PERF_CTR_BITS-1:0] mem_stalls; wire [`PERF_CTR_BITS-1:0] mem_latency; modport master ( output icache_reads, output icache_read_misses, - output icache_pipe_stalls, - output icache_crsp_stalls, output dcache_reads, - output dcache_writes, + output dcache_writes, output dcache_read_misses, output dcache_write_misses, output dcache_bank_stalls, output dcache_mshr_stalls, - output dcache_pipe_stalls, - output dcache_crsp_stalls, output smem_reads, output smem_writes, output smem_bank_stalls, output mem_reads, output mem_writes, - output mem_stalls, output mem_latency ); modport slave ( input icache_reads, input icache_read_misses, - input icache_pipe_stalls, - input icache_crsp_stalls, input dcache_reads, - input dcache_writes, + input dcache_writes, input dcache_read_misses, input dcache_write_misses, input dcache_bank_stalls, input dcache_mshr_stalls, - input dcache_pipe_stalls, - input dcache_crsp_stalls, input smem_reads, input smem_writes, input smem_bank_stalls, input mem_reads, input mem_writes, - input mem_stalls, input mem_latency ); diff --git a/hw/rtl/interfaces/VX_perf_pipeline_if.sv b/hw/rtl/interfaces/VX_perf_pipeline_if.sv index 19cc15c3..a4470e4c 100644 --- a/hw/rtl/interfaces/VX_perf_pipeline_if.sv +++ b/hw/rtl/interfaces/VX_perf_pipeline_if.sv @@ -4,18 +4,27 @@ `include "VX_define.vh" interface VX_perf_pipeline_if (); - - wire [`PERF_CTR_BITS-1:0] ibf_stalls; - wire [`PERF_CTR_BITS-1:0] scb_stalls; - wire [`PERF_CTR_BITS-1:0] lsu_stalls; - wire [`PERF_CTR_BITS-1:0] csr_stalls; - wire [`PERF_CTR_BITS-1:0] alu_stalls; + wire [`PERF_CTR_BITS-1:0] loads; + wire [`PERF_CTR_BITS-1:0] stores; + wire [`PERF_CTR_BITS-1:0] branches; + + wire [`PERF_CTR_BITS-1:0] ibf_stalls; + wire [`PERF_CTR_BITS-1:0] scb_stalls; + wire [`PERF_CTR_BITS-1:0] lsu_stalls; + wire [`PERF_CTR_BITS-1:0] csr_stalls; + wire [`PERF_CTR_BITS-1:0] alu_stalls; `ifdef EXT_F_ENABLE - wire [`PERF_CTR_BITS-1:0] fpu_stalls; + wire [`PERF_CTR_BITS-1:0] fpu_stalls; `endif - wire [`PERF_CTR_BITS-1:0] gpu_stalls; + wire [`PERF_CTR_BITS-1:0] gpu_stalls; - modport master ( + modport decode ( + output loads, + output stores, + output branches + ); + + modport issue ( output ibf_stalls, output scb_stalls, output lsu_stalls, @@ -25,9 +34,12 @@ interface VX_perf_pipeline_if (); output fpu_stalls, `endif output gpu_stalls - ); + ); modport slave ( + input loads, + input stores, + input branches, input ibf_stalls, input scb_stalls, input lsu_stalls, diff --git a/hw/rtl/interfaces/VX_perf_tex_if.sv b/hw/rtl/interfaces/VX_perf_tex_if.sv new file mode 100644 index 00000000..222ade53 --- /dev/null +++ b/hw/rtl/interfaces/VX_perf_tex_if.sv @@ -0,0 +1,23 @@ +`ifndef VX_PERF_TEX_IF +`define VX_PERF_TEX_IF + +`include "VX_define.vh" + +interface VX_perf_tex_if (); + + wire [`PERF_CTR_BITS-1:0] mem_reads; + wire [`PERF_CTR_BITS-1:0] mem_latency; + + modport master ( + output mem_reads, + output mem_latency + ); + + modport slave ( + input mem_reads, + input mem_latency + ); + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_tex_req_if.sv b/hw/rtl/interfaces/VX_tex_req_if.sv index f1eaa1be..0059de59 100644 --- a/hw/rtl/interfaces/VX_tex_req_if.sv +++ b/hw/rtl/interfaces/VX_tex_req_if.sv @@ -6,6 +6,7 @@ interface VX_tex_req_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -20,6 +21,7 @@ interface VX_tex_req_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -33,6 +35,7 @@ interface VX_tex_req_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_tex_rsp_if.sv b/hw/rtl/interfaces/VX_tex_rsp_if.sv index b3dbd65d..5966124c 100644 --- a/hw/rtl/interfaces/VX_tex_rsp_if.sv +++ b/hw/rtl/interfaces/VX_tex_rsp_if.sv @@ -6,6 +6,7 @@ interface VX_tex_rsp_if (); wire valid; + wire [63:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; @@ -16,6 +17,7 @@ interface VX_tex_rsp_if (); modport master ( output valid, + output uuid, output wid, output tmask, output PC, @@ -27,6 +29,7 @@ interface VX_tex_rsp_if (); modport slave ( input valid, + input uuid, input wid, input tmask, input PC, diff --git a/hw/rtl/interfaces/VX_writeback_if.sv b/hw/rtl/interfaces/VX_writeback_if.sv index 8f05fc7a..00cab3b8 100644 --- a/hw/rtl/interfaces/VX_writeback_if.sv +++ b/hw/rtl/interfaces/VX_writeback_if.sv @@ -6,6 +6,7 @@ interface VX_writeback_if (); wire valid; + wire [63:0] uuid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; @@ -16,6 +17,7 @@ interface VX_writeback_if (); modport master ( output valid, + output uuid, output tmask, output wid, output PC, @@ -27,6 +29,7 @@ interface VX_writeback_if (); modport slave ( input valid, + input uuid, input tmask, input wid, input PC, diff --git a/hw/rtl/libs/VX_axi_adapter.sv b/hw/rtl/libs/VX_axi_adapter.sv index 2788c315..9e96eedb 100644 --- a/hw/rtl/libs/VX_axi_adapter.sv +++ b/hw/rtl/libs/VX_axi_adapter.sv @@ -125,7 +125,7 @@ module VX_axi_adapter #( // AXI write response channel `UNUSED_VAR (m_axi_bid); - `RUNTIME_ASSERT(~m_axi_bvalid || m_axi_bresp == 0, ("AXI response error")); + `RUNTIME_ASSERT(~m_axi_bvalid || m_axi_bresp == 0, ("%t: *** AXI response error", $time)); assign m_axi_bready = 1'b1; // AXI read request channel @@ -144,7 +144,7 @@ module VX_axi_adapter #( assign mem_rsp_valid = m_axi_rvalid; assign mem_rsp_tag = m_axi_rid; assign mem_rsp_data = m_axi_rdata; - `RUNTIME_ASSERT(~m_axi_rvalid || m_axi_rresp == 0, ("AXI response error")); + `RUNTIME_ASSERT(~m_axi_rvalid || m_axi_rresp == 0, ("%t: *** AXI response error", $time)); `UNUSED_VAR (m_axi_rlast); assign m_axi_rready = mem_rsp_ready; diff --git a/hw/rtl/libs/VX_index_queue.sv b/hw/rtl/libs/VX_index_queue.sv index 66307d74..201287fb 100644 --- a/hw/rtl/libs/VX_index_queue.sv +++ b/hw/rtl/libs/VX_index_queue.sv @@ -32,7 +32,7 @@ module VX_index_queue #( assign enqueue = push; assign dequeue = !empty && !valid[rd_a]; // auto-remove when head is invalid - `RUNTIME_ASSERT(!push || !full, ("invalid inputs")); + `RUNTIME_ASSERT(!push || !full, ("%t: *** invalid inputs", $time)); always @(posedge clk) begin if (reset) begin diff --git a/hw/rtl/libs/VX_popcount.sv b/hw/rtl/libs/VX_popcount.sv index 3144f106..8c8b08d3 100644 --- a/hw/rtl/libs/VX_popcount.sv +++ b/hw/rtl/libs/VX_popcount.sv @@ -4,12 +4,17 @@ module VX_popcount #( parameter MODEL = 1, parameter N = 1, - parameter LOGN = $clog2(N), - parameter M = LOGN+1 + parameter M = $clog2(N+1) ) ( input wire [N-1:0] in_i, output wire [M-1:0] cnt_o ); +`ifndef SYNTHESIS + assign cnt_o = $countones(in_i); +`else +`ifdef QUARTUS + assign cnt_o = $countones(in_i); +`else if (N == 1) begin assign cnt_o = in_i; @@ -53,6 +58,8 @@ module VX_popcount #( assign cnt_o = cnt_r; end +`endif +`endif endmodule `TRACING_ON \ No newline at end of file diff --git a/hw/rtl/libs/VX_skid_buffer.sv b/hw/rtl/libs/VX_skid_buffer.sv index ba6c8b6c..c6820f75 100644 --- a/hw/rtl/libs/VX_skid_buffer.sv +++ b/hw/rtl/libs/VX_skid_buffer.sv @@ -30,7 +30,7 @@ module VX_skid_buffer #( end else if (NOBACKPRESSURE) begin - `RUNTIME_ASSERT(ready_out, ("ready_out should always be asserted")) + `RUNTIME_ASSERT(ready_out, ("%t: *** ready_out should always be asserted", $time)) wire stall = valid_out && ~ready_out; diff --git a/hw/rtl/tex_unit/VX_tex_unit.sv b/hw/rtl/tex_unit/VX_tex_unit.sv index 38f93eb2..c9510827 100644 --- a/hw/rtl/tex_unit/VX_tex_unit.sv +++ b/hw/rtl/tex_unit/VX_tex_unit.sv @@ -6,6 +6,11 @@ module VX_tex_unit #( input wire clk, input wire reset, + // PERF +`ifdef PERF_ENABLE + VX_perf_tex_if.master perf_tex_if, +`endif + // Texture unit <-> Memory Unit VX_dcache_req_if.master dcache_req_if, VX_dcache_rsp_if.slave dcache_rsp_if, @@ -18,10 +23,11 @@ module VX_tex_unit #( VX_tex_rsp_if.master tex_rsp_if ); - localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32; + localparam REQ_INFOW_S = 64 + `NR_BITS + 1 + `NW_BITS + 32; localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S; localparam REQ_INFOW_M = (2 * `NUM_THREADS * `TEX_BLEND_FRAC) + REQ_INFOW_A; + reg [$clog2(`NUM_TEX_UNITS)-1:0] csr_tex_unit; reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][`TEX_LOD_MAX+1-1:0]; reg [1:0][`TEX_LOD_BITS-1:0] tex_logdims [`NUM_TEX_UNITS-1:0]; reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0]; @@ -29,57 +35,60 @@ module VX_tex_unit #( reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0]; reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0]; - // CSRs programming + // CSRs programming - reg [`NUM_TEX_UNITS-1:0] csrs_dirty; + reg csrs_dirty [`NUM_TEX_UNITS-1:0]; `UNUSED_VAR (csrs_dirty) - for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin - always @(posedge clk) begin - if (tex_csr_if.write_enable) begin - case (tex_csr_if.write_addr) - `CSR_TEX(i, `TEX_STATE_ADDR) : begin - tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX(i, `TEX_STATE_FORMAT) : begin - tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX(i, `TEX_STATE_WRAPU) : begin - tex_wraps[i][0] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX(i, `TEX_STATE_WRAPV) : begin - tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX(i, `TEX_STATE_FILTER) : begin - tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX(i, `TEX_STATE_WIDTH) : begin - tex_logdims[i][0] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; - csrs_dirty[i] <= 1; - end - `CSR_TEX(i, `TEX_STATE_HEIGHT) : begin - tex_logdims[i][1] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; - csrs_dirty[i] <= 1; - end - default: begin - for (integer j = 0; j <= `TEX_LOD_MAX; ++j) begin - `IGNORE_WARNINGS_BEGIN - if (tex_csr_if.write_addr == `CSR_ADDR_BITS'(`CSR_TEX(i, `TEX_STATE_MIPOFF(j)))) begin - `IGNORE_WARNINGS_END - tex_mipoff[i][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; - csrs_dirty[i] <= 1; - end + always @(posedge clk) begin + if (tex_csr_if.write_enable) begin + case (tex_csr_if.write_addr) + `CSR_TEX_UNIT: begin + csr_tex_unit <= tex_csr_if.write_data[$clog2(`NUM_TEX_UNITS)-1:0]; + end + `CSR_TEX_ADDR: begin + tex_baddr[csr_tex_unit] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + `CSR_TEX_FORMAT: begin + tex_format[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + `CSR_TEX_WRAPU: begin + tex_wraps[csr_tex_unit][0] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + `CSR_TEX_WRAPV: begin + tex_wraps[csr_tex_unit][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + `CSR_TEX_FILTER: begin + tex_filter[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + `CSR_TEX_WIDTH: begin + tex_logdims[csr_tex_unit][0] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + `CSR_TEX_HEIGHT: begin + tex_logdims[csr_tex_unit][1] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; + end + default: begin + for (integer j = 0; j <= `TEX_LOD_MAX; ++j) begin + `IGNORE_WARNINGS_BEGIN + if (tex_csr_if.write_addr == `CSR_TEX_MIPOFF(j)) begin + `IGNORE_WARNINGS_END + tex_mipoff[csr_tex_unit][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; + csrs_dirty[csr_tex_unit] <= 1; end end - endcase - end - if (reset || (tex_req_if.valid && tex_req_if.ready)) begin - csrs_dirty[i] <= '0; + end + endcase + end + if (reset || (tex_req_if.valid && tex_req_if.ready)) begin + for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin + csrs_dirty[i] <= 0; end end end @@ -125,7 +134,7 @@ module VX_tex_unit #( .req_baseaddr(tex_baddr[tex_req_if.unit]), .req_mipoff (sel_mipoff), .req_logdims(sel_logdims), - .req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), + .req_info ({tex_format[tex_req_if.unit], tex_req_if.uuid, tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), .req_ready (tex_req_if.ready), .rsp_valid (mem_req_valid), @@ -204,9 +213,47 @@ module VX_tex_unit #( .rsp_valid (tex_rsp_if.valid), .rsp_tmask (tex_rsp_if.tmask), .rsp_data (tex_rsp_if.data), - .rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}), + .rsp_info ({tex_rsp_if.uuid, tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}), .rsp_ready (tex_rsp_if.ready) - ); + ); + +`ifdef PERF_ENABLE + wire [$clog2(`NUM_THREADS+1)-1:0] perf_mem_req_per_cycle; + wire [$clog2(`NUM_THREADS+1)-1:0] perf_mem_rsp_per_cycle; + + wire [`NUM_THREADS-1:0] perf_mem_req_per_mask = dcache_req_if.valid & dcache_req_if.ready; + wire [`NUM_THREADS-1:0] perf_mem_rsp_per_mask = dcache_rsp_if.tmask & {`NUM_THREADS{dcache_rsp_if.valid & dcache_rsp_if.ready}}; + + `POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_per_mask); + `POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_per_mask); + + reg [`PERF_CTR_BITS-1:0] perf_pending_reads; + wire [$clog2(`NUM_THREADS+1)+1-1:0] perf_pending_reads_cycle = perf_mem_req_per_cycle - perf_mem_rsp_per_cycle; + + always @(posedge clk) begin + if (reset) begin + perf_pending_reads <= 0; + end else begin + perf_pending_reads <= perf_pending_reads + `PERF_CTR_BITS'($signed(perf_pending_reads_cycle)); + end + end + + reg [`PERF_CTR_BITS-1:0] perf_mem_reads; + reg [`PERF_CTR_BITS-1:0] perf_mem_latency; + + always @(posedge clk) begin + if (reset) begin + perf_mem_reads <= 0; + perf_mem_latency <= 0; + end else begin + perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(perf_mem_req_per_cycle); + perf_mem_latency <= perf_mem_latency + `PERF_CTR_BITS'(perf_pending_reads); + end + end + + assign perf_tex_if.mem_reads = perf_mem_reads; + assign perf_tex_if.mem_latency = perf_mem_latency; +`endif `ifdef DBG_TRACE_TEX always @(posedge clk) begin diff --git a/hw/scripts/scope.json b/hw/scripts/scope.json index 2c9f8355..c9c49ebe 100644 --- a/hw/scripts/scope.json +++ b/hw/scripts/scope.json @@ -123,9 +123,9 @@ "!cci_pending_writes_full": 1, "?afu_mem_req_fire": 1, "afu_mem_req_addr": 26, - "afu_mem_req_tag": 27, + "afu_mem_req_tag": "`VX_MEM_TAG_WIDTH+1", "?afu_mem_rsp_fire": 1, - "afu_mem_rsp_tag": 27 + "afu_mem_rsp_tag": "`VX_MEM_TAG_WIDTH+1" }, "afu/vortex": { "!reset": 1, @@ -140,49 +140,29 @@ "mem_rsp_tag":"`VX_MEM_TAG_WIDTH", "busy": 1 }, - "afu/vortex/cluster/core/pipeline/fetch/icache_stage": { - "?icache_req_fire": 1, - "icache_req_wid":"`NW_BITS", - "icache_req_addr": 32, - "icache_req_tag":"`ICACHE_CORE_TAG_ID_BITS", - "?icache_rsp_fire": 1, - "icache_rsp_data": 32, - "icache_rsp_tag":"`ICACHE_CORE_TAG_ID_BITS" - }, "afu/vortex/cluster/core/pipeline/fetch/warp_sched": { "?wsched_scheduled": 1, + "wsched_schedule_uuid": 64, "wsched_active_warps": "`NUM_WARPS", "wsched_stalled_warps": "`NUM_WARPS", "wsched_schedule_tmask": "`NUM_THREADS", "wsched_schedule_wid": "`NW_BITS", - "wsched_schedule_pc": "32" + "wsched_schedule_pc": 32 }, - "afu/vortex/cluster/core/pipeline/execute/gpu_unit": { - "?gpu_rsp_valid": 1, - "gpu_rsp_wid": "`NW_BITS", - "gpu_rsp_tmc": 1, - "gpu_rsp_wspawn": 1, - "gpu_rsp_split": 1, - "gpu_rsp_barrier": 1 - }, - "afu/vortex/cluster/core/pipeline/execute/lsu_unit": { - "?dcache_req_fire":"`NUM_THREADS", - "dcache_req_wid":"`NW_BITS", - "dcache_req_pc": 32, - "dcache_req_addr":"`NUM_THREADS * 32", - "dcache_req_rw": 1, - "dcache_req_byteen":"`NUM_THREADS * 4", - "dcache_req_data": "`NUM_THREADS * 32", - "dcache_req_tag":"`LSUQ_ADDR_BITS", - "?dcache_rsp_fire":"`NUM_THREADS", - "dcache_rsp_data":"`NUM_THREADS * 32", - "dcache_rsp_tag":"`LSUQ_ADDR_BITS" + "afu/vortex/cluster/core/pipeline/fetch/icache_stage": { + "?icache_req_fire": 1, + "icache_req_uuid": 64, + "icache_req_addr": 32, + "icache_req_tag":"`ICACHE_CORE_TAG_ID_BITS", + "?icache_rsp_fire": 1, + "icache_rsp_uuid": 64, + "icache_rsp_data": 32, + "icache_rsp_tag":"`ICACHE_CORE_TAG_ID_BITS" }, "afu/vortex/cluster/core/pipeline/issue": { "?issue_fire": 1, - "issue_wid":"`NW_BITS", - "issue_tmask":"`NUM_THREADS", - "issue_pc": 32, + "issue_uuid": 64, + "issue_tmask":"`NUM_THREADS", "issue_ex_type":"`EX_BITS", "issue_op_type":"`INST_OP_BITS", "issue_op_mod":"`INST_MOD_BITS", @@ -198,15 +178,35 @@ "gpr_rs2":"`NUM_THREADS * 32", "gpr_rs3":"`NUM_THREADS * 32", "?writeback_valid": 1, - "writeback_wid":"`NW_BITS", - "writeback_pc": 32, + "writeback_uuid": 64, "writeback_tmask":"`NUM_THREADS", "writeback_rd":"`NR_BITS", "writeback_data":"`NUM_THREADS * 32", "writeback_eop": 1, "!scoreboard_delay": 1, "!dispatch_delay": 1 - }, + }, + "afu/vortex/cluster/core/pipeline/execute/lsu_unit": { + "?dcache_req_fire":"`NUM_THREADS", + "dcache_req_uuid": 64, + "dcache_req_addr":"`NUM_THREADS * 32", + "dcache_req_rw": 1, + "dcache_req_byteen":"`NUM_THREADS * 4", + "dcache_req_data":"`NUM_THREADS * 32", + "dcache_req_tag":"`LSUQ_ADDR_BITS", + "?dcache_rsp_fire":"`NUM_THREADS", + "dcache_rsp_uuid": 64, + "dcache_rsp_data":"`NUM_THREADS * 32", + "dcache_rsp_tag":"`LSUQ_ADDR_BITS" + }, + "afu/vortex/cluster/core/pipeline/execute/gpu_unit": { + "?gpu_rsp_valid": 1, + "gpu_rsp_uuid": 64, + "gpu_rsp_tmc": 1, + "gpu_rsp_wspawn": 1, + "gpu_rsp_split": 1, + "gpu_rsp_barrier": 1 + }, "afu/vortex/l3cache/bank, afu/vortex/cluster/l2cache/bank, afu/vortex/cluster/core/mem_unit/dcache/bank, afu/vortex/cluster/core/mem_unit/icache/bank": { "?valid_st0": 1, "?valid_st1": 1, diff --git a/runtime/src/vx_start.S b/runtime/src/vx_start.S index 0d2a0078..16e91a15 100644 --- a/runtime/src/vx_start.S +++ b/runtime/src/vx_start.S @@ -42,15 +42,9 @@ _start: .type _exit, @function .global _exit _exit: - beqz a0, label_exit_next - mv gp, a0 - ecall; - -label_exit_next: - # dump performance CSRs - call vx_perf_dump - - # disable all threads in current warp + mv s0, a0 + call vx_perf_dump + mv gp, s0 li a0, 0 .insn s 0x6b, 0, x0, 0(a0) # tmc a0 diff --git a/sim/common/mempool.h b/sim/common/mempool.h new file mode 100644 index 00000000..a5c0429d --- /dev/null +++ b/sim/common/mempool.h @@ -0,0 +1,47 @@ +#pragma once + +#include + +template +class MemoryPool { +public: + MemoryPool(uint32_t max_size) : max_size_(max_size) {} + + MemoryPool(MemoryPool && other) + : free_list_(std::move(other.free_list_)) + {} + + ~MemoryPool() { + this->flush(); + } + + void* allocate() { + void* mem; + if (!free_list_.empty()) { + mem = static_cast(free_list_.top()); + free_list_.pop(); + } else { + mem = ::operator new(sizeof(T)); + } + return mem; + } + + void deallocate(void * object) { + if (free_list_.size() < max_size_) { + free_list_.push(static_cast(object)); + } else { + ::operator delete(object); + } + } + + void flush() { + while (!free_list_.empty()) { + ::operator delete(free_list_.top()); + free_list_.pop(); + } + } + +private: + std::stack free_list_; + uint32_t max_size_; +}; \ No newline at end of file diff --git a/sim/common/simobject.h b/sim/common/simobject.h index 369a3503..3a5ab2b6 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -7,6 +7,7 @@ #include #include #include +#include "mempool.h" class SimObjectBase; @@ -20,37 +21,14 @@ public: return module_; } - SimPortBase* peer() const { - return peer_; - } - - bool connected() const { - return (peer_ != nullptr); - } - protected: SimPortBase(SimObjectBase* module) : module_(module) - , peer_(nullptr) {} - void connect(SimPortBase* peer) { - assert(peer_ == nullptr); - peer_ = peer; - } - - void disconnect() { - assert(peer_ == nullptr); - peer_ = nullptr; - } - SimPortBase& operator=(const SimPortBase&) = delete; SimObjectBase* module_; - SimPortBase* peer_; - - template friend class SlavePort; - template friend class MasterPort; }; /////////////////////////////////////////////////////////////////////////////// @@ -58,72 +36,92 @@ protected: template class SimPort : public SimPortBase { public: - void send(const Pkt& pkt, uint64_t delay) const; + typedef std::function TxCallback; + + SimPort(SimObjectBase* module) + : SimPortBase(module) + , peer_(nullptr) + , tx_cb_(nullptr) + {} + + void send(const Pkt& pkt, uint64_t delay = 1) const; void bind(SimPort* peer) { - this->connect(peer); + assert(peer_ == nullptr); + peer_ = peer; } void unbind() { - this->disconnect(); + assert(peer_ == nullptr); + peer_ = nullptr; + } + + bool connected() const { + return (peer_ != nullptr); + } + + SimPort* peer() const { + return peer_; } bool empty() const { return queue_.empty(); } - const Pkt& top() const { + const Pkt& front() const { return queue_.front(); } - Pkt& top() { - return queue_.front(); + Pkt& front() { + return queue_.front().pkt; } - void pop() { + const Pkt& back() const { + return queue_.back(); + } + + Pkt& back() { + return queue_.back().pkt; + } + + uint64_t pop() { + auto cycle = queue_.front().cycle; queue_.pop(); - } + return cycle; + } + + void tx_callback(const TxCallback& callback) { + tx_cb_ = callback; + } protected: - SimPort(SimObjectBase* module) - : SimPortBase(module) - {} + struct timed_pkt_t { + Pkt pkt; + uint64_t cycle; + }; - void push(const Pkt& data) { - queue_.push(data); + std::queue queue_; + SimPort* peer_; + TxCallback tx_cb_; + + void push(const Pkt& data, uint64_t cycle) { + if (tx_cb_) { + tx_cb_(data, cycle); + } + if (peer_) { + peer_->push(data, cycle); + } else { + queue_.push({data, cycle}); + } } SimPort& operator=(const SimPort&) = delete; - std::queue queue_; - template friend class SimPortEvent; }; /////////////////////////////////////////////////////////////////////////////// -template -class SlavePort : public SimPort { -public: - SlavePort(SimObjectBase* module) : SimPort(module) {} - -protected: - SlavePort& operator=(const SlavePort&) = delete; -}; - -/////////////////////////////////////////////////////////////////////////////// - -template -class MasterPort : public SimPort { -public: - MasterPort(SimObjectBase* module) : SimPort(module) {} - -protected: - MasterPort& operator=(const MasterPort&) = delete; -}; - -/////////////////////////////////////////////////////////////////////////////// - class SimEventBase { public: typedef std::shared_ptr Ptr; @@ -132,14 +130,14 @@ public: virtual void fire() const = 0; - bool step() { - return (0 == --delay_); + uint64_t time() const { + return time_; } protected: - SimEventBase(uint64_t delay) : delay_(delay) {} + SimEventBase(uint64_t time) : time_(time) {} - uint64_t delay_; + uint64_t time_; }; /////////////////////////////////////////////////////////////////////////////// @@ -147,26 +145,34 @@ protected: template class SimCallEvent : public SimEventBase { public: - typedef std::function Func; - - template - static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) { - return std::make_shared(func, pkt, delay); - } - - SimCallEvent(const Func& func, const Pkt& pkt, uint64_t delay) - : SimEventBase(delay) - , func_(func) - , pkt_(pkt) - {} - void fire() const override { func_(pkt_); } -protected: + typedef std::function Func; + + SimCallEvent(const Func& func, const Pkt& pkt, uint64_t time) + : SimEventBase(time) + , func_(func) + , pkt_(pkt) + {} + + void* operator new(size_t /*size*/) { + return allocator().allocate(); + } + + void operator delete(void* ptr) { + allocator().deallocate(ptr); + } + +protected: Func func_; - Pkt pkt_; + Pkt pkt_; + + static MemoryPool>& allocator() { + static MemoryPool> instance(64); + return instance; + } }; /////////////////////////////////////////////////////////////////////////////// @@ -174,23 +180,32 @@ protected: template class SimPortEvent : public SimEventBase { public: - static Ptr Create(const SimPort* port, const Pkt& pkt, uint64_t delay) { - return std::make_shared(port, pkt, delay); + void fire() const override { + const_cast*>(port_)->push(pkt_, time_); } - SimPortEvent(const SimPort* port, const Pkt& pkt, uint64_t delay) - : SimEventBase(delay) + SimPortEvent(const SimPort* port, const Pkt& pkt, uint64_t time) + : SimEventBase(time) , port_(port) , pkt_(pkt) {} - - void fire() const override { - const_cast*>(port_)->push(pkt_); + + void* operator new(size_t /*size*/) { + return allocator().allocate(); } -private: + void operator delete(void* ptr) { + allocator().deallocate(ptr); + } + +protected: const SimPort* port_; Pkt pkt_; + + static MemoryPool>& allocator() { + static MemoryPool> instance(64); + return instance; + } }; /////////////////////////////////////////////////////////////////////////////// @@ -203,24 +218,17 @@ public: virtual ~SimObjectBase() {} - template - void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay); - const std::string& name() const { return name_; } -protected: - virtual void step(uint64_t cycle) = 0; - SimObjectBase(const SimContext& ctx, const char* name); +protected: + + SimObjectBase(const SimContext& ctx, const char* name); -private: std::string name_; - - friend class SimPlatform; - friend class SimPortBase; }; /////////////////////////////////////////////////////////////////////////////// @@ -228,14 +236,16 @@ private: template class SimObject : public SimObjectBase { public: - typedef std::shared_ptr Ptr; + typedef std::shared_ptr Ptr; template static Ptr Create(Args&&... args); protected: - SimObject(const SimContext& ctx, const char* name) : SimObjectBase(ctx, name) {} + SimObject(const SimContext& ctx, const char* name) + : SimObjectBase(ctx, name) + {} void step(uint64_t cycle) override { this->impl().step(cycle); @@ -255,8 +265,8 @@ private: class SimContext { private: SimContext() {} - template template - friend typename SimObject::Ptr SimObject::Create(Args&&... args); + + friend class SimPlatform; }; /////////////////////////////////////////////////////////////////////////////// @@ -281,25 +291,19 @@ public: instance().clear(); } - void register_object(const SimObjectBase::Ptr& obj) { + template + typename SimObject::Ptr CreateObject(Args&&... args) { + auto obj = std::make_shared(SimContext{}, std::forward(args)...); objects_.push_back(obj); + return obj; } template - void schedule(const typename SimCallEvent::Func& callback, + void schedule(const typename SimCallEvent::Func& callback, const Pkt& pkt, uint64_t delay) { - auto evt = SimCallEvent::Create(callback, pkt, delay); - assert(delay != 0); - events_.emplace_back(evt); - } - - template - void schedule(const SimPort* port, - const Pkt& pkt, - uint64_t delay) { - auto evt = SimPortEvent::Create(port, pkt, delay); assert(delay != 0); + auto evt = std::make_shared>(callback, pkt, cycles_ + delay); events_.emplace_back(evt); } @@ -309,7 +313,7 @@ public: auto evt_it_end = events_.end(); while (evt_it != evt_it_end) { auto& event = *evt_it; - if (event->step()) { + if (cycles_ >= event->time()) { event->fire(); evt_it = events_.erase(evt_it); } else { @@ -341,9 +345,19 @@ private: events_.clear(); } + template + void schedule(const SimPort* port, const Pkt& pkt, uint64_t delay) { + assert(delay != 0); + auto evt = SimEventBase::Ptr(new SimPortEvent(port, pkt, cycles_ + delay)); + events_.emplace_back(evt); + } + std::vector objects_; std::list events_; uint64_t cycles_; + + template friend class SimPort; + friend class SimObjectBase; }; /////////////////////////////////////////////////////////////////////////////// @@ -355,22 +369,14 @@ inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) template template typename SimObject::Ptr SimObject::Create(Args&&... args) { - auto obj = std::make_shared(SimContext{}, std::forward(args)...); - SimPlatform::instance().register_object(obj); - return obj; + return SimPlatform::instance().CreateObject(std::forward(args)...); } template void SimPort::send(const Pkt& pkt, uint64_t delay) const { - if (peer_) { + if (peer_ && !tx_cb_) { reinterpret_cast*>(peer_)->send(pkt, delay); } else { SimPlatform::instance().schedule(this, pkt, delay); } -} - -template -void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) { - auto callback = std::bind(entry, obj, std::placeholders::_1); - SimPlatform::instance().schedule(callback, pkt, delay); } \ No newline at end of file diff --git a/sim/simX/archdef.h b/sim/simX/archdef.h index c6728831..c2a28f78 100644 --- a/sim/simX/archdef.h +++ b/sim/simX/archdef.h @@ -11,20 +11,20 @@ namespace vortex { class ArchDef { private: - int num_cores_; - int num_warps_; - int num_threads_; - int wsize_; - int vsize_; - int num_regs_; - int num_csrs_; - int num_barriers_; + uint16_t num_cores_; + uint16_t num_warps_; + uint16_t num_threads_; + uint16_t wsize_; + uint16_t vsize_; + uint16_t num_regs_; + uint16_t num_csrs_; + uint16_t num_barriers_; public: ArchDef(const std::string& /*arch*/, - int num_cores, - int num_warps, - int num_threads) + uint16_t num_cores, + uint16_t num_warps, + uint16_t num_threads) : num_cores_(num_cores) , num_warps_(num_warps) , num_threads_(num_threads) @@ -35,35 +35,35 @@ public: , num_barriers_(NUM_BARRIERS) {} - int wsize() const { + uint16_t wsize() const { return wsize_; } - int vsize() const { + uint16_t vsize() const { return vsize_; } - int num_regs() const { + uint16_t num_regs() const { return num_regs_; } - int num_csrs() const { + uint16_t num_csrs() const { return num_csrs_; } - int num_barriers() const { + uint16_t num_barriers() const { return num_barriers_; } - int num_threads() const { + uint16_t num_threads() const { return num_threads_; } - int num_warps() const { + uint16_t num_warps() const { return num_warps_; } - int num_cores() const { + uint16_t num_cores() const { return num_cores_; } }; diff --git a/sim/simX/args.h b/sim/simX/args.h index aeaba4e5..fd7de5bc 100644 --- a/sim/simX/args.h +++ b/sim/simX/args.h @@ -35,7 +35,7 @@ public: CommandLineArg(l, ht), arg_(x) {} int read(int argc, char **argv) { - __unused(argc); + __unused (argc); std::istringstream iss(argv[1]); iss >> arg_; return 1; @@ -53,7 +53,7 @@ public: CommandLineArg(l, ht), arg_(x) { arg_ = false; } int read(int argc, char **argv) { - __unused(argc, argv); + __unused (argc, argv); arg_ = true; return 0; } diff --git a/sim/simX/cache.cpp b/sim/simX/cache.cpp index da69cf3a..36da1b27 100644 --- a/sim/simX/cache.cpp +++ b/sim/simX/cache.cpp @@ -27,7 +27,7 @@ struct params_t { uint32_t tag_select_addr_start; uint32_t tag_select_addr_end; - params_t(const CacheConfig& config) { + params_t(const Cache::Config& config) { uint32_t bank_bits = log2ceil(config.num_banks); uint32_t offset_bits = config.B - config.W; uint32_t log2_bank_size = config.C - bank_bits; @@ -214,7 +214,7 @@ struct bank_t { std::vector sets; MSHR mshr; - bank_t(const CacheConfig& config, + bank_t(const Cache::Config& config, const params_t& params) : sets(params.sets_per_bank, params.blocks_per_set) , mshr(config.mshr_size) @@ -226,22 +226,30 @@ struct bank_t { class Cache::Impl { private: Cache* const simobject_; - CacheConfig config_; + Config config_; params_t params_; std::vector banks_; Switch::Ptr mem_switch_; Switch::Ptr bypass_switch_; - std::vector> mem_req_ports_; - std::vector> mem_rsp_ports_; + std::vector> mem_req_ports_; + std::vector> mem_rsp_ports_; + PerfStats perf_stats_; + uint64_t pending_read_reqs_; + uint64_t pending_write_reqs_; + uint64_t pending_fill_reqs_; + uint32_t flush_cycles_; public: - Impl(Cache* simobject, const CacheConfig& config) + Impl(Cache* simobject, const Config& config) : simobject_(simobject) , config_(config) , params_(config) , banks_(config.num_banks, {config, params_}) , mem_req_ports_(config.num_banks, simobject) , mem_rsp_ports_(config.num_banks, simobject) + , pending_read_reqs_(0) + , pending_write_reqs_(0) + , pending_fill_reqs_(0) { bypass_switch_ = Switch::Create("bypass_arb", ArbiterType::Priority, 2); bypass_switch_->ReqOut.bind(&simobject->MemReqPort); @@ -259,13 +267,29 @@ public: mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0)); } + + // calculate tag flush cycles + flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set; + } + + const PerfStats& perf_stats() const { + return perf_stats_; } - void step(uint64_t /*cycle*/) { + void step(uint64_t cycle) { + // wait on flush cycles + if (flush_cycles_ != 0) { + --flush_cycles_; + return; + } + + // calculate memory latency + perf_stats_.mem_latency += pending_fill_reqs_; + // handle bypasss responses auto& bypass_port = bypass_switch_->RspOut.at(1); if (!bypass_port.empty()) { - auto& mem_rsp = bypass_port.top(); + auto& mem_rsp = bypass_port.front(); uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1); uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs; MemRsp core_rsp(tag); @@ -287,7 +311,7 @@ public: for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { auto& mem_rsp_port = mem_rsp_ports_.at(bank_id); if (!mem_rsp_port.empty()) { - auto& mem_rsp = mem_rsp_port.top(); + auto& mem_rsp = mem_rsp_port.front(); this->processMemoryFill(bank_id, mem_rsp.tag); pending_fill_req.at(bank_id) = true; mem_rsp_port.pop(); @@ -300,7 +324,7 @@ public: if (core_req_port.empty()) continue; - auto& core_req = core_req_port.top(); + auto& core_req = core_req_port.front(); // check cache bypassing if (core_req.is_io) { @@ -345,7 +369,7 @@ public: // check MSHR capacity if read or writeback if ((!core_req.write || !config_.write_through) && bank.mshr.full()) { - // stall + ++perf_stats_.mshr_stalls; continue; } @@ -356,7 +380,7 @@ public: || pipeline_req.set_id != set_id || pipeline_req.tag != tag || pipeline_req.infos[port_id].valid) { - // stall + ++perf_stats_.bank_stalls; continue; } // update pending request infos @@ -365,8 +389,15 @@ public: // schedule new request pipeline_req = bank_req; } + + if (core_req.write) + ++perf_stats_.writes; + else + ++perf_stats_.reads; + // remove request - core_req_port.pop(); + auto time = core_req_port.pop(); + perf_stats_.pipeline_stalls += (cycle - time); } // process active request @@ -393,6 +424,7 @@ public: auto& block = set.blocks.at(entry.block_id); block.valid = true; block.tag = entry.tag; + --pending_fill_reqs_; } void processBankRequest(const std::vector& pipeline_reqs) { @@ -438,7 +470,7 @@ public: if (hit) { // - // MISS handling + // Hit handling // if (pipeline_req.write) { // handle write hit @@ -462,8 +494,13 @@ public: } } else { // - // MISS handling - // + // Miss handling + // + if (pipeline_req.write) + ++perf_stats_.write_misses; + else + ++perf_stats_.read_misses; + if (!found_free_block && !config_.write_through) { // write back dirty block auto& repl_block = set.blocks.at(repl_block_id); @@ -472,6 +509,7 @@ public: mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag); mem_req.write = true; mem_req_ports_.at(bank_id).send(mem_req, 1); + ++perf_stats_.evictions; } } @@ -500,9 +538,10 @@ public: if (pending == -1) { MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); - mem_req.write = pipeline_req.write; + mem_req.write = false; mem_req.tag = mshr_id; mem_req_ports_.at(bank_id).send(mem_req, 1); + ++pending_fill_reqs_; } } } @@ -513,7 +552,7 @@ public: /////////////////////////////////////////////////////////////////////////////// -Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config) +Cache::Cache(const SimContext& ctx, const char* name, const Config& config) : SimObject(ctx, name) , CoreReqPorts(config.num_inputs, this) , CoreRspPorts(config.num_inputs, this) @@ -528,4 +567,8 @@ Cache::~Cache() { void Cache::step(uint64_t cycle) { impl_->step(cycle); +} + +const Cache::PerfStats& Cache::perf_stats() const { + return impl_->perf_stats(); } \ No newline at end of file diff --git a/sim/simX/cache.h b/sim/simX/cache.h index 0be8cf6e..8f4b3932 100644 --- a/sim/simX/cache.h +++ b/sim/simX/cache.h @@ -5,33 +5,58 @@ namespace vortex { -struct CacheConfig { - uint8_t C; // log2 cache size - uint8_t B; // log2 block size - uint8_t W; // log2 word size - uint8_t A; // log2 associativity - uint8_t addr_width; // word address bits - uint8_t num_banks; // number of banks - uint8_t ports_per_bank; // number of ports per bank - uint8_t num_inputs; // number of inputs - bool write_through; // is write-through - bool write_reponse; // enable write response - uint16_t victim_size; // victim cache size - uint16_t mshr_size; // MSHR buffer size - uint8_t latency; // pipeline latency -}; - -class Cache : public SimObject { +class Cache : public SimObject { public: - Cache(const SimContext& ctx, const char* name, const CacheConfig& config); + struct Config { + uint8_t C; // log2 cache size + uint8_t B; // log2 block size + uint8_t W; // log2 word size + uint8_t A; // log2 associativity + uint8_t addr_width; // word address bits + uint8_t num_banks; // number of banks + uint8_t ports_per_bank; // number of ports per bank + uint8_t num_inputs; // number of inputs + bool write_through; // is write-through + bool write_reponse; // enable write response + uint16_t victim_size; // victim cache size + uint16_t mshr_size; // MSHR buffer size + uint8_t latency; // pipeline latency + }; + struct PerfStats { + uint64_t reads; + uint64_t writes; + uint64_t read_misses; + uint64_t write_misses; + uint64_t evictions; + uint64_t pipeline_stalls; + uint64_t bank_stalls; + uint64_t mshr_stalls; + uint64_t mem_latency; + + PerfStats() + : reads(0) + , writes(0) + , read_misses(0) + , write_misses(0) + , evictions(0) + , pipeline_stalls(0) + , bank_stalls(0) + , mshr_stalls(0) + , mem_latency(0) + {} + }; + + std::vector> CoreReqPorts; + std::vector> CoreRspPorts; + SimPort MemReqPort; + SimPort MemRspPort; + + Cache(const SimContext& ctx, const char* name, const Config& config); ~Cache(); void step(uint64_t cycle); - std::vector> CoreReqPorts; - std::vector> CoreRspPorts; - MasterPort MemReqPort; - SlavePort MemRspPort; + const PerfStats& perf_stats() const; private: class Impl; diff --git a/sim/simX/constants.h b/sim/simX/constants.h index 218fa5f9..b173a03f 100644 --- a/sim/simX/constants.h +++ b/sim/simX/constants.h @@ -3,14 +3,14 @@ #include "types.h" #ifndef MEM_LATENCY -#define MEM_LATENCY 18 +#define MEM_LATENCY 24 #endif namespace vortex { -struct Constants { +enum Constants { -static constexpr uint32_t SMEM_DELAY = 1 + SM_ENABLE; + SMEM_BANK_OFFSET = log2ceil(sizeof(Word)) + log2ceil(STACK_SIZE / sizeof(Word)), }; diff --git a/sim/simX/core.cpp b/sim/simX/core.cpp index 19b20967..7c6cbffa 100644 --- a/sim/simX/core.cpp +++ b/sim/simX/core.cpp @@ -9,16 +9,18 @@ #include "decode.h" #include "core.h" #include "debug.h" +#include "constants.h" using namespace vortex; Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) : SimObject(ctx, "Core") + , MemRspPort(this) + , MemReqPort(this) , id_(id) , arch_(arch) , decoder_(arch) , mmu_(0, arch.wsize(), true) - , shared_mem_(4096) , tex_units_(NUM_TEX_UNITS, this) , warps_(arch.num_warps()) , barriers_(arch.num_barriers(), 0) @@ -27,7 +29,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) , ibuffers_(arch.num_warps(), IBUF_SIZE) , scoreboard_(arch_) , exe_units_((int)ExeType::MAX) - , icache_(Cache::Create("Icache", CacheConfig{ + , icache_(Cache::Create("Icache", Cache::Config{ log2ceil(ICACHE_SIZE), // C log2ceil(L1_BLOCK_SIZE),// B 2, // W @@ -42,7 +44,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) NUM_WARPS, // mshr 2, // pipeline latency })) - , dcache_(Cache::Create("Dcache", CacheConfig{ + , dcache_(Cache::Create("Dcache", Cache::Config{ log2ceil(DCACHE_SIZE), // C log2ceil(L1_BLOCK_SIZE),// B 2, // W @@ -55,37 +57,41 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) false, // write response 0, // victim size DCACHE_MSHR_SIZE, // mshr - 2, // pipeline latency + 4, // pipeline latency + })) + , shared_mem_(SharedMem::Create("sharedmem", SharedMem::Config{ + arch.num_threads(), + arch.num_threads(), + Constants::SMEM_BANK_OFFSET, + 1, + false })) , l1_mem_switch_(Switch::Create("l1_arb", ArbiterType::Priority, 2)) , dcache_switch_(arch.num_threads()) - , fetch_stage_("fetch") - , decode_stage_("decode") - , issue_stage_("issue") - , execute_stage_("execute") - , commit_stage_("writeback") + , fetch_latch_("fetch") + , decode_latch_("decode") , pending_icache_(arch_.num_warps()) + , active_warps_(1) , stalled_warps_(0) , last_schedule_wid_(0) , issued_instrs_(0) , committed_instrs_(0) + , csr_tex_unit_(0) , ecall_(false) , ebreak_(false) - , stats_insts_(0) - , MemRspPort(this) - , MemReqPort(this) + , perf_mem_pending_reads_(0) { for (int i = 0; i < arch_.num_warps(); ++i) { warps_.at(i) = std::make_shared(this, i); } // register execute units - exe_units_.at((int)ExeType::NOP) = std::make_shared(this); - exe_units_.at((int)ExeType::ALU) = std::make_shared(this); - exe_units_.at((int)ExeType::LSU) = std::make_shared(this); - exe_units_.at((int)ExeType::CSR) = std::make_shared(this); - exe_units_.at((int)ExeType::FPU) = std::make_shared(this); - exe_units_.at((int)ExeType::GPU) = std::make_shared(this); + exe_units_.at((int)ExeType::NOP) = SimPlatform::instance().CreateObject(this); + exe_units_.at((int)ExeType::ALU) = SimPlatform::instance().CreateObject(this); + exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().CreateObject(this); + exe_units_.at((int)ExeType::CSR) = SimPlatform::instance().CreateObject(this); + exe_units_.at((int)ExeType::FPU) = SimPlatform::instance().CreateObject(this); + exe_units_.at((int)ExeType::GPU) = SimPlatform::instance().CreateObject(this); // connect l1 switch icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]); @@ -109,6 +115,18 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) // activate warp0 warps_.at(0)->setTmask(0, true); + + // memory perf callbacks + MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){ + __unused (cycle); + perf_stats_.mem_reads += !req.write; + perf_stats_.mem_writes += req.write; + perf_mem_pending_reads_ += !req.write; + }); + MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){ + __unused (cycle); + --perf_mem_pending_reads_; + }); } Core::~Core() { @@ -128,23 +146,26 @@ void Core::attach_ram(RAM* ram) { void Core::step(uint64_t cycle) { this->commit(cycle); this->execute(cycle); - this->issue(cycle); this->decode(cycle); this->fetch(cycle); + this->schedule(cycle); + + // update perf counter + perf_stats_.mem_latency += perf_mem_pending_reads_; DPN(2, std::flush); } -void Core::warp_scheduler(uint64_t cycle) { +void Core::schedule(uint64_t cycle) { __unused (cycle); bool foundSchedule = false; int scheduled_warp = last_schedule_wid_; // round robin scheduling - for (size_t wid = 0; wid < warps_.size(); ++wid) { - scheduled_warp = (scheduled_warp + 1) % warps_.size(); - bool warp_active = warps_.at(scheduled_warp)->active(); + for (size_t wid = 0, nw = arch_.num_warps(); wid < nw; ++wid) { + scheduled_warp = (scheduled_warp + 1) % nw; + bool warp_active = active_warps_.test(scheduled_warp); bool warp_stalled = stalled_warps_.test(scheduled_warp); if (warp_active && !warp_stalled) { last_schedule_wid_ = scheduled_warp; @@ -159,85 +180,91 @@ void Core::warp_scheduler(uint64_t cycle) { // suspend warp until decode stalled_warps_.set(scheduled_warp); - auto& warp = warps_.at(scheduled_warp); - stats_insts_ += warp->getActiveThreads(); - - auto trace = new pipeline_trace_t((issued_instrs_++ * arch_.num_cores()) + id_, arch_); + auto& warp = warps_.at(scheduled_warp); + + uint64_t uuid = (issued_instrs_++ * arch_.num_cores()) + id_; + + auto trace = new pipeline_trace_t(uuid, arch_); warp->eval(trace); DT(3, cycle, "pipeline-schedule: " << *trace); // advance to fetch stage - fetch_stage_.push(trace); + fetch_latch_.push(trace); } void Core::fetch(uint64_t cycle) { + __unused (cycle); + // handle icache reponse auto& icache_rsp_port = icache_->CoreRspPorts.at(0); if (!icache_rsp_port.empty()){ - auto& mem_rsp = icache_rsp_port.top(); + auto& mem_rsp = icache_rsp_port.front(); auto trace = pending_icache_.at(mem_rsp.tag); - auto latency = (SimPlatform::instance().cycles() - trace->icache_latency); - trace->icache_latency = latency; - decode_stage_.push(trace); + decode_latch_.push(trace); DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace); pending_icache_.release(mem_rsp.tag); icache_rsp_port.pop(); } // send icache request - if (!fetch_stage_.empty()) { - auto trace = fetch_stage_.top(); - trace->icache_latency = SimPlatform::instance().cycles(); + if (!fetch_latch_.empty()) { + auto trace = fetch_latch_.front(); MemReq mem_req; mem_req.addr = trace->PC; mem_req.write = false; mem_req.tag = pending_icache_.allocate(trace); icache_->CoreReqPorts.at(0).send(mem_req, 1); DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); - fetch_stage_.pop(); - } - - // schedule next warp - this->warp_scheduler(cycle); + fetch_latch_.pop(); + } } void Core::decode(uint64_t cycle) { __unused (cycle); - if (decode_stage_.empty()) + if (decode_latch_.empty()) return; - auto trace = decode_stage_.top(); + auto trace = decode_latch_.front(); + + // check ibuffer capacity + auto& ibuffer = ibuffers_.at(trace->wid); + if (ibuffer.full()) { + if (!trace->suspend()) { + DT(3, cycle, "*** ibuffer-stall: " << *trace); + } + ++perf_stats_.ibuf_stalls; + return; + } else { + trace->resume(); + } // release warp if (!trace->fetch_stall) { stalled_warps_.reset(trace->wid); } + // update perf counters + uint32_t active_threads = trace->tmask.count(); + if (trace->exe_type == ExeType::LSU && trace->lsu.type == LsuType::LOAD) + perf_stats_.loads += active_threads; + if (trace->exe_type == ExeType::LSU && trace->lsu.type == LsuType::STORE) + perf_stats_.stores += active_threads; + if (trace->exe_type == ExeType::ALU && trace->alu.type == AluType::BRANCH) + perf_stats_.branches += active_threads; + DT(3, cycle, "pipeline-decode: " << *trace); - - // advance to issue stage - issue_stage_.push(trace); - decode_stage_.pop(); + + // insert to ibuffer + ibuffer.push(trace); + + decode_latch_.pop(); } -void Core::issue(uint64_t cycle) { - __unused (cycle); - - if (!issue_stage_.empty()) { - // insert to ibuffer - auto trace = issue_stage_.top(); - auto& ibuffer = ibuffers_.at(trace->wid); - if (!trace->check_stalled(ibuffer.full())) { - DT(3, cycle, "*** ibuffer-stall: " << *trace); - } - if (!ibuffer.full()) { - ibuffer.push(trace); - issue_stage_.pop(); - } - } +void Core::execute(uint64_t cycle) { + __unused (cycle); // issue ibuffer instructions for (auto& ibuffer : ibuffers_) { @@ -247,180 +274,102 @@ void Core::issue(uint64_t cycle) { auto trace = ibuffer.top(); // check scoreboard - if (!trace->check_stalled(scoreboard_.in_use(trace))) { - DTH(3, cycle, "*** scoreboard-stall: dependents={"); - auto uses = scoreboard_.get_uses(trace); - for (uint32_t i = 0, n = uses.size(); i < n; ++i) { - auto& use = uses.at(i); - __unused(use); - if (i) DTN(3, ", "); - DTN(3, use.type << use.reg << "(#" << use.owner << ")"); + if (scoreboard_.in_use(trace)) { + if (!trace->suspend()) { + DTH(3, cycle, "*** scoreboard-stall: dependents={"); + auto uses = scoreboard_.get_uses(trace); + for (uint32_t i = 0, n = uses.size(); i < n; ++i) { + auto& use = uses.at(i); + __unused (use); + if (i) DTN(3, ", "); + DTN(3, use.type << use.reg << "(#" << use.owner << ")"); + } + DTN(3, "}, " << *trace << std::endl); } - DTN(3, "}, " << *trace << std::endl); - } - if (scoreboard_.in_use(trace)) + ++perf_stats_.scrb_stalls; continue; - - DT(3, cycle, "pipeline-issue: " << *trace); + } else { + trace->resume(); + } // update scoreboard scoreboard_.reserve(trace); - // advance to execute stage - execute_stage_.push(trace); + DT(3, cycle, "pipeline-issue: " << *trace); + + // push to execute units + auto& exe_unit = exe_units_.at((int)trace->exe_type); + exe_unit->Input.send(trace, 1); ibuffer.pop(); break; } } -void Core::execute(uint64_t cycle) { - // process stage inputs - if (!execute_stage_.empty()) { - auto trace = execute_stage_.top(); - auto& exe_unit = exe_units_.at((int)trace->exe_type); - exe_unit->push(trace); - DT(3, cycle, "pipeline-execute: " << *trace); - execute_stage_.pop(); - } - - // advance execute units - for (auto& exe_unit : exe_units_) { - exe_unit->step(cycle); - } - - // commit completed instructions - for (auto& exe_unit : exe_units_) { - if (!exe_unit->empty()) { - auto trace = exe_unit->top(); - if (trace->fetch_stall) { - stalled_warps_.reset(trace->wid); - } - // advance to commit stage - commit_stage_.push(trace); - exe_unit->pop(); - } - } -} - void Core::commit(uint64_t cycle) { __unused (cycle); - if (commit_stage_.empty()) - return; + // commit completed instructions + bool wb = false; + for (auto& exe_unit : exe_units_) { + if (!exe_unit->Output.empty()) { + auto trace = exe_unit->Output.front(); - auto trace = commit_stage_.top(); + // allow only one commit that updates registers + if (trace->wb && wb) + continue; + wb |= trace->wb; - DT(3, cycle, "pipeline-commit: " << *trace); + // advance to commit stage + DT(3, cycle, "pipeline-commit: " << *trace); - // update scoreboard - scoreboard_.release(trace); + // update scoreboard + scoreboard_.release(trace); - assert(committed_instrs_ <= issued_instrs_); - ++committed_instrs_; + assert(committed_instrs_ <= issued_instrs_); + ++committed_instrs_; - commit_stage_.pop(); + perf_stats_.instrs += trace->tmask.count(); - // delete the trace - delete trace; -} + // delete the trace + delete trace; -bool Core::running() const { - bool is_running = (committed_instrs_ != issued_instrs_); - return is_running; -} - -Word Core::get_csr(Addr addr, int tid, int wid) { - if (addr == CSR_FFLAGS) { - return fcsrs_.at(wid) & 0x1F; - } else if (addr == CSR_FRM) { - return (fcsrs_.at(wid) >> 5); - } else if (addr == CSR_FCSR) { - return fcsrs_.at(wid); - } else if (addr == CSR_WTID) { - // Warp threadID - return tid; - } else if (addr == CSR_LTID) { - // Core threadID - return tid + (wid * arch_.num_threads()); - } else if (addr == CSR_GTID) { - // Processor threadID - return tid + (wid * arch_.num_threads()) + - (arch_.num_threads() * arch_.num_warps() * id_); - } else if (addr == CSR_LWID) { - // Core warpID - return wid; - } else if (addr == CSR_GWID) { - // Processor warpID - return wid + (arch_.num_warps() * id_); - } else if (addr == CSR_GCID) { - // Processor coreID - return id_; - } else if (addr == CSR_TMASK) { - // Processor coreID - return warps_.at(wid)->getTmask(); - } else if (addr == CSR_NT) { - // Number of threads per warp - return arch_.num_threads(); - } else if (addr == CSR_NW) { - // Number of warps per core - return arch_.num_warps(); - } else if (addr == CSR_NC) { - // Number of cores - return arch_.num_cores(); - } else if (addr == CSR_MINSTRET) { - // NumInsts - return stats_insts_; - } else if (addr == CSR_MINSTRET_H) { - // NumInsts - return (Word)(stats_insts_ >> 32); - } else if (addr == CSR_MCYCLE) { - // NumCycles - return (Word)SimPlatform::instance().cycles(); - } else if (addr == CSR_MCYCLE_H) { - // NumCycles - return (Word)(SimPlatform::instance().cycles() >> 32); - } else { - if (addr >= CSR_TEX(0,0) - && addr < CSR_TEX(NUM_TEX_UNITS,0)) { - uint32_t unit = CSR_TEX_UNIT(addr); - uint32_t state = CSR_TEX_STATE(addr); - return tex_units_.at(unit).get_state(state); + exe_unit->Output.pop(); } - return csrs_.at(addr); } } -void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) { - if (addr == CSR_FFLAGS) { - fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F); - } else if (addr == CSR_FRM) { - fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5); - } else if (addr == CSR_FCSR) { - fcsrs_.at(wid) = value & 0xff; - } else { - if (addr >= CSR_TEX(0,0) - && addr < CSR_TEX(NUM_TEX_UNITS,0)) { - uint32_t unit = CSR_TEX_UNIT(addr); - uint32_t state = CSR_TEX_STATE(addr); - tex_units_.at(unit).set_state(state, value); - return; - } - csrs_.at(addr) = value; +WarpMask Core::wspawn(int num_warps, int nextPC) { + WarpMask ret(1); + int active_warps = std::min(num_warps, arch_.num_warps()); + DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << nextPC); + for (int i = 1; i < active_warps; ++i) { + auto warp = warps_.at(i); + warp->setPC(nextPC); + warp->setTmask(0, true); + ret.set(i); } + return std::move(ret); } -void Core::barrier(int bar_id, int count, int warp_id) { +WarpMask Core::barrier(int bar_id, int count, int warp_id) { + WarpMask ret(0); auto& barrier = barriers_.at(bar_id); barrier.set(warp_id); - if (barrier.count() < (size_t)count) - return; + if (barrier.count() < (size_t)count) { + warps_.at(warp_id)->suspend(); + DP(3, "*** Suspend warp #" << warp_id << " at barrier #" << bar_id); + return std::move(ret); + } for (int i = 0; i < arch_.num_warps(); ++i) { if (barrier.test(i)) { + DP(3, "*** Resume warp #" << i << " at barrier #" << bar_id); warps_.at(i)->activate(); + ret.set(i); } } barrier.reset(); + return std::move(ret); } Word Core::icache_read(Addr addr, Size size) { @@ -430,35 +379,21 @@ Word Core::icache_read(Addr addr, Size size) { } Word Core::dcache_read(Addr addr, Size size) { - Word data = 0; - if (SM_ENABLE) { - if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) - && ((addr + 3) < SMEM_BASE_ADDR)) { - shared_mem_.read(&data, addr & (SMEM_SIZE-1), size); - return data; - } - } + Word data; mmu_.read(&data, addr, size, 0); return data; } void Core::dcache_write(Addr addr, Word data, Size size) { - if (SM_ENABLE) { - if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) - && ((addr + 3) < SMEM_BASE_ADDR)) { - shared_mem_.write(&data, addr & (SMEM_SIZE-1), size); - return; - } - } if (addr >= IO_COUT_ADDR && addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { this->writeToStdOut(addr, data); - return; + } else { + mmu_.write(&data, addr, size, 0); } - mmu_.write(&data, addr, size, 0); } -Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector* mem_addrs) { +Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector* mem_addrs) { return tex_units_.at(unit).read(u, v, lod, mem_addrs); } @@ -473,6 +408,228 @@ void Core::writeToStdOut(Addr addr, Word data) { } } +Word Core::get_csr(Addr addr, int tid, int wid) { + switch (addr) { + case CSR_SATP: + case CSR_PMPCFG0: + case CSR_PMPADDR0: + case CSR_MSTATUS: + case CSR_MISA: + case CSR_MEDELEG: + case CSR_MIDELEG: + case CSR_MIE: + case CSR_MTVEC: + case CSR_MEPC: + return 0; + + case CSR_FFLAGS: + return fcsrs_.at(wid) & 0x1F; + case CSR_FRM: + return (fcsrs_.at(wid) >> 5); + case CSR_FCSR: + return fcsrs_.at(wid); + case CSR_WTID: + // Warp threadID + return tid; + case CSR_LTID: + // Core threadID + return tid + (wid * arch_.num_threads()); + case CSR_GTID: + // Processor threadID + return tid + (wid * arch_.num_threads()) + + (arch_.num_threads() * arch_.num_warps() * id_); + case CSR_LWID: + // Core warpID + return wid; + case CSR_GWID: + // Processor warpID + return wid + (arch_.num_warps() * id_); + case CSR_GCID: + // Processor coreID + return id_; + case CSR_TMASK: + // Processor coreID + return warps_.at(wid)->getTmask(); + case CSR_NT: + // Number of threads per warp + return arch_.num_threads(); + case CSR_NW: + // Number of warps per core + return arch_.num_warps(); + case CSR_NC: + // Number of cores + return arch_.num_cores(); + case CSR_MINSTRET: + // NumInsts + return perf_stats_.instrs & 0xffffffff; + case CSR_MINSTRET_H: + // NumInsts + return (Word)(perf_stats_.instrs >> 32); + case CSR_MCYCLE: + // NumCycles + return (Word)SimPlatform::instance().cycles(); + case CSR_MCYCLE_H: + // NumCycles + return (Word)(SimPlatform::instance().cycles() >> 32); + case CSR_MPM_IBUF_ST: + return perf_stats_.ibuf_stalls & 0xffffffff; + case CSR_MPM_IBUF_ST_H: + return perf_stats_.ibuf_stalls >> 32; + case CSR_MPM_SCRB_ST: + return perf_stats_.scrb_stalls & 0xffffffff; + case CSR_MPM_SCRB_ST_H: + return perf_stats_.scrb_stalls >> 32; + case CSR_MPM_ALU_ST: + return perf_stats_.alu_stalls & 0xffffffff; + case CSR_MPM_ALU_ST_H: + return perf_stats_.alu_stalls >> 32; + case CSR_MPM_LSU_ST: + return perf_stats_.lsu_stalls & 0xffffffff; + case CSR_MPM_LSU_ST_H: + return perf_stats_.lsu_stalls >> 32; + case CSR_MPM_CSR_ST: + return perf_stats_.csr_stalls & 0xffffffff; + case CSR_MPM_CSR_ST_H: + return perf_stats_.csr_stalls >> 32; + case CSR_MPM_FPU_ST: + return perf_stats_.fpu_stalls & 0xffffffff; + case CSR_MPM_FPU_ST_H: + return perf_stats_.fpu_stalls >> 32; + case CSR_MPM_GPU_ST: + return perf_stats_.gpu_stalls & 0xffffffff; + case CSR_MPM_GPU_ST_H: + return perf_stats_.gpu_stalls >> 32; + + case CSR_MPM_LOADS: + return perf_stats_.loads & 0xffffffff; + case CSR_MPM_LOADS_H: + return perf_stats_.loads >> 32; + case CSR_MPM_STORES: + return perf_stats_.stores & 0xffffffff; + case CSR_MPM_STORES_H: + return perf_stats_.stores >> 32; + case CSR_MPM_BRANCHES: + return perf_stats_.branches & 0xffffffff; + case CSR_MPM_BRANCHES_H: + return perf_stats_.branches >> 32; + + case CSR_MPM_ICACHE_READS: + return icache_->perf_stats().reads & 0xffffffff; + case CSR_MPM_ICACHE_READS_H: + return icache_->perf_stats().reads >> 32; + case CSR_MPM_ICACHE_MISS_R: + return icache_->perf_stats().read_misses & 0xffffffff; + case CSR_MPM_ICACHE_MISS_R_H: + return icache_->perf_stats().read_misses >> 32; + + case CSR_MPM_DCACHE_READS: + return dcache_->perf_stats().reads & 0xffffffff; + case CSR_MPM_DCACHE_READS_H: + return dcache_->perf_stats().reads >> 32; + case CSR_MPM_DCACHE_WRITES: + return dcache_->perf_stats().writes & 0xffffffff; + case CSR_MPM_DCACHE_WRITES_H: + return dcache_->perf_stats().writes >> 32; + case CSR_MPM_DCACHE_MISS_R: + return dcache_->perf_stats().read_misses & 0xffffffff; + case CSR_MPM_DCACHE_MISS_R_H: + return dcache_->perf_stats().read_misses >> 32; + case CSR_MPM_DCACHE_MISS_W: + return dcache_->perf_stats().write_misses & 0xffffffff; + case CSR_MPM_DCACHE_MISS_W_H: + return dcache_->perf_stats().write_misses >> 32; + case CSR_MPM_DCACHE_BANK_ST: + return dcache_->perf_stats().bank_stalls & 0xffffffff; + case CSR_MPM_DCACHE_BANK_ST_H: + return dcache_->perf_stats().bank_stalls >> 32; + case CSR_MPM_DCACHE_MSHR_ST: + return dcache_->perf_stats().mshr_stalls & 0xffffffff; + case CSR_MPM_DCACHE_MSHR_ST_H: + return dcache_->perf_stats().mshr_stalls >> 32; + + case CSR_MPM_SMEM_READS: + return shared_mem_->perf_stats().reads & 0xffffffff; + case CSR_MPM_SMEM_READS_H: + return shared_mem_->perf_stats().reads >> 32; + case CSR_MPM_SMEM_WRITES: + return shared_mem_->perf_stats().writes & 0xffffffff; + case CSR_MPM_SMEM_WRITES_H: + return shared_mem_->perf_stats().writes >> 32; + case CSR_MPM_SMEM_BANK_ST: + return shared_mem_->perf_stats().bank_stalls & 0xffffffff; + case CSR_MPM_SMEM_BANK_ST_H: + return shared_mem_->perf_stats().bank_stalls >> 32; + + case CSR_MPM_MEM_READS: + return perf_stats_.mem_reads & 0xffffffff; + case CSR_MPM_MEM_READS_H: + return perf_stats_.mem_reads >> 32; + case CSR_MPM_MEM_WRITES: + return perf_stats_.mem_writes & 0xffffffff; + case CSR_MPM_MEM_WRITES_H: + return perf_stats_.mem_writes >> 32; + case CSR_MPM_MEM_LAT: + return perf_stats_.mem_latency & 0xffffffff; + case CSR_MPM_MEM_LAT_H: + return perf_stats_.mem_latency >> 32; + +#ifdef EXT_TEX_ENABLE + case CSR_MPM_TEX_READS: + return perf_stats_.tex_reads & 0xffffffff; + case CSR_MPM_TEX_READS_H: + return perf_stats_.tex_reads >> 32; + case CSR_MPM_TEX_LAT: + return perf_stats_.tex_latency & 0xffffffff; + case CSR_MPM_TEX_LAT_H: + return perf_stats_.tex_latency >> 32; +#endif + default: + if ((addr >= CSR_MPM_BASE && addr < (CSR_MPM_BASE + 32)) + || (addr >= CSR_MPM_BASE_H && addr < (CSR_MPM_BASE_H + 32))) { + // user-defined MPM CSRs + } else + #ifdef EXT_TEX_ENABLE + if (addr == CSR_TEX_UNIT) { + return csr_tex_unit_; + } else + if (addr >= CSR_TEX_STATE_BEGIN + && addr < CSR_TEX_STATE_END) { + uint32_t state = CSR_TEX_STATE(addr); + return tex_units_.at(csr_tex_unit_).get_state(state); + } else + #endif + { + std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl; + std::abort(); + } + } + return 0; +} + +void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) { + if (addr == CSR_FFLAGS) { + fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F); + } else if (addr == CSR_FRM) { + fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5); + } else if (addr == CSR_FCSR) { + fcsrs_.at(wid) = value & 0xff; + } else +#ifdef EXT_TEX_ENABLE + if (addr == CSR_TEX_UNIT) { + csr_tex_unit_ = value; + } else + if (addr >= CSR_TEX_STATE_BEGIN + && addr < CSR_TEX_STATE_END) { + uint32_t state = CSR_TEX_STATE(addr); + tex_units_.at(csr_tex_unit_).set_state(state, value); + return; + } else +#endif + { + csrs_.at(addr) = value; + } +} + void Core::trigger_ecall() { ecall_ = true; } @@ -483,4 +640,9 @@ void Core::trigger_ebreak() { bool Core::check_exit() const { return ebreak_ || ecall_; +} + +bool Core::running() const { + bool is_running = (committed_instrs_ != issued_instrs_); + return is_running; } \ No newline at end of file diff --git a/sim/simX/core.h b/sim/simX/core.h index 5066d8af..e4a6034e 100644 --- a/sim/simX/core.h +++ b/sim/simX/core.h @@ -17,6 +17,7 @@ #include "warp.h" #include "pipeline.h" #include "cache.h" +#include "sharedmem.h" #include "ibuffer.h" #include "scoreboard.h" #include "exeunit.h" @@ -26,6 +27,47 @@ namespace vortex { class Core : public SimObject { public: + struct PerfStats { + uint64_t instrs; + uint64_t ibuf_stalls; + uint64_t scrb_stalls; + uint64_t alu_stalls; + uint64_t lsu_stalls; + uint64_t csr_stalls; + uint64_t fpu_stalls; + uint64_t gpu_stalls; + uint64_t loads; + uint64_t stores; + uint64_t branches; + uint64_t mem_reads; + uint64_t mem_writes; + uint64_t mem_latency; + uint64_t tex_reads; + uint64_t tex_latency; + + PerfStats() + : instrs(0) + , ibuf_stalls(0) + , scrb_stalls(0) + , alu_stalls(0) + , lsu_stalls(0) + , csr_stalls(0) + , fpu_stalls(0) + , gpu_stalls(0) + , loads(0) + , stores(0) + , branches(0) + , mem_reads(0) + , mem_writes(0) + , mem_latency(0) + , tex_reads(0) + , tex_latency(0) + {} + }; + + SimPort MemRspPort; + SimPort MemReqPort; + Core(const SimContext& ctx, const ArchDef &arch, Word id); ~Core(); @@ -51,8 +93,8 @@ public: return arch_; } - unsigned long stats_insts() const { - return stats_insts_; + const PerfStats& perf_stats() const { + return perf_stats_; } Word getIRegValue(int reg) const { @@ -63,7 +105,9 @@ public: void set_csr(Addr addr, Word value, int tid, int wid); - void barrier(int bar_id, int count, int warp_id); + WarpMask wspawn(int num_warps, int nextPC); + + WarpMask barrier(int bar_id, int count, int warp_id); Word icache_read(Addr, Size); @@ -71,7 +115,7 @@ public: void dcache_write(Addr, Word, Size); - Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector* mem_addrs); + Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector* mem_addrs); void trigger_ecall(); @@ -81,21 +125,18 @@ public: private: + void schedule(uint64_t cycle); void fetch(uint64_t cycle); void decode(uint64_t cycle); - void issue(uint64_t cycle); void execute(uint64_t cycle); void commit(uint64_t cycle); - - void warp_scheduler(uint64_t cycle); - + void writeToStdOut(Addr addr, Word data); Word id_; const ArchDef arch_; const Decoder decoder_; MemoryUnit mmu_; - RAM shared_mem_; std::vector tex_units_; std::vector> warps_; @@ -107,33 +148,33 @@ private: std::vector exe_units_; Cache::Ptr icache_; Cache::Ptr dcache_; + SharedMem::Ptr shared_mem_; Switch::Ptr l1_mem_switch_; std::vector::Ptr> dcache_switch_; - PipelineStage fetch_stage_; - PipelineStage decode_stage_; - PipelineStage issue_stage_; - PipelineStage execute_stage_; - PipelineStage commit_stage_; + PipelineLatch fetch_latch_; + PipelineLatch decode_latch_; HashTable pending_icache_; - WarpMask stalled_warps_; + WarpMask active_warps_; + WarpMask stalled_warps_; uint32_t last_schedule_wid_; - uint32_t issued_instrs_; - uint32_t committed_instrs_; + uint64_t issued_instrs_; + uint64_t committed_instrs_; + uint32_t csr_tex_unit_; bool ecall_; bool ebreak_; std::unordered_map print_bufs_; - uint64_t stats_insts_; + PerfStats perf_stats_; + uint64_t perf_mem_pending_reads_; friend class LsuUnit; + friend class AluUnit; + friend class CsrUnit; + friend class FpuUnit; friend class GpuUnit; - -public: - SlavePort MemRspPort; - MasterPort MemReqPort; }; } // namespace vortex \ No newline at end of file diff --git a/sim/simX/decode.cpp b/sim/simX/decode.cpp index a2957c64..f890d2f9 100644 --- a/sim/simX/decode.cpp +++ b/sim/simX/decode.cpp @@ -359,14 +359,28 @@ std::shared_ptr Decoder::decode(Word code) const { instr->setDestReg(rd); } instr->setFunc3(func3); - instr->setFunc7(func7); - if ((func3 == 5) && (op != L_INST) && (op != Opcode::FL)) { - instr->setImm(sext32(rs2, 5)); - } else { + instr->setFunc7(func7); + switch (op) { + case Opcode::SYS_INST: + case Opcode::FENCE: + // uint12 + instr->setImm(code >> shift_rs2_); + break; + case Opcode::I_INST: + if (func3 == 0x1 || func3 == 0x5) { + // int5 + instr->setImm(sext32(rs2, 5)); + } else { + // int12 + instr->setImm(sext32(code >> shift_rs2_, 12)); + } + break; + default: + // int12 instr->setImm(sext32(code >> shift_rs2_, 12)); + break; } } break; - case InstType::S_TYPE: { instr->setSrcReg(rs1); if (op == Opcode::FS) { @@ -375,8 +389,8 @@ std::shared_ptr Decoder::decode(Word code) const { instr->setSrcReg(rs2); } instr->setFunc3(func3); - Word imeed = (func7 << reg_s_) | rd; - instr->setImm(sext32(imeed, 12)); + Word imm = (func7 << reg_s_) | rd; + instr->setImm(sext32(imm, 12)); } break; case InstType::B_TYPE: { @@ -387,8 +401,8 @@ std::shared_ptr Decoder::decode(Word code) const { Word bits_4_1 = rd >> 1; Word bit_10_5 = func7 & 0x3f; Word bit_12 = func7 >> 6; - Word imeed = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12); - instr->setImm(sext32(imeed, 13)); + Word imm = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12); + instr->setImm(sext32(imm, 13)); } break; case InstType::U_TYPE: @@ -403,11 +417,11 @@ std::shared_ptr Decoder::decode(Word code) const { Word bit_11 = (unordered >> 8) & 0x1; Word bits_10_1 = (unordered >> 9) & 0x3ff; Word bit_20 = (unordered >> 19) & 0x1; - Word imeed = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20); + Word imm = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20); if (bit_20) { - imeed |= ~j_imm_mask_; + imm |= ~j_imm_mask_; } - instr->setImm(imeed); + instr->setImm(imm); } break; case InstType::V_TYPE: diff --git a/sim/simX/execute.cpp b/sim/simX/execute.cpp index d55ba2f9..be172830 100644 --- a/sim/simX/execute.cpp +++ b/sim/simX/execute.cpp @@ -428,7 +428,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; Word data_read = core_->dcache_read(memAddr, 4); - trace->mem_addrs.at(t).push_back(memAddr); + trace->mem_addrs.at(t).push_back({memAddr, 4}); DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); switch (func3) { case 0: @@ -491,7 +491,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { if (!tmask_.test(t)) continue; Word memAddr = rsdata[t][0] + immsrc; - trace->mem_addrs.at(t).push_back(memAddr); + trace->mem_addrs.at(t).push_back({memAddr, (1u << func3)}); DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); switch (func3) { case 0: @@ -528,14 +528,14 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case SYS_INST: - trace->exe_type = ExeType::CSR; for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; - Word csr_addr = immsrc & 0x00000FFF; - Word csr_value = core_->get_csr(csr_addr, t, id_); - switch (func3) { - case 0: + Word csr_addr = immsrc; + Word csr_value; + if (func3 == 0) { + trace->exe_type = ExeType::ALU; + trace->fetch_stall = true; switch (csr_addr) { case 0: // ECALL core_->trigger_ecall(); @@ -549,56 +549,59 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { break; default: std::abort(); - } - break; - case 1: - // CSRRW - rddata[t] = csr_value; - core_->set_csr(csr_addr, rsdata[t][0], t, id_); - trace->used_iregs.set(rsrc0); - rd_write = true; - break; - case 2: - // CSRRS - rddata[t] = csr_value; - core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_); - trace->used_iregs.set(rsrc0); - rd_write = true; - break; - case 3: - // CSRRC - rddata[t] = csr_value; - core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_); - trace->used_iregs.set(rsrc0); - rd_write = true; - break; - case 5: - // CSRRWI - rddata[t] = csr_value; - core_->set_csr(csr_addr, rsrc0, t, id_); - rd_write = true; - break; - case 6: - // CSRRSI - rddata[t] = csr_value; - core_->set_csr(csr_addr, csr_value | rsrc0, t, id_); - rd_write = true; - break; - case 7: - // CSRRCI - rddata[t] = csr_value; - core_->set_csr(csr_addr, csr_value & ~rsrc0, t, id_); - rd_write = true; - break; - default: - break; + } + } else { + trace->exe_type = ExeType::CSR; + csr_value = core_->get_csr(csr_addr, t, id_); + switch (func3) { + case 1: + // CSRRW + rddata[t] = csr_value; + core_->set_csr(csr_addr, rsdata[t][0], t, id_); + trace->used_iregs.set(rsrc0); + rd_write = true; + break; + case 2: + // CSRRS + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_); + trace->used_iregs.set(rsrc0); + rd_write = true; + break; + case 3: + // CSRRC + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_); + trace->used_iregs.set(rsrc0); + rd_write = true; + break; + case 5: + // CSRRWI + rddata[t] = csr_value; + core_->set_csr(csr_addr, rsrc0, t, id_); + rd_write = true; + break; + case 6: + // CSRRSI; + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value | rsrc0, t, id_); + rd_write = true; + break; + case 7: + // CSRRCI + rddata[t] = csr_value; + core_->set_csr(csr_addr, csr_value & ~rsrc0, t, id_); + rd_write = true; + break; + default: + break; + } } } break; case FENCE: trace->exe_type = ExeType::LSU; trace->lsu.type = LsuType::FENCE; - trace->fetch_stall = true; break; case FCI: trace->exe_type = ExeType::FPU; @@ -797,6 +800,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DPN(3, std::endl); active_ = tmask_.any(); + trace->gpu.active_warps.reset(); + trace->gpu.active_warps.set(id_, active_); } break; case 1: { // WSPAWN @@ -805,13 +810,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { trace->used_iregs.set(rsrc0); trace->used_iregs.set(rsrc1); trace->fetch_stall = true; - int active_warps = std::min(rsdata.at(ts)[0], core_->arch().num_warps()); - DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]); - for (int i = 1; i < active_warps; ++i) { - Warp &newWarp = core_->warp(i); - newWarp.setPC(rsdata[ts][1]); - newWarp.setTmask(0, true); - } + trace->gpu.active_warps = core_->wspawn(rsdata.at(ts)[0], rsdata.at(ts)[1]); } break; case 2: { // SPLIT @@ -877,9 +876,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { trace->gpu.type = GpuType::BAR; trace->used_iregs.set(rsrc0); trace->used_iregs.set(rsrc1); - trace->fetch_stall = true; - active_ = false; - core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); + trace->fetch_stall = true; + trace->gpu.active_warps = core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); } break; case 5: { // PREFETCH diff --git a/sim/simX/exeunit.cpp b/sim/simX/exeunit.cpp index 1d0a3cfc..1736101c 100644 --- a/sim/simX/exeunit.cpp +++ b/sim/simX/exeunit.cpp @@ -10,64 +10,78 @@ using namespace vortex; -NopUnit::NopUnit(Core*) : ExeUnit("NOP") {} +NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {} void NopUnit::step(uint64_t /*cycle*/) { - if (inputs_.empty()) + if (Input.empty()) return; - auto trace = inputs_.top(); - this->schedule_output(trace, 1); - inputs_.pop(); + auto trace = Input.front(); + Output.send(trace, 1); + Input.pop(); } /////////////////////////////////////////////////////////////////////////////// -LsuUnit::LsuUnit(Core* core) - : ExeUnit("LSU") - , core_(core) +LsuUnit::LsuUnit(const SimContext& ctx, Core* core) + : ExeUnit(ctx, core, "LSU") , num_threads_(core->arch().num_threads()) , pending_dcache_(LSUQ_SIZE) , fence_lock_(false) {} void LsuUnit::step(uint64_t cycle) { - __unused (cycle); - // handle dcache response for (uint32_t t = 0; t < num_threads_; ++t) { auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0); if (dcache_rsp_port.empty()) continue; - auto& mem_rsp = dcache_rsp_port.top(); + auto& mem_rsp = dcache_rsp_port.front(); auto& entry = pending_dcache_.at(mem_rsp.tag); auto trace = entry.first; DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); assert(entry.second); --entry.second; // track remaining blocks - if (0 == entry.second) { - auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency); - trace->dcache_latency = latency; - this->schedule_output(trace, 1); + if (0 == entry.second) { + Output.send(trace, 1); pending_dcache_.release(mem_rsp.tag); } dcache_rsp_port.pop(); } + // handle shared memory response + for (uint32_t t = 0; t < num_threads_; ++t) { + auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t); + if (smem_rsp_port.empty()) + continue; + auto& mem_rsp = smem_rsp_port.front(); + auto& entry = pending_dcache_.at(mem_rsp.tag); + auto trace = entry.first; + DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type + << ", tid=" << t << ", " << *trace); + assert(entry.second); + --entry.second; // track remaining blocks + if (0 == entry.second) { + Output.send(trace, 1); + pending_dcache_.release(mem_rsp.tag); + } + smem_rsp_port.pop(); + } + if (fence_lock_) { // wait for all pending memory operations to complete if (!pending_dcache_.empty()) return; - this->schedule_output(fence_state_, 1); + Output.send(fence_state_, 1); fence_lock_ = false; DT(3, cycle, "fence-unlock: " << fence_state_); } // check input queue - if (inputs_.empty()) + if (Input.empty()) return; - auto trace = inputs_.top(); + auto trace = Input.front(); if (trace->lsu.type == LsuType::FENCE) { // schedule fence lock @@ -75,179 +89,188 @@ void LsuUnit::step(uint64_t cycle) { fence_lock_ = true; DT(3, cycle, "fence-lock: " << *trace); // remove input - inputs_.pop(); + auto time = Input.pop(); + core_->perf_stats_.lsu_stalls += (cycle - time); return; } - // check pending queue capacity - if (!trace->check_stalled(pending_dcache_.full())) { - DT(3, cycle, "*** lsu-queue-stall: " << *trace); - } - if (pending_dcache_.full()) + // check pending queue capacity + if (pending_dcache_.full()) { + if (!trace->suspend()) { + DT(3, cycle, "*** lsu-queue-stall: " << *trace); + } return; - - // send memory request - - bool has_shared_memory = false; - bool mem_rsp_pending = false; + } else { + trace->resume(); + } + bool is_write = (trace->lsu.type == LsuType::STORE); - uint32_t valid_addrs = 0; - for (auto& mem_addr : trace->mem_addrs) { - valid_addrs += mem_addr.size(); - } + // duplicates detection + bool is_dup = false; + if (trace->tmask.test(0)) { + uint64_t addr_mask = sizeof(Word)-1; + Word addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask; + uint32_t matches = 1; + for (uint32_t t = 1; t < num_threads_; ++t) { + if (!trace->tmask.test(t)) + continue; + auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask; + matches += (addr0 == mem_addr); + } + is_dup = (matches == trace->tmask.count()); + } + + uint32_t valid_addrs = 0; + if (is_dup) { + valid_addrs = 1; + } else { + for (auto& mem_addr : trace->mem_addrs) { + valid_addrs += mem_addr.size(); + } + } - trace->dcache_latency = SimPlatform::instance().cycles(); auto tag = pending_dcache_.allocate({trace, valid_addrs}); for (uint32_t t = 0; t < num_threads_; ++t) { if (!trace->tmask.test(t)) continue; + + auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0); + auto mem_addr = trace->mem_addrs.at(t).at(0); + auto type = get_addr_type(mem_addr.addr, mem_addr.size); - auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0); - for (auto mem_addr : trace->mem_addrs.at(t)) { - // check shared memory address - if (SM_ENABLE) { - if ((mem_addr >= (SMEM_BASE_ADDR-SMEM_SIZE)) - && (mem_addr < SMEM_BASE_ADDR)) { - DT(3, cycle, "smem-access: addr=" << std::hex << mem_addr << ", tag=" << tag - << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); - has_shared_memory = true; - continue; - } - } - - bool is_io = (mem_addr >= IO_BASE_ADDR); - - MemReq mem_req; - mem_req.addr = mem_addr; - mem_req.write = is_write; - mem_req.tag = tag; - mem_req.is_io = is_io; - dcache_req_port.send(mem_req, 1); - DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr << ", tag=" << tag - << ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << is_io << ", "<< trace); - // do not wait on writes - mem_rsp_pending = !is_write; - } + MemReq mem_req; + mem_req.addr = mem_addr.addr; + mem_req.write = is_write; + mem_req.tag = tag; + mem_req.is_io = (type == AddrType::IO); + + if (type == AddrType::Shared) { + core_->shared_mem_->Inputs.at(t).send(mem_req, 2); + DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); + } else { + dcache_req_port.send(mem_req, 2); + DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + << ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << mem_req.is_io << ", " << *trace); + } + + if (is_dup) + break; } - // do not wait - if (!mem_rsp_pending) { + // do not wait on writes + if (is_write) { pending_dcache_.release(tag); - uint32_t delay = 1; - if (has_shared_memory) { - // all threads accessed shared memory - delay += Constants::SMEM_DELAY; - } - this->schedule_output(trace, delay); + Output.send(trace, 1); } // remove input - inputs_.pop(); + auto time = Input.pop(); + core_->perf_stats_.lsu_stalls += (cycle - time); } /////////////////////////////////////////////////////////////////////////////// -AluUnit::AluUnit(Core*) : ExeUnit("ALU") {} +AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {} -void AluUnit::step(uint64_t /*cycle*/) { - if (inputs_.empty()) +void AluUnit::step(uint64_t cycle) { + if (Input.empty()) return; - auto trace = inputs_.top(); + auto trace = Input.front(); switch (trace->alu.type) { case AluType::ARITH: case AluType::BRANCH: case AluType::CMOV: - this->schedule_output(trace, 1); - inputs_.pop(); + Output.send(trace, 1); break; case AluType::IMUL: - this->schedule_output(trace, LATENCY_IMUL); - inputs_.pop(); + Output.send(trace, LATENCY_IMUL+1); break; case AluType::IDIV: - this->schedule_output(trace, XLEN); - inputs_.pop(); + Output.send(trace, XLEN+1); break; default: std::abort(); } + DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace); + if (trace->fetch_stall) { + core_->stalled_warps_.reset(trace->wid); + } + auto time = Input.pop(); + core_->perf_stats_.alu_stalls += (cycle - time); } /////////////////////////////////////////////////////////////////////////////// -CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {} +CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {} -void CsrUnit::step(uint64_t /*cycle*/) { - if (inputs_.empty()) +void CsrUnit::step(uint64_t cycle) { + if (Input.empty()) return; - auto trace = inputs_.top(); - this->schedule_output(trace, 1); - inputs_.pop(); + auto trace = Input.front(); + Output.send(trace, 1); + auto time = Input.pop(); + core_->perf_stats_.csr_stalls += (cycle - time); + DT(3, cycle, "pipeline-execute: op=CSR, " << *trace); } /////////////////////////////////////////////////////////////////////////////// -FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {} +FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {} -void FpuUnit::step(uint64_t /*cycle*/) { - if (inputs_.empty()) +void FpuUnit::step(uint64_t cycle) { + if (Input.empty()) return; - auto trace = inputs_.top(); + auto trace = Input.front(); switch (trace->fpu.type) { case FpuType::FNCP: - this->schedule_output(trace, 1); - inputs_.pop(); + Output.send(trace, 2); break; case FpuType::FMA: - this->schedule_output(trace, LATENCY_FMA); - inputs_.pop(); + Output.send(trace, LATENCY_FMA+1); break; case FpuType::FDIV: - this->schedule_output(trace, LATENCY_FDIV); - inputs_.pop(); + Output.send(trace, LATENCY_FDIV+1); break; case FpuType::FSQRT: - this->schedule_output(trace, LATENCY_FSQRT); - inputs_.pop(); + Output.send(trace, LATENCY_FSQRT+1); break; case FpuType::FCVT: - this->schedule_output(trace, LATENCY_FCVT); - inputs_.pop(); + Output.send(trace, LATENCY_FCVT+1); break; default: std::abort(); - } + } + DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace); + auto time = Input.pop(); + core_->perf_stats_.fpu_stalls += (cycle - time); } /////////////////////////////////////////////////////////////////////////////// -GpuUnit::GpuUnit(Core* core) - : ExeUnit("GPU") - , core_(core) +GpuUnit::GpuUnit(const SimContext& ctx, Core* core) + : ExeUnit(ctx, core, "GPU") , num_threads_(core->arch().num_threads()) , pending_tex_reqs_(TEXQ_SIZE) {} void GpuUnit::step(uint64_t cycle) { - __unused (cycle); #ifdef EXT_TEX_ENABLE // handle memory response for (uint32_t t = 0; t < num_threads_; ++t) { auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1); if (dcache_rsp_port.empty()) continue; - auto& mem_rsp = dcache_rsp_port.top(); + auto& mem_rsp = dcache_rsp_port.front(); auto& entry = pending_tex_reqs_.at(mem_rsp.tag); auto trace = entry.first; DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace); assert(entry.second); --entry.second; // track remaining blocks - if (0 == entry.second) { - auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency); - trace->dcache_latency = latency; - this->schedule_output(trace, 1); + if (0 == entry.second) { + Output.send(trace, 1); pending_tex_reqs_.release(mem_rsp.tag); } dcache_rsp_port.pop(); @@ -255,38 +278,67 @@ void GpuUnit::step(uint64_t cycle) { #endif // check input queue - if (inputs_.empty()) + if (Input.empty()) return; - auto trace = inputs_.top(); + auto trace = Input.front(); + + bool issued = false; switch (trace->gpu.type) { case GpuType::TMC: + Output.send(trace, 1); + core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid)); + issued = true; + break; case GpuType::WSPAWN: + Output.send(trace, 1); + core_->active_warps_ = trace->gpu.active_warps; + issued = true; + break; case GpuType::SPLIT: case GpuType::JOIN: - case GpuType::BAR: - this->schedule_output(trace, 1); - inputs_.pop(); + Output.send(trace, 1); + issued = true; break; - case GpuType::TEX: { + case GpuType::BAR: + Output.send(trace, 1); + if (trace->gpu.active_warps != 0) + core_->active_warps_ |= trace->gpu.active_warps; + else + core_->active_warps_.reset(trace->wid); + issued = true; + break; + case GpuType::TEX: if (this->processTexRequest(cycle, trace)) - inputs_.pop(); - } break; + issued = true; + break; default: std::abort(); } + + if (issued) { + DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace); + if (trace->fetch_stall) { + core_->stalled_warps_.reset(trace->wid); + } + auto time = Input.pop(); + core_->perf_stats_.fpu_stalls += (cycle - time); + } } bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { __unused (cycle); - // check pending queue capacity - if (!trace->check_stalled(pending_tex_reqs_.full())) { - DT(3, cycle, "*** tex-queue-stall: " << *trace); - } - if (pending_tex_reqs_.full()) + // check pending queue capacity + if (pending_tex_reqs_.full()) { + if (!trace->suspend()) { + DT(3, cycle, "*** tex-queue-stall: " << *trace); + } return false; + } else { + trace->resume(); + } // send memory request @@ -295,7 +347,6 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { valid_addrs += mem_addr.size(); } - trace->tex_latency = SimPlatform::instance().cycles(); auto tag = pending_tex_reqs_.allocate({trace, valid_addrs}); for (uint32_t t = 0; t < num_threads_; ++t) { @@ -305,12 +356,14 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1); for (auto mem_addr : trace->mem_addrs.at(t)) { MemReq mem_req; - mem_req.addr = mem_addr; + mem_req.addr = mem_addr.addr; mem_req.write = (trace->lsu.type == LsuType::STORE); mem_req.tag = tag; - dcache_req_port.send(mem_req, 1); - DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr << ", tag=" << tag + dcache_req_port.send(mem_req, 3); + DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag << ", tid=" << t << ", "<< trace); + ++ core_->perf_stats_.tex_reads; + ++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size(); } } diff --git a/sim/simX/exeunit.h b/sim/simX/exeunit.h index 83e69463..bea714ea 100644 --- a/sim/simX/exeunit.h +++ b/sim/simX/exeunit.h @@ -8,56 +8,29 @@ namespace vortex { class Core; -class ExeUnit { -protected: - const char* name_; - Queue inputs_; - Queue outputs_; +class ExeUnit : public SimObject { +public: + SimPort Input; + SimPort Output; - void schedule_output(pipeline_trace_t* trace, uint32_t delay) { - if (delay > 1) { - SimPlatform::instance().schedule( - [&](pipeline_trace_t* req) { - outputs_.push(req); - }, - trace, - (delay - 1) - ); - } else { - outputs_.push(trace); - } - } - -public: - typedef std::shared_ptr Ptr; - - ExeUnit(const char* name) : name_(name) {} + ExeUnit(const SimContext& ctx, Core* core, const char* name) + : SimObject(ctx, name) + , Input(this) + , Output(this) + , core_(core) + {} + virtual ~ExeUnit() {} - void push(pipeline_trace_t* trace) { - inputs_.push(trace); - } - - bool empty() const { - return outputs_.empty(); - } - - pipeline_trace_t* top() const { - return outputs_.top(); - } - - void pop() { - outputs_.pop(); - } - - virtual void step(uint64_t cycle) = 0; +protected: + Core* core_; }; /////////////////////////////////////////////////////////////////////////////// class NopUnit : public ExeUnit { public: - NopUnit(Core*); + NopUnit(const SimContext& ctx, Core*); void step(uint64_t cycle); }; @@ -65,15 +38,14 @@ public: /////////////////////////////////////////////////////////////////////////////// class LsuUnit : public ExeUnit { -private: - Core* core_; +private: uint32_t num_threads_; HashTable> pending_dcache_; pipeline_trace_t* fence_state_; bool fence_lock_; public: - LsuUnit(Core*); + LsuUnit(const SimContext& ctx, Core*); void step(uint64_t cycle); }; @@ -82,7 +54,7 @@ public: class AluUnit : public ExeUnit { public: - AluUnit(Core*); + AluUnit(const SimContext& ctx, Core*); void step(uint64_t cycle); }; @@ -91,7 +63,7 @@ public: class CsrUnit : public ExeUnit { public: - CsrUnit(Core*); + CsrUnit(const SimContext& ctx, Core*); void step(uint64_t cycle); }; @@ -100,7 +72,7 @@ public: class FpuUnit : public ExeUnit { public: - FpuUnit(Core*); + FpuUnit(const SimContext& ctx, Core*); void step(uint64_t cycle); }; @@ -109,14 +81,13 @@ public: class GpuUnit : public ExeUnit { private: - Core* core_; uint32_t num_threads_; HashTable> pending_tex_reqs_; bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace); public: - GpuUnit(Core*); + GpuUnit(const SimContext& ctx, Core*); void step(uint64_t cycle); }; diff --git a/sim/simX/memsim.cpp b/sim/simX/memsim.cpp index 6559000d..012082d9 100644 --- a/sim/simX/memsim.cpp +++ b/sim/simX/memsim.cpp @@ -10,6 +10,7 @@ private: MemSim* simobject_; uint32_t num_banks_; uint32_t latency_; + PerfStats perf_stats_; public: Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency) @@ -18,16 +19,23 @@ public: , latency_(latency) {} + const PerfStats& perf_stats() const { + return perf_stats_; + } + void step(uint64_t /*cycle*/) { for (uint32_t i = 0, n = num_banks_; i < n; ++i) { auto& mem_req_port = simobject_->MemReqPorts.at(i); if (mem_req_port.empty()) continue; - auto& mem_req = mem_req_port.top(); + auto& mem_req = mem_req_port.front(); if (!mem_req.write) { MemRsp mem_rsp; mem_rsp.tag = mem_req.tag; simobject_->MemRspPorts.at(i).send(mem_rsp, latency_); + ++perf_stats_.reads; + } else { + ++perf_stats_.writes; } mem_req_port.pop(); } @@ -40,9 +48,9 @@ MemSim::MemSim(const SimContext& ctx, uint32_t num_banks, uint32_t latency) : SimObject(ctx, "MemSim") - , impl_(new Impl(this, num_banks, latency)) , MemReqPorts(num_banks, this) , MemRspPorts(num_banks, this) + , impl_(new Impl(this, num_banks, latency)) {} MemSim::~MemSim() { diff --git a/sim/simX/memsim.h b/sim/simX/memsim.h index 3d5b33fe..c48361bc 100644 --- a/sim/simX/memsim.h +++ b/sim/simX/memsim.h @@ -1,47 +1,36 @@ #pragma once #include +#include "types.h" #include -#include namespace vortex { -struct MemReq { - uint64_t addr; - uint32_t tag; - bool write; - bool is_io; - - MemReq(uint64_t _addr = 0, - uint64_t _tag = 0, - bool _write = false, - bool _is_io = false - ) : addr(_addr) - , tag(_tag) - , write(_write) - , is_io(_is_io) - {} -}; - -struct MemRsp { - uint64_t tag; - MemRsp(uint64_t _tag = 0) : tag (_tag) {} -}; - class MemSim : public SimObject{ -private: - class Impl; - Impl* impl_; - public: + struct PerfStats { + uint64_t reads; + uint64_t writes; - MemSim(const SimContext& ctx, uint32_t num_inputs, uint32_t latency); + PerfStats() + : reads(0) + , writes(0) + {} + }; + + std::vector> MemReqPorts; + std::vector> MemRspPorts; + + MemSim(const SimContext& ctx, uint32_t num_banks, uint32_t latency); ~MemSim(); void step(uint64_t cycle); - std::vector> MemReqPorts; - std::vector> MemRspPorts; + const PerfStats& perf_stats() const; + +private: + class Impl; + Impl* impl_; }; }; \ No newline at end of file diff --git a/sim/simX/pipeline.h b/sim/simX/pipeline.h index a5bf6d52..9ac09352 100644 --- a/sim/simX/pipeline.h +++ b/sim/simX/pipeline.h @@ -12,7 +12,7 @@ namespace vortex { struct pipeline_trace_t { //-- - uint64_t id; + uint64_t uuid; //-- int cid; @@ -22,7 +22,6 @@ struct pipeline_trace_t { //-- bool fetch_stall; - bool pipeline_stall; //-- bool wb; @@ -38,7 +37,7 @@ struct pipeline_trace_t { ExeType exe_type; //-- - std::vector> mem_addrs; + std::vector> mem_addrs; //-- union { @@ -53,22 +52,19 @@ struct pipeline_trace_t { } fpu; struct { GpuType type; + WarpMask active_warps; } gpu; }; - // stats - uint64_t icache_latency; - uint64_t dcache_latency; - uint64_t tex_latency; + bool stalled; - pipeline_trace_t(uint64_t id_, const ArchDef& arch) { - id = id_; + pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) { + uuid = uuid_; cid = 0; wid = 0; tmask.reset(); - PC = 0; + PC = 0; fetch_stall = false; - pipeline_stall = false; wb = false; rdest = 0; rdest_type = RegType::None; @@ -76,16 +72,18 @@ struct pipeline_trace_t { used_fregs.reset(); used_vregs.reset(); exe_type = ExeType::NOP; - mem_addrs.resize(arch.num_threads()); - icache_latency = 0; - dcache_latency = 0; - tex_latency = 0; + mem_addrs.resize(arch.num_threads()); + stalled = false; } - bool check_stalled(bool stall) { - bool old = pipeline_stall; - pipeline_stall = stall; - return stall ? old : true; + bool suspend() { + bool old = stalled; + stalled = true; + return old; + } + + void resume() { + stalled = false; } }; @@ -96,16 +94,16 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) os << ", rd=" << state.rdest_type << std::dec << state.rdest; } os << ", ex=" << state.exe_type; - os << " (#" << std::dec << state.id << ")"; + os << " (#" << std::dec << state.uuid << ")"; return os; } -class PipelineStage : public Queue { +class PipelineLatch : public Queue { protected: const char* name_; public: - PipelineStage(const char* name = nullptr) + PipelineLatch(const char* name = nullptr) : name_(name) {} }; diff --git a/sim/simX/processor.cpp b/sim/simX/processor.cpp index 7b54b505..ca9d46a6 100644 --- a/sim/simX/processor.cpp +++ b/sim/simX/processor.cpp @@ -18,13 +18,13 @@ Processor::Processor(const ArchDef& arch) // connect memory sub-systen memsim_ = MemSim::Create(1, MEM_LATENCY); - std::vector*> mem_req_ports(1); - std::vector*> mem_rsp_ports(1); + std::vector*> mem_req_ports(1); + std::vector*> mem_rsp_ports(1); mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0); mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0); if (L3_ENABLE) { - l3cache_ = Cache::Create("l3cache", CacheConfig{ + l3cache_ = Cache::Create("l3cache", Cache::Config{ log2ceil(L3_CACHE_SIZE), // C log2ceil(MEM_BLOCK_SIZE), // B 2, // W @@ -66,7 +66,7 @@ Processor::Processor(const ArchDef& arch) for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { if (L2_ENABLE) { auto& l2cache = l2caches_.at(i); - l2cache = Cache::Create("l2cache", CacheConfig{ + l2cache = Cache::Create("l2cache", Cache::Config{ log2ceil(L2_CACHE_SIZE), // C log2ceil(MEM_BLOCK_SIZE), // B 2, // W diff --git a/sim/simX/scoreboard.h b/sim/simX/scoreboard.h index 95ba0700..b36d60b3 100644 --- a/sim/simX/scoreboard.h +++ b/sim/simX/scoreboard.h @@ -96,7 +96,7 @@ public: } uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type; assert(owners_.count(tag) == 0); - owners_[tag] = state->id; + owners_[tag] = state->uuid; } void release(pipeline_trace_t* state) { diff --git a/sim/simX/sharedmem.h b/sim/simX/sharedmem.h new file mode 100644 index 00000000..d984422d --- /dev/null +++ b/sim/simX/sharedmem.h @@ -0,0 +1,93 @@ +#pragma once + +#include +#include +#include +#include "types.h" + +namespace vortex { + +class Core; + +class SharedMem : public SimObject { +public: + struct Config { + uint32_t num_reqs; + uint32_t num_banks; + uint32_t bank_offset; + uint32_t latency; + bool write_reponse; + }; + + struct PerfStats { + uint64_t reads; + uint64_t writes; + uint64_t bank_stalls; + + PerfStats() + : reads(0) + , writes(0) + , bank_stalls(0) + {} + }; + + std::vector> Inputs; + std::vector> Outputs; + + SharedMem(const SimContext& ctx, const char* name, const Config& config) + : SimObject(ctx, name) + , Inputs(config.num_reqs, this) + , Outputs(config.num_reqs, this) + , config_(config) + , bank_sel_addr_start_(config.bank_offset) + , bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1) + {} + + virtual ~SharedMem() {} + + void step(uint64_t /*cycle*/) { + std::vector in_used_banks(config_.num_banks); + for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) { + auto& core_req_port = this->Inputs.at(req_id); + if (core_req_port.empty()) + continue; + + auto& core_req = core_req_port.front(); + + uint32_t bank_id = (uint32_t)bit_getw( + core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_); + + // bank conflict check + if (in_used_banks.at(bank_id)) + continue; + + in_used_banks.at(bank_id) = true; + + if (!core_req.write || config_.write_reponse) { + // send response + MemRsp core_rsp; + core_rsp.tag = core_req.tag; + this->Outputs.at(req_id).send(core_rsp, 1); + } + + // update perf counters + perf_stats_.reads += !core_req.write; + perf_stats_.writes += core_req.write; + + // remove input + core_req_port.pop(); + } + } + + const PerfStats& perf_stats() const { + return perf_stats_; + } + +protected: + Config config_; + uint32_t bank_sel_addr_start_; + uint32_t bank_sel_addr_end_; + PerfStats perf_stats_; +}; + +} \ No newline at end of file diff --git a/sim/simX/tex_unit.cpp b/sim/simX/tex_unit.cpp index bfbcef1a..8dedef38 100644 --- a/sim/simX/tex_unit.cpp +++ b/sim/simX/tex_unit.cpp @@ -27,7 +27,7 @@ void TexUnit::set_state(uint32_t state, uint32_t value) { uint32_t TexUnit::read(int32_t u, int32_t v, int32_t lod, - std::vector* mem_addrs) { + std::vector* mem_addrs) { //-- auto xu = Fixed::make(u); auto xv = Fixed::make(v); @@ -60,10 +60,10 @@ uint32_t TexUnit::read(int32_t u, uint32_t texel10 = core_->dcache_read(addr10, stride); uint32_t texel11 = core_->dcache_read(addr11, stride); - mem_addrs->push_back(addr00); - mem_addrs->push_back(addr01); - mem_addrs->push_back(addr10); - mem_addrs->push_back(addr11); + mem_addrs->push_back({addr00, stride}); + mem_addrs->push_back({addr01, stride}); + mem_addrs->push_back({addr10, stride}); + mem_addrs->push_back({addr11, stride}); // filtering auto color = TexFilterLinear( @@ -79,7 +79,7 @@ uint32_t TexUnit::read(int32_t u, // memory lookup uint32_t texel = core_->dcache_read(addr, stride); - mem_addrs->push_back(addr); + mem_addrs->push_back({addr, stride}); // filtering auto color = TexFilterPoint(format, texel); diff --git a/sim/simX/tex_unit.h b/sim/simX/tex_unit.h index 759dda2a..b41cd8c7 100644 --- a/sim/simX/tex_unit.h +++ b/sim/simX/tex_unit.h @@ -15,7 +15,7 @@ public: void set_state(uint32_t state, uint32_t value); - uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector* mem_addrs); + uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector* mem_addrs); private: diff --git a/sim/simX/types.h b/sim/simX/types.h index d4feb1cb..7675ab82 100644 --- a/sim/simX/types.h +++ b/sim/simX/types.h @@ -21,6 +21,8 @@ typedef std::bitset<32> RegMask; typedef std::bitset<32> ThreadMask; typedef std::bitset<32> WarpMask; +/////////////////////////////////////////////////////////////////////////////// + enum class RegType { None, Integer, @@ -38,6 +40,8 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) { return os; } +/////////////////////////////////////////////////////////////////////////////// + enum class ExeType { NOP, ALU, @@ -61,6 +65,8 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) { return os; } +/////////////////////////////////////////////////////////////////////////////// + enum class AluType { ARITH, BRANCH, @@ -80,6 +86,8 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) { return os; } +/////////////////////////////////////////////////////////////////////////////// + enum class LsuType { LOAD, STORE, @@ -97,6 +105,47 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) { return os; } +/////////////////////////////////////////////////////////////////////////////// + +enum class AddrType { + Global, + Shared, + IO, +}; + +inline std::ostream &operator<<(std::ostream &os, const AddrType& type) { + switch (type) { + case AddrType::Global: os << "Global"; break; + case AddrType::Shared: os << "Shared"; break; + case AddrType::IO: os << "IO"; break; + } + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + +struct mem_addr_size_t { + uint64_t addr; + uint32_t size; +}; + +inline AddrType get_addr_type(Word addr, uint32_t size) { + __unused (size); + if (SM_ENABLE) { + if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE) + && addr < SMEM_BASE_ADDR) { + assert((addr + size) <= SMEM_BASE_ADDR); + return AddrType::Shared; + } + } + if (addr >= IO_BASE_ADDR) { + return AddrType::IO; + } + return AddrType::Global; +} + +/////////////////////////////////////////////////////////////////////////////// + enum class FpuType { FNCP, FMA, @@ -116,6 +165,8 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) { return os; } +/////////////////////////////////////////////////////////////////////////////// + enum class GpuType { TMC, WSPAWN, @@ -137,6 +188,8 @@ inline std::ostream &operator<<(std::ostream &os, const GpuType& type) { return os; } +/////////////////////////////////////////////////////////////////////////////// + enum class ArbiterType { Priority, RoundRobin @@ -152,6 +205,30 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) { /////////////////////////////////////////////////////////////////////////////// +struct MemReq { + uint64_t addr; + uint32_t tag; + bool write; + bool is_io; + + MemReq(uint64_t _addr = 0, + uint64_t _tag = 0, + bool _write = false, + bool _is_io = false + ) : addr(_addr) + , tag(_tag) + , write(_write) + , is_io(_is_io) + {} +}; + +struct MemRsp { + uint64_t tag; + MemRsp(uint64_t _tag = 0) : tag (_tag) {} +}; + +/////////////////////////////////////////////////////////////////////////////// + template class Queue { protected: @@ -164,21 +241,29 @@ public: return queue_.empty(); } - const T& top() const { + const T& front() const { return queue_.front(); } - T& top() { + T& front() { return queue_.front(); } - void pop() { - queue_.pop(); + const T& back() const { + return queue_.back(); + } + + T& back() { + return queue_.back(); } void push(const T& value) { queue_.push(value); } + + void pop() { + queue_.pop(); + } }; /////////////////////////////////////////////////////////////////////////////// @@ -187,20 +272,24 @@ template class HashTable { private: std::vector> entries_; - uint32_t capacity_; + uint32_t size_; public: - HashTable(uint32_t size) - : entries_(size) - , capacity_(0) + HashTable(uint32_t capacity) + : entries_(capacity) + , size_(0) {} bool empty() const { - return (0 == capacity_); + return (0 == size_); } bool full() const { - return (capacity_ == entries_.size()); + return (size_ == entries_.size()); + } + + uint32_t size() const { + return size_; } bool contains(uint32_t index) const { @@ -225,7 +314,7 @@ public: if (!entry.first) { entry.first = true; entry.second = value; - ++capacity_; + ++size_; return i; } } @@ -237,7 +326,7 @@ public: auto& entry = entries_.at(index); assert(entry.first); entry.first = false; - --capacity_; + --size_; } }; @@ -287,7 +376,7 @@ public: uint32_t j = (cursor_ + i) % n; auto& req_in = ReqIn.at(j); if (!req_in.empty()) { - auto& req = req_in.top(); + auto& req = req_in.front(); if (tag_shift_) { req.tag = (req.tag << tag_shift_) | j; } @@ -300,7 +389,7 @@ public: // process incoming reponses if (!RspIn.empty()) { - auto& rsp = RspIn.top(); + auto& rsp = RspIn.front(); uint32_t port_id = 0; if (tag_shift_) { port_id = rsp.tag & ((1 << tag_shift_)-1); @@ -317,10 +406,10 @@ public: } } - std::vector> ReqIn; - MasterPort ReqOut; - SlavePort RspIn; - std::vector> RspOut; + std::vector> ReqIn; + SimPort ReqOut; + SimPort RspIn; + std::vector> RspOut; }; } \ No newline at end of file diff --git a/sim/simX/warp.cpp b/sim/simX/warp.cpp index 0392c1b9..df0c0e75 100644 --- a/sim/simX/warp.cpp +++ b/sim/simX/warp.cpp @@ -27,7 +27,7 @@ void Warp::eval(pipeline_trace_t *trace) { DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask="); for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) DPN(2, tmask_.test(n-i-1)); - DPN(2, ", PC=0x" << std::hex << PC_ << std::endl); + DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl); /* Fetch and decode. */ @@ -38,7 +38,7 @@ void Warp::eval(pipeline_trace_t *trace) { std::abort(); } - DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr << " (#" << trace->id << ")"); + DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr); // Update trace trace->cid = core_->id(); diff --git a/sim/simX/warp.h b/sim/simX/warp.h index 5af5eb02..c5a54205 100644 --- a/sim/simX/warp.h +++ b/sim/simX/warp.h @@ -46,6 +46,10 @@ public: return active_; } + void suspend() { + active_ = false; + } + void activate() { active_ = true; } diff --git a/tests/regression/tex/kernel.c b/tests/regression/tex/kernel.c index 88aec50c..9a36d8cb 100644 --- a/tests/regression/tex/kernel.c +++ b/tests/regression/tex/kernel.c @@ -62,15 +62,16 @@ int main() { kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; // configure texture unit - csr_write(CSR_TEX(0, TEX_STATE_WIDTH), arg->src_logwidth); - csr_write(CSR_TEX(0, TEX_STATE_HEIGHT), arg->src_logheight); - csr_write(CSR_TEX(0, TEX_STATE_FORMAT), arg->format); - csr_write(CSR_TEX(0, TEX_STATE_WRAPU), arg->wrapu); - csr_write(CSR_TEX(0, TEX_STATE_WRAPV), arg->wrapv); - csr_write(CSR_TEX(0, TEX_STATE_FILTER), (arg->filter ? 1 : 0)); - csr_write(CSR_TEX(0, TEX_STATE_ADDR), arg->src_addr); + csr_write(CSR_TEX_UNIT, 0); + csr_write(CSR_TEX_WIDTH, arg->src_logwidth); + csr_write(CSR_TEX_HEIGHT, arg->src_logheight); + csr_write(CSR_TEX_FORMAT, arg->format); + csr_write(CSR_TEX_WRAPU, arg->wrapu); + csr_write(CSR_TEX_WRAPV, arg->wrapv); + csr_write(CSR_TEX_FILTER, (arg->filter ? 1 : 0)); + csr_write(CSR_TEX_ADDR, arg->src_addr); static_for_t()([&](int i) { - csr_write(CSR_TEX(0, TEX_STATE_MIPOFF(i)), arg->mip_offs[i]); + csr_write(CSR_TEX_MIPOFF(i), arg->mip_offs[i]); }); tile_arg_t targ; From 2a7a4df342fd2602097657674ea1fde62a2a7922 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Nov 2021 07:17:58 -0500 Subject: [PATCH 12/27] simx directory name fix --- driver/simx/Makefile | 4 +-- miscs/rvvector/basic/Makefile | 2 +- sim/Makefile | 4 +-- sim/{simX => simx}/Makefile | 2 +- sim/{simX => simx}/archdef.h | 0 sim/{simX => simx}/args.cpp | 0 sim/{simX => simx}/args.h | 0 sim/{simX => simx}/cache.cpp | 0 sim/{simX => simx}/cache.h | 0 sim/{simX => simx}/constants.h | 0 sim/{simX => simx}/core.cpp | 0 sim/{simX => simx}/core.h | 0 sim/{simX => simx}/debug.h | 0 sim/{simX => simx}/decode.cpp | 0 sim/{simX => simx}/decode.h | 0 sim/{simX => simx}/execute.cpp | 0 sim/{simX => simx}/exeunit.cpp | 0 sim/{simX => simx}/exeunit.h | 0 sim/{simX => simx}/ibuffer.h | 0 sim/{simX => simx}/instr.h | 0 sim/{simX => simx}/main.cpp | 0 sim/{simX => simx}/memsim.cpp | 0 sim/{simX => simx}/memsim.h | 0 sim/{simX => simx}/pipeline.h | 0 sim/{simX => simx}/processor.cpp | 0 sim/{simX => simx}/processor.h | 0 sim/{simX => simx}/scoreboard.h | 0 sim/{simX => simx}/sharedmem.h | 0 sim/{simX => simx}/tex_unit.cpp | 0 sim/{simX => simx}/tex_unit.h | 0 sim/{simX => simx}/types.h | 0 sim/{simX => simx}/warp.cpp | 0 sim/{simX => simx}/warp.h | 0 sim/vlsim/vortex_afu.h | 49 ------------------------------ tests/opencl/BlackScholes/Makefile | 2 +- tests/opencl/DotProduct/Makefile | 2 +- tests/opencl/VectorHypot/Makefile | 2 +- tests/opencl/cutcp/Makefile | 2 +- tests/opencl/lbm/Makefile | 2 +- tests/opencl/mri-q/Makefile | 2 +- tests/opencl/reduce0/Makefile | 2 +- tests/opencl/sad/Makefile | 2 +- tests/opencl/spmv/Makefile | 2 +- tests/opencl/stencil/Makefile | 2 +- tests/riscv/isa/Makefile | 2 +- tests/runtime/fibonacci/Makefile | 2 +- tests/runtime/hello/Makefile | 2 +- tests/runtime/simple/Makefile | 2 +- 48 files changed, 20 insertions(+), 69 deletions(-) rename sim/{simX => simx}/Makefile (98%) rename sim/{simX => simx}/archdef.h (100%) rename sim/{simX => simx}/args.cpp (100%) rename sim/{simX => simx}/args.h (100%) rename sim/{simX => simx}/cache.cpp (100%) rename sim/{simX => simx}/cache.h (100%) rename sim/{simX => simx}/constants.h (100%) rename sim/{simX => simx}/core.cpp (100%) rename sim/{simX => simx}/core.h (100%) rename sim/{simX => simx}/debug.h (100%) rename sim/{simX => simx}/decode.cpp (100%) rename sim/{simX => simx}/decode.h (100%) rename sim/{simX => simx}/execute.cpp (100%) rename sim/{simX => simx}/exeunit.cpp (100%) rename sim/{simX => simx}/exeunit.h (100%) rename sim/{simX => simx}/ibuffer.h (100%) rename sim/{simX => simx}/instr.h (100%) rename sim/{simX => simx}/main.cpp (100%) rename sim/{simX => simx}/memsim.cpp (100%) rename sim/{simX => simx}/memsim.h (100%) rename sim/{simX => simx}/pipeline.h (100%) rename sim/{simX => simx}/processor.cpp (100%) rename sim/{simX => simx}/processor.h (100%) rename sim/{simX => simx}/scoreboard.h (100%) rename sim/{simX => simx}/sharedmem.h (100%) rename sim/{simX => simx}/tex_unit.cpp (100%) rename sim/{simX => simx}/tex_unit.h (100%) rename sim/{simX => simx}/types.h (100%) rename sim/{simX => simx}/warp.cpp (100%) rename sim/{simX => simx}/warp.h (100%) delete mode 100644 sim/vlsim/vortex_afu.h diff --git a/driver/simx/Makefile b/driver/simx/Makefile index dea65c35..14114f2a 100644 --- a/driver/simx/Makefile +++ b/driver/simx/Makefile @@ -1,4 +1,4 @@ -SIMX_DIR = ../../sim/simX +SIMX_DIR = ../../sim/simx CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CXXFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors @@ -9,7 +9,7 @@ CXXFLAGS += $(CONFIGS) CXXFLAGS += -DDUMP_PERF_STATS LDFLAGS += -shared -pthread -LDFLAGS += $(SIMX_DIR)/libsimX.a +LDFLAGS += $(SIMX_DIR)/libsimx.a SRCS = vortex.cpp ../common/vx_utils.cpp diff --git a/miscs/rvvector/basic/Makefile b/miscs/rvvector/basic/Makefile index 5a796fe7..66aece0c 100644 --- a/miscs/rvvector/basic/Makefile +++ b/miscs/rvvector/basic/Makefile @@ -36,6 +36,6 @@ ELF: $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf run: - ../../simX/obj_dir/Vcache_simX -E -a rv32i --core vx_vector_main.hex -s -b 1> emulator.debug + ../../simx/obj_dir/Vcache_simX -E -a rv32i --core vx_vector_main.hex -s -b 1> emulator.debug diff --git a/sim/Makefile b/sim/Makefile index e0361709..5c4584f4 100644 --- a/sim/Makefile +++ b/sim/Makefile @@ -1,9 +1,9 @@ all: - $(MAKE) -C simX + $(MAKE) -C simx $(MAKE) -C rtlsim $(MAKE) -C vlsim clean: - $(MAKE) -C simX clean + $(MAKE) -C simx clean $(MAKE) -C rtlsim clean $(MAKE) -C vlsim clean \ No newline at end of file diff --git a/sim/simX/Makefile b/sim/simx/Makefile similarity index 98% rename from sim/simX/Makefile rename to sim/simx/Makefile index b3312bb0..ad4e38c8 100644 --- a/sim/simX/Makefile +++ b/sim/simx/Makefile @@ -28,7 +28,7 @@ else CXXFLAGS += -O2 -DNDEBUG endif -PROJECT = simX +PROJECT = simx all: $(PROJECT) diff --git a/sim/simX/archdef.h b/sim/simx/archdef.h similarity index 100% rename from sim/simX/archdef.h rename to sim/simx/archdef.h diff --git a/sim/simX/args.cpp b/sim/simx/args.cpp similarity index 100% rename from sim/simX/args.cpp rename to sim/simx/args.cpp diff --git a/sim/simX/args.h b/sim/simx/args.h similarity index 100% rename from sim/simX/args.h rename to sim/simx/args.h diff --git a/sim/simX/cache.cpp b/sim/simx/cache.cpp similarity index 100% rename from sim/simX/cache.cpp rename to sim/simx/cache.cpp diff --git a/sim/simX/cache.h b/sim/simx/cache.h similarity index 100% rename from sim/simX/cache.h rename to sim/simx/cache.h diff --git a/sim/simX/constants.h b/sim/simx/constants.h similarity index 100% rename from sim/simX/constants.h rename to sim/simx/constants.h diff --git a/sim/simX/core.cpp b/sim/simx/core.cpp similarity index 100% rename from sim/simX/core.cpp rename to sim/simx/core.cpp diff --git a/sim/simX/core.h b/sim/simx/core.h similarity index 100% rename from sim/simX/core.h rename to sim/simx/core.h diff --git a/sim/simX/debug.h b/sim/simx/debug.h similarity index 100% rename from sim/simX/debug.h rename to sim/simx/debug.h diff --git a/sim/simX/decode.cpp b/sim/simx/decode.cpp similarity index 100% rename from sim/simX/decode.cpp rename to sim/simx/decode.cpp diff --git a/sim/simX/decode.h b/sim/simx/decode.h similarity index 100% rename from sim/simX/decode.h rename to sim/simx/decode.h diff --git a/sim/simX/execute.cpp b/sim/simx/execute.cpp similarity index 100% rename from sim/simX/execute.cpp rename to sim/simx/execute.cpp diff --git a/sim/simX/exeunit.cpp b/sim/simx/exeunit.cpp similarity index 100% rename from sim/simX/exeunit.cpp rename to sim/simx/exeunit.cpp diff --git a/sim/simX/exeunit.h b/sim/simx/exeunit.h similarity index 100% rename from sim/simX/exeunit.h rename to sim/simx/exeunit.h diff --git a/sim/simX/ibuffer.h b/sim/simx/ibuffer.h similarity index 100% rename from sim/simX/ibuffer.h rename to sim/simx/ibuffer.h diff --git a/sim/simX/instr.h b/sim/simx/instr.h similarity index 100% rename from sim/simX/instr.h rename to sim/simx/instr.h diff --git a/sim/simX/main.cpp b/sim/simx/main.cpp similarity index 100% rename from sim/simX/main.cpp rename to sim/simx/main.cpp diff --git a/sim/simX/memsim.cpp b/sim/simx/memsim.cpp similarity index 100% rename from sim/simX/memsim.cpp rename to sim/simx/memsim.cpp diff --git a/sim/simX/memsim.h b/sim/simx/memsim.h similarity index 100% rename from sim/simX/memsim.h rename to sim/simx/memsim.h diff --git a/sim/simX/pipeline.h b/sim/simx/pipeline.h similarity index 100% rename from sim/simX/pipeline.h rename to sim/simx/pipeline.h diff --git a/sim/simX/processor.cpp b/sim/simx/processor.cpp similarity index 100% rename from sim/simX/processor.cpp rename to sim/simx/processor.cpp diff --git a/sim/simX/processor.h b/sim/simx/processor.h similarity index 100% rename from sim/simX/processor.h rename to sim/simx/processor.h diff --git a/sim/simX/scoreboard.h b/sim/simx/scoreboard.h similarity index 100% rename from sim/simX/scoreboard.h rename to sim/simx/scoreboard.h diff --git a/sim/simX/sharedmem.h b/sim/simx/sharedmem.h similarity index 100% rename from sim/simX/sharedmem.h rename to sim/simx/sharedmem.h diff --git a/sim/simX/tex_unit.cpp b/sim/simx/tex_unit.cpp similarity index 100% rename from sim/simX/tex_unit.cpp rename to sim/simx/tex_unit.cpp diff --git a/sim/simX/tex_unit.h b/sim/simx/tex_unit.h similarity index 100% rename from sim/simX/tex_unit.h rename to sim/simx/tex_unit.h diff --git a/sim/simX/types.h b/sim/simx/types.h similarity index 100% rename from sim/simX/types.h rename to sim/simx/types.h diff --git a/sim/simX/warp.cpp b/sim/simx/warp.cpp similarity index 100% rename from sim/simX/warp.cpp rename to sim/simx/warp.cpp diff --git a/sim/simX/warp.h b/sim/simx/warp.h similarity index 100% rename from sim/simX/warp.h rename to sim/simx/warp.h diff --git a/sim/vlsim/vortex_afu.h b/sim/vlsim/vortex_afu.h deleted file mode 100644 index 1a1dee44..00000000 --- a/sim/vlsim/vortex_afu.h +++ /dev/null @@ -1,49 +0,0 @@ -// auto-generated by gen_config.py. DO NOT EDIT -// Generated at 2021-11-25 13:43:13.259966 - -// Translated from VX_config.vh: - -#ifndef __VORTEX_AFU__ -#define __VORTEX_AFU__ - - - -#define PLATFORM_PROVIDES_LOCAL_MEMORY - -#ifndef PLATFORM_PARAM_LOCAL_MEMORY_BANKS -#define PLATFORM_PARAM_LOCAL_MEMORY_BANKS 2 -#endif - -#ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH -#define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH 26 -#endif - -#ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH -#define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH 512 -#endif - -#ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH -#define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4 -#endif - - - -#define AFU_ACCEL_NAME "vortex_afu" -#define AFU_ACCEL_UUID 0x35f9452b_25c2_434c_93d5_6f8c60db361c - -#define AFU_IMAGE_CMD_MEM_READ 1 -#define AFU_IMAGE_CMD_MEM_WRITE 2 -#define AFU_IMAGE_CMD_RUN 3 -#define AFU_IMAGE_MMIO_CMD_TYPE 10 -#define AFU_IMAGE_MMIO_DATA_SIZE 16 -#define AFU_IMAGE_MMIO_IO_ADDR 12 -#define AFU_IMAGE_MMIO_MEM_ADDR 14 -#define AFU_IMAGE_MMIO_SCOPE_READ 20 -#define AFU_IMAGE_MMIO_SCOPE_WRITE 22 -#define AFU_IMAGE_MMIO_DEV_CAPS 24 -#define AFU_IMAGE_MMIO_STATUS 18 - -#define AFU_IMAGE_POWER 0 -#define AFU_TOP_IFC "ccip_std_afu_avalon_mm" - -#endif diff --git a/tests/opencl/BlackScholes/Makefile b/tests/opencl/BlackScholes/Makefile index 54ffe7ab..30091c87 100644 --- a/tests/opencl/BlackScholes/Makefile +++ b/tests/opencl/BlackScholes/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/DotProduct/Makefile b/tests/opencl/DotProduct/Makefile index 44eaf258..3f3a68f3 100644 --- a/tests/opencl/DotProduct/Makefile +++ b/tests/opencl/DotProduct/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/VectorHypot/Makefile b/tests/opencl/VectorHypot/Makefile index bd5cf982..e58561ca 100644 --- a/tests/opencl/VectorHypot/Makefile +++ b/tests/opencl/VectorHypot/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/cutcp/Makefile b/tests/opencl/cutcp/Makefile index 7d4ed97b..3d694a63 100644 --- a/tests/opencl/cutcp/Makefile +++ b/tests/opencl/cutcp/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/lbm/Makefile b/tests/opencl/lbm/Makefile index 2f0116a8..ffa85d1a 100644 --- a/tests/opencl/lbm/Makefile +++ b/tests/opencl/lbm/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/mri-q/Makefile b/tests/opencl/mri-q/Makefile index ff9f420c..0aa409b6 100644 --- a/tests/opencl/mri-q/Makefile +++ b/tests/opencl/mri-q/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/reduce0/Makefile b/tests/opencl/reduce0/Makefile index b4aede1d..bb72241f 100644 --- a/tests/opencl/reduce0/Makefile +++ b/tests/opencl/reduce0/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/sad/Makefile b/tests/opencl/sad/Makefile index a8314b9c..129996be 100644 --- a/tests/opencl/sad/Makefile +++ b/tests/opencl/sad/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/spmv/Makefile b/tests/opencl/spmv/Makefile index eedbed22..f3c7a13f 100644 --- a/tests/opencl/spmv/Makefile +++ b/tests/opencl/spmv/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/opencl/stencil/Makefile b/tests/opencl/stencil/Makefile index ba69490b..41e05787 100644 --- a/tests/opencl/stencil/Makefile +++ b/tests/opencl/stencil/Makefile @@ -3,7 +3,7 @@ POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc) POCL_INC_PATH ?= $(wildcard ../include) POCL_LIB_PATH ?= $(wildcard ../lib) VORTEX_RT_PATH ?= $(wildcard ../../../runtime) -VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir) +VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir) CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/riscv/isa/Makefile b/tests/riscv/isa/Makefile index fba3bdd1..de35c0d0 100644 --- a/tests/riscv/isa/Makefile +++ b/tests/riscv/isa/Makefile @@ -10,7 +10,7 @@ TESTS := $(filter-out $(EXCLUDED_TESTS), $(ALL_TESTS)) all: run-simx: - $(foreach test, $(TESTS), ../../../sim/simX/simX -r -a rv32i -c 1 -i $(test) || exit;) + $(foreach test, $(TESTS), ../../../sim/simx/simx -r -a rv32i -c 1 -i $(test) || exit;) run-rtlsim: $(foreach test, $(TESTS), ../../../sim/rtlsim/rtlsim -r $(test) || exit;) diff --git a/tests/runtime/fibonacci/Makefile b/tests/runtime/fibonacci/Makefile index cd5195e0..1ea96718 100644 --- a/tests/runtime/fibonacci/Makefile +++ b/tests/runtime/fibonacci/Makefile @@ -30,7 +30,7 @@ run-rtlsim: $(PROJECT).bin ../../../sim/rtlsim/rtlsim $(PROJECT).bin run-simx: $(PROJECT).bin - ../../../sim/simX/simX -a rv32i -c 1 -i $(PROJECT).bin + ../../../sim/simx/simx -a rv32i -c 1 -i $(PROJECT).bin .depend: $(SRCS) $(CC) $(CFLAGS) -MM $^ > .depend; diff --git a/tests/runtime/hello/Makefile b/tests/runtime/hello/Makefile index 43e768b6..9c83df0c 100644 --- a/tests/runtime/hello/Makefile +++ b/tests/runtime/hello/Makefile @@ -30,7 +30,7 @@ run-rtlsim: $(PROJECT).bin ../../../sim/rtlsim/rtlsim $(PROJECT).bin run-simx: $(PROJECT).bin - ../../../sim/simX/simX -a rv32i -c 1 -i $(PROJECT).bin + ../../../sim/simx/simx -a rv32i -c 1 -i $(PROJECT).bin .depend: $(SRCS) $(CC) $(CFLAGS) -MM $^ > .depend; diff --git a/tests/runtime/simple/Makefile b/tests/runtime/simple/Makefile index dabb4cc0..79e2a2e5 100644 --- a/tests/runtime/simple/Makefile +++ b/tests/runtime/simple/Makefile @@ -30,7 +30,7 @@ run-rtlsim: $(PROJECT).bin ../../../sim/rtlsim/rtlsim $(PROJECT).bin run-simx: $(PROJECT).bin - ../../../sim/simX/simX -a rv32i -c 1 -i $(PROJECT).bin + ../../../sim/simx/simx -a rv32i -c 1 -i $(PROJECT).bin .depend: $(SRCS) $(CC) $(CFLAGS) -MM $^ > .depend; From d4addc65ab10ad4722266ee269d321563f4ebc65 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Nov 2021 08:55:40 -0500 Subject: [PATCH 13/27] minor update --- third_party/cocogfx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/cocogfx b/third_party/cocogfx index 68e3625e..04b10969 160000 --- a/third_party/cocogfx +++ b/third_party/cocogfx @@ -1 +1 @@ -Subproject commit 68e3625e70acd1fbd5fcfc629223d370f7b6806e +Subproject commit 04b109692cf6d0128f5ae89cbb4a7d77bfbc9f6a From d4cb3b8410e21b5f18c916c074204c3fe857e288 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Nov 2021 14:24:34 -0500 Subject: [PATCH 14/27] removed cocogfx --- .gitmodules | 3 --- third_party/cocogfx | 1 - 2 files changed, 4 deletions(-) delete mode 160000 third_party/cocogfx diff --git a/.gitmodules b/.gitmodules index 360e5c00..0acd81d3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ -[submodule "third_party/cocogfx"] - path = third_party/cocogfx - url = https://github.com/gtcasl/cocogfx.git [submodule "third_party/fpnew"] path = third_party/fpnew url = https://github.com/pulp-platform/fpnew.git diff --git a/third_party/cocogfx b/third_party/cocogfx deleted file mode 160000 index 04b10969..00000000 --- a/third_party/cocogfx +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 04b109692cf6d0128f5ae89cbb4a7d77bfbc9f6a From 7c4b3cab290dd89765ec239d663e69e2338156e3 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Nov 2021 14:26:54 -0500 Subject: [PATCH 15/27] adding cocogfx --- .gitmodules | 3 +++ third_party/cocogfx | 1 + 2 files changed, 4 insertions(+) create mode 160000 third_party/cocogfx diff --git a/.gitmodules b/.gitmodules index 0acd81d3..26a07f16 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "third_party/softfloat"] path = third_party/softfloat url = https://github.com/ucb-bar/berkeley-softfloat-3.git +[submodule "third_party/cocogfx"] + path = third_party/cocogfx + url = https://github.com/gtcasl/cocogfx.git diff --git a/third_party/cocogfx b/third_party/cocogfx new file mode 160000 index 00000000..04b10969 --- /dev/null +++ b/third_party/cocogfx @@ -0,0 +1 @@ +Subproject commit 04b109692cf6d0128f5ae89cbb4a7d77bfbc9f6a From 4477cbeed135d07dc5077a73a14d433526abff7a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 30 Nov 2021 15:36:59 -0500 Subject: [PATCH 16/27] blackbox caching fix --- ci/blackbox.sh | 8 +++++--- hw/rtl/VX_config.vh | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ci/blackbox.sh b/ci/blackbox.sh index f2c6ec2b..88930faf 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -124,9 +124,11 @@ CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_TH echo "CONFIGS=$CONFIGS" -if [ -f "blackbox.cache" ] +BLACKBOX_CACHE=blackbox.$DRIVER.cache + +if [ -f "$BLACKBOX_CACHE" ] then - LAST_CONFIGS=`cat blackbox.cache` + LAST_CONFIGS=`cat $BLACKBOX_CACHE` fi if [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ]; @@ -134,7 +136,7 @@ then make -C $DRIVER_PATH clean fi -echo "$CONFIGS+$DEBUG+$SCOPE" > blackbox.cache +echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE status=0 diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 8e0bbaa8..e9e57b03 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -269,7 +269,7 @@ `define CSR_TEX_WRAPU (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPU) `define CSR_TEX_WRAPV (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPV) `define CSR_TEX_MIPOFF(lod) (`CSR_TEX_STATE_BEGIN+`TEX_STATE_MIPOFF(lod)) -`define CSR_TEX_STATE_END (`CSR_TEX_STATE_BEGIN + `NUM_TEX_STATES) +`define CSR_TEX_STATE_END (`CSR_TEX_STATE_BEGIN+`NUM_TEX_STATES) `define CSR_TEX_STATE(addr) ((addr) - `CSR_TEX_STATE_BEGIN) From 092ff42ab42bbf20aeaba4788c0ee73fd3d22fda Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 1 Dec 2021 00:12:16 -0500 Subject: [PATCH 17/27] simx multicore fix --- ci/regression.sh | 7 +++++++ driver/simx/vortex.cpp | 5 ++--- sim/simx/constants.h | 2 ++ sim/simx/core.cpp | 15 +++++++++++++-- sim/simx/core.h | 1 + sim/simx/main.cpp | 3 +-- sim/simx/processor.cpp | 38 +++++++++++++++++++++----------------- 7 files changed, 47 insertions(+), 24 deletions(-) diff --git a/ci/regression.sh b/ci/regression.sh index 2be58140..b6125ce1 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -43,15 +43,20 @@ echo "begin clustering tests..." # warp/threads configurations ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=demo ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=demo +./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=demo # cores clustering ./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=demo --args="-n1" +./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=demo --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=demo --args="-n1" +./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=demo --args="-n1" # L2/L3 ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1" +./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=demo --args="-n1" echo "clustering tests done!" } @@ -101,12 +106,14 @@ CONFIGS="-DMEM_BLOCK_SIZE=16 -DL1_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsi # test cache banking CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr +CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=io_addr # test cache multi-porting CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo --debug --args="-n1" CONFIGS="-DL2_NUM_PORTS=2 -DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=io_addr +CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=io_addr # test 128-bit MEM block CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index d63005d6..2aaef1e9 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -9,11 +9,10 @@ #include #include #include +#include #include #include -#define RAM_PAGE_SIZE 4096 - using namespace vortex; /////////////////////////////////////////////////////////////////////////////// @@ -58,7 +57,7 @@ private: class vx_device { public: vx_device() - : arch_("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS) + : arch_("rv32i", NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS) , ram_(RAM_PAGE_SIZE) , mem_allocation_(ALLOC_BASE_ADDR) {} diff --git a/sim/simx/constants.h b/sim/simx/constants.h index b173a03f..7d8daed5 100644 --- a/sim/simx/constants.h +++ b/sim/simx/constants.h @@ -6,6 +6,8 @@ #define MEM_LATENCY 24 #endif +#define RAM_PAGE_SIZE 4096 + namespace vortex { enum Constants { diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 7c6cbffa..934ce1f8 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -21,6 +21,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) , arch_(arch) , decoder_(arch) , mmu_(0, arch.wsize(), true) + , smem_(RAM_PAGE_SIZE) , tex_units_(NUM_TEX_UNITS, this) , warps_(arch.num_warps()) , barriers_(arch.num_barriers(), 0) @@ -380,7 +381,12 @@ Word Core::icache_read(Addr addr, Size size) { Word Core::dcache_read(Addr addr, Size size) { Word data; - mmu_.read(&data, addr, size, 0); + auto type = get_addr_type(addr, size); + if (type == AddrType::Shared) { + smem_.read(&data, addr & (SMEM_SIZE-1), size); + } else { + mmu_.read(&data, addr, size, 0); + } return data; } @@ -389,7 +395,12 @@ void Core::dcache_write(Addr addr, Word data, Size size) { && addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { this->writeToStdOut(addr, data); } else { - mmu_.write(&data, addr, size, 0); + auto type = get_addr_type(addr, size); + if (type == AddrType::Shared) { + smem_.write(&data, addr & (SMEM_SIZE-1), size); + } else { + mmu_.write(&data, addr, size, 0); + } } } diff --git a/sim/simx/core.h b/sim/simx/core.h index e4a6034e..b9c01383 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -137,6 +137,7 @@ private: const ArchDef arch_; const Decoder decoder_; MemoryUnit mmu_; + RAM smem_; std::vector tex_units_; std::vector> warps_; diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp index a0e07faf..86829f3a 100644 --- a/sim/simx/main.cpp +++ b/sim/simx/main.cpp @@ -6,11 +6,10 @@ #include #include #include "processor.h" +#include "constants.h" #include #include "args.h" -#define RAM_PAGE_SIZE 4096 - using namespace vortex; int main(int argc, char **argv) { diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index ca9d46a6..f069a6b7 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -18,8 +18,9 @@ Processor::Processor(const ArchDef& arch) // connect memory sub-systen memsim_ = MemSim::Create(1, MEM_LATENCY); - std::vector*> mem_req_ports(1); + std::vector*> mem_req_ports(1); std::vector*> mem_rsp_ports(1); + mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0); mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0); @@ -46,6 +47,7 @@ Processor::Processor(const ArchDef& arch) mem_req_ports.resize(NUM_CLUSTERS); mem_rsp_ports.resize(NUM_CLUSTERS); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i); mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i); @@ -57,13 +59,17 @@ Processor::Processor(const ArchDef& arch) mem_req_ports.resize(NUM_CLUSTERS); mem_rsp_ports.resize(NUM_CLUSTERS); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i); mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i); } } - for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + std::vector*> cluster_mem_req_ports(cores_per_cluster); + std::vector*> cluster_mem_rsp_ports(cores_per_cluster); + if (L2_ENABLE) { auto& l2cache = l2caches_.at(i); l2cache = Cache::Create("l2cache", Cache::Config{ @@ -74,40 +80,38 @@ Processor::Processor(const ArchDef& arch) 32, // address bits L2_NUM_BANKS, // number of banks L2_NUM_PORTS, // number of ports - NUM_CORES, // request size + (uint8_t)cores_per_cluster, // request size true, // write-through false, // write response 0, // victim size L2_MSHR_SIZE, // mshr 2, // pipeline latency }); + mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort); l2cache->MemReqPort.bind(mem_req_ports.at(i)); - mem_req_ports.resize(cores_per_cluster); - mem_rsp_ports.resize(cores_per_cluster); for (uint32_t j = 0; j < cores_per_cluster; ++j) { - mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); - mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); + cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); + cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); } - } else if (cores_per_cluster > 1) { + } else { auto& l2_mem_switch = l2_mem_switches_.at(i); - l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES); - mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); - l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); + l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster); + + mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); + l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); - mem_req_ports.resize(cores_per_cluster); - mem_rsp_ports.resize(cores_per_cluster); for (uint32_t j = 0; j < cores_per_cluster; ++j) { - mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); - mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); + cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); + cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); } } for (uint32_t j = 0; j < cores_per_cluster; ++j) { auto& core = cores_.at((i * NUM_CLUSTERS) + j); - mem_rsp_ports.at(i)->bind(&core->MemRspPort); - core->MemReqPort.bind(mem_req_ports.at(j)); + cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort); + core->MemReqPort.bind(cluster_mem_req_ports.at(j)); } } } From 189cec3ca2529c0061dd121bb27fc92e7332d032 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 1 Dec 2021 10:36:50 -0500 Subject: [PATCH 18/27] minor update --- ci/regression.sh | 1 + sim/simx/processor.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/regression.sh b/ci/regression.sh index b6125ce1..4a1336c8 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -56,6 +56,7 @@ echo "begin clustering tests..." ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1" +./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=demo --args="-n1" ./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=demo --args="-n1" echo "clustering tests done!" diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index f069a6b7..6bb46229 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -109,7 +109,7 @@ Processor::Processor(const ArchDef& arch) } for (uint32_t j = 0; j < cores_per_cluster; ++j) { - auto& core = cores_.at((i * NUM_CLUSTERS) + j); + auto& core = cores_.at((i * cores_per_cluster) + j); cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort); core->MemReqPort.bind(cluster_mem_req_ports.at(j)); } From 38f166f09039e18e5ac9d752f511b27abc698d55 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 2 Dec 2021 10:22:21 -0800 Subject: [PATCH 19/27] texture unit hardware optimizations --- hw/rtl/VX_lsu_unit.sv | 1 + hw/rtl/VX_writeback.sv | 3 +- hw/rtl/tex_unit/VX_tex_addr.sv | 38 ++++++------- hw/rtl/tex_unit/VX_tex_lerp.sv | 7 +-- hw/rtl/tex_unit/VX_tex_mem.sv | 40 +++++++++---- hw/rtl/tex_unit/VX_tex_sampler.sv | 95 ++++++++++++++++--------------- hw/rtl/tex_unit/VX_tex_unit.sv | 40 +++++++------ hw/syn/quartus/Makefile | 9 ++- hw/syn/quartus/texunit/Makefile | 81 ++++++++++++++++++++++++++ 9 files changed, 208 insertions(+), 106 deletions(-) create mode 100644 hw/syn/quartus/texunit/Makefile diff --git a/hw/rtl/VX_lsu_unit.sv b/hw/rtl/VX_lsu_unit.sv index e0ed73b5..ec8fca80 100644 --- a/hw/rtl/VX_lsu_unit.sv +++ b/hw/rtl/VX_lsu_unit.sv @@ -44,6 +44,7 @@ module VX_lsu_unit #( wire [`NUM_THREADS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type, req_addr_type; + // full address calculation wire [`NUM_THREADS-1:0][31:0] full_addr; for (genvar i = 0; i < `NUM_THREADS; i++) begin assign full_addr[i] = lsu_req_if.base_addr[i] + lsu_req_if.offset; diff --git a/hw/rtl/VX_writeback.sv b/hw/rtl/VX_writeback.sv index 5b67256c..f4471046 100644 --- a/hw/rtl/VX_writeback.sv +++ b/hw/rtl/VX_writeback.sv @@ -64,7 +64,8 @@ module VX_writeback #( VX_stream_arbiter #( .NUM_REQS (NUM_RSPS), .DATAW (DATAW), - .TYPE ("P") + .BUFFERED (1), + .TYPE ("R") ) rsp_arb ( .clk (clk), .reset (reset), diff --git a/hw/rtl/tex_unit/VX_tex_addr.sv b/hw/rtl/tex_unit/VX_tex_addr.sv index c33cc47a..87da9cef 100644 --- a/hw/rtl/tex_unit/VX_tex_addr.sv +++ b/hw/rtl/tex_unit/VX_tex_addr.sv @@ -17,6 +17,7 @@ module VX_tex_addr #( input wire [`TEX_FILTER_BITS-1:0] req_filter, input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps, input wire [`TEX_ADDR_BITS-1:0] req_baseaddr, + input wire [NUM_REQS-1:0][`TEX_LOD_BITS-1:0] mip_level, input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff, input wire [NUM_REQS-1:0][1:0][`TEX_LOD_BITS-1:0] req_logdims, input wire [REQ_INFOW-1:0] req_info, @@ -28,6 +29,7 @@ module VX_tex_addr #( output wire [NUM_REQS-1:0] rsp_tmask, output wire [`TEX_FILTER_BITS-1:0] rsp_filter, output wire [`TEX_LGSTRIDE_BITS-1:0] rsp_lgstride, + output wire [NUM_REQS-1:0][31:0] rsp_baseaddr, output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr, output wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends, output wire [REQ_INFOW-1:0] rsp_info, @@ -38,6 +40,7 @@ module VX_tex_addr #( localparam SHIFT_BITS = $clog2(`TEX_FXD_FRAC+1); localparam PITCH_BITS = `MAX(`TEX_LOD_BITS, `TEX_LGSTRIDE_BITS) + 1; + localparam SCALED_DIM = `TEX_FXD_FRAC + `TEX_DIM_BITS; localparam SCALED_X_W = `TEX_DIM_BITS + `TEX_BLEND_FRAC; localparam OFFSET_U_W = `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX; localparam OFFSET_V_W = `TEX_DIM_BITS + `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX; @@ -69,7 +72,7 @@ module VX_tex_addr #( for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar j = 0; j < 2; ++j) begin - wire [`TEX_FXD_FRAC-1:0] delta = (`TEX_FXD_HALF >> req_logdims[i][j]); + wire [`TEX_FXD_FRAC-1:0] delta = `TEX_FXD_FRAC'((SCALED_DIM'(`TEX_FXD_HALF) << mip_level[i]) >> req_logdims[i][j]); wire [`TEX_FXD_BITS-1:0] coord_lo = req_filter ? (req_coords[j][i] - `TEX_FXD_BITS'(delta)) : req_coords[j][i]; wire [`TEX_FXD_BITS-1:0] coord_hi = req_filter ? (req_coords[j][i] + `TEX_FXD_BITS'(delta)) : req_coords[j][i]; @@ -89,14 +92,14 @@ module VX_tex_addr #( .coord_o (clamped_hi[i][j]) ); - assign dim_shift[i][j] = (`TEX_FXD_FRAC - `TEX_BLEND_FRAC - req_logdims[i][j]); + assign dim_shift[i][j] = (`TEX_FXD_FRAC - `TEX_BLEND_FRAC - (req_logdims[i][j] - mip_level[i])); end - assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0]) + PITCH_BITS'(log_stride); - assign mip_addr[i] = req_baseaddr + 32'(req_mipoff[i]); + assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0] - mip_level[i]) + PITCH_BITS'(log_stride); + assign mip_addr[i] = req_baseaddr + `TEX_ADDR_BITS'(req_mipoff[i]); end VX_pipe_register #( - .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * SHIFT_BITS + 32 + 2 * 2 * `TEX_FXD_FRAC)), + .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * SHIFT_BITS + `TEX_ADDR_BITS + 2 * 2 * `TEX_FXD_FRAC)), .RESETW (1) ) pipe_reg0 ( .clk (clk), @@ -114,8 +117,6 @@ module VX_tex_addr #( wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_hi; wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_lo; wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_hi; - wire [NUM_REQS-1:0][31:0] base_addr_lo; - wire [NUM_REQS-1:0][31:0] base_addr_hi; wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] blends; wire [NUM_REQS-1:0][3:0][31:0] addr; @@ -134,26 +135,23 @@ module VX_tex_addr #( assign offset_v_lo[i] = OFFSET_V_W'(scaled_lo[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i]; assign offset_v_hi[i] = OFFSET_V_W'(scaled_hi[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i]; - assign base_addr_lo[i] = mip_addr_s0[i] + 32'(offset_v_lo[i]); - assign base_addr_hi[i] = mip_addr_s0[i] + 32'(offset_v_hi[i]); - - assign addr[i][0] = base_addr_lo[i] + 32'(offset_u_lo[i]); - assign addr[i][1] = base_addr_lo[i] + 32'(offset_u_hi[i]); - assign addr[i][2] = base_addr_hi[i] + 32'(offset_u_lo[i]); - assign addr[i][3] = base_addr_hi[i] + 32'(offset_u_hi[i]); + assign addr[i][0] = 32'(offset_v_lo[i]) + 32'(offset_u_lo[i]); + assign addr[i][1] = 32'(offset_v_lo[i]) + 32'(offset_u_hi[i]); + assign addr[i][2] = 32'(offset_v_hi[i]) + 32'(offset_u_lo[i]); + assign addr[i][3] = 32'(offset_v_hi[i]) + 32'(offset_u_hi[i]); end assign stall_out = rsp_valid && ~rsp_ready; VX_pipe_register #( - .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `TEX_BLEND_FRAC) + REQ_INFOW), + .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (NUM_REQS * 32) + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `TEX_BLEND_FRAC) + REQ_INFOW), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}), - .data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_lgstride, rsp_addr, rsp_blends, rsp_info}) + .data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, mip_addr_s0, addr, blends, req_info_s0}), + .data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_lgstride, rsp_baseaddr, rsp_addr, rsp_blends, rsp_info}) ); assign req_ready = ~stall_out; @@ -176,6 +174,8 @@ module VX_tex_addr #( `TRACE_ARRAY2D(clamped_lo, 2, NUM_REQS); dpi_trace(", clamped_hi="); `TRACE_ARRAY2D(clamped_hi, 2, NUM_REQS); + dpi_trace(", mip_addr="); + `TRACE_ARRAY1D(mip_addr, NUM_REQS); dpi_trace("\n"); end @@ -192,10 +192,6 @@ module VX_tex_addr #( `TRACE_ARRAY1D(offset_v_lo, NUM_REQS); dpi_trace(", offset_v_hi="); `TRACE_ARRAY1D(offset_v_hi, NUM_REQS); - dpi_trace(", base_addr_lo="); - `TRACE_ARRAY1D(base_addr_lo, NUM_REQS); - dpi_trace(", base_addr_hi="); - `TRACE_ARRAY1D(base_addr_hi, NUM_REQS); dpi_trace("\n"); end diff --git a/hw/rtl/tex_unit/VX_tex_lerp.sv b/hw/rtl/tex_unit/VX_tex_lerp.sv index 6dce57e3..7f35ac38 100644 --- a/hw/rtl/tex_unit/VX_tex_lerp.sv +++ b/hw/rtl/tex_unit/VX_tex_lerp.sv @@ -3,12 +3,11 @@ module VX_tex_lerp ( input wire [3:0][7:0] in1, input wire [3:0][7:0] in2, - input wire [8:0] alpha, - input wire [7:0] beta, + input wire [7:0] frac, output wire [3:0][7:0] out -); +); for (genvar i = 0; i < 4; ++i) begin - wire [16:0] sum = in1[i] * alpha + in2[i] * beta; + wire [16:0] sum = in1[i] * 8'(8'hff - frac) + in2[i] * frac; `UNUSED_VAR (sum) assign out[i] = sum[15:8]; end diff --git a/hw/rtl/tex_unit/VX_tex_mem.sv b/hw/rtl/tex_unit/VX_tex_mem.sv index fc99466e..dd9878a2 100644 --- a/hw/rtl/tex_unit/VX_tex_mem.sv +++ b/hw/rtl/tex_unit/VX_tex_mem.sv @@ -16,6 +16,7 @@ module VX_tex_mem #( input wire [NUM_REQS-1:0] req_tmask, input wire [`TEX_FILTER_BITS-1:0] req_filter, input wire [`TEX_LGSTRIDE_BITS-1:0] req_lgstride, + input wire [NUM_REQS-1:0][31:0] req_baseaddr, input wire [NUM_REQS-1:0][3:0][31:0] req_addr, input wire [REQ_INFOW-1:0] req_info, output wire req_ready, @@ -32,6 +33,14 @@ module VX_tex_mem #( localparam RSP_CTR_W = $clog2(NUM_REQS * 4 + 1); + // full address calculation + wire [NUM_REQS-1:0][3:0][31:0] full_addr; + for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar j = 0; j < 4; ++j) begin + assign full_addr[i][j] = req_baseaddr[i] + req_addr[i][j]; + end + end + wire [3:0] dup_reqs; wire [3:0][NUM_REQS-1:0][29:0] req_addr_w; wire [3:0][NUM_REQS-1:0][1:0] align_offs; @@ -40,17 +49,17 @@ module VX_tex_mem #( for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar j = 0; j < 4; ++j) begin - assign req_addr_w[j][i] = req_addr[i][j][31:2]; - assign align_offs[j][i] = req_addr[i][j][1:0]; + assign req_addr_w[j][i] = full_addr[i][j][31:2]; + assign align_offs[j][i] = full_addr[i][j][1:0]; end end - // find duplicate addresses + // detect duplicate addresses for (genvar i = 0; i < 4; ++i) begin - wire [NUM_REQS-1:0] addr_matches; - for (genvar j = 0; j < NUM_REQS; j++) begin - assign addr_matches[j] = (req_addr_w[i][0] == req_addr_w[i][j]) || ~req_tmask[j]; + wire [NUM_REQS-2:0] addr_matches; + for (genvar j = 0; j < (NUM_REQS-1); ++j) begin + assign addr_matches[j] = (req_addr_w[i][j+1] == req_addr_w[i][0]) || ~req_tmask[j+1]; end assign dup_reqs[i] = req_tmask[0] && (& addr_matches); end @@ -172,6 +181,8 @@ module VX_tex_mem #( reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init; wire [RSP_CTR_W-1:0] rsp_rem_ctr_n; wire [NUM_REQS-1:0][1:0] rsp_align_offs; + wire [$clog2(NUM_REQS+1)-1:0] q_req_size; + wire [$clog2(NUM_REQS+1)-1:0] dcache_rsp_size; wire dcache_rsp_fire; wire [1:0] rsp_texel_idx; wire rsp_texel_dup; @@ -218,16 +229,21 @@ module VX_tex_mem #( end end + `POP_COUNT(q_req_size, q_req_tmask); + always @(*) begin - rsp_rem_ctr_init = RSP_CTR_W'($countones(q_dup_reqs[0] ? NUM_REQS'(1) : q_req_tmask)); + rsp_rem_ctr_init = q_dup_reqs[0] ? RSP_CTR_W'(1) : RSP_CTR_W'(q_req_size); if (q_req_filter) begin for (integer i = 1; i < 4; ++i) begin - rsp_rem_ctr_init += RSP_CTR_W'($countones(q_dup_reqs[i] ? NUM_REQS'(1) : q_req_tmask)); + rsp_rem_ctr_init += q_dup_reqs[i] ? RSP_CTR_W'(1) : RSP_CTR_W'(q_req_size); end end end - assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'($countones(dcache_rsp_if.tmask)); + wire [NUM_REQS-1:0] dcache_rsp_tmask = dcache_rsp_if.tmask; + `POP_COUNT(dcache_rsp_size, dcache_rsp_tmask); + + assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'(dcache_rsp_size); always @(posedge clk) begin if (reset) begin @@ -249,7 +265,7 @@ module VX_tex_mem #( wire stall_out = rsp_valid && ~rsp_ready; - wire is_last_rsp = (0 == rsp_rem_ctr_n); + wire is_last_rsp = (rsp_rem_ctr == RSP_CTR_W'(dcache_rsp_size)); wire rsp_texels_done = dcache_rsp_fire && is_last_rsp; @@ -290,8 +306,10 @@ module VX_tex_mem #( dpi_trace("\n"); end if (req_valid && req_ready) begin - dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, lgstride=%0d, addr=", + dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, lgstride=%0d, baseaddr=", $time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_lgstride); + `TRACE_ARRAY1D(req_baseaddr, NUM_REQS); + dpi_trace(", addr="); `TRACE_ARRAY2D(req_addr, 4, NUM_REQS); dpi_trace("\n"); end diff --git a/hw/rtl/tex_unit/VX_tex_sampler.sv b/hw/rtl/tex_unit/VX_tex_sampler.sv index 63371337..dffc5cf0 100644 --- a/hw/rtl/tex_unit/VX_tex_sampler.sv +++ b/hw/rtl/tex_unit/VX_tex_sampler.sv @@ -27,75 +27,78 @@ module VX_tex_sampler #( `UNUSED_PARAM (CORE_ID) - wire valid_s0; - wire [NUM_REQS-1:0] tmask_s0; - wire [REQ_INFOW-1:0] req_info_s0; + wire valid_s0, valid_s1; + wire [NUM_REQS-1:0] req_tmask_s0, req_tmask_s1; + wire [REQ_INFOW-1:0] req_info_s0, req_info_s1; wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh; - wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0; - wire [NUM_REQS-1:0][`TEX_BLEND_FRAC-1:0] blend_v, blend_v_s0; + wire [NUM_REQS-1:0][31:0] texel_ul_s1, texel_uh_s1; + wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] req_blends_s0; + wire [NUM_REQS-1:0][`TEX_BLEND_FRAC-1:0] blend_v, blend_v_s1; wire [NUM_REQS-1:0][31:0] texel_v; + wire [NUM_REQS-1:0][3:0][31:0] fmt_texels, fmt_texels_s0; wire stall_out; for (genvar i = 0; i < NUM_REQS; ++i) begin - - wire [3:0][31:0] fmt_texels; - for (genvar j = 0; j < 4; ++j) begin VX_tex_format #( .CORE_ID (CORE_ID) ) tex_format ( .format (req_format), .texel_in (req_data[i][j]), - .texel_out (fmt_texels[j]) + .texel_out (fmt_texels[i][j]) ); - end - - wire [7:0] beta = req_blends[i][0]; - wire [8:0] alpha = `TEX_BLEND_ONE - beta; - - VX_tex_lerp #( - ) tex_lerp_ul ( - .in1 (fmt_texels[0]), - .in2 (fmt_texels[1]), - .alpha (alpha), - .beta (beta), - .out (texel_ul[i]) - ); - - VX_tex_lerp #( - ) tex_lerp_uh ( - .in1 (fmt_texels[2]), - .in2 (fmt_texels[3]), - .alpha (alpha), - .beta (beta), - .out (texel_uh[i]) - ); - - assign blend_v[i] = req_blends[i][1]; + end end VX_pipe_register #( - .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `TEX_BLEND_FRAC) + (2 * NUM_REQS * 32)), + .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 2 * `TEX_BLEND_FRAC) + (NUM_REQS * 4 * 32)), .RESETW (1) ) pipe_reg0 ( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({req_valid, req_tmask, req_info, blend_v, texel_ul, texel_uh}), - .data_out ({valid_s0, tmask_s0, req_info_s0, blend_v_s0, texel_ul_s0, texel_uh_s0}) + .data_in ({req_valid, req_tmask, req_info, req_blends, fmt_texels}), + .data_out ({valid_s0, req_tmask_s0, req_info_s0, req_blends_s0, fmt_texels_s0}) + ); + + for (genvar i = 0; i < NUM_REQS; ++i) begin + VX_tex_lerp #( + ) tex_lerp_ul ( + .in1 (fmt_texels_s0[i][0]), + .in2 (fmt_texels_s0[i][1]), + .frac (req_blends_s0[i][0]), + .out (texel_ul[i]) + ); + + VX_tex_lerp #( + ) tex_lerp_uh ( + .in1 (fmt_texels_s0[i][2]), + .in2 (fmt_texels_s0[i][3]), + .frac (req_blends_s0[i][0]), + .out (texel_uh[i]) + ); + + assign blend_v[i] = req_blends_s0[i][1]; + end + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `TEX_BLEND_FRAC) + (2 * NUM_REQS * 32)), + .RESETW (1) + ) pipe_reg1 ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({valid_s0, req_tmask_s0, req_info_s0, blend_v, texel_ul, texel_uh}), + .data_out ({valid_s1, req_tmask_s1, req_info_s1, blend_v_s1, texel_ul_s1, texel_uh_s1}) ); for (genvar i = 0; i < NUM_REQS; i++) begin - wire [7:0] beta = blend_v_s0[i]; - wire [8:0] alpha = `TEX_BLEND_ONE - beta; - VX_tex_lerp #( ) tex_lerp_v ( - .in1 (texel_ul_s0[i]), - .in2 (texel_uh_s0[i]), - .alpha (alpha), - .beta (beta), + .in1 (texel_ul_s1[i]), + .in2 (texel_uh_s1[i]), + .frac (blend_v_s1[i]), .out (texel_v[i]) ); end @@ -105,12 +108,12 @@ module VX_tex_sampler #( VX_pipe_register #( .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 32)), .RESETW (1) - ) pipe_reg1 ( + ) pipe_reg2 ( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({valid_s0, tmask_s0, req_info_s0, texel_v}), - .data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data}) + .data_in ({valid_s1, req_tmask_s1, req_info_s1, texel_v}), + .data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data}) ); // can accept new request? diff --git a/hw/rtl/tex_unit/VX_tex_unit.sv b/hw/rtl/tex_unit/VX_tex_unit.sv index c9510827..c10cdf64 100644 --- a/hw/rtl/tex_unit/VX_tex_unit.sv +++ b/hw/rtl/tex_unit/VX_tex_unit.sv @@ -23,9 +23,8 @@ module VX_tex_unit #( VX_tex_rsp_if.master tex_rsp_if ); - localparam REQ_INFOW_S = 64 + `NR_BITS + 1 + `NW_BITS + 32; - localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S; - localparam REQ_INFOW_M = (2 * `NUM_THREADS * `TEX_BLEND_FRAC) + REQ_INFOW_A; + localparam REQ_INFO_W = 64 + `NR_BITS + 1 + `NW_BITS + 32; + localparam BLEND_FRAC_W = (2 * `NUM_THREADS * `TEX_BLEND_FRAC); reg [$clog2(`NUM_TEX_UNITS)-1:0] csr_tex_unit; reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][`TEX_LOD_MAX+1-1:0]; @@ -95,15 +94,16 @@ module VX_tex_unit #( // mipmap attributes + wire [`NUM_THREADS-1:0][`TEX_LOD_BITS-1:0] mip_level; wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff; wire [`NUM_THREADS-1:0][1:0][`TEX_LOD_BITS-1:0] sel_logdims; for (genvar i = 0; i < `NUM_THREADS; ++i) begin wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0]; - wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][`TEX_LOD_BITS-1:0]; - assign sel_mipoff[i] = tex_mipoff[unit][mip_level]; - assign sel_logdims[i][0] = (tex_logdims[unit][0] - mip_level); - assign sel_logdims[i][1] = (tex_logdims[unit][1] - mip_level); + assign mip_level[i] = tex_req_if.lod[i][`TEX_LOD_BITS-1:0]; + assign sel_mipoff[i] = tex_mipoff[unit][mip_level[i]]; + assign sel_logdims[i][0] = tex_logdims[unit][0]; + assign sel_logdims[i][1] = tex_logdims[unit][1]; end // address generation @@ -114,12 +114,13 @@ module VX_tex_unit #( wire [`TEX_LGSTRIDE_BITS-1:0] mem_req_lgstride; wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] mem_req_blends; wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr; - wire [REQ_INFOW_A-1:0] mem_req_info; + wire [`NUM_THREADS-1:0][31:0] mem_req_baseaddr; + wire [(`TEX_FORMAT_BITS + REQ_INFO_W)-1:0] mem_req_info; wire mem_req_ready; VX_tex_addr #( .CORE_ID (CORE_ID), - .REQ_INFOW (REQ_INFOW_A), + .REQ_INFOW (`TEX_FORMAT_BITS + REQ_INFO_W), .NUM_REQS (`NUM_THREADS) ) tex_addr ( .clk (clk), @@ -132,6 +133,7 @@ module VX_tex_unit #( .req_filter (tex_filter[tex_req_if.unit]), .req_wraps (tex_wraps[tex_req_if.unit]), .req_baseaddr(tex_baddr[tex_req_if.unit]), + .mip_level (mip_level), .req_mipoff (sel_mipoff), .req_logdims(sel_logdims), .req_info ({tex_format[tex_req_if.unit], tex_req_if.uuid, tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), @@ -141,6 +143,7 @@ module VX_tex_unit #( .rsp_tmask (mem_req_tmask), .rsp_filter (mem_req_filter), .rsp_lgstride(mem_req_lgstride), + .rsp_baseaddr(mem_req_baseaddr), .rsp_addr (mem_req_addr), .rsp_blends (mem_req_blends), .rsp_info (mem_req_info), @@ -152,12 +155,12 @@ module VX_tex_unit #( wire mem_rsp_valid; wire [`NUM_THREADS-1:0] mem_rsp_tmask; wire [`NUM_THREADS-1:0][3:0][31:0] mem_rsp_data; - wire [REQ_INFOW_M-1:0] mem_rsp_info; + wire [(BLEND_FRAC_W + `TEX_FORMAT_BITS + REQ_INFO_W)-1:0] mem_rsp_info; wire mem_rsp_ready; VX_tex_mem #( .CORE_ID (CORE_ID), - .REQ_INFOW (REQ_INFOW_M), + .REQ_INFOW (BLEND_FRAC_W + `TEX_FORMAT_BITS + REQ_INFO_W), .NUM_REQS (`NUM_THREADS) ) tex_mem ( .clk (clk), @@ -172,6 +175,7 @@ module VX_tex_unit #( .req_tmask (mem_req_tmask), .req_filter(mem_req_filter), .req_lgstride(mem_req_lgstride), + .req_baseaddr(mem_req_baseaddr), .req_addr (mem_req_addr), .req_info ({mem_req_blends, mem_req_info}), .req_ready (mem_req_ready), @@ -186,15 +190,9 @@ module VX_tex_unit #( // apply sampler - wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends; - wire [`TEX_FORMAT_BITS-1:0] rsp_format; - wire [REQ_INFOW_S-1:0] rsp_info; - - assign {rsp_blends, rsp_format, rsp_info} = mem_rsp_info; - VX_tex_sampler #( .CORE_ID (CORE_ID), - .REQ_INFOW (REQ_INFOW_S), + .REQ_INFOW (REQ_INFO_W), .NUM_REQS (`NUM_THREADS) ) tex_sampler ( .clk (clk), @@ -204,9 +202,9 @@ module VX_tex_unit #( .req_valid (mem_rsp_valid), .req_tmask (mem_rsp_tmask), .req_data (mem_rsp_data), - .req_format (rsp_format), - .req_blends (rsp_blends), - .req_info (rsp_info), + .req_blends (mem_rsp_info[(REQ_INFO_W+`TEX_FORMAT_BITS) +: BLEND_FRAC_W]), + .req_format (mem_rsp_info[REQ_INFO_W +: `TEX_FORMAT_BITS]), + .req_info (mem_rsp_info[0 +: REQ_INFO_W]), .req_ready (mem_rsp_ready), // outputs diff --git a/hw/syn/quartus/Makefile b/hw/syn/quartus/Makefile index 662848e1..1dd63335 100644 --- a/hw/syn/quartus/Makefile +++ b/hw/syn/quartus/Makefile @@ -1,6 +1,6 @@ BUILD_DIR ?= build -.PHONY: dogfood unittest pipeline smem cache fpu_core core vortex top1 top2 top4 top8 top16 top32 top64 +.PHONY: dogfood unittest pipeline smem cache fpu_core core vortex top1 top2 top4 top8 top16 top32 top64 texunit dogfood: mkdir -p dogfood/$(BUILD_DIR) @@ -75,4 +75,9 @@ top32: top64: mkdir -p top64/$(BUILD_DIR) cp top64/Makefile top64/$(BUILD_DIR) - $(MAKE) -C top64/$(BUILD_DIR) clean && $(MAKE) -C top64/$(BUILD_DIR) > top64/$(BUILD_DIR)/build.log 2>&1 & \ No newline at end of file + $(MAKE) -C top64/$(BUILD_DIR) clean && $(MAKE) -C top64/$(BUILD_DIR) > top64/$(BUILD_DIR)/build.log 2>&1 & + +texunit: + mkdir -p texunit/$(BUILD_DIR) + cp texunit/Makefile texunit/$(BUILD_DIR) + $(MAKE) -C texunit/$(BUILD_DIR) clean && $(MAKE) -C texunit/$(BUILD_DIR) > texunit/$(BUILD_DIR)/build.log 2>&1 & \ No newline at end of file diff --git a/hw/syn/quartus/texunit/Makefile b/hw/syn/quartus/texunit/Makefile new file mode 100644 index 00000000..3ecfa892 --- /dev/null +++ b/hw/syn/quartus/texunit/Makefile @@ -0,0 +1,81 @@ +PROJECT = Core +TOP_LEVEL_ENTITY = VX_core +SRC_FILE = VX_core.v +RTL_DIR = ../../../../rtl +THIRD_PARTY_DIR = ../../../../../third_party + +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG +FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 + +#FAMILY = "Stratix 10" +#DEVICE = 1SX280HN2F43E2VG +#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 + +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) + +PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf + +# Executable Configuration +SYN_ARGS = --parallel --read_settings_files=on +FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on +ASM_ARGS = +STA_ARGS = --parallel --do_report_timing + +# Build targets +all: $(PROJECT).sta.rpt + +syn: $(PROJECT).syn.rpt + +fit: $(PROJECT).fit.rpt + +asm: $(PROJECT).asm.rpt + +sta: $(PROJECT).sta.rpt + +smart: smart.log + +# Target implementations +STAMP = echo done > + +$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES) + quartus_syn $(PROJECT) $(SYN_ARGS) + $(STAMP) fit.chg + +$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt + quartus_fit $(PROJECT) $(FIT_ARGS) + $(STAMP) asm.chg + $(STAMP) sta.chg + +$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt + quartus_asm $(PROJECT) $(ASM_ARGS) + +$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt + quartus_sta $(PROJECT) $(STA_ARGS) + +smart.log: $(PROJECT_FILES) + quartus_sh --determine_smart_action $(PROJECT) > smart.log + +# Project initialization +$(PROJECT_FILES): + quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "EXT_TEX_ENABLE=1" + +syn.chg: + $(STAMP) syn.chg + +fit.chg: + $(STAMP) fit.chg + +sta.chg: + $(STAMP) sta.chg + +asm.chg: + $(STAMP) asm.chg + +program: $(PROJECT).sof + quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof" + +clean: + rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox From b741807f8cc58e8b875dc49f1c16ef837850fb13 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 6 Dec 2021 01:22:45 -0500 Subject: [PATCH 20/27] using ramulator dram simulator --- .gitmodules | 3 + ci/regression.sh | 12 +- driver/rtlsim/Makefile | 7 +- driver/rtlsim/vortex.cpp | 16 +- driver/simx/Makefile | 8 +- driver/simx/vortex.cpp | 37 +- driver/vlsim/Makefile | 9 +- sim/common/simobject.h | 13 +- sim/common/texturing.h | 2 +- sim/common/util.h | 40 ++- sim/rtlsim/Makefile | 21 +- sim/rtlsim/main.cpp | 9 +- sim/rtlsim/processor.cpp | 599 +++++++++++++++++++++++++++++++++ sim/rtlsim/processor.h | 25 ++ sim/rtlsim/simulator.cpp | 579 ------------------------------- sim/rtlsim/simulator.h | 81 ----- sim/simx/Makefile | 32 +- sim/simx/cache.cpp | 24 +- sim/simx/constants.h | 6 +- sim/simx/core.cpp | 13 +- sim/simx/execute.cpp | 3 +- sim/simx/exeunit.cpp | 7 +- sim/simx/main.cpp | 52 +-- sim/simx/memsim.cpp | 95 ++++-- sim/simx/memsim.h | 11 +- sim/simx/processor.cpp | 242 +++++++------ sim/simx/processor.h | 19 +- sim/simx/sharedmem.h | 3 +- sim/simx/types.h | 33 +- sim/vlsim/Makefile | 22 +- sim/vlsim/opae_sim.cpp | 711 +++++++++++++++++++++------------------ sim/vlsim/opae_sim.h | 76 +---- third_party/Makefile | 7 +- 33 files changed, 1473 insertions(+), 1344 deletions(-) create mode 100644 sim/rtlsim/processor.cpp create mode 100644 sim/rtlsim/processor.h delete mode 100644 sim/rtlsim/simulator.cpp delete mode 100644 sim/rtlsim/simulator.h diff --git a/.gitmodules b/.gitmodules index 26a07f16..0db51e41 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "third_party/cocogfx"] path = third_party/cocogfx url = https://github.com/gtcasl/cocogfx.git +[submodule "third_party/ramulator"] + path = third_party/ramulator + url = https://github.com/CMU-SAFARI/ramulator.git diff --git a/ci/regression.sh b/ci/regression.sh index 4a1336c8..b99754af 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -102,7 +102,7 @@ FPU_CORE=FPU_FPNEW ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo # adjust l1 block size to match l2 -CONFIGS="-DMEM_BLOCK_SIZE=16 -DL1_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1" +CONFIGS="-DL1_BLOCK_SIZE=64" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1" # test cache banking CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr @@ -119,18 +119,12 @@ CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=simx --cores # test 128-bit MEM block CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo -# test 128-bit MEM and DRAM block -CONFIGS="-DMEM_BLOCK_SIZE=16 -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo +# test single-bank DRAM +CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo # test 27-bit DRAM address CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo -# test 128-bit DRAM block -CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo - -# test long memory latency -CONFIGS="-DMEM_LATENCY=100 -DMEM_RQ_SIZE=4 -DMEM_STALLS_MODULO=4" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo - echo "configuration tests done!" } diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 4626eeb3..72d3a07a 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -5,8 +5,6 @@ CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -I../include -I../common -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common -LDFLAGS += $(RTLSIM_DIR)/librtlsim.a - # Position independent code CXXFLAGS += -fPIC @@ -17,6 +15,7 @@ CXXFLAGS += $(CONFIGS) CXXFLAGS += -DDUMP_PERF_STATS LDFLAGS += -shared -pthread +LDFLAGS += -L. -lrtlsim SRCS = vortex.cpp ../common/vx_utils.cpp @@ -30,9 +29,9 @@ PROJECT = libvortex.so all: $(PROJECT) $(PROJECT): $(SRCS) - $(MAKE) -C $(RTLSIM_DIR) static + DESTDIR=../../driver/rtlsim $(MAKE) -C $(RTLSIM_DIR) ../../driver/rtlsim/librtlsim.so $(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT) clean: - $(MAKE) -C $(RTLSIM_DIR) clean-static + DESTDIR=../../driver/rtlsim $(MAKE) -C $(RTLSIM_DIR) clean rm -rf $(PROJECT) *.o \ No newline at end of file diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index cc16f0d3..52c290cd 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -11,7 +12,7 @@ #include #include #include -#include +#include #define RAM_PAGE_SIZE 4096 @@ -60,7 +61,9 @@ public: vx_device() : ram_(RAM_PAGE_SIZE) , mem_allocation_(ALLOC_BASE_ADDR) - {} + { + processor_.attach_ram(&ram_); + } ~vx_device() { if (future_.valid()) { @@ -121,12 +124,9 @@ public: future_.wait(); } // start new run - simulator_.attach_ram(&ram_); future_ = std::async(std::launch::async, [&]{ - simulator_.reset(); - while (simulator_.is_busy()) { - simulator_.step(); - } + processor_.reset(); + processor_.run(); }); return 0; } @@ -149,7 +149,7 @@ public: private: RAM ram_; - Simulator simulator_; + Processor processor_; uint64_t mem_allocation_; std::future future_; }; diff --git a/driver/simx/Makefile b/driver/simx/Makefile index 14114f2a..b5723972 100644 --- a/driver/simx/Makefile +++ b/driver/simx/Makefile @@ -9,7 +9,7 @@ CXXFLAGS += $(CONFIGS) CXXFLAGS += -DDUMP_PERF_STATS LDFLAGS += -shared -pthread -LDFLAGS += $(SIMX_DIR)/libsimx.a +LDFLAGS += -L. -lsimx SRCS = vortex.cpp ../common/vx_utils.cpp @@ -18,9 +18,9 @@ PROJECT = libvortex.so all: $(PROJECT) $(PROJECT): $(SRCS) - $(MAKE) -C $(SIMX_DIR) static + DESTDIR=../../driver/simx $(MAKE) -C $(SIMX_DIR) ../../driver/simx/libsimx.so $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ clean: - $(MAKE) -C $(SIMX_DIR) clean-static - rm -rf $(PROJECT) *.o \ No newline at end of file + DESTDIR=../../driver/simx $(MAKE) -C $(SIMX_DIR) clean + rm -rf libsimx.so $(PROJECT) *.o \ No newline at end of file diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index 2aaef1e9..4b086d7e 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -60,7 +60,13 @@ public: : arch_("rv32i", NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS) , ram_(RAM_PAGE_SIZE) , mem_allocation_(ALLOC_BASE_ADDR) - {} + { + // setup memory simulator + memsim_ = MemSim::Create(MemSim::Config{ + DRAM_CHANNELS, + arch_.num_cores() + }); + } ~vx_device() { if (future_.valid()) { @@ -113,13 +119,33 @@ public: if (future_.valid()) { future_.wait(); } + // start new run - SimPlatform::instance().flush(); - processor_ = std::make_shared(arch_); - processor_->attach_ram(&ram_); future_ = std::async(std::launch::async, [&]{ - processor_->run(); + if (processor_) { + // release current processor instance + processor_->MemReqPort.unbind(); + memsim_->MemRspPort.unbind(); + SimPlatform::instance().release_object(processor_); + } + + // create new processor instance + processor_ = Processor::Create(arch_); + processor_->MemReqPort.bind(&memsim_->MemReqPort); + memsim_->MemRspPort.bind(&processor_->MemRspPort); + + // attach memory object + processor_->attach_ram(&ram_); + + // run simulation + int exitcode; + for (;;) { + SimPlatform::instance().step(); + if (processor_->check_exit(&exitcode)) + break; + }; }); + return 0; } @@ -141,6 +167,7 @@ public: private: ArchDef arch_; RAM ram_; + MemSim::Ptr memsim_; Processor::Ptr processor_; uint64_t mem_allocation_; std::future future_; diff --git a/driver/vlsim/Makefile b/driver/vlsim/Makefile index 5608ad11..23c07635 100644 --- a/driver/vlsim/Makefile +++ b/driver/vlsim/Makefile @@ -9,8 +9,6 @@ CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -I. -I../include -I../../hw -I$(VLSIM_DIR) -LDFLAGS += $(VLSIM_DIR)/libopae-c-vlsim.a - # Position independent code CXXFLAGS += -fPIC @@ -21,6 +19,7 @@ CXXFLAGS += $(CONFIGS) CXXFLAGS += -DDUMP_PERF_STATS LDFLAGS += -shared -pthread +LDFLAGS += -L. -lopae-c-vlsim SRCS = ../common/opae.cpp ../common/vx_utils.cpp @@ -47,9 +46,9 @@ scope-defs.h: $(SCRIPT_DIR)/scope.json scope: scope-defs.h $(PROJECT): $(SRCS) $(SCOPE_H) - $(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C $(VLSIM_DIR) static + DESTDIR=../../driver/vlsim $(MAKE) -C $(VLSIM_DIR) ../../driver/vlsim/libopae-c-vlsim.so $(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -o $(PROJECT) clean: - $(MAKE) -C $(VLSIM_DIR) clean-static - rm -rf $(PROJECT) *.o scope-defs.h \ No newline at end of file + DESTDIR=../../driver/vlsim $(MAKE) -C $(VLSIM_DIR) clean + rm -rf libopae-c-vlsim.so $(PROJECT) *.o scope-defs.h \ No newline at end of file diff --git a/sim/common/simobject.h b/sim/common/simobject.h index 3a5ab2b6..2830ea06 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -51,8 +51,7 @@ public: peer_ = peer; } - void unbind() { - assert(peer_ == nullptr); + void unbind() { peer_ = nullptr; } @@ -292,12 +291,16 @@ public: } template - typename SimObject::Ptr CreateObject(Args&&... args) { + typename SimObject::Ptr create_object(Args&&... args) { auto obj = std::make_shared(SimContext{}, std::forward(args)...); objects_.push_back(obj); return obj; } + void release_object(const SimObjectBase::Ptr& object) { + objects_.remove(object); + } + template void schedule(const typename SimCallEvent::Func& callback, const Pkt& pkt, @@ -352,7 +355,7 @@ private: events_.emplace_back(evt); } - std::vector objects_; + std::list objects_; std::list events_; uint64_t cycles_; @@ -369,7 +372,7 @@ inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) template template typename SimObject::Ptr SimObject::Create(Args&&... args) { - return SimPlatform::instance().CreateObject(std::forward(args)...); + return SimPlatform::instance().create_object(std::forward(args)...); } template diff --git a/sim/common/texturing.h b/sim/common/texturing.h index 9b0e4526..5941594e 100644 --- a/sim/common/texturing.h +++ b/sim/common/texturing.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include using namespace cocogfx; diff --git a/sim/common/util.h b/sim/common/util.h index d66305ee..171bbe68 100644 --- a/sim/common/util.h +++ b/sim/common/util.h @@ -11,4 +11,42 @@ void unused(Args&&...) {} #define __unused(...) unused(__VA_ARGS__) // return file extension -const char* fileExtension(const char* filepath); \ No newline at end of file +const char* fileExtension(const char* filepath); + +#if defined(_MSC_VER) +#define DISABLE_WARNING_PUSH __pragma(warning(push)) +#define DISABLE_WARNING_POP __pragma(warning(pop)) +#define DISABLE_WARNING_UNUSED_PARAMETER \ + __pragma(warning(disable : 4100)) +#define DISABLE_WARNING_UNREFERENCED_FUNCTION __pragma(warning(disable : 4505)) +#define DISABLE_WARNING_ANONYMOUS_STRUCT __pragma(warning(disable : 4201)) +#define DISABLE_WARNING_UNUSED_VARIABLE __pragma(warning(disable : 4189)) +#elif defined(__GNUC__) +#define DISABLE_WARNING_PUSH _Pragma("GCC diagnostic push") +#define DISABLE_WARNING_POP _Pragma("GCC diagnostic pop") +#define DISABLE_WARNING_UNUSED_PARAMETER \ + _Pragma("GCC diagnostic ignored \"-Wunused-parameter\"") +#define DISABLE_WARNING_UNREFERENCED_FUNCTION \ + _Pragma("GCC diagnostic ignored \"-Wunused-function\"") +#define DISABLE_WARNING_ANONYMOUS_STRUCT \ + _Pragma("GCC diagnostic ignored \"-Wpedantic\"") +#define DISABLE_WARNING_UNUSED_VARIABLE \ + _Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"") +#elif defined(__clang__) +#define DISABLE_WARNING_PUSH _Pragma("clang diagnostic push") +#define DISABLE_WARNING_POP _Pragma("clang diagnostic pop") +#define DISABLE_WARNING_UNUSED_PARAMETER \ + _Pragma("clang diagnostic ignored \"-Wunused-parameter\"") +#define DISABLE_WARNING_UNREFERENCED_FUNCTION \ + _Pragma("clang diagnostic ignored \"-Wunused-function\"") +#define DISABLE_WARNING_ANONYMOUS_STRUCT \ + _Pragma("clang diagnostic ignored \"-Wgnu-anonymous-struct\"") +#define DISABLE_WARNING_UNUSED_VARIABLE \ + _Pragma("clang diagnostic ignored \"-Wunused-but-set-variable\"") +#else +#define DISABLE_WARNING_PUSH +#define DISABLE_WARNING_POP +#define DISABLE_WARNING_UNUSED_PARAMETER +#define DISABLE_WARNING_UNREFERENCED_FUNCTION +#define DISABLE_WARNING_ANONYMOUS_STRUCT +#endif \ No newline at end of file diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index df9970d5..607dcf41 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -1,3 +1,4 @@ +DESTDIR ?= . RTL_DIR = ../../hw/rtl DPI_DIR = ../../hw/dpi THIRD_PARTY_DIR = ../../third_party @@ -6,8 +7,10 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -I../../../hw -I../../common CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include +CXXFLAGS += -I../$(THIRD_PARTY_DIR) LDFLAGS += ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a +LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator # control RTL debug tracing states DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE @@ -31,7 +34,7 @@ RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interface SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp -SRCS += main.cpp simulator.cpp +SRCS += processor.cpp ifdef AXI_BUS TOP = Vortex_axi @@ -86,15 +89,11 @@ PROJECT = rtlsim all: $(PROJECT) -$(PROJECT): $(SRCS) - verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT) +$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp + verilator --build $(VL_FLAGS) $^ $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$@ -static: $(SRCS) - verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' - $(AR) rcs lib$(PROJECT).a obj_dir/*.o $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/*.o +$(DESTDIR)/lib$(PROJECT).so: $(SRCS) + verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' -o ../$@ -clean-static: - rm -rf lib$(PROJECT).a obj_dir - -clean: clean-static - rm -rf $(PROJECT) +clean: + rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so diff --git a/sim/rtlsim/main.cpp b/sim/rtlsim/main.cpp index 652e550f..c61fbec8 100644 --- a/sim/rtlsim/main.cpp +++ b/sim/rtlsim/main.cpp @@ -5,7 +5,8 @@ #include #include #include -#include "simulator.h" +#include +#include "processor.h" #define RAM_PAGE_SIZE 4096 @@ -52,8 +53,8 @@ int main(int argc, char **argv) { std::cout << "Running " << program << "..." << std::endl; vortex::RAM ram(RAM_PAGE_SIZE); - vortex::Simulator simulator; - simulator.attach_ram(&ram); + vortex::Processor processor; + processor.attach_ram(&ram); std::string program_ext(fileExtension(program)); if (program_ext == "bin") { @@ -65,7 +66,7 @@ int main(int argc, char **argv) { return -1; } - exitcode = simulator.run(); + exitcode = processor.run(); if (riscv_test) { if (1 == exitcode) { diff --git a/sim/rtlsim/processor.cpp b/sim/rtlsim/processor.cpp new file mode 100644 index 00000000..7c20a442 --- /dev/null +++ b/sim/rtlsim/processor.cpp @@ -0,0 +1,599 @@ +#include "processor.h" + +#include + +#ifdef AXI_BUS +#include "VVortex_axi.h" +#include "VVortex_axi__Syms.h" +#else +#include "VVortex.h" +#include "VVortex__Syms.h" +#endif + +#ifdef VCD_OUTPUT +#include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define RAMULATOR +#include +#include +#include + +#ifndef MEMORY_BANKS + #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS + #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS + #else + #define MEMORY_BANKS 2 + #endif +#endif + +#define ENABLE_MEM_STALLS + +#ifndef TRACE_START_TIME +#define TRACE_START_TIME 0ull +#endif + +#ifndef TRACE_STOP_TIME +#define TRACE_STOP_TIME -1ull +#endif + +#ifndef VERILATOR_RESET_VALUE +#define VERILATOR_RESET_VALUE 2 +#endif + +#define VL_WDATA_GETW(lwp, i, n, w) \ + VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w) + +using namespace vortex; + +static uint64_t timestamp = 0; + +double sc_time_stamp() { + return timestamp; +} + +/////////////////////////////////////////////////////////////////////////////// + +static bool trace_enabled = false; +static uint64_t trace_start_time = TRACE_START_TIME; +static uint64_t trace_stop_time = TRACE_STOP_TIME; + +bool sim_trace_enabled() { + if (timestamp >= trace_start_time + && timestamp < trace_stop_time) + return true; + return trace_enabled; +} + +void sim_trace_enable(bool enable) { + trace_enabled = enable; +} + +/////////////////////////////////////////////////////////////////////////////// + +class Processor::Impl { +public: + Impl() { + // force random values for unitialized signals + Verilated::randReset(VERILATOR_RESET_VALUE); + Verilated::randSeed(50); + + // turn off assertion before reset + Verilated::assertOn(false); + + // create RTL module instance + #ifdef AXI_BUS + device_ = new VVortex_axi(); + #else + device_ = new VVortex(); + #endif + + #ifdef VCD_OUTPUT + Verilated::traceEverOn(true); + trace_ = new VerilatedVcdC(); + device_->trace(trace_, 99); + trace_->open("trace.vcd"); + #endif + + ram_ = nullptr; + + // initialize dram simulator + ramulator::Config ram_config; + ram_config.add("standard", "DDR4"); + ram_config.add("channels", std::to_string(MEMORY_BANKS)); + ram_config.add("ranks", "1"); + ram_config.add("speed", "DDR4_2400R"); + ram_config.add("org", "DDR4_4Gb_x8"); + ram_config.add("mapping", "defaultmapping"); + ram_config.set_core_num(1); + dram_ = new ramulator::Gem5Wrapper(ram_config, MEM_BLOCK_SIZE); + Stats::statlist.output("ramulator.ddr4.log"); + + // reset the device + this->reset(); + } + + ~Impl() { + for (auto& buf : print_bufs_) { + auto str = buf.second.str(); + if (!str.empty()) { + std::cout << "#" << buf.first << ": " << str << std::endl; + } + } + + #ifdef VCD_OUTPUT + trace_->close(); + delete trace_; + #endif + + delete device_; + + if (dram_) { + dram_->finish(); + Stats::statlist.printall(); + delete dram_; + } + } + + void attach_ram(RAM* ram) { + ram_ = ram; + } + + void reset() { + print_bufs_.clear(); + + pending_mem_reqs_.clear(); + + mem_rd_rsp_active_ = false; + mem_wr_rsp_active_ = false; + + #ifdef AXI_BUS + this->reset_axi_bus(); + #else + this->reset_avs_bus(); + #endif + + device_->reset = 1; + + for (int i = 0; i < RESET_DELAY; ++i) { + device_->clk = 0; + this->eval(); + device_->clk = 1; + this->eval(); + } + + device_->reset = 0; + + // Turn on assertion after reset + Verilated::assertOn(true); + } + + int run() { + int exitcode = 0; + + #ifndef NDEBUG + std::cout << std::dec << timestamp << ": [sim] run()" << std::endl; + #endif + + // execute program + while (device_->busy) { + if (get_ebreak()) { + exitcode = get_last_wb_value(3); + break; + } + this->step(); + } + + // wait 5 cycles to flush the pipeline + this->wait(5); + + return exitcode; + } + +private: + + void step() { + + device_->clk = 0; + this->eval(); + + #ifdef AXI_BUS + this->eval_axi_bus(0); + #else + this->eval_avs_bus(0); + #endif + + device_->clk = 1; + this->eval(); + + #ifdef AXI_BUS + this->eval_axi_bus(1); + #else + this->eval_avs_bus(1); + #endif + + dram_->tick(); + + #ifndef NDEBUG + fflush(stdout); + #endif + } + + void eval() { + device_->eval(); + #ifdef VCD_OUTPUT + if (sim_trace_enabled()) { + trace_->dump(timestamp); + } + #endif + ++timestamp; + } + +#ifdef AXI_BUS + + void reset_axi_bus() { + device_->m_axi_wready = 0; + device_->m_axi_awready = 0; + device_->m_axi_arready = 0; + device_->m_axi_rvalid = 0; + device_->m_axi_bvalid = 0; + } + + void eval_axi_bus(bool clk) { + if (!clk) { + mem_rd_rsp_ready_ = device_->m_axi_rready; + mem_wr_rsp_ready_ = device_->m_axi_bready; + return; + } + + if (ram_ == nullptr) { + device_->m_axi_wready = 0; + device_->m_axi_awready = 0; + device_->m_axi_arready = 0; + return; + } + + // process memory responses + if (mem_rd_rsp_active_ + && device_->m_axi_rvalid && mem_rd_rsp_ready_) { + mem_rd_rsp_active_ = false; + } + if (!mem_rd_rsp_active_) { + if (!pending_mem_reqs_.empty() + && (*pending_mem_reqs_.begin())->ready + && !(*pending_mem_reqs_.begin())->write) { + auto mem_rsp_it = pending_mem_reqs_.begin(); + auto mem_req = *mem_rsp_it; + /* + printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_req->addr); + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + printf("%02x", mem_req->block[(MEM_BLOCK_SIZE-1)-i]); + } + printf("\n"); + */ + device_->m_axi_rvalid = 1; + device_->m_axi_rid = mem_req->tag; + device_->m_axi_rresp = 0; + device_->m_axi_rlast = 1; + memcpy((uint8_t*)device_->m_axi_rdata, mem_req->block.data(), MEM_BLOCK_SIZE); + pending_mem_reqs_.erase(mem_rsp_it); + mem_rd_rsp_active_ = true; + delete mem_req; + } else { + device_->m_axi_rvalid = 0; + } + } + + // send memory write response + if (mem_wr_rsp_active_ + && device_->m_axi_bvalid && mem_wr_rsp_ready_) { + mem_wr_rsp_active_ = false; + } + if (!mem_wr_rsp_active_) { + if (!pending_mem_reqs_.empty() + && (*pending_mem_reqs_.begin())->ready + && (*pending_mem_reqs_.begin())->write) { + auto mem_rsp_it = pending_mem_reqs_.begin(); + auto mem_req = *mem_rsp_it; + /* + printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_req->addr); + */ + device_->m_axi_bvalid = 1; + device_->m_axi_bid = mem_req->tag; + device_->m_axi_bresp = 0; + pending_mem_reqs_.erase(mem_rsp_it); + mem_wr_rsp_active_ = true; + delete mem_req; + } else { + device_->m_axi_bvalid = 0; + } + } + + // select the memory bank + uint32_t req_addr = device_->m_axi_wvalid ? device_->m_axi_awaddr : device_->m_axi_araddr; + + // process memory requests + if (device_->m_axi_wvalid || device_->m_axi_arvalid) { + if (device_->m_axi_wvalid) { + uint64_t byteen = device_->m_axi_wstrb; + unsigned base_addr = device_->m_axi_awaddr; + uint8_t* data = (uint8_t*)(device_->m_axi_wdata); + + // check console output + if (base_addr >= IO_COUT_ADDR + && base_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + if ((byteen >> i) & 0x1) { + auto& ss_buf = print_bufs_[i]; + char c = data[i]; + ss_buf << c; + if (c == '\n') { + std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush; + ss_buf.str(""); + } + } + } + } else { + /* + printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen); + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); + } + printf("\n"); + */ + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + if ((byteen >> i) & 0x1) { + (*ram_)[base_addr + i] = data[i]; + } + } + + auto mem_req = new mem_req_t(); + mem_req->tag = device_->m_axi_awid; + mem_req->addr = device_->m_axi_awaddr; + mem_req->write = true; + mem_req->ready = true; + pending_mem_reqs_.emplace_back(mem_req); + + // send dram request + ramulator::Request dram_req( + device_->m_axi_awaddr, + ramulator::Request::Type::WRITE, + 0 + ); + dram_->send(dram_req); + } + } else { + // process reads + auto mem_req = new mem_req_t(); + mem_req->tag = device_->m_axi_arid; + mem_req->addr = device_->m_axi_araddr; + ram_->read(mem_req->block.data(), device_->m_axi_araddr, MEM_BLOCK_SIZE); + mem_req->write = false; + mem_req->ready = false; + pending_mem_reqs_.emplace_back(mem_req); + + // send dram request + ramulator::Request dram_req( + device_->m_axi_araddr, + ramulator::Request::Type::READ, + std::bind([](ramulator::Request& dram_req, mem_req_t* mem_req) { + mem_req->ready = true; + }, placeholders::_1, mem_req), + 0 + ); + dram_->send(dram_req); + } + } + + device_->m_axi_wready = 1; + device_->m_axi_awready = 1; + device_->m_axi_arready = 1; + } + +#else + + void reset_avs_bus() { + device_->mem_req_ready = 0; + device_->mem_rsp_valid = 0; + } + + void eval_avs_bus(bool clk) { + if (!clk) { + mem_rd_rsp_ready_ = device_->mem_rsp_ready; + return; + } + + if (ram_ == nullptr) { + device_->mem_req_ready = 0; + return; + } + + // process memory responses + if (mem_rd_rsp_active_ + && device_->mem_rsp_valid && mem_rd_rsp_ready_) { + mem_rd_rsp_active_ = false; + } + if (!mem_rd_rsp_active_) { + if (!pending_mem_reqs_.empty() + && (*pending_mem_reqs_.begin())->ready) { + device_->mem_rsp_valid = 1; + auto mem_rsp_it = pending_mem_reqs_.begin(); + auto mem_req = *mem_rsp_it; + /* + printf("%0ld: [sim] MEM Rd: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_req->addr); + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + printf("%02x", mem_req->block[(MEM_BLOCK_SIZE-1)-i]); + } + printf("\n"); + */ + memcpy((uint8_t*)device_->mem_rsp_data, mem_req->block.data(), MEM_BLOCK_SIZE); + device_->mem_rsp_tag = mem_req->tag; + pending_mem_reqs_.erase(mem_rsp_it); + mem_rd_rsp_active_ = true; + delete mem_req; + } else { + device_->mem_rsp_valid = 0; + } + } + + // process memory requests + if (device_->mem_req_valid) { + uint32_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE); + if (device_->mem_req_rw) { + // process writes + uint64_t byteen = device_->mem_req_byteen; + uint8_t* data = (uint8_t*)(device_->mem_req_data); + + // check console output + if (byte_addr >= IO_COUT_ADDR + && byte_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { + for (int i = 0; i < IO_COUT_SIZE; i++) { + if ((byteen >> i) & 0x1) { + auto& ss_buf = print_bufs_[i]; + char c = data[i]; + ss_buf << c; + if (c == '\n') { + std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush; + ss_buf.str(""); + } + } + } + } else { + /* + printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, byte_addr, byteen); + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); + } + printf("\n"); + */ + for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + if ((byteen >> i) & 0x1) { + (*ram_)[byte_addr + i] = data[i]; + } + } + + // send dram request + ramulator::Request dram_req( + byte_addr, + ramulator::Request::Type::WRITE, + 0 + ); + dram_->send(dram_req); + } + } else { + // process reads + auto mem_req = new mem_req_t(); + mem_req->tag = device_->mem_req_tag; + mem_req->addr = byte_addr; + mem_req->write = false; + mem_req->ready = false; + ram_->read(mem_req->block.data(), byte_addr, MEM_BLOCK_SIZE); + pending_mem_reqs_.emplace_back(mem_req); + + // send dram request + ramulator::Request dram_req( + byte_addr, + ramulator::Request::Type::READ, + std::bind([](ramulator::Request& dram_req, mem_req_t* mem_req) { + mem_req->ready = true; + }, placeholders::_1, mem_req), + 0 + ); + dram_->send(dram_req); + } + } + + device_->mem_req_ready = 1; + } + +#endif + + void wait(uint32_t cycles) { + for (int i = 0; i < cycles; ++i) { + this->step(); + } + } + + bool get_ebreak() const { + #ifdef AXI_BUS + return (bool)device_->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak; + #else + return (bool)device_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak; + #endif + } + + int get_last_wb_value(int reg) const { + #ifdef AXI_BUS + return (int)device_->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg]; + #else + return (int)device_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg]; + #endif + } + +private: + + typedef struct { + bool ready; + std::array block; + uint64_t addr; + uint64_t tag; + bool write; + } mem_req_t; + +#ifdef AXI_BUS + VVortex_axi *device_; +#else + VVortex *device_; +#endif +#ifdef VCD_OUTPUT + VerilatedVcdC *trace_; +#endif + + std::unordered_map print_bufs_; + + std::list pending_mem_reqs_; + + bool mem_rd_rsp_active_; + bool mem_rd_rsp_ready_; + + bool mem_wr_rsp_active_; + bool mem_wr_rsp_ready_; + + RAM *ram_; + + ramulator::Gem5Wrapper* dram_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +Processor::Processor() + : impl_(new Impl()) +{} + +Processor::~Processor() { + delete impl_; +} + +void Processor::attach_ram(RAM* mem) { + impl_->attach_ram(mem); +} + +void Processor::reset() { + impl_->reset(); +} + +int Processor::run() { + return impl_->run(); +} \ No newline at end of file diff --git a/sim/rtlsim/processor.h b/sim/rtlsim/processor.h new file mode 100644 index 00000000..a877044f --- /dev/null +++ b/sim/rtlsim/processor.h @@ -0,0 +1,25 @@ +#pragma once + +namespace vortex { + +class RAM; + +class Processor { +public: + + Processor(); + virtual ~Processor(); + + void attach_ram(RAM* ram); + + void reset(); + + int run(); + +private: + + class Impl; + Impl* impl_; +}; + +} \ No newline at end of file diff --git a/sim/rtlsim/simulator.cpp b/sim/rtlsim/simulator.cpp deleted file mode 100644 index 0f6df7d7..00000000 --- a/sim/rtlsim/simulator.cpp +++ /dev/null @@ -1,579 +0,0 @@ -#include "simulator.h" - -#include - -#ifdef AXI_BUS -#include "VVortex_axi.h" -#include "VVortex_axi__Syms.h" -#else -#include "VVortex.h" -#include "VVortex__Syms.h" -#endif - -#ifdef VCD_OUTPUT -#include -#endif - -#include -#include -#include -#include - -#define ENABLE_MEM_STALLS - -#ifndef TRACE_START_TIME -#define TRACE_START_TIME 0ull -#endif - -#ifndef TRACE_STOP_TIME -#define TRACE_STOP_TIME -1ull -#endif - -#ifndef MEM_LATENCY -#define MEM_LATENCY 24 -#endif - -#ifndef MEM_RQ_SIZE -#define MEM_RQ_SIZE 16 -#endif - -#ifndef MEM_STALLS_MODULO -#define MEM_STALLS_MODULO 16 -#endif - -#ifndef VERILATOR_RESET_VALUE -#define VERILATOR_RESET_VALUE 2 -#endif - -#define VL_WDATA_GETW(lwp, i, n, w) \ - VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w) - -using namespace vortex; - -static uint64_t timestamp = 0; - -double sc_time_stamp() { - return timestamp; -} - -/////////////////////////////////////////////////////////////////////////////// - -static bool trace_enabled = false; -static uint64_t trace_start_time = TRACE_START_TIME; -static uint64_t trace_stop_time = TRACE_STOP_TIME; - -bool sim_trace_enabled() { - if (timestamp >= trace_start_time - && timestamp < trace_stop_time) - return true; - return trace_enabled; -} - -void sim_trace_enable(bool enable) { - trace_enabled = enable; -} - -/////////////////////////////////////////////////////////////////////////////// - -namespace vortex { -class VL_OBJ { -public: -#ifdef AXI_BUS - VVortex_axi *device; -#else - VVortex *device; -#endif -#ifdef VCD_OUTPUT - VerilatedVcdC *trace; -#endif - - VL_OBJ() { - // force random values for unitialized signals - Verilated::randReset(VERILATOR_RESET_VALUE); - Verilated::randSeed(50); - - // Turn off assertion before reset - Verilated::assertOn(false); - - #ifdef AXI_BUS - this->device = new VVortex_axi(); - #else - this->device = new VVortex(); - #endif - - #ifdef VCD_OUTPUT - Verilated::traceEverOn(true); - this->trace = new VerilatedVcdC(); - this->device->trace(this->trace, 99); - this->trace->open("trace.vcd"); - #endif - } - - ~VL_OBJ() { - #ifdef VCD_OUTPUT - this->trace->close(); - delete this->trace; - #endif - delete this->device; - } -}; -} - -/////////////////////////////////////////////////////////////////////////////// - -Simulator::Simulator() { - vl_obj_ = new VL_OBJ(); - ram_ = nullptr; - // reset the device - this->reset(); -} - -Simulator::~Simulator() { - for (auto& buf : print_bufs_) { - auto str = buf.second.str(); - if (!str.empty()) { - std::cout << "#" << buf.first << ": " << str << std::endl; - } - } - delete vl_obj_; -} - -void Simulator::attach_ram(RAM* ram) { - ram_ = ram; - for (int b = 0; b < MEMORY_BANKS; ++b) { - mem_rsp_vec_[b].clear(); - } - last_mem_rsp_bank_ = 0; -} - -void Simulator::reset() { - print_bufs_.clear(); - - for (int b = 0; b < MEMORY_BANKS; ++b) { - mem_rsp_vec_[b].clear(); - } - last_mem_rsp_bank_ = 0; - mem_rd_rsp_active_ = false; - mem_wr_rsp_active_ = false; - -#ifdef AXI_BUS - this->reset_axi_bus(); -#else - this->reset_mem_bus(); -#endif - - vl_obj_->device->reset = 1; - - for (int i = 0; i < RESET_DELAY; ++i) { - vl_obj_->device->clk = 0; - this->eval(); - vl_obj_->device->clk = 1; - this->eval(); - } - - vl_obj_->device->reset = 0; - - // Turn on assertion after reset - Verilated::assertOn(true); -} - -void Simulator::step() { - - vl_obj_->device->clk = 0; - this->eval(); - -#ifdef AXI_BUS - this->eval_axi_bus(0); -#else - this->eval_mem_bus(0); -#endif - - vl_obj_->device->clk = 1; - this->eval(); - -#ifdef AXI_BUS - this->eval_axi_bus(1); -#else - this->eval_mem_bus(1); -#endif - -#ifndef NDEBUG - fflush(stdout); -#endif -} - -void Simulator::eval() { - vl_obj_->device->eval(); -#ifdef VCD_OUTPUT - if (sim_trace_enabled()) { - vl_obj_->trace->dump(timestamp); - } -#endif - ++timestamp; -} - -#ifdef AXI_BUS - -void Simulator::reset_axi_bus() { - vl_obj_->device->m_axi_wready = 0; - vl_obj_->device->m_axi_awready = 0; - vl_obj_->device->m_axi_arready = 0; - vl_obj_->device->m_axi_rvalid = 0; - vl_obj_->device->m_axi_bvalid = 0; -} - -void Simulator::eval_axi_bus(bool clk) { - if (!clk) { - mem_rd_rsp_ready_ = vl_obj_->device->m_axi_rready; - mem_wr_rsp_ready_ = vl_obj_->device->m_axi_bready; - return; - } - - if (ram_ == nullptr) { - vl_obj_->device->m_axi_wready = 0; - vl_obj_->device->m_axi_awready = 0; - vl_obj_->device->m_axi_arready = 0; - return; - } - - // update memory responses schedule - for (int b = 0; b < MEMORY_BANKS; ++b) { - for (auto& rsp : mem_rsp_vec_[b]) { - if (rsp.cycles_left > 0) - rsp.cycles_left -= 1; - } - } - - bool has_rd_response = false; - bool has_wr_response = false; - - // schedule memory responses that are ready - for (int i = 0; i < MEMORY_BANKS; ++i) { - uint32_t b = (i + last_mem_rsp_bank_ + 1) % MEMORY_BANKS; - if (!mem_rsp_vec_[b].empty()) { - auto mem_rsp_it = mem_rsp_vec_[b].begin(); - if (mem_rsp_it->cycles_left <= 0) { - has_rd_response = !mem_rsp_it->write; - has_wr_response = mem_rsp_it->write; - last_mem_rsp_bank_ = b; - break; - } - } - } - - // send memory read response - if (mem_rd_rsp_active_ - && vl_obj_->device->m_axi_rvalid && mem_rd_rsp_ready_) { - mem_rd_rsp_active_ = false; - } - if (!mem_rd_rsp_active_) { - if (has_rd_response) { - auto mem_rsp_it = mem_rsp_vec_[last_mem_rsp_bank_].begin(); - /* - printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_rsp_it->addr); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", mem_rsp_it->block[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); - */ - vl_obj_->device->m_axi_rvalid = 1; - vl_obj_->device->m_axi_rid = mem_rsp_it->tag; - vl_obj_->device->m_axi_rresp = 0; - vl_obj_->device->m_axi_rlast = 1; - memcpy((uint8_t*)vl_obj_->device->m_axi_rdata, mem_rsp_it->block.data(), MEM_BLOCK_SIZE); - mem_rsp_vec_[last_mem_rsp_bank_].erase(mem_rsp_it); - mem_rd_rsp_active_ = true; - } else { - vl_obj_->device->m_axi_rvalid = 0; - } - } - - // send memory write response - if (mem_wr_rsp_active_ - && vl_obj_->device->m_axi_bvalid && mem_wr_rsp_ready_) { - mem_wr_rsp_active_ = false; - } - if (!mem_wr_rsp_active_) { - if (has_wr_response) { - auto mem_rsp_it = mem_rsp_vec_[last_mem_rsp_bank_].begin(); - /* - printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_rsp_it->addr); - */ - vl_obj_->device->m_axi_bvalid = 1; - vl_obj_->device->m_axi_bid = mem_rsp_it->tag; - vl_obj_->device->m_axi_bresp = 0; - mem_rsp_vec_[last_mem_rsp_bank_].erase(mem_rsp_it); - mem_wr_rsp_active_ = true; - } else { - vl_obj_->device->m_axi_bvalid = 0; - } - } - - // select the memory bank - uint32_t req_addr = vl_obj_->device->m_axi_wvalid ? vl_obj_->device->m_axi_awaddr : vl_obj_->device->m_axi_araddr; - uint32_t req_bank = (MEMORY_BANKS >= 2) ? ((req_addr / MEM_BLOCK_SIZE) % MEMORY_BANKS) : 0; - - // handle memory stalls - bool mem_stalled = false; -#ifdef ENABLE_MEM_STALLS - if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { - mem_stalled = true; - } else - if (mem_rsp_vec_[req_bank].size() >= MEM_RQ_SIZE) { - mem_stalled = true; - } -#endif - - // process memory requests - if (!mem_stalled) { - if (vl_obj_->device->m_axi_wvalid || vl_obj_->device->m_axi_arvalid) { - if (vl_obj_->device->m_axi_wvalid) { - uint64_t byteen = vl_obj_->device->m_axi_wstrb; - unsigned base_addr = vl_obj_->device->m_axi_awaddr; - uint8_t* data = (uint8_t*)(vl_obj_->device->m_axi_wdata); - - // detect stdout write - if (base_addr >= IO_COUT_ADDR - && base_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - auto& ss_buf = print_bufs_[i]; - char c = data[i]; - ss_buf << c; - if (c == '\n') { - std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush; - ss_buf.str(""); - } - } - } - } else { - /* - printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); - */ - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - (*ram_)[base_addr + i] = data[i]; - } - } - mem_req_t mem_req; - mem_req.tag = vl_obj_->device->m_axi_arid; - mem_req.addr = vl_obj_->device->m_axi_araddr; - mem_req.cycles_left = 0; - mem_req.write = 1; - mem_rsp_vec_[req_bank].emplace_back(mem_req); - } - } else { - mem_req_t mem_req; - mem_req.tag = vl_obj_->device->m_axi_arid; - mem_req.addr = vl_obj_->device->m_axi_araddr; - ram_->read(mem_req.block.data(), vl_obj_->device->m_axi_araddr, MEM_BLOCK_SIZE); - mem_req.cycles_left = MEM_LATENCY; - mem_req.write = 0; - for (auto& rsp : mem_rsp_vec_[req_bank]) { - if (mem_req.addr == rsp.addr) { - // duplicate requests receive the same cycle delay - mem_req.cycles_left = rsp.cycles_left; - break; - } - } - mem_rsp_vec_[req_bank].emplace_back(mem_req); - } - } - } - - vl_obj_->device->m_axi_wready = !mem_stalled; - vl_obj_->device->m_axi_awready = !mem_stalled; - vl_obj_->device->m_axi_arready = !mem_stalled; -} - -#else - -void Simulator::reset_mem_bus() { - vl_obj_->device->mem_req_ready = 0; - vl_obj_->device->mem_rsp_valid = 0; -} - -void Simulator::eval_mem_bus(bool clk) { - if (!clk) { - mem_rd_rsp_ready_ = vl_obj_->device->mem_rsp_ready; - return; - } - - if (ram_ == nullptr) { - vl_obj_->device->mem_req_ready = 0; - return; - } - - // update memory responses schedule - for (int b = 0; b < MEMORY_BANKS; ++b) { - for (auto& rsp : mem_rsp_vec_[b]) { - if (rsp.cycles_left > 0) - rsp.cycles_left -= 1; - } - } - - bool has_response = false; - - // schedule memory responses that are ready - for (int i = 0; i < MEMORY_BANKS; ++i) { - uint32_t b = (i + last_mem_rsp_bank_ + 1) % MEMORY_BANKS; - if (!mem_rsp_vec_[b].empty() - && (mem_rsp_vec_[b].begin()->cycles_left) <= 0) { - has_response = true; - last_mem_rsp_bank_ = b; - break; - } - } - - // send memory response - if (mem_rd_rsp_active_ - && vl_obj_->device->mem_rsp_valid && mem_rd_rsp_ready_) { - mem_rd_rsp_active_ = false; - } - if (!mem_rd_rsp_active_) { - if (has_response) { - vl_obj_->device->mem_rsp_valid = 1; - auto mem_rsp_it = mem_rsp_vec_[last_mem_rsp_bank_].begin(); - /* - printf("%0ld: [sim] MEM Rd: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_rsp_it->addr); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", mem_rsp_it->block[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); - */ - memcpy((uint8_t*)vl_obj_->device->mem_rsp_data, mem_rsp_it->block.data(), MEM_BLOCK_SIZE); - vl_obj_->device->mem_rsp_tag = mem_rsp_it->tag; - mem_rsp_vec_[last_mem_rsp_bank_].erase(mem_rsp_it); - mem_rd_rsp_active_ = true; - } else { - vl_obj_->device->mem_rsp_valid = 0; - } - } - - // select the memory bank - uint32_t req_bank = (MEMORY_BANKS >= 2) ? (vl_obj_->device->mem_req_addr % MEMORY_BANKS) : 0; - - // handle memory stalls - bool mem_stalled = false; -#ifdef ENABLE_MEM_STALLS - if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { - mem_stalled = true; - } else - if (mem_rsp_vec_[req_bank].size() >= MEM_RQ_SIZE) { - mem_stalled = true; - } -#endif - - // process memory requests - if (!mem_stalled) { - if (vl_obj_->device->mem_req_valid) { - if (vl_obj_->device->mem_req_rw) { - uint64_t byteen = vl_obj_->device->mem_req_byteen; - unsigned base_addr = (vl_obj_->device->mem_req_addr * MEM_BLOCK_SIZE); - uint8_t* data = (uint8_t*)(vl_obj_->device->mem_req_data); - if (base_addr >= IO_COUT_ADDR - && base_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) { - for (int i = 0; i < IO_COUT_SIZE; i++) { - if ((byteen >> i) & 0x1) { - auto& ss_buf = print_bufs_[i]; - char c = data[i]; - ss_buf << c; - if (c == '\n') { - std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush; - ss_buf.str(""); - } - } - } - } else { - /* - printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); - } - printf("\n"); - */ - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - (*ram_)[base_addr + i] = data[i]; - } - } - } - } else { - mem_req_t mem_req; - mem_req.tag = vl_obj_->device->mem_req_tag; - mem_req.addr = (vl_obj_->device->mem_req_addr * MEM_BLOCK_SIZE); - ram_->read(mem_req.block.data(), vl_obj_->device->mem_req_addr * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE); - mem_req.cycles_left = MEM_LATENCY; - for (auto& rsp : mem_rsp_vec_[req_bank]) { - if (mem_req.addr == rsp.addr) { - // duplicate requests receive the same cycle delay - mem_req.cycles_left = rsp.cycles_left; - break; - } - } - mem_rsp_vec_[req_bank].emplace_back(mem_req); - } - } - } - - vl_obj_->device->mem_req_ready = !mem_stalled; -} - -#endif - -void Simulator::wait(uint32_t cycles) { - for (int i = 0; i < cycles; ++i) { - this->step(); - } -} - -bool Simulator::is_busy() const { - return vl_obj_->device->busy; -} - -int Simulator::run() { - int exitcode = 0; - -#ifndef NDEBUG - std::cout << std::dec << timestamp << ": [sim] run()" << std::endl; -#endif - - // execute program - while (vl_obj_->device->busy) { - if (get_ebreak()) { - exitcode = get_last_wb_value(3); - break; - } - this->step(); - } - - // wait 5 cycles to flush the pipeline - this->wait(5); - - return exitcode; -} - -bool Simulator::get_ebreak() const { -#ifdef AXI_BUS - return (int)vl_obj_->device->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak; -#else - return (int)vl_obj_->device->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak; -#endif -} - -int Simulator::get_last_wb_value(int reg) const { -#ifdef AXI_BUS - return (int)vl_obj_->device->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg]; -#else - return (int)vl_obj_->device->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg]; -#endif -} - -void Simulator::print_stats(std::ostream& out) { - out << std::left; - out << std::setw(24) << "# of total cycles:" << std::dec << timestamp/2 << std::endl; -} \ No newline at end of file diff --git a/sim/rtlsim/simulator.h b/sim/rtlsim/simulator.h deleted file mode 100644 index 3b36c520..00000000 --- a/sim/rtlsim/simulator.h +++ /dev/null @@ -1,81 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -#ifndef MEMORY_BANKS - #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS - #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS - #else - #define MEMORY_BANKS 2 - #endif -#endif - -namespace vortex { - -class VL_OBJ; -class RAM; - -class Simulator { -public: - - Simulator(); - virtual ~Simulator(); - - void attach_ram(RAM* ram); - - bool is_busy() const; - - void reset(); - void step(); - void wait(uint32_t cycles); - - int run(); - - void print_stats(std::ostream& out); - -private: - - typedef struct { - int cycles_left; - std::array block; - uint64_t addr; - uint64_t tag; - bool write; - } mem_req_t; - - std::unordered_map print_bufs_; - - void eval(); - -#ifdef AXI_BUS - void reset_axi_bus(); - void eval_axi_bus(bool clk); -#else - void reset_mem_bus(); - void eval_mem_bus(bool clk); -#endif - - int get_last_wb_value(int reg) const; - - bool get_ebreak() const; - - std::list mem_rsp_vec_ [MEMORY_BANKS]; - uint32_t last_mem_rsp_bank_; - - bool mem_rd_rsp_active_; - bool mem_rd_rsp_ready_; - - bool mem_wr_rsp_active_; - bool mem_wr_rsp_ready_; - - RAM *ram_; - - VL_OBJ* vl_obj_; -}; - -} \ No newline at end of file diff --git a/sim/simx/Makefile b/sim/simx/Makefile index ad4e38c8..1d081c9e 100644 --- a/sim/simx/Makefile +++ b/sim/simx/Makefile @@ -1,3 +1,4 @@ +DESTDIR ?= . RTL_DIR = ../hw/rtl THIRD_PARTY_DIR = ../../third_party @@ -5,15 +6,17 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -I. -I../common -I../../hw CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include -CXXFLAGS += -I$(THIRD_PARTY_DIR)/cocogfx/include +CXXFLAGS += -I$(THIRD_PARTY_DIR) CXXFLAGS += $(CONFIGS) -LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx +LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a +LDFLAGS += -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx +LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator TOP = vx_cache_sim -SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp -SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp main.cpp +SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp +SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS))) VPATH := $(sort $(dir $(SRCS))) @@ -30,23 +33,16 @@ endif PROJECT = simx -all: $(PROJECT) - -$(PROJECT): $(SRCS) +all: $(DESTDIR)/$(PROJECT) + +$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ -obj_dir/%.o: %.cpp - mkdir -p obj_dir - $(CXX) $(CXXFLAGS) -c $< -o $@ - -static: $(OBJS) - $(AR) rcs lib$(PROJECT).a $(OBJS) $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/*.o +$(DESTDIR)/lib$(PROJECT).so: $(SRCS) + $(CXX) $(CXXFLAGS) $^ -shared $(LDFLAGS) -o $@ .depend: $(SRCS) $(CXX) $(CXXFLAGS) -MM $^ > .depend; -clean-static: - rm -rf lib$(PROJECT).a obj_dir .depend - -clean: clean-static - rm -rf $(PROJECT) \ No newline at end of file +clean: + rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so \ No newline at end of file diff --git a/sim/simx/cache.cpp b/sim/simx/cache.cpp index 36da1b27..36c03eb9 100644 --- a/sim/simx/cache.cpp +++ b/sim/simx/cache.cpp @@ -116,6 +116,7 @@ struct bank_req_t { bool mshr_replay; uint64_t tag; uint32_t set_id; + uint32_t core_id; std::vector infos; bank_req_t(uint32_t size) @@ -124,6 +125,7 @@ struct bank_req_t { , mshr_replay(false) , tag(0) , set_id(0) + , core_id(0) , infos(size) {} }; @@ -292,7 +294,7 @@ public: auto& mem_rsp = bypass_port.front(); uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1); uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs; - MemRsp core_rsp(tag); + MemRsp core_rsp{tag, mem_rsp.core_id}; simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency); bypass_port.pop(); } @@ -327,7 +329,7 @@ public: auto& core_req = core_req_port.front(); // check cache bypassing - if (core_req.is_io) { + if (core_req.non_cacheable) { // send IO request this->processIORequest(core_req, req_id); @@ -348,6 +350,7 @@ public: bank_req.mshr_replay = false; bank_req.tag = tag; bank_req.set_id = set_id; + bank_req.core_id = core_req.core_id; bank_req.infos.at(port_id) = {true, req_id, core_req.tag}; auto& bank = banks_.at(bank_id); @@ -439,7 +442,8 @@ public: if (pipeline_req.mshr_replay) { // send core response for (auto& info : pipeline_req.infos) { - simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency); + MemRsp core_rsp{info.req_tag, pipeline_req.core_id}; + simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); } } else { bool hit = false; @@ -480,6 +484,7 @@ public: MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag); mem_req.write = true; + mem_req.core_id = pipeline_req.core_id; mem_req_ports_.at(bank_id).send(mem_req, 1); } else { // mark block as dirty @@ -488,8 +493,9 @@ public: } // send core response if (!pipeline_req.write || config_.write_reponse) { - for (auto& info : pipeline_req.infos) { - simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency); + for (auto& info : pipeline_req.infos) { + MemRsp core_rsp{info.req_tag, pipeline_req.core_id}; + simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); } } } else { @@ -508,6 +514,7 @@ public: MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag); mem_req.write = true; + mem_req.core_id = pipeline_req.core_id; mem_req_ports_.at(bank_id).send(mem_req, 1); ++perf_stats_.evictions; } @@ -519,12 +526,14 @@ public: MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); mem_req.write = true; + mem_req.core_id = pipeline_req.core_id; mem_req_ports_.at(bank_id).send(mem_req, 1); } // send core response if (config_.write_reponse) { - for (auto& info : pipeline_req.infos) { - simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency); + for (auto& info : pipeline_req.infos) { + MemRsp core_rsp{info.req_tag, pipeline_req.core_id}; + simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); } } } else { @@ -540,6 +549,7 @@ public: mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); mem_req.write = false; mem_req.tag = mshr_id; + mem_req.core_id = pipeline_req.core_id; mem_req_ports_.at(bank_id).send(mem_req, 1); ++pending_fill_reqs_; } diff --git a/sim/simx/constants.h b/sim/simx/constants.h index 7d8daed5..a28bd806 100644 --- a/sim/simx/constants.h +++ b/sim/simx/constants.h @@ -2,12 +2,10 @@ #include "types.h" -#ifndef MEM_LATENCY -#define MEM_LATENCY 24 -#endif - #define RAM_PAGE_SIZE 4096 +#define DRAM_CHANNELS 2 + namespace vortex { enum Constants { diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 934ce1f8..0540151c 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -87,12 +87,12 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) } // register execute units - exe_units_.at((int)ExeType::NOP) = SimPlatform::instance().CreateObject(this); - exe_units_.at((int)ExeType::ALU) = SimPlatform::instance().CreateObject(this); - exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().CreateObject(this); - exe_units_.at((int)ExeType::CSR) = SimPlatform::instance().CreateObject(this); - exe_units_.at((int)ExeType::FPU) = SimPlatform::instance().CreateObject(this); - exe_units_.at((int)ExeType::GPU) = SimPlatform::instance().CreateObject(this); + exe_units_.at((int)ExeType::NOP) = SimPlatform::instance().create_object(this); + exe_units_.at((int)ExeType::ALU) = SimPlatform::instance().create_object(this); + exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object(this); + exe_units_.at((int)ExeType::CSR) = SimPlatform::instance().create_object(this); + exe_units_.at((int)ExeType::FPU) = SimPlatform::instance().create_object(this); + exe_units_.at((int)ExeType::GPU) = SimPlatform::instance().create_object(this); // connect l1 switch icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]); @@ -216,6 +216,7 @@ void Core::fetch(uint64_t cycle) { mem_req.addr = trace->PC; mem_req.write = false; mem_req.tag = pending_icache_.allocate(trace); + mem_req.core_id = id_; icache_->CoreReqPorts.at(0).send(mem_req, 1); DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); fetch_latch_.pop(); diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index be172830..d1df2637 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -403,7 +403,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { break; case JALR_INST: trace->exe_type = ExeType::ALU; - trace->alu.type = AluType::BRANCH; + trace->alu.type = AluType::BRANCH; trace->used_iregs.set(rsrc0); for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) @@ -535,6 +535,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { Word csr_value; if (func3 == 0) { trace->exe_type = ExeType::ALU; + trace->alu.type = AluType::SYSCALL; trace->fetch_stall = true; switch (csr_addr) { case 0: // ECALL diff --git a/sim/simx/exeunit.cpp b/sim/simx/exeunit.cpp index 1736101c..3b84ee8a 100644 --- a/sim/simx/exeunit.cpp +++ b/sim/simx/exeunit.cpp @@ -143,8 +143,9 @@ void LsuUnit::step(uint64_t cycle) { MemReq mem_req; mem_req.addr = mem_addr.addr; mem_req.write = is_write; + mem_req.non_cacheable = (type == AddrType::IO); mem_req.tag = tag; - mem_req.is_io = (type == AddrType::IO); + mem_req.core_id = core_->id(); if (type == AddrType::Shared) { core_->shared_mem_->Inputs.at(t).send(mem_req, 2); @@ -153,7 +154,7 @@ void LsuUnit::step(uint64_t cycle) { } else { dcache_req_port.send(mem_req, 2); DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag - << ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << mem_req.is_io << ", " << *trace); + << ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace); } if (is_dup) @@ -182,6 +183,7 @@ void AluUnit::step(uint64_t cycle) { switch (trace->alu.type) { case AluType::ARITH: case AluType::BRANCH: + case AluType::SYSCALL: case AluType::CMOV: Output.send(trace, 1); break; @@ -359,6 +361,7 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { mem_req.addr = mem_addr.addr; mem_req.write = (trace->lsu.type == LsuType::STORE); mem_req.tag = tag; + mem_req.core_id = core_->id(); dcache_req_port.send(mem_req, 3); DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag << ", tid=" << t << ", "<< trace); diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp index 86829f3a..159fdab6 100644 --- a/sim/simx/main.cpp +++ b/sim/simx/main.cpp @@ -13,7 +13,7 @@ using namespace vortex; int main(int argc, char **argv) { - int exitcode; + int exitcode = 0; std::string archStr("rv32imf"); std::string imgFileName; @@ -54,12 +54,7 @@ int main(int argc, char **argv) { return -1; { - ArchDef arch(archStr, num_cores, num_warps, num_threads); - - Processor processor(arch); - RAM ram(RAM_PAGE_SIZE); - { std::string program_ext(fileExtension(imgFileName.c_str())); if (program_ext == "bin") { @@ -72,25 +67,40 @@ int main(int argc, char **argv) { } } - processor.attach_ram(&ram); + ArchDef arch(archStr, num_cores, num_warps, num_threads); + auto processor = Processor::Create(arch); + processor->attach_ram(&ram); - exitcode = processor.run(); + // setup memory simulator + auto memsim = MemSim::Create(MemSim::Config{ + DRAM_CHANNELS, + arch.num_cores() + }); + processor->MemReqPort.bind(&memsim->MemReqPort); + memsim->MemRspPort.bind(&processor->MemRspPort); - if (riscv_test) { - if (1 == exitcode) { - std::cout << "Passed." << std::endl; - exitcode = 0; - } else { - std::cout << "Failed." << std::endl; - } - } else { - if (exitcode != 0) { - std::cout << "*** error: exitcode=" << exitcode << std::endl; - } - } - } + // run simulation + for (;;) { + SimPlatform::instance().step(); + if (processor->check_exit(&exitcode)) + break; + }; + } SimPlatform::instance().finalize(); + if (riscv_test) { + if (1 == exitcode) { + std::cout << "Passed." << std::endl; + exitcode = 0; + } else { + std::cout << "Failed." << std::endl; + } + } else { + if (exitcode != 0) { + std::cout << "*** error: exitcode=" << exitcode << std::endl; + } + } + return exitcode; } diff --git a/sim/simx/memsim.cpp b/sim/simx/memsim.cpp index 012082d9..74979bc8 100644 --- a/sim/simx/memsim.cpp +++ b/sim/simx/memsim.cpp @@ -1,56 +1,99 @@ #include "memsim.h" #include #include +#include + +DISABLE_WARNING_PUSH +DISABLE_WARNING_UNUSED_PARAMETER +#define RAMULATOR +#include +#include +#include +DISABLE_WARNING_POP + #include "constants.h" +#include "types.h" using namespace vortex; class MemSim::Impl { private: MemSim* simobject_; - uint32_t num_banks_; - uint32_t latency_; + Config config_; PerfStats perf_stats_; + ramulator::Gem5Wrapper* dram_; public: - Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency) + + Impl(MemSim* simobject, const Config& config) : simobject_(simobject) - , num_banks_(num_banks) - , latency_(latency) - {} + , config_(config) + { + ramulator::Config ram_config; + ram_config.add("standard", "DDR4"); + ram_config.add("channels", std::to_string(config.channels)); + ram_config.add("ranks", "1"); + ram_config.add("speed", "DDR4_2400R"); + ram_config.add("org", "DDR4_4Gb_x8"); + ram_config.add("mapping", "defaultmapping"); + ram_config.set_core_num(config.num_cores); + dram_ = new ramulator::Gem5Wrapper(ram_config, MEM_BLOCK_SIZE); + Stats::statlist.output("ramulator.ddr4.log"); + } + + ~Impl() { + dram_->finish(); + Stats::statlist.printall(); + delete dram_; + } const PerfStats& perf_stats() const { return perf_stats_; } + void dram_callback(ramulator::Request& req, uint32_t tag) { + MemRsp mem_rsp{tag, (uint32_t)req.coreid}; + simobject_->MemRspPort.send(mem_rsp, 1); + } + void step(uint64_t /*cycle*/) { - for (uint32_t i = 0, n = num_banks_; i < n; ++i) { - auto& mem_req_port = simobject_->MemReqPorts.at(i); - if (mem_req_port.empty()) - continue; - auto& mem_req = mem_req_port.front(); - if (!mem_req.write) { - MemRsp mem_rsp; - mem_rsp.tag = mem_req.tag; - simobject_->MemRspPorts.at(i).send(mem_rsp, latency_); - ++perf_stats_.reads; - } else { - ++perf_stats_.writes; - } - mem_req_port.pop(); + dram_->tick(); + + if (simobject_->MemReqPort.empty()) + return; + + auto& mem_req = simobject_->MemReqPort.front(); + + if (mem_req.write) { + ramulator::Request dram_req( + mem_req.addr, + ramulator::Request::Type::WRITE, + mem_req.core_id + ); + dram_->send(dram_req); + ++perf_stats_.writes; + } else { + ramulator::Request dram_req( + mem_req.addr, + ramulator::Request::Type::READ, + std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag), + mem_req.core_id + ); + dram_->send(dram_req); + ++perf_stats_.reads; } + + simobject_->MemReqPort.pop(); } }; /////////////////////////////////////////////////////////////////////////////// -MemSim::MemSim(const SimContext& ctx, - uint32_t num_banks, - uint32_t latency) +MemSim::MemSim(const SimContext& ctx, const Config& config) : SimObject(ctx, "MemSim") - , MemReqPorts(num_banks, this) - , MemRspPorts(num_banks, this) - , impl_(new Impl(this, num_banks, latency)) + , MemReqPort(this) + , MemRspPort(this) + , impl_(new Impl(this, config)) {} MemSim::~MemSim() { diff --git a/sim/simx/memsim.h b/sim/simx/memsim.h index c48361bc..24918a2e 100644 --- a/sim/simx/memsim.h +++ b/sim/simx/memsim.h @@ -8,6 +8,11 @@ namespace vortex { class MemSim : public SimObject{ public: + struct Config { + uint32_t channels; + uint32_t num_cores; + }; + struct PerfStats { uint64_t reads; uint64_t writes; @@ -18,10 +23,10 @@ public: {} }; - std::vector> MemReqPorts; - std::vector> MemRspPorts; + SimPort MemReqPort; + SimPort MemRspPort; - MemSim(const SimContext& ctx, uint32_t num_banks, uint32_t latency); + MemSim(const SimContext& ctx, const Config& config); ~MemSim(); void step(uint64_t cycle); diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index 6bb46229..bfda986e 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -3,147 +3,173 @@ using namespace vortex; -Processor::Processor(const ArchDef& arch) - : cores_(arch.num_cores()) - , l2caches_(NUM_CLUSTERS) - , l2_mem_switches_(NUM_CLUSTERS) -{ - uint32_t num_cores = arch.num_cores(); - uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; +class Processor::Impl { +private: + Processor* simobject_; + std::vector cores_; + std::vector l2caches_; + std::vector::Ptr> l2_mem_switches_; + Cache::Ptr l3cache_; + Switch::Ptr l3_mem_switch_; - // create cores - for (uint32_t i = 0; i < num_cores; ++i) { - cores_.at(i) = Core::Create(arch, i); - } +public: + Impl(Processor* simobject, const ArchDef& arch) + : simobject_(simobject) + , cores_(arch.num_cores()) + , l2caches_(NUM_CLUSTERS) + , l2_mem_switches_(NUM_CLUSTERS) + { + uint32_t num_cores = arch.num_cores(); + uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; - // connect memory sub-systen - memsim_ = MemSim::Create(1, MEM_LATENCY); - std::vector*> mem_req_ports(1); - std::vector*> mem_rsp_ports(1); - - mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0); - mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0); - - if (L3_ENABLE) { - l3cache_ = Cache::Create("l3cache", Cache::Config{ - log2ceil(L3_CACHE_SIZE), // C - log2ceil(MEM_BLOCK_SIZE), // B - 2, // W - 0, // A - 32, // address bits - L3_NUM_BANKS, // number of banks - L3_NUM_PORTS, // number of ports - NUM_CLUSTERS, // request size - true, // write-through - false, // write response - 0, // victim size - L3_MSHR_SIZE, // mshr - 2, // pipeline latency - } - ); - - mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort); - l3cache_->MemReqPort.bind(mem_req_ports.at(0)); - - mem_req_ports.resize(NUM_CLUSTERS); - mem_rsp_ports.resize(NUM_CLUSTERS); - - for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { - mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i); - mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i); + // create cores + for (uint32_t i = 0; i < num_cores; ++i) { + cores_.at(i) = Core::Create(arch, i); } - } else if (NUM_CLUSTERS > 1) { - l3_mem_switch_ = Switch::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS); - mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn); - l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0)); + + std::vector*> mem_req_ports(1); + std::vector*> mem_rsp_ports(1); - mem_req_ports.resize(NUM_CLUSTERS); - mem_rsp_ports.resize(NUM_CLUSTERS); + mem_req_ports.at(0) = &simobject_->MemReqPort; + mem_rsp_ports.at(0) = &simobject_->MemRspPort; - for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { - mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i); - mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i); - } - } - - for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { - std::vector*> cluster_mem_req_ports(cores_per_cluster); - std::vector*> cluster_mem_rsp_ports(cores_per_cluster); - - if (L2_ENABLE) { - auto& l2cache = l2caches_.at(i); - l2cache = Cache::Create("l2cache", Cache::Config{ - log2ceil(L2_CACHE_SIZE), // C + if (L3_ENABLE) { + l3cache_ = Cache::Create("l3cache", Cache::Config{ + log2ceil(L3_CACHE_SIZE), // C log2ceil(MEM_BLOCK_SIZE), // B 2, // W 0, // A - 32, // address bits - L2_NUM_BANKS, // number of banks - L2_NUM_PORTS, // number of ports - (uint8_t)cores_per_cluster, // request size + 32, // address bits + L3_NUM_BANKS, // number of banks + L3_NUM_PORTS, // number of ports + NUM_CLUSTERS, // request size true, // write-through false, // write response 0, // victim size - L2_MSHR_SIZE, // mshr + L3_MSHR_SIZE, // mshr 2, // pipeline latency - }); + } + ); + l3cache_->MemReqPort.bind(mem_req_ports.at(0)); + mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort); - mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort); - l2cache->MemReqPort.bind(mem_req_ports.at(i)); + mem_req_ports.resize(NUM_CLUSTERS); + mem_rsp_ports.resize(NUM_CLUSTERS); - for (uint32_t j = 0; j < cores_per_cluster; ++j) { - cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); - cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i); + mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i); } - } else { - auto& l2_mem_switch = l2_mem_switches_.at(i); - l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster); + } else if (NUM_CLUSTERS > 1) { + l3_mem_switch_ = Switch::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS); + l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0)); + mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn); - mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); - l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); + mem_req_ports.resize(NUM_CLUSTERS); + mem_rsp_ports.resize(NUM_CLUSTERS); - for (uint32_t j = 0; j < cores_per_cluster; ++j) { - cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); - cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i); + mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i); } } - for (uint32_t j = 0; j < cores_per_cluster; ++j) { - auto& core = cores_.at((i * cores_per_cluster) + j); - cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort); - core->MemReqPort.bind(cluster_mem_req_ports.at(j)); + for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) { + std::vector*> cluster_mem_req_ports(cores_per_cluster); + std::vector*> cluster_mem_rsp_ports(cores_per_cluster); + + if (L2_ENABLE) { + auto& l2cache = l2caches_.at(i); + l2cache = Cache::Create("l2cache", Cache::Config{ + log2ceil(L2_CACHE_SIZE), // C + log2ceil(MEM_BLOCK_SIZE), // B + 2, // W + 0, // A + 32, // address bits + L2_NUM_BANKS, // number of banks + L2_NUM_PORTS, // number of ports + (uint8_t)cores_per_cluster, // request size + true, // write-through + false, // write response + 0, // victim size + L2_MSHR_SIZE, // mshr + 2, // pipeline latency + }); + l2cache->MemReqPort.bind(mem_req_ports.at(i)); + mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort); + + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j); + cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j); + } + } else { + auto& l2_mem_switch = l2_mem_switches_.at(i); + l2_mem_switch = Switch::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster); + l2_mem_switch->ReqOut.bind(mem_req_ports.at(i)); + mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn); + + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j); + cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j); + } + } + + for (uint32_t j = 0; j < cores_per_cluster; ++j) { + auto& core = cores_.at((i * cores_per_cluster) + j); + core->MemReqPort.bind(cluster_mem_req_ports.at(j)); + cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort); + } } } -} -void Processor::attach_ram(RAM* ram) { - for (auto core : cores_) { - core->attach_ram(ram); + ~Impl() {} + + void step(uint64_t cycle) { + __unused (cycle); } -} -Processor::~Processor() {} + void attach_ram(RAM* ram) { + for (auto core : cores_) { + core->attach_ram(ram); + } + } -int Processor::run() { - bool running; - int exitcode = 0; - do { - SimPlatform::instance().step(); - - running = false; + bool check_exit(int* exitcode) { + bool running = false; for (auto& core : cores_) { if (core->running()) { running = true; } if (core->check_exit()) { - exitcode = core->getIRegValue(3); - running = false; - break; + *exitcode = core->getIRegValue(3); + return true; } } - } while (running); + return !running; + } +}; - std::cout << std::flush; +/////////////////////////////////////////////////////////////////////////////// - return exitcode; +Processor::Processor(const SimContext& ctx, const ArchDef& arch) + : SimObject(ctx, "Vortex") + , MemReqPort(this) + , MemRspPort(this) + , impl_(new Impl(this, arch)) +{} + +Processor::~Processor() { + delete impl_; +} + +void Processor::attach_ram(RAM* mem) { + impl_->attach_ram(mem); +} + +bool Processor::check_exit(int* exitcode) { + return impl_->check_exit(exitcode); +} + +void Processor::step(uint64_t cycle) { + impl_->step(cycle); } \ No newline at end of file diff --git a/sim/simx/processor.h b/sim/simx/processor.h index e41fd740..cfcde4da 100644 --- a/sim/simx/processor.h +++ b/sim/simx/processor.h @@ -4,24 +4,23 @@ namespace vortex { -class Processor { +class Processor : public SimObject { public: - typedef std::shared_ptr Ptr; + SimPort MemReqPort; + SimPort MemRspPort; - Processor(const ArchDef& arch); + Processor(const SimContext& ctx, const ArchDef& arch); ~Processor(); void attach_ram(RAM* mem); - int run(); + bool check_exit(int* exitcode); + + void step(uint64_t cycle); private: - std::vector cores_; - std::vector l2caches_; - std::vector::Ptr> l2_mem_switches_; - Cache::Ptr l3cache_; - Switch::Ptr l3_mem_switch_; - MemSim::Ptr memsim_; + class Impl; + Impl* impl_; }; } \ No newline at end of file diff --git a/sim/simx/sharedmem.h b/sim/simx/sharedmem.h index d984422d..6106ad25 100644 --- a/sim/simx/sharedmem.h +++ b/sim/simx/sharedmem.h @@ -65,8 +65,7 @@ public: if (!core_req.write || config_.write_reponse) { // send response - MemRsp core_rsp; - core_rsp.tag = core_req.tag; + MemRsp core_rsp{core_req.tag, core_req.core_id}; this->Outputs.at(req_id).send(core_rsp, 1); } diff --git a/sim/simx/types.h b/sim/simx/types.h index 7675ab82..67a14b5d 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -70,6 +70,7 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) { enum class AluType { ARITH, BRANCH, + SYSCALL, IMUL, IDIV, CMOV, @@ -77,11 +78,12 @@ enum class AluType { inline std::ostream &operator<<(std::ostream &os, const AluType& type) { switch (type) { - case AluType::ARITH: os << "ARITH"; break; - case AluType::BRANCH: os << "BRANCH"; break; - case AluType::IMUL: os << "IMUL"; break; - case AluType::IDIV: os << "IDIV"; break; - case AluType::CMOV: os << "CMOV"; break; + case AluType::ARITH: os << "ARITH"; break; + case AluType::BRANCH: os << "BRANCH"; break; + case AluType::SYSCALL: os << "SYSCALL"; break; + case AluType::IMUL: os << "IMUL"; break; + case AluType::IDIV: os << "IDIV"; break; + case AluType::CMOV: os << "CMOV"; break; } return os; } @@ -207,24 +209,31 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) { struct MemReq { uint64_t addr; - uint32_t tag; bool write; - bool is_io; + bool non_cacheable; + uint32_t tag; + uint32_t core_id; MemReq(uint64_t _addr = 0, + bool _write = false, + bool _non_cacheable = false, uint64_t _tag = 0, - bool _write = false, - bool _is_io = false + uint32_t _core_id = 0 ) : addr(_addr) - , tag(_tag) , write(_write) - , is_io(_is_io) + , non_cacheable(_non_cacheable) + , tag(_tag) + , core_id(_core_id) {} }; struct MemRsp { uint64_t tag; - MemRsp(uint64_t _tag = 0) : tag (_tag) {} + uint32_t core_id; + MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0) + : tag (_tag) + , core_id(_core_id) + {} }; /////////////////////////////////////////////////////////////////////////////// diff --git a/sim/vlsim/Makefile b/sim/vlsim/Makefile index 879bd954..bd34e60f 100644 --- a/sim/vlsim/Makefile +++ b/sim/vlsim/Makefile @@ -1,3 +1,4 @@ +DESTDIR ?= . RTL_DIR = ../../hw/rtl DPI_DIR = ../../hw/dpi SCRIPT_DIR = ../../hw/scripts @@ -7,8 +8,10 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -I.. -I../../../hw -I../../common CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include +CXXFLAGS += -I../$(THIRD_PARTY_DIR) LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a +LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator # control RTL debug tracing states DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE @@ -87,22 +90,15 @@ VL_FLAGS += -DIDIV_DPI FPU_CORE ?= FPU_DPI VL_FLAGS += -D$(FPU_CORE) -PROJECT = libopae-c-vlsim +PROJECT = libopae-c-vlsim.so -all: $(PROJECT).so +all: $(PROJECT) vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh $(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h -$(PROJECT).so: $(SRCS) vortex_afu.h - verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT).so +$(DESTDIR)/$(PROJECT): $(SRCS) vortex_afu.h + verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT) -static: $(SRCS) vortex_afu.h - verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' - $(AR) rcs $(PROJECT).a obj_dir/*.o $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/*.o - -clean-static: - rm -rf $(PROJECT).a obj_dir vortex_afu.h - -clean: clean-static - rm -rf $(PROJECT).so +clean: + rm -rf obj_dir $(DESTDIR)/$(PROJECT) diff --git a/sim/vlsim/opae_sim.cpp b/sim/vlsim/opae_sim.cpp index 5da617b5..d165dba6 100644 --- a/sim/vlsim/opae_sim.cpp +++ b/sim/vlsim/opae_sim.cpp @@ -13,6 +13,31 @@ #include #include +#define RAMULATOR +#include +#include +#include + +#include +#include + +#include +#include +#include + +#ifndef MEMORY_BANKS + #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS + #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS + #else + #define MEMORY_BANKS 2 + #endif +#endif + +#undef MEM_BLOCK_SIZE +#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8) + +#define CACHE_BLOCK_SIZE 64 + #define CCI_LATENCY 8 #define CCI_RAND_MOD 8 #define CCI_RQ_SIZE 16 @@ -28,18 +53,6 @@ #define TRACE_STOP_TIME -1ull #endif -#ifndef MEM_LATENCY -#define MEM_LATENCY 24 -#endif - -#ifndef MEM_RQ_SIZE -#define MEM_RQ_SIZE 16 -#endif - -#ifndef MEM_STALLS_MODULO -#define MEM_STALLS_MODULO 16 -#endif - #ifndef VERILATOR_RESET_VALUE #define VERILATOR_RESET_VALUE 2 #endif @@ -88,357 +101,417 @@ void sim_trace_enable(bool enable) { /////////////////////////////////////////////////////////////////////////////// -namespace vortex { -class VL_OBJ { +class opae_sim::Impl { public: -#ifdef AXI_BUS - VVortex_axi *device; -#else - Vvortex_afu_shim *device; -#endif -#ifdef VCD_OUTPUT - VerilatedVcdC *trace; -#endif - - VL_OBJ() { + Impl() + : stop_(false) + , host_buffer_ids_(0) { // force random values for unitialized signals Verilated::randReset(VERILATOR_RESET_VALUE); Verilated::randSeed(50); - // Turn off assertion before reset + // turn off assertion before reset Verilated::assertOn(false); - #ifdef AXI_BUS - this->device = new Vvortex_afu_shim(); - #else - this->device = new Vvortex_afu_shim(); - #endif + // create RTL module instance + device_ = new Vvortex_afu_shim(); #ifdef VCD_OUTPUT Verilated::traceEverOn(true); - this->trace = new VerilatedVcdC(); - this->device->trace(this->trace, 99); - this->trace->open("trace.vcd"); + trace_ = new VerilatedVcdC(); + device_->trace(this->trace, 99); + trace_->open("trace.vcd"); #endif + + ram_ = new RAM(RAM_PAGE_SIZE); + + // initialize dram simulator + ramulator::Config ram_config; + ram_config.add("standard", "DDR4"); + ram_config.add("channels", std::to_string(MEMORY_BANKS)); + ram_config.add("ranks", "1"); + ram_config.add("speed", "DDR4_2400R"); + ram_config.add("org", "DDR4_4Gb_x8"); + ram_config.add("mapping", "defaultmapping"); + ram_config.set_core_num(1); + dram_ = new ramulator::Gem5Wrapper(ram_config, MEM_BLOCK_SIZE); + Stats::statlist.output("ramulator.ddr4.log"); + + // reset the device + this->reset(); + + // launch execution thread + future_ = std::async(std::launch::async, [&]{ + while (!stop_) { + std::lock_guard guard(mutex_); + this->step(); + } + }); } - ~VL_OBJ() { + ~Impl() { + stop_ = true; + if (future_.valid()) { + future_.wait(); + } + for (auto& buffer : host_buffers_) { + __aligned_free(buffer.second.data); + } #ifdef VCD_OUTPUT - this->trace->close(); - delete this->trace; + trace_->close(); + delete trace_; #endif - delete this->device; - } -}; -} + delete device_; + + delete ram_; -/////////////////////////////////////////////////////////////////////////////// - -opae_sim::opae_sim() - : stop_(false) - , host_buffer_ids_(0) { - vl_obj_ = new VL_OBJ(); - ram_ = new RAM(RAM_PAGE_SIZE); - - // reset the device - this->reset(); - - // launch execution thread - future_ = std::async(std::launch::async, [&]{ - while (!stop_) { - std::lock_guard guard(mutex_); - this->step(); - } - }); -} - -opae_sim::~opae_sim() { - stop_ = true; - if (future_.valid()) { - future_.wait(); - } - for (auto& buffer : host_buffers_) { - __aligned_free(buffer.second.data); - } - delete vl_obj_; - delete ram_; -} - -int opae_sim::prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) { - auto alloc = __aligned_malloc(CACHE_BLOCK_SIZE, len); - if (alloc == NULL) - return -1; - host_buffer_t buffer; - buffer.data = (uint64_t*)alloc; - buffer.size = len; - buffer.ioaddr = uintptr_t(alloc); - auto buffer_id = host_buffer_ids_++; - host_buffers_.emplace(buffer_id, buffer); - *buf_addr = alloc; - *wsid = buffer_id; - return 0; -} - -void opae_sim::release_buffer(uint64_t wsid) { - auto it = host_buffers_.find(wsid); - if (it != host_buffers_.end()) { - __aligned_free(it->second.data); - host_buffers_.erase(it); - } -} - -void opae_sim::get_io_address(uint64_t wsid, uint64_t *ioaddr) { - *ioaddr = host_buffers_[wsid].ioaddr; -} - -void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) { - std::lock_guard guard(mutex_); - - vl_obj_->device->vcp2af_sRxPort_c0_mmioRdValid = 1; - vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; - vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; - vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; - this->step(); - vl_obj_->device->vcp2af_sRxPort_c0_mmioRdValid = 0; - assert(vl_obj_->device->af2cp_sTxPort_c2_mmioRdValid); - *value = vl_obj_->device->af2cp_sTxPort_c2_data; -} - -void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) { - std::lock_guard guard(mutex_); - - vl_obj_->device->vcp2af_sRxPort_c0_mmioWrValid = 1; - vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; - vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; - vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; - memcpy(vl_obj_->device->vcp2af_sRxPort_c0_data, &value, 8); - this->step(); - vl_obj_->device->vcp2af_sRxPort_c0_mmioWrValid = 0; -} - -/////////////////////////////////////////////////////////////////////////////// - -void opae_sim::reset() { - cci_reads_.clear(); - cci_writes_.clear(); - vl_obj_->device->vcp2af_sRxPort_c0_mmioRdValid = 0; - vl_obj_->device->vcp2af_sRxPort_c0_mmioWrValid = 0; - vl_obj_->device->vcp2af_sRxPort_c0_rspValid = 0; - vl_obj_->device->vcp2af_sRxPort_c1_rspValid = 0; - vl_obj_->device->vcp2af_sRxPort_c0_TxAlmFull = 0; - vl_obj_->device->vcp2af_sRxPort_c1_TxAlmFull = 0; - - for (int b = 0; b < MEMORY_BANKS; ++b) { - mem_reads_[b].clear(); - vl_obj_->device->avs_readdatavalid[b] = 0; - vl_obj_->device->avs_waitrequest[b] = 0; + if (dram_) { + dram_->finish(); + Stats::statlist.printall(); + delete dram_; + } } - vl_obj_->device->reset = 1; + int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) { + auto alloc = __aligned_malloc(CACHE_BLOCK_SIZE, len); + if (alloc == NULL) + return -1; + host_buffer_t buffer; + buffer.data = (uint64_t*)alloc; + buffer.size = len; + buffer.ioaddr = uintptr_t(alloc); + auto buffer_id = host_buffer_ids_++; + host_buffers_.emplace(buffer_id, buffer); + *buf_addr = alloc; + *wsid = buffer_id; + return 0; + } - for (int i = 0; i < RESET_DELAY; ++i) { - vl_obj_->device->clk = 0; + void release_buffer(uint64_t wsid) { + auto it = host_buffers_.find(wsid); + if (it != host_buffers_.end()) { + __aligned_free(it->second.data); + host_buffers_.erase(it); + } + } + + void get_io_address(uint64_t wsid, uint64_t *ioaddr) { + *ioaddr = host_buffers_[wsid].ioaddr; + } + + void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) { + std::lock_guard guard(mutex_); + + device_->vcp2af_sRxPort_c0_mmioRdValid = 1; + device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; + device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; + device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; + this->step(); + device_->vcp2af_sRxPort_c0_mmioRdValid = 0; + assert(device_->af2cp_sTxPort_c2_mmioRdValid); + *value = device_->af2cp_sTxPort_c2_data; + } + + void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) { + std::lock_guard guard(mutex_); + + device_->vcp2af_sRxPort_c0_mmioWrValid = 1; + device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; + device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; + device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; + memcpy(device_->vcp2af_sRxPort_c0_data, &value, 8); + this->step(); + device_->vcp2af_sRxPort_c0_mmioWrValid = 0; + } + +private: + + void reset() { + cci_reads_.clear(); + cci_writes_.clear(); + device_->vcp2af_sRxPort_c0_mmioRdValid = 0; + device_->vcp2af_sRxPort_c0_mmioWrValid = 0; + device_->vcp2af_sRxPort_c0_rspValid = 0; + device_->vcp2af_sRxPort_c1_rspValid = 0; + device_->vcp2af_sRxPort_c0_TxAlmFull = 0; + device_->vcp2af_sRxPort_c1_TxAlmFull = 0; + + for (int b = 0; b < MEMORY_BANKS; ++b) { + pending_mem_reqs_[b].clear(); + device_->avs_readdatavalid[b] = 0; + device_->avs_waitrequest[b] = 0; + } + + device_->reset = 1; + + for (int i = 0; i < RESET_DELAY; ++i) { + device_->clk = 0; + this->eval(); + device_->clk = 1; + this->eval(); + } + + device_->reset = 0; + + // Turn on assertion after reset + Verilated::assertOn(true); + } + + void step() { + this->sRxPort_bus(); + this->sTxPort_bus(); + this->avs_bus(); + + device_->clk = 0; this->eval(); - vl_obj_->device->clk = 1; + device_->clk = 1; this->eval(); - } - vl_obj_->device->reset = 0; - - // Turn on assertion after reset - Verilated::assertOn(true); -} + dram_->tick(); -void opae_sim::step() { - this->sRxPort_bus(); - this->sTxPort_bus(); - this->avs_bus(); - - vl_obj_->device->clk = 0; - this->eval(); - vl_obj_->device->clk = 1; - this->eval(); - -#ifndef NDEBUG - fflush(stdout); -#endif -} - -void opae_sim::eval() { - vl_obj_->device->eval(); -#ifdef VCD_OUTPUT - if (sim_trace_enabled()) { - vl_obj_->trace->dump(timestamp); - } -#endif - ++timestamp; -} - -void opae_sim::sRxPort_bus() { - // check mmio request - bool mmio_req_enabled = vl_obj_->device->vcp2af_sRxPort_c0_mmioRdValid - || vl_obj_->device->vcp2af_sRxPort_c0_mmioWrValid; - - // schedule CCI read responses - std::list::iterator cci_rd_it(cci_reads_.end()); - for (auto it = cci_reads_.begin(), ie = cci_reads_.end(); it != ie; ++it) { - if (it->cycles_left > 0) - it->cycles_left -= 1; - if ((cci_rd_it == ie) && (it->cycles_left == 0)) { - cci_rd_it = it; - } + #ifndef NDEBUG + fflush(stdout); + #endif } - // schedule CCI write responses - std::list::iterator cci_wr_it(cci_writes_.end()); - for (auto it = cci_writes_.begin(), ie = cci_writes_.end(); it != ie; ++it) { - if (it->cycles_left > 0) - it->cycles_left -= 1; - if ((cci_wr_it == ie) && (it->cycles_left == 0)) { - cci_wr_it = it; - } - } - - // send CCI write response - vl_obj_->device->vcp2af_sRxPort_c1_rspValid = 0; - if (cci_wr_it != cci_writes_.end()) { - vl_obj_->device->vcp2af_sRxPort_c1_rspValid = 1; - vl_obj_->device->vcp2af_sRxPort_c1_hdr_resp_type = 0; - vl_obj_->device->vcp2af_sRxPort_c1_hdr_mdata = cci_wr_it->mdata; - cci_writes_.erase(cci_wr_it); - } - - // send CCI read response (ensure mmio disabled) - vl_obj_->device->vcp2af_sRxPort_c0_rspValid = 0; - if (!mmio_req_enabled - && (cci_rd_it != cci_reads_.end())) { - vl_obj_->device->vcp2af_sRxPort_c0_rspValid = 1; - vl_obj_->device->vcp2af_sRxPort_c0_hdr_resp_type = 0; - memcpy(vl_obj_->device->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE); - vl_obj_->device->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata; - /*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata); - for (int i = 0; i < CACHE_BLOCK_SIZE; ++i) - printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]); - printf("\n");*/ - cci_reads_.erase(cci_rd_it); - } -} - -void opae_sim::sTxPort_bus() { - // process read requests - if (vl_obj_->device->af2cp_sTxPort_c0_valid) { - assert(!vl_obj_->device->vcp2af_sRxPort_c0_TxAlmFull); - cci_rd_req_t cci_req; - cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD); - cci_req.addr = vl_obj_->device->af2cp_sTxPort_c0_hdr_address; - cci_req.mdata = vl_obj_->device->af2cp_sTxPort_c0_hdr_mdata; - auto host_ptr = (uint64_t*)(vl_obj_->device->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE); - memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE); - //printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, vl_obj_->device->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); - cci_reads_.emplace_back(cci_req); - } - - // process write requests - if (vl_obj_->device->af2cp_sTxPort_c1_valid) { - assert(!vl_obj_->device->vcp2af_sRxPort_c1_TxAlmFull); - cci_wr_req_t cci_req; - cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD); - cci_req.mdata = vl_obj_->device->af2cp_sTxPort_c1_hdr_mdata; - auto host_ptr = (uint64_t*)(vl_obj_->device->af2cp_sTxPort_c1_hdr_address * CACHE_BLOCK_SIZE); - memcpy(host_ptr, vl_obj_->device->af2cp_sTxPort_c1_data, CACHE_BLOCK_SIZE); - cci_writes_.emplace_back(cci_req); - } - - // check queues overflow - vl_obj_->device->vcp2af_sRxPort_c0_TxAlmFull = (cci_reads_.size() >= (CCI_RQ_SIZE-1)); - vl_obj_->device->vcp2af_sRxPort_c1_TxAlmFull = (cci_writes_.size() >= (CCI_WQ_SIZE-1)); -} - -void opae_sim::avs_bus() { - for (int b = 0; b < MEMORY_BANKS; ++b) { - // update memory responses schedule - for (auto& rsp : mem_reads_[b]) { - if (rsp.cycles_left > 0) - rsp.cycles_left -= 1; - } - - // schedule memory responses in FIFO order - std::list::iterator mem_rd_it(mem_reads_[b].end()); - if (!mem_reads_[b].empty() - && (0 == mem_reads_[b].begin()->cycles_left)) { - mem_rd_it = mem_reads_[b].begin(); - } - - // send memory response - vl_obj_->device->avs_readdatavalid[b] = 0; - if (mem_rd_it != mem_reads_[b].end()) { - vl_obj_->device->avs_readdatavalid[b] = 1; - memcpy(vl_obj_->device->avs_readdata[b], mem_rd_it->data.data(), MEM_BLOCK_SIZE); - uint32_t addr = mem_rd_it->addr; - mem_reads_[b].erase(mem_rd_it); - /*printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%x, pending={", timestamp, b, addr * MEM_BLOCK_SIZE); - for (auto& req : mem_reads_[b]) { - if (req.cycles_left != 0) - printf(" !%0x", req.addr * MEM_BLOCK_SIZE); - else - printf(" %0x", req.addr * MEM_BLOCK_SIZE); - } - printf("}\n");*/ - } - - // handle memory stalls - bool mem_stalled = false; - #ifdef ENABLE_MEM_STALLS - if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { - mem_stalled = true; - } else - if (mem_reads_[b].size() >= MEM_RQ_SIZE) { - mem_stalled = true; + void eval() { + device_->eval(); + #ifdef VCD_OUTPUT + if (sim_trace_enabled()) { + trace_->dump(timestamp); } #endif + ++timestamp; + } - // process memory requests - if (!mem_stalled) { - assert(!vl_obj_->device->avs_read[b] || !vl_obj_->device->avs_write[b]); - if (vl_obj_->device->avs_write[b]) { - uint64_t byteen = vl_obj_->device->avs_byteenable[b]; - unsigned base_addr = vl_obj_->device->avs_address[b] * MEM_BLOCK_SIZE; - uint8_t* data = (uint8_t*)(vl_obj_->device->avs_writedata[b]); + void sRxPort_bus() { + // check mmio request + bool mmio_req_enabled = device_->vcp2af_sRxPort_c0_mmioRdValid + || device_->vcp2af_sRxPort_c0_mmioWrValid; + + // schedule CCI read responses + std::list::iterator cci_rd_it(cci_reads_.end()); + for (auto it = cci_reads_.begin(), ie = cci_reads_.end(); it != ie; ++it) { + if (it->cycles_left > 0) + it->cycles_left -= 1; + if ((cci_rd_it == ie) && (it->cycles_left == 0)) { + cci_rd_it = it; + } + } + + // schedule CCI write responses + std::list::iterator cci_wr_it(cci_writes_.end()); + for (auto it = cci_writes_.begin(), ie = cci_writes_.end(); it != ie; ++it) { + if (it->cycles_left > 0) + it->cycles_left -= 1; + if ((cci_wr_it == ie) && (it->cycles_left == 0)) { + cci_wr_it = it; + } + } + + // send CCI write response + device_->vcp2af_sRxPort_c1_rspValid = 0; + if (cci_wr_it != cci_writes_.end()) { + device_->vcp2af_sRxPort_c1_rspValid = 1; + device_->vcp2af_sRxPort_c1_hdr_resp_type = 0; + device_->vcp2af_sRxPort_c1_hdr_mdata = cci_wr_it->mdata; + cci_writes_.erase(cci_wr_it); + } + + // send CCI read response (ensure mmio disabled) + device_->vcp2af_sRxPort_c0_rspValid = 0; + if (!mmio_req_enabled + && (cci_rd_it != cci_reads_.end())) { + device_->vcp2af_sRxPort_c0_rspValid = 1; + device_->vcp2af_sRxPort_c0_hdr_resp_type = 0; + memcpy(device_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE); + device_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata; + /*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata); + for (int i = 0; i < CACHE_BLOCK_SIZE; ++i) + printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]); + printf("\n");*/ + cci_reads_.erase(cci_rd_it); + } + } + + void sTxPort_bus() { + // process read requests + if (device_->af2cp_sTxPort_c0_valid) { + assert(!device_->vcp2af_sRxPort_c0_TxAlmFull); + cci_rd_req_t cci_req; + cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD); + cci_req.addr = device_->af2cp_sTxPort_c0_hdr_address; + cci_req.mdata = device_->af2cp_sTxPort_c0_hdr_mdata; + auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE); + memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE); + //printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, device_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); + cci_reads_.emplace_back(cci_req); + } + + // process write requests + if (device_->af2cp_sTxPort_c1_valid) { + assert(!device_->vcp2af_sRxPort_c1_TxAlmFull); + cci_wr_req_t cci_req; + cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD); + cci_req.mdata = device_->af2cp_sTxPort_c1_hdr_mdata; + auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c1_hdr_address * CACHE_BLOCK_SIZE); + memcpy(host_ptr, device_->af2cp_sTxPort_c1_data, CACHE_BLOCK_SIZE); + cci_writes_.emplace_back(cci_req); + } + + // check queues overflow + device_->vcp2af_sRxPort_c0_TxAlmFull = (cci_reads_.size() >= (CCI_RQ_SIZE-1)); + device_->vcp2af_sRxPort_c1_TxAlmFull = (cci_writes_.size() >= (CCI_WQ_SIZE-1)); + } + + void avs_bus() { + for (int b = 0; b < MEMORY_BANKS; ++b) { + // process memory responses + device_->avs_readdatavalid[b] = 0; + if (!pending_mem_reqs_[b].empty() + && (*pending_mem_reqs_[b].begin())->ready) { + auto mem_rd_it = pending_mem_reqs_[b].begin(); + auto mem_req = *mem_rd_it; + device_->avs_readdatavalid[b] = 1; + memcpy(device_->avs_readdata[b], mem_req->data.data(), MEM_BLOCK_SIZE); + uint32_t addr = mem_req->addr; + pending_mem_reqs_[b].erase(mem_rd_it); + delete mem_req; + } + + // process memory requests + assert(!device_->avs_read[b] || !device_->avs_write[b]); + unsigned byte_addr = device_->avs_address[b] * MEM_BLOCK_SIZE; + if (device_->avs_write[b]) { + uint64_t byteen = device_->avs_byteenable[b]; + uint8_t* data = (uint8_t*)(device_->avs_writedata[b]); for (int i = 0; i < MEM_BLOCK_SIZE; i++) { if ((byteen >> i) & 0x1) { - (*ram_)[base_addr + i] = data[i]; + (*ram_)[byte_addr + i] = data[i]; } } - /*printf("%0ld: [sim] MEM Wr Req: bank=%d, addr=%x, data=", timestamp, b, base_addr); + + /*printf("%0ld: [sim] MEM Wr Req: bank=%d, addr=%x, data=", timestamp, b, byte_addr); for (int i = 0; i < MEM_BLOCK_SIZE; i++) { printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); } printf("\n");*/ + + // send dram request + ramulator::Request dram_req( + byte_addr, + ramulator::Request::Type::WRITE, + 0 + ); + dram_->send(dram_req); } - if (vl_obj_->device->avs_read[b]) { - mem_rd_req_t mem_req; - mem_req.addr = vl_obj_->device->avs_address[b]; - ram_->read(mem_req.data.data(), vl_obj_->device->avs_address[b] * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE); - mem_req.cycles_left = MEM_LATENCY; - for (auto& rsp : mem_reads_[b]) { - if (mem_req.addr == rsp.addr) { - // duplicate requests receive the same cycle delay - mem_req.cycles_left = rsp.cycles_left; - break; - } - } - mem_reads_[b].emplace_back(mem_req); + + if (device_->avs_read[b]) { + auto mem_req = new mem_rd_req_t(); + mem_req->addr = device_->avs_address[b]; + ram_->read(mem_req->data.data(), byte_addr, MEM_BLOCK_SIZE); + mem_req->ready = false; + pending_mem_reqs_[b].emplace_back(mem_req); + /*printf("%0ld: [sim] MEM Rd Req: bank=%d, addr=%x, pending={", timestamp, b, mem_req.addr * MEM_BLOCK_SIZE); - for (auto& req : mem_reads_[b]) { + for (auto& req : pending_mem_reqs_[b]) { if (req.cycles_left != 0) printf(" !%0x", req.addr * MEM_BLOCK_SIZE); else printf(" %0x", req.addr * MEM_BLOCK_SIZE); } printf("}\n");*/ - } - } - vl_obj_->device->avs_waitrequest[b] = mem_stalled; + // send dram request + ramulator::Request dram_req( + byte_addr, + ramulator::Request::Type::READ, + std::bind([](ramulator::Request& dram_req, mem_rd_req_t* mem_req) { + mem_req->ready = true; + }, placeholders::_1, mem_req), + 0 + ); + dram_->send(dram_req); + } + + device_->avs_waitrequest[b] = false; + } } + + typedef struct { + bool ready; + std::array data; + uint32_t addr; + } mem_rd_req_t; + + typedef struct { + int cycles_left; + std::array data; + uint64_t addr; + uint32_t mdata; + } cci_rd_req_t; + + typedef struct { + int cycles_left; + uint32_t mdata; + } cci_wr_req_t; + + typedef struct { + uint64_t* data; + size_t size; + uint64_t ioaddr; + } host_buffer_t; + + std::future future_; + bool stop_; + + std::unordered_map host_buffers_; + int64_t host_buffer_ids_; + + std::list pending_mem_reqs_[MEMORY_BANKS]; + + std::list cci_reads_; + + std::list cci_writes_; + + std::mutex mutex_; + + RAM *ram_; + + ramulator::Gem5Wrapper* dram_; + + Vvortex_afu_shim *device_; +#ifdef VCD_OUTPUT + VerilatedVcdC *trace_; +#endif +}; + +/////////////////////////////////////////////////////////////////////////////// + +opae_sim::opae_sim() + : impl_(new Impl()) +{} + +opae_sim::~opae_sim() { + delete impl_; +} + +int opae_sim::prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) { + return impl_->prepare_buffer(len, buf_addr, wsid, flags); +} + +void opae_sim::release_buffer(uint64_t wsid) { + impl_->release_buffer(wsid); +} + +void opae_sim::get_io_address(uint64_t wsid, uint64_t *ioaddr) { + impl_->get_io_address(wsid, ioaddr); +} + +void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) { + impl_->write_mmio64(mmio_num, offset, value); +} + +void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) { + impl_->read_mmio64(mmio_num, offset, value); } \ No newline at end of file diff --git a/sim/vlsim/opae_sim.h b/sim/vlsim/opae_sim.h index aa19532f..21010b94 100644 --- a/sim/vlsim/opae_sim.h +++ b/sim/vlsim/opae_sim.h @@ -1,29 +1,8 @@ #pragma once -#include -#include - -#include -#include -#include -#include - -#ifndef MEMORY_BANKS - #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS - #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS - #else - #define MEMORY_BANKS 2 - #endif -#endif - -#undef MEM_BLOCK_SIZE -#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8) - -#define CACHE_BLOCK_SIZE 64 - +#include namespace vortex { -class VL_OBJ; class RAM; class opae_sim { @@ -44,57 +23,8 @@ public: private: - typedef struct { - int cycles_left; - std::array data; - uint32_t addr; - } mem_rd_req_t; - - typedef struct { - int cycles_left; - std::array data; - uint64_t addr; - uint32_t mdata; - } cci_rd_req_t; - - typedef struct { - int cycles_left; - uint32_t mdata; - } cci_wr_req_t; - - typedef struct { - uint64_t* data; - size_t size; - uint64_t ioaddr; - } host_buffer_t; - - void reset(); - - void eval(); - - void step(); - - void sRxPort_bus(); - void sTxPort_bus(); - void avs_bus(); - - std::future future_; - bool stop_; - - std::unordered_map host_buffers_; - int64_t host_buffer_ids_; - - std::list mem_reads_ [MEMORY_BANKS]; - - std::list cci_reads_; - - std::list cci_writes_; - - std::mutex mutex_; - - RAM *ram_; - - VL_OBJ* vl_obj_; + class Impl; + Impl* impl_; }; } \ No newline at end of file diff --git a/third_party/Makefile b/third_party/Makefile index 8a9ed890..26b730ff 100644 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -1,4 +1,4 @@ -all: fpnew cocogfx softfloat +all: fpnew cocogfx softfloat ramulator fpnew: @@ -8,8 +8,11 @@ cocogfx: softfloat: SPECIALIZE_TYPE=RISCV SOFTFLOAT_OPTS="-fPIC -DSOFTFLOAT_ROUND_ODD -DINLINE_LEVEL=5 -DSOFTFLOAT_FAST_DIV32TO16 -DSOFTFLOAT_FAST_DIV64TO32" $(MAKE) -C softfloat/build/Linux-x86_64-GCC +ramulator: + $(MAKE) -C ramulator libramulator.a + clean: $(MAKE) clean -C cocogfx $(MAKE) -C softfloat/build/Linux-x86_64-GCC clean -.PHONY: all fpnew cocogfx softfloat \ No newline at end of file +.PHONY: all fpnew cocogfx softfloat ramulator \ No newline at end of file From 30d9d3e956d5fe20d3c24961dbec7488e16c5cf0 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 6 Dec 2021 13:17:51 -0500 Subject: [PATCH 21/27] minor update --- miscs/patch/ramulator.patch | 46 +++++++++++++++++++++++++++++++++++++ third_party/Makefile | 4 +++- 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 miscs/patch/ramulator.patch diff --git a/miscs/patch/ramulator.patch b/miscs/patch/ramulator.patch new file mode 100644 index 00000000..e24b5d23 --- /dev/null +++ b/miscs/patch/ramulator.patch @@ -0,0 +1,46 @@ +diff --git a/Makefile b/Makefile +index ea340c8..d2aac5b 100644 +--- a/Makefile ++++ b/Makefile +@@ -7,16 +7,16 @@ OBJS := $(patsubst $(SRCDIR)/%.cpp, $(OBJDIR)/%.o, $(SRCS)) + + # Ramulator currently supports g++ 5.1+ or clang++ 3.4+. It will NOT work with + # g++ 4.x due to an internal compiler error when processing lambda functions. +-CXX := clang++ ++#CXX := clang++ + # CXX := g++-5 +-CXXFLAGS := -O3 -std=c++11 -g -Wall ++CXXFLAGS := -std=c++11 -O3 -g -Wall -fPIC + + .PHONY: all clean depend + + all: depend ramulator + + clean: +- rm -f ramulator ++ rm -f ramulator libramulator.a + rm -rf $(OBJDIR) + + depend: $(OBJDIR)/.depend +@@ -36,7 +36,7 @@ ramulator: $(MAIN) $(OBJS) $(SRCDIR)/*.h | depend + $(CXX) $(CXXFLAGS) -DRAMULATOR -o $@ $(MAIN) $(OBJS) + + libramulator.a: $(OBJS) $(OBJDIR)/Gem5Wrapper.o +- libtool -static -o $@ $(OBJS) $(OBJDIR)/Gem5Wrapper.o ++ $(AR) rcs $@ $^ + + $(OBJS): | $(OBJDIR) + +diff --git a/src/Request.h b/src/Request.h +index 57abd0d..a5ce061 100644 +--- a/src/Request.h ++++ b/src/Request.h +@@ -36,7 +36,7 @@ public: + + Request(long addr, Type type, int coreid = 0) + : is_first_command(true), addr(addr), coreid(coreid), type(type), +- callback([](Request& req){}) {} ++ callback([](Request&){}) {} + + Request(long addr, Type type, function callback, int coreid = 0) + : is_first_command(true), addr(addr), coreid(coreid), type(type), callback(callback) {} diff --git a/third_party/Makefile b/third_party/Makefile index 26b730ff..88a639c1 100644 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -9,10 +9,12 @@ softfloat: SPECIALIZE_TYPE=RISCV SOFTFLOAT_OPTS="-fPIC -DSOFTFLOAT_ROUND_ODD -DINLINE_LEVEL=5 -DSOFTFLOAT_FAST_DIV32TO16 -DSOFTFLOAT_FAST_DIV64TO32" $(MAKE) -C softfloat/build/Linux-x86_64-GCC ramulator: + cd ramulator && git apply -R ../../miscs/patch/ramulator.patch 2> /dev/null $(MAKE) -C ramulator libramulator.a clean: - $(MAKE) clean -C cocogfx + $(MAKE) -C cocogfx clean $(MAKE) -C softfloat/build/Linux-x86_64-GCC clean + $(MAKE) -C ramulator clean .PHONY: all fpnew cocogfx softfloat ramulator \ No newline at end of file From 71ce58500a1bbc3b559eb0341c81690489709730 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 6 Dec 2021 14:09:31 -0500 Subject: [PATCH 22/27] minor update --- third_party/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/Makefile b/third_party/Makefile index 88a639c1..4cf70b2d 100644 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -9,7 +9,7 @@ softfloat: SPECIALIZE_TYPE=RISCV SOFTFLOAT_OPTS="-fPIC -DSOFTFLOAT_ROUND_ODD -DINLINE_LEVEL=5 -DSOFTFLOAT_FAST_DIV32TO16 -DSOFTFLOAT_FAST_DIV64TO32" $(MAKE) -C softfloat/build/Linux-x86_64-GCC ramulator: - cd ramulator && git apply -R ../../miscs/patch/ramulator.patch 2> /dev/null + cd ramulator && git apply ../../miscs/patch/ramulator.patch 2> /dev/null; true $(MAKE) -C ramulator libramulator.a clean: From 9811740ead8a15eefad953d557d5eadb2ebcd585 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 6 Dec 2021 14:34:20 -0500 Subject: [PATCH 23/27] minor update --- third_party/ramulator | 1 + 1 file changed, 1 insertion(+) create mode 160000 third_party/ramulator diff --git a/third_party/ramulator b/third_party/ramulator new file mode 160000 index 00000000..4edcb0d0 --- /dev/null +++ b/third_party/ramulator @@ -0,0 +1 @@ +Subproject commit 4edcb0d05aac9ec46b032a7bf59595c0418287f7 From a9ec1c08a706d889e271848f13fd0e803e72c3e4 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 6 Dec 2021 15:44:25 -0500 Subject: [PATCH 24/27] minor update --- tests/regression/tex/Makefile | 4 ++-- tests/regression/tex/utils.cpp | 4 ++-- tests/regression/tex/utils.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/regression/tex/Makefile b/tests/regression/tex/Makefile index fd9f195d..ff38c514 100644 --- a/tests/regression/tex/Makefile +++ b/tests/regression/tex/Makefile @@ -10,7 +10,7 @@ VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy VX_CFLAGS += -std=c++11 -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections -VX_CFLAGS += -DENABLE_SW -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common -I$(VORTEX_RT_PATH)/../third_party/cocogfx/include +VX_CFLAGS += -DENABLE_SW -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common -I$(VORTEX_RT_PATH)/../third_party VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a @@ -19,7 +19,7 @@ VX_SRCS = kernel.c #CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors -CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common -I$(VORTEX_RT_PATH)/../third_party/cocogfx/include +CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common -I$(VORTEX_RT_PATH)/../third_party LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex $(VORTEX_RT_PATH)/../third_party/cocogfx/libcocogfx.a -lz diff --git a/tests/regression/tex/utils.cpp b/tests/regression/tex/utils.cpp index e76b72f9..aee98293 100644 --- a/tests/regression/tex/utils.cpp +++ b/tests/regression/tex/utils.cpp @@ -3,8 +3,8 @@ #include #include #include -#include -#include +#include +#include using namespace cocogfx; diff --git a/tests/regression/tex/utils.h b/tests/regression/tex/utils.h index a3ffccae..25c4e3ad 100644 --- a/tests/regression/tex/utils.h +++ b/tests/regression/tex/utils.h @@ -1,8 +1,8 @@ #include #include -#include -#include #include +#include +#include int LoadImage(const char *filename, cocogfx::ePixelFormat format, From 5825b7c15a3221c611ec395680705c06ecf5217a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 7 Dec 2021 22:44:06 -0500 Subject: [PATCH 25/27] dram simulator fix --- driver/rtlsim/vortex.cpp | 3 +- driver/simx/vortex.cpp | 49 ++++-------- sim/common/simobject.h | 50 +++++++----- sim/rtlsim/main.cpp | 10 +-- sim/rtlsim/processor.cpp | 106 +++++++++++++++---------- sim/rtlsim/processor.h | 4 +- sim/simx/cache.cpp | 109 +++++++++++++++++++------- sim/simx/cache.h | 5 +- sim/simx/constants.h | 12 ++- sim/simx/core.cpp | 132 ++++++++++++++++++------------- sim/simx/core.h | 20 ++--- sim/simx/debug.h | 12 +-- sim/simx/execute.cpp | 162 +++++++++++++++++++-------------------- sim/simx/exeunit.cpp | 91 ++++++++++++---------- sim/simx/exeunit.h | 26 ++++--- sim/simx/ibuffer.h | 5 ++ sim/simx/main.cpp | 37 ++++----- sim/simx/memsim.cpp | 62 +++++++++------ sim/simx/memsim.h | 6 +- sim/simx/pipeline.h | 28 ++++++- sim/simx/processor.cpp | 75 +++++++++--------- sim/simx/processor.h | 16 ++-- sim/simx/scoreboard.h | 13 +++- sim/simx/sharedmem.h | 6 +- sim/simx/tex_unit.cpp | 6 ++ sim/simx/tex_unit.h | 2 + sim/simx/types.h | 75 ++++++++---------- sim/simx/warp.cpp | 30 ++++++-- sim/simx/warp.h | 12 +-- sim/vlsim/opae_sim.cpp | 37 ++++++--- 30 files changed, 702 insertions(+), 499 deletions(-) diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index 52c290cd..85f7054c 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -124,8 +124,7 @@ public: future_.wait(); } // start new run - future_ = std::async(std::launch::async, [&]{ - processor_.reset(); + future_ = std::async(std::launch::async, [&]{ processor_.run(); }); return 0; diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index 4b086d7e..e1897139 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -8,11 +8,17 @@ #include #include -#include -#include + #include + #include +#include +#include +#include +#include + + using namespace vortex; /////////////////////////////////////////////////////////////////////////////// @@ -59,13 +65,11 @@ public: vx_device() : arch_("rv32i", NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS) , ram_(RAM_PAGE_SIZE) + , processor_(arch_) , mem_allocation_(ALLOC_BASE_ADDR) { - // setup memory simulator - memsim_ = MemSim::Create(MemSim::Config{ - DRAM_CHANNELS, - arch_.num_cores() - }); + // attach memory module + processor_.attach_ram(&ram_); } ~vx_device() { @@ -122,28 +126,7 @@ public: // start new run future_ = std::async(std::launch::async, [&]{ - if (processor_) { - // release current processor instance - processor_->MemReqPort.unbind(); - memsim_->MemRspPort.unbind(); - SimPlatform::instance().release_object(processor_); - } - - // create new processor instance - processor_ = Processor::Create(arch_); - processor_->MemReqPort.bind(&memsim_->MemReqPort); - memsim_->MemRspPort.bind(&processor_->MemRspPort); - - // attach memory object - processor_->attach_ram(&ram_); - - // run simulation - int exitcode; - for (;;) { - SimPlatform::instance().step(); - if (processor_->check_exit(&exitcode)) - break; - }; + processor_.run(); }); return 0; @@ -167,8 +150,7 @@ public: private: ArchDef arch_; RAM ram_; - MemSim::Ptr memsim_; - Processor::Ptr processor_; + Processor processor_; uint64_t mem_allocation_; std::future future_; }; @@ -207,9 +189,6 @@ extern int vx_dev_open(vx_device_h* hdevice) { if (nullptr == hdevice) return -1; - if (!SimPlatform::instance().initialize()) - return -1; - *hdevice = new vx_device(); #ifdef DUMP_PERF_STATS @@ -232,8 +211,6 @@ extern int vx_dev_close(vx_device_h hdevice) { delete device; - SimPlatform::instance().finalize(); - return 0; } diff --git a/sim/common/simobject.h b/sim/common/simobject.h index 2830ea06..eb32302d 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -127,7 +127,7 @@ public: virtual ~SimEventBase() {} - virtual void fire() const = 0; + virtual void fire() const = 0; uint64_t time() const { return time_; @@ -219,15 +219,21 @@ public: const std::string& name() const { return name_; - } - - virtual void step(uint64_t cycle) = 0; + } protected: SimObjectBase(const SimContext& ctx, const char* name); +private: + + virtual void do_reset() = 0; + + virtual void do_tick() = 0; + std::string name_; + + friend class SimPlatform; }; /////////////////////////////////////////////////////////////////////////////// @@ -246,18 +252,22 @@ protected: : SimObjectBase(ctx, name) {} - void step(uint64_t cycle) override { - this->impl().step(cycle); - } - private: - const Impl& impl() const { - return static_cast(*this); + const Impl* impl() const { + return static_cast(this); } - Impl& impl() { - return static_cast(*this); + Impl* impl() { + return static_cast(this); + } + + void do_reset() override { + this->impl()->reset(); + } + + void do_tick() override { + this->impl()->tick(); } }; @@ -282,10 +292,6 @@ public: return true; } - void flush() { - instance().clear(); - } - void finalize() { instance().clear(); } @@ -310,7 +316,15 @@ public: events_.emplace_back(evt); } - void step() { + void reset() { + events_.clear(); + for (auto& object : objects_) { + object->do_reset(); + } + cycles_ = 0; + } + + void tick() { // evaluate events auto evt_it = events_.begin(); auto evt_it_end = events_.end(); @@ -325,7 +339,7 @@ public: } // evaluate components for (auto& object : objects_) { - object->step(cycles_); + object->do_tick(); } // advance clock ++cycles_; diff --git a/sim/rtlsim/main.cpp b/sim/rtlsim/main.cpp index c61fbec8..a3766604 100644 --- a/sim/rtlsim/main.cpp +++ b/sim/rtlsim/main.cpp @@ -49,12 +49,12 @@ int main(int argc, char **argv) { parse_args(argc, argv); - for (auto program : programs) { - std::cout << "Running " << program << "..." << std::endl; + vortex::RAM ram(RAM_PAGE_SIZE); + vortex::Processor processor; + processor.attach_ram(&ram); - vortex::RAM ram(RAM_PAGE_SIZE); - vortex::Processor processor; - processor.attach_ram(&ram); + for (auto program : programs) { + std::cout << "Running " << program << "..." << std::endl; std::string program_ext(fileExtension(program)); if (program_ext == "bin") { diff --git a/sim/rtlsim/processor.cpp b/sim/rtlsim/processor.cpp index 7c20a442..284d599f 100644 --- a/sim/rtlsim/processor.cpp +++ b/sim/rtlsim/processor.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -39,7 +40,9 @@ #endif #endif -#define ENABLE_MEM_STALLS +#ifndef MEM_CYCLE_RATIO +#define MEM_CYCLE_RATIO -1 +#endif #ifndef TRACE_START_TIME #define TRACE_START_TIME 0ull @@ -126,12 +129,7 @@ public: } ~Impl() { - for (auto& buf : print_bufs_) { - auto str = buf.second.str(); - if (!str.empty()) { - std::cout << "#" << buf.first << ": " << str << std::endl; - } - } + this->cout_flush(); #ifdef VCD_OUTPUT trace_->close(); @@ -147,10 +145,46 @@ public: } } + void cout_flush() { + for (auto& buf : print_bufs_) { + auto str = buf.second.str(); + if (!str.empty()) { + std::cout << "#" << buf.first << ": " << str << std::endl; + } + } + } + void attach_ram(RAM* ram) { ram_ = ram; } + int run() { + int exitcode = 0; + + #ifndef NDEBUG + std::cout << std::dec << timestamp << ": [sim] run()" << std::endl; + #endif + + // reset device + this->reset(); + + // execute program + while (device_->busy) { + if (get_ebreak()) { + exitcode = get_last_wb_value(3); + break; + } + this->tick(); + } + + // wait 5 cycles to flush the pipeline + this->wait(5); + + return exitcode; + } + +private: + void reset() { print_bufs_.clear(); @@ -178,33 +212,11 @@ public: // Turn on assertion after reset Verilated::assertOn(true); + + this->cout_flush(); } - int run() { - int exitcode = 0; - - #ifndef NDEBUG - std::cout << std::dec << timestamp << ": [sim] run()" << std::endl; - #endif - - // execute program - while (device_->busy) { - if (get_ebreak()) { - exitcode = get_last_wb_value(3); - break; - } - this->step(); - } - - // wait 5 cycles to flush the pipeline - this->wait(5); - - return exitcode; - } - -private: - - void step() { + void tick() { device_->clk = 0; this->eval(); @@ -224,7 +236,19 @@ private: this->eval_avs_bus(1); #endif - dram_->tick(); + if (MEM_CYCLE_RATIO > 0) { + auto cycle = timestamp / 2; + if ((cycle % MEM_CYCLE_RATIO) == 0) + dram_->tick(); + } else { + for (int i = MEM_CYCLE_RATIO; i <= 0; ++i) + dram_->tick(); + } + + if (!dram_queue_.empty()) { + if (dram_->send(dram_queue_.front())) + dram_queue_.pop(); + } #ifndef NDEBUG fflush(stdout); @@ -372,7 +396,7 @@ private: ramulator::Request::Type::WRITE, 0 ); - dram_->send(dram_req); + dram_queue_.push(dram_req); } } else { // process reads @@ -393,7 +417,7 @@ private: }, placeholders::_1, mem_req), 0 ); - dram_->send(dram_req); + dram_queue_.push(dram_req); } } @@ -490,7 +514,7 @@ private: ramulator::Request::Type::WRITE, 0 ); - dram_->send(dram_req); + dram_queue_.push(dram_req); } } else { // process reads @@ -511,7 +535,7 @@ private: }, placeholders::_1, mem_req), 0 ); - dram_->send(dram_req); + dram_queue_.push(dram_req); } } @@ -522,7 +546,7 @@ private: void wait(uint32_t cycles) { for (int i = 0; i < cycles; ++i) { - this->step(); + this->tick(); } } @@ -574,6 +598,8 @@ private: RAM *ram_; ramulator::Gem5Wrapper* dram_; + + std::queue dram_queue_; }; /////////////////////////////////////////////////////////////////////////////// @@ -590,10 +616,6 @@ void Processor::attach_ram(RAM* mem) { impl_->attach_ram(mem); } -void Processor::reset() { - impl_->reset(); -} - int Processor::run() { return impl_->run(); } \ No newline at end of file diff --git a/sim/rtlsim/processor.h b/sim/rtlsim/processor.h index a877044f..5518990b 100644 --- a/sim/rtlsim/processor.h +++ b/sim/rtlsim/processor.h @@ -8,12 +8,10 @@ class Processor { public: Processor(); - virtual ~Processor(); + ~Processor(); void attach_ram(RAM* ram); - void reset(); - int run(); private: diff --git a/sim/simx/cache.cpp b/sim/simx/cache.cpp index 36c03eb9..34c8903c 100644 --- a/sim/simx/cache.cpp +++ b/sim/simx/cache.cpp @@ -102,6 +102,12 @@ struct block_t { struct set_t { std::vector blocks; set_t(uint32_t size) : blocks(size) {} + + void clear() { + for (auto& block : blocks) { + block.valid = false; + } + } }; struct bank_req_info_t { @@ -117,6 +123,7 @@ struct bank_req_t { uint64_t tag; uint32_t set_id; uint32_t core_id; + uint64_t uuid; std::vector infos; bank_req_t(uint32_t size) @@ -126,6 +133,7 @@ struct bank_req_t { , tag(0) , set_id(0) , core_id(0) + , uuid(0) , infos(size) {} }; @@ -142,20 +150,20 @@ struct mshr_entry_t : public bank_req_t { class MSHR { private: std::vector entries_; - uint32_t capacity_; + uint32_t size_; public: MSHR(uint32_t size) : entries_(size) - , capacity_(0) + , size_(0) {} bool empty() const { - return (0 == capacity_); + return (0 == size_); } bool full() const { - return (capacity_ == entries_.size()); + return (size_ == entries_.size()); } int lookup(const bank_req_t& bank_req) { @@ -178,7 +186,7 @@ public: entry.valid = true; entry.mshr_replay = false; entry.block_id = block_id; - ++capacity_; + ++size_; return i; } } @@ -204,12 +212,21 @@ public: if (entry.valid && entry.mshr_replay) { *out = entry; entry.valid = false; - --capacity_; + --size_; return true; } } return false; } + + void clear() { + for (auto& entry : entries_) { + if (entry.valid && entry.mshr_replay) { + entry.valid = false; + } + } + size_ = 0; + } }; struct bank_t { @@ -221,6 +238,13 @@ struct bank_t { : sets(params.sets_per_bank, params.blocks_per_set) , mshr(config.mshr_size) {} + + void clear() { + mshr.clear(); + for (auto& set : sets) { + set.clear(); + } + } }; /////////////////////////////////////////////////////////////////////////////// @@ -235,11 +259,11 @@ private: Switch::Ptr bypass_switch_; std::vector> mem_req_ports_; std::vector> mem_rsp_ports_; + uint32_t flush_cycles_; PerfStats perf_stats_; uint64_t pending_read_reqs_; uint64_t pending_write_reqs_; - uint64_t pending_fill_reqs_; - uint32_t flush_cycles_; + uint64_t pending_fill_reqs_; public: Impl(Cache* simobject, const Config& config) @@ -249,9 +273,6 @@ public: , banks_(config.num_banks, {config, params_}) , mem_req_ports_(config.num_banks, simobject) , mem_rsp_ports_(config.num_banks, simobject) - , pending_read_reqs_(0) - , pending_write_reqs_(0) - , pending_fill_reqs_(0) { bypass_switch_ = Switch::Create("bypass_arb", ArbiterType::Priority, 2); bypass_switch_->ReqOut.bind(&simobject->MemReqPort); @@ -272,19 +293,28 @@ public: // calculate tag flush cycles flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set; - } - - const PerfStats& perf_stats() const { - return perf_stats_; } - void step(uint64_t cycle) { + void reset() { + for (auto& bank : banks_) { + bank.clear(); + } + perf_stats_ = PerfStats(); + pending_read_reqs_ = 0; + pending_write_reqs_ = 0; + pending_fill_reqs_ = 0; + } + + void tick() { // wait on flush cycles if (flush_cycles_ != 0) { --flush_cycles_; return; } + // per-bank pipeline request + std::vector pipeline_reqs(config_.num_banks, config_.ports_per_bank); + // calculate memory latency perf_stats_.mem_latency += pending_fill_reqs_; @@ -294,12 +324,11 @@ public: auto& mem_rsp = bypass_port.front(); uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1); uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs; - MemRsp core_rsp{tag, mem_rsp.core_id}; + MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid}; simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency); + DT(3, simobject_->name() << "-" << core_rsp); bypass_port.pop(); - } - - std::vector pipeline_reqs(config_.num_banks, config_.ports_per_bank); + } // handle MSHR replay for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { @@ -351,6 +380,7 @@ public: bank_req.tag = tag; bank_req.set_id = set_id; bank_req.core_id = core_req.core_id; + bank_req.uuid = core_req.uuid; bank_req.infos.at(port_id) = {true, req_id, core_req.tag}; auto& bank = banks_.at(bank_id); @@ -400,22 +430,31 @@ public: // remove request auto time = core_req_port.pop(); - perf_stats_.pipeline_stalls += (cycle - time); + perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time); } // process active request this->processBankRequest(pipeline_reqs); + } + + const PerfStats& perf_stats() const { + return perf_stats_; } + +private: void processIORequest(const MemReq& core_req, uint32_t req_id) { { MemReq mem_req(core_req); mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id; bypass_switch_->ReqIn.at(1).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); } if (core_req.write && config_.write_reponse) { - simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_req.tag}, 1); + MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid}; + simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1); + DT(3, simobject_->name() << "-" << core_rsp); } } @@ -442,8 +481,9 @@ public: if (pipeline_req.mshr_replay) { // send core response for (auto& info : pipeline_req.infos) { - MemRsp core_rsp{info.req_tag, pipeline_req.core_id}; - simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); + MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid}; + simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); + DT(3, simobject_->name() << "-" << core_rsp); } } else { bool hit = false; @@ -485,7 +525,9 @@ public: mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag); mem_req.write = true; mem_req.core_id = pipeline_req.core_id; + mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); } else { // mark block as dirty hit_block.dirty = true; @@ -494,8 +536,9 @@ public: // send core response if (!pipeline_req.write || config_.write_reponse) { for (auto& info : pipeline_req.infos) { - MemRsp core_rsp{info.req_tag, pipeline_req.core_id}; + MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid}; simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); + DT(3, simobject_->name() << "-" << core_rsp); } } } else { @@ -516,6 +559,7 @@ public: mem_req.write = true; mem_req.core_id = pipeline_req.core_id; mem_req_ports_.at(bank_id).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); ++perf_stats_.evictions; } } @@ -527,13 +571,16 @@ public: mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); mem_req.write = true; mem_req.core_id = pipeline_req.core_id; + mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); } // send core response if (config_.write_reponse) { for (auto& info : pipeline_req.infos) { - MemRsp core_rsp{info.req_tag, pipeline_req.core_id}; + MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid}; simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency); + DT(3, simobject_->name() << "-" << core_rsp); } } } else { @@ -550,7 +597,9 @@ public: mem_req.write = false; mem_req.tag = mshr_id; mem_req.core_id = pipeline_req.core_id; + mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).send(mem_req, 1); + DT(3, simobject_->name() << "-" << mem_req); ++pending_fill_reqs_; } } @@ -575,8 +624,12 @@ Cache::~Cache() { delete impl_; } -void Cache::step(uint64_t cycle) { - impl_->step(cycle); +void Cache::reset() { + impl_->reset(); +} + +void Cache::tick() { + impl_->tick(); } const Cache::PerfStats& Cache::perf_stats() const { diff --git a/sim/simx/cache.h b/sim/simx/cache.h index 8f4b3932..a335b483 100644 --- a/sim/simx/cache.h +++ b/sim/simx/cache.h @@ -22,6 +22,7 @@ public: uint16_t mshr_size; // MSHR buffer size uint8_t latency; // pipeline latency }; + struct PerfStats { uint64_t reads; uint64_t writes; @@ -54,7 +55,9 @@ public: Cache(const SimContext& ctx, const char* name, const Config& config); ~Cache(); - void step(uint64_t cycle); + void reset(); + + void tick(); const PerfStats& perf_stats() const; diff --git a/sim/simx/constants.h b/sim/simx/constants.h index a28bd806..109f29f4 100644 --- a/sim/simx/constants.h +++ b/sim/simx/constants.h @@ -1,10 +1,16 @@ #pragma once -#include "types.h" - +#ifndef RAM_PAGE_SIZE #define RAM_PAGE_SIZE 4096 +#endif -#define DRAM_CHANNELS 2 +#ifndef MEM_CYCLE_RATIO +#define MEM_CYCLE_RATIO -1 +#endif + +#ifndef MEMORY_BANKS +#define MEMORY_BANKS 2 +#endif namespace vortex { diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 0540151c..fd11befd 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -30,7 +30,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) , ibuffers_(arch.num_warps(), IBUF_SIZE) , scoreboard_(arch_) , exe_units_((int)ExeType::MAX) - , icache_(Cache::Create("Icache", Cache::Config{ + , icache_(Cache::Create("icache", Cache::Config{ log2ceil(ICACHE_SIZE), // C log2ceil(L1_BLOCK_SIZE),// B 2, // W @@ -45,7 +45,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) NUM_WARPS, // mshr 2, // pipeline latency })) - , dcache_(Cache::Create("Dcache", Cache::Config{ + , dcache_(Cache::Create("dcache", Cache::Config{ log2ceil(DCACHE_SIZE), // C log2ceil(L1_BLOCK_SIZE),// B 2, // W @@ -72,15 +72,6 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) , fetch_latch_("fetch") , decode_latch_("decode") , pending_icache_(arch_.num_warps()) - , active_warps_(1) - , stalled_warps_(0) - , last_schedule_wid_(0) - , issued_instrs_(0) - , committed_instrs_(0) - , csr_tex_unit_(0) - , ecall_(false) - , ebreak_(false) - , perf_mem_pending_reads_(0) { for (int i = 0; i < arch_.num_warps(); ++i) { warps_.at(i) = std::make_shared(this, i); @@ -112,10 +103,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) #endif sw->ReqOut.bind(&dcache_->CoreReqPorts.at(i)); dcache_->CoreRspPorts.at(i).bind(&sw->RspIn); - } - - // activate warp0 - warps_.at(0)->setTmask(0, true); + } // memory perf callbacks MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){ @@ -128,9 +116,62 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id) __unused (cycle); --perf_mem_pending_reads_; }); + + this->reset(); } Core::~Core() { + this->cout_flush(); +} + +void Core::reset() { + for (auto& warp : warps_) { + warp->clear(); + } + warps_.at(0)->setTmask(0, true); + active_warps_ = 1; + + for (auto& tex_unit : tex_units_) { + tex_unit.clear(); + } + + for ( auto& barrier : barriers_) { + barrier.reset(); + } + + for (auto& csr : csrs_) { + csr = 0; + } + + for (auto& fcsr : fcsrs_) { + fcsr = 0; + } + + for (auto& ibuf : ibuffers_) { + ibuf.clear(); + } + + scoreboard_.clear(); + fetch_latch_.clear(); + decode_latch_.clear(); + pending_icache_.clear(); + stalled_warps_.reset(); + last_schedule_wid_ = 0; + issued_instrs_ = 0; + committed_instrs_ = 0; + csr_tex_unit_ = 0; + ecall_ = false; + ebreak_ = false; + perf_mem_pending_reads_ = 0; + perf_stats_ = PerfStats(); +} + +void Core::attach_ram(RAM* ram) { + // bind RAM to memory unit + mmu_.attach(*ram, 0, 0xFFFFFFFF); +} + +void Core::cout_flush() { for (auto& buf : print_bufs_) { auto str = buf.second.str(); if (!str.empty()) { @@ -139,17 +180,12 @@ Core::~Core() { } } -void Core::attach_ram(RAM* ram) { - // bind RAM to memory unit - mmu_.attach(*ram, 0, 0xFFFFFFFF); -} - -void Core::step(uint64_t cycle) { - this->commit(cycle); - this->execute(cycle); - this->decode(cycle); - this->fetch(cycle); - this->schedule(cycle); +void Core::tick() { + this->commit(); + this->execute(); + this->decode(); + this->fetch(); + this->schedule(); // update perf counter perf_stats_.mem_latency += perf_mem_pending_reads_; @@ -157,9 +193,7 @@ void Core::step(uint64_t cycle) { DPN(2, std::flush); } -void Core::schedule(uint64_t cycle) { - __unused (cycle); - +void Core::schedule() { bool foundSchedule = false; int scheduled_warp = last_schedule_wid_; @@ -181,30 +215,27 @@ void Core::schedule(uint64_t cycle) { // suspend warp until decode stalled_warps_.set(scheduled_warp); - auto& warp = warps_.at(scheduled_warp); - uint64_t uuid = (issued_instrs_++ * arch_.num_cores()) + id_; auto trace = new pipeline_trace_t(uuid, arch_); + auto& warp = warps_.at(scheduled_warp); warp->eval(trace); - DT(3, cycle, "pipeline-schedule: " << *trace); + DT(3, "pipeline-schedule: " << *trace); // advance to fetch stage fetch_latch_.push(trace); } -void Core::fetch(uint64_t cycle) { - __unused (cycle); - +void Core::fetch() { // handle icache reponse auto& icache_rsp_port = icache_->CoreRspPorts.at(0); if (!icache_rsp_port.empty()){ auto& mem_rsp = icache_rsp_port.front(); auto trace = pending_icache_.at(mem_rsp.tag); decode_latch_.push(trace); - DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace); + DT(3, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace); pending_icache_.release(mem_rsp.tag); icache_rsp_port.pop(); } @@ -216,16 +247,15 @@ void Core::fetch(uint64_t cycle) { mem_req.addr = trace->PC; mem_req.write = false; mem_req.tag = pending_icache_.allocate(trace); - mem_req.core_id = id_; - icache_->CoreReqPorts.at(0).send(mem_req, 1); - DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); + mem_req.core_id = trace->cid; + mem_req.uuid = trace->uuid; + icache_->CoreReqPorts.at(0).send(mem_req, 1); + DT(3, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); fetch_latch_.pop(); } } -void Core::decode(uint64_t cycle) { - __unused (cycle); - +void Core::decode() { if (decode_latch_.empty()) return; @@ -235,7 +265,7 @@ void Core::decode(uint64_t cycle) { auto& ibuffer = ibuffers_.at(trace->wid); if (ibuffer.full()) { if (!trace->suspend()) { - DT(3, cycle, "*** ibuffer-stall: " << *trace); + DT(3, "*** ibuffer-stall: " << *trace); } ++perf_stats_.ibuf_stalls; return; @@ -257,7 +287,7 @@ void Core::decode(uint64_t cycle) { if (trace->exe_type == ExeType::ALU && trace->alu.type == AluType::BRANCH) perf_stats_.branches += active_threads; - DT(3, cycle, "pipeline-decode: " << *trace); + DT(3, "pipeline-decode: " << *trace); // insert to ibuffer ibuffer.push(trace); @@ -265,9 +295,7 @@ void Core::decode(uint64_t cycle) { decode_latch_.pop(); } -void Core::execute(uint64_t cycle) { - __unused (cycle); - +void Core::execute() { // issue ibuffer instructions for (auto& ibuffer : ibuffers_) { if (ibuffer.empty()) @@ -278,7 +306,7 @@ void Core::execute(uint64_t cycle) { // check scoreboard if (scoreboard_.in_use(trace)) { if (!trace->suspend()) { - DTH(3, cycle, "*** scoreboard-stall: dependents={"); + DTH(3, "*** scoreboard-stall: dependents={"); auto uses = scoreboard_.get_uses(trace); for (uint32_t i = 0, n = uses.size(); i < n; ++i) { auto& use = uses.at(i); @@ -297,7 +325,7 @@ void Core::execute(uint64_t cycle) { // update scoreboard scoreboard_.reserve(trace); - DT(3, cycle, "pipeline-issue: " << *trace); + DT(3, "pipeline-issue: " << *trace); // push to execute units auto& exe_unit = exe_units_.at((int)trace->exe_type); @@ -308,9 +336,7 @@ void Core::execute(uint64_t cycle) { } } -void Core::commit(uint64_t cycle) { - __unused (cycle); - +void Core::commit() { // commit completed instructions bool wb = false; for (auto& exe_unit : exe_units_) { @@ -323,7 +349,7 @@ void Core::commit(uint64_t cycle) { wb |= trace->wb; // advance to commit stage - DT(3, cycle, "pipeline-commit: " << *trace); + DT(3, "pipeline-commit: " << *trace); // update scoreboard scoreboard_.release(trace); diff --git a/sim/simx/core.h b/sim/simx/core.h index b9c01383..18c9beb3 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -75,16 +75,14 @@ public: bool running() const; - void step(uint64_t cycle); + void reset(); + + void tick(); Word id() const { return id_; } - Warp& warp(int i) { - return *warps_.at(i); - } - const Decoder& decoder() { return decoder_; } @@ -125,14 +123,16 @@ public: private: - void schedule(uint64_t cycle); - void fetch(uint64_t cycle); - void decode(uint64_t cycle); - void execute(uint64_t cycle); - void commit(uint64_t cycle); + void schedule(); + void fetch(); + void decode(); + void execute(); + void commit(); void writeToStdOut(Addr addr, Word data); + void cout_flush(); + Word id_; const ArchDef arch_; const Decoder decoder_; diff --git a/sim/simx/debug.h b/sim/simx/debug.h index 53d2d62a..688eded4 100644 --- a/sim/simx/debug.h +++ b/sim/simx/debug.h @@ -33,15 +33,15 @@ } \ } while(0) -#define DT(lvl, t, x) do { \ +#define DT(lvl, x) do { \ if ((lvl) <= DEBUG_LEVEL) { \ - std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x << std::endl; \ + std::cout TRACE_HEADER << std::setw(10) << std::dec << SimPlatform::instance().cycles() << std::setw(0) << ": " << x << std::endl; \ } \ } while(0) -#define DTH(lvl, t, x) do { \ +#define DTH(lvl, x) do { \ if ((lvl) <= DEBUG_LEVEL) { \ - std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x; \ + std::cout TRACE_HEADER << std::setw(10) << std::dec << SimPlatform::instance().cycles() << std::setw(0) << ": " << x; \ } \ } while(0) @@ -58,8 +58,8 @@ #define DPH(lvl, x) do {} while(0) #define DPN(lvl, x) do {} while(0) -#define DT(lvl, t, x) do {} while(0) -#define DTH(lvl, t, x) do {} while(0) +#define DT(lvl, x) do {} while(0) +#define DTH(lvl, x) do {} while(0) #define DTN(lvl, x) do {} while(0) #endif \ No newline at end of file diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index d1df2637..efc199d2 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -87,7 +87,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DPN(2, "-"); continue; } - rsdata[t][i] = iRegFile_.at(t)[reg]; + rsdata[t][i] = ireg_file_.at(t)[reg]; DPN(2, std::hex << rsdata[t][i]); } DPN(2, "}" << std::endl); @@ -100,7 +100,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DPN(2, "-"); continue; } - rsdata[t][i] = fRegFile_.at(t)[reg]; + rsdata[t][i] = freg_file_.at(t)[reg]; DPN(2, std::hex << rsdata[t][i]); } DPN(2, "}" << std::endl); @@ -460,7 +460,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DP(4, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew); DP(4, "dest: v" << rdest); DP(4, "width" << instr.getVlsWidth()); - auto &vd = vRegFile_.at(rdest); + auto &vd = vreg_file_.at(rdest); switch (instr.getVlsWidth()) { case 6: { // load word and unit strided (not checking for unit stride) @@ -517,7 +517,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { switch (instr.getVlsWidth()) { case 6: { // store word and unit strided (not checking for unit stride) - uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i); + uint32_t value = *(uint32_t *)(vreg_file_.at(instr.getVs3()).data() + i); core_->dcache_write(memAddr, value, 4); DP(4, "store: " << memAddr << " value:" << value); } break; @@ -784,7 +784,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { // predicate mode ThreadMask pred; for (int i = 0; i < num_threads; ++i) { - pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0; + pred[i] = tmask_.test(i) ? (ireg_file_.at(i).at(rsrc0) != 0) : 0; } if (pred.any()) { tmask_ &= pred; @@ -819,15 +819,15 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { trace->gpu.type = GpuType::SPLIT; trace->used_iregs.set(rsrc0); trace->fetch_stall = true; - if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) { + if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) { ThreadMask tmask; for (int i = 0; i < num_threads; ++i) { - tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0); + tmask[i] = tmask_.test(i) && !ireg_file_.at(i).at(rsrc0); } DomStackEntry e(tmask, nextPC); - domStack_.push(tmask_); - domStack_.push(e); + dom_stack_.push(tmask_); + dom_stack_.push(e); for (size_t i = 0; i < e.tmask.size(); ++i) { tmask_.set(i, !e.tmask.test(i) && tmask_.test(i)); } @@ -842,7 +842,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DP(3, "*** Unanimous pred"); DomStackEntry e(tmask_); e.unanimous = true; - domStack_.push(e); + dom_stack_.push(e); } } break; case 3: { @@ -850,25 +850,25 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { trace->exe_type = ExeType::GPU; trace->gpu.type = GpuType::JOIN; trace->fetch_stall = true; - if (!domStack_.empty() && domStack_.top().unanimous) { + if (!dom_stack_.empty() && dom_stack_.top().unanimous) { DP(3, "*** Uninimous branch at join"); - tmask_ = domStack_.top().tmask; + tmask_ = dom_stack_.top().tmask; active_ = tmask_.any(); - domStack_.pop(); + dom_stack_.pop(); } else { - if (!domStack_.top().fallThrough) { - nextPC = domStack_.top().PC; + if (!dom_stack_.top().fallThrough) { + nextPC = dom_stack_.top().PC; DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec); } - tmask_ = domStack_.top().tmask; + tmask_ = dom_stack_.top().tmask; active_ = tmask_.any(); DPH(3, "*** Join: New TM="); for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1)); DPN(3, "\n"); - domStack_.pop(); + dom_stack_.pop(); } } break; case 4: { @@ -946,10 +946,10 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { case 0: // vector-vector switch (func6) { case 0: { - auto& vr1 = vRegFile_.at(rsrc0); - auto& vr2 = vRegFile_.at(rsrc1); - auto& vd = vRegFile_.at(rdest); - auto& mask = vRegFile_.at(0); + auto& vr1 = vreg_file_.at(rsrc0); + auto& vr2 = vreg_file_.at(rsrc1); + auto& vd = vreg_file_.at(rdest); + auto& mask = vreg_file_.at(0); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t emask = *(uint8_t *)(mask.data() + i); @@ -990,9 +990,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 24: { // vmseq - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1021,9 +1021,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 25: { // vmsne - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1052,9 +1052,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 26: { // vmsltu - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1083,9 +1083,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 27: { // vmslt - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { int8_t first = *(int8_t *)(vr1.data() + i); @@ -1114,9 +1114,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 28: { // vmsleu - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1145,9 +1145,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 29: { // vmsle - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { int8_t first = *(int8_t *)(vr1.data() + i); @@ -1176,9 +1176,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 30: { // vmsgtu - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1207,9 +1207,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 31: { // vmsgt - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { int8_t first = *(int8_t *)(vr1.data() + i); @@ -1242,9 +1242,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { switch (func6) { case 24: { // vmandnot - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1288,9 +1288,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 25: { // vmand - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1334,9 +1334,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 26: { // vmor - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1380,9 +1380,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 27: { // vmxor - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1426,9 +1426,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 28: { // vmornot - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1472,9 +1472,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 29: { // vmnand - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1518,9 +1518,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 30: { // vmnor - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1564,9 +1564,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 31: { // vmxnor - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1610,9 +1610,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 37: { // vmul - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1650,9 +1650,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 45: { // vmacc - auto &vr1 = vRegFile_.at(rsrc0); - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr1 = vreg_file_.at(rsrc0); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t first = *(uint8_t *)(vr1.data() + i); @@ -1693,8 +1693,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { case 6: { switch (func6) { case 0: { - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t second = *(uint8_t *)(vr2.data() + i); @@ -1729,8 +1729,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } break; case 37: { // vmul.vx - auto &vr2 = vRegFile_.at(rsrc1); - auto &vd = vRegFile_.at(rdest); + auto &vr2 = vreg_file_.at(rsrc1); + auto &vd = vreg_file_.at(rdest); if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { uint8_t second = *(uint8_t *)(vr2.data() + i); @@ -1805,7 +1805,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DPN(2, "-"); continue; } - iRegFile_.at(t)[rdest] = rddata[t]; + ireg_file_.at(t)[rdest] = rddata[t]; DPN(2, "0x" << std::hex << rddata[t]); } DPN(2, "}" << std::endl); @@ -1820,7 +1820,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { DPN(2, "-"); continue; } - fRegFile_.at(t)[rdest] = rddata[t]; + freg_file_.at(t)[rdest] = rddata[t]; DPN(2, "0x" << std::hex << rddata[t]); } DPN(2, "}" << std::endl); diff --git a/sim/simx/exeunit.cpp b/sim/simx/exeunit.cpp index 3b84ee8a..5a47dc06 100644 --- a/sim/simx/exeunit.cpp +++ b/sim/simx/exeunit.cpp @@ -12,7 +12,7 @@ using namespace vortex; NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {} -void NopUnit::step(uint64_t /*cycle*/) { +void NopUnit::tick() { if (Input.empty()) return; auto trace = Input.front(); @@ -25,26 +25,31 @@ void NopUnit::step(uint64_t /*cycle*/) { LsuUnit::LsuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "LSU") , num_threads_(core->arch().num_threads()) - , pending_dcache_(LSUQ_SIZE) + , pending_rd_reqs_(LSUQ_SIZE) , fence_lock_(false) {} -void LsuUnit::step(uint64_t cycle) { +void LsuUnit::reset() { + pending_rd_reqs_.clear(); + fence_lock_ = false; +} + +void LsuUnit::tick() { // handle dcache response for (uint32_t t = 0; t < num_threads_; ++t) { auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0); if (dcache_rsp_port.empty()) continue; auto& mem_rsp = dcache_rsp_port.front(); - auto& entry = pending_dcache_.at(mem_rsp.tag); + auto& entry = pending_rd_reqs_.at(mem_rsp.tag); auto trace = entry.first; - DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type + DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); assert(entry.second); --entry.second; // track remaining blocks if (0 == entry.second) { Output.send(trace, 1); - pending_dcache_.release(mem_rsp.tag); + pending_rd_reqs_.release(mem_rsp.tag); } dcache_rsp_port.pop(); } @@ -55,26 +60,26 @@ void LsuUnit::step(uint64_t cycle) { if (smem_rsp_port.empty()) continue; auto& mem_rsp = smem_rsp_port.front(); - auto& entry = pending_dcache_.at(mem_rsp.tag); + auto& entry = pending_rd_reqs_.at(mem_rsp.tag); auto trace = entry.first; - DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type + DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); assert(entry.second); --entry.second; // track remaining blocks if (0 == entry.second) { Output.send(trace, 1); - pending_dcache_.release(mem_rsp.tag); + pending_rd_reqs_.release(mem_rsp.tag); } smem_rsp_port.pop(); } if (fence_lock_) { // wait for all pending memory operations to complete - if (!pending_dcache_.empty()) + if (!pending_rd_reqs_.empty()) return; Output.send(fence_state_, 1); fence_lock_ = false; - DT(3, cycle, "fence-unlock: " << fence_state_); + DT(3, "fence-unlock: " << fence_state_); } // check input queue @@ -87,17 +92,17 @@ void LsuUnit::step(uint64_t cycle) { // schedule fence lock fence_state_ = trace; fence_lock_ = true; - DT(3, cycle, "fence-lock: " << *trace); + DT(3, "fence-lock: " << *trace); // remove input auto time = Input.pop(); - core_->perf_stats_.lsu_stalls += (cycle - time); + core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time); return; } // check pending queue capacity - if (pending_dcache_.full()) { + if (pending_rd_reqs_.full()) { if (!trace->suspend()) { - DT(3, cycle, "*** lsu-queue-stall: " << *trace); + DT(3, "*** lsu-queue-stall: " << *trace); } return; } else { @@ -130,7 +135,7 @@ void LsuUnit::step(uint64_t cycle) { } } - auto tag = pending_dcache_.allocate({trace, valid_addrs}); + auto tag = pending_rd_reqs_.allocate({trace, valid_addrs}); for (uint32_t t = 0; t < num_threads_; ++t) { if (!trace->tmask.test(t)) @@ -145,15 +150,16 @@ void LsuUnit::step(uint64_t cycle) { mem_req.write = is_write; mem_req.non_cacheable = (type == AddrType::IO); mem_req.tag = tag; - mem_req.core_id = core_->id(); + mem_req.core_id = trace->cid; + mem_req.uuid = trace->uuid; if (type == AddrType::Shared) { core_->shared_mem_->Inputs.at(t).send(mem_req, 2); - DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace); } else { dcache_req_port.send(mem_req, 2); - DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag << ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace); } @@ -163,20 +169,20 @@ void LsuUnit::step(uint64_t cycle) { // do not wait on writes if (is_write) { - pending_dcache_.release(tag); + pending_rd_reqs_.release(tag); Output.send(trace, 1); } // remove input auto time = Input.pop(); - core_->perf_stats_.lsu_stalls += (cycle - time); + core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time); } /////////////////////////////////////////////////////////////////////////////// AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {} -void AluUnit::step(uint64_t cycle) { +void AluUnit::tick() { if (Input.empty()) return; auto trace = Input.front(); @@ -196,33 +202,33 @@ void AluUnit::step(uint64_t cycle) { default: std::abort(); } - DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace); + DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace); if (trace->fetch_stall) { core_->stalled_warps_.reset(trace->wid); } auto time = Input.pop(); - core_->perf_stats_.alu_stalls += (cycle - time); + core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time); } /////////////////////////////////////////////////////////////////////////////// CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {} -void CsrUnit::step(uint64_t cycle) { +void CsrUnit::tick() { if (Input.empty()) return; auto trace = Input.front(); Output.send(trace, 1); auto time = Input.pop(); - core_->perf_stats_.csr_stalls += (cycle - time); - DT(3, cycle, "pipeline-execute: op=CSR, " << *trace); + core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time); + DT(3, "pipeline-execute: op=CSR, " << *trace); } /////////////////////////////////////////////////////////////////////////////// FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {} -void FpuUnit::step(uint64_t cycle) { +void FpuUnit::tick() { if (Input.empty()) return; auto trace = Input.front(); @@ -245,9 +251,9 @@ void FpuUnit::step(uint64_t cycle) { default: std::abort(); } - DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace); + DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace); auto time = Input.pop(); - core_->perf_stats_.fpu_stalls += (cycle - time); + core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time); } /////////////////////////////////////////////////////////////////////////////// @@ -257,8 +263,12 @@ GpuUnit::GpuUnit(const SimContext& ctx, Core* core) , num_threads_(core->arch().num_threads()) , pending_tex_reqs_(TEXQ_SIZE) {} + +void GpuUnit::reset() { + pending_tex_reqs_.clear(); +} -void GpuUnit::step(uint64_t cycle) { +void GpuUnit::tick() { #ifdef EXT_TEX_ENABLE // handle memory response for (uint32_t t = 0; t < num_threads_; ++t) { @@ -268,7 +278,7 @@ void GpuUnit::step(uint64_t cycle) { auto& mem_rsp = dcache_rsp_port.front(); auto& entry = pending_tex_reqs_.at(mem_rsp.tag); auto trace = entry.first; - DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace); + DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace); assert(entry.second); --entry.second; // track remaining blocks if (0 == entry.second) { @@ -312,7 +322,7 @@ void GpuUnit::step(uint64_t cycle) { issued = true; break; case GpuType::TEX: - if (this->processTexRequest(cycle, trace)) + if (this->processTexRequest(trace)) issued = true; break; default: @@ -320,22 +330,20 @@ void GpuUnit::step(uint64_t cycle) { } if (issued) { - DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace); + DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace); if (trace->fetch_stall) { core_->stalled_warps_.reset(trace->wid); } auto time = Input.pop(); - core_->perf_stats_.fpu_stalls += (cycle - time); + core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time); } } -bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { - __unused (cycle); - +bool GpuUnit::processTexRequest(pipeline_trace_t* trace) { // check pending queue capacity if (pending_tex_reqs_.full()) { if (!trace->suspend()) { - DT(3, cycle, "*** tex-queue-stall: " << *trace); + DT(3, "*** tex-queue-stall: " << *trace); } return false; } else { @@ -356,14 +364,15 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) { continue; auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1); - for (auto mem_addr : trace->mem_addrs.at(t)) { + for (auto& mem_addr : trace->mem_addrs.at(t)) { MemReq mem_req; mem_req.addr = mem_addr.addr; mem_req.write = (trace->lsu.type == LsuType::STORE); mem_req.tag = tag; mem_req.core_id = core_->id(); + mem_req.uuid = trace->uuid; dcache_req_port.send(mem_req, 3); - DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag + DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag << ", tid=" << t << ", "<< trace); ++ core_->perf_stats_.tex_reads; ++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size(); diff --git a/sim/simx/exeunit.h b/sim/simx/exeunit.h index bea714ea..78990369 100644 --- a/sim/simx/exeunit.h +++ b/sim/simx/exeunit.h @@ -18,10 +18,14 @@ public: , Input(this) , Output(this) , core_(core) - {} + {} virtual ~ExeUnit() {} + virtual void reset() {} + + virtual void tick() = 0; + protected: Core* core_; }; @@ -32,7 +36,7 @@ class NopUnit : public ExeUnit { public: NopUnit(const SimContext& ctx, Core*); - void step(uint64_t cycle); + void tick(); }; /////////////////////////////////////////////////////////////////////////////// @@ -40,14 +44,16 @@ public: class LsuUnit : public ExeUnit { private: uint32_t num_threads_; - HashTable> pending_dcache_; + HashTable> pending_rd_reqs_; pipeline_trace_t* fence_state_; bool fence_lock_; public: LsuUnit(const SimContext& ctx, Core*); - void step(uint64_t cycle); + void reset(); + + void tick(); }; /////////////////////////////////////////////////////////////////////////////// @@ -56,7 +62,7 @@ class AluUnit : public ExeUnit { public: AluUnit(const SimContext& ctx, Core*); - void step(uint64_t cycle); + void tick(); }; /////////////////////////////////////////////////////////////////////////////// @@ -65,7 +71,7 @@ class CsrUnit : public ExeUnit { public: CsrUnit(const SimContext& ctx, Core*); - void step(uint64_t cycle); + void tick(); }; /////////////////////////////////////////////////////////////////////////////// @@ -74,7 +80,7 @@ class FpuUnit : public ExeUnit { public: FpuUnit(const SimContext& ctx, Core*); - void step(uint64_t cycle); + void tick(); }; /////////////////////////////////////////////////////////////////////////////// @@ -84,12 +90,14 @@ private: uint32_t num_threads_; HashTable> pending_tex_reqs_; - bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace); + bool processTexRequest(pipeline_trace_t* trace); public: GpuUnit(const SimContext& ctx, Core*); + + void reset(); - void step(uint64_t cycle); + void tick(); }; } \ No newline at end of file diff --git a/sim/simx/ibuffer.h b/sim/simx/ibuffer.h index b4c6f51e..7362195f 100644 --- a/sim/simx/ibuffer.h +++ b/sim/simx/ibuffer.h @@ -34,6 +34,11 @@ public: void pop() { return entries_.pop(); } + + void clear() { + std::queue empty; + std::swap(entries_, empty ); + } }; } \ No newline at end of file diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp index 159fdab6..89999c8f 100644 --- a/sim/simx/main.cpp +++ b/sim/simx/main.cpp @@ -6,6 +6,8 @@ #include #include #include "processor.h" +#include "archdef.h" +#include "mem.h" #include "constants.h" #include #include "args.h" @@ -50,11 +52,14 @@ int main(int argc, char **argv) { std::cout << "Running " << imgFileName << "..." << std::endl; - if (!SimPlatform::instance().initialize()) - return -1; - { + // create processor configuation + ArchDef arch(archStr, num_cores, num_warps, num_threads); + + // create memory module RAM ram(RAM_PAGE_SIZE); + + // load program { std::string program_ext(fileExtension(imgFileName.c_str())); if (program_ext == "bin") { @@ -67,27 +72,15 @@ int main(int argc, char **argv) { } } - ArchDef arch(archStr, num_cores, num_warps, num_threads); - auto processor = Processor::Create(arch); - processor->attach_ram(&ram); - - // setup memory simulator - auto memsim = MemSim::Create(MemSim::Config{ - DRAM_CHANNELS, - arch.num_cores() - }); - processor->MemReqPort.bind(&memsim->MemReqPort); - memsim->MemRspPort.bind(&processor->MemRspPort); + // create processor + Processor processor(arch); + + // attach memory module + processor.attach_ram(&ram); // run simulation - for (;;) { - SimPlatform::instance().step(); - if (processor->check_exit(&exitcode)) - break; - }; - } - - SimPlatform::instance().finalize(); + processor.run(); + } if (riscv_test) { if (1 == exitcode) { diff --git a/sim/simx/memsim.cpp b/sim/simx/memsim.cpp index 74979bc8..a69df4b9 100644 --- a/sim/simx/memsim.cpp +++ b/sim/simx/memsim.cpp @@ -13,6 +13,7 @@ DISABLE_WARNING_POP #include "constants.h" #include "types.h" +#include "debug.h" using namespace vortex; @@ -51,37 +52,50 @@ public: return perf_stats_; } - void dram_callback(ramulator::Request& req, uint32_t tag) { - MemRsp mem_rsp{tag, (uint32_t)req.coreid}; + void dram_callback(ramulator::Request& req, uint32_t tag, uint64_t uuid) { + if (req.type == ramulator::Request::Type::WRITE) + return; + MemRsp mem_rsp{tag, (uint32_t)req.coreid, uuid}; simobject_->MemRspPort.send(mem_rsp, 1); + DT(3, simobject_->name() << "-" << mem_rsp); } - void step(uint64_t /*cycle*/) { - dram_->tick(); + void reset() { + perf_stats_ = PerfStats(); + } + + void tick() { + if (MEM_CYCLE_RATIO > 0) { + auto cycle = SimPlatform::instance().cycles(); + if ((cycle % MEM_CYCLE_RATIO) == 0) + dram_->tick(); + } else { + for (int i = MEM_CYCLE_RATIO; i <= 0; ++i) + dram_->tick(); + } if (simobject_->MemReqPort.empty()) return; auto& mem_req = simobject_->MemReqPort.front(); - if (mem_req.write) { - ramulator::Request dram_req( - mem_req.addr, - ramulator::Request::Type::WRITE, - mem_req.core_id - ); - dram_->send(dram_req); + ramulator::Request dram_req( + mem_req.addr, + mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ, + std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid), + mem_req.core_id + ); + + if (!dram_->send(dram_req)) + return; + + if (mem_req.write) { ++perf_stats_.writes; } else { - ramulator::Request dram_req( - mem_req.addr, - ramulator::Request::Type::READ, - std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag), - mem_req.core_id - ); - dram_->send(dram_req); ++perf_stats_.reads; } + + DT(3, simobject_->name() << "-" << mem_req); simobject_->MemReqPort.pop(); } @@ -89,8 +103,8 @@ public: /////////////////////////////////////////////////////////////////////////////// -MemSim::MemSim(const SimContext& ctx, const Config& config) - : SimObject(ctx, "MemSim") +MemSim::MemSim(const SimContext& ctx, const char* name, const Config& config) + : SimObject(ctx, name) , MemReqPort(this) , MemRspPort(this) , impl_(new Impl(this, config)) @@ -100,6 +114,10 @@ MemSim::~MemSim() { delete impl_; } -void MemSim::step(uint64_t cycle) { - impl_->step(cycle); +void MemSim::reset() { + impl_->reset(); +} + +void MemSim::tick() { + impl_->tick(); } \ No newline at end of file diff --git a/sim/simx/memsim.h b/sim/simx/memsim.h index 24918a2e..26e21a34 100644 --- a/sim/simx/memsim.h +++ b/sim/simx/memsim.h @@ -26,10 +26,12 @@ public: SimPort MemReqPort; SimPort MemRspPort; - MemSim(const SimContext& ctx, const Config& config); + MemSim(const SimContext& ctx, const char* name, const Config& config); ~MemSim(); - void step(uint64_t cycle); + void reset(); + + void tick(); const PerfStats& perf_stats() const; diff --git a/sim/simx/pipeline.h b/sim/simx/pipeline.h index 9ac09352..18d54e21 100644 --- a/sim/simx/pipeline.h +++ b/sim/simx/pipeline.h @@ -98,14 +98,40 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) return os; } -class PipelineLatch : public Queue { +class PipelineLatch { protected: const char* name_; + std::queue queue_; public: PipelineLatch(const char* name = nullptr) : name_(name) {} + + bool empty() const { + return queue_.empty(); + } + + pipeline_trace_t* front() { + return queue_.front(); + } + + pipeline_trace_t* back() { + return queue_.back(); + } + + void push(pipeline_trace_t* value) { + queue_.push(value); + } + + void pop() { + queue_.pop(); + } + + void clear() { + std::queue empty; + std::swap(queue_, empty ); + } }; } \ No newline at end of file diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index bfda986e..a7314687 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -1,11 +1,11 @@ #include "processor.h" +#include "core.h" #include "constants.h" using namespace vortex; class Processor::Impl { private: - Processor* simobject_; std::vector cores_; std::vector l2caches_; std::vector::Ptr> l2_mem_switches_; @@ -13,12 +13,13 @@ private: Switch::Ptr l3_mem_switch_; public: - Impl(Processor* simobject, const ArchDef& arch) - : simobject_(simobject) - , cores_(arch.num_cores()) + Impl(const ArchDef& arch) + : cores_(arch.num_cores()) , l2caches_(NUM_CLUSTERS) , l2_mem_switches_(NUM_CLUSTERS) { + SimPlatform::instance().initialize(); + uint32_t num_cores = arch.num_cores(); uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; @@ -26,12 +27,15 @@ public: for (uint32_t i = 0; i < num_cores; ++i) { cores_.at(i) = Core::Create(arch, i); } - - std::vector*> mem_req_ports(1); - std::vector*> mem_rsp_ports(1); - mem_req_ports.at(0) = &simobject_->MemReqPort; - mem_rsp_ports.at(0) = &simobject_->MemRspPort; + // setup memory simulator + auto memsim = MemSim::Create("dram", MemSim::Config{ + MEMORY_BANKS, + arch.num_cores() + }); + + std::vector*> mem_req_ports(1, &memsim->MemReqPort); + std::vector*> mem_rsp_ports(1, &memsim->MemRspPort); if (L3_ENABLE) { l3cache_ = Cache::Create("l3cache", Cache::Config{ @@ -39,7 +43,7 @@ public: log2ceil(MEM_BLOCK_SIZE), // B 2, // W 0, // A - 32, // address bits + 32, // address bits L3_NUM_BANKS, // number of banks L3_NUM_PORTS, // number of ports NUM_CLUSTERS, // request size @@ -122,10 +126,8 @@ public: } } - ~Impl() {} - - void step(uint64_t cycle) { - __unused (cycle); + ~Impl() { + SimPlatform::instance().finalize(); } void attach_ram(RAM* ram) { @@ -134,28 +136,33 @@ public: } } - bool check_exit(int* exitcode) { - bool running = false; - for (auto& core : cores_) { - if (core->running()) { - running = true; + int run() { + SimPlatform::instance().reset(); + bool running; + int exitcode = 0; + do { + SimPlatform::instance().tick(); + running = false; + for (auto& core : cores_) { + if (core->running()) { + running = true; + } + if (core->check_exit()) { + exitcode = core->getIRegValue(3); + running = false; + break; + } } - if (core->check_exit()) { - *exitcode = core->getIRegValue(3); - return true; - } - } - return !running; + } while (running); + + return exitcode; } }; /////////////////////////////////////////////////////////////////////////////// -Processor::Processor(const SimContext& ctx, const ArchDef& arch) - : SimObject(ctx, "Vortex") - , MemReqPort(this) - , MemRspPort(this) - , impl_(new Impl(this, arch)) +Processor::Processor(const ArchDef& arch) + : impl_(new Impl(arch)) {} Processor::~Processor() { @@ -166,10 +173,6 @@ void Processor::attach_ram(RAM* mem) { impl_->attach_ram(mem); } -bool Processor::check_exit(int* exitcode) { - return impl_->check_exit(exitcode); -} - -void Processor::step(uint64_t cycle) { - impl_->step(cycle); +int Processor::run() { + return impl_->run(); } \ No newline at end of file diff --git a/sim/simx/processor.h b/sim/simx/processor.h index cfcde4da..46bcd735 100644 --- a/sim/simx/processor.h +++ b/sim/simx/processor.h @@ -1,22 +1,18 @@ #pragma once -#include "core.h" - namespace vortex { -class Processor : public SimObject { +class ArchDef; +class RAM; + +class Processor { public: - SimPort MemReqPort; - SimPort MemRspPort; - - Processor(const SimContext& ctx, const ArchDef& arch); + Processor(const ArchDef& arch); ~Processor(); void attach_ram(RAM* mem); - bool check_exit(int* exitcode); - - void step(uint64_t cycle); + int run(); private: class Impl; diff --git a/sim/simx/scoreboard.h b/sim/simx/scoreboard.h index b36d60b3..c468860d 100644 --- a/sim/simx/scoreboard.h +++ b/sim/simx/scoreboard.h @@ -24,11 +24,16 @@ public: , in_use_fregs_(arch.num_warps()) , in_use_vregs_(arch.num_warps()) { - for (int w = 0; w < arch.num_warps(); ++w) { - in_use_iregs_.at(w).reset(); - in_use_fregs_.at(w).reset(); - in_use_vregs_.at(w).reset(); + this->clear(); + } + + void clear() { + for (int i = 0, n = in_use_iregs_.size(); i < n; ++i) { + in_use_iregs_.at(i).reset(); + in_use_fregs_.at(i).reset(); + in_use_vregs_.at(i).reset(); } + owners_.clear(); } bool in_use(pipeline_trace_t* state) const { diff --git a/sim/simx/sharedmem.h b/sim/simx/sharedmem.h index 6106ad25..c76a29d3 100644 --- a/sim/simx/sharedmem.h +++ b/sim/simx/sharedmem.h @@ -45,7 +45,11 @@ public: virtual ~SharedMem() {} - void step(uint64_t /*cycle*/) { + void reset() { + perf_stats_ = PerfStats(); + } + + void tick() { std::vector in_used_banks(config_.num_banks); for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) { auto& core_req_port = this->Inputs.at(req_id); diff --git a/sim/simx/tex_unit.cpp b/sim/simx/tex_unit.cpp index 8dedef38..763f37a6 100644 --- a/sim/simx/tex_unit.cpp +++ b/sim/simx/tex_unit.cpp @@ -16,6 +16,12 @@ TexUnit::TexUnit(Core* core) : core_(core) {} TexUnit::~TexUnit() {} +void TexUnit::clear() { + for (auto& state : states_) { + state = 0; + } +} + uint32_t TexUnit::get_state(uint32_t state) { return states_.at(state); } diff --git a/sim/simx/tex_unit.h b/sim/simx/tex_unit.h index b41cd8c7..5bca8098 100644 --- a/sim/simx/tex_unit.h +++ b/sim/simx/tex_unit.h @@ -11,6 +11,8 @@ public: TexUnit(Core* core); ~TexUnit(); + void clear(); + uint32_t get_state(uint32_t state); void set_state(uint32_t state, uint32_t value); diff --git a/sim/simx/types.h b/sim/simx/types.h index 67a14b5d..9177dba4 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -213,67 +213,48 @@ struct MemReq { bool non_cacheable; uint32_t tag; uint32_t core_id; + uint64_t uuid; MemReq(uint64_t _addr = 0, bool _write = false, bool _non_cacheable = false, uint64_t _tag = 0, - uint32_t _core_id = 0 + uint32_t _core_id = 0, + uint64_t _uuid = 0 ) : addr(_addr) , write(_write) , non_cacheable(_non_cacheable) , tag(_tag) , core_id(_core_id) + , uuid(_uuid) {} }; +inline std::ostream &operator<<(std::ostream &os, const MemReq& req) { + os << "mem-" << (req.write ? "wr" : "rd") << ": "; + os << "addr=" << req.addr << ", tag=" << req.tag << ", core_id=" << req.core_id; + os << " (#" << std::dec << req.uuid << ")"; + return os; +} + +/////////////////////////////////////////////////////////////////////////////// + struct MemRsp { uint64_t tag; uint32_t core_id; - MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0) + uint64_t uuid; + MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0) : tag (_tag) , core_id(_core_id) + , uuid(_uuid) {} }; -/////////////////////////////////////////////////////////////////////////////// - -template -class Queue { -protected: - std::queue queue_; - -public: - Queue() {} - - bool empty() const { - return queue_.empty(); - } - - const T& front() const { - return queue_.front(); - } - - T& front() { - return queue_.front(); - } - - const T& back() const { - return queue_.back(); - } - - T& back() { - return queue_.back(); - } - - void push(const T& value) { - queue_.push(value); - } - - void pop() { - queue_.pop(); - } -}; +inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) { + os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id; + os << " (#" << std::dec << rsp.uuid << ")"; + return os; +} /////////////////////////////////////////////////////////////////////////////// @@ -337,6 +318,14 @@ public: entry.first = false; --size_; } + + void clear() { + for (uint32_t i = 0, n = entries_.size(); i < n; ++i) { + auto& entry = entries_.at(i); + entry.first = false; + } + size_ = 0; + } }; /////////////////////////////////////////////////////////////////////////////// @@ -376,7 +365,11 @@ public: } } - void step(uint64_t /*cycle*/) { + void reset() { + cursor_ = 0; + } + + void tick() { if (ReqIn.size() == 1) return; diff --git a/sim/simx/warp.cpp b/sim/simx/warp.cpp index df0c0e75..b05b1246 100644 --- a/sim/simx/warp.cpp +++ b/sim/simx/warp.cpp @@ -13,12 +13,28 @@ using namespace vortex; Warp::Warp(Core *core, Word id) : id_(id) , core_(core) - , active_(false) - , PC_(STARTUP_ADDR) - , tmask_(0) { - iRegFile_.resize(core_->arch().num_threads(), std::vector(core_->arch().num_regs(), 0)); - fRegFile_.resize(core_->arch().num_threads(), std::vector(core_->arch().num_regs(), 0)); - vRegFile_.resize(core_->arch().num_regs(), std::vector(core_->arch().vsize(), 0)); + , ireg_file_(core->arch().num_threads(), std::vector(core->arch().num_regs())) + , freg_file_(core->arch().num_threads(), std::vector(core->arch().num_regs())) + , vreg_file_(core->arch().num_threads(), std::vector(core->arch().vsize())) +{ + this->clear(); +} + +void Warp::clear() { + active_ = false; + PC_ = STARTUP_ADDR; + tmask_.reset(); + for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) { + for (auto& reg : ireg_file_.at(i)) { + reg = 0; + } + for (auto& reg : freg_file_.at(i)) { + reg = 0; + } + for (auto& reg : vreg_file_.at(i)) { + reg = 0; + } + } } void Warp::eval(pipeline_trace_t *trace) { @@ -55,7 +71,7 @@ void Warp::eval(pipeline_trace_t *trace) { for (int i = 0; i < core_->arch().num_regs(); ++i) { DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':'); for (int j = 0; j < core_->arch().num_threads(); ++j) { - DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_.at(j).at(i) << std::setfill(' ') << ' '); + DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' '); } DPN(4, std::endl); } diff --git a/sim/simx/warp.h b/sim/simx/warp.h index c5a54205..9e9970f3 100644 --- a/sim/simx/warp.h +++ b/sim/simx/warp.h @@ -41,6 +41,8 @@ struct vtype { class Warp { public: Warp(Core *core, Word id); + + void clear(); bool active() const { return active_; @@ -84,7 +86,7 @@ public: } Word getIRegValue(int reg) const { - return iRegFile_.at(0).at(reg); + return ireg_file_.at(0).at(reg); } void eval(pipeline_trace_t *); @@ -100,10 +102,10 @@ private: Word PC_; ThreadMask tmask_; - std::vector> iRegFile_; - std::vector> fRegFile_; - std::vector> vRegFile_; - std::stack domStack_; + std::vector> ireg_file_; + std::vector> freg_file_; + std::vector> vreg_file_; + std::stack dom_stack_; struct vtype vtype_; int vl_; diff --git a/sim/vlsim/opae_sim.cpp b/sim/vlsim/opae_sim.cpp index d165dba6..ff632bf4 100644 --- a/sim/vlsim/opae_sim.cpp +++ b/sim/vlsim/opae_sim.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #ifndef MEMORY_BANKS @@ -33,8 +34,12 @@ #endif #endif +#ifndef MEM_CYCLE_RATIO +#define MEM_CYCLE_RATIO -1 +#endif + #undef MEM_BLOCK_SIZE -#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8) +#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8) #define CACHE_BLOCK_SIZE 64 @@ -43,8 +48,6 @@ #define CCI_RQ_SIZE 16 #define CCI_WQ_SIZE 16 -#define ENABLE_MEM_STALLS - #ifndef TRACE_START_TIME #define TRACE_START_TIME 0ull #endif @@ -144,7 +147,7 @@ public: future_ = std::async(std::launch::async, [&]{ while (!stop_) { std::lock_guard guard(mutex_); - this->step(); + this->tick(); } }); } @@ -206,7 +209,7 @@ public: device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4; device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; - this->step(); + this->tick(); device_->vcp2af_sRxPort_c0_mmioRdValid = 0; assert(device_->af2cp_sTxPort_c2_mmioRdValid); *value = device_->af2cp_sTxPort_c2_data; @@ -220,7 +223,7 @@ public: device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1; device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0; memcpy(device_->vcp2af_sRxPort_c0_data, &value, 8); - this->step(); + this->tick(); device_->vcp2af_sRxPort_c0_mmioWrValid = 0; } @@ -257,17 +260,29 @@ private: Verilated::assertOn(true); } - void step() { + void tick() { this->sRxPort_bus(); this->sTxPort_bus(); this->avs_bus(); + + if (!dram_queue_.empty()) { + if (dram_->send(dram_queue_.front())) + dram_queue_.pop(); + } device_->clk = 0; this->eval(); device_->clk = 1; this->eval(); - dram_->tick(); + if (MEM_CYCLE_RATIO > 0) { + auto cycle = timestamp / 2; + if ((cycle % MEM_CYCLE_RATIO) == 0) + dram_->tick(); + } else { + for (int i = MEM_CYCLE_RATIO; i <= 0; ++i) + dram_->tick(); + } #ifndef NDEBUG fflush(stdout); @@ -403,7 +418,7 @@ private: ramulator::Request::Type::WRITE, 0 ); - dram_->send(dram_req); + dram_queue_.push(dram_req); } if (device_->avs_read[b]) { @@ -431,7 +446,7 @@ private: }, placeholders::_1, mem_req), 0 ); - dram_->send(dram_req); + dram_queue_.push(dram_req); } device_->avs_waitrequest[b] = false; @@ -480,6 +495,8 @@ private: ramulator::Gem5Wrapper* dram_; + std::queue dram_queue_; + Vvortex_afu_shim *device_; #ifdef VCD_OUTPUT VerilatedVcdC *trace_; From 0e2de4f13a154a28891c9a441beebea7d33df982 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 9 Dec 2021 04:54:10 -0500 Subject: [PATCH 26/27] prefetch test fixes --- sim/simx/decode.cpp | 1 + sim/simx/execute.cpp | 40 ++++++++++++++--------------- tests/regression/prefetch/Makefile | 2 +- tests/regression/prefetch/kernel.c | 41 ++++++++++++++++++++++-------- tests/regression/prefetch/main.cpp | 2 +- 5 files changed, 53 insertions(+), 33 deletions(-) diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp index f890d2f9..86e30266 100644 --- a/sim/simx/decode.cpp +++ b/sim/simx/decode.cpp @@ -195,6 +195,7 @@ static const char* op_string(const Instr &instr) { case 2: return "SPLIT"; case 3: return "JOIN"; case 4: return "BAR"; + case 5: return "PREFETCH"; default: std::abort(); } diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index efc199d2..5df72c6f 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -425,11 +425,11 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; - Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned + Word mem_addr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; - Word data_read = core_->dcache_read(memAddr, 4); - trace->mem_addrs.at(t).push_back({memAddr, 4}); - DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); + Word data_read = core_->dcache_read(mem_addr, 4); + trace->mem_addrs.at(t).push_back({mem_addr, 4}); + DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << mem_addr << ", DATA=0x" << data_read); switch (func3) { case 0: // LBI @@ -465,10 +465,10 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { case 6: { // load word and unit strided (not checking for unit stride) for (int i = 0; i < vl_; i++) { - Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); - DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr); - Word data_read = core_->dcache_read(memAddr, 4); - DP(4, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); + Word mem_addr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); + DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << mem_addr); + Word data_read = core_->dcache_read(mem_addr, 4); + DP(4, "Mem addr: " << std::hex << mem_addr << " Data read " << data_read); int *result_ptr = (int *)(vd.data() + i); *result_ptr = data_read; } @@ -490,21 +490,21 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; - Word memAddr = rsdata[t][0] + immsrc; - trace->mem_addrs.at(t).push_back({memAddr, (1u << func3)}); - DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + Word mem_addr = rsdata[t][0] + immsrc; + trace->mem_addrs.at(t).push_back({mem_addr, (1u << func3)}); + DP(4, "STORE MEM: ADDRESS=0x" << std::hex << mem_addr); switch (func3) { case 0: // SB - core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1); + core_->dcache_write(mem_addr, rsdata[t][1] & 0x000000FF, 1); break; case 1: // SH - core_->dcache_write(memAddr, rsdata[t][1], 2); + core_->dcache_write(mem_addr, rsdata[t][1], 2); break; case 2: // SW - core_->dcache_write(memAddr, rsdata[t][1], 4); + core_->dcache_write(mem_addr, rsdata[t][1], 4); break; default: std::abort(); @@ -512,14 +512,14 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } } else { for (int i = 0; i < vl_; i++) { - Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8); - DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + Word mem_addr = rsdata[i][0] + (i * vtype_.vsew / 8); + DP(4, "STORE MEM: ADDRESS=0x" << std::hex << mem_addr); switch (instr.getVlsWidth()) { case 6: { // store word and unit strided (not checking for unit stride) uint32_t value = *(uint32_t *)(vreg_file_.at(instr.getVs3()).data() + i); - core_->dcache_write(memAddr, value, 4); - DP(4, "store: " << memAddr << " value:" << value); + core_->dcache_write(mem_addr, value, 4); + DP(4, "store: " << mem_addr << " value:" << value); } break; default: std::abort(); @@ -888,8 +888,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; - int addr = rsdata[t][0]; - printf("*** PREFETCHED %d ***\n", addr); + auto mem_addr = rsdata[t][0]; + trace->mem_addrs.at(t).push_back({mem_addr, 4}); } } break; default: diff --git a/tests/regression/prefetch/Makefile b/tests/regression/prefetch/Makefile index 0627bd36..af58821c 100644 --- a/tests/regression/prefetch/Makefile +++ b/tests/regression/prefetch/Makefile @@ -2,7 +2,7 @@ RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain VORTEX_DRV_PATH ?= $(realpath ../../../driver) VORTEX_RT_PATH ?= $(realpath ../../../runtime) -OPTS ?= -n64 +OPTS ?= -n32 VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/regression/prefetch/kernel.c b/tests/regression/prefetch/kernel.c index 9136592b..b852f582 100644 --- a/tests/regression/prefetch/kernel.c +++ b/tests/regression/prefetch/kernel.c @@ -1,24 +1,43 @@ #include #include #include +#include #include "common.h" -void kernel_body(int task_id, const kernel_arg_t* arg) { - uint32_t count = arg->task_size; - int32_t* src0_ptr = (int32_t*)arg->src0_ptr; - int32_t* src1_ptr = (int32_t*)arg->src1_ptr; - int32_t* dst_ptr = (int32_t*)arg->dst_ptr; - +#define BLOCK_SIZE 64 + +void kernel_body(int task_id, kernel_arg_t* arg) { + uint32_t count = arg->task_size; uint32_t offset = task_id * count; + uint32_t num_blocks = (count * 4 + BLOCK_SIZE-1) / BLOCK_SIZE; + + int32_t* src0_ptr = (int32_t*)arg->src0_ptr + offset; + int32_t* src1_ptr = (int32_t*)arg->src1_ptr + offset; + int32_t* dst_ptr = (int32_t*)arg->dst_ptr + offset; + + uint32_t src0_end = (uint32_t)(src0_ptr + count); + uint32_t src1_end = (uint32_t)(src1_ptr + count); for (uint32_t i = 0; i < count; ++i) { - dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i]; - vx_prefetch((uint32_t)(src0_ptr) + offset + i); - vx_prefetch((uint32_t)(src1_ptr) + offset + i); + dst_ptr[i] = src0_ptr[i] + src1_ptr[i]; + + uint32_t src0_mask = ((uint32_t)(src0_ptr + i)) % BLOCK_SIZE; + uint32_t src0_next = (uint32_t)(src0_ptr + i + BLOCK_SIZE/4); + if (src0_mask == 0 && src0_next < src0_end) { + //vx_printf("src0_next=%d\n", src0_next); + vx_prefetch(src0_next); + } + + uint32_t src1_mask = ((uint32_t)(src1_ptr + i)) % BLOCK_SIZE; + uint32_t src1_next = (uint32_t)(src1_ptr + i + BLOCK_SIZE/4); + if (src1_mask == 0 && src1_next < src1_end) { + //vx_printf("src1_next=%d\n", src1_next); + vx_prefetch(src1_next); + } } } void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - vx_spawn_tasks(arg->num_tasks, kernel_body, arg); + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg); } \ No newline at end of file diff --git a/tests/regression/prefetch/main.cpp b/tests/regression/prefetch/main.cpp index 2961b517..8be0d2a4 100644 --- a/tests/regression/prefetch/main.cpp +++ b/tests/regression/prefetch/main.cpp @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); From d7737542e4c57fe564c01b76457defc1068ec78e Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 9 Dec 2021 20:43:22 -0500 Subject: [PATCH 27/27] cache uuid support --- hw/rtl/VX_alu_unit.sv | 6 +-- hw/rtl/VX_csr_data.sv | 5 ++- hw/rtl/VX_csr_unit.sv | 2 +- hw/rtl/VX_define.vh | 12 +++--- hw/rtl/VX_dispatch.sv | 10 ++--- hw/rtl/VX_fpu_unit.sv | 6 +-- hw/rtl/VX_gpu_unit.sv | 4 +- hw/rtl/VX_ibuffer.sv | 2 +- hw/rtl/VX_icache_stage.sv | 27 +++---------- hw/rtl/VX_lsu_unit.sv | 47 ++++++++--------------- hw/rtl/VX_muldiv.sv | 18 ++++----- hw/rtl/VX_warp_sched.sv | 6 +-- hw/rtl/cache/VX_bank.sv | 12 +++--- hw/rtl/cache/VX_cache_define.vh | 2 +- hw/rtl/cache/VX_data_access.sv | 4 +- hw/rtl/cache/VX_miss_resrv.sv | 12 +++--- hw/rtl/cache/VX_shared_mem.sv | 8 ++-- hw/rtl/cache/VX_tag_access.sv | 4 +- hw/rtl/interfaces/VX_alu_req_if.sv | 2 +- hw/rtl/interfaces/VX_commit_if.sv | 2 +- hw/rtl/interfaces/VX_csr_req_if.sv | 2 +- hw/rtl/interfaces/VX_decode_if.sv | 2 +- hw/rtl/interfaces/VX_fpu_req_if.sv | 2 +- hw/rtl/interfaces/VX_gpu_req_if.sv | 2 +- hw/rtl/interfaces/VX_ibuffer_if.sv | 2 +- hw/rtl/interfaces/VX_ifetch_req_if.sv | 2 +- hw/rtl/interfaces/VX_ifetch_rsp_if.sv | 2 +- hw/rtl/interfaces/VX_lsu_req_if.sv | 2 +- hw/rtl/interfaces/VX_tex_csr_if.sv | 7 +++- hw/rtl/interfaces/VX_tex_req_if.sv | 2 +- hw/rtl/interfaces/VX_tex_rsp_if.sv | 2 +- hw/rtl/interfaces/VX_writeback_if.sv | 2 +- hw/rtl/tex_unit/VX_tex_define.vh | 16 ++++++++ hw/rtl/tex_unit/VX_tex_mem.sv | 52 +++++++++++-------------- hw/rtl/tex_unit/VX_tex_unit.sv | 55 +++++++++------------------ hw/scripts/scope.json | 16 ++++---- 36 files changed, 159 insertions(+), 200 deletions(-) diff --git a/hw/rtl/VX_alu_unit.sv b/hw/rtl/VX_alu_unit.sv index da20eb6d..72d36184 100644 --- a/hw/rtl/VX_alu_unit.sv +++ b/hw/rtl/VX_alu_unit.sv @@ -96,7 +96,7 @@ module VX_alu_unit #( wire alu_ready_in; wire alu_valid_out; wire alu_ready_out; - wire [63:0] alu_uuid; + wire [`UUID_BITS-1:0] alu_uuid; wire [`NW_BITS-1:0] alu_wid; wire [`NUM_THREADS-1:0] alu_tmask; wire [31:0] alu_PC; @@ -113,7 +113,7 @@ module VX_alu_unit #( assign alu_ready_in = alu_ready_out || ~alu_valid_out; VX_pipe_register #( - .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32), .RESETW (1) ) pipe_reg ( .clk (clk), @@ -139,7 +139,7 @@ module VX_alu_unit #( wire mul_ready_in; wire mul_valid_out; wire mul_ready_out; - wire [63:0] mul_uuid; + wire [`UUID_BITS-1:0] mul_uuid; wire [`NW_BITS-1:0] mul_wid; wire [`NUM_THREADS-1:0] mul_tmask; wire [31:0] mul_PC; diff --git a/hw/rtl/VX_csr_data.sv b/hw/rtl/VX_csr_data.sv index d63d1b7d..6d4a82c9 100644 --- a/hw/rtl/VX_csr_data.sv +++ b/hw/rtl/VX_csr_data.sv @@ -25,13 +25,13 @@ module VX_csr_data #( `endif input wire read_enable, - input wire [63:0] read_uuid, + input wire [`UUID_BITS-1:0] read_uuid, input wire[`CSR_ADDR_BITS-1:0] read_addr, input wire[`NW_BITS-1:0] read_wid, output wire[31:0] read_data, input wire write_enable, - input wire [63:0] write_uuid, + input wire [`UUID_BITS-1:0] write_uuid, input wire[`CSR_ADDR_BITS-1:0] write_addr, input wire[`NW_BITS-1:0] write_wid, input wire[31:0] write_data, @@ -100,6 +100,7 @@ module VX_csr_data #( assign tex_csr_if.write_enable = write_enable; assign tex_csr_if.write_addr = write_addr; assign tex_csr_if.write_data = write_data; + assign tex_csr_if.write_uuid = write_uuid; `endif always @(posedge clk) begin diff --git a/hw/rtl/VX_csr_unit.sv b/hw/rtl/VX_csr_unit.sv index 6f7b35c9..9186586a 100644 --- a/hw/rtl/VX_csr_unit.sv +++ b/hw/rtl/VX_csr_unit.sv @@ -110,7 +110,7 @@ module VX_csr_unit #( wire stall_out = ~csr_commit_if.ready && csr_commit_if.valid; VX_pipe_register #( - .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32), .RESETW (1) ) pipe_reg ( .clk (clk), diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index d4cf83fa..2badf7f8 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -34,6 +34,8 @@ `define PERF_CTR_BITS 44 +`define UUID_BITS 44 + /////////////////////////////////////////////////////////////////////////////// `define EX_NOP 3'h0 @@ -239,10 +241,6 @@ /////////////////////////////////////////////////////////////////////////////// -// cache request identifier -`define DBG_CACHE_REQ_IDW 48 -`define DBG_CACHE_REQ_ID(type, ctr) {4'(type), {`DBG_CACHE_REQ_IDW-4{1'b0}}} + ctr - // non-cacheable tag bits `define NC_TAG_BIT 1 @@ -267,7 +265,7 @@ `define ICACHE_CORE_TAG_ID_BITS `NW_BITS // Core request tag bits -`define ICACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_IDW + `ICACHE_CORE_TAG_ID_BITS) +`define ICACHE_CORE_TAG_WIDTH (`UUID_BITS + `ICACHE_CORE_TAG_ID_BITS) // Memory request data bits `define ICACHE_MEM_DATA_WIDTH (`ICACHE_LINE_SIZE * 8) @@ -293,13 +291,13 @@ `define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE) `ifdef EXT_TEX_ENABLE `define LSU_TAG_ID_BITS `MAX(`LSUQ_ADDR_BITS, 2) -`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_IDW + `LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS) +`define LSU_TEX_DCACHE_TAG_BITS (`UUID_BITS + `LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS) `define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS + `TEX_TAG_BIT) `else `define LSU_TAG_ID_BITS `LSUQ_ADDR_BITS `define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS) `endif -`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_IDW + `DCACHE_CORE_TAG_ID_BITS) +`define DCACHE_CORE_TAG_WIDTH (`UUID_BITS + `DCACHE_CORE_TAG_ID_BITS) // Memory request data bits `define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8) diff --git a/hw/rtl/VX_dispatch.sv b/hw/rtl/VX_dispatch.sv index 5715d14b..9b8b88c8 100644 --- a/hw/rtl/VX_dispatch.sv +++ b/hw/rtl/VX_dispatch.sv @@ -42,7 +42,7 @@ module VX_dispatch ( wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), + .DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), .OUT_REG (1) ) alu_buffer ( .clk (clk), @@ -63,7 +63,7 @@ module VX_dispatch ( wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod); VX_skid_buffer #( - .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1), + .DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1), .OUT_REG (1) ) lsu_buffer ( .clk (clk), @@ -85,7 +85,7 @@ module VX_dispatch ( wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid]; VX_skid_buffer #( - .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32), + .DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32), .OUT_REG (1) ) csr_buffer ( .clk (clk), @@ -105,7 +105,7 @@ module VX_dispatch ( wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), + .DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), .OUT_REG (1) ) fpu_buffer ( .clk (clk), @@ -127,7 +127,7 @@ module VX_dispatch ( wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)), + .DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)), .OUT_REG (1) ) gpu_buffer ( .clk (clk), diff --git a/hw/rtl/VX_fpu_unit.sv b/hw/rtl/VX_fpu_unit.sv index 84af116b..342bf36d 100644 --- a/hw/rtl/VX_fpu_unit.sv +++ b/hw/rtl/VX_fpu_unit.sv @@ -22,7 +22,7 @@ module VX_fpu_unit #( wire valid_out; wire ready_out; - wire [63:0] rsp_uuid; + wire [`UUID_BITS-1:0] rsp_uuid; wire [`NW_BITS-1:0] rsp_wid; wire [`NUM_THREADS-1:0] rsp_tmask; wire [31:0] rsp_PC; @@ -40,7 +40,7 @@ module VX_fpu_unit #( wire fpuq_pop = valid_out && ready_out; VX_index_buffer #( - .DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), + .DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), .SIZE (`FPUQ_SIZE) ) req_metadata ( .clk (clk), @@ -181,7 +181,7 @@ module VX_fpu_unit #( wire stall_out = ~fpu_commit_if.ready && fpu_commit_if.valid; VX_pipe_register #( - .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS), .RESETW (1) ) pipe_reg ( .clk (clk), diff --git a/hw/rtl/VX_gpu_unit.sv b/hw/rtl/VX_gpu_unit.sv index 6db637a2..b4047830 100644 --- a/hw/rtl/VX_gpu_unit.sv +++ b/hw/rtl/VX_gpu_unit.sv @@ -33,7 +33,7 @@ module VX_gpu_unit #( localparam RSP_DATAW = `MAX(`NUM_THREADS * 32, WCTL_DATAW); wire rsp_valid; - wire [63:0] rsp_uuid; + wire [`UUID_BITS-1:0] rsp_uuid; wire [`NW_BITS-1:0] rsp_wid; wire [`NUM_THREADS-1:0] rsp_tmask; wire [31:0] rsp_PC; @@ -187,7 +187,7 @@ module VX_gpu_unit #( assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid; VX_pipe_register #( - .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1), .RESETW (1) ) pipe_reg ( .clk (clk), diff --git a/hw/rtl/VX_ibuffer.sv b/hw/rtl/VX_ibuffer.sv index 953f1426..6231ac5f 100644 --- a/hw/rtl/VX_ibuffer.sv +++ b/hw/rtl/VX_ibuffer.sv @@ -15,7 +15,7 @@ module VX_ibuffer #( `UNUSED_PARAM (CORE_ID) - localparam DATAW = 64 + `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1; + localparam DATAW = `UUID_BITS + `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1; localparam ADDRW = $clog2(`IBUF_SIZE+1); localparam NWARPSW = $clog2(`NUM_WARPS+1); diff --git a/hw/rtl/VX_icache_stage.sv b/hw/rtl/VX_icache_stage.sv index 77a20b47..be096c5f 100644 --- a/hw/rtl/VX_icache_stage.sv +++ b/hw/rtl/VX_icache_stage.sv @@ -24,24 +24,19 @@ module VX_icache_stage #( localparam OUT_REG = 0; - reg [`DBG_CACHE_REQ_IDW-1:0] req_id; - wire [`DBG_CACHE_REQ_IDW-1:0] rsp_id; wire [`NW_BITS-1:0] req_tag, rsp_tag; - `UNUSED_VAR (rsp_id) - wire icache_req_fire = icache_req_if.valid && icache_req_if.ready; assign req_tag = ifetch_req_if.wid; assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0]; - assign rsp_id = icache_rsp_if.tag[`NW_BITS +: `DBG_CACHE_REQ_IDW]; - wire [63:0] rsp_uuid; + wire [`UUID_BITS-1:0] rsp_uuid; wire [31:0] rsp_PC; wire [`NUM_THREADS-1:0] rsp_tmask; VX_dp_ram #( - .DATAW (32 + `NUM_THREADS + 64), + .DATAW (32 + `NUM_THREADS + `UUID_BITS), .SIZE (`NUM_WARPS), .LUTRAM (1) ) req_metadata ( @@ -59,17 +54,7 @@ module VX_icache_stage #( // Icache Request assign icache_req_if.valid = ifetch_req_if.valid; assign icache_req_if.addr = ifetch_req_if.PC[31:2]; - assign icache_req_if.tag = {req_id, req_tag}; - - always @(posedge clk) begin - if (reset) begin - req_id <= `DBG_CACHE_REQ_ID(0, 0); - end else begin - if (icache_req_fire) begin - req_id <= req_id + 1; - end - end - end + assign icache_req_if.tag = {ifetch_req_if.uuid, req_tag}; // Can accept new request? assign ifetch_req_if.ready = icache_req_if.ready; @@ -79,7 +64,7 @@ module VX_icache_stage #( wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid); VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32 + 64), + .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `UUID_BITS), .RESETW (1), .DEPTH (OUT_REG) ) pipe_reg ( @@ -106,10 +91,10 @@ module VX_icache_stage #( `ifdef DBG_TRACE_CORE_ICACHE always @(posedge clk) begin if (icache_req_fire) begin - dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h, req_id=%0h (#%0d)\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, req_id, ifetch_req_if.uuid); + dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h (#%0d)\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, ifetch_req_if.uuid); end if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin - dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, req_id=%0h, data=%0h (#%0d)\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, rsp_id, ifetch_rsp_if.data, ifetch_rsp_if.uuid); + dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, data=%0h (#%0d)\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, ifetch_rsp_if.data, ifetch_rsp_if.uuid); end end `endif diff --git a/hw/rtl/VX_lsu_unit.sv b/hw/rtl/VX_lsu_unit.sv index ec8fca80..5116035f 100644 --- a/hw/rtl/VX_lsu_unit.sv +++ b/hw/rtl/VX_lsu_unit.sv @@ -28,7 +28,7 @@ module VX_lsu_unit #( `STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter")) wire req_valid; - wire [63:0] req_uuid; + wire [`UUID_BITS-1:0] req_uuid; wire [`NUM_THREADS-1:0] req_tmask; wire [`NUM_THREADS-1:0][31:0] req_addr; wire [`INST_LSU_BITS-1:0] req_type; @@ -82,7 +82,7 @@ module VX_lsu_unit #( wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch; VX_pipe_register #( - .DATAW (1 + 1 + 1 + 64 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 1 + 1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) req_pipe_reg ( .clk (clk), @@ -95,7 +95,7 @@ module VX_lsu_unit #( // Can accept new request? assign lsu_req_if.ready = ~stall_in && ~fence_wait; - wire [63:0] rsp_uuid; + wire [`UUID_BITS-1:0] rsp_uuid; wire [`NW_BITS-1:0] rsp_wid; wire [31:0] rsp_pc; wire [`NR_BITS-1:0] rsp_rd; @@ -108,8 +108,6 @@ module VX_lsu_unit #( wire [`NUM_THREADS-1:0] rsp_rem_mask_n; wire [`NUM_THREADS-1:0] rsp_tmask; - reg [`DBG_CACHE_REQ_IDW-1:0] req_id; - wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id; reg [`NUM_THREADS-1:0] req_sent_mask; reg is_req_start; @@ -118,7 +116,6 @@ module VX_lsu_unit #( `UNUSED_VAR (rsp_type) `UNUSED_VAR (rsp_is_prefetch) - `UNUSED_VAR (rsp_req_id) wire [`NUM_THREADS-1:0][REQ_ASHIFT-1:0] req_offset, rsp_offset; for (genvar i = 0; i < `NUM_THREADS; i++) begin @@ -127,8 +124,6 @@ module VX_lsu_unit #( wire [`NUM_THREADS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready; - wire dcache_req_fire_any = (| dcache_req_fire); - wire dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready; wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1}; @@ -141,14 +136,13 @@ module VX_lsu_unit #( wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n); assign mbuf_raddr = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: `LSUQ_ADDR_BITS]; - assign rsp_req_id = dcache_rsp_if.tag[(`CACHE_ADDR_TYPE_BITS + `LSU_TAG_ID_BITS) +: `DBG_CACHE_REQ_IDW]; `UNUSED_VAR (dcache_rsp_if.tag) // do not writeback from software prefetch wire req_wb2 = req_wb && ~req_is_prefetch; VX_index_buffer #( - .DATAW (64 + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1), + .DATAW (`UUID_BITS + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1), .SIZE (`LSUQ_SIZE) ) req_metadata ( .clk (clk), @@ -241,19 +235,9 @@ module VX_lsu_unit #( assign dcache_req_if.addr[i] = req_addr[i][31:2]; assign dcache_req_if.byteen[i] = mem_req_byteen; assign dcache_req_if.data[i] = mem_req_data; - assign dcache_req_if.tag[i] = {req_id, `LSU_TAG_ID_BITS'(req_tag), req_addr_type[i]}; + assign dcache_req_if.tag[i] = {req_uuid, `LSU_TAG_ID_BITS'(req_tag), req_addr_type[i]}; end - always @(posedge clk) begin - if (reset) begin - req_id <= `DBG_CACHE_REQ_ID(1, 0); - end else begin - if (dcache_req_fire_any) begin - req_id <= req_id + 1; - end - end - end - assign ready_in = req_dep_ready && dcache_req_ready; // send store commit @@ -298,7 +282,7 @@ module VX_lsu_unit #( wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid; VX_pipe_register #( - .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1), .RESETW (1) ) rsp_pipe_reg ( .clk (clk), @@ -325,7 +309,7 @@ module VX_lsu_unit #( `SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr); `ifndef SYNTHESIS - reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + 64 + 64 + 1)-1:0] pending_reqs; + reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + `UUID_BITS + 64 + 1)-1:0] pending_reqs; wire [63:0] delay_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE)); always @(posedge clk) begin @@ -344,23 +328,24 @@ module VX_lsu_unit #( if (pending_reqs[i][0]) begin `ASSERT(($time - pending_reqs[i][1 +: 64]) < delay_timeout, ("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d (#%0d)", - $time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+64+32+`NR_BITS +: `NW_BITS], - pending_reqs[i][1+64+64+`NR_BITS +: 32], - pending_reqs[i][1+64+64 +: `NR_BITS], - pending_reqs[i][1+64 +: 64])); + $time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+`UUID_BITS+`NR_BITS+32 +: `NW_BITS], + pending_reqs[i][1+64+`UUID_BITS+`NR_BITS +: 32], + pending_reqs[i][1+64+`UUID_BITS +: `NR_BITS], + pending_reqs[i][1+64 +: `UUID_BITS])); end end end `endif `ifdef DBG_TRACE_CORE_DCACHE + wire dcache_req_fire_any = (| dcache_req_fire); always @(posedge clk) begin if (lsu_req_if.valid && fence_wait) begin dpi_trace("%d: *** D$%0d fence wait\n", $time, CORE_ID); end if (dcache_req_fire_any) begin if (dcache_req_if.rw[0]) begin - dpi_trace("%d: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, req_id=%0h, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire, req_id); + dpi_trace("%d: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire); `TRACE_ARRAY1D(req_addr, `NUM_THREADS); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); @@ -368,7 +353,7 @@ module VX_lsu_unit #( `TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS); dpi_trace(", (#%0d)\n", req_uuid); end else begin - dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire, req_id); + dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire); `TRACE_ARRAY1D(req_addr, `NUM_THREADS); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); @@ -376,8 +361,8 @@ module VX_lsu_unit #( end end if (dcache_rsp_fire) begin - dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, rd=%0d, data=", - $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, rsp_req_id, mbuf_raddr, rsp_rd); + dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=", + $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd); `TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS); dpi_trace(", is_dup=%b (#%0d)\n", rsp_is_dup, rsp_uuid); end diff --git a/hw/rtl/VX_muldiv.sv b/hw/rtl/VX_muldiv.sv index c4dda93b..ea992825 100644 --- a/hw/rtl/VX_muldiv.sv +++ b/hw/rtl/VX_muldiv.sv @@ -6,7 +6,7 @@ module VX_muldiv ( // Inputs input wire [`INST_MUL_BITS-1:0] alu_op, - input wire [63:0] uuid_in, + input wire [`UUID_BITS-1:0] uuid_in, input wire [`NW_BITS-1:0] wid_in, input wire [`NUM_THREADS-1:0] tmask_in, input wire [31:0] PC_in, @@ -16,7 +16,7 @@ module VX_muldiv ( input wire [`NUM_THREADS-1:0][31:0] alu_in2, // Outputs - output wire [63:0] uuid_out, + output wire [`UUID_BITS-1:0] uuid_out, output wire [`NW_BITS-1:0] wid_out, output wire [`NUM_THREADS-1:0] tmask_out, output wire [31:0] PC_out, @@ -34,7 +34,7 @@ module VX_muldiv ( wire is_div_op = `INST_MUL_IS_DIV(alu_op); wire [`NUM_THREADS-1:0][31:0] mul_result; - wire [63:0] mul_uuid_out; + wire [`UUID_BITS-1:0] mul_uuid_out; wire [`NW_BITS-1:0] mul_wid_out; wire [`NUM_THREADS-1:0] mul_tmask_out; wire [31:0] mul_PC_out; @@ -66,7 +66,7 @@ module VX_muldiv ( end VX_shift_register #( - .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .DEPTH (`LATENCY_IMUL), .RESETW (1) ) mul_shift_reg ( @@ -106,7 +106,7 @@ module VX_muldiv ( end VX_shift_register #( - .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1), .DEPTH (`LATENCY_IMUL), .RESETW (1) ) mul_shift_reg ( @@ -122,7 +122,7 @@ module VX_muldiv ( /////////////////////////////////////////////////////////////////////////// wire [`NUM_THREADS-1:0][31:0] div_result; - wire [63:0] div_uuid_out; + wire [`UUID_BITS-1:0] div_uuid_out; wire [`NW_BITS-1:0] div_wid_out; wire [`NUM_THREADS-1:0] div_tmask_out; wire [31:0] div_PC_out; @@ -151,7 +151,7 @@ module VX_muldiv ( end VX_shift_register #( - .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .DEPTH (`LATENCY_IMUL), .RESETW (1) ) div_shift_reg ( @@ -199,7 +199,7 @@ module VX_muldiv ( /////////////////////////////////////////////////////////////////////////// wire rsp_valid = mul_valid_out || div_valid_out; - wire [63:0] rsp_uuid = mul_valid_out ? mul_uuid_out : div_uuid_out; + wire [`UUID_BITS-1:0] rsp_uuid = mul_valid_out ? mul_uuid_out : div_uuid_out; wire [`NW_BITS-1:0] rsp_wid = mul_valid_out ? mul_wid_out : div_wid_out; wire [`NUM_THREADS-1:0] rsp_tmask = mul_valid_out ? mul_tmask_out : div_tmask_out; wire [31:0] rsp_PC = mul_valid_out ? mul_PC_out : div_PC_out; @@ -210,7 +210,7 @@ module VX_muldiv ( assign stall_out = ~ready_out && valid_out; VX_pipe_register #( - .DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) pipe_reg ( .clk (clk), diff --git a/hw/rtl/VX_warp_sched.sv b/hw/rtl/VX_warp_sched.sv index b8ec17bf..dda8600b 100644 --- a/hw/rtl/VX_warp_sched.sv +++ b/hw/rtl/VX_warp_sched.sv @@ -46,7 +46,7 @@ module VX_warp_sched #( wire schedule_valid; wire warp_scheduled; - reg [63:0] issued_instrs; + reg [`UUID_BITS-1:0] issued_instrs; wire ifetch_req_fire = ifetch_req_if.valid && ifetch_req_if.ready; @@ -228,10 +228,10 @@ module VX_warp_sched #( assign warp_scheduled = schedule_valid && ~stall_out; - wire [63:0] instr_uuid = (issued_instrs * `NUM_CORES * `NUM_CLUSTERS) + 64'(CORE_ID); + wire [`UUID_BITS-1:0] instr_uuid = (issued_instrs * `NUM_CORES * `NUM_CLUSTERS) + `UUID_BITS'(CORE_ID); VX_pipe_register #( - .DATAW (1 + 64 + `NUM_THREADS + 32 + `NW_BITS), + .DATAW (1 + `UUID_BITS + `NUM_THREADS + 32 + `NW_BITS), .RESETW (1) ) pipe_reg ( .clk (clk), diff --git a/hw/rtl/cache/VX_bank.sv b/hw/rtl/cache/VX_bank.sv index 9e1f3552..22e5887b 100644 --- a/hw/rtl/cache/VX_bank.sv +++ b/hw/rtl/cache/VX_bank.sv @@ -488,22 +488,22 @@ module VX_bank #( dpi_trace("%d: cache%0d:%0d fill-rsp: addr=%0h, id=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data); end if (mshr_fire) begin - dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, req_id_sel); + dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, req_id_sel); end if (creq_fire) begin if (creq_rw) - dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, req_id_sel); + dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, req_id_sel); else - dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, req_id_sel); + dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, req_id_sel); end if (crsq_fire) begin - dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, req_id_st1); + dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, req_id_st1); end if (mreq_push) begin if (is_write_st1) - dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, req_id_st1); + dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, req_id_st1); else - dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, req_id_st1); + dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, req_id_st1); end end `endif diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index b8f2fdbc..647ea0be 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -4,7 +4,7 @@ `include "VX_platform.vh" // cache request identifier -`define DBG_CACHE_REQ_IDW 48 +`define DBG_CACHE_REQ_IDW 44 `define REQS_BITS `LOG2UP(NUM_REQS) diff --git a/hw/rtl/cache/VX_data_access.sv b/hw/rtl/cache/VX_data_access.sv index 887b4095..f5809644 100644 --- a/hw/rtl/cache/VX_data_access.sv +++ b/hw/rtl/cache/VX_data_access.sv @@ -122,10 +122,10 @@ module VX_data_access #( dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data); end if (read && ~stall) begin - dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, req_id=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, line_addr, read_data); + dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, blk_addr=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, read_data, req_id); end if (write && ~stall) begin - dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, req_id=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, byteen, line_addr, write_data); + dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, byteen=%b, blk_addr=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), byteen, line_addr, write_data, req_id); end end `endif diff --git a/hw/rtl/cache/VX_miss_resrv.sv b/hw/rtl/cache/VX_miss_resrv.sv index 08b76add..b9081fdd 100644 --- a/hw/rtl/cache/VX_miss_resrv.sv +++ b/hw/rtl/cache/VX_miss_resrv.sv @@ -201,22 +201,22 @@ module VX_miss_resrv #( always @(posedge clk) begin if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin if (allocate_fire) - dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, + dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_req_id); if (fill_valid) dpi_trace("%d: cache%0d:%0d mshr-fill: addr=%0h, id=%0d, addr=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_table[fill_id], BANK_ID), fill_id, `LINE_TO_BYTE_ADDR(fill_addr, BANK_ID)); if (dequeue_fire) - dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, + dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_id); if (lookup_replay) - dpi_trace("%d: cache%0d:%0d mshr-replay: addr=%0h, id=%0d\n", $time, CACHE_ID, BANK_ID, - `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id); + dpi_trace("%d: cache%0d:%0d mshr-replay: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, + `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lkp_req_id); if (lookup_valid) - dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b, req_id=%0h\n", $time, CACHE_ID, BANK_ID, + dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_req_id); if (release_valid) - dpi_trace("%d: cache%0d:%0d mshr-release id=%0d, req_id=%0h\n", $time, CACHE_ID, BANK_ID, release_id, rel_req_id); + dpi_trace("%d: cache%0d:%0d mshr-release id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, release_id, rel_req_id); dpi_trace("%d: cache%0d:%0d mshr-table", $time, CACHE_ID, BANK_ID); for (integer i = 0; i < MSHR_SIZE; ++i) begin if (valid_table[i]) begin diff --git a/hw/rtl/cache/VX_shared_mem.sv b/hw/rtl/cache/VX_shared_mem.sv index 971795e0..7d6eb275 100644 --- a/hw/rtl/cache/VX_shared_mem.sv +++ b/hw/rtl/cache/VX_shared_mem.sv @@ -306,10 +306,10 @@ module VX_shared_mem #( for (integer i = 0; i < NUM_BANKS; ++i) begin if (per_bank_core_req_valid_unqual[i]) begin if (per_bank_core_req_rw_unqual[i]) begin - dpi_trace("%d: smem%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, req_id=%0h\n", + dpi_trace("%d: smem%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h (#%0d)\n", $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], req_id_st0[i]); end else begin - dpi_trace("%d: smem%0d:%0d core-rd-req: addr=%0h, tag=%0h, req_id=%0h\n", + dpi_trace("%d: smem%0d:%0d core-rd-req: addr=%0h, tag=%0h (#%0d)\n", $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], req_id_st0[i]); end end @@ -319,10 +319,10 @@ module VX_shared_mem #( for (integer i = 0; i < NUM_BANKS; ++i) begin if (per_bank_core_req_valid[i]) begin if (per_bank_core_req_rw[i]) begin - dpi_trace("%d: smem%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, data=%0h, req_id=%0h\n", + dpi_trace("%d: smem%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, data=%0h (#%0d)\n", $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_data[i], req_id_st1[i]); end else begin - dpi_trace("%d: smem%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, data=%0h, req_id=%0h\n", + dpi_trace("%d: smem%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, data=%0h (#%0d)\n", $time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_rsp_data[i], req_id_st1[i]); end end diff --git a/hw/rtl/cache/VX_tag_access.sv b/hw/rtl/cache/VX_tag_access.sv index 808008d5..d8d2a4db 100644 --- a/hw/rtl/cache/VX_tag_access.sv +++ b/hw/rtl/cache/VX_tag_access.sv @@ -68,9 +68,9 @@ module VX_tag_access #( end if (lookup && ~stall) begin if (tag_match) begin - dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, req_id=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, line_addr, line_tag); + dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, blk_addr=%0d, tag_id=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag, req_id); end else begin - dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, req_id=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), req_id, line_addr, line_tag, read_tag); + dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag, read_tag, req_id); end end end diff --git a/hw/rtl/interfaces/VX_alu_req_if.sv b/hw/rtl/interfaces/VX_alu_req_if.sv index 35049542..f6818e7d 100644 --- a/hw/rtl/interfaces/VX_alu_req_if.sv +++ b/hw/rtl/interfaces/VX_alu_req_if.sv @@ -6,7 +6,7 @@ interface VX_alu_req_if (); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; diff --git a/hw/rtl/interfaces/VX_commit_if.sv b/hw/rtl/interfaces/VX_commit_if.sv index e85d310f..ddbd9600 100644 --- a/hw/rtl/interfaces/VX_commit_if.sv +++ b/hw/rtl/interfaces/VX_commit_if.sv @@ -6,7 +6,7 @@ interface VX_commit_if (); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; diff --git a/hw/rtl/interfaces/VX_csr_req_if.sv b/hw/rtl/interfaces/VX_csr_req_if.sv index 0639f3aa..c8eef24a 100644 --- a/hw/rtl/interfaces/VX_csr_req_if.sv +++ b/hw/rtl/interfaces/VX_csr_req_if.sv @@ -6,7 +6,7 @@ interface VX_csr_req_if (); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; diff --git a/hw/rtl/interfaces/VX_decode_if.sv b/hw/rtl/interfaces/VX_decode_if.sv index 23039847..5c00fb0f 100644 --- a/hw/rtl/interfaces/VX_decode_if.sv +++ b/hw/rtl/interfaces/VX_decode_if.sv @@ -6,7 +6,7 @@ interface VX_decode_if (); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; diff --git a/hw/rtl/interfaces/VX_fpu_req_if.sv b/hw/rtl/interfaces/VX_fpu_req_if.sv index 2b7d69f0..62ea9255 100644 --- a/hw/rtl/interfaces/VX_fpu_req_if.sv +++ b/hw/rtl/interfaces/VX_fpu_req_if.sv @@ -6,7 +6,7 @@ interface VX_fpu_req_if (); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; diff --git a/hw/rtl/interfaces/VX_gpu_req_if.sv b/hw/rtl/interfaces/VX_gpu_req_if.sv index 06ef6cc7..027f7a2b 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.sv +++ b/hw/rtl/interfaces/VX_gpu_req_if.sv @@ -6,7 +6,7 @@ interface VX_gpu_req_if(); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; diff --git a/hw/rtl/interfaces/VX_ibuffer_if.sv b/hw/rtl/interfaces/VX_ibuffer_if.sv index a436ae7b..2f9c17b6 100644 --- a/hw/rtl/interfaces/VX_ibuffer_if.sv +++ b/hw/rtl/interfaces/VX_ibuffer_if.sv @@ -6,7 +6,7 @@ interface VX_ibuffer_if (); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; diff --git a/hw/rtl/interfaces/VX_ifetch_req_if.sv b/hw/rtl/interfaces/VX_ifetch_req_if.sv index 4132f90b..95e88223 100644 --- a/hw/rtl/interfaces/VX_ifetch_req_if.sv +++ b/hw/rtl/interfaces/VX_ifetch_req_if.sv @@ -6,7 +6,7 @@ interface VX_ifetch_req_if (); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; diff --git a/hw/rtl/interfaces/VX_ifetch_rsp_if.sv b/hw/rtl/interfaces/VX_ifetch_rsp_if.sv index 350af081..f47e8749 100644 --- a/hw/rtl/interfaces/VX_ifetch_rsp_if.sv +++ b/hw/rtl/interfaces/VX_ifetch_rsp_if.sv @@ -6,7 +6,7 @@ interface VX_ifetch_rsp_if (); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; diff --git a/hw/rtl/interfaces/VX_lsu_req_if.sv b/hw/rtl/interfaces/VX_lsu_req_if.sv index 128b3c20..f52b22da 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.sv +++ b/hw/rtl/interfaces/VX_lsu_req_if.sv @@ -6,7 +6,7 @@ interface VX_lsu_req_if (); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; diff --git a/hw/rtl/interfaces/VX_tex_csr_if.sv b/hw/rtl/interfaces/VX_tex_csr_if.sv index a83c9479..e0c626a5 100644 --- a/hw/rtl/interfaces/VX_tex_csr_if.sv +++ b/hw/rtl/interfaces/VX_tex_csr_if.sv @@ -8,17 +8,20 @@ interface VX_tex_csr_if (); wire write_enable; wire [`CSR_ADDR_BITS-1:0] write_addr; wire [31:0] write_data; + wire [`UUID_BITS-1:0] write_uuid; modport master ( output write_enable, output write_addr, - output write_data + output write_data, + output write_uuid ); modport slave ( input write_enable, input write_addr, - input write_data + input write_data, + input write_uuid ); endinterface diff --git a/hw/rtl/interfaces/VX_tex_req_if.sv b/hw/rtl/interfaces/VX_tex_req_if.sv index 0059de59..a3fec613 100644 --- a/hw/rtl/interfaces/VX_tex_req_if.sv +++ b/hw/rtl/interfaces/VX_tex_req_if.sv @@ -6,7 +6,7 @@ interface VX_tex_req_if (); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; diff --git a/hw/rtl/interfaces/VX_tex_rsp_if.sv b/hw/rtl/interfaces/VX_tex_rsp_if.sv index 5966124c..b6fe625a 100644 --- a/hw/rtl/interfaces/VX_tex_rsp_if.sv +++ b/hw/rtl/interfaces/VX_tex_rsp_if.sv @@ -6,7 +6,7 @@ interface VX_tex_rsp_if (); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; diff --git a/hw/rtl/interfaces/VX_writeback_if.sv b/hw/rtl/interfaces/VX_writeback_if.sv index 00cab3b8..6b93a04f 100644 --- a/hw/rtl/interfaces/VX_writeback_if.sv +++ b/hw/rtl/interfaces/VX_writeback_if.sv @@ -6,7 +6,7 @@ interface VX_writeback_if (); wire valid; - wire [63:0] uuid; + wire [`UUID_BITS-1:0] uuid; wire [`NUM_THREADS-1:0] tmask; wire [`NW_BITS-1:0] wid; wire [31:0] PC; diff --git a/hw/rtl/tex_unit/VX_tex_define.vh b/hw/rtl/tex_unit/VX_tex_define.vh index a3e1a926..381069fc 100644 --- a/hw/rtl/tex_unit/VX_tex_define.vh +++ b/hw/rtl/tex_unit/VX_tex_define.vh @@ -32,4 +32,20 @@ `define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(5) `define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(6) +task trace_tex_state ( + input [`CSR_ADDR_BITS-1:0] state +); + case (state) + `CSR_TEX_ADDR: dpi_trace("ADDR"); + `CSR_TEX_WIDTH: dpi_trace("WIDTH"); + `CSR_TEX_HEIGHT: dpi_trace("HEIGHT"); + `CSR_TEX_FORMAT: dpi_trace("FORMAT"); + `CSR_TEX_FILTER: dpi_trace("FILTER"); + `CSR_TEX_WRAPU: dpi_trace("WRAPU"); + `CSR_TEX_WRAPV: dpi_trace("WRAPV"); + //`CSR_TEX_MIPOFF + default: dpi_trace("MIPOFF"); + endcase +endtask + `endif \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_mem.sv b/hw/rtl/tex_unit/VX_tex_mem.sv index dd9878a2..73f9367c 100644 --- a/hw/rtl/tex_unit/VX_tex_mem.sv +++ b/hw/rtl/tex_unit/VX_tex_mem.sv @@ -75,6 +75,9 @@ module VX_tex_mem #( wire [`TEX_LGSTRIDE_BITS-1:0] q_req_lgstride; wire [3:0][NUM_REQS-1:0][1:0] q_align_offs; wire [3:0] q_dup_reqs; + wire [`NW_BITS-1:0] q_req_wid; + wire [31:0] q_req_PC; + wire [`UUID_BITS-1:0] q_req_uuid; assign reqq_push = req_valid && req_ready; @@ -105,12 +108,8 @@ module VX_tex_mem #( wire sent_all_ready, last_texel_sent; wire req_texel_dup; wire [NUM_REQS-1:0][29:0] req_texel_addr; - reg [`DBG_CACHE_REQ_IDW-1:0] req_id; - wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id; reg [1:0] req_texel_idx; reg req_texels_done; - - `UNUSED_VAR (rsp_req_id) always @(posedge clk) begin if (reset || last_texel_sent) begin @@ -156,22 +155,16 @@ module VX_tex_mem #( wire [NUM_REQS-1:0] req_dup_mask = {{(NUM_REQS-1){~req_texel_dup}}, 1'b1}; + assign {q_req_wid, q_req_PC, q_req_uuid} = q_req_info[`NW_BITS+32+`UUID_BITS-1:0]; + `UNUSED_VAR (q_req_wid) + `UNUSED_VAR (q_req_PC) + assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask; assign dcache_req_if.rw = {NUM_REQS{1'b0}}; assign dcache_req_if.addr = req_texel_addr; assign dcache_req_if.byteen = {NUM_REQS{4'b0}}; assign dcache_req_if.data = 'x; - assign dcache_req_if.tag = {NUM_REQS{req_id, `LSU_TAG_ID_BITS'(req_texel_idx), `CACHE_ADDR_TYPE_BITS'(0)}}; - - always @(posedge clk) begin - if (reset) begin - req_id <= `DBG_CACHE_REQ_ID(2, 0); - end else begin - if (dcache_req_fire_any) begin - req_id <= req_id + 1; - end - end - end + assign dcache_req_if.tag = {NUM_REQS{q_req_uuid, `LSU_TAG_ID_BITS'(req_texel_idx), `CACHE_ADDR_TYPE_BITS'(0)}}; // Dcache Response @@ -188,7 +181,6 @@ module VX_tex_mem #( wire rsp_texel_dup; assign rsp_texel_idx = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: 2]; - assign rsp_req_id = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS + `LSU_TAG_ID_BITS +: `DBG_CACHE_REQ_IDW]; `UNUSED_VAR (dcache_rsp_if.tag) assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx]; @@ -285,25 +277,25 @@ module VX_tex_mem #( // Can accept new cache response? assign dcache_rsp_if.ready = ~(is_last_rsp && stall_out); -`ifdef DBG_TRACE_TEX - wire [`NW_BITS-1:0] q_req_wid, req_wid, rsp_wid; - wire [31:0] q_req_PC, req_PC, rsp_PC; - assign {q_req_wid, q_req_PC} = q_req_info[`NW_BITS+32-1:0]; - assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0]; - assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0]; +`ifdef DBG_TRACE_TEX + wire [`NW_BITS-1:0] req_wid, rsp_wid; + wire [31:0] req_PC, rsp_PC; + wire [`UUID_BITS-1:0] req_uuid, rsp_uuid; + assign {req_wid, req_PC, req_uuid} = req_info[`NW_BITS+32+`UUID_BITS-1:0]; + assign {rsp_wid, rsp_PC, rsp_uuid} = rsp_info[`NW_BITS+32+`UUID_BITS-1:0]; always @(posedge clk) begin if (dcache_req_fire_any) begin - dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, addr=", - $time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_id, req_texel_idx); + dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=", + $time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_texel_idx); `TRACE_ARRAY1D(req_texel_addr, NUM_REQS); - dpi_trace(", is_dup=%b\n", req_texel_dup); + dpi_trace(", is_dup=%b (#%0d)\n", req_texel_dup, q_req_uuid); end if (dcache_rsp_fire) begin - dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, data=", - $time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_req_id, rsp_texel_idx); + dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, data=", + $time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_texel_idx); `TRACE_ARRAY1D(dcache_rsp_if.data, NUM_REQS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", q_req_uuid); end if (req_valid && req_ready) begin dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, lgstride=%0d, baseaddr=", @@ -311,13 +303,13 @@ module VX_tex_mem #( `TRACE_ARRAY1D(req_baseaddr, NUM_REQS); dpi_trace(", addr="); `TRACE_ARRAY2D(req_addr, 4, NUM_REQS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", req_uuid); end if (rsp_valid && rsp_ready) begin dpi_trace("%d: core%0d-tex-mem-rsp: wid=%0d, PC=%0h, tmask=%b, data=", $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask); `TRACE_ARRAY2D(rsp_data, 4, NUM_REQS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", rsp_uuid); end end `endif diff --git a/hw/rtl/tex_unit/VX_tex_unit.sv b/hw/rtl/tex_unit/VX_tex_unit.sv index c10cdf64..9045c5aa 100644 --- a/hw/rtl/tex_unit/VX_tex_unit.sv +++ b/hw/rtl/tex_unit/VX_tex_unit.sv @@ -23,11 +23,11 @@ module VX_tex_unit #( VX_tex_rsp_if.master tex_rsp_if ); - localparam REQ_INFO_W = 64 + `NR_BITS + 1 + `NW_BITS + 32; + localparam REQ_INFO_W = `NR_BITS + 1 + `NW_BITS + 32 + `UUID_BITS; localparam BLEND_FRAC_W = (2 * `NUM_THREADS * `TEX_BLEND_FRAC); reg [$clog2(`NUM_TEX_UNITS)-1:0] csr_tex_unit; - reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][`TEX_LOD_MAX+1-1:0]; + reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(`TEX_LOD_MAX+1)-1:0]; reg [1:0][`TEX_LOD_BITS-1:0] tex_logdims [`NUM_TEX_UNITS-1:0]; reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0]; reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0]; @@ -36,9 +36,6 @@ module VX_tex_unit #( // CSRs programming - reg csrs_dirty [`NUM_TEX_UNITS-1:0]; - `UNUSED_VAR (csrs_dirty) - always @(posedge clk) begin if (tex_csr_if.write_enable) begin case (tex_csr_if.write_addr) @@ -47,50 +44,39 @@ module VX_tex_unit #( end `CSR_TEX_ADDR: begin tex_baddr[csr_tex_unit] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; - csrs_dirty[csr_tex_unit] <= 1; end `CSR_TEX_FORMAT: begin tex_format[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; - csrs_dirty[csr_tex_unit] <= 1; end `CSR_TEX_WRAPU: begin tex_wraps[csr_tex_unit][0] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; - csrs_dirty[csr_tex_unit] <= 1; end `CSR_TEX_WRAPV: begin tex_wraps[csr_tex_unit][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0]; - csrs_dirty[csr_tex_unit] <= 1; end `CSR_TEX_FILTER: begin - tex_filter[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; - csrs_dirty[csr_tex_unit] <= 1; + tex_filter[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; end `CSR_TEX_WIDTH: begin tex_logdims[csr_tex_unit][0] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; - csrs_dirty[csr_tex_unit] <= 1; end `CSR_TEX_HEIGHT: begin tex_logdims[csr_tex_unit][1] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0]; - csrs_dirty[csr_tex_unit] <= 1; end default: begin for (integer j = 0; j <= `TEX_LOD_MAX; ++j) begin `IGNORE_WARNINGS_BEGIN if (tex_csr_if.write_addr == `CSR_TEX_MIPOFF(j)) begin `IGNORE_WARNINGS_END - tex_mipoff[csr_tex_unit][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; - csrs_dirty[csr_tex_unit] <= 1; + tex_mipoff[csr_tex_unit][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; end end end endcase end - if (reset || (tex_req_if.valid && tex_req_if.ready)) begin - for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin - csrs_dirty[i] <= 0; - end - end end + wire [`UUID_BITS-1:0] write_uuid = tex_csr_if.write_uuid; + `UNUSED_VAR (write_uuid); // mipmap attributes @@ -136,7 +122,7 @@ module VX_tex_unit #( .mip_level (mip_level), .req_mipoff (sel_mipoff), .req_logdims(sel_logdims), - .req_info ({tex_format[tex_req_if.unit], tex_req_if.uuid, tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), + .req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC, tex_req_if.uuid}), .req_ready (tex_req_if.ready), .rsp_valid (mem_req_valid), @@ -211,9 +197,9 @@ module VX_tex_unit #( .rsp_valid (tex_rsp_if.valid), .rsp_tmask (tex_rsp_if.tmask), .rsp_data (tex_rsp_if.data), - .rsp_info ({tex_rsp_if.uuid, tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}), + .rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC, tex_rsp_if.uuid}), .rsp_ready (tex_rsp_if.ready) - ); + ); `ifdef PERF_ENABLE wire [$clog2(`NUM_THREADS+1)-1:0] perf_mem_req_per_cycle; @@ -255,31 +241,24 @@ module VX_tex_unit #( `ifdef DBG_TRACE_TEX always @(posedge clk) begin + if (tex_csr_if.write_enable) begin + dpi_trace("%d: core%0d-tex-csr: unit=%0d, state=", $time, CORE_ID, csr_tex_unit); + trace_tex_state(tex_csr_if.write_addr); + dpi_trace(", data=%0h (#%0d)\n", tex_csr_if.write_data, tex_csr_if.write_uuid); + end if (tex_req_if.valid && tex_req_if.ready) begin - for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin - if (csrs_dirty[i]) begin - dpi_trace("%d: core%0d-tex-csr: tex%0d_addr=%0h\n", $time, CORE_ID, i, tex_baddr[i]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_logwidth=%0h\n", $time, CORE_ID, i, tex_logdims[i][0]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_logheight=%0h\n", $time, CORE_ID, i, tex_logdims[i][1]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_format=%0h\n", $time, CORE_ID, i, tex_format[i]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_u=%0h\n", $time, CORE_ID, i, tex_wraps[i][0]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_v=%0h\n", $time, CORE_ID, i, tex_wraps[i][1]); - dpi_trace("%d: core%0d-tex-csr: tex%0d_filter=%0h\n", $time, CORE_ID, i, tex_filter[i]); - end - end - dpi_trace("%d: core%0d-tex-req: wid=%0d, PC=%0h, tmask=%b, unit=%0d, lod=%0h, u=", - $time, CORE_ID, tex_req_if.wid, tex_req_if.PC, tex_req_if.tmask, tex_req_if.unit, tex_req_if.lod); + $time, CORE_ID, tex_req_if.wid, tex_req_if.PC, tex_req_if.tmask, tex_req_if.unit, tex_req_if.lod); `TRACE_ARRAY1D(tex_req_if.coords[0], `NUM_THREADS); dpi_trace(", v="); `TRACE_ARRAY1D(tex_req_if.coords[1], `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", tex_req_if.uuid); end if (tex_rsp_if.valid && tex_rsp_if.ready) begin dpi_trace("%d: core%0d-tex-rsp: wid=%0d, PC=%0h, tmask=%b, data=", $time, CORE_ID, tex_rsp_if.wid, tex_rsp_if.PC, tex_rsp_if.tmask); `TRACE_ARRAY1D(tex_rsp_if.data, `NUM_THREADS); - dpi_trace("\n"); + dpi_trace(" (#%0d)\n", tex_rsp_if.uuid); end end `endif diff --git a/hw/scripts/scope.json b/hw/scripts/scope.json index c9c49ebe..d6cfd609 100644 --- a/hw/scripts/scope.json +++ b/hw/scripts/scope.json @@ -142,7 +142,7 @@ }, "afu/vortex/cluster/core/pipeline/fetch/warp_sched": { "?wsched_scheduled": 1, - "wsched_schedule_uuid": 64, + "wsched_schedule_uuid": "`UUID_BITS", "wsched_active_warps": "`NUM_WARPS", "wsched_stalled_warps": "`NUM_WARPS", "wsched_schedule_tmask": "`NUM_THREADS", @@ -151,17 +151,17 @@ }, "afu/vortex/cluster/core/pipeline/fetch/icache_stage": { "?icache_req_fire": 1, - "icache_req_uuid": 64, + "icache_req_uuid": "`UUID_BITS", "icache_req_addr": 32, "icache_req_tag":"`ICACHE_CORE_TAG_ID_BITS", "?icache_rsp_fire": 1, - "icache_rsp_uuid": 64, + "icache_rsp_uuid": "`UUID_BITS", "icache_rsp_data": 32, "icache_rsp_tag":"`ICACHE_CORE_TAG_ID_BITS" }, "afu/vortex/cluster/core/pipeline/issue": { "?issue_fire": 1, - "issue_uuid": 64, + "issue_uuid": "`UUID_BITS", "issue_tmask":"`NUM_THREADS", "issue_ex_type":"`EX_BITS", "issue_op_type":"`INST_OP_BITS", @@ -178,7 +178,7 @@ "gpr_rs2":"`NUM_THREADS * 32", "gpr_rs3":"`NUM_THREADS * 32", "?writeback_valid": 1, - "writeback_uuid": 64, + "writeback_uuid": "`UUID_BITS", "writeback_tmask":"`NUM_THREADS", "writeback_rd":"`NR_BITS", "writeback_data":"`NUM_THREADS * 32", @@ -188,20 +188,20 @@ }, "afu/vortex/cluster/core/pipeline/execute/lsu_unit": { "?dcache_req_fire":"`NUM_THREADS", - "dcache_req_uuid": 64, + "dcache_req_uuid": "`UUID_BITS", "dcache_req_addr":"`NUM_THREADS * 32", "dcache_req_rw": 1, "dcache_req_byteen":"`NUM_THREADS * 4", "dcache_req_data":"`NUM_THREADS * 32", "dcache_req_tag":"`LSUQ_ADDR_BITS", "?dcache_rsp_fire":"`NUM_THREADS", - "dcache_rsp_uuid": 64, + "dcache_rsp_uuid": "`UUID_BITS", "dcache_rsp_data":"`NUM_THREADS * 32", "dcache_rsp_tag":"`LSUQ_ADDR_BITS" }, "afu/vortex/cluster/core/pipeline/execute/gpu_unit": { "?gpu_rsp_valid": 1, - "gpu_rsp_uuid": 64, + "gpu_rsp_uuid": "`UUID_BITS", "gpu_rsp_tmc": 1, "gpu_rsp_wspawn": 1, "gpu_rsp_split": 1,