SimX timing simulation

This commit is contained in:
Blaise Tine
2021-11-13 01:41:12 -05:00
parent 009e897cab
commit c2721fd545
26 changed files with 3690 additions and 1639 deletions

427
sim/common/simobject.h Normal file
View File

@@ -0,0 +1,427 @@
#pragma once
#include <functional>
#include <iostream>
#include <memory>
#include <vector>
#include <list>
#include <assert.h>
namespace vortex {
class SimObjectBase;
class SimEventBase {
public:
typedef std::shared_ptr<SimEventBase> Ptr;
virtual ~SimEventBase() {}
virtual void fire() const = 0;
bool step() {
return (0 == --delay_);
}
protected:
SimEventBase(uint64_t delay) : delay_(delay) {}
uint64_t delay_;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
class SimSimpleEvent : public SimEventBase {
public:
typedef std::function<void (const Pkt&)> Func;
template <typename... Args>
static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) {
return std::make_shared<SimSimpleEvent>(func, pkt, delay);
}
SimSimpleEvent(const Func& func, const Pkt& pkt, uint64_t delay)
: SimEventBase(delay)
, func_(func)
, pkt_(pkt)
{}
void fire() const override {
func_(pkt_);
}
protected:
Func func_;
Pkt pkt_;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
class SimPortEvent : public SimEventBase {
public:
typedef std::function<void (const Pkt&, uint32_t)> Func;
template <typename... Args>
static Ptr Create(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) {
return std::make_shared<SimPortEvent>(func, pkt, port_id, delay);
}
SimPortEvent(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay)
: SimEventBase(delay)
, func_(func)
, pkt_(pkt)
, port_id_(port_id)
{}
void fire() const override {
func_(pkt_, port_id_);
}
private:
Func func_;
Pkt pkt_;
uint32_t port_id_;
};
///////////////////////////////////////////////////////////////////////////////
class SimPortBase {
public:
typedef std::shared_ptr<SimPortBase> Ptr;
virtual ~SimPortBase() {}
SimObjectBase* module() const {
return module_;
}
uint32_t port_id() const {
return port_id_;
}
SimPortBase* peer() const {
return peer_;
}
bool connected() const {
return (peer_ != nullptr);
}
bool is_slave() const {
return is_slave_;
}
protected:
SimPortBase(SimObjectBase* module, bool is_slave);
void connect(SimPortBase* peer) {
assert(peer_ == nullptr);
peer_ = peer;
}
void disconnect() {
assert(peer_ == nullptr);
peer_ = nullptr;
}
SimObjectBase* module_;
uint32_t port_id_;
bool is_slave_;
SimPortBase* peer_;
template <typename Pkt> friend class MasterPort;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
class SlavePort : public SimPortBase {
public:
typedef std::shared_ptr<SlavePort<Ptr>> Ptr;
typedef std::function<void (const Pkt&, uint32_t)> Func;
static Ptr Create(SimObjectBase* module, const Func& func) {
return std::make_shared<SlavePort<Pkt>>(module, func);
}
template <typename T>
static Ptr Create(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) {
return std::make_shared<SlavePort<Pkt>>(module, obj, entry);
}
SlavePort(SimObjectBase* module, const Func& func)
: SimPortBase(module, true)
, func_(func)
{}
template <typename T>
SlavePort(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t))
: SimPortBase(module, true)
, func_(std::bind(entry, obj, std::placeholders::_1, std::placeholders::_2))
{}
SlavePort(SimObjectBase* module, SlavePort* peer)
: SimPortBase(module, false)
{
this->connect(peer);
}
void send(const Pkt& pkt, uint64_t delay) const;
const Func& func() const {
return func_;
}
protected:
SlavePort& operator=(const SlavePort&);
Func func_;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
class MasterPort : public SimPortBase {
public:
typedef std::shared_ptr<MasterPort<Ptr>> Ptr;
typedef std::function<void (const Pkt&, uint32_t)> Func;
static Ptr Create() {
return std::make_shared<MasterPort<Ptr>>(module);
}
MasterPort(SimObjectBase* module) : SimPortBase(module, false) {}
MasterPort(SimObjectBase* module, MasterPort* peer)
: SimPortBase(module, false)
{
peer->connect(this);
}
void bind(SlavePort<Pkt>* peer) {
this->connect(peer);
}
void unbind() {
peer_->disconnect();
this->disconnect();
}
void send(const Pkt& pkt, uint64_t delay) const {
assert(peer_ != nullptr);
if (peer_->is_slave()) {
auto slave = reinterpret_cast<const SlavePort<Pkt>*>(peer_);
slave->send(pkt, delay);
} else {
auto master = reinterpret_cast<const MasterPort<Pkt>*>(peer_);
master->send(pkt, delay);
}
}
private:
MasterPort& operator=(const MasterPort&);
};
///////////////////////////////////////////////////////////////////////////////
class SimContext;
class SimObjectBase {
public:
typedef std::shared_ptr<SimObjectBase> Ptr;
virtual ~SimObjectBase() {}
template <typename T, typename Pkt>
void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay);
virtual void step(uint64_t cycle) = 0;
const std::string& name() const {
return name_;
}
protected:
SimObjectBase(const SimContext& ctx, const char* name);
uint32_t allocate_port(SimPortBase* port) {
uint32_t id = ports_.size();
ports_.push_back(port);
return id;
}
private:
std::string name_;
std::vector<SimPortBase*> ports_;
friend class SimPlatform;
friend class SimPortBase;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Impl>
class SimObject : public SimObjectBase {
public:
typedef std::shared_ptr<Impl> Ptr;
template <typename... Args>
static Ptr Create(Args&&... args);
protected:
SimObject(const SimContext& ctx, const char* name) : SimObjectBase(ctx, name) {}
void step(uint64_t cycle) override {
this->impl().step(cycle);
}
private:
const Impl& impl() const {
return static_cast<const Impl&>(*this);
}
Impl& impl() {
return static_cast<Impl&>(*this);
}
};
class SimContext {
private:
SimContext() {}
template <typename Impl> template <typename... Args>
friend typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args);
};
///////////////////////////////////////////////////////////////////////////////
class SimPlatform {
public:
static SimPlatform& instance() {
static SimPlatform s_inst;
return s_inst;
}
bool initialize() {
//--
return true;
}
void finalize() {
instance().clear();
}
void register_object(const SimObjectBase::Ptr& obj) {
objects_.push_back(obj);
}
template <typename Pkt>
void schedule(const typename SimSimpleEvent<Pkt>::Func& callback,
const Pkt& pkt,
uint64_t delay) {
auto evt = SimSimpleEvent<Pkt>::Create(callback, pkt, delay);
assert(delay != 0);
events_.emplace_back(evt);
}
template <typename Pkt>
void schedule(const typename SimPortEvent<Pkt>::Func& callback,
const Pkt& pkt,
uint32_t port_id,
uint64_t delay) {
auto evt = SimPortEvent<Pkt>::Create(callback, pkt, port_id, delay);
assert(delay != 0);
events_.emplace_back(evt);
}
void step() {
// evaluate events
auto evt_it = events_.begin();
auto evt_it_end = events_.end();
while (evt_it != evt_it_end) {
auto& event = *evt_it;
if (event->step()) {
event->fire();
evt_it = events_.erase(evt_it);
} else {
++evt_it;
}
}
// evaluate components
for (auto& object : objects_) {
object->step(cycles_);
}
// advance clock
++cycles_;
}
uint64_t cycles() const {
return cycles_;
}
private:
SimPlatform() : cycles_(0) {}
virtual ~SimPlatform() {
this->clear();
}
void clear() {
objects_.clear();
events_.clear();
}
std::vector<SimObjectBase::Ptr> objects_;
std::list<SimEventBase::Ptr> events_;
uint64_t cycles_;
};
///////////////////////////////////////////////////////////////////////////////
inline SimPortBase::SimPortBase(SimObjectBase* module, bool is_slave)
: module_(module)
, port_id_(module->allocate_port(this))
, is_slave_(is_slave)
, peer_(nullptr)
{}
inline SimObjectBase::SimObjectBase(const SimContext&, const char* name)
: name_(name)
{}
template <typename Impl>
template <typename... Args>
typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
auto obj = std::make_shared<Impl>(SimContext{}, std::forward<Args>(args)...);
SimPlatform::instance().register_object(obj);
return obj;
}
template <typename Pkt>
void SlavePort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
if (func_) {
SimPlatform::instance().schedule(func_, pkt, port_id_, delay);
} else {
assert(peer_ != nullptr);
if (peer_->is_slave()) {
auto slave = reinterpret_cast<const SlavePort<Pkt>*>(peer_);
slave->send(pkt, delay);
} else {
auto master = reinterpret_cast<const MasterPort<Pkt>*>(peer_);
master->send(pkt, delay);
}
}
}
template <typename T, typename Pkt>
void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) {
auto callback = std::bind(entry, obj, std::placeholders::_1);
SimPlatform::instance().schedule(callback, pkt, delay);
}
}

View File

@@ -1,6 +1,7 @@
#pragma once
#include <cstdint>
#include <algorithm>
#include <assert.h>
template <typename... Args>
@@ -8,24 +9,83 @@ void unused(Args&&...) {}
#define __unused(...) unused(__VA_ARGS__)
constexpr bool ispow2(uint64_t value) {
constexpr uint32_t count_leading_zeros(uint32_t value) {
return value ? __builtin_clz(value) : 32;
}
constexpr uint32_t count_trailing_zeros(uint32_t value) {
return value ? __builtin_ctz(value) : 32;
}
constexpr bool ispow2(uint32_t value) {
return value && !(value & (value - 1));
}
constexpr unsigned log2ceil(uint32_t value) {
return 32 - __builtin_clz(value - 1);
constexpr uint32_t log2ceil(uint32_t value) {
return 32 - count_leading_zeros(value - 1);
}
inline uint64_t align_size(uint64_t size, uint64_t alignment) {
inline unsigned log2up(uint32_t value) {
return std::max<uint32_t>(1, log2ceil(value));
}
constexpr unsigned log2floor(uint32_t value) {
return 31 - count_leading_zeros(value);
}
constexpr unsigned ceil2(uint32_t value) {
return 32 - count_leading_zeros(value);
}
inline uint64_t bit_clr(uint64_t bits, uint32_t index) {
assert(index <= 63);
return bits & ~(1ull << index);
}
inline uint64_t bit_set(uint64_t bits, uint32_t index) {
assert(index <= 63);
return bits | (1ull << index);
}
inline bool bit_get(uint64_t bits, uint32_t index) {
assert(index <= 63);
return (bits >> index) & 0x1;
}
inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) {
assert(end >= start);
assert(end <= 63);
uint32_t shift = 63 - end;
uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift;
return bits & ~mask;
}
inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) {
assert(end >= start);
assert(end <= 63);
uint32_t shift = 63 - end;
uint64_t dirty = (value << (shift + start)) >> shift;
return bit_clrw(bits, start, end) | dirty;
}
inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
assert(end >= start);
assert(end <= 63);
uint32_t shift = 63 - end;
return (bits << shift) >> (shift + start);
}
inline uint64_t aligned_size(uint64_t size, uint32_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return (size + alignment - 1) & ~(alignment - 1);
}
// Apply integer sign extension
inline uint32_t signExt(uint32_t w, uint32_t bit, uint32_t mask) {
if (w >> (bit - 1))
w |= ~mask;
return w;
inline uint32_t sext32(uint32_t word, uint32_t width) {
assert(width > 1);
assert(width <= 32);
uint32_t mask = (1 << width) - 1;
return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word;
}
// return file extension

View File

@@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
TOP = vx_cache_sim
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += args.cpp pipeline.cpp warp.cpp core.cpp decode.cpp execute.cpp main.cpp
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp main.cpp
OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
VPATH := $(sort $(dir $(SRCS)))

View File

@@ -10,20 +10,30 @@
namespace vortex {
class ArchDef {
private:
int num_cores_;
int num_warps_;
int num_threads_;
int wsize_;
int vsize_;
int num_regs_;
int num_csrs_;
int num_barriers_;
public:
ArchDef(const std::string& /*arch*/,
int num_cores,
int num_warps,
int num_threads) {
wsize_ = 4;
vsize_ = 16;
num_regs_ = 32;
num_csrs_ = 4096;
num_barriers_= NUM_BARRIERS;
num_cores_ = num_cores;
num_warps_ = num_warps;
num_threads_ = num_threads;
}
int num_threads)
: num_cores_(num_cores)
, num_warps_(num_warps)
, num_threads_(num_threads)
, wsize_(4)
, vsize_(16)
, num_regs_(32)
, num_csrs_(4096)
, num_barriers_(NUM_BARRIERS)
{}
int wsize() const {
return wsize_;
@@ -56,17 +66,6 @@ public:
int num_cores() const {
return num_cores_;
}
private:
int wsize_;
int vsize_;
int num_regs_;
int num_csrs_;
int num_barriers_;
int num_threads_;
int num_warps_;
int num_cores_;
};
}

497
sim/simX/cache.cpp Normal file
View File

@@ -0,0 +1,497 @@
#include "cache.h"
#include "debug.h"
#include <util.h>
#include <unordered_map>
#include <vector>
#include <list>
#include <queue>
using namespace vortex;
struct params_t {
uint32_t sets_per_bank;
uint32_t blocks_per_set;
uint32_t words_per_block;
uint32_t word_select_addr_start;
uint32_t word_select_addr_end;
uint32_t bank_select_addr_start;
uint32_t bank_select_addr_end;
uint32_t set_select_addr_start;
uint32_t set_select_addr_end;
uint32_t tag_select_addr_start;
uint32_t tag_select_addr_end;
params_t(const CacheConfig& config) {
uint32_t bank_bits = log2ceil(config.num_banks);
uint32_t offset_bits = config.B - config.W;
uint32_t log2_bank_size = config.C - bank_bits;
uint32_t index_bits = log2_bank_size - (config.B << config.A);
assert(log2_bank_size >= config.B);
this->words_per_block = 1 << offset_bits;
this->blocks_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits;
assert(config.ports_per_bank <= this->words_per_block);
// Word select
this->word_select_addr_start = config.W;
this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
// Bank select
this->bank_select_addr_start = (1+this->word_select_addr_end);
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
// Set select
this->set_select_addr_start = (1+this->bank_select_addr_end);
this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
// Tag select
this->tag_select_addr_start = (1+this->set_select_addr_end);
this->tag_select_addr_end = (config.addr_width-1);
}
uint32_t addr_bank_id(uint64_t word_addr) const {
if (bank_select_addr_end >= bank_select_addr_start)
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
else
return 0;
}
uint32_t addr_set_id(uint64_t word_addr) const {
if (set_select_addr_end >= set_select_addr_start)
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
else
return 0;
}
uint64_t addr_tag(uint64_t word_addr) const {
if (tag_select_addr_end >= tag_select_addr_start)
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
else
return 0;
}
uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
uint64_t addr(0);
if (bank_select_addr_end >= bank_select_addr_start)
addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
if (set_select_addr_end >= set_select_addr_start)
addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
if (tag_select_addr_end >= tag_select_addr_start)
addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
return addr;
}
};
struct block_t {
bool valid;
bool dirty;
uint64_t tag;
uint32_t lru_ctr;
};
struct set_t {
std::vector<block_t> blocks;
set_t(uint32_t size) : blocks(size) {}
};
struct bank_req_info_t {
bool valid;
uint32_t req_id;
uint32_t req_tag;
};
struct bank_req_t {
bool valid;
bool write;
bool mshr_replay;
uint64_t tag;
uint32_t set_id;
std::vector<bank_req_info_t> infos;
bank_req_t(uint32_t size)
: valid(false)
, write(false)
, mshr_replay(false)
, tag(0)
, set_id(0)
, infos(size)
{}
};
struct mshr_entry_t : public bank_req_t {
uint32_t block_id;
mshr_entry_t(uint32_t size = 0)
: bank_req_t(size)
, block_id(0)
{}
};
class MSHR {
private:
std::vector<mshr_entry_t> entries_;
uint32_t capacity_;
public:
MSHR(uint32_t size)
: entries_(size)
, capacity_(0)
{}
bool empty() const {
return (0 == capacity_);
}
bool full() const {
return (capacity_ == entries_.size());
}
int lookup(const bank_req_t& bank_req) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (entry.valid
&& entry.set_id == bank_req.set_id
&& entry.tag == bank_req.tag) {
return i;
}
}
return -1;
}
int allocate(const bank_req_t& bank_req, uint32_t block_id) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (!entry.valid) {
*(bank_req_t*)&entry = bank_req;
entry.valid = true;
entry.mshr_replay = false;
entry.block_id = block_id;
++capacity_;
return i;
}
}
return -1;
}
mshr_entry_t& replay(uint32_t id) {
auto& root_entry = entries_.at(id);
assert(root_entry.valid);
// make all related mshr entries for replay
for (auto& entry : entries_) {
if (entry.valid
&& entry.set_id == root_entry.set_id
&& entry.tag == root_entry.tag) {
entry.mshr_replay = true;
}
}
return root_entry;
}
bool try_pop(bank_req_t* out) {
for (auto& entry : entries_) {
if (entry.valid && entry.mshr_replay) {
*out = entry;
entry.valid = false;
--capacity_;
return true;
}
}
return false;
}
};
struct bank_t {
std::vector<set_t> sets;
MSHR mshr;
std::queue<bank_req_t> stall_buffer;
bank_req_t active_req;
bank_t(const CacheConfig& config,
const params_t& params)
: sets(params.sets_per_bank, params.blocks_per_set)
, mshr(config.mshr_size)
, active_req(config.ports_per_bank)
{}
};
///////////////////////////////////////////////////////////////////////////////
class Cache::Impl {
private:
Cache* const simobject_;
CacheConfig config_;
params_t params_;
std::vector<bank_t> banks_;
std::vector<std::pair<bool, MemReq>> core_reqs_;
std::pair<bool, MemRsp> mem_rsp_;
std::vector<std::queue<uint32_t>> core_rsps_;
public:
Impl(Cache* simobject, const CacheConfig& config)
: simobject_(simobject)
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, core_reqs_(config.num_inputs)
, core_rsps_(config.num_inputs)
{}
void handleMemResponse(const MemRsp& response, uint32_t) {
mem_rsp_ = {true, response};
}
void handleCoreRequest(const MemReq& request, uint32_t port_id) {
core_reqs_.at(port_id) = {true, request};
}
void step(uint64_t /*cycle*/) {
// process core response
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
auto& core_rsp = core_rsps_.at(req_id);
if (!core_rsp.empty()) {
simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_rsp.front()}, config_.latency);
core_rsp.pop();
}
}
for (auto& bank : banks_) {
auto& active_req = bank.active_req;
// try chedule mshr replay
if (!active_req.valid) {
bank.mshr.try_pop(&active_req);
}
// try schedule stall replay
if (!active_req.valid
&& !bank.stall_buffer.empty()) {
active_req = bank.stall_buffer.front();
bank.stall_buffer.pop();
}
}
// handle memory fills
if (mem_rsp_.first) {
mem_rsp_.first = false;
auto bank_id = bit_getw(mem_rsp_.second.tag, 0, 15);
auto mshr_id = bit_getw(mem_rsp_.second.tag, 16, 31);
this->processMemoryFill(bank_id, mshr_id);
}
// handle incoming core requests
for (uint32_t i = 0, n = core_reqs_.size(); i < n; ++i) {
auto& entry = core_reqs_.at(i);
if (!entry.first)
continue;
entry.first = false;
auto& core_req = entry.second;
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = i % config_.ports_per_bank;
// create abnk request
bank_req_t bank_req(config_.ports_per_bank);
bank_req.valid = true;
bank_req.write = core_req.write;
bank_req.mshr_replay = false;
bank_req.tag = tag;
bank_req.set_id = set_id;
bank_req.infos.at(port_id) = {true, i, core_req.tag};
auto& bank = banks_.at(bank_id);
// check MSHR capacity
if (bank.mshr.full()) {
// add to stall buffer
bank.stall_buffer.emplace(bank_req);
continue;
}
auto& active_req = bank.active_req;
// check pending MSHR request
if (active_req.valid
&& active_req.mshr_replay) {
// add to stall buffer
bank.stall_buffer.emplace(bank_req);
continue;
}
// check bank conflicts
if (active_req.valid) {
// check port conflict
if (active_req.write != core_req.write
|| active_req.set_id != set_id
|| active_req.tag != tag
|| active_req.infos[port_id].valid) {
// add to stall buffer
bank.stall_buffer.emplace(bank_req);
continue;
}
// update pending request infos
active_req.infos[port_id] = bank_req.infos[port_id];
} else {
// schedule new request
active_req = bank_req;
}
}
// process active request
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
this->processBankRequest(bank_id);
}
}
void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
// update block
auto& bank = banks_.at(bank_id);
auto& root_entry = bank.mshr.replay(mshr_id);
auto& set = bank.sets.at(root_entry.set_id);
auto& block = set.blocks.at(root_entry.block_id);
block.valid = true;
block.tag = root_entry.tag;
}
void processBankRequest(uint32_t bank_id) {
auto& bank = banks_.at(bank_id);
auto& active_req = bank.active_req;
if (!active_req.valid)
return;
active_req.valid = false;
auto& set = bank.sets.at(active_req.set_id);
if (active_req.mshr_replay) {
// send core response
for (auto& info : active_req.infos) {
core_rsps_.at(info.req_id).emplace(info.req_tag);
}
} else {
bool hit = false;
bool found_free_block = false;
int hit_block_id = 0;
int repl_block_id = 0;
uint32_t max_cnt = 0;
for (int i = 0, n = set.blocks.size(); i < n; ++i) {
auto& block = set.blocks.at(i);
if (block.valid) {
if (block.tag == active_req.tag) {
block.lru_ctr = 0;
hit_block_id = i;
hit = true;
} else {
++block.lru_ctr;
}
if (max_cnt < block.lru_ctr) {
max_cnt = block.lru_ctr;
repl_block_id = i;
}
} else {
found_free_block = true;
repl_block_id = i;
}
}
if (hit) {
//
// MISS handling
//
if (active_req.write) {
// handle write hit
auto& hit_block = set.blocks.at(hit_block_id);
if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag);
mem_req.write = true;
mem_req.tag = 0;
simobject_->MemReqPort.send(mem_req, 1);
} else {
// mark block as dirty
hit_block.dirty = true;
}
}
// send core response
for (auto& info : active_req.infos) {
core_rsps_.at(info.req_id).emplace(info.req_tag);
}
} else {
//
// MISS handling
//
if (!found_free_block && !config_.write_through) {
// write back dirty block
auto& repl_block = set.blocks.at(repl_block_id);
if (repl_block.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag);
mem_req.write = true;
simobject_->MemReqPort.send(mem_req, 1);
}
}
if (active_req.write && config_.write_through) {
// forward write request to memory
{
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
mem_req.write = true;
mem_req.tag = 0;
simobject_->MemReqPort.send(mem_req, 1);
}
// send core response
for (auto& info : active_req.infos) {
core_rsps_.at(info.req_id).emplace(info.req_tag);
}
} else {
// lookup
int pending = bank.mshr.lookup(active_req);
// allocate MSHR
int mshr_id = bank.mshr.allocate(active_req, repl_block_id);
// send fill request
if (pending == -1) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
mem_req.write = active_req.write;
mem_req.tag = bit_setw(0, 0, 15, bank_id);
mem_req.tag = bit_setw(mem_req.tag, 16, 31, mshr_id);
simobject_->MemReqPort.send(mem_req, 1);
}
}
}
}
}
};
///////////////////////////////////////////////////////////////////////////////
Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config)
: SimObject<Cache>(ctx, name)
, impl_(new Impl(this, config))
, CoreReqPorts(config.num_inputs, {this, impl_, &Cache::Impl::handleCoreRequest})
, CoreRspPorts(config.num_inputs, this)
, MemReqPort(this)
, MemRspPort(this, impl_, &Impl::handleMemResponse)
{}
Cache::~Cache() {
delete impl_;
}
void Cache::step(uint64_t cycle) {
impl_->step(cycle);
}

40
sim/simX/cache.h Normal file
View File

@@ -0,0 +1,40 @@
#pragma once
#include <simobject.h>
#include "memsim.h"
namespace vortex {
struct CacheConfig {
uint8_t C; // log2 cache size
uint8_t B; // log2 block size
uint8_t W; // log2 word size
uint8_t A; // log2 associativity
uint8_t addr_width; // word address bits
uint8_t num_banks; // number of banks
uint8_t ports_per_bank; // number of ports per bank
uint8_t num_inputs; // number of inputs
bool write_through; // is write-through cache
uint16_t victim_size; // victim cache size
uint16_t mshr_size; // MSHR buffer size
uint8_t latency; // pipeline latency
};
class Cache : public SimObject<Cache> {
private:
class Impl;
Impl* impl_;
public:
Cache(const SimContext& ctx, const char* name, const CacheConfig& config);
~Cache();
void step(uint64_t cycle);
std::vector<SlavePort<MemReq>> CoreReqPorts;
std::vector<MasterPort<MemRsp>> CoreRspPorts;
MasterPort<MemReq> MemReqPort;
SlavePort<MemRsp> MemRspPort;
};
}

21
sim/simX/constants.h Normal file
View File

@@ -0,0 +1,21 @@
#pragma once
#include "types.h"
#ifndef MEM_LATENCY
#define MEM_LATENCY 18
#endif
namespace vortex {
struct Constants {
static constexpr uint32_t CORE_TO_DCACHE_DELAY = 1 + SM_ENABLE;
static constexpr uint32_t CORE_TO_ICACHE_DELAY = 1;
static constexpr uint32_t ICACHE_TO_MEM_DELAY = 2;
static constexpr uint32_t DCACHE_TO_MEM_DELAY = 2;
};
}

View File

@@ -12,34 +12,92 @@
using namespace vortex;
Core::Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
: id_(id)
Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
: SimObject(ctx, "Core")
, id_(id)
, arch_(arch)
, decoder_(decoder)
, mem_(mem)
, shared_mem_(1, SMEM_SIZE)
, inst_in_schedule_("schedule")
, inst_in_fetch_("fetch")
, inst_in_decode_("decode")
, inst_in_issue_("issue")
, inst_in_execute_("execute")
, inst_in_writeback_("writeback") {
in_use_iregs_.resize(arch.num_warps(), 0);
in_use_fregs_.resize(arch.num_warps(), 0);
in_use_vregs_.reset();
csrs_.resize(arch_.num_csrs(), 0);
fcsrs_.resize(arch_.num_warps(), 0);
barriers_.resize(arch_.num_barriers(), 0);
warps_.resize(arch_.num_warps());
, warps_(arch.num_warps())
, barriers_(arch.num_barriers(), 0)
, csrs_(arch.num_csrs(), 0)
, fcsrs_(arch.num_warps(), 0)
, ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_)
, exe_units_((int)ExeType::MAX)
, icache_(Cache::Create("Icache", CacheConfig{
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_BLOCK_SIZE),// B
2, // W
0, // A
32, // address bits
1, // number of banks
1, // number of ports
1, // request size
true, // write-throught
0, // victim size
NUM_WARPS, // mshr
2, // pipeline latency
}))
, dcache_(Cache::Create("Dcache", CacheConfig{
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_BLOCK_SIZE),// B
2, // W
0, // A
32, // address bits
DCACHE_NUM_BANKS, // number of banks
DCACHE_NUM_PORTS, // number of ports
(uint8_t)arch.num_threads(), // request size
true, // write-throught
0, // victim size
DCACHE_MSHR_SIZE, // mshr
2, // pipeline latency
}))
, l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
, icache_rsp_port_(this, this, &Core::icache_handleCacheReponse)
, dcache_rsp_port_(arch.num_threads(), {this, reinterpret_cast<LsuUnit*>(exe_units_.at((int)ExeType::LSU).get()) , &LsuUnit::handleCacheReponse})
, fetch_stage_("fetch")
, decode_stage_("decode")
, issue_stage_("issue")
, execute_stage_("execute")
, commit_stage_("writeback")
, pending_icache_(arch_.num_warps())
, stalled_warps_(0)
, last_schedule_wid_(0)
, pending_instrs_(0)
, ebreak_(false)
, stats_insts_(0)
, stats_loads_(0)
, stats_stores_(0)
, MemRspPort(this, &l1_mem_switch_->RspIn)
, MemReqPort(this, &l1_mem_switch_->ReqOut)
{
for (int i = 0; i < arch_.num_warps(); ++i) {
warps_[i] = std::make_shared<Warp>(this, i);
warps_.at(i) = std::make_shared<Warp>(this, i);
}
this->clear();
// register execute units
exe_units_.at((int)ExeType::ALU) = std::make_shared<AluUnit>(this);
exe_units_.at((int)ExeType::LSU) = std::make_shared<LsuUnit>(this);
exe_units_.at((int)ExeType::CSR) = std::make_shared<CsrUnit>(this);
exe_units_.at((int)ExeType::FPU) = std::make_shared<FpuUnit>(this);
exe_units_.at((int)ExeType::GPU) = std::make_shared<GpuUnit>(this);
// connect l1 caches
icache_->CoreRspPorts.at(0).bind(&icache_rsp_port_);
for (int i = 0; i < arch_.num_threads(); ++i) {
dcache_->CoreRspPorts.at(i).bind(&dcache_rsp_port_.at(i));
}
// connect l1 switch
icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]);
dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]);
l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort);
l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort);
// activate warp0
warps_.at(0)->setTmask(0, true);
}
Core::~Core() {
@@ -51,79 +109,41 @@ Core::~Core() {
}
}
void Core::clear() {
for (int w = 0; w < arch_.num_warps(); ++w) {
in_use_iregs_[w].reset();
in_use_fregs_[w].reset();
}
stalled_warps_.reset();
in_use_vregs_.reset();
for (auto& csr : csrs_) {
csr = 0;
void Core::icache_handleCacheReponse(const MemRsp& response, uint32_t /*port_id*/) {
// advance to decode stage
uint32_t wid = response.tag;
pipeline_state_t state;
pending_icache_.remove(wid, &state);
auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
state.icache_latency = latency;
decode_stage_.push(state);
}
for (auto& fcsr : fcsrs_) {
fcsr = 0;
}
for (auto& barrier : barriers_) {
barrier.reset();
}
for (auto warp : warps_) {
warp->clear();
}
inst_in_schedule_.clear();
inst_in_fetch_.clear();
inst_in_decode_.clear();
inst_in_issue_.clear();
inst_in_execute_.clear();
inst_in_writeback_.clear();
print_bufs_.clear();
steps_ = 0;
insts_ = 0;
loads_ = 0;
stores_ = 0;
inst_in_schedule_.valid = true;
warps_[0]->setTmask(0, true);
ebreak_ = false;
}
void Core::step() {
void Core::step(uint64_t cycle) {
__unused (cycle);
D(2, "###########################################################");
D(2, std::dec << "Core" << id_ << ": cycle: " << cycle);
steps_++;
D(2, std::dec << "Core" << id_ << ": cycle: " << steps_);
this->writeback();
this->commit();
this->execute();
this->issue();
this->decode();
this->fetch();
this->schedule();
DPN(2, std::flush);
}
void Core::schedule() {
if (!inst_in_schedule_.enter(&inst_in_fetch_))
return;
void Core::warp_scheduler() {
bool foundSchedule = false;
int scheduled_warp = inst_in_schedule_.wid;
int scheduled_warp = last_schedule_wid_;
for (size_t wid = 0; wid < warps_.size(); ++wid) {
// round robin scheduling
for (size_t wid = 0; wid < warps_.size(); ++wid) {
scheduled_warp = (scheduled_warp + 1) % warps_.size();
bool is_active = warps_[scheduled_warp]->active();
bool stalled = stalled_warps_[scheduled_warp];
if (is_active && !stalled) {
bool warp_active = warps_.at(scheduled_warp)->active();
bool warp_stalled = stalled_warps_.test(scheduled_warp);
if (warp_active && !warp_stalled) {
last_schedule_wid_ = scheduled_warp;
foundSchedule = true;
break;
}
@@ -132,113 +152,121 @@ void Core::schedule() {
if (!foundSchedule)
return;
D(2, "Schedule: wid=" << scheduled_warp);
inst_in_schedule_.wid = scheduled_warp;
// suspend warp until decode
stalled_warps_.set(scheduled_warp);
// advance pipeline
inst_in_schedule_.next(&inst_in_fetch_);
auto& warp = warps_.at(scheduled_warp);
stats_insts_ += warp->getActiveThreads();
pipeline_state_t state;
warp->eval(&state);
D(4, state);
// advance to fetch stage
++pending_instrs_;
fetch_stage_.push(state);
}
void Core::fetch() {
if (!inst_in_fetch_.enter(&inst_in_issue_))
return;
int wid = inst_in_fetch_.wid;
auto active_threads_b = warps_[wid]->getActiveThreads();
warps_[wid]->step(&inst_in_fetch_);
auto active_threads_a = warps_[wid]->getActiveThreads();
insts_ += active_threads_b;
if (active_threads_b != active_threads_a) {
D(3, "*** warp#" << wid << " active threads changed to " << active_threads_a);
// schedule icache request
pipeline_state_t state;
if (fetch_stage_.try_pop(&state)) {
state.icache_latency = SimPlatform::instance().cycles();
MemReq mem_req;
mem_req.addr = state.PC;
mem_req.write = false;
mem_req.tag = pending_icache_.allocate(state);
icache_->CoreReqPorts.at(0).send(mem_req, 1);
}
if (inst_in_fetch_.stall_warp) {
D(3, "*** warp#" << wid << " fetch stalled");
stalled_warps_[wid] = true;
}
D(4, inst_in_fetch_);
// advance pipeline
inst_in_fetch_.next(&inst_in_issue_);
// schedule next warp
this->warp_scheduler();
}
void Core::decode() {
if (!inst_in_decode_.enter(&inst_in_issue_))
pipeline_state_t state;
if (!decode_stage_.try_pop(&state))
return;
// advance pipeline
inst_in_decode_.next(&inst_in_issue_);
if (state.stall_warp) {
D(3, "*** warp#" << state.wid << " fetch stalled");
} else {
// release warp
stalled_warps_.reset(state.wid);
}
// advance to issue stage
issue_stage_.push(state);
}
void Core::issue() {
if (!inst_in_issue_.enter(&inst_in_execute_))
return;
bool in_use_regs = (inst_in_issue_.used_iregs & in_use_iregs_[inst_in_issue_.wid]) != 0
|| (inst_in_issue_.used_fregs & in_use_fregs_[inst_in_issue_.wid]) != 0
|| (inst_in_issue_.used_vregs & in_use_vregs_) != 0;
if (in_use_regs) {
D(3, "*** Issue: registers not ready!");
inst_in_issue_.stalled = true;
return;
if (!issue_stage_.empty()) {
// insert to ibuffer
auto& state = issue_stage_.top();
auto& ibuffer = ibuffers_.at(state.wid);
if (!ibuffer.full()) {
ibuffer.push(state);
issue_stage_.pop();
}
}
switch (inst_in_issue_.rdest_type) {
case 1:
if (inst_in_issue_.rdest)
in_use_iregs_[inst_in_issue_.wid][inst_in_issue_.rdest] = 1;
break;
case 2:
in_use_fregs_[inst_in_issue_.wid][inst_in_issue_.rdest] = 1;
break;
case 3:
in_use_vregs_[inst_in_issue_.rdest] = 1;
break;
default:
// issue ibuffer instructions
for (auto& ibuffer : ibuffers_) {
if (ibuffer.empty())
continue;
auto& state = ibuffer.top();
// check scoreboard
if (scoreboard_.in_use(state))
continue;
// update scoreboard
scoreboard_.reserve(state);
// advance to execute stage
execute_stage_.push(state);
ibuffer.pop();
break;
}
// advance pipeline
inst_in_issue_.next(&inst_in_execute_);
}
void Core::execute() {
if (!inst_in_execute_.enter(&inst_in_writeback_))
// process stage inputs
if (!execute_stage_.empty()) {
auto& state = execute_stage_.top();
auto& exe_unit = exe_units_.at((int)state.exe_type);
exe_unit->push_input(state);
execute_stage_.pop();
}
// advance execute units
for (auto& exe_unit : exe_units_) {
exe_unit->step();
}
// commit completed instructions
for (auto& exe_unit : exe_units_) {
pipeline_state_t state;
if (exe_unit->pop_output(&state)) {
if (state.stall_warp) {
stalled_warps_.reset(state.wid);
}
// advance to commit stage
commit_stage_.push(state);
}
}
}
void Core::commit() {
pipeline_state_t state;
if (!commit_stage_.try_pop(&state))
return;
// advance pipeline
inst_in_execute_.next(&inst_in_writeback_);
}
void Core::writeback() {
if (!inst_in_writeback_.enter(NULL))
return;
switch (inst_in_writeback_.rdest_type) {
case 1:
in_use_iregs_[inst_in_writeback_.wid][inst_in_writeback_.rdest] = 0;
break;
case 2:
in_use_fregs_[inst_in_writeback_.wid][inst_in_writeback_.rdest] = 0;
break;
case 3:
in_use_vregs_[inst_in_writeback_.rdest] = 0;
break;
default:
break;
}
if (inst_in_writeback_.stall_warp) {
stalled_warps_[inst_in_writeback_.wid] = false;
D(3, "*** warp#" << inst_in_writeback_.wid << " fetch released");
}
// advance pipeline
inst_in_writeback_.next(NULL);
// update scoreboard
scoreboard_.release(state);
}
Word Core::get_csr(Addr addr, int tid, int wid) {
@@ -281,16 +309,16 @@ Word Core::get_csr(Addr addr, int tid, int wid) {
return arch_.num_cores();
} else if (addr == CSR_MINSTRET) {
// NumInsts
return insts_;
return stats_insts_;
} else if (addr == CSR_MINSTRET_H) {
// NumInsts
return (Word)(insts_ >> 32);
return (Word)(stats_insts_ >> 32);
} else if (addr == CSR_MCYCLE) {
// NumCycles
return (Word)steps_;
return (Word)SimPlatform::instance().cycles();
} else if (addr == CSR_MCYCLE_H) {
// NumCycles
return (Word)(steps_ >> 32);
return (Word)(SimPlatform::instance().cycles() >> 32);
} else {
return csrs_.at(addr);
}
@@ -328,7 +356,7 @@ Word Core::icache_fetch(Addr addr) {
}
Word Core::dcache_read(Addr addr, Size size) {
++loads_;
++stats_loads_;
Word data = 0;
#ifdef SM_ENABLE
if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
@@ -342,7 +370,7 @@ Word Core::dcache_read(Addr addr, Size size) {
}
void Core::dcache_write(Addr addr, Word data, Size size) {
++stores_;
++stats_stores_;
#ifdef SM_ENABLE
if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
&& ((addr + 3) < SMEM_BASE_ADDR)) {
@@ -359,23 +387,19 @@ void Core::dcache_write(Addr addr, Word data, Size size) {
}
bool Core::running() const {
return inst_in_fetch_.valid
|| inst_in_decode_.valid
|| inst_in_issue_.valid
|| inst_in_execute_.valid
|| inst_in_writeback_.valid;
return pending_instrs_;
}
void Core::printStats() const {
std::cout << "Steps : " << steps_ << std::endl
<< "Insts : " << insts_ << std::endl
<< "Loads : " << loads_ << std::endl
<< "Stores: " << stores_ << std::endl;
std::cout << "Cycles: " << SimPlatform::instance().cycles() << std::endl
<< "Insts : " << stats_insts_ << std::endl
<< "Loads : " << stats_loads_ << std::endl
<< "Stores: " << stats_stores_ << std::endl;
}
void Core::writeToStdOut(Addr addr, Word data) {
uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
auto& ss_buf = print_bufs_[tid];
auto& ss_buf = print_bufs_.at(tid);
char c = (char)data;
ss_buf << c;
if (c == '\n') {

View File

@@ -4,10 +4,11 @@
#include <vector>
#include <list>
#include <stack>
#include <queue>
#include <unordered_map>
#include <memory>
#include <set>
#include <simobject.h>
#include "debug.h"
#include "types.h"
#include "archdef.h"
@@ -15,20 +16,21 @@
#include "mem.h"
#include "warp.h"
#include "pipeline.h"
#include "cache.h"
#include "ibuffer.h"
#include "scoreboard.h"
#include "exeunit.h"
namespace vortex {
class Core {
class Core : public SimObject<Core> {
public:
Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id);
Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id);
~Core();
void clear();
bool running() const;
void step();
void step(uint64_t cycle);
void printStats() const;
@@ -40,7 +42,7 @@ public:
return *warps_.at(i);
}
Decoder& decoder() {
const Decoder& decoder() {
return decoder_;
}
@@ -48,16 +50,12 @@ public:
return arch_;
}
unsigned long num_insts() const {
return insts_;
}
unsigned long num_steps() const {
return steps_;
unsigned long stats_insts() const {
return stats_insts_;
}
Word getIRegValue(int reg) const {
return warps_[0]->getIRegValue(reg);
return warps_.at(0)->getIRegValue(reg);
}
Word get_csr(Addr addr, int tid, int wid);
@@ -73,50 +71,66 @@ public:
void dcache_write(Addr, Word, Size);
void trigger_ebreak();
bool check_ebreak() const;
private:
void schedule();
void fetch();
void decode();
void issue();
void execute();
void writeback();
void commit();
void warp_scheduler();
void icache_handleCacheReponse(const MemRsp& response, uint32_t port_id);
void writeToStdOut(Addr addr, Word data);
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
RegMask in_use_vregs_;
WarpMask stalled_warps_;
std::vector<std::shared_ptr<Warp>> warps_;
std::vector<WarpMask> barriers_;
std::vector<Word> csrs_;
std::vector<Byte> fcsrs_;
std::unordered_map<int, std::stringstream> print_bufs_;
Word id_;
const ArchDef& arch_;
Decoder &decoder_;
const Decoder& decoder_;
MemoryUnit& mem_;
#ifdef SM_ENABLE
RAM shared_mem_;
#endif
std::vector<std::shared_ptr<Warp>> warps_;
std::vector<WarpMask> barriers_;
std::vector<Word> csrs_;
std::vector<Byte> fcsrs_;
std::vector<IBuffer> ibuffers_;
Scoreboard scoreboard_;
std::vector<ExeUnit::Ptr> exe_units_;
Cache::Ptr icache_;
Cache::Ptr dcache_;
Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
SlavePort<MemRsp> icache_rsp_port_;
std::vector<SlavePort<MemRsp>> dcache_rsp_port_;
PipelineStage fetch_stage_;
PipelineStage decode_stage_;
PipelineStage issue_stage_;
PipelineStage execute_stage_;
PipelineStage commit_stage_;
HashTable<pipeline_state_t> pending_icache_;
WarpMask stalled_warps_;
uint32_t last_schedule_wid_;
uint32_t pending_instrs_;
bool ebreak_;
Pipeline inst_in_schedule_;
Pipeline inst_in_fetch_;
Pipeline inst_in_decode_;
Pipeline inst_in_issue_;
Pipeline inst_in_execute_;
Pipeline inst_in_writeback_;
std::unordered_map<int, std::stringstream> print_bufs_;
uint64_t stats_insts_;
uint64_t stats_loads_;
uint64_t stats_stores_;
uint64_t steps_;
uint64_t insts_;
uint64_t loads_;
uint64_t stores_;
friend class LsuUnit;
public:
SlavePort<MemRsp> MemRspPort;
MasterPort<MemReq> MemReqPort;
};
} // namespace vortex

View File

@@ -281,7 +281,7 @@ Decoder::Decoder(const ArchDef &arch) {
v_imm_mask_ = 0x7ff;
}
std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) {
std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {
auto instr = std::make_shared<Instr>();
Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_);
instr->setOpcode(op);
@@ -351,9 +351,9 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) {
instr->setFunc3(func3);
instr->setFunc7(func7);
if ((func3 == 5) && (op != L_INST) && (op != Opcode::FL)) {
instr->setImm(signExt(rs2, 5, reg_mask_));
instr->setImm(sext32(rs2, 5));
} else {
instr->setImm(signExt(code >> shift_rs2_, 12, i_imm_mask_));
instr->setImm(sext32(code >> shift_rs2_, 12));
}
} break;
@@ -366,7 +366,7 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) {
}
instr->setFunc3(func3);
Word imeed = (func7 << reg_s_) | rd;
instr->setImm(signExt(imeed, 12, s_imm_mask_));
instr->setImm(sext32(imeed, 12));
} break;
case InstType::B_TYPE: {
@@ -378,12 +378,12 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) {
Word bit_10_5 = func7 & 0x3f;
Word bit_12 = func7 >> 6;
Word imeed = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12);
instr->setImm(signExt(imeed, 13, b_imm_mask_));
instr->setImm(sext32(imeed, 13));
} break;
case InstType::U_TYPE:
instr->setDestReg(rd);
instr->setImm(signExt(code >> shift_func3_, 20, u_imm_mask_));
instr->setImm(sext32(code >> shift_func3_, 20));
break;
case InstType::J_TYPE: {

View File

@@ -13,7 +13,7 @@ class Decoder {
public:
Decoder(const ArchDef &);
std::shared_ptr<Instr> decode(Word code, Word PC);
std::shared_ptr<Instr> decode(Word code, Word PC) const;
private:

File diff suppressed because it is too large Load Diff

152
sim/simX/exeunit.cpp Normal file
View File

@@ -0,0 +1,152 @@
#include "exeunit.h"
#include <iostream>
#include <iomanip>
#include <string.h>
#include <assert.h>
#include <util.h>
#include "debug.h"
#include "core.h"
using namespace vortex;
LsuUnit::LsuUnit(Core* core)
: ExeUnit("LSU")
, core_(core)
, num_threads_(core->arch().num_threads())
, pending_dcache_(LSUQ_SIZE)
, fence_lock_(false)
{}
void LsuUnit::handleCacheReponse(const MemRsp& response, uint32_t port_id) {
auto entry = pending_dcache_.at(response.tag);
entry.second.reset(port_id); // track remaining blocks
if (!entry.second.any()) {
auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
entry.first.dcache_latency = latency;
this->schedule_output(entry.first, 1);
pending_dcache_.release(response.tag);
}
}
void LsuUnit::step() {
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_dcache_.empty())
return;
this->schedule_output(fence_state_, 1);
fence_lock_ = false;
}
if (inputs_.empty())
return;
auto state = inputs_.top();
if (state.lsu.fence) {
// schedule fence lock
fence_state_ = state;
fence_lock_ = true;
inputs_.pop();
return;
}
// send dcache requests
if (!pending_dcache_.full()) {
state.dcache_latency = SimPlatform::instance().cycles();
auto tag = pending_dcache_.allocate({state, state.tmask});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!state.tmask.test(t))
continue;
MemReq mem_req;
mem_req.addr = state.mem_addrs.at(t);
mem_req.write = state.lsu.store;
mem_req.tag = tag;
core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
}
inputs_.pop();
}
}
///////////////////////////////////////////////////////////////////////////////
AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
void AluUnit::step() {
pipeline_state_t state;
if (!inputs_.try_pop(&state))
return;
switch (state.alu.type) {
case AluType::ARITH:
this->schedule_output(state, 1);
break;
case AluType::BRANCH:
this->schedule_output(state, 1);
break;
case AluType::IMUL:
this->schedule_output(state, LATENCY_IMUL);
break;
case AluType::IDIV:
this->schedule_output(state, XLEN);
break;
}
}
///////////////////////////////////////////////////////////////////////////////
CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
void CsrUnit::step() {
pipeline_state_t state;
if (!inputs_.try_pop(&state))
return;
this->schedule_output(state, 1);
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
void FpuUnit::step() {
pipeline_state_t state;
if (!inputs_.try_pop(&state))
return;
switch (state.fpu.type) {
case FpuType::FNCP:
this->schedule_output(state, 1);
break;
case FpuType::FMA:
this->schedule_output(state, LATENCY_FMA);
break;
case FpuType::FDIV:
this->schedule_output(state, LATENCY_FDIV);
break;
case FpuType::FSQRT:
this->schedule_output(state, LATENCY_FSQRT);
break;
case FpuType::FCVT:
this->schedule_output(state, LATENCY_FCVT);
break;
}
}
///////////////////////////////////////////////////////////////////////////////
GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {}
void GpuUnit::step() {
pipeline_state_t state;
if (!inputs_.try_pop(&state))
return;
switch (state.gpu.type) {
case GpuType::TMC:
case GpuType::WSPAWN:
case GpuType::SPLIT:
case GpuType::JOIN:
case GpuType::BAR:
this->schedule_output(state, 1);
break;
case GpuType::TEX:
/* TODO */
break;
}
}

103
sim/simX/exeunit.h Normal file
View File

@@ -0,0 +1,103 @@
#pragma once
#include <simobject.h>
#include "pipeline.h"
#include "cache.h"
namespace vortex {
class Core;
class ExeUnit {
protected:
const char* name_;
Queue<pipeline_state_t> inputs_;
Queue<pipeline_state_t> outputs_;
void schedule_output(const pipeline_state_t& state, uint32_t delay) {
if (delay > 1) {
SimPlatform::instance().schedule(
[&](const pipeline_state_t& req) {
outputs_.push(req);
},
state,
(delay - 1)
);
} else {
outputs_.push(state);
}
}
public:
typedef std::shared_ptr<ExeUnit> Ptr;
ExeUnit(const char* name) : name_(name) {}
virtual ~ExeUnit() {}
void push_input(const pipeline_state_t& state) {
inputs_.push(state);
}
bool pop_output(pipeline_state_t* state) {
return outputs_.try_pop(state);
}
virtual void step() = 0;
};
///////////////////////////////////////////////////////////////////////////////
class LsuUnit : public ExeUnit {
private:
Core* core_;
uint32_t num_threads_;
HashTable<std::pair<pipeline_state_t, ThreadMask>> pending_dcache_;
pipeline_state_t fence_state_;
bool fence_lock_;
public:
LsuUnit(Core*);
void handleCacheReponse(const MemRsp& response, uint32_t port_id);
void step();
};
///////////////////////////////////////////////////////////////////////////////
class AluUnit : public ExeUnit {
public:
AluUnit(Core*);
void step();
};
///////////////////////////////////////////////////////////////////////////////
class CsrUnit : public ExeUnit {
public:
CsrUnit(Core*);
void step();
};
///////////////////////////////////////////////////////////////////////////////
class FpuUnit : public ExeUnit {
public:
FpuUnit(Core*);
void step();
};
///////////////////////////////////////////////////////////////////////////////
class GpuUnit : public ExeUnit {
public:
GpuUnit(Core*);
void step();
};
}

39
sim/simX/ibuffer.h Normal file
View File

@@ -0,0 +1,39 @@
#pragma once
#include "pipeline.h"
#include <queue>
namespace vortex {
class IBuffer {
private:
std::queue<pipeline_state_t> entries_;
uint32_t capacity_;
public:
IBuffer(uint32_t size)
: capacity_(size)
{}
bool empty() const {
return entries_.empty();
}
bool full() const {
return (entries_.size() == capacity_);
}
const pipeline_state_t& top() const {
return entries_.front();
}
void push(const pipeline_state_t& state) {
entries_.emplace(state);
}
void pop() {
return entries_.pop();
}
};
}

View File

@@ -113,15 +113,12 @@ private:
int num_rsrcs_;
bool has_imm_;
int rdest_type_;
int isrc_mask_;
int fsrc_mask_;
int vsrc_mask_;
Word imm_;
int rsrc_type_[MAX_REG_SOURCES];
int rsrc_[MAX_REG_SOURCES];
int rdest_;
Word func3_;
Word func7_;
Word func6_;
//Vector
Word vmask_;
@@ -132,7 +129,7 @@ private:
Word vlmul_;
Word vsew_;
Word vediv_;
Word func6_;
Word func7_;
friend std::ostream &operator<<(std::ostream &, const Instr&);
};

View File

@@ -5,28 +5,26 @@
#include <fstream>
#include <stdlib.h>
#include <sys/stat.h>
#include "debug.h"
#include "types.h"
#include "core.h"
#include "processor.h"
#include "args.h"
using namespace vortex;
int main(int argc, char **argv) {
int ret;
std::string archString("rv32imf");
std::string archStr("rv32imf");
std::string imgFileName;
int num_cores(NUM_CORES * NUM_CLUSTERS);
int num_warps(NUM_WARPS);
int num_threads(NUM_THREADS);
std::string imgFileName;
bool showHelp(false);
bool showStats(false);
bool riscv_test(false);
/* Read the command line arguments. */
CommandLineArgFlag fh("-h", "--help", "", showHelp);
CommandLineArgSetter<std::string> fa("-a", "--arch", "", archString);
CommandLineArgSetter<std::string> fa("-a", "--arch", "", archStr);
CommandLineArgSetter<std::string> fi("-i", "--image", "", imgFileName);
CommandLineArgSetter<int> fc("-c", "--cores", "", num_cores);
CommandLineArgSetter<int> fw("-w", "--warps", "", num_warps);
@@ -48,62 +46,18 @@ int main(int argc, char **argv) {
return 0;
}
ArchDef arch(archString, num_cores, num_warps, num_threads);
std::cout << "Running " << imgFileName << "..." << std::endl;
Decoder decoder(arch);
MemoryUnit mu(0, arch.wsize(), true);
RAM ram((1<<12), (1<<20));
std::string program_ext(fileExtension(imgFileName.c_str()));
if (program_ext == "bin") {
ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
} else if (program_ext == "hex") {
ram.loadHexImage(imgFileName.c_str());
} else {
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
if (!SimPlatform::instance().initialize())
return -1;
{
ArchDef arch(archStr, num_cores, num_warps, num_threads);
Processor processor(arch);
ret = processor.run(imgFileName, riscv_test, showStats);
}
mu.attach(ram, 0, 0xFFFFFFFF);
SimPlatform::instance().finalize();
struct stat hello;
fstat(0, &hello);
std::vector<std::shared_ptr<Core>> cores(num_cores);
for (int i = 0; i < num_cores; ++i) {
cores[i] = std::make_shared<Core>(arch, decoder, mu, i);
}
bool running;
int exitcode = 0;
do {
running = false;
for (auto& core : cores) {
core->step();
if (core->running()) {
running = true;
}
if (core->check_ebreak()) {
exitcode = core->getIRegValue(3);
running = false;
break;
}
}
} while (running);
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed." << std::endl;
exitcode = 0;
} else {
std::cout << "Failed." << std::endl;
}
} else {
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
}
return exitcode;
return ret;
}

58
sim/simX/memsim.cpp Normal file
View File

@@ -0,0 +1,58 @@
#include "memsim.h"
#include <vector>
#include <queue>
#include "constants.h"
using namespace vortex;
class MemSim::Impl {
private:
MemSim* simobject_;
std::vector<std::queue<MemReq>> inputs_;
uint32_t latency_;
public:
Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency)
: simobject_(simobject)
, inputs_(num_banks)
, latency_(latency)
{}
void handleMemRequest(const MemReq& mem_req, uint32_t port_id) {
inputs_.at(port_id).push(mem_req);
}
void step(uint64_t /*cycle*/) {
for (uint32_t i = 0, n = inputs_.size(); i < n; ++i) {
auto& queue = inputs_.at(i);
if (queue.empty())
continue;
auto& entry = queue.front();
if (!entry.write) {
MemRsp mem_rsp;
mem_rsp.tag = entry.tag;
simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
}
queue.pop();
}
}
};
///////////////////////////////////////////////////////////////////////////////
MemSim::MemSim(const SimContext& ctx,
uint32_t num_banks,
uint32_t latency)
: SimObject<MemSim>(ctx, "MemSim")
, impl_(new Impl(this, num_banks, latency))
, MemReqPorts(num_banks, {this, impl_, &Impl::handleMemRequest})
, MemRspPorts(num_banks, this)
{}
MemSim::~MemSim() {
delete impl_;
}
void MemSim::step(uint64_t cycle) {
impl_->step(cycle);
}

35
sim/simX/memsim.h Normal file
View File

@@ -0,0 +1,35 @@
#pragma once
#include <simobject.h>
#include <vector>
#include <list>
namespace vortex {
struct MemReq {
uint64_t addr;
uint32_t tag;
bool write;
};
struct MemRsp {
uint32_t tag;
};
class MemSim : public SimObject<MemSim>{
private:
class Impl;
Impl* impl_;
public:
MemSim(const SimContext& ctx, uint32_t num_inputs, uint32_t latency);
~MemSim();
void step(uint64_t cycle);
std::vector<SlavePort<MemReq>> MemReqPorts;
std::vector<MasterPort<MemRsp>> MemRspPorts;
};
};

View File

@@ -1,63 +0,0 @@
#include <iostream>
#include "pipeline.h"
using namespace vortex;
namespace vortex {
std::ostream &operator<<(std::ostream &os, const Pipeline& pipeline) {
os << pipeline.name_ << ": valid=" << pipeline.valid << std::endl;
os << pipeline.name_ << ": stalled=" << pipeline.stalled << std::endl;
os << pipeline.name_ << ": stall_warp=" << pipeline.stall_warp << std::endl;
os << pipeline.name_ << ": wid=" << pipeline.wid << std::endl;
os << pipeline.name_ << ": PC=" << std::hex << pipeline.PC << std::endl;
os << pipeline.name_ << ": used_iregs=" << pipeline.used_iregs << std::endl;
os << pipeline.name_ << ": used_fregs=" << pipeline.used_fregs << std::endl;
os << pipeline.name_ << ": used_vregs=" << pipeline.used_vregs << std::endl;
return os;
}
}
Pipeline::Pipeline(const char* name)
: name_(name) {
this->clear();
}
void Pipeline::clear() {
valid = false;
stalled = false;
stall_warp = false;
wid = 0;
PC = 0;
used_iregs.reset();
used_fregs.reset();
used_vregs.reset();
}
bool Pipeline::enter(Pipeline *drain) {
if (drain) {
if (drain->stalled) {
this->stalled = true;
return false;
}
drain->valid = false;
}
this->stalled = false;
if (!this->valid)
return false;
return true;
}
void Pipeline::next(Pipeline *drain) {
if (drain) {
drain->valid = this->valid;
drain->stalled = this->stalled;
drain->stall_warp = this->stall_warp;
drain->wid = this->wid;
drain->PC = this->PC;
drain->rdest = this->rdest;
drain->rdest_type = this->rdest_type;
drain->used_iregs = this->used_iregs;
drain->used_fregs = this->used_fregs;
drain->used_vregs = this->used_vregs;
}
}

View File

@@ -2,47 +2,75 @@
#pragma once
#include <memory>
#include <iostream>
#include <util.h>
#include "types.h"
#include "debug.h"
namespace vortex {
class Instr;
class Pipeline {
public:
Pipeline(const char* name);
void clear();
bool enter(Pipeline* drain);
void next(Pipeline* drain);
//--
bool valid;
//--
bool stalled;
bool stall_warp;
struct pipeline_state_t {
//--
int wid;
ThreadMask tmask;
Word PC;
//--
bool stall_warp;
int rdest_type;
int rdest;
RegMask used_iregs;
RegMask used_fregs;
RegMask used_vregs;
private:
//-
ExeType exe_type;
std::vector<uint64_t> mem_addrs;
const char* name_;
friend std::ostream &operator<<(std::ostream &, const Pipeline&);
//--
union {
struct {
uint8_t load : 1;
uint8_t store: 1;
uint8_t fence : 1;
uint8_t prefetch: 1;
} lsu;
struct {
AluType type;
} alu;
struct {
FpuType type;
} fpu;
struct {
GpuType type;
} gpu;
};
// stats
uint64_t icache_latency;
uint64_t dcache_latency;
};
class PipelineStage : public Queue<pipeline_state_t> {
protected:
const char* name_;
friend std::ostream &operator<<(std::ostream &, const pipeline_state_t&);
public:
PipelineStage(const char* name = nullptr)
: name_(name)
{}
};
inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) {
os << "stall_warp=" << state.stall_warp;
os << ", wid=" << state.wid;
os << ", PC=" << std::hex << state.PC;
os << ", used_iregs=" << state.used_iregs;
os << ", used_fregs=" << state.used_fregs;
os << ", used_vregs=" << state.used_vregs;
os << std::endl;
return os;
}
}

189
sim/simX/processor.h Normal file
View File

@@ -0,0 +1,189 @@
#pragma once
#include "constants.h"
#include "debug.h"
#include "types.h"
#include "core.h"
namespace vortex {
class Processor {
private:
ArchDef arch_;
Decoder decoder_;
MemoryUnit mu_;
RAM ram_;
std::vector<Core::Ptr> cores_;
std::vector<Cache::Ptr> l2caches_;
std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
Cache::Ptr l3cache_;
Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
MemSim::Ptr memsim_;
public:
Processor(const ArchDef& arch)
: arch_(arch)
, decoder_(arch)
, mu_(0, arch.wsize(), true)
, ram_((1<<12), (1<<20))
, cores_(arch.num_cores())
, l2caches_(NUM_CLUSTERS)
, l2_mem_switches_(NUM_CLUSTERS)
{
uint32_t num_cores = arch.num_cores();
uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
// bind RAM to memory unit
mu_.attach(ram_, 0, 0xFFFFFFFF);
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
cores_.at(i) = Core::Create(arch, decoder_, mu_, i);
}
// connect memory sub-systen
memsim_ = MemSim::Create(1, MEM_LATENCY);
std::vector<SlavePort<MemReq>*> mem_req_ports(1);
std::vector<MasterPort<MemRsp>*> mem_rsp_ports(1);
mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
if (L3_ENABLE) {
l3cache_ = Cache::Create("l3cache", CacheConfig{
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L3_NUM_BANKS, // number of banks
L3_NUM_PORTS, // number of ports
NUM_CLUSTERS, // request size
true, // write-throught
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
});
mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
l3cache_->MemReqPort.bind(mem_req_ports.at(0));
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
}
} else if (NUM_CLUSTERS > 1) {
l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
}
}
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
if (L2_ENABLE) {
auto& l2cache = l2caches_.at(i);
l2cache = Cache::Create("l2cache", CacheConfig{
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L2_NUM_BANKS, // number of banks
L2_NUM_PORTS, // number of ports
NUM_CORES, // request size
true, // write-throught
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
l2cache->MemReqPort.bind(mem_req_ports.at(i));
mem_req_ports.resize(cores_per_cluster);
mem_rsp_ports.resize(cores_per_cluster);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
}
} else if (cores_per_cluster > 1) {
auto& l2_mem_switch = l2_mem_switches_.at(i);
l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES);
mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
mem_req_ports.resize(cores_per_cluster);
mem_rsp_ports.resize(cores_per_cluster);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
}
}
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
auto& core = cores_.at((i * NUM_CLUSTERS) + j);
mem_rsp_ports.at(i)->bind(&core->MemRspPort);
core->MemReqPort.bind(mem_req_ports.at(j));
}
}
}
~Processor() {}
int run(const std::string& program, bool riscv_test, bool /*showStats*/) {
{
std::string program_ext(fileExtension(program.c_str()));
if (program_ext == "bin") {
ram_.loadBinImage(program.c_str(), STARTUP_ADDR);
} else if (program_ext == "hex") {
ram_.loadHexImage(program.c_str());
} else {
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
return -1;
}
}
bool running;
int exitcode = 0;
do {
SimPlatform::instance().step();
running = false;
for (auto& core : cores_) {
if (core->running()) {
running = true;
}
if (core->check_ebreak()) {
exitcode = core->getIRegValue(3);
running = false;
break;
}
}
} while (running);
// get error status
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed." << std::endl;
exitcode = 0;
} else {
std::cout << "Failed." << std::endl;
}
} else {
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
}
return exitcode;
}
};
}

71
sim/simX/scoreboard.h Normal file
View File

@@ -0,0 +1,71 @@
#pragma once
#include "pipeline.h"
#include <queue>
namespace vortex {
class Scoreboard {
private:
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
public:
Scoreboard(const ArchDef &arch)
: in_use_iregs_(arch.num_warps())
, in_use_fregs_(arch.num_warps())
, in_use_vregs_(arch.num_warps())
{
for (int w = 0; w < arch.num_warps(); ++w) {
in_use_iregs_.at(w).reset();
in_use_fregs_.at(w).reset();
in_use_vregs_.at(w).reset();
}
}
bool in_use(const pipeline_state_t& state) const {
return (state.used_iregs & in_use_iregs_.at(state.wid)) != 0
|| (state.used_fregs & in_use_fregs_.at(state.wid)) != 0
|| (state.used_vregs & in_use_vregs_.at(state.wid)) != 0;
}
void reserve(const pipeline_state_t& state) {
if (!state.rdest)
return;
switch (state.rdest_type) {
case 1:
in_use_iregs_.at(state.wid).set(state.rdest);
break;
case 2:
in_use_fregs_.at(state.wid).set(state.rdest);
break;
case 3:
in_use_vregs_.at(state.wid).set(state.rdest);
break;
default:
break;
}
}
void release(const pipeline_state_t& state) {
if (!state.rdest)
return;
switch (state.rdest_type) {
case 1:
in_use_iregs_.at(state.wid).reset(state.rdest);
break;
case 2:
in_use_fregs_.at(state.wid).reset(state.rdest);
break;
case 3:
in_use_vregs_.at(state.wid).reset(state.rdest);
break;
default:
break;
}
}
};
}

View File

@@ -2,7 +2,10 @@
#include <stdint.h>
#include <bitset>
#include <queue>
#include <unordered_map>
#include <VX_config.h>
#include <simobject.h>
namespace vortex {
@@ -14,9 +17,242 @@ typedef uint32_t Addr;
typedef uint32_t Size;
typedef std::bitset<32> RegMask;
typedef std::bitset<32> ThreadMask;
typedef std::bitset<32> WarpMask;
enum class ExeType {
ALU,
LSU,
CSR,
FPU,
GPU,
MAX,
};
enum class AluType {
ARITH,
BRANCH,
IMUL,
IDIV,
};
enum class FpuType {
FNCP,
FMA,
FDIV,
FSQRT,
FCVT,
};
enum class GpuType {
TMC,
WSPAWN,
SPLIT,
JOIN,
BAR,
TEX,
};
enum class ArbiterType {
Priority,
RoundRobin
};
///////////////////////////////////////////////////////////////////////////////
template <typename T>
class Queue {
protected:
std::queue<T> queue_;
public:
Queue() {}
bool empty() const {
return queue_.empty();
}
const T& top() const {
return queue_.front();
}
void push(const T& value) {
queue_.push(value);
}
void pop() {
queue_.pop();
}
bool try_pop(T* value) {
if (queue_.empty())
return false;
*value = queue_.front();
queue_.pop();
return true;
}
};
///////////////////////////////////////////////////////////////////////////////
template <typename T>
class HashTable {
private:
std::vector<std::pair<bool, T>> entries_;
uint32_t capacity_;
public:
HashTable(uint32_t size)
: entries_(size)
, capacity_(0)
{}
bool empty() const {
return (0 == capacity_);
}
bool full() const {
return (capacity_ == entries_.size());
}
bool contains(uint32_t index) const {
return entries_.at(index).first;
}
const T& at(uint32_t index) const {
auto& entry = entries_.at(index);
assert(entry.first);
return entry.second;
}
T& at(uint32_t index) {
auto& entry = entries_.at(index);
assert(entry.first);
return entry.second;
}
uint32_t allocate(const T& value) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (!entry.first) {
entry.first = true;
entry.second = value;
++capacity_;
return i;
}
}
return -1;
}
void release(uint32_t index) {
auto& entry = entries_.at(index);
assert(entry.first);
entry.first = false;
}
void remove(uint32_t index, T* value) {
auto& entry = entries_.at(index);
assert(entry.first);
*value = entry.second;
entry.first = false;
}
};
///////////////////////////////////////////////////////////////////////////////
template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
class Switch : public SimObject<Switch<Req, Rsp>> {
private:
struct req_t {
std::vector<Req> data;
std::bitset<MaxInputs> valid;
req_t() {}
req_t(uint32_t size) : data(size) {}
};
void handleIncomingRequest(const Req& req, uint32_t port_id) {
cur_req_.data.at(port_id) = req;
cur_req_.valid.set(port_id);
}
void handleIncomingResponse(const Rsp& rsp, uint32_t) {
rsps_.push(rsp);
}
ArbiterType type_;
std::queue<req_t> reqs_;
std::queue<Rsp> rsps_;
req_t cur_req_;
uint32_t delay_;
uint32_t cursor_;
std::unordered_map<uint32_t, uint32_t> addr_table_;
public:
Switch(
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs,
uint32_t delay = 1
)
: SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)
, type_(type)
, cur_req_(num_inputs)
, delay_(delay)
, cursor_(0)
, ReqIn(num_inputs, {this, this, &Switch<Req, Rsp, MaxInputs>::handleIncomingRequest})
, ReqOut(this)
, RspIn(this, this, &Switch<Req, Rsp, MaxInputs>::handleIncomingResponse)
, RspOut(num_inputs, this)
{
assert(delay_ != 0);
assert(num_inputs <= MaxInputs);
}
void step(uint64_t /*cycle*/) {
if (cur_req_.valid.any()) {
reqs_.push(cur_req_);
cur_req_.valid.reset();
}
while (!reqs_.empty()) {
auto& entry = reqs_.front();
bool found = false;
for (uint32_t i = 0, n = entry.data.size(); i < n; ++i) {
auto j = (cursor_ + i) % n;
if (entry.valid.test(j)) {
auto& req = entry.data.at(j);
addr_table_[req.tag] = j;
ReqOut.send(req, delay_);
entry.valid.reset(j);
this->update_cursor(j);
found = true;
break;
}
}
if (found)
break;
reqs_.pop();
}
if (!rsps_.empty()) {
auto& rsp = rsps_.front();
auto port_id = addr_table_.at(rsp.tag);
RspOut.at(port_id).send(rsp, 1);
rsps_.pop();
}
}
void update_cursor(uint32_t grant) {
if (type_ == ArbiterType::RoundRobin) {
cursor_ = grant + 1;
}
}
std::vector<SlavePort<Req>> ReqIn;
MasterPort<Req> ReqOut;
SlavePort<Rsp> RspIn;
std::vector<MasterPort<Rsp>> RspOut;
};
}

View File

@@ -12,25 +12,21 @@ using namespace vortex;
Warp::Warp(Core *core, Word id)
: id_(id)
, core_(core) {
, core_(core)
, active_(false)
, PC_(STARTUP_ADDR)
, tmask_(0) {
iRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
fRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
vRegFile_.resize(core_->arch().num_regs(), std::vector<Byte>(core_->arch().vsize(), 0));
this->clear();
}
void Warp::clear() {
PC_ = STARTUP_ADDR;
tmask_.reset();
active_ = false;
}
void Warp::step(Pipeline *pipeline) {
void Warp::eval(pipeline_state_t *pipeline_state) {
assert(tmask_.any());
DPH(2, "Step: wid=" << id_ << ", PC=0x" << std::hex << PC_ << ", tmask=");
for (int i = 0, n = core_->arch().num_threads(); i < n; ++i)
DPN(2, tmask_[n-i-1]);
DPN(2, tmask_.test(n-i-1));
DPN(2, "\n");
/* Fetch and decode. */
@@ -38,55 +34,24 @@ void Warp::step(Pipeline *pipeline) {
Word fetched = core_->icache_fetch(PC_);
auto instr = core_->decoder().decode(fetched, PC_);
// Update pipeline
pipeline->valid = true;
pipeline->PC = PC_;
pipeline->rdest = instr->getRDest();
pipeline->rdest_type = instr->getRDType();
pipeline->used_iregs.reset();
pipeline->used_fregs.reset();
pipeline->used_vregs.reset();
switch (pipeline->rdest_type) {
case 1:
pipeline->used_iregs[pipeline->rdest] = 1;
break;
case 2:
pipeline->used_fregs[pipeline->rdest] = 1;
break;
case 3:
pipeline->used_vregs[pipeline->rdest] = 1;
break;
default:
break;
}
for (int i = 0; i < instr->getNRSrc(); ++i) {
int type = instr->getRSType(i);
int reg = instr->getRSrc(i);
switch (type) {
case 1:
pipeline->used_iregs[reg] = 1;
break;
case 2:
pipeline->used_fregs[reg] = 1;
break;
case 3:
pipeline->used_vregs[reg] = 1;
break;
default:
break;
}
}
// Update state
pipeline_state->wid = id_;
pipeline_state->PC = PC_;
pipeline_state->tmask = tmask_;
pipeline_state->rdest = instr->getRDest();
pipeline_state->rdest_type = instr->getRDType();
pipeline_state->used_iregs.reset();
pipeline_state->used_fregs.reset();
pipeline_state->used_vregs.reset();
// Execute
this->execute(*instr, pipeline);
this->execute(*instr, pipeline_state);
D(4, "Register state:");
for (int i = 0; i < core_->arch().num_regs(); ++i) {
DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
for (int j = 0; j < core_->arch().num_threads(); ++j) {
DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_[j][i] << std::setfill(' ') << ' ');
DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(4, std::endl);
}

View File

@@ -9,7 +9,7 @@ namespace vortex {
class Core;
class Instr;
class Pipeline;
class pipeline_state_t;
struct DomStackEntry {
DomStackEntry(const ThreadMask &tmask, Word PC)
: tmask(tmask)
@@ -42,8 +42,6 @@ class Warp {
public:
Warp(Core *core, Word id);
void clear();
bool active() const {
return active_;
}
@@ -71,7 +69,7 @@ public:
}
void setTmask(size_t index, bool value) {
tmask_[index] = value;
tmask_.set(index, value);
active_ = tmask_.any();
}
@@ -82,18 +80,18 @@ public:
}
Word getIRegValue(int reg) const {
return iRegFile_[0][reg];
return iRegFile_.at(0).at(reg);
}
void step(Pipeline *);
void eval(pipeline_state_t *);
private:
void execute(const Instr &instr, Pipeline *);
void execute(const Instr &instr, pipeline_state_t *pipeline_state);
Word id_;
bool active_;
Core *core_;
bool active_;
Word PC_;
ThreadMask tmask_;