fixes: texture unit mem access sometimes going to smem, bilinear texture filtering; new: cache req_id,
This commit is contained in:
79
sim/common/bitmanip.h
Normal file
79
sim/common/bitmanip.h
Normal file
@@ -0,0 +1,79 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
|
||||
constexpr uint32_t count_leading_zeros(uint32_t value) {
|
||||
return value ? __builtin_clz(value) : 32;
|
||||
}
|
||||
|
||||
constexpr uint32_t count_trailing_zeros(uint32_t value) {
|
||||
return value ? __builtin_ctz(value) : 32;
|
||||
}
|
||||
|
||||
constexpr bool ispow2(uint32_t value) {
|
||||
return value && !(value & (value - 1));
|
||||
}
|
||||
|
||||
constexpr uint32_t log2ceil(uint32_t value) {
|
||||
return 32 - count_leading_zeros(value - 1);
|
||||
}
|
||||
|
||||
inline unsigned log2up(uint32_t value) {
|
||||
return std::max<uint32_t>(1, log2ceil(value));
|
||||
}
|
||||
|
||||
constexpr unsigned log2floor(uint32_t value) {
|
||||
return 31 - count_leading_zeros(value);
|
||||
}
|
||||
|
||||
constexpr unsigned ceil2(uint32_t value) {
|
||||
return 32 - count_leading_zeros(value);
|
||||
}
|
||||
|
||||
inline uint64_t bit_clr(uint64_t bits, uint32_t index) {
|
||||
assert(index <= 63);
|
||||
return bits & ~(1ull << index);
|
||||
}
|
||||
|
||||
inline uint64_t bit_set(uint64_t bits, uint32_t index) {
|
||||
assert(index <= 63);
|
||||
return bits | (1ull << index);
|
||||
}
|
||||
|
||||
inline bool bit_get(uint64_t bits, uint32_t index) {
|
||||
assert(index <= 63);
|
||||
return (bits >> index) & 0x1;
|
||||
}
|
||||
|
||||
inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) {
|
||||
assert(end >= start);
|
||||
assert(end <= 63);
|
||||
uint32_t shift = 63 - end;
|
||||
uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift;
|
||||
return bits & ~mask;
|
||||
}
|
||||
|
||||
inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) {
|
||||
assert(end >= start);
|
||||
assert(end <= 63);
|
||||
uint32_t shift = 63 - end;
|
||||
uint64_t dirty = (value << (shift + start)) >> shift;
|
||||
return bit_clrw(bits, start, end) | dirty;
|
||||
}
|
||||
|
||||
inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
|
||||
assert(end >= start);
|
||||
assert(end <= 63);
|
||||
uint32_t shift = 63 - end;
|
||||
return (bits << shift) >> (shift + start);
|
||||
}
|
||||
|
||||
// Apply integer sign extension
|
||||
inline uint32_t sext32(uint32_t word, uint32_t width) {
|
||||
assert(width > 1);
|
||||
assert(width <= 32);
|
||||
uint32_t mask = (1 << width) - 1;
|
||||
return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word;
|
||||
}
|
||||
419
sim/common/fixed.h
Normal file
419
sim/common/fixed.h
Normal file
@@ -0,0 +1,419 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <assert.h>
|
||||
|
||||
template <uint32_t F, typename T = int32_t>
|
||||
class Fixed {
|
||||
private:
|
||||
|
||||
template <uint32_t F2, typename T2>
|
||||
struct Cast {
|
||||
private:
|
||||
template <bool isF2Bigger, bool isT2Bigger> struct Tag {};
|
||||
|
||||
inline static T Convert(T2 value, Tag<false, false>) {
|
||||
return static_cast<T>(value) << (F - F2);
|
||||
}
|
||||
|
||||
inline static T Convert(T2 value, Tag<false, true>) {
|
||||
return static_cast<T>(value) >> (F2 - F);
|
||||
}
|
||||
|
||||
inline static T Convert(T2 value, Tag<true, false>) {
|
||||
return static_cast<T>(value << (F - F2));
|
||||
}
|
||||
|
||||
inline static T Convert(T2 value, Tag<true, true>) {
|
||||
return static_cast<T>(value >> (F2 - F));
|
||||
}
|
||||
|
||||
public:
|
||||
inline static T Convert(T2 value) {
|
||||
return Convert(value, Tag<(sizeof(T2) > sizeof(T)), (F2 > F)>{});
|
||||
}
|
||||
};
|
||||
|
||||
public:
|
||||
using data_type = T;
|
||||
|
||||
static constexpr uint32_t FRAC = F;
|
||||
static constexpr uint32_t INT = sizeof(T) * 8 - FRAC;
|
||||
static constexpr uint32_t HFRAC = FRAC >> 1;
|
||||
static constexpr T ONE = static_cast<T>(1) << FRAC;
|
||||
static constexpr T MASK = ONE - 1;
|
||||
static constexpr T IMASK = ~MASK;
|
||||
static constexpr T HALF = ONE >> 1;
|
||||
static constexpr T TWO = ONE << 1;
|
||||
|
||||
Fixed() {}
|
||||
|
||||
explicit Fixed(int64_t rhs)
|
||||
: data_(static_cast<T>(rhs << FRAC)) {
|
||||
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
|
||||
}
|
||||
|
||||
explicit Fixed(uint64_t rhs)
|
||||
: data_(static_cast<T>(rhs << FRAC)) {
|
||||
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
|
||||
}
|
||||
|
||||
explicit Fixed(int32_t rhs)
|
||||
: data_(static_cast<T>(rhs << FRAC)) {
|
||||
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
|
||||
}
|
||||
|
||||
explicit Fixed(uint32_t rhs)
|
||||
: data_(static_cast<T>(rhs << FRAC)) {
|
||||
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
|
||||
}
|
||||
|
||||
explicit Fixed(int16_t rhs)
|
||||
: data_(static_cast<T>(rhs << FRAC)) {
|
||||
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
|
||||
}
|
||||
|
||||
explicit Fixed(uint16_t rhs)
|
||||
: data_(static_cast<T>(rhs << FRAC)) {
|
||||
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
|
||||
}
|
||||
|
||||
explicit Fixed(int8_t rhs)
|
||||
: data_(static_cast<T>(rhs << FRAC)) {
|
||||
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
|
||||
}
|
||||
|
||||
explicit Fixed(uint8_t rhs)
|
||||
: data_(static_cast<T>(rhs << FRAC)) {
|
||||
assert((static_cast<int64_t>(rhs) << FRAC) == data_);
|
||||
}
|
||||
|
||||
template <uint32_t F2, typename T2>
|
||||
explicit Fixed(Fixed<F2, T2> rhs)
|
||||
: data_(Cast<F2, T2>::Convert(rhs.data()))
|
||||
{}
|
||||
|
||||
explicit Fixed(float rhs)
|
||||
: data_(static_cast<T>(rhs * ONE)) {
|
||||
assert(data_ == static_cast<T>(rhs * ONE));
|
||||
}
|
||||
|
||||
bool operator==(Fixed rhs) const {
|
||||
return (data_ == rhs.data_);
|
||||
}
|
||||
|
||||
bool operator!=(Fixed rhs) const {
|
||||
return (data_ != rhs.data_);
|
||||
}
|
||||
|
||||
bool operator<(Fixed rhs) const {
|
||||
return (data_ < rhs.data_);
|
||||
}
|
||||
|
||||
bool operator<=(Fixed rhs) const {
|
||||
return (data_ <= rhs.data_);
|
||||
}
|
||||
|
||||
bool operator>(Fixed rhs) const {
|
||||
return (data_ > rhs.data_);
|
||||
}
|
||||
|
||||
bool operator>=(Fixed rhs) const {
|
||||
return (data_ >= rhs.data_);
|
||||
}
|
||||
|
||||
Fixed operator-() const {
|
||||
return make(-data_);
|
||||
}
|
||||
|
||||
Fixed operator+=(Fixed rhs) {
|
||||
*this = (*this) + rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Fixed operator-=(Fixed rhs) {
|
||||
*this = (*this) - rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Fixed operator*=(Fixed rhs) {
|
||||
*this = (*this) * rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Fixed operator/=(Fixed rhs) {
|
||||
*this = (*this) / rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <uint32_t F2, typename T2>
|
||||
Fixed operator*=(Fixed<F2, T2> rhs) {
|
||||
*this = (*this) * rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <uint32_t F2, typename T2>
|
||||
Fixed operator/=(Fixed<F2, T2> rhs) {
|
||||
*this = (*this) / rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Fixed operator*=(int32_t rhs) {
|
||||
*this = (*this) * rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Fixed operator*=(uint32_t rhs) {
|
||||
*this = (*this) * rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Fixed operator*=(float rhs) {
|
||||
*this = (*this) * rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Fixed operator/=(int32_t rhs) {
|
||||
*this = (*this) / rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Fixed operator/=(uint32_t rhs) {
|
||||
*this = (*this) / rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Fixed operator/=(float rhs) {
|
||||
*this = (*this) / rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
friend Fixed operator+(Fixed lhs, Fixed rhs) {
|
||||
assert((static_cast<int64_t>(lhs.data_) + rhs.data_) ==
|
||||
(lhs.data_ + rhs.data_));
|
||||
return Fixed::make(lhs.data_ + rhs.data_);
|
||||
}
|
||||
|
||||
friend Fixed operator-(Fixed lhs, Fixed rhs) {
|
||||
assert((static_cast<int64_t>(lhs.data_) - rhs.data_) ==
|
||||
(lhs.data_ - rhs.data_));
|
||||
return Fixed::make(lhs.data_ - rhs.data_);
|
||||
}
|
||||
|
||||
friend Fixed operator*(Fixed lhs, Fixed rhs) {
|
||||
return Fixed::make((static_cast<int64_t>(lhs.data_) * rhs.data_) >> FRAC);
|
||||
}
|
||||
|
||||
template <uint32_t F2, typename T2>
|
||||
friend Fixed operator*(Fixed lhs, Fixed<F2, T2> rhs) {
|
||||
return Fixed::make((static_cast<int64_t>(lhs.data_) * rhs.data()) >> F2);
|
||||
}
|
||||
|
||||
friend Fixed operator/(Fixed lhs, Fixed rhs) {
|
||||
assert(rhs.data_ != 0);
|
||||
return Fixed::make((static_cast<int64_t>(lhs.data_) << FRAC) / rhs.data_);
|
||||
}
|
||||
|
||||
template <uint32_t F2, typename T2>
|
||||
friend Fixed operator/(Fixed lhs, Fixed<F2, T2> rhs) {
|
||||
assert(rhs.data() != 0);
|
||||
return Fixed::make((static_cast<int64_t>(lhs.data_) << F2) / rhs.data());
|
||||
}
|
||||
|
||||
friend Fixed operator*(Fixed lhs, float rhs) {
|
||||
return static_cast<float>(lhs) * rhs;
|
||||
}
|
||||
|
||||
friend Fixed operator*(float lhs, Fixed rhs) {
|
||||
return lhs * static_cast<float>(rhs);
|
||||
}
|
||||
|
||||
friend Fixed operator/(Fixed lhs, float rhs) {
|
||||
return static_cast<float>(lhs) / rhs;
|
||||
}
|
||||
|
||||
friend Fixed operator/(float lhs, Fixed rhs) {
|
||||
return lhs / static_cast<float>(rhs);
|
||||
}
|
||||
|
||||
friend Fixed operator*(Fixed lhs, char rhs) {
|
||||
return lhs * static_cast<int32_t>(rhs);
|
||||
}
|
||||
|
||||
friend Fixed operator*(char lhs, Fixed rhs) {
|
||||
return rhs * lhs;
|
||||
}
|
||||
|
||||
friend Fixed operator/(Fixed lhs, char rhs) {
|
||||
return lhs / static_cast<int32_t>(rhs);
|
||||
}
|
||||
|
||||
friend Fixed operator/(char lhs, Fixed rhs) {
|
||||
return rhs / lhs;
|
||||
}
|
||||
|
||||
friend Fixed operator*(Fixed lhs, uint8_t rhs) {
|
||||
return lhs * static_cast<int32_t>(rhs);
|
||||
}
|
||||
|
||||
friend Fixed operator*(uint8_t lhs, Fixed rhs) {
|
||||
return rhs * lhs;
|
||||
}
|
||||
|
||||
friend Fixed operator/(Fixed lhs, uint8_t rhs) {
|
||||
return lhs / static_cast<int32_t>(rhs);
|
||||
}
|
||||
|
||||
friend Fixed operator/(uint8_t lhs, Fixed rhs) {
|
||||
return rhs / lhs;
|
||||
}
|
||||
|
||||
friend Fixed operator*(Fixed lhs, short rhs) {
|
||||
return lhs * static_cast<int32_t>(rhs);
|
||||
}
|
||||
|
||||
friend Fixed operator*(short lhs, Fixed rhs) {
|
||||
return rhs * lhs;
|
||||
}
|
||||
|
||||
friend Fixed operator/(Fixed lhs, short rhs) {
|
||||
return lhs / static_cast<int32_t>(rhs);
|
||||
}
|
||||
|
||||
friend Fixed operator/(short lhs, Fixed rhs) {
|
||||
return rhs / lhs;
|
||||
}
|
||||
|
||||
friend Fixed operator*(Fixed lhs, uint16_t rhs) {
|
||||
return lhs * static_cast<int32_t>(rhs);
|
||||
}
|
||||
|
||||
friend Fixed operator*(uint16_t lhs, Fixed rhs) {
|
||||
return rhs * lhs;
|
||||
}
|
||||
|
||||
friend Fixed operator/(Fixed lhs, uint16_t rhs) {
|
||||
return lhs / static_cast<int32_t>(rhs);
|
||||
}
|
||||
|
||||
friend Fixed operator/(uint16_t lhs, Fixed rhs) {
|
||||
return rhs / lhs;
|
||||
}
|
||||
|
||||
friend Fixed operator*(Fixed lhs, int32_t rhs) {
|
||||
auto value = static_cast<T>(lhs.data_ * rhs);
|
||||
assert((lhs.data_ * static_cast<int64_t>(rhs)) == value);
|
||||
return Fixed::make(value);
|
||||
}
|
||||
|
||||
friend Fixed operator*(int32_t lhs, Fixed rhs) {
|
||||
return rhs * lhs;
|
||||
}
|
||||
|
||||
friend Fixed operator/(Fixed lhs, int32_t rhs) {
|
||||
assert(rhs);
|
||||
auto value = static_cast<T>(lhs.data_ / rhs);
|
||||
return Fixed::make(value);
|
||||
}
|
||||
|
||||
friend Fixed operator/(int32_t lhs, Fixed rhs) {
|
||||
return rhs / lhs;
|
||||
}
|
||||
|
||||
friend Fixed operator*(Fixed lhs, uint32_t rhs) {
|
||||
auto value = static_cast<T>(lhs.data_ << rhs);
|
||||
assert((lhs.data_ << static_cast<int64_t>(rhs)) == value);
|
||||
return Fixed::make(value);
|
||||
}
|
||||
|
||||
friend Fixed operator*(uint32_t lhs, Fixed rhs) {
|
||||
return rhs * lhs;
|
||||
}
|
||||
|
||||
friend Fixed operator/(Fixed lhs, uint32_t rhs) {
|
||||
assert(rhs);
|
||||
auto value = static_cast<T>(lhs.data_ / rhs);
|
||||
return Fixed::make(value);
|
||||
}
|
||||
|
||||
friend Fixed operator/(uint32_t lhs, Fixed rhs) {
|
||||
return rhs / lhs;
|
||||
}
|
||||
|
||||
friend Fixed operator<<(Fixed lhs, int32_t rhs) {
|
||||
auto value = static_cast<T>(lhs.data_ << rhs);
|
||||
assert((lhs.data_ << static_cast<int64_t>(rhs)) == value);
|
||||
return Fixed::make(value);
|
||||
}
|
||||
|
||||
friend Fixed operator>>(Fixed lhs, int32_t rhs) {
|
||||
auto value = static_cast<T>(lhs.data_ >> rhs);
|
||||
return Fixed::make(value);
|
||||
}
|
||||
|
||||
friend Fixed operator<<(Fixed lhs, uint32_t rhs) {
|
||||
auto value = static_cast<T>(lhs.data_ << rhs);
|
||||
assert((lhs.data_ << static_cast<int64_t>(rhs)) == value);
|
||||
return Fixed::make(value);
|
||||
}
|
||||
|
||||
friend Fixed operator>>(Fixed lhs, uint32_t rhs) {
|
||||
auto value = static_cast<T>(lhs.data_ >> rhs);
|
||||
return Fixed::make(value);
|
||||
}
|
||||
|
||||
static Fixed make(T value) {
|
||||
Fixed ret;
|
||||
ret.data_ = value;
|
||||
return ret;
|
||||
}
|
||||
|
||||
explicit operator int64_t() const {
|
||||
return static_cast<int64_t>(data_ >> F);
|
||||
}
|
||||
|
||||
explicit operator uint64_t() const {
|
||||
return static_cast<uint64_t>(data_ >> F);
|
||||
}
|
||||
|
||||
explicit operator int32_t() const {
|
||||
return static_cast<int32_t>(data_ >> F);
|
||||
}
|
||||
|
||||
explicit operator uint32_t() const {
|
||||
return static_cast<uint32_t>(data_ >> F);
|
||||
}
|
||||
|
||||
explicit operator int16_t() const {
|
||||
return static_cast<int16_t>(data_ >> F);
|
||||
}
|
||||
|
||||
explicit operator uint16_t() const {
|
||||
return static_cast<uint16_t>(data_ >> F);
|
||||
}
|
||||
|
||||
explicit operator int8_t() const {
|
||||
return static_cast<int8_t>(data_ >> F);
|
||||
}
|
||||
|
||||
explicit operator uint8_t() const {
|
||||
return static_cast<uint8_t>(data_ >> F);
|
||||
}
|
||||
|
||||
template <uint32_t F2, typename T2>
|
||||
explicit operator Fixed<F2, T2>() const {
|
||||
return Fixed<F2, T2>(*this);
|
||||
}
|
||||
|
||||
explicit operator float() const {
|
||||
return static_cast<float>(data_) / (static_cast<T>(1) << F);
|
||||
}
|
||||
|
||||
T data() const {
|
||||
return data_;
|
||||
}
|
||||
|
||||
private:
|
||||
T data_;
|
||||
};
|
||||
@@ -5,10 +5,9 @@
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
#include <queue>
|
||||
#include <assert.h>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class SimObjectBase;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@@ -59,32 +58,44 @@ protected:
|
||||
template <typename Pkt>
|
||||
class SimPort : public SimPortBase {
|
||||
public:
|
||||
void send(const Pkt& pkt, uint64_t delay) const;
|
||||
void send(const Pkt& pkt, uint64_t delay) const;
|
||||
|
||||
bool read(Pkt* out) {
|
||||
if (!valid_)
|
||||
return false;
|
||||
*out = data_;
|
||||
valid_ = false;
|
||||
return true;
|
||||
void bind(SimPort<Pkt>* peer) {
|
||||
this->connect(peer);
|
||||
}
|
||||
|
||||
void unbind() {
|
||||
this->disconnect();
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return queue_.empty();
|
||||
}
|
||||
|
||||
const Pkt& top() const {
|
||||
return queue_.front();
|
||||
}
|
||||
|
||||
Pkt& top() {
|
||||
return queue_.front();
|
||||
}
|
||||
|
||||
void pop() {
|
||||
queue_.pop();
|
||||
}
|
||||
|
||||
protected:
|
||||
SimPort(SimObjectBase* module)
|
||||
: SimPortBase(module)
|
||||
, valid_(false)
|
||||
{}
|
||||
|
||||
void write(const Pkt& data) {
|
||||
assert(!valid_);
|
||||
data_ = data;
|
||||
valid_ = true;
|
||||
void push(const Pkt& data) {
|
||||
queue_.push(data);
|
||||
}
|
||||
|
||||
SimPort& operator=(const SimPort&) = delete;
|
||||
|
||||
Pkt data_;
|
||||
bool valid_;
|
||||
std::queue<Pkt> queue_;
|
||||
|
||||
template <typename U> friend class SimPortEvent;
|
||||
};
|
||||
@@ -94,15 +105,7 @@ protected:
|
||||
template <typename Pkt>
|
||||
class SlavePort : public SimPort<Pkt> {
|
||||
public:
|
||||
SlavePort(SimObjectBase* module) : SimPort<Pkt>(module) {}
|
||||
|
||||
void bind(SlavePort<Pkt>* peer) {
|
||||
this->connect(peer);
|
||||
}
|
||||
|
||||
void unbind() {
|
||||
this->disconnect();
|
||||
}
|
||||
SlavePort(SimObjectBase* module) : SimPort<Pkt>(module) {}
|
||||
|
||||
protected:
|
||||
SlavePort& operator=(const SlavePort&) = delete;
|
||||
@@ -115,18 +118,6 @@ class MasterPort : public SimPort<Pkt> {
|
||||
public:
|
||||
MasterPort(SimObjectBase* module) : SimPort<Pkt>(module) {}
|
||||
|
||||
void bind(SlavePort<Pkt>* peer) {
|
||||
this->connect(peer);
|
||||
}
|
||||
|
||||
void bind(MasterPort<Pkt>* peer) {
|
||||
this->connect(peer);
|
||||
}
|
||||
|
||||
void unbind() {
|
||||
this->disconnect();
|
||||
}
|
||||
|
||||
protected:
|
||||
MasterPort& operator=(const MasterPort&) = delete;
|
||||
};
|
||||
@@ -194,7 +185,7 @@ public:
|
||||
{}
|
||||
|
||||
void fire() const override {
|
||||
const_cast<SimPort<Pkt>*>(port_)->write(pkt_);
|
||||
const_cast<SimPort<Pkt>*>(port_)->push(pkt_);
|
||||
}
|
||||
|
||||
private:
|
||||
@@ -382,6 +373,4 @@ template <typename T, typename Pkt>
|
||||
void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) {
|
||||
auto callback = std::bind(entry, obj, std::placeholders::_1);
|
||||
SimPlatform::instance().schedule(callback, pkt, delay);
|
||||
}
|
||||
|
||||
}
|
||||
221
sim/common/texturing.h
Normal file
221
sim/common/texturing.h
Normal file
@@ -0,0 +1,221 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <fixed.h>
|
||||
#include <bitmanip.h>
|
||||
|
||||
enum class WrapMode {
|
||||
Clamp,
|
||||
Repeat,
|
||||
Mirror,
|
||||
};
|
||||
|
||||
enum class TexFormat {
|
||||
R8G8B8A8,
|
||||
R5G6B5,
|
||||
R4G4B4A4,
|
||||
L8A8,
|
||||
L8,
|
||||
A8,
|
||||
};
|
||||
|
||||
template <uint32_t F, typename T = int32_t>
|
||||
T Clamp(Fixed<F,T> fx, WrapMode mode) {
|
||||
switch (mode) {
|
||||
case WrapMode::Clamp: return (fx.data() < 0) ? 0 : ((fx.data() > Fixed<F,T>::MASK) ? Fixed<F,T>::MASK : fx.data());
|
||||
case WrapMode::Repeat: return (fx.data() & Fixed<F,T>::MASK);
|
||||
case WrapMode::Mirror: return (bit_get(fx.data(), Fixed<F,T>::FRAC) ? ~fx.data() : fx.data());
|
||||
default:
|
||||
std::abort();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline uint32_t Stride(TexFormat format) {
|
||||
switch (format) {
|
||||
case TexFormat::R8G8B8A8:
|
||||
return 4;
|
||||
case TexFormat::R5G6B5:
|
||||
case TexFormat::R4G4B4A4:
|
||||
case TexFormat::L8A8:
|
||||
return 2;
|
||||
case TexFormat::L8:
|
||||
case TexFormat::A8:
|
||||
return 1;
|
||||
default:
|
||||
std::abort();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline void Unpack8888(TexFormat format,
|
||||
uint32_t texel,
|
||||
uint32_t* lo,
|
||||
uint32_t* hi) {
|
||||
switch (format) {
|
||||
case TexFormat::R8G8B8A8:
|
||||
*lo = texel & 0x00ff00ff;
|
||||
*hi = (texel >> 8) & 0x00ff00ff;
|
||||
break;
|
||||
case TexFormat::R5G6B5:
|
||||
case TexFormat::R4G4B4A4:
|
||||
*lo = texel;
|
||||
*hi= 0;
|
||||
break;
|
||||
case TexFormat::L8A8:
|
||||
*lo = (texel | (texel << 8)) & 0x00ff00ff;
|
||||
*hi = 0;
|
||||
break;
|
||||
case TexFormat::L8:
|
||||
*lo = (texel | (texel << 16)) & 0x07e0f81f;
|
||||
*hi = 0;
|
||||
break;
|
||||
case TexFormat::A8:
|
||||
*lo = (texel | (texel << 12)) & 0x0f0f0f0f;
|
||||
*hi = 0;
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
inline uint32_t Pack8888(TexFormat format, uint32_t lo, uint32_t hi) {
|
||||
switch (format) {
|
||||
case TexFormat::R8G8B8A8:
|
||||
return (hi << 8) | lo;
|
||||
case TexFormat::R5G6B5:
|
||||
case TexFormat::R4G4B4A4:
|
||||
return lo;
|
||||
case TexFormat::L8A8:
|
||||
return (lo | (lo >> 8)) & 0xffff;
|
||||
case TexFormat::L8:
|
||||
return (lo | (lo >> 16)) & 0xffff;
|
||||
case TexFormat::A8:
|
||||
return (lo | (lo >> 12)) & 0xffff;
|
||||
default:
|
||||
std::abort();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline void Lerp8888(uint32_t al,
|
||||
uint32_t ah,
|
||||
uint32_t bl,
|
||||
uint32_t bh,
|
||||
uint32_t frac,
|
||||
uint32_t* lo,
|
||||
uint32_t* hi) {
|
||||
*lo = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
|
||||
*hi = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
|
||||
}
|
||||
|
||||
template <uint32_t F, typename T = int32_t>
|
||||
void TexAddressLinear(Fixed<F,T> fu,
|
||||
Fixed<F,T> fv,
|
||||
uint32_t log_width,
|
||||
uint32_t log_height,
|
||||
WrapMode wrapu,
|
||||
WrapMode wrapv,
|
||||
uint32_t* addr00,
|
||||
uint32_t* addr01,
|
||||
uint32_t* addr10,
|
||||
uint32_t* addr11,
|
||||
uint32_t* alpha,
|
||||
uint32_t* beta
|
||||
) {
|
||||
auto delta_x = Fixed<F,T>::make(Fixed<F,T>::HALF >> log_width);
|
||||
auto delta_y = Fixed<F,T>::make(Fixed<F,T>::HALF >> log_height);
|
||||
|
||||
uint32_t u0 = Clamp(fu - delta_x, wrapu);
|
||||
uint32_t u1 = Clamp(fu + delta_x, wrapu);
|
||||
uint32_t v0 = Clamp(fv - delta_y, wrapv);
|
||||
uint32_t v1 = Clamp(fv + delta_y, wrapv);
|
||||
|
||||
uint32_t shift_u = (Fixed<F,T>::FRAC - log_width);
|
||||
uint32_t shift_v = (Fixed<F,T>::FRAC - log_height);
|
||||
|
||||
uint32_t x0s = (u0 << 8) >> shift_u;
|
||||
uint32_t y0s = (v0 << 8) >> shift_v;
|
||||
|
||||
uint32_t x0 = x0s >> 8;
|
||||
uint32_t y0 = y0s >> 8;
|
||||
uint32_t x1 = u1 >> shift_u;
|
||||
uint32_t y1 = v1 >> shift_v;
|
||||
|
||||
*addr00 = x0 + (y0 << log_width);
|
||||
*addr01 = x1 + (y0 << log_width);
|
||||
*addr10 = x0 + (y1 << log_width);
|
||||
*addr11 = x1 + (y1 << log_width);
|
||||
|
||||
*alpha = x0s & 0xff;
|
||||
*beta = y0s & 0xff;
|
||||
|
||||
//printf("*** fu=0x%x, fv=0x%x, u0=0x%x, u1=0x%x, v0=0x%x, v1=0x%x, x0=0x%x, x1=0x%x, y0=0x%x, y1=0x%x, addr00=0x%x, addr01=0x%x, addr10=0x%x, addr11=0x%x\n", fu.data(), fv.data(), u0, u1, v0, v1, x0, x1, y0, y1, *addr00, *addr01, *addr10, *addr11);
|
||||
}
|
||||
|
||||
template <uint32_t F, typename T = int32_t>
|
||||
void TexAddressPoint(Fixed<F,T> fu,
|
||||
Fixed<F,T> fv,
|
||||
uint32_t log_width,
|
||||
uint32_t log_height,
|
||||
WrapMode wrapu,
|
||||
WrapMode wrapv,
|
||||
uint32_t* addr
|
||||
) {
|
||||
uint32_t u = Clamp(fu, wrapu);
|
||||
uint32_t v = Clamp(fv, wrapv);
|
||||
|
||||
uint32_t x = u >> (Fixed<F,T>::FRAC - log_width);
|
||||
uint32_t y = v >> (Fixed<F,T>::FRAC - log_height);
|
||||
|
||||
*addr = x + (y << log_width);
|
||||
|
||||
//printf("*** fu=0x%x, fv=0x%x, u=0x%x, v=0x%x, x=0x%x, y=0x%x, addr=0x%x\n", fu.data(), fv.data(), u, v, x, y, *addr);
|
||||
}
|
||||
|
||||
inline uint32_t TexFilterLinear(
|
||||
TexFormat format,
|
||||
uint32_t texel00,
|
||||
uint32_t texel01,
|
||||
uint32_t texel10,
|
||||
uint32_t texel11,
|
||||
uint32_t alpha,
|
||||
uint32_t beta
|
||||
) {
|
||||
uint32_t c01l, c01h;
|
||||
{
|
||||
uint32_t c0l, c0h;
|
||||
uint32_t c1l, c1h;
|
||||
Unpack8888(format, texel00, &c0l, &c0h);
|
||||
Unpack8888(format, texel01, &c1l, &c1h);
|
||||
Lerp8888(c0l, c0h, c1l, c1h, alpha, &c01l, &c01h);
|
||||
}
|
||||
|
||||
uint32_t c23l, c23h;
|
||||
{
|
||||
uint32_t c2l, c2h;
|
||||
uint32_t c3l, c3h;
|
||||
Unpack8888(format, texel10, &c2l, &c2h);
|
||||
Unpack8888(format, texel11, &c3l, &c3h);
|
||||
Lerp8888(c2l, c2h, c3l, c3h, alpha, &c23l, &c23h);
|
||||
}
|
||||
|
||||
uint32_t cl, ch;
|
||||
Lerp8888(c01l, c01h, c23l, c23h, beta, &cl, &ch);
|
||||
uint32_t color = Pack8888(TexFormat::R8G8B8A8, cl, ch);
|
||||
|
||||
//printf("*** texel00=0x%x, texel01=0x%x, texel10=0x%x, texel11=0x%x, color=0x%x\n", texel00, texel01, texel10, texel11, color);
|
||||
|
||||
return color;
|
||||
}
|
||||
|
||||
inline uint32_t TexFilterPoint(TexFormat format, uint32_t texel) {
|
||||
uint32_t cl, ch;
|
||||
Unpack8888(format, texel, &cl, &ch);
|
||||
uint32_t color = Pack8888(TexFormat::R8G8B8A8, cl, ch);
|
||||
|
||||
//printf("*** texel=0x%x, color=0x%x\n", texel, color);
|
||||
|
||||
return color;
|
||||
}
|
||||
@@ -3,85 +3,12 @@
|
||||
#include <cstdint>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <bitmanip.h>
|
||||
|
||||
template <typename... Args>
|
||||
void unused(Args&&...) {}
|
||||
|
||||
#define __unused(...) unused(__VA_ARGS__)
|
||||
|
||||
constexpr uint32_t count_leading_zeros(uint32_t value) {
|
||||
return value ? __builtin_clz(value) : 32;
|
||||
}
|
||||
|
||||
constexpr uint32_t count_trailing_zeros(uint32_t value) {
|
||||
return value ? __builtin_ctz(value) : 32;
|
||||
}
|
||||
|
||||
constexpr bool ispow2(uint32_t value) {
|
||||
return value && !(value & (value - 1));
|
||||
}
|
||||
|
||||
constexpr uint32_t log2ceil(uint32_t value) {
|
||||
return 32 - count_leading_zeros(value - 1);
|
||||
}
|
||||
|
||||
inline unsigned log2up(uint32_t value) {
|
||||
return std::max<uint32_t>(1, log2ceil(value));
|
||||
}
|
||||
|
||||
constexpr unsigned log2floor(uint32_t value) {
|
||||
return 31 - count_leading_zeros(value);
|
||||
}
|
||||
|
||||
constexpr unsigned ceil2(uint32_t value) {
|
||||
return 32 - count_leading_zeros(value);
|
||||
}
|
||||
|
||||
inline uint64_t bit_clr(uint64_t bits, uint32_t index) {
|
||||
assert(index <= 63);
|
||||
return bits & ~(1ull << index);
|
||||
}
|
||||
|
||||
inline uint64_t bit_set(uint64_t bits, uint32_t index) {
|
||||
assert(index <= 63);
|
||||
return bits | (1ull << index);
|
||||
}
|
||||
|
||||
inline bool bit_get(uint64_t bits, uint32_t index) {
|
||||
assert(index <= 63);
|
||||
return (bits >> index) & 0x1;
|
||||
}
|
||||
|
||||
inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) {
|
||||
assert(end >= start);
|
||||
assert(end <= 63);
|
||||
uint32_t shift = 63 - end;
|
||||
uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift;
|
||||
return bits & ~mask;
|
||||
}
|
||||
|
||||
inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) {
|
||||
assert(end >= start);
|
||||
assert(end <= 63);
|
||||
uint32_t shift = 63 - end;
|
||||
uint64_t dirty = (value << (shift + start)) >> shift;
|
||||
return bit_clrw(bits, start, end) | dirty;
|
||||
}
|
||||
|
||||
inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
|
||||
assert(end >= start);
|
||||
assert(end <= 63);
|
||||
uint32_t shift = 63 - end;
|
||||
return (bits << shift) >> (shift + start);
|
||||
}
|
||||
|
||||
// Apply integer sign extension
|
||||
inline uint32_t sext32(uint32_t word, uint32_t width) {
|
||||
assert(width > 1);
|
||||
assert(width <= 32);
|
||||
uint32_t mask = (1 << width) - 1;
|
||||
return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word;
|
||||
}
|
||||
|
||||
// return file extension
|
||||
const char* fileExtension(const char* filepath);
|
||||
@@ -23,8 +23,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
|
||||
|
||||
DBG_FLAGS += $(DBG_TRACE_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
DBG_FLAGS += -DVCD_OUTPUT
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
|
||||
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
|
||||
@@ -51,10 +49,17 @@ VL_FLAGS += $(RTL_INCLUDE)
|
||||
VL_FLAGS += $(CONFIGS)
|
||||
CXXFLAGS += $(CONFIGS)
|
||||
|
||||
# Enable Verilator multithreaded simulation
|
||||
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
|
||||
#VL_FLAGS += --threads $(THREADS)
|
||||
|
||||
# Enable VCD trace
|
||||
VCD_TRACE = -DVCD_OUTPUT
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS)
|
||||
CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
|
||||
VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS)
|
||||
CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS)
|
||||
else
|
||||
VL_FLAGS += -DNDEBUG
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
|
||||
@@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
|
||||
TOP = vx_cache_sim
|
||||
|
||||
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
|
||||
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp processor.cpp main.cpp
|
||||
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp main.cpp
|
||||
|
||||
OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
|
||||
VPATH := $(sort $(dir $(SRCS)))
|
||||
|
||||
@@ -13,6 +13,7 @@ struct params_t {
|
||||
uint32_t sets_per_bank;
|
||||
uint32_t blocks_per_set;
|
||||
uint32_t words_per_block;
|
||||
uint32_t log2_num_inputs;
|
||||
|
||||
uint32_t word_select_addr_start;
|
||||
uint32_t word_select_addr_end;
|
||||
@@ -31,8 +32,10 @@ struct params_t {
|
||||
uint32_t offset_bits = config.B - config.W;
|
||||
uint32_t log2_bank_size = config.C - bank_bits;
|
||||
uint32_t index_bits = log2_bank_size - (config.B << config.A);
|
||||
assert(log2_bank_size >= config.B);
|
||||
|
||||
assert(log2_bank_size >= config.B);
|
||||
|
||||
this->log2_num_inputs = log2ceil(config.num_inputs);
|
||||
|
||||
this->words_per_block = 1 << offset_bits;
|
||||
this->blocks_per_set = 1 << config.A;
|
||||
this->sets_per_bank = 1 << index_bits;
|
||||
@@ -104,7 +107,7 @@ struct set_t {
|
||||
struct bank_req_info_t {
|
||||
bool valid;
|
||||
uint32_t req_id;
|
||||
uint32_t req_tag;
|
||||
uint64_t req_tag;
|
||||
};
|
||||
|
||||
struct bank_req_t {
|
||||
@@ -194,7 +197,7 @@ public:
|
||||
return root_entry;
|
||||
}
|
||||
|
||||
bool try_pop(bank_req_t* out) {
|
||||
bool pop(bank_req_t* out) {
|
||||
for (auto& entry : entries_) {
|
||||
if (entry.valid && entry.mshr_replay) {
|
||||
*out = entry;
|
||||
@@ -208,16 +211,13 @@ public:
|
||||
};
|
||||
|
||||
struct bank_t {
|
||||
std::vector<set_t> sets;
|
||||
MSHR mshr;
|
||||
std::queue<bank_req_t> stall_buffer;
|
||||
bank_req_t active_req;
|
||||
std::vector<set_t> sets;
|
||||
MSHR mshr;
|
||||
|
||||
bank_t(const CacheConfig& config,
|
||||
const params_t& params)
|
||||
: sets(params.sets_per_bank, params.blocks_per_set)
|
||||
, mshr(config.mshr_size)
|
||||
, active_req(config.ports_per_bank)
|
||||
{}
|
||||
};
|
||||
|
||||
@@ -229,8 +229,8 @@ private:
|
||||
CacheConfig config_;
|
||||
params_t params_;
|
||||
std::vector<bank_t> banks_;
|
||||
std::vector<std::queue<uint32_t>> core_rsps_;
|
||||
Switch<MemReq, MemRsp>::Ptr mem_switch_;
|
||||
Switch<MemReq, MemRsp>::Ptr mem_switch_;
|
||||
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
|
||||
std::vector<MasterPort<MemReq>> mem_req_ports_;
|
||||
std::vector<SlavePort<MemRsp>> mem_rsp_ports_;
|
||||
|
||||
@@ -240,241 +240,270 @@ public:
|
||||
, config_(config)
|
||||
, params_(config)
|
||||
, banks_(config.num_banks, {config, params_})
|
||||
, core_rsps_(config.num_inputs)
|
||||
, mem_req_ports_(config.num_banks, simobject)
|
||||
, mem_rsp_ports_(config.num_banks, simobject)
|
||||
{
|
||||
bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
|
||||
bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
|
||||
simobject->MemRspPort.bind(&bypass_switch_->RspIn);
|
||||
|
||||
if (config.num_banks > 1) {
|
||||
mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
|
||||
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
|
||||
mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
|
||||
mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
|
||||
}
|
||||
mem_switch_->ReqOut.bind(&simobject->MemReqPort);
|
||||
simobject->MemRspPort.bind(&mem_switch_->RspIn);
|
||||
mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0));
|
||||
bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn);
|
||||
} else {
|
||||
mem_req_ports_.at(0).bind(&simobject->MemReqPort);
|
||||
simobject->MemRspPort.bind(&mem_rsp_ports_.at(0));
|
||||
mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
|
||||
bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
|
||||
}
|
||||
}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
// process core response
|
||||
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
|
||||
auto& core_rsp = core_rsps_.at(req_id);
|
||||
if (!core_rsp.empty()) {
|
||||
simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_rsp.front()}, config_.latency);
|
||||
core_rsp.pop();
|
||||
}
|
||||
// handle bypasss responses
|
||||
auto& bypass_port = bypass_switch_->RspOut.at(1);
|
||||
if (!bypass_port.empty()) {
|
||||
auto& mem_rsp = bypass_port.top();
|
||||
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
|
||||
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
|
||||
MemRsp core_rsp(tag);
|
||||
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
|
||||
bypass_port.pop();
|
||||
}
|
||||
|
||||
for (auto& bank : banks_) {
|
||||
auto& active_req = bank.active_req;
|
||||
std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
|
||||
|
||||
// try chedule mshr replay
|
||||
if (!active_req.valid) {
|
||||
bank.mshr.try_pop(&active_req);
|
||||
}
|
||||
|
||||
// try schedule stall queue if MSHR has space
|
||||
if (!active_req.valid
|
||||
&& !bank.stall_buffer.empty()
|
||||
&& !bank.mshr.full()) {
|
||||
active_req = bank.stall_buffer.front();
|
||||
bank.stall_buffer.pop();
|
||||
}
|
||||
}
|
||||
// handle MSHR replay
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& pipeline_req = pipeline_reqs.at(bank_id);
|
||||
bank.mshr.pop(&pipeline_req);
|
||||
}
|
||||
|
||||
// handle memory fills
|
||||
for (uint32_t i = 0, n = config_.num_banks; i < n; ++i) {
|
||||
MemRsp mem_rsp;
|
||||
if (mem_rsp_ports_.at(i).read(&mem_rsp)) {
|
||||
this->processMemoryFill(i, mem_rsp.tag);
|
||||
std::vector<bool> pending_fill_req(config_.num_banks, false);
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
|
||||
if (!mem_rsp_port.empty()) {
|
||||
auto& mem_rsp = mem_rsp_port.top();
|
||||
this->processMemoryFill(bank_id, mem_rsp.tag);
|
||||
pending_fill_req.at(bank_id) = true;
|
||||
mem_rsp_port.pop();
|
||||
}
|
||||
}
|
||||
|
||||
// handle incoming core requests
|
||||
for (uint32_t i = 0, n = config_.num_inputs; i < n; ++i) {
|
||||
MemReq core_req;
|
||||
if (!simobject_->CoreReqPorts.at(i).read(&core_req))
|
||||
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
|
||||
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
|
||||
if (core_req_port.empty())
|
||||
continue;
|
||||
|
||||
auto bank_id = params_.addr_bank_id(core_req.addr);
|
||||
auto set_id = params_.addr_set_id(core_req.addr);
|
||||
auto tag = params_.addr_tag(core_req.addr);
|
||||
auto port_id = i % config_.ports_per_bank;
|
||||
auto& core_req = core_req_port.top();
|
||||
|
||||
// check cache bypassing
|
||||
if (core_req.is_io) {
|
||||
// send IO request
|
||||
this->processIORequest(core_req, req_id);
|
||||
|
||||
// remove request
|
||||
core_req_port.pop();
|
||||
continue;
|
||||
}
|
||||
|
||||
auto bank_id = params_.addr_bank_id(core_req.addr);
|
||||
auto set_id = params_.addr_set_id(core_req.addr);
|
||||
auto tag = params_.addr_tag(core_req.addr);
|
||||
auto port_id = req_id % config_.ports_per_bank;
|
||||
|
||||
// create abnk request
|
||||
// create bank request
|
||||
bank_req_t bank_req(config_.ports_per_bank);
|
||||
bank_req.valid = true;
|
||||
bank_req.write = core_req.write;
|
||||
bank_req.mshr_replay = false;
|
||||
bank_req.tag = tag;
|
||||
bank_req.set_id = set_id;
|
||||
bank_req.infos.at(port_id) = {true, i, core_req.tag};
|
||||
bank_req.infos.at(port_id) = {true, req_id, core_req.tag};
|
||||
|
||||
auto& bank = banks_.at(bank_id);
|
||||
|
||||
// check MSHR capacity
|
||||
if (bank.mshr.full()) {
|
||||
// add to stall buffer
|
||||
bank.stall_buffer.emplace(bank_req);
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& pipeline_req = pipeline_reqs.at(bank_id);
|
||||
|
||||
// check pending MSHR replay
|
||||
if (pipeline_req.valid
|
||||
&& pipeline_req.mshr_replay) {
|
||||
// stall
|
||||
continue;
|
||||
}
|
||||
|
||||
// check pending fill request
|
||||
if (pending_fill_req.at(bank_id)) {
|
||||
// stall
|
||||
continue;
|
||||
}
|
||||
|
||||
auto& active_req = bank.active_req;
|
||||
|
||||
// check pending MSHR request
|
||||
if (active_req.valid
|
||||
&& active_req.mshr_replay) {
|
||||
// add to stall buffer
|
||||
bank.stall_buffer.emplace(bank_req);
|
||||
|
||||
// check MSHR capacity if read or writeback
|
||||
if ((!core_req.write || !config_.write_through)
|
||||
&& bank.mshr.full()) {
|
||||
// stall
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// check bank conflicts
|
||||
if (active_req.valid) {
|
||||
if (pipeline_req.valid) {
|
||||
// check port conflict
|
||||
if (active_req.write != core_req.write
|
||||
|| active_req.set_id != set_id
|
||||
|| active_req.tag != tag
|
||||
|| active_req.infos[port_id].valid) {
|
||||
// add to stall buffer
|
||||
bank.stall_buffer.emplace(bank_req);
|
||||
if (pipeline_req.write != core_req.write
|
||||
|| pipeline_req.set_id != set_id
|
||||
|| pipeline_req.tag != tag
|
||||
|| pipeline_req.infos[port_id].valid) {
|
||||
// stall
|
||||
continue;
|
||||
}
|
||||
// update pending request infos
|
||||
active_req.infos[port_id] = bank_req.infos[port_id];
|
||||
pipeline_req.infos[port_id] = bank_req.infos[port_id];
|
||||
} else {
|
||||
// schedule new request
|
||||
active_req = bank_req;
|
||||
pipeline_req = bank_req;
|
||||
}
|
||||
// remove request
|
||||
core_req_port.pop();
|
||||
}
|
||||
|
||||
// process active request
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
this->processBankRequest(bank_id);
|
||||
// process active request
|
||||
this->processBankRequest(pipeline_reqs);
|
||||
}
|
||||
|
||||
void processIORequest(const MemReq& core_req, uint32_t req_id) {
|
||||
{
|
||||
MemReq mem_req(core_req);
|
||||
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
|
||||
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
|
||||
}
|
||||
|
||||
if (core_req.write && config_.write_reponse) {
|
||||
simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_req.tag}, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
|
||||
// update block
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& root_entry = bank.mshr.replay(mshr_id);
|
||||
auto& set = bank.sets.at(root_entry.set_id);
|
||||
auto& block = set.blocks.at(root_entry.block_id);
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& entry = bank.mshr.replay(mshr_id);
|
||||
auto& set = bank.sets.at(entry.set_id);
|
||||
auto& block = set.blocks.at(entry.block_id);
|
||||
block.valid = true;
|
||||
block.tag = root_entry.tag;
|
||||
block.tag = entry.tag;
|
||||
}
|
||||
|
||||
void processBankRequest(uint32_t bank_id) {
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& active_req = bank.active_req;
|
||||
if (!active_req.valid)
|
||||
return;
|
||||
void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
auto& pipeline_req = pipeline_reqs.at(bank_id);
|
||||
if (!pipeline_req.valid)
|
||||
continue;
|
||||
|
||||
active_req.valid = false;
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& set = bank.sets.at(pipeline_req.set_id);
|
||||
|
||||
auto& set = bank.sets.at(active_req.set_id);
|
||||
|
||||
if (active_req.mshr_replay) {
|
||||
// send core response
|
||||
for (auto& info : active_req.infos) {
|
||||
core_rsps_.at(info.req_id).emplace(info.req_tag);
|
||||
}
|
||||
} else {
|
||||
bool hit = false;
|
||||
bool found_free_block = false;
|
||||
int hit_block_id = 0;
|
||||
int repl_block_id = 0;
|
||||
uint32_t max_cnt = 0;
|
||||
|
||||
for (int i = 0, n = set.blocks.size(); i < n; ++i) {
|
||||
auto& block = set.blocks.at(i);
|
||||
if (block.valid) {
|
||||
if (block.tag == active_req.tag) {
|
||||
block.lru_ctr = 0;
|
||||
hit_block_id = i;
|
||||
hit = true;
|
||||
} else {
|
||||
++block.lru_ctr;
|
||||
}
|
||||
if (max_cnt < block.lru_ctr) {
|
||||
max_cnt = block.lru_ctr;
|
||||
if (pipeline_req.mshr_replay) {
|
||||
// send core response
|
||||
for (auto& info : pipeline_req.infos) {
|
||||
simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
|
||||
}
|
||||
} else {
|
||||
bool hit = false;
|
||||
bool found_free_block = false;
|
||||
int hit_block_id = 0;
|
||||
int repl_block_id = 0;
|
||||
uint32_t max_cnt = 0;
|
||||
|
||||
for (int i = 0, n = set.blocks.size(); i < n; ++i) {
|
||||
auto& block = set.blocks.at(i);
|
||||
if (block.valid) {
|
||||
if (block.tag == pipeline_req.tag) {
|
||||
block.lru_ctr = 0;
|
||||
hit_block_id = i;
|
||||
hit = true;
|
||||
} else {
|
||||
++block.lru_ctr;
|
||||
}
|
||||
if (max_cnt < block.lru_ctr) {
|
||||
max_cnt = block.lru_ctr;
|
||||
repl_block_id = i;
|
||||
}
|
||||
} else {
|
||||
found_free_block = true;
|
||||
repl_block_id = i;
|
||||
}
|
||||
} else {
|
||||
found_free_block = true;
|
||||
repl_block_id = i;
|
||||
}
|
||||
}
|
||||
|
||||
if (hit) {
|
||||
//
|
||||
// MISS handling
|
||||
//
|
||||
if (active_req.write) {
|
||||
// handle write hit
|
||||
auto& hit_block = set.blocks.at(hit_block_id);
|
||||
if (config_.write_through) {
|
||||
// forward write request to memory
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.tag = 0;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
} else {
|
||||
// mark block as dirty
|
||||
hit_block.dirty = true;
|
||||
}
|
||||
}
|
||||
// send core response
|
||||
for (auto& info : active_req.infos) {
|
||||
core_rsps_.at(info.req_id).emplace(info.req_tag);
|
||||
}
|
||||
} else {
|
||||
//
|
||||
// MISS handling
|
||||
//
|
||||
if (!found_free_block && !config_.write_through) {
|
||||
// write back dirty block
|
||||
auto& repl_block = set.blocks.at(repl_block_id);
|
||||
if (repl_block.dirty) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.tag = 0;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (active_req.write && config_.write_through) {
|
||||
// forward write request to memory
|
||||
{
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.tag = 0;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
if (hit) {
|
||||
//
|
||||
// MISS handling
|
||||
//
|
||||
if (pipeline_req.write) {
|
||||
// handle write hit
|
||||
auto& hit_block = set.blocks.at(hit_block_id);
|
||||
if (config_.write_through) {
|
||||
// forward write request to memory
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
|
||||
mem_req.write = true;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
} else {
|
||||
// mark block as dirty
|
||||
hit_block.dirty = true;
|
||||
}
|
||||
}
|
||||
// send core response
|
||||
for (auto& info : active_req.infos) {
|
||||
core_rsps_.at(info.req_id).emplace(info.req_tag);
|
||||
if (!pipeline_req.write || config_.write_reponse) {
|
||||
for (auto& info : pipeline_req.infos) {
|
||||
simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//
|
||||
// MISS handling
|
||||
//
|
||||
if (!found_free_block && !config_.write_through) {
|
||||
// write back dirty block
|
||||
auto& repl_block = set.blocks.at(repl_block_id);
|
||||
if (repl_block.dirty) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
|
||||
mem_req.write = true;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// lookup
|
||||
int pending = bank.mshr.lookup(active_req);
|
||||
|
||||
// allocate MSHR
|
||||
int mshr_id = bank.mshr.allocate(active_req, repl_block_id);
|
||||
|
||||
// send fill request
|
||||
if (pending == -1) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
|
||||
mem_req.write = active_req.write;
|
||||
mem_req.tag = mshr_id;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
if (pipeline_req.write && config_.write_through) {
|
||||
// forward write request to memory
|
||||
{
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
|
||||
mem_req.write = true;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
}
|
||||
// send core response
|
||||
if (config_.write_reponse) {
|
||||
for (auto& info : pipeline_req.infos) {
|
||||
simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// MSHR lookup
|
||||
int pending = bank.mshr.lookup(pipeline_req);
|
||||
|
||||
// allocate MSHR
|
||||
int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id);
|
||||
|
||||
// send fill request
|
||||
if (pending == -1) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
|
||||
mem_req.write = pipeline_req.write;
|
||||
mem_req.tag = mshr_id;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,8 @@ struct CacheConfig {
|
||||
uint8_t num_banks; // number of banks
|
||||
uint8_t ports_per_bank; // number of ports per bank
|
||||
uint8_t num_inputs; // number of inputs
|
||||
bool write_through; // is write-through cache
|
||||
bool write_through; // is write-through
|
||||
bool write_reponse; // enable write response
|
||||
uint16_t victim_size; // victim cache size
|
||||
uint16_t mshr_size; // MSHR buffer size
|
||||
uint8_t latency; // pipeline latency
|
||||
|
||||
@@ -10,11 +10,7 @@ namespace vortex {
|
||||
|
||||
struct Constants {
|
||||
|
||||
static constexpr uint32_t CORE_TO_DCACHE_DELAY = 1 + SM_ENABLE;
|
||||
static constexpr uint32_t CORE_TO_ICACHE_DELAY = 1;
|
||||
|
||||
static constexpr uint32_t ICACHE_TO_MEM_DELAY = 2;
|
||||
static constexpr uint32_t DCACHE_TO_MEM_DELAY = 2;
|
||||
static constexpr uint32_t SMEM_DELAY = 1 + SM_ENABLE;
|
||||
|
||||
};
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
||||
, decoder_(arch)
|
||||
, mmu_(0, arch.wsize(), true)
|
||||
, shared_mem_(4096)
|
||||
, tex_units_(NUM_TEX_UNITS, this)
|
||||
, warps_(arch.num_warps())
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
, csrs_(arch.num_csrs(), 0)
|
||||
@@ -35,7 +36,8 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
||||
1, // number of banks
|
||||
1, // number of ports
|
||||
1, // request size
|
||||
true, // write-throught
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
NUM_WARPS, // mshr
|
||||
2, // pipeline latency
|
||||
@@ -49,12 +51,14 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
||||
DCACHE_NUM_BANKS, // number of banks
|
||||
DCACHE_NUM_PORTS, // number of ports
|
||||
(uint8_t)arch.num_threads(), // request size
|
||||
true, // write-throught
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
DCACHE_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
}))
|
||||
, l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
|
||||
, l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
|
||||
, dcache_switch_(arch.num_threads())
|
||||
, fetch_stage_("fetch")
|
||||
, decode_stage_("decode")
|
||||
, issue_stage_("issue")
|
||||
@@ -65,10 +69,9 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
||||
, last_schedule_wid_(0)
|
||||
, issued_instrs_(0)
|
||||
, committed_instrs_(0)
|
||||
, ecall_(false)
|
||||
, ebreak_(false)
|
||||
, stats_insts_(0)
|
||||
, stats_loads_(0)
|
||||
, stats_stores_(0)
|
||||
, MemRspPort(this)
|
||||
, MemReqPort(this)
|
||||
{
|
||||
@@ -92,6 +95,18 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
||||
this->MemRspPort.bind(&l1_mem_switch_->RspIn);
|
||||
l1_mem_switch_->ReqOut.bind(&this->MemReqPort);
|
||||
|
||||
// lsu/tex switch
|
||||
for (uint32_t i = 0, n = arch.num_threads(); i < n; ++i) {
|
||||
auto& sw = dcache_switch_.at(i);
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
sw = Switch<MemReq, MemRsp>::Create("lsu_arb", ArbiterType::Priority, 2);
|
||||
#else
|
||||
sw = Switch<MemReq, MemRsp>::Create("lsu_arb", ArbiterType::Priority, 1);
|
||||
#endif
|
||||
sw->ReqOut.bind(&dcache_->CoreReqPorts.at(i));
|
||||
dcache_->CoreRspPorts.at(i).bind(&sw->RspIn);
|
||||
}
|
||||
|
||||
// activate warp0
|
||||
warps_.at(0)->setTmask(0, true);
|
||||
}
|
||||
@@ -147,44 +162,41 @@ void Core::warp_scheduler(uint64_t cycle) {
|
||||
auto& warp = warps_.at(scheduled_warp);
|
||||
stats_insts_ += warp->getActiveThreads();
|
||||
|
||||
pipeline_state_t state;
|
||||
state.clear();
|
||||
state.id = (issued_instrs_++ * arch_.num_cores()) + id_;
|
||||
auto trace = new pipeline_trace_t((issued_instrs_++ * arch_.num_cores()) + id_, arch_);
|
||||
|
||||
warp->eval(&state);
|
||||
warp->eval(trace);
|
||||
|
||||
DT(3, cycle, "pipeline-schedule: " << state);
|
||||
DT(3, cycle, "pipeline-schedule: " << *trace);
|
||||
|
||||
// advance to fetch stage
|
||||
fetch_stage_.push(state);
|
||||
fetch_stage_.push(trace);
|
||||
}
|
||||
|
||||
void Core::fetch(uint64_t cycle) {
|
||||
// handle icache reponse
|
||||
{
|
||||
MemRsp mem_rsp;
|
||||
if (icache_->CoreRspPorts.at(0).read(&mem_rsp)){
|
||||
pipeline_state_t state;
|
||||
pending_icache_.remove(mem_rsp.tag, &state);
|
||||
auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
|
||||
state.icache_latency = latency;
|
||||
decode_stage_.push(state);
|
||||
DT(3, cycle, "icache-rsp: addr=" << std::hex << state.PC << ", tag=" << mem_rsp.tag << ", " << state);
|
||||
}
|
||||
auto& icache_rsp_port = icache_->CoreRspPorts.at(0);
|
||||
if (!icache_rsp_port.empty()){
|
||||
auto& mem_rsp = icache_rsp_port.top();
|
||||
auto trace = pending_icache_.at(mem_rsp.tag);
|
||||
auto latency = (SimPlatform::instance().cycles() - trace->icache_latency);
|
||||
trace->icache_latency = latency;
|
||||
decode_stage_.push(trace);
|
||||
DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
|
||||
pending_icache_.release(mem_rsp.tag);
|
||||
icache_rsp_port.pop();
|
||||
}
|
||||
|
||||
// send icache request
|
||||
{
|
||||
pipeline_state_t state;
|
||||
if (fetch_stage_.try_pop(&state)) {
|
||||
state.icache_latency = SimPlatform::instance().cycles();
|
||||
MemReq mem_req;
|
||||
mem_req.addr = state.PC;
|
||||
mem_req.write = false;
|
||||
mem_req.tag = pending_icache_.allocate(state);
|
||||
icache_->CoreReqPorts.at(0).send(mem_req, 1);
|
||||
DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << state);
|
||||
}
|
||||
if (!fetch_stage_.empty()) {
|
||||
auto trace = fetch_stage_.top();
|
||||
trace->icache_latency = SimPlatform::instance().cycles();
|
||||
MemReq mem_req;
|
||||
mem_req.addr = trace->PC;
|
||||
mem_req.write = false;
|
||||
mem_req.tag = pending_icache_.allocate(trace);
|
||||
icache_->CoreReqPorts.at(0).send(mem_req, 1);
|
||||
DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
|
||||
fetch_stage_.pop();
|
||||
}
|
||||
|
||||
// schedule next warp
|
||||
@@ -194,19 +206,21 @@ void Core::fetch(uint64_t cycle) {
|
||||
void Core::decode(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
pipeline_state_t state;
|
||||
if (!decode_stage_.try_pop(&state))
|
||||
return;
|
||||
if (decode_stage_.empty())
|
||||
return;
|
||||
|
||||
auto trace = decode_stage_.top();
|
||||
|
||||
// release warp
|
||||
if (!state.stall_warp) {
|
||||
stalled_warps_.reset(state.wid);
|
||||
if (!trace->fetch_stall) {
|
||||
stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
|
||||
DT(3, cycle, "pipeline-decode: " << state);
|
||||
DT(3, cycle, "pipeline-decode: " << *trace);
|
||||
|
||||
// advance to issue stage
|
||||
issue_stage_.push(state);
|
||||
issue_stage_.push(trace);
|
||||
decode_stage_.pop();
|
||||
}
|
||||
|
||||
void Core::issue(uint64_t cycle) {
|
||||
@@ -214,12 +228,13 @@ void Core::issue(uint64_t cycle) {
|
||||
|
||||
if (!issue_stage_.empty()) {
|
||||
// insert to ibuffer
|
||||
auto& state = issue_stage_.top();
|
||||
auto& ibuffer = ibuffers_.at(state.wid);
|
||||
if (ibuffer.full()) {
|
||||
DT(3, cycle, "*** ibuffer-stall: " << state);
|
||||
} else {
|
||||
ibuffer.push(state);
|
||||
auto trace = issue_stage_.top();
|
||||
auto& ibuffer = ibuffers_.at(trace->wid);
|
||||
if (!trace->check_stalled(ibuffer.full())) {
|
||||
DT(3, cycle, "*** ibuffer-stall: " << *trace);
|
||||
}
|
||||
if (!ibuffer.full()) {
|
||||
ibuffer.push(trace);
|
||||
issue_stage_.pop();
|
||||
}
|
||||
}
|
||||
@@ -229,27 +244,30 @@ void Core::issue(uint64_t cycle) {
|
||||
if (ibuffer.empty())
|
||||
continue;
|
||||
|
||||
auto& state = ibuffer.top();
|
||||
auto trace = ibuffer.top();
|
||||
|
||||
// check scoreboard
|
||||
if (scoreboard_.in_use(state)) {
|
||||
if (!trace->check_stalled(scoreboard_.in_use(trace))) {
|
||||
DTH(3, cycle, "*** scoreboard-stall: dependents={");
|
||||
auto owners = scoreboard_.owners(state);
|
||||
for (uint32_t i = 0, n = owners.size(); i < n; ++i) {
|
||||
if (i) DTN(3, ", ");
|
||||
DTN(3, "#" << owners.at(i));
|
||||
auto uses = scoreboard_.get_uses(trace);
|
||||
for (uint32_t i = 0, n = uses.size(); i < n; ++i) {
|
||||
auto& use = uses.at(i);
|
||||
__unused(use);
|
||||
if (i) DTN(3, ", ");
|
||||
DTN(3, use.type << use.reg << "(#" << use.owner << ")");
|
||||
}
|
||||
DTN(3, "}, " << state << std::endl);
|
||||
continue;
|
||||
DTN(3, "}, " << *trace << std::endl);
|
||||
}
|
||||
if (scoreboard_.in_use(trace))
|
||||
continue;
|
||||
|
||||
DT(3, cycle, "pipeline-issue: " << state);
|
||||
DT(3, cycle, "pipeline-issue: " << *trace);
|
||||
|
||||
// update scoreboard
|
||||
scoreboard_.reserve(state);
|
||||
scoreboard_.reserve(trace);
|
||||
|
||||
// advance to execute stage
|
||||
execute_stage_.push(state);
|
||||
execute_stage_.push(trace);
|
||||
|
||||
ibuffer.pop();
|
||||
break;
|
||||
@@ -259,11 +277,11 @@ void Core::issue(uint64_t cycle) {
|
||||
void Core::execute(uint64_t cycle) {
|
||||
// process stage inputs
|
||||
if (!execute_stage_.empty()) {
|
||||
auto& state = execute_stage_.top();
|
||||
auto& exe_unit = exe_units_.at((int)state.exe_type);
|
||||
exe_unit->push_input(state);
|
||||
auto trace = execute_stage_.top();
|
||||
auto& exe_unit = exe_units_.at((int)trace->exe_type);
|
||||
exe_unit->push(trace);
|
||||
DT(3, cycle, "pipeline-execute: " << *trace);
|
||||
execute_stage_.pop();
|
||||
DT(3, cycle, "pipeline-execute: " << state);
|
||||
}
|
||||
|
||||
// advance execute units
|
||||
@@ -273,13 +291,14 @@ void Core::execute(uint64_t cycle) {
|
||||
|
||||
// commit completed instructions
|
||||
for (auto& exe_unit : exe_units_) {
|
||||
pipeline_state_t state;
|
||||
if (exe_unit->pop_output(&state)) {
|
||||
if (state.stall_warp) {
|
||||
stalled_warps_.reset(state.wid);
|
||||
if (!exe_unit->empty()) {
|
||||
auto trace = exe_unit->top();
|
||||
if (trace->fetch_stall) {
|
||||
stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
// advance to commit stage
|
||||
commit_stage_.push(state);
|
||||
commit_stage_.push(trace);
|
||||
exe_unit->pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -287,21 +306,28 @@ void Core::execute(uint64_t cycle) {
|
||||
void Core::commit(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
pipeline_state_t state;
|
||||
if (!commit_stage_.try_pop(&state))
|
||||
if (commit_stage_.empty())
|
||||
return;
|
||||
|
||||
DT(3, cycle, "pipeline-commit: " << state);
|
||||
auto trace = commit_stage_.top();
|
||||
|
||||
DT(3, cycle, "pipeline-commit: " << *trace);
|
||||
|
||||
// update scoreboard
|
||||
scoreboard_.release(state);
|
||||
scoreboard_.release(trace);
|
||||
|
||||
assert(committed_instrs_ <= issued_instrs_);
|
||||
++committed_instrs_;
|
||||
|
||||
commit_stage_.pop();
|
||||
|
||||
// delete the trace
|
||||
delete trace;
|
||||
}
|
||||
|
||||
bool Core::running() const {
|
||||
return (committed_instrs_ != issued_instrs_);
|
||||
bool is_running = (committed_instrs_ != issued_instrs_);
|
||||
return is_running;
|
||||
}
|
||||
|
||||
Word Core::get_csr(Addr addr, int tid, int wid) {
|
||||
@@ -355,6 +381,12 @@ Word Core::get_csr(Addr addr, int tid, int wid) {
|
||||
// NumCycles
|
||||
return (Word)(SimPlatform::instance().cycles() >> 32);
|
||||
} else {
|
||||
if (addr >= CSR_TEX(0,0)
|
||||
&& addr < CSR_TEX(NUM_TEX_UNITS,0)) {
|
||||
uint32_t unit = CSR_TEX_UNIT(addr);
|
||||
uint32_t state = CSR_TEX_STATE(addr);
|
||||
return tex_units_.at(unit).get_state(state);
|
||||
}
|
||||
return csrs_.at(addr);
|
||||
}
|
||||
}
|
||||
@@ -367,6 +399,13 @@ void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) {
|
||||
} else if (addr == CSR_FCSR) {
|
||||
fcsrs_.at(wid) = value & 0xff;
|
||||
} else {
|
||||
if (addr >= CSR_TEX(0,0)
|
||||
&& addr < CSR_TEX(NUM_TEX_UNITS,0)) {
|
||||
uint32_t unit = CSR_TEX_UNIT(addr);
|
||||
uint32_t state = CSR_TEX_STATE(addr);
|
||||
tex_units_.at(unit).set_state(state, value);
|
||||
return;
|
||||
}
|
||||
csrs_.at(addr) = value;
|
||||
}
|
||||
}
|
||||
@@ -390,29 +429,27 @@ Word Core::icache_read(Addr addr, Size size) {
|
||||
return data;
|
||||
}
|
||||
|
||||
Word Core::dcache_read(Addr addr, Size size) {
|
||||
++stats_loads_;
|
||||
Word Core::dcache_read(Addr addr, Size size) {
|
||||
Word data = 0;
|
||||
#ifdef SM_ENABLE
|
||||
if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
|
||||
&& ((addr + 3) < SMEM_BASE_ADDR)) {
|
||||
shared_mem_.read(&data, addr & (SMEM_SIZE-1), size);
|
||||
return data;
|
||||
if (SM_ENABLE) {
|
||||
if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
|
||||
&& ((addr + 3) < SMEM_BASE_ADDR)) {
|
||||
shared_mem_.read(&data, addr & (SMEM_SIZE-1), size);
|
||||
return data;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
mmu_.read(&data, addr, size, 0);
|
||||
return data;
|
||||
}
|
||||
|
||||
void Core::dcache_write(Addr addr, Word data, Size size) {
|
||||
++stats_stores_;
|
||||
#ifdef SM_ENABLE
|
||||
if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
|
||||
&& ((addr + 3) < SMEM_BASE_ADDR)) {
|
||||
shared_mem_.write(&data, addr & (SMEM_SIZE-1), size);
|
||||
return;
|
||||
void Core::dcache_write(Addr addr, Word data, Size size) {
|
||||
if (SM_ENABLE) {
|
||||
if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
|
||||
&& ((addr + 3) < SMEM_BASE_ADDR)) {
|
||||
shared_mem_.write(&data, addr & (SMEM_SIZE-1), size);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (addr >= IO_COUT_ADDR
|
||||
&& addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) {
|
||||
this->writeToStdOut(addr, data);
|
||||
@@ -421,11 +458,8 @@ void Core::dcache_write(Addr addr, Word data, Size size) {
|
||||
mmu_.write(&data, addr, size, 0);
|
||||
}
|
||||
|
||||
void Core::printStats() const {
|
||||
std::cout << "Cycles: " << SimPlatform::instance().cycles() << std::endl
|
||||
<< "Insts : " << stats_insts_ << std::endl
|
||||
<< "Loads : " << stats_loads_ << std::endl
|
||||
<< "Stores: " << stats_stores_ << std::endl;
|
||||
Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector<uint64_t>* mem_addrs) {
|
||||
return tex_units_.at(unit).read(u, v, lod, mem_addrs);
|
||||
}
|
||||
|
||||
void Core::writeToStdOut(Addr addr, Word data) {
|
||||
@@ -439,10 +473,14 @@ void Core::writeToStdOut(Addr addr, Word data) {
|
||||
}
|
||||
}
|
||||
|
||||
void Core::trigger_ecall() {
|
||||
ecall_ = true;
|
||||
}
|
||||
|
||||
void Core::trigger_ebreak() {
|
||||
ebreak_ = true;
|
||||
}
|
||||
|
||||
bool Core::check_ebreak() const {
|
||||
return ebreak_;
|
||||
bool Core::check_exit() const {
|
||||
return ebreak_ || ecall_;
|
||||
}
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "ibuffer.h"
|
||||
#include "scoreboard.h"
|
||||
#include "exeunit.h"
|
||||
#include "tex_unit.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
@@ -34,8 +35,6 @@ public:
|
||||
|
||||
void step(uint64_t cycle);
|
||||
|
||||
void printStats() const;
|
||||
|
||||
Word id() const {
|
||||
return id_;
|
||||
}
|
||||
@@ -72,9 +71,13 @@ public:
|
||||
|
||||
void dcache_write(Addr, Word, Size);
|
||||
|
||||
Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector<uint64_t>* mem_addrs);
|
||||
|
||||
void trigger_ecall();
|
||||
|
||||
void trigger_ebreak();
|
||||
|
||||
bool check_ebreak() const;
|
||||
bool check_exit() const;
|
||||
|
||||
private:
|
||||
|
||||
@@ -92,10 +95,8 @@ private:
|
||||
const ArchDef arch_;
|
||||
const Decoder decoder_;
|
||||
MemoryUnit mmu_;
|
||||
|
||||
#ifdef SM_ENABLE
|
||||
RAM shared_mem_;
|
||||
#endif
|
||||
std::vector<TexUnit> tex_units_;
|
||||
|
||||
std::vector<std::shared_ptr<Warp>> warps_;
|
||||
std::vector<WarpMask> barriers_;
|
||||
@@ -107,6 +108,7 @@ private:
|
||||
Cache::Ptr icache_;
|
||||
Cache::Ptr dcache_;
|
||||
Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
|
||||
std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;
|
||||
|
||||
PipelineStage fetch_stage_;
|
||||
PipelineStage decode_stage_;
|
||||
@@ -114,20 +116,20 @@ private:
|
||||
PipelineStage execute_stage_;
|
||||
PipelineStage commit_stage_;
|
||||
|
||||
HashTable<pipeline_state_t> pending_icache_;
|
||||
HashTable<pipeline_trace_t*> pending_icache_;
|
||||
WarpMask stalled_warps_;
|
||||
uint32_t last_schedule_wid_;
|
||||
uint32_t issued_instrs_;
|
||||
uint32_t committed_instrs_;
|
||||
bool ecall_;
|
||||
bool ebreak_;
|
||||
|
||||
std::unordered_map<int, std::stringstream> print_bufs_;
|
||||
|
||||
uint64_t stats_insts_;
|
||||
uint64_t stats_loads_;
|
||||
uint64_t stats_stores_;
|
||||
|
||||
friend class LsuUnit;
|
||||
friend class GpuUnit;
|
||||
|
||||
public:
|
||||
SlavePort<MemRsp> MemRspPort;
|
||||
|
||||
@@ -41,14 +41,18 @@ static const std::unordered_map<int, struct InstTableEntry_t> sc_instTable = {
|
||||
{Opcode::FMNMSUB, {false, InstType::R4_TYPE}},
|
||||
{Opcode::VSET, {false, InstType::V_TYPE}},
|
||||
{Opcode::GPGPU, {false, InstType::R_TYPE}},
|
||||
{Opcode::GPU, {false, InstType::R4_TYPE}},
|
||||
};
|
||||
|
||||
static const char* op_string(const Instr &instr) {
|
||||
Word func3 = instr.getFunc3();
|
||||
Word func7 = instr.getFunc7();
|
||||
Word rs2 = instr.getRSrc(1);
|
||||
Word imm = instr.getImm();
|
||||
switch (instr.getOpcode()) {
|
||||
static const char* op_string(const Instr &instr) {
|
||||
auto opcode = instr.getOpcode();
|
||||
Word func2 = instr.getFunc2();
|
||||
Word func3 = instr.getFunc3();
|
||||
Word func7 = instr.getFunc7();
|
||||
Word rs2 = instr.getRSrc(1);
|
||||
Word imm = instr.getImm();
|
||||
|
||||
switch (opcode) {
|
||||
case Opcode::NOP: return "NOP";
|
||||
case Opcode::LUI_INST: return "LUI";
|
||||
case Opcode::AUIPC_INST: return "AUIPC";
|
||||
@@ -120,7 +124,16 @@ static const char* op_string(const Instr &instr) {
|
||||
}
|
||||
case Opcode::SYS_INST:
|
||||
switch (func3) {
|
||||
case 0: return imm ? "EBREAK" : "ECALL";
|
||||
case 0:
|
||||
switch (imm) {
|
||||
case 0x000: return "ECALL";
|
||||
case 0x001: return "EBREAK";
|
||||
case 0x002: return "URET";
|
||||
case 0x102: return "SRET";
|
||||
case 0x302: return "MRET";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case 1: return "CSRRW";
|
||||
case 2: return "CSRRS";
|
||||
case 3: return "CSRRC";
|
||||
@@ -181,29 +194,43 @@ static const char* op_string(const Instr &instr) {
|
||||
case 1: return "WSPAWN";
|
||||
case 2: return "SPLIT";
|
||||
case 3: return "JOIN";
|
||||
case 4: return "BAR";
|
||||
case 6: return "PREFETCH";
|
||||
case 4: return "BAR";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case Opcode::GPU:
|
||||
switch (func3) {
|
||||
case 0: return "TEX";
|
||||
case 1: {
|
||||
switch (func2) {
|
||||
case 0: return "CMOV";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace vortex {
|
||||
std::ostream &operator<<(std::ostream &os, const Instr &instr) {
|
||||
os << op_string(instr) << ": ";
|
||||
std::ostream &operator<<(std::ostream &os, const Instr &instr) {
|
||||
auto opcode = instr.getOpcode();
|
||||
Word func2 = instr.getFunc2();
|
||||
Word func3 = instr.getFunc3();
|
||||
|
||||
os << op_string(instr) << ": ";
|
||||
|
||||
if (opcode == S_INST
|
||||
|| opcode == FS
|
||||
|| opcode == VS) {
|
||||
|| opcode == FS) {
|
||||
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
|
||||
os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
|
||||
} else
|
||||
if (opcode == L_INST
|
||||
|| opcode == FL
|
||||
|| opcode == VL) {
|
||||
|| opcode == FL) {
|
||||
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
|
||||
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
|
||||
} else {
|
||||
@@ -219,8 +246,10 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
|
||||
if (i) os << ", ";
|
||||
os << "imm=0x" << std::hex << instr.getImm();
|
||||
}
|
||||
}
|
||||
|
||||
if (opcode == GPU && func3 == 0) {
|
||||
os << ", unit=" << std::dec << func2;
|
||||
}
|
||||
}
|
||||
return os;
|
||||
}
|
||||
}
|
||||
@@ -239,6 +268,7 @@ Decoder::Decoder(const ArchDef &arch) {
|
||||
shift_func3_ = shift_rd_ + reg_s_;
|
||||
shift_rs1_ = shift_func3_ + func3_s_;
|
||||
shift_rs2_ = shift_rs1_ + reg_s_;
|
||||
shift_func2_ = shift_rs2_ + reg_s_;
|
||||
shift_func7_ = shift_rs2_ + reg_s_;
|
||||
shift_rs3_ = shift_func7_ + func2_s_;
|
||||
shift_vmop_ = shift_func7_ + vmask_s_;
|
||||
@@ -247,7 +277,7 @@ Decoder::Decoder(const ArchDef &arch) {
|
||||
shift_vset_ = shift_func7_ + 6;
|
||||
|
||||
reg_mask_ = 0x1f;
|
||||
func2_mask_ = 0x2;
|
||||
func2_mask_ = 0x3;
|
||||
func3_mask_ = 0x7;
|
||||
func6_mask_ = 0x3f;
|
||||
func7_mask_ = 0x7f;
|
||||
@@ -265,6 +295,7 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
|
||||
Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_);
|
||||
instr->setOpcode(op);
|
||||
|
||||
Word func2 = (code >> shift_func2_) & func2_mask_;
|
||||
Word func3 = (code >> shift_func3_) & func3_mask_;
|
||||
Word func6 = (code >> shift_func6_) & func6_mask_;
|
||||
Word func7 = (code >> shift_func7_) & func7_mask_;
|
||||
@@ -403,7 +434,7 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
|
||||
}
|
||||
} break;
|
||||
|
||||
case Opcode::VL:
|
||||
case Opcode::FL:
|
||||
instr->setDestVReg(rd);
|
||||
instr->setSrcVReg(rs1);
|
||||
instr->setVlsWidth(func3);
|
||||
@@ -413,7 +444,7 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
|
||||
instr->setVnf((code >> shift_vnf_) & func3_mask_);
|
||||
break;
|
||||
|
||||
case Opcode::VS:
|
||||
case Opcode::FS:
|
||||
instr->setVs3(rd);
|
||||
instr->setSrcVReg(rs1);
|
||||
instr->setVlsWidth(func3);
|
||||
@@ -428,10 +459,18 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
|
||||
}
|
||||
break;
|
||||
case R4_TYPE:
|
||||
instr->setDestFReg(rd);
|
||||
instr->setSrcFReg(rs1);
|
||||
instr->setSrcFReg(rs2);
|
||||
instr->setSrcFReg(rs3);
|
||||
if (op == Opcode::GPU) {
|
||||
instr->setDestReg(rd);
|
||||
instr->setSrcReg(rs1);
|
||||
instr->setSrcReg(rs2);
|
||||
instr->setSrcReg(rs3);
|
||||
} else {
|
||||
instr->setDestFReg(rd);
|
||||
instr->setSrcFReg(rs1);
|
||||
instr->setSrcFReg(rs2);
|
||||
instr->setSrcFReg(rs3);
|
||||
}
|
||||
instr->setFunc2(func2);
|
||||
instr->setFunc3(func3);
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -49,11 +49,12 @@ inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid)
|
||||
}
|
||||
}
|
||||
|
||||
void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
||||
assert(tmask_.any());
|
||||
|
||||
Word nextPC = PC_ + core_->arch().wsize();
|
||||
|
||||
Word func2 = instr.getFunc2();
|
||||
Word func3 = instr.getFunc3();
|
||||
Word func6 = instr.getFunc6();
|
||||
Word func7 = instr.getFunc7();
|
||||
@@ -117,8 +118,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
case NOP:
|
||||
break;
|
||||
case LUI_INST:
|
||||
pipeline_state->exe_type = ExeType::ALU;
|
||||
pipeline_state->alu.type = AluType::ARITH;
|
||||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu.type = AluType::ARITH;
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
@@ -127,8 +128,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
rd_write = true;
|
||||
break;
|
||||
case AUIPC_INST:
|
||||
pipeline_state->exe_type = ExeType::ALU;
|
||||
pipeline_state->alu.type = AluType::ARITH;
|
||||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu.type = AluType::ARITH;
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
@@ -137,10 +138,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
rd_write = true;
|
||||
break;
|
||||
case R_INST:
|
||||
pipeline_state->exe_type = ExeType::ALU;
|
||||
pipeline_state->alu.type = AluType::ARITH;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->used_iregs[rsrc1] = 1;
|
||||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu.type = AluType::ARITH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
@@ -149,7 +150,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
case 0:
|
||||
// MUL
|
||||
rddata[t] = ((WordI)rsdata[t][0]) * ((WordI)rsdata[t][1]);
|
||||
pipeline_state->alu.type = AluType::IMUL;
|
||||
trace->alu.type = AluType::IMUL;
|
||||
break;
|
||||
case 1: {
|
||||
// MULH
|
||||
@@ -163,7 +164,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
uint64_t result = first * second;
|
||||
rddata[t] = (result >> 32) & 0xFFFFFFFF;
|
||||
pipeline_state->alu.type = AluType::IMUL;
|
||||
trace->alu.type = AluType::IMUL;
|
||||
} break;
|
||||
case 2: {
|
||||
// MULHSU
|
||||
@@ -173,14 +174,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
int64_t second = (int64_t)rsdata[t][1];
|
||||
rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF;
|
||||
pipeline_state->alu.type = AluType::IMUL;
|
||||
trace->alu.type = AluType::IMUL;
|
||||
} break;
|
||||
case 3: {
|
||||
// MULHU
|
||||
uint64_t first = (uint64_t)rsdata[t][0];
|
||||
uint64_t second = (uint64_t)rsdata[t][1];
|
||||
rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF;
|
||||
pipeline_state->alu.type = AluType::IMUL;
|
||||
trace->alu.type = AluType::IMUL;
|
||||
} break;
|
||||
case 4: {
|
||||
// DIV
|
||||
@@ -193,7 +194,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
} else {
|
||||
rddata[t] = dividen / divisor;
|
||||
}
|
||||
pipeline_state->alu.type = AluType::IDIV;
|
||||
trace->alu.type = AluType::IDIV;
|
||||
} break;
|
||||
case 5: {
|
||||
// DIVU
|
||||
@@ -204,7 +205,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
} else {
|
||||
rddata[t] = dividen / divisor;
|
||||
}
|
||||
pipeline_state->alu.type = AluType::IDIV;
|
||||
trace->alu.type = AluType::IDIV;
|
||||
} break;
|
||||
case 6: {
|
||||
// REM
|
||||
@@ -217,7 +218,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
} else {
|
||||
rddata[t] = dividen % divisor;
|
||||
}
|
||||
pipeline_state->alu.type = AluType::IDIV;
|
||||
trace->alu.type = AluType::IDIV;
|
||||
} break;
|
||||
case 7: {
|
||||
// REMU
|
||||
@@ -228,7 +229,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
} else {
|
||||
rddata[t] = dividen % divisor;
|
||||
}
|
||||
pipeline_state->alu.type = AluType::IDIV;
|
||||
trace->alu.type = AluType::IDIV;
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
@@ -285,9 +286,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
rd_write = true;
|
||||
break;
|
||||
case I_INST:
|
||||
pipeline_state->exe_type = ExeType::ALU;
|
||||
pipeline_state->alu.type = AluType::ARITH;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu.type = AluType::ARITH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
@@ -336,10 +337,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
rd_write = true;
|
||||
break;
|
||||
case B_INST:
|
||||
pipeline_state->exe_type = ExeType::ALU;
|
||||
pipeline_state->alu.type = AluType::BRANCH;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->used_iregs[rsrc1] = 1;
|
||||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu.type = AluType::BRANCH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
@@ -385,107 +386,149 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
break; // runonce
|
||||
}
|
||||
pipeline_state->stall_warp = true;
|
||||
trace->fetch_stall = true;
|
||||
break;
|
||||
case JAL_INST:
|
||||
pipeline_state->exe_type = ExeType::ALU;
|
||||
pipeline_state->alu.type = AluType::BRANCH;
|
||||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu.type = AluType::BRANCH;
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
rddata[t] = nextPC;
|
||||
nextPC = PC_ + immsrc;
|
||||
pipeline_state->stall_warp = true;
|
||||
trace->fetch_stall = true;
|
||||
break; // runonce
|
||||
}
|
||||
rd_write = true;
|
||||
break;
|
||||
case JALR_INST:
|
||||
pipeline_state->exe_type = ExeType::ALU;
|
||||
pipeline_state->alu.type = AluType::BRANCH;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu.type = AluType::BRANCH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
rddata[t] = nextPC;
|
||||
nextPC = rsdata[t][0] + immsrc;
|
||||
pipeline_state->stall_warp = true;
|
||||
trace->fetch_stall = true;
|
||||
break; // runOnce
|
||||
}
|
||||
rd_write = true;
|
||||
break;
|
||||
case L_INST:
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.type = LsuType::LOAD;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->mem_addrs.resize(num_threads);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned
|
||||
Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
|
||||
Word data_read = core_->dcache_read(memAddr, 4);
|
||||
pipeline_state->mem_addrs.at(t) = memAddr;
|
||||
DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
|
||||
switch (func3) {
|
||||
case 0:
|
||||
// LBI
|
||||
rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8);
|
||||
break;
|
||||
case 1:
|
||||
// LHI
|
||||
rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16);
|
||||
break;
|
||||
case 2:
|
||||
// LW
|
||||
rddata[t] = data_read;
|
||||
break;
|
||||
case 4:
|
||||
// LBU
|
||||
rddata[t] = Word((data_read >> shift_by) & 0xFF);
|
||||
break;
|
||||
case 5:
|
||||
// LHU
|
||||
rddata[t] = Word((data_read >> shift_by) & 0xFFFF);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
case FL:
|
||||
trace->exe_type = ExeType::LSU;
|
||||
trace->lsu.type = LsuType::LOAD;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
if (opcode == L_INST
|
||||
|| (opcode == FL && func3 == 2)) {
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned
|
||||
Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
|
||||
Word data_read = core_->dcache_read(memAddr, 4);
|
||||
trace->mem_addrs.at(t).push_back(memAddr);
|
||||
DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
|
||||
switch (func3) {
|
||||
case 0:
|
||||
// LBI
|
||||
rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8);
|
||||
break;
|
||||
case 1:
|
||||
// LHI
|
||||
rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16);
|
||||
break;
|
||||
case 2:
|
||||
// LW
|
||||
rddata[t] = data_read;
|
||||
break;
|
||||
case 4:
|
||||
// LBU
|
||||
rddata[t] = Word((data_read >> shift_by) & 0xFF);
|
||||
break;
|
||||
case 5:
|
||||
// LHU
|
||||
rddata[t] = Word((data_read >> shift_by) & 0xFFFF);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
rd_write = true;
|
||||
break;
|
||||
case S_INST:
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.type = LsuType::STORE;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->used_iregs[rsrc1] = 1;
|
||||
pipeline_state->mem_addrs.resize(num_threads);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
Word memAddr = rsdata[t][0] + immsrc;
|
||||
pipeline_state->mem_addrs.at(t) = memAddr;
|
||||
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
switch (func3) {
|
||||
case 0:
|
||||
// SB
|
||||
core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1);
|
||||
break;
|
||||
case 1:
|
||||
// SH
|
||||
core_->dcache_write(memAddr, rsdata[t][1], 2);
|
||||
break;
|
||||
case 2:
|
||||
// SW
|
||||
core_->dcache_write(memAddr, rsdata[t][1], 4);
|
||||
break;
|
||||
} else {
|
||||
DP(4, "Executing vector load");
|
||||
DP(4, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
|
||||
DP(4, "dest: v" << rdest);
|
||||
DP(4, "width" << instr.getVlsWidth());
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
switch (instr.getVlsWidth()) {
|
||||
case 6: {
|
||||
// load word and unit strided (not checking for unit stride)
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
|
||||
DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
Word data_read = core_->dcache_read(memAddr, 4);
|
||||
DP(4, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
|
||||
int *result_ptr = (int *)(vd.data() + i);
|
||||
*result_ptr = data_read;
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
rd_write = true;
|
||||
break;
|
||||
case S_INST:
|
||||
case FS:
|
||||
trace->exe_type = ExeType::LSU;
|
||||
trace->lsu.type = LsuType::STORE;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
if (opcode == S_INST
|
||||
|| (opcode == FS && func3 == 2)) {
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
Word memAddr = rsdata[t][0] + immsrc;
|
||||
trace->mem_addrs.at(t).push_back(memAddr);
|
||||
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
switch (func3) {
|
||||
case 0:
|
||||
// SB
|
||||
core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1);
|
||||
break;
|
||||
case 1:
|
||||
// SH
|
||||
core_->dcache_write(memAddr, rsdata[t][1], 2);
|
||||
break;
|
||||
case 2:
|
||||
// SW
|
||||
core_->dcache_write(memAddr, rsdata[t][1], 4);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
|
||||
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
switch (instr.getVlsWidth()) {
|
||||
case 6: {
|
||||
// store word and unit strided (not checking for unit stride)
|
||||
uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
|
||||
core_->dcache_write(memAddr, value, 4);
|
||||
DP(4, "store: " << memAddr << " value:" << value);
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case SYS_INST:
|
||||
pipeline_state->exe_type = ExeType::CSR;
|
||||
trace->exe_type = ExeType::CSR;
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
@@ -493,30 +536,40 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
Word csr_value = core_->get_csr(csr_addr, t, id_);
|
||||
switch (func3) {
|
||||
case 0:
|
||||
if (csr_addr < 2) {
|
||||
// ECALL/EBREAK
|
||||
switch (csr_addr) {
|
||||
case 0: // ECALL
|
||||
core_->trigger_ecall();
|
||||
break;
|
||||
case 1: // EBREAK
|
||||
core_->trigger_ebreak();
|
||||
}
|
||||
break;
|
||||
case 0x002: // URET
|
||||
case 0x102: // SRET
|
||||
case 0x302: // MRET
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
// CSRRW
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, rsdata[t][0], t, id_);
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 2:
|
||||
// CSRRS
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_);
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 3:
|
||||
// CSRRC
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_);
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 5:
|
||||
@@ -543,88 +596,12 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
break;
|
||||
case FENCE:
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.type = LsuType::FENCE;
|
||||
pipeline_state->stall_warp = true;
|
||||
break;
|
||||
case (FL | VL):
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.type = LsuType::LOAD;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
if (func3 == 0x2) {
|
||||
pipeline_state->mem_addrs.resize(num_threads);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
Word memAddr = rsdata[t][0] + immsrc;
|
||||
pipeline_state->mem_addrs.at(t) = memAddr;
|
||||
Word data_read = core_->dcache_read(memAddr, 4);
|
||||
DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
|
||||
rddata[t] = data_read;
|
||||
}
|
||||
} else {
|
||||
DP(3, "Executing vector load");
|
||||
DP(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
|
||||
DP(3, "dest: v" << rdest);
|
||||
DP(3, "width" << instr.getVlsWidth());
|
||||
pipeline_state->mem_addrs.resize(vl_);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
switch (instr.getVlsWidth()) {
|
||||
case 6: {
|
||||
// load word and unit strided (not checking for unit stride)
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
|
||||
pipeline_state->mem_addrs.at(i) = memAddr;
|
||||
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
Word data_read = core_->dcache_read(memAddr, 4);
|
||||
DP(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
|
||||
int *result_ptr = (int *)(vd.data() + i);
|
||||
*result_ptr = data_read;
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
break;
|
||||
}
|
||||
rd_write = true;
|
||||
break;
|
||||
case (FS | VS):
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.type = LsuType::STORE;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->used_iregs[rsrc1] = 1;
|
||||
if (func3 == 0x2) {
|
||||
pipeline_state->mem_addrs.resize(num_threads);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
Word memAddr = rsdata[t][0] + immsrc;
|
||||
pipeline_state->mem_addrs.at(t) = memAddr;
|
||||
core_->dcache_write(memAddr, rsdata[t][1], 4);
|
||||
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
}
|
||||
} else {
|
||||
pipeline_state->mem_addrs.resize(vl_);
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
|
||||
pipeline_state->mem_addrs.at(i) = memAddr;
|
||||
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
switch (instr.getVlsWidth()) {
|
||||
case 6: {
|
||||
//store word and unit strided (not checking for unit stride)
|
||||
uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
|
||||
core_->dcache_write(memAddr, value, 4);
|
||||
DP(3, "store: " << memAddr << " value:" << value);
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
trace->exe_type = ExeType::LSU;
|
||||
trace->lsu.type = LsuType::FENCE;
|
||||
trace->fetch_stall = true;
|
||||
break;
|
||||
case FCI:
|
||||
pipeline_state->exe_type = ExeType::FPU;
|
||||
trace->exe_type = ExeType::FPU;
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
@@ -633,32 +610,32 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
switch (func7) {
|
||||
case 0x00: //FADD
|
||||
rddata[t] = rv_fadd(rsdata[t][0], rsdata[t][1], frm, &fflags);
|
||||
pipeline_state->fpu.type = FpuType::FMA;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
pipeline_state->used_fregs[rsrc1] = 1;
|
||||
trace->fpu.type = FpuType::FMA;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
break;
|
||||
case 0x04: //FSUB
|
||||
rddata[t] = rv_fsub(rsdata[t][0], rsdata[t][1], frm, &fflags);
|
||||
pipeline_state->fpu.type = FpuType::FMA;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
pipeline_state->used_fregs[rsrc1] = 1;
|
||||
trace->fpu.type = FpuType::FMA;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
break;
|
||||
case 0x08: //FMUL
|
||||
rddata[t] = rv_fmul(rsdata[t][0], rsdata[t][1], frm, &fflags);
|
||||
pipeline_state->fpu.type = FpuType::FMA;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
pipeline_state->used_fregs[rsrc1] = 1;
|
||||
trace->fpu.type = FpuType::FMA;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
break;
|
||||
case 0x0c: //FDIV
|
||||
rddata[t] = rv_fdiv(rsdata[t][0], rsdata[t][1], frm, &fflags);
|
||||
pipeline_state->fpu.type = FpuType::FDIV;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
pipeline_state->used_fregs[rsrc1] = 1;
|
||||
trace->fpu.type = FpuType::FDIV;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
break;
|
||||
case 0x2c: //FSQRT
|
||||
rddata[t] = rv_fsqrt(rsdata[t][0], frm, &fflags);
|
||||
pipeline_state->fpu.type = FpuType::FSQRT;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
trace->fpu.type = FpuType::FSQRT;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
break;
|
||||
case 0x10:
|
||||
switch (func3) {
|
||||
@@ -672,9 +649,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
rddata[t] = rv_fsgnjx(rsdata[t][0], rsdata[t][1]);
|
||||
break;
|
||||
}
|
||||
pipeline_state->fpu.type = FpuType::FNCP;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
pipeline_state->used_fregs[rsrc1] = 1;
|
||||
trace->fpu.type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
break;
|
||||
case 0x14:
|
||||
if (func3) {
|
||||
@@ -684,9 +661,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
// FMIN.S
|
||||
rddata[t] = rv_fmin(rsdata[t][0], rsdata[t][1], &fflags);
|
||||
}
|
||||
pipeline_state->fpu.type = FpuType::FNCP;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
pipeline_state->used_fregs[rsrc1] = 1;
|
||||
trace->fpu.type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
break;
|
||||
case 0x60:
|
||||
if (rsrc1 == 0) {
|
||||
@@ -696,8 +673,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
// FCVT.WU.S
|
||||
rddata[t] = rv_ftou(rsdata[t][0], frm, &fflags);
|
||||
}
|
||||
pipeline_state->fpu.type = FpuType::FCVT;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
trace->fpu.type = FpuType::FCVT;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
break;
|
||||
case 0x70:
|
||||
if (func3) {
|
||||
@@ -707,8 +684,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
// FMV.X.W
|
||||
rddata[t] = rsdata[t][0];
|
||||
}
|
||||
pipeline_state->fpu.type = FpuType::FNCP;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
trace->fpu.type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
break;
|
||||
case 0x50:
|
||||
switch(func3) {
|
||||
@@ -725,9 +702,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
rddata[t] = rv_feq(rsdata[t][0], rsdata[t][1], &fflags);
|
||||
break;
|
||||
}
|
||||
pipeline_state->fpu.type = FpuType::FNCP;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
pipeline_state->used_fregs[rsrc1] = 1;
|
||||
trace->fpu.type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
break;
|
||||
case 0x68:
|
||||
if (rsrc1) {
|
||||
@@ -737,14 +714,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
// FCVT.S.W:
|
||||
rddata[t] = rv_itof(rsdata[t][0], frm, &fflags);
|
||||
}
|
||||
pipeline_state->fpu.type = FpuType::FCVT;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
trace->fpu.type = FpuType::FCVT;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
break;
|
||||
case 0x78:
|
||||
// FMV.W.X
|
||||
rddata[t] = rsdata[t][0];
|
||||
pipeline_state->fpu.type = FpuType::FNCP;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
trace->fpu.type = FpuType::FNCP;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
break;
|
||||
}
|
||||
update_fcrs(fflags, core_, t, id_);
|
||||
@@ -755,10 +732,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
case FMSUB:
|
||||
case FMNMADD:
|
||||
case FMNMSUB:
|
||||
pipeline_state->fpu.type = FpuType::FMA;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
pipeline_state->used_fregs[rsrc1] = 1;
|
||||
pipeline_state->used_fregs[rsrc2] = 1;
|
||||
trace->fpu.type = FpuType::FMA;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->used_fregs.set(rsrc2);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
@@ -784,8 +761,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
rd_write = true;
|
||||
break;
|
||||
case GPGPU: {
|
||||
pipeline_state->exe_type = ExeType::GPU;
|
||||
case GPGPU: {
|
||||
int ts = 0;
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (tmask_.test(t)) {
|
||||
@@ -795,10 +771,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
switch (func3) {
|
||||
case 0: {
|
||||
// TMC
|
||||
pipeline_state->gpu.type = GpuType::TMC;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->stall_warp = true;
|
||||
// TMC
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu.type = GpuType::TMC;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->fetch_stall = true;
|
||||
if (rsrc1) {
|
||||
// predicate mode
|
||||
ThreadMask pred;
|
||||
@@ -823,10 +800,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
} break;
|
||||
case 1: {
|
||||
// WSPAWN
|
||||
pipeline_state->gpu.type = GpuType::WSPAWN;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->used_iregs[rsrc1] = 1;
|
||||
pipeline_state->stall_warp = true;
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu.type = GpuType::WSPAWN;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->fetch_stall = true;
|
||||
int active_warps = std::min<int>(rsdata.at(ts)[0], core_->arch().num_warps());
|
||||
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]);
|
||||
for (int i = 1; i < active_warps; ++i) {
|
||||
@@ -837,9 +815,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
} break;
|
||||
case 2: {
|
||||
// SPLIT
|
||||
pipeline_state->gpu.type = GpuType::SPLIT;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->stall_warp = true;
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu.type = GpuType::SPLIT;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->fetch_stall = true;
|
||||
if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {
|
||||
ThreadMask tmask;
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
@@ -868,8 +847,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
} break;
|
||||
case 3: {
|
||||
// JOIN
|
||||
pipeline_state->gpu.type = GpuType::JOIN;
|
||||
pipeline_state->stall_warp = true;
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu.type = GpuType::JOIN;
|
||||
trace->fetch_stall = true;
|
||||
if (!domStack_.empty() && domStack_.top().unanimous) {
|
||||
DP(3, "*** Uninimous branch at join");
|
||||
tmask_ = domStack_.top().tmask;
|
||||
@@ -893,18 +873,19 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
} break;
|
||||
case 4: {
|
||||
// BAR
|
||||
pipeline_state->gpu.type = GpuType::BAR;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->used_iregs[rsrc1] = 1;
|
||||
pipeline_state->stall_warp = true;
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu.type = GpuType::BAR;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->fetch_stall = true;
|
||||
active_ = false;
|
||||
core_->barrier(rsdata[ts][0], rsdata[ts][1], id_);
|
||||
} break;
|
||||
case 6: {
|
||||
case 5: {
|
||||
// PREFETCH
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.type = LsuType::PREFETCH;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
trace->exe_type = ExeType::LSU;
|
||||
trace->lsu.type = LsuType::PREFETCH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
@@ -915,7 +896,50 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
} break;
|
||||
} break;
|
||||
case GPU: {
|
||||
switch (func3) {
|
||||
case 0: { // TEX
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu.type = GpuType::TEX;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->used_iregs.set(rsrc2);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
auto unit = func2;
|
||||
auto u = rsdata[t][0];
|
||||
auto v = rsdata[t][1];
|
||||
auto lod = rsdata[t][2];
|
||||
auto color = core_->tex_read(unit, u, v, lod, &trace->mem_addrs.at(t));
|
||||
rddata[t] = color;
|
||||
}
|
||||
rd_write = true;
|
||||
} break;
|
||||
case 1:
|
||||
switch (func2) {
|
||||
case 0: { // CMOV
|
||||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu.type = AluType::CMOV;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->used_iregs.set(rsrc2);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
rddata[t] = rsdata[t][0] ? rsdata[t][1] : rsdata[t][2];
|
||||
}
|
||||
rd_write = true;
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
} break;
|
||||
case VSET: {
|
||||
int VLEN = core_->arch().vsize() * 8;
|
||||
int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew();
|
||||
@@ -966,7 +990,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 24: {
|
||||
//vmseq
|
||||
// vmseq
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -997,7 +1021,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 25: {
|
||||
//vmsne
|
||||
// vmsne
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1028,7 +1052,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 26: {
|
||||
//vmsltu
|
||||
// vmsltu
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1059,7 +1083,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 27: {
|
||||
//vmslt
|
||||
// vmslt
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1090,7 +1114,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 28: {
|
||||
//vmsleu
|
||||
// vmsleu
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1121,7 +1145,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 29: {
|
||||
//vmsle
|
||||
// vmsle
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1152,7 +1176,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 30: {
|
||||
//vmsgtu
|
||||
// vmsgtu
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1183,7 +1207,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 31: {
|
||||
//vmsgt
|
||||
// vmsgt
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1356,7 +1380,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 27: {
|
||||
//vmxor
|
||||
// vmxor
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1402,7 +1426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 28: {
|
||||
//vmornot
|
||||
// vmornot
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1448,7 +1472,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 29: {
|
||||
//vmnand
|
||||
// vmnand
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1494,7 +1518,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 30: {
|
||||
//vmnor
|
||||
// vmnor
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1540,7 +1564,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 31: {
|
||||
//vmxnor
|
||||
// vmxnor
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1586,7 +1610,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
} break;
|
||||
case 37: {
|
||||
//vmul
|
||||
// vmul
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
@@ -1769,7 +1793,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
}
|
||||
|
||||
if (rd_write) {
|
||||
pipeline_state->wb = true;
|
||||
trace->wb = true;
|
||||
DPH(2, "Dest Reg: ");
|
||||
auto rdt = instr.getRDType();
|
||||
switch (rdt) {
|
||||
@@ -1786,7 +1810,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
DPN(2, "0x" << std::hex << rddata[t]);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
pipeline_state->used_iregs[rdest] = 1;
|
||||
trace->used_iregs[rdest] = 1;
|
||||
}
|
||||
break;
|
||||
case RegType::Float:
|
||||
@@ -1801,7 +1825,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
||||
DPN(2, "0x" << std::hex << rddata[t]);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
pipeline_state->used_fregs[rdest] = 1;
|
||||
trace->used_fregs[rdest] = 1;
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
|
||||
@@ -6,16 +6,18 @@
|
||||
#include <util.h>
|
||||
#include "debug.h"
|
||||
#include "core.h"
|
||||
#include "constants.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
NopUnit::NopUnit(Core*) : ExeUnit("NOP") {}
|
||||
|
||||
void NopUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
this->schedule_output(state, 1);
|
||||
auto trace = inputs_.top();
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@@ -33,19 +35,23 @@ void LsuUnit::step(uint64_t cycle) {
|
||||
|
||||
// handle dcache response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
MemRsp mem_rsp;
|
||||
if (!core_->dcache_->CoreRspPorts.at(t).read(&mem_rsp))
|
||||
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
|
||||
if (dcache_rsp_port.empty())
|
||||
continue;
|
||||
auto& entry = pending_dcache_.at(mem_rsp.tag);
|
||||
DT(3, cycle, "dcache-rsp: addr=" << std::hex << entry.first.mem_addrs.at(t) << ", tag=" << mem_rsp.tag << ", type=" << entry.first.lsu.type << ", tid=" << t << ", " << entry.first);
|
||||
assert(entry.second.test(t));
|
||||
entry.second.reset(t); // track remaining blocks
|
||||
if (!entry.second.any()) {
|
||||
auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
|
||||
entry.first.dcache_latency = latency;
|
||||
this->schedule_output(entry.first, 1);
|
||||
auto& mem_rsp = dcache_rsp_port.top();
|
||||
auto& entry = pending_dcache_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
|
||||
<< ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
|
||||
trace->dcache_latency = latency;
|
||||
this->schedule_output(trace, 1);
|
||||
pending_dcache_.release(mem_rsp.tag);
|
||||
}
|
||||
}
|
||||
dcache_rsp_port.pop();
|
||||
}
|
||||
|
||||
if (fence_lock_) {
|
||||
@@ -61,36 +67,83 @@ void LsuUnit::step(uint64_t cycle) {
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
|
||||
auto state = inputs_.top();
|
||||
auto trace = inputs_.top();
|
||||
|
||||
if (state.lsu.type == LsuType::FENCE) {
|
||||
if (trace->lsu.type == LsuType::FENCE) {
|
||||
// schedule fence lock
|
||||
fence_state_ = state;
|
||||
fence_lock_ = true;
|
||||
inputs_.pop();
|
||||
DT(3, cycle, "fence-lock: " << state);
|
||||
fence_state_ = trace;
|
||||
fence_lock_ = true;
|
||||
DT(3, cycle, "fence-lock: " << *trace);
|
||||
// remove input
|
||||
inputs_.pop();
|
||||
return;
|
||||
}
|
||||
|
||||
// check pending queue capacity
|
||||
if (pending_dcache_.full()) {
|
||||
DT(3, cycle, "*** lsu-queue-stall: " << state);
|
||||
if (!trace->check_stalled(pending_dcache_.full())) {
|
||||
DT(3, cycle, "*** lsu-queue-stall: " << *trace);
|
||||
}
|
||||
if (pending_dcache_.full())
|
||||
return;
|
||||
|
||||
// send memory request
|
||||
|
||||
bool has_shared_memory = false;
|
||||
bool mem_rsp_pending = false;
|
||||
bool is_write = (trace->lsu.type == LsuType::STORE);
|
||||
|
||||
uint32_t valid_addrs = 0;
|
||||
for (auto& mem_addr : trace->mem_addrs) {
|
||||
valid_addrs += mem_addr.size();
|
||||
}
|
||||
|
||||
trace->dcache_latency = SimPlatform::instance().cycles();
|
||||
auto tag = pending_dcache_.allocate({trace, valid_addrs});
|
||||
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
if (!trace->tmask.test(t))
|
||||
continue;
|
||||
|
||||
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
|
||||
for (auto mem_addr : trace->mem_addrs.at(t)) {
|
||||
// check shared memory address
|
||||
if (SM_ENABLE) {
|
||||
if ((mem_addr >= (SMEM_BASE_ADDR-SMEM_SIZE))
|
||||
&& (mem_addr < SMEM_BASE_ADDR)) {
|
||||
DT(3, cycle, "smem-access: addr=" << std::hex << mem_addr << ", tag=" << tag
|
||||
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
|
||||
has_shared_memory = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_io = (mem_addr >= IO_BASE_ADDR);
|
||||
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr;
|
||||
mem_req.write = is_write;
|
||||
mem_req.tag = tag;
|
||||
mem_req.is_io = is_io;
|
||||
dcache_req_port.send(mem_req, 1);
|
||||
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr << ", tag=" << tag
|
||||
<< ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << is_io << ", "<< trace);
|
||||
// do not wait on writes
|
||||
mem_rsp_pending = !is_write;
|
||||
}
|
||||
}
|
||||
|
||||
// send dcache request
|
||||
state.dcache_latency = SimPlatform::instance().cycles();
|
||||
auto tag = pending_dcache_.allocate({state, state.tmask});
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
if (!state.tmask.test(t))
|
||||
continue;
|
||||
MemReq mem_req;
|
||||
mem_req.addr = state.mem_addrs.at(t);
|
||||
mem_req.write = (state.lsu.type == LsuType::STORE);
|
||||
mem_req.tag = tag;
|
||||
core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
|
||||
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", type=" << state.lsu.type << ", tid=" << t << ", " << state);
|
||||
}
|
||||
// do not wait
|
||||
if (!mem_rsp_pending) {
|
||||
pending_dcache_.release(tag);
|
||||
uint32_t delay = 1;
|
||||
if (has_shared_memory) {
|
||||
// all threads accessed shared memory
|
||||
delay += Constants::SMEM_DELAY;
|
||||
}
|
||||
this->schedule_output(trace, delay);
|
||||
}
|
||||
|
||||
// remove input
|
||||
inputs_.pop();
|
||||
}
|
||||
|
||||
@@ -98,23 +151,27 @@ void LsuUnit::step(uint64_t cycle) {
|
||||
|
||||
AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
|
||||
|
||||
void AluUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
void AluUnit::step(uint64_t /*cycle*/) {
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
switch (state.alu.type) {
|
||||
case AluType::ARITH:
|
||||
this->schedule_output(state, 1);
|
||||
break;
|
||||
auto trace = inputs_.top();
|
||||
switch (trace->alu.type) {
|
||||
case AluType::ARITH:
|
||||
case AluType::BRANCH:
|
||||
this->schedule_output(state, 1);
|
||||
case AluType::CMOV:
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case AluType::IMUL:
|
||||
this->schedule_output(state, LATENCY_IMUL);
|
||||
this->schedule_output(trace, LATENCY_IMUL);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case AluType::IDIV:
|
||||
this->schedule_output(state, XLEN);
|
||||
this->schedule_output(trace, XLEN);
|
||||
inputs_.pop();
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,10 +180,11 @@ void AluUnit::step(uint64_t /*cycle*/) {
|
||||
CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
|
||||
|
||||
void CsrUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
this->schedule_output(state, 1);
|
||||
auto trace = inputs_.top();
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@@ -134,46 +192,127 @@ void CsrUnit::step(uint64_t /*cycle*/) {
|
||||
FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
|
||||
|
||||
void FpuUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
switch (state.fpu.type) {
|
||||
auto trace = inputs_.top();
|
||||
switch (trace->fpu.type) {
|
||||
case FpuType::FNCP:
|
||||
this->schedule_output(state, 1);
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case FpuType::FMA:
|
||||
this->schedule_output(state, LATENCY_FMA);
|
||||
this->schedule_output(trace, LATENCY_FMA);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case FpuType::FDIV:
|
||||
this->schedule_output(state, LATENCY_FDIV);
|
||||
this->schedule_output(trace, LATENCY_FDIV);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case FpuType::FSQRT:
|
||||
this->schedule_output(state, LATENCY_FSQRT);
|
||||
this->schedule_output(trace, LATENCY_FSQRT);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case FpuType::FCVT:
|
||||
this->schedule_output(state, LATENCY_FCVT);
|
||||
this->schedule_output(trace, LATENCY_FCVT);
|
||||
inputs_.pop();
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {}
|
||||
GpuUnit::GpuUnit(Core* core)
|
||||
: ExeUnit("GPU")
|
||||
, core_(core)
|
||||
, num_threads_(core->arch().num_threads())
|
||||
, pending_tex_reqs_(TEXQ_SIZE)
|
||||
{}
|
||||
|
||||
void GpuUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
void GpuUnit::step(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
// handle memory response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
|
||||
if (dcache_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = dcache_rsp_port.top();
|
||||
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
|
||||
trace->dcache_latency = latency;
|
||||
this->schedule_output(trace, 1);
|
||||
pending_tex_reqs_.release(mem_rsp.tag);
|
||||
}
|
||||
dcache_rsp_port.pop();
|
||||
}
|
||||
#endif
|
||||
|
||||
// check input queue
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
switch (state.gpu.type) {
|
||||
|
||||
auto trace = inputs_.top();
|
||||
|
||||
switch (trace->gpu.type) {
|
||||
case GpuType::TMC:
|
||||
case GpuType::WSPAWN:
|
||||
case GpuType::SPLIT:
|
||||
case GpuType::JOIN:
|
||||
case GpuType::BAR:
|
||||
this->schedule_output(state, 1);
|
||||
break;
|
||||
case GpuType::TEX:
|
||||
/* TODO */
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
break;
|
||||
case GpuType::TEX: {
|
||||
if (this->processTexRequest(cycle, trace))
|
||||
inputs_.pop();
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
|
||||
__unused (cycle);
|
||||
|
||||
// check pending queue capacity
|
||||
if (!trace->check_stalled(pending_tex_reqs_.full())) {
|
||||
DT(3, cycle, "*** tex-queue-stall: " << *trace);
|
||||
}
|
||||
if (pending_tex_reqs_.full())
|
||||
return false;
|
||||
|
||||
// send memory request
|
||||
|
||||
uint32_t valid_addrs = 0;
|
||||
for (auto& mem_addr : trace->mem_addrs) {
|
||||
valid_addrs += mem_addr.size();
|
||||
}
|
||||
|
||||
trace->tex_latency = SimPlatform::instance().cycles();
|
||||
auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
|
||||
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
if (!trace->tmask.test(t))
|
||||
continue;
|
||||
|
||||
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
|
||||
for (auto mem_addr : trace->mem_addrs.at(t)) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr;
|
||||
mem_req.write = (trace->lsu.type == LsuType::STORE);
|
||||
mem_req.tag = tag;
|
||||
dcache_req_port.send(mem_req, 1);
|
||||
DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr << ", tag=" << tag
|
||||
<< ", tid=" << t << ", "<< trace);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -11,36 +11,43 @@ class Core;
|
||||
class ExeUnit {
|
||||
protected:
|
||||
const char* name_;
|
||||
Queue<pipeline_state_t> inputs_;
|
||||
Queue<pipeline_state_t> outputs_;
|
||||
Queue<pipeline_trace_t*> inputs_;
|
||||
Queue<pipeline_trace_t*> outputs_;
|
||||
|
||||
void schedule_output(const pipeline_state_t& state, uint32_t delay) {
|
||||
void schedule_output(pipeline_trace_t* trace, uint32_t delay) {
|
||||
if (delay > 1) {
|
||||
SimPlatform::instance().schedule(
|
||||
[&](const pipeline_state_t& req) {
|
||||
[&](pipeline_trace_t* req) {
|
||||
outputs_.push(req);
|
||||
},
|
||||
state,
|
||||
trace,
|
||||
(delay - 1)
|
||||
);
|
||||
} else {
|
||||
outputs_.push(state);
|
||||
outputs_.push(trace);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
typedef std::shared_ptr<ExeUnit> Ptr;
|
||||
|
||||
ExeUnit(const char* name) : name_(name) {}
|
||||
|
||||
ExeUnit(const char* name) : name_(name) {}
|
||||
virtual ~ExeUnit() {}
|
||||
|
||||
void push_input(const pipeline_state_t& state) {
|
||||
inputs_.push(state);
|
||||
void push(pipeline_trace_t* trace) {
|
||||
inputs_.push(trace);
|
||||
}
|
||||
|
||||
bool pop_output(pipeline_state_t* state) {
|
||||
return outputs_.try_pop(state);
|
||||
bool empty() const {
|
||||
return outputs_.empty();
|
||||
}
|
||||
|
||||
pipeline_trace_t* top() const {
|
||||
return outputs_.top();
|
||||
}
|
||||
|
||||
void pop() {
|
||||
outputs_.pop();
|
||||
}
|
||||
|
||||
virtual void step(uint64_t cycle) = 0;
|
||||
@@ -61,8 +68,8 @@ class LsuUnit : public ExeUnit {
|
||||
private:
|
||||
Core* core_;
|
||||
uint32_t num_threads_;
|
||||
HashTable<std::pair<pipeline_state_t, ThreadMask>> pending_dcache_;
|
||||
pipeline_state_t fence_state_;
|
||||
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_dcache_;
|
||||
pipeline_trace_t* fence_state_;
|
||||
bool fence_lock_;
|
||||
|
||||
public:
|
||||
@@ -101,6 +108,13 @@ public:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class GpuUnit : public ExeUnit {
|
||||
private:
|
||||
Core* core_;
|
||||
uint32_t num_threads_;
|
||||
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
|
||||
|
||||
bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace);
|
||||
|
||||
public:
|
||||
GpuUnit(Core*);
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ namespace vortex {
|
||||
|
||||
class IBuffer {
|
||||
private:
|
||||
std::queue<pipeline_state_t> entries_;
|
||||
std::queue<pipeline_trace_t*> entries_;
|
||||
uint32_t capacity_;
|
||||
|
||||
public:
|
||||
@@ -23,12 +23,12 @@ public:
|
||||
return (entries_.size() == capacity_);
|
||||
}
|
||||
|
||||
const pipeline_state_t& top() const {
|
||||
pipeline_trace_t* top() const {
|
||||
return entries_.front();
|
||||
}
|
||||
|
||||
void push(const pipeline_state_t& state) {
|
||||
entries_.emplace(state);
|
||||
void push(pipeline_trace_t* trace) {
|
||||
entries_.emplace(trace);
|
||||
}
|
||||
|
||||
void pop() {
|
||||
|
||||
@@ -29,10 +29,9 @@ enum Opcode {
|
||||
FMNMADD = 0x4f,
|
||||
// Vector Extension
|
||||
VSET = 0x57,
|
||||
VL = 0x7,
|
||||
VS = 0x27,
|
||||
// GPGPU Extension
|
||||
GPGPU = 0x6b,
|
||||
GPU = 0x5b,
|
||||
};
|
||||
|
||||
enum InstType {
|
||||
@@ -70,6 +69,7 @@ public:
|
||||
void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg; }
|
||||
void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
|
||||
void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; }
|
||||
void setFunc2(Word func2) { func2_ = func2; }
|
||||
void setFunc3(Word func3) { func3_ = func3; }
|
||||
void setFunc7(Word func7) { func7_ = func7; }
|
||||
void setImm(Word imm) { has_imm_ = true; imm_ = imm; }
|
||||
@@ -85,6 +85,7 @@ public:
|
||||
|
||||
/* Getters used by encoders. */
|
||||
Opcode getOpcode() const { return opcode_; }
|
||||
Word getFunc2() const { return func2_; }
|
||||
Word getFunc3() const { return func3_; }
|
||||
Word getFunc6() const { return func6_; }
|
||||
Word getFunc7() const { return func7_; }
|
||||
@@ -118,6 +119,7 @@ private:
|
||||
RegType rsrc_type_[MAX_REG_SOURCES];
|
||||
int rsrc_[MAX_REG_SOURCES];
|
||||
int rdest_;
|
||||
Word func2_;
|
||||
Word func3_;
|
||||
Word func6_;
|
||||
|
||||
|
||||
@@ -20,14 +20,16 @@ public:
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
for (uint32_t i = 0, n = num_banks_; i < n; ++i) {
|
||||
MemReq mem_req;
|
||||
if (!simobject_->MemReqPorts.at(i).read(&mem_req))
|
||||
auto& mem_req_port = simobject_->MemReqPorts.at(i);
|
||||
if (mem_req_port.empty())
|
||||
continue;
|
||||
auto& mem_req = mem_req_port.top();
|
||||
if (!mem_req.write) {
|
||||
MemRsp mem_rsp;
|
||||
mem_rsp.tag = mem_req.tag;
|
||||
simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
|
||||
}
|
||||
mem_req_port.pop();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -10,10 +10,22 @@ struct MemReq {
|
||||
uint64_t addr;
|
||||
uint32_t tag;
|
||||
bool write;
|
||||
bool is_io;
|
||||
|
||||
MemReq(uint64_t _addr = 0,
|
||||
uint64_t _tag = 0,
|
||||
bool _write = false,
|
||||
bool _is_io = false
|
||||
) : addr(_addr)
|
||||
, tag(_tag)
|
||||
, write(_write)
|
||||
, is_io(_is_io)
|
||||
{}
|
||||
};
|
||||
|
||||
struct MemRsp {
|
||||
uint32_t tag;
|
||||
uint64_t tag;
|
||||
MemRsp(uint64_t _tag = 0) : tag (_tag) {}
|
||||
};
|
||||
|
||||
class MemSim : public SimObject<MemSim>{
|
||||
|
||||
@@ -5,11 +5,12 @@
|
||||
#include <iostream>
|
||||
#include <util.h>
|
||||
#include "types.h"
|
||||
#include "archdef.h"
|
||||
#include "debug.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
struct pipeline_state_t {
|
||||
struct pipeline_trace_t {
|
||||
//--
|
||||
uint64_t id;
|
||||
|
||||
@@ -20,17 +21,24 @@ struct pipeline_state_t {
|
||||
Word PC;
|
||||
|
||||
//--
|
||||
bool stall_warp;
|
||||
bool fetch_stall;
|
||||
bool pipeline_stall;
|
||||
|
||||
//--
|
||||
bool wb;
|
||||
RegType rdest_type;
|
||||
int rdest;
|
||||
|
||||
//--
|
||||
RegMask used_iregs;
|
||||
RegMask used_fregs;
|
||||
RegMask used_vregs;
|
||||
|
||||
//-
|
||||
ExeType exe_type;
|
||||
std::vector<uint64_t> mem_addrs;
|
||||
|
||||
//--
|
||||
std::vector<std::vector<uint64_t>> mem_addrs;
|
||||
|
||||
//--
|
||||
union {
|
||||
@@ -51,27 +59,37 @@ struct pipeline_state_t {
|
||||
// stats
|
||||
uint64_t icache_latency;
|
||||
uint64_t dcache_latency;
|
||||
uint64_t tex_latency;
|
||||
|
||||
void clear() {
|
||||
pipeline_trace_t(uint64_t id_, const ArchDef& arch) {
|
||||
id = id_;
|
||||
cid = 0;
|
||||
wid = 0;
|
||||
tmask.reset();
|
||||
PC = 0;
|
||||
stall_warp = false;
|
||||
wb = false;
|
||||
PC = 0;
|
||||
fetch_stall = false;
|
||||
pipeline_stall = false;
|
||||
wb = false;
|
||||
rdest = 0;
|
||||
rdest_type = RegType::None;
|
||||
used_iregs.reset();
|
||||
used_fregs.reset();
|
||||
used_vregs.reset();
|
||||
exe_type = ExeType::NOP;
|
||||
mem_addrs.clear();
|
||||
mem_addrs.resize(arch.num_threads());
|
||||
icache_latency = 0;
|
||||
dcache_latency = 0;
|
||||
tex_latency = 0;
|
||||
}
|
||||
|
||||
bool check_stalled(bool stall) {
|
||||
bool old = pipeline_stall;
|
||||
pipeline_stall = stall;
|
||||
return stall ? old : true;
|
||||
}
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) {
|
||||
inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) {
|
||||
os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
|
||||
os << ", wb=" << state.wb;
|
||||
if (state.wb) {
|
||||
@@ -82,10 +100,9 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state)
|
||||
return os;
|
||||
}
|
||||
|
||||
class PipelineStage : public Queue<pipeline_state_t> {
|
||||
class PipelineStage : public Queue<pipeline_trace_t*> {
|
||||
protected:
|
||||
const char* name_;
|
||||
friend std::ostream &operator<<(std::ostream &, const pipeline_state_t&);
|
||||
|
||||
public:
|
||||
PipelineStage(const char* name = nullptr)
|
||||
|
||||
@@ -33,7 +33,8 @@ Processor::Processor(const ArchDef& arch)
|
||||
L3_NUM_BANKS, // number of banks
|
||||
L3_NUM_PORTS, // number of ports
|
||||
NUM_CLUSTERS, // request size
|
||||
true, // write-throught
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
L3_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
@@ -74,7 +75,8 @@ Processor::Processor(const ArchDef& arch)
|
||||
L2_NUM_BANKS, // number of banks
|
||||
L2_NUM_PORTS, // number of ports
|
||||
NUM_CORES, // request size
|
||||
true, // write-throught
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
L2_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
@@ -129,7 +131,7 @@ int Processor::run() {
|
||||
if (core->running()) {
|
||||
running = true;
|
||||
}
|
||||
if (core->check_ebreak()) {
|
||||
if (core->check_exit()) {
|
||||
exitcode = core->getIRegValue(3);
|
||||
running = false;
|
||||
break;
|
||||
@@ -137,5 +139,7 @@ int Processor::run() {
|
||||
}
|
||||
} while (running);
|
||||
|
||||
std::cout << std::flush;
|
||||
|
||||
return exitcode;
|
||||
}
|
||||
@@ -7,6 +7,12 @@ namespace vortex {
|
||||
|
||||
class Scoreboard {
|
||||
private:
|
||||
struct reg_use_t {
|
||||
RegType type;
|
||||
uint32_t reg;
|
||||
uint64_t owner;
|
||||
};
|
||||
|
||||
std::vector<RegMask> in_use_iregs_;
|
||||
std::vector<RegMask> in_use_fregs_;
|
||||
std::vector<RegMask> in_use_vregs_;
|
||||
@@ -25,21 +31,21 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
bool in_use(const pipeline_state_t& state) const {
|
||||
return (state.used_iregs & in_use_iregs_.at(state.wid)) != 0
|
||||
|| (state.used_fregs & in_use_fregs_.at(state.wid)) != 0
|
||||
|| (state.used_vregs & in_use_vregs_.at(state.wid)) != 0;
|
||||
bool in_use(pipeline_trace_t* state) const {
|
||||
return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0
|
||||
|| (state->used_fregs & in_use_fregs_.at(state->wid)) != 0
|
||||
|| (state->used_vregs & in_use_vregs_.at(state->wid)) != 0;
|
||||
}
|
||||
|
||||
std::vector<uint64_t> owners(const pipeline_state_t& state) const {
|
||||
std::vector<uint64_t> out;
|
||||
std::vector<reg_use_t> get_uses(pipeline_trace_t* state) const {
|
||||
std::vector<reg_use_t> out;
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_iregs = state.used_iregs & in_use_iregs_.at(state.wid);
|
||||
auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid);
|
||||
while (used_iregs.any()) {
|
||||
if (used_iregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Integer;
|
||||
out.push_back(owners_.at(tag));
|
||||
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer;
|
||||
out.push_back({RegType::Integer, r, owners_.at(tag)});
|
||||
}
|
||||
used_iregs >>= 1;
|
||||
++r;
|
||||
@@ -47,11 +53,11 @@ public:
|
||||
}
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_fregs = state.used_fregs & in_use_fregs_.at(state.wid);
|
||||
auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid);
|
||||
while (used_fregs.any()) {
|
||||
if (used_fregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Float;
|
||||
out.push_back(owners_.at(tag));
|
||||
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float;
|
||||
out.push_back({RegType::Float, r, owners_.at(tag)});
|
||||
}
|
||||
used_fregs >>= 1;
|
||||
++r;
|
||||
@@ -59,11 +65,11 @@ public:
|
||||
}
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_vregs = state.used_vregs & in_use_vregs_.at(state.wid);
|
||||
auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid);
|
||||
while (used_vregs.any()) {
|
||||
if (used_vregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Vector;
|
||||
out.push_back(owners_.at(tag));
|
||||
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector;
|
||||
out.push_back({RegType::Vector, r, owners_.at(tag)});
|
||||
}
|
||||
used_vregs >>= 1;
|
||||
++r;
|
||||
@@ -72,44 +78,44 @@ public:
|
||||
return std::move(out);
|
||||
}
|
||||
|
||||
void reserve(const pipeline_state_t& state) {
|
||||
if (!state.wb)
|
||||
void reserve(pipeline_trace_t* state) {
|
||||
if (!state->wb)
|
||||
return;
|
||||
switch (state.rdest_type) {
|
||||
switch (state->rdest_type) {
|
||||
case RegType::Integer:
|
||||
in_use_iregs_.at(state.wid).set(state.rdest);
|
||||
in_use_iregs_.at(state->wid).set(state->rdest);
|
||||
break;
|
||||
case RegType::Float:
|
||||
in_use_fregs_.at(state.wid).set(state.rdest);
|
||||
in_use_fregs_.at(state->wid).set(state->rdest);
|
||||
break;
|
||||
case RegType::Vector:
|
||||
in_use_vregs_.at(state.wid).set(state.rdest);
|
||||
in_use_vregs_.at(state->wid).set(state->rdest);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type;
|
||||
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
|
||||
assert(owners_.count(tag) == 0);
|
||||
owners_[tag] = state.id;
|
||||
owners_[tag] = state->id;
|
||||
}
|
||||
|
||||
void release(const pipeline_state_t& state) {
|
||||
if (!state.wb)
|
||||
void release(pipeline_trace_t* state) {
|
||||
if (!state->wb)
|
||||
return;
|
||||
switch (state.rdest_type) {
|
||||
switch (state->rdest_type) {
|
||||
case RegType::Integer:
|
||||
in_use_iregs_.at(state.wid).reset(state.rdest);
|
||||
in_use_iregs_.at(state->wid).reset(state->rdest);
|
||||
break;
|
||||
case RegType::Float:
|
||||
in_use_fregs_.at(state.wid).reset(state.rdest);
|
||||
in_use_fregs_.at(state->wid).reset(state->rdest);
|
||||
break;
|
||||
case RegType::Vector:
|
||||
in_use_vregs_.at(state.wid).reset(state.rdest);
|
||||
in_use_vregs_.at(state->wid).reset(state->rdest);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type;
|
||||
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
|
||||
owners_.erase(tag);
|
||||
}
|
||||
};
|
||||
|
||||
91
sim/simX/tex_unit.cpp
Normal file
91
sim/simX/tex_unit.cpp
Normal file
@@ -0,0 +1,91 @@
|
||||
#include "tex_unit.h"
|
||||
#include "core.h"
|
||||
#include <texturing.h>
|
||||
#include <VX_config.h>
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
enum class FilterMode {
|
||||
Point,
|
||||
Bilinear,
|
||||
Trilinear,
|
||||
};
|
||||
|
||||
TexUnit::TexUnit(Core* core) : core_(core) {}
|
||||
|
||||
TexUnit::~TexUnit() {}
|
||||
|
||||
uint32_t TexUnit::get_state(uint32_t state) {
|
||||
return states_.at(state);
|
||||
}
|
||||
|
||||
void TexUnit::set_state(uint32_t state, uint32_t value) {
|
||||
states_.at(state) = value;
|
||||
}
|
||||
|
||||
uint32_t TexUnit::read(int32_t u,
|
||||
int32_t v,
|
||||
int32_t lod,
|
||||
std::vector<uint64_t>* mem_addrs) {
|
||||
//--
|
||||
auto xu = Fixed<TEX_FXD_FRAC>::make(u);
|
||||
auto xv = Fixed<TEX_FXD_FRAC>::make(v);
|
||||
uint32_t base_addr = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod));
|
||||
uint32_t log_width = std::max<int32_t>(states_.at(TEX_STATE_WIDTH) - lod, 0);
|
||||
uint32_t log_height = std::max<int32_t>(states_.at(TEX_STATE_HEIGHT) - lod, 0);
|
||||
auto format = (TexFormat)states_.at(TEX_STATE_FORMAT);
|
||||
auto filter = (FilterMode)states_.at(TEX_STATE_FILTER);
|
||||
auto wrapu = (WrapMode)states_.at(TEX_STATE_WRAPU);
|
||||
auto wrapv = (WrapMode)states_.at(TEX_STATE_WRAPV);
|
||||
|
||||
auto stride = Stride(format);
|
||||
|
||||
switch (filter) {
|
||||
case FilterMode::Bilinear: {
|
||||
// addressing
|
||||
uint32_t offset00, offset01, offset10, offset11;
|
||||
uint32_t alpha, beta;
|
||||
TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv,
|
||||
&offset00, &offset01, &offset10, &offset11, &alpha, &beta);
|
||||
|
||||
uint32_t addr00 = base_addr + offset00 * stride;
|
||||
uint32_t addr01 = base_addr + offset01 * stride;
|
||||
uint32_t addr10 = base_addr + offset10 * stride;
|
||||
uint32_t addr11 = base_addr + offset11 * stride;
|
||||
|
||||
// memory lookup
|
||||
uint32_t texel00 = core_->dcache_read(addr00, stride);
|
||||
uint32_t texel01 = core_->dcache_read(addr01, stride);
|
||||
uint32_t texel10 = core_->dcache_read(addr10, stride);
|
||||
uint32_t texel11 = core_->dcache_read(addr11, stride);
|
||||
|
||||
mem_addrs->push_back(addr00);
|
||||
mem_addrs->push_back(addr01);
|
||||
mem_addrs->push_back(addr10);
|
||||
mem_addrs->push_back(addr11);
|
||||
|
||||
// filtering
|
||||
auto color = TexFilterLinear(
|
||||
format, texel00, texel01, texel10, texel11, alpha, beta);
|
||||
return color;
|
||||
}
|
||||
case FilterMode::Point: {
|
||||
// addressing
|
||||
uint32_t offset;
|
||||
TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
|
||||
|
||||
uint32_t addr = base_addr + offset * stride;
|
||||
|
||||
// memory lookup
|
||||
uint32_t texel = core_->dcache_read(addr, stride);
|
||||
mem_addrs->push_back(addr);
|
||||
|
||||
// filtering
|
||||
auto color = TexFilterPoint(format, texel);
|
||||
return color;
|
||||
}
|
||||
default:
|
||||
std::abort();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
26
sim/simX/tex_unit.h
Normal file
26
sim/simX/tex_unit.h
Normal file
@@ -0,0 +1,26 @@
|
||||
#pragma once
|
||||
|
||||
#include "types.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Core;
|
||||
|
||||
class TexUnit {
|
||||
public:
|
||||
TexUnit(Core* core);
|
||||
~TexUnit();
|
||||
|
||||
uint32_t get_state(uint32_t state);
|
||||
|
||||
void set_state(uint32_t state, uint32_t value);
|
||||
|
||||
uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<uint64_t>* mem_addrs);
|
||||
|
||||
private:
|
||||
|
||||
std::array<uint32_t, NUM_TEX_STATES> states_;
|
||||
Core* core_;
|
||||
};
|
||||
|
||||
}
|
||||
108
sim/simX/types.h
108
sim/simX/types.h
@@ -66,6 +66,7 @@ enum class AluType {
|
||||
BRANCH,
|
||||
IMUL,
|
||||
IDIV,
|
||||
CMOV,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
|
||||
@@ -74,6 +75,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
|
||||
case AluType::BRANCH: os << "BRANCH"; break;
|
||||
case AluType::IMUL: os << "IMUL"; break;
|
||||
case AluType::IDIV: os << "IDIV"; break;
|
||||
case AluType::CMOV: os << "CMOV"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
@@ -155,8 +157,6 @@ class Queue {
|
||||
protected:
|
||||
std::queue<T> queue_;
|
||||
|
||||
uint32_t count;
|
||||
|
||||
public:
|
||||
Queue() {}
|
||||
|
||||
@@ -168,21 +168,16 @@ public:
|
||||
return queue_.front();
|
||||
}
|
||||
|
||||
void push(const T& value) {
|
||||
++count;
|
||||
queue_.push(value);
|
||||
T& top() {
|
||||
return queue_.front();
|
||||
}
|
||||
|
||||
void pop() {
|
||||
queue_.pop();
|
||||
}
|
||||
|
||||
bool try_pop(T* value) {
|
||||
if (queue_.empty())
|
||||
return false;
|
||||
*value = queue_.front();
|
||||
queue_.pop();
|
||||
return true;
|
||||
void push(const T& value) {
|
||||
queue_.push(value);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -244,14 +239,6 @@ public:
|
||||
entry.first = false;
|
||||
--capacity_;
|
||||
}
|
||||
|
||||
void remove(uint32_t index, T* value) {
|
||||
auto& entry = entries_.at(index);
|
||||
assert(entry.first);
|
||||
*value = entry.second;
|
||||
entry.first = false;
|
||||
--capacity_;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@@ -259,18 +246,7 @@ public:
|
||||
template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
|
||||
class Switch : public SimObject<Switch<Req, Rsp>> {
|
||||
private:
|
||||
struct req_batch_t {
|
||||
std::vector<Req> data;
|
||||
std::bitset<MaxInputs> valid;
|
||||
req_batch_t() {}
|
||||
req_batch_t(uint32_t size)
|
||||
: data(size)
|
||||
, valid(0)
|
||||
{}
|
||||
};
|
||||
|
||||
ArbiterType type_;
|
||||
std::queue<req_batch_t> reqq_;
|
||||
uint32_t delay_;
|
||||
uint32_t cursor_;
|
||||
uint32_t tag_shift_;
|
||||
@@ -295,55 +271,43 @@ public:
|
||||
{
|
||||
assert(delay_ != 0);
|
||||
assert(num_inputs <= MaxInputs);
|
||||
if (num_inputs == 1) {
|
||||
// bypass
|
||||
ReqIn.at(0).bind(&ReqOut);
|
||||
RspIn.bind(&RspOut.at(0));
|
||||
}
|
||||
}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
// process incomming requests
|
||||
{
|
||||
req_batch_t req_batch(ReqIn.size());
|
||||
for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
|
||||
Req req;
|
||||
if (ReqIn.at(i).read(&req)) {
|
||||
req_batch.data.at(i) = req;
|
||||
req_batch.valid.set(i);
|
||||
void step(uint64_t /*cycle*/) {
|
||||
if (ReqIn.size() == 1)
|
||||
return;
|
||||
|
||||
// process incomming requests
|
||||
for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
|
||||
uint32_t j = (cursor_ + i) % n;
|
||||
auto& req_in = ReqIn.at(j);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.top();
|
||||
if (tag_shift_) {
|
||||
req.tag = (req.tag << tag_shift_) | j;
|
||||
}
|
||||
ReqOut.send(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_cursor(j);
|
||||
break;
|
||||
}
|
||||
if (req_batch.valid.any()) {
|
||||
reqq_.push(req_batch);
|
||||
}
|
||||
}
|
||||
|
||||
// apply arbitration
|
||||
if (!reqq_.empty()) {
|
||||
auto& req_batch = reqq_.front();
|
||||
for (uint32_t i = 0, n = req_batch.data.size(); i < n; ++i) {
|
||||
auto j = (cursor_ + i) % n;
|
||||
if (req_batch.valid.test(j)) {
|
||||
auto& req = req_batch.data.at(j);
|
||||
if (tag_shift_) {
|
||||
req.tag = (req.tag << tag_shift_) | j;
|
||||
}
|
||||
ReqOut.send(req, delay_);
|
||||
req_batch.valid.reset(j);
|
||||
this->update_cursor(j);
|
||||
if (!req_batch.valid.any())
|
||||
reqq_.pop(); // pop when empty
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// process incoming reponses
|
||||
{
|
||||
Rsp rsp;
|
||||
if (RspIn.read(&rsp)) {
|
||||
uint32_t port_id = 0;
|
||||
if (tag_shift_) {
|
||||
port_id = rsp.tag & ((1 << tag_shift_)-1);
|
||||
rsp.tag >>= tag_shift_;
|
||||
}
|
||||
RspOut.at(port_id).send(rsp, 1);
|
||||
}
|
||||
if (!RspIn.empty()) {
|
||||
auto& rsp = RspIn.top();
|
||||
uint32_t port_id = 0;
|
||||
if (tag_shift_) {
|
||||
port_id = rsp.tag & ((1 << tag_shift_)-1);
|
||||
rsp.tag >>= tag_shift_;
|
||||
}
|
||||
RspOut.at(port_id).send(rsp, 1);
|
||||
RspIn.pop();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ Warp::Warp(Core *core, Word id)
|
||||
vRegFile_.resize(core_->arch().num_regs(), std::vector<Byte>(core_->arch().vsize(), 0));
|
||||
}
|
||||
|
||||
void Warp::eval(pipeline_state_t *pipeline_state) {
|
||||
void Warp::eval(pipeline_trace_t *trace) {
|
||||
assert(tmask_.any());
|
||||
|
||||
DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
|
||||
@@ -38,18 +38,18 @@ void Warp::eval(pipeline_state_t *pipeline_state) {
|
||||
std::abort();
|
||||
}
|
||||
|
||||
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
|
||||
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr << " (#" << trace->id << ")");
|
||||
|
||||
// Update state
|
||||
pipeline_state->cid = core_->id();
|
||||
pipeline_state->wid = id_;
|
||||
pipeline_state->PC = PC_;
|
||||
pipeline_state->tmask = tmask_;
|
||||
pipeline_state->rdest = instr->getRDest();
|
||||
pipeline_state->rdest_type = instr->getRDType();
|
||||
// Update trace
|
||||
trace->cid = core_->id();
|
||||
trace->wid = id_;
|
||||
trace->PC = PC_;
|
||||
trace->tmask = tmask_;
|
||||
trace->rdest = instr->getRDest();
|
||||
trace->rdest_type = instr->getRDType();
|
||||
|
||||
// Execute
|
||||
this->execute(*instr, pipeline_state);
|
||||
this->execute(*instr, trace);
|
||||
|
||||
DP(4, "Register state:");
|
||||
for (int i = 0; i < core_->arch().num_regs(); ++i) {
|
||||
|
||||
@@ -9,7 +9,7 @@ namespace vortex {
|
||||
|
||||
class Core;
|
||||
class Instr;
|
||||
class pipeline_state_t;
|
||||
class pipeline_trace_t;
|
||||
struct DomStackEntry {
|
||||
DomStackEntry(const ThreadMask &tmask, Word PC)
|
||||
: tmask(tmask)
|
||||
@@ -83,11 +83,11 @@ public:
|
||||
return iRegFile_.at(0).at(reg);
|
||||
}
|
||||
|
||||
void eval(pipeline_state_t *);
|
||||
void eval(pipeline_trace_t *);
|
||||
|
||||
private:
|
||||
|
||||
void execute(const Instr &instr, pipeline_state_t *pipeline_state);
|
||||
void execute(const Instr &instr, pipeline_trace_t *trace);
|
||||
|
||||
Word id_;
|
||||
Core *core_;
|
||||
|
||||
@@ -24,7 +24,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
|
||||
|
||||
DBG_FLAGS += $(DBG_TRACE_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
|
||||
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
|
||||
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
||||
@@ -51,10 +50,13 @@ CXXFLAGS += $(CONFIGS)
|
||||
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
|
||||
#VL_FLAGS += --threads $(THREADS)
|
||||
|
||||
# Enable VCD trace
|
||||
#VCD_TRACE = -DVCD_OUTPUT
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS)
|
||||
CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
|
||||
VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS)
|
||||
CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS)
|
||||
else
|
||||
VL_FLAGS += -DNDEBUG
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
|
||||
Reference in New Issue
Block a user