Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions

View File

@@ -1,9 +1,9 @@
all:
$(MAKE) -C simx
$(MAKE) -C rtlsim
$(MAKE) -C vlsim
$(MAKE) -C opaesim
clean:
$(MAKE) -C simx clean
$(MAKE) -C rtlsim clean
$(MAKE) -C vlsim clean
$(MAKE) -C opaesim clean

View File

@@ -1,7 +1,19 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <algorithm>
#include <assert.h>
constexpr uint32_t count_leading_zeros(uint32_t value) {
@@ -77,5 +89,15 @@ T sext(const T& word, uint32_t width) {
if (width == (sizeof(T) * 8))
return word;
T mask((static_cast<T>(1) << width) - 1);
return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word;
}
return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : (word & mask);
}
template <typename T = uint32_t>
T zext(const T& word, uint32_t width) {
assert(width > 1);
assert(width <= (sizeof(T) * 8));
if (width == (sizeof(T) * 8))
return word;
T mask((static_cast<T>(1) << width) - 1);
return word & mask;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mem.h"
#include <vector>
#include <iostream>
@@ -20,8 +33,9 @@ RamMemDevice::RamMemDevice(const char *filename, uint32_t wordSize)
contents_.push_back(input.get());
} while (input);
while (contents_.size() & (wordSize-1))
while (contents_.size() & (wordSize-1)) {
contents_.push_back(0x00);
}
}
RamMemDevice::RamMemDevice(uint64_t size, uint32_t wordSize)
@@ -29,7 +43,7 @@ RamMemDevice::RamMemDevice(uint64_t size, uint32_t wordSize)
, wordSize_(wordSize)
{}
void RamMemDevice::read(void *data, uint64_t addr, uint64_t size) {
void RamMemDevice::read(void* data, uint64_t addr, uint64_t size) {
auto addr_end = addr + size;
if ((addr & (wordSize_-1))
|| (addr_end & (wordSize_-1))
@@ -44,7 +58,7 @@ void RamMemDevice::read(void *data, uint64_t addr, uint64_t size) {
}
}
void RamMemDevice::write(const void *data, uint64_t addr, uint64_t size) {
void RamMemDevice::write(const void* data, uint64_t addr, uint64_t size) {
auto addr_end = addr + size;
if ((addr & (wordSize_-1))
|| (addr_end & (wordSize_-1))
@@ -68,26 +82,26 @@ void RomMemDevice::write(const void* /*data*/, uint64_t /*addr*/, uint64_t /*siz
///////////////////////////////////////////////////////////////////////////////
bool MemoryUnit::ADecoder::lookup(uint64_t a, uint32_t wordSize, mem_accessor_t* ma) {
uint64_t e = a + (wordSize - 1);
assert(e >= a);
bool MemoryUnit::ADecoder::lookup(uint64_t addr, uint32_t wordSize, mem_accessor_t* ma) {
uint64_t end = addr + (wordSize - 1);
assert(end >= addr);
for (auto iter = entries_.rbegin(), iterE = entries_.rend(); iter != iterE; ++iter) {
if (a >= iter->start && e <= iter->end) {
if (addr >= iter->start && end <= iter->end) {
ma->md = iter->md;
ma->addr = a - iter->start;
ma->addr = addr - iter->start;
return true;
}
}
return false;
}
void MemoryUnit::ADecoder::map(uint64_t a, uint64_t e, MemDevice &m) {
assert(e >= a);
entry_t entry{&m, a, e};
void MemoryUnit::ADecoder::map(uint64_t start, uint64_t end, MemDevice &md) {
assert(end >= start);
entry_t entry{&md, start, end};
entries_.emplace_back(entry);
}
void MemoryUnit::ADecoder::read(void *data, uint64_t addr, uint64_t size) {
void MemoryUnit::ADecoder::read(void* data, uint64_t addr, uint64_t size) {
mem_accessor_t ma;
if (!this->lookup(addr, size, &ma)) {
std::cout << "lookup of 0x" << std::hex << addr << " failed.\n";
@@ -96,7 +110,7 @@ void MemoryUnit::ADecoder::read(void *data, uint64_t addr, uint64_t size) {
ma.md->read(data, ma.addr, size);
}
void MemoryUnit::ADecoder::write(const void *data, uint64_t addr, uint64_t size) {
void MemoryUnit::ADecoder::write(const void* data, uint64_t addr, uint64_t size) {
mem_accessor_t ma;
if (!this->lookup(addr, size, &ma)) {
std::cout << "lookup of 0x" << std::hex << addr << " failed.\n";
@@ -107,11 +121,11 @@ void MemoryUnit::ADecoder::write(const void *data, uint64_t addr, uint64_t size)
///////////////////////////////////////////////////////////////////////////////
MemoryUnit::MemoryUnit(uint64_t pageSize, uint64_t addrBytes, bool disableVm)
MemoryUnit::MemoryUnit(uint64_t pageSize)
: pageSize_(pageSize)
, addrBytes_(addrBytes)
, disableVM_(disableVm) {
if (!disableVm) {
, enableVM_(pageSize != 0)
, amo_reservation_({0x0, false}) {
if (pageSize != 0) {
tlb_[0] = TLBEntry(0, 077);
}
}
@@ -133,30 +147,38 @@ MemoryUnit::TLBEntry MemoryUnit::tlbLookup(uint64_t vAddr, uint32_t flagMask) {
}
}
void MemoryUnit::read(void *data, uint64_t addr, uint64_t size, bool sup) {
uint64_t MemoryUnit::toPhyAddr(uint64_t addr, uint32_t flagMask) {
uint64_t pAddr;
if (disableVM_) {
pAddr = addr;
} else {
uint32_t flagMask = sup ? 8 : 1;
if (enableVM_) {
TLBEntry t = this->tlbLookup(addr, flagMask);
pAddr = t.pfn * pageSize_ + addr % pageSize_;
} else {
pAddr = addr;
}
return pAddr;
}
void MemoryUnit::read(void* data, uint64_t addr, uint64_t size, bool sup) {
uint64_t pAddr = this->toPhyAddr(addr, sup ? 8 : 1);
return decoder_.read(data, pAddr, size);
}
void MemoryUnit::write(const void *data, uint64_t addr, uint64_t size, bool sup) {
uint64_t pAddr;
if (disableVM_) {
pAddr = addr;
} else {
uint32_t flagMask = sup ? 16 : 2;
TLBEntry t = tlbLookup(addr, flagMask);
pAddr = t.pfn * pageSize_ + addr % pageSize_;
}
void MemoryUnit::write(const void* data, uint64_t addr, uint64_t size, bool sup) {
uint64_t pAddr = this->toPhyAddr(addr, sup ? 16 : 1);
decoder_.write(data, pAddr, size);
amo_reservation_.valid = false;
}
void MemoryUnit::amo_reserve(uint64_t addr) {
uint64_t pAddr = this->toPhyAddr(addr, 1);
amo_reservation_.addr = pAddr;
amo_reservation_.valid = true;
}
bool MemoryUnit::amo_check(uint64_t addr) {
uint64_t pAddr = this->toPhyAddr(addr, 1);
return amo_reservation_.valid && (amo_reservation_.addr == pAddr);
}
void MemoryUnit::tlbAdd(uint64_t virt, uint64_t phys, uint32_t flags) {
tlb_[virt / pageSize_] = TLBEntry(phys / pageSize_, flags);
}
@@ -168,12 +190,14 @@ void MemoryUnit::tlbRm(uint64_t va) {
///////////////////////////////////////////////////////////////////////////////
RAM::RAM(uint32_t page_size)
: size_(0)
RAM::RAM(uint32_t page_size, uint64_t capacity)
: capacity_(capacity)
, page_bits_(log2ceil(page_size))
, last_page_(nullptr)
, last_page_index_(0) {
assert(ispow2(page_size));
assert(0 == capacity || ispow2(capacity));
assert(0 == (capacity % page_size));
}
RAM::~RAM() {
@@ -191,6 +215,9 @@ uint64_t RAM::size() const {
}
uint8_t *RAM::get(uint64_t address) const {
if (capacity_ != 0 && address >= capacity_) {
throw OutOfRange();
}
uint32_t page_size = 1 << page_bits_;
uint32_t page_offset = address & (page_size - 1);
uint64_t page_index = address >> page_bits_;
@@ -218,14 +245,14 @@ uint8_t *RAM::get(uint64_t address) const {
return page + page_offset;
}
void RAM::read(void *data, uint64_t addr, uint64_t size) {
void RAM::read(void* data, uint64_t addr, uint64_t size) {
uint8_t* d = (uint8_t*)data;
for (uint64_t i = 0; i < size; i++) {
d[i] = *this->get(addr + i);
}
}
void RAM::write(const void *data, uint64_t addr, uint64_t size) {
void RAM::write(const void* data, uint64_t addr, uint64_t size) {
const uint8_t* d = (const uint8_t*)data;
for (uint64_t i = 0; i < size; i++) {
*this->get(addr + i) = d[i];
@@ -236,6 +263,7 @@ void RAM::loadBinImage(const char* filename, uint64_t destination) {
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
std::abort();
}
ifs.seekg(0, ifs.end);
@@ -268,6 +296,7 @@ void RAM::loadHexImage(const char* filename) {
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
std::abort();
}
ifs.seekg(0, ifs.end);
@@ -313,4 +342,4 @@ void RAM::loadHexImage(const char* filename) {
++line;
--size;
}
}
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
@@ -7,13 +20,14 @@
namespace vortex {
struct BadAddress {};
struct OutOfRange {};
class MemDevice {
public:
virtual ~MemDevice() {}
virtual uint64_t size() const = 0;
virtual void read(void *data, uint64_t addr, uint64_t size) = 0;
virtual void write(const void *data, uint64_t addr, uint64_t size) = 0;
virtual void read(void* data, uint64_t addr, uint64_t size) = 0;
virtual void write(const void* data, uint64_t addr, uint64_t size) = 0;
};
///////////////////////////////////////////////////////////////////////////////
@@ -21,11 +35,11 @@ public:
class RamMemDevice : public MemDevice {
public:
RamMemDevice(uint64_t size, uint32_t wordSize);
RamMemDevice(const char *filename, uint32_t wordSize);
RamMemDevice(const char* filename, uint32_t wordSize);
~RamMemDevice() {}
void read(void *data, uint64_t addr, uint64_t size) override;
void write(const void *data, uint64_t addr, uint64_t size) override;
void read(void* data, uint64_t addr, uint64_t size) override;
void write(const void* data, uint64_t addr, uint64_t size) override;
virtual uint64_t size() const {
return contents_.size();
@@ -50,7 +64,7 @@ public:
~RomMemDevice();
void write(const void *data, uint64_t addr, uint64_t size) override;
void write(const void* data, uint64_t addr, uint64_t size) override;
};
///////////////////////////////////////////////////////////////////////////////
@@ -63,47 +77,56 @@ public:
: faultAddr(a)
, notFound(nf)
{}
uint64_t faultAddr;
bool notFound;
uint64_t faultAddr;
bool notFound;
};
MemoryUnit(uint64_t pageSize, uint64_t addrBytes, bool disableVm = false);
MemoryUnit(uint64_t pageSize = 0);
void attach(MemDevice &m, uint64_t start, uint64_t end);
void read(void *data, uint64_t addr, uint64_t size, bool sup);
void write(const void *data, uint64_t addr, uint64_t size, bool sup);
void read(void* data, uint64_t addr, uint64_t size, bool sup);
void write(const void* data, uint64_t addr, uint64_t size, bool sup);
void amo_reserve(uint64_t addr);
bool amo_check(uint64_t addr);
void tlbAdd(uint64_t virt, uint64_t phys, uint32_t flags);
void tlbRm(uint64_t va);
void tlbRm(uint64_t vaddr);
void tlbFlush() {
tlb_.clear();
}
private:
struct amo_reservation_t {
uint64_t addr;
bool valid;
};
class ADecoder {
public:
ADecoder() {}
void read(void *data, uint64_t addr, uint64_t size);
void write(const void *data, uint64_t addr, uint64_t size);
void read(void* data, uint64_t addr, uint64_t size);
void write(const void* data, uint64_t addr, uint64_t size);
void map(uint64_t start, uint64_t end, MemDevice &md);
private:
struct mem_accessor_t {
MemDevice* md;
uint64_t addr;
MemDevice* md;
uint64_t addr;
};
struct entry_t {
MemDevice *md;
uint64_t start;
uint64_t end;
MemDevice* md;
uint64_t start;
uint64_t end;
};
bool lookup(uint64_t a, uint32_t wordSize, mem_accessor_t*);
bool lookup(uint64_t addr, uint32_t wordSize, mem_accessor_t*);
std::vector<entry_t> entries_;
};
@@ -120,11 +143,14 @@ private:
TLBEntry tlbLookup(uint64_t vAddr, uint32_t flagMask);
uint64_t toPhyAddr(uint64_t vAddr, uint32_t flagMask);
std::unordered_map<uint64_t, TLBEntry> tlb_;
uint64_t pageSize_;
uint64_t addrBytes_;
ADecoder decoder_;
bool disableVM_;
uint64_t pageSize_;
ADecoder decoder_;
bool enableVM_;
amo_reservation_t amo_reservation_;
};
///////////////////////////////////////////////////////////////////////////////
@@ -132,15 +158,15 @@ private:
class RAM : public MemDevice {
public:
RAM(uint32_t page_size);
RAM(uint32_t page_size, uint64_t capacity = 0);
~RAM();
void clear();
uint64_t size() const override;
void read(void *data, uint64_t addr, uint64_t size) override;
void write(const void *data, uint64_t addr, uint64_t size) override;
void read(void* data, uint64_t addr, uint64_t size) override;
void write(const void* data, uint64_t addr, uint64_t size) override;
void loadBinImage(const char* filename, uint64_t destination);
void loadHexImage(const char* filename);
@@ -157,11 +183,11 @@ private:
uint8_t *get(uint64_t address) const;
uint64_t size_;
uint64_t capacity_;
uint32_t page_bits_;
mutable std::unordered_map<uint64_t, uint8_t*> pages_;
mutable uint8_t* last_page_;
mutable uint64_t last_page_index_;
};
} // namespace vortex
} // namespace vortex

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stack>
@@ -18,8 +31,9 @@ public:
void* allocate() {
void* mem;
if (!free_list_.empty()) {
mem = static_cast<void*>(free_list_.top());
auto entry = free_list_.top();
free_list_.pop();
mem = static_cast<void*>(entry);
} else {
mem = ::operator new(sizeof(T));
}
@@ -36,12 +50,13 @@ public:
void flush() {
while (!free_list_.empty()) {
::operator delete(free_list_.top());
auto entry = free_list_.top();
free_list_.pop();
::operator delete(entry);
}
}
private:
std::stack<void*> free_list_;
std::stack<T*> free_list_;
uint32_t max_size_;
};
};

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "rvfloats.h"
#include <stdio.h>
@@ -16,12 +29,9 @@ inline float64_t to_float64_t(uint64_t x) { return float64_t{x}; }
inline uint32_t from_float32_t(float32_t x) { return uint32_t(x.v); }
inline uint64_t from_float64_t(float64_t x) { return uint64_t(x.v); }
inline uint32_t get_fflags() {
uint32_t fflags = softfloat_exceptionFlags;
if (fflags) {
softfloat_exceptionFlags = 0;
}
return fflags;
inline void rv_init(uint32_t frm) {
softfloat_exceptionFlags = 0;
softfloat_roundingMode = frm;
}
#ifdef __cplusplus
@@ -29,289 +39,296 @@ extern "C" {
#endif
uint32_t rv_fadd_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_add(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fadd_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_add(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fsub_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_sub(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fsub_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_sub(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fmul_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_mul(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fmul_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_mul(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fmadd_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_mulAdd(to_float32_t(a), to_float32_t(b), to_float32_t(c));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fmadd_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_mulAdd(to_float64_t(a), to_float64_t(b), to_float64_t(c));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fmsub_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto c_neg = c ^ F32_SIGN;
auto r = f32_mulAdd(to_float32_t(a), to_float32_t(b), to_float32_t(c_neg));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fmsub_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto c_neg = c ^ F64_SIGN;
auto r = f64_mulAdd(to_float64_t(a), to_float64_t(b), to_float64_t(c_neg));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fnmadd_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto a_neg = a ^ F32_SIGN;
auto c_neg = c ^ F32_SIGN;
auto r = f32_mulAdd(to_float32_t(a_neg), to_float32_t(b), to_float32_t(c_neg));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fnmadd_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto a_neg = a ^ F64_SIGN;
auto c_neg = c ^ F64_SIGN;
auto r = f64_mulAdd(to_float64_t(a_neg), to_float64_t(b), to_float64_t(c_neg));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fnmsub_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto a_neg = a ^ F32_SIGN;
auto r = f32_mulAdd(to_float32_t(a_neg), to_float32_t(b), to_float32_t(c));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fnmsub_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto a_neg = a ^ F64_SIGN;
auto r = f64_mulAdd(to_float64_t(a_neg), to_float64_t(b), to_float64_t(c));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fdiv_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_div(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_div(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_sqrt(to_float32_t(a));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fsqrt_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_sqrt(to_float64_t(a));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_ftoi_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_to_i32(to_float32_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint32_t rv_ftoi_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_to_i32(to_float64_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint32_t rv_ftou_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_to_ui32(to_float32_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint32_t rv_ftou_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_to_ui32(to_float64_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint64_t rv_ftol_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_to_i64(to_float32_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint64_t rv_ftol_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_to_i64(to_float64_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint64_t rv_ftolu_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_to_ui64(to_float32_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint64_t rv_ftolu_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_to_ui64(to_float64_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint32_t rv_itof_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = i32_to_f32(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_itof_d(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = i32_to_f64(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_utof_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = ui32_to_f32(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_utof_d(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = ui32_to_f64(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_ltof_s(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = i64_to_f32(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_ltof_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = i64_to_f64(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_lutof_s(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = ui64_to_f32(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_lutof_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = ui64_to_f64(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
bool rv_flt_s(uint32_t a, uint32_t b, uint32_t* fflags) {
rv_init(0);
auto r = f32_lt(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
bool rv_flt_d(uint64_t a, uint64_t b, uint32_t* fflags) {
rv_init(0);
auto r = f64_lt(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
bool rv_fle_s(uint32_t a, uint32_t b, uint32_t* fflags) {
rv_init(0);
auto r = f32_le(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
bool rv_fle_d(uint64_t a, uint64_t b, uint32_t* fflags) {
rv_init(0);
auto r = f64_le(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
bool rv_feq_s(uint32_t a, uint32_t b, uint32_t* fflags) {
rv_init(0);
auto r = f32_eq(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
bool rv_feq_d(uint64_t a, uint64_t b, uint32_t* fflags) {
rv_init(0);
auto r = f64_eq(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint32_t rv_fmin_s(uint32_t a, uint32_t b, uint32_t* fflags) {
uint32_t r;
rv_init(0);
if (isNaNF32UI(a) && isNaNF32UI(b)) {
r = defaultNaNF32UI;
} else {
@@ -324,12 +341,13 @@ uint32_t rv_fmin_s(uint32_t a, uint32_t b, uint32_t* fflags) {
r = b;
}
}
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags) {
uint64_t r;
rv_init(0);
if (isNaNF64UI(a) && isNaNF64UI(b)) {
r = defaultNaNF64UI;
} else {
@@ -342,12 +360,13 @@ uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags) {
r = b;
}
}
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint32_t rv_fmax_s(uint32_t a, uint32_t b, uint32_t* fflags) {
uint32_t r;
rv_init(0);
if (isNaNF32UI(a) && isNaNF32UI(b)) {
r = defaultNaNF32UI;
} else {
@@ -360,12 +379,13 @@ uint32_t rv_fmax_s(uint32_t a, uint32_t b, uint32_t* fflags) {
r = b;
}
}
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags) {
uint64_t r;
rv_init(0);
if (isNaNF64UI(a) && isNaNF64UI(b)) {
r = defaultNaNF64UI;
} else {
@@ -378,7 +398,7 @@ uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags) {
r = b;
}
}
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}

View File

@@ -1,5 +1,17 @@
#ifndef RVFLOATS_H
#define RVFLOATS_H
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
@@ -78,5 +90,3 @@ uint64_t rv_ftod(uint32_t a);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
@@ -84,33 +97,39 @@ public:
}
uint64_t pop() {
auto cycle = queue_.front().cycle;
auto cycles = queue_.front().cycles;
queue_.pop();
return cycle;
return cycles;
}
void tx_callback(const TxCallback& callback) {
tx_cb_ = callback;
}
uint64_t arrival_time() const {
if (queue_.empty())
return 0;
return queue_.front().cycles;
}
protected:
struct timed_pkt_t {
Pkt pkt;
uint64_t cycle;
uint64_t cycles;
};
std::queue<timed_pkt_t> queue_;
SimPort* peer_;
TxCallback tx_cb_;
void push(const Pkt& data, uint64_t cycle) {
void push(const Pkt& data, uint64_t cycles) {
if (tx_cb_) {
tx_cb_(data, cycle);
tx_cb_(data, cycles);
}
if (peer_) {
peer_->push(data, cycle);
peer_->push(data, cycles);
} else {
queue_.push({data, cycle});
queue_.push({data, cycles});
}
}
@@ -129,14 +148,14 @@ public:
virtual void fire() const = 0;
uint64_t time() const {
return time_;
uint64_t cycles() const {
return cycles_;
}
protected:
SimEventBase(uint64_t time) : time_(time) {}
SimEventBase(uint64_t cycles) : cycles_(cycles) {}
uint64_t time_;
uint64_t cycles_;
};
///////////////////////////////////////////////////////////////////////////////
@@ -150,8 +169,8 @@ public:
typedef std::function<void (const Pkt&)> Func;
SimCallEvent(const Func& func, const Pkt& pkt, uint64_t time)
: SimEventBase(time)
SimCallEvent(const Func& func, const Pkt& pkt, uint64_t cycles)
: SimEventBase(cycles)
, func_(func)
, pkt_(pkt)
{}
@@ -180,11 +199,11 @@ template <typename Pkt>
class SimPortEvent : public SimEventBase {
public:
void fire() const override {
const_cast<SimPort<Pkt>*>(port_)->push(pkt_, time_);
const_cast<SimPort<Pkt>*>(port_)->push(pkt_, cycles_);
}
SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t time)
: SimEventBase(time)
SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t cycles)
: SimEventBase(cycles)
, port_(port)
, pkt_(pkt)
{}
@@ -330,7 +349,7 @@ public:
auto evt_it_end = events_.end();
while (evt_it != evt_it_end) {
auto& event = *evt_it;
if (cycles_ >= event->time()) {
if (cycles_ >= event->cycles()) {
event->fire();
evt_it = events_.erase(evt_it);
} else {
@@ -395,5 +414,5 @@ void SimPort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
reinterpret_cast<const SimPort<Pkt>*>(peer_)->send(pkt, delay);
} else {
SimPlatform::instance().schedule(this, pkt, delay);
}
}
}
}

78
sim/common/stringutil.h Normal file
View File

@@ -0,0 +1,78 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include <iomanip>
class ByteStream : public std::istream {
public:
ByteStream(const void *buf, std::size_t size) : buf_(buf), size_(size) {}
friend std::ostream& operator<<(std::ostream& os, const ByteStream& obj) {
auto oldflags = os.flags();
auto oldwidth = os.width();
auto oldfill = os.fill();
for (std::size_t i = 0, n = obj.size_; i < n; ++i) {
int byte = *((uint8_t*)obj.buf_ + (n - 1 - i));
os << std::hex << std::setw(2) << std::setfill('0') << byte;
}
os.fill(oldfill);
os.width(oldwidth);
os.flags(oldflags);
return os;
}
private:
const void *buf_;
std::size_t size_;
};
class IndentStream : public std::streambuf {
public:
explicit IndentStream(std::streambuf* dest, int indent = 4)
: dest_(dest)
, isBeginLine_(true)
, indent_(indent, ' ')
, owner_(nullptr)
{}
explicit IndentStream(std::ostream& dest, int indent = 4)
: dest_(dest.rdbuf())
, isBeginLine_(true)
, indent_(indent, ' ')
, owner_(&dest) {
owner_->rdbuf(this);
}
virtual ~IndentStream() {
if (owner_)
owner_->rdbuf(dest_);
}
protected:
virtual int overflow(int ch) {
if (isBeginLine_ && ch != '\n') {
dest_->sputn(indent_.data(), indent_.size());
}
isBeginLine_ = ch == '\n';
return dest_->sputc(ch);
}
private:
std::streambuf* dest_;
bool isBeginLine_;
std::string indent_;
std::ostream* owner_;
};

View File

@@ -1,237 +0,0 @@
#pragma once
#include <cstdint>
#include <cocogfx/include/fixed.h>
#include <bitmanip.h>
using namespace cocogfx;
enum class WrapMode {
Clamp,
Repeat,
Mirror,
};
enum class TexFormat {
A8R8G8B8,
R5G6B5,
A1R5G5B5,
A4R4G4B4,
A8L8,
L8,
A8,
};
template <uint32_t F, typename T = int32_t>
T Clamp(Fixed<F,T> fx, WrapMode mode) {
switch (mode) {
case WrapMode::Clamp: return (fx.data() < 0) ? 0 : ((fx.data() > Fixed<F,T>::MASK) ? Fixed<F,T>::MASK : fx.data());
case WrapMode::Repeat: return (fx.data() & Fixed<F,T>::MASK);
case WrapMode::Mirror: return (bit_get(fx.data(), Fixed<F,T>::FRAC) ? ~fx.data() : fx.data());
default:
std::abort();
return 0;
}
}
inline uint32_t Stride(TexFormat format) {
switch (format) {
case TexFormat::A8R8G8B8:
return 4;
case TexFormat::R5G6B5:
case TexFormat::A1R5G5B5:
case TexFormat::A4R4G4B4:
case TexFormat::A8L8:
return 2;
case TexFormat::L8:
case TexFormat::A8:
return 1;
default:
std::abort();
return 0;
}
}
inline void Unpack8888(TexFormat format,
uint32_t texel,
uint32_t* lo,
uint32_t* hi) {
uint32_t r, g, b, a;
switch (format) {
case TexFormat::A8R8G8B8:
r = (texel >> 16) & 0xff;
g = (texel >> 8) & 0xff;
b = texel & 0xff;
a = texel >> 24;
break;
case TexFormat::R5G6B5:
r = ((texel >> 11) << 3) | (texel >> 13);
g = ((texel >> 3) & 0xfc) | ((texel >> 9) & 0x3);
b = ((texel & 0x1f) << 3) | ((texel & 0x1c) >> 2);
a = 0xff;
break;
case TexFormat::A1R5G5B5:
r = ((texel >> 7) & 0xf8) | ((texel << 1) >> 13);
g = ((texel >> 2) & 0xf8) | ((texel >> 7) & 7);
b = ((texel & 0x1f) << 3) | ((texel & 0x1c) >> 2);
a = 0xff * (texel >> 15);
break;
case TexFormat::A4R4G4B4:
r = ((texel >> 4) & 0xf0) | ((texel >> 8) & 0x0f);
g = ((texel & 0xf0) >> 0) | ((texel & 0xf0) >> 4);
b = ((texel & 0x0f) << 4) | ((texel & 0x0f) >> 0);
a = ((texel >> 8) & 0xf0) | (texel >> 12);
break;
case TexFormat::A8L8:
r = texel & 0xff;
g = r;
b = r;
a = texel >> 8;
break;
case TexFormat::L8:
r = texel & 0xff;
g = r;
b = r;
a = 0xff;
break;
case TexFormat::A8:
r = 0xff;
g = 0xff;
b = 0xff;
a = texel & 0xff;
break;
default:
std::abort();
}
*lo = (r << 16) + b;
*hi = (a << 16) + g;
}
inline void Unpack8888(uint32_t texel, uint32_t* lo, uint32_t* hi) {
*lo = texel & 0x00ff00ff;
*hi = (texel >> 8) & 0x00ff00ff;
}
inline uint32_t Pack8888(uint32_t lo, uint32_t hi) {
return (hi << 8) | lo;
}
inline uint32_t Lerp8888(uint32_t a, uint32_t b, uint32_t f) {
return (a + (((b - a) * f) >> 8)) & 0x00ff00ff;
}
template <uint32_t F, typename T = int32_t>
void TexAddressLinear(Fixed<F,T> fu,
Fixed<F,T> fv,
uint32_t log_width,
uint32_t log_height,
WrapMode wrapu,
WrapMode wrapv,
uint32_t* addr00,
uint32_t* addr01,
uint32_t* addr10,
uint32_t* addr11,
uint32_t* alpha,
uint32_t* beta
) {
auto delta_x = Fixed<F,T>::make(Fixed<F,T>::HALF >> log_width);
auto delta_y = Fixed<F,T>::make(Fixed<F,T>::HALF >> log_height);
uint32_t u0 = Clamp(fu - delta_x, wrapu);
uint32_t u1 = Clamp(fu + delta_x, wrapu);
uint32_t v0 = Clamp(fv - delta_y, wrapv);
uint32_t v1 = Clamp(fv + delta_y, wrapv);
uint32_t shift_u = (Fixed<F,T>::FRAC - log_width);
uint32_t shift_v = (Fixed<F,T>::FRAC - log_height);
uint32_t x0s = (u0 << 8) >> shift_u;
uint32_t y0s = (v0 << 8) >> shift_v;
uint32_t x0 = x0s >> 8;
uint32_t y0 = y0s >> 8;
uint32_t x1 = u1 >> shift_u;
uint32_t y1 = v1 >> shift_v;
*addr00 = x0 + (y0 << log_width);
*addr01 = x1 + (y0 << log_width);
*addr10 = x0 + (y1 << log_width);
*addr11 = x1 + (y1 << log_width);
*alpha = x0s & 0xff;
*beta = y0s & 0xff;
//printf("*** fu=0x%x, fv=0x%x, u0=0x%x, u1=0x%x, v0=0x%x, v1=0x%x, x0=0x%x, x1=0x%x, y0=0x%x, y1=0x%x, addr00=0x%x, addr01=0x%x, addr10=0x%x, addr11=0x%x\n", fu.data(), fv.data(), u0, u1, v0, v1, x0, x1, y0, y1, *addr00, *addr01, *addr10, *addr11);
}
template <uint32_t F, typename T = int32_t>
void TexAddressPoint(Fixed<F,T> fu,
Fixed<F,T> fv,
uint32_t log_width,
uint32_t log_height,
WrapMode wrapu,
WrapMode wrapv,
uint32_t* addr
) {
uint32_t u = Clamp(fu, wrapu);
uint32_t v = Clamp(fv, wrapv);
uint32_t x = u >> (Fixed<F,T>::FRAC - log_width);
uint32_t y = v >> (Fixed<F,T>::FRAC - log_height);
*addr = x + (y << log_width);
//printf("*** fu=0x%x, fv=0x%x, u=0x%x, v=0x%x, x=0x%x, y=0x%x, addr=0x%x\n", fu.data(), fv.data(), u, v, x, y, *addr);
}
inline uint32_t TexFilterLinear(
TexFormat format,
uint32_t texel00,
uint32_t texel01,
uint32_t texel10,
uint32_t texel11,
uint32_t alpha,
uint32_t beta
) {
uint32_t c01l, c01h;
{
uint32_t c0l, c0h, c1l, c1h;
Unpack8888(format, texel00, &c0l, &c0h);
Unpack8888(format, texel01, &c1l, &c1h);
c01l = Lerp8888(c0l, c1l, alpha);
c01h = Lerp8888(c0h, c1h, alpha);
}
uint32_t c23l, c23h;
{
uint32_t c2l, c2h, c3l, c3h;
Unpack8888(format, texel10, &c2l, &c2h);
Unpack8888(format, texel11, &c3l, &c3h);
c23l = Lerp8888(c2l, c3l, alpha);
c23h = Lerp8888(c2h, c3h, alpha);
}
uint32_t color;
{
uint32_t cl = Lerp8888(c01l, c23l, beta);
uint32_t ch = Lerp8888(c01h, c23h, beta);
color = Pack8888(cl, ch);
}
//printf("*** texel00=0x%x, texel01=0x%x, texel10=0x%x, texel11=0x%x, color=0x%x\n", texel00, texel01, texel10, texel11, color);
return color;
}
inline uint32_t TexFilterPoint(TexFormat format, uint32_t texel) {
uint32_t color;
{
uint32_t cl, ch;
Unpack8888(format, texel, &cl, &ch);
color = Pack8888(cl, ch);
}
//printf("*** texel=0x%x, color=0x%x\n", texel, color);
return color;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "util.h"
#include <string.h>
@@ -7,4 +20,20 @@ const char* fileExtension(const char* filepath) {
if (ext == NULL || ext == filepath)
return "";
return ext + 1;
}
void* aligned_malloc(size_t size, size_t alignment) {
// reserve margin for alignment and storing of unaligned address
assert((alignment & (alignment - 1)) == 0); // Power of 2 alignment.
size_t margin = (alignment-1) + sizeof(void*);
void *unaligned_addr = malloc(size + margin);
void **aligned_addr = (void**)((uintptr_t)(((uint8_t*)unaligned_addr) + margin) & ~(alignment-1));
aligned_addr[-1] = unaligned_addr;
return aligned_addr;
}
void aligned_free(void *ptr) {
// retreive the stored unaligned address and use it to free the allocation
void* unaligned_addr = ((void**)ptr)[-1];
free(unaligned_addr);
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
@@ -49,4 +62,7 @@ const char* fileExtension(const char* filepath);
#define DISABLE_WARNING_UNUSED_PARAMETER
#define DISABLE_WARNING_UNREFERENCED_FUNCTION
#define DISABLE_WARNING_ANONYMOUS_STRUCT
#endif
#endif
void *aligned_malloc(size_t size, size_t alignment);
void aligned_free(void *ptr);

55
sim/common/uuid_gen.h Normal file
View File

@@ -0,0 +1,55 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <unordered_map>
namespace vortex {
class UUIDGenerator {
public:
UUIDGenerator() : ids_(0) {}
virtual ~UUIDGenerator() {}
uint32_t get_uuid(uint64_t PC) {
uint32_t id;
uint32_t ref;
auto it = uuid_map_.find(PC);
if (it != uuid_map_.end()) {
uint64_t value = it->second;
id = value & 0xffff;
ref = value >> 16;
} else {
id = ids_++;
ref = -1;
}
++ref;
uint64_t ret = (uint64_t(ref) << 16) | id;
uuid_map_[PC] = ret;
return ret;
}
void reset() {
uuid_map_.clear();
ids_ = 0;
}
private:
std::unordered_map<uint64_t, uint32_t> uuid_map_;
uint32_t ids_;
};
}

138
sim/opaesim/Makefile Normal file
View File

@@ -0,0 +1,138 @@
XLEN ?= 32
DESTDIR ?= .
RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi
AFU_DIR = $(RTL_DIR)/afu/opae
SCRIPT_DIR = ../../hw/scripts
THIRD_PARTY_DIR = ../../third_party
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I.. -I../../../hw -I../../common -I$(abspath $(DESTDIR))
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I../$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN)
LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator -pthread
# control RTL debug tracing states
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_MEM
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_TRACE_FLAGS += -DDBG_TRACE_RASTER
DBG_TRACE_FLAGS += -DDBG_TRACE_ROP
DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR
# Control logic analyzer monitors
DBG_SCOPE_FLAGS += -DDBG_SCOPE_AFU
DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
DBG_SCOPE_FLAGS += -DDBG_SCOPE_RASTER
DBG_SCOPE_FLAGS += -DDBG_SCOPE_MSCHED
# AFU parameters
CONFIGS += -DPLATFORM_PROVIDES_LOCAL_MEMORY
ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_BANKS,$(CONFIGS)))
CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=2
endif
ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=26
endif
ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=512
endif
ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH=4
endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += fpga.cpp opae_sim.cpp
RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv
RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
FPU_INCLUDE = -I$(RTL_DIR)/fpu
ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
RTL_PKGS += $(THIRD_PARTY_DIR)/fpnew/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/fpnew/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv
FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
endif
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)
RTL_INCLUDE += -I$(AFU_DIR) -I$(AFU_DIR)/ccip
TOP = vortex_afu_shim
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION
VL_FLAGS += -DXLEN_$(XLEN)
VL_FLAGS += $(CONFIGS)
VL_FLAGS += verilator.vlt
VL_FLAGS += $(RTL_INCLUDE)
VL_FLAGS += $(RTL_PKGS)
VL_FLAGS += $(DBG_SCOPE_FLAGS)
CXXFLAGS += $(CONFIGS)
# Enable Verilator multithreaded simulation
THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())')
VL_FLAGS += -j $(THREADS)
#VL_FLAGS += --threads $(THREADS)
# Debugigng
ifdef DEBUG
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 $(DBG_FLAGS)
else
VL_FLAGS += -DNDEBUG
CXXFLAGS += -O3 -DNDEBUG
endif
# Enable scope analyzer
ifdef SCOPE
VL_FLAGS += -DSCOPE
CXXFLAGS += -DSCOPE
SCOPE_JSON = $(DESTDIR)/scope.json
endif
# Enable perf counters
ifdef PERF
VL_FLAGS += -DPERF_ENABLE
CXXFLAGS += -DPERF_ENABLE
endif
# use our OPAE shim
VL_FLAGS += -DNOPAE
CXXFLAGS += -DNOPAE
PROJECT = libopae-c-sim.so
all: $(PROJECT)
$(DESTDIR)/vortex.xml:
verilator --xml-only -O0 $(VL_FLAGS) $(TOP) --xml-output $(DESTDIR)/vortex.xml
$(DESTDIR)/scope.json: $(DESTDIR)/vortex.xml
$(SCRIPT_DIR)/scope.py $(DESTDIR)/vortex.xml -o $(DESTDIR)/scope.json
$(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh
$(SCRIPT_DIR)/gen_config.py -i $(AFU_DIR)/vortex_afu.vh -o $(DESTDIR)/vortex_afu.h
$(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON)
verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT)
clean:
rm -rf obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <iostream>
#include <stdio.h>
@@ -8,10 +21,61 @@
#include "fpga.h"
#include "opae_sim.h"
#include <VX_config.h>
#include <util.h>
using namespace vortex;
#ifdef __cplusplus
extern "C" {
#endif
extern fpga_result fpgaGetProperties(fpga_token token, fpga_properties *prop) {
__unused (token, prop);
return FPGA_OK;
}
extern fpga_result fpgaPropertiesSetObjectType(fpga_properties prop, fpga_objtype objtype) {
__unused (prop, objtype);
return FPGA_OK;
}
extern fpga_result fpgaPropertiesSetGUID(fpga_properties prop, fpga_guid guid) {
__unused (prop, guid);
return FPGA_OK;
}
extern fpga_result fpgaDestroyProperties(fpga_properties *prop) {
__unused (prop);
return FPGA_OK;
}
extern fpga_result fpgaEnumerate(const fpga_properties *filters, uint32_t num_filters, fpga_token *tokens, uint32_t max_tokens, uint32_t *num_matches) {
__unused (filters, num_filters, num_filters, tokens, max_tokens);
if (num_matches) {
*num_matches = 1;
}
return FPGA_OK;
}
extern fpga_result fpgaDestroyToken(fpga_token *token) {
__unused (token);
return FPGA_OK;
}
extern fpga_result fpgaPropertiesGetLocalMemorySize(const fpga_properties *filters, uint64_t* lms) {
__unused (filters);
if (lms) {
#if (XLEN == 64)
*lms = 0x200000000; // 8 GB
#else
*lms = 0x100000000; // 4 GB
#endif
}
return FPGA_OK;
}
extern fpga_result fpgaOpen(fpga_token token, fpga_handle *handle, int flags) {
__unused (token);
if (NULL == handle || flags != 0)
return FPGA_INVALID_PARAM;
auto sim = new opae_sim();
@@ -83,4 +147,8 @@ extern fpga_result fpgaReadMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_
extern const char *fpgaErrStr(fpga_result e) {
return "";
}
}
#ifdef __cplusplus
}
#endif

View File

@@ -1,7 +1,20 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __FPGA_H__
#define __FPGA_H__
#include <stdio.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
@@ -21,28 +34,21 @@ typedef enum {
FPGA_RECONF_ERROR /**< Error while reconfiguring FPGA */
} fpga_result;
typedef enum {
FPGA_DEVICE = 0,
FPGA_ACCELERATOR
} fpga_objtype;
typedef void *fpga_handle;
typedef void *fpga_token;
fpga_result fpgaOpen(fpga_token token, fpga_handle *handle, int flags);
typedef void *fpga_properties;
fpga_result fpgaClose(fpga_handle handle);
fpga_result fpgaPrepareBuffer(fpga_handle handle, uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
fpga_result fpgaReleaseBuffer(fpga_handle handle, uint64_t wsid);
fpga_result fpgaGetIOAddress(fpga_handle handle, uint64_t wsid, uint64_t *ioaddr);
fpga_result fpgaWriteMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t value);
fpga_result fpgaReadMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t *value);
const char *fpgaErrStr(fpga_result e);
typedef uint8_t fpga_guid[16];
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
}
#endif
#endif // __FPGA_H__

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "opae_sim.h"
#include <verilated.h>
@@ -25,6 +38,7 @@
#include <list>
#include <queue>
#include <unordered_map>
#include <util.h>
#ifndef MEMORY_BANKS
#ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
@@ -62,6 +76,8 @@
#define RAM_PAGE_SIZE 4096
#define CPU_GPU_LATENCY 200
using namespace vortex;
static uint64_t timestamp = 0;
@@ -70,23 +86,6 @@ double sc_time_stamp() {
return timestamp;
}
static void *__aligned_malloc(size_t alignment, size_t size) {
// reserve margin for alignment and storing of unaligned address
size_t margin = (alignment-1) + sizeof(void*);
void *unaligned_addr = malloc(size + margin);
void **aligned_addr = (void**)((uintptr_t)(((uint8_t*)unaligned_addr) + margin) & ~(alignment-1));
aligned_addr[-1] = unaligned_addr;
return aligned_addr;
}
static void __aligned_free(void *ptr) {
// retreive the stored unaligned address and use it to free the allocation
void* unaligned_addr = ((void**)ptr)[-1];
free(unaligned_addr);
}
///////////////////////////////////////////////////////////////////////////////
static bool trace_enabled = false;
static uint64_t trace_start_time = TRACE_START_TIME;
static uint64_t trace_stop_time = TRACE_STOP_TIME;
@@ -158,7 +157,7 @@ public:
future_.wait();
}
for (auto& buffer : host_buffers_) {
__aligned_free(buffer.second.data);
aligned_free(buffer.second.data);
}
#ifdef VCD_OUTPUT
trace_->close();
@@ -176,9 +175,13 @@ public:
}
int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) {
auto alloc = __aligned_malloc(CACHE_BLOCK_SIZE, len);
auto alloc = aligned_malloc(len, CACHE_BLOCK_SIZE);
if (alloc == NULL)
return -1;
// set uninitialized data to "baadf00d"
for (uint32_t i = 0; i < len; ++i) {
((uint8_t*)alloc)[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff;
}
host_buffer_t buffer;
buffer.data = (uint64_t*)alloc;
buffer.size = len;
@@ -193,7 +196,7 @@ public:
void release_buffer(uint64_t wsid) {
auto it = host_buffers_.find(wsid);
if (it != host_buffers_.end()) {
__aligned_free(it->second.data);
aligned_free(it->second.data);
host_buffers_.erase(it);
}
}
@@ -205,6 +208,11 @@ public:
void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
std::lock_guard<std::mutex> guard(mutex_);
// simulate CPU-GPU latency
for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i)
this->tick();
// simulate mmio request
device_->vcp2af_sRxPort_c0_mmioRdValid = 1;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
@@ -217,7 +225,12 @@ public:
void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) {
std::lock_guard<std::mutex> guard(mutex_);
// simulate CPU-GPU latency
for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i)
this->tick();
// simulate mmio request
device_->vcp2af_sRxPort_c0_mmioWrValid = 1;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
@@ -254,7 +267,14 @@ private:
this->eval();
}
device_->reset = 0;
device_->reset = 0;
for (int i = 0; i < RESET_DELAY; ++i) {
device_->clk = 0;
this->eval();
device_->clk = 1;
this->eval();
}
// Turn on assertion after reset
Verilated::assertOn(true);
@@ -289,7 +309,7 @@ private:
#endif
}
void eval() {
void eval() {
device_->eval();
#ifdef VCD_OUTPUT
if (sim_trace_enabled()) {
@@ -396,10 +416,10 @@ private:
// process memory requests
assert(!device_->avs_read[b] || !device_->avs_write[b]);
unsigned byte_addr = device_->avs_address[b] * MEM_BLOCK_SIZE;
unsigned byte_addr = (device_->avs_address[b] * MEMORY_BANKS + b) * MEM_BLOCK_SIZE;
if (device_->avs_write[b]) {
uint64_t byteen = device_->avs_byteenable[b];
uint8_t* data = (uint8_t*)device_->avs_writedata[b].data();
uint8_t* data = (uint8_t*)(device_->avs_writedata[b].data());
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
(*ram_)[byte_addr + i] = data[i];
@@ -419,8 +439,7 @@ private:
0
);
dram_queue_.push(dram_req);
}
} else
if (device_->avs_read[b]) {
auto mem_req = new mem_rd_req_t();
mem_req->addr = device_->avs_address[b];
@@ -491,7 +510,7 @@ private:
std::mutex mutex_;
RAM *ram_;
RAM* ram_;
ramulator::Gem5Wrapper* dram_;
@@ -531,4 +550,4 @@ void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value)
void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
impl_->read_mmio64(mmio_num, offset, value);
}
}

43
sim/opaesim/opae_sim.h Normal file
View File

@@ -0,0 +1,43 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
namespace vortex {
class RAM;
class opae_sim {
public:
opae_sim();
virtual ~opae_sim();
int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
void release_buffer(uint64_t wsid);
void get_io_address(uint64_t wsid, uint64_t *ioaddr);
void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value);
void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value);
private:
class Impl;
Impl* impl_;
};
}

View File

@@ -0,0 +1,8 @@
`verilator_config
lint_off -rule BLKANDNBLK -file "*/fpnew/src/*"
lint_off -rule UNOPTFLAT -file "*/fpnew/src/*"
lint_off -file "*/fpnew/src/*"
lint_off -file "*/afu/opae/ccip/ccip_if_pkg.sv"
lint_off -file "*/afu/opae/local_mem_cfg_pkg.sv"

View File

@@ -1,16 +1,24 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
`IGNORE_WARNINGS_BEGIN
`include "vortex_afu.vh"
`IGNORE_WARNINGS_END
/* verilator lint_off IMPORTSTAR */
import ccip_if_pkg::*;
import local_mem_cfg_pkg::*;
/* verilator lint_on IMPORTSTAR */
`include "VX_define.vh"
module vortex_afu_shim (
module vortex_afu_shim import local_mem_cfg_pkg::*; import ccip_if_pkg::*; (
// global signals
input clk,
input reset,
@@ -167,4 +175,4 @@ assign af2cp_sTxPort_c2_hdr_tid = af2cp_sTxPort.c2.hdr.tid;
assign af2cp_sTxPort_c2_mmioRdValid = af2cp_sTxPort.c2.mmioRdValid;
assign af2cp_sTxPort_c2_data = af2cp_sTxPort.c2.data;
endmodule
endmodule

View File

@@ -1,3 +1,4 @@
XLEN ?= 32
DESTDIR ?= .
RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi
@@ -8,6 +9,7 @@ CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I../../../hw -I../../common
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I../$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN)
LDFLAGS += ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator
@@ -24,12 +26,20 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_TRACE_FLAGS += -DDBG_TRACE_RASTER
DBG_TRACE_FLAGS += -DDBG_TRACE_ROP
DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR
DBG_FLAGS += $(DBG_TRACE_FLAGS)
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) $(TEX_INCLUDE)
RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
FPU_INCLUDE = -I$(RTL_DIR)/fpu
ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
RTL_PKGS += $(THIRD_PARTY_DIR)/fpnew/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/fpnew/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv
FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
endif
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
@@ -42,14 +52,18 @@ else
TOP = Vortex
endif
VL_FLAGS = --exe --cc $(TOP) --top-module $(TOP)
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO -Wno-EOFNEWLINE
VL_FLAGS = --exe
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += verilator.vlt
VL_FLAGS += $(RTL_INCLUDE)
VL_FLAGS += -DSIMULATION
VL_FLAGS += -DXLEN_$(XLEN)
VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(RTL_INCLUDE)
VL_FLAGS += $(RTL_PKGS)
VL_FLAGS += --cc $(TOP) --top-module $(TOP)
CXXFLAGS += $(CONFIGS)
# Enable Verilator multithreaded simulation
@@ -59,8 +73,8 @@ VL_FLAGS += -j $(THREADS)
# Debugigng
ifdef DEBUG
VL_FLAGS += --trace --trace-structs -DVCD_OUTPUT $(DBG_FLAGS)
CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 $(DBG_FLAGS)
else
VL_FLAGS += -DNDEBUG
CXXFLAGS += -O2 -DNDEBUG
@@ -72,20 +86,12 @@ ifdef PERF
CXXFLAGS += -DPERF_ENABLE
endif
# ALU backend
VL_FLAGS += -DIMUL_DPI
VL_FLAGS += -DIDIV_DPI
# FPU backend
FPU_CORE ?= FPU_DPI
VL_FLAGS += -D$(FPU_CORE)
PROJECT = rtlsim
all: $(PROJECT)
$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$@
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' -o ../$@
$(DESTDIR)/lib$(PROJECT).so: $(SRCS)
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' -o ../$@

View File

@@ -1,11 +1,24 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <fstream>
#include <iomanip>
#include <unistd.h>
#include <unistd.h>
#include <util.h>
#include <mem.h>
#include <VX_config.h>
#include <VX_types.h>
#include "processor.h"
#define RAM_PAGE_SIZE 4096
@@ -13,11 +26,11 @@
using namespace vortex;
static void show_usage() {
std::cout << "Usage: [-r] [-h: help] programs.." << std::endl;
std::cout << "Usage: [-r: riscv-test] [-h: help] <program>" << std::endl;
}
bool riscv_test = false;
std::vector<const char*> programs;
const char* program = nullptr;
static void parse_args(int argc, char **argv) {
int c;
@@ -35,56 +48,68 @@ static void parse_args(int argc, char **argv) {
show_usage();
exit(-1);
}
}
}
for (int i = optind; i < argc; ++i) {
programs.push_back(argv[i]);
if (optind < argc) {
program = argv[optind];
std::cout << "Running " << program << "..." << std::endl;
} else {
show_usage();
exit(-1);
}
}
int main(int argc, char **argv) {
int exitcode = 0;
bool failed = false;
parse_args(argc, argv);
parse_args(argc, argv);
// create memory module
vortex::RAM ram(RAM_PAGE_SIZE);
// create processor
vortex::Processor processor;
// attach memory module
processor.attach_ram(&ram);
for (auto program : programs) {
std::cout << "Running " << program << "..." << std::endl;
// setup base DCRs
const uint64_t startup_addr(STARTUP_ADDR);
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
#if (XLEN == 64)
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
#endif
processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
// load program
{
std::string program_ext(fileExtension(program));
if (program_ext == "bin") {
ram.loadBinImage(program, STARTUP_ADDR);
ram.loadBinImage(program, startup_addr);
} else if (program_ext == "hex") {
ram.loadHexImage(program);
} else {
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
return -1;
}
exitcode = processor.run();
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed" << std::endl;
} else {
std::cout << "Failed: exitcode=" << exitcode << std::endl;
failed = true;
}
} else {
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
failed = true;
}
}
if (failed)
break;
}
return failed ? exitcode : 0;
// run simulation
exitcode = processor.run();
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed" << std::endl;
exitcode = 0;
} else {
std::cout << "Failed" << std::endl;
exitcode = 1;
}
} else {
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
}
return exitcode;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "processor.h"
#include <verilated.h>
@@ -56,6 +69,14 @@
#define VERILATOR_RESET_VALUE 2
#endif
#if (XLEN == 32)
typedef uint32_t Word;
#elif (XLEN == 64)
typedef uint64_t Word;
#else
#error unsupported XLEN
#endif
#define VL_WDATA_GETW(lwp, i, n, w) \
VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w)
@@ -71,7 +92,7 @@ double sc_time_stamp() {
static bool trace_enabled = false;
static uint64_t trace_start_time = TRACE_START_TIME;
static uint64_t trace_stop_time = TRACE_STOP_TIME;
static uint64_t trace_stop_time = TRACE_STOP_TIME;
bool sim_trace_enabled() {
if (timestamp >= trace_start_time
@@ -126,6 +147,9 @@ public:
// reset the device
this->reset();
// Turn on assertion after reset
Verilated::assertOn(true);
}
~Impl() {
@@ -165,27 +189,46 @@ public:
std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
#endif
// reset device
this->reset();
// start execution
running_ = true;
device_->reset = 0;
// execute program
// wait on device to go busy
while (!device_->busy) {
this->tick();
}
// wait on device to go idle
while (device_->busy) {
if (get_ebreak()) {
exitcode = get_last_wb_value(3);
exitcode = (int)get_last_wb_value(3);
break;
}
this->tick();
}
// reset device
this->reset();
// wait 5 cycles to flush the pipeline
this->wait(5);
this->cout_flush();
return exitcode;
}
void write_dcr(uint32_t addr, uint32_t value) {
device_->dcr_wr_valid = 1;
device_->dcr_wr_addr = addr;
device_->dcr_wr_data = value;
while (device_->dcr_wr_valid) {
this->tick();
}
}
private:
void reset() {
void reset() {
running_ = false;
print_bufs_.clear();
pending_mem_reqs_.clear();
@@ -199,6 +242,8 @@ private:
this->reset_avs_bus();
#endif
this->reset_dcr_bus();
device_->reset = 1;
for (int i = 0; i < RESET_DELAY; ++i) {
@@ -206,14 +251,7 @@ private:
this->eval();
device_->clk = 1;
this->eval();
}
device_->reset = 0;
// Turn on assertion after reset
Verilated::assertOn(true);
this->cout_flush();
}
}
void tick() {
@@ -226,6 +264,7 @@ private:
#else
this->eval_avs_bus(0);
#endif
this->eval_dcr_bus(0);
device_->clk = 1;
this->eval();
@@ -235,6 +274,7 @@ private:
#else
this->eval_avs_bus(1);
#endif
this->eval_dcr_bus(1);
if (MEM_CYCLE_RATIO > 0) {
auto cycle = timestamp / 2;
@@ -260,6 +300,8 @@ private:
#ifdef VCD_OUTPUT
if (sim_trace_enabled()) {
trace_->dump(timestamp);
} else {
exit(-1);
}
#endif
++timestamp;
@@ -268,30 +310,30 @@ private:
#ifdef AXI_BUS
void reset_axi_bus() {
device_->m_axi_wready = 0;
device_->m_axi_awready = 0;
device_->m_axi_arready = 0;
device_->m_axi_rvalid = 0;
device_->m_axi_bvalid = 0;
device_->m_axi_wready[0] = 0;
device_->m_axi_awready[0] = 0;
device_->m_axi_arready[0] = 0;
device_->m_axi_rvalid[0] = 0;
device_->m_axi_bvalid[0] = 0;
}
void eval_axi_bus(bool clk) {
if (!clk) {
mem_rd_rsp_ready_ = device_->m_axi_rready;
mem_wr_rsp_ready_ = device_->m_axi_bready;
mem_rd_rsp_ready_ = device_->m_axi_rready[0];
mem_wr_rsp_ready_ = device_->m_axi_bready[0];
return;
}
if (ram_ == nullptr) {
device_->m_axi_wready = 0;
device_->m_axi_awready = 0;
device_->m_axi_arready = 0;
device_->m_axi_wready[0] = 0;
device_->m_axi_awready[0] = 0;
device_->m_axi_arready[0] = 0;
return;
}
// process memory responses
if (mem_rd_rsp_active_
&& device_->m_axi_rvalid && mem_rd_rsp_ready_) {
&& device_->m_axi_rvalid[0] && mem_rd_rsp_ready_) {
mem_rd_rsp_active_ = false;
}
if (!mem_rd_rsp_active_) {
@@ -299,30 +341,30 @@ private:
&& (*pending_mem_reqs_.begin())->ready
&& !(*pending_mem_reqs_.begin())->write) {
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_req = *mem_rsp_it;
auto mem_rsp = *mem_rsp_it;
/*
printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_req->addr);
printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_rsp->addr);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%02x", mem_req->block[(MEM_BLOCK_SIZE-1)-i]);
printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]);
}
printf("\n");
*/
device_->m_axi_rvalid = 1;
device_->m_axi_rid = mem_req->tag;
device_->m_axi_rresp = 0;
device_->m_axi_rlast = 1;
memcpy((uint8_t*)device_->m_axi_rdata, mem_req->block.data(), MEM_BLOCK_SIZE);
device_->m_axi_rvalid[0] = 1;
device_->m_axi_rid[0] = mem_rsp->tag;
device_->m_axi_rresp[0] = 0;
device_->m_axi_rlast[0] = 1;
memcpy(device_->m_axi_rdata[0].data(), mem_rsp->block.data(), MEM_BLOCK_SIZE);
pending_mem_reqs_.erase(mem_rsp_it);
mem_rd_rsp_active_ = true;
delete mem_req;
delete mem_rsp;
} else {
device_->m_axi_rvalid = 0;
device_->m_axi_rvalid[0] = 0;
}
}
// send memory write response
if (mem_wr_rsp_active_
&& device_->m_axi_bvalid && mem_wr_rsp_ready_) {
&& device_->m_axi_bvalid[0] && mem_wr_rsp_ready_) {
mem_wr_rsp_active_ = false;
}
if (!mem_wr_rsp_active_) {
@@ -330,34 +372,34 @@ private:
&& (*pending_mem_reqs_.begin())->ready
&& (*pending_mem_reqs_.begin())->write) {
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_req = *mem_rsp_it;
auto mem_rsp = *mem_rsp_it;
/*
printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_req->addr);
printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_rsp->addr);
*/
device_->m_axi_bvalid = 1;
device_->m_axi_bid = mem_req->tag;
device_->m_axi_bresp = 0;
device_->m_axi_bvalid[0] = 1;
device_->m_axi_bid[0] = mem_rsp->tag;
device_->m_axi_bresp[0] = 0;
pending_mem_reqs_.erase(mem_rsp_it);
mem_wr_rsp_active_ = true;
delete mem_req;
delete mem_rsp;
} else {
device_->m_axi_bvalid = 0;
device_->m_axi_bvalid[0] = 0;
}
}
// select the memory bank
uint32_t req_addr = device_->m_axi_wvalid ? device_->m_axi_awaddr : device_->m_axi_araddr;
uint32_t req_addr = device_->m_axi_wvalid[0] ? device_->m_axi_awaddr[0] : device_->m_axi_araddr[0];
// process memory requests
if (device_->m_axi_wvalid || device_->m_axi_arvalid) {
if (device_->m_axi_wvalid) {
uint64_t byteen = device_->m_axi_wstrb;
unsigned base_addr = device_->m_axi_awaddr;
uint8_t* data = (uint8_t*)(device_->m_axi_wdata);
if ((device_->m_axi_wvalid[0] || device_->m_axi_arvalid[0]) && running_) {
if (device_->m_axi_wvalid[0]) {
uint64_t byteen = device_->m_axi_wstrb[0];
uint64_t base_addr = device_->m_axi_awaddr[0];
uint8_t* data = (uint8_t*)device_->m_axi_wdata[0].data();
// check console output
if (base_addr >= IO_COUT_ADDR
&& base_addr < (IO_COUT_ADDR + IO_COUT_SIZE)) {
if (base_addr >= uint64_t(IO_COUT_ADDR)
&& base_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
auto& ss_buf = print_bufs_[i];
@@ -384,15 +426,15 @@ private:
}
auto mem_req = new mem_req_t();
mem_req->tag = device_->m_axi_awid;
mem_req->addr = device_->m_axi_awaddr;
mem_req->tag = device_->m_axi_awid[0];
mem_req->addr = device_->m_axi_awaddr[0];
mem_req->write = true;
mem_req->ready = true;
pending_mem_reqs_.emplace_back(mem_req);
// send dram request
ramulator::Request dram_req(
device_->m_axi_awaddr,
device_->m_axi_awaddr[0],
ramulator::Request::Type::WRITE,
0
);
@@ -401,18 +443,18 @@ private:
} else {
// process reads
auto mem_req = new mem_req_t();
mem_req->tag = device_->m_axi_arid;
mem_req->addr = device_->m_axi_araddr;
ram_->read(mem_req->block.data(), device_->m_axi_araddr, MEM_BLOCK_SIZE);
mem_req->tag = device_->m_axi_arid[0];
mem_req->addr = device_->m_axi_araddr[0];
ram_->read(mem_req->block.data(), device_->m_axi_araddr[0], MEM_BLOCK_SIZE);
mem_req->write = false;
mem_req->ready = false;
pending_mem_reqs_.emplace_back(mem_req);
// send dram request
ramulator::Request dram_req(
device_->m_axi_araddr,
device_->m_axi_araddr[0],
ramulator::Request::Type::READ,
std::bind([](ramulator::Request& dram_req, mem_req_t* mem_req) {
std::bind([&](ramulator::Request& dram_req, mem_req_t* mem_req) {
mem_req->ready = true;
}, placeholders::_1, mem_req),
0
@@ -421,9 +463,9 @@ private:
}
}
device_->m_axi_wready = 1;
device_->m_axi_awready = 1;
device_->m_axi_arready = 1;
device_->m_axi_wready[0] = running_;
device_->m_axi_awready[0] = running_;
device_->m_axi_arready[0] = running_;
}
#else
@@ -454,35 +496,35 @@ private:
&& (*pending_mem_reqs_.begin())->ready) {
device_->mem_rsp_valid = 1;
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_req = *mem_rsp_it;
auto mem_rsp = *mem_rsp_it;
/*
printf("%0ld: [sim] MEM Rd: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_req->addr);
printf("%0ld: [sim] MEM Rd: bank=%d, tag=%0lx, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_rsp->tag, mem_rsp->addr);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%02x", mem_req->block[(MEM_BLOCK_SIZE-1)-i]);
printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]);
}
printf("\n");
*/
memcpy(device_->mem_rsp_data.data(), mem_req->block.data(), MEM_BLOCK_SIZE);
device_->mem_rsp_tag = mem_req->tag;
memcpy(device_->mem_rsp_data.data(), mem_rsp->block.data(), MEM_BLOCK_SIZE);
device_->mem_rsp_tag = mem_rsp->tag;
pending_mem_reqs_.erase(mem_rsp_it);
mem_rd_rsp_active_ = true;
delete mem_req;
delete mem_rsp;
} else {
device_->mem_rsp_valid = 0;
}
}
// process memory requests
if (device_->mem_req_valid) {
uint32_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE);
if (device_->mem_req_valid && running_) {
uint64_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE);
if (device_->mem_req_rw) {
// process writes
uint64_t byteen = device_->mem_req_byteen;
uint8_t* data = (uint8_t*)device_->mem_req_data.data();
uint8_t* data = (uint8_t*)(device_->mem_req_data.data());
// check console output
if (byte_addr >= IO_COUT_ADDR
&& byte_addr < (IO_COUT_ADDR + IO_COUT_SIZE)) {
if (byte_addr >= uint64_t(IO_COUT_ADDR)
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
for (int i = 0; i < IO_COUT_SIZE; i++) {
if ((byteen >> i) & 0x1) {
auto& ss_buf = print_bufs_[i];
@@ -496,7 +538,7 @@ private:
}
} else {
/*
printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, byte_addr, byteen);
printf("%0ld: [sim] MEM Wr: tag=%0lx, addr=%0x, byteen=%0lx, data=", timestamp, device_->mem_req_tag, byte_addr, byteen);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]);
}
@@ -515,7 +557,7 @@ private:
0
);
dram_queue_.push(dram_req);
}
}
} else {
// process reads
auto mem_req = new mem_req_t();
@@ -526,11 +568,13 @@ private:
ram_->read(mem_req->block.data(), byte_addr, MEM_BLOCK_SIZE);
pending_mem_reqs_.emplace_back(mem_req);
//printf("%0ld: [sim] MEM Rd Req: addr=%0x, tag=%0lx\n", timestamp, byte_addr, device_->mem_req_tag);
// send dram request
ramulator::Request dram_req(
byte_addr,
ramulator::Request::Type::READ,
std::bind([](ramulator::Request& dram_req, mem_req_t* mem_req) {
std::bind([&](ramulator::Request& dram_req, mem_req_t* mem_req) {
mem_req->ready = true;
}, placeholders::_1, mem_req),
0
@@ -539,11 +583,24 @@ private:
}
}
device_->mem_req_ready = 1;
device_->mem_req_ready = running_;
}
#endif
void reset_dcr_bus() {
device_->dcr_wr_valid = 0;
}
void eval_dcr_bus(bool clk) {
if (!clk) {
return;
}
if (device_->dcr_wr_valid) {
device_->dcr_wr_valid = 0;
}
}
void wait(uint32_t cycles) {
for (int i = 0; i < cycles; ++i) {
this->tick();
@@ -552,17 +609,17 @@ private:
bool get_ebreak() const {
#ifdef AXI_BUS
return (bool)device_->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak;
return (bool)device_->Vortex_axi->vortex->sim_ebreak;
#else
return (bool)device_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak;
return (bool)device_->Vortex->sim_ebreak;
#endif
}
int get_last_wb_value(int reg) const {
uint64_t get_last_wb_value(int reg) const {
#ifdef AXI_BUS
return (int)device_->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg];
return ((Word*)device_->Vortex_axi->vortex->sim_wb_value.data())[reg];
#else
return (int)device_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg];
return ((Word*)device_->Vortex->sim_wb_value.data())[reg];
#endif
}
@@ -600,6 +657,8 @@ private:
ramulator::Gem5Wrapper* dram_;
std::queue<ramulator::Request> dram_queue_;
bool running_;
};
///////////////////////////////////////////////////////////////////////////////
@@ -618,4 +677,8 @@ void Processor::attach_ram(RAM* mem) {
int Processor::run() {
return impl_->run();
}
void Processor::write_dcr(uint32_t addr, uint32_t value) {
return impl_->write_dcr(addr, value);
}

View File

@@ -1,5 +1,20 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
namespace vortex {
class RAM;
@@ -14,6 +29,8 @@ public:
int run();
void write_dcr(uint32_t addr, uint32_t value);
private:
class Impl;

View File

@@ -1,10 +1,5 @@
`verilator_config
lint_off -rule BLKANDNBLK -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule UNOPTFLAT -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule WIDTH -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule UNUSED -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule LITENDIAN -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule IMPORTSTAR -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule PINCONNECTEMPTY -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule BLKANDNBLK -file "*/fpnew/src/*"
lint_off -rule UNOPTFLAT -file "*/fpnew/src/*"
lint_off -file "*/fpnew/src/*"

View File

@@ -1,45 +1,36 @@
XLEN ?= 32
DESTDIR ?= .
RTL_DIR = ../hw/rtl
THIRD_PARTY_DIR = ../../third_party
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I. -I../common -I../../hw
CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN)
CXXFLAGS += $(CONFIGS)
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp
OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
VPATH := $(sort $(dir $(SRCS)))
#$(info OBJS is $(OBJS))
#$(info VPATH is $(VPATH))
SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
#CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer
else
CXXFLAGS += -O2 -DNDEBUG
endif
# XLEN parameterization
ifdef XLEN
CXXFLAGS += -DXLEN=$(XLEN)
endif
PROJECT = simx
all: $(DESTDIR)/$(PROJECT)
$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
$(CXX) $(CXXFLAGS) -DSTARTUP_ADDR=0x80000000 $^ $(LDFLAGS) -o $@
$(DESTDIR)/lib$(PROJECT).so: $(SRCS)
$(CXX) $(CXXFLAGS) $^ -shared $(LDFLAGS) -o $@
@@ -48,4 +39,4 @@ $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
rm -rf $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so

87
sim/simx/arch.h Normal file
View File

@@ -0,0 +1,87 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <sstream>
#include <cstdlib>
#include <stdio.h>
#include "types.h"
namespace vortex {
class Arch {
private:
uint16_t num_threads_;
uint16_t num_warps_;
uint16_t num_cores_;
uint16_t num_clusters_;
uint16_t vsize_;
uint16_t num_regs_;
uint16_t num_csrs_;
uint16_t num_barriers_;
uint16_t ipdom_size_;
public:
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)
: num_threads_(num_threads)
, num_warps_(num_warps)
, num_cores_(num_cores)
, num_clusters_(num_clusters)
, vsize_(16)
, num_regs_(32)
, num_csrs_(4096)
, num_barriers_(NUM_BARRIERS)
, ipdom_size_((num_threads-1) * 2)
{}
uint16_t vsize() const {
return vsize_;
}
uint16_t num_regs() const {
return num_regs_;
}
uint16_t num_csrs() const {
return num_csrs_;
}
uint16_t num_barriers() const {
return num_barriers_;
}
uint16_t ipdom_size() const {
return ipdom_size_;
}
uint16_t num_threads() const {
return num_threads_;
}
uint16_t num_warps() const {
return num_warps_;
}
uint16_t num_cores() const {
return num_cores_;
}
uint16_t num_clusters() const {
return num_clusters_;
}
};
}

View File

@@ -1,70 +0,0 @@
#pragma once
#include <string>
#include <sstream>
#include <cstdlib>
#include <stdio.h>
#include "types.h"
namespace vortex {
class ArchDef {
private:
uint16_t num_cores_;
uint16_t num_warps_;
uint16_t num_threads_;
uint16_t wsize_;
uint16_t vsize_;
uint16_t num_regs_;
uint16_t num_csrs_;
uint16_t num_barriers_;
public:
ArchDef(uint16_t num_cores,
uint16_t num_warps,
uint16_t num_threads)
: num_cores_(num_cores)
, num_warps_(num_warps)
, num_threads_(num_threads)
, wsize_(4)
, vsize_(16)
, num_regs_(32)
, num_csrs_(4096)
, num_barriers_(NUM_BARRIERS)
{}
uint16_t wsize() const {
return wsize_;
}
uint16_t vsize() const {
return vsize_;
}
uint16_t num_regs() const {
return num_regs_;
}
uint16_t num_csrs() const {
return num_csrs_;
}
uint16_t num_barriers() const {
return num_barriers_;
}
uint16_t num_threads() const {
return num_threads_;
}
uint16_t num_warps() const {
return num_warps_;
}
uint16_t num_cores() const {
return num_cores_;
}
};
}

View File

@@ -1,47 +0,0 @@
#include <iostream>
#include <string>
#include "args.h"
using namespace vortex;
using std::string;
std::string CommandLineArg::helpString_;
std::unordered_map<string, CommandLineArg *> CommandLineArg::longArgs_;
std::unordered_map<string, CommandLineArg *> CommandLineArg::shortArgs_;
CommandLineArg::CommandLineArg(string s, string l, const char *helpText) {
helpString_ += helpText;
longArgs_[l] = this;
shortArgs_[s] = this;
}
CommandLineArg::CommandLineArg(string l, const char *helpText) {
helpString_ += helpText;
longArgs_[l] = this;
}
void CommandLineArg::readArgs(int argc, char **argv) {
for (int i = 0; i < argc; i++) {
std::unordered_map<string, CommandLineArg *>::iterator
s = shortArgs_.find(std::string(argv[i])),
l = longArgs_.find(std::string(argv[i]));
if (s != shortArgs_.end()) {
i += s->second->read(argc - i, &argv[i]);
} else if (l != longArgs_.end()) {
i += l->second->read(argc - i, &argv[i]);
} else {
throw BadArg(string(argv[i]));
}
}
}
void CommandLineArg::clearArgs() {
shortArgs_.clear();
longArgs_.clear();
helpString_ = "";
}
void CommandLineArg::showHelp(std::ostream &os) {
os << helpString_;
}

View File

@@ -1,64 +0,0 @@
#pragma once
#include <iostream>
#include <string>
#include <sstream>
#include <unordered_map>
#include <util.h>
namespace vortex {
struct BadArg { BadArg(std::string s) : arg(s) {} std::string arg; };
class CommandLineArg {
public:
CommandLineArg(std::string s, std::string l, const char *helpText);
CommandLineArg(std::string l, const char *helpText);
virtual int read(int argc, char** argv) = 0;
static void readArgs(int argc, char **argv);
static void clearArgs();
static void showHelp(std::ostream &os);
private:
static std::string helpString_;
static std::unordered_map<std::string, CommandLineArg *> longArgs_;
static std::unordered_map<std::string, CommandLineArg *> shortArgs_;
};
template <typename T> class CommandLineArgSetter : public CommandLineArg {
public:
CommandLineArgSetter(std::string s, std::string l, const char *ht, T &x) :
CommandLineArg(s, l, ht), arg_(x) {}
CommandLineArgSetter(std::string l, const char *ht, T &x) :
CommandLineArg(l, ht), arg_(x) {}
int read(int argc, char **argv) {
__unused (argc);
std::istringstream iss(argv[1]);
iss >> arg_;
return 1;
}
private:
T &arg_;
};
class CommandLineArgFlag : public CommandLineArg {
public:
CommandLineArgFlag(std::string s, std::string l, const char *ht, bool &x) :
CommandLineArg(s, l, ht), arg_(x) { arg_ = false; }
CommandLineArgFlag(std::string l, const char *ht, bool &x) :
CommandLineArg(l, ht), arg_(x) { arg_ = false; }
int read(int argc, char **argv) {
__unused (argc, argv);
arg_ = true;
return 0;
}
private:
bool &arg_;
};
}

View File

@@ -1,637 +0,0 @@
#include "cache.h"
#include "debug.h"
#include "types.h"
#include <util.h>
#include <unordered_map>
#include <vector>
#include <list>
#include <queue>
using namespace vortex;
struct params_t {
uint32_t sets_per_bank;
uint32_t blocks_per_set;
uint32_t words_per_block;
uint32_t log2_num_inputs;
uint32_t word_select_addr_start;
uint32_t word_select_addr_end;
uint32_t bank_select_addr_start;
uint32_t bank_select_addr_end;
uint32_t set_select_addr_start;
uint32_t set_select_addr_end;
uint32_t tag_select_addr_start;
uint32_t tag_select_addr_end;
params_t(const Cache::Config& config) {
uint32_t bank_bits = log2ceil(config.num_banks);
uint32_t offset_bits = config.B - config.W;
uint32_t log2_bank_size = config.C - bank_bits;
uint32_t index_bits = log2_bank_size - (config.B << config.A);
assert(log2_bank_size >= config.B);
this->log2_num_inputs = log2ceil(config.num_inputs);
this->words_per_block = 1 << offset_bits;
this->blocks_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits;
assert(config.ports_per_bank <= this->words_per_block);
// Word select
this->word_select_addr_start = config.W;
this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
// Bank select
this->bank_select_addr_start = (1+this->word_select_addr_end);
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
// Set select
this->set_select_addr_start = (1+this->bank_select_addr_end);
this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
// Tag select
this->tag_select_addr_start = (1+this->set_select_addr_end);
this->tag_select_addr_end = (config.addr_width-1);
}
uint32_t addr_bank_id(uint64_t word_addr) const {
if (bank_select_addr_end >= bank_select_addr_start)
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
else
return 0;
}
uint32_t addr_set_id(uint64_t word_addr) const {
if (set_select_addr_end >= set_select_addr_start)
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
else
return 0;
}
uint64_t addr_tag(uint64_t word_addr) const {
if (tag_select_addr_end >= tag_select_addr_start)
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
else
return 0;
}
uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
uint64_t addr(0);
if (bank_select_addr_end >= bank_select_addr_start)
addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
if (set_select_addr_end >= set_select_addr_start)
addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
if (tag_select_addr_end >= tag_select_addr_start)
addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
return addr;
}
};
struct block_t {
bool valid;
bool dirty;
uint64_t tag;
uint32_t lru_ctr;
};
struct set_t {
std::vector<block_t> blocks;
set_t(uint32_t size) : blocks(size) {}
void clear() {
for (auto& block : blocks) {
block.valid = false;
}
}
};
struct bank_req_info_t {
bool valid;
uint32_t req_id;
uint64_t req_tag;
};
struct bank_req_t {
bool valid;
bool write;
bool mshr_replay;
uint64_t tag;
uint32_t set_id;
uint32_t core_id;
uint64_t uuid;
std::vector<bank_req_info_t> infos;
bank_req_t(uint32_t size)
: valid(false)
, write(false)
, mshr_replay(false)
, tag(0)
, set_id(0)
, core_id(0)
, uuid(0)
, infos(size)
{}
};
struct mshr_entry_t : public bank_req_t {
uint32_t block_id;
mshr_entry_t(uint32_t size = 0)
: bank_req_t(size)
, block_id(0)
{}
};
class MSHR {
private:
std::vector<mshr_entry_t> entries_;
uint32_t size_;
public:
MSHR(uint32_t size)
: entries_(size)
, size_(0)
{}
bool empty() const {
return (0 == size_);
}
bool full() const {
return (size_ == entries_.size());
}
int lookup(const bank_req_t& bank_req) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (entry.valid
&& entry.set_id == bank_req.set_id
&& entry.tag == bank_req.tag) {
return i;
}
}
return -1;
}
int allocate(const bank_req_t& bank_req, uint32_t block_id) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (!entry.valid) {
*(bank_req_t*)&entry = bank_req;
entry.valid = true;
entry.mshr_replay = false;
entry.block_id = block_id;
++size_;
return i;
}
}
return -1;
}
mshr_entry_t& replay(uint32_t id) {
auto& root_entry = entries_.at(id);
assert(root_entry.valid);
// make all related mshr entries for replay
for (auto& entry : entries_) {
if (entry.valid
&& entry.set_id == root_entry.set_id
&& entry.tag == root_entry.tag) {
entry.mshr_replay = true;
}
}
return root_entry;
}
bool pop(bank_req_t* out) {
for (auto& entry : entries_) {
if (entry.valid && entry.mshr_replay) {
*out = entry;
entry.valid = false;
--size_;
return true;
}
}
return false;
}
void clear() {
for (auto& entry : entries_) {
if (entry.valid && entry.mshr_replay) {
entry.valid = false;
}
}
size_ = 0;
}
};
struct bank_t {
std::vector<set_t> sets;
MSHR mshr;
bank_t(const Cache::Config& config,
const params_t& params)
: sets(params.sets_per_bank, params.blocks_per_set)
, mshr(config.mshr_size)
{}
void clear() {
mshr.clear();
for (auto& set : sets) {
set.clear();
}
}
};
///////////////////////////////////////////////////////////////////////////////
class Cache::Impl {
private:
Cache* const simobject_;
Config config_;
params_t params_;
std::vector<bank_t> banks_;
Switch<MemReq, MemRsp>::Ptr mem_switch_;
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
uint32_t flush_cycles_;
PerfStats perf_stats_;
uint64_t pending_read_reqs_;
uint64_t pending_write_reqs_;
uint64_t pending_fill_reqs_;
public:
Impl(Cache* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
{
bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspIn);
if (config.num_banks > 1) {
mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
}
mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn);
} else {
mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
}
// calculate tag flush cycles
flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
}
void reset() {
for (auto& bank : banks_) {
bank.clear();
}
perf_stats_ = PerfStats();
pending_read_reqs_ = 0;
pending_write_reqs_ = 0;
pending_fill_reqs_ = 0;
}
void tick() {
// wait on flush cycles
if (flush_cycles_ != 0) {
--flush_cycles_;
return;
}
// per-bank pipeline request
std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
// calculate memory latency
perf_stats_.mem_latency += pending_fill_reqs_;
// handle bypasss responses
auto& bypass_port = bypass_switch_->RspOut.at(1);
if (!bypass_port.empty()) {
auto& mem_rsp = bypass_port.front();
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
bypass_port.pop();
}
// handle MSHR replay
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs.at(bank_id);
bank.mshr.pop(&pipeline_req);
}
// handle memory fills
std::vector<bool> pending_fill_req(config_.num_banks, false);
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
if (!mem_rsp_port.empty()) {
auto& mem_rsp = mem_rsp_port.front();
this->processMemoryFill(bank_id, mem_rsp.tag);
pending_fill_req.at(bank_id) = true;
mem_rsp_port.pop();
}
}
// handle incoming core requests
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
// check cache bypassing
if (core_req.non_cacheable) {
// send IO request
this->processIORequest(core_req, req_id);
// remove request
core_req_port.pop();
continue;
}
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
// create bank request
bank_req_t bank_req(config_.ports_per_bank);
bank_req.valid = true;
bank_req.write = core_req.write;
bank_req.mshr_replay = false;
bank_req.tag = tag;
bank_req.set_id = set_id;
bank_req.core_id = core_req.core_id;
bank_req.uuid = core_req.uuid;
bank_req.infos.at(port_id) = {true, req_id, core_req.tag};
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs.at(bank_id);
// check pending MSHR replay
if (pipeline_req.valid
&& pipeline_req.mshr_replay) {
// stall
continue;
}
// check pending fill request
if (pending_fill_req.at(bank_id)) {
// stall
continue;
}
// check MSHR capacity if read or writeback
if ((!core_req.write || !config_.write_through)
&& bank.mshr.full()) {
++perf_stats_.mshr_stalls;
continue;
}
// check bank conflicts
if (pipeline_req.valid) {
// check port conflict
if (pipeline_req.write != core_req.write
|| pipeline_req.set_id != set_id
|| pipeline_req.tag != tag
|| pipeline_req.infos[port_id].valid) {
++perf_stats_.bank_stalls;
continue;
}
// update pending request infos
pipeline_req.infos[port_id] = bank_req.infos[port_id];
} else {
// schedule new request
pipeline_req = bank_req;
}
if (core_req.write)
++perf_stats_.writes;
else
++perf_stats_.reads;
// remove request
auto time = core_req_port.pop();
perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
}
// process active request
this->processBankRequest(pipeline_reqs);
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
private:
void processIORequest(const MemReq& core_req, uint32_t req_id) {
{
MemReq mem_req(core_req);
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
}
if (core_req.write && config_.write_reponse) {
MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
// update block
auto& bank = banks_.at(bank_id);
auto& entry = bank.mshr.replay(mshr_id);
auto& set = bank.sets.at(entry.set_id);
auto& block = set.blocks.at(entry.block_id);
block.valid = true;
block.tag = entry.tag;
--pending_fill_reqs_;
}
void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& pipeline_req = pipeline_reqs.at(bank_id);
if (!pipeline_req.valid)
continue;
auto& bank = banks_.at(bank_id);
auto& set = bank.sets.at(pipeline_req.set_id);
if (pipeline_req.mshr_replay) {
// send core response
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
} else {
bool hit = false;
bool found_free_block = false;
uint32_t hit_block_id = 0;
uint32_t repl_block_id = 0;
uint32_t max_cnt = 0;
for (uint32_t i = 0, n = set.blocks.size(); i < n; ++i) {
auto& block = set.blocks.at(i);
if (block.valid) {
if (block.tag == pipeline_req.tag) {
block.lru_ctr = 0;
hit_block_id = i;
hit = true;
} else {
++block.lru_ctr;
}
if (max_cnt < block.lru_ctr) {
max_cnt = block.lru_ctr;
repl_block_id = i;
}
} else {
found_free_block = true;
repl_block_id = i;
}
}
if (hit) {
//
// Hit handling
//
if (pipeline_req.write) {
// handle write hit
auto& hit_block = set.blocks.at(hit_block_id);
if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
} else {
// mark block as dirty
hit_block.dirty = true;
}
}
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
} else {
//
// Miss handling
//
if (pipeline_req.write)
++perf_stats_.write_misses;
else
++perf_stats_.read_misses;
if (!found_free_block && !config_.write_through) {
// write back dirty block
auto& repl_block = set.blocks.at(repl_block_id);
if (repl_block.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
++perf_stats_.evictions;
}
}
if (pipeline_req.write && config_.write_through) {
// forward write request to memory
{
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
}
// send core response
if (config_.write_reponse) {
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
} else {
// MSHR lookup
int pending = bank.mshr.lookup(pipeline_req);
// allocate MSHR
int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id);
// send fill request
if (pending == -1) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = false;
mem_req.tag = mshr_id;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
++pending_fill_reqs_;
}
}
}
}
}
}
};
///////////////////////////////////////////////////////////////////////////////
Cache::Cache(const SimContext& ctx, const char* name, const Config& config)
: SimObject<Cache>(ctx, name)
, CoreReqPorts(config.num_inputs, this)
, CoreRspPorts(config.num_inputs, this)
, MemReqPort(this)
, MemRspPort(this)
, impl_(new Impl(this, config))
{}
Cache::~Cache() {
delete impl_;
}
void Cache::reset() {
impl_->reset();
}
void Cache::tick() {
impl_->tick();
}
const Cache::PerfStats& Cache::perf_stats() const {
return impl_->perf_stats();
}

106
sim/simx/cache_cluster.h Normal file
View File

@@ -0,0 +1,106 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "cache_sim.h"
namespace vortex {
class CacheCluster : public SimObject<CacheCluster> {
public:
std::vector<std::vector<SimPort<MemReq>>> CoreReqPorts;
std::vector<std::vector<SimPort<MemRsp>>> CoreRspPorts;
SimPort<MemReq> MemReqPort;
SimPort<MemRsp> MemRspPort;
CacheCluster(const SimContext& ctx,
const char* name,
uint32_t num_units,
uint32_t num_caches,
uint32_t num_requests,
const CacheSim::Config& config)
: SimObject(ctx, name)
, CoreReqPorts(num_units, std::vector<SimPort<MemReq>>(num_requests, this))
, CoreRspPorts(num_units, std::vector<SimPort<MemRsp>>(num_requests, this))
, MemReqPort(this)
, MemRspPort(this)
, caches_(MAX(num_caches, 0x1)) {
CacheSim::Config config2(config);
if (0 == num_caches) {
num_caches = 1;
config2.bypass = true;
}
char sname[100];
std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units);
for (uint32_t u = 0; u < num_units; ++u) {
snprintf(sname, 100, "%s-unit-arb-%d", name, u);
unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
for (uint32_t i = 0; i < num_requests; ++i) {
this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
}
}
std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs);
for (uint32_t i = 0; i < config.num_inputs; ++i) {
snprintf(sname, 100, "%s-mem-arb-%d", name, i);
mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
for (uint32_t u = 0; u < num_units; ++u) {
unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
}
}
snprintf(sname, 100, "%s-cache-arb", name);
auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
for (uint32_t i = 0; i < num_caches; ++i) {
snprintf(sname, 100, "%s-cache%d", name, i);
caches_.at(i) = CacheSim::Create(sname, config2);
for (uint32_t j = 0; j < config.num_inputs; ++j) {
mem_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j));
caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i));
}
caches_.at(i)->MemReqPort.bind(&cache_arb->ReqIn.at(i));
cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPort);
}
cache_arb->ReqOut.at(0).bind(&this->MemReqPort);
this->MemRspPort.bind(&cache_arb->RspOut.at(0));
}
~CacheCluster() {}
void reset() {}
void tick() {}
CacheSim::PerfStats perf_stats() const {
CacheSim::PerfStats perf;
for (auto cache : caches_) {
perf += cache->perf_stats();
}
return perf;
}
private:
std::vector<CacheSim::Ptr> caches_;
};
}

707
sim/simx/cache_sim.cpp Normal file
View File

@@ -0,0 +1,707 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cache_sim.h"
#include "debug.h"
#include "types.h"
#include <util.h>
#include <unordered_map>
#include <vector>
#include <list>
#include <queue>
using namespace vortex;
struct params_t {
uint32_t sets_per_bank;
uint32_t lines_per_set;
uint32_t words_per_line;
uint32_t log2_num_inputs;
uint32_t word_select_addr_start;
uint32_t word_select_addr_end;
uint32_t bank_select_addr_start;
uint32_t bank_select_addr_end;
uint32_t set_select_addr_start;
uint32_t set_select_addr_end;
uint32_t tag_select_addr_start;
uint32_t tag_select_addr_end;
params_t(const CacheSim::Config& config) {
int32_t bank_bits = log2ceil(config.num_banks);
int32_t offset_bits = config.B - config.W;
int32_t log2_bank_size = config.C - bank_bits;
int32_t index_bits = log2_bank_size - (config.B + config.A);
assert(log2_bank_size > 0);
assert(offset_bits >= 0);
assert(index_bits >= 0);
this->log2_num_inputs = log2ceil(config.num_inputs);
this->words_per_line = 1 << offset_bits;
this->lines_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits;
assert(config.ports_per_bank <= this->words_per_line);
// Word select
this->word_select_addr_start = config.W;
this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
// Bank select
this->bank_select_addr_start = (1+this->word_select_addr_end);
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
// Set select
this->set_select_addr_start = (1+this->bank_select_addr_end);
this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
// Tag select
this->tag_select_addr_start = (1+this->set_select_addr_end);
this->tag_select_addr_end = (config.addr_width-1);
}
uint32_t addr_bank_id(uint64_t word_addr) const {
if (bank_select_addr_end >= bank_select_addr_start)
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
else
return 0;
}
uint32_t addr_set_id(uint64_t word_addr) const {
if (set_select_addr_end >= set_select_addr_start)
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
else
return 0;
}
uint64_t addr_tag(uint64_t word_addr) const {
if (tag_select_addr_end >= tag_select_addr_start)
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
else
return 0;
}
uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
uint64_t addr(0);
if (bank_select_addr_end >= bank_select_addr_start)
addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
if (set_select_addr_end >= set_select_addr_start)
addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
if (tag_select_addr_end >= tag_select_addr_start)
addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
return addr;
}
};
struct line_t {
uint64_t tag;
uint32_t lru_ctr;
bool valid;
bool dirty;
void clear() {
valid = false;
dirty = false;
}
};
struct set_t {
std::vector<line_t> lines;
set_t(uint32_t num_ways)
: lines(num_ways)
{}
void clear() {
for (auto& line : lines) {
line.clear();
}
}
};
struct bank_req_port_t {
uint32_t req_id;
uint64_t req_tag;
bool valid;
void clear() {
valid = false;
}
};
struct bank_req_t {
enum ReqType {
None = 0,
Fill = 1,
Replay = 2,
Core = 3
};
std::vector<bank_req_port_t> ports;
uint64_t tag;
uint32_t set_id;
uint32_t cid;
uint64_t uuid;
ReqType type;
bool write;
bank_req_t(uint32_t num_ports)
: ports(num_ports)
{}
void clear() {
for (auto& port : ports) {
port.clear();
}
type = ReqType::None;
}
};
struct mshr_entry_t {
bank_req_t bank_req;
uint32_t line_id;
mshr_entry_t(uint32_t num_ports)
: bank_req(num_ports)
{}
void clear() {
bank_req.clear();
}
};
class MSHR {
private:
std::vector<mshr_entry_t> entries_;
uint32_t size_;
public:
MSHR(uint32_t size, uint32_t num_ports)
: entries_(size, num_ports)
, size_(0)
{}
bool empty() const {
return (0 == size_);
}
bool full() const {
return (size_ == entries_.size());
}
bool lookup(const bank_req_t& bank_req) {
for (auto& entry : entries_) {;
if (entry.bank_req.type != bank_req_t::None
&& entry.bank_req.set_id == bank_req.set_id
&& entry.bank_req.tag == bank_req.tag) {
return true;
}
}
return false;
}
int allocate(const bank_req_t& bank_req, uint32_t line_id) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (entry.bank_req.type == bank_req_t::None) {
entry.bank_req = bank_req;
entry.line_id = line_id;
++size_;
return i;
}
}
return -1;
}
mshr_entry_t& replay(uint32_t id) {
auto& root_entry = entries_.at(id);
assert(root_entry.bank_req.type == bank_req_t::Core);
// mark all related mshr entries for replay
for (auto& entry : entries_) {
if (entry.bank_req.type == bank_req_t::Core
&& entry.bank_req.set_id == root_entry.bank_req.set_id
&& entry.bank_req.tag == root_entry.bank_req.tag) {
entry.bank_req.type = bank_req_t::Replay;
}
}
return root_entry;
}
bool pop(bank_req_t* out) {
for (auto& entry : entries_) {
if (entry.bank_req.type == bank_req_t::Replay) {
*out = entry.bank_req;
entry.bank_req.type = bank_req_t::None;
--size_;
return true;
}
}
return false;
}
void clear() {
for (auto& entry : entries_) {
entry.clear();
}
size_ = 0;
}
};
struct bank_t {
std::vector<set_t> sets;
MSHR mshr;
bank_t(const CacheSim::Config& config,
const params_t& params)
: sets(params.sets_per_bank, params.lines_per_set)
, mshr(config.mshr_size, config.ports_per_bank)
{}
void clear() {
for (auto& set : sets) {
set.clear();
}
mshr.clear();
}
};
///////////////////////////////////////////////////////////////////////////////
class CacheSim::Impl {
private:
CacheSim* const simobject_;
Config config_;
params_t params_;
std::vector<bank_t> banks_;
Switch<MemReq, MemRsp>::Ptr bank_switch_;
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
std::vector<bank_req_t> pipeline_reqs_;
uint32_t init_cycles_;
PerfStats perf_stats_;
uint64_t pending_read_reqs_;
uint64_t pending_write_reqs_;
uint64_t pending_fill_reqs_;
public:
Impl(CacheSim* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
, pipeline_reqs_(config.num_banks, config.ports_per_bank)
{
char sname[100];
snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());
if (config_.bypass) {
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
for (uint32_t i = 0; i < config_.num_inputs; ++i) {
simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
}
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
return;
}
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2);
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
if (config.num_banks > 1) {
snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks);
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
}
bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0));
} else {
mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0));
}
// calculate cache initialization cycles
init_cycles_ = params_.sets_per_bank * params_.lines_per_set;
}
void reset() {
if (config_.bypass)
return;
for (auto& bank : banks_) {
bank.clear();
}
perf_stats_ = PerfStats();
pending_read_reqs_ = 0;
pending_write_reqs_ = 0;
pending_fill_reqs_ = 0;
}
void tick() {
if (config_.bypass)
return;
// wait on cache initialization cycles
if (init_cycles_ != 0) {
--init_cycles_;
return;
}
// handle cache bypasss responses
{
auto& bypass_port = bypass_switch_->RspIn.at(1);
if (!bypass_port.empty()) {
auto& mem_rsp = bypass_port.front();
this->processBypassResponse(mem_rsp);
bypass_port.pop();
}
}
// initialize pipeline request
for (auto& pipeline_req : pipeline_reqs_) {
pipeline_req.clear();
}
// schedule MSHR replay
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
bank.mshr.pop(&pipeline_req);
}
// schedule memory fill
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
if (mem_rsp_port.empty())
continue;
auto& pipeline_req = pipeline_reqs_.at(bank_id);
if (pipeline_req.type != bank_req_t::None)
continue;
auto& mem_rsp = mem_rsp_port.front();
DT(3, simobject_->name() << "-dram-" << mem_rsp);
pipeline_req.type = bank_req_t::Fill;
pipeline_req.tag = mem_rsp.tag;
mem_rsp_port.pop();
}
// schedule core requests
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
// check cache bypassing
if (core_req.type == AddrType::IO) {
// send bypass request
this->processBypassRequest(core_req, req_id);
// remove request
core_req_port.pop();
continue;
}
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
// check MSHR capacity
if ((!core_req.write || !config_.write_through)
&& bank.mshr.full()) {
++perf_stats_.mshr_stalls;
++perf_stats_.bank_stalls;
continue;
}
// check bank conflicts
if (pipeline_req.type == bank_req_t::Core) {
// check port conflict
if (pipeline_req.write != core_req.write
|| pipeline_req.set_id != set_id
|| pipeline_req.tag != tag
|| pipeline_req.ports.at(port_id).valid) {
++perf_stats_.bank_stalls;
continue;
}
// extend request ports
pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
} else if (pipeline_req.type == bank_req_t::None) {
// schedule new request
bank_req_t bank_req(config_.ports_per_bank);
bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
bank_req.tag = tag;
bank_req.set_id = set_id;
bank_req.cid = core_req.cid;
bank_req.uuid = core_req.uuid;
bank_req.type = bank_req_t::Core;
bank_req.write = core_req.write;
pipeline_req = bank_req;
} else {
// bank in use
++perf_stats_.bank_stalls;
continue;
}
if (core_req.write)
++perf_stats_.writes;
else
++perf_stats_.reads;
// remove request
DT(3, simobject_->name() << "-core-" << core_req);
auto time = core_req_port.pop();
perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
}
// process active request
this->processBankRequests();
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
private:
void processBypassResponse(const MemRsp& mem_rsp) {
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
MemRsp core_rsp{tag, mem_rsp.cid, mem_rsp.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
void processBypassRequest(const MemReq& core_req, uint32_t req_id) {
DT(3, simobject_->name() << "-core-" << core_req);
{
MemReq mem_req(core_req);
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
}
if (core_req.write && config_.write_reponse) {
MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
void processBankRequests() {
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto pipeline_req = pipeline_reqs_.at(bank_id);
switch (pipeline_req.type) {
case bank_req_t::None:
break;
case bank_req_t::Fill: {
// update cache line
auto& bank = banks_.at(bank_id);
auto& entry = bank.mshr.replay(pipeline_req.tag);
auto& set = bank.sets.at(entry.bank_req.set_id);
auto& line = set.lines.at(entry.line_id);
line.valid = true;
line.tag = entry.bank_req.tag;
--pending_fill_reqs_;
} break;
case bank_req_t::Replay: {
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.ports) {
if (!info.valid)
continue;
MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} break;
case bank_req_t::Core: {
bool hit = false;
bool found_free_line = false;
uint32_t hit_line_id = 0;
uint32_t repl_line_id = 0;
uint32_t max_cnt = 0;
auto& set = bank.sets.at(pipeline_req.set_id);
// tag lookup
for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
auto& line = set.lines.at(i);
if (line.valid) {
if (line.tag == pipeline_req.tag) {
line.lru_ctr = 0;
hit_line_id = i;
hit = true;
} else {
++line.lru_ctr;
}
if (max_cnt < line.lru_ctr) {
max_cnt = line.lru_ctr;
repl_line_id = i;
}
} else {
found_free_line = true;
repl_line_id = i;
}
}
if (hit) {
//
// Hit handling
//
if (pipeline_req.write) {
// handle write hit
auto& hit_line = set.lines.at(hit_line_id);
if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
} else {
// mark line as dirty
hit_line.dirty = true;
}
}
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.ports) {
if (!info.valid)
continue;
MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} else {
//
// Miss handling
//
if (pipeline_req.write)
++perf_stats_.write_misses;
else
++perf_stats_.read_misses;
if (!found_free_line && !config_.write_through) {
// write back dirty line
auto& repl_line = set.lines.at(repl_line_id);
if (repl_line.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
++perf_stats_.evictions;
}
}
if (pipeline_req.write && config_.write_through) {
// forward write request to memory
{
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
}
// send core response
if (config_.write_reponse) {
for (auto& info : pipeline_req.ports) {
if (!info.valid)
continue;
MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} else {
// MSHR lookup
auto mshr_pending = bank.mshr.lookup(pipeline_req);
// allocate MSHR
auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id);
// send fill request
if (!mshr_pending) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = false;
mem_req.tag = mshr_id;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
++pending_fill_reqs_;
}
}
}
} break;
}
}
// calculate memory latency
perf_stats_.mem_latency += pending_fill_reqs_;
}
};
///////////////////////////////////////////////////////////////////////////////
CacheSim::CacheSim(const SimContext& ctx, const char* name, const Config& config)
: SimObject<CacheSim>(ctx, name)
, CoreReqPorts(config.num_inputs, this)
, CoreRspPorts(config.num_inputs, this)
, MemReqPort(this)
, MemRspPort(this)
, impl_(new Impl(this, config))
{}
CacheSim::~CacheSim() {
delete impl_;
}
void CacheSim::reset() {
impl_->reset();
}
void CacheSim::tick() {
impl_->tick();
}
const CacheSim::PerfStats& CacheSim::perf_stats() const {
return impl_->perf_stats();
}

View File

@@ -1,13 +1,27 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "memsim.h"
#include "mem_sim.h"
namespace vortex {
class Cache : public SimObject<Cache> {
class CacheSim : public SimObject<CacheSim> {
public:
struct Config {
bool bypass; // cache bypass
uint8_t C; // log2 cache size
uint8_t B; // log2 block size
uint8_t W; // log2 word size
@@ -45,6 +59,19 @@ public:
, mshr_stalls(0)
, mem_latency(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->reads += rhs.reads;
this->writes += rhs.writes;
this->read_misses += rhs.read_misses;
this->write_misses += rhs.write_misses;
this->evictions += rhs.evictions;
this->pipeline_stalls += rhs.pipeline_stalls;
this->bank_stalls += rhs.bank_stalls;
this->mshr_stalls += rhs.mshr_stalls;
this->mem_latency += rhs.mem_latency;
return *this;
}
};
std::vector<SimPort<MemReq>> CoreReqPorts;
@@ -52,8 +79,8 @@ public:
SimPort<MemReq> MemReqPort;
SimPort<MemRsp> MemRspPort;
Cache(const SimContext& ctx, const char* name, const Config& config);
~Cache();
CacheSim(const SimContext& ctx, const char* name, const Config& config);
~CacheSim();
void reset();

222
sim/simx/cluster.cpp Normal file
View File

@@ -0,0 +1,222 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cluster.h"
using namespace vortex;
Cluster::Cluster(const SimContext& ctx,
uint32_t cluster_id,
ProcessorImpl* processor,
const Arch &arch, const
DCRS &dcrs)
: SimObject(ctx, "cluster")
, mem_req_port(this)
, mem_rsp_port(this)
, cluster_id_(cluster_id)
, cores_(arch.num_cores())
, barriers_(arch.num_barriers(), 0)
, sharedmems_(arch.num_cores())
, processor_(processor)
{
auto num_cores = arch.num_cores();
char sname[100];
snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
!L2_ENABLED,
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(L2_NUM_WAYS), // W
0, // A
XLEN, // address bits
L2_NUM_BANKS, // number of banks
1, // number of ports
5, // request size
true, // write-through
false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
l2cache_->MemReqPort.bind(&this->mem_req_port);
this->mem_rsp_port.bind(&l2cache_->MemRspPort);
snprintf(sname, 100, "cluster%d-icaches", cluster_id);
icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
!ICACHE_ENABLED,
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(sizeof(uint32_t)), // W
log2ceil(ICACHE_NUM_WAYS),// A
XLEN, // address bits
1, // number of banks
1, // number of ports
1, // number of inputs
true, // write-through
false, // write response
0, // victim size
(uint8_t)arch.num_warps(), // mshr
2, // pipeline latency
});
icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
!DCACHE_ENABLED,
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(sizeof(Word)), // W
log2ceil(DCACHE_NUM_WAYS),// A
XLEN, // address bits
DCACHE_NUM_BANKS, // number of banks
1, // number of ports
DCACHE_NUM_BANKS, // number of inputs
true, // write-through
false, // write response
0, // victim size
DCACHE_MSHR_SIZE, // mshr
4, // pipeline latency
});
dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
///////////////////////////////////////////////////////////////////////////
// create shared memory blocks
for (uint32_t i = 0; i < num_cores; ++i) {
snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
(1 << SMEM_LOG_SIZE),
sizeof(Word),
NUM_LSU_LANES,
NUM_LSU_LANES,
false
});
}
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
uint32_t core_id = cluster_id * num_cores + i;
cores_.at(i) = Core::Create(core_id,
this,
arch,
dcrs,
sharedmems_.at(i));
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
auto smem_demux = SMemDemux::Create(sname);
cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));
smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
}
}
}
Cluster::~Cluster() {
//--
}
void Cluster::reset() {
for (auto& barrier : barriers_) {
barrier.reset();
}
}
void Cluster::tick() {
//--
}
void Cluster::attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
}
}
bool Cluster::running() const {
for (auto& core : cores_) {
if (core->running())
return true;
}
return false;
}
bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
bool done = true;
Word exitcode_ = 0;
for (auto& core : cores_) {
Word ec;
if (core->check_exit(&ec, riscv_test)) {
exitcode_ |= ec;
} else {
done = false;
}
}
*exitcode = exitcode_;
return done;
}
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
auto& barrier = barriers_.at(bar_id);
uint32_t local_core_id = core_id % cores_.size();
barrier.set(local_core_id);
DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
if (barrier.count() == (size_t)count) {
// resume all suspended cores
for (uint32_t i = 0; i < cores_.size(); ++i) {
if (barrier.test(i)) {
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
cores_.at(i)->resume();
}
}
barrier.reset();
}
}
ProcessorImpl* Cluster::processor() const {
return processor_;
}
Cluster::PerfStats Cluster::perf_stats() const {
Cluster::PerfStats perf;
perf.icache = icaches_->perf_stats();
perf.dcache = dcaches_->perf_stats();
perf.tcache = tcaches_->perf_stats();
perf.ocache = ocaches_->perf_stats();
perf.rcache = rcaches_->perf_stats();
perf.l2cache = l2cache_->perf_stats();
for (auto sharedmem : sharedmems_) {
perf.sharedmem += sharedmem->perf_stats();
}
return perf;
}

92
sim/simx/cluster.h Normal file
View File

@@ -0,0 +1,92 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "dcrs.h"
#include "arch.h"
#include "cache_cluster.h"
#include "shared_mem.h"
#include "core.h"
#include "constants.h"
namespace vortex {
class ProcessorImpl;
class Cluster : public SimObject<Cluster> {
public:
struct PerfStats {
CacheSim::PerfStats icache;
CacheSim::PerfStats dcache;
SharedMem::PerfStats sharedmem;
CacheSim::PerfStats l2cache;
CacheSim::PerfStats tcache;
CacheSim::PerfStats ocache;
CacheSim::PerfStats rcache;
PerfStats& operator+=(const PerfStats& rhs) {
this->icache += rhs.icache;
this->dcache += rhs.dcache;
this->sharedmem += rhs.sharedmem;
this->l2cache += rhs.l2cache;
this->tcache += rhs.tcache;
this->ocache += rhs.ocache;
this->rcache += rhs.rcache;
return *this;
}
};
SimPort<MemReq> mem_req_port;
SimPort<MemRsp> mem_rsp_port;
Cluster(const SimContext& ctx,
uint32_t cluster_id,
ProcessorImpl* processor,
const Arch &arch,
const DCRS &dcrs);
~Cluster();
void reset();
void tick();
void attach_ram(RAM* ram);
bool running() const;
bool check_exit(Word* exitcode, bool riscv_test) const;
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
ProcessorImpl* processor() const;
Cluster::PerfStats perf_stats() const;
private:
uint32_t cluster_id_;
std::vector<Core::Ptr> cores_;
std::vector<CoreMask> barriers_;
CacheSim::Ptr l2cache_;
CacheCluster::Ptr icaches_;
CacheCluster::Ptr dcaches_;
std::vector<SharedMem::Ptr> sharedmems_;
CacheCluster::Ptr tcaches_;
CacheCluster::Ptr ocaches_;
CacheCluster::Ptr rcaches_;
ProcessorImpl* processor_;
};
} // namespace vortex

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifndef RAM_PAGE_SIZE
@@ -10,14 +23,4 @@
#ifndef MEMORY_BANKS
#define MEMORY_BANKS 2
#endif
namespace vortex {
enum Constants {
SMEM_BANK_OFFSET = log2ceil(sizeof(uint32_t)) + log2ceil(STACK_SIZE / sizeof(uint32_t)),
};
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
@@ -11,101 +24,104 @@
#include <simobject.h>
#include "debug.h"
#include "types.h"
#include "archdef.h"
#include "arch.h"
#include "decode.h"
#include "mem.h"
#include "warp.h"
#include "pipeline.h"
#include "cache.h"
#include "sharedmem.h"
#include "cache_sim.h"
#include "shared_mem.h"
#include "ibuffer.h"
#include "scoreboard.h"
#include "exeunit.h"
#include "tex_unit.h"
#include "operand.h"
#include "dispatcher.h"
#include "exe_unit.h"
#include "dcrs.h"
namespace vortex {
class Cluster;
class Core : public SimObject<Core> {
public:
struct PerfStats {
uint64_t cycles;
uint64_t instrs;
uint64_t ibuf_stalls;
uint64_t scrb_stalls;
uint64_t alu_stalls;
uint64_t lsu_stalls;
uint64_t csr_stalls;
uint64_t fpu_stalls;
uint64_t gpu_stalls;
uint64_t sfu_stalls;
uint64_t ifetches;
uint64_t loads;
uint64_t stores;
uint64_t branches;
uint64_t mem_reads;
uint64_t mem_writes;
uint64_t mem_latency;
uint64_t tex_reads;
uint64_t tex_latency;
uint64_t ifetch_latency;
uint64_t load_latency;
PerfStats()
: instrs(0)
: cycles(0)
, instrs(0)
, ibuf_stalls(0)
, scrb_stalls(0)
, alu_stalls(0)
, lsu_stalls(0)
, csr_stalls(0)
, fpu_stalls(0)
, gpu_stalls(0)
, sfu_stalls(0)
, ifetches(0)
, loads(0)
, stores(0)
, branches(0)
, mem_reads(0)
, mem_writes(0)
, mem_latency(0)
, tex_reads(0)
, tex_latency(0)
, ifetch_latency(0)
, load_latency(0)
{}
};
SimPort<MemRsp> MemRspPort;
SimPort<MemReq> MemReqPort;
std::vector<SimPort<MemReq>> icache_req_ports;
std::vector<SimPort<MemRsp>> icache_rsp_ports;
std::vector<SimPort<MemReq>> dcache_req_ports;
std::vector<SimPort<MemRsp>> dcache_rsp_ports;
Core(const SimContext& ctx,
uint32_t core_id,
Cluster* cluster,
const Arch &arch,
const DCRS &dcrs,
SharedMem::Ptr sharedmem);
Core(const SimContext& ctx, const ArchDef &arch, uint32_t id);
~Core();
void attach_ram(RAM* ram);
bool running() const;
void reset();
void tick();
void attach_ram(RAM* ram);
bool running() const;
void resume();
uint32_t id() const {
return id_;
return core_id_;
}
const Decoder& decoder() {
return decoder_;
}
const ArchDef& arch() const {
const Arch& arch() const {
return arch_;
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
uint32_t getIRegValue(int reg) const {
return warps_.at(0)->getIRegValue(reg);
const DCRS& dcrs() const {
return dcrs_;
}
uint32_t get_csr(uint32_t addr, uint32_t tid, uint32_t wid);
void set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid);
WarpMask wspawn(uint32_t num_warps, uint32_t nextPC);
void wspawn(uint32_t num_warps, Word nextPC);
WarpMask barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
void barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
AddrType get_addr_type(uint64_t addr);
void icache_read(void* data, uint64_t addr, uint32_t size);
@@ -113,19 +129,22 @@ public:
void dcache_write(const void* data, uint64_t addr, uint32_t size);
uint32_t tex_read(uint32_t unit, uint32_t lod, uint32_t u, uint32_t v, std::vector<mem_addr_size_t>* mem_addrs);
void dcache_amo_reserve(uint64_t addr);
bool dcache_amo_check(uint64_t addr);
void trigger_ecall();
void trigger_ebreak();
bool check_exit() const;
bool check_exit(Word* exitcode, bool riscv_test) const;
private:
void schedule();
void fetch();
void decode();
void issue();
void execute();
void commit();
@@ -133,49 +152,55 @@ private:
void cout_flush();
uint32_t id_;
const ArchDef arch_;
uint32_t core_id_;
const Arch& arch_;
const DCRS &dcrs_;
const Decoder decoder_;
MemoryUnit mmu_;
RAM smem_;
std::vector<TexUnit> tex_units_;
std::vector<std::shared_ptr<Warp>> warps_;
std::vector<WarpMask> barriers_;
std::vector<uint32_t> csrs_;
std::vector<WarpMask> barriers_;
std::vector<Byte> fcsrs_;
std::vector<IBuffer> ibuffers_;
Scoreboard scoreboard_;
std::vector<Operand::Ptr> operands_;
std::vector<Dispatcher::Ptr> dispatchers_;
std::vector<ExeUnit::Ptr> exe_units_;
Cache::Ptr icache_;
Cache::Ptr dcache_;
SharedMem::Ptr shared_mem_;
Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;
SharedMem::Ptr sharedmem_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;
HashTable<pipeline_trace_t*> pending_icache_;
std::vector<pipeline_trace_t*> committed_traces_;
WarpMask active_warps_;
WarpMask stalled_warps_;
uint32_t last_schedule_wid_;
uint64_t issued_instrs_;
uint64_t committed_instrs_;
uint32_t csr_tex_unit_;
bool ecall_;
bool ebreak_;
bool exited_;
uint64_t pending_ifetches_;
std::unordered_map<int, std::stringstream> print_bufs_;
std::vector<std::vector<CSRs>> csrs_;
PerfStats perf_stats_;
uint64_t perf_mem_pending_reads_;
Cluster* cluster_;
uint32_t commit_exe_;
friend class Warp;
friend class LsuUnit;
friend class AluUnit;
friend class CsrUnit;
friend class FpuUnit;
friend class GpuUnit;
friend class SfuUnit;
friend class TexUnit;
friend class RasterAgent;
friend class RopAgent;
friend class TexAgent;
};
} // namespace vortex
} // namespace vortex

28
sim/simx/dcrs.cpp Normal file
View File

@@ -0,0 +1,28 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dcrs.h"
#include <iostream>
using namespace vortex;
void DCRS::write(uint32_t addr, uint32_t value) {
if (addr >= VX_DCR_BASE_STATE_BEGIN
&& addr < VX_DCR_BASE_STATE_END) {
base_dcrs.write(addr, value);
return;
}
std::cout << std::hex << "Error: invalid global DCR addr=0x" << addr << std::endl;
std::abort();
}

45
sim/simx/dcrs.h Normal file
View File

@@ -0,0 +1,45 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <util.h>
#include <VX_types.h>
#include <array>
namespace vortex {
class BaseDCRS {
public:
uint32_t read(uint32_t addr) const {
uint32_t state = VX_DCR_BASE_STATE(addr);
return states_.at(state);
}
void write(uint32_t addr, uint32_t value) {
uint32_t state = VX_DCR_BASE_STATE(addr);
states_.at(state) = value;
}
private:
std::array<uint32_t, VX_DCR_BASE_STATE_COUNT> states_;
};
class DCRS {
public:
void write(uint32_t addr, uint32_t value);
BaseDCRS base_dcrs;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifndef DEBUG_LEVEL

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <string>
#include <stdlib.h>
@@ -9,41 +22,36 @@
#include "debug.h"
#include "types.h"
#include "decode.h"
#include "archdef.h"
#include "arch.h"
#include "instr.h"
using namespace vortex;
struct InstTableEntry_t {
bool controlFlow;
InstType iType;
};
static const std::unordered_map<Opcode, struct InstTableEntry_t> sc_instTable = {
{Opcode::NOP, {false, InstType::N_TYPE}},
{Opcode::R_INST, {false, InstType::R_TYPE}},
{Opcode::L_INST, {false, InstType::I_TYPE}},
{Opcode::I_INST, {false, InstType::I_TYPE}},
{Opcode::S_INST, {false, InstType::S_TYPE}},
{Opcode::B_INST, {true , InstType::B_TYPE}},
{Opcode::LUI_INST, {false, InstType::U_TYPE}},
{Opcode::AUIPC_INST, {false, InstType::U_TYPE}},
{Opcode::JAL_INST, {true , InstType::J_TYPE}},
{Opcode::JALR_INST, {true , InstType::I_TYPE}},
{Opcode::SYS_INST, {true , InstType::I_TYPE}},
{Opcode::FENCE, {true , InstType::I_TYPE}},
{Opcode::FL, {false, InstType::I_TYPE}},
{Opcode::FS, {false, InstType::S_TYPE}},
{Opcode::FCI, {false, InstType::R_TYPE}},
{Opcode::FMADD, {false, InstType::R4_TYPE}},
{Opcode::FMSUB, {false, InstType::R4_TYPE}},
{Opcode::FMNMADD, {false, InstType::R4_TYPE}},
{Opcode::FMNMSUB, {false, InstType::R4_TYPE}},
{Opcode::VSET, {false, InstType::V_TYPE}},
{Opcode::GPGPU, {false, InstType::R_TYPE}},
{Opcode::GPU, {false, InstType::R4_TYPE}},
{Opcode::R_INST_W, {false, InstType::R_TYPE}},
{Opcode::I_INST_W, {false, InstType::I_TYPE}},
static const std::unordered_map<Opcode, InstType> sc_instTable = {
{Opcode::R_INST, InstType::R_TYPE},
{Opcode::L_INST, InstType::I_TYPE},
{Opcode::I_INST, InstType::I_TYPE},
{Opcode::S_INST, InstType::S_TYPE},
{Opcode::B_INST, InstType::B_TYPE},
{Opcode::LUI_INST, InstType::U_TYPE},
{Opcode::AUIPC_INST, InstType::U_TYPE},
{Opcode::JAL_INST, InstType::J_TYPE},
{Opcode::JALR_INST, InstType::I_TYPE},
{Opcode::SYS_INST, InstType::I_TYPE},
{Opcode::FENCE, InstType::I_TYPE},
{Opcode::AMO, InstType::R_TYPE},
{Opcode::FL, InstType::I_TYPE},
{Opcode::FS, InstType::S_TYPE},
{Opcode::FCI, InstType::R_TYPE},
{Opcode::FMADD, InstType::R4_TYPE},
{Opcode::FMSUB, InstType::R4_TYPE},
{Opcode::FMNMADD, InstType::R4_TYPE},
{Opcode::FMNMSUB, InstType::R4_TYPE},
{Opcode::VSET, InstType::V_TYPE},
{Opcode::EXT1, InstType::R_TYPE},
{Opcode::EXT2, InstType::R4_TYPE},
{Opcode::R_INST_W, InstType::R_TYPE},
{Opcode::I_INST_W, InstType::I_TYPE},
};
enum Constants {
@@ -58,6 +66,8 @@ enum Constants {
width_i_imm = 12,
width_j_imm = 20,
width_v_imm = 11,
width_aq = 1,
width_rl = 1,
shift_opcode= 0,
shift_rd = width_opcode,
@@ -72,15 +82,15 @@ enum Constants {
shift_func6 = shift_func7 + width_vmask,
shift_vset = shift_func7 + width_func6,
mask_opcode = (1<<width_opcode)-1,
mask_reg = (1<<width_reg)-1,
mask_func2 = (1<<width_func2)-1,
mask_func3 = (1<<width_func3)-1,
mask_func6 = (1<<width_func6)-1,
mask_func7 = (1<<width_func7)-1,
mask_i_imm = (1<<width_i_imm)-1,
mask_j_imm = (1<<width_j_imm)-1,
mask_v_imm = (1<<width_v_imm)-1,
mask_opcode = (1 << width_opcode) - 1,
mask_reg = (1 << width_reg) - 1,
mask_func2 = (1 << width_func2) - 1,
mask_func3 = (1 << width_func3) - 1,
mask_func6 = (1 << width_func6) - 1,
mask_func7 = (1 << width_func7) - 1,
mask_i_imm = (1 << width_i_imm) - 1,
mask_j_imm = (1 << width_j_imm) - 1,
mask_v_imm = (1 << width_v_imm) - 1,
};
static const char* op_string(const Instr &instr) {
@@ -92,7 +102,6 @@ static const char* op_string(const Instr &instr) {
auto imm = instr.getImm();
switch (opcode) {
case Opcode::NOP: return "NOP";
case Opcode::LUI_INST: return "LUI";
case Opcode::AUIPC_INST: return "AUIPC";
case Opcode::R_INST:
@@ -116,7 +125,7 @@ static const char* op_string(const Instr &instr) {
case 2: return "SLT";
case 3: return "SLTU";
case 4: return "XOR";
case 5: return func7 ? "SRA" : "SRL";
case 5: return (func7 & 0x20) ? "SRA" : "SRL";
case 6: return "OR";
case 7: return "AND";
default:
@@ -130,7 +139,7 @@ static const char* op_string(const Instr &instr) {
case 2: return "SLTI";
case 3: return "SLTIU";
case 4: return "XORI";
case 5: return func7 ? "SRAI" : "SRLI";
case 5: return (func7 & 0x20) ? "SRAI" : "SRLI";
case 6: return "ORI";
case 7: return "ANDI";
default:
@@ -151,8 +160,8 @@ static const char* op_string(const Instr &instr) {
case Opcode::JALR_INST: return "JALR";
case Opcode::L_INST:
switch (func3) {
case 0: return "LBI";
case 1: return "LHI";
case 0: return "LB";
case 1: return "LH";
case 2: return "LW";
case 3: return "LD";
case 4: return "LBU";
@@ -192,11 +201,11 @@ static const char* op_string(const Instr &instr) {
}
case Opcode::I_INST_W:
switch (func3) {
case 0: return "ADDIW";
case 1: return "SLLIW";
case 5: return func7 ? "SRAIW" : "SRLIW";
default:
std::abort();
case 0: return "ADDIW";
case 1: return "SLLIW";
case 5: return func7 ? "SRAIW" : "SRLIW";
default:
std::abort();
}
case Opcode::SYS_INST:
switch (func3) {
@@ -222,20 +231,59 @@ static const char* op_string(const Instr &instr) {
case Opcode::FENCE: return "FENCE";
case Opcode::FL:
switch (func3) {
case 0x1: return "VL";
case 0x2: return "FLW";
case 0x3: return "FLD";
default:
std::abort();
case 0x1: return "VL";
case 0x2: return "FLW";
case 0x3: return "FLD";
default:
std::abort();
}
case Opcode::FS:
switch (func3) {
case 0x1: return "VS";
case 0x2: return "FSW";
case 0x3: return "FSD";
case 0x1: return "VS";
case 0x2: return "FSW";
case 0x3: return "FSD";
default:
std::abort();
}
case Opcode::AMO: {
auto amo_type = func7 >> 2;
switch (func3) {
case 0x2:
switch (amo_type) {
case 0x00: return "AMOADD.W";
case 0x01: return "AMOSWAP.W";
case 0x02: return "LR.W";
case 0x03: return "SC.W";
case 0x04: return "AMOXOR.W";
case 0x08: return "AMOOR.W";
case 0x0c: return "AMOAND.W";
case 0x10: return "AMOMIN.W";
case 0x14: return "AMOMAX.W";
case 0x18: return "AMOMINU.W";
case 0x1c: return "AMOMAXU.W";
default:
std::abort();
}
case 0x3:
switch (amo_type) {
case 0x00: return "AMOADD.D";
case 0x01: return "AMOSWAP.D";
case 0x02: return "LR.D";
case 0x03: return "SC.D";
case 0x04: return "AMOXOR.D";
case 0x08: return "AMOOR.D";
case 0x0c: return "AMOAND.D";
case 0x10: return "AMOMIN.D";
case 0x14: return "AMOMAX.D";
case 0x18: return "AMOMINU.D";
case 0x1c: return "AMOMAXU.D";
default:
std::abort();
}
default:
std::abort();
}
}
case Opcode::FCI:
switch (func7) {
case 0x00: return "FADD.S";
@@ -332,9 +380,9 @@ static const char* op_string(const Instr &instr) {
default:
std::abort();
}
case 0x70: return func3 ? "FCLASS.S" : "FMV.X.W";
case 0x70: return func3 ? "FCLASS.S" : "FMV.X.S";
case 0x71: return func3 ? "FCLASS.D" : "FMV.X.D";
case 0x78: return "FMV.W.X";
case 0x78: return "FMV.S.X";
case 0x79: return "FMV.D.X";
default:
std::abort();
@@ -344,23 +392,36 @@ static const char* op_string(const Instr &instr) {
case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
case Opcode::VSET: return "VSET";
case Opcode::GPGPU:
switch (func3) {
case 0: return "TMC";
case 1: return "WSPAWN";
case 2: return "SPLIT";
case 3: return "JOIN";
case 4: return "BAR";
case 5: return "PREFETCH";
case Opcode::EXT1:
switch (func7) {
case 0:
switch (func3) {
case 0: return "TMC";
case 1: return "WSPAWN";
case 2: return "SPLIT";
case 3: return "JOIN";
case 4: return "BAR";
case 5: return "PRED";
default:
std::abort();
}
case 1:
switch (func3) {
case 0: return "RASTER";
default:
std::abort();
}
default:
std::abort();
}
case Opcode::GPU:
case Opcode::EXT2:
switch (func3) {
case 0: return "TEX";
case 0:
return "TEX";
case 1: {
switch (func2) {
case 0: return "CMOV";
case 1: return "ROP";
default:
std::abort();
}
@@ -375,43 +436,36 @@ static const char* op_string(const Instr &instr) {
namespace vortex {
std::ostream &operator<<(std::ostream &os, const Instr &instr) {
auto opcode = instr.getOpcode();
auto func2 = instr.getFunc2();
auto opcode = instr.getOpcode();
auto func3 = instr.getFunc3();
os << op_string(instr) << ": ";
if (opcode == S_INST
|| opcode == FS) {
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
} else
if (opcode == L_INST
|| opcode == FL) {
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
} else {
if (instr.getRDType() != RegType::None) {
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
}
uint32_t i = 0;
for (; i < instr.getNRSrc(); ++i) {
if (i) os << ", ";
os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
}
if (instr.hasImm()) {
if (i) os << ", ";
os << "imm=0x" << std::hex << instr.getImm();
}
if (opcode == GPU && func3 == 0) {
os << ", unit=" << std::dec << func2;
}
os << op_string(instr);
int sep = 0;
if (instr.getRDType() != RegType::None) {
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << instr.getRDType() << std::dec << instr.getRDest();
}
for (uint32_t i = 0; i < instr.getNRSrc(); ++i) {
if (instr.getRSType(i) == RegType::None)
continue;
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
}
if (instr.hasImm()) {
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << "0x" << std::hex << instr.getImm();
}
if (opcode == Opcode::SYS_INST && func3 >= 5) {
// CSRs with immediate values
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << "0x" << std::hex << instr.getRSrc(0);
}
return os;
}
}
Decoder::Decoder(const ArchDef&) {}
Decoder::Decoder(const Arch&) {}
std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
auto instr = std::make_shared<Instr>();
@@ -434,7 +488,7 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
return nullptr;
}
auto iType = op_it->second.iType;
auto iType = op_it->second;
if (op == Opcode::FL || op == Opcode::FS) {
if (func3 != 0x2 && func3 != 0x3) {
iType = InstType::V_TYPE;
@@ -442,57 +496,97 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
switch (iType) {
case InstType::N_TYPE:
break;
case InstType::R_TYPE:
if (op == Opcode::FCI) {
switch (func7) {
switch (op) {
case Opcode::FCI:
switch (func7) {
case 0x2c: // FSQRT.S
case 0x2d: // FSQRT.D
instr->setDestReg(rd, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
break;
case 0x50: // FLE.S, FLT.S, FEQ.S
case 0x51: // FLE.D, FLT.D, FEQ.D
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
break;
case 0x60: // FCVT.W.D, FCVT.WU.D, FCVT.L.D, FCVT.LU.D
case 0x61: // FCVT.WU.S, FCVT.W.S, FCVT.L.S, FCVT.LU.S
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::None);
break;
case 0x68: // FCVT.S.W, FCVT.S.WU, FCVT.S.L, FCVT.S.LU
case 0x69: // FCVT.D.W, FCVT.D.WU, FCVT.D.L, FCVT.D.LU
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::None);
break;
case 0x70: // FCLASS.S, FMV.X.W
case 0x70: // FCLASS.S, FMV.X.S
case 0x71: // FCLASS.D, FMV.X.D
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
break;
case 0x78: // FMV.W.X
case 0x78: // FMV.S.X
case 0x79: // FMV.D.X
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
break;
default:
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
break;
}
} else {
break;
case Opcode::EXT1:
switch (func7) {
case 0:
switch (func3) {
case 0: // TMC
case 3: // JOIN
instr->addSrcReg(rs1, RegType::Integer);
break;
case 1: // WSPAWN
case 4: // BAR
case 5: // PRED
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
break;
case 2: // SPLIT
instr->setDestReg(rd, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
break;
default:
std::abort();
}
break;
case 1:
switch (func3) {
case 0: // RASTER
instr->setDestReg(rd, RegType::Integer);
break;
default:
std::abort();
}
break;
default:
std::abort();
}
break;
default:
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
break;
}
instr->setFunc3(func3);
instr->setFunc7(func7);
break;
case InstType::I_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
if (op == Opcode::FL) {
instr->setDestReg(rd, RegType::Float);
} else {
@@ -503,15 +597,23 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
switch (op) {
case Opcode::SYS_INST:
if (func3 != 0) {
// RV32I: CSR*
instr->setDestReg(rd, RegType::Integer);
}
// RV32I: CSR
if (func3 >= 5) {
// rs1 holds zimm
instr->setSrcReg(0, rs1, RegType::None);
}
} else {
instr->setDestReg(rd, RegType::None);
instr->setSrcReg(0, rs1, RegType::None);
}
// uint12
instr->setImm(code >> shift_rs2);
break;
case Opcode::FENCE:
// uint12
instr->setImm(code >> shift_rs2);
instr->setDestReg(rd, RegType::None);
instr->setSrcReg(0, rs1, RegType::None);
break;
case Opcode::I_INST:
case Opcode::I_INST_W:
@@ -538,11 +640,11 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
} break;
case InstType::S_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
if (op == Opcode::FS) {
instr->setSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
} else {
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
}
instr->setFunc3(func3);
auto imm = (func7 << width_reg) | rd;
@@ -550,8 +652,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
} break;
case InstType::B_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
instr->setFunc3(func3);
auto bit_11 = rd & 0x1;
auto bits_4_1 = rd >> 1;
@@ -581,8 +683,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
case InstType::V_TYPE:
switch (op) {
case Opcode::VSET: {
instr->setDestVReg(rd);
instr->setSrcVReg(rs1);
instr->setDestReg(rd, RegType::Vector);
instr->addSrcReg(rs1, RegType::Vector);
instr->setFunc3(func3);
if (func3 == 7) {
instr->setImm(!(code >> shift_vset));
@@ -593,20 +695,20 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
instr->setVediv((immed >> 4) & 0x3);
instr->setVsew((immed >> 2) & 0x3);
} else {
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
}
} else {
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask((code >> shift_func7) & 0x1);
instr->setFunc6(func6);
}
} break;
case Opcode::FL:
instr->setDestVReg(rd);
instr->setSrcVReg(rs1);
instr->setDestReg(rd, RegType::Vector);
instr->addSrcReg(rs1, RegType::Vector);
instr->setVlsWidth(func3);
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask(code >> shift_func7);
instr->setVmop((code >> shift_vmop) & mask_func3);
instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -614,9 +716,9 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
case Opcode::FS:
instr->setVs3(rd);
instr->setSrcVReg(rs1);
instr->addSrcReg(rs1, RegType::Vector);
instr->setVlsWidth(func3);
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask(code >> shift_func7);
instr->setVmop((code >> shift_vmop) & mask_func3);
instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -627,16 +729,28 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
break;
case R4_TYPE:
if (op == Opcode::GPU) {
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->setSrcReg(rs3, RegType::Integer);
if (op == Opcode::EXT2) {
switch (func3) {
case 1:
switch (func2) {
case 0: // CMOV
instr->setDestReg(rd, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs3, RegType::Integer);
break;
default:
std::abort();
}
break;
default:
std::abort();
}
} else {
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Float);
instr->setSrcReg(rs3, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs3, RegType::Float);
}
instr->setFunc2(func2);
instr->setFunc3(func3);

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
@@ -5,12 +18,12 @@
namespace vortex {
class ArchDef;
class Arch;
class Instr;
class Decoder {
public:
Decoder(const ArchDef &);
Decoder(const Arch &);
std::shared_ptr<Instr> decode(uint32_t code) const;
};

141
sim/simx/dispatcher.h Normal file
View File

@@ -0,0 +1,141 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
#include <queue>
namespace vortex {
class Dispatcher : public SimObject<Dispatcher> {
public:
std::vector<SimPort<pipeline_trace_t*>> Outputs;
Dispatcher(const SimContext& ctx, const Arch& arch, uint32_t buf_size, uint32_t block_size, uint32_t num_lanes)
: SimObject<Dispatcher>(ctx, "Dispatcher")
, Outputs(ISSUE_WIDTH, this)
, Inputs_(ISSUE_WIDTH, this)
, arch_(arch)
, queues_(ISSUE_WIDTH, std::queue<pipeline_trace_t*>())
, buf_size_(buf_size)
, block_size_(block_size)
, num_lanes_(num_lanes)
, batch_count_(ISSUE_WIDTH / block_size)
, pid_count_(arch.num_threads() / num_lanes)
, batch_idx_(0)
, start_p_(block_size, 0)
{}
virtual ~Dispatcher() {}
virtual void reset() {
batch_idx_ = 0;
for (uint32_t b = 0; b < block_size_; ++b) {
start_p_.at(b) = 0;
}
}
virtual void tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& queue = queues_.at(i);
if (queue.empty())
continue;
auto trace = queue.front();
Inputs_.at(i).send(trace, 1);
queue.pop();
}
uint32_t block_sent = 0;
for (uint32_t b = 0; b < block_size_; ++b) {
uint32_t i = batch_idx_ * block_size_ + b;
auto& input = Inputs_.at(i);
if (input.empty()) {
++block_sent;
continue;
}
auto& output = Outputs.at(i);
auto trace = input.front();
if (pid_count_ != 1) {
auto start_p = start_p_.at(b);
if (start_p == -1) {
++block_sent;
continue;
}
int start(-1), end(-1);
for (uint32_t j = start_p * num_lanes_, n = arch_.num_threads(); j < n; ++j) {
if (!trace->tmask.test(j))
continue;
if (start == -1)
start = j;
end = j;
}
start /= num_lanes_;
end /= num_lanes_;
auto new_trace = new pipeline_trace_t(*trace);
new_trace->tmask.reset();
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
new_trace->tmask[j] = trace->tmask[j];
}
new_trace->pid = start;
new_trace->sop = (start_p == 0);
if (start == end) {
new_trace->eop = 1;
start_p_.at(b) = -1;
input.pop();
++block_sent;
delete trace;
} else {
new_trace->eop = 0;
start_p_.at(b) = start + 1;
}
output.send(new_trace, 1);
DT(3, "pipeline-dispatch: " << *new_trace);
} else {
trace->pid = 0;
input.pop();
output.send(trace, 1);
DT(3, "pipeline-dispatch: " << *trace);
++block_sent;
}
}
if (block_sent == block_size_) {
batch_idx_ = (batch_idx_ + 1) % batch_count_;
for (uint32_t b = 0; b < block_size_; ++b) {
start_p_.at(b) = 0;
}
}
};
bool push(uint32_t issue_index, pipeline_trace_t* trace) {
auto& queue = queues_.at(issue_index);
if (queue.size() >= buf_size_)
return false;
queue.push(trace);
return true;
}
private:
std::vector<SimPort<pipeline_trace_t*>> Inputs_;
const Arch& arch_;
std::vector<std::queue<pipeline_trace_t*>> queues_;
uint32_t buf_size_;
uint32_t block_size_;
uint32_t num_lanes_;
uint32_t batch_count_;
uint32_t pid_count_;
uint32_t batch_idx_;
std::vector<int> start_p_;
};
}

341
sim/simx/exe_unit.cpp Normal file
View File

@@ -0,0 +1,341 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "exe_unit.h"
#include <iostream>
#include <iomanip>
#include <string.h>
#include <assert.h>
#include <util.h>
#include "debug.h"
#include "core.h"
#include "constants.h"
#include "cache_sim.h"
using namespace vortex;
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
void AluUnit::tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& input = Inputs.at(i);
if (input.empty())
continue;
auto& output = Outputs.at(i);
auto trace = input.front();
switch (trace->alu_type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::SYSCALL:
case AluType::IMUL:
output.send(trace, LATENCY_IMUL+1);
break;
case AluType::IDIV:
output.send(trace, XLEN+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
if (trace->eop && trace->fetch_stall) {
assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid);
}
auto time = input.pop();
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
}
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
void FpuUnit::tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& input = Inputs.at(i);
if (input.empty())
continue;
auto& output = Outputs.at(i);
auto trace = input.front();
switch (trace->fpu_type) {
case FpuType::FNCP:
output.send(trace, 2);
break;
case FpuType::FMA:
output.send(trace, LATENCY_FMA+1);
break;
case FpuType::FDIV:
output.send(trace, LATENCY_FDIV+1);
break;
case FpuType::FSQRT:
output.send(trace, LATENCY_FSQRT+1);
break;
case FpuType::FCVT:
output.send(trace, LATENCY_FCVT+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
auto time = input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
}
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
, pending_rd_reqs_(LSUQ_SIZE)
, num_lanes_(NUM_LSU_LANES)
, pending_loads_(0)
, fence_lock_(false)
, input_idx_(0)
{}
void LsuUnit::reset() {
pending_rd_reqs_.clear();
pending_loads_ = 0;
fence_lock_ = false;
}
void LsuUnit::tick() {
core_->perf_stats_.load_latency += pending_loads_;
// handle dcache response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.trace;
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type
<< ", tid=" << t << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
int iw = trace->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
--pending_loads_;
}
// handle shared memory response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.trace;
DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", tid=" << t << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
int iw = trace->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
smem_rsp_port.pop();
--pending_loads_;
}
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_rd_reqs_.empty())
return;
int iw = fence_state_->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(fence_state_, 1);
fence_lock_ = false;
DT(3, "fence-unlock: " << fence_state_);
}
// check input queue
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
int iw = (input_idx_ + i) % ISSUE_WIDTH;
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
auto t0 = trace->pid * num_lanes_;
if (trace->lsu_type == LsuType::FENCE) {
// schedule fence lock
fence_state_ = trace;
fence_lock_ = true;
DT(3, "fence-lock: " << *trace);
// remove input
auto time = input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
break;
}
// check pending queue capacity
if (pending_rd_reqs_.full()) {
if (!trace->log_once(true)) {
DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
}
break;
} else {
trace->log_once(false);
}
bool is_write = (trace->lsu_type == LsuType::STORE);
// duplicates detection
bool is_dup = false;
if (trace->tmask.test(t0)) {
uint64_t addr_mask = sizeof(uint32_t)-1;
uint32_t addr0 = trace_data->mem_addrs.at(0).addr & ~addr_mask;
uint32_t matches = 1;
for (uint32_t t = 1; t < num_lanes_; ++t) {
if (!trace->tmask.test(t0 + t))
continue;
auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
is_dup = (matches == trace->tmask.count());
}
uint32_t addr_count;
if (is_dup) {
addr_count = 1;
} else {
addr_count = trace->tmask.count();
}
auto tag = pending_rd_reqs_.allocate({trace, addr_count});
for (uint32_t t = 0; t < num_lanes_; ++t) {
if (!trace->tmask.test(t0 + t))
continue;
auto& dcache_req_port = core_->dcache_req_ports.at(t);
auto mem_addr = trace_data->mem_addrs.at(t);
auto type = core_->get_addr_type(mem_addr.addr);
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.type = type;
mem_req.tag = tag;
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
dcache_req_port.send(mem_req, 2);
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
++pending_loads_;
++core_->perf_stats_.loads;
if (is_dup)
break;
}
// do not wait on writes
if (is_write) {
pending_rd_reqs_.release(tag);
output.send(trace, 1);
++core_->perf_stats_.stores;
}
// remove input
auto time = input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
break; // single block
}
++input_idx_;
}
///////////////////////////////////////////////////////////////////////////////
SfuUnit::SfuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "SFU")
{}
void SfuUnit::tick() {
// handle pending responses
for (auto pending_rsp : pending_rsps_) {
if (pending_rsp->empty())
continue;
auto trace = pending_rsp->front();
if (trace->cid != core_->id())
continue;
int iw = trace->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(trace, 1);
pending_rsp->pop();
}
// check input queue
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
int iw = (input_idx_ + i) % ISSUE_WIDTH;
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
auto sfu_type = trace->sfu_type;
bool release_warp = trace->fetch_stall;
switch (sfu_type) {
case SfuType::TMC:
case SfuType::WSPAWN:
case SfuType::SPLIT:
case SfuType::JOIN:
case SfuType::PRED:
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC:
output.send(trace, 1);
break;
case SfuType::BAR: {
output.send(trace, 1);
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
if (trace->eop) {
core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
}
release_warp = false;
} break;
case SfuType::CMOV:
output.send(trace, 3);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
if (trace->eop && release_warp) {
assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid);
}
auto time = input.pop();
auto stalls = (SimPlatform::instance().cycles() - time);
core_->perf_stats_.sfu_stalls += stalls;
break; // single block
}
++input_idx_;
}

View File

@@ -1,8 +1,21 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "pipeline.h"
#include "cache.h"
#include "cache_sim.h"
namespace vortex {
@@ -10,13 +23,13 @@ class Core;
class ExeUnit : public SimObject<ExeUnit> {
public:
SimPort<pipeline_trace_t*> Input;
SimPort<pipeline_trace_t*> Output;
std::vector<SimPort<pipeline_trace_t*>> Inputs;
std::vector<SimPort<pipeline_trace_t*>> Outputs;
ExeUnit(const SimContext& ctx, Core* core, const char* name)
: SimObject<ExeUnit>(ctx, name)
, Input(this)
, Output(this)
, Inputs(ISSUE_WIDTH, this)
, Outputs(ISSUE_WIDTH, this)
, core_(core)
{}
@@ -32,28 +45,25 @@ protected:
///////////////////////////////////////////////////////////////////////////////
class NopUnit : public ExeUnit {
public:
NopUnit(const SimContext& ctx, Core*);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class LsuUnit : public ExeUnit {
private:
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_rd_reqs_;
pipeline_trace_t* fence_state_;
bool fence_lock_;
public:
LsuUnit(const SimContext& ctx, Core*);
void reset();
void tick();
private:
struct pending_req_t {
pipeline_trace_t* trace;
uint32_t count;
};
HashTable<pending_req_t> pending_rd_reqs_;
uint32_t num_lanes_;
pipeline_trace_t* fence_state_;
uint64_t pending_loads_;
bool fence_lock_;
uint32_t input_idx_;
};
///////////////////////////////////////////////////////////////////////////////
@@ -67,15 +77,6 @@ public:
///////////////////////////////////////////////////////////////////////////////
class CsrUnit : public ExeUnit {
public:
CsrUnit(const SimContext& ctx, Core*);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class FpuUnit : public ExeUnit {
public:
FpuUnit(const SimContext& ctx, Core*);
@@ -85,19 +86,15 @@ public:
///////////////////////////////////////////////////////////////////////////////
class GpuUnit : public ExeUnit {
private:
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
bool processTexRequest(pipeline_trace_t* trace);
class SfuUnit : public ExeUnit {
public:
GpuUnit(const SimContext& ctx, Core*);
void reset();
SfuUnit(const SimContext& ctx, Core*);
void tick();
private:
std::vector<SimPort<pipeline_trace_t*>*> pending_rsps_;
uint32_t input_idx_;
};
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,383 +0,0 @@
#include "exeunit.h"
#include <iostream>
#include <iomanip>
#include <string.h>
#include <assert.h>
#include <util.h>
#include "debug.h"
#include "core.h"
#include "constants.h"
using namespace vortex;
NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
void NopUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
Output.send(trace, 1);
Input.pop();
}
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
, num_threads_(core->arch().num_threads())
, pending_rd_reqs_(LSUQ_SIZE)
, fence_lock_(false)
{}
void LsuUnit::reset() {
pending_rd_reqs_.clear();
fence_lock_ = false;
}
void LsuUnit::tick() {
// handle dcache response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
}
// handle shared memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
smem_rsp_port.pop();
}
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_rd_reqs_.empty())
return;
Output.send(fence_state_, 1);
fence_lock_ = false;
DT(3, "fence-unlock: " << fence_state_);
}
// check input queue
if (Input.empty())
return;
auto trace = Input.front();
if (trace->lsu.type == LsuType::FENCE) {
// schedule fence lock
fence_state_ = trace;
fence_lock_ = true;
DT(3, "fence-lock: " << *trace);
// remove input
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
return;
}
// check pending queue capacity
if (pending_rd_reqs_.full()) {
if (!trace->suspend()) {
DT(3, "*** lsu-queue-stall: " << *trace);
}
return;
} else {
trace->resume();
}
bool is_write = (trace->lsu.type == LsuType::STORE);
// duplicates detection
bool is_dup = false;
if (trace->tmask.test(0)) {
uint64_t addr_mask = sizeof(uint32_t)-1;
uint32_t addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
uint32_t matches = 1;
for (uint32_t t = 1; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
is_dup = (matches == trace->tmask.count());
}
uint32_t valid_addrs = 0;
if (is_dup) {
valid_addrs = 1;
} else {
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
}
auto tag = pending_rd_reqs_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
auto mem_addr = trace->mem_addrs.at(t).at(0);
auto type = get_addr_type(mem_addr.addr, mem_addr.size);
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.non_cacheable = (type == AddrType::IO);
mem_req.tag = tag;
mem_req.core_id = trace->cid;
mem_req.uuid = trace->uuid;
if (type == AddrType::Shared) {
core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
} else {
dcache_req_port.send(mem_req, 2);
DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace);
}
if (is_dup)
break;
}
// do not wait on writes
if (is_write) {
pending_rd_reqs_.release(tag);
Output.send(trace, 1);
}
// remove input
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
void AluUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
switch (trace->alu.type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::SYSCALL:
case AluType::CMOV:
Output.send(trace, 1);
break;
case AluType::IMUL:
Output.send(trace, LATENCY_IMUL+1);
break;
case AluType::IDIV:
Output.send(trace, XLEN+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
void CsrUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
Output.send(trace, 1);
auto time = Input.pop();
core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time);
DT(3, "pipeline-execute: op=CSR, " << *trace);
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
void FpuUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
switch (trace->fpu.type) {
case FpuType::FNCP:
Output.send(trace, 2);
break;
case FpuType::FMA:
Output.send(trace, LATENCY_FMA+1);
break;
case FpuType::FDIV:
Output.send(trace, LATENCY_FDIV+1);
break;
case FpuType::FSQRT:
Output.send(trace, LATENCY_FSQRT+1);
break;
case FpuType::FCVT:
Output.send(trace, LATENCY_FCVT+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
GpuUnit::GpuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "GPU")
, num_threads_(core->arch().num_threads())
, pending_tex_reqs_(TEXQ_SIZE)
{}
void GpuUnit::reset() {
pending_tex_reqs_.clear();
}
void GpuUnit::tick() {
#ifdef EXT_TEX_ENABLE
// handle memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_tex_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
}
#endif
// check input queue
if (Input.empty())
return;
auto trace = Input.front();
bool issued = false;
switch (trace->gpu.type) {
case GpuType::TMC:
Output.send(trace, 1);
core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
issued = true;
break;
case GpuType::WSPAWN:
Output.send(trace, 1);
core_->active_warps_ = trace->gpu.active_warps;
issued = true;
break;
case GpuType::SPLIT:
case GpuType::JOIN:
Output.send(trace, 1);
issued = true;
break;
case GpuType::BAR:
Output.send(trace, 1);
if (trace->gpu.active_warps != 0)
core_->active_warps_ |= trace->gpu.active_warps;
else
core_->active_warps_.reset(trace->wid);
issued = true;
break;
case GpuType::TEX:
if (this->processTexRequest(trace))
issued = true;
break;
default:
std::abort();
}
if (issued) {
DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
}
bool GpuUnit::processTexRequest(pipeline_trace_t* trace) {
// check pending queue capacity
if (pending_tex_reqs_.full()) {
if (!trace->suspend()) {
DT(3, "*** tex-queue-stall: " << *trace);
}
return false;
} else {
trace->resume();
}
// send memory request
uint32_t valid_addrs = 0;
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
for (auto& mem_addr : trace->mem_addrs.at(t)) {
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = (trace->lsu.type == LsuType::STORE);
mem_req.tag = tag;
mem_req.core_id = core_->id();
mem_req.uuid = trace->uuid;
dcache_req_port.send(mem_req, 3);
DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", tid=" << t << ", "<< trace);
++ core_->perf_stats_.tex_reads;
++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
}
}
return true;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
@@ -6,10 +19,6 @@
namespace vortex {
class IBuffer {
private:
std::queue<pipeline_trace_t*> entries_;
uint32_t capacity_;
public:
IBuffer(uint32_t size)
: capacity_(size)
@@ -39,6 +48,10 @@ public:
std::queue<pipeline_trace_t*> empty;
std::swap(entries_, empty );
}
private:
std::queue<pipeline_trace_t*> entries_;
uint32_t capacity_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "types.h"
@@ -7,7 +20,7 @@ namespace vortex {
class Warp;
enum Opcode {
NOP = 0,
NONE = 0,
R_INST = 0x33,
L_INST = 0x3,
I_INST = 0x13,
@@ -19,6 +32,7 @@ enum Opcode {
JALR_INST = 0x67,
SYS_INST = 0x73,
FENCE = 0x0f,
AMO = 0x2f,
// F Extension
FL = 0x7,
FS = 0x27,
@@ -26,19 +40,20 @@ enum Opcode {
FMADD = 0x43,
FMSUB = 0x47,
FMNMSUB = 0x4b,
FMNMADD = 0x4f,
// Vector Extension
VSET = 0x57,
// GPGPU Extension
GPGPU = 0x6b,
GPU = 0x5b,
// RV64 Standard Extensions
FMNMADD = 0x4f,
// RV64 Standard Extension
R_INST_W = 0x3b,
I_INST_W = 0x1b,
// Vector Extension
VSET = 0x57,
// Custom Extensions
EXT1 = 0x0b,
EXT2 = 0x2b,
EXT3 = 0x5b,
EXT4 = 0x7b
};
enum InstType {
N_TYPE,
enum InstType {
R_TYPE,
I_TYPE,
S_TYPE,
@@ -52,25 +67,45 @@ enum InstType {
class Instr {
public:
Instr()
: opcode_(Opcode::NOP)
: opcode_(Opcode::NONE)
, num_rsrcs_(0)
, has_imm_(false)
, rdest_type_(RegType::None)
, imm_(0)
, rdest_(0)
, func2_(0)
, func3_(0)
, func6_(0)
, func7_(0) {
, func7_(0)
, vmask_(0)
, vlsWidth_(0)
, vMop_(0)
, vNf_(0)
, vs3_(0)
, vlmul_(0)
, vsew_(0)
, vediv_(0) {
for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
rsrc_type_[i] = RegType::None;
rsrc_[i] = 0;
}
}
void setOpcode(Opcode opcode) { opcode_ = opcode; }
void setDestReg(uint32_t destReg, RegType type) { rdest_type_ = type; rdest_ = destReg; }
void setSrcReg(uint32_t srcReg, RegType type) { rsrc_type_[num_rsrcs_] = type; rsrc_[num_rsrcs_++] = srcReg; }
void setDestVReg(uint32_t destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
void setSrcVReg(uint32_t srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; }
void setDestReg(uint32_t destReg, RegType type) {
rdest_type_ = type;
rdest_ = destReg;
}
void addSrcReg(uint32_t srcReg, RegType type) {
rsrc_type_[num_rsrcs_] = type;
rsrc_[num_rsrcs_] = srcReg;
++num_rsrcs_;
}
void setSrcReg(uint32_t index, uint32_t srcReg, RegType type) {
rsrc_type_[index] = type;
rsrc_[index] = srcReg;
num_rsrcs_ = std::max<uint32_t>(num_rsrcs_, index+1);
}
void setFunc2(uint32_t func2) { func2_ = func2; }
void setFunc3(uint32_t func3) { func3_ = func3; }
void setFunc7(uint32_t func7) { func7_ = func7; }
@@ -85,17 +120,17 @@ public:
void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; }
void setFunc6(uint32_t func6) { func6_ = func6; }
Opcode getOpcode() const { return opcode_; }
Opcode getOpcode() const { return opcode_; }
uint32_t getFunc2() const { return func2_; }
uint32_t getFunc3() const { return func3_; }
uint32_t getFunc6() const { return func6_; }
uint32_t getFunc7() const { return func7_; }
uint32_t getNRSrc() const { return num_rsrcs_; }
uint32_t getRSrc(uint32_t i) const { return rsrc_[i]; }
RegType getRSType(uint32_t i) const { return rsrc_type_[i]; }
RegType getRSType(uint32_t i) const { return rsrc_type_[i]; }
uint32_t getRDest() const { return rdest_; }
RegType getRDType() const { return rdest_type_; }
bool hasImm() const { return has_imm_; }
RegType getRDType() const { return rdest_type_; }
bool hasImm() const { return has_imm_; }
uint32_t getImm() const { return imm_; }
uint32_t getVlsWidth() const { return vlsWidth_; }
uint32_t getVmop() const { return vMop_; }

View File

@@ -1,98 +1,132 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <fstream>
#include <stdlib.h>
#include <unistd.h>
#include <sys/stat.h>
#include "processor.h"
#include "archdef.h"
#include "mem.h"
#include "constants.h"
#include <util.h>
#include "args.h"
#include "core.h"
using namespace vortex;
static void show_usage() {
std::cout << "Usage: [-c <cores>] [-w <warps>] [-t <threads>] [-r: riscv-test] [-s: stats] [-h: help] <program>" << std::endl;
}
uint32_t num_threads = NUM_THREADS;
uint32_t num_warps = NUM_WARPS;
uint32_t num_cores = NUM_CORES;
uint32_t num_clusters = NUM_CLUSTERS;
bool showStats = false;;
bool riscv_test = false;
const char* program = nullptr;
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
switch (c) {
case 't':
num_threads = atoi(optarg);
break;
case 'w':
num_warps = atoi(optarg);
break;
case 'c':
num_cores = atoi(optarg);
break;
case 'g':
num_clusters = atoi(optarg);
break;
case 'r':
riscv_test = true;
break;
case 's':
showStats = true;
break;
case 'h':
case '?':
show_usage();
exit(0);
break;
default:
show_usage();
exit(-1);
}
}
if (optind < argc) {
program = argv[optind];
std::cout << "Running " << program << "..." << std::endl;
} else {
show_usage();
exit(-1);
}
}
int main(int argc, char **argv) {
int exitcode = 0;
std::string imgFileName;
int num_cores(NUM_CORES * NUM_CLUSTERS);
int num_warps(NUM_WARPS);
int num_threads(NUM_THREADS);
bool showHelp(false);
bool showStats(false);
bool riscv_test(false);
parse_args(argc, argv);
// parse the command line arguments
CommandLineArgFlag fh("-h", "--help", "show command line options", showHelp);
CommandLineArgSetter<std::string> fi("-i", "--image", "program binary", imgFileName);
CommandLineArgSetter<int> fc("-c", "--cores", "number of cores", num_cores);
CommandLineArgSetter<int> fw("-w", "--warps", "number of warps", num_warps);
CommandLineArgSetter<int> ft("-t", "--threads", "number of threads", num_threads);
CommandLineArgFlag fr("-r", "--riscv", "enable riscv tests", riscv_test);
CommandLineArgFlag fs("-s", "--stats", "show stats", showStats);
CommandLineArg::readArgs(argc - 1, argv + 1);
if (showHelp || imgFileName.empty()) {
std::cout << "Vortex emulator command line arguments:\n"
" -i, --image <filename> Program RAM image\n"
" -c, --cores <num> Number of cores\n"
" -w, --warps <num> Number of warps\n"
" -t, --threads <num> Number of threads\n"
" -r, --riscv riscv test\n"
" -s, --stats Print stats on exit.\n";
return 0;
}
std::cout << "Running " << imgFileName << "..." << std::endl;
{
// create processor configuation
ArchDef arch(num_cores, num_warps, num_threads);
Arch arch(num_threads, num_warps, num_cores, num_clusters);
// create memory module
RAM ram(RAM_PAGE_SIZE);
// create processor
Processor processor(arch);
// attach memory module
processor.attach_ram(&ram);
// setup base DCRs
const uint64_t startup_addr(STARTUP_ADDR);
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
#if (XLEN == 64)
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
#endif
processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
// load program
{
std::string program_ext(fileExtension(imgFileName.c_str()));
{
std::string program_ext(fileExtension(program));
if (program_ext == "bin") {
ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
ram.loadBinImage(program, startup_addr);
} else if (program_ext == "hex") {
ram.loadHexImage(imgFileName.c_str());
ram.loadHexImage(program);
} else {
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
return -1;
}
}
// create processor
Processor processor(arch);
// attach memory module
processor.attach_ram(&ram);
// run simulation
exitcode = processor.run();
exitcode = processor.run(riscv_test);
}
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed." << std::endl;
exitcode = 0;
} else {
std::cout << "Failed." << std::endl;
}
} else {
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
}
return exitcode;
}

View File

@@ -1,4 +1,17 @@
#include "memsim.h"
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mem_sim.h"
#include <vector>
#include <queue>
#include <stdlib.h>
@@ -83,7 +96,7 @@ public:
mem_req.addr,
mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ,
std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid),
mem_req.core_id
mem_req.cid
);
if (!dram_->send(dram_req))

View File

@@ -1,8 +1,20 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "types.h"
#include <vector>
namespace vortex {

61
sim/simx/operand.h Normal file
View File

@@ -0,0 +1,61 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
#include <queue>
namespace vortex {
class Operand : public SimObject<Operand> {
public:
SimPort<pipeline_trace_t*> Input;
SimPort<pipeline_trace_t*> Output;
Operand(const SimContext& ctx)
: SimObject<Operand>(ctx, "Operand")
, Input(this)
, Output(this)
{}
virtual ~Operand() {}
virtual void reset() {}
virtual void tick() {
if (Input.empty())
return;
auto trace = Input.front();
int delay = 1;
for (int i = 0; i < MAX_NUM_REGS; ++i) {
bool is_iregs = trace->used_iregs.test(i);
bool is_fregs = trace->used_fregs.test(i);
bool is_vregs = trace->used_vregs.test(i);
if (is_iregs || is_fregs || is_vregs) {
if (is_iregs && i == 0)
continue;
++delay;
}
}
Output.send(trace, delay);
DT(3, "pipeline-operands: " << *trace);
Input.pop();
};
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
@@ -5,14 +18,38 @@
#include <iostream>
#include <util.h>
#include "types.h"
#include "archdef.h"
#include "arch.h"
#include "debug.h"
namespace vortex {
class ITraceData {
public:
using Ptr = std::shared_ptr<ITraceData>;
ITraceData() {}
virtual ~ITraceData() {}
};
struct LsuTraceData : public ITraceData {
using Ptr = std::shared_ptr<LsuTraceData>;
std::vector<mem_addr_size_t> mem_addrs;
LsuTraceData(uint32_t num_threads) : mem_addrs(num_threads) {}
};
struct SFUTraceData : public ITraceData {
using Ptr = std::shared_ptr<SFUTraceData>;
struct {
uint32_t id;
uint32_t count;
} bar;
SFUTraceData(uint32_t bar_id, uint32_t bar_count) : bar{bar_id, bar_count} {}
};
struct pipeline_trace_t {
public:
//--
uint64_t uuid;
const uint64_t uuid;
const Arch& arch;
//--
uint32_t cid;
@@ -21,12 +58,9 @@ struct pipeline_trace_t {
Word PC;
//--
bool fetch_stall;
//--
bool wb;
RegType rdest_type;
uint32_t rdest;
RegType rdest_type;
bool wb;
//--
RegMask used_iregs;
@@ -36,73 +70,104 @@ struct pipeline_trace_t {
//-
ExeType exe_type;
//--
std::vector<std::vector<mem_addr_size_t>> mem_addrs;
//--
union {
struct {
LsuType type;
} lsu;
struct {
AluType type;
} alu;
struct {
FpuType type;
} fpu;
struct {
GpuType type;
WarpMask active_warps;
} gpu;
uint32_t unit_type;
LsuType lsu_type;
AluType alu_type;
FpuType fpu_type;
SfuType sfu_type;
};
bool stalled;
ITraceData::Ptr data;
pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) {
uuid = uuid_;
cid = 0;
wid = 0;
tmask.reset();
PC = 0;
fetch_stall = false;
wb = false;
rdest = 0;
rdest_type = RegType::None;
used_iregs.reset();
used_fregs.reset();
used_vregs.reset();
exe_type = ExeType::NOP;
mem_addrs.resize(arch.num_threads());
stalled = false;
}
int pid;
bool sop;
bool eop;
bool suspend() {
bool old = stalled;
stalled = true;
bool fetch_stall;
pipeline_trace_t(uint64_t uuid, const Arch& arch)
: uuid(uuid)
, arch(arch)
, cid(0)
, wid(0)
, tmask(0)
, PC(0)
, rdest(0)
, rdest_type(RegType::None)
, wb(false)
, used_iregs(0)
, used_fregs(0)
, used_vregs(0)
, exe_type(ExeType::ALU)
, unit_type(0)
, data(nullptr)
, pid(-1)
, sop(true)
, eop(true)
, fetch_stall(false)
, log_once_(false)
{}
pipeline_trace_t(const pipeline_trace_t& rhs)
: uuid(rhs.uuid)
, arch(rhs.arch)
, cid(rhs.cid)
, wid(rhs.wid)
, tmask(rhs.tmask)
, PC(rhs.PC)
, rdest(rhs.rdest)
, rdest_type(rhs.rdest_type)
, wb(rhs.wb)
, used_iregs(rhs.used_iregs)
, used_fregs(rhs.used_fregs)
, used_vregs(rhs.used_vregs)
, exe_type(rhs.exe_type)
, unit_type(rhs.unit_type)
, data(rhs.data)
, pid(rhs.pid)
, sop(rhs.sop)
, eop(rhs.eop)
, fetch_stall(rhs.fetch_stall)
, log_once_(false)
{}
~pipeline_trace_t() {}
bool log_once(bool enable) {
bool old = log_once_;
log_once_ = enable;
return old;
}
void resume() {
stalled = false;
}
private:
bool log_once_;
};
inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) {
os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
os << "cid=" << state.cid;
os << ", wid=" << state.wid;
os << ", tmask=";
for (uint32_t i = 0, n = state.arch.num_threads(); i < n; ++i) {
os << state.tmask.test(i);
}
os << ", PC=0x" << std::hex << state.PC;
os << ", wb=" << state.wb;
if (state.wb) {
os << ", rd=" << state.rdest_type << std::dec << state.rdest;
}
os << ", ex=" << state.exe_type;
if (state.pid != -1) {
os << ", pid=" << state.pid;
os << ", sop=" << state.sop;
os << ", eop=" << state.eop;
}
os << " (#" << std::dec << state.uuid << ")";
return os;
}
class PipelineLatch {
protected:
const char* name_;
std::queue<pipeline_trace_t*> queue_;
public:
PipelineLatch(const char* name = nullptr)
: name_(name)
@@ -132,6 +197,10 @@ public:
std::queue<pipeline_trace_t*> empty;
std::swap(queue_, empty );
}
protected:
const char* name_;
std::queue<pipeline_trace_t*> queue_;
};
}

View File

@@ -1,168 +1,141 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "processor.h"
#include "core.h"
#include "constants.h"
#include "processor_impl.h"
using namespace vortex;
class Processor::Impl {
private:
std::vector<Core::Ptr> cores_;
std::vector<Cache::Ptr> l2caches_;
std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
Cache::Ptr l3cache_;
Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
ProcessorImpl::ProcessorImpl(const Arch& arch)
: arch_(arch)
, clusters_(arch.num_clusters())
{
SimPlatform::instance().initialize();
public:
Impl(const ArchDef& arch)
: cores_(arch.num_cores())
, l2caches_(NUM_CLUSTERS)
, l2_mem_switches_(NUM_CLUSTERS)
{
SimPlatform::instance().initialize();
// create memory simulator
memsim_ = MemSim::Create("dram", MemSim::Config{
MEMORY_BANKS,
uint32_t(arch.num_cores()) * arch.num_clusters()
});
uint32_t num_cores = arch.num_cores();
uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
cores_.at(i) = Core::Create(arch, i);
// create L3 cache
l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{
!L3_ENABLED,
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(L3_NUM_WAYS), // W
0, // A
XLEN, // address bits
L3_NUM_BANKS, // number of banks
1, // number of ports
uint8_t(arch.num_clusters()), // request size
true, // write-through
false, // write response
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
}
);
// connect L3 memory ports
l3cache_->MemReqPort.bind(&memsim_->MemReqPort);
memsim_->MemRspPort.bind(&l3cache_->MemRspPort);
// setup memory simulator
auto memsim = MemSim::Create("dram", MemSim::Config{
MEMORY_BANKS,
arch.num_cores()
});
std::vector<SimPort<MemReq>*> mem_req_ports(1, &memsim->MemReqPort);
std::vector<SimPort<MemRsp>*> mem_rsp_ports(1, &memsim->MemRspPort);
if (L3_ENABLE) {
l3cache_ = Cache::Create("l3cache", Cache::Config{
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L3_NUM_BANKS, // number of banks
L3_NUM_PORTS, // number of ports
NUM_CLUSTERS, // request size
true, // write-through
false, // write response
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
}
);
l3cache_->MemReqPort.bind(mem_req_ports.at(0));
mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
}
} else if (NUM_CLUSTERS > 1) {
l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));
mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
}
}
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
std::vector<SimPort<MemReq>*> cluster_mem_req_ports(cores_per_cluster);
std::vector<SimPort<MemRsp>*> cluster_mem_rsp_ports(cores_per_cluster);
if (L2_ENABLE) {
auto& l2cache = l2caches_.at(i);
l2cache = Cache::Create("l2cache", Cache::Config{
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L2_NUM_BANKS, // number of banks
L2_NUM_PORTS, // number of ports
(uint8_t)cores_per_cluster, // request size
true, // write-through
false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
l2cache->MemReqPort.bind(mem_req_ports.at(i));
mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
}
} else {
auto& l2_mem_switch = l2_mem_switches_.at(i);
l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster);
l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
}
}
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
auto& core = cores_.at((i * cores_per_cluster) + j);
core->MemReqPort.bind(cluster_mem_req_ports.at(j));
cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort);
}
}
// create clusters
for (uint32_t i = 0; i < arch.num_clusters(); ++i) {
clusters_.at(i) = Cluster::Create(i, this, arch, dcrs_);
// connect L3 core ports
clusters_.at(i)->mem_req_port.bind(&l3cache_->CoreReqPorts.at(i));
l3cache_->CoreRspPorts.at(i).bind(&clusters_.at(i)->mem_rsp_port);
}
~Impl() {
SimPlatform::instance().finalize();
}
// set up memory perf recording
memsim_->MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
__unused (cycle);
perf_mem_reads_ += !req.write;
perf_mem_writes_ += req.write;
perf_mem_pending_reads_ += !req.write;
});
memsim_->MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){
__unused (cycle);
--perf_mem_pending_reads_;
});
void attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
}
}
this->reset();
}
int run() {
SimPlatform::instance().reset();
bool running;
int exitcode = 0;
do {
SimPlatform::instance().tick();
running = false;
for (auto& core : cores_) {
if (core->running()) {
running = true;
}
if (core->check_exit()) {
exitcode = core->getIRegValue(3);
running = false;
break;
ProcessorImpl::~ProcessorImpl() {
SimPlatform::instance().finalize();
}
void ProcessorImpl::attach_ram(RAM* ram) {
for (auto cluster : clusters_) {
cluster->attach_ram(ram);
}
}
int ProcessorImpl::run(bool riscv_test) {
SimPlatform::instance().reset();
this->reset();
bool done;
Word exitcode = 0;
do {
SimPlatform::instance().tick();
done = true;
for (auto cluster : clusters_) {
if (cluster->running()) {
Word ec;
if (cluster->check_exit(&ec, riscv_test)) {
exitcode |= ec;
} else {
done = false;
}
}
} while (running);
}
perf_mem_latency_ += perf_mem_pending_reads_;
} while (!done);
return exitcode;
}
};
return exitcode;
}
void ProcessorImpl::reset() {
perf_mem_reads_ = 0;
perf_mem_writes_ = 0;
perf_mem_latency_ = 0;
perf_mem_pending_reads_ = 0;
}
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
dcrs_.write(addr, value);
}
ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
ProcessorImpl::PerfStats perf;
perf.mem_reads = perf_mem_reads_;
perf.mem_writes = perf_mem_writes_;
perf.mem_latency = perf_mem_latency_;
perf.l3cache = l3cache_->perf_stats();
for (auto cluster : clusters_) {
perf.clusters += cluster->perf_stats();
}
return perf;
}
///////////////////////////////////////////////////////////////////////////////
Processor::Processor(const ArchDef& arch)
: impl_(new Impl(arch))
Processor::Processor(const Arch& arch)
: impl_(new ProcessorImpl(arch))
{}
Processor::~Processor() {
@@ -173,6 +146,10 @@ void Processor::attach_ram(RAM* mem) {
impl_->attach_ram(mem);
}
int Processor::run() {
return impl_->run();
int Processor::run(bool riscv_test) {
return impl_->run(riscv_test);
}
void Processor::write_dcr(uint32_t addr, uint32_t value) {
return impl_->write_dcr(addr, value);
}

View File

@@ -1,22 +1,39 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
namespace vortex {
class ArchDef;
class Arch;
class RAM;
class ProcessorImpl;
class Processor {
public:
Processor(const ArchDef& arch);
Processor(const Arch& arch);
~Processor();
void attach_ram(RAM* mem);
int run();
int run(bool riscv_test);
void write_dcr(uint32_t addr, uint32_t value);
private:
class Impl;
Impl* impl_;
ProcessorImpl* impl_;
};
}
}

66
sim/simx/processor_impl.h Normal file
View File

@@ -0,0 +1,66 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "mem_sim.h"
#include "cache_sim.h"
#include "constants.h"
#include "dcrs.h"
#include "cluster.h"
namespace vortex {
class ProcessorImpl {
public:
struct PerfStats {
uint64_t mem_reads;
uint64_t mem_writes;
uint64_t mem_latency;
CacheSim::PerfStats l3cache;
Cluster::PerfStats clusters;
PerfStats()
: mem_reads(0)
, mem_writes(0)
, mem_latency(0)
{}
};
ProcessorImpl(const Arch& arch);
~ProcessorImpl();
void attach_ram(RAM* mem);
int run(bool riscv_test);
void write_dcr(uint32_t addr, uint32_t value);
ProcessorImpl::PerfStats perf_stats() const;
private:
void reset();
const Arch& arch_;
std::vector<std::shared_ptr<Cluster>> clusters_;
DCRS dcrs_;
MemSim::Ptr memsim_;
CacheSim::Ptr l3cache_;
uint64_t perf_mem_reads_;
uint64_t perf_mem_writes_;
uint64_t perf_mem_latency_;
uint64_t perf_mem_pending_reads_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
@@ -6,20 +19,15 @@
namespace vortex {
class Scoreboard {
private:
public:
struct reg_use_t {
RegType type;
uint32_t reg;
uint64_t owner;
};
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
std::unordered_map<uint32_t, uint64_t> owners_;
public:
Scoreboard(const ArchDef &arch)
Scoreboard(const Arch &arch)
: in_use_iregs_(arch.num_warps())
, in_use_fregs_(arch.num_warps())
, in_use_vregs_(arch.num_warps())
@@ -84,8 +92,7 @@ public:
}
void reserve(pipeline_trace_t* state) {
if (!state->wb)
return;
assert(state->wb);
switch (state->rdest_type) {
case RegType::Integer:
in_use_iregs_.at(state->wid).set(state->rdest);
@@ -105,8 +112,7 @@ public:
}
void release(pipeline_trace_t* state) {
if (!state->wb)
return;
assert(state->wb);
switch (state->rdest_type) {
case RegType::Integer:
in_use_iregs_.at(state->wid).reset(state->rdest);
@@ -123,6 +129,13 @@ public:
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
owners_.erase(tag);
}
private:
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
std::unordered_map<uint32_t, uint64_t> owners_;
};
}

138
sim/simx/shared_mem.cpp Normal file
View File

@@ -0,0 +1,138 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "shared_mem.h"
#include "core.h"
#include <bitmanip.h>
#include <vector>
#include "types.h"
using namespace vortex;
class SharedMem::Impl {
protected:
SharedMem* simobject_;
Config config_;
RAM ram_;
uint32_t bank_sel_addr_start_;
uint32_t bank_sel_addr_end_;
PerfStats perf_stats_;
uint64_t to_local_addr(uint64_t addr) {
uint32_t total_lines = config_.capacity / config_.line_size;
uint32_t line_bits = log2ceil(total_lines);
uint32_t offset = bit_getw(addr, 0, line_bits-1);
return offset;
}
public:
Impl(SharedMem* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, ram_(config.capacity, config.capacity)
, bank_sel_addr_start_(0)
, bank_sel_addr_end_(0 + log2ceil(config.num_banks)-1)
{}
virtual ~Impl() {}
void reset() {
perf_stats_ = PerfStats();
}
void read(void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr);
DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
ram_.read(data, s_addr, size);
}
void write(const void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr);
DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
ram_.write(data, s_addr, size);
}
void tick() {
std::vector<bool> in_used_banks(config_.num_banks);
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
auto& core_req_port = simobject_->Inputs.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
uint32_t bank_id = 0;
if (bank_sel_addr_start_ <= bank_sel_addr_end_) {
bank_id = (uint32_t)bit_getw(core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
}
// bank conflict check
if (in_used_banks.at(bank_id)) {
++perf_stats_.bank_stalls;
continue;
}
in_used_banks.at(bank_id) = true;
if (!core_req.write || config_.write_reponse) {
// send response
MemRsp core_rsp{core_req.tag, core_req.cid};
simobject_->Outputs.at(req_id).send(core_rsp, 1);
}
// update perf counters
perf_stats_.reads += !core_req.write;
perf_stats_.writes += core_req.write;
// remove input
core_req_port.pop();
}
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
};
///////////////////////////////////////////////////////////////////////////////
SharedMem::SharedMem(const SimContext& ctx, const char* name, const Config& config)
: SimObject<SharedMem>(ctx, name)
, Inputs(config.num_reqs, this)
, Outputs(config.num_reqs, this)
, impl_(new Impl(this, config))
{}
SharedMem::~SharedMem() {
delete impl_;
}
void SharedMem::reset() {
impl_->reset();
}
void SharedMem::read(void* data, uint64_t addr, uint32_t size) {
impl_->read(data, addr, size);
}
void SharedMem::write(const void* data, uint64_t addr, uint32_t size) {
impl_->write(data, addr, size);
}
void SharedMem::tick() {
impl_->tick();
}
const SharedMem::PerfStats& SharedMem::perf_stats() const {
return impl_->perf_stats();
}

72
sim/simx/shared_mem.h Normal file
View File

@@ -0,0 +1,72 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "types.h"
namespace vortex {
class SharedMem : public SimObject<SharedMem> {
public:
struct Config {
uint32_t capacity;
uint32_t line_size;
uint32_t num_reqs;
uint32_t num_banks;
bool write_reponse;
};
struct PerfStats {
uint64_t reads;
uint64_t writes;
uint64_t bank_stalls;
PerfStats()
: reads(0)
, writes(0)
, bank_stalls(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->reads += rhs.reads;
this->writes += rhs.writes;
this->bank_stalls += rhs.bank_stalls;
return *this;
}
};
std::vector<SimPort<MemReq>> Inputs;
std::vector<SimPort<MemRsp>> Outputs;
SharedMem(const SimContext& ctx, const char* name, const Config& config);
virtual ~SharedMem();
void reset();
void read(void* data, uint64_t addr, uint32_t size);
void write(const void* data, uint64_t addr, uint32_t size);
void tick();
const PerfStats& perf_stats() const;
protected:
class Impl;
Impl* impl_;
};
}

View File

@@ -1,96 +0,0 @@
#pragma once
#include <simobject.h>
#include <bitmanip.h>
#include <vector>
#include "types.h"
namespace vortex {
class Core;
class SharedMem : public SimObject<SharedMem> {
public:
struct Config {
uint32_t num_reqs;
uint32_t num_banks;
uint32_t bank_offset;
uint32_t latency;
bool write_reponse;
};
struct PerfStats {
uint64_t reads;
uint64_t writes;
uint64_t bank_stalls;
PerfStats()
: reads(0)
, writes(0)
, bank_stalls(0)
{}
};
std::vector<SimPort<MemReq>> Inputs;
std::vector<SimPort<MemRsp>> Outputs;
SharedMem(const SimContext& ctx, const char* name, const Config& config)
: SimObject<SharedMem>(ctx, name)
, Inputs(config.num_reqs, this)
, Outputs(config.num_reqs, this)
, config_(config)
, bank_sel_addr_start_(config.bank_offset)
, bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1)
{}
virtual ~SharedMem() {}
void reset() {
perf_stats_ = PerfStats();
}
void tick() {
std::vector<bool> in_used_banks(config_.num_banks);
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
auto& core_req_port = this->Inputs.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
uint32_t bank_id = (uint32_t)bit_getw(
core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
// bank conflict check
if (in_used_banks.at(bank_id))
continue;
in_used_banks.at(bank_id) = true;
if (!core_req.write || config_.write_reponse) {
// send response
MemRsp core_rsp{core_req.tag, core_req.core_id};
this->Outputs.at(req_id).send(core_rsp, 1);
}
// update perf counters
perf_stats_.reads += !core_req.write;
perf_stats_.writes += core_req.write;
// remove input
core_req_port.pop();
}
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
protected:
Config config_;
uint32_t bank_sel_addr_start_;
uint32_t bank_sel_addr_end_;
PerfStats perf_stats_;
};
}

View File

@@ -1,100 +0,0 @@
#include "tex_unit.h"
#include "core.h"
#include <texturing.h>
#include <VX_config.h>
using namespace vortex;
using namespace cocogfx;
enum class FilterMode {
Point,
Bilinear,
Trilinear,
};
TexUnit::TexUnit(Core* core) : core_(core) {}
TexUnit::~TexUnit() {}
void TexUnit::clear() {
for (auto& state : states_) {
state = 0;
}
}
uint32_t TexUnit::get_state(uint32_t state) {
return states_.at(state);
}
void TexUnit::set_state(uint32_t state, uint32_t value) {
states_.at(state) = value;
}
uint32_t TexUnit::read(int32_t u,
int32_t v,
int32_t lod,
std::vector<mem_addr_size_t>* mem_addrs) {
//--
auto xu = Fixed<TEX_FXD_FRAC>::make(u);
auto xv = Fixed<TEX_FXD_FRAC>::make(v);
uint32_t base_addr = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod));
uint32_t log_width = std::max<int32_t>(states_.at(TEX_STATE_WIDTH) - lod, 0);
uint32_t log_height = std::max<int32_t>(states_.at(TEX_STATE_HEIGHT) - lod, 0);
auto format = (TexFormat)states_.at(TEX_STATE_FORMAT);
auto filter = (FilterMode)states_.at(TEX_STATE_FILTER);
auto wrapu = (WrapMode)states_.at(TEX_STATE_WRAPU);
auto wrapv = (WrapMode)states_.at(TEX_STATE_WRAPV);
auto stride = Stride(format);
switch (filter) {
case FilterMode::Bilinear: {
// addressing
uint32_t offset00, offset01, offset10, offset11;
uint32_t alpha, beta;
TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv,
&offset00, &offset01, &offset10, &offset11, &alpha, &beta);
uint32_t addr00 = base_addr + offset00 * stride;
uint32_t addr01 = base_addr + offset01 * stride;
uint32_t addr10 = base_addr + offset10 * stride;
uint32_t addr11 = base_addr + offset11 * stride;
// memory lookup
uint32_t texel00(0), texel01(0), texel10(0), texel11(0);
core_->dcache_read(&texel00, addr00, stride);
core_->dcache_read(&texel01, addr01, stride);
core_->dcache_read(&texel10, addr10, stride);
core_->dcache_read(&texel11, addr11, stride);
mem_addrs->push_back({addr00, stride});
mem_addrs->push_back({addr01, stride});
mem_addrs->push_back({addr10, stride});
mem_addrs->push_back({addr11, stride});
// filtering
auto color = TexFilterLinear(
format, texel00, texel01, texel10, texel11, alpha, beta);
return color;
}
case FilterMode::Point: {
// addressing
uint32_t offset;
TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
uint32_t addr = base_addr + offset * stride;
// memory lookup
uint32_t texel(0);
core_->dcache_read(&texel, addr, stride);
mem_addrs->push_back({addr, stride});
// filtering
auto color = TexFilterPoint(format, texel);
return color;
}
default:
std::abort();
return 0;
}
}

View File

@@ -1,28 +0,0 @@
#pragma once
#include "types.h"
namespace vortex {
class Core;
class TexUnit {
public:
TexUnit(Core* core);
~TexUnit();
void clear();
uint32_t get_state(uint32_t state);
void set_state(uint32_t state, uint32_t value);
uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<mem_addr_size_t>* mem_addrs);
private:
std::array<uint32_t, NUM_TEX_STATES> states_;
Core* core_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
@@ -5,31 +18,42 @@
#include <queue>
#include <unordered_map>
#include <util.h>
#include <stringutil.h>
#include <VX_config.h>
#include <simobject.h>
#include "uuid_gen.h"
#include "debug.h"
namespace vortex {
typedef uint8_t Byte;
#if XLEN == 32
#if (XLEN == 32)
typedef uint32_t Word;
typedef int32_t WordI;
typedef uint64_t DWord;
typedef int64_t DWordI;
#elif XLEN == 64
typedef uint32_t WordF;
#elif (XLEN == 64)
typedef uint64_t Word;
typedef int64_t WordI;
typedef __uint128_t DWord;
typedef __int128_t DWordI;
typedef uint64_t WordF;
#else
#error unsupported XLEN
#endif
typedef uint64_t FWord;
#define MAX_NUM_CORES 1024
#define MAX_NUM_THREADS 32
#define MAX_NUM_WARPS 32
#define MAX_NUM_REGS 32
typedef std::bitset<32> RegMask;
typedef std::bitset<32> ThreadMask;
typedef std::bitset<32> WarpMask;
typedef std::bitset<MAX_NUM_CORES> CoreMask;
typedef std::bitset<MAX_NUM_REGS> RegMask;
typedef std::bitset<MAX_NUM_THREADS> ThreadMask;
typedef std::bitset<MAX_NUM_WARPS> WarpMask;
typedef std::unordered_map<uint32_t, uint32_t> CSRs;
///////////////////////////////////////////////////////////////////////////////
@@ -40,8 +64,8 @@ enum class RegType {
Vector
};
inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
switch (clss) {
inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
switch (type) {
case RegType::None: break;
case RegType::Integer: os << "x"; break;
case RegType::Float: os << "f"; break;
@@ -53,23 +77,19 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
///////////////////////////////////////////////////////////////////////////////
enum class ExeType {
NOP,
ALU,
LSU,
CSR,
FPU,
GPU,
SFU,
MAX,
};
inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
switch (type) {
case ExeType::NOP: os << "NOP"; break;
case ExeType::ALU: os << "ALU"; break;
case ExeType::LSU: os << "LSU"; break;
case ExeType::CSR: os << "CSR"; break;
case ExeType::FPU: os << "FPU"; break;
case ExeType::GPU: os << "GPU"; break;
case ExeType::SFU: os << "SFU"; break;
case ExeType::MAX: break;
}
return os;
@@ -82,8 +102,7 @@ enum class AluType {
BRANCH,
SYSCALL,
IMUL,
IDIV,
CMOV,
IDIV
};
inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
@@ -93,7 +112,6 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
case AluType::SYSCALL: os << "SYSCALL"; break;
case AluType::IMUL: os << "IMUL"; break;
case AluType::IDIV: os << "IDIV"; break;
case AluType::CMOV: os << "CMOV"; break;
}
return os;
}
@@ -103,16 +121,14 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
enum class LsuType {
LOAD,
STORE,
FENCE,
PREFETCH,
FENCE
};
inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
switch (type) {
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
case LsuType::PREFETCH: os << "PREFETCH"; break;
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
}
return os;
}
@@ -141,21 +157,6 @@ struct mem_addr_size_t {
uint32_t size;
};
inline AddrType get_addr_type(Word addr, uint32_t size) {
__unused (size);
if (SM_ENABLE) {
if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE)
&& addr < SMEM_BASE_ADDR) {
assert((addr + size) <= SMEM_BASE_ADDR);
return AddrType::Shared;
}
}
if (addr >= IO_BASE_ADDR) {
return AddrType::IO;
}
return AddrType::Global;
}
///////////////////////////////////////////////////////////////////////////////
enum class FpuType {
@@ -179,23 +180,37 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
///////////////////////////////////////////////////////////////////////////////
enum class GpuType {
enum class SfuType {
TMC,
WSPAWN,
SPLIT,
JOIN,
BAR,
PRED,
CSRRW,
CSRRS,
CSRRC,
TEX,
RASTER,
ROP,
CMOV
};
inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
switch (type) {
case GpuType::TMC: os << "TMC"; break;
case GpuType::WSPAWN: os << "WSPAWN"; break;
case GpuType::SPLIT: os << "SPLIT"; break;
case GpuType::JOIN: os << "JOIN"; break;
case GpuType::BAR: os << "BAR"; break;
case GpuType::TEX: os << "TEX"; break;
case SfuType::TMC: os << "TMC"; break;
case SfuType::WSPAWN: os << "WSPAWN"; break;
case SfuType::SPLIT: os << "SPLIT"; break;
case SfuType::JOIN: os << "JOIN"; break;
case SfuType::BAR: os << "BAR"; break;
case SfuType::PRED: os << "PRED"; break;
case SfuType::CSRRW: os << "CSRRW"; break;
case SfuType::CSRRS: os << "CSRRS"; break;
case SfuType::CSRRC: os << "CSRRC"; break;
case SfuType::TEX: os << "TEX"; break;
case SfuType::RASTER: os << "RASTER"; break;
case SfuType::ROP: os << "ROP"; break;
case SfuType::CMOV: os << "CMOV"; break;
}
return os;
}
@@ -218,31 +233,32 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
///////////////////////////////////////////////////////////////////////////////
struct MemReq {
uint64_t addr;
bool write;
bool non_cacheable;
uint32_t tag;
uint32_t core_id;
uint64_t uuid;
uint64_t addr;
bool write;
AddrType type;
uint32_t tag;
uint32_t cid;
uint64_t uuid;
MemReq(uint64_t _addr = 0,
bool _write = false,
bool _non_cacheable = false,
uint64_t _tag = 0,
uint32_t _core_id = 0,
uint64_t _uuid = 0
) : addr(_addr)
, write(_write)
, non_cacheable(_non_cacheable)
, tag(_tag)
, core_id(_core_id)
, uuid(_uuid)
{}
MemReq(uint64_t _addr = 0,
bool _write = false,
AddrType _type = AddrType::Global,
uint64_t _tag = 0,
uint32_t _cid = 0,
uint64_t _uuid = 0
) : addr(_addr)
, write(_write)
, type(_type)
, tag(_tag)
, cid(_cid)
, uuid(_uuid)
{}
};
inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
os << "mem-" << (req.write ? "wr" : "rd") << ": ";
os << "addr=" << std::hex << req.addr << std::dec << ", tag=" << req.tag << ", core_id=" << req.core_id;
os << "addr=0x" << std::hex << req.addr << ", type=" << req.type;
os << std::dec << ", tag=" << req.tag << ", cid=" << req.cid;
os << " (#" << std::dec << req.uuid << ")";
return os;
}
@@ -250,18 +266,19 @@ inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
///////////////////////////////////////////////////////////////////////////////
struct MemRsp {
uint64_t tag;
uint32_t core_id;
uint64_t uuid;
MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0)
: tag (_tag)
, core_id(_core_id)
, uuid(_uuid)
{}
uint64_t tag;
uint32_t cid;
uint64_t uuid;
MemRsp(uint64_t _tag = 0, uint32_t _cid = 0, uint64_t _uuid = 0)
: tag (_tag)
, cid(_cid)
, uuid(_uuid)
{}
};
inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id;
os << "mem-rsp: tag=" << rsp.tag << ", cid=" << rsp.cid;
os << " (#" << std::dec << rsp.uuid << ")";
return os;
}
@@ -270,10 +287,6 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
template <typename T>
class HashTable {
private:
std::vector<std::pair<bool, T>> entries_;
uint32_t size_;
public:
HashTable(uint32_t capacity)
: entries_(capacity)
@@ -336,92 +349,180 @@ public:
}
size_ = 0;
}
private:
std::vector<std::pair<bool, T>> entries_;
uint32_t size_;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
template <typename Req, typename Rsp>
class Switch : public SimObject<Switch<Req, Rsp>> {
private:
ArbiterType type_;
uint32_t delay_;
uint32_t cursor_;
uint32_t tag_shift_;
public:
std::vector<SimPort<Req>> ReqIn;
std::vector<SimPort<Rsp>> RspIn;
std::vector<SimPort<Req>> ReqOut;
std::vector<SimPort<Rsp>> RspOut;
Switch(
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs,
uint32_t num_inputs = 1,
uint32_t num_outputs = 1,
uint32_t delay = 1
)
: SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)
: SimObject<Switch<Req, Rsp>>(ctx, name)
, ReqIn(num_inputs, this)
, RspIn(num_inputs, this)
, ReqOut(num_outputs, this)
, RspOut(num_outputs, this)
, type_(type)
, delay_(delay)
, cursor_(0)
, tag_shift_(log2ceil(num_inputs))
, ReqIn(num_inputs, this)
, ReqOut(this)
, RspIn(this)
, RspOut(num_inputs, this)
, cursors_(num_outputs, 0)
, lg_num_reqs_(log2ceil(num_inputs / num_outputs))
{
assert(delay_ != 0);
assert(num_inputs <= MaxInputs);
if (num_inputs == 1) {
// bypass
ReqIn.at(0).bind(&ReqOut);
RspIn.bind(&RspOut.at(0));
assert(delay != 0);
assert(num_inputs <= 32);
assert(num_outputs <= 32);
assert(num_inputs >= num_outputs);
if (num_inputs == num_outputs) {
// bypass mode
for (uint32_t i = 0; i < num_inputs; ++i) {
ReqIn.at(i).bind(&ReqOut.at(i));
RspOut.at(i).bind(&RspIn.at(i));
}
}
}
void reset() {
cursor_ = 0;
for (auto& cursor : cursors_) {
cursor = 0;
}
}
void tick() {
if (ReqIn.size() == 1)
void tick() {
uint32_t I = ReqIn.size();
uint32_t O = ReqOut.size();
uint32_t R = 1 << lg_num_reqs_;
// skip bypass mode
if (I == O)
return;
// process incomming requests
for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
uint32_t j = (cursor_ + i) % n;
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
if (tag_shift_) {
req.tag = (req.tag << tag_shift_) | j;
// process incomming requests
for (uint32_t o = 0; o < O; ++o) {
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (cursors_.at(o) + r) & (R-1);
uint32_t j = o * R + i;
if (j >= I)
continue;
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
if (lg_num_reqs_ != 0) {
req.tag = (req.tag << lg_num_reqs_) | i;
}
DT(4, this->name() << "-" << req);
ReqOut.at(o).send(req, delay_);
req_in.pop();
this->update_cursor(o, i);
break;
}
ReqOut.send(req, delay_);
req_in.pop();
this->update_cursor(j);
break;
}
}
// process incoming reponses
if (!RspIn.empty()) {
auto& rsp = RspIn.front();
uint32_t port_id = 0;
if (tag_shift_) {
port_id = rsp.tag & ((1 << tag_shift_)-1);
rsp.tag >>= tag_shift_;
}
RspOut.at(port_id).send(rsp, 1);
RspIn.pop();
// process incoming reponses
if (!RspOut.at(o).empty()) {
auto& rsp = RspOut.at(o).front();
uint32_t i = 0;
if (lg_num_reqs_ != 0) {
i = rsp.tag & (R-1);
rsp.tag >>= lg_num_reqs_;
}
DT(4, this->name() << "-" << rsp);
uint32_t j = o * R + i;
RspIn.at(j).send(rsp, 1);
RspOut.at(o).pop();
}
}
}
void update_cursor(uint32_t grant) {
void update_cursor(uint32_t index, uint32_t grant) {
if (type_ == ArbiterType::RoundRobin) {
cursor_ = grant + 1;
cursors_.at(index) = grant + 1;
}
}
std::vector<SimPort<Req>> ReqIn;
SimPort<Req> ReqOut;
SimPort<Rsp> RspIn;
std::vector<SimPort<Rsp>> RspOut;
private:
ArbiterType type_;
uint32_t delay_;
std::vector<uint32_t> cursors_;
uint32_t lg_num_reqs_;
};
///////////////////////////////////////////////////////////////////////////////
class SMemDemux : public SimObject<SMemDemux> {
public:
SimPort<MemReq> ReqIn;
SimPort<MemRsp> RspIn;
SimPort<MemReq> ReqSm;
SimPort<MemRsp> RspSm;
SimPort<MemReq> ReqDc;
SimPort<MemRsp> RspDc;
SMemDemux(
const SimContext& ctx,
const char* name,
uint32_t delay = 1
) : SimObject<SMemDemux>(ctx, name)
, ReqIn(this)
, RspIn(this)
, ReqSm(this)
, RspSm(this)
, ReqDc(this)
, RspDc(this)
, delay_(delay)
{}
void reset() {}
void tick() {
// process incomming requests
if (!ReqIn.empty()) {
auto& req = ReqIn.front();
DT(4, this->name() << "-" << req);
if (req.type == AddrType::Shared) {
ReqSm.send(req, delay_);
} else {
ReqDc.send(req, delay_);
}
ReqIn.pop();
}
// process incoming reponses
if (!RspSm.empty()) {
auto& rsp = RspSm.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspSm.pop();
}
if (!RspDc.empty()) {
auto& rsp = RspDc.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspDc.pop();
}
}
private:
uint32_t delay_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <stdlib.h>
#include <unistd.h>
@@ -10,21 +23,25 @@
using namespace vortex;
Warp::Warp(Core *core, uint32_t id)
: id_(id)
Warp::Warp(Core *core, uint32_t warp_id)
: warp_id_(warp_id)
, arch_(core->arch())
, core_(core)
, ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
, freg_file_(core->arch().num_threads(), std::vector<FWord>(core->arch().num_regs()))
, freg_file_(core->arch().num_threads(), std::vector<uint64_t>(core->arch().num_regs()))
, vreg_file_(core->arch().num_threads(), std::vector<Byte>(core->arch().vsize()))
{
this->clear();
this->reset();
}
void Warp::clear() {
active_ = false;
PC_ = STARTUP_ADDR;
void Warp::reset() {
PC_ = core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
#if (XLEN == 64)
PC_ = (uint64_t(core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | PC_;
#endif
tmask_.reset();
for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i) {
issued_instrs_ = 0;
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i) {
for (auto& reg : ireg_file_.at(i)) {
reg = 0;
}
@@ -35,31 +52,44 @@ void Warp::clear() {
reg = 0;
}
}
uui_gen_.reset();
}
void Warp::eval(pipeline_trace_t *trace) {
pipeline_trace_t* Warp::eval() {
assert(tmask_.any());
DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i)
DPN(2, tmask_.test(n-i-1));
DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl);
/* Fetch and decode. */
#ifndef NDEBUG
uint32_t instr_uuid = uui_gen_.get_uuid(PC_);
uint32_t g_wid = core_->id() * arch_.num_warps() + warp_id_;
uint32_t instr_id = instr_uuid & 0xffff;
uint32_t instr_ref = instr_uuid >> 16;
uint64_t uuid = (uint64_t(instr_ref) << 32) | (g_wid << 16) | instr_id;
#else
uint64_t uuid = 0;
#endif
DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << warp_id_ << ", tmask=");
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)
DPN(1, tmask_.test(i));
DPN(1, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << uuid << ")" << std::endl);
// Fetch
uint32_t instr_code = 0;
core_->icache_read(&instr_code, PC_, sizeof(uint32_t));
auto instr = core_->decoder().decode(instr_code);
// Decode
auto instr = core_->decoder_.decode(instr_code);
if (!instr) {
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl;
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << PC_ << " (#" << std::dec << uuid << ")" << std::endl;
std::abort();
}
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr);
// Update trace
// Create trace
auto trace = new pipeline_trace_t(uuid, arch_);
trace->cid = core_->id();
trace->wid = id_;
trace->wid = warp_id_;
trace->PC = PC_;
trace->tmask = tmask_;
trace->rdest = instr->getRDest();
@@ -68,18 +98,20 @@ void Warp::eval(pipeline_trace_t *trace) {
// Execute
this->execute(*instr, trace);
DP(4, "Register state:");
for (uint32_t i = 0; i < core_->arch().num_regs(); ++i) {
DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
DP(5, "Register state:");
for (uint32_t i = 0; i < arch_.num_regs(); ++i) {
DPN(5, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
// Integer register file
for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
DPN(4, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(4, '|');
DPN(5, '|');
// Floating point register file
for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
DPN(4, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(4, std::endl);
DPN(5, std::endl);
}
return trace;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __WARP_H
#define __WARP_H
@@ -7,28 +20,26 @@
namespace vortex {
class Arch;
class Core;
class Instr;
class pipeline_trace_t;
struct DomStackEntry {
DomStackEntry(const ThreadMask &tmask, Word PC)
: tmask(tmask)
, PC(PC)
, fallThrough(false)
, unanimous(false)
, fallthrough(false)
{}
DomStackEntry(const ThreadMask &tmask)
: tmask(tmask)
, PC(0)
, fallThrough(true)
, unanimous(false)
DomStackEntry(const ThreadMask &tmask)
: tmask(tmask)
, fallthrough(true)
{}
ThreadMask tmask;
Word PC;
bool fallThrough;
bool unanimous;
bool fallthrough;
};
struct vtype {
@@ -40,72 +51,58 @@ struct vtype {
class Warp {
public:
Warp(Core *core, uint32_t id);
Warp(Core *core, uint32_t warp_id);
void clear();
bool active() const {
return active_;
}
void suspend() {
active_ = false;
}
void activate() {
active_ = true;
}
std::size_t getActiveThreads() const {
if (active_)
return tmask_.count();
return 0;
}
void reset();
uint32_t id() const {
return id_;
return warp_id_;
}
uint32_t getPC() const {
Word getPC() const {
return PC_;
}
void setPC(uint32_t PC) {
void setPC(Word PC) {
PC_ = PC;
}
void setTmask(size_t index, bool value) {
tmask_.set(index, value);
active_ = tmask_.any();
}
uint32_t getTmask() const {
if (active_)
return tmask_.to_ulong();
return 0;
uint64_t getTmask() const {
return tmask_.to_ulong();
}
uint32_t getIRegValue(uint32_t reg) const {
Word getIRegValue(uint32_t reg) const {
return ireg_file_.at(0).at(reg);
}
void eval(pipeline_trace_t *);
uint64_t incr_instrs() {
return issued_instrs_++;
}
pipeline_trace_t* eval();
private:
void execute(const Instr &instr, pipeline_trace_t *trace);
UUIDGenerator uui_gen_;
uint32_t id_;
uint32_t warp_id_;
const Arch& arch_;
Core *core_;
bool active_;
uint64_t issued_instrs_;
Word PC_;
ThreadMask tmask_;
std::vector<std::vector<Word>> ireg_file_;
std::vector<std::vector<FWord>> freg_file_;
std::vector<std::vector<Byte>> vreg_file_;
std::stack<DomStackEntry> dom_stack_;
ThreadMask tmask_;
std::vector<std::vector<Word>> ireg_file_;
std::vector<std::vector<uint64_t>> freg_file_;
std::vector<std::vector<Byte>> vreg_file_;
std::stack<DomStackEntry> ipdom_stack_;
struct vtype vtype_;
uint32_t vl_;

View File

@@ -1,101 +0,0 @@
DESTDIR ?= .
RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi
SCRIPT_DIR = ../../hw/scripts
THIRD_PARTY_DIR = ../../third_party
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I.. -I../../../hw -I../../common
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I../$(THIRD_PARTY_DIR)
LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator
# control RTL debug tracing states
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_MEM
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_FLAGS += $(DBG_TRACE_FLAGS)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += fpga.cpp opae_sim.cpp
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
TOP = vortex_afu_shim
VL_FLAGS = --exe --cc $(TOP) --top-module $(TOP)
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO -Wno-EOFNEWLINE
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += verilator.vlt
VL_FLAGS += $(RTL_INCLUDE)
VL_FLAGS += $(CONFIGS)
CXXFLAGS += $(CONFIGS)
# Enable Verilator multithreaded simulation
THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())')
VL_FLAGS += -j $(THREADS)
#VL_FLAGS += --threads $(THREADS)
# Debugigng
ifdef DEBUG
VL_FLAGS += --trace --trace-structs -DVCD_OUTPUT $(DBG_FLAGS)
CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
else
VL_FLAGS += -DNDEBUG
CXXFLAGS += -O2 -DNDEBUG
endif
# Enable scope analyzer
ifdef SCOPE
VL_FLAGS += -DSCOPE
CXXFLAGS += -DSCOPE
endif
# Enable perf counters
ifdef PERF
VL_FLAGS += -DPERF_ENABLE
CXXFLAGS += -DPERF_ENABLE
endif
# use our OPAE shim
VL_FLAGS += -DNOPAE
CXXFLAGS += -DNOPAE
# ALU backend
VL_FLAGS += -DIMUL_DPI
VL_FLAGS += -DIDIV_DPI
# FPU backend
FPU_CORE ?= FPU_DPI
VL_FLAGS += -D$(FPU_CORE)
PROJECT = libopae-c-vlsim.so
all: $(PROJECT)
vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h
$(DESTDIR)/$(PROJECT): $(SRCS) vortex_afu.h
verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT)
clean:
rm -rf obj_dir $(DESTDIR)/$(PROJECT)

View File

@@ -1,30 +0,0 @@
#pragma once
#include <stdint.h>
namespace vortex {
class RAM;
class opae_sim {
public:
opae_sim();
virtual ~opae_sim();
int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
void release_buffer(uint64_t wsid);
void get_io_address(uint64_t wsid, uint64_t *ioaddr);
void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value);
void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value);
private:
class Impl;
Impl* impl_;
};
}

View File

@@ -1,10 +0,0 @@
`verilator_config
lint_off -rule BLKANDNBLK -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule UNOPTFLAT -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule WIDTH -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule UNUSED -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule LITENDIAN -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule IMPORTSTAR -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule PINCONNECTEMPTY -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -file "../rtl/fp_cores/fpnew/*"