Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions

View File

@@ -1,9 +1,9 @@
all:
$(MAKE) -C simx
$(MAKE) -C rtlsim
$(MAKE) -C vlsim
$(MAKE) -C opaesim
clean:
$(MAKE) -C simx clean
$(MAKE) -C rtlsim clean
$(MAKE) -C vlsim clean
$(MAKE) -C opaesim clean

View File

@@ -1,7 +1,19 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <algorithm>
#include <assert.h>
constexpr uint32_t count_leading_zeros(uint32_t value) {
@@ -77,5 +89,15 @@ T sext(const T& word, uint32_t width) {
if (width == (sizeof(T) * 8))
return word;
T mask((static_cast<T>(1) << width) - 1);
return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word;
}
return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : (word & mask);
}
template <typename T = uint32_t>
T zext(const T& word, uint32_t width) {
assert(width > 1);
assert(width <= (sizeof(T) * 8));
if (width == (sizeof(T) * 8))
return word;
T mask((static_cast<T>(1) << width) - 1);
return word & mask;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mem.h"
#include <vector>
#include <iostream>
@@ -20,8 +33,9 @@ RamMemDevice::RamMemDevice(const char *filename, uint32_t wordSize)
contents_.push_back(input.get());
} while (input);
while (contents_.size() & (wordSize-1))
while (contents_.size() & (wordSize-1)) {
contents_.push_back(0x00);
}
}
RamMemDevice::RamMemDevice(uint64_t size, uint32_t wordSize)
@@ -29,7 +43,7 @@ RamMemDevice::RamMemDevice(uint64_t size, uint32_t wordSize)
, wordSize_(wordSize)
{}
void RamMemDevice::read(void *data, uint64_t addr, uint64_t size) {
void RamMemDevice::read(void* data, uint64_t addr, uint64_t size) {
auto addr_end = addr + size;
if ((addr & (wordSize_-1))
|| (addr_end & (wordSize_-1))
@@ -44,7 +58,7 @@ void RamMemDevice::read(void *data, uint64_t addr, uint64_t size) {
}
}
void RamMemDevice::write(const void *data, uint64_t addr, uint64_t size) {
void RamMemDevice::write(const void* data, uint64_t addr, uint64_t size) {
auto addr_end = addr + size;
if ((addr & (wordSize_-1))
|| (addr_end & (wordSize_-1))
@@ -68,26 +82,26 @@ void RomMemDevice::write(const void* /*data*/, uint64_t /*addr*/, uint64_t /*siz
///////////////////////////////////////////////////////////////////////////////
bool MemoryUnit::ADecoder::lookup(uint64_t a, uint32_t wordSize, mem_accessor_t* ma) {
uint64_t e = a + (wordSize - 1);
assert(e >= a);
bool MemoryUnit::ADecoder::lookup(uint64_t addr, uint32_t wordSize, mem_accessor_t* ma) {
uint64_t end = addr + (wordSize - 1);
assert(end >= addr);
for (auto iter = entries_.rbegin(), iterE = entries_.rend(); iter != iterE; ++iter) {
if (a >= iter->start && e <= iter->end) {
if (addr >= iter->start && end <= iter->end) {
ma->md = iter->md;
ma->addr = a - iter->start;
ma->addr = addr - iter->start;
return true;
}
}
return false;
}
void MemoryUnit::ADecoder::map(uint64_t a, uint64_t e, MemDevice &m) {
assert(e >= a);
entry_t entry{&m, a, e};
void MemoryUnit::ADecoder::map(uint64_t start, uint64_t end, MemDevice &md) {
assert(end >= start);
entry_t entry{&md, start, end};
entries_.emplace_back(entry);
}
void MemoryUnit::ADecoder::read(void *data, uint64_t addr, uint64_t size) {
void MemoryUnit::ADecoder::read(void* data, uint64_t addr, uint64_t size) {
mem_accessor_t ma;
if (!this->lookup(addr, size, &ma)) {
std::cout << "lookup of 0x" << std::hex << addr << " failed.\n";
@@ -96,7 +110,7 @@ void MemoryUnit::ADecoder::read(void *data, uint64_t addr, uint64_t size) {
ma.md->read(data, ma.addr, size);
}
void MemoryUnit::ADecoder::write(const void *data, uint64_t addr, uint64_t size) {
void MemoryUnit::ADecoder::write(const void* data, uint64_t addr, uint64_t size) {
mem_accessor_t ma;
if (!this->lookup(addr, size, &ma)) {
std::cout << "lookup of 0x" << std::hex << addr << " failed.\n";
@@ -107,11 +121,11 @@ void MemoryUnit::ADecoder::write(const void *data, uint64_t addr, uint64_t size)
///////////////////////////////////////////////////////////////////////////////
MemoryUnit::MemoryUnit(uint64_t pageSize, uint64_t addrBytes, bool disableVm)
MemoryUnit::MemoryUnit(uint64_t pageSize)
: pageSize_(pageSize)
, addrBytes_(addrBytes)
, disableVM_(disableVm) {
if (!disableVm) {
, enableVM_(pageSize != 0)
, amo_reservation_({0x0, false}) {
if (pageSize != 0) {
tlb_[0] = TLBEntry(0, 077);
}
}
@@ -133,30 +147,38 @@ MemoryUnit::TLBEntry MemoryUnit::tlbLookup(uint64_t vAddr, uint32_t flagMask) {
}
}
void MemoryUnit::read(void *data, uint64_t addr, uint64_t size, bool sup) {
uint64_t MemoryUnit::toPhyAddr(uint64_t addr, uint32_t flagMask) {
uint64_t pAddr;
if (disableVM_) {
pAddr = addr;
} else {
uint32_t flagMask = sup ? 8 : 1;
if (enableVM_) {
TLBEntry t = this->tlbLookup(addr, flagMask);
pAddr = t.pfn * pageSize_ + addr % pageSize_;
} else {
pAddr = addr;
}
return pAddr;
}
void MemoryUnit::read(void* data, uint64_t addr, uint64_t size, bool sup) {
uint64_t pAddr = this->toPhyAddr(addr, sup ? 8 : 1);
return decoder_.read(data, pAddr, size);
}
void MemoryUnit::write(const void *data, uint64_t addr, uint64_t size, bool sup) {
uint64_t pAddr;
if (disableVM_) {
pAddr = addr;
} else {
uint32_t flagMask = sup ? 16 : 2;
TLBEntry t = tlbLookup(addr, flagMask);
pAddr = t.pfn * pageSize_ + addr % pageSize_;
}
void MemoryUnit::write(const void* data, uint64_t addr, uint64_t size, bool sup) {
uint64_t pAddr = this->toPhyAddr(addr, sup ? 16 : 1);
decoder_.write(data, pAddr, size);
amo_reservation_.valid = false;
}
void MemoryUnit::amo_reserve(uint64_t addr) {
uint64_t pAddr = this->toPhyAddr(addr, 1);
amo_reservation_.addr = pAddr;
amo_reservation_.valid = true;
}
bool MemoryUnit::amo_check(uint64_t addr) {
uint64_t pAddr = this->toPhyAddr(addr, 1);
return amo_reservation_.valid && (amo_reservation_.addr == pAddr);
}
void MemoryUnit::tlbAdd(uint64_t virt, uint64_t phys, uint32_t flags) {
tlb_[virt / pageSize_] = TLBEntry(phys / pageSize_, flags);
}
@@ -168,12 +190,14 @@ void MemoryUnit::tlbRm(uint64_t va) {
///////////////////////////////////////////////////////////////////////////////
RAM::RAM(uint32_t page_size)
: size_(0)
RAM::RAM(uint32_t page_size, uint64_t capacity)
: capacity_(capacity)
, page_bits_(log2ceil(page_size))
, last_page_(nullptr)
, last_page_index_(0) {
assert(ispow2(page_size));
assert(0 == capacity || ispow2(capacity));
assert(0 == (capacity % page_size));
}
RAM::~RAM() {
@@ -191,6 +215,9 @@ uint64_t RAM::size() const {
}
uint8_t *RAM::get(uint64_t address) const {
if (capacity_ != 0 && address >= capacity_) {
throw OutOfRange();
}
uint32_t page_size = 1 << page_bits_;
uint32_t page_offset = address & (page_size - 1);
uint64_t page_index = address >> page_bits_;
@@ -218,14 +245,14 @@ uint8_t *RAM::get(uint64_t address) const {
return page + page_offset;
}
void RAM::read(void *data, uint64_t addr, uint64_t size) {
void RAM::read(void* data, uint64_t addr, uint64_t size) {
uint8_t* d = (uint8_t*)data;
for (uint64_t i = 0; i < size; i++) {
d[i] = *this->get(addr + i);
}
}
void RAM::write(const void *data, uint64_t addr, uint64_t size) {
void RAM::write(const void* data, uint64_t addr, uint64_t size) {
const uint8_t* d = (const uint8_t*)data;
for (uint64_t i = 0; i < size; i++) {
*this->get(addr + i) = d[i];
@@ -236,6 +263,7 @@ void RAM::loadBinImage(const char* filename, uint64_t destination) {
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
std::abort();
}
ifs.seekg(0, ifs.end);
@@ -268,6 +296,7 @@ void RAM::loadHexImage(const char* filename) {
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
std::abort();
}
ifs.seekg(0, ifs.end);
@@ -313,4 +342,4 @@ void RAM::loadHexImage(const char* filename) {
++line;
--size;
}
}
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
@@ -7,13 +20,14 @@
namespace vortex {
struct BadAddress {};
struct OutOfRange {};
class MemDevice {
public:
virtual ~MemDevice() {}
virtual uint64_t size() const = 0;
virtual void read(void *data, uint64_t addr, uint64_t size) = 0;
virtual void write(const void *data, uint64_t addr, uint64_t size) = 0;
virtual void read(void* data, uint64_t addr, uint64_t size) = 0;
virtual void write(const void* data, uint64_t addr, uint64_t size) = 0;
};
///////////////////////////////////////////////////////////////////////////////
@@ -21,11 +35,11 @@ public:
class RamMemDevice : public MemDevice {
public:
RamMemDevice(uint64_t size, uint32_t wordSize);
RamMemDevice(const char *filename, uint32_t wordSize);
RamMemDevice(const char* filename, uint32_t wordSize);
~RamMemDevice() {}
void read(void *data, uint64_t addr, uint64_t size) override;
void write(const void *data, uint64_t addr, uint64_t size) override;
void read(void* data, uint64_t addr, uint64_t size) override;
void write(const void* data, uint64_t addr, uint64_t size) override;
virtual uint64_t size() const {
return contents_.size();
@@ -50,7 +64,7 @@ public:
~RomMemDevice();
void write(const void *data, uint64_t addr, uint64_t size) override;
void write(const void* data, uint64_t addr, uint64_t size) override;
};
///////////////////////////////////////////////////////////////////////////////
@@ -63,47 +77,56 @@ public:
: faultAddr(a)
, notFound(nf)
{}
uint64_t faultAddr;
bool notFound;
uint64_t faultAddr;
bool notFound;
};
MemoryUnit(uint64_t pageSize, uint64_t addrBytes, bool disableVm = false);
MemoryUnit(uint64_t pageSize = 0);
void attach(MemDevice &m, uint64_t start, uint64_t end);
void read(void *data, uint64_t addr, uint64_t size, bool sup);
void write(const void *data, uint64_t addr, uint64_t size, bool sup);
void read(void* data, uint64_t addr, uint64_t size, bool sup);
void write(const void* data, uint64_t addr, uint64_t size, bool sup);
void amo_reserve(uint64_t addr);
bool amo_check(uint64_t addr);
void tlbAdd(uint64_t virt, uint64_t phys, uint32_t flags);
void tlbRm(uint64_t va);
void tlbRm(uint64_t vaddr);
void tlbFlush() {
tlb_.clear();
}
private:
struct amo_reservation_t {
uint64_t addr;
bool valid;
};
class ADecoder {
public:
ADecoder() {}
void read(void *data, uint64_t addr, uint64_t size);
void write(const void *data, uint64_t addr, uint64_t size);
void read(void* data, uint64_t addr, uint64_t size);
void write(const void* data, uint64_t addr, uint64_t size);
void map(uint64_t start, uint64_t end, MemDevice &md);
private:
struct mem_accessor_t {
MemDevice* md;
uint64_t addr;
MemDevice* md;
uint64_t addr;
};
struct entry_t {
MemDevice *md;
uint64_t start;
uint64_t end;
MemDevice* md;
uint64_t start;
uint64_t end;
};
bool lookup(uint64_t a, uint32_t wordSize, mem_accessor_t*);
bool lookup(uint64_t addr, uint32_t wordSize, mem_accessor_t*);
std::vector<entry_t> entries_;
};
@@ -120,11 +143,14 @@ private:
TLBEntry tlbLookup(uint64_t vAddr, uint32_t flagMask);
uint64_t toPhyAddr(uint64_t vAddr, uint32_t flagMask);
std::unordered_map<uint64_t, TLBEntry> tlb_;
uint64_t pageSize_;
uint64_t addrBytes_;
ADecoder decoder_;
bool disableVM_;
uint64_t pageSize_;
ADecoder decoder_;
bool enableVM_;
amo_reservation_t amo_reservation_;
};
///////////////////////////////////////////////////////////////////////////////
@@ -132,15 +158,15 @@ private:
class RAM : public MemDevice {
public:
RAM(uint32_t page_size);
RAM(uint32_t page_size, uint64_t capacity = 0);
~RAM();
void clear();
uint64_t size() const override;
void read(void *data, uint64_t addr, uint64_t size) override;
void write(const void *data, uint64_t addr, uint64_t size) override;
void read(void* data, uint64_t addr, uint64_t size) override;
void write(const void* data, uint64_t addr, uint64_t size) override;
void loadBinImage(const char* filename, uint64_t destination);
void loadHexImage(const char* filename);
@@ -157,11 +183,11 @@ private:
uint8_t *get(uint64_t address) const;
uint64_t size_;
uint64_t capacity_;
uint32_t page_bits_;
mutable std::unordered_map<uint64_t, uint8_t*> pages_;
mutable uint8_t* last_page_;
mutable uint64_t last_page_index_;
};
} // namespace vortex
} // namespace vortex

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stack>
@@ -18,8 +31,9 @@ public:
void* allocate() {
void* mem;
if (!free_list_.empty()) {
mem = static_cast<void*>(free_list_.top());
auto entry = free_list_.top();
free_list_.pop();
mem = static_cast<void*>(entry);
} else {
mem = ::operator new(sizeof(T));
}
@@ -36,12 +50,13 @@ public:
void flush() {
while (!free_list_.empty()) {
::operator delete(free_list_.top());
auto entry = free_list_.top();
free_list_.pop();
::operator delete(entry);
}
}
private:
std::stack<void*> free_list_;
std::stack<T*> free_list_;
uint32_t max_size_;
};
};

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "rvfloats.h"
#include <stdio.h>
@@ -16,12 +29,9 @@ inline float64_t to_float64_t(uint64_t x) { return float64_t{x}; }
inline uint32_t from_float32_t(float32_t x) { return uint32_t(x.v); }
inline uint64_t from_float64_t(float64_t x) { return uint64_t(x.v); }
inline uint32_t get_fflags() {
uint32_t fflags = softfloat_exceptionFlags;
if (fflags) {
softfloat_exceptionFlags = 0;
}
return fflags;
inline void rv_init(uint32_t frm) {
softfloat_exceptionFlags = 0;
softfloat_roundingMode = frm;
}
#ifdef __cplusplus
@@ -29,289 +39,296 @@ extern "C" {
#endif
uint32_t rv_fadd_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_add(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fadd_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_add(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fsub_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_sub(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fsub_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_sub(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fmul_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_mul(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fmul_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_mul(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fmadd_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_mulAdd(to_float32_t(a), to_float32_t(b), to_float32_t(c));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fmadd_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_mulAdd(to_float64_t(a), to_float64_t(b), to_float64_t(c));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fmsub_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto c_neg = c ^ F32_SIGN;
auto r = f32_mulAdd(to_float32_t(a), to_float32_t(b), to_float32_t(c_neg));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fmsub_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto c_neg = c ^ F64_SIGN;
auto r = f64_mulAdd(to_float64_t(a), to_float64_t(b), to_float64_t(c_neg));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fnmadd_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto a_neg = a ^ F32_SIGN;
auto c_neg = c ^ F32_SIGN;
auto r = f32_mulAdd(to_float32_t(a_neg), to_float32_t(b), to_float32_t(c_neg));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fnmadd_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto a_neg = a ^ F64_SIGN;
auto c_neg = c ^ F64_SIGN;
auto r = f64_mulAdd(to_float64_t(a_neg), to_float64_t(b), to_float64_t(c_neg));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fnmsub_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto a_neg = a ^ F32_SIGN;
auto r = f32_mulAdd(to_float32_t(a_neg), to_float32_t(b), to_float32_t(c));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fnmsub_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto a_neg = a ^ F64_SIGN;
auto r = f64_mulAdd(to_float64_t(a_neg), to_float64_t(b), to_float64_t(c));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fdiv_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_div(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_div(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_sqrt(to_float32_t(a));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_fsqrt_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_sqrt(to_float64_t(a));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_ftoi_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_to_i32(to_float32_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint32_t rv_ftoi_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_to_i32(to_float64_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint32_t rv_ftou_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_to_ui32(to_float32_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint32_t rv_ftou_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_to_ui32(to_float64_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint64_t rv_ftol_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_to_i64(to_float32_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint64_t rv_ftol_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_to_i64(to_float64_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint64_t rv_ftolu_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f32_to_ui64(to_float32_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint64_t rv_ftolu_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = f64_to_ui64(to_float64_t(a), frm, true);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint32_t rv_itof_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = i32_to_f32(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_itof_d(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = i32_to_f64(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_utof_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = ui32_to_f32(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_utof_d(uint32_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = ui32_to_f64(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_ltof_s(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = i64_to_f32(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_ltof_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = i64_to_f64(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
uint32_t rv_lutof_s(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = ui64_to_f32(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float32_t(r);
}
uint64_t rv_lutof_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
softfloat_roundingMode = frm;
rv_init(frm);
auto r = ui64_to_f64(a);
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return from_float64_t(r);
}
bool rv_flt_s(uint32_t a, uint32_t b, uint32_t* fflags) {
rv_init(0);
auto r = f32_lt(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
bool rv_flt_d(uint64_t a, uint64_t b, uint32_t* fflags) {
rv_init(0);
auto r = f64_lt(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
bool rv_fle_s(uint32_t a, uint32_t b, uint32_t* fflags) {
rv_init(0);
auto r = f32_le(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
bool rv_fle_d(uint64_t a, uint64_t b, uint32_t* fflags) {
rv_init(0);
auto r = f64_le(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
bool rv_feq_s(uint32_t a, uint32_t b, uint32_t* fflags) {
rv_init(0);
auto r = f32_eq(to_float32_t(a), to_float32_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
bool rv_feq_d(uint64_t a, uint64_t b, uint32_t* fflags) {
rv_init(0);
auto r = f64_eq(to_float64_t(a), to_float64_t(b));
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint32_t rv_fmin_s(uint32_t a, uint32_t b, uint32_t* fflags) {
uint32_t r;
rv_init(0);
if (isNaNF32UI(a) && isNaNF32UI(b)) {
r = defaultNaNF32UI;
} else {
@@ -324,12 +341,13 @@ uint32_t rv_fmin_s(uint32_t a, uint32_t b, uint32_t* fflags) {
r = b;
}
}
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags) {
uint64_t r;
rv_init(0);
if (isNaNF64UI(a) && isNaNF64UI(b)) {
r = defaultNaNF64UI;
} else {
@@ -342,12 +360,13 @@ uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags) {
r = b;
}
}
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint32_t rv_fmax_s(uint32_t a, uint32_t b, uint32_t* fflags) {
uint32_t r;
rv_init(0);
if (isNaNF32UI(a) && isNaNF32UI(b)) {
r = defaultNaNF32UI;
} else {
@@ -360,12 +379,13 @@ uint32_t rv_fmax_s(uint32_t a, uint32_t b, uint32_t* fflags) {
r = b;
}
}
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}
uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags) {
uint64_t r;
rv_init(0);
if (isNaNF64UI(a) && isNaNF64UI(b)) {
r = defaultNaNF64UI;
} else {
@@ -378,7 +398,7 @@ uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags) {
r = b;
}
}
if (fflags) { *fflags = get_fflags(); }
if (fflags) { *fflags = softfloat_exceptionFlags; }
return r;
}

View File

@@ -1,5 +1,17 @@
#ifndef RVFLOATS_H
#define RVFLOATS_H
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
@@ -78,5 +90,3 @@ uint64_t rv_ftod(uint32_t a);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
@@ -84,33 +97,39 @@ public:
}
uint64_t pop() {
auto cycle = queue_.front().cycle;
auto cycles = queue_.front().cycles;
queue_.pop();
return cycle;
return cycles;
}
void tx_callback(const TxCallback& callback) {
tx_cb_ = callback;
}
uint64_t arrival_time() const {
if (queue_.empty())
return 0;
return queue_.front().cycles;
}
protected:
struct timed_pkt_t {
Pkt pkt;
uint64_t cycle;
uint64_t cycles;
};
std::queue<timed_pkt_t> queue_;
SimPort* peer_;
TxCallback tx_cb_;
void push(const Pkt& data, uint64_t cycle) {
void push(const Pkt& data, uint64_t cycles) {
if (tx_cb_) {
tx_cb_(data, cycle);
tx_cb_(data, cycles);
}
if (peer_) {
peer_->push(data, cycle);
peer_->push(data, cycles);
} else {
queue_.push({data, cycle});
queue_.push({data, cycles});
}
}
@@ -129,14 +148,14 @@ public:
virtual void fire() const = 0;
uint64_t time() const {
return time_;
uint64_t cycles() const {
return cycles_;
}
protected:
SimEventBase(uint64_t time) : time_(time) {}
SimEventBase(uint64_t cycles) : cycles_(cycles) {}
uint64_t time_;
uint64_t cycles_;
};
///////////////////////////////////////////////////////////////////////////////
@@ -150,8 +169,8 @@ public:
typedef std::function<void (const Pkt&)> Func;
SimCallEvent(const Func& func, const Pkt& pkt, uint64_t time)
: SimEventBase(time)
SimCallEvent(const Func& func, const Pkt& pkt, uint64_t cycles)
: SimEventBase(cycles)
, func_(func)
, pkt_(pkt)
{}
@@ -180,11 +199,11 @@ template <typename Pkt>
class SimPortEvent : public SimEventBase {
public:
void fire() const override {
const_cast<SimPort<Pkt>*>(port_)->push(pkt_, time_);
const_cast<SimPort<Pkt>*>(port_)->push(pkt_, cycles_);
}
SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t time)
: SimEventBase(time)
SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t cycles)
: SimEventBase(cycles)
, port_(port)
, pkt_(pkt)
{}
@@ -330,7 +349,7 @@ public:
auto evt_it_end = events_.end();
while (evt_it != evt_it_end) {
auto& event = *evt_it;
if (cycles_ >= event->time()) {
if (cycles_ >= event->cycles()) {
event->fire();
evt_it = events_.erase(evt_it);
} else {
@@ -395,5 +414,5 @@ void SimPort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
reinterpret_cast<const SimPort<Pkt>*>(peer_)->send(pkt, delay);
} else {
SimPlatform::instance().schedule(this, pkt, delay);
}
}
}
}

78
sim/common/stringutil.h Normal file
View File

@@ -0,0 +1,78 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include <iomanip>
class ByteStream : public std::istream {
public:
ByteStream(const void *buf, std::size_t size) : buf_(buf), size_(size) {}
friend std::ostream& operator<<(std::ostream& os, const ByteStream& obj) {
auto oldflags = os.flags();
auto oldwidth = os.width();
auto oldfill = os.fill();
for (std::size_t i = 0, n = obj.size_; i < n; ++i) {
int byte = *((uint8_t*)obj.buf_ + (n - 1 - i));
os << std::hex << std::setw(2) << std::setfill('0') << byte;
}
os.fill(oldfill);
os.width(oldwidth);
os.flags(oldflags);
return os;
}
private:
const void *buf_;
std::size_t size_;
};
class IndentStream : public std::streambuf {
public:
explicit IndentStream(std::streambuf* dest, int indent = 4)
: dest_(dest)
, isBeginLine_(true)
, indent_(indent, ' ')
, owner_(nullptr)
{}
explicit IndentStream(std::ostream& dest, int indent = 4)
: dest_(dest.rdbuf())
, isBeginLine_(true)
, indent_(indent, ' ')
, owner_(&dest) {
owner_->rdbuf(this);
}
virtual ~IndentStream() {
if (owner_)
owner_->rdbuf(dest_);
}
protected:
virtual int overflow(int ch) {
if (isBeginLine_ && ch != '\n') {
dest_->sputn(indent_.data(), indent_.size());
}
isBeginLine_ = ch == '\n';
return dest_->sputc(ch);
}
private:
std::streambuf* dest_;
bool isBeginLine_;
std::string indent_;
std::ostream* owner_;
};

View File

@@ -1,237 +0,0 @@
#pragma once
#include <cstdint>
#include <cocogfx/include/fixed.h>
#include <bitmanip.h>
using namespace cocogfx;
enum class WrapMode {
Clamp,
Repeat,
Mirror,
};
enum class TexFormat {
A8R8G8B8,
R5G6B5,
A1R5G5B5,
A4R4G4B4,
A8L8,
L8,
A8,
};
template <uint32_t F, typename T = int32_t>
T Clamp(Fixed<F,T> fx, WrapMode mode) {
switch (mode) {
case WrapMode::Clamp: return (fx.data() < 0) ? 0 : ((fx.data() > Fixed<F,T>::MASK) ? Fixed<F,T>::MASK : fx.data());
case WrapMode::Repeat: return (fx.data() & Fixed<F,T>::MASK);
case WrapMode::Mirror: return (bit_get(fx.data(), Fixed<F,T>::FRAC) ? ~fx.data() : fx.data());
default:
std::abort();
return 0;
}
}
inline uint32_t Stride(TexFormat format) {
switch (format) {
case TexFormat::A8R8G8B8:
return 4;
case TexFormat::R5G6B5:
case TexFormat::A1R5G5B5:
case TexFormat::A4R4G4B4:
case TexFormat::A8L8:
return 2;
case TexFormat::L8:
case TexFormat::A8:
return 1;
default:
std::abort();
return 0;
}
}
inline void Unpack8888(TexFormat format,
uint32_t texel,
uint32_t* lo,
uint32_t* hi) {
uint32_t r, g, b, a;
switch (format) {
case TexFormat::A8R8G8B8:
r = (texel >> 16) & 0xff;
g = (texel >> 8) & 0xff;
b = texel & 0xff;
a = texel >> 24;
break;
case TexFormat::R5G6B5:
r = ((texel >> 11) << 3) | (texel >> 13);
g = ((texel >> 3) & 0xfc) | ((texel >> 9) & 0x3);
b = ((texel & 0x1f) << 3) | ((texel & 0x1c) >> 2);
a = 0xff;
break;
case TexFormat::A1R5G5B5:
r = ((texel >> 7) & 0xf8) | ((texel << 1) >> 13);
g = ((texel >> 2) & 0xf8) | ((texel >> 7) & 7);
b = ((texel & 0x1f) << 3) | ((texel & 0x1c) >> 2);
a = 0xff * (texel >> 15);
break;
case TexFormat::A4R4G4B4:
r = ((texel >> 4) & 0xf0) | ((texel >> 8) & 0x0f);
g = ((texel & 0xf0) >> 0) | ((texel & 0xf0) >> 4);
b = ((texel & 0x0f) << 4) | ((texel & 0x0f) >> 0);
a = ((texel >> 8) & 0xf0) | (texel >> 12);
break;
case TexFormat::A8L8:
r = texel & 0xff;
g = r;
b = r;
a = texel >> 8;
break;
case TexFormat::L8:
r = texel & 0xff;
g = r;
b = r;
a = 0xff;
break;
case TexFormat::A8:
r = 0xff;
g = 0xff;
b = 0xff;
a = texel & 0xff;
break;
default:
std::abort();
}
*lo = (r << 16) + b;
*hi = (a << 16) + g;
}
inline void Unpack8888(uint32_t texel, uint32_t* lo, uint32_t* hi) {
*lo = texel & 0x00ff00ff;
*hi = (texel >> 8) & 0x00ff00ff;
}
inline uint32_t Pack8888(uint32_t lo, uint32_t hi) {
return (hi << 8) | lo;
}
inline uint32_t Lerp8888(uint32_t a, uint32_t b, uint32_t f) {
return (a + (((b - a) * f) >> 8)) & 0x00ff00ff;
}
template <uint32_t F, typename T = int32_t>
void TexAddressLinear(Fixed<F,T> fu,
Fixed<F,T> fv,
uint32_t log_width,
uint32_t log_height,
WrapMode wrapu,
WrapMode wrapv,
uint32_t* addr00,
uint32_t* addr01,
uint32_t* addr10,
uint32_t* addr11,
uint32_t* alpha,
uint32_t* beta
) {
auto delta_x = Fixed<F,T>::make(Fixed<F,T>::HALF >> log_width);
auto delta_y = Fixed<F,T>::make(Fixed<F,T>::HALF >> log_height);
uint32_t u0 = Clamp(fu - delta_x, wrapu);
uint32_t u1 = Clamp(fu + delta_x, wrapu);
uint32_t v0 = Clamp(fv - delta_y, wrapv);
uint32_t v1 = Clamp(fv + delta_y, wrapv);
uint32_t shift_u = (Fixed<F,T>::FRAC - log_width);
uint32_t shift_v = (Fixed<F,T>::FRAC - log_height);
uint32_t x0s = (u0 << 8) >> shift_u;
uint32_t y0s = (v0 << 8) >> shift_v;
uint32_t x0 = x0s >> 8;
uint32_t y0 = y0s >> 8;
uint32_t x1 = u1 >> shift_u;
uint32_t y1 = v1 >> shift_v;
*addr00 = x0 + (y0 << log_width);
*addr01 = x1 + (y0 << log_width);
*addr10 = x0 + (y1 << log_width);
*addr11 = x1 + (y1 << log_width);
*alpha = x0s & 0xff;
*beta = y0s & 0xff;
//printf("*** fu=0x%x, fv=0x%x, u0=0x%x, u1=0x%x, v0=0x%x, v1=0x%x, x0=0x%x, x1=0x%x, y0=0x%x, y1=0x%x, addr00=0x%x, addr01=0x%x, addr10=0x%x, addr11=0x%x\n", fu.data(), fv.data(), u0, u1, v0, v1, x0, x1, y0, y1, *addr00, *addr01, *addr10, *addr11);
}
template <uint32_t F, typename T = int32_t>
void TexAddressPoint(Fixed<F,T> fu,
Fixed<F,T> fv,
uint32_t log_width,
uint32_t log_height,
WrapMode wrapu,
WrapMode wrapv,
uint32_t* addr
) {
uint32_t u = Clamp(fu, wrapu);
uint32_t v = Clamp(fv, wrapv);
uint32_t x = u >> (Fixed<F,T>::FRAC - log_width);
uint32_t y = v >> (Fixed<F,T>::FRAC - log_height);
*addr = x + (y << log_width);
//printf("*** fu=0x%x, fv=0x%x, u=0x%x, v=0x%x, x=0x%x, y=0x%x, addr=0x%x\n", fu.data(), fv.data(), u, v, x, y, *addr);
}
inline uint32_t TexFilterLinear(
TexFormat format,
uint32_t texel00,
uint32_t texel01,
uint32_t texel10,
uint32_t texel11,
uint32_t alpha,
uint32_t beta
) {
uint32_t c01l, c01h;
{
uint32_t c0l, c0h, c1l, c1h;
Unpack8888(format, texel00, &c0l, &c0h);
Unpack8888(format, texel01, &c1l, &c1h);
c01l = Lerp8888(c0l, c1l, alpha);
c01h = Lerp8888(c0h, c1h, alpha);
}
uint32_t c23l, c23h;
{
uint32_t c2l, c2h, c3l, c3h;
Unpack8888(format, texel10, &c2l, &c2h);
Unpack8888(format, texel11, &c3l, &c3h);
c23l = Lerp8888(c2l, c3l, alpha);
c23h = Lerp8888(c2h, c3h, alpha);
}
uint32_t color;
{
uint32_t cl = Lerp8888(c01l, c23l, beta);
uint32_t ch = Lerp8888(c01h, c23h, beta);
color = Pack8888(cl, ch);
}
//printf("*** texel00=0x%x, texel01=0x%x, texel10=0x%x, texel11=0x%x, color=0x%x\n", texel00, texel01, texel10, texel11, color);
return color;
}
inline uint32_t TexFilterPoint(TexFormat format, uint32_t texel) {
uint32_t color;
{
uint32_t cl, ch;
Unpack8888(format, texel, &cl, &ch);
color = Pack8888(cl, ch);
}
//printf("*** texel=0x%x, color=0x%x\n", texel, color);
return color;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "util.h"
#include <string.h>
@@ -7,4 +20,20 @@ const char* fileExtension(const char* filepath) {
if (ext == NULL || ext == filepath)
return "";
return ext + 1;
}
void* aligned_malloc(size_t size, size_t alignment) {
// reserve margin for alignment and storing of unaligned address
assert((alignment & (alignment - 1)) == 0); // Power of 2 alignment.
size_t margin = (alignment-1) + sizeof(void*);
void *unaligned_addr = malloc(size + margin);
void **aligned_addr = (void**)((uintptr_t)(((uint8_t*)unaligned_addr) + margin) & ~(alignment-1));
aligned_addr[-1] = unaligned_addr;
return aligned_addr;
}
void aligned_free(void *ptr) {
// retreive the stored unaligned address and use it to free the allocation
void* unaligned_addr = ((void**)ptr)[-1];
free(unaligned_addr);
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
@@ -49,4 +62,7 @@ const char* fileExtension(const char* filepath);
#define DISABLE_WARNING_UNUSED_PARAMETER
#define DISABLE_WARNING_UNREFERENCED_FUNCTION
#define DISABLE_WARNING_ANONYMOUS_STRUCT
#endif
#endif
void *aligned_malloc(size_t size, size_t alignment);
void aligned_free(void *ptr);

55
sim/common/uuid_gen.h Normal file
View File

@@ -0,0 +1,55 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <unordered_map>
namespace vortex {
class UUIDGenerator {
public:
UUIDGenerator() : ids_(0) {}
virtual ~UUIDGenerator() {}
uint32_t get_uuid(uint64_t PC) {
uint32_t id;
uint32_t ref;
auto it = uuid_map_.find(PC);
if (it != uuid_map_.end()) {
uint64_t value = it->second;
id = value & 0xffff;
ref = value >> 16;
} else {
id = ids_++;
ref = -1;
}
++ref;
uint64_t ret = (uint64_t(ref) << 16) | id;
uuid_map_[PC] = ret;
return ret;
}
void reset() {
uuid_map_.clear();
ids_ = 0;
}
private:
std::unordered_map<uint64_t, uint32_t> uuid_map_;
uint32_t ids_;
};
}

134
sim/opaesim/Makefile Normal file
View File

@@ -0,0 +1,134 @@
XLEN ?= 32
DESTDIR ?= .
RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi
AFU_DIR = $(RTL_DIR)/afu/opae
SCRIPT_DIR = ../../hw/scripts
THIRD_PARTY_DIR = ../../third_party
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I.. -I../../../hw -I../../common -I$(abspath $(DESTDIR))
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I../$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN)
LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator -pthread
# control RTL debug tracing states
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_MEM
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR
# Control logic analyzer monitors
DBG_SCOPE_FLAGS += -DDBG_SCOPE_AFU
DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
DBG_SCOPE_FLAGS += -DDBG_SCOPE_MSCHED
# AFU parameters
CONFIGS += -DPLATFORM_PROVIDES_LOCAL_MEMORY
ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_BANKS,$(CONFIGS)))
CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=2
endif
ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=26
endif
ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=512
endif
ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH=4
endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += fpga.cpp opae_sim.cpp
RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv
RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
FPU_INCLUDE = -I$(RTL_DIR)/fpu
ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
RTL_PKGS += $(THIRD_PARTY_DIR)/fpnew/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/fpnew/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv
FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
endif
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)
RTL_INCLUDE += -I$(AFU_DIR) -I$(AFU_DIR)/ccip
TOP = vortex_afu_shim
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION
VL_FLAGS += -DXLEN_$(XLEN)
VL_FLAGS += $(CONFIGS)
VL_FLAGS += verilator.vlt
VL_FLAGS += $(RTL_INCLUDE)
VL_FLAGS += $(RTL_PKGS)
VL_FLAGS += $(DBG_SCOPE_FLAGS)
CXXFLAGS += $(CONFIGS)
# Enable Verilator multithreaded simulation
THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())')
VL_FLAGS += -j $(THREADS)
#VL_FLAGS += --threads $(THREADS)
# Debugigng
ifdef DEBUG
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 $(DBG_FLAGS)
else
VL_FLAGS += -DNDEBUG
CXXFLAGS += -O3 -DNDEBUG
endif
# Enable scope analyzer
ifdef SCOPE
VL_FLAGS += -DSCOPE
CXXFLAGS += -DSCOPE
SCOPE_JSON = $(DESTDIR)/scope.json
endif
# Enable perf counters
ifdef PERF
VL_FLAGS += -DPERF_ENABLE
CXXFLAGS += -DPERF_ENABLE
endif
# use our OPAE shim
VL_FLAGS += -DNOPAE
CXXFLAGS += -DNOPAE
PROJECT = libopae-c-sim.so
all: $(DESTDIR)/$(PROJECT)
$(DESTDIR)/vortex.xml:
verilator --xml-only -O0 $(VL_FLAGS) $(TOP) --xml-output $(DESTDIR)/vortex.xml
$(DESTDIR)/scope.json: $(DESTDIR)/vortex.xml
$(SCRIPT_DIR)/scope.py $(DESTDIR)/vortex.xml -o $(DESTDIR)/scope.json
$(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh
$(SCRIPT_DIR)/gen_config.py -i $(AFU_DIR)/vortex_afu.vh -o $(DESTDIR)/vortex_afu.h
$(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON)
verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT)
clean:
rm -rf obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <iostream>
#include <stdio.h>
@@ -8,10 +21,61 @@
#include "fpga.h"
#include "opae_sim.h"
#include <VX_config.h>
#include <util.h>
using namespace vortex;
#ifdef __cplusplus
extern "C" {
#endif
extern fpga_result fpgaGetProperties(fpga_token token, fpga_properties *prop) {
__unused (token, prop);
return FPGA_OK;
}
extern fpga_result fpgaPropertiesSetObjectType(fpga_properties prop, fpga_objtype objtype) {
__unused (prop, objtype);
return FPGA_OK;
}
extern fpga_result fpgaPropertiesSetGUID(fpga_properties prop, fpga_guid guid) {
__unused (prop, guid);
return FPGA_OK;
}
extern fpga_result fpgaDestroyProperties(fpga_properties *prop) {
__unused (prop);
return FPGA_OK;
}
extern fpga_result fpgaEnumerate(const fpga_properties *filters, uint32_t num_filters, fpga_token *tokens, uint32_t max_tokens, uint32_t *num_matches) {
__unused (filters, num_filters, num_filters, tokens, max_tokens);
if (num_matches) {
*num_matches = 1;
}
return FPGA_OK;
}
extern fpga_result fpgaDestroyToken(fpga_token *token) {
__unused (token);
return FPGA_OK;
}
extern fpga_result fpgaPropertiesGetLocalMemorySize(const fpga_properties *filters, uint64_t* lms) {
__unused (filters);
if (lms) {
#if (XLEN == 64)
*lms = 0x200000000; // 8 GB
#else
*lms = 0x100000000; // 4 GB
#endif
}
return FPGA_OK;
}
extern fpga_result fpgaOpen(fpga_token token, fpga_handle *handle, int flags) {
__unused (token);
if (NULL == handle || flags != 0)
return FPGA_INVALID_PARAM;
auto sim = new opae_sim();
@@ -83,4 +147,8 @@ extern fpga_result fpgaReadMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_
extern const char *fpgaErrStr(fpga_result e) {
return "";
}
}
#ifdef __cplusplus
}
#endif

View File

@@ -1,7 +1,20 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __FPGA_H__
#define __FPGA_H__
#include <stdio.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
@@ -21,28 +34,21 @@ typedef enum {
FPGA_RECONF_ERROR /**< Error while reconfiguring FPGA */
} fpga_result;
typedef enum {
FPGA_DEVICE = 0,
FPGA_ACCELERATOR
} fpga_objtype;
typedef void *fpga_handle;
typedef void *fpga_token;
fpga_result fpgaOpen(fpga_token token, fpga_handle *handle, int flags);
typedef void *fpga_properties;
fpga_result fpgaClose(fpga_handle handle);
fpga_result fpgaPrepareBuffer(fpga_handle handle, uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
fpga_result fpgaReleaseBuffer(fpga_handle handle, uint64_t wsid);
fpga_result fpgaGetIOAddress(fpga_handle handle, uint64_t wsid, uint64_t *ioaddr);
fpga_result fpgaWriteMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t value);
fpga_result fpgaReadMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t *value);
const char *fpgaErrStr(fpga_result e);
typedef uint8_t fpga_guid[16];
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
}
#endif
#endif // __FPGA_H__

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "opae_sim.h"
#include <verilated.h>
@@ -25,6 +38,7 @@
#include <list>
#include <queue>
#include <unordered_map>
#include <util.h>
#ifndef MEMORY_BANKS
#ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
@@ -62,6 +76,8 @@
#define RAM_PAGE_SIZE 4096
#define CPU_GPU_LATENCY 200
using namespace vortex;
static uint64_t timestamp = 0;
@@ -70,23 +86,6 @@ double sc_time_stamp() {
return timestamp;
}
static void *__aligned_malloc(size_t alignment, size_t size) {
// reserve margin for alignment and storing of unaligned address
size_t margin = (alignment-1) + sizeof(void*);
void *unaligned_addr = malloc(size + margin);
void **aligned_addr = (void**)((uintptr_t)(((uint8_t*)unaligned_addr) + margin) & ~(alignment-1));
aligned_addr[-1] = unaligned_addr;
return aligned_addr;
}
static void __aligned_free(void *ptr) {
// retreive the stored unaligned address and use it to free the allocation
void* unaligned_addr = ((void**)ptr)[-1];
free(unaligned_addr);
}
///////////////////////////////////////////////////////////////////////////////
static bool trace_enabled = false;
static uint64_t trace_start_time = TRACE_START_TIME;
static uint64_t trace_stop_time = TRACE_STOP_TIME;
@@ -158,7 +157,7 @@ public:
future_.wait();
}
for (auto& buffer : host_buffers_) {
__aligned_free(buffer.second.data);
aligned_free(buffer.second.data);
}
#ifdef VCD_OUTPUT
trace_->close();
@@ -176,9 +175,13 @@ public:
}
int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) {
auto alloc = __aligned_malloc(CACHE_BLOCK_SIZE, len);
auto alloc = aligned_malloc(len, CACHE_BLOCK_SIZE);
if (alloc == NULL)
return -1;
// set uninitialized data to "baadf00d"
for (uint32_t i = 0; i < len; ++i) {
((uint8_t*)alloc)[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff;
}
host_buffer_t buffer;
buffer.data = (uint64_t*)alloc;
buffer.size = len;
@@ -193,7 +196,7 @@ public:
void release_buffer(uint64_t wsid) {
auto it = host_buffers_.find(wsid);
if (it != host_buffers_.end()) {
__aligned_free(it->second.data);
aligned_free(it->second.data);
host_buffers_.erase(it);
}
}
@@ -205,6 +208,11 @@ public:
void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
std::lock_guard<std::mutex> guard(mutex_);
// simulate CPU-GPU latency
for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i)
this->tick();
// simulate mmio request
device_->vcp2af_sRxPort_c0_mmioRdValid = 1;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
@@ -217,7 +225,12 @@ public:
void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) {
std::lock_guard<std::mutex> guard(mutex_);
// simulate CPU-GPU latency
for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i)
this->tick();
// simulate mmio request
device_->vcp2af_sRxPort_c0_mmioWrValid = 1;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
@@ -254,7 +267,14 @@ private:
this->eval();
}
device_->reset = 0;
device_->reset = 0;
for (int i = 0; i < RESET_DELAY; ++i) {
device_->clk = 0;
this->eval();
device_->clk = 1;
this->eval();
}
// Turn on assertion after reset
Verilated::assertOn(true);
@@ -289,7 +309,7 @@ private:
#endif
}
void eval() {
void eval() {
device_->eval();
#ifdef VCD_OUTPUT
if (sim_trace_enabled()) {
@@ -396,10 +416,10 @@ private:
// process memory requests
assert(!device_->avs_read[b] || !device_->avs_write[b]);
unsigned byte_addr = device_->avs_address[b] * MEM_BLOCK_SIZE;
unsigned byte_addr = (device_->avs_address[b] * MEMORY_BANKS + b) * MEM_BLOCK_SIZE;
if (device_->avs_write[b]) {
uint64_t byteen = device_->avs_byteenable[b];
uint8_t* data = (uint8_t*)device_->avs_writedata[b].data();
uint8_t* data = (uint8_t*)(device_->avs_writedata[b].data());
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
(*ram_)[byte_addr + i] = data[i];
@@ -419,8 +439,7 @@ private:
0
);
dram_queue_.push(dram_req);
}
} else
if (device_->avs_read[b]) {
auto mem_req = new mem_rd_req_t();
mem_req->addr = device_->avs_address[b];
@@ -491,7 +510,7 @@ private:
std::mutex mutex_;
RAM *ram_;
RAM* ram_;
ramulator::Gem5Wrapper* dram_;
@@ -531,4 +550,4 @@ void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value)
void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
impl_->read_mmio64(mmio_num, offset, value);
}
}

43
sim/opaesim/opae_sim.h Normal file
View File

@@ -0,0 +1,43 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
namespace vortex {
class RAM;
class opae_sim {
public:
opae_sim();
virtual ~opae_sim();
int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
void release_buffer(uint64_t wsid);
void get_io_address(uint64_t wsid, uint64_t *ioaddr);
void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value);
void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value);
private:
class Impl;
Impl* impl_;
};
}

View File

@@ -0,0 +1,8 @@
`verilator_config
lint_off -rule BLKANDNBLK -file "*/fpnew/src/*"
lint_off -rule UNOPTFLAT -file "*/fpnew/src/*"
lint_off -file "*/fpnew/src/*"
lint_off -file "*/afu/opae/ccip/ccip_if_pkg.sv"
lint_off -file "*/afu/opae/local_mem_cfg_pkg.sv"

View File

@@ -1,16 +1,24 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
`IGNORE_WARNINGS_BEGIN
`include "vortex_afu.vh"
`IGNORE_WARNINGS_END
/* verilator lint_off IMPORTSTAR */
import ccip_if_pkg::*;
import local_mem_cfg_pkg::*;
/* verilator lint_on IMPORTSTAR */
`include "VX_define.vh"
module vortex_afu_shim (
module vortex_afu_shim import local_mem_cfg_pkg::*; import ccip_if_pkg::*; (
// global signals
input clk,
input reset,
@@ -167,4 +175,4 @@ assign af2cp_sTxPort_c2_hdr_tid = af2cp_sTxPort.c2.hdr.tid;
assign af2cp_sTxPort_c2_mmioRdValid = af2cp_sTxPort.c2.mmioRdValid;
assign af2cp_sTxPort_c2_data = af2cp_sTxPort.c2.data;
endmodule
endmodule

View File

@@ -1,3 +1,4 @@
XLEN ?= 32
DESTDIR ?= .
RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi
@@ -8,6 +9,7 @@ CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I../../../hw -I../../common
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I../$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN)
LDFLAGS += ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator
@@ -23,13 +25,18 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR
DBG_FLAGS += $(DBG_TRACE_FLAGS)
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) $(TEX_INCLUDE)
RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
FPU_INCLUDE = -I$(RTL_DIR)/fpu
ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
RTL_PKGS += $(THIRD_PARTY_DIR)/fpnew/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/fpnew/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv
FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
endif
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
@@ -42,14 +49,18 @@ else
TOP = Vortex
endif
VL_FLAGS = --exe --cc $(TOP) --top-module $(TOP)
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO -Wno-EOFNEWLINE
VL_FLAGS = --exe
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += verilator.vlt
VL_FLAGS += $(RTL_INCLUDE)
VL_FLAGS += -DSIMULATION
VL_FLAGS += -DXLEN_$(XLEN)
VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(RTL_INCLUDE)
VL_FLAGS += $(RTL_PKGS)
VL_FLAGS += --cc $(TOP) --top-module $(TOP)
CXXFLAGS += $(CONFIGS)
# Enable Verilator multithreaded simulation
@@ -59,8 +70,8 @@ VL_FLAGS += -j $(THREADS)
# Debugigng
ifdef DEBUG
VL_FLAGS += --trace --trace-structs -DVCD_OUTPUT $(DBG_FLAGS)
CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 $(DBG_FLAGS)
else
VL_FLAGS += -DNDEBUG
CXXFLAGS += -O2 -DNDEBUG
@@ -72,20 +83,12 @@ ifdef PERF
CXXFLAGS += -DPERF_ENABLE
endif
# ALU backend
VL_FLAGS += -DIMUL_DPI
VL_FLAGS += -DIDIV_DPI
# FPU backend
FPU_CORE ?= FPU_DPI
VL_FLAGS += -D$(FPU_CORE)
PROJECT = rtlsim
all: $(PROJECT)
all: $(DESTDIR)/$(PROJECT)
$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$@
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' -o ../$@
$(DESTDIR)/lib$(PROJECT).so: $(SRCS)
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' -o ../$@

View File

@@ -1,11 +1,24 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <fstream>
#include <iomanip>
#include <unistd.h>
#include <unistd.h>
#include <util.h>
#include <mem.h>
#include <VX_config.h>
#include <VX_types.h>
#include "processor.h"
#define RAM_PAGE_SIZE 4096
@@ -13,11 +26,11 @@
using namespace vortex;
static void show_usage() {
std::cout << "Usage: [-r] [-h: help] programs.." << std::endl;
std::cout << "Usage: [-r: riscv-test] [-h: help] <program>" << std::endl;
}
bool riscv_test = false;
std::vector<const char*> programs;
const char* program = nullptr;
static void parse_args(int argc, char **argv) {
int c;
@@ -35,56 +48,68 @@ static void parse_args(int argc, char **argv) {
show_usage();
exit(-1);
}
}
}
for (int i = optind; i < argc; ++i) {
programs.push_back(argv[i]);
if (optind < argc) {
program = argv[optind];
std::cout << "Running " << program << "..." << std::endl;
} else {
show_usage();
exit(-1);
}
}
int main(int argc, char **argv) {
int exitcode = 0;
bool failed = false;
parse_args(argc, argv);
parse_args(argc, argv);
// create memory module
vortex::RAM ram(RAM_PAGE_SIZE);
// create processor
vortex::Processor processor;
// attach memory module
processor.attach_ram(&ram);
for (auto program : programs) {
std::cout << "Running " << program << "..." << std::endl;
// setup base DCRs
const uint64_t startup_addr(STARTUP_ADDR);
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
#if (XLEN == 64)
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
#endif
processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
// load program
{
std::string program_ext(fileExtension(program));
if (program_ext == "bin") {
ram.loadBinImage(program, STARTUP_ADDR);
ram.loadBinImage(program, startup_addr);
} else if (program_ext == "hex") {
ram.loadHexImage(program);
} else {
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
return -1;
}
exitcode = processor.run();
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed" << std::endl;
} else {
std::cout << "Failed: exitcode=" << exitcode << std::endl;
failed = true;
}
} else {
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
failed = true;
}
}
if (failed)
break;
}
return failed ? exitcode : 0;
// run simulation
exitcode = processor.run();
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed" << std::endl;
exitcode = 0;
} else {
std::cout << "Failed" << std::endl;
exitcode = 1;
}
} else {
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
}
return exitcode;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "processor.h"
#include <verilated.h>
@@ -56,6 +69,14 @@
#define VERILATOR_RESET_VALUE 2
#endif
#if (XLEN == 32)
typedef uint32_t Word;
#elif (XLEN == 64)
typedef uint64_t Word;
#else
#error unsupported XLEN
#endif
#define VL_WDATA_GETW(lwp, i, n, w) \
VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w)
@@ -71,7 +92,7 @@ double sc_time_stamp() {
static bool trace_enabled = false;
static uint64_t trace_start_time = TRACE_START_TIME;
static uint64_t trace_stop_time = TRACE_STOP_TIME;
static uint64_t trace_stop_time = TRACE_STOP_TIME;
bool sim_trace_enabled() {
if (timestamp >= trace_start_time
@@ -126,6 +147,9 @@ public:
// reset the device
this->reset();
// Turn on assertion after reset
Verilated::assertOn(true);
}
~Impl() {
@@ -165,27 +189,46 @@ public:
std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
#endif
// reset device
this->reset();
// start execution
running_ = true;
device_->reset = 0;
// execute program
// wait on device to go busy
while (!device_->busy) {
this->tick();
}
// wait on device to go idle
while (device_->busy) {
if (get_ebreak()) {
exitcode = get_last_wb_value(3);
exitcode = (int)get_last_wb_value(3);
break;
}
this->tick();
}
// reset device
this->reset();
// wait 5 cycles to flush the pipeline
this->wait(5);
this->cout_flush();
return exitcode;
}
void write_dcr(uint32_t addr, uint32_t value) {
device_->dcr_wr_valid = 1;
device_->dcr_wr_addr = addr;
device_->dcr_wr_data = value;
while (device_->dcr_wr_valid) {
this->tick();
}
}
private:
void reset() {
void reset() {
running_ = false;
print_bufs_.clear();
pending_mem_reqs_.clear();
@@ -199,6 +242,8 @@ private:
this->reset_avs_bus();
#endif
this->reset_dcr_bus();
device_->reset = 1;
for (int i = 0; i < RESET_DELAY; ++i) {
@@ -206,14 +251,7 @@ private:
this->eval();
device_->clk = 1;
this->eval();
}
device_->reset = 0;
// Turn on assertion after reset
Verilated::assertOn(true);
this->cout_flush();
}
}
void tick() {
@@ -226,6 +264,7 @@ private:
#else
this->eval_avs_bus(0);
#endif
this->eval_dcr_bus(0);
device_->clk = 1;
this->eval();
@@ -235,6 +274,7 @@ private:
#else
this->eval_avs_bus(1);
#endif
this->eval_dcr_bus(1);
if (MEM_CYCLE_RATIO > 0) {
auto cycle = timestamp / 2;
@@ -260,6 +300,8 @@ private:
#ifdef VCD_OUTPUT
if (sim_trace_enabled()) {
trace_->dump(timestamp);
} else {
exit(-1);
}
#endif
++timestamp;
@@ -268,30 +310,30 @@ private:
#ifdef AXI_BUS
void reset_axi_bus() {
device_->m_axi_wready = 0;
device_->m_axi_awready = 0;
device_->m_axi_arready = 0;
device_->m_axi_rvalid = 0;
device_->m_axi_bvalid = 0;
device_->m_axi_wready[0] = 0;
device_->m_axi_awready[0] = 0;
device_->m_axi_arready[0] = 0;
device_->m_axi_rvalid[0] = 0;
device_->m_axi_bvalid[0] = 0;
}
void eval_axi_bus(bool clk) {
if (!clk) {
mem_rd_rsp_ready_ = device_->m_axi_rready;
mem_wr_rsp_ready_ = device_->m_axi_bready;
mem_rd_rsp_ready_ = device_->m_axi_rready[0];
mem_wr_rsp_ready_ = device_->m_axi_bready[0];
return;
}
if (ram_ == nullptr) {
device_->m_axi_wready = 0;
device_->m_axi_awready = 0;
device_->m_axi_arready = 0;
device_->m_axi_wready[0] = 0;
device_->m_axi_awready[0] = 0;
device_->m_axi_arready[0] = 0;
return;
}
// process memory responses
if (mem_rd_rsp_active_
&& device_->m_axi_rvalid && mem_rd_rsp_ready_) {
&& device_->m_axi_rvalid[0] && mem_rd_rsp_ready_) {
mem_rd_rsp_active_ = false;
}
if (!mem_rd_rsp_active_) {
@@ -299,30 +341,30 @@ private:
&& (*pending_mem_reqs_.begin())->ready
&& !(*pending_mem_reqs_.begin())->write) {
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_req = *mem_rsp_it;
auto mem_rsp = *mem_rsp_it;
/*
printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_req->addr);
printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_rsp->addr);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%02x", mem_req->block[(MEM_BLOCK_SIZE-1)-i]);
printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]);
}
printf("\n");
*/
device_->m_axi_rvalid = 1;
device_->m_axi_rid = mem_req->tag;
device_->m_axi_rresp = 0;
device_->m_axi_rlast = 1;
memcpy((uint8_t*)device_->m_axi_rdata, mem_req->block.data(), MEM_BLOCK_SIZE);
device_->m_axi_rvalid[0] = 1;
device_->m_axi_rid[0] = mem_rsp->tag;
device_->m_axi_rresp[0] = 0;
device_->m_axi_rlast[0] = 1;
memcpy(device_->m_axi_rdata[0].data(), mem_rsp->block.data(), MEM_BLOCK_SIZE);
pending_mem_reqs_.erase(mem_rsp_it);
mem_rd_rsp_active_ = true;
delete mem_req;
delete mem_rsp;
} else {
device_->m_axi_rvalid = 0;
device_->m_axi_rvalid[0] = 0;
}
}
// send memory write response
if (mem_wr_rsp_active_
&& device_->m_axi_bvalid && mem_wr_rsp_ready_) {
&& device_->m_axi_bvalid[0] && mem_wr_rsp_ready_) {
mem_wr_rsp_active_ = false;
}
if (!mem_wr_rsp_active_) {
@@ -330,34 +372,34 @@ private:
&& (*pending_mem_reqs_.begin())->ready
&& (*pending_mem_reqs_.begin())->write) {
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_req = *mem_rsp_it;
auto mem_rsp = *mem_rsp_it;
/*
printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_req->addr);
printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_rsp->addr);
*/
device_->m_axi_bvalid = 1;
device_->m_axi_bid = mem_req->tag;
device_->m_axi_bresp = 0;
device_->m_axi_bvalid[0] = 1;
device_->m_axi_bid[0] = mem_rsp->tag;
device_->m_axi_bresp[0] = 0;
pending_mem_reqs_.erase(mem_rsp_it);
mem_wr_rsp_active_ = true;
delete mem_req;
delete mem_rsp;
} else {
device_->m_axi_bvalid = 0;
device_->m_axi_bvalid[0] = 0;
}
}
// select the memory bank
uint32_t req_addr = device_->m_axi_wvalid ? device_->m_axi_awaddr : device_->m_axi_araddr;
uint32_t req_addr = device_->m_axi_wvalid[0] ? device_->m_axi_awaddr[0] : device_->m_axi_araddr[0];
// process memory requests
if (device_->m_axi_wvalid || device_->m_axi_arvalid) {
if (device_->m_axi_wvalid) {
uint64_t byteen = device_->m_axi_wstrb;
unsigned base_addr = device_->m_axi_awaddr;
uint8_t* data = (uint8_t*)(device_->m_axi_wdata);
if ((device_->m_axi_wvalid[0] || device_->m_axi_arvalid[0]) && running_) {
if (device_->m_axi_wvalid[0]) {
uint64_t byteen = device_->m_axi_wstrb[0];
uint64_t base_addr = device_->m_axi_awaddr[0];
uint8_t* data = (uint8_t*)device_->m_axi_wdata[0].data();
// check console output
if (base_addr >= IO_COUT_ADDR
&& base_addr < (IO_COUT_ADDR + IO_COUT_SIZE)) {
if (base_addr >= uint64_t(IO_COUT_ADDR)
&& base_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
auto& ss_buf = print_bufs_[i];
@@ -384,15 +426,15 @@ private:
}
auto mem_req = new mem_req_t();
mem_req->tag = device_->m_axi_awid;
mem_req->addr = device_->m_axi_awaddr;
mem_req->tag = device_->m_axi_awid[0];
mem_req->addr = device_->m_axi_awaddr[0];
mem_req->write = true;
mem_req->ready = true;
pending_mem_reqs_.emplace_back(mem_req);
// send dram request
ramulator::Request dram_req(
device_->m_axi_awaddr,
device_->m_axi_awaddr[0],
ramulator::Request::Type::WRITE,
0
);
@@ -401,18 +443,18 @@ private:
} else {
// process reads
auto mem_req = new mem_req_t();
mem_req->tag = device_->m_axi_arid;
mem_req->addr = device_->m_axi_araddr;
ram_->read(mem_req->block.data(), device_->m_axi_araddr, MEM_BLOCK_SIZE);
mem_req->tag = device_->m_axi_arid[0];
mem_req->addr = device_->m_axi_araddr[0];
ram_->read(mem_req->block.data(), device_->m_axi_araddr[0], MEM_BLOCK_SIZE);
mem_req->write = false;
mem_req->ready = false;
pending_mem_reqs_.emplace_back(mem_req);
// send dram request
ramulator::Request dram_req(
device_->m_axi_araddr,
device_->m_axi_araddr[0],
ramulator::Request::Type::READ,
std::bind([](ramulator::Request& dram_req, mem_req_t* mem_req) {
std::bind([&](ramulator::Request& dram_req, mem_req_t* mem_req) {
mem_req->ready = true;
}, placeholders::_1, mem_req),
0
@@ -421,9 +463,9 @@ private:
}
}
device_->m_axi_wready = 1;
device_->m_axi_awready = 1;
device_->m_axi_arready = 1;
device_->m_axi_wready[0] = running_;
device_->m_axi_awready[0] = running_;
device_->m_axi_arready[0] = running_;
}
#else
@@ -454,35 +496,35 @@ private:
&& (*pending_mem_reqs_.begin())->ready) {
device_->mem_rsp_valid = 1;
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_req = *mem_rsp_it;
auto mem_rsp = *mem_rsp_it;
/*
printf("%0ld: [sim] MEM Rd: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_req->addr);
printf("%0ld: [sim] MEM Rd: bank=%d, tag=%0lx, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_rsp->tag, mem_rsp->addr);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%02x", mem_req->block[(MEM_BLOCK_SIZE-1)-i]);
printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]);
}
printf("\n");
*/
memcpy(device_->mem_rsp_data.data(), mem_req->block.data(), MEM_BLOCK_SIZE);
device_->mem_rsp_tag = mem_req->tag;
memcpy(device_->mem_rsp_data.data(), mem_rsp->block.data(), MEM_BLOCK_SIZE);
device_->mem_rsp_tag = mem_rsp->tag;
pending_mem_reqs_.erase(mem_rsp_it);
mem_rd_rsp_active_ = true;
delete mem_req;
delete mem_rsp;
} else {
device_->mem_rsp_valid = 0;
}
}
// process memory requests
if (device_->mem_req_valid) {
uint32_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE);
if (device_->mem_req_valid && running_) {
uint64_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE);
if (device_->mem_req_rw) {
// process writes
uint64_t byteen = device_->mem_req_byteen;
uint8_t* data = (uint8_t*)device_->mem_req_data.data();
uint8_t* data = (uint8_t*)(device_->mem_req_data.data());
// check console output
if (byte_addr >= IO_COUT_ADDR
&& byte_addr < (IO_COUT_ADDR + IO_COUT_SIZE)) {
if (byte_addr >= uint64_t(IO_COUT_ADDR)
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
for (int i = 0; i < IO_COUT_SIZE; i++) {
if ((byteen >> i) & 0x1) {
auto& ss_buf = print_bufs_[i];
@@ -496,7 +538,7 @@ private:
}
} else {
/*
printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, byte_addr, byteen);
printf("%0ld: [sim] MEM Wr: tag=%0lx, addr=%0x, byteen=%0lx, data=", timestamp, device_->mem_req_tag, byte_addr, byteen);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]);
}
@@ -515,7 +557,7 @@ private:
0
);
dram_queue_.push(dram_req);
}
}
} else {
// process reads
auto mem_req = new mem_req_t();
@@ -526,11 +568,13 @@ private:
ram_->read(mem_req->block.data(), byte_addr, MEM_BLOCK_SIZE);
pending_mem_reqs_.emplace_back(mem_req);
//printf("%0ld: [sim] MEM Rd Req: addr=%0x, tag=%0lx\n", timestamp, byte_addr, device_->mem_req_tag);
// send dram request
ramulator::Request dram_req(
byte_addr,
ramulator::Request::Type::READ,
std::bind([](ramulator::Request& dram_req, mem_req_t* mem_req) {
std::bind([&](ramulator::Request& dram_req, mem_req_t* mem_req) {
mem_req->ready = true;
}, placeholders::_1, mem_req),
0
@@ -539,11 +583,24 @@ private:
}
}
device_->mem_req_ready = 1;
device_->mem_req_ready = running_;
}
#endif
void reset_dcr_bus() {
device_->dcr_wr_valid = 0;
}
void eval_dcr_bus(bool clk) {
if (!clk) {
return;
}
if (device_->dcr_wr_valid) {
device_->dcr_wr_valid = 0;
}
}
void wait(uint32_t cycles) {
for (int i = 0; i < cycles; ++i) {
this->tick();
@@ -552,17 +609,17 @@ private:
bool get_ebreak() const {
#ifdef AXI_BUS
return (bool)device_->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak;
return (bool)device_->Vortex_axi->vortex->sim_ebreak;
#else
return (bool)device_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak;
return (bool)device_->Vortex->sim_ebreak;
#endif
}
int get_last_wb_value(int reg) const {
uint64_t get_last_wb_value(int reg) const {
#ifdef AXI_BUS
return (int)device_->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg];
return ((Word*)device_->Vortex_axi->vortex->sim_wb_value.data())[reg];
#else
return (int)device_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg];
return ((Word*)device_->Vortex->sim_wb_value.data())[reg];
#endif
}
@@ -600,6 +657,8 @@ private:
ramulator::Gem5Wrapper* dram_;
std::queue<ramulator::Request> dram_queue_;
bool running_;
};
///////////////////////////////////////////////////////////////////////////////
@@ -618,4 +677,8 @@ void Processor::attach_ram(RAM* mem) {
int Processor::run() {
return impl_->run();
}
void Processor::write_dcr(uint32_t addr, uint32_t value) {
return impl_->write_dcr(addr, value);
}

View File

@@ -1,5 +1,20 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
namespace vortex {
class RAM;
@@ -14,6 +29,8 @@ public:
int run();
void write_dcr(uint32_t addr, uint32_t value);
private:
class Impl;

View File

@@ -1,10 +1,5 @@
`verilator_config
lint_off -rule BLKANDNBLK -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule UNOPTFLAT -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule WIDTH -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule UNUSED -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule LITENDIAN -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule IMPORTSTAR -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule PINCONNECTEMPTY -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -file "../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule BLKANDNBLK -file "*/fpnew/src/*"
lint_off -rule UNOPTFLAT -file "*/fpnew/src/*"
lint_off -file "*/fpnew/src/*"

View File

@@ -1,45 +1,36 @@
XLEN ?= 32
DESTDIR ?= .
RTL_DIR = ../hw/rtl
THIRD_PARTY_DIR = ../../third_party
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I. -I../common -I../../hw
CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN)
CXXFLAGS += $(CONFIGS)
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp
OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
VPATH := $(sort $(dir $(SRCS)))
#$(info OBJS is $(OBJS))
#$(info VPATH is $(VPATH))
SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
#CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer
else
CXXFLAGS += -O2 -DNDEBUG
endif
# XLEN parameterization
ifdef XLEN
CXXFLAGS += -DXLEN=$(XLEN)
endif
PROJECT = simx
all: $(DESTDIR)/$(PROJECT)
$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
$(CXX) $(CXXFLAGS) -DSTARTUP_ADDR=0x80000000 $^ $(LDFLAGS) -o $@
$(DESTDIR)/lib$(PROJECT).so: $(SRCS)
$(CXX) $(CXXFLAGS) $^ -shared $(LDFLAGS) -o $@
@@ -48,4 +39,4 @@ $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
rm -rf $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so

87
sim/simx/arch.h Normal file
View File

@@ -0,0 +1,87 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <sstream>
#include <cstdlib>
#include <stdio.h>
#include "types.h"
namespace vortex {
class Arch {
private:
uint16_t num_threads_;
uint16_t num_warps_;
uint16_t num_cores_;
uint16_t num_clusters_;
uint16_t vsize_;
uint16_t num_regs_;
uint16_t num_csrs_;
uint16_t num_barriers_;
uint16_t ipdom_size_;
public:
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)
: num_threads_(num_threads)
, num_warps_(num_warps)
, num_cores_(num_cores)
, num_clusters_(num_clusters)
, vsize_(16)
, num_regs_(32)
, num_csrs_(4096)
, num_barriers_(NUM_BARRIERS)
, ipdom_size_((num_threads-1) * 2)
{}
uint16_t vsize() const {
return vsize_;
}
uint16_t num_regs() const {
return num_regs_;
}
uint16_t num_csrs() const {
return num_csrs_;
}
uint16_t num_barriers() const {
return num_barriers_;
}
uint16_t ipdom_size() const {
return ipdom_size_;
}
uint16_t num_threads() const {
return num_threads_;
}
uint16_t num_warps() const {
return num_warps_;
}
uint16_t num_cores() const {
return num_cores_;
}
uint16_t num_clusters() const {
return num_clusters_;
}
};
}

View File

@@ -1,70 +0,0 @@
#pragma once
#include <string>
#include <sstream>
#include <cstdlib>
#include <stdio.h>
#include "types.h"
namespace vortex {
class ArchDef {
private:
uint16_t num_cores_;
uint16_t num_warps_;
uint16_t num_threads_;
uint16_t wsize_;
uint16_t vsize_;
uint16_t num_regs_;
uint16_t num_csrs_;
uint16_t num_barriers_;
public:
ArchDef(uint16_t num_cores,
uint16_t num_warps,
uint16_t num_threads)
: num_cores_(num_cores)
, num_warps_(num_warps)
, num_threads_(num_threads)
, wsize_(4)
, vsize_(16)
, num_regs_(32)
, num_csrs_(4096)
, num_barriers_(NUM_BARRIERS)
{}
uint16_t wsize() const {
return wsize_;
}
uint16_t vsize() const {
return vsize_;
}
uint16_t num_regs() const {
return num_regs_;
}
uint16_t num_csrs() const {
return num_csrs_;
}
uint16_t num_barriers() const {
return num_barriers_;
}
uint16_t num_threads() const {
return num_threads_;
}
uint16_t num_warps() const {
return num_warps_;
}
uint16_t num_cores() const {
return num_cores_;
}
};
}

View File

@@ -1,47 +0,0 @@
#include <iostream>
#include <string>
#include "args.h"
using namespace vortex;
using std::string;
std::string CommandLineArg::helpString_;
std::unordered_map<string, CommandLineArg *> CommandLineArg::longArgs_;
std::unordered_map<string, CommandLineArg *> CommandLineArg::shortArgs_;
CommandLineArg::CommandLineArg(string s, string l, const char *helpText) {
helpString_ += helpText;
longArgs_[l] = this;
shortArgs_[s] = this;
}
CommandLineArg::CommandLineArg(string l, const char *helpText) {
helpString_ += helpText;
longArgs_[l] = this;
}
void CommandLineArg::readArgs(int argc, char **argv) {
for (int i = 0; i < argc; i++) {
std::unordered_map<string, CommandLineArg *>::iterator
s = shortArgs_.find(std::string(argv[i])),
l = longArgs_.find(std::string(argv[i]));
if (s != shortArgs_.end()) {
i += s->second->read(argc - i, &argv[i]);
} else if (l != longArgs_.end()) {
i += l->second->read(argc - i, &argv[i]);
} else {
throw BadArg(string(argv[i]));
}
}
}
void CommandLineArg::clearArgs() {
shortArgs_.clear();
longArgs_.clear();
helpString_ = "";
}
void CommandLineArg::showHelp(std::ostream &os) {
os << helpString_;
}

View File

@@ -1,64 +0,0 @@
#pragma once
#include <iostream>
#include <string>
#include <sstream>
#include <unordered_map>
#include <util.h>
namespace vortex {
struct BadArg { BadArg(std::string s) : arg(s) {} std::string arg; };
class CommandLineArg {
public:
CommandLineArg(std::string s, std::string l, const char *helpText);
CommandLineArg(std::string l, const char *helpText);
virtual int read(int argc, char** argv) = 0;
static void readArgs(int argc, char **argv);
static void clearArgs();
static void showHelp(std::ostream &os);
private:
static std::string helpString_;
static std::unordered_map<std::string, CommandLineArg *> longArgs_;
static std::unordered_map<std::string, CommandLineArg *> shortArgs_;
};
template <typename T> class CommandLineArgSetter : public CommandLineArg {
public:
CommandLineArgSetter(std::string s, std::string l, const char *ht, T &x) :
CommandLineArg(s, l, ht), arg_(x) {}
CommandLineArgSetter(std::string l, const char *ht, T &x) :
CommandLineArg(l, ht), arg_(x) {}
int read(int argc, char **argv) {
__unused (argc);
std::istringstream iss(argv[1]);
iss >> arg_;
return 1;
}
private:
T &arg_;
};
class CommandLineArgFlag : public CommandLineArg {
public:
CommandLineArgFlag(std::string s, std::string l, const char *ht, bool &x) :
CommandLineArg(s, l, ht), arg_(x) { arg_ = false; }
CommandLineArgFlag(std::string l, const char *ht, bool &x) :
CommandLineArg(l, ht), arg_(x) { arg_ = false; }
int read(int argc, char **argv) {
__unused (argc, argv);
arg_ = true;
return 0;
}
private:
bool &arg_;
};
}

View File

@@ -1,637 +0,0 @@
#include "cache.h"
#include "debug.h"
#include "types.h"
#include <util.h>
#include <unordered_map>
#include <vector>
#include <list>
#include <queue>
using namespace vortex;
struct params_t {
uint32_t sets_per_bank;
uint32_t blocks_per_set;
uint32_t words_per_block;
uint32_t log2_num_inputs;
uint32_t word_select_addr_start;
uint32_t word_select_addr_end;
uint32_t bank_select_addr_start;
uint32_t bank_select_addr_end;
uint32_t set_select_addr_start;
uint32_t set_select_addr_end;
uint32_t tag_select_addr_start;
uint32_t tag_select_addr_end;
params_t(const Cache::Config& config) {
uint32_t bank_bits = log2ceil(config.num_banks);
uint32_t offset_bits = config.B - config.W;
uint32_t log2_bank_size = config.C - bank_bits;
uint32_t index_bits = log2_bank_size - (config.B << config.A);
assert(log2_bank_size >= config.B);
this->log2_num_inputs = log2ceil(config.num_inputs);
this->words_per_block = 1 << offset_bits;
this->blocks_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits;
assert(config.ports_per_bank <= this->words_per_block);
// Word select
this->word_select_addr_start = config.W;
this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
// Bank select
this->bank_select_addr_start = (1+this->word_select_addr_end);
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
// Set select
this->set_select_addr_start = (1+this->bank_select_addr_end);
this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
// Tag select
this->tag_select_addr_start = (1+this->set_select_addr_end);
this->tag_select_addr_end = (config.addr_width-1);
}
uint32_t addr_bank_id(uint64_t word_addr) const {
if (bank_select_addr_end >= bank_select_addr_start)
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
else
return 0;
}
uint32_t addr_set_id(uint64_t word_addr) const {
if (set_select_addr_end >= set_select_addr_start)
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
else
return 0;
}
uint64_t addr_tag(uint64_t word_addr) const {
if (tag_select_addr_end >= tag_select_addr_start)
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
else
return 0;
}
uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
uint64_t addr(0);
if (bank_select_addr_end >= bank_select_addr_start)
addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
if (set_select_addr_end >= set_select_addr_start)
addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
if (tag_select_addr_end >= tag_select_addr_start)
addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
return addr;
}
};
struct block_t {
bool valid;
bool dirty;
uint64_t tag;
uint32_t lru_ctr;
};
struct set_t {
std::vector<block_t> blocks;
set_t(uint32_t size) : blocks(size) {}
void clear() {
for (auto& block : blocks) {
block.valid = false;
}
}
};
struct bank_req_info_t {
bool valid;
uint32_t req_id;
uint64_t req_tag;
};
struct bank_req_t {
bool valid;
bool write;
bool mshr_replay;
uint64_t tag;
uint32_t set_id;
uint32_t core_id;
uint64_t uuid;
std::vector<bank_req_info_t> infos;
bank_req_t(uint32_t size)
: valid(false)
, write(false)
, mshr_replay(false)
, tag(0)
, set_id(0)
, core_id(0)
, uuid(0)
, infos(size)
{}
};
struct mshr_entry_t : public bank_req_t {
uint32_t block_id;
mshr_entry_t(uint32_t size = 0)
: bank_req_t(size)
, block_id(0)
{}
};
class MSHR {
private:
std::vector<mshr_entry_t> entries_;
uint32_t size_;
public:
MSHR(uint32_t size)
: entries_(size)
, size_(0)
{}
bool empty() const {
return (0 == size_);
}
bool full() const {
return (size_ == entries_.size());
}
int lookup(const bank_req_t& bank_req) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (entry.valid
&& entry.set_id == bank_req.set_id
&& entry.tag == bank_req.tag) {
return i;
}
}
return -1;
}
int allocate(const bank_req_t& bank_req, uint32_t block_id) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (!entry.valid) {
*(bank_req_t*)&entry = bank_req;
entry.valid = true;
entry.mshr_replay = false;
entry.block_id = block_id;
++size_;
return i;
}
}
return -1;
}
mshr_entry_t& replay(uint32_t id) {
auto& root_entry = entries_.at(id);
assert(root_entry.valid);
// make all related mshr entries for replay
for (auto& entry : entries_) {
if (entry.valid
&& entry.set_id == root_entry.set_id
&& entry.tag == root_entry.tag) {
entry.mshr_replay = true;
}
}
return root_entry;
}
bool pop(bank_req_t* out) {
for (auto& entry : entries_) {
if (entry.valid && entry.mshr_replay) {
*out = entry;
entry.valid = false;
--size_;
return true;
}
}
return false;
}
void clear() {
for (auto& entry : entries_) {
if (entry.valid && entry.mshr_replay) {
entry.valid = false;
}
}
size_ = 0;
}
};
struct bank_t {
std::vector<set_t> sets;
MSHR mshr;
bank_t(const Cache::Config& config,
const params_t& params)
: sets(params.sets_per_bank, params.blocks_per_set)
, mshr(config.mshr_size)
{}
void clear() {
mshr.clear();
for (auto& set : sets) {
set.clear();
}
}
};
///////////////////////////////////////////////////////////////////////////////
class Cache::Impl {
private:
Cache* const simobject_;
Config config_;
params_t params_;
std::vector<bank_t> banks_;
Switch<MemReq, MemRsp>::Ptr mem_switch_;
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
uint32_t flush_cycles_;
PerfStats perf_stats_;
uint64_t pending_read_reqs_;
uint64_t pending_write_reqs_;
uint64_t pending_fill_reqs_;
public:
Impl(Cache* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
{
bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspIn);
if (config.num_banks > 1) {
mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
}
mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn);
} else {
mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
}
// calculate tag flush cycles
flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
}
void reset() {
for (auto& bank : banks_) {
bank.clear();
}
perf_stats_ = PerfStats();
pending_read_reqs_ = 0;
pending_write_reqs_ = 0;
pending_fill_reqs_ = 0;
}
void tick() {
// wait on flush cycles
if (flush_cycles_ != 0) {
--flush_cycles_;
return;
}
// per-bank pipeline request
std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
// calculate memory latency
perf_stats_.mem_latency += pending_fill_reqs_;
// handle bypasss responses
auto& bypass_port = bypass_switch_->RspOut.at(1);
if (!bypass_port.empty()) {
auto& mem_rsp = bypass_port.front();
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
bypass_port.pop();
}
// handle MSHR replay
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs.at(bank_id);
bank.mshr.pop(&pipeline_req);
}
// handle memory fills
std::vector<bool> pending_fill_req(config_.num_banks, false);
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
if (!mem_rsp_port.empty()) {
auto& mem_rsp = mem_rsp_port.front();
this->processMemoryFill(bank_id, mem_rsp.tag);
pending_fill_req.at(bank_id) = true;
mem_rsp_port.pop();
}
}
// handle incoming core requests
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
// check cache bypassing
if (core_req.non_cacheable) {
// send IO request
this->processIORequest(core_req, req_id);
// remove request
core_req_port.pop();
continue;
}
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
// create bank request
bank_req_t bank_req(config_.ports_per_bank);
bank_req.valid = true;
bank_req.write = core_req.write;
bank_req.mshr_replay = false;
bank_req.tag = tag;
bank_req.set_id = set_id;
bank_req.core_id = core_req.core_id;
bank_req.uuid = core_req.uuid;
bank_req.infos.at(port_id) = {true, req_id, core_req.tag};
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs.at(bank_id);
// check pending MSHR replay
if (pipeline_req.valid
&& pipeline_req.mshr_replay) {
// stall
continue;
}
// check pending fill request
if (pending_fill_req.at(bank_id)) {
// stall
continue;
}
// check MSHR capacity if read or writeback
if ((!core_req.write || !config_.write_through)
&& bank.mshr.full()) {
++perf_stats_.mshr_stalls;
continue;
}
// check bank conflicts
if (pipeline_req.valid) {
// check port conflict
if (pipeline_req.write != core_req.write
|| pipeline_req.set_id != set_id
|| pipeline_req.tag != tag
|| pipeline_req.infos[port_id].valid) {
++perf_stats_.bank_stalls;
continue;
}
// update pending request infos
pipeline_req.infos[port_id] = bank_req.infos[port_id];
} else {
// schedule new request
pipeline_req = bank_req;
}
if (core_req.write)
++perf_stats_.writes;
else
++perf_stats_.reads;
// remove request
auto time = core_req_port.pop();
perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
}
// process active request
this->processBankRequest(pipeline_reqs);
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
private:
void processIORequest(const MemReq& core_req, uint32_t req_id) {
{
MemReq mem_req(core_req);
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
}
if (core_req.write && config_.write_reponse) {
MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
// update block
auto& bank = banks_.at(bank_id);
auto& entry = bank.mshr.replay(mshr_id);
auto& set = bank.sets.at(entry.set_id);
auto& block = set.blocks.at(entry.block_id);
block.valid = true;
block.tag = entry.tag;
--pending_fill_reqs_;
}
void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& pipeline_req = pipeline_reqs.at(bank_id);
if (!pipeline_req.valid)
continue;
auto& bank = banks_.at(bank_id);
auto& set = bank.sets.at(pipeline_req.set_id);
if (pipeline_req.mshr_replay) {
// send core response
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
} else {
bool hit = false;
bool found_free_block = false;
uint32_t hit_block_id = 0;
uint32_t repl_block_id = 0;
uint32_t max_cnt = 0;
for (uint32_t i = 0, n = set.blocks.size(); i < n; ++i) {
auto& block = set.blocks.at(i);
if (block.valid) {
if (block.tag == pipeline_req.tag) {
block.lru_ctr = 0;
hit_block_id = i;
hit = true;
} else {
++block.lru_ctr;
}
if (max_cnt < block.lru_ctr) {
max_cnt = block.lru_ctr;
repl_block_id = i;
}
} else {
found_free_block = true;
repl_block_id = i;
}
}
if (hit) {
//
// Hit handling
//
if (pipeline_req.write) {
// handle write hit
auto& hit_block = set.blocks.at(hit_block_id);
if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
} else {
// mark block as dirty
hit_block.dirty = true;
}
}
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
} else {
//
// Miss handling
//
if (pipeline_req.write)
++perf_stats_.write_misses;
else
++perf_stats_.read_misses;
if (!found_free_block && !config_.write_through) {
// write back dirty block
auto& repl_block = set.blocks.at(repl_block_id);
if (repl_block.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
++perf_stats_.evictions;
}
}
if (pipeline_req.write && config_.write_through) {
// forward write request to memory
{
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
}
// send core response
if (config_.write_reponse) {
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
} else {
// MSHR lookup
int pending = bank.mshr.lookup(pipeline_req);
// allocate MSHR
int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id);
// send fill request
if (pending == -1) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = false;
mem_req.tag = mshr_id;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
++pending_fill_reqs_;
}
}
}
}
}
}
};
///////////////////////////////////////////////////////////////////////////////
Cache::Cache(const SimContext& ctx, const char* name, const Config& config)
: SimObject<Cache>(ctx, name)
, CoreReqPorts(config.num_inputs, this)
, CoreRspPorts(config.num_inputs, this)
, MemReqPort(this)
, MemRspPort(this)
, impl_(new Impl(this, config))
{}
Cache::~Cache() {
delete impl_;
}
void Cache::reset() {
impl_->reset();
}
void Cache::tick() {
impl_->tick();
}
const Cache::PerfStats& Cache::perf_stats() const {
return impl_->perf_stats();
}

106
sim/simx/cache_cluster.h Normal file
View File

@@ -0,0 +1,106 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "cache_sim.h"
namespace vortex {
class CacheCluster : public SimObject<CacheCluster> {
public:
std::vector<std::vector<SimPort<MemReq>>> CoreReqPorts;
std::vector<std::vector<SimPort<MemRsp>>> CoreRspPorts;
SimPort<MemReq> MemReqPort;
SimPort<MemRsp> MemRspPort;
CacheCluster(const SimContext& ctx,
const char* name,
uint32_t num_units,
uint32_t num_caches,
uint32_t num_requests,
const CacheSim::Config& config)
: SimObject(ctx, name)
, CoreReqPorts(num_units, std::vector<SimPort<MemReq>>(num_requests, this))
, CoreRspPorts(num_units, std::vector<SimPort<MemRsp>>(num_requests, this))
, MemReqPort(this)
, MemRspPort(this)
, caches_(MAX(num_caches, 0x1)) {
CacheSim::Config config2(config);
if (0 == num_caches) {
num_caches = 1;
config2.bypass = true;
}
char sname[100];
std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units);
for (uint32_t u = 0; u < num_units; ++u) {
snprintf(sname, 100, "%s-unit-arb-%d", name, u);
unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
for (uint32_t i = 0; i < num_requests; ++i) {
this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
}
}
std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs);
for (uint32_t i = 0; i < config.num_inputs; ++i) {
snprintf(sname, 100, "%s-mem-arb-%d", name, i);
mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
for (uint32_t u = 0; u < num_units; ++u) {
unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
}
}
snprintf(sname, 100, "%s-cache-arb", name);
auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
for (uint32_t i = 0; i < num_caches; ++i) {
snprintf(sname, 100, "%s-cache%d", name, i);
caches_.at(i) = CacheSim::Create(sname, config2);
for (uint32_t j = 0; j < config.num_inputs; ++j) {
mem_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j));
caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i));
}
caches_.at(i)->MemReqPort.bind(&cache_arb->ReqIn.at(i));
cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPort);
}
cache_arb->ReqOut.at(0).bind(&this->MemReqPort);
this->MemRspPort.bind(&cache_arb->RspOut.at(0));
}
~CacheCluster() {}
void reset() {}
void tick() {}
CacheSim::PerfStats perf_stats() const {
CacheSim::PerfStats perf;
for (auto cache : caches_) {
perf += cache->perf_stats();
}
return perf;
}
private:
std::vector<CacheSim::Ptr> caches_;
};
}

707
sim/simx/cache_sim.cpp Normal file
View File

@@ -0,0 +1,707 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cache_sim.h"
#include "debug.h"
#include "types.h"
#include <util.h>
#include <unordered_map>
#include <vector>
#include <list>
#include <queue>
using namespace vortex;
struct params_t {
uint32_t sets_per_bank;
uint32_t lines_per_set;
uint32_t words_per_line;
uint32_t log2_num_inputs;
uint32_t word_select_addr_start;
uint32_t word_select_addr_end;
uint32_t bank_select_addr_start;
uint32_t bank_select_addr_end;
uint32_t set_select_addr_start;
uint32_t set_select_addr_end;
uint32_t tag_select_addr_start;
uint32_t tag_select_addr_end;
params_t(const CacheSim::Config& config) {
int32_t bank_bits = log2ceil(config.num_banks);
int32_t offset_bits = config.B - config.W;
int32_t log2_bank_size = config.C - bank_bits;
int32_t index_bits = log2_bank_size - (config.B + config.A);
assert(log2_bank_size > 0);
assert(offset_bits >= 0);
assert(index_bits >= 0);
this->log2_num_inputs = log2ceil(config.num_inputs);
this->words_per_line = 1 << offset_bits;
this->lines_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits;
assert(config.ports_per_bank <= this->words_per_line);
// Word select
this->word_select_addr_start = config.W;
this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
// Bank select
this->bank_select_addr_start = (1+this->word_select_addr_end);
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
// Set select
this->set_select_addr_start = (1+this->bank_select_addr_end);
this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
// Tag select
this->tag_select_addr_start = (1+this->set_select_addr_end);
this->tag_select_addr_end = (config.addr_width-1);
}
uint32_t addr_bank_id(uint64_t word_addr) const {
if (bank_select_addr_end >= bank_select_addr_start)
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
else
return 0;
}
uint32_t addr_set_id(uint64_t word_addr) const {
if (set_select_addr_end >= set_select_addr_start)
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
else
return 0;
}
uint64_t addr_tag(uint64_t word_addr) const {
if (tag_select_addr_end >= tag_select_addr_start)
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
else
return 0;
}
uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
uint64_t addr(0);
if (bank_select_addr_end >= bank_select_addr_start)
addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
if (set_select_addr_end >= set_select_addr_start)
addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
if (tag_select_addr_end >= tag_select_addr_start)
addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
return addr;
}
};
struct line_t {
uint64_t tag;
uint32_t lru_ctr;
bool valid;
bool dirty;
void clear() {
valid = false;
dirty = false;
}
};
struct set_t {
std::vector<line_t> lines;
set_t(uint32_t num_ways)
: lines(num_ways)
{}
void clear() {
for (auto& line : lines) {
line.clear();
}
}
};
struct bank_req_port_t {
uint32_t req_id;
uint64_t req_tag;
bool valid;
void clear() {
valid = false;
}
};
struct bank_req_t {
enum ReqType {
None = 0,
Fill = 1,
Replay = 2,
Core = 3
};
std::vector<bank_req_port_t> ports;
uint64_t tag;
uint32_t set_id;
uint32_t cid;
uint64_t uuid;
ReqType type;
bool write;
bank_req_t(uint32_t num_ports)
: ports(num_ports)
{}
void clear() {
for (auto& port : ports) {
port.clear();
}
type = ReqType::None;
}
};
struct mshr_entry_t {
bank_req_t bank_req;
uint32_t line_id;
mshr_entry_t(uint32_t num_ports)
: bank_req(num_ports)
{}
void clear() {
bank_req.clear();
}
};
class MSHR {
private:
std::vector<mshr_entry_t> entries_;
uint32_t size_;
public:
MSHR(uint32_t size, uint32_t num_ports)
: entries_(size, num_ports)
, size_(0)
{}
bool empty() const {
return (0 == size_);
}
bool full() const {
return (size_ == entries_.size());
}
bool lookup(const bank_req_t& bank_req) {
for (auto& entry : entries_) {;
if (entry.bank_req.type != bank_req_t::None
&& entry.bank_req.set_id == bank_req.set_id
&& entry.bank_req.tag == bank_req.tag) {
return true;
}
}
return false;
}
int allocate(const bank_req_t& bank_req, uint32_t line_id) {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
if (entry.bank_req.type == bank_req_t::None) {
entry.bank_req = bank_req;
entry.line_id = line_id;
++size_;
return i;
}
}
return -1;
}
mshr_entry_t& replay(uint32_t id) {
auto& root_entry = entries_.at(id);
assert(root_entry.bank_req.type == bank_req_t::Core);
// mark all related mshr entries for replay
for (auto& entry : entries_) {
if (entry.bank_req.type == bank_req_t::Core
&& entry.bank_req.set_id == root_entry.bank_req.set_id
&& entry.bank_req.tag == root_entry.bank_req.tag) {
entry.bank_req.type = bank_req_t::Replay;
}
}
return root_entry;
}
bool pop(bank_req_t* out) {
for (auto& entry : entries_) {
if (entry.bank_req.type == bank_req_t::Replay) {
*out = entry.bank_req;
entry.bank_req.type = bank_req_t::None;
--size_;
return true;
}
}
return false;
}
void clear() {
for (auto& entry : entries_) {
entry.clear();
}
size_ = 0;
}
};
struct bank_t {
std::vector<set_t> sets;
MSHR mshr;
bank_t(const CacheSim::Config& config,
const params_t& params)
: sets(params.sets_per_bank, params.lines_per_set)
, mshr(config.mshr_size, config.ports_per_bank)
{}
void clear() {
for (auto& set : sets) {
set.clear();
}
mshr.clear();
}
};
///////////////////////////////////////////////////////////////////////////////
class CacheSim::Impl {
private:
CacheSim* const simobject_;
Config config_;
params_t params_;
std::vector<bank_t> banks_;
Switch<MemReq, MemRsp>::Ptr bank_switch_;
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
std::vector<bank_req_t> pipeline_reqs_;
uint32_t init_cycles_;
PerfStats perf_stats_;
uint64_t pending_read_reqs_;
uint64_t pending_write_reqs_;
uint64_t pending_fill_reqs_;
public:
Impl(CacheSim* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
, pipeline_reqs_(config.num_banks, config.ports_per_bank)
{
char sname[100];
snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());
if (config_.bypass) {
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
for (uint32_t i = 0; i < config_.num_inputs; ++i) {
simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
}
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
return;
}
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2);
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
if (config.num_banks > 1) {
snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks);
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
}
bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0));
} else {
mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0));
}
// calculate cache initialization cycles
init_cycles_ = params_.sets_per_bank * params_.lines_per_set;
}
void reset() {
if (config_.bypass)
return;
for (auto& bank : banks_) {
bank.clear();
}
perf_stats_ = PerfStats();
pending_read_reqs_ = 0;
pending_write_reqs_ = 0;
pending_fill_reqs_ = 0;
}
void tick() {
if (config_.bypass)
return;
// wait on cache initialization cycles
if (init_cycles_ != 0) {
--init_cycles_;
return;
}
// handle cache bypasss responses
{
auto& bypass_port = bypass_switch_->RspIn.at(1);
if (!bypass_port.empty()) {
auto& mem_rsp = bypass_port.front();
this->processBypassResponse(mem_rsp);
bypass_port.pop();
}
}
// initialize pipeline request
for (auto& pipeline_req : pipeline_reqs_) {
pipeline_req.clear();
}
// schedule MSHR replay
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
bank.mshr.pop(&pipeline_req);
}
// schedule memory fill
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
if (mem_rsp_port.empty())
continue;
auto& pipeline_req = pipeline_reqs_.at(bank_id);
if (pipeline_req.type != bank_req_t::None)
continue;
auto& mem_rsp = mem_rsp_port.front();
DT(3, simobject_->name() << "-dram-" << mem_rsp);
pipeline_req.type = bank_req_t::Fill;
pipeline_req.tag = mem_rsp.tag;
mem_rsp_port.pop();
}
// schedule core requests
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
// check cache bypassing
if (core_req.type == AddrType::IO) {
// send bypass request
this->processBypassRequest(core_req, req_id);
// remove request
core_req_port.pop();
continue;
}
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
// check MSHR capacity
if ((!core_req.write || !config_.write_through)
&& bank.mshr.full()) {
++perf_stats_.mshr_stalls;
++perf_stats_.bank_stalls;
continue;
}
// check bank conflicts
if (pipeline_req.type == bank_req_t::Core) {
// check port conflict
if (pipeline_req.write != core_req.write
|| pipeline_req.set_id != set_id
|| pipeline_req.tag != tag
|| pipeline_req.ports.at(port_id).valid) {
++perf_stats_.bank_stalls;
continue;
}
// extend request ports
pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
} else if (pipeline_req.type == bank_req_t::None) {
// schedule new request
bank_req_t bank_req(config_.ports_per_bank);
bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
bank_req.tag = tag;
bank_req.set_id = set_id;
bank_req.cid = core_req.cid;
bank_req.uuid = core_req.uuid;
bank_req.type = bank_req_t::Core;
bank_req.write = core_req.write;
pipeline_req = bank_req;
} else {
// bank in use
++perf_stats_.bank_stalls;
continue;
}
if (core_req.write)
++perf_stats_.writes;
else
++perf_stats_.reads;
// remove request
DT(3, simobject_->name() << "-core-" << core_req);
auto time = core_req_port.pop();
perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
}
// process active request
this->processBankRequests();
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
private:
void processBypassResponse(const MemRsp& mem_rsp) {
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
MemRsp core_rsp{tag, mem_rsp.cid, mem_rsp.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
void processBypassRequest(const MemReq& core_req, uint32_t req_id) {
DT(3, simobject_->name() << "-core-" << core_req);
{
MemReq mem_req(core_req);
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
}
if (core_req.write && config_.write_reponse) {
MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
void processBankRequests() {
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto pipeline_req = pipeline_reqs_.at(bank_id);
switch (pipeline_req.type) {
case bank_req_t::None:
break;
case bank_req_t::Fill: {
// update cache line
auto& bank = banks_.at(bank_id);
auto& entry = bank.mshr.replay(pipeline_req.tag);
auto& set = bank.sets.at(entry.bank_req.set_id);
auto& line = set.lines.at(entry.line_id);
line.valid = true;
line.tag = entry.bank_req.tag;
--pending_fill_reqs_;
} break;
case bank_req_t::Replay: {
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.ports) {
if (!info.valid)
continue;
MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} break;
case bank_req_t::Core: {
bool hit = false;
bool found_free_line = false;
uint32_t hit_line_id = 0;
uint32_t repl_line_id = 0;
uint32_t max_cnt = 0;
auto& set = bank.sets.at(pipeline_req.set_id);
// tag lookup
for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
auto& line = set.lines.at(i);
if (line.valid) {
if (line.tag == pipeline_req.tag) {
line.lru_ctr = 0;
hit_line_id = i;
hit = true;
} else {
++line.lru_ctr;
}
if (max_cnt < line.lru_ctr) {
max_cnt = line.lru_ctr;
repl_line_id = i;
}
} else {
found_free_line = true;
repl_line_id = i;
}
}
if (hit) {
//
// Hit handling
//
if (pipeline_req.write) {
// handle write hit
auto& hit_line = set.lines.at(hit_line_id);
if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
} else {
// mark line as dirty
hit_line.dirty = true;
}
}
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.ports) {
if (!info.valid)
continue;
MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} else {
//
// Miss handling
//
if (pipeline_req.write)
++perf_stats_.write_misses;
else
++perf_stats_.read_misses;
if (!found_free_line && !config_.write_through) {
// write back dirty line
auto& repl_line = set.lines.at(repl_line_id);
if (repl_line.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
++perf_stats_.evictions;
}
}
if (pipeline_req.write && config_.write_through) {
// forward write request to memory
{
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
}
// send core response
if (config_.write_reponse) {
for (auto& info : pipeline_req.ports) {
if (!info.valid)
continue;
MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} else {
// MSHR lookup
auto mshr_pending = bank.mshr.lookup(pipeline_req);
// allocate MSHR
auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id);
// send fill request
if (!mshr_pending) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = false;
mem_req.tag = mshr_id;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
++pending_fill_reqs_;
}
}
}
} break;
}
}
// calculate memory latency
perf_stats_.mem_latency += pending_fill_reqs_;
}
};
///////////////////////////////////////////////////////////////////////////////
CacheSim::CacheSim(const SimContext& ctx, const char* name, const Config& config)
: SimObject<CacheSim>(ctx, name)
, CoreReqPorts(config.num_inputs, this)
, CoreRspPorts(config.num_inputs, this)
, MemReqPort(this)
, MemRspPort(this)
, impl_(new Impl(this, config))
{}
CacheSim::~CacheSim() {
delete impl_;
}
void CacheSim::reset() {
impl_->reset();
}
void CacheSim::tick() {
impl_->tick();
}
const CacheSim::PerfStats& CacheSim::perf_stats() const {
return impl_->perf_stats();
}

View File

@@ -1,13 +1,27 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "memsim.h"
#include "mem_sim.h"
namespace vortex {
class Cache : public SimObject<Cache> {
class CacheSim : public SimObject<CacheSim> {
public:
struct Config {
bool bypass; // cache bypass
uint8_t C; // log2 cache size
uint8_t B; // log2 block size
uint8_t W; // log2 word size
@@ -45,6 +59,19 @@ public:
, mshr_stalls(0)
, mem_latency(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->reads += rhs.reads;
this->writes += rhs.writes;
this->read_misses += rhs.read_misses;
this->write_misses += rhs.write_misses;
this->evictions += rhs.evictions;
this->pipeline_stalls += rhs.pipeline_stalls;
this->bank_stalls += rhs.bank_stalls;
this->mshr_stalls += rhs.mshr_stalls;
this->mem_latency += rhs.mem_latency;
return *this;
}
};
std::vector<SimPort<MemReq>> CoreReqPorts;
@@ -52,8 +79,8 @@ public:
SimPort<MemReq> MemReqPort;
SimPort<MemRsp> MemRspPort;
Cache(const SimContext& ctx, const char* name, const Config& config);
~Cache();
CacheSim(const SimContext& ctx, const char* name, const Config& config);
~CacheSim();
void reset();

219
sim/simx/cluster.cpp Normal file
View File

@@ -0,0 +1,219 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cluster.h"
using namespace vortex;
Cluster::Cluster(const SimContext& ctx,
uint32_t cluster_id,
ProcessorImpl* processor,
const Arch &arch, const
DCRS &dcrs)
: SimObject(ctx, "cluster")
, mem_req_port(this)
, mem_rsp_port(this)
, cluster_id_(cluster_id)
, cores_(arch.num_cores())
, barriers_(arch.num_barriers(), 0)
, sharedmems_(arch.num_cores())
, processor_(processor)
{
auto num_cores = arch.num_cores();
char sname[100];
snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
!L2_ENABLED,
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(L2_NUM_WAYS), // W
0, // A
XLEN, // address bits
L2_NUM_BANKS, // number of banks
1, // number of ports
5, // request size
true, // write-through
false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
l2cache_->MemReqPort.bind(&this->mem_req_port);
this->mem_rsp_port.bind(&l2cache_->MemRspPort);
snprintf(sname, 100, "cluster%d-icaches", cluster_id);
icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
!ICACHE_ENABLED,
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(sizeof(uint32_t)), // W
log2ceil(ICACHE_NUM_WAYS),// A
XLEN, // address bits
1, // number of banks
1, // number of ports
1, // number of inputs
true, // write-through
false, // write response
0, // victim size
(uint8_t)arch.num_warps(), // mshr
2, // pipeline latency
});
icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
!DCACHE_ENABLED,
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(sizeof(Word)), // W
log2ceil(DCACHE_NUM_WAYS),// A
XLEN, // address bits
DCACHE_NUM_BANKS, // number of banks
1, // number of ports
DCACHE_NUM_BANKS, // number of inputs
true, // write-through
false, // write response
0, // victim size
DCACHE_MSHR_SIZE, // mshr
4, // pipeline latency
});
dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
///////////////////////////////////////////////////////////////////////////
// create shared memory blocks
for (uint32_t i = 0; i < num_cores; ++i) {
snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
(1 << SMEM_LOG_SIZE),
sizeof(Word),
NUM_LSU_LANES,
NUM_LSU_LANES,
false
});
}
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
uint32_t core_id = cluster_id * num_cores + i;
cores_.at(i) = Core::Create(core_id,
this,
arch,
dcrs,
sharedmems_.at(i));
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
auto smem_demux = SMemDemux::Create(sname);
cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));
smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
}
}
}
Cluster::~Cluster() {
//--
}
void Cluster::reset() {
for (auto& barrier : barriers_) {
barrier.reset();
}
}
void Cluster::tick() {
//--
}
void Cluster::attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
}
}
bool Cluster::running() const {
for (auto& core : cores_) {
if (core->running())
return true;
}
return false;
}
bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
bool done = true;
Word exitcode_ = 0;
for (auto& core : cores_) {
Word ec;
if (core->check_exit(&ec, riscv_test)) {
exitcode_ |= ec;
} else {
done = false;
}
}
*exitcode = exitcode_;
return done;
}
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
auto& barrier = barriers_.at(bar_id);
uint32_t local_core_id = core_id % cores_.size();
barrier.set(local_core_id);
DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
if (barrier.count() == (size_t)count) {
// resume all suspended cores
for (uint32_t i = 0; i < cores_.size(); ++i) {
if (barrier.test(i)) {
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
cores_.at(i)->resume();
}
}
barrier.reset();
}
}
ProcessorImpl* Cluster::processor() const {
return processor_;
}
Cluster::PerfStats Cluster::perf_stats() const {
Cluster::PerfStats perf;
perf.icache = icaches_->perf_stats();
perf.dcache = dcaches_->perf_stats();
perf.l2cache = l2cache_->perf_stats();
for (auto sharedmem : sharedmems_) {
perf.sharedmem += sharedmem->perf_stats();
}
return perf;
}

86
sim/simx/cluster.h Normal file
View File

@@ -0,0 +1,86 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "dcrs.h"
#include "arch.h"
#include "cache_cluster.h"
#include "shared_mem.h"
#include "core.h"
#include "constants.h"
namespace vortex {
class ProcessorImpl;
class Cluster : public SimObject<Cluster> {
public:
struct PerfStats {
CacheSim::PerfStats icache;
CacheSim::PerfStats dcache;
SharedMem::PerfStats sharedmem;
CacheSim::PerfStats l2cache;
PerfStats& operator+=(const PerfStats& rhs) {
this->icache += rhs.icache;
this->dcache += rhs.dcache;
this->sharedmem += rhs.sharedmem;
this->l2cache += rhs.l2cache;
return *this;
}
};
SimPort<MemReq> mem_req_port;
SimPort<MemRsp> mem_rsp_port;
Cluster(const SimContext& ctx,
uint32_t cluster_id,
ProcessorImpl* processor,
const Arch &arch,
const DCRS &dcrs);
~Cluster();
void reset();
void tick();
void attach_ram(RAM* ram);
bool running() const;
bool check_exit(Word* exitcode, bool riscv_test) const;
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
ProcessorImpl* processor() const;
Cluster::PerfStats perf_stats() const;
private:
uint32_t cluster_id_;
std::vector<Core::Ptr> cores_;
std::vector<CoreMask> barriers_;
CacheSim::Ptr l2cache_;
CacheCluster::Ptr icaches_;
CacheCluster::Ptr dcaches_;
std::vector<SharedMem::Ptr> sharedmems_;
CacheCluster::Ptr tcaches_;
CacheCluster::Ptr ocaches_;
CacheCluster::Ptr rcaches_;
ProcessorImpl* processor_;
};
} // namespace vortex

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifndef RAM_PAGE_SIZE
@@ -10,14 +23,4 @@
#ifndef MEMORY_BANKS
#define MEMORY_BANKS 2
#endif
namespace vortex {
enum Constants {
SMEM_BANK_OFFSET = log2ceil(sizeof(uint32_t)) + log2ceil(STACK_SIZE / sizeof(uint32_t)),
};
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
@@ -11,101 +24,104 @@
#include <simobject.h>
#include "debug.h"
#include "types.h"
#include "archdef.h"
#include "arch.h"
#include "decode.h"
#include "mem.h"
#include "warp.h"
#include "pipeline.h"
#include "cache.h"
#include "sharedmem.h"
#include "cache_sim.h"
#include "shared_mem.h"
#include "ibuffer.h"
#include "scoreboard.h"
#include "exeunit.h"
#include "tex_unit.h"
#include "operand.h"
#include "dispatcher.h"
#include "exe_unit.h"
#include "dcrs.h"
namespace vortex {
class Cluster;
class Core : public SimObject<Core> {
public:
struct PerfStats {
uint64_t cycles;
uint64_t instrs;
uint64_t ibuf_stalls;
uint64_t scrb_stalls;
uint64_t alu_stalls;
uint64_t lsu_stalls;
uint64_t csr_stalls;
uint64_t fpu_stalls;
uint64_t gpu_stalls;
uint64_t sfu_stalls;
uint64_t ifetches;
uint64_t loads;
uint64_t stores;
uint64_t branches;
uint64_t mem_reads;
uint64_t mem_writes;
uint64_t mem_latency;
uint64_t tex_reads;
uint64_t tex_latency;
uint64_t ifetch_latency;
uint64_t load_latency;
PerfStats()
: instrs(0)
: cycles(0)
, instrs(0)
, ibuf_stalls(0)
, scrb_stalls(0)
, alu_stalls(0)
, lsu_stalls(0)
, csr_stalls(0)
, fpu_stalls(0)
, gpu_stalls(0)
, sfu_stalls(0)
, ifetches(0)
, loads(0)
, stores(0)
, branches(0)
, mem_reads(0)
, mem_writes(0)
, mem_latency(0)
, tex_reads(0)
, tex_latency(0)
, ifetch_latency(0)
, load_latency(0)
{}
};
SimPort<MemRsp> MemRspPort;
SimPort<MemReq> MemReqPort;
std::vector<SimPort<MemReq>> icache_req_ports;
std::vector<SimPort<MemRsp>> icache_rsp_ports;
std::vector<SimPort<MemReq>> dcache_req_ports;
std::vector<SimPort<MemRsp>> dcache_rsp_ports;
Core(const SimContext& ctx,
uint32_t core_id,
Cluster* cluster,
const Arch &arch,
const DCRS &dcrs,
SharedMem::Ptr sharedmem);
Core(const SimContext& ctx, const ArchDef &arch, uint32_t id);
~Core();
void attach_ram(RAM* ram);
bool running() const;
void reset();
void tick();
void attach_ram(RAM* ram);
bool running() const;
void resume();
uint32_t id() const {
return id_;
return core_id_;
}
const Decoder& decoder() {
return decoder_;
}
const ArchDef& arch() const {
const Arch& arch() const {
return arch_;
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
uint32_t getIRegValue(int reg) const {
return warps_.at(0)->getIRegValue(reg);
const DCRS& dcrs() const {
return dcrs_;
}
uint32_t get_csr(uint32_t addr, uint32_t tid, uint32_t wid);
void set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid);
WarpMask wspawn(uint32_t num_warps, uint32_t nextPC);
void wspawn(uint32_t num_warps, Word nextPC);
WarpMask barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
void barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
AddrType get_addr_type(uint64_t addr);
void icache_read(void* data, uint64_t addr, uint32_t size);
@@ -113,19 +129,22 @@ public:
void dcache_write(const void* data, uint64_t addr, uint32_t size);
uint32_t tex_read(uint32_t unit, uint32_t lod, uint32_t u, uint32_t v, std::vector<mem_addr_size_t>* mem_addrs);
void dcache_amo_reserve(uint64_t addr);
bool dcache_amo_check(uint64_t addr);
void trigger_ecall();
void trigger_ebreak();
bool check_exit() const;
bool check_exit(Word* exitcode, bool riscv_test) const;
private:
void schedule();
void fetch();
void decode();
void issue();
void execute();
void commit();
@@ -133,49 +152,51 @@ private:
void cout_flush();
uint32_t id_;
const ArchDef arch_;
uint32_t core_id_;
const Arch& arch_;
const DCRS &dcrs_;
const Decoder decoder_;
MemoryUnit mmu_;
RAM smem_;
std::vector<TexUnit> tex_units_;
std::vector<std::shared_ptr<Warp>> warps_;
std::vector<WarpMask> barriers_;
std::vector<uint32_t> csrs_;
std::vector<WarpMask> barriers_;
std::vector<Byte> fcsrs_;
std::vector<IBuffer> ibuffers_;
Scoreboard scoreboard_;
std::vector<Operand::Ptr> operands_;
std::vector<Dispatcher::Ptr> dispatchers_;
std::vector<ExeUnit::Ptr> exe_units_;
Cache::Ptr icache_;
Cache::Ptr dcache_;
SharedMem::Ptr shared_mem_;
Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;
SharedMem::Ptr sharedmem_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;
HashTable<pipeline_trace_t*> pending_icache_;
std::vector<pipeline_trace_t*> committed_traces_;
WarpMask active_warps_;
WarpMask stalled_warps_;
uint32_t last_schedule_wid_;
uint64_t issued_instrs_;
uint64_t committed_instrs_;
uint32_t csr_tex_unit_;
bool ecall_;
bool ebreak_;
bool exited_;
uint64_t pending_ifetches_;
std::unordered_map<int, std::stringstream> print_bufs_;
std::vector<std::vector<CSRs>> csrs_;
PerfStats perf_stats_;
uint64_t perf_mem_pending_reads_;
Cluster* cluster_;
uint32_t commit_exe_;
friend class Warp;
friend class LsuUnit;
friend class AluUnit;
friend class CsrUnit;
friend class FpuUnit;
friend class GpuUnit;
friend class SfuUnit;
};
} // namespace vortex
} // namespace vortex

28
sim/simx/dcrs.cpp Normal file
View File

@@ -0,0 +1,28 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dcrs.h"
#include <iostream>
using namespace vortex;
void DCRS::write(uint32_t addr, uint32_t value) {
if (addr >= VX_DCR_BASE_STATE_BEGIN
&& addr < VX_DCR_BASE_STATE_END) {
base_dcrs.write(addr, value);
return;
}
std::cout << std::hex << "Error: invalid global DCR addr=0x" << addr << std::endl;
std::abort();
}

45
sim/simx/dcrs.h Normal file
View File

@@ -0,0 +1,45 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <util.h>
#include <VX_types.h>
#include <array>
namespace vortex {
class BaseDCRS {
public:
uint32_t read(uint32_t addr) const {
uint32_t state = VX_DCR_BASE_STATE(addr);
return states_.at(state);
}
void write(uint32_t addr, uint32_t value) {
uint32_t state = VX_DCR_BASE_STATE(addr);
states_.at(state) = value;
}
private:
std::array<uint32_t, VX_DCR_BASE_STATE_COUNT> states_;
};
class DCRS {
public:
void write(uint32_t addr, uint32_t value);
BaseDCRS base_dcrs;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifndef DEBUG_LEVEL

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <string>
#include <stdlib.h>
@@ -9,41 +22,36 @@
#include "debug.h"
#include "types.h"
#include "decode.h"
#include "archdef.h"
#include "arch.h"
#include "instr.h"
using namespace vortex;
struct InstTableEntry_t {
bool controlFlow;
InstType iType;
};
static const std::unordered_map<Opcode, struct InstTableEntry_t> sc_instTable = {
{Opcode::NOP, {false, InstType::N_TYPE}},
{Opcode::R_INST, {false, InstType::R_TYPE}},
{Opcode::L_INST, {false, InstType::I_TYPE}},
{Opcode::I_INST, {false, InstType::I_TYPE}},
{Opcode::S_INST, {false, InstType::S_TYPE}},
{Opcode::B_INST, {true , InstType::B_TYPE}},
{Opcode::LUI_INST, {false, InstType::U_TYPE}},
{Opcode::AUIPC_INST, {false, InstType::U_TYPE}},
{Opcode::JAL_INST, {true , InstType::J_TYPE}},
{Opcode::JALR_INST, {true , InstType::I_TYPE}},
{Opcode::SYS_INST, {true , InstType::I_TYPE}},
{Opcode::FENCE, {true , InstType::I_TYPE}},
{Opcode::FL, {false, InstType::I_TYPE}},
{Opcode::FS, {false, InstType::S_TYPE}},
{Opcode::FCI, {false, InstType::R_TYPE}},
{Opcode::FMADD, {false, InstType::R4_TYPE}},
{Opcode::FMSUB, {false, InstType::R4_TYPE}},
{Opcode::FMNMADD, {false, InstType::R4_TYPE}},
{Opcode::FMNMSUB, {false, InstType::R4_TYPE}},
{Opcode::VSET, {false, InstType::V_TYPE}},
{Opcode::GPGPU, {false, InstType::R_TYPE}},
{Opcode::GPU, {false, InstType::R4_TYPE}},
{Opcode::R_INST_W, {false, InstType::R_TYPE}},
{Opcode::I_INST_W, {false, InstType::I_TYPE}},
static const std::unordered_map<Opcode, InstType> sc_instTable = {
{Opcode::R_INST, InstType::R_TYPE},
{Opcode::L_INST, InstType::I_TYPE},
{Opcode::I_INST, InstType::I_TYPE},
{Opcode::S_INST, InstType::S_TYPE},
{Opcode::B_INST, InstType::B_TYPE},
{Opcode::LUI_INST, InstType::U_TYPE},
{Opcode::AUIPC_INST, InstType::U_TYPE},
{Opcode::JAL_INST, InstType::J_TYPE},
{Opcode::JALR_INST, InstType::I_TYPE},
{Opcode::SYS_INST, InstType::I_TYPE},
{Opcode::FENCE, InstType::I_TYPE},
{Opcode::AMO, InstType::R_TYPE},
{Opcode::FL, InstType::I_TYPE},
{Opcode::FS, InstType::S_TYPE},
{Opcode::FCI, InstType::R_TYPE},
{Opcode::FMADD, InstType::R4_TYPE},
{Opcode::FMSUB, InstType::R4_TYPE},
{Opcode::FMNMADD, InstType::R4_TYPE},
{Opcode::FMNMSUB, InstType::R4_TYPE},
{Opcode::VSET, InstType::V_TYPE},
{Opcode::EXT1, InstType::R_TYPE},
{Opcode::EXT2, InstType::R4_TYPE},
{Opcode::R_INST_W, InstType::R_TYPE},
{Opcode::I_INST_W, InstType::I_TYPE},
};
enum Constants {
@@ -58,6 +66,8 @@ enum Constants {
width_i_imm = 12,
width_j_imm = 20,
width_v_imm = 11,
width_aq = 1,
width_rl = 1,
shift_opcode= 0,
shift_rd = width_opcode,
@@ -72,15 +82,15 @@ enum Constants {
shift_func6 = shift_func7 + width_vmask,
shift_vset = shift_func7 + width_func6,
mask_opcode = (1<<width_opcode)-1,
mask_reg = (1<<width_reg)-1,
mask_func2 = (1<<width_func2)-1,
mask_func3 = (1<<width_func3)-1,
mask_func6 = (1<<width_func6)-1,
mask_func7 = (1<<width_func7)-1,
mask_i_imm = (1<<width_i_imm)-1,
mask_j_imm = (1<<width_j_imm)-1,
mask_v_imm = (1<<width_v_imm)-1,
mask_opcode = (1 << width_opcode) - 1,
mask_reg = (1 << width_reg) - 1,
mask_func2 = (1 << width_func2) - 1,
mask_func3 = (1 << width_func3) - 1,
mask_func6 = (1 << width_func6) - 1,
mask_func7 = (1 << width_func7) - 1,
mask_i_imm = (1 << width_i_imm) - 1,
mask_j_imm = (1 << width_j_imm) - 1,
mask_v_imm = (1 << width_v_imm) - 1,
};
static const char* op_string(const Instr &instr) {
@@ -92,7 +102,6 @@ static const char* op_string(const Instr &instr) {
auto imm = instr.getImm();
switch (opcode) {
case Opcode::NOP: return "NOP";
case Opcode::LUI_INST: return "LUI";
case Opcode::AUIPC_INST: return "AUIPC";
case Opcode::R_INST:
@@ -116,7 +125,7 @@ static const char* op_string(const Instr &instr) {
case 2: return "SLT";
case 3: return "SLTU";
case 4: return "XOR";
case 5: return func7 ? "SRA" : "SRL";
case 5: return (func7 & 0x20) ? "SRA" : "SRL";
case 6: return "OR";
case 7: return "AND";
default:
@@ -130,7 +139,7 @@ static const char* op_string(const Instr &instr) {
case 2: return "SLTI";
case 3: return "SLTIU";
case 4: return "XORI";
case 5: return func7 ? "SRAI" : "SRLI";
case 5: return (func7 & 0x20) ? "SRAI" : "SRLI";
case 6: return "ORI";
case 7: return "ANDI";
default:
@@ -151,8 +160,8 @@ static const char* op_string(const Instr &instr) {
case Opcode::JALR_INST: return "JALR";
case Opcode::L_INST:
switch (func3) {
case 0: return "LBI";
case 1: return "LHI";
case 0: return "LB";
case 1: return "LH";
case 2: return "LW";
case 3: return "LD";
case 4: return "LBU";
@@ -192,11 +201,11 @@ static const char* op_string(const Instr &instr) {
}
case Opcode::I_INST_W:
switch (func3) {
case 0: return "ADDIW";
case 1: return "SLLIW";
case 5: return func7 ? "SRAIW" : "SRLIW";
default:
std::abort();
case 0: return "ADDIW";
case 1: return "SLLIW";
case 5: return func7 ? "SRAIW" : "SRLIW";
default:
std::abort();
}
case Opcode::SYS_INST:
switch (func3) {
@@ -222,20 +231,59 @@ static const char* op_string(const Instr &instr) {
case Opcode::FENCE: return "FENCE";
case Opcode::FL:
switch (func3) {
case 0x1: return "VL";
case 0x2: return "FLW";
case 0x3: return "FLD";
default:
std::abort();
case 0x1: return "VL";
case 0x2: return "FLW";
case 0x3: return "FLD";
default:
std::abort();
}
case Opcode::FS:
switch (func3) {
case 0x1: return "VS";
case 0x2: return "FSW";
case 0x3: return "FSD";
case 0x1: return "VS";
case 0x2: return "FSW";
case 0x3: return "FSD";
default:
std::abort();
}
case Opcode::AMO: {
auto amo_type = func7 >> 2;
switch (func3) {
case 0x2:
switch (amo_type) {
case 0x00: return "AMOADD.W";
case 0x01: return "AMOSWAP.W";
case 0x02: return "LR.W";
case 0x03: return "SC.W";
case 0x04: return "AMOXOR.W";
case 0x08: return "AMOOR.W";
case 0x0c: return "AMOAND.W";
case 0x10: return "AMOMIN.W";
case 0x14: return "AMOMAX.W";
case 0x18: return "AMOMINU.W";
case 0x1c: return "AMOMAXU.W";
default:
std::abort();
}
case 0x3:
switch (amo_type) {
case 0x00: return "AMOADD.D";
case 0x01: return "AMOSWAP.D";
case 0x02: return "LR.D";
case 0x03: return "SC.D";
case 0x04: return "AMOXOR.D";
case 0x08: return "AMOOR.D";
case 0x0c: return "AMOAND.D";
case 0x10: return "AMOMIN.D";
case 0x14: return "AMOMAX.D";
case 0x18: return "AMOMINU.D";
case 0x1c: return "AMOMAXU.D";
default:
std::abort();
}
default:
std::abort();
}
}
case Opcode::FCI:
switch (func7) {
case 0x00: return "FADD.S";
@@ -332,9 +380,9 @@ static const char* op_string(const Instr &instr) {
default:
std::abort();
}
case 0x70: return func3 ? "FCLASS.S" : "FMV.X.W";
case 0x70: return func3 ? "FCLASS.S" : "FMV.X.S";
case 0x71: return func3 ? "FCLASS.D" : "FMV.X.D";
case 0x78: return "FMV.W.X";
case 0x78: return "FMV.S.X";
case 0x79: return "FMV.D.X";
default:
std::abort();
@@ -344,23 +392,27 @@ static const char* op_string(const Instr &instr) {
case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
case Opcode::VSET: return "VSET";
case Opcode::GPGPU:
switch (func3) {
case 0: return "TMC";
case 1: return "WSPAWN";
case 2: return "SPLIT";
case 3: return "JOIN";
case 4: return "BAR";
case 5: return "PREFETCH";
case Opcode::EXT1:
switch (func7) {
case 0:
switch (func3) {
case 0: return "TMC";
case 1: return "WSPAWN";
case 2: return "SPLIT";
case 3: return "JOIN";
case 4: return "BAR";
case 5: return "PRED";
default:
std::abort();
}
default:
std::abort();
}
case Opcode::GPU:
case Opcode::EXT2:
switch (func3) {
case 0: return "TEX";
case 1: {
switch (func2) {
case 0: return "CMOV";
case 0: return "CMOV";
default:
std::abort();
}
@@ -375,43 +427,36 @@ static const char* op_string(const Instr &instr) {
namespace vortex {
std::ostream &operator<<(std::ostream &os, const Instr &instr) {
auto opcode = instr.getOpcode();
auto func2 = instr.getFunc2();
auto opcode = instr.getOpcode();
auto func3 = instr.getFunc3();
os << op_string(instr) << ": ";
if (opcode == S_INST
|| opcode == FS) {
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
} else
if (opcode == L_INST
|| opcode == FL) {
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
} else {
if (instr.getRDType() != RegType::None) {
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
}
uint32_t i = 0;
for (; i < instr.getNRSrc(); ++i) {
if (i) os << ", ";
os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
}
if (instr.hasImm()) {
if (i) os << ", ";
os << "imm=0x" << std::hex << instr.getImm();
}
if (opcode == GPU && func3 == 0) {
os << ", unit=" << std::dec << func2;
}
os << op_string(instr);
int sep = 0;
if (instr.getRDType() != RegType::None) {
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << instr.getRDType() << std::dec << instr.getRDest();
}
for (uint32_t i = 0; i < instr.getNRSrc(); ++i) {
if (instr.getRSType(i) == RegType::None)
continue;
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
}
if (instr.hasImm()) {
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << "0x" << std::hex << instr.getImm();
}
if (opcode == Opcode::SYS_INST && func3 >= 5) {
// CSRs with immediate values
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << "0x" << std::hex << instr.getRSrc(0);
}
return os;
}
}
Decoder::Decoder(const ArchDef&) {}
Decoder::Decoder(const Arch&) {}
std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
auto instr = std::make_shared<Instr>();
@@ -434,7 +479,7 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
return nullptr;
}
auto iType = op_it->second.iType;
auto iType = op_it->second;
if (op == Opcode::FL || op == Opcode::FS) {
if (func3 != 0x2 && func3 != 0x3) {
iType = InstType::V_TYPE;
@@ -442,57 +487,88 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
switch (iType) {
case InstType::N_TYPE:
break;
case InstType::R_TYPE:
if (op == Opcode::FCI) {
switch (func7) {
switch (op) {
case Opcode::FCI:
switch (func7) {
case 0x2c: // FSQRT.S
case 0x2d: // FSQRT.D
instr->setDestReg(rd, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
break;
case 0x50: // FLE.S, FLT.S, FEQ.S
case 0x51: // FLE.D, FLT.D, FEQ.D
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
break;
case 0x60: // FCVT.W.D, FCVT.WU.D, FCVT.L.D, FCVT.LU.D
case 0x61: // FCVT.WU.S, FCVT.W.S, FCVT.L.S, FCVT.LU.S
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::None);
break;
case 0x68: // FCVT.S.W, FCVT.S.WU, FCVT.S.L, FCVT.S.LU
case 0x69: // FCVT.D.W, FCVT.D.WU, FCVT.D.L, FCVT.D.LU
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::None);
break;
case 0x70: // FCLASS.S, FMV.X.W
case 0x70: // FCLASS.S, FMV.X.S
case 0x71: // FCLASS.D, FMV.X.D
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
break;
case 0x78: // FMV.W.X
case 0x78: // FMV.S.X
case 0x79: // FMV.D.X
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
break;
default:
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
break;
}
} else {
break;
case Opcode::EXT1:
switch (func7) {
case 0:
switch (func3) {
case 0: // TMC
case 3: // JOIN
instr->addSrcReg(rs1, RegType::Integer);
break;
case 1: // WSPAWN
case 4: // BAR
case 5: // PRED
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
break;
case 2: // SPLIT
instr->setDestReg(rd, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
break;
default:
std::abort();
}
break;
default:
std::abort();
}
break;
default:
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
break;
}
instr->setFunc3(func3);
instr->setFunc7(func7);
break;
case InstType::I_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
if (op == Opcode::FL) {
instr->setDestReg(rd, RegType::Float);
} else {
@@ -503,15 +579,23 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
switch (op) {
case Opcode::SYS_INST:
if (func3 != 0) {
// RV32I: CSR*
instr->setDestReg(rd, RegType::Integer);
}
// RV32I: CSR
if (func3 >= 5) {
// rs1 holds zimm
instr->setSrcReg(0, rs1, RegType::None);
}
} else {
instr->setDestReg(rd, RegType::None);
instr->setSrcReg(0, rs1, RegType::None);
}
// uint12
instr->setImm(code >> shift_rs2);
break;
case Opcode::FENCE:
// uint12
instr->setImm(code >> shift_rs2);
instr->setDestReg(rd, RegType::None);
instr->setSrcReg(0, rs1, RegType::None);
break;
case Opcode::I_INST:
case Opcode::I_INST_W:
@@ -538,11 +622,11 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
} break;
case InstType::S_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
if (op == Opcode::FS) {
instr->setSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
} else {
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
}
instr->setFunc3(func3);
auto imm = (func7 << width_reg) | rd;
@@ -550,8 +634,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
} break;
case InstType::B_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
instr->setFunc3(func3);
auto bit_11 = rd & 0x1;
auto bits_4_1 = rd >> 1;
@@ -581,8 +665,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
case InstType::V_TYPE:
switch (op) {
case Opcode::VSET: {
instr->setDestVReg(rd);
instr->setSrcVReg(rs1);
instr->setDestReg(rd, RegType::Vector);
instr->addSrcReg(rs1, RegType::Vector);
instr->setFunc3(func3);
if (func3 == 7) {
instr->setImm(!(code >> shift_vset));
@@ -593,20 +677,20 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
instr->setVediv((immed >> 4) & 0x3);
instr->setVsew((immed >> 2) & 0x3);
} else {
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
}
} else {
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask((code >> shift_func7) & 0x1);
instr->setFunc6(func6);
}
} break;
case Opcode::FL:
instr->setDestVReg(rd);
instr->setSrcVReg(rs1);
instr->setDestReg(rd, RegType::Vector);
instr->addSrcReg(rs1, RegType::Vector);
instr->setVlsWidth(func3);
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask(code >> shift_func7);
instr->setVmop((code >> shift_vmop) & mask_func3);
instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -614,9 +698,9 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
case Opcode::FS:
instr->setVs3(rd);
instr->setSrcVReg(rs1);
instr->addSrcReg(rs1, RegType::Vector);
instr->setVlsWidth(func3);
instr->setSrcVReg(rs2);
instr->addSrcReg(rs2, RegType::Vector);
instr->setVmask(code >> shift_func7);
instr->setVmop((code >> shift_vmop) & mask_func3);
instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -627,16 +711,28 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
break;
case R4_TYPE:
if (op == Opcode::GPU) {
instr->setDestReg(rd, RegType::Integer);
instr->setSrcReg(rs1, RegType::Integer);
instr->setSrcReg(rs2, RegType::Integer);
instr->setSrcReg(rs3, RegType::Integer);
if (op == Opcode::EXT2) {
switch (func3) {
case 1:
switch (func2) {
case 0: // CMOV
instr->setDestReg(rd, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
instr->addSrcReg(rs3, RegType::Integer);
break;
default:
std::abort();
}
break;
default:
std::abort();
}
} else {
instr->setDestReg(rd, RegType::Float);
instr->setSrcReg(rs1, RegType::Float);
instr->setSrcReg(rs2, RegType::Float);
instr->setSrcReg(rs3, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs3, RegType::Float);
}
instr->setFunc2(func2);
instr->setFunc3(func3);

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
@@ -5,12 +18,12 @@
namespace vortex {
class ArchDef;
class Arch;
class Instr;
class Decoder {
public:
Decoder(const ArchDef &);
Decoder(const Arch &);
std::shared_ptr<Instr> decode(uint32_t code) const;
};

141
sim/simx/dispatcher.h Normal file
View File

@@ -0,0 +1,141 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
#include <queue>
namespace vortex {
class Dispatcher : public SimObject<Dispatcher> {
public:
std::vector<SimPort<pipeline_trace_t*>> Outputs;
Dispatcher(const SimContext& ctx, const Arch& arch, uint32_t buf_size, uint32_t block_size, uint32_t num_lanes)
: SimObject<Dispatcher>(ctx, "Dispatcher")
, Outputs(ISSUE_WIDTH, this)
, Inputs_(ISSUE_WIDTH, this)
, arch_(arch)
, queues_(ISSUE_WIDTH, std::queue<pipeline_trace_t*>())
, buf_size_(buf_size)
, block_size_(block_size)
, num_lanes_(num_lanes)
, batch_count_(ISSUE_WIDTH / block_size)
, pid_count_(arch.num_threads() / num_lanes)
, batch_idx_(0)
, start_p_(block_size, 0)
{}
virtual ~Dispatcher() {}
virtual void reset() {
batch_idx_ = 0;
for (uint32_t b = 0; b < block_size_; ++b) {
start_p_.at(b) = 0;
}
}
virtual void tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& queue = queues_.at(i);
if (queue.empty())
continue;
auto trace = queue.front();
Inputs_.at(i).send(trace, 1);
queue.pop();
}
uint32_t block_sent = 0;
for (uint32_t b = 0; b < block_size_; ++b) {
uint32_t i = batch_idx_ * block_size_ + b;
auto& input = Inputs_.at(i);
if (input.empty()) {
++block_sent;
continue;
}
auto& output = Outputs.at(i);
auto trace = input.front();
if (pid_count_ != 1) {
auto start_p = start_p_.at(b);
if (start_p == -1) {
++block_sent;
continue;
}
int start(-1), end(-1);
for (uint32_t j = start_p * num_lanes_, n = arch_.num_threads(); j < n; ++j) {
if (!trace->tmask.test(j))
continue;
if (start == -1)
start = j;
end = j;
}
start /= num_lanes_;
end /= num_lanes_;
auto new_trace = new pipeline_trace_t(*trace);
new_trace->tmask.reset();
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
new_trace->tmask[j] = trace->tmask[j];
}
new_trace->pid = start;
new_trace->sop = (start_p == 0);
if (start == end) {
new_trace->eop = 1;
start_p_.at(b) = -1;
input.pop();
++block_sent;
delete trace;
} else {
new_trace->eop = 0;
start_p_.at(b) = start + 1;
}
output.send(new_trace, 1);
DT(3, "pipeline-dispatch: " << *new_trace);
} else {
trace->pid = 0;
input.pop();
output.send(trace, 1);
DT(3, "pipeline-dispatch: " << *trace);
++block_sent;
}
}
if (block_sent == block_size_) {
batch_idx_ = (batch_idx_ + 1) % batch_count_;
for (uint32_t b = 0; b < block_size_; ++b) {
start_p_.at(b) = 0;
}
}
};
bool push(uint32_t issue_index, pipeline_trace_t* trace) {
auto& queue = queues_.at(issue_index);
if (queue.size() >= buf_size_)
return false;
queue.push(trace);
return true;
}
private:
std::vector<SimPort<pipeline_trace_t*>> Inputs_;
const Arch& arch_;
std::vector<std::queue<pipeline_trace_t*>> queues_;
uint32_t buf_size_;
uint32_t block_size_;
uint32_t num_lanes_;
uint32_t batch_count_;
uint32_t pid_count_;
uint32_t batch_idx_;
std::vector<int> start_p_;
};
}

329
sim/simx/exe_unit.cpp Normal file
View File

@@ -0,0 +1,329 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "exe_unit.h"
#include <iostream>
#include <iomanip>
#include <string.h>
#include <assert.h>
#include <util.h>
#include "debug.h"
#include "core.h"
#include "constants.h"
#include "cache_sim.h"
using namespace vortex;
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
void AluUnit::tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& input = Inputs.at(i);
if (input.empty())
continue;
auto& output = Outputs.at(i);
auto trace = input.front();
switch (trace->alu_type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::SYSCALL:
case AluType::IMUL:
output.send(trace, LATENCY_IMUL+1);
break;
case AluType::IDIV:
output.send(trace, XLEN+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
if (trace->eop && trace->fetch_stall) {
assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid);
}
auto time = input.pop();
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
}
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
void FpuUnit::tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& input = Inputs.at(i);
if (input.empty())
continue;
auto& output = Outputs.at(i);
auto trace = input.front();
switch (trace->fpu_type) {
case FpuType::FNCP:
output.send(trace, 2);
break;
case FpuType::FMA:
output.send(trace, LATENCY_FMA+1);
break;
case FpuType::FDIV:
output.send(trace, LATENCY_FDIV+1);
break;
case FpuType::FSQRT:
output.send(trace, LATENCY_FSQRT+1);
break;
case FpuType::FCVT:
output.send(trace, LATENCY_FCVT+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
auto time = input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
}
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
, pending_rd_reqs_(LSUQ_SIZE)
, num_lanes_(NUM_LSU_LANES)
, pending_loads_(0)
, fence_lock_(false)
, input_idx_(0)
{}
void LsuUnit::reset() {
pending_rd_reqs_.clear();
pending_loads_ = 0;
fence_lock_ = false;
}
void LsuUnit::tick() {
core_->perf_stats_.load_latency += pending_loads_;
// handle dcache response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.trace;
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type
<< ", tid=" << t << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
int iw = trace->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
--pending_loads_;
}
// handle shared memory response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.trace;
DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", tid=" << t << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
int iw = trace->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
smem_rsp_port.pop();
--pending_loads_;
}
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_rd_reqs_.empty())
return;
int iw = fence_state_->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.send(fence_state_, 1);
fence_lock_ = false;
DT(3, "fence-unlock: " << fence_state_);
}
// check input queue
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
int iw = (input_idx_ + i) % ISSUE_WIDTH;
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
auto t0 = trace->pid * num_lanes_;
if (trace->lsu_type == LsuType::FENCE) {
// schedule fence lock
fence_state_ = trace;
fence_lock_ = true;
DT(3, "fence-lock: " << *trace);
// remove input
auto time = input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
break;
}
// check pending queue capacity
if (pending_rd_reqs_.full()) {
if (!trace->log_once(true)) {
DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
}
break;
} else {
trace->log_once(false);
}
bool is_write = (trace->lsu_type == LsuType::STORE);
// duplicates detection
bool is_dup = false;
if (trace->tmask.test(t0)) {
uint64_t addr_mask = sizeof(uint32_t)-1;
uint32_t addr0 = trace_data->mem_addrs.at(0).addr & ~addr_mask;
uint32_t matches = 1;
for (uint32_t t = 1; t < num_lanes_; ++t) {
if (!trace->tmask.test(t0 + t))
continue;
auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
is_dup = (matches == trace->tmask.count());
}
uint32_t addr_count;
if (is_dup) {
addr_count = 1;
} else {
addr_count = trace->tmask.count();
}
auto tag = pending_rd_reqs_.allocate({trace, addr_count});
for (uint32_t t = 0; t < num_lanes_; ++t) {
if (!trace->tmask.test(t0 + t))
continue;
auto& dcache_req_port = core_->dcache_req_ports.at(t);
auto mem_addr = trace_data->mem_addrs.at(t);
auto type = core_->get_addr_type(mem_addr.addr);
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.type = type;
mem_req.tag = tag;
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
dcache_req_port.send(mem_req, 2);
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
++pending_loads_;
++core_->perf_stats_.loads;
if (is_dup)
break;
}
// do not wait on writes
if (is_write) {
pending_rd_reqs_.release(tag);
output.send(trace, 1);
++core_->perf_stats_.stores;
}
// remove input
auto time = input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
break; // single block
}
++input_idx_;
}
///////////////////////////////////////////////////////////////////////////////
SfuUnit::SfuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "SFU")
, input_idx_(0)
{}
void SfuUnit::tick() {
// check input queue
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
int iw = (input_idx_ + i) % ISSUE_WIDTH;
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
auto sfu_type = trace->sfu_type;
bool release_warp = trace->fetch_stall;
switch (sfu_type) {
case SfuType::TMC:
case SfuType::WSPAWN:
case SfuType::SPLIT:
case SfuType::JOIN:
case SfuType::PRED:
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC:
output.send(trace, 1);
break;
case SfuType::BAR: {
output.send(trace, 1);
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
if (trace->eop) {
core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
}
release_warp = false;
} break;
case SfuType::CMOV:
output.send(trace, 3);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
if (trace->eop && release_warp) {
assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid);
}
auto time = input.pop();
auto stalls = (SimPlatform::instance().cycles() - time);
core_->perf_stats_.sfu_stalls += stalls;
break; // single block
}
++input_idx_;
}

View File

@@ -1,8 +1,21 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "pipeline.h"
#include "cache.h"
#include "cache_sim.h"
namespace vortex {
@@ -10,13 +23,13 @@ class Core;
class ExeUnit : public SimObject<ExeUnit> {
public:
SimPort<pipeline_trace_t*> Input;
SimPort<pipeline_trace_t*> Output;
std::vector<SimPort<pipeline_trace_t*>> Inputs;
std::vector<SimPort<pipeline_trace_t*>> Outputs;
ExeUnit(const SimContext& ctx, Core* core, const char* name)
: SimObject<ExeUnit>(ctx, name)
, Input(this)
, Output(this)
, Inputs(ISSUE_WIDTH, this)
, Outputs(ISSUE_WIDTH, this)
, core_(core)
{}
@@ -32,32 +45,6 @@ protected:
///////////////////////////////////////////////////////////////////////////////
class NopUnit : public ExeUnit {
public:
NopUnit(const SimContext& ctx, Core*);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class LsuUnit : public ExeUnit {
private:
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_rd_reqs_;
pipeline_trace_t* fence_state_;
bool fence_lock_;
public:
LsuUnit(const SimContext& ctx, Core*);
void reset();
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class AluUnit : public ExeUnit {
public:
AluUnit(const SimContext& ctx, Core*);
@@ -67,15 +54,6 @@ public:
///////////////////////////////////////////////////////////////////////////////
class CsrUnit : public ExeUnit {
public:
CsrUnit(const SimContext& ctx, Core*);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class FpuUnit : public ExeUnit {
public:
FpuUnit(const SimContext& ctx, Core*);
@@ -85,19 +63,37 @@ public:
///////////////////////////////////////////////////////////////////////////////
class GpuUnit : public ExeUnit {
private:
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
bool processTexRequest(pipeline_trace_t* trace);
class LsuUnit : public ExeUnit {
public:
GpuUnit(const SimContext& ctx, Core*);
LsuUnit(const SimContext& ctx, Core*);
void reset();
void tick();
private:
struct pending_req_t {
pipeline_trace_t* trace;
uint32_t count;
};
HashTable<pending_req_t> pending_rd_reqs_;
uint32_t num_lanes_;
pipeline_trace_t* fence_state_;
uint64_t pending_loads_;
bool fence_lock_;
uint32_t input_idx_;
};
///////////////////////////////////////////////////////////////////////////////
class SfuUnit : public ExeUnit {
public:
SfuUnit(const SimContext& ctx, Core*);
void tick();
private:
uint32_t input_idx_;
};
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,383 +0,0 @@
#include "exeunit.h"
#include <iostream>
#include <iomanip>
#include <string.h>
#include <assert.h>
#include <util.h>
#include "debug.h"
#include "core.h"
#include "constants.h"
using namespace vortex;
NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
void NopUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
Output.send(trace, 1);
Input.pop();
}
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
, num_threads_(core->arch().num_threads())
, pending_rd_reqs_(LSUQ_SIZE)
, fence_lock_(false)
{}
void LsuUnit::reset() {
pending_rd_reqs_.clear();
fence_lock_ = false;
}
void LsuUnit::tick() {
// handle dcache response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
}
// handle shared memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
smem_rsp_port.pop();
}
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_rd_reqs_.empty())
return;
Output.send(fence_state_, 1);
fence_lock_ = false;
DT(3, "fence-unlock: " << fence_state_);
}
// check input queue
if (Input.empty())
return;
auto trace = Input.front();
if (trace->lsu.type == LsuType::FENCE) {
// schedule fence lock
fence_state_ = trace;
fence_lock_ = true;
DT(3, "fence-lock: " << *trace);
// remove input
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
return;
}
// check pending queue capacity
if (pending_rd_reqs_.full()) {
if (!trace->suspend()) {
DT(3, "*** lsu-queue-stall: " << *trace);
}
return;
} else {
trace->resume();
}
bool is_write = (trace->lsu.type == LsuType::STORE);
// duplicates detection
bool is_dup = false;
if (trace->tmask.test(0)) {
uint64_t addr_mask = sizeof(uint32_t)-1;
uint32_t addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
uint32_t matches = 1;
for (uint32_t t = 1; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
is_dup = (matches == trace->tmask.count());
}
uint32_t valid_addrs = 0;
if (is_dup) {
valid_addrs = 1;
} else {
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
}
auto tag = pending_rd_reqs_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
auto mem_addr = trace->mem_addrs.at(t).at(0);
auto type = get_addr_type(mem_addr.addr, mem_addr.size);
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.non_cacheable = (type == AddrType::IO);
mem_req.tag = tag;
mem_req.core_id = trace->cid;
mem_req.uuid = trace->uuid;
if (type == AddrType::Shared) {
core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
} else {
dcache_req_port.send(mem_req, 2);
DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace);
}
if (is_dup)
break;
}
// do not wait on writes
if (is_write) {
pending_rd_reqs_.release(tag);
Output.send(trace, 1);
}
// remove input
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
void AluUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
switch (trace->alu.type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::SYSCALL:
case AluType::CMOV:
Output.send(trace, 1);
break;
case AluType::IMUL:
Output.send(trace, LATENCY_IMUL+1);
break;
case AluType::IDIV:
Output.send(trace, XLEN+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
void CsrUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
Output.send(trace, 1);
auto time = Input.pop();
core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time);
DT(3, "pipeline-execute: op=CSR, " << *trace);
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
void FpuUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
switch (trace->fpu.type) {
case FpuType::FNCP:
Output.send(trace, 2);
break;
case FpuType::FMA:
Output.send(trace, LATENCY_FMA+1);
break;
case FpuType::FDIV:
Output.send(trace, LATENCY_FDIV+1);
break;
case FpuType::FSQRT:
Output.send(trace, LATENCY_FSQRT+1);
break;
case FpuType::FCVT:
Output.send(trace, LATENCY_FCVT+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
GpuUnit::GpuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "GPU")
, num_threads_(core->arch().num_threads())
, pending_tex_reqs_(TEXQ_SIZE)
{}
void GpuUnit::reset() {
pending_tex_reqs_.clear();
}
void GpuUnit::tick() {
#ifdef EXT_TEX_ENABLE
// handle memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_tex_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
}
#endif
// check input queue
if (Input.empty())
return;
auto trace = Input.front();
bool issued = false;
switch (trace->gpu.type) {
case GpuType::TMC:
Output.send(trace, 1);
core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
issued = true;
break;
case GpuType::WSPAWN:
Output.send(trace, 1);
core_->active_warps_ = trace->gpu.active_warps;
issued = true;
break;
case GpuType::SPLIT:
case GpuType::JOIN:
Output.send(trace, 1);
issued = true;
break;
case GpuType::BAR:
Output.send(trace, 1);
if (trace->gpu.active_warps != 0)
core_->active_warps_ |= trace->gpu.active_warps;
else
core_->active_warps_.reset(trace->wid);
issued = true;
break;
case GpuType::TEX:
if (this->processTexRequest(trace))
issued = true;
break;
default:
std::abort();
}
if (issued) {
DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
}
bool GpuUnit::processTexRequest(pipeline_trace_t* trace) {
// check pending queue capacity
if (pending_tex_reqs_.full()) {
if (!trace->suspend()) {
DT(3, "*** tex-queue-stall: " << *trace);
}
return false;
} else {
trace->resume();
}
// send memory request
uint32_t valid_addrs = 0;
for (auto& mem_addr : trace->mem_addrs) {
valid_addrs += mem_addr.size();
}
auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
for (auto& mem_addr : trace->mem_addrs.at(t)) {
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = (trace->lsu.type == LsuType::STORE);
mem_req.tag = tag;
mem_req.core_id = core_->id();
mem_req.uuid = trace->uuid;
dcache_req_port.send(mem_req, 3);
DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", tid=" << t << ", "<< trace);
++ core_->perf_stats_.tex_reads;
++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
}
}
return true;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
@@ -6,10 +19,6 @@
namespace vortex {
class IBuffer {
private:
std::queue<pipeline_trace_t*> entries_;
uint32_t capacity_;
public:
IBuffer(uint32_t size)
: capacity_(size)
@@ -39,6 +48,10 @@ public:
std::queue<pipeline_trace_t*> empty;
std::swap(entries_, empty );
}
private:
std::queue<pipeline_trace_t*> entries_;
uint32_t capacity_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "types.h"
@@ -7,7 +20,7 @@ namespace vortex {
class Warp;
enum Opcode {
NOP = 0,
NONE = 0,
R_INST = 0x33,
L_INST = 0x3,
I_INST = 0x13,
@@ -19,6 +32,7 @@ enum Opcode {
JALR_INST = 0x67,
SYS_INST = 0x73,
FENCE = 0x0f,
AMO = 0x2f,
// F Extension
FL = 0x7,
FS = 0x27,
@@ -26,19 +40,20 @@ enum Opcode {
FMADD = 0x43,
FMSUB = 0x47,
FMNMSUB = 0x4b,
FMNMADD = 0x4f,
// Vector Extension
VSET = 0x57,
// GPGPU Extension
GPGPU = 0x6b,
GPU = 0x5b,
// RV64 Standard Extensions
FMNMADD = 0x4f,
// RV64 Standard Extension
R_INST_W = 0x3b,
I_INST_W = 0x1b,
// Vector Extension
VSET = 0x57,
// Custom Extensions
EXT1 = 0x0b,
EXT2 = 0x2b,
EXT3 = 0x5b,
EXT4 = 0x7b
};
enum InstType {
N_TYPE,
enum InstType {
R_TYPE,
I_TYPE,
S_TYPE,
@@ -52,25 +67,45 @@ enum InstType {
class Instr {
public:
Instr()
: opcode_(Opcode::NOP)
: opcode_(Opcode::NONE)
, num_rsrcs_(0)
, has_imm_(false)
, rdest_type_(RegType::None)
, imm_(0)
, rdest_(0)
, func2_(0)
, func3_(0)
, func6_(0)
, func7_(0) {
, func7_(0)
, vmask_(0)
, vlsWidth_(0)
, vMop_(0)
, vNf_(0)
, vs3_(0)
, vlmul_(0)
, vsew_(0)
, vediv_(0) {
for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
rsrc_type_[i] = RegType::None;
rsrc_[i] = 0;
}
}
void setOpcode(Opcode opcode) { opcode_ = opcode; }
void setDestReg(uint32_t destReg, RegType type) { rdest_type_ = type; rdest_ = destReg; }
void setSrcReg(uint32_t srcReg, RegType type) { rsrc_type_[num_rsrcs_] = type; rsrc_[num_rsrcs_++] = srcReg; }
void setDestVReg(uint32_t destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
void setSrcVReg(uint32_t srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; }
void setDestReg(uint32_t destReg, RegType type) {
rdest_type_ = type;
rdest_ = destReg;
}
void addSrcReg(uint32_t srcReg, RegType type) {
rsrc_type_[num_rsrcs_] = type;
rsrc_[num_rsrcs_] = srcReg;
++num_rsrcs_;
}
void setSrcReg(uint32_t index, uint32_t srcReg, RegType type) {
rsrc_type_[index] = type;
rsrc_[index] = srcReg;
num_rsrcs_ = std::max<uint32_t>(num_rsrcs_, index+1);
}
void setFunc2(uint32_t func2) { func2_ = func2; }
void setFunc3(uint32_t func3) { func3_ = func3; }
void setFunc7(uint32_t func7) { func7_ = func7; }
@@ -85,17 +120,17 @@ public:
void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; }
void setFunc6(uint32_t func6) { func6_ = func6; }
Opcode getOpcode() const { return opcode_; }
Opcode getOpcode() const { return opcode_; }
uint32_t getFunc2() const { return func2_; }
uint32_t getFunc3() const { return func3_; }
uint32_t getFunc6() const { return func6_; }
uint32_t getFunc7() const { return func7_; }
uint32_t getNRSrc() const { return num_rsrcs_; }
uint32_t getRSrc(uint32_t i) const { return rsrc_[i]; }
RegType getRSType(uint32_t i) const { return rsrc_type_[i]; }
RegType getRSType(uint32_t i) const { return rsrc_type_[i]; }
uint32_t getRDest() const { return rdest_; }
RegType getRDType() const { return rdest_type_; }
bool hasImm() const { return has_imm_; }
RegType getRDType() const { return rdest_type_; }
bool hasImm() const { return has_imm_; }
uint32_t getImm() const { return imm_; }
uint32_t getVlsWidth() const { return vlsWidth_; }
uint32_t getVmop() const { return vMop_; }

View File

@@ -1,98 +1,132 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <fstream>
#include <stdlib.h>
#include <unistd.h>
#include <sys/stat.h>
#include "processor.h"
#include "archdef.h"
#include "mem.h"
#include "constants.h"
#include <util.h>
#include "args.h"
#include "core.h"
using namespace vortex;
static void show_usage() {
std::cout << "Usage: [-c <cores>] [-w <warps>] [-t <threads>] [-r: riscv-test] [-s: stats] [-h: help] <program>" << std::endl;
}
uint32_t num_threads = NUM_THREADS;
uint32_t num_warps = NUM_WARPS;
uint32_t num_cores = NUM_CORES;
uint32_t num_clusters = NUM_CLUSTERS;
bool showStats = false;;
bool riscv_test = false;
const char* program = nullptr;
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
switch (c) {
case 't':
num_threads = atoi(optarg);
break;
case 'w':
num_warps = atoi(optarg);
break;
case 'c':
num_cores = atoi(optarg);
break;
case 'g':
num_clusters = atoi(optarg);
break;
case 'r':
riscv_test = true;
break;
case 's':
showStats = true;
break;
case 'h':
case '?':
show_usage();
exit(0);
break;
default:
show_usage();
exit(-1);
}
}
if (optind < argc) {
program = argv[optind];
std::cout << "Running " << program << "..." << std::endl;
} else {
show_usage();
exit(-1);
}
}
int main(int argc, char **argv) {
int exitcode = 0;
std::string imgFileName;
int num_cores(NUM_CORES * NUM_CLUSTERS);
int num_warps(NUM_WARPS);
int num_threads(NUM_THREADS);
bool showHelp(false);
bool showStats(false);
bool riscv_test(false);
parse_args(argc, argv);
// parse the command line arguments
CommandLineArgFlag fh("-h", "--help", "show command line options", showHelp);
CommandLineArgSetter<std::string> fi("-i", "--image", "program binary", imgFileName);
CommandLineArgSetter<int> fc("-c", "--cores", "number of cores", num_cores);
CommandLineArgSetter<int> fw("-w", "--warps", "number of warps", num_warps);
CommandLineArgSetter<int> ft("-t", "--threads", "number of threads", num_threads);
CommandLineArgFlag fr("-r", "--riscv", "enable riscv tests", riscv_test);
CommandLineArgFlag fs("-s", "--stats", "show stats", showStats);
CommandLineArg::readArgs(argc - 1, argv + 1);
if (showHelp || imgFileName.empty()) {
std::cout << "Vortex emulator command line arguments:\n"
" -i, --image <filename> Program RAM image\n"
" -c, --cores <num> Number of cores\n"
" -w, --warps <num> Number of warps\n"
" -t, --threads <num> Number of threads\n"
" -r, --riscv riscv test\n"
" -s, --stats Print stats on exit.\n";
return 0;
}
std::cout << "Running " << imgFileName << "..." << std::endl;
{
// create processor configuation
ArchDef arch(num_cores, num_warps, num_threads);
Arch arch(num_threads, num_warps, num_cores, num_clusters);
// create memory module
RAM ram(RAM_PAGE_SIZE);
// create processor
Processor processor(arch);
// attach memory module
processor.attach_ram(&ram);
// setup base DCRs
const uint64_t startup_addr(STARTUP_ADDR);
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
#if (XLEN == 64)
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
#endif
processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
// load program
{
std::string program_ext(fileExtension(imgFileName.c_str()));
{
std::string program_ext(fileExtension(program));
if (program_ext == "bin") {
ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
ram.loadBinImage(program, startup_addr);
} else if (program_ext == "hex") {
ram.loadHexImage(imgFileName.c_str());
ram.loadHexImage(program);
} else {
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
return -1;
}
}
// create processor
Processor processor(arch);
// attach memory module
processor.attach_ram(&ram);
// run simulation
exitcode = processor.run();
exitcode = processor.run(riscv_test);
}
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed." << std::endl;
exitcode = 0;
} else {
std::cout << "Failed." << std::endl;
}
} else {
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
}
return exitcode;
}

View File

@@ -1,4 +1,17 @@
#include "memsim.h"
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mem_sim.h"
#include <vector>
#include <queue>
#include <stdlib.h>
@@ -83,7 +96,7 @@ public:
mem_req.addr,
mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ,
std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid),
mem_req.core_id
mem_req.cid
);
if (!dram_->send(dram_req))

View File

@@ -1,8 +1,20 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "types.h"
#include <vector>
namespace vortex {

61
sim/simx/operand.h Normal file
View File

@@ -0,0 +1,61 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
#include <queue>
namespace vortex {
class Operand : public SimObject<Operand> {
public:
SimPort<pipeline_trace_t*> Input;
SimPort<pipeline_trace_t*> Output;
Operand(const SimContext& ctx)
: SimObject<Operand>(ctx, "Operand")
, Input(this)
, Output(this)
{}
virtual ~Operand() {}
virtual void reset() {}
virtual void tick() {
if (Input.empty())
return;
auto trace = Input.front();
int delay = 1;
for (int i = 0; i < MAX_NUM_REGS; ++i) {
bool is_iregs = trace->used_iregs.test(i);
bool is_fregs = trace->used_fregs.test(i);
bool is_vregs = trace->used_vregs.test(i);
if (is_iregs || is_fregs || is_vregs) {
if (is_iregs && i == 0)
continue;
++delay;
}
}
Output.send(trace, delay);
DT(3, "pipeline-operands: " << *trace);
Input.pop();
};
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
@@ -5,14 +18,38 @@
#include <iostream>
#include <util.h>
#include "types.h"
#include "archdef.h"
#include "arch.h"
#include "debug.h"
namespace vortex {
class ITraceData {
public:
using Ptr = std::shared_ptr<ITraceData>;
ITraceData() {}
virtual ~ITraceData() {}
};
struct LsuTraceData : public ITraceData {
using Ptr = std::shared_ptr<LsuTraceData>;
std::vector<mem_addr_size_t> mem_addrs;
LsuTraceData(uint32_t num_threads) : mem_addrs(num_threads) {}
};
struct SFUTraceData : public ITraceData {
using Ptr = std::shared_ptr<SFUTraceData>;
struct {
uint32_t id;
uint32_t count;
} bar;
SFUTraceData(uint32_t bar_id, uint32_t bar_count) : bar{bar_id, bar_count} {}
};
struct pipeline_trace_t {
public:
//--
uint64_t uuid;
const uint64_t uuid;
const Arch& arch;
//--
uint32_t cid;
@@ -21,12 +58,9 @@ struct pipeline_trace_t {
Word PC;
//--
bool fetch_stall;
//--
bool wb;
RegType rdest_type;
uint32_t rdest;
RegType rdest_type;
bool wb;
//--
RegMask used_iregs;
@@ -36,73 +70,104 @@ struct pipeline_trace_t {
//-
ExeType exe_type;
//--
std::vector<std::vector<mem_addr_size_t>> mem_addrs;
//--
union {
struct {
LsuType type;
} lsu;
struct {
AluType type;
} alu;
struct {
FpuType type;
} fpu;
struct {
GpuType type;
WarpMask active_warps;
} gpu;
uint32_t unit_type;
LsuType lsu_type;
AluType alu_type;
FpuType fpu_type;
SfuType sfu_type;
};
bool stalled;
ITraceData::Ptr data;
pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) {
uuid = uuid_;
cid = 0;
wid = 0;
tmask.reset();
PC = 0;
fetch_stall = false;
wb = false;
rdest = 0;
rdest_type = RegType::None;
used_iregs.reset();
used_fregs.reset();
used_vregs.reset();
exe_type = ExeType::NOP;
mem_addrs.resize(arch.num_threads());
stalled = false;
}
int pid;
bool sop;
bool eop;
bool suspend() {
bool old = stalled;
stalled = true;
bool fetch_stall;
pipeline_trace_t(uint64_t uuid, const Arch& arch)
: uuid(uuid)
, arch(arch)
, cid(0)
, wid(0)
, tmask(0)
, PC(0)
, rdest(0)
, rdest_type(RegType::None)
, wb(false)
, used_iregs(0)
, used_fregs(0)
, used_vregs(0)
, exe_type(ExeType::ALU)
, unit_type(0)
, data(nullptr)
, pid(-1)
, sop(true)
, eop(true)
, fetch_stall(false)
, log_once_(false)
{}
pipeline_trace_t(const pipeline_trace_t& rhs)
: uuid(rhs.uuid)
, arch(rhs.arch)
, cid(rhs.cid)
, wid(rhs.wid)
, tmask(rhs.tmask)
, PC(rhs.PC)
, rdest(rhs.rdest)
, rdest_type(rhs.rdest_type)
, wb(rhs.wb)
, used_iregs(rhs.used_iregs)
, used_fregs(rhs.used_fregs)
, used_vregs(rhs.used_vregs)
, exe_type(rhs.exe_type)
, unit_type(rhs.unit_type)
, data(rhs.data)
, pid(rhs.pid)
, sop(rhs.sop)
, eop(rhs.eop)
, fetch_stall(rhs.fetch_stall)
, log_once_(false)
{}
~pipeline_trace_t() {}
bool log_once(bool enable) {
bool old = log_once_;
log_once_ = enable;
return old;
}
void resume() {
stalled = false;
}
private:
bool log_once_;
};
inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) {
os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
os << "cid=" << state.cid;
os << ", wid=" << state.wid;
os << ", tmask=";
for (uint32_t i = 0, n = state.arch.num_threads(); i < n; ++i) {
os << state.tmask.test(i);
}
os << ", PC=0x" << std::hex << state.PC;
os << ", wb=" << state.wb;
if (state.wb) {
os << ", rd=" << state.rdest_type << std::dec << state.rdest;
}
os << ", ex=" << state.exe_type;
if (state.pid != -1) {
os << ", pid=" << state.pid;
os << ", sop=" << state.sop;
os << ", eop=" << state.eop;
}
os << " (#" << std::dec << state.uuid << ")";
return os;
}
class PipelineLatch {
protected:
const char* name_;
std::queue<pipeline_trace_t*> queue_;
public:
PipelineLatch(const char* name = nullptr)
: name_(name)
@@ -132,6 +197,10 @@ public:
std::queue<pipeline_trace_t*> empty;
std::swap(queue_, empty );
}
protected:
const char* name_;
std::queue<pipeline_trace_t*> queue_;
};
}

View File

@@ -1,168 +1,141 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "processor.h"
#include "core.h"
#include "constants.h"
#include "processor_impl.h"
using namespace vortex;
class Processor::Impl {
private:
std::vector<Core::Ptr> cores_;
std::vector<Cache::Ptr> l2caches_;
std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
Cache::Ptr l3cache_;
Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
ProcessorImpl::ProcessorImpl(const Arch& arch)
: arch_(arch)
, clusters_(arch.num_clusters())
{
SimPlatform::instance().initialize();
public:
Impl(const ArchDef& arch)
: cores_(arch.num_cores())
, l2caches_(NUM_CLUSTERS)
, l2_mem_switches_(NUM_CLUSTERS)
{
SimPlatform::instance().initialize();
// create memory simulator
memsim_ = MemSim::Create("dram", MemSim::Config{
MEMORY_BANKS,
uint32_t(arch.num_cores()) * arch.num_clusters()
});
uint32_t num_cores = arch.num_cores();
uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
cores_.at(i) = Core::Create(arch, i);
// create L3 cache
l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{
!L3_ENABLED,
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(L3_NUM_WAYS), // W
0, // A
XLEN, // address bits
L3_NUM_BANKS, // number of banks
1, // number of ports
uint8_t(arch.num_clusters()), // request size
true, // write-through
false, // write response
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
}
);
// connect L3 memory ports
l3cache_->MemReqPort.bind(&memsim_->MemReqPort);
memsim_->MemRspPort.bind(&l3cache_->MemRspPort);
// setup memory simulator
auto memsim = MemSim::Create("dram", MemSim::Config{
MEMORY_BANKS,
arch.num_cores()
});
std::vector<SimPort<MemReq>*> mem_req_ports(1, &memsim->MemReqPort);
std::vector<SimPort<MemRsp>*> mem_rsp_ports(1, &memsim->MemRspPort);
if (L3_ENABLE) {
l3cache_ = Cache::Create("l3cache", Cache::Config{
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L3_NUM_BANKS, // number of banks
L3_NUM_PORTS, // number of ports
NUM_CLUSTERS, // request size
true, // write-through
false, // write response
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
}
);
l3cache_->MemReqPort.bind(mem_req_ports.at(0));
mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
}
} else if (NUM_CLUSTERS > 1) {
l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));
mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
}
}
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
std::vector<SimPort<MemReq>*> cluster_mem_req_ports(cores_per_cluster);
std::vector<SimPort<MemRsp>*> cluster_mem_rsp_ports(cores_per_cluster);
if (L2_ENABLE) {
auto& l2cache = l2caches_.at(i);
l2cache = Cache::Create("l2cache", Cache::Config{
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L2_NUM_BANKS, // number of banks
L2_NUM_PORTS, // number of ports
(uint8_t)cores_per_cluster, // request size
true, // write-through
false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
l2cache->MemReqPort.bind(mem_req_ports.at(i));
mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
}
} else {
auto& l2_mem_switch = l2_mem_switches_.at(i);
l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster);
l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
}
}
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
auto& core = cores_.at((i * cores_per_cluster) + j);
core->MemReqPort.bind(cluster_mem_req_ports.at(j));
cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort);
}
}
// create clusters
for (uint32_t i = 0; i < arch.num_clusters(); ++i) {
clusters_.at(i) = Cluster::Create(i, this, arch, dcrs_);
// connect L3 core ports
clusters_.at(i)->mem_req_port.bind(&l3cache_->CoreReqPorts.at(i));
l3cache_->CoreRspPorts.at(i).bind(&clusters_.at(i)->mem_rsp_port);
}
~Impl() {
SimPlatform::instance().finalize();
}
// set up memory perf recording
memsim_->MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
__unused (cycle);
perf_mem_reads_ += !req.write;
perf_mem_writes_ += req.write;
perf_mem_pending_reads_ += !req.write;
});
memsim_->MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){
__unused (cycle);
--perf_mem_pending_reads_;
});
void attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
}
}
this->reset();
}
int run() {
SimPlatform::instance().reset();
bool running;
int exitcode = 0;
do {
SimPlatform::instance().tick();
running = false;
for (auto& core : cores_) {
if (core->running()) {
running = true;
}
if (core->check_exit()) {
exitcode = core->getIRegValue(3);
running = false;
break;
ProcessorImpl::~ProcessorImpl() {
SimPlatform::instance().finalize();
}
void ProcessorImpl::attach_ram(RAM* ram) {
for (auto cluster : clusters_) {
cluster->attach_ram(ram);
}
}
int ProcessorImpl::run(bool riscv_test) {
SimPlatform::instance().reset();
this->reset();
bool done;
Word exitcode = 0;
do {
SimPlatform::instance().tick();
done = true;
for (auto cluster : clusters_) {
if (cluster->running()) {
Word ec;
if (cluster->check_exit(&ec, riscv_test)) {
exitcode |= ec;
} else {
done = false;
}
}
} while (running);
}
perf_mem_latency_ += perf_mem_pending_reads_;
} while (!done);
return exitcode;
}
};
return exitcode;
}
void ProcessorImpl::reset() {
perf_mem_reads_ = 0;
perf_mem_writes_ = 0;
perf_mem_latency_ = 0;
perf_mem_pending_reads_ = 0;
}
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
dcrs_.write(addr, value);
}
ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
ProcessorImpl::PerfStats perf;
perf.mem_reads = perf_mem_reads_;
perf.mem_writes = perf_mem_writes_;
perf.mem_latency = perf_mem_latency_;
perf.l3cache = l3cache_->perf_stats();
for (auto cluster : clusters_) {
perf.clusters += cluster->perf_stats();
}
return perf;
}
///////////////////////////////////////////////////////////////////////////////
Processor::Processor(const ArchDef& arch)
: impl_(new Impl(arch))
Processor::Processor(const Arch& arch)
: impl_(new ProcessorImpl(arch))
{}
Processor::~Processor() {
@@ -173,6 +146,10 @@ void Processor::attach_ram(RAM* mem) {
impl_->attach_ram(mem);
}
int Processor::run() {
return impl_->run();
int Processor::run(bool riscv_test) {
return impl_->run(riscv_test);
}
void Processor::write_dcr(uint32_t addr, uint32_t value) {
return impl_->write_dcr(addr, value);
}

View File

@@ -1,22 +1,39 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
namespace vortex {
class ArchDef;
class Arch;
class RAM;
class ProcessorImpl;
class Processor {
public:
Processor(const ArchDef& arch);
Processor(const Arch& arch);
~Processor();
void attach_ram(RAM* mem);
int run();
int run(bool riscv_test);
void write_dcr(uint32_t addr, uint32_t value);
private:
class Impl;
Impl* impl_;
ProcessorImpl* impl_;
};
}
}

66
sim/simx/processor_impl.h Normal file
View File

@@ -0,0 +1,66 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "mem_sim.h"
#include "cache_sim.h"
#include "constants.h"
#include "dcrs.h"
#include "cluster.h"
namespace vortex {
class ProcessorImpl {
public:
struct PerfStats {
uint64_t mem_reads;
uint64_t mem_writes;
uint64_t mem_latency;
CacheSim::PerfStats l3cache;
Cluster::PerfStats clusters;
PerfStats()
: mem_reads(0)
, mem_writes(0)
, mem_latency(0)
{}
};
ProcessorImpl(const Arch& arch);
~ProcessorImpl();
void attach_ram(RAM* mem);
int run(bool riscv_test);
void write_dcr(uint32_t addr, uint32_t value);
ProcessorImpl::PerfStats perf_stats() const;
private:
void reset();
const Arch& arch_;
std::vector<std::shared_ptr<Cluster>> clusters_;
DCRS dcrs_;
MemSim::Ptr memsim_;
CacheSim::Ptr l3cache_;
uint64_t perf_mem_reads_;
uint64_t perf_mem_writes_;
uint64_t perf_mem_latency_;
uint64_t perf_mem_pending_reads_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pipeline.h"
@@ -6,20 +19,15 @@
namespace vortex {
class Scoreboard {
private:
public:
struct reg_use_t {
RegType type;
uint32_t reg;
uint64_t owner;
};
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
std::unordered_map<uint32_t, uint64_t> owners_;
public:
Scoreboard(const ArchDef &arch)
Scoreboard(const Arch &arch)
: in_use_iregs_(arch.num_warps())
, in_use_fregs_(arch.num_warps())
, in_use_vregs_(arch.num_warps())
@@ -84,8 +92,7 @@ public:
}
void reserve(pipeline_trace_t* state) {
if (!state->wb)
return;
assert(state->wb);
switch (state->rdest_type) {
case RegType::Integer:
in_use_iregs_.at(state->wid).set(state->rdest);
@@ -105,8 +112,7 @@ public:
}
void release(pipeline_trace_t* state) {
if (!state->wb)
return;
assert(state->wb);
switch (state->rdest_type) {
case RegType::Integer:
in_use_iregs_.at(state->wid).reset(state->rdest);
@@ -123,6 +129,13 @@ public:
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
owners_.erase(tag);
}
private:
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
std::unordered_map<uint32_t, uint64_t> owners_;
};
}

138
sim/simx/shared_mem.cpp Normal file
View File

@@ -0,0 +1,138 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "shared_mem.h"
#include "core.h"
#include <bitmanip.h>
#include <vector>
#include "types.h"
using namespace vortex;
class SharedMem::Impl {
protected:
SharedMem* simobject_;
Config config_;
RAM ram_;
uint32_t bank_sel_addr_start_;
uint32_t bank_sel_addr_end_;
PerfStats perf_stats_;
uint64_t to_local_addr(uint64_t addr) {
uint32_t total_lines = config_.capacity / config_.line_size;
uint32_t line_bits = log2ceil(total_lines);
uint32_t offset = bit_getw(addr, 0, line_bits-1);
return offset;
}
public:
Impl(SharedMem* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, ram_(config.capacity, config.capacity)
, bank_sel_addr_start_(0)
, bank_sel_addr_end_(0 + log2ceil(config.num_banks)-1)
{}
virtual ~Impl() {}
void reset() {
perf_stats_ = PerfStats();
}
void read(void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr);
DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
ram_.read(data, s_addr, size);
}
void write(const void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr);
DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
ram_.write(data, s_addr, size);
}
void tick() {
std::vector<bool> in_used_banks(config_.num_banks);
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
auto& core_req_port = simobject_->Inputs.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
uint32_t bank_id = 0;
if (bank_sel_addr_start_ <= bank_sel_addr_end_) {
bank_id = (uint32_t)bit_getw(core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
}
// bank conflict check
if (in_used_banks.at(bank_id)) {
++perf_stats_.bank_stalls;
continue;
}
in_used_banks.at(bank_id) = true;
if (!core_req.write || config_.write_reponse) {
// send response
MemRsp core_rsp{core_req.tag, core_req.cid};
simobject_->Outputs.at(req_id).send(core_rsp, 1);
}
// update perf counters
perf_stats_.reads += !core_req.write;
perf_stats_.writes += core_req.write;
// remove input
core_req_port.pop();
}
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
};
///////////////////////////////////////////////////////////////////////////////
SharedMem::SharedMem(const SimContext& ctx, const char* name, const Config& config)
: SimObject<SharedMem>(ctx, name)
, Inputs(config.num_reqs, this)
, Outputs(config.num_reqs, this)
, impl_(new Impl(this, config))
{}
SharedMem::~SharedMem() {
delete impl_;
}
void SharedMem::reset() {
impl_->reset();
}
void SharedMem::read(void* data, uint64_t addr, uint32_t size) {
impl_->read(data, addr, size);
}
void SharedMem::write(const void* data, uint64_t addr, uint32_t size) {
impl_->write(data, addr, size);
}
void SharedMem::tick() {
impl_->tick();
}
const SharedMem::PerfStats& SharedMem::perf_stats() const {
return impl_->perf_stats();
}

72
sim/simx/shared_mem.h Normal file
View File

@@ -0,0 +1,72 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "types.h"
namespace vortex {
class SharedMem : public SimObject<SharedMem> {
public:
struct Config {
uint32_t capacity;
uint32_t line_size;
uint32_t num_reqs;
uint32_t num_banks;
bool write_reponse;
};
struct PerfStats {
uint64_t reads;
uint64_t writes;
uint64_t bank_stalls;
PerfStats()
: reads(0)
, writes(0)
, bank_stalls(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->reads += rhs.reads;
this->writes += rhs.writes;
this->bank_stalls += rhs.bank_stalls;
return *this;
}
};
std::vector<SimPort<MemReq>> Inputs;
std::vector<SimPort<MemRsp>> Outputs;
SharedMem(const SimContext& ctx, const char* name, const Config& config);
virtual ~SharedMem();
void reset();
void read(void* data, uint64_t addr, uint32_t size);
void write(const void* data, uint64_t addr, uint32_t size);
void tick();
const PerfStats& perf_stats() const;
protected:
class Impl;
Impl* impl_;
};
}

View File

@@ -1,96 +0,0 @@
#pragma once
#include <simobject.h>
#include <bitmanip.h>
#include <vector>
#include "types.h"
namespace vortex {
class Core;
class SharedMem : public SimObject<SharedMem> {
public:
struct Config {
uint32_t num_reqs;
uint32_t num_banks;
uint32_t bank_offset;
uint32_t latency;
bool write_reponse;
};
struct PerfStats {
uint64_t reads;
uint64_t writes;
uint64_t bank_stalls;
PerfStats()
: reads(0)
, writes(0)
, bank_stalls(0)
{}
};
std::vector<SimPort<MemReq>> Inputs;
std::vector<SimPort<MemRsp>> Outputs;
SharedMem(const SimContext& ctx, const char* name, const Config& config)
: SimObject<SharedMem>(ctx, name)
, Inputs(config.num_reqs, this)
, Outputs(config.num_reqs, this)
, config_(config)
, bank_sel_addr_start_(config.bank_offset)
, bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1)
{}
virtual ~SharedMem() {}
void reset() {
perf_stats_ = PerfStats();
}
void tick() {
std::vector<bool> in_used_banks(config_.num_banks);
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
auto& core_req_port = this->Inputs.at(req_id);
if (core_req_port.empty())
continue;
auto& core_req = core_req_port.front();
uint32_t bank_id = (uint32_t)bit_getw(
core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
// bank conflict check
if (in_used_banks.at(bank_id))
continue;
in_used_banks.at(bank_id) = true;
if (!core_req.write || config_.write_reponse) {
// send response
MemRsp core_rsp{core_req.tag, core_req.core_id};
this->Outputs.at(req_id).send(core_rsp, 1);
}
// update perf counters
perf_stats_.reads += !core_req.write;
perf_stats_.writes += core_req.write;
// remove input
core_req_port.pop();
}
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
protected:
Config config_;
uint32_t bank_sel_addr_start_;
uint32_t bank_sel_addr_end_;
PerfStats perf_stats_;
};
}

View File

@@ -1,100 +0,0 @@
#include "tex_unit.h"
#include "core.h"
#include <texturing.h>
#include <VX_config.h>
using namespace vortex;
using namespace cocogfx;
enum class FilterMode {
Point,
Bilinear,
Trilinear,
};
TexUnit::TexUnit(Core* core) : core_(core) {}
TexUnit::~TexUnit() {}
void TexUnit::clear() {
for (auto& state : states_) {
state = 0;
}
}
uint32_t TexUnit::get_state(uint32_t state) {
return states_.at(state);
}
void TexUnit::set_state(uint32_t state, uint32_t value) {
states_.at(state) = value;
}
uint32_t TexUnit::read(int32_t u,
int32_t v,
int32_t lod,
std::vector<mem_addr_size_t>* mem_addrs) {
//--
auto xu = Fixed<TEX_FXD_FRAC>::make(u);
auto xv = Fixed<TEX_FXD_FRAC>::make(v);
uint32_t base_addr = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod));
uint32_t log_width = std::max<int32_t>(states_.at(TEX_STATE_WIDTH) - lod, 0);
uint32_t log_height = std::max<int32_t>(states_.at(TEX_STATE_HEIGHT) - lod, 0);
auto format = (TexFormat)states_.at(TEX_STATE_FORMAT);
auto filter = (FilterMode)states_.at(TEX_STATE_FILTER);
auto wrapu = (WrapMode)states_.at(TEX_STATE_WRAPU);
auto wrapv = (WrapMode)states_.at(TEX_STATE_WRAPV);
auto stride = Stride(format);
switch (filter) {
case FilterMode::Bilinear: {
// addressing
uint32_t offset00, offset01, offset10, offset11;
uint32_t alpha, beta;
TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv,
&offset00, &offset01, &offset10, &offset11, &alpha, &beta);
uint32_t addr00 = base_addr + offset00 * stride;
uint32_t addr01 = base_addr + offset01 * stride;
uint32_t addr10 = base_addr + offset10 * stride;
uint32_t addr11 = base_addr + offset11 * stride;
// memory lookup
uint32_t texel00(0), texel01(0), texel10(0), texel11(0);
core_->dcache_read(&texel00, addr00, stride);
core_->dcache_read(&texel01, addr01, stride);
core_->dcache_read(&texel10, addr10, stride);
core_->dcache_read(&texel11, addr11, stride);
mem_addrs->push_back({addr00, stride});
mem_addrs->push_back({addr01, stride});
mem_addrs->push_back({addr10, stride});
mem_addrs->push_back({addr11, stride});
// filtering
auto color = TexFilterLinear(
format, texel00, texel01, texel10, texel11, alpha, beta);
return color;
}
case FilterMode::Point: {
// addressing
uint32_t offset;
TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
uint32_t addr = base_addr + offset * stride;
// memory lookup
uint32_t texel(0);
core_->dcache_read(&texel, addr, stride);
mem_addrs->push_back({addr, stride});
// filtering
auto color = TexFilterPoint(format, texel);
return color;
}
default:
std::abort();
return 0;
}
}

View File

@@ -1,28 +0,0 @@
#pragma once
#include "types.h"
namespace vortex {
class Core;
class TexUnit {
public:
TexUnit(Core* core);
~TexUnit();
void clear();
uint32_t get_state(uint32_t state);
void set_state(uint32_t state, uint32_t value);
uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<mem_addr_size_t>* mem_addrs);
private:
std::array<uint32_t, NUM_TEX_STATES> states_;
Core* core_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
@@ -5,31 +18,42 @@
#include <queue>
#include <unordered_map>
#include <util.h>
#include <stringutil.h>
#include <VX_config.h>
#include <simobject.h>
#include "uuid_gen.h"
#include "debug.h"
namespace vortex {
typedef uint8_t Byte;
#if XLEN == 32
#if (XLEN == 32)
typedef uint32_t Word;
typedef int32_t WordI;
typedef uint64_t DWord;
typedef int64_t DWordI;
#elif XLEN == 64
typedef uint32_t WordF;
#elif (XLEN == 64)
typedef uint64_t Word;
typedef int64_t WordI;
typedef __uint128_t DWord;
typedef __int128_t DWordI;
typedef uint64_t WordF;
#else
#error unsupported XLEN
#endif
typedef uint64_t FWord;
#define MAX_NUM_CORES 1024
#define MAX_NUM_THREADS 32
#define MAX_NUM_WARPS 32
#define MAX_NUM_REGS 32
typedef std::bitset<32> RegMask;
typedef std::bitset<32> ThreadMask;
typedef std::bitset<32> WarpMask;
typedef std::bitset<MAX_NUM_CORES> CoreMask;
typedef std::bitset<MAX_NUM_REGS> RegMask;
typedef std::bitset<MAX_NUM_THREADS> ThreadMask;
typedef std::bitset<MAX_NUM_WARPS> WarpMask;
typedef std::unordered_map<uint32_t, uint32_t> CSRs;
///////////////////////////////////////////////////////////////////////////////
@@ -40,8 +64,8 @@ enum class RegType {
Vector
};
inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
switch (clss) {
inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
switch (type) {
case RegType::None: break;
case RegType::Integer: os << "x"; break;
case RegType::Float: os << "f"; break;
@@ -53,23 +77,19 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
///////////////////////////////////////////////////////////////////////////////
enum class ExeType {
NOP,
ALU,
LSU,
CSR,
FPU,
GPU,
SFU,
MAX,
};
inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
switch (type) {
case ExeType::NOP: os << "NOP"; break;
case ExeType::ALU: os << "ALU"; break;
case ExeType::LSU: os << "LSU"; break;
case ExeType::CSR: os << "CSR"; break;
case ExeType::FPU: os << "FPU"; break;
case ExeType::GPU: os << "GPU"; break;
case ExeType::SFU: os << "SFU"; break;
case ExeType::MAX: break;
}
return os;
@@ -82,8 +102,7 @@ enum class AluType {
BRANCH,
SYSCALL,
IMUL,
IDIV,
CMOV,
IDIV
};
inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
@@ -93,7 +112,6 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
case AluType::SYSCALL: os << "SYSCALL"; break;
case AluType::IMUL: os << "IMUL"; break;
case AluType::IDIV: os << "IDIV"; break;
case AluType::CMOV: os << "CMOV"; break;
}
return os;
}
@@ -103,16 +121,14 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
enum class LsuType {
LOAD,
STORE,
FENCE,
PREFETCH,
FENCE
};
inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
switch (type) {
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
case LsuType::PREFETCH: os << "PREFETCH"; break;
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
}
return os;
}
@@ -141,21 +157,6 @@ struct mem_addr_size_t {
uint32_t size;
};
inline AddrType get_addr_type(Word addr, uint32_t size) {
__unused (size);
if (SM_ENABLE) {
if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE)
&& addr < SMEM_BASE_ADDR) {
assert((addr + size) <= SMEM_BASE_ADDR);
return AddrType::Shared;
}
}
if (addr >= IO_BASE_ADDR) {
return AddrType::IO;
}
return AddrType::Global;
}
///////////////////////////////////////////////////////////////////////////////
enum class FpuType {
@@ -179,23 +180,31 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
///////////////////////////////////////////////////////////////////////////////
enum class GpuType {
enum class SfuType {
TMC,
WSPAWN,
SPLIT,
JOIN,
BAR,
TEX,
PRED,
CSRRW,
CSRRS,
CSRRC,
CMOV
};
inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
switch (type) {
case GpuType::TMC: os << "TMC"; break;
case GpuType::WSPAWN: os << "WSPAWN"; break;
case GpuType::SPLIT: os << "SPLIT"; break;
case GpuType::JOIN: os << "JOIN"; break;
case GpuType::BAR: os << "BAR"; break;
case GpuType::TEX: os << "TEX"; break;
case SfuType::TMC: os << "TMC"; break;
case SfuType::WSPAWN: os << "WSPAWN"; break;
case SfuType::SPLIT: os << "SPLIT"; break;
case SfuType::JOIN: os << "JOIN"; break;
case SfuType::BAR: os << "BAR"; break;
case SfuType::PRED: os << "PRED"; break;
case SfuType::CSRRW: os << "CSRRW"; break;
case SfuType::CSRRS: os << "CSRRS"; break;
case SfuType::CSRRC: os << "CSRRC"; break;
case SfuType::CMOV: os << "CMOV"; break;
}
return os;
}
@@ -218,31 +227,32 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
///////////////////////////////////////////////////////////////////////////////
struct MemReq {
uint64_t addr;
bool write;
bool non_cacheable;
uint32_t tag;
uint32_t core_id;
uint64_t uuid;
uint64_t addr;
bool write;
AddrType type;
uint32_t tag;
uint32_t cid;
uint64_t uuid;
MemReq(uint64_t _addr = 0,
bool _write = false,
bool _non_cacheable = false,
uint64_t _tag = 0,
uint32_t _core_id = 0,
uint64_t _uuid = 0
) : addr(_addr)
, write(_write)
, non_cacheable(_non_cacheable)
, tag(_tag)
, core_id(_core_id)
, uuid(_uuid)
{}
MemReq(uint64_t _addr = 0,
bool _write = false,
AddrType _type = AddrType::Global,
uint64_t _tag = 0,
uint32_t _cid = 0,
uint64_t _uuid = 0
) : addr(_addr)
, write(_write)
, type(_type)
, tag(_tag)
, cid(_cid)
, uuid(_uuid)
{}
};
inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
os << "mem-" << (req.write ? "wr" : "rd") << ": ";
os << "addr=" << std::hex << req.addr << std::dec << ", tag=" << req.tag << ", core_id=" << req.core_id;
os << "addr=0x" << std::hex << req.addr << ", type=" << req.type;
os << std::dec << ", tag=" << req.tag << ", cid=" << req.cid;
os << " (#" << std::dec << req.uuid << ")";
return os;
}
@@ -250,18 +260,19 @@ inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
///////////////////////////////////////////////////////////////////////////////
struct MemRsp {
uint64_t tag;
uint32_t core_id;
uint64_t uuid;
MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0)
: tag (_tag)
, core_id(_core_id)
, uuid(_uuid)
{}
uint64_t tag;
uint32_t cid;
uint64_t uuid;
MemRsp(uint64_t _tag = 0, uint32_t _cid = 0, uint64_t _uuid = 0)
: tag (_tag)
, cid(_cid)
, uuid(_uuid)
{}
};
inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id;
os << "mem-rsp: tag=" << rsp.tag << ", cid=" << rsp.cid;
os << " (#" << std::dec << rsp.uuid << ")";
return os;
}
@@ -270,10 +281,6 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
template <typename T>
class HashTable {
private:
std::vector<std::pair<bool, T>> entries_;
uint32_t size_;
public:
HashTable(uint32_t capacity)
: entries_(capacity)
@@ -336,92 +343,180 @@ public:
}
size_ = 0;
}
private:
std::vector<std::pair<bool, T>> entries_;
uint32_t size_;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
template <typename Req, typename Rsp>
class Switch : public SimObject<Switch<Req, Rsp>> {
private:
ArbiterType type_;
uint32_t delay_;
uint32_t cursor_;
uint32_t tag_shift_;
public:
std::vector<SimPort<Req>> ReqIn;
std::vector<SimPort<Rsp>> RspIn;
std::vector<SimPort<Req>> ReqOut;
std::vector<SimPort<Rsp>> RspOut;
Switch(
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs,
uint32_t num_inputs = 1,
uint32_t num_outputs = 1,
uint32_t delay = 1
)
: SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)
: SimObject<Switch<Req, Rsp>>(ctx, name)
, ReqIn(num_inputs, this)
, RspIn(num_inputs, this)
, ReqOut(num_outputs, this)
, RspOut(num_outputs, this)
, type_(type)
, delay_(delay)
, cursor_(0)
, tag_shift_(log2ceil(num_inputs))
, ReqIn(num_inputs, this)
, ReqOut(this)
, RspIn(this)
, RspOut(num_inputs, this)
, cursors_(num_outputs, 0)
, lg_num_reqs_(log2ceil(num_inputs / num_outputs))
{
assert(delay_ != 0);
assert(num_inputs <= MaxInputs);
if (num_inputs == 1) {
// bypass
ReqIn.at(0).bind(&ReqOut);
RspIn.bind(&RspOut.at(0));
assert(delay != 0);
assert(num_inputs <= 32);
assert(num_outputs <= 32);
assert(num_inputs >= num_outputs);
if (num_inputs == num_outputs) {
// bypass mode
for (uint32_t i = 0; i < num_inputs; ++i) {
ReqIn.at(i).bind(&ReqOut.at(i));
RspOut.at(i).bind(&RspIn.at(i));
}
}
}
void reset() {
cursor_ = 0;
for (auto& cursor : cursors_) {
cursor = 0;
}
}
void tick() {
if (ReqIn.size() == 1)
void tick() {
uint32_t I = ReqIn.size();
uint32_t O = ReqOut.size();
uint32_t R = 1 << lg_num_reqs_;
// skip bypass mode
if (I == O)
return;
// process incomming requests
for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
uint32_t j = (cursor_ + i) % n;
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
if (tag_shift_) {
req.tag = (req.tag << tag_shift_) | j;
// process incomming requests
for (uint32_t o = 0; o < O; ++o) {
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (cursors_.at(o) + r) & (R-1);
uint32_t j = o * R + i;
if (j >= I)
continue;
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
if (lg_num_reqs_ != 0) {
req.tag = (req.tag << lg_num_reqs_) | i;
}
DT(4, this->name() << "-" << req);
ReqOut.at(o).send(req, delay_);
req_in.pop();
this->update_cursor(o, i);
break;
}
ReqOut.send(req, delay_);
req_in.pop();
this->update_cursor(j);
break;
}
}
// process incoming reponses
if (!RspIn.empty()) {
auto& rsp = RspIn.front();
uint32_t port_id = 0;
if (tag_shift_) {
port_id = rsp.tag & ((1 << tag_shift_)-1);
rsp.tag >>= tag_shift_;
}
RspOut.at(port_id).send(rsp, 1);
RspIn.pop();
// process incoming reponses
if (!RspOut.at(o).empty()) {
auto& rsp = RspOut.at(o).front();
uint32_t i = 0;
if (lg_num_reqs_ != 0) {
i = rsp.tag & (R-1);
rsp.tag >>= lg_num_reqs_;
}
DT(4, this->name() << "-" << rsp);
uint32_t j = o * R + i;
RspIn.at(j).send(rsp, 1);
RspOut.at(o).pop();
}
}
}
void update_cursor(uint32_t grant) {
void update_cursor(uint32_t index, uint32_t grant) {
if (type_ == ArbiterType::RoundRobin) {
cursor_ = grant + 1;
cursors_.at(index) = grant + 1;
}
}
std::vector<SimPort<Req>> ReqIn;
SimPort<Req> ReqOut;
SimPort<Rsp> RspIn;
std::vector<SimPort<Rsp>> RspOut;
private:
ArbiterType type_;
uint32_t delay_;
std::vector<uint32_t> cursors_;
uint32_t lg_num_reqs_;
};
///////////////////////////////////////////////////////////////////////////////
class SMemDemux : public SimObject<SMemDemux> {
public:
SimPort<MemReq> ReqIn;
SimPort<MemRsp> RspIn;
SimPort<MemReq> ReqSm;
SimPort<MemRsp> RspSm;
SimPort<MemReq> ReqDc;
SimPort<MemRsp> RspDc;
SMemDemux(
const SimContext& ctx,
const char* name,
uint32_t delay = 1
) : SimObject<SMemDemux>(ctx, name)
, ReqIn(this)
, RspIn(this)
, ReqSm(this)
, RspSm(this)
, ReqDc(this)
, RspDc(this)
, delay_(delay)
{}
void reset() {}
void tick() {
// process incomming requests
if (!ReqIn.empty()) {
auto& req = ReqIn.front();
DT(4, this->name() << "-" << req);
if (req.type == AddrType::Shared) {
ReqSm.send(req, delay_);
} else {
ReqDc.send(req, delay_);
}
ReqIn.pop();
}
// process incoming reponses
if (!RspSm.empty()) {
auto& rsp = RspSm.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspSm.pop();
}
if (!RspDc.empty()) {
auto& rsp = RspDc.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspDc.pop();
}
}
private:
uint32_t delay_;
};
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <stdlib.h>
#include <unistd.h>
@@ -10,21 +23,25 @@
using namespace vortex;
Warp::Warp(Core *core, uint32_t id)
: id_(id)
Warp::Warp(Core *core, uint32_t warp_id)
: warp_id_(warp_id)
, arch_(core->arch())
, core_(core)
, ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
, freg_file_(core->arch().num_threads(), std::vector<FWord>(core->arch().num_regs()))
, freg_file_(core->arch().num_threads(), std::vector<uint64_t>(core->arch().num_regs()))
, vreg_file_(core->arch().num_threads(), std::vector<Byte>(core->arch().vsize()))
{
this->clear();
this->reset();
}
void Warp::clear() {
active_ = false;
PC_ = STARTUP_ADDR;
void Warp::reset() {
PC_ = core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
#if (XLEN == 64)
PC_ = (uint64_t(core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | PC_;
#endif
tmask_.reset();
for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i) {
issued_instrs_ = 0;
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i) {
for (auto& reg : ireg_file_.at(i)) {
reg = 0;
}
@@ -35,31 +52,44 @@ void Warp::clear() {
reg = 0;
}
}
uui_gen_.reset();
}
void Warp::eval(pipeline_trace_t *trace) {
pipeline_trace_t* Warp::eval() {
assert(tmask_.any());
DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i)
DPN(2, tmask_.test(n-i-1));
DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl);
/* Fetch and decode. */
#ifndef NDEBUG
uint32_t instr_uuid = uui_gen_.get_uuid(PC_);
uint32_t g_wid = core_->id() * arch_.num_warps() + warp_id_;
uint32_t instr_id = instr_uuid & 0xffff;
uint32_t instr_ref = instr_uuid >> 16;
uint64_t uuid = (uint64_t(instr_ref) << 32) | (g_wid << 16) | instr_id;
#else
uint64_t uuid = 0;
#endif
DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << warp_id_ << ", tmask=");
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)
DPN(1, tmask_.test(i));
DPN(1, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << uuid << ")" << std::endl);
// Fetch
uint32_t instr_code = 0;
core_->icache_read(&instr_code, PC_, sizeof(uint32_t));
auto instr = core_->decoder().decode(instr_code);
// Decode
auto instr = core_->decoder_.decode(instr_code);
if (!instr) {
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl;
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << PC_ << " (#" << std::dec << uuid << ")" << std::endl;
std::abort();
}
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr);
// Update trace
// Create trace
auto trace = new pipeline_trace_t(uuid, arch_);
trace->cid = core_->id();
trace->wid = id_;
trace->wid = warp_id_;
trace->PC = PC_;
trace->tmask = tmask_;
trace->rdest = instr->getRDest();
@@ -68,18 +98,20 @@ void Warp::eval(pipeline_trace_t *trace) {
// Execute
this->execute(*instr, trace);
DP(4, "Register state:");
for (uint32_t i = 0; i < core_->arch().num_regs(); ++i) {
DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
DP(5, "Register state:");
for (uint32_t i = 0; i < arch_.num_regs(); ++i) {
DPN(5, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
// Integer register file
for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
DPN(4, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(4, '|');
DPN(5, '|');
// Floating point register file
for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
DPN(4, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(4, std::endl);
DPN(5, std::endl);
}
return trace;
}

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __WARP_H
#define __WARP_H
@@ -7,28 +20,26 @@
namespace vortex {
class Arch;
class Core;
class Instr;
class pipeline_trace_t;
struct DomStackEntry {
DomStackEntry(const ThreadMask &tmask, Word PC)
: tmask(tmask)
, PC(PC)
, fallThrough(false)
, unanimous(false)
, fallthrough(false)
{}
DomStackEntry(const ThreadMask &tmask)
: tmask(tmask)
, PC(0)
, fallThrough(true)
, unanimous(false)
DomStackEntry(const ThreadMask &tmask)
: tmask(tmask)
, fallthrough(true)
{}
ThreadMask tmask;
Word PC;
bool fallThrough;
bool unanimous;
bool fallthrough;
};
struct vtype {
@@ -40,72 +51,58 @@ struct vtype {
class Warp {
public:
Warp(Core *core, uint32_t id);
Warp(Core *core, uint32_t warp_id);
void clear();
bool active() const {
return active_;
}
void suspend() {
active_ = false;
}
void activate() {
active_ = true;
}
std::size_t getActiveThreads() const {
if (active_)
return tmask_.count();
return 0;
}
void reset();
uint32_t id() const {
return id_;
return warp_id_;
}
uint32_t getPC() const {
Word getPC() const {
return PC_;
}
void setPC(uint32_t PC) {
void setPC(Word PC) {
PC_ = PC;
}
void setTmask(size_t index, bool value) {
tmask_.set(index, value);
active_ = tmask_.any();
}
uint32_t getTmask() const {
if (active_)
return tmask_.to_ulong();
return 0;
uint64_t getTmask() const {
return tmask_.to_ulong();
}
uint32_t getIRegValue(uint32_t reg) const {
Word getIRegValue(uint32_t reg) const {
return ireg_file_.at(0).at(reg);
}
void eval(pipeline_trace_t *);
uint64_t incr_instrs() {
return issued_instrs_++;
}
pipeline_trace_t* eval();
private:
void execute(const Instr &instr, pipeline_trace_t *trace);
UUIDGenerator uui_gen_;
uint32_t id_;
uint32_t warp_id_;
const Arch& arch_;
Core *core_;
bool active_;
uint64_t issued_instrs_;
Word PC_;
ThreadMask tmask_;
std::vector<std::vector<Word>> ireg_file_;
std::vector<std::vector<FWord>> freg_file_;
std::vector<std::vector<Byte>> vreg_file_;
std::stack<DomStackEntry> dom_stack_;
ThreadMask tmask_;
std::vector<std::vector<Word>> ireg_file_;
std::vector<std::vector<uint64_t>> freg_file_;
std::vector<std::vector<Byte>> vreg_file_;
std::stack<DomStackEntry> ipdom_stack_;
struct vtype vtype_;
uint32_t vl_;

View File

@@ -1,101 +0,0 @@
DESTDIR ?= .
RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi
SCRIPT_DIR = ../../hw/scripts
THIRD_PARTY_DIR = ../../third_party
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I.. -I../../../hw -I../../common
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I../$(THIRD_PARTY_DIR)
LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator
# control RTL debug tracing states
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_MEM
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_FLAGS += $(DBG_TRACE_FLAGS)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += fpga.cpp opae_sim.cpp
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
TOP = vortex_afu_shim
VL_FLAGS = --exe --cc $(TOP) --top-module $(TOP)
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO -Wno-EOFNEWLINE
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += verilator.vlt
VL_FLAGS += $(RTL_INCLUDE)
VL_FLAGS += $(CONFIGS)
CXXFLAGS += $(CONFIGS)
# Enable Verilator multithreaded simulation
THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())')
VL_FLAGS += -j $(THREADS)
#VL_FLAGS += --threads $(THREADS)
# Debugigng
ifdef DEBUG
VL_FLAGS += --trace --trace-structs -DVCD_OUTPUT $(DBG_FLAGS)
CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
else
VL_FLAGS += -DNDEBUG
CXXFLAGS += -O2 -DNDEBUG
endif
# Enable scope analyzer
ifdef SCOPE
VL_FLAGS += -DSCOPE
CXXFLAGS += -DSCOPE
endif
# Enable perf counters
ifdef PERF
VL_FLAGS += -DPERF_ENABLE
CXXFLAGS += -DPERF_ENABLE
endif
# use our OPAE shim
VL_FLAGS += -DNOPAE
CXXFLAGS += -DNOPAE
# ALU backend
VL_FLAGS += -DIMUL_DPI
VL_FLAGS += -DIDIV_DPI
# FPU backend
FPU_CORE ?= FPU_DPI
VL_FLAGS += -D$(FPU_CORE)
PROJECT = libopae-c-vlsim.so
all: $(PROJECT)
vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h
$(DESTDIR)/$(PROJECT): $(SRCS) vortex_afu.h
verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT)
clean:
rm -rf obj_dir $(DESTDIR)/$(PROJECT)

View File

@@ -1,30 +0,0 @@
#pragma once
#include <stdint.h>
namespace vortex {
class RAM;
class opae_sim {
public:
opae_sim();
virtual ~opae_sim();
int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
void release_buffer(uint64_t wsid);
void get_io_address(uint64_t wsid, uint64_t *ioaddr);
void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value);
void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value);
private:
class Impl;
Impl* impl_;
};
}

View File

@@ -1,10 +0,0 @@
`verilator_config
lint_off -rule BLKANDNBLK -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule UNOPTFLAT -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule WIDTH -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule UNUSED -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule LITENDIAN -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule IMPORTSTAR -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -rule PINCONNECTEMPTY -file "../../../hw/rtl/fp_cores/fpnew/*"
lint_off -file "../rtl/fp_cores/fpnew/*"