adding sockets support to simx and cache subsystem refactoring

minor update

minor update

minor updates
This commit is contained in:
Blaise Tine
2023-12-20 11:57:44 -08:00
parent 914b680aed
commit c7a81d1493
24 changed files with 541 additions and 388 deletions

View File

@@ -15,7 +15,7 @@ LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
SRCS += processor.cpp cluster.cpp socket.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
# Debugigng
ifdef DEBUG

View File

@@ -28,6 +28,7 @@ private:
uint16_t num_warps_;
uint16_t num_cores_;
uint16_t num_clusters_;
uint16_t socket_size_;
uint16_t vsize_;
uint16_t num_regs_;
uint16_t num_csrs_;
@@ -35,11 +36,12 @@ private:
uint16_t ipdom_size_;
public:
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)
: num_threads_(num_threads)
, num_warps_(num_warps)
, num_cores_(num_cores)
, num_clusters_(num_clusters)
, num_clusters_(NUM_CLUSTERS)
, socket_size_(SOCKET_SIZE)
, vsize_(16)
, num_regs_(32)
, num_csrs_(4096)
@@ -82,6 +84,10 @@ public:
uint16_t num_clusters() const {
return num_clusters_;
}
uint16_t socket_size() const {
return socket_size_;
}
};
}

View File

@@ -24,14 +24,38 @@ Cluster::Cluster(const SimContext& ctx,
, mem_req_port(this)
, mem_rsp_port(this)
, cluster_id_(cluster_id)
, cores_(arch.num_cores())
, sockets_(NUM_SOCKETS)
, barriers_(arch.num_barriers(), 0)
, sharedmems_(arch.num_cores())
, processor_(processor)
, cores_per_socket_(arch.socket_size())
{
auto num_cores = arch.num_cores();
char sname[100];
auto sockets_per_cluster = sockets_.size();
// create sockets
snprintf(sname, 100, "cluster%d-icache-arb", cluster_id);
auto icache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
snprintf(sname, 100, "cluster%d-dcache-arb", cluster_id);
auto dcache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
for (uint32_t i = 0; i < sockets_per_cluster; ++i) {
uint32_t socket_id = cluster_id * sockets_per_cluster + i;
auto socket = Socket::Create(socket_id, this, arch, dcrs);
socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i));
icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port);
socket->dcache_mem_req_port.bind(&dcache_switch->ReqIn.at(i));
dcache_switch->RspIn.at(i).bind(&socket->dcache_mem_rsp_port);
sockets_.at(i) = socket;
}
// Create l2cache
snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
!L2_ENABLED,
@@ -42,7 +66,7 @@ Cluster::Cluster(const SimContext& ctx,
log2ceil(L2_NUM_BANKS), // B
XLEN, // address bits
1, // number of ports
5, // request size
2, // request size
true, // write-through
false, // write response
L2_MSHR_SIZE, // mshr
@@ -52,87 +76,11 @@ Cluster::Cluster(const SimContext& ctx,
l2cache_->MemReqPort.bind(&this->mem_req_port);
this->mem_rsp_port.bind(&l2cache_->MemRspPort);
snprintf(sname, 100, "cluster%d-icaches", cluster_id);
icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
!ICACHE_ENABLED,
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // L
log2ceil(sizeof(uint32_t)), // W
log2ceil(ICACHE_NUM_WAYS),// A
1, // B
XLEN, // address bits
1, // number of ports
1, // number of inputs
true, // write-through
false, // write response
(uint8_t)arch.num_warps(), // mshr
2, // pipeline latency
});
icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0));
l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0));
icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
!DCACHE_ENABLED,
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // L
log2ceil(sizeof(Word)), // W
log2ceil(DCACHE_NUM_WAYS),// A
log2ceil(DCACHE_NUM_BANKS), // B
XLEN, // address bits
1, // number of ports
DCACHE_NUM_BANKS, // number of inputs
true, // write-through
false, // write response
DCACHE_MSHR_SIZE, // mshr
4, // pipeline latency
});
dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
///////////////////////////////////////////////////////////////////////////
// create shared memory blocks
for (uint32_t i = 0; i < num_cores; ++i) {
snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
(1 << SMEM_LOG_SIZE),
sizeof(Word),
NUM_LSU_LANES,
NUM_LSU_LANES,
false
});
}
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
uint32_t core_id = cluster_id * num_cores + i;
cores_.at(i) = Core::Create(core_id,
this,
arch,
dcrs,
sharedmems_.at(i));
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
auto smem_demux = SMemDemux::Create(sname);
cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));
smem_demux->ReqDC.bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDC);
smem_demux->ReqSM.bind(&sharedmems_.at(i)->Inputs.at(j));
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSM);
}
}
dcache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(1));
l2cache_->CoreRspPorts.at(1).bind(&dcache_switch->RspOut.at(0));
}
Cluster::~Cluster() {
@@ -150,14 +98,14 @@ void Cluster::tick() {
}
void Cluster::attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
for (auto& socket : sockets_) {
socket->attach_ram(ram);
}
}
bool Cluster::running() const {
for (auto& core : cores_) {
if (core->running())
for (auto& socket : sockets_) {
if (socket->running())
return true;
}
return false;
@@ -166,9 +114,9 @@ bool Cluster::running() const {
bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
bool done = true;
Word exitcode_ = 0;
for (auto& core : cores_) {
for (auto& socket : sockets_) {
Word ec;
if (core->check_exit(&ec, riscv_test)) {
if (socket->check_exit(&ec, riscv_test)) {
exitcode_ |= ec;
} else {
done = false;
@@ -181,36 +129,32 @@ bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
auto& barrier = barriers_.at(bar_id);
uint32_t local_core_id = core_id % cores_.size();
auto sockets_per_cluster = sockets_.size();
auto cores_per_socket = cores_per_socket_;
uint32_t cores_per_cluster = sockets_per_cluster * cores_per_socket;
uint32_t local_core_id = core_id % cores_per_cluster;
barrier.set(local_core_id);
DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
if (barrier.count() == (size_t)count) {
// resume all suspended cores
for (uint32_t i = 0; i < cores_.size(); ++i) {
if (barrier.test(i)) {
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
cores_.at(i)->resume();
for (uint32_t s = 0; s < sockets_per_cluster; ++s) {
for (uint32_t c = 0; c < cores_per_socket; ++c) {
uint32_t i = s * cores_per_socket + c;
if (barrier.test(i)) {
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
sockets_.at(s)->resume(c);
}
}
}
barrier.reset();
}
}
ProcessorImpl* Cluster::processor() const {
return processor_;
}
Cluster::PerfStats Cluster::perf_stats() const {
Cluster::PerfStats perf;
perf.icache = icaches_->perf_stats();
perf.dcache = dcaches_->perf_stats();
perf.l2cache = l2cache_->perf_stats();
for (auto sharedmem : sharedmems_) {
perf.sharedmem += sharedmem->perf_stats();
}
return perf;
}

View File

@@ -17,8 +17,8 @@
#include "dcrs.h"
#include "arch.h"
#include "cache_cluster.h"
#include "shared_mem.h"
#include "core.h"
#include "socket.h"
#include "constants.h"
namespace vortex {
@@ -27,17 +27,11 @@ class ProcessorImpl;
class Cluster : public SimObject<Cluster> {
public:
struct PerfStats {
CacheSim::PerfStats icache;
CacheSim::PerfStats dcache;
SharedMem::PerfStats sharedmem;
CacheSim::PerfStats l2cache;
struct PerfStats {
CacheSim::PerfStats l2cache;
PerfStats& operator+=(const PerfStats& rhs) {
this->icache += rhs.icache;
this->dcache += rhs.dcache;
this->sharedmem += rhs.sharedmem;
this->l2cache += rhs.l2cache;
this->l2cache += rhs.l2cache;
return *this;
}
};
@@ -53,6 +47,14 @@ public:
~Cluster();
uint32_t id() const {
return cluster_id_;
}
ProcessorImpl* processor() const {
return processor_;
}
void reset();
void tick();
@@ -65,22 +67,15 @@ public:
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
ProcessorImpl* processor() const;
Cluster::PerfStats perf_stats() const;
private:
uint32_t cluster_id_;
std::vector<Core::Ptr> cores_;
std::vector<CoreMask> barriers_;
CacheSim::Ptr l2cache_;
CacheCluster::Ptr icaches_;
CacheCluster::Ptr dcaches_;
std::vector<SharedMem::Ptr> sharedmems_;
CacheCluster::Ptr tcaches_;
CacheCluster::Ptr ocaches_;
CacheCluster::Ptr rcaches_;
ProcessorImpl* processor_;
uint32_t cluster_id_;
std::vector<Socket::Ptr> sockets_;
std::vector<CoreMask> barriers_;
CacheSim::Ptr l2cache_;
ProcessorImpl* processor_;
uint32_t cores_per_socket_;
};
} // namespace vortex

View File

@@ -21,18 +21,14 @@
#include "mem.h"
#include "decode.h"
#include "core.h"
#include "socket.h"
#include "debug.h"
#include "constants.h"
#include "processor_impl.h"
using namespace vortex;
Core::Core(const SimContext& ctx,
uint32_t core_id,
Cluster* cluster,
const Arch &arch,
const DCRS &dcrs,
SharedMem::Ptr sharedmem)
Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs)
: SimObject(ctx, "core")
, icache_req_ports(1, this)
, icache_rsp_ports(1, this)
@@ -50,12 +46,12 @@ Core::Core(const SimContext& ctx,
, operands_(ISSUE_WIDTH)
, dispatchers_((uint32_t)ExeType::ExeTypeCount)
, exe_units_((uint32_t)ExeType::ExeTypeCount)
, sharedmem_(sharedmem)
, smem_demuxs_(NUM_LSU_LANES)
, fetch_latch_("fetch")
, decode_latch_("decode")
, pending_icache_(arch_.num_warps())
, csrs_(arch.num_warps())
, cluster_(cluster)
, socket_(socket)
, commit_arbs_(ISSUE_WIDTH)
{
char sname[100];
@@ -72,6 +68,27 @@ Core::Core(const SimContext& ctx,
operands_.at(i) = SimPlatform::instance().create_object<Operand>();
}
// initialize shared memory
shared_mem_ = SharedMem::Create(sname, SharedMem::Config{
(1 << SMEM_LOG_SIZE),
sizeof(Word),
NUM_LSU_LANES,
NUM_LSU_LANES,
false
});
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
snprintf(sname, 100, "smem_demux%d_%d", core_id, i);
auto smem_demux = SMemDemux::Create(sname);
smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
smem_demuxs_.at(i) = smem_demux;
}
// initialize dispatchers
dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
dispatchers_.at((int)ExeType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
@@ -241,13 +258,6 @@ void Core::decode() {
stalled_warps_.reset(trace->wid);
}
// update perf counters
uint32_t active_threads = trace->tmask.count();
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::LOAD)
perf_stats_.loads += active_threads;
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE)
perf_stats_.stores += active_threads;
DT(3, "pipeline-decode: " << *trace);
// insert to ibuffer
@@ -394,7 +404,7 @@ void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) {
if (is_global) {
// global barrier handling
if (barrier.count() == active_warps_.count()) {
cluster_->barrier(bar_idx, count, core_id_);
socket_->barrier(bar_idx, count, core_id_);
barrier.reset();
}
} else {
@@ -431,7 +441,7 @@ AddrType Core::get_addr_type(uint64_t addr) {
void Core::dcache_read(void *data, uint64_t addr, uint32_t size) {
auto type = this->get_addr_type(addr);
if (type == AddrType::Shared) {
sharedmem_->read(data, addr, size);
shared_mem_->read(data, addr, size);
} else {
mmu_.read(data, addr, size, 0);
}
@@ -446,7 +456,7 @@ void Core::dcache_write(const void* data, uint64_t addr, uint32_t size) {
this->writeToStdOut(data, addr, size);
} else {
if (type == AddrType::Shared) {
sharedmem_->write(data, addr, size);
shared_mem_->write(data, addr, size);
} else {
mmu_.write(data, addr, size, 0);
}
@@ -554,16 +564,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
case VX_CSR_MPM_ALU_ST: return perf_stats_.alu_stalls & 0xffffffff;
case VX_CSR_MPM_ALU_ST_H: return perf_stats_.alu_stalls >> 32;
case VX_CSR_MPM_LSU_ST: return perf_stats_.lsu_stalls & 0xffffffff;
case VX_CSR_MPM_LSU_ST_H: return perf_stats_.lsu_stalls >> 32;
case VX_CSR_MPM_FPU_ST: return perf_stats_.fpu_stalls & 0xffffffff;
case VX_CSR_MPM_FPU_ST_H: return perf_stats_.fpu_stalls >> 32;
case VX_CSR_MPM_SFU_ST: return perf_stats_.sfu_stalls & 0xffffffff;
case VX_CSR_MPM_SFU_ST_H: return perf_stats_.sfu_stalls >> 32;
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff;
case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32;
case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff;
@@ -572,7 +574,6 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
@@ -586,27 +587,29 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
}
} break;
case VX_DCR_MPM_CLASS_MEM: {
auto proc_perf = cluster_->processor()->perf_stats();
auto proc_perf = socket_->cluster()->processor()->perf_stats();
auto socket_perf = socket_->perf_stats();
auto smem_perf = shared_mem_->perf_stats();
switch (addr) {
case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff;
case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32;
case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff;
case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32;
case VX_CSR_MPM_ICACHE_MSHR_ST: return proc_perf.clusters.icache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return proc_perf.clusters.icache.mshr_stalls >> 32;
case VX_CSR_MPM_ICACHE_READS: return socket_perf.icache.reads & 0xffffffff;
case VX_CSR_MPM_ICACHE_READS_H: return socket_perf.icache.reads >> 32;
case VX_CSR_MPM_ICACHE_MISS_R: return socket_perf.icache.read_misses & 0xffffffff;
case VX_CSR_MPM_ICACHE_MISS_R_H: return socket_perf.icache.read_misses >> 32;
case VX_CSR_MPM_ICACHE_MSHR_ST: return socket_perf.icache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32;
case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff;
case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32;
case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff;
case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32;
case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32;
case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32;
case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_BANK_ST_H: return proc_perf.clusters.dcache.bank_stalls >> 32;
case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return proc_perf.clusters.dcache.mshr_stalls >> 32;
case VX_CSR_MPM_DCACHE_READS: return socket_perf.dcache.reads & 0xffffffff;
case VX_CSR_MPM_DCACHE_READS_H: return socket_perf.dcache.reads >> 32;
case VX_CSR_MPM_DCACHE_WRITES: return socket_perf.dcache.writes & 0xffffffff;
case VX_CSR_MPM_DCACHE_WRITES_H: return socket_perf.dcache.writes >> 32;
case VX_CSR_MPM_DCACHE_MISS_R: return socket_perf.dcache.read_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_R_H: return socket_perf.dcache.read_misses >> 32;
case VX_CSR_MPM_DCACHE_MISS_W: return socket_perf.dcache.write_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_W_H: return socket_perf.dcache.write_misses >> 32;
case VX_CSR_MPM_DCACHE_BANK_ST: return socket_perf.dcache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32;
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
@@ -641,12 +644,12 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff;
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32;
case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff;
case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32;
case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff;
case VX_CSR_MPM_SMEM_BANK_ST_H: return proc_perf.clusters.sharedmem.bank_stalls >> 32;
case VX_CSR_MPM_SMEM_READS: return smem_perf.reads & 0xffffffff;
case VX_CSR_MPM_SMEM_READS_H: return smem_perf.reads >> 32;
case VX_CSR_MPM_SMEM_WRITES: return smem_perf.writes & 0xffffffff;
case VX_CSR_MPM_SMEM_WRITES_H: return smem_perf.writes >> 32;
case VX_CSR_MPM_SMEM_BANK_ST: return smem_perf.bank_stalls & 0xffffffff;
case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32;
}
} break;
}

View File

@@ -40,7 +40,7 @@
namespace vortex {
class Cluster;
class Socket;
using TraceSwitch = Mux<pipeline_trace_t*>;
@@ -53,10 +53,6 @@ public:
uint64_t sched_stalls;
uint64_t ibuf_stalls;
uint64_t scrb_stalls;
uint64_t alu_stalls;
uint64_t lsu_stalls;
uint64_t fpu_stalls;
uint64_t sfu_stalls;
uint64_t scrb_alu;
uint64_t scrb_fpu;
uint64_t scrb_lsu;
@@ -74,10 +70,6 @@ public:
, sched_stalls(0)
, ibuf_stalls(0)
, scrb_stalls(0)
, alu_stalls(0)
, lsu_stalls(0)
, fpu_stalls(0)
, sfu_stalls(0)
, scrb_alu(0)
, scrb_fpu(0)
, scrb_lsu(0)
@@ -96,12 +88,7 @@ public:
std::vector<SimPort<MemReq>> dcache_req_ports;
std::vector<SimPort<MemRsp>> dcache_rsp_ports;
Core(const SimContext& ctx,
uint32_t core_id,
Cluster* cluster,
const Arch &arch,
const DCRS &dcrs,
SharedMem::Ptr sharedmem);
Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs);
~Core();
@@ -119,6 +106,10 @@ public:
return core_id_;
}
Socket* socket() const {
return socket_;
}
const Arch& arch() const {
return arch_;
}
@@ -181,7 +172,8 @@ private:
std::vector<Operand::Ptr> operands_;
std::vector<Dispatcher::Ptr> dispatchers_;
std::vector<ExeUnit::Ptr> exe_units_;
SharedMem::Ptr sharedmem_;
SharedMem::Ptr shared_mem_;
std::vector<SMemDemux::Ptr> smem_demuxs_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;
@@ -201,7 +193,7 @@ private:
PerfStats perf_stats_;
Cluster* cluster_;
Socket* socket_;
std::vector<TraceSwitch::Ptr> commit_arbs_;

View File

@@ -51,8 +51,7 @@ void AluUnit::tick() {
assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid);
}
auto time = input.pop();
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
input.pop();
}
}
@@ -87,8 +86,7 @@ void FpuUnit::tick() {
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
auto time = input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
input.pop();
}
}
@@ -114,7 +112,7 @@ void LsuUnit::tick() {
// handle dcache response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
auto& dcache_rsp_port = core_->smem_demuxs_.at(t)->RspIn;
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
@@ -136,7 +134,7 @@ void LsuUnit::tick() {
// handle shared memory response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
@@ -184,8 +182,7 @@ void LsuUnit::tick() {
fence_lock_ = true;
DT(3, "fence-lock: " << *trace);
// remove input
auto time = input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
input.pop();
break;
}
@@ -213,7 +210,9 @@ void LsuUnit::tick() {
auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
#ifdef LSU_DUP_ENABLE
is_dup = (matches == trace->tmask.count());
#endif
}
uint32_t addr_count;
@@ -229,7 +228,7 @@ void LsuUnit::tick() {
if (!trace->tmask.test(t0 + t))
continue;
auto& dcache_req_port = core_->dcache_req_ports.at(t);
auto& dcache_req_port = core_->smem_demuxs_.at(t)->ReqIn;
auto mem_addr = trace_data->mem_addrs.at(t);
auto type = core_->get_addr_type(mem_addr.addr);
@@ -241,12 +240,16 @@ void LsuUnit::tick() {
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
dcache_req_port.send(mem_req, 2);
dcache_req_port.send(mem_req, 1);
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
++pending_loads_;
++core_->perf_stats_.loads;
if (is_write) {
++core_->perf_stats_.stores;
} else {
++core_->perf_stats_.loads;
++pending_loads_;
}
if (is_dup)
break;
}
@@ -254,13 +257,11 @@ void LsuUnit::tick() {
// do not wait on writes
if (is_write) {
pending_rd_reqs_.release(tag);
output.send(trace, 1);
++core_->perf_stats_.stores;
output.send(trace, 1);
}
// remove input
auto time = input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
input.pop();
break; // single block
}
@@ -318,10 +319,7 @@ void SfuUnit::tick() {
core_->stalled_warps_.reset(trace->wid);
}
auto time = input.pop();
auto stalls = (SimPlatform::instance().cycles() - time);
core_->perf_stats_.sfu_stalls += stalls;
input.pop();
break; // single block
}

View File

@@ -34,14 +34,13 @@ static void show_usage() {
uint32_t num_threads = NUM_THREADS;
uint32_t num_warps = NUM_WARPS;
uint32_t num_cores = NUM_CORES;
uint32_t num_clusters = NUM_CLUSTERS;
bool showStats = false;;
bool riscv_test = false;
const char* program = nullptr;
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
while ((c = getopt(argc, argv, "t:w:c:rsh?")) != -1) {
switch (c) {
case 't':
num_threads = atoi(optarg);
@@ -51,9 +50,6 @@ static void parse_args(int argc, char **argv) {
break;
case 'c':
num_cores = atoi(optarg);
break;
case 'g':
num_clusters = atoi(optarg);
break;
case 'r':
riscv_test = true;
@@ -88,7 +84,7 @@ int main(int argc, char **argv) {
{
// create processor configuation
Arch arch(num_threads, num_warps, num_cores, num_clusters);
Arch arch(num_threads, num_warps, num_cores);
// create memory module
RAM ram(RAM_PAGE_SIZE);

146
sim/simx/socket.cpp Normal file
View File

@@ -0,0 +1,146 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "socket.h"
#include "cluster.h"
using namespace vortex;
Socket::Socket(const SimContext& ctx,
uint32_t socket_id,
Cluster* cluster,
const Arch &arch, const
DCRS &dcrs)
: SimObject(ctx, "socket")
, icache_mem_req_port(this)
, icache_mem_rsp_port(this)
, dcache_mem_req_port(this)
, dcache_mem_rsp_port(this)
, socket_id_(socket_id)
, cores_(arch.socket_size())
, cluster_(cluster)
{
auto cores_per_socket = cores_.size();
char sname[100];
snprintf(sname, 100, "socket%d-icaches", socket_id);
icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{
!ICACHE_ENABLED,
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // L
log2ceil(sizeof(uint32_t)), // W
log2ceil(ICACHE_NUM_WAYS),// A
1, // B
XLEN, // address bits
1, // number of ports
1, // number of inputs
true, // write-through
false, // write response
(uint8_t)arch.num_warps(), // mshr
2, // pipeline latency
});
icaches_->MemReqPort.bind(&icache_mem_req_port);
icache_mem_rsp_port.bind(&icaches_->MemRspPort);
snprintf(sname, 100, "socket%d-dcaches", socket_id);
dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
!DCACHE_ENABLED,
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // L
log2ceil(sizeof(Word)), // W
log2ceil(DCACHE_NUM_WAYS),// A
log2ceil(DCACHE_NUM_BANKS), // B
XLEN, // address bits
1, // number of ports
DCACHE_NUM_BANKS, // number of inputs
true, // write-through
false, // write response
DCACHE_MSHR_SIZE, // mshr
2, // pipeline latency
});
dcaches_->MemReqPort.bind(&dcache_mem_req_port);
dcache_mem_rsp_port.bind(&dcaches_->MemRspPort);
// create cores
for (uint32_t i = 0; i < cores_per_socket; ++i) {
uint32_t core_id = socket_id * cores_per_socket + i;
cores_.at(i) = Core::Create(core_id, this, arch, dcrs);
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
cores_.at(i)->dcache_req_ports.at(j).bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&cores_.at(i)->dcache_rsp_ports.at(j));
}
}
}
Socket::~Socket() {
//--
}
void Socket::reset() {
//--
}
void Socket::tick() {
//--
}
void Socket::attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
}
}
bool Socket::running() const {
for (auto& core : cores_) {
if (core->running())
return true;
}
return false;
}
bool Socket::check_exit(Word* exitcode, bool riscv_test) const {
bool done = true;
Word exitcode_ = 0;
for (auto& core : cores_) {
Word ec;
if (core->check_exit(&ec, riscv_test)) {
exitcode_ |= ec;
} else {
done = false;
}
}
*exitcode = exitcode_;
return done;
}
void Socket::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
cluster_->barrier(bar_id, count, socket_id_ * cores_.size() + core_id);
}
void Socket::resume(uint32_t core_index) {
cores_.at(core_index)->resume();
}
Socket::PerfStats Socket::perf_stats() const {
Socket::PerfStats perf;
perf.icache = icaches_->perf_stats();
perf.dcache = dcaches_->perf_stats();
return perf;
}

87
sim/simx/socket.h Normal file
View File

@@ -0,0 +1,87 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "dcrs.h"
#include "arch.h"
#include "cache_cluster.h"
#include "shared_mem.h"
#include "core.h"
#include "constants.h"
namespace vortex {
class Cluster;
class Socket : public SimObject<Socket> {
public:
struct PerfStats {
CacheSim::PerfStats icache;
CacheSim::PerfStats dcache;
PerfStats& operator+=(const PerfStats& rhs) {
this->icache += rhs.icache;
this->dcache += rhs.dcache;
return *this;
}
};
SimPort<MemReq> icache_mem_req_port;
SimPort<MemRsp> icache_mem_rsp_port;
SimPort<MemReq> dcache_mem_req_port;
SimPort<MemRsp> dcache_mem_rsp_port;
Socket(const SimContext& ctx,
uint32_t socket_id,
Cluster* cluster,
const Arch &arch,
const DCRS &dcrs);
~Socket();
uint32_t id() const {
return socket_id_;
}
Cluster* cluster() const {
return cluster_;
}
void reset();
void tick();
void attach_ram(RAM* ram);
bool running() const;
bool check_exit(Word* exitcode, bool riscv_test) const;
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
void resume(uint32_t core_id);
Socket::PerfStats perf_stats() const;
private:
uint32_t socket_id_;
std::vector<Core::Ptr> cores_;
CacheCluster::Ptr icaches_;
CacheCluster::Ptr dcaches_;
Cluster* cluster_;
};
} // namespace vortex

View File

@@ -70,6 +70,7 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
case RegType::Integer: os << "x"; break;
case RegType::Float: os << "f"; break;
case RegType::Vector: os << "v"; break;
default: assert(false);
}
return os;
}
@@ -112,6 +113,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
case AluType::SYSCALL: os << "SYSCALL"; break;
case AluType::IMUL: os << "IMUL"; break;
case AluType::IDIV: os << "IDIV"; break;
default: assert(false);
}
return os;
}
@@ -129,6 +131,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
default: assert(false);
}
return os;
}
@@ -146,6 +149,7 @@ inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
case AddrType::Global: os << "Global"; break;
case AddrType::Shared: os << "Shared"; break;
case AddrType::IO: os << "IO"; break;
default: assert(false);
}
return os;
}
@@ -174,6 +178,7 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
case FpuType::FDIV: os << "FDIV"; break;
case FpuType::FSQRT: os << "FSQRT"; break;
case FpuType::FCVT: os << "FCVT"; break;
default: assert(false);
}
return os;
}
@@ -205,6 +210,7 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
case SfuType::CSRRS: os << "CSRRS"; break;
case SfuType::CSRRC: os << "CSRRC"; break;
case SfuType::CMOV: os << "CMOV"; break;
default: assert(false);
}
return os;
}
@@ -220,6 +226,7 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
switch (type) {
case ArbiterType::Priority: os << "Priority"; break;
case ArbiterType::RoundRobin: os << "RoundRobin"; break;
default: assert(false);
}
return os;
}