adding tracking for SFU stalls

This commit is contained in:
Blaise Tine
2023-12-28 12:12:11 -08:00
parent c7a81d1493
commit e217bc2c23
27 changed files with 1266 additions and 1166 deletions

View File

@@ -18,20 +18,20 @@ using namespace vortex;
Cluster::Cluster(const SimContext& ctx,
uint32_t cluster_id,
ProcessorImpl* processor,
const Arch &arch, const
DCRS &dcrs)
const Arch &arch,
const DCRS &dcrs)
: SimObject(ctx, "cluster")
, mem_req_port(this)
, mem_rsp_port(this)
, cluster_id_(cluster_id)
, sockets_(NUM_SOCKETS)
, barriers_(arch.num_barriers(), 0)
, processor_(processor)
, sockets_(NUM_SOCKETS)
, barriers_(arch.num_barriers(), 0)
, cores_per_socket_(arch.socket_size())
{
char sname[100];
auto sockets_per_cluster = sockets_.size();
uint32_t sockets_per_cluster = sockets_.size();
// create sockets
@@ -43,7 +43,10 @@ Cluster::Cluster(const SimContext& ctx,
for (uint32_t i = 0; i < sockets_per_cluster; ++i) {
uint32_t socket_id = cluster_id * sockets_per_cluster + i;
auto socket = Socket::Create(socket_id, this, arch, dcrs);
auto socket = Socket::Create(socket_id,
this,
arch,
dcrs);
socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i));
icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port);
@@ -154,7 +157,7 @@ void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
}
Cluster::PerfStats Cluster::perf_stats() const {
Cluster::PerfStats perf;
perf.l2cache = l2cache_->perf_stats();
return perf;
PerfStats perf_stats;
perf_stats.l2cache = l2cache_->perf_stats();
return perf_stats;
}

View File

@@ -17,6 +17,7 @@
#include "dcrs.h"
#include "arch.h"
#include "cache_cluster.h"
#include "shared_mem.h"
#include "core.h"
#include "socket.h"
#include "constants.h"
@@ -27,13 +28,8 @@ class ProcessorImpl;
class Cluster : public SimObject<Cluster> {
public:
struct PerfStats {
struct PerfStats {
CacheSim::PerfStats l2cache;
PerfStats& operator+=(const PerfStats& rhs) {
this->l2cache += rhs.l2cache;
return *this;
}
};
SimPort<MemReq> mem_req_port;
@@ -67,15 +63,15 @@ public:
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
Cluster::PerfStats perf_stats() const;
PerfStats perf_stats() const;
private:
uint32_t cluster_id_;
std::vector<Socket::Ptr> sockets_;
std::vector<CoreMask> barriers_;
CacheSim::Ptr l2cache_;
ProcessorImpl* processor_;
uint32_t cores_per_socket_;
uint32_t cluster_id_;
ProcessorImpl* processor_;
std::vector<Socket::Ptr> sockets_;
std::vector<CoreMask> barriers_;
CacheSim::Ptr l2cache_;
uint32_t cores_per_socket_;
};
} // namespace vortex

View File

@@ -28,13 +28,18 @@
using namespace vortex;
Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs)
Core::Core(const SimContext& ctx,
uint32_t core_id,
Socket* socket,
const Arch &arch,
const DCRS &dcrs)
: SimObject(ctx, "core")
, icache_req_ports(1, this)
, icache_rsp_ports(1, this)
, dcache_req_ports(NUM_LSU_LANES, this)
, dcache_rsp_ports(NUM_LSU_LANES, this)
, core_id_(core_id)
, socket_(socket)
, arch_(arch)
, dcrs_(dcrs)
, decoder_(arch)
@@ -42,7 +47,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
, barriers_(arch.num_barriers(), 0)
, fcsrs_(arch.num_warps(), 0)
, ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_)
, scoreboard_(arch_)
, operands_(ISSUE_WIDTH)
, dispatchers_((uint32_t)ExeType::ExeTypeCount)
, exe_units_((uint32_t)ExeType::ExeTypeCount)
@@ -50,8 +55,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
, fetch_latch_("fetch")
, decode_latch_("decode")
, pending_icache_(arch_.num_warps())
, csrs_(arch.num_warps())
, socket_(socket)
, csrs_(arch.num_warps())
, commit_arbs_(ISSUE_WIDTH)
{
char sname[100];
@@ -69,6 +73,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
}
// initialize shared memory
snprintf(sname, 100, "core%d-shared_mem", core_id);
shared_mem_ = SharedMem::Create(sname, SharedMem::Config{
(1 << SMEM_LOG_SIZE),
sizeof(Word),
@@ -77,17 +82,17 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
false
});
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
snprintf(sname, 100, "smem_demux%d_%d", core_id, i);
auto smem_demux = SMemDemux::Create(sname);
smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
snprintf(sname, 100, "core%d-smem_demux%d", core_id, i);
auto smem_demux = SMemDemux::Create(sname);
smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
smem_demuxs_.at(i) = smem_demux;
}
smem_demuxs_.at(i) = smem_demux;
}
// initialize dispatchers
dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
@@ -103,7 +108,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
// bind commit arbiters
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
snprintf(sname, 100, "commit-arb%d", i);
snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1);
for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) {
exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
@@ -128,7 +133,7 @@ void Core::reset() {
for (auto& exe_unit : exe_units_) {
exe_unit->reset();
}
for (auto& commit_arb : commit_arbs_) {
commit_arb->reset();
}
@@ -184,7 +189,7 @@ void Core::schedule() {
}
}
if (scheduled_warp == -1) {
++perf_stats_.sched_idles;
++perf_stats_.sched_idle;
return;
}
@@ -229,7 +234,7 @@ void Core::fetch() {
mem_req.uuid = trace->uuid;
icache_req_ports.at(0).send(mem_req, 2);
DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
fetch_latch_.pop();
fetch_latch_.pop();
++perf_stats_.ifetches;
++pending_ifetches_;
}
@@ -311,7 +316,21 @@ void Core::issue() {
case ExeType::ALU: ++perf_stats_.scrb_alu; break;
case ExeType::FPU: ++perf_stats_.scrb_fpu; break;
case ExeType::LSU: ++perf_stats_.scrb_lsu; break;
case ExeType::SFU: ++perf_stats_.scrb_sfu; break;
case ExeType::SFU: {
++perf_stats_.scrb_sfu;
switch (use.sfu_type) {
case SfuType::TMC:
case SfuType::WSPAWN:
case SfuType::SPLIT:
case SfuType::JOIN:
case SfuType::BAR:
case SfuType::PRED: ++perf_stats_.scrb_wctl; break;
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break;
default: assert(false);
}
} break;
default: assert(false);
}
}
@@ -356,7 +375,6 @@ void Core::commit() {
auto& commit_arb = commit_arbs_.at(i);
if (commit_arb->Outputs.at(0).empty())
continue;
auto trace = commit_arb->Outputs.at(0).front();
// advance to commit stage
@@ -558,8 +576,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
break;
case VX_DCR_MPM_CLASS_CORE: {
switch (addr) {
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idles & 0xffffffff;
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idles >> 32;
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idle & 0xffffffff;
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idle >> 32;
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
@@ -574,6 +592,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
case VX_CSR_MPM_SCRB_WCTL: return perf_stats_.scrb_wctl & 0xffffffff;
case VX_CSR_MPM_SCRB_WCTL_H: return perf_stats_.scrb_wctl >> 32;
case VX_CSR_MPM_SCRB_CSRS: return perf_stats_.scrb_csrs & 0xffffffff;
case VX_CSR_MPM_SCRB_CSRS_H: return perf_stats_.scrb_csrs >> 32;
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
@@ -588,6 +610,7 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
} break;
case VX_DCR_MPM_CLASS_MEM: {
auto proc_perf = socket_->cluster()->processor()->perf_stats();
auto cluster_perf = socket_->cluster()->perf_stats();
auto socket_perf = socket_->perf_stats();
auto smem_perf = shared_mem_->perf_stats();
switch (addr) {
@@ -611,18 +634,18 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
case VX_CSR_MPM_L2CACHE_WRITES: return proc_perf.clusters.l2cache.writes & 0xffffffff;
case VX_CSR_MPM_L2CACHE_WRITES_H: return proc_perf.clusters.l2cache.writes >> 32;
case VX_CSR_MPM_L2CACHE_MISS_R: return proc_perf.clusters.l2cache.read_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_R_H: return proc_perf.clusters.l2cache.read_misses >> 32;
case VX_CSR_MPM_L2CACHE_MISS_W: return proc_perf.clusters.l2cache.write_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_W_H: return proc_perf.clusters.l2cache.write_misses >> 32;
case VX_CSR_MPM_L2CACHE_BANK_ST: return proc_perf.clusters.l2cache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return proc_perf.clusters.l2cache.bank_stalls >> 32;
case VX_CSR_MPM_L2CACHE_MSHR_ST: return proc_perf.clusters.l2cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return proc_perf.clusters.l2cache.mshr_stalls >> 32;
case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff;
case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32;
case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff;
case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32;
case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32;
case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32;
case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
@@ -638,7 +661,7 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
@@ -652,6 +675,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32;
}
} break;
default: {
std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
std::abort();
} break;
}
} else {
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;

View File

@@ -49,7 +49,7 @@ public:
struct PerfStats {
uint64_t cycles;
uint64_t instrs;
uint64_t sched_idles;
uint64_t sched_idle;
uint64_t sched_stalls;
uint64_t ibuf_stalls;
uint64_t scrb_stalls;
@@ -57,6 +57,8 @@ public:
uint64_t scrb_fpu;
uint64_t scrb_lsu;
uint64_t scrb_sfu;
uint64_t scrb_wctl;
uint64_t scrb_csrs;
uint64_t ifetches;
uint64_t loads;
uint64_t stores;
@@ -66,7 +68,7 @@ public:
PerfStats()
: cycles(0)
, instrs(0)
, sched_idles(0)
, sched_idle(0)
, sched_stalls(0)
, ibuf_stalls(0)
, scrb_stalls(0)
@@ -74,6 +76,8 @@ public:
, scrb_fpu(0)
, scrb_lsu(0)
, scrb_sfu(0)
, scrb_wctl(0)
, scrb_csrs(0)
, ifetches(0)
, loads(0)
, stores(0)
@@ -88,7 +92,11 @@ public:
std::vector<SimPort<MemReq>> dcache_req_ports;
std::vector<SimPort<MemRsp>> dcache_rsp_ports;
Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs);
Core(const SimContext& ctx,
uint32_t core_id,
Socket* socket,
const Arch &arch,
const DCRS &dcrs);
~Core();
@@ -158,6 +166,7 @@ private:
void cout_flush();
uint32_t core_id_;
Socket* socket_;
const Arch& arch_;
const DCRS &dcrs_;
@@ -193,10 +202,9 @@ private:
PerfStats perf_stats_;
Socket* socket_;
std::vector<TraceSwitch::Ptr> commit_arbs_;
uint32_t commit_exe_;
uint32_t ibuffer_idx_;
friend class Warp;

View File

@@ -113,6 +113,7 @@ void ProcessorImpl::reset() {
perf_mem_writes_ = 0;
perf_mem_latency_ = 0;
perf_mem_pending_reads_ = 0;
}
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
@@ -125,9 +126,6 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
perf.mem_writes = perf_mem_writes_;
perf.mem_latency = perf_mem_latency_;
perf.l3cache = l3cache_->perf_stats();
for (auto cluster : clusters_) {
perf.clusters += cluster->perf_stats();
}
return perf;
}

View File

@@ -24,17 +24,10 @@ namespace vortex {
class ProcessorImpl {
public:
struct PerfStats {
CacheSim::PerfStats l3cache;
uint64_t mem_reads;
uint64_t mem_writes;
uint64_t mem_latency;
CacheSim::PerfStats l3cache;
Cluster::PerfStats clusters;
PerfStats()
: mem_reads(0)
, mem_writes(0)
, mem_latency(0)
{}
};
ProcessorImpl(const Arch& arch);
@@ -46,7 +39,7 @@ public:
void write_dcr(uint32_t addr, uint32_t value);
ProcessorImpl::PerfStats perf_stats() const;
PerfStats perf_stats() const;
private:
@@ -55,7 +48,7 @@ private:
const Arch& arch_;
std::vector<std::shared_ptr<Cluster>> clusters_;
DCRS dcrs_;
MemSim::Ptr memsim_;
MemSim::Ptr memsim_;
CacheSim::Ptr l3cache_;
uint64_t perf_mem_reads_;
uint64_t perf_mem_writes_;

View File

@@ -25,6 +25,7 @@ public:
RegType reg_type;
uint32_t reg_id;
ExeType exe_type;
SfuType sfu_type;
uint64_t uuid;
};
@@ -62,7 +63,7 @@ public:
if (used_iregs.test(r)) {
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer;
auto owner = owners_.at(tag);
out.push_back({RegType::Integer, r, owner->exe_type, owner->uuid});
out.push_back({RegType::Integer, r, owner->exe_type, owner->sfu_type, owner->uuid});
}
}
@@ -70,7 +71,7 @@ public:
if (used_fregs.test(r)) {
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float;
auto owner = owners_.at(tag);
out.push_back({RegType::Float, r, owner->exe_type, owner->uuid});
out.push_back({RegType::Float, r, owner->exe_type, owner->sfu_type, owner->uuid});
}
}
@@ -78,7 +79,7 @@ public:
if (used_vregs.test(r)) {
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector;
auto owner = owners_.at(tag);
out.push_back({RegType::Vector, r, owner->exe_type, owner->uuid});
out.push_back({RegType::Vector, r, owner->exe_type, owner->sfu_type, owner->uuid});
}
}

View File

@@ -19,16 +19,16 @@ using namespace vortex;
Socket::Socket(const SimContext& ctx,
uint32_t socket_id,
Cluster* cluster,
const Arch &arch, const
DCRS &dcrs)
const Arch &arch,
const DCRS &dcrs)
: SimObject(ctx, "socket")
, icache_mem_req_port(this)
, icache_mem_rsp_port(this)
, dcache_mem_req_port(this)
, dcache_mem_rsp_port(this)
, socket_id_(socket_id)
, cores_(arch.socket_size())
, cluster_(cluster)
, cores_(arch.socket_size())
{
auto cores_per_socket = cores_.size();
@@ -77,7 +77,10 @@ Socket::Socket(const SimContext& ctx,
for (uint32_t i = 0; i < cores_per_socket; ++i) {
uint32_t core_id = socket_id * cores_per_socket + i;
cores_.at(i) = Core::Create(core_id, this, arch, dcrs);
cores_.at(i) = Core::Create(core_id,
this,
arch,
dcrs);
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
@@ -139,8 +142,8 @@ void Socket::resume(uint32_t core_index) {
}
Socket::PerfStats Socket::perf_stats() const {
Socket::PerfStats perf;
perf.icache = icaches_->perf_stats();
perf.dcache = dcaches_->perf_stats();
return perf;
PerfStats perf_stats;
perf_stats.icache = icaches_->perf_stats();
perf_stats.dcache = dcaches_->perf_stats();
return perf_stats;
}

View File

@@ -30,12 +30,6 @@ public:
struct PerfStats {
CacheSim::PerfStats icache;
CacheSim::PerfStats dcache;
PerfStats& operator+=(const PerfStats& rhs) {
this->icache += rhs.icache;
this->dcache += rhs.dcache;
return *this;
}
};
SimPort<MemReq> icache_mem_req_port;
@@ -74,14 +68,14 @@ public:
void resume(uint32_t core_id);
Socket::PerfStats perf_stats() const;
PerfStats perf_stats() const;
private:
uint32_t socket_id_;
uint32_t socket_id_;
Cluster* cluster_;
std::vector<Core::Ptr> cores_;
CacheCluster::Ptr icaches_;
CacheCluster::Ptr dcaches_;
Cluster* cluster_;
};
} // namespace vortex