vortex/sim/simx/exeunit.cpp

#include "exeunit.h"
#include <iostream>
#include <iomanip>
#include <string.h>
#include <assert.h>
#include <util.h>
#include "debug.h"
#include "core.h"
#include "constants.h"

using namespace vortex;

NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}

void NopUnit::step(uint64_t /*cycle*/) {
    if (Input.empty())
        return;
    auto trace = Input.front();
    Output.send(trace, 1);
    Input.pop();
}

///////////////////////////////////////////////////////////////////////////////

LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
    : ExeUnit(ctx, core, "LSU")
    , num_threads_(core->arch().num_threads())
    , pending_dcache_(LSUQ_SIZE)
    , fence_lock_(false)
{}

void LsuUnit::step(uint64_t cycle) {
    // handle dcache response
    for (uint32_t t = 0; t < num_threads_; ++t) {
        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
        if (dcache_rsp_port.empty())
            continue;
        auto& mem_rsp = dcache_rsp_port.front();
        auto& entry = pending_dcache_.at(mem_rsp.tag);
        auto trace = entry.first;
        DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
            << ", tid=" << t << ", " << *trace);
        assert(entry.second);
        --entry.second; // track remaining blocks
        if (0 == entry.second) {
            Output.send(trace, 1);
            pending_dcache_.release(mem_rsp.tag);
        }
        dcache_rsp_port.pop();
    }

    // handle shared memory response
    for (uint32_t t = 0; t < num_threads_; ++t) {
        auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
        if (smem_rsp_port.empty())
            continue;
        auto& mem_rsp = smem_rsp_port.front();
        auto& entry = pending_dcache_.at(mem_rsp.tag);
        auto trace = entry.first;
        DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
            << ", tid=" << t << ", " << *trace);
        assert(entry.second);
        --entry.second; // track remaining blocks
        if (0 == entry.second) {
            Output.send(trace, 1);
            pending_dcache_.release(mem_rsp.tag);
        }
        smem_rsp_port.pop();
    }

    if (fence_lock_) {
        // wait for all pending memory operations to complete
        if (!pending_dcache_.empty())
            return;
        Output.send(fence_state_, 1);
        fence_lock_ = false;
        DT(3, cycle, "fence-unlock: " << fence_state_);
    }

    // check input queue
    if (Input.empty())
        return;

    auto trace = Input.front();

    if (trace->lsu.type == LsuType::FENCE) {
        // schedule fence lock
        fence_state_ = trace;
        fence_lock_ = true;
        DT(3, cycle, "fence-lock: " << *trace);
        // remove input
        auto time = Input.pop();
        core_->perf_stats_.lsu_stalls += (cycle - time);
        return;
    }

    // check pending queue capacity
    if (pending_dcache_.full()) {
        if (!trace->suspend()) {
            DT(3, cycle, "*** lsu-queue-stall: " << *trace);
        }
        return;
    } else {
        trace->resume();
    }

    bool is_write = (trace->lsu.type == LsuType::STORE);

    // duplicates detection
    bool is_dup = false;
    if (trace->tmask.test(0)) {
        uint64_t addr_mask = sizeof(Word)-1;
        Word addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
        uint32_t matches = 1;
        for (uint32_t t = 1; t < num_threads_; ++t) {
            if (!trace->tmask.test(t))
                continue;
            auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
            matches += (addr0 == mem_addr);
        }
        is_dup = (matches == trace->tmask.count());
    }

    uint32_t valid_addrs = 0;
    if (is_dup) {
        valid_addrs = 1;
    } else {
        for (auto& mem_addr : trace->mem_addrs) {
            valid_addrs += mem_addr.size();
        }
    }

    auto tag = pending_dcache_.allocate({trace, valid_addrs});

    for (uint32_t t = 0; t < num_threads_; ++t) {
        if (!trace->tmask.test(t))
            continue;

        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
        auto mem_addr = trace->mem_addrs.at(t).at(0);
        auto type = get_addr_type(mem_addr.addr, mem_addr.size);

        MemReq mem_req;
        mem_req.addr  = mem_addr.addr;
        mem_req.write = is_write;
        mem_req.non_cacheable = (type == AddrType::IO);
        mem_req.tag   = tag;
        mem_req.core_id = core_->id();

        if (type == AddrType::Shared) {
            core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
            DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
                << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
        } else {
            dcache_req_port.send(mem_req, 2);
            DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
                << ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace);
        }

        if (is_dup)
            break;
    }

    // do not wait on writes
    if (is_write) {
        pending_dcache_.release(tag);
        Output.send(trace, 1);
    }

    // remove input
    auto time = Input.pop();
    core_->perf_stats_.lsu_stalls += (cycle - time);
}

///////////////////////////////////////////////////////////////////////////////

AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}

void AluUnit::step(uint64_t cycle) {
    if (Input.empty())
        return;
    auto trace = Input.front();
    switch (trace->alu.type) {
    case AluType::ARITH:
    case AluType::BRANCH:
    case AluType::SYSCALL:
    case AluType::CMOV:
        Output.send(trace, 1);
        break;
    case AluType::IMUL:
        Output.send(trace, LATENCY_IMUL+1);
        break;
    case AluType::IDIV:
        Output.send(trace, XLEN+1);
        break;
    default:
        std::abort();
    }
    DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
    if (trace->fetch_stall) {
        core_->stalled_warps_.reset(trace->wid);
    }
    auto time = Input.pop();
    core_->perf_stats_.alu_stalls += (cycle - time);
}

///////////////////////////////////////////////////////////////////////////////

CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}

void CsrUnit::step(uint64_t cycle) {
    if (Input.empty())
        return;
    auto trace = Input.front();
    Output.send(trace, 1);
    auto time = Input.pop();
    core_->perf_stats_.csr_stalls += (cycle - time);
    DT(3, cycle, "pipeline-execute: op=CSR, " << *trace);
}

///////////////////////////////////////////////////////////////////////////////

FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}

void FpuUnit::step(uint64_t cycle) {
    if (Input.empty())
        return;
    auto trace = Input.front();
    switch (trace->fpu.type) {
    case FpuType::FNCP:
        Output.send(trace, 2);
        break;
    case FpuType::FMA:
        Output.send(trace, LATENCY_FMA+1);
        break;
    case FpuType::FDIV:
        Output.send(trace, LATENCY_FDIV+1);
        break;
    case FpuType::FSQRT:
        Output.send(trace, LATENCY_FSQRT+1);
        break;
    case FpuType::FCVT:
        Output.send(trace, LATENCY_FCVT+1);
        break;
    default:
        std::abort();
    }
    DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
    auto time = Input.pop();
    core_->perf_stats_.fpu_stalls += (cycle - time);
}

///////////////////////////////////////////////////////////////////////////////

GpuUnit::GpuUnit(const SimContext& ctx, Core* core)
    : ExeUnit(ctx, core, "GPU")
    , num_threads_(core->arch().num_threads())
    , pending_tex_reqs_(TEXQ_SIZE)
{}

void GpuUnit::step(uint64_t cycle) {
#ifdef EXT_TEX_ENABLE
    // handle memory response
    for (uint32_t t = 0; t < num_threads_; ++t) {
        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
        if (dcache_rsp_port.empty())
            continue;
        auto& mem_rsp = dcache_rsp_port.front();
        auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
        auto trace = entry.first;
        DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
        assert(entry.second);
        --entry.second; // track remaining blocks
        if (0 == entry.second) {
            Output.send(trace, 1);
            pending_tex_reqs_.release(mem_rsp.tag);
        }
        dcache_rsp_port.pop();
    }
#endif

    // check input queue
    if (Input.empty())
        return;

    auto trace = Input.front();

    bool issued = false;

    switch  (trace->gpu.type) {
    case GpuType::TMC:
        Output.send(trace, 1);
        core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
        issued = true;
        break;
    case GpuType::WSPAWN:
        Output.send(trace, 1);
        core_->active_warps_ = trace->gpu.active_warps;
        issued = true;
        break;
    case GpuType::SPLIT:
    case GpuType::JOIN:
        Output.send(trace, 1);
        issued = true;
        break;
    case GpuType::BAR:
        Output.send(trace, 1);
        if (trace->gpu.active_warps != 0)
            core_->active_warps_ |= trace->gpu.active_warps;
        else
            core_->active_warps_.reset(trace->wid);
        issued = true;
        break;
    case GpuType::TEX:
        if (this->processTexRequest(cycle, trace))
           issued = true;
        break;
    default:
        std::abort();
    }

    if (issued) {
        DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
        if (trace->fetch_stall)  {
            core_->stalled_warps_.reset(trace->wid);
        }
        auto time = Input.pop();
        core_->perf_stats_.fpu_stalls += (cycle - time);
    }
}

bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
    __unused (cycle);

    // check pending queue capacity
    if (pending_tex_reqs_.full()) {
        if (!trace->suspend()) {
            DT(3, cycle, "*** tex-queue-stall: " << *trace);
        }
        return false;
    } else {
        trace->resume();
    }

    // send memory request

    uint32_t valid_addrs = 0;
    for (auto& mem_addr : trace->mem_addrs) {
        valid_addrs += mem_addr.size();
    }

    auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});

    for (uint32_t t = 0; t < num_threads_; ++t) {
        if (!trace->tmask.test(t))
            continue;

        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
        for (auto mem_addr : trace->mem_addrs.at(t)) {
            MemReq mem_req;
            mem_req.addr  = mem_addr.addr;
            mem_req.write = (trace->lsu.type == LsuType::STORE);
            mem_req.tag   = tag;
            mem_req.core_id = core_->id();
            dcache_req_port.send(mem_req, 3);
            DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
                << ", tid=" << t << ", "<< trace);
            ++ core_->perf_stats_.tex_reads;
            ++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
        }
    }

    return true;
}