Files
kernels/sim/simx/execute.cpp
Blaise Tine c1e168fdbe Vortex 2.0 changes:
+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
2023-11-10 02:47:05 -08:00

2361 lines
81 KiB
C++

// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <bitset>
#include <climits>
#include <sys/types.h>
#include <sys/stat.h>
#include <assert.h>
#include <util.h>
#include <rvfloats.h>
#include "warp.h"
#include "instr.h"
#include "core.h"
using namespace vortex;
union reg_data_t {
Word u;
WordI i;
WordF f;
float f32;
double f64;
uint32_t u32;
uint64_t u64;
int32_t i32;
int64_t i64;
};
inline uint32_t get_fpu_rm(uint32_t func3, Core* core, uint32_t tid, uint32_t wid) {
return (func3 == 0x7) ? core->get_csr(VX_CSR_FRM, tid, wid) : func3;
}
inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid) {
if (fflags) {
core->set_csr(VX_CSR_FCSR, core->get_csr(VX_CSR_FCSR, tid, wid) | fflags, tid, wid);
core->set_csr(VX_CSR_FFLAGS, core->get_csr(VX_CSR_FFLAGS, tid, wid) | fflags, tid, wid);
}
}
inline uint64_t nan_box(uint32_t value) {
uint64_t mask = 0xffffffff00000000;
return value | mask;
}
inline bool is_nan_boxed(uint64_t value) {
return (uint32_t(value >> 32) == 0xffffffff);
}
inline int64_t check_boxing(int64_t a) {
if (is_nan_boxed(a))
return a;
return nan_box(0x7fc00000); // NaN
}
void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
assert(tmask_.any());
auto next_pc = PC_ + 4;
auto next_tmask = tmask_;
auto func2 = instr.getFunc2();
auto func3 = instr.getFunc3();
auto func6 = instr.getFunc6();
auto func7 = instr.getFunc7();
auto opcode = instr.getOpcode();
auto rdest = instr.getRDest();
auto rsrc0 = instr.getRSrc(0);
auto rsrc1 = instr.getRSrc(1);
auto rsrc2 = instr.getRSrc(2);
auto immsrc = sext((Word)instr.getImm(), 32);
auto vmask = instr.getVmask();
auto num_threads = arch_.num_threads();
uint32_t thread_start = 0;
for (; thread_start < num_threads; ++thread_start) {
if (tmask_.test(thread_start))
break;
}
std::vector<reg_data_t[3]> rsdata(num_threads);
std::vector<reg_data_t> rddata(num_threads);
auto num_rsrcs = instr.getNRSrc();
if (num_rsrcs) {
for (uint32_t i = 0; i < num_rsrcs; ++i) {
auto type = instr.getRSType(i);
auto reg = instr.getRSrc(i);
switch (type) {
case RegType::Integer:
DPH(2, "Src" << std::dec << i << " Reg: " << type << std::dec << reg << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
DPN(2, "-");
continue;
}
rsdata[t][i].u = ireg_file_.at(t)[reg];
DPN(2, "0x" << std::hex << rsdata[t][i].i);
}
DPN(2, "}" << std::endl);
break;
case RegType::Float:
DPH(2, "Src" << std::dec << i << " Reg: " << type << std::dec << reg << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
DPN(2, "-");
continue;
}
rsdata[t][i].u64 = freg_file_.at(t)[reg];
DPN(2, "0x" << std::hex << rsdata[t][i].f);
}
DPN(2, "}" << std::endl);
break;
case RegType::Vector:
// TODO:
break;
case RegType::None:
break;
}
}
}
bool rd_write = false;
switch (opcode) {
case LUI_INST: {
// RV32I: LUI
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::ARITH;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t].i = immsrc << 12;
}
rd_write = true;
break;
}
case AUIPC_INST: {
// RV32I: AUIPC
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::ARITH;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t].i = (immsrc << 12) + PC_;
}
rd_write = true;
break;
}
case R_INST: {
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
if (func7 & 0x1) {
switch (func3) {
case 0: {
// RV32M: MUL
rddata[t].i = rsdata[t][0].i * rsdata[t][1].i;
trace->alu_type = AluType::IMUL;
break;
}
case 1: {
// RV32M: MULH
auto first = static_cast<DWordI>(rsdata[t][0].i);
auto second = static_cast<DWordI>(rsdata[t][1].i);
rddata[t].i = (first * second) >> XLEN;
trace->alu_type = AluType::IMUL;
break;
}
case 2: {
// RV32M: MULHSU
auto first = static_cast<DWordI>(rsdata[t][0].i);
auto second = static_cast<DWord>(rsdata[t][1].u);
rddata[t].i = (first * second) >> XLEN;
trace->alu_type = AluType::IMUL;
break;
}
case 3: {
// RV32M: MULHU
auto first = static_cast<DWord>(rsdata[t][0].u);
auto second = static_cast<DWord>(rsdata[t][1].u);
rddata[t].i = (first * second) >> XLEN;
trace->alu_type = AluType::IMUL;
break;
}
case 4: {
// RV32M: DIV
auto dividen = rsdata[t][0].i;
auto divisor = rsdata[t][1].i;
auto largest_negative = WordI(1) << (XLEN-1);
if (divisor == 0) {
rddata[t].i = -1;
} else if (dividen == largest_negative && divisor == -1) {
rddata[t].i = dividen;
} else {
rddata[t].i = dividen / divisor;
}
trace->alu_type = AluType::IDIV;
break;
}
case 5: {
// RV32M: DIVU
auto dividen = rsdata[t][0].u;
auto divisor = rsdata[t][1].u;
if (divisor == 0) {
rddata[t].i = -1;
} else {
rddata[t].i = dividen / divisor;
}
trace->alu_type = AluType::IDIV;
break;
}
case 6: {
// RV32M: REM
auto dividen = rsdata[t][0].i;
auto divisor = rsdata[t][1].i;
auto largest_negative = WordI(1) << (XLEN-1);
if (rsdata[t][1].i == 0) {
rddata[t].i = dividen;
} else if (dividen == largest_negative && divisor == -1) {
rddata[t].i = 0;
} else {
rddata[t].i = dividen % divisor;
}
trace->alu_type = AluType::IDIV;
break;
}
case 7: {
// RV32M: REMU
auto dividen = rsdata[t][0].u;
auto divisor = rsdata[t][1].u;
if (rsdata[t][1].i == 0) {
rddata[t].i = dividen;
} else {
rddata[t].i = dividen % divisor;
}
trace->alu_type = AluType::IDIV;
break;
}
default:
std::abort();
}
} else {
switch (func3) {
case 0: {
if (func7) {
// RV32I: SUB
rddata[t].i = rsdata[t][0].i - rsdata[t][1].i;
} else {
// RV32I: ADD
rddata[t].i = rsdata[t][0].i + rsdata[t][1].i;
}
break;
}
case 1: {
// RV32I: SLL
Word shamt_mask = (Word(1) << log2up(XLEN)) - 1;
Word shamt = rsdata[t][1].i & shamt_mask;
rddata[t].i = rsdata[t][0].i << shamt;
break;
}
case 2: {
// RV32I: SLT
rddata[t].i = rsdata[t][0].i < rsdata[t][1].i;
break;
}
case 3: {
// RV32I: SLTU
rddata[t].i = rsdata[t][0].u < rsdata[t][1].u;
break;
}
case 4: {
// RV32I: XOR
rddata[t].i = rsdata[t][0].i ^ rsdata[t][1].i;
break;
}
case 5: {
Word shamt_mask = ((Word)1 << log2up(XLEN)) - 1;
Word shamt = rsdata[t][1].i & shamt_mask;
if (func7) {
// RV32I: SRA
rddata[t].i = rsdata[t][0].i >> shamt;
} else {
// RV32I: SRL
rddata[t].i = rsdata[t][0].u >> shamt;
}
break;
}
case 6: {
// RV32I: OR
rddata[t].i = rsdata[t][0].i | rsdata[t][1].i;
break;
}
case 7: {
// RV32I: AND
rddata[t].i = rsdata[t][0].i & rsdata[t][1].i;
break;
}
default:
std::abort();
}
}
}
rd_write = true;
break;
}
case I_INST: {
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
switch (func3) {
case 0: {
// RV32I: ADDI
rddata[t].i = rsdata[t][0].i + immsrc;
break;
}
case 1: {
// RV64I: SLLI
rddata[t].i = rsdata[t][0].i << immsrc;
break;
}
case 2: {
// RV32I: SLTI
rddata[t].i = rsdata[t][0].i < WordI(immsrc);
break;
}
case 3: {
// RV32I: SLTIU
rddata[t].i = rsdata[t][0].u < immsrc;
break;
}
case 4: {
// RV32I: XORI
rddata[t].i = rsdata[t][0].i ^ immsrc;
break;
}
case 5: {
if (func7) {
// RV64I: SRAI
Word result = rsdata[t][0].i >> immsrc;
rddata[t].i = result;
} else {
// RV64I: SRLI
Word result = rsdata[t][0].u >> immsrc;
rddata[t].i = result;
}
break;
}
case 6: {
// RV32I: ORI
rddata[t].i = rsdata[t][0].i | immsrc;
break;
}
case 7: {
// RV32I: ANDI
rddata[t].i = rsdata[t][0].i & immsrc;
break;
}
}
}
rd_write = true;
break;
}
case R_INST_W: {
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
if (func7 & 0x1) {
switch (func3) {
case 0: {
// RV64M: MULW
int32_t product = (int32_t)rsdata[t][0].i * (int32_t)rsdata[t][1].i;
rddata[t].i = sext((uint64_t)product, 32);
trace->alu_type = AluType::IMUL;
break;
}
case 4: {
// RV64M: DIVW
int32_t dividen = (int32_t)rsdata[t][0].i;
int32_t divisor = (int32_t)rsdata[t][1].i;
int32_t quotient;
int32_t largest_negative = 0x80000000;
if (divisor == 0){
quotient = -1;
} else if (dividen == largest_negative && divisor == -1) {
quotient = dividen;
} else {
quotient = dividen / divisor;
}
rddata[t].i = sext((uint64_t)quotient, 32);
trace->alu_type = AluType::IDIV;
break;
}
case 5: {
// RV64M: DIVUW
uint32_t dividen = (uint32_t)rsdata[t][0].i;
uint32_t divisor = (uint32_t)rsdata[t][1].i;
uint32_t quotient;
if (divisor == 0){
quotient = -1;
} else {
quotient = dividen / divisor;
}
rddata[t].i = sext((uint64_t)quotient, 32);
trace->alu_type = AluType::IDIV;
break;
}
case 6: {
// RV64M: REMW
int32_t dividen = (uint32_t)rsdata[t][0].i;
int32_t divisor = (uint32_t)rsdata[t][1].i;
int32_t remainder;
int32_t largest_negative = 0x80000000;
if (divisor == 0){
remainder = dividen;
} else if (dividen == largest_negative && divisor == -1) {
remainder = 0;
} else {
remainder = dividen % divisor;
}
rddata[t].i = sext((uint64_t)remainder, 32);
trace->alu_type = AluType::IDIV;
break;
}
case 7: {
// RV64M: REMUW
uint32_t dividen = (uint32_t)rsdata[t][0].i;
uint32_t divisor = (uint32_t)rsdata[t][1].i;
uint32_t remainder;
if (divisor == 0){
remainder = dividen;
} else {
remainder = dividen % divisor;
}
rddata[t].i = sext((uint64_t)remainder, 32);
trace->alu_type = AluType::IDIV;
break;
}
default:
std::abort();
}
} else {
switch (func3) {
case 0: {
if (func7){
// RV64I: SUBW
uint32_t result = (uint32_t)rsdata[t][0].i - (uint32_t)rsdata[t][1].i;
rddata[t].i = sext((uint64_t)result, 32);
}
else{
// RV64I: ADDW
uint32_t result = (uint32_t)rsdata[t][0].i + (uint32_t)rsdata[t][1].i;
rddata[t].i = sext((uint64_t)result, 32);
}
break;
}
case 1: {
// RV64I: SLLW
uint32_t shamt_mask = 0x1F;
uint32_t shamt = rsdata[t][1].i & shamt_mask;
uint32_t result = (uint32_t)rsdata[t][0].i << shamt;
rddata[t].i = sext((uint64_t)result, 32);
break;
}
case 5: {
uint32_t shamt_mask = 0x1F;
uint32_t shamt = rsdata[t][1].i & shamt_mask;
uint32_t result;
if (func7) {
// RV64I: SRAW
result = (int32_t)rsdata[t][0].i >> shamt;
} else {
// RV64I: SRLW
result = (uint32_t)rsdata[t][0].i >> shamt;
}
rddata[t].i = sext((uint64_t)result, 32);
break;
}
default:
std::abort();
}
}
}
rd_write = true;
break;
}
case I_INST_W: {
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
switch (func3) {
case 0: {
// RV64I: ADDIW
uint32_t result = (uint32_t)rsdata[t][0].i + (uint32_t)immsrc;
rddata[t].i = sext((uint64_t)result, 32);
break;
}
case 1: {
// RV64I: SLLIW
uint32_t shamt_mask = 0x1F;
uint32_t shamt = immsrc & shamt_mask;
uint32_t result = rsdata[t][0].i << shamt;
rddata[t].i = sext((uint64_t)result, 32);
break;
}
case 5: {
uint32_t shamt_mask = 0x1F;
uint32_t shamt = immsrc & shamt_mask;
uint32_t result;
if (func7) {
// RV64I: SRAIW
result = (int32_t)rsdata[t][0].i >> shamt;
} else {
// RV64I: SRLIW
result = (uint32_t)rsdata[t][0].i >> shamt;
}
rddata[t].i = sext((uint64_t)result, 32);
break;
}
default:
std::abort();
}
}
rd_write = true;
break;
}
case B_INST: {
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::BRANCH;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
switch (func3) {
case 0: {
// RV32I: BEQ
if (rsdata[t][0].i == rsdata[t][1].i) {
next_pc = PC_ + immsrc;
}
break;
}
case 1: {
// RV32I: BNE
if (rsdata[t][0].i != rsdata[t][1].i) {
next_pc = PC_ + immsrc;
}
break;
}
case 4: {
// RV32I: BLT
if (rsdata[t][0].i < rsdata[t][1].i) {
next_pc = PC_ + immsrc;
}
break;
}
case 5: {
// RV32I: BGE
if (rsdata[t][0].i >= rsdata[t][1].i) {
next_pc = PC_ + immsrc;
}
break;
}
case 6: {
// RV32I: BLTU
if (rsdata[t][0].u < rsdata[t][1].u) {
next_pc = PC_ + immsrc;
}
break;
}
case 7: {
// RV32I: BGEU
if (rsdata[t][0].u >= rsdata[t][1].u) {
next_pc = PC_ + immsrc;
}
break;
}
default:
std::abort();
}
break; // runonce
}
trace->fetch_stall = true;
break;
}
case JAL_INST: {
// RV32I: JAL
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::BRANCH;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t].i = next_pc;
}
next_pc = PC_ + immsrc;
trace->fetch_stall = true;
rd_write = true;
break;
}
case JALR_INST: {
// RV32I: JALR
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::BRANCH;
trace->used_iregs.set(rsrc0);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t].i = next_pc;
}
next_pc = rsdata[thread_start][0].i + immsrc;
trace->fetch_stall = true;
rd_write = true;
break;
}
case L_INST:
case FL: {
trace->exe_type = ExeType::LSU;
trace->lsu_type = LsuType::LOAD;
trace->used_iregs.set(rsrc0);
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
trace->data = trace_data;
if ((opcode == L_INST )
|| (opcode == FL && func3 == 2)
|| (opcode == FL && func3 == 3)) {
uint32_t data_bytes = 1 << (func3 & 0x3);
uint32_t data_width = 8 * data_bytes;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].i + immsrc;
uint64_t read_data = 0;
core_->dcache_read(&read_data, mem_addr, data_bytes);
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
switch (func3) {
case 0: // RV32I: LB
case 1: // RV32I: LH
rddata[t].i = sext((Word)read_data, data_width);
break;
case 2:
if (opcode == L_INST) {
// RV32I: LW
rddata[t].i = sext((Word)read_data, data_width);
} else {
// RV32F: FLW
rddata[t].u64 = nan_box((uint32_t)read_data);
}
break;
case 3: // RV64I: LD
// RV32D: FLD
case 4: // RV32I: LBU
case 5: // RV32I: LHU
case 6: // RV64I: LWU
rddata[t].u64 = read_data;
break;
default:
std::abort();
}
}
} else {
auto &vd = vreg_file_.at(rdest);
switch (instr.getVlsWidth()) {
case 6: {
for (uint32_t i = 0; i < vl_; i++) {
Word mem_addr = ((rsdata[i][0].i) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
Word mem_data = 0;
core_->dcache_read(&mem_data, mem_addr, 4);
Word *result_ptr = (Word *)(vd.data() + i);
*result_ptr = mem_data;
}
break;
}
default:
std::abort();
}
}
rd_write = true;
break;
}
case S_INST:
case FS: {
trace->exe_type = ExeType::LSU;
trace->lsu_type = LsuType::STORE;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
trace->data = trace_data;
if ((opcode == S_INST)
|| (opcode == FS && func3 == 2)
|| (opcode == FS && func3 == 3)) {
uint32_t data_bytes = 1 << (func3 & 0x3);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].i + immsrc;
uint64_t write_data = rsdata[t][1].u64;
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
switch (func3) {
case 0:
case 1:
case 2:
case 3:
core_->dcache_write(&write_data, mem_addr, data_bytes);
break;
default:
std::abort();
}
}
} else {
for (uint32_t i = 0; i < vl_; i++) {
uint64_t mem_addr = rsdata[i][0].i + (i * vtype_.vsew / 8);
switch (instr.getVlsWidth()) {
case 6: {
// store word and unit strided (not checking for unit stride)
uint32_t mem_data = *(uint32_t *)(vreg_file_.at(instr.getVs3()).data() + i);
core_->dcache_write(&mem_data, mem_addr, 4);
break;
}
default:
std::abort();
}
}
}
break;
}
case AMO: {
trace->exe_type = ExeType::LSU;
trace->lsu_type = LsuType::LOAD;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
trace->data = trace_data;
auto amo_type = func7 >> 2;
uint32_t data_bytes = 1 << (func3 & 0x3);
uint32_t data_width = 8 * data_bytes;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].u;
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
if (amo_type == 0x02) { // LR
uint64_t read_data = 0;
core_->dcache_read(&read_data, mem_addr, data_bytes);
core_->dcache_amo_reserve(mem_addr);
rddata[t].i = sext((Word)read_data, data_width);
} else
if (amo_type == 0x03) { // SC
if (core_->dcache_amo_check(mem_addr)) {
core_->dcache_write(&rsdata[t][1].u64, mem_addr, data_bytes);
rddata[t].i = 0;
} else {
rddata[t].i = 1;
}
} else {
uint64_t read_data = 0;
core_->dcache_read(&read_data, mem_addr, data_bytes);
auto read_data_i = sext((WordI)read_data, data_width);
auto rs1_data_i = sext((WordI)rsdata[t][1].u64, data_width);
auto read_data_u = zext((Word)read_data, data_width);
auto rs1_data_u = zext((Word)rsdata[t][1].u64, data_width);
uint64_t result;
switch (amo_type) {
case 0x00: // AMOADD
result = read_data_i + rs1_data_i;
break;
case 0x01: // AMOSWAP
result = rs1_data_u;
break;
case 0x04: // AMOXOR
result = read_data_u ^ rs1_data_u;
break;
case 0x08: // AMOOR
result = read_data_u | rs1_data_u;
break;
case 0x0c: // AMOAND
result = read_data_u & rs1_data_u;
break;
case 0x10: // AMOMIN
result = std::min(read_data_i, rs1_data_i);
break;
case 0x14: // AMOMAX
result = std::max(read_data_i, rs1_data_i);
break;
case 0x18: // AMOMINU
result = std::min(read_data_u, rs1_data_u);
break;
case 0x1c: // AMOMAXU
result = std::max(read_data_u, rs1_data_u);
break;
default:
std::abort();
}
core_->dcache_write(&result, mem_addr, data_bytes);
rddata[t].i = read_data_i;
}
}
rd_write = true;
break;
}
case SYS_INST: {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint32_t csr_addr = immsrc;
uint32_t csr_value;
if (func3 == 0) {
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::SYSCALL;
trace->fetch_stall = true;
switch (csr_addr) {
case 0:
// RV32I: ECALL
core_->trigger_ecall();
break;
case 1:
// RV32I: EBREAK
core_->trigger_ebreak();
break;
case 0x002: // URET
case 0x102: // SRET
case 0x302: // MRET
break;
default:
std::abort();
}
} else {
trace->exe_type = ExeType::SFU;
trace->fetch_stall = true;
csr_value = core_->get_csr(csr_addr, t, warp_id_);
switch (func3) {
case 1: {
// RV32I: CSRRW
rddata[t].i = csr_value;
core_->set_csr(csr_addr, rsdata[t][0].i, t, warp_id_);
trace->used_iregs.set(rsrc0);
trace->sfu_type = SfuType::CSRRW;
rd_write = true;
break;
}
case 2: {
// RV32I: CSRRS
rddata[t].i = csr_value;
if (rsdata[t][0].i != 0) {
core_->set_csr(csr_addr, csr_value | rsdata[t][0].i, t, warp_id_);
}
trace->used_iregs.set(rsrc0);
trace->sfu_type = SfuType::CSRRS;
rd_write = true;
break;
}
case 3: {
// RV32I: CSRRC
rddata[t].i = csr_value;
if (rsdata[t][0].i != 0) {
core_->set_csr(csr_addr, csr_value & ~rsdata[t][0].i, t, warp_id_);
}
trace->used_iregs.set(rsrc0);
trace->sfu_type = SfuType::CSRRC;
rd_write = true;
break;
}
case 5: {
// RV32I: CSRRWI
rddata[t].i = csr_value;
core_->set_csr(csr_addr, rsrc0, t, warp_id_);
trace->sfu_type = SfuType::CSRRW;
rd_write = true;
break;
}
case 6: {
// RV32I: CSRRSI;
rddata[t].i = csr_value;
if (rsrc0 != 0) {
core_->set_csr(csr_addr, csr_value | rsrc0, t, warp_id_);
}
trace->sfu_type = SfuType::CSRRS;
rd_write = true;
break;
}
case 7: {
// RV32I: CSRRCI
rddata[t].i = csr_value;
if (rsrc0 != 0) {
core_->set_csr(csr_addr, csr_value & ~rsrc0, t, warp_id_);
}
trace->sfu_type = SfuType::CSRRC;
rd_write = true;
break;
}
default:
break;
}
}
}
break;
}
case FENCE: {
// RV32I: FENCE
trace->exe_type = ExeType::LSU;
trace->lsu_type = LsuType::FENCE;
break;
}
case FCI: {
trace->exe_type = ExeType::FPU;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint32_t frm = get_fpu_rm(func3, core_, t, warp_id_);
uint32_t fflags = 0;
switch (func7) {
case 0x00: { // RV32F: FADD.S
rddata[t].u64 = nan_box(rv_fadd_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), frm, &fflags));
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x01: { // RV32D: FADD.D
rddata[t].u64 = rv_fadd_d(rsdata[t][0].u64, rsdata[t][1].u64, frm, &fflags);
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x04: { // RV32F: FSUB.S
rddata[t].u64 = nan_box(rv_fsub_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), frm, &fflags));
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x05: { // RV32D: FSUB.D
rddata[t].u64 = rv_fsub_d(rsdata[t][0].u64, rsdata[t][1].u64, frm, &fflags);
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x08: { // RV32F: FMUL.S
rddata[t].u64 = nan_box(rv_fmul_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), frm, &fflags));
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x09: { // RV32D: FMUL.D
rddata[t].u64 = rv_fmul_d(rsdata[t][0].u64, rsdata[t][1].u64, frm, &fflags);
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x0c: { // RV32F: FDIV.S
rddata[t].u64 = nan_box(rv_fdiv_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), frm, &fflags));
trace->fpu_type = FpuType::FDIV;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x0d: { // RV32D: FDIV.D
rddata[t].u64 = rv_fdiv_d(rsdata[t][0].u64, rsdata[t][1].u64, frm, &fflags);
trace->fpu_type = FpuType::FDIV;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x10: {
switch (func3) {
case 0: // RV32F: FSGNJ.S
rddata[t].u64 = nan_box(rv_fsgnj_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64)));
break;
case 1: // RV32F: FSGNJN.S
rddata[t].u64 = nan_box(rv_fsgnjn_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64)));
break;
case 2: // RV32F: FSGNJX.S
rddata[t].u64 = nan_box(rv_fsgnjx_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64)));
break;
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x11: {
switch (func3) {
case 0: // RV32D: FSGNJ.D
rddata[t].u64 = rv_fsgnj_d(rsdata[t][0].u64, rsdata[t][1].u64);
break;
case 1: // RV32D: FSGNJN.D
rddata[t].u64 = rv_fsgnjn_d(rsdata[t][0].u64, rsdata[t][1].u64);
break;
case 2: // RV32D: FSGNJX.D
rddata[t].u64 = rv_fsgnjx_d(rsdata[t][0].u64, rsdata[t][1].u64);
break;
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x14: {
if (func3) {
// RV32F: FMAX.S
rddata[t].u64 = nan_box(rv_fmax_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), &fflags));
} else {
// RV32F: FMIN.S
rddata[t].u64 = nan_box(rv_fmin_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), &fflags));
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x15: {
if (func3) {
// RV32D: FMAX.D
rddata[t].u64 = rv_fmax_d(rsdata[t][0].u64, rsdata[t][1].u64, &fflags);
} else {
// RV32D: FMIN.D
rddata[t].u64 = rv_fmin_d(rsdata[t][0].u64, rsdata[t][1].u64, &fflags);
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x20: {
// RV32D: FCVT.S.D
rddata[t].u64 = nan_box(rv_dtof(rsdata[t][0].u64));
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x21: {
// RV32D: FCVT.D.S
rddata[t].u64 = rv_ftod(check_boxing(rsdata[t][0].u64));
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x2c: { // RV32F: FSQRT.S
rddata[t].u64 = nan_box(rv_fsqrt_s(check_boxing(rsdata[t][0].u64), frm, &fflags));
trace->fpu_type = FpuType::FSQRT;
trace->used_fregs.set(rsrc0);
break;
}
case 0x2d: { // RV32D: FSQRT.D
rddata[t].u64 = rv_fsqrt_d(rsdata[t][0].u64, frm, &fflags);
trace->fpu_type = FpuType::FSQRT;
trace->used_fregs.set(rsrc0);
break;
}
case 0x50: {
switch (func3) {
case 0:
// RV32F: FLE.S
rddata[t].i = rv_fle_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), &fflags);
break;
case 1:
// RV32F: FLT.S
rddata[t].i = rv_flt_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), &fflags);
break;
case 2:
// RV32F: FEQ.S
rddata[t].i = rv_feq_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), &fflags);
break;
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x51: {
switch (func3) {
case 0:
// RV32D: FLE.D
rddata[t].i = rv_fle_d(rsdata[t][0].u64, rsdata[t][1].u64, &fflags);
break;
case 1:
// RV32D: FLT.D
rddata[t].i = rv_flt_d(rsdata[t][0].u64, rsdata[t][1].u64, &fflags);
break;
case 2:
// RV32D: FEQ.D
rddata[t].i = rv_feq_d(rsdata[t][0].u64, rsdata[t][1].u64, &fflags);
break;
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x60: {
switch (rsrc1) {
case 0:
// RV32F: FCVT.W.S
rddata[t].i = sext((uint64_t)rv_ftoi_s(check_boxing(rsdata[t][0].u64), frm, &fflags), 32);
break;
case 1:
// RV32F: FCVT.WU.S
rddata[t].i = sext((uint64_t)rv_ftou_s(check_boxing(rsdata[t][0].u64), frm, &fflags), 32);
break;
case 2:
// RV64F: FCVT.L.S
rddata[t].i = rv_ftol_s(check_boxing(rsdata[t][0].u64), frm, &fflags);
break;
case 3:
// RV64F: FCVT.LU.S
rddata[t].i = rv_ftolu_s(check_boxing(rsdata[t][0].u64), frm, &fflags);
break;
}
trace->fpu_type = FpuType::FCVT;
trace->used_fregs.set(rsrc0);
break;
}
case 0x61: {
switch (rsrc1) {
case 0:
// RV32D: FCVT.W.D
rddata[t].i = sext((uint64_t)rv_ftoi_d(rsdata[t][0].u64, frm, &fflags), 32);
break;
case 1:
// RV32D: FCVT.WU.D
rddata[t].i = sext((uint64_t)rv_ftou_d(rsdata[t][0].u64, frm, &fflags), 32);
break;
case 2:
// RV64D: FCVT.L.D
rddata[t].i = rv_ftol_d(rsdata[t][0].u64, frm, &fflags);
break;
case 3:
// RV64D: FCVT.LU.D
rddata[t].i = rv_ftolu_d(rsdata[t][0].u64, frm, &fflags);
break;
}
trace->fpu_type = FpuType::FCVT;
trace->used_fregs.set(rsrc0);
break;
}
case 0x68: {
switch (rsrc1) {
case 0:
// RV32F: FCVT.S.W
rddata[t].u64 = nan_box(rv_itof_s(rsdata[t][0].i, frm, &fflags));
break;
case 1:
// RV32F: FCVT.S.WU
rddata[t].u64 = nan_box(rv_utof_s(rsdata[t][0].i, frm, &fflags));
break;
case 2:
// RV64F: FCVT.S.L
rddata[t].u64 = nan_box(rv_ltof_s(rsdata[t][0].i, frm, &fflags));
break;
case 3:
// RV64F: FCVT.S.LU
rddata[t].u64 = nan_box(rv_lutof_s(rsdata[t][0].i, frm, &fflags));
break;
}
trace->fpu_type = FpuType::FCVT;
trace->used_iregs.set(rsrc0);
break;
}
case 0x69: {
switch (rsrc1) {
case 0:
// RV32D: FCVT.D.W
rddata[t].u64 = rv_itof_d(rsdata[t][0].i, frm, &fflags);
break;
case 1:
// RV32D: FCVT.D.WU
rddata[t].u64 = rv_utof_d(rsdata[t][0].i, frm, &fflags);
break;
case 2:
// RV64D: FCVT.D.L
rddata[t].u64 = rv_ltof_d(rsdata[t][0].i, frm, &fflags);
break;
case 3:
// RV64D: FCVT.D.LU
rddata[t].u64 = rv_lutof_d(rsdata[t][0].i, frm, &fflags);
break;
}
trace->fpu_type = FpuType::FCVT;
trace->used_iregs.set(rsrc0);
break;
}
case 0x70: {
if (func3) {
// RV32F: FCLASS.S
rddata[t].i = rv_fclss_s(check_boxing(rsdata[t][0].u64));
} else {
// RV32F: FMV.X.S
uint32_t result = (uint32_t)rsdata[t][0].u64;
rddata[t].i = sext((uint64_t)result, 32);
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
break;
}
case 0x71: {
if (func3) {
// RV32D: FCLASS.D
rddata[t].i = rv_fclss_d(rsdata[t][0].u64);
} else {
// RV64D: FMV.X.D
rddata[t].i = rsdata[t][0].u64;
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
break;
}
case 0x78: { // RV32F: FMV.S.X
rddata[t].u64 = nan_box((uint32_t)rsdata[t][0].i);
trace->fpu_type = FpuType::FNCP;
trace->used_iregs.set(rsrc0);
break;
}
case 0x79: { // RV64D: FMV.D.X
rddata[t].u64 = rsdata[t][0].i;
trace->fpu_type = FpuType::FNCP;
trace->used_iregs.set(rsrc0);
break;
}
}
update_fcrs(fflags, core_, t, warp_id_);
}
rd_write = true;
break;
}
case FMADD:
case FMSUB:
case FMNMADD:
case FMNMSUB: {
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
trace->used_fregs.set(rsrc2);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint32_t frm = get_fpu_rm(func3, core_, t, warp_id_);
uint32_t fflags = 0;
switch (opcode) {
case FMADD:
if (func2)
// RV32D: FMADD.D
rddata[t].u64 = rv_fmadd_d(rsdata[t][0].u64, rsdata[t][1].u64, rsdata[t][2].u64, frm, &fflags);
else
// RV32F: FMADD.S
rddata[t].u64 = nan_box(rv_fmadd_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), check_boxing(rsdata[t][2].u64), frm, &fflags));
break;
case FMSUB:
if (func2)
// RV32D: FMSUB.D
rddata[t].u64 = rv_fmsub_d(rsdata[t][0].u64, rsdata[t][1].u64, rsdata[t][2].u64, frm, &fflags);
else
// RV32F: FMSUB.S
rddata[t].u64 = nan_box(rv_fmsub_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), check_boxing(rsdata[t][2].u64), frm, &fflags));
break;
case FMNMADD:
if (func2)
// RV32D: FNMADD.D
rddata[t].u64 = rv_fnmadd_d(rsdata[t][0].u64, rsdata[t][1].u64, rsdata[t][2].u64, frm, &fflags);
else
// RV32F: FNMADD.S
rddata[t].u64 = nan_box(rv_fnmadd_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), check_boxing(rsdata[t][2].u64), frm, &fflags));
break;
case FMNMSUB:
if (func2)
// RV32D: FNMSUB.D
rddata[t].u64 = rv_fnmsub_d(rsdata[t][0].u64, rsdata[t][1].u64, rsdata[t][2].u64, frm, &fflags);
else
// RV32F: FNMSUB.S
rddata[t].u64 = nan_box(rv_fnmsub_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), check_boxing(rsdata[t][2].u64), frm, &fflags));
break;
default:
break;
}
update_fcrs(fflags, core_, t, warp_id_);
}
rd_write = true;
break;
}
case EXT1: {
switch (func7) {
case 0: {
switch (func3) {
case 0: {
// TMC
trace->exe_type = ExeType::SFU;
trace->sfu_type = SfuType::TMC;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
next_tmask.reset();
for (uint32_t t = 0; t < num_threads; ++t) {
next_tmask.set(t, rsdata.at(thread_start)[0].i & (1 << t));
}
} break;
case 1: {
// WSPAWN
trace->exe_type = ExeType::SFU;
trace->sfu_type = SfuType::WSPAWN;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
core_->wspawn(rsdata.at(thread_start)[0].i, rsdata.at(thread_start)[1].i);
} break;
case 2: {
// SPLIT
trace->exe_type = ExeType::SFU;
trace->sfu_type = SfuType::SPLIT;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
ThreadMask then_tmask, else_tmask;
for (uint32_t t = 0; t < num_threads; ++t) {
auto cond = ireg_file_.at(t).at(rsrc0);
then_tmask[t] = tmask_.test(t) && cond;
else_tmask[t] = tmask_.test(t) && !cond;
}
bool is_divergent = then_tmask.any() && else_tmask.any();
if (is_divergent) {
if (ipdom_stack_.size() == arch_.ipdom_size()) {
std::cout << "IPDOM stack is full! size=" << std::dec << ipdom_stack_.size() << ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")\n" << std::dec << std::flush;
std::abort();
}
// set new thread mask
next_tmask = then_tmask;
// push reconvergence thread mask onto the stack
ipdom_stack_.emplace(tmask_);
// push else's thread mask onto the stack
ipdom_stack_.emplace(else_tmask, next_pc);
}
// return divergent state
for (uint32_t t = thread_start; t < num_threads; ++t) {
rddata[t].i = is_divergent;
}
rd_write = true;
} break;
case 3: {
// JOIN
trace->exe_type = ExeType::SFU;
trace->sfu_type = SfuType::JOIN;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
int is_divergent = ireg_file_.at(thread_start).at(rsrc0);
if (is_divergent != 0) {
if (ipdom_stack_.empty()) {
std::cout << "IPDOM stack is empty!\n" << std::flush;
std::abort();
}
next_tmask = ipdom_stack_.top().tmask;
if (!ipdom_stack_.top().fallthrough) {
next_pc = ipdom_stack_.top().PC;
}
ipdom_stack_.pop();
}
} break;
case 4: {
// BAR
trace->exe_type = ExeType::SFU;
trace->sfu_type = SfuType::BAR;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
trace->data = std::make_shared<SFUTraceData>(rsdata[thread_start][0].i, rsdata[thread_start][1].i);
} break;
case 5: {
// PRED
trace->exe_type = ExeType::SFU;
trace->sfu_type = SfuType::PRED;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
ThreadMask pred;
for (uint32_t t = 0; t < num_threads; ++t) {
pred[t] = tmask_.test(t) && (ireg_file_.at(t).at(rsrc0) & 0x1);
}
if (pred.any()) {
next_tmask &= pred;
} else {
next_tmask = ireg_file_.at(thread_start).at(rsrc1);
}
} break;
default:
std::abort();
}
} break;
default:
std::abort();
}
} break;
case EXT2: {
switch (func3) {
case 1:
switch (func2) {
case 0: { // CMOV
trace->exe_type = ExeType::SFU;
trace->sfu_type = SfuType::CMOV;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->used_iregs.set(rsrc2);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t].i = rsdata[t][0].i ? rsdata[t][1].i : rsdata[t][2].i;
}
rd_write = true;
} break;
default:
std::abort();
}
break;
default:
std::abort();
}
} break;
case VSET: {
uint32_t VLEN = arch_.vsize() * 8;
uint32_t VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew();
switch (func3) {
case 0: // vector-vector
switch (func6) {
case 0: {
auto& vr1 = vreg_file_.at(rsrc0);
auto& vr2 = vreg_file_.at(rsrc1);
auto& vd = vreg_file_.at(rdest);
auto& mask = vreg_file_.at(0);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t emask = *(uint8_t *)(mask.data() + i);
uint8_t value = emask & 0x1;
if (vmask || (!vmask && value)) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = first + second;
DP(3, "Adding " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t emask = *(uint16_t *)(mask.data() + i);
uint16_t value = emask & 0x1;
if (vmask || (!vmask && value)) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = first + second;
DP(3, "Adding " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t emask = *(uint32_t *)(mask.data() + i);
uint32_t value = emask & 0x1;
if (vmask || (!vmask && value)) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = first + second;
DP(3, "Adding " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
}
}
} break;
case 24: {
// vmseq
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first == second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first == second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first == second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
}
} break;
case 25: {
// vmsne
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first != second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first != second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first != second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
}
} break;
case 26: {
// vmsltu
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first < second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first < second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first < second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
}
} break;
case 27: {
// vmslt
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
int8_t first = *(int8_t *)(vr1.data() + i);
int8_t second = *(int8_t *)(vr2.data() + i);
int8_t result = (first < second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
int16_t first = *(int16_t *)(vr1.data() + i);
int16_t second = *(int16_t *)(vr2.data() + i);
int16_t result = (first < second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(int16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
int32_t first = *(int32_t *)(vr1.data() + i);
int32_t second = *(int32_t *)(vr2.data() + i);
int32_t result = (first < second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(int32_t *)(vd.data() + i) = result;
}
}
} break;
case 28: {
// vmsleu
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first <= second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first <= second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first <= second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
}
} break;
case 29: {
// vmsle
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
int8_t first = *(int8_t *)(vr1.data() + i);
int8_t second = *(int8_t *)(vr2.data() + i);
int8_t result = (first <= second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
int16_t first = *(int16_t *)(vr1.data() + i);
int16_t second = *(int16_t *)(vr2.data() + i);
int16_t result = (first <= second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(int16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
int32_t first = *(int32_t *)(vr1.data() + i);
int32_t second = *(int32_t *)(vr2.data() + i);
int32_t result = (first <= second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(int32_t *)(vd.data() + i) = result;
}
}
} break;
case 30: {
// vmsgtu
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first > second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first > second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first > second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
}
} break;
case 31: {
// vmsgt
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
int8_t first = *(int8_t *)(vr1.data() + i);
int8_t second = *(int8_t *)(vr2.data() + i);
int8_t result = (first > second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
int16_t first = *(int16_t *)(vr1.data() + i);
int16_t second = *(int16_t *)(vr2.data() + i);
int16_t result = (first > second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(int16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
int32_t first = *(int32_t *)(vr1.data() + i);
int32_t second = *(int32_t *)(vr2.data() + i);
int32_t result = (first > second) ? 1 : 0;
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(int32_t *)(vd.data() + i) = result;
}
}
} break;
}
break;
case 2: {
switch (func6) {
case 24: {
// vmandnot
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = (first_value & !second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint8_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = (first_value & !second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint16_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = (first_value & !second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint32_t *)(vd.data() + i) = 0;
}
}
} break;
case 25: {
// vmand
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = (first_value & second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint8_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = (first_value & second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint16_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = (first_value & second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint32_t *)(vd.data() + i) = 0;
}
}
} break;
case 26: {
// vmor
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = (first_value | second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint8_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = (first_value | second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint16_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = (first_value | second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint32_t *)(vd.data() + i) = 0;
}
}
} break;
case 27: {
// vmxor
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = (first_value ^ second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint8_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = (first_value ^ second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint16_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = (first_value ^ second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint32_t *)(vd.data() + i) = 0;
}
}
} break;
case 28: {
// vmornot
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = (first_value | !second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint8_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = (first_value | !second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint16_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = (first_value | !second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint32_t *)(vd.data() + i) = 0;
}
}
} break;
case 29: {
// vmnand
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = !(first_value & second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint8_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = !(first_value & second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint16_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = !(first_value & second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint32_t *)(vd.data() + i) = 0;
}
}
} break;
case 30: {
// vmnor
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = !(first_value | second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint8_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = !(first_value | second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint16_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = !(first_value | second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint32_t *)(vd.data() + i) = 0;
}
}
} break;
case 31: {
// vmxnor
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = !(first_value ^ second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint8_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = !(first_value ^ second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint16_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = !(first_value ^ second_value);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint32_t *)(vd.data() + i) = 0;
}
}
} break;
case 37: {
// vmul
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first * second);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint8_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first * second);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint16_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first * second);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint32_t *)(vd.data() + i) = 0;
}
}
} break;
case 45: {
// vmacc
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first * second);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) += result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint8_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first * second);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) += result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint16_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first * second);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) += result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint32_t *)(vd.data() + i) = 0;
}
}
} break;
}
} break;
case 6: {
switch (func6) {
case 0: {
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (rsdata[i][0].i + second);
DP(3, "Comparing " << rsdata[i][0].i << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint8_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (rsdata[i][0].i + second);
DP(3, "Comparing " << rsdata[i][0].i << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint16_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (rsdata[i][0].i + second);
DP(3, "Comparing " << rsdata[i][0].i << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint32_t *)(vd.data() + i) = 0;
}
}
} break;
case 37: {
// vmul.vx
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (uint32_t i = 0; i < vl_; i++) {
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (rsdata[i][0].i * second);
DP(3, "Comparing " << rsdata[i][0].i << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint8_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 16) {
for (uint32_t i = 0; i < vl_; i++) {
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (rsdata[i][0].i * second);
DP(3, "Comparing " << rsdata[i][0].i << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint16_t *)(vd.data() + i) = 0;
}
} else if (vtype_.vsew == 32) {
for (uint32_t i = 0; i < vl_; i++) {
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (rsdata[i][0].i * second);
DP(3, "Comparing " << rsdata[i][0].i << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (uint32_t i = vl_; i < VLMAX; i++) {
*(uint32_t *)(vd.data() + i) = 0;
}
}
} break;
}
} break;
case 7: {
vtype_.vill = 0;
vtype_.vediv = instr.getVediv();
vtype_.vsew = instr.getVsew();
vtype_.vlmul = instr.getVlmul();
DP(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0].i << "VLMAX" << VLMAX);
auto s0 = rsdata[0][0].u;
if (s0 <= VLMAX) {
vl_ = s0;
} else if (s0 < (2 * VLMAX)) {
vl_ = (uint32_t)ceil((s0 * 1.0) / 2.0);
} else if (s0 >= (2 * VLMAX)) {
vl_ = VLMAX;
}
rddata[0].i = vl_;
} break;
default:
std::abort();
}
} break;
default:
std::abort();
}
if (rd_write) {
trace->wb = true;
auto type = instr.getRDType();
switch (type) {
case RegType::Integer:
if (rdest) {
DPH(2, "Dest Reg: " << type << std::dec << rdest << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
DPN(2, "-");
continue;
}
ireg_file_.at(t)[rdest] = rddata[t].i;
DPN(2, "0x" << std::hex << rddata[t].i);
}
DPN(2, "}" << std::endl);
trace->used_iregs[rdest] = 1;
assert(rdest != 0);
} else {
// disable writes to x0
trace->wb = false;
}
break;
case RegType::Float:
DPH(2, "Dest Reg: " << type << std::dec << rdest << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
DPN(2, "-");
continue;
}
freg_file_.at(t)[rdest] = rddata[t].u64;
DPN(2, "0x" << std::hex << rddata[t].f);
}
DPN(2, "}" << std::endl);
trace->used_fregs[rdest] = 1;
break;
default:
std::abort();
break;
}
}
PC_ += 4;
if (PC_ != next_pc) {
DP(3, "*** Next PC=0x" << std::hex << next_pc << std::dec);
PC_ = next_pc;
}
if (tmask_ != next_tmask) {
DPH(3, "*** New Tmask=");
for (uint32_t i = 0; i < num_threads; ++i)
DPN(3, next_tmask.test(i));
DPN(3, std::endl);
tmask_ = next_tmask;
if (!next_tmask.any()) {
core_->active_warps_.reset(warp_id_);
}
}
}