From 2b8435471a05fd9630c53e4a43c913eb029c4d73 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 6 Jan 2021 18:44:06 -0800 Subject: [PATCH] speeding up simulation using dedicated full dpi-based FPU core --- driver/opae/vlsim/Makefile | 5 +- driver/opae/vlsim/opae_sim.cpp | 11 +- driver/rtlsim/Makefile | 5 +- hw/dpi/float_dpi.cpp | 264 ++++++++++++++++++ hw/dpi/float_dpi.vh | 31 +++ hw/dpi/util_dpi.cpp | 84 ++++++ hw/dpi/util_dpi.vh | 7 + hw/rtl/VX_cluster.v | 19 +- hw/rtl/VX_config.vh | 8 +- hw/rtl/VX_fpu_unit.v | 48 +++- hw/rtl/VX_lsu_unit.v | 2 +- hw/rtl/VX_mem_unit.v | 26 +- hw/rtl/Vortex.v | 16 +- hw/rtl/afu/vortex_afu.sv | 2 +- hw/rtl/fp_cores/VX_fp_div.v | 24 +- hw/rtl/fp_cores/VX_fp_dpi.v | 415 ++++++++++++++++++++++++++++ hw/rtl/fp_cores/VX_fp_fma.v | 14 - hw/rtl/fp_cores/VX_fp_fpga.v | 113 ++++---- hw/rtl/fp_cores/VX_fp_ncomp.v | 8 +- hw/rtl/fp_cores/VX_fp_sqrt.v | 26 +- hw/rtl/fp_cores/VX_fpnew.v | 14 +- hw/rtl/fp_cores/svdpi/float_dpi.cpp | 239 ---------------- hw/rtl/fp_cores/svdpi/float_dpi.vh | 20 -- hw/rtl/libs/VX_lzc.v | 2 +- hw/simulate/Makefile | 5 +- hw/simulate/simulator.cpp | 12 +- 26 files changed, 990 insertions(+), 430 deletions(-) create mode 100644 hw/dpi/float_dpi.cpp create mode 100644 hw/dpi/float_dpi.vh create mode 100644 hw/dpi/util_dpi.cpp create mode 100644 hw/dpi/util_dpi.vh create mode 100644 hw/rtl/fp_cores/VX_fp_dpi.v delete mode 100644 hw/rtl/fp_cores/svdpi/float_dpi.cpp delete mode 100644 hw/rtl/fp_cores/svdpi/float_dpi.vh diff --git a/driver/opae/vlsim/Makefile b/driver/opae/vlsim/Makefile index 709cc17d..749ae8a8 100644 --- a/driver/opae/vlsim/Makefile +++ b/driver/opae/vlsim/Makefile @@ -39,11 +39,12 @@ LDFLAGS += -shared -pthread TOP = vortex_afu_shim RTL_DIR=../../../hw/rtl +DPI_DIR=../../../hw/dpi SRCS = fpga.cpp opae_sim.cpp -SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp +SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp -FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src +FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip diff --git a/driver/opae/vlsim/opae_sim.cpp b/driver/opae/vlsim/opae_sim.cpp index b0391efd..b08aa1c1 100644 --- a/driver/opae/vlsim/opae_sim.cpp +++ b/driver/opae/vlsim/opae_sim.cpp @@ -8,7 +8,7 @@ #define CCI_RQ_SIZE 16 #define CCI_WQ_SIZE 16 -#define RESET_DELAY 2 +#define RESET_DELAY 4 #define ENABLE_DRAM_STALLS #define DRAM_LATENCY 24 @@ -135,19 +135,14 @@ void opae_sim::reset() { vortex_afu_->reset = 1; - vortex_afu_->clk = 0; - this->eval(); - vortex_afu_->clk = 1; - this->eval(); - - vortex_afu_->reset = 0; - for (int i = 0; i < RESET_DELAY; ++i) { vortex_afu_->clk = 0; this->eval(); vortex_afu_->clk = 1; this->eval(); } + + vortex_afu_->reset = 0; // Turn on assertion after reset Verilated::assertOn(true); diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 50bd7a4b..414616d8 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -39,11 +39,12 @@ LDFLAGS += -shared -pthread TOP = Vortex RTL_DIR = ../../hw/rtl +DPI_DIR = ../../hw/dpi SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp -SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp +SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp -FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src +FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS) diff --git a/hw/dpi/float_dpi.cpp b/hw/dpi/float_dpi.cpp new file mode 100644 index 00000000..f7d9a85c --- /dev/null +++ b/hw/dpi/float_dpi.cpp @@ -0,0 +1,264 @@ +#include +#include +#include +#include +#include +#include +#include "svdpi.h" +#include "verilated_vpi.h" +#include "VX_config.h" + +extern "C" { + void dpi_fadd(int a, int b, int frm, int* result, int* fflags); + void dpi_fsub(int a, int b, int frm, int* result, int* fflags); + void dpi_fmul(int a, int b, int frm, int* result, int* fflags); + void dpi_fmadd(int a, int b, int c, int frm, int* result, int* fflags); + void dpi_fmsub(int a, int b, int c, int frm, int* result, int* fflags); + void dpi_fnmadd(int a, int b, int c, int frm, int* result, int* fflags); + void dpi_fnmsub(int a, int b, int c, int frm, int* result, int* fflags); + + void dpi_fdiv(int a, int b, int frm, int* result, int* fflags); + void dpi_fsqrt(int a, int frm, int* result, int* fflags); + + void dpi_ftoi(int a, int frm, int* result, int* fflags); + void dpi_ftou(int a, int frm, int* result, int* fflags); + void dpi_itof(int a, int frm, int* result, int* fflags); + void dpi_utof(int a, int frm, int* result, int* fflags); + + void dpi_fclss(int a, int* result); + void dpi_fsgnj(int a, int* result); + void dpi_fsgnjn(int a, int* result); + void dpi_fsgnjx(int a, int* result); + + void dpi_flt(int a, int b, int* result, int* fflags); + void dpi_fle(int a, int b, int* result, int* fflags); + void dpi_feq(int a, int b, int* result, int* fflags); + void dpi_fmin(int a, int b, int* result, int* fflags); + void dpi_fmax(int a, int b, int* result, int* fflags); +} + +union Float_t { + float f; + int i; + struct { + uint32_t man : 23; + uint32_t exp : 8; + uint32_t sign : 1; + } parts; +}; + +void dpi_fadd(int a, int b, int frm, int* result, int* fflags) { + Float_t fa, fb, fr; + + fa.i = a; + fb.i = b; + fr.f = fa.f + fb.f; + + *result = fr.i; + *fflags = 0; +} + +void dpi_fsub(int a, int b, int frm, int* result, int* fflags) { + Float_t fa, fb, fr; + + fa.i = a; + fb.i = b; + fr.f = fa.f - fb.f; + + *result = fr.i; + *fflags = 0; +} + +void dpi_fmul(int a, int b, int frm, int* result, int* fflags) { + Float_t fa, fb, fr; + + fa.i = a; + fb.i = b; + fr.f = fa.f * fb.f; + + *result = fr.i; + *fflags = 0; +} + +void dpi_fmadd(int a, int b, int c, int frm, int* result, int* fflags) { + Float_t fa, fb, fc, fr; + + fa.i = a; + fb.i = b; + fc.i = c; + fr.f = fa.f * fb.f + fc.f; + + *result = fr.i; + *fflags = 0; +} + +void dpi_fmsub(int a, int b, int c, int frm, int* result, int* fflags) { + Float_t fa, fb, fc, fr; + + fa.i = a; + fb.i = b; + fc.i = c; + fr.f = fa.f * fb.f - fc.f; + + *result = fr.i; + *fflags = 0; +} + +void dpi_fnmadd(int a, int b, int c, int frm, int* result, int* fflags) { + Float_t fa, fb, fc, fr; + + fa.i = a; + fb.i = b; + fc.i = c; + fr.f = -(fa.f * fb.f + fc.f); + + *result = fr.i; + *fflags = 0; +} + +void dpi_fnmsub(int a, int b, int c, int frm, int* result, int* fflags) { + Float_t fa, fb, fc, fr; + + fa.i = a; + fb.i = b; + fc.i = c; + fr.f = -(fa.f * fb.f - fc.f); + + *result = fr.i; + *fflags = 0; +} + +void dpi_fdiv(int a, int b, int frm, int* result, int* fflags) { + Float_t fa, fb, fr; + + fa.i = a; + fb.i = b; + fr.f = fa.f / fb.f; + + *result = fr.i; + *fflags = 0; +} + +void dpi_fsqrt(int a, int frm, int* result, int* fflags) { + Float_t fa, fr; + + fa.i = a; + fr.f = sqrtf(fa.f); + + *result = fr.i; + *fflags = 0; +} + +void dpi_ftoi(int a, int frm, int* result, int* fflags) { + Float_t fa, fr; + + fa.i = a; + fr.i = int(fa.f); + + *result = fr.i; + *fflags = 0; +} + +void dpi_ftou(int a, int frm, int* result, int* fflags) { + Float_t fa, fr; + + fa.i = a; + fr.i = unsigned(fa.f); + + *result = fr.i; + *fflags = 0; +} + +void dpi_itof(int a, int frm, int* result, int* fflags) { + Float_t fa, fr; + + fr.f = (float)a; + + *result = fr.i; + *fflags = 0; +} + +void dpi_utof(int a, int frm, int* result, int* fflags) { + Float_t fa, fr; + + unsigned ua = a; + fr.f = (float)ua; + + *result = fr.i; + *fflags = 0; +} + +void dpi_flt(int a, int b, int* result, int* fflags) { + Float_t fa, fb, fr; + + fa.i = a; + fb.i = b; + fr.f = fa.f < fb.f; + + *result = fr.i; + *fflags = 0; +} + +void dpi_fle(int a, int b, int* result, int* fflags) { + Float_t fa, fb, fr; + + fa.i = a; + fb.i = b; + fr.f = fa.f <= fb.f; + + *result = fr.i; + *fflags = 0; +} + +void dpi_feq(int a, int b, int* result, int* fflags) { + Float_t fa, fb, fr; + + fa.i = a; + fb.i = b; + fr.f = fa.f == fb.f; + + *result = fr.i; + *fflags = 0; +} + +void dpi_fmin(int a, int b, int* result, int* fflags) { + Float_t fa, fb, fr; + + fa.i = a; + fb.i = b; + fr.f = std::min(fa.f, fb.f); + + *result = fr.i; + *fflags = 0; +} + +void dpi_fmax(int a, int b, int* result, int* fflags) { + Float_t fa, fb, fr; + + fa.i = a; + fb.i = b; + fr.f = std::max(fa.f, fb.f); + + *result = fr.i; + *fflags = 0; +} + +void dpi_fclss(int a, int* result) { + // TODO + *result = 0; +} + +void dpi_fsgnj(int a, int* result) { + // TODO + *result = 0; +} + +void dpi_fsgnjn(int a, int* result) { + // TODO + *result = 0; +} + +void dpi_fsgnjx(int a, int* result) { + // TODO + *result = 0; +} \ No newline at end of file diff --git a/hw/dpi/float_dpi.vh b/hw/dpi/float_dpi.vh new file mode 100644 index 00000000..62f45432 --- /dev/null +++ b/hw/dpi/float_dpi.vh @@ -0,0 +1,31 @@ +`ifndef FLOAT_DPI +`define FLOAT_DPI + +import "DPI-C" context function void dpi_fadd(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_fsub(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_fmul(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_fmadd(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_fmsub(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_fnmadd(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_fnmsub(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags); + +import "DPI-C" context function void dpi_fdiv(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_fsqrt(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags); + +import "DPI-C" context function void dpi_ftoi(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_ftou(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_itof(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_utof(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags); + +import "DPI-C" context function void dpi_fclss(input int a, output int result); +import "DPI-C" context function void dpi_fsgnj(input int a, output int result); +import "DPI-C" context function void dpi_fsgnjn(input int a, output int result); +import "DPI-C" context function void dpi_fsgnjx(input int a, output int result); + +import "DPI-C" context function void dpi_flt(input int a, input int b, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_fle(input int a, input int b, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_feq(input int a, input int b, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_fmin(input int a, input int b, output int result, output bit[4:0] fflags); +import "DPI-C" context function void dpi_fmax(input int a, input int b, output int result, output bit[4:0] fflags); + +`endif \ No newline at end of file diff --git a/hw/dpi/util_dpi.cpp b/hw/dpi/util_dpi.cpp new file mode 100644 index 00000000..e9baa90c --- /dev/null +++ b/hw/dpi/util_dpi.cpp @@ -0,0 +1,84 @@ +#include +#include +#include +#include +#include +#include +#include "svdpi.h" +#include "verilated_vpi.h" +#include "VX_config.h" + +extern "C" { + int dpi_register(); + void dpi_assert(int inst, bool cond, int delay); +} + +class ShiftRegister { +public: + ShiftRegister() : init_(false), depth_(0) {} + + void ensure_init(int depth) { + if (!init_) { + buffer_.resize(depth); + init_ = true; + depth_ = depth; + } + } + + void push(int value, bool enable) { + if (!enable) + return; + for (unsigned i = 0; i < depth_-1; ++i) { + buffer_[i] = buffer_[i+1]; + } + buffer_[depth_-1] = value; + } + + int top() const { + return buffer_[0]; + } + +private: + + std::vector buffer_; + bool init_; + unsigned depth_; +}; + +class Instances { +public: + ShiftRegister& get(int inst) { + return instances_.at(inst); + } + + int allocate() { + mutex_.lock(); + int inst = instances_.size(); + instances_.resize(inst + 1); + mutex_.unlock(); + return inst; + } + +private: + std::vector instances_; + std::mutex mutex_; +}; + +Instances instances; + +int dpi_register() { + return instances.allocate(); +} + +void dpi_assert(int inst, bool cond, int delay) { + ShiftRegister& sr = instances.get(inst); + + sr.ensure_init(delay); + sr.push(!cond, 1); + + auto status = sr.top(); + if (status) { + printf("delayed assertion at %s!\n", svGetNameFromScope(svGetScope())); + std::abort(); + } +} \ No newline at end of file diff --git a/hw/dpi/util_dpi.vh b/hw/dpi/util_dpi.vh new file mode 100644 index 00000000..77294974 --- /dev/null +++ b/hw/dpi/util_dpi.vh @@ -0,0 +1,7 @@ +`ifndef UTIL_DPI +`define UTIL_DPI + +import "DPI-C" context function int dpi_register(); +import "DPI-C" context function void dpi_assert(int inst, input logic cond, input int delay); + +`endif \ No newline at end of file diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 9f3fc8c7..800e23ac 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -40,7 +40,8 @@ module VX_cluster #( // Status output wire busy, output wire ebreak -); +); + wire [`NUM_CORES-1:0] per_core_dram_req_valid; wire [`NUM_CORES-1:0] per_core_dram_req_rw; wire [`NUM_CORES-1:0][`DDRAM_BYTEEN_WIDTH-1:0] per_core_dram_req_byteen; @@ -70,15 +71,13 @@ module VX_cluster #( for (genvar i = 0; i < `NUM_CORES; i++) begin wire core_reset; - if (`NUM_CORES > 1) begin - reg core_reset_r; - always @(posedge clk) begin - core_reset_r <= reset; - end - assign core_reset = core_reset_r; - end else begin - assign core_reset = reset; - end + VX_reset_relay #( + .PASSTHRU (`NUM_CORES == 1) + ) reset_relay ( + .clk (clk), + .reset (reset), + .reset_out (core_reset) + ); VX_core #( .CORE_ID(i + (CLUSTER_ID * `NUM_CORES)) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index e1c6241b..2a63b73d 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -92,8 +92,8 @@ `define LATENCY_IMUL 3 `endif -`ifndef LATENCY_FNCOMP -`define LATENCY_FNCOMP 2 +`ifndef LATENCY_FNCP +`define LATENCY_FNCP 2 `endif `ifndef LATENCY_FMA @@ -128,8 +128,8 @@ `define LATENCY_FDIVSQRT 32 `endif -`ifndef LATENCY_FCONV -`define LATENCY_FCONV 4 +`ifndef LATENCY_FCVT +`define LATENCY_FCVT 4 `endif // CSR Addresses ////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index 1ca5c66c..ba320873 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -63,11 +63,11 @@ module VX_fpu_unit #( // resolve dynamic FRM from CSR assign fpu_to_csr_if.read_wid = fpu_req_if.wid; - wire [`FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod; + wire [`FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod; `ifdef FPU_FAST - VX_fp_fpga #( + VX_fp_dpi #( .TAGW (FPUQ_BITS) ) fp_core ( .clk (clk), @@ -91,21 +91,51 @@ module VX_fpu_unit #( .tag_out (tag_out), + .ready_out (ready_out), + .valid_out (valid_out) + ); + +`elsif FPU_FPNEW + + VX_fpnew #( + .FMULADD (1), + .FDIVSQRT (1), + .FNONCOMP (1), + .FCONV (1), + .TAGW (FPUQ_BITS) + ) fp_core ( + .clk (clk), + .reset (reset), + + .valid_in (valid_in), + .ready_in (ready_in), + + .tag_in (tag_in), + + .op_type (fpu_req_if.op_type), + .frm (fpu_frm), + + .dataa (fpu_req_if.rs1_data), + .datab (fpu_req_if.rs2_data), + .datac (fpu_req_if.rs3_data), + .result (result), + + .has_fflags (has_fflags), + .fflags (fflags), + + .tag_out (tag_out), + .ready_out (ready_out), .valid_out (valid_out) ); `else - VX_fpnew #( - .FMULADD (1), - .FDIVSQRT (1), - .FNONCOMP (1), - .FCONV (1), - .TAGW (FPUQ_BITS) + VX_fp_fpga #( + .TAGW (FPUQ_BITS) ) fp_core ( .clk (clk), - .reset (reset), + .reset (fpu_reset), .valid_in (valid_in), .ready_in (ready_in), diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 34476b11..ef2f56d0 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -64,7 +64,7 @@ module VX_lsu_unit #( assign mem_req_addr[i] = full_address[i][31:2]; assign mem_req_offset[i] = full_address[i][1:0]; assign mem_req_byteen[i] = wmask << full_address[i][1:0]; - assign mem_req_data[i] = lsu_req_if.store_data[i] << {mem_req_offset[i], 3'b0}; + assign mem_req_data[i] = lsu_req_if.store_data[i] << {full_address[i][1:0], 3'b0}; end `IGNORE_WARNINGS_BEGIN diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index eab11f0b..a04e4fcf 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -79,7 +79,17 @@ module VX_mem_unit # ( .cache_rsp_if (dcache_rsp_if), .smem_rsp_if (smem_rsp_if), .core_rsp_if (core_dcache_rsp_if) - ); + ); + + wire icache_reset, dcache_reset; + + VX_reset_relay #( + .NUM_NODES (2) + ) reset_relay ( + .clk (clk), + .reset (reset), + .reset_out ({dcache_reset, icache_reset}) + ); VX_cache #( .CACHE_ID (`ICACHE_ID), @@ -102,7 +112,7 @@ module VX_mem_unit # ( `SCOPE_BIND_VX_mem_unit_icache .clk (clk), - .reset (reset), + .reset (icache_reset), // Core request .core_req_valid (core_icache_req_if.valid), @@ -160,7 +170,7 @@ module VX_mem_unit # ( `SCOPE_BIND_VX_mem_unit_dcache .clk (clk), - .reset (reset), + .reset (dcache_reset), // Core req .core_req_valid (dcache_req_if.valid), @@ -199,6 +209,14 @@ module VX_mem_unit # ( if (`SM_ENABLE) begin + wire scache_reset; + + VX_reset_relay reset_relay ( + .clk (clk), + .reset (reset), + .reset_out (scache_reset) + ); + VX_cache #( .CACHE_ID (`SCACHE_ID), .CACHE_SIZE (`SMEM_SIZE), @@ -220,7 +238,7 @@ module VX_mem_unit # ( `SCOPE_BIND_VX_mem_unit_smem .clk (clk), - .reset (reset), + .reset (scache_reset), // Core request .core_req_valid (smem_req_if.valid), diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index 31db4a99..b8fb6658 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -72,15 +72,13 @@ module Vortex ( for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin wire cluster_reset; - if (`NUM_CLUSTERS > 1) begin - reg cluster_reset_r; - always @(posedge clk) begin - cluster_reset_r <= reset; - end - assign cluster_reset = cluster_reset_r; - end else begin - assign cluster_reset = reset; - end + VX_reset_relay #( + .PASSTHRU (`NUM_CLUSTERS == 1) + ) reset_relay ( + .clk (clk), + .reset (reset), + .reset_out (cluster_reset) + ); VX_cluster #( .CLUSTER_ID(i) diff --git a/hw/rtl/afu/vortex_afu.sv b/hw/rtl/afu/vortex_afu.sv index a39b8135..efec4550 100644 --- a/hw/rtl/afu/vortex_afu.sv +++ b/hw/rtl/afu/vortex_afu.sv @@ -37,7 +37,7 @@ module vortex_afu #( output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select ); -localparam RESET_DELAY = 2; +localparam RESET_DELAY = 3; localparam DRAM_ADDR_WIDTH = $bits(t_local_mem_addr); localparam DRAM_LINE_WIDTH = $bits(t_local_mem_data); diff --git a/hw/rtl/fp_cores/VX_fp_div.v b/hw/rtl/fp_cores/VX_fp_div.v index 48218671..be06b7e2 100644 --- a/hw/rtl/fp_cores/VX_fp_div.v +++ b/hw/rtl/fp_cores/VX_fp_div.v @@ -1,9 +1,5 @@ `include "VX_define.vh" -`ifndef SYNTHESIS -`include "float_dpi.vh" -`endif - module VX_fp_div #( parameter TAGW = 1, parameter LANES = 1 @@ -32,26 +28,24 @@ module VX_fp_div #( ); wire stall = ~ready_out && valid_out; wire enable = ~stall; + + wire _reset; + + VX_reset_relay reset_relay ( + .clk (clk), + .reset (reset), + .reset_out (_reset) + ); for (genvar i = 0; i < LANES; i++) begin - `ifdef QUARTUS acl_fdiv fdiv ( .clk (clk), - .areset (reset), + .areset (_reset), .en (enable), .a (dataa[i]), .b (datab[i]), .q (result[i]) ); - `else - integer fdiv_h; - initial begin - fdiv_h = dpi_register(); - end - always @(posedge clk) begin - dpi_fdiv (fdiv_h, enable, dataa[i], datab[i], `LATENCY_FDIV, result[i]); - end - `endif end VX_shift_register #( diff --git a/hw/rtl/fp_cores/VX_fp_dpi.v b/hw/rtl/fp_cores/VX_fp_dpi.v new file mode 100644 index 00000000..6f0ffb28 --- /dev/null +++ b/hw/rtl/fp_cores/VX_fp_dpi.v @@ -0,0 +1,415 @@ +`ifndef SYNTHESIS + +`include "VX_define.vh" +`include "float_dpi.vh" + +module VX_fp_dpi #( + parameter TAGW = 1 +) ( + input wire clk, + input wire reset, + + input wire valid_in, + output wire ready_in, + + input wire [TAGW-1:0] tag_in, + + input wire [`FPU_BITS-1:0] op_type, + input wire [`MOD_BITS-1:0] frm, + + input wire [`NUM_THREADS-1:0][31:0] dataa, + input wire [`NUM_THREADS-1:0][31:0] datab, + input wire [`NUM_THREADS-1:0][31:0] datac, + output wire [`NUM_THREADS-1:0][31:0] result, + + output wire has_fflags, + output fflags_t [`NUM_THREADS-1:0] fflags, + + output wire [TAGW-1:0] tag_out, + + input wire ready_out, + output wire valid_out +); + localparam FPU_FMA = 0; + localparam FPU_DIV = 1; + localparam FPU_SQRT = 2; + localparam FPU_CVT = 3; + localparam FPU_NCP = 4; + localparam NUM_FPC = 5; + localparam FPC_BITS = `LOG2UP(NUM_FPC); + + wire [NUM_FPC-1:0] per_core_ready_in; + wire [NUM_FPC-1:0][`NUM_THREADS-1:0][31:0] per_core_result; + wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out; + reg [NUM_FPC-1:0] per_core_ready_out; + wire [NUM_FPC-1:0] per_core_valid_out; + + wire [NUM_FPC-1:0] per_core_has_fflags; + fflags_t [NUM_FPC-1:0][`NUM_THREADS-1:0] per_core_fflags; + + reg [FPC_BITS-1:0] core_select; + + reg is_fadd, is_fsub, is_fmul, is_fmadd, is_fmsub, is_fnmadd, is_fnmsub; + reg is_itof, is_utof, is_ftoi, is_ftou; + reg is_fclss, is_flt, is_fle, is_feq, is_fmin, is_fmax, is_fsgnj, is_fsgnjn, is_fsgnjx; + + always @(*) begin + is_fadd = 0; + is_fsub = 0; + is_fmul = 0; + is_fmadd = 0; + is_fmsub = 0; + is_fnmadd = 0; + is_fnmsub = 0; + is_itof = 0; + is_utof = 0; + is_ftoi = 0; + is_ftou = 0; + is_fclss = 0; + is_flt = 0; + is_fle = 0; + is_feq = 0; + is_fmin = 0; + is_fmax = 0; + is_fsgnj = 0; + is_fsgnjn = 0; + is_fsgnjx = 0; + + case (op_type) + `FPU_ADD: begin core_select = FPU_FMA; is_fadd = 1; end + `FPU_SUB: begin core_select = FPU_FMA; is_fsub = 1; end + `FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end + `FPU_MADD: begin core_select = FPU_FMA; is_fmadd = 1; end + `FPU_MSUB: begin core_select = FPU_FMA; is_fmsub = 1; end + `FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end + `FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end + `FPU_DIV: begin core_select = FPU_DIV; end + `FPU_SQRT: begin core_select = FPU_SQRT; end + `FPU_CVTWS: begin core_select = FPU_CVT; is_ftoi = 1; end + `FPU_CVTWUS:begin core_select = FPU_CVT; is_ftou = 1; end + `FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; end + `FPU_CVTSWU:begin core_select = FPU_CVT; is_utof = 1; end + `FPU_CLASS: begin core_select = FPU_NCP; is_fclss = 1; end + `FPU_CMP: begin core_select = FPU_NCP; + is_fle = (frm == 0); + is_flt = (frm == 1); + is_feq = (frm == 2); + end + default: begin core_select = FPU_NCP; + is_fsgnj = (frm == 0); + is_fsgnjn = (frm == 1); + is_fsgnjx = (frm == 2); + is_fmin = (frm == 3); + is_fmax = (frm == 4); + end + endcase + end + + generate + begin : fma + + wire [`NUM_THREADS-1:0][31:0] result_fma; + wire [`NUM_THREADS-1:0][31:0] result_fadd; + wire [`NUM_THREADS-1:0][31:0] result_fsub; + wire [`NUM_THREADS-1:0][31:0] result_fmul; + wire [`NUM_THREADS-1:0][31:0] result_fmadd; + wire [`NUM_THREADS-1:0][31:0] result_fmsub; + wire [`NUM_THREADS-1:0][31:0] result_fnmadd; + wire [`NUM_THREADS-1:0][31:0] result_fnmsub; + + fflags_t [`NUM_THREADS-1:0] fflags_fma; + fflags_t [`NUM_THREADS-1:0] fflags_fadd; + fflags_t [`NUM_THREADS-1:0] fflags_fsub; + fflags_t [`NUM_THREADS-1:0] fflags_fmul; + fflags_t [`NUM_THREADS-1:0] fflags_fmadd; + fflags_t [`NUM_THREADS-1:0] fflags_fmsub; + fflags_t [`NUM_THREADS-1:0] fflags_fnmadd; + fflags_t [`NUM_THREADS-1:0] fflags_fnmsub; + + always @(*) begin + for (integer i = 0; i < `NUM_THREADS; i++) begin + dpi_fadd (dataa[i], datab[i], frm, result_fadd[i], fflags_fadd[i]); + dpi_fsub (dataa[i], datab[i], frm, result_fsub[i], fflags_fsub[i]); + dpi_fmul (dataa[i], datab[i], frm, result_fmul[i], fflags_fmul[i]); + dpi_fmadd (dataa[i], datab[i], datac[i], frm, result_fmadd[i], fflags_fmadd[i]); + dpi_fmsub (dataa[i], datab[i], datac[i], frm, result_fmsub[i], fflags_fmsub[i]); + dpi_fnmadd (dataa[i], datab[i], datac[i], frm, result_fnmadd[i], fflags_fnmadd[i]); + dpi_fnmsub (dataa[i], datab[i], datac[i], frm, result_fnmsub[i], fflags_fnmsub[i]); + end + end + + assign result_fma = is_fadd ? result_fadd : + is_fsub ? result_fsub : + is_fmul ? result_fmul : + is_fmadd ? result_fmadd : + is_fmsub ? result_fmsub : + is_fnmadd ? result_fnmadd : + is_fnmsub ? result_fnmsub : + 0; + + assign fflags_fma = is_fadd ? fflags_fadd : + is_fsub ? fflags_fsub : + is_fmul ? fflags_fmul : + is_fmadd ? fflags_fmadd : + is_fmsub ? fflags_fmsub : + is_fnmadd ? fflags_fnmadd : + is_fnmsub ? fflags_fnmsub : + 0; + + wire enable = per_core_ready_out[FPU_FMA] || ~per_core_valid_out[FPU_FMA]; + wire valid = (valid_in && core_select == FPU_FMA); + + VX_shift_register #( + .DATAW (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))), + .DEPTH (`LATENCY_FMA), + .RESETW (1) + ) shift_reg ( + .clk (clk), + .reset (reset), + .enable (enable), + .data_in ({valid, tag_in, result_fma, fflags_fma}), + .data_out ({per_core_valid_out[FPU_FMA], per_core_tag_out[FPU_FMA], per_core_result[FPU_FMA], per_core_fflags[FPU_FMA]}) + ); + + assign per_core_has_fflags[FPU_FMA] = 1; + assign per_core_ready_in[FPU_FMA] = enable; + + end + endgenerate + + generate + begin : fdiv + + wire [`NUM_THREADS-1:0][31:0] result_fdiv; + fflags_t [`NUM_THREADS-1:0] fflags_fdiv; + + always @(*) begin + for (integer i = 0; i < `NUM_THREADS; i++) begin + dpi_fdiv (dataa[i], datab[i], frm, result_fdiv[i], fflags_fdiv[i]); + end + end + + wire enable = per_core_ready_out[FPU_DIV] || ~per_core_valid_out[FPU_DIV]; + wire valid = (valid_in && core_select == FPU_DIV); + + VX_shift_register #( + .DATAW (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))), + .DEPTH (`LATENCY_FDIV), + .RESETW (1) + ) shift_reg ( + .clk (clk), + .reset (reset), + .enable (enable), + .data_in ({valid, tag_in, result_fdiv, fflags_fdiv}), + .data_out ({per_core_valid_out[FPU_DIV], per_core_tag_out[FPU_DIV], per_core_result[FPU_DIV], per_core_fflags[FPU_DIV]}) + ); + + assign per_core_has_fflags[FPU_DIV] = 1; + assign per_core_ready_in[FPU_DIV] = enable; + + end + endgenerate + + generate + begin : fsqrt + + wire [`NUM_THREADS-1:0][31:0] result_fsqrt; + fflags_t [`NUM_THREADS-1:0] fflags_fsqrt; + + always @(*) begin + for (integer i = 0; i < `NUM_THREADS; i++) begin + dpi_fsqrt (dataa[i], frm, result_fsqrt[i], fflags_fsqrt[i]); + end + end + + wire enable = per_core_ready_out[FPU_SQRT] || ~per_core_valid_out[FPU_SQRT]; + wire valid = (valid_in && core_select == FPU_SQRT); + + VX_shift_register #( + .DATAW (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))), + .DEPTH (`LATENCY_FSQRT), + .RESETW (1) + ) shift_reg ( + .clk (clk), + .reset (reset), + .enable (enable), + .data_in ({valid, tag_in, result_fsqrt, fflags_fsqrt}), + .data_out ({per_core_valid_out[FPU_SQRT], per_core_tag_out[FPU_SQRT], per_core_result[FPU_SQRT], per_core_fflags[FPU_SQRT]}) + ); + + assign per_core_has_fflags[FPU_SQRT] = 1; + assign per_core_ready_in[FPU_SQRT] = enable; + + end + endgenerate + + generate + begin : fcvt + + wire [`NUM_THREADS-1:0][31:0] result_fcvt; + wire [`NUM_THREADS-1:0][31:0] result_itof; + wire [`NUM_THREADS-1:0][31:0] result_utof; + wire [`NUM_THREADS-1:0][31:0] result_ftoi; + wire [`NUM_THREADS-1:0][31:0] result_ftou; + + fflags_t [`NUM_THREADS-1:0] fflags_fcvt; + fflags_t [`NUM_THREADS-1:0] fflags_itof; + fflags_t [`NUM_THREADS-1:0] fflags_utof; + fflags_t [`NUM_THREADS-1:0] fflags_ftoi; + fflags_t [`NUM_THREADS-1:0] fflags_ftou; + + always @(*) begin + for (integer i = 0; i < `NUM_THREADS; i++) begin + dpi_itof (dataa[i], frm, result_itof[i], fflags_itof[i]); + dpi_utof (dataa[i], frm, result_utof[i], fflags_utof[i]); + dpi_ftoi (dataa[i], frm, result_ftoi[i], fflags_ftoi[i]); + dpi_ftou (dataa[i], frm, result_ftou[i], fflags_ftou[i]); + end + end + + assign result_fcvt = is_itof ? result_itof : + is_utof ? result_utof : + is_ftoi ? result_ftoi : + is_ftou ? result_ftou : + 0; + + assign fflags_fcvt = is_itof ? fflags_itof : + is_utof ? fflags_utof : + is_ftoi ? fflags_ftoi : + is_ftou ? fflags_ftou : + 0; + + wire enable = per_core_ready_out[FPU_CVT] || ~per_core_valid_out[FPU_CVT]; + wire valid = (valid_in && core_select == FPU_CVT); + + VX_shift_register #( + .DATAW (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))), + .DEPTH (`LATENCY_FCVT), + .RESETW (1) + ) shift_reg ( + .clk (clk), + .reset (reset), + .enable (enable), + .data_in ({valid, tag_in, result_fcvt, fflags_fcvt}), + .data_out ({per_core_valid_out[FPU_CVT], per_core_tag_out[FPU_CVT], per_core_result[FPU_CVT], per_core_fflags[FPU_CVT]}) + ); + + assign per_core_has_fflags[FPU_CVT] = 1; + assign per_core_ready_in[FPU_CVT] = enable; + + end + endgenerate + + generate + begin : fncp + + wire [`NUM_THREADS-1:0][31:0] result_fncp; + wire [`NUM_THREADS-1:0][31:0] result_fclss; + wire [`NUM_THREADS-1:0][31:0] result_flt; + wire [`NUM_THREADS-1:0][31:0] result_fle; + wire [`NUM_THREADS-1:0][31:0] result_feq; + wire [`NUM_THREADS-1:0][31:0] result_fmin; + wire [`NUM_THREADS-1:0][31:0] result_fmax; + wire [`NUM_THREADS-1:0][31:0] result_fsgnj; + wire [`NUM_THREADS-1:0][31:0] result_fsgnjn; + wire [`NUM_THREADS-1:0][31:0] result_fsgnjx; + reg [`NUM_THREADS-1:0][31:0] result_fmv; + + fflags_t [`NUM_THREADS-1:0] fflags_fncp; + fflags_t [`NUM_THREADS-1:0] fflags_flt; + fflags_t [`NUM_THREADS-1:0] fflags_fle; + fflags_t [`NUM_THREADS-1:0] fflags_feq; + fflags_t [`NUM_THREADS-1:0] fflags_fmin; + fflags_t [`NUM_THREADS-1:0] fflags_fmax; + + always @(*) begin + for (integer i = 0; i < `NUM_THREADS; i++) begin + dpi_fclss (dataa[i], result_fclss[i]); + dpi_flt (dataa[i], datab[i], result_flt[i], fflags_flt[i]); + dpi_fle (dataa[i], datab[i], result_fle[i], fflags_fle[i]); + dpi_feq (dataa[i], datab[i], result_feq[i], fflags_feq[i]); + dpi_fmin (dataa[i], datab[i], result_fmin[i], fflags_fmin[i]); + dpi_fmax (dataa[i], datab[i], result_fmax[i], fflags_fmax[i]); + dpi_fsgnj (dataa[i], result_fsgnj[i]); + dpi_fsgnjn (dataa[i], result_fsgnjn[i]); + dpi_fsgnjx (dataa[i], result_fsgnjx[i]); + result_fmv[i] = dataa[i]; + end + end + + assign result_fncp = is_fclss ? result_fclss : + is_flt ? result_flt : + is_fle ? result_fle : + is_feq ? result_feq : + is_fmin ? result_fmin : + is_fmax ? result_fmax : + is_fsgnj ? result_fsgnj : + is_fsgnjn ? result_fsgnjn : + is_fsgnjx ? result_fsgnjx : + result_fmv; + + wire has_fflags_fncp = (is_flt || is_fle || is_feq || is_fmin || is_fmax); + + assign fflags_fncp = is_flt ? fflags_flt : + is_fle ? fflags_fle : + is_feq ? fflags_feq : + is_fmin ? fflags_fmin : + is_fmax ? fflags_fmax : + 0; + + wire enable = per_core_ready_out[FPU_NCP] || ~per_core_valid_out[FPU_NCP]; + wire valid = (valid_in && core_select == FPU_NCP); + + VX_shift_register #( + .DATAW (1 + TAGW + 1 + `NUM_THREADS * (32 + $bits(fflags_t))), + .DEPTH (`LATENCY_FNCP), + .RESETW (1) + ) shift_reg ( + .clk (clk), + .reset (reset), + .enable (enable), + .data_in ({valid, tag_in, has_fflags_fncp, result_fncp, fflags_fncp}), + .data_out ({per_core_valid_out[FPU_NCP], per_core_tag_out[FPU_NCP], per_core_has_fflags[FPU_NCP], per_core_result[FPU_NCP], per_core_fflags[FPU_NCP]}) + ); + + assign per_core_ready_in[FPU_NCP] = enable; + + end + endgenerate + + /////////////////////////////////////////////////////////////////////////// + + reg has_fflags_n; + fflags_t [`NUM_THREADS-1:0] fflags_n; + reg [`NUM_THREADS-1:0][31:0] result_n; + reg [TAGW-1:0] tag_out_n; + + always @(*) begin + per_core_ready_out = 0; + has_fflags_n = 'x; + fflags_n = 'x; + result_n = 'x; + tag_out_n = 'x; + for (integer i = 0; i < NUM_FPC; i++) begin + if (per_core_valid_out[i]) begin + has_fflags_n = per_core_has_fflags[i]; + fflags_n = per_core_fflags[i]; + result_n = per_core_result[i]; + tag_out_n = per_core_tag_out[i]; + per_core_ready_out[i] = ready_out; + break; + end + end + end + + assign valid_out = (| per_core_valid_out); + assign has_fflags = has_fflags_n; + assign tag_out = tag_out_n; + assign result = result_n; + assign fflags = fflags_n; + + assign ready_in = per_core_ready_in[core_select]; + +endmodule + +`endif \ No newline at end of file diff --git a/hw/rtl/fp_cores/VX_fp_fma.v b/hw/rtl/fp_cores/VX_fp_fma.v index 1029c2c2..3cd1b2e5 100644 --- a/hw/rtl/fp_cores/VX_fp_fma.v +++ b/hw/rtl/fp_cores/VX_fp_fma.v @@ -1,9 +1,5 @@ `include "VX_define.vh" -`ifndef SYNTHESIS -`include "float_dpi.vh" -`endif - module VX_fp_fma #( parameter TAGW = 1, parameter LANES = 1 @@ -63,7 +59,6 @@ module VX_fp_fma #( end end - `ifdef QUARTUS acl_fmadd fmadd ( .clk (clk), .areset (reset), @@ -73,15 +68,6 @@ module VX_fp_fma #( .c (c), .q (result[i]) ); - `else - integer fmadd_h; - initial begin - fmadd_h = dpi_register(); - end - always @(posedge clk) begin - dpi_fmadd (fmadd_h, enable, a, b, c, `LATENCY_FMA, result[i]); - end - `endif end VX_shift_register #( diff --git a/hw/rtl/fp_cores/VX_fp_fpga.v b/hw/rtl/fp_cores/VX_fp_fpga.v index 85553c69..c96399b4 100644 --- a/hw/rtl/fp_cores/VX_fp_fpga.v +++ b/hw/rtl/fp_cores/VX_fp_fpga.v @@ -27,6 +27,11 @@ module VX_fp_fpga #( input wire ready_out, output wire valid_out ); + localparam FPU_FMA = 0; + localparam FPU_DIV = 1; + localparam FPU_SQRT = 2; + localparam FPU_CVT = 3; + localparam FPU_NCP = 4; localparam NUM_FPC = 5; localparam FPC_BITS = `LOG2UP(NUM_FPC); @@ -49,20 +54,20 @@ module VX_fp_fpga #( is_itof = 'x; is_signed = 'x; case (op_type) - `FPU_ADD: begin core_select = 0; do_madd = 0; do_sub = 0; do_neg = 0; end - `FPU_SUB: begin core_select = 0; do_madd = 0; do_sub = 1; do_neg = 0; end - `FPU_MUL: begin core_select = 0; do_madd = 0; do_sub = 0; do_neg = 1; end - `FPU_MADD: begin core_select = 0; do_madd = 1; do_sub = 0; do_neg = 0; end - `FPU_MSUB: begin core_select = 0; do_madd = 1; do_sub = 1; do_neg = 0; end - `FPU_NMADD: begin core_select = 0; do_madd = 1; do_sub = 0; do_neg = 1; end - `FPU_NMSUB: begin core_select = 0; do_madd = 1; do_sub = 1; do_neg = 1; end - `FPU_DIV: begin core_select = 1; end - `FPU_SQRT: begin core_select = 2; end - `FPU_CVTWS: begin core_select = 3; is_itof = 0; is_signed = 1; end - `FPU_CVTWUS: begin core_select = 3; is_itof = 0; is_signed = 0; end - `FPU_CVTSW: begin core_select = 3; is_itof = 1; is_signed = 1; end - `FPU_CVTSWU: begin core_select = 3; is_itof = 1; is_signed = 0; end - default: begin core_select = 4; end + `FPU_ADD: begin core_select = FPU_FMA; do_madd = 0; do_sub = 0; do_neg = 0; end + `FPU_SUB: begin core_select = FPU_FMA; do_madd = 0; do_sub = 1; do_neg = 0; end + `FPU_MUL: begin core_select = FPU_FMA; do_madd = 0; do_sub = 0; do_neg = 1; end + `FPU_MADD: begin core_select = FPU_FMA; do_madd = 1; do_sub = 0; do_neg = 0; end + `FPU_MSUB: begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; do_neg = 0; end + `FPU_NMADD: begin core_select = FPU_FMA; do_madd = 1; do_sub = 0; do_neg = 1; end + `FPU_NMSUB: begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; do_neg = 1; end + `FPU_DIV: begin core_select = FPU_DIV; end + `FPU_SQRT: begin core_select = FPU_SQRT; end + `FPU_CVTWS: begin core_select = FPU_CVT; is_itof = 0; is_signed = 1; end + `FPU_CVTWUS: begin core_select = FPU_CVT; is_itof = 0; is_signed = 0; end + `FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end + `FPU_CVTSWU: begin core_select = FPU_CVT; is_itof = 1; is_signed = 0; end + default: begin core_select = FPU_NCP; end endcase end @@ -72,8 +77,8 @@ module VX_fp_fpga #( ) fp_fma ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 0)), - .ready_in (per_core_ready_in[0]), + .valid_in (valid_in && (core_select == FPU_FMA)), + .ready_in (per_core_ready_in[FPU_FMA]), .tag_in (tag_in), .frm (frm), .do_madd (do_madd), @@ -82,12 +87,12 @@ module VX_fp_fpga #( .dataa (dataa), .datab (datab), .datac (datac), - .has_fflags (per_core_has_fflags[0]), - .fflags (per_core_fflags[0]), - .result (per_core_result[0]), - .tag_out (per_core_tag_out[0]), - .ready_out (per_core_ready_out[0]), - .valid_out (per_core_valid_out[0]) + .has_fflags (per_core_has_fflags[FPU_FMA]), + .fflags (per_core_fflags[FPU_FMA]), + .result (per_core_result[FPU_FMA]), + .tag_out (per_core_tag_out[FPU_FMA]), + .ready_out (per_core_ready_out[FPU_FMA]), + .valid_out (per_core_valid_out[FPU_FMA]) ); VX_fp_div #( @@ -96,18 +101,18 @@ module VX_fp_fpga #( ) fp_div ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 1)), - .ready_in (per_core_ready_in[1]), + .valid_in (valid_in && (core_select == FPU_DIV)), + .ready_in (per_core_ready_in[FPU_DIV]), .tag_in (tag_in), .frm (frm), .dataa (dataa), .datab (datab), - .has_fflags (per_core_has_fflags[1]), - .fflags (per_core_fflags[1]), - .result (per_core_result[1]), - .tag_out (per_core_tag_out[1]), - .ready_out (per_core_ready_out[1]), - .valid_out (per_core_valid_out[1]) + .has_fflags (per_core_has_fflags[FPU_DIV]), + .fflags (per_core_fflags[FPU_DIV]), + .result (per_core_result[FPU_DIV]), + .tag_out (per_core_tag_out[FPU_DIV]), + .ready_out (per_core_ready_out[FPU_DIV]), + .valid_out (per_core_valid_out[FPU_DIV]) ); VX_fp_sqrt #( @@ -116,17 +121,17 @@ module VX_fp_fpga #( ) fp_sqrt ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 2)), - .ready_in (per_core_ready_in[2]), + .valid_in (valid_in && (core_select == FPU_SQRT)), + .ready_in (per_core_ready_in[FPU_SQRT]), .tag_in (tag_in), .frm (frm), .dataa (dataa), - .has_fflags (per_core_has_fflags[2]), - .fflags (per_core_fflags[2]), - .result (per_core_result[2]), - .tag_out (per_core_tag_out[2]), - .ready_out (per_core_ready_out[2]), - .valid_out (per_core_valid_out[2]) + .has_fflags (per_core_has_fflags[FPU_SQRT]), + .fflags (per_core_fflags[FPU_SQRT]), + .result (per_core_result[FPU_SQRT]), + .tag_out (per_core_tag_out[FPU_SQRT]), + .ready_out (per_core_ready_out[FPU_SQRT]), + .valid_out (per_core_valid_out[FPU_SQRT]) ); VX_fp_cvt #( @@ -135,19 +140,19 @@ module VX_fp_fpga #( ) fp_cvt ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 3)), - .ready_in (per_core_ready_in[3]), + .valid_in (valid_in && (core_select == FPU_CVT)), + .ready_in (per_core_ready_in[FPU_CVT]), .tag_in (tag_in), .frm (frm), .is_itof (is_itof), .is_signed (is_signed), .dataa (dataa), - .has_fflags (per_core_has_fflags[3]), - .fflags (per_core_fflags[3]), - .result (per_core_result[3]), - .tag_out (per_core_tag_out[3]), - .ready_out (per_core_ready_out[3]), - .valid_out (per_core_valid_out[3]) + .has_fflags (per_core_has_fflags[FPU_CVT]), + .fflags (per_core_fflags[FPU_CVT]), + .result (per_core_result[FPU_CVT]), + .tag_out (per_core_tag_out[FPU_CVT]), + .ready_out (per_core_ready_out[FPU_CVT]), + .valid_out (per_core_valid_out[FPU_CVT]) ); VX_fp_ncomp #( @@ -156,19 +161,19 @@ module VX_fp_fpga #( ) fp_ncomp ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 4)), - .ready_in (per_core_ready_in[4]), + .valid_in (valid_in && (core_select == FPU_NCP)), + .ready_in (per_core_ready_in[FPU_NCP]), .tag_in (tag_in), .op_type (op_type), .frm (frm), .dataa (dataa), .datab (datab), - .result (per_core_result[4]), - .has_fflags (per_core_has_fflags[4]), - .fflags (per_core_fflags[4]), - .tag_out (per_core_tag_out[4]), - .ready_out (per_core_ready_out[4]), - .valid_out (per_core_valid_out[4]) + .result (per_core_result[FPU_NCP]), + .has_fflags (per_core_has_fflags[FPU_NCP]), + .fflags (per_core_fflags[FPU_NCP]), + .tag_out (per_core_tag_out[FPU_NCP]), + .ready_out (per_core_ready_out[FPU_NCP]), + .valid_out (per_core_valid_out[FPU_NCP]) ); reg has_fflags_n; diff --git a/hw/rtl/fp_cores/VX_fp_ncomp.v b/hw/rtl/fp_cores/VX_fp_ncomp.v index a804f1a7..57792d15 100644 --- a/hw/rtl/fp_cores/VX_fp_ncomp.v +++ b/hw/rtl/fp_cores/VX_fp_ncomp.v @@ -160,7 +160,7 @@ module VX_fp_ncomp #( for (genvar i = 0; i < LANES; i++) begin always @(*) begin case (frm_s0) - `FRM_RNE: begin + `FRM_RNE: begin // LE fcmp_fflags[i] = 5'h0; if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin fcmp_res[i] = 32'h0; @@ -169,7 +169,7 @@ module VX_fp_ncomp #( fcmp_res[i] = {31'h0, (a_smaller_s0[i] | ab_equal_s0[i])}; end end - `FRM_RTZ: begin + `FRM_RTZ: begin // LS fcmp_fflags[i] = 5'h0; if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin fcmp_res[i] = 32'h0; @@ -178,7 +178,7 @@ module VX_fp_ncomp #( fcmp_res[i] = {31'h0, (a_smaller_s0[i] & ~ab_equal_s0[i])}; end end - `FRM_RDN: begin + `FRM_RDN: begin // EQ fcmp_fflags[i] = 5'h0; if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin fcmp_res[i] = 32'h0; @@ -223,7 +223,7 @@ module VX_fp_ncomp #( tmp_fflags[i] = 0; tmp_fflags[i].NV = a_type_s0[i].is_signaling | b_type_s0[i].is_signaling; end - //5,6,7: + //5,6,7: MOVE default: begin tmp_result[i] = dataa[i]; tmp_fflags[i] = 'x; diff --git a/hw/rtl/fp_cores/VX_fp_sqrt.v b/hw/rtl/fp_cores/VX_fp_sqrt.v index 07dbd042..132319f4 100644 --- a/hw/rtl/fp_cores/VX_fp_sqrt.v +++ b/hw/rtl/fp_cores/VX_fp_sqrt.v @@ -1,9 +1,5 @@ `include "VX_define.vh" -`ifndef SYNTHESIS -`include "float_dpi.vh" -`endif - module VX_fp_sqrt #( parameter TAGW = 1, parameter LANES = 1 @@ -30,26 +26,24 @@ module VX_fp_sqrt #( output wire valid_out ); wire stall = ~ready_out && valid_out; - wire enable = ~stall; + wire enable = ~stall; + + wire _reset; + + VX_reset_relay reset_relay ( + .clk (clk), + .reset (reset), + .reset_out (_reset) + ); for (genvar i = 0; i < LANES; i++) begin - `ifdef QUARTUS acl_fsqrt fsqrt ( .clk (clk), - .areset (reset), + .areset (_reset), .en (enable), .a (dataa[i]), .q (result[i]) ); - `else - integer fsqrt_h; - initial begin - fsqrt_h = dpi_register(); - end - always @(posedge clk) begin - dpi_fsqrt (fsqrt_h, enable, dataa[i], `LATENCY_FSQRT, result[i]); - end - `endif end VX_shift_register #( diff --git a/hw/rtl/fp_cores/VX_fpnew.v b/hw/rtl/fp_cores/VX_fpnew.v index a04f431d..a9f349ab 100644 --- a/hw/rtl/fp_cores/VX_fpnew.v +++ b/hw/rtl/fp_cores/VX_fpnew.v @@ -56,13 +56,13 @@ module VX_fpnew localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{ PipeRegs:'{'{`LATENCY_FMA, 0, 0, 0, 0}, // ADDMUL - '{default: `LATENCY_FDIVSQRT}, // DIVSQRT - '{default: `LATENCY_FNCOMP}, // NONCOMP - '{default: `LATENCY_FCONV}}, // CONV - UnitTypes:'{'{default: UNIT_FMULADD}, // ADDMUL - '{default: UNIT_FDIVSQRT}, // DIVSQRT - '{default: UNIT_FNONCOMP}, // NONCOMP - '{default: UNIT_FCONV}}, // CONV + '{default: `LATENCY_FDIVSQRT}, // DIVSQRT + '{default: `LATENCY_FNCP}, // NONCOMP + '{default: `LATENCY_FCVT}}, // CONV + UnitTypes:'{'{default: UNIT_FMULADD}, // ADDMUL + '{default: UNIT_FDIVSQRT}, // DIVSQRT + '{default: UNIT_FNONCOMP}, // NONCOMP + '{default: UNIT_FCONV}}, // CONV PipeConfig: fpnew_pkg::DISTRIBUTED }; diff --git a/hw/rtl/fp_cores/svdpi/float_dpi.cpp b/hw/rtl/fp_cores/svdpi/float_dpi.cpp deleted file mode 100644 index 0bc99c2c..00000000 --- a/hw/rtl/fp_cores/svdpi/float_dpi.cpp +++ /dev/null @@ -1,239 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "svdpi.h" -#include "verilated_vpi.h" -#include "VX_config.h" - -extern "C" { - int dpi_register(); - void dpi_fadd(int inst, bool enable, int a, int b, int delay, int* result); - void dpi_fsub(int inst, bool enable, int a, int b, int delay, int* result); - void dpi_fmul(int inst, bool enable, int a, int b, int delay, int* result); - void dpi_fmadd(int inst, bool enable, int a, int b, int c, int delay, int* result); - void dpi_fdiv(int inst, bool enable, int a, int b, int delay, int* result); - void dpi_fsqrt(int inst, bool enable, int a, int delay, int* result); - void dpi_ftoi(int inst, bool enable, int a, int delay, int* result); - void dpi_ftou(int inst, bool enable, int a, int delay, int* result); - void dpi_itof(int inst, bool enable, int a, int delay, int* result); - void dpi_utof(int inst, bool enable, int a, int delay, int* result); - void dpi_assert(int inst, bool cond, int delay); -} - -class ShiftRegister { -public: - ShiftRegister() : init_(false), depth_(0) {} - - void ensure_init(int depth) { - if (!init_) { - buffer_.resize(depth); - init_ = true; - depth_ = depth; - } - } - - void push(int value, bool enable) { - if (!enable) - return; - for (unsigned i = 0; i < depth_-1; ++i) { - buffer_[i] = buffer_[i+1]; - } - buffer_[depth_-1] = value; - } - - int top() const { - return buffer_[0]; - } - -private: - - std::vector buffer_; - bool init_; - unsigned depth_; -}; - -union Float_t { - float f; - int i; - struct { - uint32_t man : 23; - uint32_t exp : 8; - uint32_t sign : 1; - } parts; -}; - -class Instances { -public: - ShiftRegister& get(int inst) { - return instances_.at(inst); - } - - int allocate() { - mutex_.lock(); - int inst = instances_.size(); - instances_.resize(inst + 1); - mutex_.unlock(); - return inst; - } - -private: - std::vector instances_; - std::mutex mutex_; -}; - -Instances instances; - -int dpi_register() { - return instances.allocate(); -} - -void dpi_fadd(int inst, bool enable, int a, int b, int delay, int* result) { - ShiftRegister& sr = instances.get(inst); - - Float_t fa, fb, fr; - - fa.i = a; - fb.i = b; - fr.f = fa.f + fb.f; - - sr.ensure_init(delay); - sr.push(fr.i, enable); - *result = sr.top(); -} - -void dpi_fsub(int inst, bool enable, int a, int b, int delay, int* result) { - ShiftRegister& sr = instances.get(inst); - - Float_t fa, fb, fr; - - fa.i = a; - fb.i = b; - fr.f = fa.f - fb.f; - - sr.ensure_init(delay); - sr.push(fr.i, enable); - *result = sr.top(); -} - -void dpi_fmul(int inst, bool enable, int a, int b, int delay, int* result) { - ShiftRegister& sr = instances.get(inst); - - Float_t fa, fb, fr; - - fa.i = a; - fb.i = b; - fr.f = fa.f * fb.f; - - sr.ensure_init(delay); - sr.push(fr.i, enable); - *result = sr.top(); -} - -void dpi_fmadd(int inst, bool enable, int a, int b, int c, int delay, int* result) { - ShiftRegister& sr = instances.get(inst); - - Float_t fa, fb, fc, fr; - - fa.i = a; - fb.i = b; - fc.i = c; - fr.f = fa.f * fb.f + fc.f; - - sr.ensure_init(delay); - sr.push(fr.i, enable); - *result = sr.top(); -} - -void dpi_fdiv(int inst, bool enable, int a, int b, int delay, int* result) { - ShiftRegister& sr = instances.get(inst); - - Float_t fa, fb, fr; - - fa.i = a; - fb.i = b; - fr.f = fa.f / fb.f; - - sr.ensure_init(delay); - sr.push(fr.i, enable); - *result = sr.top(); -} - -void dpi_fsqrt(int inst, bool enable, int a, int delay, int* result) { - ShiftRegister& sr = instances.get(inst); - - Float_t fa, fr; - - fa.i = a; - fr.f = sqrtf(fa.f); - - sr.ensure_init(delay); - sr.push(fr.i, enable); - *result = sr.top(); -} - -void dpi_ftoi(int inst, bool enable, int a, int delay, int* result) { - ShiftRegister& sr = instances.get(inst); - - Float_t fa, fr; - - fa.i = a; - fr.i = int(fa.f); - - sr.ensure_init(delay); - sr.push(fr.i, enable); - *result = sr.top(); -} - -void dpi_ftou(int inst, bool enable, int a, int delay, int* result) { - ShiftRegister& sr = instances.get(inst); - - Float_t fa, fr; - - fa.i = a; - fr.i = unsigned(fa.f); - - sr.ensure_init(delay); - sr.push(fr.i, enable); - *result = sr.top(); -} - -void dpi_itof(int inst, bool enable, int a, int delay, int* result) { - ShiftRegister& sr = instances.get(inst); - - Float_t fa, fr; - - fr.f = (float)a; - - sr.ensure_init(delay); - sr.push(fr.i, enable); - *result = sr.top(); -} - -void dpi_utof(int inst, bool enable, int a, int delay, int* result) { - ShiftRegister& sr = instances.get(inst); - - Float_t fa, fr; - - unsigned ua = a; - fr.f = (float)ua; - - sr.ensure_init(delay); - sr.push(fr.i, enable); - *result = sr.top(); -} - -void dpi_assert(int inst, bool cond, int delay) { - ShiftRegister& sr = instances.get(inst); - - sr.ensure_init(delay); - sr.push(!cond, 1); - - auto status = sr.top(); - if (status) { - printf("delayed assertion at %s!\n", svGetNameFromScope(svGetScope())); - std::abort(); - } -} \ No newline at end of file diff --git a/hw/rtl/fp_cores/svdpi/float_dpi.vh b/hw/rtl/fp_cores/svdpi/float_dpi.vh deleted file mode 100644 index d4500cf9..00000000 --- a/hw/rtl/fp_cores/svdpi/float_dpi.vh +++ /dev/null @@ -1,20 +0,0 @@ -`ifndef FLOAT_DPI -`define FLOAT_DPI - -import "DPI-C" context function int dpi_register(); - -import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input int a, input int b, input int delay, output int result); -import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input int a, input int b, input int delay, output int result); -import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input int a, input int b, input int delay, output int result); -import "DPI-C" context function void dpi_fmadd(int inst, input logic enable, input int a, input int b, input int c, input int delay, output int result); -import "DPI-C" context function void dpi_fmsub(int inst, input logic enable, input int a, input int b, input int c, input int delay, output int result); -import "DPI-C" context function void dpi_fdiv(int inst, input logic enable, input int a, input int b, input int delay, output int result); -import "DPI-C" context function void dpi_fsqrt(int inst, input logic enable, input int a, input int delay, output int result); -import "DPI-C" context function void dpi_ftoi(int inst, input logic enable, input int a, input int delay, output int result); -import "DPI-C" context function void dpi_ftou(int inst, input logic enable, input int a, input int delay, output int result); -import "DPI-C" context function void dpi_itof(int inst, input logic enable, input int a, input int delay, output int result); -import "DPI-C" context function void dpi_utof(int inst, input logic enable, input int a, input int delay, output int result); - -import "DPI-C" context function void dpi_assert(int inst, input logic cond, input int delay); - -`endif \ No newline at end of file diff --git a/hw/rtl/libs/VX_lzc.v b/hw/rtl/libs/VX_lzc.v index 0baceb17..469c587c 100644 --- a/hw/rtl/libs/VX_lzc.v +++ b/hw/rtl/libs/VX_lzc.v @@ -1,7 +1,7 @@ `include "VX_platform.vh" module VX_lzc #( - parameter DATAW = 1, + parameter DATAW = 32, parameter LDATAW = `LOG2UP(DATAW) ) ( input wire [DATAW-1:0] data_in, diff --git a/hw/simulate/Makefile b/hw/simulate/Makefile index 32e4adb6..287271ca 100644 --- a/hw/simulate/Makefile +++ b/hw/simulate/Makefile @@ -31,12 +31,13 @@ MULTICORE ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 TOP = Vortex RTL_DIR=../rtl +DPI_DIR=../dpi -FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src +FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src RTL_INCLUDE = -I$(RTL_DIR)/ -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) SRCS = simulator.cpp testbench.cpp -SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp +SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic VL_FLAGS += -Wno-DECLFILENAME diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index d16aa406..16722a29 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -3,7 +3,7 @@ #include #include -#define RESET_DELAY 2 +#define RESET_DELAY 4 #define ENABLE_DRAM_STALLS #define DRAM_LATENCY 24 @@ -75,13 +75,6 @@ void Simulator::reset() { vortex_->csr_rsp_ready = 0; vortex_->reset = 1; - - vortex_->clk = 0; - this->eval(); - vortex_->clk = 1; - this->eval(); - - vortex_->reset = 0; for (int i = 0; i < RESET_DELAY; ++i) { vortex_->clk = 0; @@ -89,8 +82,11 @@ void Simulator::reset() { vortex_->clk = 1; this->eval(); } + + vortex_->reset = 0; // Turn on assertion after reset + printf("*** enabling assertion at tick: %ld", timestamp); Verilated::assertOn(true); }