From 42e3b6c45d70b65a38a6264687253b62d0816e0a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 4 Sep 2020 07:51:46 -0700 Subject: [PATCH] fixed lmp_mult parameters, ram init filepath --- driver/tests/dogfood/testcases.h | 10 +- hw/opae/Makefile | 8 +- hw/opae/README | 2 +- hw/opae/sources_1c.txt | 1 + hw/rtl/VX_commit.v | 2 +- hw/rtl/VX_config.vh | 18 +- hw/rtl/VX_define.vh | 2 +- hw/rtl/VX_mul_unit.v | 25 +-- hw/rtl/cache/VX_cache_config.vh | 6 + hw/rtl/fp_cores/VX_fp_addmul.v | 187 ++++++++++++++++++ hw/rtl/fp_cores/VX_fp_div.v | 11 +- hw/rtl/fp_cores/VX_fp_fpga.v | 126 ++++++------ hw/rtl/fp_cores/VX_fp_ftoi.v | 17 +- hw/rtl/fp_cores/VX_fp_itof.v | 17 +- hw/rtl/fp_cores/VX_fp_madd.v | 174 ++-------------- hw/rtl/fp_cores/VX_fp_nmadd.v | 36 ++-- hw/rtl/fp_cores/VX_fp_sqrt.v | 11 +- .../altera/{acl_fp_div.sv => acl_fdiv.sv} | 12 +- ...fdiv_memoryC0_uid112_invTables_lutmem.hex} | 0 ...fdiv_memoryC1_uid115_invTables_lutmem.hex} | 0 ...fdiv_memoryC2_uid118_invTables_lutmem.hex} | 0 .../altera/{acl_fp_sqrt.sv => acl_fsqrt.sv} | 12 +- ...sqrt_memoryC0_uid62_sqrtTables_lutmem.hex} | 0 ...sqrt_memoryC1_uid65_sqrtTables_lutmem.hex} | 0 ...sqrt_memoryC2_uid68_sqrtTables_lutmem.hex} | 0 .../altera/{acl_fp_ftoi.sv => acl_ftoi.sv} | 6 +- .../altera/{acl_fp_ftou.sv => acl_ftou.sv} | 6 +- hw/rtl/fp_cores/altera/acl_gen.log | 169 ++++++++++++++++ hw/rtl/fp_cores/altera/acl_gen.sh | 25 +++ .../altera/{acl_fp_itof.sv => acl_itof.sv} | 6 +- .../altera/{acl_fp_utof.sv => acl_utof.sv} | 6 +- hw/rtl/fp_cores/altera/generate.sh | 25 --- hw/rtl/fp_cores/svdpi/float_dpi.cpp | 185 +++++++++-------- hw/rtl/fp_cores/svdpi/float_dpi.vh | 22 +-- hw/rtl/libs/VX_divide.v | 51 ++--- hw/rtl/libs/VX_multiplier.v | 55 +++--- 36 files changed, 738 insertions(+), 495 deletions(-) create mode 100644 hw/rtl/fp_cores/VX_fp_addmul.v rename hw/rtl/fp_cores/altera/{acl_fp_div.sv => acl_fdiv.sv} (99%) rename hw/rtl/fp_cores/altera/{acl_fp_div_memoryC0_uid112_invTables_lutmem.hex => acl_fdiv_memoryC0_uid112_invTables_lutmem.hex} (100%) rename hw/rtl/fp_cores/altera/{acl_fp_div_memoryC1_uid115_invTables_lutmem.hex => acl_fdiv_memoryC1_uid115_invTables_lutmem.hex} (100%) rename hw/rtl/fp_cores/altera/{acl_fp_div_memoryC2_uid118_invTables_lutmem.hex => acl_fdiv_memoryC2_uid118_invTables_lutmem.hex} (100%) rename hw/rtl/fp_cores/altera/{acl_fp_sqrt.sv => acl_fsqrt.sv} (99%) rename hw/rtl/fp_cores/altera/{acl_fp_sqrt_memoryC0_uid62_sqrtTables_lutmem.hex => acl_fsqrt_memoryC0_uid62_sqrtTables_lutmem.hex} (100%) rename hw/rtl/fp_cores/altera/{acl_fp_sqrt_memoryC1_uid65_sqrtTables_lutmem.hex => acl_fsqrt_memoryC1_uid65_sqrtTables_lutmem.hex} (100%) rename hw/rtl/fp_cores/altera/{acl_fp_sqrt_memoryC2_uid68_sqrtTables_lutmem.hex => acl_fsqrt_memoryC2_uid68_sqrtTables_lutmem.hex} (100%) rename hw/rtl/fp_cores/altera/{acl_fp_ftoi.sv => acl_ftoi.sv} (99%) rename hw/rtl/fp_cores/altera/{acl_fp_ftou.sv => acl_ftou.sv} (99%) create mode 100644 hw/rtl/fp_cores/altera/acl_gen.log create mode 100755 hw/rtl/fp_cores/altera/acl_gen.sh rename hw/rtl/fp_cores/altera/{acl_fp_itof.sv => acl_itof.sv} (99%) rename hw/rtl/fp_cores/altera/{acl_fp_utof.sv => acl_utof.sv} (99%) delete mode 100755 hw/rtl/fp_cores/altera/generate.sh diff --git a/driver/tests/dogfood/testcases.h b/driver/tests/dogfood/testcases.h index fdbb727b..7e015b18 100644 --- a/driver/tests/dogfood/testcases.h +++ b/driver/tests/dogfood/testcases.h @@ -5,12 +5,12 @@ #include union Float_t { - float f; - int32_t i; + float f; + int i; struct { - uint32_t mantissa : 23; - uint32_t exponent : 8; - uint32_t sign : 1; + uint32_t man : 23; + uint32_t exp : 8; + uint32_t sign : 1; } parts; }; diff --git a/hw/opae/Makefile b/hw/opae/Makefile index f1637d0d..57d242fc 100644 --- a/hw/opae/Makefile +++ b/hw/opae/Makefile @@ -4,19 +4,22 @@ FPGA_BUILD_DIR=build_fpga all: ase-1c -sources.txt: +sources.txt: ./gen_sources.sh > sources.txt gen_sources: sources.txt ase-1c: setup-ase-1c gen_sources make -C $(ASE_BUILD_DIR)_1c + cp ../rtl/fp_cores/altera/*.hex $(ASE_BUILD_DIR)_1c/work ase-2c: setup-ase-2c gen_sources make -C $(ASE_BUILD_DIR)_2c + cp ../rtl/fp_cores/altera/*.hex $(ASE_BUILD_DIR)_2c/work ase-4c: setup-ase-4c gen_sources make -C $(ASE_BUILD_DIR)_4c + cp ../rtl/fp_cores/altera/*.hex $(ASE_BUILD_DIR)_3c/work setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile @@ -35,12 +38,15 @@ $(ASE_BUILD_DIR)_4c/Makefile: sources.txt fpga-1c: setup-fpga-1c gen_sources cd $(FPGA_BUILD_DIR)_1c && qsub-synth + cp ../rtl/fp_cores/altera/*.hex $(FPGA_BUILD_DIR)_1c fpga-2c: setup-fpga-2c gen_sources cd $(FPGA_BUILD_DIR)_2c && qsub-synth + cp ../rtl/fp_cores/altera/*.hex $(FPGA_BUILD_DIR)_2c fpga-4c: setup-fpga-4c gen_sources cd $(FPGA_BUILD_DIR)_4c && qsub-synth + cp ../rtl/fp_cores/altera/*.hex $(FPGA_BUILD_DIR)_4c setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf diff --git a/hw/opae/README b/hw/opae/README index 1cd7b8ef..54038744 100644 --- a/hw/opae/README +++ b/hw/opae/README @@ -62,7 +62,7 @@ make ase # tests ./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -n 256 ./run_ase.sh build_ase_1c ../../driver/tests/demo/demo -n 16 -./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n 1 -s4 -e4 +./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n1 -s4 -e4 ./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd # modify "vsim_run.tcl" to dump VCD trace diff --git a/hw/opae/sources_1c.txt b/hw/opae/sources_1c.txt index f3e7c840..f3b46403 100644 --- a/hw/opae/sources_1c.txt +++ b/hw/opae/sources_1c.txt @@ -13,6 +13,7 @@ #+define+DBG_PRINT_DRAM #+define+DBG_PRINT_PIPELINE #+define+DBG_PRINT_OPAE +#+define+DBG_CORE_REQ_INFO #+define+DBG_PRINT_SCOPE vortex_afu.json diff --git a/hw/rtl/VX_commit.v b/hw/rtl/VX_commit.v index 2afb1f4a..b09b82f7 100644 --- a/hw/rtl/VX_commit.v +++ b/hw/rtl/VX_commit.v @@ -62,7 +62,7 @@ module VX_commit #( fflags_r <= fflags; has_fflags_r <= fpu_commit_if.valid && fpu_commit_if.has_fflags; wid_r <= fpu_commit_if.wid; - num_commits_r <= num_commits; + num_commits_r <= (num_commits << $clog2(`NUM_THREADS)); end assign cmt_to_csr_if.valid = csr_update_r; diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 0cd84da3..5d3db338 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -59,6 +59,8 @@ `define EXT_F_ENABLE `endif +`define FPU_FAST + // Device identification `define VENDOR_ID 0 `define ARCHITECTURE_ID 0 @@ -74,12 +76,12 @@ `define LATENCY_FNONCOMP 1 `endif -`ifndef LATENCY_FMADD -`define LATENCY_FMADD 1 +`ifndef LATENCY_FADDMUL +`define LATENCY_FADDMUL 3 `endif -`ifndef LATENCY_FNMADD -`define LATENCY_FNMADD 2 +`ifndef LATENCY_FMADD +`define LATENCY_FMADD 4 `endif `ifndef LATENCY_FDIV @@ -98,16 +100,12 @@ `define LATENCY_FTOI 3 `endif -`ifndef LATENCY_FADDMUL -`define LATENCY_FADDMUL 2 -`endif - `ifndef LATENCY_FDIVSQRT -`define LATENCY_FDIVSQRT 2 +`define LATENCY_FDIVSQRT 10 `endif `ifndef LATENCY_FCONV -`define LATENCY_FCONV 2 +`define LATENCY_FCONV 3 `endif // CSR Addresses ////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 298d500e..dcb7e8b7 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -385,7 +385,7 @@ `define VX_CORE_TAG_WIDTH `L3CORE_TAG_WIDTH `define VX_CSR_ID_WIDTH `LOG2UP(`NUM_CLUSTERS * `NUM_CORES) -`define DRAM_TO_BYTE_ADDR(x) {x, (32-$bits(x))'(0)} +`define DRAM_TO_BYTE_ADDR(x) {x, (32-$bits(x))'(0)} `include "VX_types.vh" diff --git a/hw/rtl/VX_mul_unit.v b/hw/rtl/VX_mul_unit.v index 50f07644..9fa861ee 100644 --- a/hw/rtl/VX_mul_unit.v +++ b/hw/rtl/VX_mul_unit.v @@ -51,32 +51,33 @@ module VX_mul_unit #( /////////////////////////////////////////////////////////////////////////// wire [`NUM_THREADS-1:0][31:0] mul_result; - wire is_mulw = (alu_op == `MUL_MUL); - wire is_mulw_out; + wire is_mul_in = (alu_op == `MUL_MUL); + wire is_mul_out; wire stall_mul; for (genvar i = 0; i < `NUM_THREADS; i++) begin wire [32:0] mul_in1 = {(alu_op != `MUL_MULHU) & alu_in1[i][31], alu_in1[i]}; wire [32:0] mul_in2 = {(alu_op != `MUL_MULHU && alu_op != `MUL_MULHSU) & alu_in2[i][31], alu_in2[i]}; - wire [63:0] mul_result_tmp; + `IGNORE_WARNINGS_BEGIN + wire [65:0] mul_result_tmp; + `IGNORE_WARNINGS_END VX_multiplier #( .WIDTHA(33), .WIDTHB(33), - .WIDTHP(64), + .WIDTHP(66), .SIGNED(1), - .PIPELINE(`LATENCY_IMUL) + .LATENCY(`LATENCY_IMUL) ) multiplier ( .clk(clk), - .reset(reset), - .clk_en(~stall_mul), + .enable(~stall_mul), .dataa(mul_in1), .datab(mul_in2), .result(mul_result_tmp) ); - assign mul_result[i] = is_mulw_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32]; + assign mul_result[i] = is_mul_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32]; end wire [MULQ_BITS-1:0] mul_tag; @@ -91,17 +92,17 @@ module VX_mul_unit #( .clk(clk), .reset(reset), .enable(~stall_mul), - .in({mul_fire, tag_in, is_mulw}), - .out({mul_valid_out, mul_tag, is_mulw_out}) + .in({mul_fire, tag_in, is_mul_in}), + .out({mul_valid_out, mul_tag, is_mul_out}) ); /////////////////////////////////////////////////////////////////////////// wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp; - wire is_div_only = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU); + wire is_div_only = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU); wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM); - wire div_valid_in = mul_req_if.valid && is_div_op; + wire div_valid_in = mul_req_if.valid && is_div_op; wire div_ready_in; wire div_ready_out; wire div_valid_out; diff --git a/hw/rtl/cache/VX_cache_config.vh b/hw/rtl/cache/VX_cache_config.vh index cb6a1a69..1237c1c6 100644 --- a/hw/rtl/cache/VX_cache_config.vh +++ b/hw/rtl/cache/VX_cache_config.vh @@ -4,6 +4,10 @@ `include "VX_platform.vh" `include "VX_scope.vh" +`ifdef DBG_CORE_REQ_INFO +`include "VX_define.vh" +`endif + `define REQ_TAG_WIDTH `MAX(CORE_TAG_WIDTH, SNP_REQ_TAG_WIDTH) `define REQS_BITS `LOG2UP(NUM_REQUESTS) @@ -77,4 +81,6 @@ `define LINE_TO_BYTE_ADDR(x, i) {x, (32-$bits(x))'(i << (32-$bits(x)-`BANK_SELECT_BITS))} +`define DRAM_TO_BYTE_ADDR(x) {x, (32-$bits(x))'(0)} + `endif diff --git a/hw/rtl/fp_cores/VX_fp_addmul.v b/hw/rtl/fp_cores/VX_fp_addmul.v new file mode 100644 index 00000000..b1053e95 --- /dev/null +++ b/hw/rtl/fp_cores/VX_fp_addmul.v @@ -0,0 +1,187 @@ +`include "VX_define.vh" + +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + +module VX_fp_addmul #( + parameter TAGW = 1, + parameter LANES = 1 +) ( + input wire clk, + input wire reset, + + output wire ready_in, + input wire valid_in, + + input wire [TAGW-1:0] tag_in, + + input wire do_sub, + input wire do_mul, + + input wire [LANES-1:0][31:0] dataa, + input wire [LANES-1:0][31:0] datab, + output wire [LANES-1:0][31:0] result, + + output wire [TAGW-1:0] tag_out, + + input wire ready_out, + output wire valid_out +); + + wire stall = ~ready_out && valid_out; + wire enable = ~stall; + + reg do_sub_r, do_mul_r; + + for (genvar i = 0; i < LANES; i++) begin + + wire [31:0] result_add; + wire [31:0] result_sub; + wire [31:0] result_mul; + + `ifdef QUARTUS + twentynm_fp_mac mac_fp_add ( + // inputs + .accumulate(), + .chainin_overflow(), + .chainin_invalid(), + .chainin_underflow(), + .chainin_inexact(), + .ax(dataa[i]), + .ay(datab[i]), + .az(), + .clk({2'b00,clk}), + .ena({2'b11,enable}), + .aclr(2'b00), + .chainin(), + // outputs + .overflow(), + .invalid(), + .underflow(), + .inexact(), + .chainout_overflow(), + .chainout_invalid(), + .chainout_underflow(), + .chainout_inexact(), + .resulta(result_add), + .chainout() + ); + defparam mac_fp_add.operation_mode = "sp_add"; + defparam mac_fp_add.use_chainin = "false"; + defparam mac_fp_add.adder_subtract = "false"; + defparam mac_fp_add.ax_clock = "0"; + defparam mac_fp_add.ay_clock = "0"; + defparam mac_fp_add.az_clock = "none"; + defparam mac_fp_add.output_clock = "0"; + defparam mac_fp_add.accumulate_clock = "none"; + defparam mac_fp_add.ax_chainin_pl_clock = "none"; + defparam mac_fp_add.accum_pipeline_clock = "none"; + defparam mac_fp_add.mult_pipeline_clock = "none"; + defparam mac_fp_add.adder_input_clock = "0"; + defparam mac_fp_add.accum_adder_clock = "none"; + + twentynm_fp_mac mac_fp_sub ( + // inputs + .accumulate(), + .chainin_overflow(), + .chainin_invalid(), + .chainin_underflow(), + .chainin_inexact(), + .ax(dataa[i]), + .ay(datab[i]), + .az(), + .clk({2'b00,clk}), + .ena({2'b11,enable}), + .aclr(2'b00), + .chainin(), + // outputs + .overflow(), + .invalid(), + .underflow(), + .inexact(), + .chainout_overflow(), + .chainout_invalid(), + .chainout_underflow(), + .chainout_inexact(), + .resulta(result_sub), + .chainout() + ); + defparam mac_fp_sub.operation_mode = "sp_add"; + defparam mac_fp_sub.use_chainin = "false"; + defparam mac_fp_sub.adder_subtract = "true"; + defparam mac_fp_sub.ax_clock = "0"; + defparam mac_fp_sub.ay_clock = "0"; + defparam mac_fp_sub.az_clock = "none"; + defparam mac_fp_sub.output_clock = "0"; + defparam mac_fp_sub.accumulate_clock = "none"; + defparam mac_fp_sub.ax_chainin_pl_clock = "none"; + defparam mac_fp_sub.accum_pipeline_clock = "none"; + defparam mac_fp_sub.mult_pipeline_clock = "none"; + defparam mac_fp_sub.adder_input_clock = "0"; + defparam mac_fp_sub.accum_adder_clock = "none"; + + twentynm_fp_mac mac_fp_mul ( + // inputs + .accumulate(), + .chainin_overflow(), + .chainin_invalid(), + .chainin_underflow(), + .chainin_inexact(), + .ax(), + .ay(datab[i]), + .az(dataa[i]), + .clk({2'b00,clk}), + .ena({2'b11,enable}), + .aclr(2'b00), + .chainin(), + // outputs + .overflow(), + .invalid(), + .underflow(), + .inexact(), + .chainout_overflow(), + .chainout_invalid(), + .chainout_underflow(), + .chainout_inexact(), + .resulta(result_mul), + .chainout() + ); + defparam mac_fp_mul.operation_mode = "sp_mult"; + defparam mac_fp_mul.use_chainin = "false"; + defparam mac_fp_mul.adder_subtract = "false"; + defparam mac_fp_mul.ax_clock = "none"; + defparam mac_fp_mul.ay_clock = "0"; + defparam mac_fp_mul.az_clock = "0"; + defparam mac_fp_mul.output_clock = "0"; + defparam mac_fp_mul.accumulate_clock = "none"; + defparam mac_fp_mul.ax_chainin_pl_clock = "none"; + defparam mac_fp_mul.accum_pipeline_clock = "none"; + defparam mac_fp_mul.mult_pipeline_clock = "0"; + defparam mac_fp_mul.adder_input_clock = "none"; + defparam mac_fp_mul.accum_adder_clock = "none"; + `else + always @(posedge clk) begin + dpi_fadd(0*LANES+i, enable, dataa[i], datab[i], result_add); + dpi_fsub(1*LANES+i, enable, dataa[i], datab[i], result_sub); + dpi_fmul(2*LANES+i, enable, dataa[i], datab[i], result_mul); + end + `endif + + assign result[i] = do_mul_r ? result_mul : (do_sub_r ? result_sub : result_add); + end + + VX_shift_register #( + .DATAW(TAGW + 1 + 1 + 1), + .DEPTH(`LATENCY_FADDMUL) + ) shift_reg ( + .clk(clk), + .reset(reset), + .enable(enable), + .in({tag_in, valid_in, do_sub, do_mul}), + .out({tag_out, valid_out, do_sub_r, do_mul_r}) + ); + + assign ready_in = enable; + +endmodule diff --git a/hw/rtl/fp_cores/VX_fp_div.v b/hw/rtl/fp_cores/VX_fp_div.v index b6d9409d..f5e624ef 100644 --- a/hw/rtl/fp_cores/VX_fp_div.v +++ b/hw/rtl/fp_cores/VX_fp_div.v @@ -26,20 +26,21 @@ module VX_fp_div #( output wire valid_out ); wire stall = ~ready_out && valid_out; + wire enable = ~stall; for (genvar i = 0; i < LANES; i++) begin `ifdef QUARTUS - acl_fp_div fdiv ( + acl_fdiv fdiv ( .clk (clk), .areset (1'b0), - .en (~stall), + .en (enable), .a (dataa[i]), .b (datab[i]), .q (result[i]) ); `else always @(posedge clk) begin - dpi_fdiv(8*LANES+i, ~stall, valid_in, dataa[i], datab[i], result[i]); + dpi_fdiv(8*LANES+i, enable, dataa[i], datab[i], result[i]); end `endif end @@ -50,11 +51,11 @@ module VX_fp_div #( ) shift_reg ( .clk(clk), .reset(reset), - .enable(~stall), + .enable(enable), .in ({tag_in, valid_in}), .out({tag_out, valid_out}) ); - assign ready_in = ~stall; + assign ready_in = enable; endmodule diff --git a/hw/rtl/fp_cores/VX_fp_fpga.v b/hw/rtl/fp_cores/VX_fp_fpga.v index 243f0bc1..bb4cb412 100644 --- a/hw/rtl/fp_cores/VX_fp_fpga.v +++ b/hw/rtl/fp_cores/VX_fp_fpga.v @@ -27,7 +27,7 @@ module VX_fp_fpga #( input wire ready_out, output wire valid_out ); - localparam NUM_FPC = 7; + localparam NUM_FPC = 8; localparam FPC_BITS = `LOG2UP(NUM_FPC); wire [NUM_FPC-1:0] per_core_ready_in; @@ -40,29 +40,28 @@ module VX_fp_fpga #( fflags_t [`NUM_THREADS-1:0] fpnew_fflags; reg [FPC_BITS-1:0] core_select; - reg do_add, do_sub, do_mul; + reg do_sub, do_mul; reg is_signed; always @(*) begin core_select = 'x; - do_add = 'x; do_sub = 'x; do_mul = 'x; is_signed = 'x; case (op_type) - `FPU_ADD: begin core_select = 1; do_mul = 0; do_add = 1; do_sub = 0; end - `FPU_SUB: begin core_select = 1; do_mul = 0; do_add = 0; do_sub = 1; end - `FPU_MUL: begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 0; end - `FPU_MADD: begin core_select = 1; do_mul = 1; do_add = 1; do_sub = 0; end - `FPU_MSUB: begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 1; end - `FPU_NMSUB: begin core_select = 2; do_sub = 1; end - `FPU_NMADD: begin core_select = 2; do_sub = 0; end - `FPU_DIV: begin core_select = 3; end - `FPU_SQRT: begin core_select = 4; end - `FPU_CVTWS: begin core_select = 5; is_signed = 1; end - `FPU_CVTWUS: begin core_select = 5; is_signed = 0; end - `FPU_CVTSW: begin core_select = 6; is_signed = 1; end - `FPU_CVTSWU: begin core_select = 6; is_signed = 0; end + `FPU_ADD: begin core_select = 1; do_mul = 0; do_sub = 0; end + `FPU_SUB: begin core_select = 1; do_mul = 0; do_sub = 1; end + `FPU_MUL: begin core_select = 1; do_mul = 1; do_sub = 0; end + `FPU_MADD: begin core_select = 2; do_sub = 0; end + `FPU_MSUB: begin core_select = 2; do_sub = 1; end + `FPU_NMADD: begin core_select = 3; do_sub = 0; end + `FPU_NMSUB: begin core_select = 3; do_sub = 1; end + `FPU_DIV: begin core_select = 4; end + `FPU_SQRT: begin core_select = 5; end + `FPU_CVTWS: begin core_select = 6; is_signed = 1; end + `FPU_CVTWUS: begin core_select = 6; is_signed = 0; end + `FPU_CVTSW: begin core_select = 7; is_signed = 1; end + `FPU_CVTSWU: begin core_select = 7; is_signed = 0; end default: begin core_select = 0; end endcase end @@ -88,25 +87,42 @@ module VX_fp_fpga #( .valid_out (per_core_valid_out[0]) ); + VX_fp_addmul #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_addmul ( + .clk (clk), + .reset (reset), + .valid_in (valid_in && (core_select == 1)), + .ready_in (per_core_ready_in[1]), + .tag_in (tag_in), + .do_sub (do_sub), + .do_mul (do_mul), + .dataa (dataa), + .datab (datab), + .result (per_core_result[1]), + .tag_out (per_core_tag_out[1]), + .ready_out (per_core_ready_out[1]), + .valid_out (per_core_valid_out[1]) + ); + VX_fp_madd #( .TAGW (TAGW), .LANES(`NUM_THREADS) ) fp_madd ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 1)), - .ready_in (per_core_ready_in[1]), - .tag_in (tag_in), - .do_add (do_add), + .valid_in (valid_in && (core_select == 2)), + .ready_in (per_core_ready_in[2]), + .tag_in (tag_in), .do_sub (do_sub), - .do_mul (do_mul), .dataa (dataa), .datab (datab), .datac (datac), - .result (per_core_result[1]), - .tag_out (per_core_tag_out[1]), - .ready_out (per_core_ready_out[1]), - .valid_out (per_core_valid_out[1]) + .result (per_core_result[2]), + .tag_out (per_core_tag_out[2]), + .ready_out (per_core_ready_out[2]), + .valid_out (per_core_valid_out[2]) ); VX_fp_nmadd #( @@ -115,17 +131,17 @@ module VX_fp_fpga #( ) fp_nmadd ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 2)), - .ready_in (per_core_ready_in[2]), + .valid_in (valid_in && (core_select == 3)), + .ready_in (per_core_ready_in[3]), .tag_in (tag_in), .do_sub (do_sub), .dataa (dataa), .datab (datab), .datac (datac), - .result (per_core_result[2]), - .tag_out (per_core_tag_out[2]), - .ready_out (per_core_ready_out[2]), - .valid_out (per_core_valid_out[2]) + .result (per_core_result[3]), + .tag_out (per_core_tag_out[3]), + .ready_out (per_core_ready_out[3]), + .valid_out (per_core_valid_out[3]) ); VX_fp_div #( @@ -134,15 +150,15 @@ module VX_fp_fpga #( ) fp_div ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 3)), - .ready_in (per_core_ready_in[3]), + .valid_in (valid_in && (core_select == 4)), + .ready_in (per_core_ready_in[4]), .tag_in (tag_in), .dataa (dataa), .datab (datab), - .result (per_core_result[3]), - .tag_out (per_core_tag_out[3]), - .ready_out (per_core_ready_out[3]), - .valid_out (per_core_valid_out[3]) + .result (per_core_result[4]), + .tag_out (per_core_tag_out[4]), + .ready_out (per_core_ready_out[4]), + .valid_out (per_core_valid_out[4]) ); VX_fp_sqrt #( @@ -151,14 +167,14 @@ module VX_fp_fpga #( ) fp_sqrt ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 4)), - .ready_in (per_core_ready_in[4]), + .valid_in (valid_in && (core_select == 5)), + .ready_in (per_core_ready_in[5]), .tag_in (tag_in), .dataa (dataa), - .result (per_core_result[4]), - .tag_out (per_core_tag_out[4]), - .ready_out (per_core_ready_out[4]), - .valid_out (per_core_valid_out[4]) + .result (per_core_result[5]), + .tag_out (per_core_tag_out[5]), + .ready_out (per_core_ready_out[5]), + .valid_out (per_core_valid_out[5]) ); VX_fp_ftoi #( @@ -167,15 +183,15 @@ module VX_fp_fpga #( ) fp_ftoi ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 5)), - .ready_in (per_core_ready_in[5]), + .valid_in (valid_in && (core_select == 6)), + .ready_in (per_core_ready_in[6]), .tag_in (tag_in), .is_signed (is_signed), .dataa (dataa), - .result (per_core_result[5]), - .tag_out (per_core_tag_out[5]), - .ready_out (per_core_ready_out[5]), - .valid_out (per_core_valid_out[5]) + .result (per_core_result[6]), + .tag_out (per_core_tag_out[6]), + .ready_out (per_core_ready_out[6]), + .valid_out (per_core_valid_out[6]) ); VX_fp_itof #( @@ -184,15 +200,15 @@ module VX_fp_fpga #( ) fp_itof ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 6)), - .ready_in (per_core_ready_in[6]), + .valid_in (valid_in && (core_select == 7)), + .ready_in (per_core_ready_in[7]), .tag_in (tag_in), .is_signed (is_signed), .dataa (dataa), - .result (per_core_result[6]), - .tag_out (per_core_tag_out[6]), - .ready_out (per_core_ready_out[6]), - .valid_out (per_core_valid_out[6]) + .result (per_core_result[7]), + .tag_out (per_core_tag_out[7]), + .ready_out (per_core_ready_out[7]), + .valid_out (per_core_valid_out[7]) ); reg valid_out_n; diff --git a/hw/rtl/fp_cores/VX_fp_ftoi.v b/hw/rtl/fp_cores/VX_fp_ftoi.v index 4fea6cb5..e9e3f720 100644 --- a/hw/rtl/fp_cores/VX_fp_ftoi.v +++ b/hw/rtl/fp_cores/VX_fp_ftoi.v @@ -27,6 +27,7 @@ module VX_fp_ftoi #( output wire valid_out ); wire stall = ~ready_out && valid_out; + wire enable = ~stall; reg is_signed_r; @@ -36,25 +37,25 @@ module VX_fp_ftoi #( wire [31:0] result_u; `ifdef QUARTUS - acl_fp_ftoi ftoi ( + acl_ftoi ftoi ( .clk (clk), .areset (1'b0), - .en (~stall), + .en (enable), .a (dataa[i]), .q (result_s) ); - acl_fp_ftou ftou ( + acl_ftou ftou ( .clk (clk), .areset (1'b0), - .en (~stall), + .en (enable), .a (dataa[i]), .q (result_u) ); `else always @(posedge clk) begin - dpi_ftoi(10*LANES+i, ~stall, valid_in, dataa[i], result_s); - dpi_ftou(11*LANES+i, ~stall, valid_in, dataa[i], result_u); + dpi_ftoi(10*LANES+i, enable, dataa[i], result_s); + dpi_ftou(11*LANES+i, enable, dataa[i], result_u); end `endif @@ -67,11 +68,11 @@ module VX_fp_ftoi #( ) shift_reg ( .clk(clk), .reset(reset), - .enable(~stall), + .enable(enable), .in ({tag_in, valid_in, is_signed}), .out({tag_out, valid_out, is_signed_r}) ); - assign ready_in = ~stall; + assign ready_in = enable; endmodule diff --git a/hw/rtl/fp_cores/VX_fp_itof.v b/hw/rtl/fp_cores/VX_fp_itof.v index a62315cb..348bc673 100644 --- a/hw/rtl/fp_cores/VX_fp_itof.v +++ b/hw/rtl/fp_cores/VX_fp_itof.v @@ -27,6 +27,7 @@ module VX_fp_itof #( output wire valid_out ); wire stall = ~ready_out && valid_out; + wire enable = ~stall; reg is_signed_r; @@ -36,25 +37,25 @@ module VX_fp_itof #( wire [31:0] result_u; `ifdef QUARTUS - acl_fp_itof itof ( + acl_itof itof ( .clk (clk), .areset (1'b0), - .en (~stall), + .en (enable), .a (dataa[i]), .q (result_s) ); - acl_fp_utof utof ( + acl_utof utof ( .clk (clk), .areset (1'b0), - .en (~stall), + .en (enable), .a (dataa[i]), .q (result_u) ); `else always @(posedge clk) begin - dpi_itof(12*LANES+i, ~stall, valid_in, dataa[i], result_s); - dpi_utof(13*LANES+i, ~stall, valid_in, dataa[i], result_u); + dpi_itof(12*LANES+i, enable, dataa[i], result_s); + dpi_utof(13*LANES+i, enable, dataa[i], result_u); end `endif @@ -67,11 +68,11 @@ module VX_fp_itof #( ) shift_reg ( .clk(clk), .reset(reset), - .enable(~stall), + .enable(enable), .in ({tag_in, valid_in, is_signed}), .out({tag_out, valid_out, is_signed_r}) ); - assign ready_in = ~stall; + assign ready_in = enable; endmodule diff --git a/hw/rtl/fp_cores/VX_fp_madd.v b/hw/rtl/fp_cores/VX_fp_madd.v index 0d86f509..5b589dd2 100644 --- a/hw/rtl/fp_cores/VX_fp_madd.v +++ b/hw/rtl/fp_cores/VX_fp_madd.v @@ -16,9 +16,7 @@ module VX_fp_madd #( input wire [TAGW-1:0] tag_in, - input wire do_add, - input wire do_sub, - input wire do_mul, + input wire do_sub, input wire [LANES-1:0][31:0] dataa, input wire [LANES-1:0][31:0] datab, @@ -32,138 +30,16 @@ module VX_fp_madd #( ); wire stall = ~ready_out && valid_out; + wire enable = ~stall; - reg do_add_r, do_sub_r, do_mul_r; + reg do_sub_r; for (genvar i = 0; i < LANES; i++) begin - wire [31:0] result_add; - wire [31:0] result_sub; - wire [31:0] result_mul; wire [31:0] result_madd; wire [31:0] result_msub; `ifdef QUARTUS - twentynm_fp_mac mac_fp_add ( - // inputs - .accumulate(), - .chainin_overflow(), - .chainin_invalid(), - .chainin_underflow(), - .chainin_inexact(), - .ax(dataa[i]), - .ay(datab[i]), - .az(), - .clk({2'b00,clk}), - .ena({2'b11,~stall}), - .aclr(2'b00), - .chainin(), - // outputs - .overflow(), - .invalid(), - .underflow(), - .inexact(), - .chainout_overflow(), - .chainout_invalid(), - .chainout_underflow(), - .chainout_inexact(), - .resulta(result_add), - .chainout() - ); - defparam mac_fp_add.operation_mode = "sp_add"; - defparam mac_fp_add.use_chainin = "false"; - defparam mac_fp_add.adder_subtract = "false"; - defparam mac_fp_add.ax_clock = "0"; - defparam mac_fp_add.ay_clock = "0"; - defparam mac_fp_add.az_clock = "none"; - defparam mac_fp_add.output_clock = "0"; - defparam mac_fp_add.accumulate_clock = "none"; - defparam mac_fp_add.ax_chainin_pl_clock = "none"; - defparam mac_fp_add.accum_pipeline_clock = "none"; - defparam mac_fp_add.mult_pipeline_clock = "none"; - defparam mac_fp_add.adder_input_clock = "0"; - defparam mac_fp_add.accum_adder_clock = "none"; - - twentynm_fp_mac mac_fp_sub ( - // inputs - .accumulate(), - .chainin_overflow(), - .chainin_invalid(), - .chainin_underflow(), - .chainin_inexact(), - .ax(dataa[i]), - .ay(datab[i]), - .az(), - .clk({2'b00,clk}), - .ena({2'b11,~stall}), - .aclr(2'b00), - .chainin(), - // outputs - .overflow(), - .invalid(), - .underflow(), - .inexact(), - .chainout_overflow(), - .chainout_invalid(), - .chainout_underflow(), - .chainout_inexact(), - .resulta(result_sub), - .chainout() - ); - defparam mac_fp_sub.operation_mode = "sp_add"; - defparam mac_fp_sub.use_chainin = "false"; - defparam mac_fp_sub.adder_subtract = "true"; - defparam mac_fp_sub.ax_clock = "0"; - defparam mac_fp_sub.ay_clock = "0"; - defparam mac_fp_sub.az_clock = "none"; - defparam mac_fp_sub.output_clock = "0"; - defparam mac_fp_sub.accumulate_clock = "none"; - defparam mac_fp_sub.ax_chainin_pl_clock = "none"; - defparam mac_fp_sub.accum_pipeline_clock = "none"; - defparam mac_fp_sub.mult_pipeline_clock = "none"; - defparam mac_fp_sub.adder_input_clock = "0"; - defparam mac_fp_sub.accum_adder_clock = "none"; - - twentynm_fp_mac mac_fp_mul ( - // inputs - .accumulate(), - .chainin_overflow(), - .chainin_invalid(), - .chainin_underflow(), - .chainin_inexact(), - .ax(), - .ay(datab[i]), - .az(dataa[i]), - .clk({2'b00,clk}), - .ena({2'b11,~stall}), - .aclr(2'b00), - .chainin(), - // outputs - .overflow(), - .invalid(), - .underflow(), - .inexact(), - .chainout_overflow(), - .chainout_invalid(), - .chainout_underflow(), - .chainout_inexact(), - .resulta(result_mul), - .chainout() - ); - defparam mac_fp_mul.operation_mode = "sp_mult"; - defparam mac_fp_mul.use_chainin = "false"; - defparam mac_fp_mul.adder_subtract = "false"; - defparam mac_fp_mul.ax_clock = "none"; - defparam mac_fp_mul.ay_clock = "0"; - defparam mac_fp_mul.az_clock = "0"; - defparam mac_fp_mul.output_clock = "0"; - defparam mac_fp_mul.accumulate_clock = "none"; - defparam mac_fp_mul.ax_chainin_pl_clock = "none"; - defparam mac_fp_mul.accum_pipeline_clock = "none"; - defparam mac_fp_mul.mult_pipeline_clock = "0"; - defparam mac_fp_mul.adder_input_clock = "none"; - defparam mac_fp_mul.accum_adder_clock = "none"; - twentynm_fp_mac mac_fp_madd ( // inputs .accumulate(), @@ -175,7 +51,7 @@ module VX_fp_madd #( .ay(datab[i]), .az(dataa[i]), .clk({2'b00,clk}), - .ena({2'b11,~stall}), + .ena({2'b11,enable}), .aclr(2'b00), .chainin(), // outputs @@ -215,7 +91,7 @@ module VX_fp_madd #( .ay(datab[i]), .az(dataa[i]), .clk({2'b00,clk}), - .ena({2'b11,~stall}), + .ena({2'b11,enable}), .aclr(2'b00), .chainin(), // outputs @@ -245,47 +121,25 @@ module VX_fp_madd #( defparam mac_fp_msub.accum_adder_clock = "none"; `else always @(posedge clk) begin - dpi_fadd(0*LANES+i, ~stall, valid_in, dataa[i], datab[i], result_add); - dpi_fsub(1*LANES+i, ~stall, valid_in, dataa[i], datab[i], result_sub); - dpi_fmul(2*LANES+i, ~stall, valid_in, dataa[i], datab[i], result_mul); - dpi_fmadd(3*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_madd); - dpi_fmsub(4*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_msub); + dpi_fmadd(3*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd); + dpi_fmsub(4*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub); end `endif - reg [31:0] result_r; - - always @(*) begin - result_r = 'x; - if (do_mul_r) begin - if (do_add_r) - result_r = result_madd; - else if (do_sub_r) - result_r = result_msub; - else - result_r = result_mul; - end else begin - if (do_add_r) - result_r = result_add; - else if (do_sub_r) - result_r = result_sub; - end - end - - assign result[i] = result_r; + assign result[i] = do_sub_r ? result_msub : result_madd; end VX_shift_register #( - .DATAW(TAGW + 1 + 1 + 1 + 1), + .DATAW(TAGW + 1 + 1), .DEPTH(`LATENCY_FMADD) - ) shift_reg1 ( + ) shift_reg ( .clk(clk), .reset(reset), - .enable(~stall), - .in({tag_in, valid_in, do_add, do_sub, do_mul}), - .out({tag_out, valid_out, do_add_r, do_sub_r, do_mul_r}) + .enable(enable), + .in({tag_in, valid_in, do_sub}), + .out({tag_out, valid_out, do_sub_r}) ); - assign ready_in = ~stall; + assign ready_in = enable; endmodule diff --git a/hw/rtl/fp_cores/VX_fp_nmadd.v b/hw/rtl/fp_cores/VX_fp_nmadd.v index 2236ef69..95350bc2 100644 --- a/hw/rtl/fp_cores/VX_fp_nmadd.v +++ b/hw/rtl/fp_cores/VX_fp_nmadd.v @@ -30,13 +30,14 @@ module VX_fp_nmadd #( ); wire stall = ~ready_out && valid_out; + wire enable = ~stall; reg do_sub_r; for (genvar i = 0; i < LANES; i++) begin wire [31:0] result_madd; - wire [31:0] result_msub; + wire [31:0] result_msub; wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd; @@ -52,7 +53,7 @@ module VX_fp_nmadd #( .ay(datab[i]), .az(dataa[i]), .clk({2'b00,clk}), - .ena({2'b11,~stall}), + .ena({2'b11,enable}), .aclr(2'b00), .chainin(), // outputs @@ -161,33 +162,36 @@ module VX_fp_nmadd #( defparam mac_fp_neg.adder_input_clock = "0"; defparam mac_fp_neg.accum_adder_clock = "none"; `else - reg valid_in_st0; always @(posedge clk) begin - valid_in_st0 <= reset ? 0 : valid_in; - dpi_fmadd(5*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_madd); - dpi_fmsub(6*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_msub); - dpi_fsub(7*LANES+i, ~stall, valid_in_st0, 32'b0, result_st0, result[i]); + dpi_fmadd(5*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd); + dpi_fmsub(6*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub); + dpi_fsub(7*LANES+i, enable, 32'b0, result_st0, result[i]); end `endif - end - - always @(posedge clk) begin - if (~stall) begin - do_sub_r <= do_sub; - end end + VX_shift_register #( + .DATAW(1), + .DEPTH(`LATENCY_FMADD) + ) shift_reg0 ( + .clk(clk), + .reset(reset), + .enable(enable), + .in({do_sub}), + .out({do_sub_r}) + ); + VX_shift_register #( .DATAW(TAGW + 1), - .DEPTH(`LATENCY_FNMADD) + .DEPTH(`LATENCY_FMADD + `LATENCY_FADDMUL) ) shift_reg1 ( .clk(clk), .reset(reset), - .enable(~stall), + .enable(enable), .in({tag_in, valid_in}), .out({tag_out, valid_out}) ); - assign ready_in = ~stall; + assign ready_in = enable; endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/VX_fp_sqrt.v b/hw/rtl/fp_cores/VX_fp_sqrt.v index 15f41da0..780fa041 100644 --- a/hw/rtl/fp_cores/VX_fp_sqrt.v +++ b/hw/rtl/fp_cores/VX_fp_sqrt.v @@ -25,19 +25,20 @@ module VX_fp_sqrt #( output wire valid_out ); wire stall = ~ready_out && valid_out; + wire enable = ~stall; for (genvar i = 0; i < LANES; i++) begin `ifdef QUARTUS - acl_fp_sqrt fsqrt ( + acl_fsqrt fsqrt ( .clk (clk), .areset (1'b0), - .en (~stall), + .en (enable), .a (dataa[i]), .q (result[i]) ); `else always @(posedge clk) begin - dpi_fsqrt(9*LANES+i, ~stall, valid_in, dataa[i], result[i]); + dpi_fsqrt(9*LANES+i, enable, dataa[i], result[i]); end `endif end @@ -48,11 +49,11 @@ module VX_fp_sqrt #( ) shift_reg ( .clk(clk), .reset(reset), - .enable(~stall), + .enable(enable), .in ({tag_in, valid_in}), .out({tag_out, valid_out}) ); - assign ready_in = ~stall; + assign ready_in = enable; endmodule diff --git a/hw/rtl/fp_cores/altera/acl_fp_div.sv b/hw/rtl/fp_cores/altera/acl_fdiv.sv similarity index 99% rename from hw/rtl/fp_cores/altera/acl_fp_div.sv rename to hw/rtl/fp_cores/altera/acl_fdiv.sv index b0fb6b43..c6c73008 100644 --- a/hw/rtl/fp_cores/altera/acl_fp_div.sv +++ b/hw/rtl/fp_cores/altera/acl_fdiv.sv @@ -15,12 +15,12 @@ // applicable agreement for further details. // --------------------------------------------------------------------------- -// SystemVerilog created from acl_fp_div -// SystemVerilog created on Mon Aug 31 06:15:17 2020 +// SystemVerilog created from acl_fdiv +// SystemVerilog created on Wed Sep 2 07:11:09 2020 (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *) -module acl_fp_div ( +module acl_fdiv ( input wire [31:0] a, input wire [31:0] b, input wire [0:0] en, @@ -623,7 +623,7 @@ module acl_fp_div ( .outdata_aclr_a("CLEAR0"), .clock_enable_input_a("NORMAL"), .power_up_uninitialized("FALSE"), - .init_file("acl_fp_div_memoryC2_uid118_invTables_lutmem.hex"), + .init_file("acl_fdiv_memoryC2_uid118_invTables_lutmem.hex"), .init_file_layout("PORT_A"), .intended_device_family("Arria 10") ) memoryC2_uid118_invTables_lutmem_dmem ( @@ -755,7 +755,7 @@ module acl_fp_div ( .outdata_aclr_a("CLEAR0"), .clock_enable_input_a("NORMAL"), .power_up_uninitialized("FALSE"), - .init_file("acl_fp_div_memoryC1_uid115_invTables_lutmem.hex"), + .init_file("acl_fdiv_memoryC1_uid115_invTables_lutmem.hex"), .init_file_layout("PORT_A"), .intended_device_family("Arria 10") ) memoryC1_uid115_invTables_lutmem_dmem ( @@ -1060,7 +1060,7 @@ module acl_fp_div ( .outdata_aclr_a("CLEAR0"), .clock_enable_input_a("NORMAL"), .power_up_uninitialized("FALSE"), - .init_file("acl_fp_div_memoryC0_uid112_invTables_lutmem.hex"), + .init_file("acl_fdiv_memoryC0_uid112_invTables_lutmem.hex"), .init_file_layout("PORT_A"), .intended_device_family("Arria 10") ) memoryC0_uid112_invTables_lutmem_dmem ( diff --git a/hw/rtl/fp_cores/altera/acl_fp_div_memoryC0_uid112_invTables_lutmem.hex b/hw/rtl/fp_cores/altera/acl_fdiv_memoryC0_uid112_invTables_lutmem.hex similarity index 100% rename from hw/rtl/fp_cores/altera/acl_fp_div_memoryC0_uid112_invTables_lutmem.hex rename to hw/rtl/fp_cores/altera/acl_fdiv_memoryC0_uid112_invTables_lutmem.hex diff --git a/hw/rtl/fp_cores/altera/acl_fp_div_memoryC1_uid115_invTables_lutmem.hex b/hw/rtl/fp_cores/altera/acl_fdiv_memoryC1_uid115_invTables_lutmem.hex similarity index 100% rename from hw/rtl/fp_cores/altera/acl_fp_div_memoryC1_uid115_invTables_lutmem.hex rename to hw/rtl/fp_cores/altera/acl_fdiv_memoryC1_uid115_invTables_lutmem.hex diff --git a/hw/rtl/fp_cores/altera/acl_fp_div_memoryC2_uid118_invTables_lutmem.hex b/hw/rtl/fp_cores/altera/acl_fdiv_memoryC2_uid118_invTables_lutmem.hex similarity index 100% rename from hw/rtl/fp_cores/altera/acl_fp_div_memoryC2_uid118_invTables_lutmem.hex rename to hw/rtl/fp_cores/altera/acl_fdiv_memoryC2_uid118_invTables_lutmem.hex diff --git a/hw/rtl/fp_cores/altera/acl_fp_sqrt.sv b/hw/rtl/fp_cores/altera/acl_fsqrt.sv similarity index 99% rename from hw/rtl/fp_cores/altera/acl_fp_sqrt.sv rename to hw/rtl/fp_cores/altera/acl_fsqrt.sv index d18ac12a..165effd6 100644 --- a/hw/rtl/fp_cores/altera/acl_fp_sqrt.sv +++ b/hw/rtl/fp_cores/altera/acl_fsqrt.sv @@ -15,12 +15,12 @@ // applicable agreement for further details. // --------------------------------------------------------------------------- -// SystemVerilog created from acl_fp_sqrt -// SystemVerilog created on Mon Aug 31 06:15:18 2020 +// SystemVerilog created from acl_fsqrt +// SystemVerilog created on Wed Sep 2 07:11:09 2020 (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *) -module acl_fp_sqrt ( +module acl_fsqrt ( input wire [31:0] a, input wire [0:0] en, output wire [31:0] q, @@ -279,7 +279,7 @@ module acl_fp_sqrt ( .outdata_aclr_a("CLEAR0"), .clock_enable_input_a("NORMAL"), .power_up_uninitialized("FALSE"), - .init_file("acl_fp_sqrt_memoryC2_uid68_sqrtTables_lutmem.hex"), + .init_file("acl_fsqrt_memoryC2_uid68_sqrtTables_lutmem.hex"), .init_file_layout("PORT_A"), .intended_device_family("Arria 10") ) memoryC2_uid68_sqrtTables_lutmem_dmem ( @@ -412,7 +412,7 @@ module acl_fp_sqrt ( .outdata_aclr_a("CLEAR0"), .clock_enable_input_a("NORMAL"), .power_up_uninitialized("FALSE"), - .init_file("acl_fp_sqrt_memoryC1_uid65_sqrtTables_lutmem.hex"), + .init_file("acl_fsqrt_memoryC1_uid65_sqrtTables_lutmem.hex"), .init_file_layout("PORT_A"), .intended_device_family("Arria 10") ) memoryC1_uid65_sqrtTables_lutmem_dmem ( @@ -723,7 +723,7 @@ module acl_fp_sqrt ( .outdata_aclr_a("CLEAR0"), .clock_enable_input_a("NORMAL"), .power_up_uninitialized("FALSE"), - .init_file("acl_fp_sqrt_memoryC0_uid62_sqrtTables_lutmem.hex"), + .init_file("acl_fsqrt_memoryC0_uid62_sqrtTables_lutmem.hex"), .init_file_layout("PORT_A"), .intended_device_family("Arria 10") ) memoryC0_uid62_sqrtTables_lutmem_dmem ( diff --git a/hw/rtl/fp_cores/altera/acl_fp_sqrt_memoryC0_uid62_sqrtTables_lutmem.hex b/hw/rtl/fp_cores/altera/acl_fsqrt_memoryC0_uid62_sqrtTables_lutmem.hex similarity index 100% rename from hw/rtl/fp_cores/altera/acl_fp_sqrt_memoryC0_uid62_sqrtTables_lutmem.hex rename to hw/rtl/fp_cores/altera/acl_fsqrt_memoryC0_uid62_sqrtTables_lutmem.hex diff --git a/hw/rtl/fp_cores/altera/acl_fp_sqrt_memoryC1_uid65_sqrtTables_lutmem.hex b/hw/rtl/fp_cores/altera/acl_fsqrt_memoryC1_uid65_sqrtTables_lutmem.hex similarity index 100% rename from hw/rtl/fp_cores/altera/acl_fp_sqrt_memoryC1_uid65_sqrtTables_lutmem.hex rename to hw/rtl/fp_cores/altera/acl_fsqrt_memoryC1_uid65_sqrtTables_lutmem.hex diff --git a/hw/rtl/fp_cores/altera/acl_fp_sqrt_memoryC2_uid68_sqrtTables_lutmem.hex b/hw/rtl/fp_cores/altera/acl_fsqrt_memoryC2_uid68_sqrtTables_lutmem.hex similarity index 100% rename from hw/rtl/fp_cores/altera/acl_fp_sqrt_memoryC2_uid68_sqrtTables_lutmem.hex rename to hw/rtl/fp_cores/altera/acl_fsqrt_memoryC2_uid68_sqrtTables_lutmem.hex diff --git a/hw/rtl/fp_cores/altera/acl_fp_ftoi.sv b/hw/rtl/fp_cores/altera/acl_ftoi.sv similarity index 99% rename from hw/rtl/fp_cores/altera/acl_fp_ftoi.sv rename to hw/rtl/fp_cores/altera/acl_ftoi.sv index d96b28ea..e800b01d 100644 --- a/hw/rtl/fp_cores/altera/acl_fp_ftoi.sv +++ b/hw/rtl/fp_cores/altera/acl_ftoi.sv @@ -15,12 +15,12 @@ // applicable agreement for further details. // --------------------------------------------------------------------------- -// SystemVerilog created from acl_fp_ftoi -// SystemVerilog created on Mon Aug 31 06:15:18 2020 +// SystemVerilog created from acl_ftoi +// SystemVerilog created on Wed Sep 2 07:11:09 2020 (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *) -module acl_fp_ftoi ( +module acl_ftoi ( input wire [31:0] a, input wire [0:0] en, output wire [31:0] q, diff --git a/hw/rtl/fp_cores/altera/acl_fp_ftou.sv b/hw/rtl/fp_cores/altera/acl_ftou.sv similarity index 99% rename from hw/rtl/fp_cores/altera/acl_fp_ftou.sv rename to hw/rtl/fp_cores/altera/acl_ftou.sv index d7c84b34..cd5de555 100644 --- a/hw/rtl/fp_cores/altera/acl_fp_ftou.sv +++ b/hw/rtl/fp_cores/altera/acl_ftou.sv @@ -15,12 +15,12 @@ // applicable agreement for further details. // --------------------------------------------------------------------------- -// SystemVerilog created from acl_fp_ftou -// SystemVerilog created on Mon Aug 31 06:15:18 2020 +// SystemVerilog created from acl_ftou +// SystemVerilog created on Wed Sep 2 07:11:09 2020 (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *) -module acl_fp_ftou ( +module acl_ftou ( input wire [31:0] a, input wire [0:0] en, output wire [31:0] q, diff --git a/hw/rtl/fp_cores/altera/acl_gen.log b/hw/rtl/fp_cores/altera/acl_gen.log new file mode 100644 index 00000000..3e694b39 --- /dev/null +++ b/hw/rtl/fp_cores/altera/acl_gen.log @@ -0,0 +1,169 @@ +starting execution ... +build model options ... +argc=21 +Generation context: + HardFP is enabled enabling set to true + Faithful rounding constraint detected + Will not generate valid and channel signals + The new component name is acl_fdiv + Frequency 250MHz + Deployment FPGA Arria10 +Estimated resources LUTs 539, DSPs 5, RAMBits 32768, RAMBlocks 3 +The pipeline depth of the block is 15 cycle(s) +@@start +@name FPDiv@ +@latency 15@ +@LUT 539@ +@DSP 5@ +@RAMBits 32768@ +@RAMBlockUsage 3@ +@enable 1@ +@subnormals 0@ +@error 1.00@ +@rounding NA@ +@method polynomial approximation@ +@inPort 0 fpieee 8 23@ +@inPort 1 fpieee 8 23@ +@outPort 0 fpieee 8 23@ +@nochanvalid 1@ +@@end +starting execution ... +build model options ... +argc=20 +Generation context: + HardFP is enabled enabling set to true + Faithful rounding constraint detected + Will not generate valid and channel signals + The new component name is acl_fsqrt + Frequency 250MHz + Deployment FPGA Arria10 +Estimated resources LUTs 271, DSPs 3, RAMBits 15872, RAMBlocks 3 +The pipeline depth of the block is 10 cycle(s) +@@start +@name FPSqrt@ +@latency 10@ +@LUT 271@ +@DSP 3@ +@RAMBits 15872@ +@RAMBlockUsage 3@ +@enable 1@ +@subnormals 0@ +@error 1.00@ +@rounding NA@ +@method polynomial approximation@ +@inPort 0 fpieee 8 23@ +@outPort 0 fpieee 8 23@ +@nochanvalid 1@ +@@end +starting execution ... +build model options ... +argc=23 +Generation context: + HardFP is enabled enabling set to true + Faithful rounding constraint detected + Will not generate valid and channel signals + The new component name is acl_ftoi + Frequency 250MHz + Deployment FPGA Arria10 +Estimated resources LUTs 327, DSPs 0, RAMBits 0, RAMBlocks 0 +The pipeline depth of the block is 3 cycle(s) +@@start +@name FPToFXP@ +@latency 3@ +@LUT 327@ +@DSP 0@ +@RAMBits 0@ +@RAMBlockUsage 0@ +@enable 1@ +@subnormals 0@ +@error 1.00@ +@rounding NA@ +@method default@ +@inPort 0 fpieee 8 23@ +@outPort 0 fxp 32 0 1@ +@nochanvalid 1@ +@@end +starting execution ... +build model options ... +argc=23 +Generation context: + HardFP is enabled enabling set to true + Faithful rounding constraint detected + Will not generate valid and channel signals + The new component name is acl_ftou + Frequency 250MHz + Deployment FPGA Arria10 +Estimated resources LUTs 287, DSPs 0, RAMBits 0, RAMBlocks 0 +The pipeline depth of the block is 3 cycle(s) +@@start +@name FPToFXP@ +@latency 3@ +@LUT 287@ +@DSP 0@ +@RAMBits 0@ +@RAMBlockUsage 0@ +@enable 1@ +@subnormals 0@ +@error 1.00@ +@rounding NA@ +@method default@ +@inPort 0 fpieee 8 23@ +@outPort 0 fxp 32 0 0@ +@nochanvalid 1@ +@@end +starting execution ... +build model options ... +argc=23 +Generation context: + HardFP is enabled enabling set to true + Faithful rounding constraint detected + Will not generate valid and channel signals + The new component name is acl_itof + Frequency 250MHz + Deployment FPGA Arria10 +Estimated resources LUTs 397, DSPs 0, RAMBits 0, RAMBlocks 0 +The pipeline depth of the block is 7 cycle(s) +@@start +@name FXPToFP@ +@latency 7@ +@LUT 397@ +@DSP 0@ +@RAMBits 0@ +@RAMBlockUsage 0@ +@enable 1@ +@subnormals 0@ +@error 1.00@ +@rounding NA@ +@method default@ +@inPort 0 fxp 32 0 1@ +@outPort 0 fpieee 8 23@ +@nochanvalid 1@ +@@end +starting execution ... +build model options ... +argc=23 +Generation context: + HardFP is enabled enabling set to true + Faithful rounding constraint detected + Will not generate valid and channel signals + The new component name is acl_utof + Frequency 300MHz + Deployment FPGA Arria10 +Estimated resources LUTs 363, DSPs 0, RAMBits 0, RAMBlocks 0 +The pipeline depth of the block is 7 cycle(s) +@@start +@name FXPToFP@ +@latency 7@ +@LUT 363@ +@DSP 0@ +@RAMBits 0@ +@RAMBlockUsage 0@ +@enable 1@ +@subnormals 0@ +@error 1.00@ +@rounding NA@ +@method default@ +@inPort 0 fxp 32 0 0@ +@outPort 0 fpieee 8 23@ +@nochanvalid 1@ +@@end diff --git a/hw/rtl/fp_cores/altera/acl_gen.sh b/hw/rtl/fp_cores/altera/acl_gen.sh new file mode 100755 index 00000000..f26058eb --- /dev/null +++ b/hw/rtl/fp_cores/altera/acl_gen.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +CMD_POLY_EVAL_PATH=$QUARTUS_HOME/dspba/backend/linux64 + +OPTIONS="-target Arria10 -lang verilog -enableHardFP 1 -printMachineReadable -faithfulRounding -noChanValid -enable -speedgrade 2" + +export LD_LIBRARY_PATH=$CMD_POLY_EVAL_PATH:$LD_LIBRARY_PATH + +CMD="$CMD_POLY_EVAL_PATH/cmdPolyEval $OPTIONS" + +EXP_BITS=8 +MAN_BITS=23 +FBITS="f$(($EXP_BITS + $MAN_BITS + 1))" + +echo Generating IP cores for $FBITS +{ + $CMD -name acl_fdiv -frequency 250 FPDiv $EXP_BITS $MAN_BITS 0 + $CMD -name acl_fsqrt -frequency 250 FPSqrt $EXP_BITS $MAN_BITS + $CMD -name acl_ftoi -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 1 + $CMD -name acl_ftou -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 0 + $CMD -name acl_itof -frequency 250 FXPToFP 32 0 1 $EXP_BITS $MAN_BITS + $CMD -name acl_utof -frequency 300 FXPToFP 32 0 0 $EXP_BITS $MAN_BITS +} > acl_gen.log 2>&1 + +#cp $QUARTUS_HOME/dspba/backend/Libraries/sv/base/dspba_library_ver.sv . \ No newline at end of file diff --git a/hw/rtl/fp_cores/altera/acl_fp_itof.sv b/hw/rtl/fp_cores/altera/acl_itof.sv similarity index 99% rename from hw/rtl/fp_cores/altera/acl_fp_itof.sv rename to hw/rtl/fp_cores/altera/acl_itof.sv index 4a7ce08b..5e6a74aa 100644 --- a/hw/rtl/fp_cores/altera/acl_fp_itof.sv +++ b/hw/rtl/fp_cores/altera/acl_itof.sv @@ -15,12 +15,12 @@ // applicable agreement for further details. // --------------------------------------------------------------------------- -// SystemVerilog created from acl_fp_itof -// SystemVerilog created on Mon Aug 31 06:15:18 2020 +// SystemVerilog created from acl_itof +// SystemVerilog created on Wed Sep 2 07:11:09 2020 (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *) -module acl_fp_itof ( +module acl_itof ( input wire [31:0] a, input wire [0:0] en, output wire [31:0] q, diff --git a/hw/rtl/fp_cores/altera/acl_fp_utof.sv b/hw/rtl/fp_cores/altera/acl_utof.sv similarity index 99% rename from hw/rtl/fp_cores/altera/acl_fp_utof.sv rename to hw/rtl/fp_cores/altera/acl_utof.sv index 946ba8e9..4b404625 100644 --- a/hw/rtl/fp_cores/altera/acl_fp_utof.sv +++ b/hw/rtl/fp_cores/altera/acl_utof.sv @@ -15,12 +15,12 @@ // applicable agreement for further details. // --------------------------------------------------------------------------- -// SystemVerilog created from acl_fp_utof -// SystemVerilog created on Mon Aug 31 06:15:18 2020 +// SystemVerilog created from acl_utof +// SystemVerilog created on Wed Sep 2 07:11:09 2020 (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *) -module acl_fp_utof ( +module acl_utof ( input wire [31:0] a, input wire [0:0] en, output wire [31:0] q, diff --git a/hw/rtl/fp_cores/altera/generate.sh b/hw/rtl/fp_cores/altera/generate.sh deleted file mode 100755 index 84a84a2c..00000000 --- a/hw/rtl/fp_cores/altera/generate.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -CMD_POLY_EVAL_PATH=$QUARTUS_HOME/dspba/backend/linux64 - -OPTIONS="-target Arria10 -lang verilog -enableHardFP 1 -printMachineReadable -faithfulRounding -noChanValid -enable -speedgrade 2" - -export LD_LIBRARY_PATH=$CMD_POLY_EVAL_PATH:$LD_LIBRARY_PATH - -CMD="$CMD_POLY_EVAL_PATH/cmdPolyEval $OPTIONS" - -EXP_BITS=8 -MAN_BITS=23 -FBITS="f$(($EXP_BITS + $MAN_BITS + 1))" - -echo Generating IP cores for $FBITS -{ - $CMD -name acl_fp_div -frequency 250 FPDiv $EXP_BITS $MAN_BITS 0 - $CMD -name acl_fp_sqrt -frequency 250 FPSqrt $EXP_BITS $MAN_BITS - $CMD -name acl_fp_ftoi -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 1 - $CMD -name acl_fp_ftou -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 0 - $CMD -name acl_fp_itof -frequency 250 FXPToFP 32 0 1 $EXP_BITS $MAN_BITS - $CMD -name acl_fp_utof -frequency 300 FXPToFP 32 0 0 $EXP_BITS $MAN_BITS -} > log.txt 2>&1 - -cp $QUARTUS_HOME/dspba/backend/Libraries/sv/base/dspba_library_ver.sv . \ No newline at end of file diff --git a/hw/rtl/fp_cores/svdpi/float_dpi.cpp b/hw/rtl/fp_cores/svdpi/float_dpi.cpp index 0cacb814..dc5d1bcd 100644 --- a/hw/rtl/fp_cores/svdpi/float_dpi.cpp +++ b/hw/rtl/fp_cores/svdpi/float_dpi.cpp @@ -8,21 +8,19 @@ #include "VX_config.h" extern "C" { - void dpi_fadd(int inst, bool enable, bool valid, int a, int b, int* result); - void dpi_fsub(int inst, bool enable, bool valid, int a, int b, int* result); - void dpi_fmul(int inst, bool enable, bool valid, int a, int b, int* result); - void dpi_fmadd(int inst, bool enable, bool valid, int a, int b, int c, int* result); - void dpi_fmsub(int inst, bool enable, bool valid, int a, int b, int c, int* result); - void dpi_fdiv(int inst, bool enable, bool valid, int a, int b, int* result); - void dpi_fsqrt(int inst, bool enable, bool valid, int a, int* result); - void dpi_ftoi(int inst, bool enable, bool valid, int a, int* result); - void dpi_ftou(int inst, bool enable, bool valid, int a, int* result); - void dpi_itof(int inst, bool enable, bool valid, int a, int* result); - void dpi_utof(int inst, bool enable, bool valid, int a, int* result); + void dpi_fadd(int inst, bool enable, int a, int b, int* result); + void dpi_fsub(int inst, bool enable, int a, int b, int* result); + void dpi_fmul(int inst, bool enable, int a, int b, int* result); + void dpi_fmadd(int inst, bool enable, int a, int b, int c, int* result); + void dpi_fmsub(int inst, bool enable, int a, int b, int c, int* result); + void dpi_fdiv(int inst, bool enable, int a, int b, int* result); + void dpi_fsqrt(int inst, bool enable, int a, int* result); + void dpi_ftoi(int inst, bool enable, int a, int* result); + void dpi_ftou(int inst, bool enable, int a, int* result); + void dpi_itof(int inst, bool enable, int a, int* result); + void dpi_utof(int inst, bool enable, int a, int* result); } -extern double sc_time_stamp(); - class ShiftRegister { public: ShiftRegister() : init_(false), depth_(0) {} @@ -35,37 +33,36 @@ public: } } - void push(int value, bool enable, bool valid) { + void push(int value, bool enable) { if (!enable) return; for (unsigned i = 0; i < depth_-1; ++i) { buffer_[i] = buffer_[i+1]; } - buffer_[depth_-1].value = value; - buffer_[depth_-1].valid = valid; + buffer_[depth_-1] = value; } int top() const { - return buffer_[0].value; - } - - bool valid() const { - return buffer_[0].valid; + return buffer_[0]; } private: - struct entry_t { - int value; - bool valid; - }; - - std::vector buffer_; - int top_; - unsigned depth_; + std::vector buffer_; bool init_; + unsigned depth_; }; +union Float_t { + float f; + int i; + struct { + uint32_t man : 23; + uint32_t exp : 8; + uint32_t sign : 1; + } parts; +}; + class Instances { public: ShiftRegister& get(int inst) { @@ -82,130 +79,152 @@ private: Instances instances; -void dpi_fadd(int inst, bool enable, bool valid, int a, int b, int* result) { +void dpi_fadd(int inst, bool enable, int a, int b, int* result) { ShiftRegister& sr = instances.get(inst); - float fa = *(float*)&a; - float fb = *(float*)&b; - float fr = fa + fb; + Float_t fa, fb, fr; - sr.ensure_init(LATENCY_FMADD); - sr.push(*(int*)&fr, enable, valid); + fa.i = a; + fb.i = b; + fr.f = fa.f + fb.f; + + sr.ensure_init(LATENCY_FADDMUL); + sr.push(fr.i, enable); *result = sr.top(); } -void dpi_fsub(int inst, bool enable, bool valid, int a, int b, int* result) { +void dpi_fsub(int inst, bool enable, int a, int b, int* result) { ShiftRegister& sr = instances.get(inst); - float fa = *(float*)&a; - float fb = *(float*)&b; - float fr = fa - fb; + Float_t fa, fb, fr; - sr.ensure_init(LATENCY_FMADD); - sr.push(*(int*)&fr, enable, valid); + fa.i = a; + fb.i = b; + fr.f = fa.f - fb.f; + + sr.ensure_init(LATENCY_FADDMUL); + sr.push(fr.i, enable); *result = sr.top(); } -void dpi_fmul(int inst, bool enable, bool valid, int a, int b, int* result) { +void dpi_fmul(int inst, bool enable, int a, int b, int* result) { ShiftRegister& sr = instances.get(inst); - float fa = *(float*)&a; - float fb = *(float*)&b; - float fr = fa * fb; + Float_t fa, fb, fr; - sr.ensure_init(LATENCY_FMADD); - sr.push(*(int*)&fr, enable, valid); + fa.i = a; + fb.i = b; + fr.f = fa.f * fb.f; + + sr.ensure_init(LATENCY_FADDMUL); + sr.push(fr.i, enable); *result = sr.top(); } -void dpi_fmadd(int inst, bool enable, bool valid, int a, int b, int c, int* result) { +void dpi_fmadd(int inst, bool enable, int a, int b, int c, int* result) { ShiftRegister& sr = instances.get(inst); - float fa = *(float*)&a; - float fb = *(float*)&b; - float fc = *(float*)&c; - float fr = fa * fb + fc; + Float_t fa, fb, fc, fr; + + fa.i = a; + fb.i = b; + fc.i = c; + fr.f = fa.f * fb.f + fc.f; sr.ensure_init(LATENCY_FMADD); - sr.push(*(int*)&fr, enable, valid); + sr.push(fr.i, enable); *result = sr.top(); } -void dpi_fmsub(int inst, bool enable, bool valid, int a, int b, int c, int* result) { +void dpi_fmsub(int inst, bool enable, int a, int b, int c, int* result) { ShiftRegister& sr = instances.get(inst); - float fa = *(float*)&a; - float fb = *(float*)&b; - float fc = *(float*)&c; - float fr = fa * fb - fc; + Float_t fa, fb, fc, fr; + + fa.i = a; + fb.i = b; + fc.i = c; + fr.f = fa.f * fb.f - fc.f; sr.ensure_init(LATENCY_FMADD); - sr.push(*(int*)&fr, enable, valid); + sr.push(fr.i, enable); *result = sr.top(); } -void dpi_fdiv(int inst, bool enable, bool valid, int a, int b, int* result) { +void dpi_fdiv(int inst, bool enable, int a, int b, int* result) { ShiftRegister& sr = instances.get(inst); - float fa = *(float*)&a; - float fb = *(float*)&b; - float fr = fa / fb; + Float_t fa, fb, fr; + + fa.i = a; + fb.i = b; + fr.f = fa.f / fb.f; sr.ensure_init(LATENCY_FDIV); - sr.push(*(int*)&fr, enable, valid); + sr.push(fr.i, enable); *result = sr.top(); } -void dpi_fsqrt(int inst, bool enable, bool valid, int a, int* result) { +void dpi_fsqrt(int inst, bool enable, int a, int* result) { ShiftRegister& sr = instances.get(inst); - float fa = *(float*)&a; - float fr = sqrtf(fa); + Float_t fa, fr; + + fa.i = a; + fr.f = sqrtf(fa.f); sr.ensure_init(LATENCY_FSQRT); - sr.push(*(int*)&fr, enable, valid); + sr.push(fr.i, enable); *result = sr.top(); } -void dpi_ftoi(int inst, bool enable, bool valid, int a, int* result) { +void dpi_ftoi(int inst, bool enable, int a, int* result) { ShiftRegister& sr = instances.get(inst); - float fa = *(float*)&a; - int ir = int(fa); + Float_t fa, fr; + + fa.i = a; + fr.i = int(fa.f); sr.ensure_init(LATENCY_FTOI); - sr.push(ir, enable, valid); + sr.push(fr.i, enable); *result = sr.top(); } -void dpi_ftou(int inst, bool enable, bool valid, int a, int* result) { +void dpi_ftou(int inst, bool enable, int a, int* result) { ShiftRegister& sr = instances.get(inst); - float fa = *(float*)&a; - unsigned ir = unsigned(fa); + Float_t fa, fr; + + fa.i = a; + fr.i = unsigned(fa.f); sr.ensure_init(LATENCY_FTOI); - sr.push(ir, enable, valid); + sr.push(fr.i, enable); *result = sr.top(); } -void dpi_itof(int inst, bool enable, bool valid, int a, int* result) { +void dpi_itof(int inst, bool enable, int a, int* result) { ShiftRegister& sr = instances.get(inst); - float fr = (float)a; + Float_t fa, fr; + + fr.f = (float)a; sr.ensure_init(LATENCY_ITOF); - sr.push(*(int*)&fr, enable, valid); + sr.push(fr.i, enable); *result = sr.top(); } -void dpi_utof(int inst, bool enable, bool valid, int a, int* result) { +void dpi_utof(int inst, bool enable, int a, int* result) { ShiftRegister& sr = instances.get(inst); - unsigned ua = *(unsigned*)&a; - float fr = (float)ua; + Float_t fa, fr; + + unsigned ua = a; + fr.f = (float)ua; sr.ensure_init(LATENCY_ITOF); - sr.push(*(int*)&fr, enable, valid); + sr.push(fr.i, enable); *result = sr.top(); } \ No newline at end of file diff --git a/hw/rtl/fp_cores/svdpi/float_dpi.vh b/hw/rtl/fp_cores/svdpi/float_dpi.vh index aae84c89..a94dc34c 100644 --- a/hw/rtl/fp_cores/svdpi/float_dpi.vh +++ b/hw/rtl/fp_cores/svdpi/float_dpi.vh @@ -1,16 +1,16 @@ `ifndef FLOAT_DPI `define FLOAT_DPI -import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input logic valid, input int a, input int b, output int result); -import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input logic valid, input int a, input int b, output int result); -import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input logic valid, input int a, input int b, output int result); -import "DPI-C" context function void dpi_fmadd(int inst, input logic enable, input logic valid, input int a, input int b, input int c, output int result); -import "DPI-C" context function void dpi_fmsub(int inst, input logic enable, input logic valid, input int a, input int b, input int c, output int result); -import "DPI-C" context function void dpi_fdiv(int inst, input logic enable, input logic valid, input int a, input int b, output int result); -import "DPI-C" context function void dpi_fsqrt(int inst, input logic enable, input logic valid, input int a, output int result); -import "DPI-C" context function void dpi_ftoi(int inst, input logic enable, input logic valid, input int a, output int result); -import "DPI-C" context function void dpi_ftou(int inst, input logic enable, input logic valid, input int a, output int result); -import "DPI-C" context function void dpi_itof(int inst, input logic enable, input logic valid, input int a, output int result); -import "DPI-C" context function void dpi_utof(int inst, input logic enable, input logic valid, input int a, output int result); +import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input int a, input int b, output int result); +import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input int a, input int b, output int result); +import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input int a, input int b, output int result); +import "DPI-C" context function void dpi_fmadd(int inst, input logic enable, input int a, input int b, input int c, output int result); +import "DPI-C" context function void dpi_fmsub(int inst, input logic enable, input int a, input int b, input int c, output int result); +import "DPI-C" context function void dpi_fdiv(int inst, input logic enable, input int a, input int b, output int result); +import "DPI-C" context function void dpi_fsqrt(int inst, input logic enable, input int a, output int result); +import "DPI-C" context function void dpi_ftoi(int inst, input logic enable, input int a, output int result); +import "DPI-C" context function void dpi_ftou(int inst, input logic enable, input int a, output int result); +import "DPI-C" context function void dpi_itof(int inst, input logic enable, input int a, output int result); +import "DPI-C" context function void dpi_utof(int inst, input logic enable, input int a, output int result); `endif \ No newline at end of file diff --git a/hw/rtl/libs/VX_divide.v b/hw/rtl/libs/VX_divide.v index 9cdf6848..f960ba52 100644 --- a/hw/rtl/libs/VX_divide.v +++ b/hw/rtl/libs/VX_divide.v @@ -1,21 +1,18 @@ `include "VX_platform.vh" module VX_divide #( - parameter WIDTHN = 1, - parameter WIDTHD = 1, - parameter WIDTHQ = 1, - parameter WIDTHR = 1, + parameter WIDTHN = 1, + parameter WIDTHD = 1, + parameter WIDTHQ = 1, + parameter WIDTHR = 1, parameter NSIGNED = 0, parameter DSIGNED = 0, - parameter PIPELINE = 0 + parameter LATENCY = 0 ) ( input wire clk, - input wire reset, - - input wire clk_en, + input wire enable, input wire [WIDTHN-1:0] numer, input wire [WIDTHD-1:0] denom, - output wire [WIDTHQ-1:0] quotient, output wire [WIDTHR-1:0] remainder ); @@ -27,11 +24,11 @@ module VX_divide #( lpm_divide divide ( .clock (clk), + .clken (enable), .numer (numer), .denom (denom), .quotient (quotient_unqual), - .remain (remainder_unqual), - .clken (clk_en) + .remain (remainder_unqual) ); defparam @@ -41,7 +38,7 @@ module VX_divide #( divide.lpm_nrepresentation = NSIGNED ? "SIGNED" : "UNSIGNED", divide.lpm_drepresentation = DSIGNED ? "SIGNED" : "UNSIGNED", divide.lpm_hint = "MAXIMIZE_SPEED=6,LPM_REMAINDERPOSITIVE=FALSE", - divide.lpm_pipeline = PIPELINE; + divide.lpm_pipeline = LATENCY; assign quotient = quotient_unqual [WIDTHQ-1:0]; assign remainder = remainder_unqual [WIDTHR-1:0]; @@ -72,34 +69,24 @@ module VX_divide #( end end - if (PIPELINE == 0) begin + if (LATENCY == 0) begin assign quotient = quotient_unqual [WIDTHQ-1:0]; assign remainder = remainder_unqual [WIDTHR-1:0]; end else begin - reg [WIDTHN-1:0] quotient_pipe [0:PIPELINE-1]; - reg [WIDTHD-1:0] remainder_pipe [0:PIPELINE-1]; + reg [WIDTHN-1:0] quotient_pipe [0:LATENCY-1]; + reg [WIDTHD-1:0] remainder_pipe [0:LATENCY-1]; - for (genvar i = 0; i < PIPELINE; i++) begin - always @(posedge clk) begin - if (reset) begin - quotient_pipe[i] <= 0; - remainder_pipe[i] <= 0; - end else begin - if (clk_en) begin - if (i == 0) begin - quotient_pipe[i] <= quotient_unqual; - remainder_pipe[i] <= remainder_unqual; - end else begin - quotient_pipe[i] <= quotient_pipe[i-1]; - remainder_pipe[i] <= remainder_pipe[i-1]; - end - end + for (genvar i = 0; i < LATENCY; i++) begin + always @(posedge clk) begin + if (enable) begin + quotient_pipe[i] <= (0 == i) ? quotient_unqual : quotient_pipe[i-1]; + remainder_pipe[i] <= (0 == i) ? remainder_unqual : remainder_pipe[i-1]; end end end - assign quotient = quotient_pipe[PIPELINE-1][WIDTHQ-1:0]; - assign remainder = remainder_pipe[PIPELINE-1][WIDTHR-1:0]; + assign quotient = quotient_pipe[LATENCY-1][WIDTHQ-1:0]; + assign remainder = remainder_pipe[LATENCY-1][WIDTHR-1:0]; end `endif diff --git a/hw/rtl/libs/VX_multiplier.v b/hw/rtl/libs/VX_multiplier.v index 5822f31b..ed7c5946 100644 --- a/hw/rtl/libs/VX_multiplier.v +++ b/hw/rtl/libs/VX_multiplier.v @@ -1,16 +1,14 @@ `include "VX_platform.vh" module VX_multiplier #( - parameter WIDTHA = 1, - parameter WIDTHB = 1, - parameter WIDTHP = 1, - parameter SIGNED = 0, - parameter PIPELINE = 0 + parameter WIDTHA = 1, + parameter WIDTHB = 1, + parameter WIDTHP = 1, + parameter SIGNED = 0, + parameter LATENCY = 0 ) ( - input wire clk, - input wire reset, - - input wire clk_en, + input wire clk, + input wire enable, input wire [WIDTHA-1:0] dataa, input wire [WIDTHB-1:0] datab, output wire [WIDTHP-1:0] result @@ -20,20 +18,22 @@ module VX_multiplier #( lpm_mult mult ( .clock (clk), + .clken (enable), .dataa (dataa), .datab (datab), - .result (result), - .clken (clk_en), + .result (result), + .aclr (1'b0), + .sclr (1'b0), .sum (1'b0) ); - defparam mult.lpm_type = "LPM_MULT", + defparam mult.lpm_type = "LPM_MULT", mult.lpm_widtha = WIDTHA, mult.lpm_widthb = WIDTHB, mult.lpm_widthp = WIDTHP, mult.lpm_representation = SIGNED ? "SIGNED" : "UNSIGNED", - mult.lpm_pipeline = PIPELINE, - mult.lpm_hint = "MAXIMIZE_SPEED=9,DEDICATED_MULTIPLIER_CIRCUITRY=YES"; + mult.lpm_pipeline = LATENCY, + mult.lpm_hint = "DEDICATED_MULTIPLIER_CIRCUITRY=YES,MAXIMIZE_SPEED=9"; `else wire [WIDTHP-1:0] result_unqual; @@ -44,29 +44,20 @@ module VX_multiplier #( assign result_unqual = dataa * datab; end - if (PIPELINE == 0) begin + if (LATENCY == 0) begin assign result = result_unqual; - end else begin - - reg [WIDTHP-1:0] result_pipe [0:PIPELINE-1]; + end else begin + reg [WIDTHP-1:0] result_pipe [0:LATENCY-1]; - for (genvar i = 0; i < PIPELINE; i++) begin + for (genvar i = 0; i < LATENCY; i++) begin always @(posedge clk) begin - if (reset) begin - result_pipe[i] <= 0; - end else begin - if (clk_en) begin - if (i == 0) begin - result_pipe[i] <= result_unqual; - end else begin - result_pipe[i] <= result_pipe[i-1]; - end - end + if (enable) begin + result_pipe[i] <= (0 == i) ? result_unqual : result_pipe[i-1]; end end - end - - assign result = result_pipe[PIPELINE-1]; + end + + assign result = result_pipe[LATENCY-1]; end `endif