diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 8eeaa44e..73e64d64 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -34,9 +34,10 @@ LDFLAGS += -shared -pthread TOP = Vortex SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp +SRCS += ../../hw/rtl/fp_cores/svdpi/float_dpi.cpp -FPU_INCLUDE = -I../../hw/rtl/fp_cores/fpnew/src/common_cells/include -I../../hw/rtl/fp_cores/fpnew/src/common_cells/src -I../../hw/rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../hw/rtl/fp_cores/fpnew/src -RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/fp_cores $(FPU_INCLUDE) +FPU_INCLUDE = -I../../hw/rtl/fp_cores -I../../hw/rtl/fp_cores/svdpi -I../../hw/rtl/fp_cores/fpnew/src/common_cells/include -I../../hw/rtl/fp_cores/fpnew/src/common_cells/src -I../../hw/rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../hw/rtl/fp_cores/fpnew/src +RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache $(FPU_INCLUDE) VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS) VL_FLAGS += -Wno-DECLFILENAME diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index ab4d4742..82e67786 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -51,15 +51,55 @@ `define L3_ENABLE (`NUM_CLUSTERS > 1) `endif +`ifndef EXT_M_DISABLE `define EXT_M_ENABLE +`endif +`ifndef EXT_F_DISABLE `define EXT_F_ENABLE +`endif + +`define FPNEW_ENABLE // Device identification `define VENDOR_ID 0 `define ARCHITECTURE_ID 0 `define IMPLEMENTATION_ID 0 +/////////////////////////////////////////////////////////////////////////////// + +`ifndef LATENCY_IMUL +`define LATENCY_IMUL 3 +`endif + +`ifndef LATENCY_FNONCOMP +`define LATENCY_FNONCOMP 1 +`endif + +`ifndef LATENCY_FMADD +`define LATENCY_FMADD 1 +`endif + +`ifndef LATENCY_FNMADD +`define LATENCY_FNMADD 2 +`endif + +`ifndef LATENCY_FDIV +`define LATENCY_FDIV 15 +`endif + +`ifndef LATENCY_FSQRT +`define LATENCY_FSQRT 9 +`endif + +`ifndef LATENCY_ITOF +`define LATENCY_ITOF 7 +`endif + +`ifndef LATENCY_FTOI +`define LATENCY_FTOI 3 +`endif + // CSR Addresses ////////////////////////////////////////////////////////////// `define CSR_FFLAGS 12'h001 diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 66017c93..298d500e 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -35,22 +35,6 @@ /////////////////////////////////////////////////////////////////////////////// -`define LATENCY_IMUL 3 - -`define LATENCY_FDIV 16 -`define LATENCY_FSQRT 10 -`define LATENCY_FTOI 5 -`define LATENCY_FTOU 4 -`define LATENCY_ITOF 8 -`define LATENCY_UTOF 7 - -`define LATENCY_FMULADD 2 -`define LATENCY_FDIVSQRT 2 -`define LATENCY_FCONV 2 -`define LATENCY_FNONCOMP 1 - -/////////////////////////////////////////////////////////////////////////////// - `define INST_LUI 7'b0110111 `define INST_AUIPC 7'b0010111 `define INST_JAL 7'b1101111 diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index bdbcd7bf..5cf7aaa4 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -56,7 +56,7 @@ module VX_fpu_unit #( // can accept new request? assign fpu_req_if.ready = ready_in && ~fpuq_full; -`ifdef SYNTHESIS +`ifndef FPNEW_ENABLE VX_fp_fpga #( .TAGW (FPUQ_BITS) diff --git a/hw/rtl/fp_cores/altera/VX_fp_div.v b/hw/rtl/fp_cores/VX_fp_div.v similarity index 73% rename from hw/rtl/fp_cores/altera/VX_fp_div.v rename to hw/rtl/fp_cores/VX_fp_div.v index 698782d9..a91784d7 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_div.v +++ b/hw/rtl/fp_cores/VX_fp_div.v @@ -1,5 +1,9 @@ `include "VX_define.vh" +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + module VX_fp_div #( parameter TAGW = 1, parameter LANES = 1 @@ -21,19 +25,23 @@ module VX_fp_div #( input wire ready_out, output wire valid_out ); - wire stall = ~ready_out && valid_out; - wire enable = ~stall; - assign ready_in = enable; - + wire stall = ~ready_out && valid_out; + for (genvar i = 0; i < LANES; i++) begin + `ifdef QUARTUS acl_fp_div fdiv ( .clk (clk), .areset (1'b0), - .en (enable), + .en (~stall), .a (dataa[i]), .b (datab[i]), .q (result[i]) ); + `else + always @(posedge clk) begin + dpi_fdiv(clk, ~stall, dataa[i], datab[i], result[i]); + end + `endif end VX_shift_register #( @@ -42,9 +50,11 @@ module VX_fp_div #( ) shift_reg ( .clk(clk), .reset(reset), - .enable(enable), + .enable(~stall), .in ({tag_in, valid_in}), .out({tag_out, valid_out}) ); + assign ready_in = ~stall; + endmodule diff --git a/hw/rtl/fp_cores/VX_fp_fpga.v b/hw/rtl/fp_cores/VX_fp_fpga.v index 2097ffd1..243f0bc1 100644 --- a/hw/rtl/fp_cores/VX_fp_fpga.v +++ b/hw/rtl/fp_cores/VX_fp_fpga.v @@ -1,5 +1,4 @@ `include "VX_define.vh" -`include "dspba_library_ver.sv" module VX_fp_fpga #( parameter TAGW = 1 @@ -28,7 +27,7 @@ module VX_fp_fpga #( input wire ready_out, output wire valid_out ); - localparam NUM_FPC = 12; + localparam NUM_FPC = 7; localparam FPC_BITS = `LOG2UP(NUM_FPC); wire [NUM_FPC-1:0] per_core_ready_in; @@ -41,26 +40,30 @@ module VX_fp_fpga #( fflags_t [`NUM_THREADS-1:0] fpnew_fflags; reg [FPC_BITS-1:0] core_select; - reg fmadd_negate; + reg do_add, do_sub, do_mul; + reg is_signed; always @(*) begin - core_select = 0; - fmadd_negate = 0; + core_select = 'x; + do_add = 'x; + do_sub = 'x; + do_mul = 'x; + is_signed = 'x; case (op_type) - `FPU_ADD: core_select = 1; - `FPU_SUB: core_select = 2; - `FPU_MUL: core_select = 3; - `FPU_MADD: core_select = 4; - `FPU_MSUB: core_select = 5; - `FPU_NMSUB: begin core_select = 4; fmadd_negate = 1; end - `FPU_NMADD: begin core_select = 5; fmadd_negate = 1; end - `FPU_DIV: core_select = 6; - `FPU_SQRT: core_select = 7; - `FPU_CVTWS: core_select = 8; - `FPU_CVTWUS: core_select = 9; - `FPU_CVTSW: core_select = 10; - `FPU_CVTSWU: core_select = 11; - default:; + `FPU_ADD: begin core_select = 1; do_mul = 0; do_add = 1; do_sub = 0; end + `FPU_SUB: begin core_select = 1; do_mul = 0; do_add = 0; do_sub = 1; end + `FPU_MUL: begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 0; end + `FPU_MADD: begin core_select = 1; do_mul = 1; do_add = 1; do_sub = 0; end + `FPU_MSUB: begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 1; end + `FPU_NMSUB: begin core_select = 2; do_sub = 1; end + `FPU_NMADD: begin core_select = 2; do_sub = 0; end + `FPU_DIV: begin core_select = 3; end + `FPU_SQRT: begin core_select = 4; end + `FPU_CVTWS: begin core_select = 5; is_signed = 1; end + `FPU_CVTWUS: begin core_select = 5; is_signed = 0; end + `FPU_CVTSW: begin core_select = 6; is_signed = 1; end + `FPU_CVTSWU: begin core_select = 6; is_signed = 0; end + default: begin core_select = 0; end endcase end @@ -76,7 +79,7 @@ module VX_fp_fpga #( .op_type (op_type), .frm (frm), .dataa (dataa), - .datab (datab), + .datab (datab), .result (per_core_result[0]), .has_fflags (fpnew_has_fflags), .fflags (fpnew_fflags), @@ -85,44 +88,50 @@ module VX_fp_fpga #( .valid_out (per_core_valid_out[0]) ); - VX_fp_add #( + VX_fp_madd #( .TAGW (TAGW), .LANES(`NUM_THREADS) - ) fp_add ( + ) fp_madd ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 1)), .ready_in (per_core_ready_in[1]), .tag_in (tag_in), + .do_add (do_add), + .do_sub (do_sub), + .do_mul (do_mul), .dataa (dataa), - .datab (datab), + .datab (datab), + .datac (datac), .result (per_core_result[1]), .tag_out (per_core_tag_out[1]), .ready_out (per_core_ready_out[1]), .valid_out (per_core_valid_out[1]) ); - VX_fp_sub #( + VX_fp_nmadd #( .TAGW (TAGW), .LANES(`NUM_THREADS) - ) fp_sub ( + ) fp_nmadd ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 2)), .ready_in (per_core_ready_in[2]), - .tag_in (tag_in), + .tag_in (tag_in), + .do_sub (do_sub), .dataa (dataa), - .datab (datab), + .datab (datab), + .datac (datac), .result (per_core_result[2]), .tag_out (per_core_tag_out[2]), .ready_out (per_core_ready_out[2]), .valid_out (per_core_valid_out[2]) ); - VX_fp_mul #( + VX_fp_div #( .TAGW (TAGW), .LANES(`NUM_THREADS) - ) fp_mul ( + ) fp_div ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 3)), @@ -136,75 +145,20 @@ module VX_fp_fpga #( .valid_out (per_core_valid_out[3]) ); - VX_fp_madd #( - .TAGW (TAGW), - .LANES(`NUM_THREADS) - ) fp_madd ( - .clk (clk), - .reset (reset), - .valid_in (valid_in && (core_select == 4)), - .ready_in (per_core_ready_in[4]), - .tag_in (tag_in), - .negate (fmadd_negate), - .dataa (dataa), - .datab (datab), - .datac (datac), - .result (per_core_result[4]), - .tag_out (per_core_tag_out[4]), - .ready_out (per_core_ready_out[4]), - .valid_out (per_core_valid_out[4]) - ); - - VX_fp_msub #( - .TAGW (TAGW), - .LANES(`NUM_THREADS) - ) fp_msub ( - .clk (clk), - .reset (reset), - .valid_in (valid_in && (core_select == 5)), - .ready_in (per_core_ready_in[5]), - .tag_in (tag_in), - .negate (fmadd_negate), - .dataa (dataa), - .datab (datab), - .datac (datac), - .result (per_core_result[5]), - .tag_out (per_core_tag_out[5]), - .ready_out (per_core_ready_out[5]), - .valid_out (per_core_valid_out[5]) - ); - - VX_fp_div #( - .TAGW (TAGW), - .LANES(`NUM_THREADS) - ) fp_div ( - .clk (clk), - .reset (reset), - .valid_in (valid_in && (core_select == 6)), - .ready_in (per_core_ready_in[6]), - .tag_in (tag_in), - .dataa (dataa), - .datab (datab), - .result (per_core_result[6]), - .tag_out (per_core_tag_out[6]), - .ready_out (per_core_ready_out[6]), - .valid_out (per_core_valid_out[6]) - ); - VX_fp_sqrt #( .TAGW (TAGW), .LANES(`NUM_THREADS) ) fp_sqrt ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 7)), - .ready_in (per_core_ready_in[7]), + .valid_in (valid_in && (core_select == 4)), + .ready_in (per_core_ready_in[4]), .tag_in (tag_in), .dataa (dataa), - .result (per_core_result[7]), - .tag_out (per_core_tag_out[7]), - .ready_out (per_core_ready_out[7]), - .valid_out (per_core_valid_out[7]) + .result (per_core_result[4]), + .tag_out (per_core_tag_out[4]), + .ready_out (per_core_ready_out[4]), + .valid_out (per_core_valid_out[4]) ); VX_fp_ftoi #( @@ -213,30 +167,15 @@ module VX_fp_fpga #( ) fp_ftoi ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 8)), - .ready_in (per_core_ready_in[8]), - .tag_in (tag_in), + .valid_in (valid_in && (core_select == 5)), + .ready_in (per_core_ready_in[5]), + .tag_in (tag_in), + .is_signed (is_signed), .dataa (dataa), - .result (per_core_result[8]), - .tag_out (per_core_tag_out[8]), - .ready_out (per_core_ready_out[8]), - .valid_out (per_core_valid_out[8]) - ); - - VX_fp_ftou #( - .TAGW (TAGW), - .LANES(`NUM_THREADS) - ) fp_ftou ( - .clk (clk), - .reset (reset), - .valid_in (valid_in && (core_select == 9)), - .ready_in (per_core_ready_in[9]), - .tag_in (tag_in), - .dataa (dataa), - .result (per_core_result[9]), - .tag_out (per_core_tag_out[9]), - .ready_out (per_core_ready_out[9]), - .valid_out (per_core_valid_out[9]) + .result (per_core_result[5]), + .tag_out (per_core_tag_out[5]), + .ready_out (per_core_ready_out[5]), + .valid_out (per_core_valid_out[5]) ); VX_fp_itof #( @@ -245,60 +184,45 @@ module VX_fp_fpga #( ) fp_itof ( .clk (clk), .reset (reset), - .valid_in (valid_in && (core_select == 10)), - .ready_in (per_core_ready_in[10]), - .tag_in (tag_in), + .valid_in (valid_in && (core_select == 6)), + .ready_in (per_core_ready_in[6]), + .tag_in (tag_in), + .is_signed (is_signed), .dataa (dataa), - .result (per_core_result[10]), - .tag_out (per_core_tag_out[10]), - .ready_out (per_core_ready_out[10]), - .valid_out (per_core_valid_out[10]) + .result (per_core_result[6]), + .tag_out (per_core_tag_out[6]), + .ready_out (per_core_ready_out[6]), + .valid_out (per_core_valid_out[6]) ); - VX_fp_utof #( - .TAGW (TAGW), - .LANES(`NUM_THREADS) - ) fp_utof ( - .clk (clk), - .reset (reset), - .valid_in (valid_in && (core_select == 11)), - .ready_in (per_core_ready_in[11]), - .tag_in (tag_in), - .dataa (dataa), - .result (per_core_result[11]), - .tag_out (per_core_tag_out[11]), - .ready_out (per_core_ready_out[11]), - .valid_out (per_core_valid_out[11]) - ); - - reg valid_out_r; - reg has_fflags_r; - reg [`NUM_THREADS-1:0][31:0] result_r; - reg [TAGW-1:0] tag_out_r; + reg valid_out_n; + reg has_fflags_n; + reg [`NUM_THREADS-1:0][31:0] result_n; + reg [TAGW-1:0] tag_out_n; always @(*) begin per_core_ready_out = 0; - valid_out_r = 0; - has_fflags_r = 'x; - result_r = 'x; - tag_out_r = 'x; + valid_out_n = 0; + has_fflags_n = 'x; + result_n = 'x; + tag_out_n = 'x; for (integer i = 0; i < NUM_FPC; i++) begin if (per_core_valid_out[i]) begin per_core_ready_out[i] = ready_out; - valid_out_r = 1; - has_fflags_r = fpnew_has_fflags && (i == 0); - result_r = per_core_result[i]; - tag_out_r = per_core_tag_out[i]; + valid_out_n = 1; + has_fflags_n = fpnew_has_fflags && (i == 0); + result_n = per_core_result[i]; + tag_out_n = per_core_tag_out[i]; break; end end end assign ready_in = (& per_core_ready_in); - assign valid_out = valid_out_r; - assign has_fflags = has_fflags_r; - assign tag_out = tag_out_r; - assign result = result_r; + assign valid_out = valid_out_n; + assign has_fflags = has_fflags_n; + assign tag_out = tag_out_n; + assign result = result_n; assign fflags = fpnew_fflags; endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/VX_fp_ftoi.v b/hw/rtl/fp_cores/VX_fp_ftoi.v new file mode 100644 index 00000000..35bbdb1d --- /dev/null +++ b/hw/rtl/fp_cores/VX_fp_ftoi.v @@ -0,0 +1,77 @@ +`include "VX_define.vh" + +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + +module VX_fp_ftoi #( + parameter TAGW = 1, + parameter LANES = 1 +) ( + input wire clk, + input wire reset, + + output wire ready_in, + input wire valid_in, + + input wire [TAGW-1:0] tag_in, + + input wire is_signed, + + input wire [LANES-1:0][31:0] dataa, + output wire [LANES-1:0][31:0] result, + + output wire [TAGW-1:0] tag_out, + + input wire ready_out, + output wire valid_out +); + wire stall = ~ready_out && valid_out; + + reg is_signed_r; + + for (genvar i = 0; i < LANES; i++) begin + + wire [31:0] result_s; + wire [31:0] result_u; + + `ifdef QUARTUS + acl_fp_ftoi ftoi ( + .clk (clk), + .areset (1'b0), + .en (~stall), + .a (dataa[i]), + .q (result_s) + ); + + acl_fp_ftou ftou ( + .clk (clk), + .areset (1'b0), + .en (~stall), + .a (dataa[i]), + .q (result_u) + ); + `else + always @(posedge clk) begin + dpi_ftoi(clk, ~stall, dataa[i], result_s); + dpi_ftou(clk, ~stall, dataa[i], result_u); + end + `endif + + assign result[i] = is_signed_r ? result_s : result_u; + end + + VX_shift_register #( + .DATAW(TAGW + 1 + 1), + .DEPTH(`LATENCY_FTOI) + ) shift_reg ( + .clk(clk), + .reset(reset), + .enable(~stall), + .in ({tag_in, valid_in, is_signed}), + .out({tag_out, valid_out, is_signed_r}) + ); + + assign ready_in = ~stall; + +endmodule diff --git a/hw/rtl/fp_cores/VX_fp_itof.v b/hw/rtl/fp_cores/VX_fp_itof.v new file mode 100644 index 00000000..222b1589 --- /dev/null +++ b/hw/rtl/fp_cores/VX_fp_itof.v @@ -0,0 +1,77 @@ +`include "VX_define.vh" + +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + +module VX_fp_itof #( + parameter TAGW = 1, + parameter LANES = 1 +) ( + input wire clk, + input wire reset, + + output wire ready_in, + input wire valid_in, + + input wire [TAGW-1:0] tag_in, + + input wire is_signed, + + input wire [LANES-1:0][31:0] dataa, + output wire [LANES-1:0][31:0] result, + + output wire [TAGW-1:0] tag_out, + + input wire ready_out, + output wire valid_out +); + wire stall = ~ready_out && valid_out; + + reg is_signed_r; + + for (genvar i = 0; i < LANES; i++) begin + + wire [31:0] result_s; + wire [31:0] result_u; + + `ifdef QUARTUS + acl_fp_itof itof ( + .clk (clk), + .areset (1'b0), + .en (~stall), + .a (dataa[i]), + .q (result_s) + ); + + acl_fp_utof utof ( + .clk (clk), + .areset (1'b0), + .en (~stall), + .a (dataa[i]), + .q (result_u) + ); + `else + always @(posedge clk) begin + dpi_itof(clk, ~stall, dataa[i], result_s); + dpi_utof(clk, ~stall, dataa[i], result_u); + end + `endif + + assign result[i] = is_signed_r ? result_s : result_u; + end + + VX_shift_register #( + .DATAW(TAGW + 1 + 1), + .DEPTH(`LATENCY_FTOI) + ) shift_reg ( + .clk(clk), + .reset(reset), + .enable(~stall), + .in ({tag_in, valid_in, is_signed}), + .out({tag_out, valid_out, is_signed_r}) + ); + + assign ready_in = ~stall; + +endmodule diff --git a/hw/rtl/fp_cores/VX_fp_madd.v b/hw/rtl/fp_cores/VX_fp_madd.v new file mode 100644 index 00000000..a94fdfac --- /dev/null +++ b/hw/rtl/fp_cores/VX_fp_madd.v @@ -0,0 +1,291 @@ +`include "VX_define.vh" + +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + +module VX_fp_madd #( + parameter TAGW = 1, + parameter LANES = 1 +) ( + input wire clk, + input wire reset, + + output wire ready_in, + input wire valid_in, + + input wire [TAGW-1:0] tag_in, + + input wire do_add, + input wire do_sub, + input wire do_mul, + + input wire [LANES-1:0][31:0] dataa, + input wire [LANES-1:0][31:0] datab, + input wire [LANES-1:0][31:0] datac, + output wire [LANES-1:0][31:0] result, + + output wire [TAGW-1:0] tag_out, + + input wire ready_out, + output wire valid_out +); + + wire stall = ~ready_out && valid_out; + + reg do_add_r, do_sub_r, do_mul_r; + + for (genvar i = 0; i < LANES; i++) begin + + wire [31:0] result_add; + wire [31:0] result_sub; + wire [31:0] result_mul; + wire [31:0] result_madd; + wire [31:0] result_msub; + + `ifdef QUARTUS + twentynm_fp_mac mac_fp_add ( + // inputs + .accumulate(), + .chainin_overflow(), + .chainin_invalid(), + .chainin_underflow(), + .chainin_inexact(), + .ax(), + .ay(datab[i]), + .az(dataa[i]), + .clk({2'b00,clk}), + .ena({2'b11,~stall}), + .aclr(2'b00), + .chainin(), + // outputs + .overflow(), + .invalid(), + .underflow(), + .inexact(), + .chainout_overflow(), + .chainout_invalid(), + .chainout_underflow(), + .chainout_inexact(), + .resulta(result_add), + .chainout() + ); + defparam mac_fp_add.operation_mode = "sp_add"; + defparam mac_fp_add.use_chainin = "false"; + defparam mac_fp_add.adder_subtract = "false"; + defparam mac_fp_add.ax_clock = "0"; + defparam mac_fp_add.ay_clock = "0"; + defparam mac_fp_add.az_clock = "0"; + defparam mac_fp_add.output_clock = "0"; + defparam mac_fp_add.accumulate_clock = "none"; + defparam mac_fp_add.ax_chainin_pl_clock = "0"; + defparam mac_fp_add.accum_pipeline_clock = "none"; + defparam mac_fp_add.mult_pipeline_clock = "0"; + defparam mac_fp_add.adder_input_clock = "0"; + defparam mac_fp_add.accum_adder_clock = "none"; + + twentynm_fp_mac mac_fp_sub ( + // inputs + .accumulate(), + .chainin_overflow(), + .chainin_invalid(), + .chainin_underflow(), + .chainin_inexact(), + .ax(), + .ay(datab[i]), + .az(dataa[i]), + .clk({2'b00,clk}), + .ena({2'b11,~stall}), + .aclr(2'b00), + .chainin(), + // outputs + .overflow(), + .invalid(), + .underflow(), + .inexact(), + .chainout_overflow(), + .chainout_invalid(), + .chainout_underflow(), + .chainout_inexact(), + .resulta(result_sub), + .chainout() + ); + defparam mac_fp_sub.operation_mode = "sp_add"; + defparam mac_fp_sub.use_chainin = "false"; + defparam mac_fp_sub.adder_subtract = "true"; + defparam mac_fp_sub.ax_clock = "0"; + defparam mac_fp_sub.ay_clock = "0"; + defparam mac_fp_sub.az_clock = "none"; + defparam mac_fp_sub.output_clock = "0"; + defparam mac_fp_sub.accumulate_clock = "none"; + defparam mac_fp_sub.ax_chainin_pl_clock = "none"; + defparam mac_fp_sub.accum_pipeline_clock = "none"; + defparam mac_fp_sub.mult_pipeline_clock = "none"; + defparam mac_fp_sub.adder_input_clock = "0"; + defparam mac_fp_sub.accum_adder_clock = "none"; + + twentynm_fp_mac mac_fp_mul ( + // inputs + .accumulate(), + .chainin_overflow(), + .chainin_invalid(), + .chainin_underflow(), + .chainin_inexact(), + .ax(), + .ay(datab[i]), + .az(dataa[i]), + .clk({2'b00,clk}), + .ena({2'b11,~stall}), + .aclr(2'b00), + .chainin(), + // outputs + .overflow(), + .invalid(), + .underflow(), + .inexact(), + .chainout_overflow(), + .chainout_invalid(), + .chainout_underflow(), + .chainout_inexact(), + .resulta(result_mul), + .chainout() + ); + defparam mac_fp_mul.operation_mode = "sp_mult"; + defparam mac_fp_mul.use_chainin = "false"; + defparam mac_fp_mul.adder_subtract = "false"; + defparam mac_fp_mul.ax_clock = "none"; + defparam mac_fp_mul.ay_clock = "0"; + defparam mac_fp_mul.az_clock = "0"; + defparam mac_fp_mul.output_clock = "0"; + defparam mac_fp_mul.accumulate_clock = "none"; + defparam mac_fp_mul.ax_chainin_pl_clock = "none"; + defparam mac_fp_mul.accum_pipeline_clock = "none"; + defparam mac_fp_mul.mult_pipeline_clock = "0"; + defparam mac_fp_mul.adder_input_clock = "none"; + defparam mac_fp_mul.accum_adder_clock = "none"; + + twentynm_fp_mac mac_fp_madd ( + // inputs + .accumulate(), + .chainin_overflow(), + .chainin_invalid(), + .chainin_underflow(), + .chainin_inexact(), + .ax(datac[i]), + .ay(datab[i]), + .az(dataa[i]), + .clk({2'b00,clk}), + .ena({2'b11,~stall}), + .aclr(2'b00), + .chainin(), + // outputs + .overflow(), + .invalid(), + .underflow(), + .inexact(), + .chainout_overflow(), + .chainout_invalid(), + .chainout_underflow(), + .chainout_inexact(), + .resulta(result_madd), + .chainout() + ); + defparam mac_fp_madd.operation_mode = "sp_mult_add"; + defparam mac_fp_madd.use_chainin = "false"; + defparam mac_fp_madd.adder_subtract = "false"; + defparam mac_fp_madd.ax_clock = "0"; + defparam mac_fp_madd.ay_clock = "0"; + defparam mac_fp_madd.az_clock = "0"; + defparam mac_fp_madd.output_clock = "0"; + defparam mac_fp_madd.accumulate_clock = "none"; + defparam mac_fp_madd.ax_chainin_pl_clock = "0"; + defparam mac_fp_madd.accum_pipeline_clock = "none"; + defparam mac_fp_madd.mult_pipeline_clock = "0"; + defparam mac_fp_madd.adder_input_clock = "0"; + defparam mac_fp_madd.accum_adder_clock = "none"; + + twentynm_fp_mac mac_fp_msub ( + // inputs + .accumulate(), + .chainin_overflow(), + .chainin_invalid(), + .chainin_underflow(), + .chainin_inexact(), + .ax(datac[i]), + .ay(datab[i]), + .az(dataa[i]), + .clk({2'b00,clk}), + .ena({2'b11,~stall}), + .aclr(2'b00), + .chainin(), + // outputs + .overflow(), + .invalid(), + .underflow(), + .inexact(), + .chainout_overflow(), + .chainout_invalid(), + .chainout_underflow(), + .chainout_inexact(), + .resulta(result_msub), + .chainout() + ); + defparam mac_fp_msub.operation_mode = "sp_mult_add"; + defparam mac_fp_msub.use_chainin = "false"; + defparam mac_fp_msub.adder_subtract = "true"; + defparam mac_fp_msub.ax_clock = "0"; + defparam mac_fp_msub.ay_clock = "0"; + defparam mac_fp_msub.az_clock = "0"; + defparam mac_fp_msub.output_clock = "0"; + defparam mac_fp_msub.accumulate_clock = "none"; + defparam mac_fp_msub.ax_chainin_pl_clock = "0"; + defparam mac_fp_msub.accum_pipeline_clock = "none"; + defparam mac_fp_msub.mult_pipeline_clock = "0"; + defparam mac_fp_msub.adder_input_clock = "0"; + defparam mac_fp_msub.accum_adder_clock = "none"; + `else + always @(posedge clk) begin + dpi_fadd(clk, ~stall, dataa[i], datab[i], result_add); + dpi_fsub(clk, ~stall, dataa[i], datab[i], result_sub); + dpi_fmul(clk, ~stall, dataa[i], datab[i], result_mul); + dpi_fmadd(clk, ~stall, dataa[i], datab[i], datac[i], result_madd); + dpi_fmsub(clk, ~stall, dataa[i], datab[i], datac[i], result_msub); + end + `endif + + reg [31:0] result_r; + + always @(*) begin + result_r = 'x; + if (do_mul_r) begin + if (do_add_r) + result_r = result_madd; + else if (do_sub_r) + result_r = result_msub; + else + result_r = result_mul; + end else begin + if (do_add_r) + result_r = result_add; + else if (do_sub_r) + result_r = result_sub; + end + end + + assign result[i] = result_r; + end + + VX_shift_register #( + .DATAW(TAGW + 1 + 1 + 1 + 1), + .DEPTH(`LATENCY_FMADD) + ) shift_reg1 ( + .clk(clk), + .reset(reset), + .enable(~stall), + .in({tag_in, valid_in, do_add, do_sub, do_mul}), + .out({tag_out, valid_out, do_add_r, do_sub_r, do_mul_r}) + ); + + assign ready_in = ~stall; + +endmodule diff --git a/hw/rtl/fp_cores/VX_fp_nmadd.v b/hw/rtl/fp_cores/VX_fp_nmadd.v new file mode 100644 index 00000000..f80b09d9 --- /dev/null +++ b/hw/rtl/fp_cores/VX_fp_nmadd.v @@ -0,0 +1,191 @@ +`include "VX_define.vh" + +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + +module VX_fp_nmadd #( + parameter TAGW = 1, + parameter LANES = 1 +) ( + input wire clk, + input wire reset, + + output wire ready_in, + input wire valid_in, + + input wire [TAGW-1:0] tag_in, + + input wire do_sub, + + input wire [LANES-1:0][31:0] dataa, + input wire [LANES-1:0][31:0] datab, + input wire [LANES-1:0][31:0] datac, + output wire [LANES-1:0][31:0] result, + + output wire [TAGW-1:0] tag_out, + + input wire ready_out, + output wire valid_out +); + + wire stall = ~ready_out && valid_out; + + reg do_sub_r; + + for (genvar i = 0; i < LANES; i++) begin + + wire [31:0] result_madd; + wire [31:0] result_msub; + + wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd; + + `ifdef QUARTUS + twentynm_fp_mac mac_fp_madd ( + // inputs + .accumulate(), + .chainin_overflow(), + .chainin_invalid(), + .chainin_underflow(), + .chainin_inexact(), + .ax(datac[i]), + .ay(datab[i]), + .az(dataa[i]), + .clk({2'b00,clk}), + .ena({2'b11,~stall), + .aclr(2'b00), + .chainin(), + // outputs + .overflow(), + .invalid(), + .underflow(), + .inexact(), + .chainout_overflow(), + .chainout_invalid(), + .chainout_underflow(), + .chainout_inexact(), + .resulta(result_madd), + .chainout() + ); + defparam mac_fp_madd.operation_mode = "sp_mult_add"; + defparam mac_fp_madd.use_chainin = "false"; + defparam mac_fp_madd.adder_subtract = "false"; + defparam mac_fp_madd.ax_clock = "0"; + defparam mac_fp_madd.ay_clock = "0"; + defparam mac_fp_madd.az_clock = "0"; + defparam mac_fp_madd.output_clock = "0"; + defparam mac_fp_madd.accumulate_clock = "none"; + defparam mac_fp_madd.ax_chainin_pl_clock = "0"; + defparam mac_fp_madd.accum_pipeline_clock = "none"; + defparam mac_fp_madd.mult_pipeline_clock = "0"; + defparam mac_fp_madd.adder_input_clock = "0"; + defparam mac_fp_madd.accum_adder_clock = "none"; + + twentynm_fp_mac mac_fp_msub ( + // inputs + .accumulate(), + .chainin_overflow(), + .chainin_invalid(), + .chainin_underflow(), + .chainin_inexact(), + .ax(datac[i]), + .ay(datab[i]), + .az(dataa[i]), + .clk({2'b00,clk}), + .ena({2'b11,enable0}), + .aclr(2'b00), + .chainin(), + // outputs + .overflow(), + .invalid(), + .underflow(), + .inexact(), + .chainout_overflow(), + .chainout_invalid(), + .chainout_underflow(), + .chainout_inexact(), + .resulta(result_msub), + .chainout() + ); + defparam mac_fp_msub.operation_mode = "sp_mult_add"; + defparam mac_fp_msub.use_chainin = "false"; + defparam mac_fp_msub.adder_subtract = "true"; + defparam mac_fp_msub.ax_clock = "0"; + defparam mac_fp_msub.ay_clock = "0"; + defparam mac_fp_msub.az_clock = "0"; + defparam mac_fp_msub.output_clock = "0"; + defparam mac_fp_msub.accumulate_clock = "none"; + defparam mac_fp_msub.ax_chainin_pl_clock = "0"; + defparam mac_fp_msub.accum_pipeline_clock = "none"; + defparam mac_fp_msub.mult_pipeline_clock = "0"; + defparam mac_fp_msub.adder_input_clock = "0"; + defparam mac_fp_msub.accum_adder_clock = "none"; + + twentynm_fp_mac mac_fp_neg ( + // inputs + .accumulate(), + .chainin_overflow(), + .chainin_invalid(), + .chainin_underflow(), + .chainin_inexact(), + .ax(32'h0), + .ay(result_st0), + .az(), + .clk({2'b00,clk}), + .ena({2'b11,enable1}), + .aclr(2'b00), + .chainin(), + // outputs + .overflow(), + .invalid(), + .underflow(), + .inexact(), + .chainout_overflow(), + .chainout_invalid(), + .chainout_underflow(), + .chainout_inexact(), + .resulta(result[i]), + .chainout() + ); + defparam mac_fp_neg.operation_mode = "sp_add"; + defparam mac_fp_neg.use_chainin = "false"; + defparam mac_fp_neg.adder_subtract = "true"; + defparam mac_fp_neg.ax_clock = "0"; + defparam mac_fp_neg.ay_clock = "0"; + defparam mac_fp_neg.az_clock = "none"; + defparam mac_fp_neg.output_clock = "0"; + defparam mac_fp_neg.accumulate_clock = "none"; + defparam mac_fp_neg.ax_chainin_pl_clock = "none"; + defparam mac_fp_neg.accum_pipeline_clock = "none"; + defparam mac_fp_neg.mult_pipeline_clock = "none"; + defparam mac_fp_neg.adder_input_clock = "0"; + defparam mac_fp_neg.accum_adder_clock = "none"; + `else + always @(posedge clk) begin + dpi_fmadd(clk, ~stall, dataa[i], datab[i], datac[i], result_madd); + dpi_fmsub(clk, ~stall, dataa[i], datab[i], datac[i], result_msub); + dpi_fsub(clk, ~stall, 32'b0, result_st0, result[i]); + end + `endif + end + + always @(posedge clk) begin + if (~stall) begin + do_sub_r <= do_sub; + end + end + + VX_shift_register #( + .DATAW(TAGW + 1), + .DEPTH(`LATENCY_FNMADD) + ) shift_reg1 ( + .clk(clk), + .reset(reset), + .enable(~stall), + .in({tag_in, valid_in}), + .out({tag_out, valid_out}) + ); + + assign ready_in = ~stall; + +endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/VX_fp_noncomp.v b/hw/rtl/fp_cores/VX_fp_noncomp.v index ac30b1b6..fb354bc8 100644 --- a/hw/rtl/fp_cores/VX_fp_noncomp.v +++ b/hw/rtl/fp_cores/VX_fp_noncomp.v @@ -45,8 +45,8 @@ module VX_fp_noncomp #( reg [LANES-1:0][31:0] datab_r; reg [LANES-1:0] a_sign, b_sign; - reg [LANES-1:0][7:0] a_exponent, b_exponent; - reg [LANES-1:0][22:0] a_mantissa, b_mantissa; + reg [LANES-1:0][7:0] a_exponent; + reg [LANES-1:0][22:0] a_mantissa; fp_type_t [LANES-1:0] a_type, b_type; reg [LANES-1:0] a_smaller, ab_equal; @@ -60,12 +60,12 @@ module VX_fp_noncomp #( // Setup for (genvar i = 0; i < LANES; i++) begin - wire tmp_a_sign = dataa[i][31]; - wire [7:0] tmp_a_exponent = dataa[i][30:23]; + wire tmp_a_sign = dataa[i][31]; + wire [7:0] tmp_a_exponent = dataa[i][30:23]; wire [22:0] tmp_a_mantissa = dataa[i][22:0]; - wire tmp_b_sign = datab[i][31]; - wire [7:0] tmp_b_exponent = datab[i][30:23]; + wire tmp_b_sign = datab[i][31]; + wire [7:0] tmp_b_exponent = datab[i][30:23]; wire [22:0] tmp_b_mantissa = datab[i][22:0]; fp_type_t tmp_a_type, tmp_b_type; @@ -86,14 +86,14 @@ module VX_fp_noncomp #( wire tmp_ab_equal = (dataa[i] == datab[i]) | (tmp_a_type[4] & tmp_b_type[4]); VX_generic_register #( - .N(1 + 1 + 8 + 8 + 23 + 23 + $bits(fp_type_t) + $bits(fp_type_t) + 1 + 1) + .N(1 + 1 + 8 + 23 + $bits(fp_type_t) + $bits(fp_type_t) + 1 + 1) ) fnc1_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (1'b0), - .in ({tmp_a_sign, tmp_b_sign, tmp_a_exponent, tmp_b_exponent, tmp_a_mantissa, tmp_b_mantissa, tmp_a_type, tmp_b_type, tmp_a_smaller, tmp_ab_equal}), - .out ({a_sign[i], b_sign[i], a_exponent[i], b_exponent[i], a_mantissa[i], b_mantissa[i], a_type[i], b_type[i], a_smaller[i], ab_equal[i]}) + .in ({tmp_a_sign, tmp_b_sign, tmp_a_exponent, tmp_a_mantissa, tmp_a_type, tmp_b_type, tmp_a_smaller, tmp_ab_equal}), + .out ({a_sign[i], b_sign[i], a_exponent[i], a_mantissa[i], a_type[i], b_type[i], a_smaller[i], ab_equal[i]}) ); end @@ -213,8 +213,6 @@ module VX_fp_noncomp #( for (genvar i = 0; i < LANES; i++) begin always @(*) begin - tmp_result[i] = 32'hdeadbeaf; - {tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = 5'h0; case (op_type_r) `FPU_CLASS: begin tmp_result[i] = fclass_mask[i]; @@ -224,7 +222,8 @@ module VX_fp_noncomp #( tmp_result[i] = fcmp_res[i]; {tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = fcmp_excp[i]; end - `FPU_MISC: begin + //`FPU_MISC: + default: begin case (frm) 0,1,2: begin tmp_result[i] = fsgnj_res[i]; @@ -234,7 +233,8 @@ module VX_fp_noncomp #( tmp_result[i] = fminmax_res[i]; {tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = {a_type[i][0] | b_type[i][0], 4'h0}; end - 5,6: begin + //5,6,7: + default: begin tmp_result[i] = dataa[i]; {tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = 5'h0; end diff --git a/hw/rtl/fp_cores/altera/VX_fp_sqrt.v b/hw/rtl/fp_cores/VX_fp_sqrt.v similarity index 72% rename from hw/rtl/fp_cores/altera/VX_fp_sqrt.v rename to hw/rtl/fp_cores/VX_fp_sqrt.v index 784f5cb4..5f506094 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_sqrt.v +++ b/hw/rtl/fp_cores/VX_fp_sqrt.v @@ -1,5 +1,9 @@ `include "VX_define.vh" +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + module VX_fp_sqrt #( parameter TAGW = 1, parameter LANES = 1 @@ -20,18 +24,22 @@ module VX_fp_sqrt #( input wire ready_out, output wire valid_out ); - wire stall = ~ready_out && valid_out; - wire enable = ~stall; - assign ready_in = enable; - + wire stall = ~ready_out && valid_out; + for (genvar i = 0; i < LANES; i++) begin + `ifdef QUARTUS acl_fp_sqrt fsqrt ( .clk (clk), .areset (1'b0), - .en (enable), + .en (~stall), .a (dataa[i]), .q (result[i]) ); + `else + always @(posedge clk) begin + dpi_fsqrt(clk, ~stall, dataa[i], result[i]); + end + `endif end VX_shift_register #( @@ -40,9 +48,11 @@ module VX_fp_sqrt #( ) shift_reg ( .clk(clk), .reset(reset), - .enable(enable), + .enable(~stall), .in ({tag_in, valid_in}), .out({tag_out, valid_out}) ); + assign ready_in = ~stall; + endmodule diff --git a/hw/rtl/fp_cores/VX_fp_type.v b/hw/rtl/fp_cores/VX_fp_type.v index 5e5aeb50..f94e286e 100644 --- a/hw/rtl/fp_cores/VX_fp_type.v +++ b/hw/rtl/fp_cores/VX_fp_type.v @@ -8,12 +8,20 @@ module VX_fp_type ( // outputs output fp_type_t o_type ); - assign o_type.is_normal = (exponent != 8'd0) && (exponent != 8'hff); - assign o_type.is_zero = (exponent == 8'd0) && (mantissa == 23'd0); - assign o_type.is_subnormal = (exponent == 8'd0) && !o_type.is_zero; - assign o_type.is_inf = ((exponent == 8'hff) && (mantissa == 23'd0)); - assign o_type.is_nan = ((exponent == 8'hff) && (mantissa != 23'd0)); - assign o_type.is_signaling = o_type.is_nan && (mantissa[22] == 1'b0); - assign o_type.is_quiet = o_type.is_nan && !o_type.is_signaling; + wire is_normal = (exponent != 8'd0) && (exponent != 8'hff); + wire is_zero = (exponent == 8'd0) && (mantissa == 23'd0); + wire is_subnormal = (exponent == 8'd0) && !is_zero; + wire is_inf = (exponent == 8'hff) && (mantissa == 23'd0); + wire is_nan = (exponent == 8'hff) && (mantissa != 23'd0); + wire is_signaling = is_nan && (mantissa[22] == 1'b0); + wire is_quiet = is_nan && !is_signaling; + + assign o_type.is_normal = is_normal; + assign o_type.is_zero = is_zero; + assign o_type.is_subnormal = is_subnormal; + assign o_type.is_inf = is_inf; + assign o_type.is_nan = is_nan; + assign o_type.is_signaling = is_signaling; + assign o_type.is_quiet = is_quiet; endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/VX_fpnew.v b/hw/rtl/fp_cores/VX_fpnew.v index fb792446..8d306b17 100644 --- a/hw/rtl/fp_cores/VX_fpnew.v +++ b/hw/rtl/fp_cores/VX_fpnew.v @@ -53,10 +53,10 @@ module VX_fpnew #( }; localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{ - PipeRegs:'{'{`LATENCY_FMULADD, 0, 0, 0, 0}, // ADDMUL - '{default: `LATENCY_FDIVSQRT}, // DIVSQRT - '{default: `LATENCY_FNONCOMP}, // NONCOMP - '{default: `LATENCY_FCONV}}, // CONV + PipeRegs:'{'{`LATENCY_FMADD, 0, 0, 0, 0}, // ADDMUL + '{default: `LATENCY_FDIV}, // DIVSQRT + '{default: `LATENCY_FNONCOMP}, // NONCOMP + '{default: `LATENCY_ITOF}}, // CONV UnitTypes:'{'{default: UNIT_FMULADD}, // ADDMUL '{default: UNIT_FDIVSQRT}, // DIVSQRT '{default: UNIT_FNONCOMP}, // NONCOMP diff --git a/hw/rtl/fp_cores/altera/VX_fp_add.v b/hw/rtl/fp_cores/altera/VX_fp_add.v deleted file mode 100644 index eeb94556..00000000 --- a/hw/rtl/fp_cores/altera/VX_fp_add.v +++ /dev/null @@ -1,81 +0,0 @@ -`include "VX_define.vh" - -module VX_fp_add #( - parameter TAGW = 1, - parameter LANES = 1 -) ( - input wire clk, - input wire reset, - - output wire ready_in, - input wire valid_in, - - input wire [TAGW-1:0] tag_in, - - input wire [LANES-1:0][31:0] dataa, - input wire [LANES-1:0][31:0] datab, - output wire [LANES-1:0][31:0] result, - - output wire [TAGW-1:0] tag_out, - - input wire ready_out, - output wire valid_out -); - wire stall = ~ready_out && valid_out; - wire enable = ~stall; - assign ready_in = enable; - - for (genvar i = 0; i < LANES; i++) begin - twentynm_fp_mac mac_fp_wys ( - // inputs - .accumulate(), - .chainin_overflow(), - .chainin_invalid(), - .chainin_underflow(), - .chainin_inexact(), - .ax(dataa[i]), - .ay(datab[i]), - .az(), - .clk({2'b00,clk}), - .ena({2'b11,enable}), - .aclr(2'b00), - .chainin(), - // outputs - .overflow(), - .invalid(), - .underflow(), - .inexact(), - .chainout_overflow(), - .chainout_invalid(), - .chainout_underflow(), - .chainout_inexact(), - .resulta(result[i]), - .chainout() - ); - defparam mac_fp_wys.operation_mode = "sp_add"; - defparam mac_fp_wys.use_chainin = "false"; - defparam mac_fp_wys.adder_subtract = "false"; - defparam mac_fp_wys.ax_clock = "0"; - defparam mac_fp_wys.ay_clock = "0"; - defparam mac_fp_wys.az_clock = "none"; - defparam mac_fp_wys.output_clock = "0"; - defparam mac_fp_wys.accumulate_clock = "none"; - defparam mac_fp_wys.ax_chainin_pl_clock = "none"; - defparam mac_fp_wys.accum_pipeline_clock = "none"; - defparam mac_fp_wys.mult_pipeline_clock = "none"; - defparam mac_fp_wys.adder_input_clock = "0"; - defparam mac_fp_wys.accum_adder_clock = "none"; - end - - VX_shift_register #( - .DATAW(TAGW + 1), - .DEPTH(1) - ) shift_reg ( - .clk(clk), - .reset(reset), - .enable(enable), - .in ({tag_in, valid_in}), - .out({tag_out, valid_out}) - ); - -endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_ftoi.v b/hw/rtl/fp_cores/altera/VX_fp_ftoi.v deleted file mode 100644 index 6c2aa613..00000000 --- a/hw/rtl/fp_cores/altera/VX_fp_ftoi.v +++ /dev/null @@ -1,48 +0,0 @@ -`include "VX_define.vh" - -module VX_fp_ftoi #( - parameter TAGW = 1, - parameter LANES = 1 -) ( - input wire clk, - input wire reset, - - output wire ready_in, - input wire valid_in, - - input wire [TAGW-1:0] tag_in, - - input wire [LANES-1:0][31:0] dataa, - output wire [LANES-1:0][31:0] result, - - output wire [TAGW-1:0] tag_out, - - input wire ready_out, - output wire valid_out -); - wire stall = ~ready_out && valid_out; - wire enable = ~stall; - assign ready_in = enable; - - for (genvar i = 0; i < LANES; i++) begin - acl_fp_ftoi ftoi ( - .clk (clk), - .areset (1'b0), - .en (enable), - .a (dataa[i]), - .q (result[i]) - ); - end - - VX_shift_register #( - .DATAW(TAGW + 1), - .DEPTH(`LATENCY_FTOI) - ) shift_reg ( - .clk(clk), - .reset(reset), - .enable(enable), - .in ({tag_in, valid_in}), - .out({tag_out, valid_out}) - ); - -endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_ftou.v b/hw/rtl/fp_cores/altera/VX_fp_ftou.v deleted file mode 100644 index 71460515..00000000 --- a/hw/rtl/fp_cores/altera/VX_fp_ftou.v +++ /dev/null @@ -1,48 +0,0 @@ -`include "VX_define.vh" - -module VX_fp_ftou #( - parameter TAGW = 1, - parameter LANES = 1 -) ( - input wire clk, - input wire reset, - - output wire ready_in, - input wire valid_in, - - input wire [TAGW-1:0] tag_in, - - input wire [LANES-1:0][31:0] dataa, - output wire [LANES-1:0][31:0] result, - - output wire [TAGW-1:0] tag_out, - - input wire ready_out, - output wire valid_out -); - wire stall = ~ready_out && valid_out; - wire enable = ~stall; - assign ready_in = enable; - - for (genvar i = 0; i < LANES; i++) begin - acl_fp_ftou ftou ( - .clk (clk), - .areset (1'b0), - .en (enable), - .a (dataa[i]), - .q (result[i]) - ); - end - - VX_shift_register #( - .DATAW(TAGW + 1), - .DEPTH(`LATENCY_FTOU) - ) shift_reg ( - .clk(clk), - .reset(reset), - .enable(enable), - .in ({tag_in, valid_in}), - .out({tag_out, valid_out}) - ); - -endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_itof.v b/hw/rtl/fp_cores/altera/VX_fp_itof.v deleted file mode 100644 index 4a08ab01..00000000 --- a/hw/rtl/fp_cores/altera/VX_fp_itof.v +++ /dev/null @@ -1,48 +0,0 @@ -`include "VX_define.vh" - -module VX_fp_itof #( - parameter TAGW = 1, - parameter LANES = 1 -) ( - input wire clk, - input wire reset, - - output wire ready_in, - input wire valid_in, - - input wire [TAGW-1:0] tag_in, - - input wire [LANES-1:0][31:0] dataa, - output wire [LANES-1:0][31:0] result, - - output wire [TAGW-1:0] tag_out, - - input wire ready_out, - output wire valid_out -); - wire stall = ~ready_out && valid_out; - wire enable = ~stall; - assign ready_in = enable; - - for (genvar i = 0; i < LANES; i++) begin - acl_fp_itof itof ( - .clk (clk), - .areset (1'b0), - .en (enable), - .a (dataa[i]), - .q (result[i]) - ); - end - - VX_shift_register #( - .DATAW(TAGW + 1), - .DEPTH(`LATENCY_ITOF) - ) shift_reg ( - .clk(clk), - .reset(reset), - .enable(enable), - .in ({tag_in, valid_in}), - .out({tag_out, valid_out}) - ); - -endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_madd.v b/hw/rtl/fp_cores/altera/VX_fp_madd.v deleted file mode 100644 index 4841eeed..00000000 --- a/hw/rtl/fp_cores/altera/VX_fp_madd.v +++ /dev/null @@ -1,146 +0,0 @@ -`include "VX_define.vh" - -module VX_fp_madd #( - parameter TAGW = 1, - parameter LANES = 1 -) ( - input wire clk, - input wire reset, - - output wire ready_in, - input wire valid_in, - - input wire [TAGW-1:0] tag_in, - - input wire [LANES-1:0][31:0] dataa, - input wire [LANES-1:0][31:0] datab, - input wire [LANES-1:0][31:0] datac, - output wire [LANES-1:0][31:0] result, - - input wire negate, - - output wire [TAGW-1:0] tag_out, - - input wire ready_out, - output wire valid_out -); - wire enable0, enable1; - assign ready_in = enable0 && enable1; - - wire [LANES-1:0][31:0] result_st0, result_st1; - wire [TAGW-1:0] out_tag_st0, out_tag_st1; - wire in_valid_st0, out_valid_st0, out_valid_st1; - - for (genvar i = 0; i < LANES; i++) begin - twentynm_fp_mac mac_fp_wys0 ( - // inputs - .accumulate(), - .chainin_overflow(), - .chainin_invalid(), - .chainin_underflow(), - .chainin_inexact(), - .ax(datac[i]), - .ay(datab[i]), - .az(dataa[i]), - .clk({2'b00,clk}), - .ena({2'b11,enable0}), - .aclr(2'b00), - .chainin(), - // outputs - .overflow(), - .invalid(), - .underflow(), - .inexact(), - .chainout_overflow(), - .chainout_invalid(), - .chainout_underflow(), - .chainout_inexact(), - .resulta(result_st0[i]), - .chainout() - ); - defparam mac_fp_wys0.operation_mode = "sp_mult_add"; - defparam mac_fp_wys0.use_chainin = "false"; - defparam mac_fp_wys0.adder_subtract = "false"; - defparam mac_fp_wys0.ax_clock = "0"; - defparam mac_fp_wys0.ay_clock = "0"; - defparam mac_fp_wys0.az_clock = "0"; - defparam mac_fp_wys0.output_clock = "0"; - defparam mac_fp_wys0.accumulate_clock = "none"; - defparam mac_fp_wys0.ax_chainin_pl_clock = "0"; - defparam mac_fp_wys0.accum_pipeline_clock = "none"; - defparam mac_fp_wys0.mult_pipeline_clock = "0"; - defparam mac_fp_wys0.adder_input_clock = "0"; - defparam mac_fp_wys0.accum_adder_clock = "none"; - - twentynm_fp_mac mac_fp_wys1 ( - // inputs - .accumulate(), - .chainin_overflow(), - .chainin_invalid(), - .chainin_underflow(), - .chainin_inexact(), - .ax(32'h0), - .ay(result_st0[i]), - .az(), - .clk({2'b00,clk}), - .ena({2'b11,enable1}), - .aclr(2'b00), - .chainin(), - // outputs - .overflow(), - .invalid(), - .underflow(), - .inexact(), - .chainout_overflow(), - .chainout_invalid(), - .chainout_underflow(), - .chainout_inexact(), - .resulta(result_st1[i]), - .chainout() - ); - defparam mac_fp_wys1.operation_mode = "sp_add"; - defparam mac_fp_wys1.use_chainin = "false"; - defparam mac_fp_wys1.adder_subtract = "true"; - defparam mac_fp_wys1.ax_clock = "0"; - defparam mac_fp_wys1.ay_clock = "0"; - defparam mac_fp_wys1.az_clock = "none"; - defparam mac_fp_wys1.output_clock = "0"; - defparam mac_fp_wys1.accumulate_clock = "none"; - defparam mac_fp_wys1.ax_chainin_pl_clock = "none"; - defparam mac_fp_wys1.accum_pipeline_clock = "none"; - defparam mac_fp_wys1.mult_pipeline_clock = "none"; - defparam mac_fp_wys1.adder_input_clock = "0"; - defparam mac_fp_wys1.accum_adder_clock = "none"; - end - - VX_shift_register #( - .DATAW(TAGW + 1 + 1), - .DEPTH(1) - ) shift_reg0 ( - .clk(clk), - .reset(reset), - .enable(enable0), - .in ({tag_in, (valid_in && ~negate), (valid_in && negate)}), - .out({out_tag_st0, out_valid_st0, in_valid_st0}) - ); - - VX_shift_register #( - .DATAW(TAGW + 1), - .DEPTH(1) - ) shift_reg1 ( - .clk(clk), - .reset(reset), - .enable(enable1), - .in({out_tag_st0, in_valid_st0}), - .out({out_tag_st1, out_valid_st1}) - ); - - wire out_stall = ~ready_out && valid_out; - assign enable0 = ~out_stall; - assign enable1 = ~out_stall && ~(out_valid_st0 && out_valid_st1); // stall the negate stage if dual outputs - - assign result = out_valid_st0 ? result_st0 : result_st1; - assign tag_out = out_valid_st0 ? out_tag_st0 : out_tag_st1; - assign valid_out = out_valid_st0 || out_valid_st1; - -endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_msub.v b/hw/rtl/fp_cores/altera/VX_fp_msub.v deleted file mode 100644 index 85da1bce..00000000 --- a/hw/rtl/fp_cores/altera/VX_fp_msub.v +++ /dev/null @@ -1,146 +0,0 @@ -`include "VX_define.vh" - -module VX_fp_msub #( - parameter TAGW = 1, - parameter LANES = 1 -) ( - input wire clk, - input wire reset, - - output wire ready_in, - input wire valid_in, - - input wire [TAGW-1:0] tag_in, - - input wire [LANES-1:0][31:0] dataa, - input wire [LANES-1:0][31:0] datab, - input wire [LANES-1:0][31:0] datac, - output wire [LANES-1:0][31:0] result, - - input wire negate, - - output wire [TAGW-1:0] tag_out, - - input wire ready_out, - output wire valid_out -); - wire enable0, enable1; - assign ready_in = enable0 && enable1; - - wire [LANES-1:0][31:0] result_st0, result_st1; - wire [TAGW-1:0] out_tag_st0, out_tag_st1; - wire in_valid_st0, out_valid_st0, out_valid_st1; - - for (genvar i = 0; i < LANES; i++) begin - twentynm_fp_mac mac_fp_wys0 ( - // inputs - .accumulate(), - .chainin_overflow(), - .chainin_invalid(), - .chainin_underflow(), - .chainin_inexact(), - .ax(datac[i]), - .ay(datab[i]), - .az(dataa[i]), - .clk({2'b00,clk}), - .ena({2'b11,enable0}), - .aclr(2'b00), - .chainin(), - // outputs - .overflow(), - .invalid(), - .underflow(), - .inexact(), - .chainout_overflow(), - .chainout_invalid(), - .chainout_underflow(), - .chainout_inexact(), - .resulta(result_st0[i]), - .chainout() - ); - defparam mac_fp_wys0.operation_mode = "sp_mult_add"; - defparam mac_fp_wys0.use_chainin = "false"; - defparam mac_fp_wys0.adder_subtract = "true"; - defparam mac_fp_wys0.ax_clock = "0"; - defparam mac_fp_wys0.ay_clock = "0"; - defparam mac_fp_wys0.az_clock = "0"; - defparam mac_fp_wys0.output_clock = "0"; - defparam mac_fp_wys0.accumulate_clock = "none"; - defparam mac_fp_wys0.ax_chainin_pl_clock = "0"; - defparam mac_fp_wys0.accum_pipeline_clock = "none"; - defparam mac_fp_wys0.mult_pipeline_clock = "0"; - defparam mac_fp_wys0.adder_input_clock = "0"; - defparam mac_fp_wys0.accum_adder_clock = "none"; - - twentynm_fp_mac mac_fp_wys1 ( - // inputs - .accumulate(), - .chainin_overflow(), - .chainin_invalid(), - .chainin_underflow(), - .chainin_inexact(), - .ax(32'h0), - .ay(result_st0[i]), - .az(), - .clk({2'b00,clk}), - .ena({2'b11,enable1}), - .aclr(2'b00), - .chainin(), - // outputs - .overflow(), - .invalid(), - .underflow(), - .inexact(), - .chainout_overflow(), - .chainout_invalid(), - .chainout_underflow(), - .chainout_inexact(), - .resulta(result_st1[i]), - .chainout() - ); - defparam mac_fp_wys1.operation_mode = "sp_add"; - defparam mac_fp_wys1.use_chainin = "false"; - defparam mac_fp_wys1.adder_subtract = "true"; - defparam mac_fp_wys1.ax_clock = "0"; - defparam mac_fp_wys1.ay_clock = "0"; - defparam mac_fp_wys1.az_clock = "none"; - defparam mac_fp_wys1.output_clock = "0"; - defparam mac_fp_wys1.accumulate_clock = "none"; - defparam mac_fp_wys1.ax_chainin_pl_clock = "none"; - defparam mac_fp_wys1.accum_pipeline_clock = "none"; - defparam mac_fp_wys1.mult_pipeline_clock = "none"; - defparam mac_fp_wys1.adder_input_clock = "0"; - defparam mac_fp_wys1.accum_adder_clock = "none"; - end - - VX_shift_register #( - .DATAW(TAGW + 1 + 1), - .DEPTH(1) - ) shift_reg0 ( - .clk(clk), - .reset(reset), - .enable(enable0), - .in ({tag_in, (valid_in && ~negate), (valid_in && negate)}), - .out({out_tag_st0, out_valid_st0, in_valid_st0}) - ); - - VX_shift_register #( - .DATAW(TAGW + 1), - .DEPTH(1) - ) shift_reg1 ( - .clk(clk), - .reset(reset), - .enable(enable1), - .in({out_tag_st0, in_valid_st0}), - .out({out_tag_st1, out_valid_st1}) - ); - - wire out_stall = ~ready_out && valid_out; - assign enable0 = ~out_stall; - assign enable1 = ~out_stall && ~(out_valid_st0 && out_valid_st1); // stall the negate stage if dual outputs - - assign result = out_valid_st0 ? result_st0 : result_st1; - assign tag_out = out_valid_st0 ? out_tag_st0 : out_tag_st1; - assign valid_out = out_valid_st0 || out_valid_st1; - -endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/altera/VX_fp_mul.v b/hw/rtl/fp_cores/altera/VX_fp_mul.v deleted file mode 100644 index e2d00457..00000000 --- a/hw/rtl/fp_cores/altera/VX_fp_mul.v +++ /dev/null @@ -1,81 +0,0 @@ -`include "VX_define.vh" - -module VX_fp_mul #( - parameter TAGW = 1, - parameter LANES = 1 -) ( - input wire clk, - input wire reset, - - output wire ready_in, - input wire valid_in, - - input wire [TAGW-1:0] tag_in, - - input wire [LANES-1:0][31:0] dataa, - input wire [LANES-1:0][31:0] datab, - output wire [LANES-1:0][31:0] result, - - output wire [TAGW-1:0] tag_out, - - input wire ready_out, - output wire valid_out -); - wire stall = ~ready_out && valid_out; - wire enable = ~stall; - assign ready_in = enable; - - for (genvar i = 0; i < LANES; i++) begin - twentynm_fp_mac mac_fp_wys ( - // inputs - .accumulate(), - .chainin_overflow(), - .chainin_invalid(), - .chainin_underflow(), - .chainin_inexact(), - .ax(), - .ay(datab[i]), - .az(dataa[i]), - .clk({2'b00,clk}), - .ena({2'b11,enable}), - .aclr(2'b00), - .chainin(), - // outputs - .overflow(), - .invalid(), - .underflow(), - .inexact(), - .chainout_overflow(), - .chainout_invalid(), - .chainout_underflow(), - .chainout_inexact(), - .resulta(result[i]), - .chainout() - ); - defparam mac_fp_wys.operation_mode = "sp_mult"; - defparam mac_fp_wys.use_chainin = "false"; - defparam mac_fp_wys.adder_subtract = "false"; - defparam mac_fp_wys.ax_clock = "none"; - defparam mac_fp_wys.ay_clock = "0"; - defparam mac_fp_wys.az_clock = "0"; - defparam mac_fp_wys.output_clock = "0"; - defparam mac_fp_wys.accumulate_clock = "none"; - defparam mac_fp_wys.ax_chainin_pl_clock = "none"; - defparam mac_fp_wys.accum_pipeline_clock = "none"; - defparam mac_fp_wys.mult_pipeline_clock = "0"; - defparam mac_fp_wys.adder_input_clock = "none"; - defparam mac_fp_wys.accum_adder_clock = "none"; - end - - VX_shift_register #( - .DATAW(TAGW + 1), - .DEPTH(1) - ) shift_reg ( - .clk(clk), - .reset(reset), - .enable(enable), - .in ({tag_in, valid_in}), - .out({tag_out, valid_out}) - ); - -endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/altera/VX_fp_sub.v b/hw/rtl/fp_cores/altera/VX_fp_sub.v deleted file mode 100644 index f1c8ed26..00000000 --- a/hw/rtl/fp_cores/altera/VX_fp_sub.v +++ /dev/null @@ -1,81 +0,0 @@ -`include "VX_define.vh" - -module VX_fp_sub #( - parameter TAGW = 1, - parameter LANES = 1 -) ( - input wire clk, - input wire reset, - - output wire ready_in, - input wire valid_in, - - input wire [TAGW-1:0] tag_in, - - input wire [LANES-1:0][31:0] dataa, - input wire [LANES-1:0][31:0] datab, - output wire [LANES-1:0][31:0] result, - - output wire [TAGW-1:0] tag_out, - - input wire ready_out, - output wire valid_out -); - wire stall = ~ready_out && valid_out; - wire enable = ~stall; - assign ready_in = enable; - - for (genvar i = 0; i < LANES; i++) begin - twentynm_fp_mac mac_fp_wys ( - // inputs - .accumulate(), - .chainin_overflow(), - .chainin_invalid(), - .chainin_underflow(), - .chainin_inexact(), - .ax(dataa[i]), - .ay(datab[i]), - .az(), - .clk({2'b00,clk}), - .ena({2'b11,enable}), - .aclr(2'b00), - .chainin(), - // outputs - .overflow(), - .invalid(), - .underflow(), - .inexact(), - .chainout_overflow(), - .chainout_invalid(), - .chainout_underflow(), - .chainout_inexact(), - .resulta(result[i]), - .chainout() - ); - defparam mac_fp_wys.operation_mode = "sp_add"; - defparam mac_fp_wys.use_chainin = "false"; - defparam mac_fp_wys.adder_subtract = "true"; - defparam mac_fp_wys.ax_clock = "0"; - defparam mac_fp_wys.ay_clock = "0"; - defparam mac_fp_wys.az_clock = "none"; - defparam mac_fp_wys.output_clock = "0"; - defparam mac_fp_wys.accumulate_clock = "none"; - defparam mac_fp_wys.ax_chainin_pl_clock = "none"; - defparam mac_fp_wys.accum_pipeline_clock = "none"; - defparam mac_fp_wys.mult_pipeline_clock = "none"; - defparam mac_fp_wys.adder_input_clock = "0"; - defparam mac_fp_wys.accum_adder_clock = "none"; - end - - VX_shift_register #( - .DATAW(TAGW + 1), - .DEPTH(1) - ) shift_reg ( - .clk(clk), - .reset(reset), - .enable(enable), - .in ({tag_in, valid_in}), - .out({tag_out, valid_out}) - ); - -endmodule diff --git a/hw/rtl/fp_cores/altera/VX_fp_utof.v b/hw/rtl/fp_cores/altera/VX_fp_utof.v deleted file mode 100644 index 935a44fb..00000000 --- a/hw/rtl/fp_cores/altera/VX_fp_utof.v +++ /dev/null @@ -1,48 +0,0 @@ -`include "VX_define.vh" - -module VX_fp_utof #( - parameter TAGW = 1, - parameter LANES = 1 -) ( - input wire clk, - input wire reset, - - output wire ready_in, - input wire valid_in, - - input wire [TAGW-1:0] tag_in, - - input wire [LANES-1:0][31:0] dataa, - output wire [LANES-1:0][31:0] result, - - output wire [TAGW-1:0] tag_out, - - input wire ready_out, - output wire valid_out -); - wire stall = ~ready_out && valid_out; - wire enable = ~stall; - assign ready_in = enable; - - for (genvar i = 0; i < LANES; i++) begin - acl_fp_utof utof ( - .clk (clk), - .areset (1'b0), - .en (enable), - .a (dataa[i]), - .q (result[i]) - ); - end - - VX_shift_register #( - .DATAW(TAGW + 1), - .DEPTH(`LATENCY_UTOF) - ) shift_reg ( - .clk(clk), - .reset(reset), - .enable(enable), - .in ({tag_in, valid_in}), - .out({tag_out, valid_out}) - ); - -endmodule diff --git a/hw/rtl/fp_cores/altera/dspba_delay_ver.sv b/hw/rtl/fp_cores/altera/dspba_delay_ver.sv new file mode 100644 index 00000000..526de10a --- /dev/null +++ b/hw/rtl/fp_cores/altera/dspba_delay_ver.sv @@ -0,0 +1,95 @@ +// Legal Notice: Copyright 2017 Intel Corporation. All rights reserved. +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing device programming or simulation files), and +// any associated documentation or information are expressly subject to the +// terms and conditions of the Intel FPGA Software License Agreement, +// Intel MegaCore Function License Agreement, or other applicable license +// agreement, including, without limitation, that your use is for the sole +// purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +module dspba_delay_ver +#( + parameter width = 8, + parameter depth = 1, + parameter reset_high = 1'b1, + parameter reset_kind = "ASYNC" +) ( + input clk, + input aclr, + input ena, + input [width-1:0] xin, + output [width-1:0] xout +); + + wire reset; + reg [width-1:0] delays [depth-1:0]; + + assign reset = aclr ^ reset_high; + + generate + if (depth > 0) + begin + genvar i; + for (i = 0; i < depth; ++i) + begin : delay_block + if (reset_kind == "ASYNC") + begin : sync_reset + always @ (posedge clk or negedge reset) + begin: a + if (!reset) begin + delays[i] <= 0; + end else begin + if (ena) begin + if (i > 0) begin + delays[i] <= delays[i - 1]; + end else begin + delays[i] <= xin; + end + end + end + end + end + + if (reset_kind == "SYNC") + begin : async_reset + always @ (posedge clk) + begin: a + if (!reset) begin + delays[i] <= 0; + end else begin + if (ena) begin + if (i > 0) begin + delays[i] <= delays[i - 1]; + end else begin + delays[i] <= xin; + end + end + end + end + end + + if (reset_kind == "NONE") + begin : no_reset + always @ (posedge clk) + begin: a + if (ena) begin + if (i > 0) begin + delays[i] <= delays[i - 1]; + end else begin + delays[i] <= xin; + end + end + end + end + end + + assign xout = delays[depth - 1]; + end else begin + assign xout = xin; + end + endgenerate + +endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/altera/dspba_library_ver.sv b/hw/rtl/fp_cores/altera/dspba_library_ver.sv deleted file mode 100644 index 16367ad0..00000000 --- a/hw/rtl/fp_cores/altera/dspba_library_ver.sv +++ /dev/null @@ -1,392 +0,0 @@ -// Legal Notice: Copyright 2017 Intel Corporation. All rights reserved. -// Your use of Intel Corporation's design tools, logic functions and other -// software and tools, and its AMPP partner logic functions, and any output -// files any of the foregoing device programming or simulation files), and -// any associated documentation or information are expressly subject to the -// terms and conditions of the Intel FPGA Software License Agreement, -// Intel MegaCore Function License Agreement, or other applicable license -// agreement, including, without limitation, that your use is for the sole -// purpose of programming logic devices manufactured by Intel and sold by -// Intel or its authorized distributors. Please refer to the applicable -// agreement for further details. - - -module dspba_delay_ver -#( - parameter width = 8, - parameter depth = 1, - parameter reset_high = 1'b1, - parameter reset_kind = "ASYNC" -) ( - input clk, - input aclr, - input ena, - input [width-1:0] xin, - output [width-1:0] xout -); - - wire reset; - reg [width-1:0] delays [depth-1:0]; - - assign reset = aclr ^ reset_high; - - generate - if (depth > 0) - begin - genvar i; - for (i = 0; i < depth; ++i) - begin : delay_block - if (reset_kind == "ASYNC") - begin : sync_reset - always @ (posedge clk or negedge reset) - begin: a - if (!reset) begin - delays[i] <= 0; - end else begin - if (ena) begin - if (i > 0) begin - delays[i] <= delays[i - 1]; - end else begin - delays[i] <= xin; - end - end - end - end - end - - if (reset_kind == "SYNC") - begin : async_reset - always @ (posedge clk) - begin: a - if (!reset) begin - delays[i] <= 0; - end else begin - if (ena) begin - if (i > 0) begin - delays[i] <= delays[i - 1]; - end else begin - delays[i] <= xin; - end - end - end - end - end - - if (reset_kind == "NONE") - begin : no_reset - always @ (posedge clk) - begin: a - if (ena) begin - if (i > 0) begin - delays[i] <= delays[i - 1]; - end else begin - delays[i] <= xin; - end - end - end - end - end - - assign xout = delays[depth - 1]; - end else begin - assign xout = xin; - end - endgenerate - -endmodule - -//------------------------------------------------------------------------------ - -module dspba_sync_reg_ver -#( - parameter width1 = 8, - parameter width2 = 8, - parameter depth = 2, - parameter pulse_multiplier = 1, - parameter counter_width = 8, - parameter init_value = 0, - parameter reset1_high = 1'b1, - parameter reset2_high = 1'b1, - parameter reset_kind = "ASYNC" -) ( - input clk1, - input aclr1, - input [0 : 0] ena, - input [width1-1 : 0] xin, - output [width1-1 : 0] xout, - input clk2, - input aclr2, - output [width2-1 : 0] sxout -); -wire [width1-1 : 0] init_value_internal; - -wire reset1; -wire reset2; - -reg iclk_enable; -reg [width1-1 : 0] iclk_data; -reg [width2-1 : 0] oclk_data; - -// For Synthesis this means: preserve this registers and do not merge any other flip-flops with synchronizer flip-flops -// For TimeQuest this means: identify these flip-flops as synchronizer to enable automatic MTBF analysis -(* altera_attribute = {"-name ADV_NETLIST_OPT_ALLOWED NEVER_ALLOW; -name SYNCHRONIZER_IDENTIFICATION FORCED; -name DONT_MERGE_REGISTER ON; -name PRESERVE_REGISTER ON"} *) reg [depth-1 : 0] sync_regs; - -wire oclk_enable; - -wire ena_internal; -reg [counter_width-1 : 0] counter; - -assign init_value_internal = init_value; - -assign reset1 = aclr1 ^ reset1_high; -assign reset2 = aclr2 ^ reset2_high; - -generate - if (pulse_multiplier == 1) - begin: no_multiplication - assign ena_internal = ena[0]; - end -endgenerate - -generate - if (pulse_multiplier > 1) - begin: multiplu_ena_pulse - if (reset_kind == "ASYNC") - begin: async_reset - always @ (posedge clk1 or negedge reset1) - begin - if (reset1 == 1'b0) begin - counter <= 0; - end else begin - if (counter > 0) begin - if (counter == pulse_multiplier - 1) begin - counter <= 0; - end else begin - counter <= counter + 2'd1; - end - end else begin - if (ena[0] == 1'b1) begin - counter <= 1; - end - end - end - end - end - if (reset_kind == "SYNC") - begin: sync_reset - always @ (posedge clk1) - begin - if (reset1 == 1'b0) begin - counter <= 0; - end else begin - if (counter > 0) begin - if (counter == pulse_multiplier - 1) begin - counter <= 0; - end else begin - counter <= counter + 2'd1; - end - end else begin - if (ena[0] == 1'b1) begin - counter <= 1; - end - end - end - end - end - if (reset_kind == "NONE") - begin: no_reset - always @ (posedge clk1) - begin - if (counter > 0) begin - if (counter == pulse_multiplier - 1) begin - counter <= 0; - end else begin - counter <= counter + 2'd1; - end - end else begin - if (ena[0] == 1'b1) begin - counter <= 1; - end - end - end - end - - assign ena_internal = counter > 0 ? 1'b1 : ena[0]; - end -endgenerate - -assign oclk_enable = sync_regs[depth - 1]; - -generate - if (reset_kind == "ASYNC") - begin: iclk_async_reset - always @ (posedge clk1 or negedge reset1) - begin - if (reset1 == 1'b0) begin - iclk_data <= init_value_internal; - iclk_enable <= 1'b0; - end else begin - iclk_enable <= ena_internal; - if (ena[0] == 1'b1) begin - iclk_data <= xin; - end - end - end - end - if (reset_kind == "SYNC") - begin: iclk_sync_reset - always @ (posedge clk1) - begin - if (reset1 == 1'b0) begin - iclk_data <= init_value_internal; - iclk_enable <= 1'b0; - end else begin - iclk_enable <= ena_internal; - if (ena[0] == 1'b1) begin - iclk_data <= xin; - end - end - end - end - if (reset_kind == "NONE") - begin: iclk_no_reset - always @ (posedge clk1) - begin - iclk_enable <= ena_internal; - if (ena[0] == 1'b1) begin - iclk_data <= xin; - end - end - end -endgenerate - -generate - genvar i; - for (i = 0; i < depth; ++i) - begin: sync_regs_block - if (reset_kind == "ASYNC") - begin: sync_reg_async_reset - always @ (posedge clk2 or negedge reset2) begin - if (reset2 == 1'b0) begin - sync_regs[i] <= 1'b0; - end else begin - if (i > 0) begin - sync_regs[i] <= sync_regs[i - 1]; - end else begin - sync_regs[i] <= iclk_enable; - end - end - end - end - if (reset_kind == "SYNC") - begin: sync_reg_sync_reset - always @ (posedge clk2) begin - if (reset2 == 1'b0) begin - sync_regs[i] <= 1'b0; - end else begin - if (i > 0) begin - sync_regs[i] <= sync_regs[i - 1]; - end else begin - sync_regs[i] <= iclk_enable; - end - end - end - end - if (reset_kind == "NONE") - begin: sync_reg_no_reset - always @ (posedge clk2) begin - if (i > 0) begin - sync_regs[i] <= sync_regs[i - 1]; - end else begin - sync_regs[i] <= iclk_enable; - end - end - end - end -endgenerate - -generate - if (reset_kind == "ASYNC") - begin: oclk_async_reset - always @ (posedge clk2 or negedge reset2) - begin - if (reset2 == 1'b0) begin - oclk_data <= init_value_internal[width2-1 : 0]; - end else begin - if (oclk_enable == 1'b1) begin - oclk_data <= iclk_data[width2-1 : 0]; - end - end - end - end - if (reset_kind == "SYNC") - begin: oclk_sync_reset - always @ (posedge clk2) - begin - if (reset2 == 1'b0) begin - oclk_data <= init_value_internal[width2-1 : 0]; - end else begin - if (oclk_enable == 1'b1) begin - oclk_data <= iclk_data[width2-1 : 0]; - end - end - end - end - if (reset_kind == "NONE") - begin: oclk_no_reset - always @ (posedge clk2) - begin - if (oclk_enable == 1'b1) begin - oclk_data <= iclk_data[width2-1 : 0]; - end - end - end -endgenerate - -assign xout = iclk_data; -assign sxout = oclk_data; - -endmodule - -//------------------------------------------------------------------------------ - -module dspba_pipe -#( - parameter num_bits = 8, - parameter num_stages = 0, - parameter init_value = 1'bx -) ( - input clk, - input [num_bits-1:0] d, - output [num_bits-1:0] q -); - logic [num_bits-1:0] init_stage = { num_bits { init_value } }; - - generate - if (num_stages > 0) - begin - reg [num_bits-1:0] stage_array[num_stages-1:0]; - - genvar i; - for (i = 0; i < num_stages; ++i) - begin : g_pipe - always @ (posedge clk) begin - if (i>0) begin - stage_array[i] <= stage_array[i-1]; - end else begin - stage_array[i] <= d; - end - end - end - initial begin - stage_array = '{ num_stages { init_stage } }; - end - - assign q = stage_array[num_stages-1]; - - end else begin - assign q = d; - end - endgenerate - -endmodule diff --git a/hw/rtl/fp_cores/svdpi/float_dpi.cpp b/hw/rtl/fp_cores/svdpi/float_dpi.cpp new file mode 100644 index 00000000..7e9c8709 --- /dev/null +++ b/hw/rtl/fp_cores/svdpi/float_dpi.cpp @@ -0,0 +1,210 @@ +#include +#include +#include +#include +#include +#include "svdpi.h" +#include "VX_config.h" + +extern "C" { + void dpi_fadd(bool clk, bool enable, int a, int b, int* result); + void dpi_fsub(bool clk, bool enable, int a, int b, int* result); + void dpi_fmul(bool clk, bool enable, int a, int b, int* result); + void dpi_fmadd(bool clk, bool enable, int a, int b, int c, int* result); + void dpi_fmsub(bool clk, bool enable, int a, int b, int c, int* result); + void dpi_fdiv(bool clk, bool enable, int a, int b, int* result); + void dpi_fsqrt(bool clk, bool enable, int a, int* result); + void dpi_ftoi(bool clk, bool enable, int a, int* result); + void dpi_ftou(bool clk, bool enable, int a, int* result); + void dpi_itof(bool clk, bool enable, int a, int* result); + void dpi_utof(bool clk, bool enable, int a, int* result); +} + +class ShiftRegister { +public: + ShiftRegister() : init_(false), depth_(0) {} + + void ensure_init(int depth) { + if (!init_) { + buffer_.resize(depth); + init_ = true; + depth_ = depth; + } + } + + void push(int value, bool clk, bool enable) { + if (clk || !enable) + return; + for (unsigned i = 0; i < depth_-1; ++i) { + buffer_[i] = buffer_[i+1]; + } + buffer_[depth_-1] = value; + } + + int top() const { + return buffer_[0]; + } + +private: + + std::vector buffer_; + unsigned depth_; + bool init_; +}; + +class Instances { +public: + ShiftRegister& get(svScope scope) { + mutex_.lock(); + ShiftRegister& reg = instances_[scope]; + mutex_.unlock(); + return reg; + } + +private: + std::unordered_map instances_; + std::mutex mutex_; +}; + +Instances instances; + +void dpi_fadd(bool clk, bool enable, int a, int b, int* result) { + auto scope = svGetScope(); + ShiftRegister& inst = instances.get(scope); + + float fa = *(float*)&a; + float fb = *(float*)&b; + float fr = fa + fb; + + inst.ensure_init(LATENCY_FMADD); + inst.push(*(int*)&fr, clk, enable); + *result = inst.top(); +} + +void dpi_fsub(bool clk, bool enable, int a, int b, int* result) { + auto scope = svGetScope(); + ShiftRegister& inst = instances.get(scope); + + float fa = *(float*)&a; + float fb = *(float*)&b; + float fr = fa - fb; + + inst.ensure_init(LATENCY_FMADD); + inst.push(*(int*)&fr, clk, enable); + *result = inst.top(); +} + +void dpi_fmul(bool clk, bool enable, int a, int b, int* result) { + auto scope = svGetScope(); + ShiftRegister& inst = instances.get(scope); + + float fa = *(float*)&a; + float fb = *(float*)&b; + float fr = fa * fb; + + inst.ensure_init(LATENCY_FMADD); + inst.push(*(int*)&fr, clk, enable); + *result = inst.top(); +} + +void dpi_fmadd(bool clk, bool enable, int a, int b, int c, int* result) { + auto scope = svGetScope(); + ShiftRegister& inst = instances.get(scope); + + float fa = *(float*)&a; + float fb = *(float*)&b; + float fc = *(float*)&c; + float fr = fa * fb + fc; + + inst.ensure_init(LATENCY_FMADD); + inst.push(*(int*)&fr, clk, enable); + *result = inst.top(); +} + +void dpi_fmsub(bool clk, bool enable, int a, int b, int c, int* result) { + auto scope = svGetScope(); + ShiftRegister& inst = instances.get(scope); + + float fa = *(float*)&a; + float fb = *(float*)&b; + float fc = *(float*)&c; + float fr = fa * fb - fc; + + inst.ensure_init(LATENCY_FMADD); + inst.push(*(int*)&fr, clk, enable); + *result = inst.top(); +} + +void dpi_fdiv(bool clk, bool enable, int a, int b, int* result) { + auto scope = svGetScope(); + ShiftRegister& inst = instances.get(scope); + + float fa = *(float*)&a; + float fb = *(float*)&b; + float fr = fa / fb; + + inst.ensure_init(LATENCY_FDIV); + inst.push(*(int*)&fr, clk, enable); + *result = inst. + + top(); +} + +void dpi_fsqrt(bool clk, bool enable, int a, int* result) { + auto scope = svGetScope(); + ShiftRegister& inst = instances.get(scope); + + float fa = *(float*)&a; + float fr = sqrt(fa); + + inst.ensure_init(LATENCY_FSQRT); + inst.push(*(int*)&fr, clk, enable); + *result = inst.top(); +} + +void dpi_ftoi(bool clk, bool enable, int a, int* result) { + auto scope = svGetScope(); + ShiftRegister& inst = instances.get(scope); + + float fa = *(float*)&a; + int ir = int(fa); + + inst.ensure_init(LATENCY_FTOI); + inst.push(ir, clk, enable); + *result = inst.top(); +} + +void dpi_ftou(bool clk, bool enable, int a, int* result) { + auto scope = svGetScope(); + ShiftRegister& inst = instances.get(scope); + + float fa = *(float*)&a; + unsigned ir = unsigned(fa); + + inst.ensure_init(LATENCY_FTOI); + inst.push(ir, clk, enable); + *result = inst.top(); +} + +void dpi_itof(bool clk, bool enable, int a, int* result) { + auto scope = svGetScope(); + ShiftRegister& inst = instances.get(scope); + + float fr = float(a); + + inst.ensure_init(LATENCY_ITOF); + inst.push(*(int*)&fr, clk, enable); + *result = inst.top(); +} + +void dpi_utof(bool clk, bool enable, int a, int* result) { + auto scope = svGetScope(); + ShiftRegister& inst = instances.get(scope); + + unsigned ua = *(unsigned*)&a; + float fr = float(ua); + + inst.ensure_init(LATENCY_ITOF); + inst.push(*(int*)&fr, clk, enable); + *result = inst.top(); +} \ No newline at end of file diff --git a/hw/rtl/fp_cores/svdpi/float_dpi.vh b/hw/rtl/fp_cores/svdpi/float_dpi.vh new file mode 100644 index 00000000..fff1ccf8 --- /dev/null +++ b/hw/rtl/fp_cores/svdpi/float_dpi.vh @@ -0,0 +1,16 @@ +`ifndef FLOAT_DPI +`define FLOAT_DPI + +import "DPI-C" context function void dpi_fadd(input logic clk, input logic enable, input int a, input int b, output int result); +import "DPI-C" context function void dpi_fsub(input logic clk, input logic enable, input int a, input int b, output int result); +import "DPI-C" context function void dpi_fmul(input logic clk, input logic enable, input int a, input int b, output int result); +import "DPI-C" context function void dpi_fmadd(input logic clk, input logic enable, input int a, input int b, input int c, output int result); +import "DPI-C" context function void dpi_fmsub(input logic clk, input logic enable, input int a, input int b, input int c, output int result); +import "DPI-C" context function void dpi_fdiv(input logic clk, input logic enable, input int a, input int b, output int result); +import "DPI-C" context function void dpi_fsqrt(input logic clk, input logic enable, input int a, output int result); +import "DPI-C" context function void dpi_ftoi(input logic clk, input logic enable, input int a, output int result); +import "DPI-C" context function void dpi_ftou(input logic clk, input logic enable, input int a, output int result); +import "DPI-C" context function void dpi_itof(input logic clk, input logic enable, input int a, output int result); +import "DPI-C" context function void dpi_utof(input logic clk, input logic enable, input int a, output int result); + +`endif \ No newline at end of file diff --git a/hw/scripts/gen_config.py b/hw/scripts/gen_config.py index 0f80b093..05ecb8bb 100755 --- a/hw/scripts/gen_config.py +++ b/hw/scripts/gen_config.py @@ -94,6 +94,7 @@ if args.outc != 'none': // Translated from VX_config.vh: '''[1:].format(date=datetime.now()), file=f) with open(path.join(script_dir, '../rtl/VX_config.vh'), 'r') as r: + lineno = 0 for line in r: if in_expansion: f.write(post_process_line(line)) @@ -107,7 +108,8 @@ if args.outc != 'none': f.write(post_process_line(pat.sub(repl, line))) break else: - raise ValueError('failed to find rule for: ' + line) + raise ValueError('failed to find rule for: "' + line + '" (' + str(lineno) + ')') + lineno = lineno + 1 print(''' // Misc diff --git a/hw/simulate/Makefile b/hw/simulate/Makefile index ea6d87a1..1a31fd2d 100644 --- a/hw/simulate/Makefile +++ b/hw/simulate/Makefile @@ -17,10 +17,11 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO -FPU_INCLUDE = -I../rtl/fp_cores/fpnew/src/common_cells/include -I../rtl/fp_cores/fpnew/src/common_cells/src -I../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../rtl/fp_cores/fpnew/src -INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/cache -I../rtl/fp_cores -I../rtl/simulate $(FPU_INCLUDE) +FPU_INCLUDE = -I../rtl/fp_cores -I../rtl/fp_cores/svdpi -I../rtl/fp_cores/fpnew/src/common_cells/include -I../rtl/fp_cores/fpnew/src/common_cells/src -I../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../rtl/fp_cores/fpnew/src +INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/cache -I../rtl/simulate $(FPU_INCLUDE) SRCS = simulator.cpp testbench.cpp +SRCS += ../rtl/fp_cores/svdpi/float_dpi.cpp all: build-s