From 7e93d253f250184ca6d5331e4786a2d6bcb3fd8f Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@iam-ssh1.research.intel-research.net>
Date: Sun, 10 Jan 2021 22:03:23 -0800
Subject: [PATCH] minor update

---
 hw/rtl/fp_cores/VX_fp_cvt.v      | 13 +++--
 hw/rtl/fp_cores/VX_fp_div.v      | 22 ++++++++
 hw/rtl/fp_cores/VX_fp_fma.v      | 22 ++++++++
 hw/rtl/fp_cores/VX_fp_ncomp.v    |  6 ++-
 hw/rtl/fp_cores/VX_fp_rounding.v | 39 +++++++-------
 hw/rtl/fp_cores/VX_fp_sqrt.v     | 22 ++++++++
 hw/rtl/libs/VX_lzc.v             | 89 ++++++++++++++++++++++++++------
 7 files changed, 174 insertions(+), 39 deletions(-)

diff --git a/hw/rtl/fp_cores/VX_fp_cvt.v b/hw/rtl/fp_cores/VX_fp_cvt.v
index 43fd6dd3..8d58cca3 100644
--- a/hw/rtl/fp_cores/VX_fp_cvt.v
+++ b/hw/rtl/fp_cores/VX_fp_cvt.v
@@ -1,5 +1,8 @@
 `include "VX_define.vh"
 
+/// Modified port of cast module from fpnew Libray 
+/// reference: https://github.com/pulp-platform/fpnew
+
 `ifndef SYNTHESIS
 `include "float_dpi.vh"
 `endif
@@ -91,14 +94,14 @@ module VX_fp_cvt #(
     wire [LANES-1:0] mant_is_zero;                       // for integer zeroes
 
     for (genvar i = 0; i < LANES; ++i) begin
-        // Leading zero counter for cancellations
         wire mant_is_nonzero;
         VX_lzc #(
-            .DATAW (INT_MAN_WIDTH)
+            .WIDTH (INT_MAN_WIDTH),
+            .MODE  (1)
         ) lzc (
-            .data_in   (encoded_mant[i]),
-            .data_out  (renorm_shamt[i]),
-            .valid_out (mant_is_nonzero)
+            .in_i    (encoded_mant[i]),
+            .cnt_o   (renorm_shamt[i]),
+            .valid_o (mant_is_nonzero)
         );
         assign mant_is_zero[i] = ~mant_is_nonzero;
     end
diff --git a/hw/rtl/fp_cores/VX_fp_div.v b/hw/rtl/fp_cores/VX_fp_div.v
index be06b7e2..5d3eaafe 100644
--- a/hw/rtl/fp_cores/VX_fp_div.v
+++ b/hw/rtl/fp_cores/VX_fp_div.v
@@ -38,6 +38,27 @@ module VX_fp_div #(
     );
     
     for (genvar i = 0; i < LANES; i++) begin
+    `ifdef VERILATOR
+        reg [31:0] r;
+        fflags_t f;
+
+        always @(*) begin        
+            dpi_fdiv (dataa[i], datab[i], frm, r, f);
+        end
+        `UNUSED_VAR (f)
+
+        VX_shift_register #(
+            .DATAW  (32),
+            .DEPTH  (`LATENCY_FDIV),
+            .RESETW (1)
+        ) shift_req_dpi (
+            .clk      (clk),
+            .reset    (_reset),
+            .enable   (enable),
+            .data_in  (r),
+            .data_out (result[i])
+        );
+    `else
         acl_fdiv fdiv (
             .clk    (clk),
             .areset (_reset),
@@ -46,6 +67,7 @@ module VX_fp_div #(
             .b      (datab[i]),
             .q      (result[i])
         );
+    `endif
     end
 
     VX_shift_register #(
diff --git a/hw/rtl/fp_cores/VX_fp_fma.v b/hw/rtl/fp_cores/VX_fp_fma.v
index 3cd1b2e5..ce7efb24 100644
--- a/hw/rtl/fp_cores/VX_fp_fma.v
+++ b/hw/rtl/fp_cores/VX_fp_fma.v
@@ -59,6 +59,27 @@ module VX_fp_fma #(
             end    
         end
 
+    `ifdef VERILATOR
+        reg [31:0] r;
+        fflags_t f;
+
+        always @(*) begin        
+            dpi_fmadd (a, b, c, frm, r, f);
+        end
+        `UNUSED_VAR (f)
+
+        VX_shift_register #(
+            .DATAW  (32),
+            .DEPTH  (`LATENCY_FMA),
+            .RESETW (1)
+        ) shift_req_dpi (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (enable),
+            .data_in  (r),
+            .data_out (result[i])
+        );
+    `else
         acl_fmadd fmadd (
             .clk    (clk),
             .areset (reset),
@@ -68,6 +89,7 @@ module VX_fp_fma #(
             .c      (c),
             .q      (result[i])
         );
+    `endif
     end
     
     VX_shift_register #(
diff --git a/hw/rtl/fp_cores/VX_fp_ncomp.v b/hw/rtl/fp_cores/VX_fp_ncomp.v
index 57792d15..7f4406ce 100644
--- a/hw/rtl/fp_cores/VX_fp_ncomp.v
+++ b/hw/rtl/fp_cores/VX_fp_ncomp.v
@@ -1,5 +1,8 @@
 `include "VX_define.vh"
 
+/// Modified port of noncomp module from fpnew Libray 
+/// reference: https://github.com/pulp-platform/fpnew
+
 module VX_fp_ncomp #( 
     parameter TAGW = 1,
     parameter LANES = 1
@@ -87,7 +90,8 @@ module VX_fp_ncomp #(
 
     VX_pipe_register #(
         .DATAW  (1 + TAGW + `FPU_BITS + `FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_type_t) + 1 + 1)),
-        .RESETW (1)
+        .RESETW (1),
+        .DEPTH  (0)
     ) pipe_reg0 (
         .clk      (clk),
         .reset    (reset),
diff --git a/hw/rtl/fp_cores/VX_fp_rounding.v b/hw/rtl/fp_cores/VX_fp_rounding.v
index d899c3d1..9e544e44 100644
--- a/hw/rtl/fp_cores/VX_fp_rounding.v
+++ b/hw/rtl/fp_cores/VX_fp_rounding.v
@@ -1,6 +1,9 @@
 
 `include "VX_define.vh"
 
+/// Modified port of rouding module from fpnew Libray
+/// reference: https://github.com/pulp-platform/fpnew
+
 module VX_fp_rounding #(
     parameter DAT_WIDTH = 2 // Width of the abolute value, without sign bit
 ) (
@@ -17,17 +20,17 @@ module VX_fp_rounding #(
     output wire                 exact_zero_o             // output is an exact zero
 );
 
-  reg round_up; // Rounding decision
+    reg round_up; // Rounding decision
 
-  // Take the rounding decision according to RISC-V spec
-  // RoundMode | Mnemonic | Meaning
-  // :--------:|:--------:|:-------
-  //    000    |   RNE    | Round to Nearest, ties to Even
-  //    001    |   RTZ    | Round towards Zero
-  //    010    |   RDN    | Round Down (towards -\infty)
-  //    011    |   RUP    | Round Up (towards \infty)
-  //    100    |   RMM    | Round to Nearest, ties to Max Magnitude
-  //  others   |          | *invalid*
+    // Take the rounding decision according to RISC-V spec
+    // RoundMode | Mnemonic | Meaning
+    // :--------:|:--------:|:-------
+    //    000    |   RNE    | Round to Nearest, ties to Even
+    //    001    |   RTZ    | Round towards Zero
+    //    010    |   RDN    | Round Down (towards -\infty)
+    //    011    |   RUP    | Round Up (towards \infty)
+    //    100    |   RMM    | Round to Nearest, ties to Max Magnitude
+    //  others   |          | *invalid*
 
     always @(*) begin
         case (rnd_mode_i)
@@ -47,15 +50,15 @@ module VX_fp_rounding #(
         endcase
     end
 
-  // Perform the rounding, exponent change and overflow to inf happens automagically
-  assign abs_rounded_o = abs_value_i + DAT_WIDTH'(round_up);
+    // Perform the rounding, exponent change and overflow to inf happens automagically
+    assign abs_rounded_o = abs_value_i + DAT_WIDTH'(round_up);
 
-  // True zero result is a zero result without dirty round/sticky bits
-  assign exact_zero_o = (abs_value_i == 0) && (round_sticky_bits_i == 0);
+    // True zero result is a zero result without dirty round/sticky bits
+    assign exact_zero_o = (abs_value_i == 0) && (round_sticky_bits_i == 0);
 
-  // In case of effective subtraction (thus signs of addition operands must have differed) and a
-  // true zero result, the result sign is '-' in case of RDN and '+' for other modes.
-  assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == `FRM_RDN)
-                                                            : sign_i;
+    // In case of effective subtraction (thus signs of addition operands must have differed) and a
+    // true zero result, the result sign is '-' in case of RDN and '+' for other modes.
+    assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == `FRM_RDN)
+                                                              : sign_i;
 
 endmodule
\ No newline at end of file
diff --git a/hw/rtl/fp_cores/VX_fp_sqrt.v b/hw/rtl/fp_cores/VX_fp_sqrt.v
index 132319f4..a00a9a37 100644
--- a/hw/rtl/fp_cores/VX_fp_sqrt.v
+++ b/hw/rtl/fp_cores/VX_fp_sqrt.v
@@ -37,6 +37,27 @@ module VX_fp_sqrt #(
     );  
     
     for (genvar i = 0; i < LANES; i++) begin
+    `ifdef VERILATOR
+        reg [31:0] r;
+        fflags_t f;
+
+        always @(*) begin        
+            dpi_fsqrt  (dataa[i], frm, r, f);
+        end
+        `UNUSED_VAR (f)
+
+        VX_shift_register #(
+            .DATAW  (32),
+            .DEPTH  (`LATENCY_FSQRT),
+            .RESETW (1)
+        ) shift_req_dpi (
+            .clk      (clk),
+            .reset    (_reset),
+            .enable   (enable),
+            .data_in  (r),
+            .data_out (result[i])
+        );
+    `else
         acl_fsqrt fsqrt (
             .clk    (clk),
             .areset (_reset),
@@ -44,6 +65,7 @@ module VX_fp_sqrt #(
             .a      (dataa[i]),
             .q      (result[i])
         );
+    `endif
     end
 
     VX_shift_register #(
diff --git a/hw/rtl/libs/VX_lzc.v b/hw/rtl/libs/VX_lzc.v
index 469c587c..0ee0737a 100644
--- a/hw/rtl/libs/VX_lzc.v
+++ b/hw/rtl/libs/VX_lzc.v
@@ -1,27 +1,86 @@
 `include "VX_platform.vh"
 
+/// Modified port of lzc module from fpnew Libray
+/// reference: https://github.com/pulp-platform/fpnew
+/// A trailing zero counter / leading zero counter.
+/// Set MODE to 0 for trailing zero counter => cnt_o is the number of trailing zeros (from the LSB)
+/// Set MODE to 1 for leading zero counter  => cnt_o is the number of leading zeros  (from the MSB)
+/// If the input does not contain a zero, `empty_o` is asserted. Additionally `cnt_o` contains
+/// the maximum number of zeros - 1. For example:
+///   in_i = 000_0000, empty_o = 1, cnt_o = 6 (mode = 0)
+///   in_i = 000_0001, empty_o = 0, cnt_o = 0 (mode = 0)
+///   in_i = 000_1000, empty_o = 0, cnt_o = 3 (mode = 0)
+/// Furthermore, this unit contains a more efficient implementation for Verilator (simulation only).
+/// This speeds up simulation significantly.
+
 module VX_lzc #(
-    parameter DATAW  = 32,
-    parameter LDATAW = `LOG2UP(DATAW)
+    /// The width of the input vector.
+    parameter int unsigned WIDTH = 2,
+    parameter bit          MODE  = 1'b0 // 0 -> trailing zero, 1 -> leading zero
 ) (
-    input wire  [DATAW-1:0]  data_in,
-    output wire [LDATAW-1:0] data_out,
-    output wire              valid_out
-); 
+    input  logic [WIDTH-1:0]         in_i,
+    output logic [$clog2(WIDTH)-1:0] cnt_o,
+    output logic                     valid_o
+);
+`IGNORE_WARNINGS_BEGIN
 
-    reg [LDATAW-1:0] data_out_r;
+    localparam int unsigned NUM_LEVELS = $clog2(WIDTH);
 
-    always @(*) begin
-        data_out_r = 'x;
-        for (integer i = DATAW-1; i >= 0; --i) begin
-            if (data_in[i]) begin
-                data_out_r = LDATAW'(DATAW-1-i);
-                break;
+    // pragma translate_off
+    initial begin
+        assert(WIDTH > 0) else $fatal("input must be at least one bit wide");
+    end
+    // pragma translate_on
+
+    logic [WIDTH-1:0][NUM_LEVELS-1:0]          index_lut;
+    logic [2**NUM_LEVELS-1:0]                  sel_nodes;
+    logic [2**NUM_LEVELS-1:0][NUM_LEVELS-1:0]  index_nodes;
+
+    logic [WIDTH-1:0] in_tmp;
+
+    // reverse vector if required
+    always_comb begin : flip_vector
+        for (int unsigned i = 0; i < WIDTH; i++) begin
+            in_tmp[i] = (MODE) ? in_i[WIDTH-1-i] : in_i[i];
+        end
+    end
+
+    for (genvar j = 0; unsigned'(j) < WIDTH; j++) begin : g_index_lut
+        assign index_lut[j] = NUM_LEVELS'(unsigned'(j));
+    end
+
+    for (genvar level = 0; unsigned'(level) < NUM_LEVELS; level++) begin : g_levels
+        if (unsigned'(level) == NUM_LEVELS-1) begin : g_last_level
+            for (genvar k = 0; k < 2**level; k++) begin : g_level
+                // if two successive indices are still in the vector...
+                if (unsigned'(k) * 2 < WIDTH-1) begin
+                    assign sel_nodes[2**level-1+k]   = in_tmp[k*2] | in_tmp[k*2+1];
+                    assign index_nodes[2**level-1+k] = (in_tmp[k*2] == 1'b1) ? index_lut[k*2] :
+                                                                               index_lut[k*2+1];
+                end
+                // if only the first index is still in the vector...
+                if (unsigned'(k) * 2 == WIDTH-1) begin
+                    assign sel_nodes[2**level-1+k]   = in_tmp[k*2];
+                    assign index_nodes[2**level-1+k] = index_lut[k*2];
+                end
+                // if index is out of range
+                if (unsigned'(k) * 2 > WIDTH-1) begin
+                    assign sel_nodes[2**level-1+k]   = 1'b0;
+                    assign index_nodes[2**level-1+k] = '0;
+                end
+            end
+        end else begin
+            for (genvar l = 0; l < 2**level; l++) begin : g_level
+                assign sel_nodes[2**level-1+l]   = sel_nodes[2**(level+1)-1+l*2] | sel_nodes[2**(level+1)-1+l*2+1];
+                assign index_nodes[2**level-1+l] = (sel_nodes[2**(level+1)-1+l*2] == 1'b1) ? index_nodes[2**(level+1)-1+l*2] :
+                                                                                             index_nodes[2**(level+1)-1+l*2+1];
             end
         end
     end
 
-    assign data_out  = data_out_r;
-    assign valid_out = (| data_in);
+    assign cnt_o   = NUM_LEVELS > unsigned'(0) ? index_nodes[0] : $clog2(WIDTH)'(0);
+    assign valid_o = NUM_LEVELS > unsigned'(0) ? sel_nodes[0]  : (|in_i);
+
+`IGNORE_WARNINGS_END
   
 endmodule
\ No newline at end of file