From 1c9445745f2f20f89eef68facee9d253b99a2c16 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 23 Aug 2020 16:53:28 -0700 Subject: [PATCH] fp_noncomp fixes --- hw/rtl/VX_config.vh | 2 + hw/rtl/VX_lsu_unit.v | 34 ++++++------ hw/rtl/VX_scoreboard.v | 2 +- hw/rtl/fp_cores/VX_fp_fpga.v | 1 + hw/rtl/fp_cores/VX_fp_noncomp.v | 94 ++++++++++++++++++++++----------- hw/rtl/interfaces/VX_issue_if.v | 38 +++++++++++++ hw/rtl/libs/VX_bypass_buffer.v | 47 +++++++++++++++++ hw/syn/quartus/project.sdc | 2 +- 8 files changed, 170 insertions(+), 50 deletions(-) create mode 100644 hw/rtl/interfaces/VX_issue_if.v create mode 100644 hw/rtl/libs/VX_bypass_buffer.v diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index ab4d4742..4abfd8fd 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -55,6 +55,8 @@ `define EXT_F_ENABLE +`define IBUF_ENABLE + // Device identification `define VENDOR_ID 0 `define ARCHITECTURE_ID 0 diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 2463bd67..a34eacce 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -111,16 +111,16 @@ module VX_lsu_unit #( .DATAW (`NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 2) + 2), .SIZE (`LSUQ_SIZE) ) lsu_queue ( - .clk (clk), - .reset (reset), - .write_addr (req_tag), - .acquire_slot (lsuq_push), - .read_addr (rsp_tag), - .write_data ({req_wid, req_curr_PC, req_rd, req_wb, req_offset, req_sext}), - .read_data ({rsp_wid, rsp_curr_PC, rsp_rd, rsp_wb, rsp_offset, rsp_sext}), - .release_addr (rsp_tag), - .release_slot (lsuq_pop), - .full (lsuq_full) + .clk (clk), + .reset (reset), + .write_addr (req_tag), + .acquire_slot (lsuq_push), + .read_addr (rsp_tag), + .write_data ({req_wid, req_curr_PC, req_rd, req_wb, req_offset, req_sext}), + .read_data ({rsp_wid, rsp_curr_PC, rsp_rd, rsp_wb, rsp_offset, rsp_sext}), + .release_addr (rsp_tag), + .release_slot (lsuq_pop), + .full (lsuq_full) ); always @(posedge clk) begin @@ -170,12 +170,12 @@ module VX_lsu_unit #( wire stall_out = ~lsu_commit_if.ready && lsu_commit_if.valid; wire mem_rsp_stall = is_load_rsp && is_store_req; // arbitration prioritizes stores - wire arb_valid = is_store_req || is_load_rsp; - wire [`NW_BITS-1:0] arb_wid = is_store_req ? req_wid : rsp_wid; - wire [`NUM_THREADS-1:0] arb_thread_mask = is_store_req ? req_thread_mask : dcache_rsp_if.valid; - wire [31:0] arb_curr_PC = is_store_req ? req_curr_PC : rsp_curr_PC; - wire [`NR_BITS-1:0] arb_rd = is_store_req ? 0 : rsp_rd; - wire arb_wb = is_store_req ? 0 : rsp_wb; + wire arb_valid = is_store_req || is_load_rsp; + wire [`NW_BITS-1:0] arb_wid = is_store_req ? req_wid : rsp_wid; + wire [`NUM_THREADS-1:0] arb_tmask = is_store_req ? req_thread_mask : dcache_rsp_if.valid; + wire [31:0] arb_curr_PC = is_store_req ? req_curr_PC : rsp_curr_PC; + wire [`NR_BITS-1:0] arb_rd = is_store_req ? 0 : rsp_rd; + wire arb_wb = is_store_req ? 0 : rsp_wb; VX_generic_register #( .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)) @@ -184,7 +184,7 @@ module VX_lsu_unit #( .reset (reset), .stall (stall_out), .flush (1'b0), - .in ({arb_valid, arb_wid, arb_thread_mask, arb_curr_PC, arb_rd, arb_wb, rsp_data}), + .in ({arb_valid, arb_wid, arb_tmask, arb_curr_PC, arb_rd, arb_wb, rsp_data}), .out ({lsu_commit_if.valid, lsu_commit_if.wid, lsu_commit_if.thread_mask, lsu_commit_if.curr_PC, lsu_commit_if.rd, lsu_commit_if.wb, lsu_commit_if.data}) ); diff --git a/hw/rtl/VX_scoreboard.v b/hw/rtl/VX_scoreboard.v index 1f681db8..039843e7 100644 --- a/hw/rtl/VX_scoreboard.v +++ b/hw/rtl/VX_scoreboard.v @@ -14,7 +14,7 @@ module VX_scoreboard #( output wire delay ); reg [`NUM_THREADS-1:0] inuse_registers [(`NUM_WARPS * `NUM_REGS)-1:0]; - reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0]; + reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0]; wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[ibuf_deq_if.wid] & ibuf_deq_if.used_regs; diff --git a/hw/rtl/fp_cores/VX_fp_fpga.v b/hw/rtl/fp_cores/VX_fp_fpga.v index 75f237f5..44a1e19b 100644 --- a/hw/rtl/fp_cores/VX_fp_fpga.v +++ b/hw/rtl/fp_cores/VX_fp_fpga.v @@ -286,6 +286,7 @@ module VX_fp_fpga #( assign per_core_ready_out[i] = ready_out && (i == fp_index); end + assign ready_in = (& per_core_ready_in); assign valid_out = fp_valid; assign tag_out = per_core_tag_out[fp_index]; assign result = per_core_result[fp_index]; diff --git a/hw/rtl/fp_cores/VX_fp_noncomp.v b/hw/rtl/fp_cores/VX_fp_noncomp.v index 16c44dd4..b321a819 100644 --- a/hw/rtl/fp_cores/VX_fp_noncomp.v +++ b/hw/rtl/fp_cores/VX_fp_noncomp.v @@ -38,12 +38,17 @@ module VX_fp_noncomp #( SIG_NAN = 32'h00000100, QUT_NAN = 32'h00000200; - wire [LANES-1:0] a_sign, b_sign; - wire [LANES-1:0][7:0] a_exponent, b_exponent; - wire [LANES-1:0][22:0] a_mantissa, b_mantissa; - fp_type_t [LANES-1:0] a_type, b_type; + reg [`FPU_BITS-1:0] op_r; + reg [`FRM_BITS-1:0] frm_r; - wire [LANES-1:0] a_smaller, ab_equal; + reg [LANES-1:0][31:0] dataa_r; + reg [LANES-1:0][31:0] datab_r; + + reg [LANES-1:0] a_sign, b_sign; + reg [LANES-1:0][7:0] a_exponent, b_exponent; + reg [LANES-1:0][22:0] a_mantissa, b_mantissa; + fp_type_t [LANES-1:0] a_type, b_type; + reg [LANES-1:0] a_smaller, ab_equal; reg [LANES-1:0][31:0] fclass_mask; // generate a 10-bit mask for integer reg reg [LANES-1:0][31:0] fminmax_res; // result of fmin/fmax @@ -51,32 +56,60 @@ module VX_fp_noncomp #( reg [LANES-1:0][31:0] fcmp_res; // result of comparison reg [LANES-1:0][ 4:0] fcmp_excp; // exception of comparison + wire stall = ~ready_out && valid_out; + // Setup for (genvar i = 0; i < LANES; i++) begin - assign a_sign[i] = dataa[i][31]; - assign a_exponent[i] = dataa[i][30:23]; - assign a_mantissa[i] = dataa[i][22:0]; + wire tmp_a_sign = dataa[i][31]; + wire [7:0] tmp_a_exponent = dataa[i][30:23]; + wire [22:0] tmp_a_mantissa = dataa[i][22:0]; - assign b_sign[i] = datab[i][31]; - assign b_exponent[i] = datab[i][30:23]; - assign b_mantissa[i] = datab[i][22:0]; + wire tmp_b_sign = datab[i][31]; + wire [7:0] tmp_b_exponent = datab[i][30:23]; + wire [22:0] tmp_b_mantissa = datab[i][22:0]; - assign a_smaller[i] = (dataa[i] < datab[i]) ^ (a_sign[i] || b_sign[i]); - assign ab_equal[i] = (dataa[i] == datab[i]) | (a_type[i][4] & b_type[i][4]); + fp_type_t tmp_a_type, tmp_b_type; VX_fp_type fp_type_a ( - .exponent(a_exponent[i]), - .mantissa(a_mantissa[i]), - .o_type(a_type[i]) + .exponent(tmp_a_exponent[i]), + .mantissa(tmp_a_mantissa[i]), + .o_type(tmp_a_type[i]) ); VX_fp_type fp_type_b ( - .exponent(b_exponent[i]), - .mantissa(b_mantissa[i]), - .o_type(b_type[i]) + .exponent(tmp_b_exponent[i]), + .mantissa(tmp_b_mantissa[i]), + .o_type(tmp_b_type[i]) ); + + wire tmp_a_smaller = (dataa[i] < datab[i]) ^ (tmp_a_sign || tmp_b_sign); + wire tmp_ab_equal = (dataa[i] == datab[i]) | (tmp_a_type[4] & tmp_b_type[4]); + + always @(posedge clk) begin + if (~stall) begin + a_sign[i] <= tmp_a_sign; + b_sign[i] <= tmp_b_sign; + a_exponent[i] <= tmp_a_exponent; + b_exponent[i] <= tmp_b_exponent; + a_mantissa[i] <= tmp_a_mantissa; + b_mantissa[i] <= tmp_b_mantissa; + a_type[i] <= tmp_a_type; + b_type[i] <= tmp_b_type; + a_smaller[i] <= tmp_a_smaller; + ab_equal[i] <= tmp_ab_equal; + end + end end + always @(posedge clk) begin + if (~stall) begin + op_r <= op; + frm_r <= frm; + dataa_r <= dataa; + datab_r <= datab; + end + end + // FCLASS for (genvar i = 0; i < LANES; i++) begin always @(*) begin @@ -107,13 +140,13 @@ module VX_fp_noncomp #( if (a_type[i].is_nan && b_type[i].is_nan) fminmax_res[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN else if (a_type[i].is_nan) - fminmax_res[i] = datab[i]; + fminmax_res[i] = datab_r[i]; else if (b_type[i].is_nan) - fminmax_res[i] = dataa[i]; + fminmax_res[i] = dataa_r[i]; else begin - case (op) // use LSB to distinguish MIN and MAX - `FPU_MIN: fminmax_res[i] = a_smaller[i] ? dataa[i] : datab[i]; - `FPU_MAX: fminmax_res[i] = a_smaller[i] ? datab[i] : dataa[i]; + case (op_r) // use LSB to distinguish MIN and MAX + `FPU_MIN: fminmax_res[i] = a_smaller[i] ? dataa_r[i] : datab_r[i]; + `FPU_MAX: fminmax_res[i] = a_smaller[i] ? datab_r[i] : dataa_r[i]; default: fminmax_res[i] = 32'hdeadbeaf; // don't care value endcase end @@ -123,7 +156,7 @@ module VX_fp_noncomp #( // Sign Injection for (genvar i = 0; i < LANES; i++) begin always @(*) begin - case (op) + case (op_r) `FPU_SGNJ: fsgnj_res[i] = { b_sign[i], a_exponent[i], a_mantissa[i]}; `FPU_SGNJN: fsgnj_res[i] = {~b_sign[i], a_exponent[i], a_mantissa[i]}; `FPU_SGNJX: fsgnj_res[i] = { a_sign[i] ^ b_sign[i], a_exponent[i], a_mantissa[i]}; @@ -135,7 +168,7 @@ module VX_fp_noncomp #( // Comparison for (genvar i = 0; i < LANES; i++) begin always @(*) begin - case (frm) + case (frm_r) `FRM_RNE: begin if (a_type[i].is_nan || b_type[i].is_nan) begin fcmp_res[i] = 32'h0; // result is 0 when either operand is NaN @@ -183,7 +216,7 @@ module VX_fp_noncomp #( reg [LANES-1:0][31:0] tmp_result; always @(*) begin - case (op) + case (op_r) `FPU_SGNJ: tmp_has_fflags = 0; `FPU_SGNJN: tmp_has_fflags = 0; `FPU_SGNJX: tmp_has_fflags = 0; @@ -197,7 +230,7 @@ module VX_fp_noncomp #( for (genvar i = 0; i < LANES; i++) begin always @(*) begin tmp_valid = 1'b1; - case (op) + case (op_r) `FPU_CLASS: begin tmp_result[i] = fclass_mask[i]; {tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = 5'h0; @@ -227,9 +260,6 @@ module VX_fp_noncomp #( end end - wire stall = ~ready_out && valid_out; - assign ready_in = ~stall; - VX_generic_register #( .N(1 + TAGW + (LANES * 32) + 1 + (LANES * `FFG_BITS)) ) nc_reg ( @@ -241,4 +271,6 @@ module VX_fp_noncomp #( .out ({valid_out, tag_out, result, has_fflags, fflags}) ); + assign ready_in = ~stall; + endmodule \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_issue_if.v b/hw/rtl/interfaces/VX_issue_if.v new file mode 100644 index 00000000..6018465b --- /dev/null +++ b/hw/rtl/interfaces/VX_issue_if.v @@ -0,0 +1,38 @@ +`ifndef VX_ISSUE_IF +`define VX_ISSUE_IF + +`include "VX_define.vh" + +interface VX_issue_if (); + + wire valid; + + wire [`ITAG_BITS-1:0] issue_tag; + wire [`NW_BITS-1:0] wid; + wire [`NUM_THREADS-1:0] thread_mask; + wire [31:0] curr_PC; + + wire [`EX_BITS-1:0] ex_type; + wire [`OP_BITS-1:0] ex_op; + + wire [`FRM_BITS-1:0] frm; + + wire wb; + + wire [`NR_BITS-1:0] rd; + + wire [`NUM_THREADS-1:0][31:0] rs1_data; + wire [`NUM_THREADS-1:0][31:0] rs2_data; + wire [`NUM_THREADS-1:0][31:0] rs3_data; + + wire [`NR_BITS-1:0] rs1; + wire [31:0] imm; + + wire rs1_is_PC; + wire rs2_is_imm; + + wire [1NT_BITS-1:0] tid; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/libs/VX_bypass_buffer.v b/hw/rtl/libs/VX_bypass_buffer.v new file mode 100644 index 00000000..1a002d10 --- /dev/null +++ b/hw/rtl/libs/VX_bypass_buffer.v @@ -0,0 +1,47 @@ +`include "VX_platform.vh" + +module VX_bypass_buffer #( + parameter DATAW = 1, + parameter PASSTHRU = 0 +) ( + input wire clk, + input wire reset, + input wire valid_in, + output wire ready_in, + input wire [DATAW-1:0] data_in, + output wire [DATAW-1:0] data_out, + input wire ready_out, + output wire valid_out +); + if (PASSTHRU) begin + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + assign ready_in = ready_out; + assign valid_out = valid_in; + assign data_out = data_in; + end else begin + reg [DATAW-1:0] buffer; + reg buffer_valid; + + always @(posedge clk) begin + if (reset) begin + buffer_valid <= 0; + buffer <= 0; + end else begin + if (ready_out) begin + buffer_valid <= 0; + end + if (valid_in && ~ready_out) begin + assert(!buffer_valid); + buffer <= data_in; + buffer_valid <= 1; + end + end + end + + assign ready_in = ready_out || !buffer_valid; + assign data_out = buffer_valid ? buffer : data_in; + assign valid_out = valid_in || buffer_valid; + end + +endmodule \ No newline at end of file diff --git a/hw/syn/quartus/project.sdc b/hw/syn/quartus/project.sdc index e06c4389..59686a41 100644 --- a/hw/syn/quartus/project.sdc +++ b/hw/syn/quartus/project.sdc @@ -1,6 +1,6 @@ set_time_format -unit ns -decimal_places 3 -create_clock -name {clk} -period "240 MHz" -waveform { 0.0 1.0 } [get_ports {clk}] +create_clock -name {clk} -period "200 MHz" -waveform { 0.0 1.0 } [get_ports {clk}] derive_pll_clocks -create_base_clocks derive_clock_uncertainty