From 5a2bc88d20c8e15394efe0e4470f7e83c555cf61 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 15 Dec 2023 14:09:51 -0800 Subject: [PATCH 1/2] operands optimization minor updates minor updates --- hw/rtl/VX_gpu_pkg.sv | 50 +++++---- hw/rtl/core/VX_dispatch_unit.sv | 10 +- hw/rtl/core/VX_gather_unit.sv | 14 +-- hw/rtl/core/VX_operands.sv | 184 +++++++++++++++----------------- hw/rtl/core/VX_scoreboard.sv | 41 +++---- 5 files changed, 142 insertions(+), 157 deletions(-) diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv index 4ece6c9c..668b53ee 100644 --- a/hw/rtl/VX_gpu_pkg.sv +++ b/hw/rtl/VX_gpu_pkg.sv @@ -190,42 +190,46 @@ package VX_gpu_pkg; /////////////////////////////// Issue parameters ////////////////////////// - localparam ISSUE_IDX_W = `LOG2UP(`ISSUE_WIDTH); + localparam ISSUE_ISW = `CLOG2(`ISSUE_WIDTH); + localparam ISSUE_ISW_W = `UP(ISSUE_ISW); localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH; - localparam ISSUE_WIS_W = `LOG2UP(ISSUE_RATIO); - localparam ISSUE_ADDRW = `LOG2UP(`NUM_REGS * (ISSUE_RATIO)); - + localparam ISSUE_WIS = `CLOG2(ISSUE_RATIO); + localparam ISSUE_WIS_W = `UP(ISSUE_WIS); + `IGNORE_UNUSED_BEGIN - function logic [ISSUE_IDX_W-1:0] wid_to_isw( + function logic [`NW_WIDTH-1:0] wis_to_wid( + input logic [ISSUE_WIS_W-1:0] wis, + input logic [ISSUE_ISW_W-1:0] isw + ); + if (ISSUE_WIS == 0) begin + wis_to_wid = `NW_WIDTH'(isw); + end else if (ISSUE_ISW == 0) begin + wis_to_wid = `NW_WIDTH'(wis); + end else begin + wis_to_wid = `NW_WIDTH'({wis, isw}); + end + endfunction + + function logic [ISSUE_ISW_W-1:0] wid_to_isw( input logic [`NW_WIDTH-1:0] wid ); - if (`ISSUE_WIDTH > 1) begin - wid_to_isw = ISSUE_IDX_W'(wid); + if (ISSUE_ISW != 0) begin + wid_to_isw = wid[ISSUE_ISW_W-1:0]; end else begin wid_to_isw = 0; end endfunction -`IGNORE_UNUSED_END - - function logic [`NW_WIDTH-1:0] wis_to_wid( - input logic [ISSUE_WIS_W-1:0] wis, - input logic [ISSUE_IDX_W-1:0] isw - ); - wis_to_wid = `NW_WIDTH'({wis, isw} >> (ISSUE_IDX_W-`CLOG2(`ISSUE_WIDTH))); - endfunction function logic [ISSUE_WIS_W-1:0] wid_to_wis( input logic [`NW_WIDTH-1:0] wid ); - wid_to_wis = ISSUE_WIS_W'({1'b0, wid} >> `CLOG2(`ISSUE_WIDTH)); - endfunction - - function logic [ISSUE_ADDRW-1:0] wis_to_addr( - input logic [`NR_BITS-1:0] rid, - input logic [ISSUE_WIS_W-1:0] wis - ); - wis_to_addr = ISSUE_ADDRW'({rid, wis} >> (ISSUE_WIS_W-`CLOG2(ISSUE_RATIO))); + if (ISSUE_WIS != 0) begin + wid_to_wis = ISSUE_WIS_W'(wid >> ISSUE_ISW); + end else begin + wid_to_wis = 0; + end endfunction +`IGNORE_UNUSED_END endpackage diff --git a/hw/rtl/core/VX_dispatch_unit.sv b/hw/rtl/core/VX_dispatch_unit.sv index 586acc0b..6e36a33b 100644 --- a/hw/rtl/core/VX_dispatch_unit.sv +++ b/hw/rtl/core/VX_dispatch_unit.sv @@ -203,20 +203,20 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( assign block_done[block_idx] = ~valid_p || ready_p; end - wire [ISSUE_IDX_W-1:0] wsi; + wire [ISSUE_ISW_W-1:0] isw; if (BATCH_COUNT != 1) begin if (BLOCK_SIZE != 1) begin - assign wsi = {batch_idx, BLOCK_SIZE_W'(block_idx)}; + assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)}; end else begin - assign wsi = batch_idx; + assign isw = batch_idx; end end else begin - assign wsi = block_idx; + assign isw = block_idx; end `RESET_RELAY(buf_out_reset, reset); - wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], wsi); + wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw); VX_elastic_buffer #( .DATAW (OUT_DATAW), diff --git a/hw/rtl/core/VX_gather_unit.sv b/hw/rtl/core/VX_gather_unit.sv index e3dc935d..21ae4485 100644 --- a/hw/rtl/core/VX_gather_unit.sv +++ b/hw/rtl/core/VX_gather_unit.sv @@ -37,7 +37,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #( wire [BLOCK_SIZE-1:0] commit_in_valid; wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data; wire [BLOCK_SIZE-1:0] commit_in_ready; - wire [BLOCK_SIZE-1:0][ISSUE_IDX_W-1:0] commit_in_wsi; + wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw; for (genvar i = 0; i < BLOCK_SIZE; ++i) begin assign commit_in_valid[i] = commit_in_if[i].valid; @@ -45,12 +45,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #( assign commit_in_if[i].ready = commit_in_ready[i]; if (BLOCK_SIZE != `ISSUE_WIDTH) begin if (BLOCK_SIZE != 1) begin - assign commit_in_wsi[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_IDX_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)}; + assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)}; end else begin - assign commit_in_wsi[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_IDX_W]; + assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W]; end end else begin - assign commit_in_wsi[i] = BLOCK_SIZE_W'(i); + assign commit_in_isw[i] = BLOCK_SIZE_W'(i); end end @@ -64,12 +64,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #( commit_out_data[i] = 'x; end for (integer i = 0; i < BLOCK_SIZE; ++i) begin - commit_out_valid[commit_in_wsi[i]] = commit_in_valid[i]; - commit_out_data[commit_in_wsi[i]] = commit_in_data[i]; + commit_out_valid[commit_in_isw[i]] = commit_in_valid[i]; + commit_out_data[commit_in_isw[i]] = commit_in_data[i]; end end for (genvar i = 0; i < BLOCK_SIZE; ++i) begin - assign commit_in_ready[i] = commit_out_ready[commit_in_wsi[i]]; + assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]]; end for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 3ff5df46..ee0c493b 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -26,6 +26,7 @@ module VX_operands import VX_gpu_pkg::*; #( ); `UNUSED_PARAM (CORE_ID) localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS; + localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO); localparam STATE_IDLE = 2'd0; localparam STATE_FETCH1 = 2'd1; @@ -46,9 +47,11 @@ module VX_operands import VX_gpu_pkg::*; #( reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0]; reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n; + reg valid_out_r; + reg [DATAW-1:0] data_out_r; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n; - reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n; + reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n; reg [STATE_BITS-1:0] state, state_n; reg [`NR_BITS-1:0] rs2, rs2_n; @@ -57,11 +60,11 @@ module VX_operands import VX_gpu_pkg::*; #( reg rs3_ready, rs3_ready_n; reg data_ready, data_ready_n; + wire ready_out = operands_if[i].ready; + wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0); wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0); - wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0); - - VX_operands_if staging_if(); + wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0); always @(*) begin state_n = state; @@ -82,7 +85,7 @@ module VX_operands import VX_gpu_pkg::*; #( case (state) STATE_IDLE: begin - if (staging_if.valid && staging_if.ready) begin + if (valid_out_r && ready_out) begin data_ready_n = 0; end if (scoreboard_if[i].valid && data_ready_n == 0) begin @@ -170,33 +173,86 @@ module VX_operands import VX_gpu_pkg::*; #( end always @(posedge clk) begin - if (reset) begin + if (reset) begin state <= STATE_IDLE; - gpr_rd_rid <= '0; - gpr_rd_wis <= '0; cache_eop <= {ISSUE_RATIO{1'b1}}; data_ready <= 0; + valid_out_r <= 0; end else begin state <= state_n; - rs2 <= rs2_n; - rs3 <= rs3_n; - rs2_ready <= rs2_ready_n; - rs3_ready <= rs3_ready_n; - rs1_data <= rs1_data_n; - rs2_data <= rs2_data_n; - rs3_data <= rs3_data_n; - gpr_rd_rid <= gpr_rd_rid_n; - gpr_rd_wis <= gpr_rd_wis_n; - cache_data <= cache_data_n; - cache_reg <= cache_reg_n; - cache_tmask <= cache_tmask_n; cache_eop <= cache_eop_n; - data_ready <= data_ready_n; + data_ready <= data_ready_n; + if (~valid_out_r) begin + valid_out_r <= scoreboard_if[i].valid && data_ready; + end else if (ready_out) begin + valid_out_r <= 0; + end end - end + + if (~valid_out_r) begin + data_out_r <= {scoreboard_if[i].data.uuid, + scoreboard_if[i].data.wis, + scoreboard_if[i].data.tmask, + scoreboard_if[i].data.PC, + scoreboard_if[i].data.wb, + scoreboard_if[i].data.ex_type, + scoreboard_if[i].data.op_type, + scoreboard_if[i].data.op_mod, + scoreboard_if[i].data.use_PC, + scoreboard_if[i].data.use_imm, + scoreboard_if[i].data.imm, + scoreboard_if[i].data.rd}; + end + + gpr_rd_rid <= gpr_rd_rid_n; + gpr_rd_wis <= gpr_rd_wis_n; + rs2_ready <= rs2_ready_n; + rs3_ready <= rs3_ready_n; + rs2 <= rs2_n; + rs3 <= rs3_n; + rs1_data <= rs1_data_n; + rs2_data <= rs2_data_n; + rs3_data <= rs3_data_n; + cache_data <= cache_data_n; + cache_reg <= cache_reg_n; + cache_tmask <= cache_tmask_n; + end + + assign operands_if[i].valid = valid_out_r; + assign {operands_if[i].data.uuid, + operands_if[i].data.wis, + operands_if[i].data.tmask, + operands_if[i].data.PC, + operands_if[i].data.wb, + operands_if[i].data.ex_type, + operands_if[i].data.op_type, + operands_if[i].data.op_mod, + operands_if[i].data.use_PC, + operands_if[i].data.use_imm, + operands_if[i].data.imm, + operands_if[i].data.rd} = data_out_r; + assign operands_if[i].data.rs1_data = rs1_data; + assign operands_if[i].data.rs2_data = rs2_data; + assign operands_if[i].data.rs3_data = rs3_data; + + assign scoreboard_if[i].ready = ~valid_out_r && data_ready; // GPR banks + reg [RAM_ADDRW-1:0] gpr_rd_addr; + wire [RAM_ADDRW-1:0] gpr_wr_addr; + if (ISSUE_WIS != 0) begin + assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd}; + always @(posedge clk) begin + gpr_rd_addr <= {gpr_rd_wis_n, gpr_rd_rid_n}; + end + end else begin + assign gpr_wr_addr = writeback_if[i].data.rd; + always @(posedge clk) begin + gpr_rd_addr <= gpr_rd_rid_n; + end + end + `ifdef GPR_RESET reg wr_enabled = 0; always @(posedge clk) begin @@ -204,10 +260,8 @@ module VX_operands import VX_gpu_pkg::*; #( wr_enabled <= 1; end end - `else - wire wr_enabled = 1; `endif - + for (genvar j = 0; j < `NUM_THREADS; ++j) begin VX_dp_ram #( .DATAW (`XLEN), @@ -221,81 +275,17 @@ module VX_operands import VX_gpu_pkg::*; #( .clk (clk), .read (1'b1), `UNUSED_PIN (wren), - .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), - .waddr (wis_to_addr(writeback_if[i].data.rd, writeback_if[i].data.wis)), + `ifdef GPR_RESET + .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), + `else + .write (writeback_if[i].valid && writeback_if[i].data.tmask[j]), + `endif + .waddr (gpr_wr_addr), .wdata (writeback_if[i].data.data[j]), - .raddr (wis_to_addr(gpr_rd_rid, gpr_rd_wis)), + .raddr (gpr_rd_addr), .rdata (gpr_rd_data[j]) ); end - - // staging buffer - - `RESET_RELAY (stg_buf_reset, reset); - - VX_elastic_buffer #( - .DATAW (DATAW) - ) stg_buf ( - .clk (clk), - .reset (stg_buf_reset), - .valid_in (scoreboard_if[i].valid), - .ready_in (scoreboard_if[i].ready), - .data_in ({ - scoreboard_if[i].data.uuid, - scoreboard_if[i].data.wis, - scoreboard_if[i].data.tmask, - scoreboard_if[i].data.PC, - scoreboard_if[i].data.wb, - scoreboard_if[i].data.ex_type, - scoreboard_if[i].data.op_type, - scoreboard_if[i].data.op_mod, - scoreboard_if[i].data.use_PC, - scoreboard_if[i].data.use_imm, - scoreboard_if[i].data.imm, - scoreboard_if[i].data.rd}), - .data_out ({ - staging_if.data.uuid, - staging_if.data.wis, - staging_if.data.tmask, - staging_if.data.PC, - staging_if.data.wb, - staging_if.data.ex_type, - staging_if.data.op_type, - staging_if.data.op_mod, - staging_if.data.use_PC, - staging_if.data.use_imm, - staging_if.data.imm, - staging_if.data.rd}), - .valid_out (staging_if.valid), - .ready_out (staging_if.ready) - ); - - assign staging_if.data.rs1_data = rs1_data; - assign staging_if.data.rs2_data = rs2_data; - assign staging_if.data.rs3_data = rs3_data; - - // output buffer - - wire valid_stg, ready_stg; - assign valid_stg = staging_if.valid && data_ready; - assign staging_if.ready = ready_stg && data_ready; - - `RESET_RELAY (out_buf_reset, reset); - - VX_elastic_buffer #( - .DATAW (DATAW + (3 * `NUM_THREADS * `XLEN)), - .SIZE (2), - .OUT_REG (2) - ) out_buf ( - .clk (clk), - .reset (out_buf_reset), - .valid_in (valid_stg), - .ready_in (ready_stg), - .data_in (staging_if.data), - .data_out (operands_if[i].data), - .valid_out (operands_if[i].valid), - .ready_out (operands_if[i].ready) - ); - end + end endmodule diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 2206df25..1c5f3676 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -51,7 +51,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #( for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs; - VX_ibuffer_if staging_if(); wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; @@ -84,10 +83,17 @@ module VX_scoreboard import VX_gpu_pkg::*; #( reg [DATAW-1:0] data_out_r; reg valid_out_r; + wire ready_out; wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; wire deps_ready = (& ready_masks); + wire valid_in = ibuffer_if[i].valid && deps_ready; + wire ready_in = ~valid_out_r && deps_ready; + wire [DATAW-1:0] data_in = ibuffer_if[i].data; + + assign ready_out = scoreboard_if[i].ready; + always @(posedge clk) begin if (reset) begin valid_out_r <= 0; @@ -97,40 +103,25 @@ module VX_scoreboard import VX_gpu_pkg::*; #( inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0; end if (~valid_out_r) begin - valid_out_r <= ibuffer_if[i].valid && deps_ready; - end else if (staging_if.ready) begin - if (staging_if.data.wb) begin - inuse_regs[staging_if.data.wis][staging_if.data.rd] <= 1; + valid_out_r <= valid_in; + end else if (ready_out) begin + if (scoreboard_if[i].data.wb) begin + inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1; `ifdef PERF_ENABLE - inuse_units[staging_if.data.wis][staging_if.data.rd] <= staging_if.data.ex_type; + inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type; `endif end valid_out_r <= 0; end end if (~valid_out_r) begin - data_out_r <= ibuffer_if[i].data; + data_out_r <= data_in; end end - assign ibuffer_if[i].ready = ~valid_out_r && deps_ready; - assign staging_if.valid = valid_out_r; - assign staging_if.data = data_out_r; - - VX_elastic_buffer #( - .DATAW (DATAW), - .SIZE (0), - .OUT_REG (2) - ) out_buf ( - .clk (clk), - .reset (reset), - .valid_in (staging_if.valid), - .ready_in (staging_if.ready), - .data_in (staging_if.data), - .data_out (scoreboard_if[i].data), - .valid_out (scoreboard_if[i].valid), - .ready_out (scoreboard_if[i].ready) - ); + assign ibuffer_if[i].ready = ready_in; + assign scoreboard_if[i].valid = valid_out_r; + assign scoreboard_if[i].data = data_out_r; `ifdef SIMULATION reg [31:0] timeout_ctr; From 39e6f95c2b26738a6a60b76ca2b6c5e3fa45bff6 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 15 Dec 2023 14:09:51 -0800 Subject: [PATCH 2/2] operands optimization minor updates minor updates minor update --- hw/rtl/VX_gpu_pkg.sv | 50 +++++---- hw/rtl/core/VX_dispatch_unit.sv | 10 +- hw/rtl/core/VX_gather_unit.sv | 14 +-- hw/rtl/core/VX_lsu_unit.sv | 2 +- hw/rtl/core/VX_operands.sv | 184 +++++++++++++++----------------- hw/rtl/core/VX_scoreboard.sv | 41 +++---- 6 files changed, 143 insertions(+), 158 deletions(-) diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv index 4ece6c9c..668b53ee 100644 --- a/hw/rtl/VX_gpu_pkg.sv +++ b/hw/rtl/VX_gpu_pkg.sv @@ -190,42 +190,46 @@ package VX_gpu_pkg; /////////////////////////////// Issue parameters ////////////////////////// - localparam ISSUE_IDX_W = `LOG2UP(`ISSUE_WIDTH); + localparam ISSUE_ISW = `CLOG2(`ISSUE_WIDTH); + localparam ISSUE_ISW_W = `UP(ISSUE_ISW); localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH; - localparam ISSUE_WIS_W = `LOG2UP(ISSUE_RATIO); - localparam ISSUE_ADDRW = `LOG2UP(`NUM_REGS * (ISSUE_RATIO)); - + localparam ISSUE_WIS = `CLOG2(ISSUE_RATIO); + localparam ISSUE_WIS_W = `UP(ISSUE_WIS); + `IGNORE_UNUSED_BEGIN - function logic [ISSUE_IDX_W-1:0] wid_to_isw( + function logic [`NW_WIDTH-1:0] wis_to_wid( + input logic [ISSUE_WIS_W-1:0] wis, + input logic [ISSUE_ISW_W-1:0] isw + ); + if (ISSUE_WIS == 0) begin + wis_to_wid = `NW_WIDTH'(isw); + end else if (ISSUE_ISW == 0) begin + wis_to_wid = `NW_WIDTH'(wis); + end else begin + wis_to_wid = `NW_WIDTH'({wis, isw}); + end + endfunction + + function logic [ISSUE_ISW_W-1:0] wid_to_isw( input logic [`NW_WIDTH-1:0] wid ); - if (`ISSUE_WIDTH > 1) begin - wid_to_isw = ISSUE_IDX_W'(wid); + if (ISSUE_ISW != 0) begin + wid_to_isw = wid[ISSUE_ISW_W-1:0]; end else begin wid_to_isw = 0; end endfunction -`IGNORE_UNUSED_END - - function logic [`NW_WIDTH-1:0] wis_to_wid( - input logic [ISSUE_WIS_W-1:0] wis, - input logic [ISSUE_IDX_W-1:0] isw - ); - wis_to_wid = `NW_WIDTH'({wis, isw} >> (ISSUE_IDX_W-`CLOG2(`ISSUE_WIDTH))); - endfunction function logic [ISSUE_WIS_W-1:0] wid_to_wis( input logic [`NW_WIDTH-1:0] wid ); - wid_to_wis = ISSUE_WIS_W'({1'b0, wid} >> `CLOG2(`ISSUE_WIDTH)); - endfunction - - function logic [ISSUE_ADDRW-1:0] wis_to_addr( - input logic [`NR_BITS-1:0] rid, - input logic [ISSUE_WIS_W-1:0] wis - ); - wis_to_addr = ISSUE_ADDRW'({rid, wis} >> (ISSUE_WIS_W-`CLOG2(ISSUE_RATIO))); + if (ISSUE_WIS != 0) begin + wid_to_wis = ISSUE_WIS_W'(wid >> ISSUE_ISW); + end else begin + wid_to_wis = 0; + end endfunction +`IGNORE_UNUSED_END endpackage diff --git a/hw/rtl/core/VX_dispatch_unit.sv b/hw/rtl/core/VX_dispatch_unit.sv index 586acc0b..6e36a33b 100644 --- a/hw/rtl/core/VX_dispatch_unit.sv +++ b/hw/rtl/core/VX_dispatch_unit.sv @@ -203,20 +203,20 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( assign block_done[block_idx] = ~valid_p || ready_p; end - wire [ISSUE_IDX_W-1:0] wsi; + wire [ISSUE_ISW_W-1:0] isw; if (BATCH_COUNT != 1) begin if (BLOCK_SIZE != 1) begin - assign wsi = {batch_idx, BLOCK_SIZE_W'(block_idx)}; + assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)}; end else begin - assign wsi = batch_idx; + assign isw = batch_idx; end end else begin - assign wsi = block_idx; + assign isw = block_idx; end `RESET_RELAY(buf_out_reset, reset); - wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], wsi); + wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw); VX_elastic_buffer #( .DATAW (OUT_DATAW), diff --git a/hw/rtl/core/VX_gather_unit.sv b/hw/rtl/core/VX_gather_unit.sv index e3dc935d..21ae4485 100644 --- a/hw/rtl/core/VX_gather_unit.sv +++ b/hw/rtl/core/VX_gather_unit.sv @@ -37,7 +37,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #( wire [BLOCK_SIZE-1:0] commit_in_valid; wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data; wire [BLOCK_SIZE-1:0] commit_in_ready; - wire [BLOCK_SIZE-1:0][ISSUE_IDX_W-1:0] commit_in_wsi; + wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw; for (genvar i = 0; i < BLOCK_SIZE; ++i) begin assign commit_in_valid[i] = commit_in_if[i].valid; @@ -45,12 +45,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #( assign commit_in_if[i].ready = commit_in_ready[i]; if (BLOCK_SIZE != `ISSUE_WIDTH) begin if (BLOCK_SIZE != 1) begin - assign commit_in_wsi[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_IDX_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)}; + assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)}; end else begin - assign commit_in_wsi[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_IDX_W]; + assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W]; end end else begin - assign commit_in_wsi[i] = BLOCK_SIZE_W'(i); + assign commit_in_isw[i] = BLOCK_SIZE_W'(i); end end @@ -64,12 +64,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #( commit_out_data[i] = 'x; end for (integer i = 0; i < BLOCK_SIZE; ++i) begin - commit_out_valid[commit_in_wsi[i]] = commit_in_valid[i]; - commit_out_data[commit_in_wsi[i]] = commit_in_data[i]; + commit_out_valid[commit_in_isw[i]] = commit_in_valid[i]; + commit_out_data[commit_in_isw[i]] = commit_in_data[i]; end end for (genvar i = 0; i < BLOCK_SIZE; ++i) begin - assign commit_in_ready[i] = commit_out_ready[commit_in_wsi[i]]; + assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]]; end for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv index 3383f70f..1e0a09b8 100644 --- a/hw/rtl/core/VX_lsu_unit.sv +++ b/hw/rtl/core/VX_lsu_unit.sv @@ -554,7 +554,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( VX_stream_arb #( .NUM_INPUTS (2), .DATAW (RSP_ARB_DATAW), - .OUT_REG (2) + .OUT_REG (3) ) rsp_arb ( .clk (clk), .reset (commit_reset), diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 3ff5df46..ee0c493b 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -26,6 +26,7 @@ module VX_operands import VX_gpu_pkg::*; #( ); `UNUSED_PARAM (CORE_ID) localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS; + localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO); localparam STATE_IDLE = 2'd0; localparam STATE_FETCH1 = 2'd1; @@ -46,9 +47,11 @@ module VX_operands import VX_gpu_pkg::*; #( reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0]; reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n; + reg valid_out_r; + reg [DATAW-1:0] data_out_r; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n; - reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n; + reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n; reg [STATE_BITS-1:0] state, state_n; reg [`NR_BITS-1:0] rs2, rs2_n; @@ -57,11 +60,11 @@ module VX_operands import VX_gpu_pkg::*; #( reg rs3_ready, rs3_ready_n; reg data_ready, data_ready_n; + wire ready_out = operands_if[i].ready; + wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0); wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0); - wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0); - - VX_operands_if staging_if(); + wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0); always @(*) begin state_n = state; @@ -82,7 +85,7 @@ module VX_operands import VX_gpu_pkg::*; #( case (state) STATE_IDLE: begin - if (staging_if.valid && staging_if.ready) begin + if (valid_out_r && ready_out) begin data_ready_n = 0; end if (scoreboard_if[i].valid && data_ready_n == 0) begin @@ -170,33 +173,86 @@ module VX_operands import VX_gpu_pkg::*; #( end always @(posedge clk) begin - if (reset) begin + if (reset) begin state <= STATE_IDLE; - gpr_rd_rid <= '0; - gpr_rd_wis <= '0; cache_eop <= {ISSUE_RATIO{1'b1}}; data_ready <= 0; + valid_out_r <= 0; end else begin state <= state_n; - rs2 <= rs2_n; - rs3 <= rs3_n; - rs2_ready <= rs2_ready_n; - rs3_ready <= rs3_ready_n; - rs1_data <= rs1_data_n; - rs2_data <= rs2_data_n; - rs3_data <= rs3_data_n; - gpr_rd_rid <= gpr_rd_rid_n; - gpr_rd_wis <= gpr_rd_wis_n; - cache_data <= cache_data_n; - cache_reg <= cache_reg_n; - cache_tmask <= cache_tmask_n; cache_eop <= cache_eop_n; - data_ready <= data_ready_n; + data_ready <= data_ready_n; + if (~valid_out_r) begin + valid_out_r <= scoreboard_if[i].valid && data_ready; + end else if (ready_out) begin + valid_out_r <= 0; + end end - end + + if (~valid_out_r) begin + data_out_r <= {scoreboard_if[i].data.uuid, + scoreboard_if[i].data.wis, + scoreboard_if[i].data.tmask, + scoreboard_if[i].data.PC, + scoreboard_if[i].data.wb, + scoreboard_if[i].data.ex_type, + scoreboard_if[i].data.op_type, + scoreboard_if[i].data.op_mod, + scoreboard_if[i].data.use_PC, + scoreboard_if[i].data.use_imm, + scoreboard_if[i].data.imm, + scoreboard_if[i].data.rd}; + end + + gpr_rd_rid <= gpr_rd_rid_n; + gpr_rd_wis <= gpr_rd_wis_n; + rs2_ready <= rs2_ready_n; + rs3_ready <= rs3_ready_n; + rs2 <= rs2_n; + rs3 <= rs3_n; + rs1_data <= rs1_data_n; + rs2_data <= rs2_data_n; + rs3_data <= rs3_data_n; + cache_data <= cache_data_n; + cache_reg <= cache_reg_n; + cache_tmask <= cache_tmask_n; + end + + assign operands_if[i].valid = valid_out_r; + assign {operands_if[i].data.uuid, + operands_if[i].data.wis, + operands_if[i].data.tmask, + operands_if[i].data.PC, + operands_if[i].data.wb, + operands_if[i].data.ex_type, + operands_if[i].data.op_type, + operands_if[i].data.op_mod, + operands_if[i].data.use_PC, + operands_if[i].data.use_imm, + operands_if[i].data.imm, + operands_if[i].data.rd} = data_out_r; + assign operands_if[i].data.rs1_data = rs1_data; + assign operands_if[i].data.rs2_data = rs2_data; + assign operands_if[i].data.rs3_data = rs3_data; + + assign scoreboard_if[i].ready = ~valid_out_r && data_ready; // GPR banks + reg [RAM_ADDRW-1:0] gpr_rd_addr; + wire [RAM_ADDRW-1:0] gpr_wr_addr; + if (ISSUE_WIS != 0) begin + assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd}; + always @(posedge clk) begin + gpr_rd_addr <= {gpr_rd_wis_n, gpr_rd_rid_n}; + end + end else begin + assign gpr_wr_addr = writeback_if[i].data.rd; + always @(posedge clk) begin + gpr_rd_addr <= gpr_rd_rid_n; + end + end + `ifdef GPR_RESET reg wr_enabled = 0; always @(posedge clk) begin @@ -204,10 +260,8 @@ module VX_operands import VX_gpu_pkg::*; #( wr_enabled <= 1; end end - `else - wire wr_enabled = 1; `endif - + for (genvar j = 0; j < `NUM_THREADS; ++j) begin VX_dp_ram #( .DATAW (`XLEN), @@ -221,81 +275,17 @@ module VX_operands import VX_gpu_pkg::*; #( .clk (clk), .read (1'b1), `UNUSED_PIN (wren), - .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), - .waddr (wis_to_addr(writeback_if[i].data.rd, writeback_if[i].data.wis)), + `ifdef GPR_RESET + .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), + `else + .write (writeback_if[i].valid && writeback_if[i].data.tmask[j]), + `endif + .waddr (gpr_wr_addr), .wdata (writeback_if[i].data.data[j]), - .raddr (wis_to_addr(gpr_rd_rid, gpr_rd_wis)), + .raddr (gpr_rd_addr), .rdata (gpr_rd_data[j]) ); end - - // staging buffer - - `RESET_RELAY (stg_buf_reset, reset); - - VX_elastic_buffer #( - .DATAW (DATAW) - ) stg_buf ( - .clk (clk), - .reset (stg_buf_reset), - .valid_in (scoreboard_if[i].valid), - .ready_in (scoreboard_if[i].ready), - .data_in ({ - scoreboard_if[i].data.uuid, - scoreboard_if[i].data.wis, - scoreboard_if[i].data.tmask, - scoreboard_if[i].data.PC, - scoreboard_if[i].data.wb, - scoreboard_if[i].data.ex_type, - scoreboard_if[i].data.op_type, - scoreboard_if[i].data.op_mod, - scoreboard_if[i].data.use_PC, - scoreboard_if[i].data.use_imm, - scoreboard_if[i].data.imm, - scoreboard_if[i].data.rd}), - .data_out ({ - staging_if.data.uuid, - staging_if.data.wis, - staging_if.data.tmask, - staging_if.data.PC, - staging_if.data.wb, - staging_if.data.ex_type, - staging_if.data.op_type, - staging_if.data.op_mod, - staging_if.data.use_PC, - staging_if.data.use_imm, - staging_if.data.imm, - staging_if.data.rd}), - .valid_out (staging_if.valid), - .ready_out (staging_if.ready) - ); - - assign staging_if.data.rs1_data = rs1_data; - assign staging_if.data.rs2_data = rs2_data; - assign staging_if.data.rs3_data = rs3_data; - - // output buffer - - wire valid_stg, ready_stg; - assign valid_stg = staging_if.valid && data_ready; - assign staging_if.ready = ready_stg && data_ready; - - `RESET_RELAY (out_buf_reset, reset); - - VX_elastic_buffer #( - .DATAW (DATAW + (3 * `NUM_THREADS * `XLEN)), - .SIZE (2), - .OUT_REG (2) - ) out_buf ( - .clk (clk), - .reset (out_buf_reset), - .valid_in (valid_stg), - .ready_in (ready_stg), - .data_in (staging_if.data), - .data_out (operands_if[i].data), - .valid_out (operands_if[i].valid), - .ready_out (operands_if[i].ready) - ); - end + end endmodule diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 2206df25..1c5f3676 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -51,7 +51,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #( for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs; - VX_ibuffer_if staging_if(); wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; @@ -84,10 +83,17 @@ module VX_scoreboard import VX_gpu_pkg::*; #( reg [DATAW-1:0] data_out_r; reg valid_out_r; + wire ready_out; wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; wire deps_ready = (& ready_masks); + wire valid_in = ibuffer_if[i].valid && deps_ready; + wire ready_in = ~valid_out_r && deps_ready; + wire [DATAW-1:0] data_in = ibuffer_if[i].data; + + assign ready_out = scoreboard_if[i].ready; + always @(posedge clk) begin if (reset) begin valid_out_r <= 0; @@ -97,40 +103,25 @@ module VX_scoreboard import VX_gpu_pkg::*; #( inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0; end if (~valid_out_r) begin - valid_out_r <= ibuffer_if[i].valid && deps_ready; - end else if (staging_if.ready) begin - if (staging_if.data.wb) begin - inuse_regs[staging_if.data.wis][staging_if.data.rd] <= 1; + valid_out_r <= valid_in; + end else if (ready_out) begin + if (scoreboard_if[i].data.wb) begin + inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1; `ifdef PERF_ENABLE - inuse_units[staging_if.data.wis][staging_if.data.rd] <= staging_if.data.ex_type; + inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type; `endif end valid_out_r <= 0; end end if (~valid_out_r) begin - data_out_r <= ibuffer_if[i].data; + data_out_r <= data_in; end end - assign ibuffer_if[i].ready = ~valid_out_r && deps_ready; - assign staging_if.valid = valid_out_r; - assign staging_if.data = data_out_r; - - VX_elastic_buffer #( - .DATAW (DATAW), - .SIZE (0), - .OUT_REG (2) - ) out_buf ( - .clk (clk), - .reset (reset), - .valid_in (staging_if.valid), - .ready_in (staging_if.ready), - .data_in (staging_if.data), - .data_out (scoreboard_if[i].data), - .valid_out (scoreboard_if[i].valid), - .ready_out (scoreboard_if[i].ready) - ); + assign ibuffer_if[i].ready = ready_in; + assign scoreboard_if[i].valid = valid_out_r; + assign scoreboard_if[i].data = data_out_r; `ifdef SIMULATION reg [31:0] timeout_ctr;