From 3f8c28c7d6d270925ed4eb97b0d17ddd685f9c34 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Thu, 5 Sep 2024 16:49:01 -0700 Subject: [PATCH] sync rf, x0 fix --- hw/VX_config.h | 2 +- hw/rtl/VX_config.vh | 4 +- hw/rtl/VX_platform.vh | 4 +- hw/rtl/core/VX_operands.sv | 15 ++- hw/rtl/core/VX_operands_dup.sv | 116 ++++++++++++++----- hw/rtl/libs/VX_dp_ram.sv | 200 +++++++++++++++++---------------- 6 files changed, 203 insertions(+), 138 deletions(-) diff --git a/hw/VX_config.h b/hw/VX_config.h index f809ab82..c5d0caec 100644 --- a/hw/VX_config.h +++ b/hw/VX_config.h @@ -84,7 +84,7 @@ #endif #ifndef NUM_CORES -#define NUM_CORES 4 +#define NUM_CORES 8 #endif #ifndef NUM_WARPS diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 69594848..a9ff2742 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -83,7 +83,7 @@ `endif `ifndef NUM_CORES -`define NUM_CORES 4 +`define NUM_CORES 8 `endif `ifndef NUM_WARPS @@ -179,7 +179,7 @@ `endif `ifndef SMEM_LOG_SIZE -`define SMEM_LOG_SIZE 17 +`define SMEM_LOG_SIZE 19 `endif `ifndef IO_BASE_ADDR diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index ce28a9c0..46765eb0 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -33,7 +33,7 @@ `ifdef SYNTHESIS `define NUM_BARRIERS 8 -`define NUM_CORES 4 +`define NUM_CORES 8 `define NUM_THREADS 8 `define NUM_WARPS 8 @@ -60,6 +60,8 @@ `endif `ifdef SYNTHESIS +`define TRACE(level, args) $write args +`define TRACE_STARTTIME 32'd10 `define TRACING_ON `define TRACING_OFF `ifndef NDEBUG diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 8c7504a4..fba3f861 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -53,7 +53,7 @@ module VX_operands import VX_gpu_pkg::*; #( reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n; - reg [STATE_BITS-1:0] state, state_n; + reg [STATE_BITS-1:0] state, state_n, state_p; reg [`NR_BITS-1:0] rs2, rs2_n; reg [`NR_BITS-1:0] rs3, rs3_n; reg rs2_ready, rs2_ready_n; @@ -175,10 +175,12 @@ module VX_operands import VX_gpu_pkg::*; #( always @(posedge clk) begin if (reset) begin state <= STATE_IDLE; + state_p <= STATE_IDLE; cache_eop <= {ISSUE_RATIO{1'b1}}; data_ready <= 0; end else begin state <= state_n; + state_p <= state; cache_eop <= cache_eop_n; data_ready <= data_ready_n; end @@ -190,7 +192,7 @@ module VX_operands import VX_gpu_pkg::*; #( rs3 <= rs3_n; rs1_data <= rs1_data_n; rs2_data <= rs2_data_n; - rs3_data <= rs3_data_n; + rs3_data <= rs3_data_n; cache_data <= cache_data_n; cache_reg <= cache_reg_n; cache_tmask <= cache_tmask_n; @@ -242,9 +244,9 @@ module VX_operands import VX_gpu_pkg::*; #( .ready_out (operands_if[i].ready) ); - assign operands_if[i].data.rs1_data = rs1_data; - assign operands_if[i].data.rs2_data = rs2_data; - assign operands_if[i].data.rs3_data = rs3_data; + assign operands_if[i].data.rs1_data = (state_p == STATE_FETCH1) ? gpr_rd_data : rs1_data; + assign operands_if[i].data.rs2_data = (state_p == STATE_FETCH2) ? gpr_rd_data : rs2_data; + assign operands_if[i].data.rs3_data = (state_p == STATE_FETCH3) ? gpr_rd_data : rs3_data; // GPR banks @@ -279,7 +281,8 @@ module VX_operands import VX_gpu_pkg::*; #( .INIT_ENABLE (1), .INIT_VALUE (0), `endif - .NO_RWCHECK (1) + .NO_RWCHECK (1), + .OUT_REG (1), ) gpr_ram ( .clk (clk), .read (1'b1), diff --git a/hw/rtl/core/VX_operands_dup.sv b/hw/rtl/core/VX_operands_dup.sv index a90efab3..6e2af3d7 100644 --- a/hw/rtl/core/VX_operands_dup.sv +++ b/hw/rtl/core/VX_operands_dup.sv @@ -35,18 +35,26 @@ module VX_operands_dup import VX_gpu_pkg::*; #( logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_write_per_warp; `endif + logic [`ISSUE_WIDTH-1:0][DATAW-1:0] scoreboard_if_stored; + logic [`ISSUE_WIDTH-1:0] scoreboard_if_stored_valid; + logic [`ISSUE_WIDTH-1:0] full1; + logic [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] full2; + logic [`ISSUE_WIDTH-1:0] empty1; + logic [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] empty2; + logic [`ISSUE_WIDTH-1:0][2:0] size1; + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - VX_stream_buffer #( - .DATAW (DATAW) - ) staging_buffer ( - .clk (clk), - .reset (reset), - .valid_in (scoreboard_if[i].valid), - .data_in ({ + + always @(posedge clk) begin + if (reset) begin + scoreboard_if_stored[i] <= '0; + scoreboard_if_stored_valid[i] <= '0; + end else begin + scoreboard_if_stored[i] <= { scoreboard_if[i].data.uuid, scoreboard_if[i].data.wis, scoreboard_if[i].data.tmask, - scoreboard_if[i].data.PC, + scoreboard_if[i].data.PC, scoreboard_if[i].data.wb, scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type, @@ -55,14 +63,27 @@ module VX_operands_dup import VX_gpu_pkg::*; #( scoreboard_if[i].data.use_imm, scoreboard_if[i].data.imm, scoreboard_if[i].data.rd - }), - .ready_in (scoreboard_if[i].ready), - .valid_out (operands_if[i].valid), - .data_out ({ + }; + scoreboard_if_stored_valid[i] <= scoreboard_if[i].valid && scoreboard_if[i].ready; + end + end + + VX_fifo_queue #( + .DATAW (DATAW), + .DEPTH (4), // could be 3 but limited by power of 2 + .OUT_REG (0), + .LUTRAM (0) + ) fifo_queue ( + .clk (clk), + .reset (reset), + .push (scoreboard_if_stored_valid[i]), + .pop (operands_if[i].ready && ~empty1[i]), + .data_in (scoreboard_if_stored[i]), + .data_out ({ operands_if[i].data.uuid, operands_if[i].data.wis, operands_if[i].data.tmask, - operands_if[i].data.PC, + operands_if[i].data.PC, operands_if[i].data.wb, operands_if[i].data.ex_type, operands_if[i].data.op_type, @@ -72,31 +93,52 @@ module VX_operands_dup import VX_gpu_pkg::*; #( operands_if[i].data.imm, operands_if[i].data.rd }), - .ready_out (operands_if[i].ready) + .empty (empty1[i]), + .full (full1[i]), + `UNUSED_PIN (alm_empty), + `UNUSED_PIN (alm_full), + .size (size1[i]) ); + assign operands_if[i].valid = ~empty1[i]; + assign scoreboard_if[i].ready = (size1[i] < 2'd2); + + // assert (full1[i] == full2[i]); + // assert (empty1[i] == empty2[i]); wire [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data; wire [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data; wire [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data; + reg [RAM_ADDRW-1:0] gpr_rd_addr_rs1_stored; + reg [RAM_ADDRW-1:0] gpr_rd_addr_rs2_stored; + reg [RAM_ADDRW-1:0] gpr_rd_addr_rs3_stored; + for (genvar j = 0; j < `NUM_THREADS; ++j) begin - VX_stream_buffer #( - .DATAW (`XLEN + `XLEN + `XLEN) - ) staging_data_buffer ( - .clk (clk), - .reset (reset), - .valid_in (scoreboard_if[i].valid), - .data_in ({ - rs1_data[j], rs2_data[j], rs3_data[j] + VX_fifo_queue #( + .DATAW (`XLEN + `XLEN + `XLEN), + .DEPTH (4), + .OUT_REG (0), + .LUTRAM (0) + ) fifo_queue ( + .clk (clk), + .reset (reset), + .push (scoreboard_if_stored_valid[i]), + .pop (operands_if[i].ready && ~empty2[i][0]), + .data_in ({ + (gpr_rd_addr_rs1_stored == '0) ? 32'd0 : rs1_data[j], + (gpr_rd_addr_rs2_stored == '0) ? 32'd0 : rs2_data[j], + (gpr_rd_addr_rs3_stored == '0) ? 32'd0 : rs3_data[j] }), - `UNUSED_PIN (ready_in), - `UNUSED_PIN (valid_out), - .data_out ({ - operands_if[i].data.rs1_data[j], - operands_if[i].data.rs2_data[j], - operands_if[i].data.rs3_data[j] + .data_out ({ + operands_if[i].data.rs1_data[j], + operands_if[i].data.rs2_data[j], + operands_if[i].data.rs3_data[j] }), - .ready_out (operands_if[i].ready) + .empty (empty2[i][j]), + .full (full2[i][j]), + `UNUSED_PIN (alm_empty), + `UNUSED_PIN (alm_full), + `UNUSED_PIN (size) ); end @@ -106,6 +148,19 @@ module VX_operands_dup import VX_gpu_pkg::*; #( wire [RAM_ADDRW-1:0] gpr_rd_addr_rs2; wire [RAM_ADDRW-1:0] gpr_rd_addr_rs3; wire [RAM_ADDRW-1:0] gpr_wr_addr; + + always @(posedge clk) begin + if (reset) begin + gpr_rd_addr_rs1_stored <= '0; + gpr_rd_addr_rs2_stored <= '0; + gpr_rd_addr_rs3_stored <= '0; + end else begin + gpr_rd_addr_rs1_stored <= gpr_rd_addr_rs1; + gpr_rd_addr_rs2_stored <= gpr_rd_addr_rs2; + gpr_rd_addr_rs3_stored <= gpr_rd_addr_rs3; + end + end + if (ISSUE_WIS != 0) begin assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd}; assign gpr_rd_addr_rs1 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1}; @@ -165,6 +220,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( VX_dp_ram #( .DATAW (`XLEN), .SIZE (`NUM_REGS * ISSUE_RATIO), + .OUT_REG (1), `ifdef GPR_RESET .INIT_ENABLE (1), .INIT_VALUE (0), @@ -188,6 +244,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( VX_dp_ram #( .DATAW (`XLEN), .SIZE (`NUM_REGS * ISSUE_RATIO), + .OUT_REG (1), `ifdef GPR_RESET .INIT_ENABLE (1), .INIT_VALUE (0), @@ -211,6 +268,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( VX_dp_ram #( .DATAW (`XLEN), .SIZE (`NUM_REGS * ISSUE_RATIO), + .OUT_REG (1), `ifdef GPR_RESET .INIT_ENABLE (1), .INIT_VALUE (0), diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index 020fc5f1..49f35cc8 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -161,75 +161,76 @@ module VX_dp_ram #( end end else begin `ifndef FIRESIM - if (DATAW == 1024 && SIZE == 16) begin // dcache data - (* dont_touch = "yes" *) dcache_data ram ( + // if (DATAW == 1024 && SIZE == 16) begin // dcache data + // (* dont_touch = "yes" *) dcache_data ram ( + // .R0_addr(raddr), + // .R0_clk(clk), + // .R0_data(/*rdata*/), + // .R0_en(read), + // .W0_addr(waddr), + // .W0_clk(clk), + // .W0_data(wdata), + // .W0_en(write), + // .W0_mask(wren) + // ); + // end else if (DATAW == 305 && SIZE == 8) begin // mshr + // (* dont_touch = "yes" *) cache_mshr ram ( + // .R0_addr(raddr), + // .R0_clk(clk), + // .R0_data(/*rdata*/), + // .R0_en(read), + // .W0_addr(waddr), + // .W0_clk(clk), + // .W0_data(wdata), + // .W0_en(write) + // ); + // end else if (DATAW == 24 && SIZE == 16) begin // dcache tags + // (* dont_touch = "yes" *) dcache_tags ram ( + // .R0_addr(raddr), + // .R0_clk(clk), + // .R0_data(/*rdata*/), + // .R0_en(read), + // .W0_addr(waddr), + // .W0_clk(clk), + // .W0_data(wdata), + // .W0_en(write) + // ); + // end else if (DATAW == 1024 && SIZE == 128) begin // icache data + // (* dont_touch = "yes" *) icache_data ram ( + // .R0_addr(raddr), + // .R0_clk(clk), + // .R0_data(/*rdata*/), + // .R0_en(read), + // .W0_addr(waddr), + // .W0_clk(clk), + // .W0_data(wdata), + // .W0_en(write), + // .W0_mask(wren) + // ); + // end else if (DATAW == 21 && SIZE == 128) begin // icache tags + // (* dont_touch = "yes" *) icache_tags ram ( + // .R0_addr(raddr), + // .R0_clk(clk), + // .R0_data(/*rdata*/), + // .R0_en(read), + // .W0_addr(waddr), + // .W0_clk(clk), + // .W0_data(wdata), + // .W0_en(write) + // ); + // end else if (DATAW == 32 && SIZE == 64) begin // register file + if (DATAW == 32 && SIZE == 64) begin // register file + rf_bank ram ( .R0_addr(raddr), .R0_clk(clk), - .R0_data(/*rdata*/), - .R0_en(read), - .W0_addr(waddr), - .W0_clk(clk), - .W0_data(wdata), - .W0_en(write), - .W0_mask(wren) - ); - end else if (DATAW == 305 && SIZE == 8) begin // mshr - (* dont_touch = "yes" *) cache_mshr ram ( - .R0_addr(raddr), - .R0_clk(clk), - .R0_data(/*rdata*/), + .R0_data(rdata), .R0_en(read), .W0_addr(waddr), .W0_clk(clk), .W0_data(wdata), .W0_en(write) ); - end else if (DATAW == 24 && SIZE == 16) begin // dcache tags - (* dont_touch = "yes" *) dcache_tags ram ( - .R0_addr(raddr), - .R0_clk(clk), - .R0_data(/*rdata*/), - .R0_en(read), - .W0_addr(waddr), - .W0_clk(clk), - .W0_data(wdata), - .W0_en(write) - ); - end else if (DATAW == 1024 && SIZE == 128) begin // icache data - (* dont_touch = "yes" *) icache_data ram ( - .R0_addr(raddr), - .R0_clk(clk), - .R0_data(/*rdata*/), - .R0_en(read), - .W0_addr(waddr), - .W0_clk(clk), - .W0_data(wdata), - .W0_en(write), - .W0_mask(wren) - ); - end else if (DATAW == 21 && SIZE == 128) begin // icache tags - (* dont_touch = "yes" *) icache_tags ram ( - .R0_addr(raddr), - .R0_clk(clk), - .R0_data(/*rdata*/), - .R0_en(read), - .W0_addr(waddr), - .W0_clk(clk), - .W0_data(wdata), - .W0_en(write) - ); - end else if (DATAW == 32 && SIZE == 64) begin // register file - (* dont_touch = "yes" *) rf_bank ram ( - .R0_addr(raddr), - .R0_clk(clk), - .R0_data(/*rdata*/), - .R0_en(read), - .W0_addr(waddr), - .W0_clk(clk), - .W0_data(wdata), - .W0_en(write) - ); - end // else begin + end else begin `endif if (OUT_REG != 0) begin reg [DATAW-1:0] ram [SIZE-1:0]; @@ -275,7 +276,7 @@ module VX_dp_ram #( end end `ifndef FIRESIM - // end + end `endif end `endif @@ -304,51 +305,52 @@ module VX_dp_ram #( assign rdata = ram[raddr]; end end else begin - if (DATAW == 305 && SIZE == 8) begin // mshr - (* dont_touch = "yes" *) cache_mshr ram ( + // if (DATAW == 305 && SIZE == 8) begin // mshr + // (* dont_touch = "yes" *) cache_mshr ram ( + // .R0_addr(raddr), + // .R0_clk(clk), + // .R0_data(/*rdata*/), + // .R0_en(read), + // .W0_addr(waddr), + // .W0_clk(clk), + // .W0_data(wdata), + // .W0_en(write) + // ); + // end else if (DATAW == 24 && SIZE == 16) begin // dcache tags + // (* dont_touch = "yes" *) dcache_tags ram ( + // .R0_addr(raddr), + // .R0_clk(clk), + // .R0_data(/*rdata*/), + // .R0_en(read), + // .W0_addr(waddr), + // .W0_clk(clk), + // .W0_data(wdata), + // .W0_en(write) + // ); + // end else if (DATAW == 21 && SIZE == 128) begin // icache tags + // (* dont_touch = "yes" *) icache_tags ram ( + // .R0_addr(raddr), + // .R0_clk(clk), + // .R0_data(/*rdata*/), + // .R0_en(read), + // .W0_addr(waddr), + // .W0_clk(clk), + // .W0_data(wdata), + // .W0_en(write) + // ); + // end else if (DATAW == 32 && SIZE == 64) begin // register file + if (DATAW == 32 && SIZE == 64) begin // register file + rf_bank ram ( .R0_addr(raddr), .R0_clk(clk), - .R0_data(/*rdata*/), + .R0_data(rdata), .R0_en(read), .W0_addr(waddr), .W0_clk(clk), .W0_data(wdata), .W0_en(write) ); - end else if (DATAW == 24 && SIZE == 16) begin // dcache tags - (* dont_touch = "yes" *) dcache_tags ram ( - .R0_addr(raddr), - .R0_clk(clk), - .R0_data(/*rdata*/), - .R0_en(read), - .W0_addr(waddr), - .W0_clk(clk), - .W0_data(wdata), - .W0_en(write) - ); - end else if (DATAW == 21 && SIZE == 128) begin // icache tags - (* dont_touch = "yes" *) icache_tags ram ( - .R0_addr(raddr), - .R0_clk(clk), - .R0_data(/*rdata*/), - .R0_en(read), - .W0_addr(waddr), - .W0_clk(clk), - .W0_data(wdata), - .W0_en(write) - ); - end else if (DATAW == 32 && SIZE == 64) begin // register file - (* dont_touch = "yes" *) rf_bank ram ( - .R0_addr(raddr), - .R0_clk(clk), - .R0_data(/*rdata*/), - .R0_en(read), - .W0_addr(waddr), - .W0_clk(clk), - .W0_data(wdata), - .W0_en(write) - ); - end // else begin + end else if (OUT_REG != 0) begin reg [DATAW-1:0] ram [SIZE-1:0]; reg [DATAW-1:0] rdata_r;