diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index 19ff353b..3e59f28d 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -42,7 +42,8 @@ module VX_instr_demux ( wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), + .OUT_REG (1) ) alu_buffer ( .clk (clk), .reset (reset), @@ -61,7 +62,8 @@ module VX_instr_demux ( wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)), + .OUT_REG (1) ) lsu_buffer ( .clk (clk), .reset (reset), @@ -82,7 +84,8 @@ module VX_instr_demux ( wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid]; VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32), + .OUT_REG (1) ) csr_buffer ( .clk (clk), .reset (reset), @@ -101,7 +104,8 @@ module VX_instr_demux ( wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), + .OUT_REG (1) ) fpu_buffer ( .clk (clk), .reset (reset), @@ -123,7 +127,8 @@ module VX_instr_demux ( wire [31:0] gpu_rs2_data = gpr_rsp_if.rs2_data[tid]; VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)), + .OUT_REG (1) ) gpu_buffer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index 06065d5d..a889216e 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -207,7 +207,7 @@ module VX_mem_unit # ( .DATA_SIZE (4), .TAG_IN_WIDTH (`DCORE_TAG_WIDTH), .TYPE ("P"), - .BUFFERED_REQ (1), + .BUFFERED_REQ (2), .BUFFERED_RSP (1) ) smem_arb ( .clk (clk), @@ -319,7 +319,7 @@ module VX_mem_unit # ( .TYPE ("R"), .TAG_SEL_IDX (1), // Skip 0 for NC flag .BUFFERED_REQ (1), - .BUFFERED_RSP (1) + .BUFFERED_RSP (2) ) mem_arb ( .clk (clk), .reset (mem_arb_reset), diff --git a/hw/rtl/afu/vortex_afu.sv b/hw/rtl/afu/vortex_afu.sv index 6e728b06..1ef81393 100644 --- a/hw/rtl/afu/vortex_afu.sv +++ b/hw/rtl/afu/vortex_afu.sv @@ -880,7 +880,7 @@ assign cci_mem_req_tag = cci_mem_req_rw ? cci_mem_wr_req_ctr : cci_mem_rd_req_ assign cmd_run_done = !vx_busy; -Vortex #() vortex ( +Vortex vortex ( `SCOPE_BIND_afu_vortex .clk (clk), diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index f1ece0e3..3577d3e6 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -148,7 +148,7 @@ module VX_bank #( wire [NUM_PORTS-1:0][`REQS_BITS-1:0] req_tid_st0, req_tid_st1; wire [NUM_PORTS-1:0] pmask_st0, pmask_st1; wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] tag_st0, tag_st1; - wire [`CACHE_LINE_WIDTH-1:0] rdata_st1; + wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] rdata_st1; wire [`CACHE_LINE_WIDTH-1:0] wdata_st0, wdata_st1; wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1; wire valid_st0, valid_st1; @@ -305,46 +305,15 @@ module VX_bank #( wire mreq_push_st1 = (read_st1 && miss_st1 && !mshr_pending_st1) || write_st1; - wire [`CACHE_LINE_WIDTH-1:0] line_wdata_st1; - wire [CACHE_LINE_SIZE-1:0] line_byteen_st1; - wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] creq_data_st1 = wdata_st1[0 +: NUM_PORTS * `WORD_WIDTH]; - if (`WORDS_PER_LINE > 1) begin - reg [`CACHE_LINE_WIDTH-1:0] line_wdata_r; - reg [CACHE_LINE_SIZE-1:0] line_byteen_r; - if (NUM_PORTS > 1) begin - always @(*) begin - line_wdata_r = 'x; - line_byteen_r = 0; - for (integer i = 0; i < NUM_PORTS; ++i) begin - if (pmask_st1[i]) begin - line_wdata_r[wsel_st1[i] * `WORD_WIDTH +: `WORD_WIDTH] = creq_data_st1[i]; - line_byteen_r[wsel_st1[i] * WORD_SIZE +: WORD_SIZE] = byteen_st1[i]; - end - end - end - end else begin - always @(*) begin - line_wdata_r = {`WORDS_PER_LINE{creq_data_st1}}; - line_byteen_r = 0; - line_byteen_r[wsel_st1 * WORD_SIZE +: WORD_SIZE] = byteen_st1; - end - end - assign line_wdata_st1 = line_wdata_r; - assign line_byteen_st1 = line_byteen_r; - end else begin - `UNUSED_VAR (wsel_st1) - assign line_wdata_st1 = creq_data_st1; - assign line_byteen_st1 = byteen_st1; - end - VX_data_access #( .BANK_ID (BANK_ID), .CACHE_ID (CACHE_ID), .CACHE_SIZE (CACHE_SIZE), .CACHE_LINE_SIZE(CACHE_LINE_SIZE), .NUM_BANKS (NUM_BANKS), + .NUM_PORTS (NUM_PORTS), .WORD_SIZE (WORD_SIZE), .WRITE_ENABLE (WRITE_ENABLE) ) data_access ( @@ -359,6 +328,8 @@ module VX_bank #( .stall (crsq_stall), .addr (addr_st1), + .wsel (wsel_st1), + .pmask (pmask_st1), // reading .readen (valid_st1 && read_st1), @@ -367,8 +338,8 @@ module VX_bank #( // writing .writeen (valid_st1 && writeen_st1), .is_fill (is_fill_st1), - .byteen (line_byteen_st1), - .write_data (line_wdata_st1), + .byteen (byteen_st1), + .write_data (creq_data_st1), .fill_data (wdata_st1) ); @@ -454,16 +425,9 @@ module VX_bank #( assign crsq_pmask = pmask_st1; assign crsq_tid = req_tid_st1; + assign crsq_data = rdata_st1; assign crsq_tag = tag_st1; - if (`WORDS_PER_LINE > 1) begin - for (genvar i = 0; i < NUM_PORTS; ++i) begin - assign crsq_data[i] = rdata_st1[wsel_st1[i] * `WORD_WIDTH +: `WORD_WIDTH]; - end - end else begin - assign crsq_data = rdata_st1; - end - VX_elastic_buffer #( .DATAW (NUM_PORTS * (CORE_TAG_WIDTH + 1 + `WORD_WIDTH + `REQS_BITS)), .SIZE (CRSQ_SIZE), diff --git a/hw/rtl/cache/VX_data_access.v b/hw/rtl/cache/VX_data_access.v index 36f33938..a504078a 100644 --- a/hw/rtl/cache/VX_data_access.v +++ b/hw/rtl/cache/VX_data_access.v @@ -9,10 +9,14 @@ module VX_data_access #( parameter CACHE_LINE_SIZE = 1, // Number of banks parameter NUM_BANKS = 1, + // Number of ports per banks + parameter NUM_PORTS = 1, // Size of a word in bytes parameter WORD_SIZE = 1, // Enable cache writeable - parameter WRITE_ENABLE = 1 + parameter WRITE_ENABLE = 1, + + localparam WORD_SELECT_BITS = `UP(`WORD_SELECT_BITS) ) ( input wire clk, input wire reset, @@ -30,15 +34,18 @@ module VX_data_access #( input wire[`LINE_ADDR_WIDTH-1:0] addr, `IGNORE_UNUSED_END + input wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] wsel, + input wire [NUM_PORTS-1:0] pmask, + // reading input wire readen, - output wire [`CACHE_LINE_WIDTH-1:0] read_data, + output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] read_data, // writing input wire writeen, input wire is_fill, - input wire [CACHE_LINE_SIZE-1:0] byteen, - input wire [`CACHE_LINE_WIDTH-1:0] write_data, + input wire [WORD_SIZE-1:0] byteen, + input wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] write_data, input wire [`CACHE_LINE_WIDTH-1:0] fill_data ); @@ -50,25 +57,58 @@ module VX_data_access #( localparam BYTEENW = WRITE_ENABLE ? CACHE_LINE_SIZE : 1; - wire [`LINE_SELECT_BITS-1:0] line_addr; + wire [`CACHE_LINE_WIDTH-1:0] rdata; wire [`CACHE_LINE_WIDTH-1:0] wdata; wire [BYTEENW-1:0] wren; - - assign line_addr = addr[`LINE_SELECT_BITS-1:0]; + + wire [`LINE_SELECT_BITS-1:0] line_addr = addr[`LINE_SELECT_BITS-1:0]; if (WRITE_ENABLE) begin - assign wren = is_fill ? {BYTEENW{writeen}} : (byteen & {BYTEENW{writeen}}); - assign wdata = is_fill ? fill_data : write_data; - end else begin + wire [`CACHE_LINE_WIDTH-1:0] line_wdata; + wire [CACHE_LINE_SIZE-1:0] line_byteen; + if (`WORDS_PER_LINE > 1) begin + reg [`CACHE_LINE_WIDTH-1:0] line_wdata_r; + reg [CACHE_LINE_SIZE-1:0] line_byteen_r; + if (NUM_PORTS > 1) begin + always @(*) begin + line_wdata_r = 'x; + line_byteen_r = 0; + for (integer i = 0; i < NUM_PORTS; ++i) begin + if (pmask[i]) begin + line_wdata_r[wsel[i] * `WORD_WIDTH +: `WORD_WIDTH] = write_data[i]; + line_byteen_r[wsel[i] * WORD_SIZE +: WORD_SIZE] = byteen[i]; + end + end + end + end else begin + `UNUSED_VAR (pmask) + always @(*) begin + line_wdata_r = {`WORDS_PER_LINE{write_data}}; + line_byteen_r = 0; + line_byteen_r[wsel * WORD_SIZE +: WORD_SIZE] = byteen; + end + end + assign line_wdata = line_wdata_r; + assign line_byteen = line_byteen_r; + end else begin + `UNUSED_VAR (wsel) + `UNUSED_VAR (pmask) + assign line_wdata = write_data; + assign line_byteen = byteen; + end + assign wren = is_fill ? {BYTEENW{writeen}} : ({BYTEENW{writeen}} & line_byteen); + assign wdata = is_fill ? fill_data : line_wdata; + end else begin `UNUSED_VAR (is_fill) - `UNUSED_VAR (byteen) + `UNUSED_VAR (byteen) + `UNUSED_VAR (pmask) `UNUSED_VAR (write_data) assign wren = writeen; assign wdata = fill_data; end VX_sp_ram #( - .DATAW (CACHE_LINE_SIZE * 8), + .DATAW (`CACHE_LINE_WIDTH), .SIZE (`LINES_PER_BANK), .BYTEENW (BYTEENW), .NO_RWCHECK (1) @@ -78,9 +118,17 @@ module VX_data_access #( .wren (wren), .wdata (wdata), .rden (1'b1), - .rdata (read_data) + .rdata (rdata) ); + if (`WORDS_PER_LINE > 1) begin + for (genvar i = 0; i < NUM_PORTS; ++i) begin + assign read_data = rdata[wsel[i] * `WORD_WIDTH +: `WORD_WIDTH]; + end + end else begin + assign read_data = rdata; + end + `UNUSED_VAR (stall) `ifdef DBG_PRINT_CACHE_DATA diff --git a/hw/rtl/fp_cores/VX_fp_ncomp.v b/hw/rtl/fp_cores/VX_fp_ncomp.v index 49f0da77..df6c6b38 100644 --- a/hw/rtl/fp_cores/VX_fp_ncomp.v +++ b/hw/rtl/fp_cores/VX_fp_ncomp.v @@ -100,7 +100,7 @@ module VX_fp_ncomp #( VX_pipe_register #( .DATAW (1 + TAGW + `INST_FPU_BITS + `INST_FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_class_t) + 1 + 1)), .RESETW (1), - .DEPTH (1) + .DEPTH (0) ) pipe_reg0 ( .clk (clk), .reset (reset), diff --git a/hw/rtl/libs/VX_stream_arbiter.v b/hw/rtl/libs/VX_stream_arbiter.v index 0f23b25d..1c89bb4f 100644 --- a/hw/rtl/libs/VX_stream_arbiter.v +++ b/hw/rtl/libs/VX_stream_arbiter.v @@ -98,31 +98,13 @@ module VX_stream_arbiter #( if (LANES > 1) begin wire [NUM_REQS-1:0][(LANES * (1 + DATAW))-1:0] valid_data_in; - for (genvar i = 0; i < NUM_REQS; i++) begin assign valid_data_in[i] = {valid_in[i], data_in[i]}; end - - VX_mux #( - .DATAW (LANES * (1 + DATAW)), - .N (NUM_REQS) - ) data_in_mux ( - .data_in (valid_data_in), - .sel_in (sel_index), - .data_out ({valid_in_sel, data_in_sel}) - ); - + assign {valid_in_sel, data_in_sel} = valid_data_in[sel_index]; `UNUSED_VAR (sel_valid) end else begin - VX_mux #( - .DATAW (DATAW), - .N (NUM_REQS) - ) data_in_mux ( - .data_in (data_in), - .sel_in (sel_index), - .data_out (data_in_sel) - ); - + assign data_in_sel = data_in[sel_index]; assign valid_in_sel = sel_valid; end