From 57143f5889e24e07f27c5da95050b9c1ae24f2ca Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 17 Jun 2021 16:43:43 -0700 Subject: [PATCH] synthesis optimizations --- hw/rtl/VX_ibuffer.v | 37 ++++--- hw/rtl/VX_instr_demux.v | 15 +-- hw/rtl/VX_scoreboard.v | 16 ++- hw/rtl/VX_smem_arb.v | 4 +- hw/rtl/afu/vortex_afu.sv | 129 ++++++++++++++----------- hw/rtl/cache/VX_bank.v | 3 +- hw/rtl/cache/VX_cache_core_rsp_merge.v | 6 +- hw/rtl/libs/VX_priority_encoder.v | 2 +- hw/rtl/libs/VX_rr_arbiter.v | 55 ++++++++++- hw/rtl/libs/VX_skid_buffer.v | 2 +- hw/syn/opae/Makefile | 8 +- hw/syn/opae/setup16.cfg | 7 -- hw/syn/opae/setup8.cfg | 7 -- hw/syn/opae/vortex_afu.json | 4 +- hw/syn/opae/vortex_afu16.json | 53 ---------- hw/syn/opae/vortex_afu8.json | 54 ----------- 16 files changed, 173 insertions(+), 229 deletions(-) delete mode 100644 hw/syn/opae/setup16.cfg delete mode 100644 hw/syn/opae/setup8.cfg delete mode 100644 hw/syn/opae/vortex_afu16.json delete mode 100644 hw/syn/opae/vortex_afu8.json diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index 8a7bb4aa..8c6a4d08 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -36,19 +36,16 @@ module VX_ibuffer #( wire writing = enq_fire && (i == ibuf_enq_if.wid); wire reading = deq_fire && (i == ibuf_deq_if.wid); - wire is_slot0 = empty_r[i] || (alm_empty_r[i] && reading); - - wire push = writing && !is_slot0; - wire pop = reading && !alm_empty_r[i]; + wire is_head_ptr = empty_r[i] || (alm_empty_r[i] && reading); VX_skid_buffer #( .DATAW (DATAW) ) queue ( .clk (clk), .reset (reset), - .valid_in (push), + .valid_in (writing && !is_head_ptr), .data_in (q_data_in), - .ready_out(pop), + .ready_out(reading), .data_out (q_data_prev[i]), `UNUSED_PIN (ready_in), `UNUSED_PIN (valid_out) @@ -79,9 +76,9 @@ module VX_ibuffer #( used_r[i] <= used_r[i] + ADDRW'($signed(2'(writing) - 2'(reading))); end - if (writing && is_slot0) begin + if (writing && is_head_ptr) begin q_data_out[i] <= q_data_in; - end else if (pop) begin + end else if (reading) begin q_data_out[i] <= q_data_prev[i]; end end @@ -111,26 +108,17 @@ module VX_ibuffer #( end // schedule the next instruction to issue - // do round-robin when multiple warps are active - always @(*) begin - deq_valid_n = 0; - deq_wid_n = 'x; - deq_instr_n = 'x; - schedule_table_n = 'x; - + always @(*) begin + deq_valid_n = 1; if (num_warps > 1) begin - deq_valid_n = (| schedule_table); - schedule_table_n = schedule_table; for (integer i = 0; i < `NUM_WARPS; i++) begin if (schedule_table[i]) begin deq_wid_n = `NW_BITS'(i); deq_instr_n = q_data_out[i]; - schedule_table_n[i] = 0; break; end end end else if (1 == num_warps && !(deq_fire && q_alm_empty[deq_wid])) begin - deq_valid_n = 1; deq_wid_n = deq_wid; deq_instr_n = deq_fire ? q_data_prev[deq_wid] : q_data_out[deq_wid]; end else begin @@ -139,6 +127,17 @@ module VX_ibuffer #( deq_instr_n = q_data_in; end end + + // do round-robin with multiple active warps + always @(*) begin + schedule_table_n = schedule_table; + for (integer i = 0; i < `NUM_WARPS; i++) begin + if (schedule_table[i]) begin + schedule_table_n[i] = 0; + break; + end + end + end wire warp_added = enq_fire && q_empty[ibuf_enq_if.wid]; wire warp_removed = deq_fire && ~(enq_fire && ibuf_enq_if.wid == deq_wid) && q_alm_empty[deq_wid]; diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index 1bf8711b..36164b94 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -38,8 +38,7 @@ module VX_instr_demux ( wire alu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_ALU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), - .USE_FASTREG (1) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)) ) alu_buffer ( .clk (clk), .reset (reset), @@ -56,8 +55,7 @@ module VX_instr_demux ( wire lsu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_LSU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)), - .USE_FASTREG (1) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)) ) lsu_buffer ( .clk (clk), .reset (reset), @@ -74,8 +72,7 @@ module VX_instr_demux ( wire csr_req_valid = execute_if.valid && (execute_if.ex_type == `EX_CSR); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32), - .USE_FASTREG (1) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32) ) csr_buffer ( .clk (clk), .reset (reset), @@ -93,8 +90,7 @@ module VX_instr_demux ( wire fpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_FPU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), - .USE_FASTREG (1) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)) ) fpu_buffer ( .clk (clk), .reset (reset), @@ -115,8 +111,7 @@ module VX_instr_demux ( wire gpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_GPU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32)), - .USE_FASTREG (1) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32)) ) gpu_buffer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_scoreboard.v b/hw/rtl/VX_scoreboard.v index 139f0aa1..6b161a1f 100644 --- a/hw/rtl/VX_scoreboard.v +++ b/hw/rtl/VX_scoreboard.v @@ -11,11 +11,17 @@ module VX_scoreboard #( output wire delay ); reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs; - wire [`NUM_REGS-1:0] deq_inuse_regs; - - assign deq_inuse_regs = inuse_regs[ibuf_deq_if.wid] & ibuf_deq_if.used_regs; - assign delay = (| deq_inuse_regs); + reg is_reg_busy; + always @(*) begin + is_reg_busy = 0; + for (integer i = 0; i < `NUM_WARPS; ++i) begin + if (ibuf_deq_if.wid == `NW_BITS'(i)) begin + is_reg_busy = | (inuse_regs[i] & ibuf_deq_if.used_regs); + end + end + end + assign delay = is_reg_busy; wire reserve_reg = ibuf_deq_if.valid && ibuf_deq_if.ready && (ibuf_deq_if.wb != 0); @@ -37,6 +43,8 @@ module VX_scoreboard #( end end + wire [`NUM_REGS-1:0] deq_inuse_regs = inuse_regs[ibuf_deq_if.wid]; + `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin diff --git a/hw/rtl/VX_smem_arb.v b/hw/rtl/VX_smem_arb.v index 4e6cdee2..48eb1680 100644 --- a/hw/rtl/VX_smem_arb.v +++ b/hw/rtl/VX_smem_arb.v @@ -32,8 +32,8 @@ module VX_smem_arb ( VX_stream_demux #( .NUM_REQS (2), .DATAW (REQ_DATAW), - .BUFFERED (0) - ) rsp_demux ( + .BUFFERED (1) + ) req_demux ( .clk (clk), .reset (reset), .sel (core_req_if.tag[i][0]), diff --git a/hw/rtl/afu/vortex_afu.sv b/hw/rtl/afu/vortex_afu.sv index fb49bbe4..1d42af92 100644 --- a/hw/rtl/afu/vortex_afu.sv +++ b/hw/rtl/afu/vortex_afu.sv @@ -53,7 +53,6 @@ localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, CCI_ADDR_WIDTH + $clog2(LME localparam AVS_REQ_TAGW = `MAX(AVS_REQ_TAGW_VX, AVS_REQ_TAGW_CCI); localparam CCI_RD_WINDOW_SIZE = 8; -localparam CCI_RD_QUEUE_SIZE = 2 * CCI_RD_WINDOW_SIZE; localparam CCI_RW_PENDING_SIZE= 256; localparam AFU_ID_L = 16'h0002; // AFU ID Lower @@ -78,15 +77,15 @@ localparam MMIO_SCOPE_WRITE = `AFU_IMAGE_MMIO_SCOPE_WRITE; localparam MMIO_DEV_CAPS = `AFU_IMAGE_MMIO_DEV_CAPS; -localparam CCI_RD_RQ_TAGW = $clog2(CCI_RD_WINDOW_SIZE); -localparam CCI_RD_RQ_DATAW = CCI_LINE_WIDTH + CCI_RD_RQ_TAGW; +localparam CCI_RD_QUEUE_SIZE = 2 * CCI_RD_WINDOW_SIZE; +localparam CCI_RD_QUEUE_TAGW = $clog2(CCI_RD_WINDOW_SIZE); +localparam CCI_RD_QUEUE_DATAW = CCI_LINE_WIDTH + CCI_ADDR_WIDTH; localparam STATE_IDLE = 0; -localparam STATE_READ = 1; -localparam STATE_WRITE = 2; +localparam STATE_WRITE = 1; +localparam STATE_READ = 2; localparam STATE_START = 3; -localparam STATE_RUN = 4; -localparam STATE_MAX_VALUE = 5; +localparam STATE_MAX_VALUE = 4; localparam STATE_WIDTH = $clog2(STATE_MAX_VALUE); `ifdef SCOPE @@ -114,11 +113,9 @@ wire [`VX_MEM_LINE_WIDTH-1:0] vx_mem_rsp_data; wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_rsp_tag; wire vx_mem_rsp_ready; +reg vx_reset; wire vx_busy; -reg vx_reset; -reg vx_mem_en; - // CMD variables ////////////////////////////////////////////////////////////// t_ccip_clAddr cmd_io_addr; @@ -292,8 +289,9 @@ end // COMMAND FSM //////////////////////////////////////////////////////////////// wire cmd_read_done; -wire cmd_write_done; +reg cmd_write_done; wire cmd_run_done; +reg vx_started; reg [$clog2(RESET_DELAY+1)-1:0] vx_reset_ctr; always @(posedge clk) begin @@ -306,9 +304,9 @@ end always @(posedge clk) begin if (reset) begin - state <= STATE_IDLE; + state <= STATE_IDLE; + vx_started <= 0; vx_reset <= 0; - vx_mem_en <= 0; end else begin case (state) STATE_IDLE: begin @@ -358,21 +356,20 @@ always @(posedge clk) begin STATE_START: begin // vortex reset cycles - if (vx_reset_ctr == $bits(vx_reset_ctr)'(RESET_DELAY)) begin - vx_reset <= 0; - vx_mem_en <= 1; - state <= STATE_RUN; - end - end - - STATE_RUN: begin - if (cmd_run_done) begin - vx_mem_en <= 0; - state <= STATE_IDLE; - `ifdef DBG_PRINT_OPAE - $display("%t: STATE IDLE", $time); - `endif - end + if (vx_started) begin + if (cmd_run_done) begin + vx_started <= 0; + state <= STATE_IDLE; + `ifdef DBG_PRINT_OPAE + $display("%t: STATE IDLE", $time); + `endif + end + end else begin + if (vx_reset_ctr == $bits(vx_reset_ctr)'(RESET_DELAY)) begin + vx_started <= 1; + vx_reset <= 0; + end + end end default: begin @@ -387,11 +384,12 @@ end wire cci_mem_rd_req_valid; wire cci_mem_wr_req_valid; -wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_dout; +wire [CCI_RD_QUEUE_DATAW-1:0] cci_rdq_dout; wire cci_mem_req_valid; wire cci_mem_req_rw; wire [CCI_ADDR_WIDTH-1:0] cci_mem_req_addr; +wire [CCI_LINE_WIDTH-1:0] cci_mem_req_data; wire [CCI_ADDR_WIDTH-1:0] cci_mem_req_tag; wire cci_mem_req_ready; @@ -430,7 +428,7 @@ VX_to_mem #( .mem_req_addr_in (cci_mem_req_addr), .mem_req_rw_in (cci_mem_req_rw), .mem_req_byteen_in ({CCI_LINE_SIZE{1'b1}}), - .mem_req_data_in (cci_rdq_dout[CCI_RD_RQ_DATAW-1:CCI_RD_RQ_TAGW]), + .mem_req_data_in (cci_mem_req_data), .mem_req_tag_in (cci_mem_req_tag), .mem_req_ready_in (cci_mem_req_ready), @@ -473,7 +471,7 @@ wire vx_mem_req_valid_qual; wire vx_mem_req_ready_qual; assign vx_mem_req_valid_qual = vx_mem_req_valid - && vx_mem_en + && vx_started && ~vx_mem_is_cout; assign vx_mem_req_ready = vx_mem_is_cout ? ~cout_q_full : vx_mem_req_ready_qual; @@ -617,19 +615,20 @@ VX_avs_wrapper #( reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_ctr; wire [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr; -reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr_unqual; -reg [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr; -wire [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr_next; -wire [CCI_RD_RQ_TAGW-1:0] cci_rd_req_tag; -wire [CCI_RD_RQ_TAGW-1:0] cci_rd_rsp_tag; -reg [CCI_RD_RQ_TAGW-1:0] cci_rd_rsp_ctr; +reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr_base; wire cci_rd_req_fire; t_ccip_clAddr cci_rd_req_addr; reg cci_rd_req_valid, cci_rd_req_wait; +reg [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr; +wire [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr_next; +wire [CCI_RD_QUEUE_TAGW-1:0] cci_rd_req_tag; + +wire [CCI_RD_QUEUE_TAGW-1:0] cci_rd_rsp_tag; +reg [CCI_RD_QUEUE_TAGW-1:0] cci_rd_rsp_ctr; wire cci_rdq_push, cci_rdq_pop; -wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_din; +wire [CCI_RD_QUEUE_DATAW-1:0] cci_rdq_din; wire cci_rdq_empty; always @(*) begin @@ -641,16 +640,15 @@ end wire cci_mem_wr_req_fire = cci_mem_wr_req_valid && cci_mem_req_ready; -wire cci_rd_rsp_fire = (STATE_WRITE == state) - && cp2af_sRxPort.c0.rspValid +wire cci_rd_rsp_fire = cp2af_sRxPort.c0.rspValid && (cp2af_sRxPort.c0.hdr.resp_type == eRSP_RDLINE); -assign cci_rd_req_tag = CCI_RD_RQ_TAGW'(cci_rd_req_ctr); -assign cci_rd_rsp_tag = CCI_RD_RQ_TAGW'(cp2af_sRxPort.c0.hdr.mdata); +assign cci_rd_req_tag = CCI_RD_QUEUE_TAGW'(cci_rd_req_ctr); +assign cci_rd_rsp_tag = CCI_RD_QUEUE_TAGW'(cp2af_sRxPort.c0.hdr.mdata); assign cci_rdq_push = cci_rd_rsp_fire; assign cci_rdq_pop = cci_mem_wr_req_fire; -assign cci_rdq_din = {cp2af_sRxPort.c0.data, cci_rd_rsp_tag}; +assign cci_rdq_din = {cp2af_sRxPort.c0.data, cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(cci_rd_rsp_tag)}; wire [$clog2(CCI_RD_QUEUE_SIZE+1)-1:0] cci_pending_reads; wire cci_pending_reads_full; @@ -673,9 +671,7 @@ assign cci_rd_req_fire = cci_rd_req_valid && !(cci_rd_req_wait || cci_pending_re assign cci_mem_wr_req_valid = !cci_rdq_empty; -assign cci_mem_wr_req_addr = cci_mem_wr_req_addr_unqual + (CCI_ADDR_WIDTH'(CCI_RD_RQ_TAGW'(cci_rdq_dout))); - -assign cmd_write_done = (cci_mem_wr_req_ctr == cmd_data_size); +assign cci_mem_wr_req_addr = cci_rdq_dout[CCI_ADDR_WIDTH-1:0]; // Send read requests to CCI always @(posedge clk) begin @@ -693,11 +689,11 @@ always @(posedge clk) begin && (cci_rd_req_ctr_next != cmd_data_size) && !cp2af_sRxPort.c0TxAlmFull; - if (cci_rd_req_fire && (cci_rd_req_tag == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin + if (cci_rd_req_fire && (cci_rd_req_tag == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin cci_rd_req_wait <= 1; // end current request batch end - if (cci_rd_rsp_fire && (cci_rd_rsp_ctr == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin + if (cci_rd_rsp_fire && (cci_rd_rsp_ctr == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin cci_rd_req_wait <= 0; // begin new request batch end end @@ -708,7 +704,8 @@ always @(posedge clk) begin cci_rd_req_ctr <= 0; cci_rd_rsp_ctr <= 0; cci_mem_wr_req_ctr <= 0; - cci_mem_wr_req_addr_unqual <= cmd_mem_addr; + cci_mem_wr_req_addr_base <= cmd_mem_addr; + cmd_write_done <= 0; end if (cci_rd_req_fire) begin @@ -720,7 +717,7 @@ always @(posedge clk) begin end if (cci_rd_rsp_fire) begin - cci_rd_rsp_ctr <= cci_rd_rsp_ctr + CCI_RD_RQ_TAGW'(1); + cci_rd_rsp_ctr <= cci_rd_rsp_ctr + CCI_RD_QUEUE_TAGW'(1); `ifdef DBG_PRINT_OPAE $display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data); `endif @@ -733,13 +730,18 @@ always @(posedge clk) begin end if (cci_mem_wr_req_fire) begin - cci_mem_wr_req_addr_unqual <= cci_mem_wr_req_addr_unqual + ((CCI_RD_RQ_TAGW'(cci_mem_wr_req_ctr) == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) ? CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE) : CCI_ADDR_WIDTH'(0)); - cci_mem_wr_req_ctr <= cci_mem_wr_req_ctr + CCI_ADDR_WIDTH'(1); + if (CCI_RD_QUEUE_TAGW'(cci_mem_wr_req_ctr) == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin + cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE); + end + cci_mem_wr_req_ctr <= cci_mem_wr_req_ctr + CCI_ADDR_WIDTH'(1); + if (cci_mem_wr_req_ctr == (cmd_data_size-1)) begin + cmd_write_done <= 1; + end end end VX_fifo_queue #( - .DATAW (CCI_RD_RQ_DATAW), + .DATAW (CCI_RD_QUEUE_DATAW), .SIZE (CCI_RD_QUEUE_SIZE) ) cci_rd_req_queue ( .clk (clk), @@ -779,11 +781,13 @@ VX_fifo_queue #( reg [CCI_ADDR_WIDTH-1:0] cci_mem_rd_req_ctr; reg [CCI_ADDR_WIDTH-1:0] cci_mem_rd_req_addr; -reg [CCI_ADDR_WIDTH-1:0] cci_wr_req_ctr; +reg cci_mem_rd_req_done; +reg [CCI_ADDR_WIDTH-1:0] cci_wr_req_ctr; reg cci_wr_req_fire; t_ccip_clAddr cci_wr_req_addr; t_ccip_clData cci_wr_req_data; +reg cci_wr_req_done; always @(*) begin af2cp_sTxPort.c1.valid = cci_wr_req_fire; @@ -818,12 +822,12 @@ VX_pending_size #( `UNUSED_VAR (cci_pending_writes) assign cci_mem_rd_req_valid = (STATE_READ == state) - && (cci_mem_rd_req_ctr != cmd_data_size); + && !cci_mem_rd_req_done; assign cci_mem_rsp_ready = !cp2af_sRxPort.c1TxAlmFull && !cci_pending_writes_full; -assign cmd_read_done = (0 == cci_wr_req_ctr) +assign cmd_read_done = cci_wr_req_done && cci_pending_writes_empty; // Send write requests to CCI @@ -839,12 +843,17 @@ begin && (CMD_MEM_READ == cmd_type)) begin cci_mem_rd_req_ctr <= 0; cci_mem_rd_req_addr <= cmd_mem_addr; + cci_mem_rd_req_done <= 0; cci_wr_req_ctr <= cmd_data_size; + cci_wr_req_done <= 0; end if (cci_mem_rd_req_fire) begin cci_mem_rd_req_addr <= cci_mem_rd_req_addr + CCI_ADDR_WIDTH'(1); cci_mem_rd_req_ctr <= cci_mem_rd_req_ctr + CCI_ADDR_WIDTH'(1); + if (cci_mem_rd_req_ctr == (cmd_data_size-1)) begin + cci_mem_rd_req_done <= 1; + end end cci_wr_req_addr <= cmd_io_addr + t_ccip_clAddr'(cci_mem_rsp_tag); @@ -853,6 +862,9 @@ begin if (cci_wr_req_fire) begin assert(cci_wr_req_ctr != 0); cci_wr_req_ctr <= cci_wr_req_ctr - CCI_ADDR_WIDTH'(1); + if (cci_wr_req_ctr == CCI_ADDR_WIDTH'(1)) begin + cci_wr_req_done <= 1; + end `ifdef DBG_PRINT_OPAE $display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data); `endif @@ -867,9 +879,10 @@ end //-- -assign cci_mem_req_rw = (CMD_MEM_WRITE == state); +assign cci_mem_req_rw = state[0]; // STATE_WRITE=00, STATE_WRITE=01 assign cci_mem_req_valid = cci_mem_req_rw ? cci_mem_wr_req_valid : cci_mem_rd_req_valid; assign cci_mem_req_addr = cci_mem_req_rw ? cci_mem_wr_req_addr : cci_mem_rd_req_addr; +assign cci_mem_req_data = cci_rdq_dout[CCI_RD_QUEUE_DATAW-1:CCI_ADDR_WIDTH]; assign cci_mem_req_tag = cci_mem_req_rw ? cci_mem_wr_req_ctr : cci_mem_rd_req_ctr; // Vortex ///////////////////////////////////////////////////////////////////// @@ -920,7 +933,7 @@ assign cout_char = vx_mem_req_data_ar[cout_tid]; assign vx_mem_is_cout = (vx_mem_req_addr == `VX_MEM_ADDR_WIDTH'(`IO_COUT_ADDR >> (32 - `VX_MEM_ADDR_WIDTH))); wire cout_q_push = vx_mem_req_valid - && vx_mem_en + && vx_started && vx_mem_is_cout && ~cout_q_full; diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 5f1b80d3..cc953f8c 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -475,8 +475,7 @@ module VX_bank #( end VX_skid_buffer #( - .DATAW (CORE_TAG_WIDTH + (1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS), - .USE_FASTREG (NUM_BANKS == 1) + .DATAW (CORE_TAG_WIDTH + (1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS) ) core_rsp_req ( .clk (clk), .reset (reset), diff --git a/hw/rtl/cache/VX_cache_core_rsp_merge.v b/hw/rtl/cache/VX_cache_core_rsp_merge.v index 1ffbaa23..71a8b85e 100644 --- a/hw/rtl/cache/VX_cache_core_rsp_merge.v +++ b/hw/rtl/cache/VX_cache_core_rsp_merge.v @@ -106,8 +106,7 @@ module VX_cache_core_rsp_merge #( wire core_rsp_valid_any = (| per_bank_core_rsp_valid); VX_skid_buffer #( - .DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)), - .USE_FASTREG (1) + .DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)) ) pipe_reg ( .clk (clk), .reset (reset), @@ -155,8 +154,7 @@ module VX_cache_core_rsp_merge #( for (genvar i = 0; i < NUM_REQS; i++) begin VX_skid_buffer #( - .DATAW (CORE_TAG_WIDTH + `WORD_WIDTH), - .USE_FASTREG (1) + .DATAW (CORE_TAG_WIDTH + `WORD_WIDTH) ) pipe_reg ( .clk (clk), .reset (reset), diff --git a/hw/rtl/libs/VX_priority_encoder.v b/hw/rtl/libs/VX_priority_encoder.v index 4810fe0e..57e6ff69 100644 --- a/hw/rtl/libs/VX_priority_encoder.v +++ b/hw/rtl/libs/VX_priority_encoder.v @@ -48,7 +48,7 @@ module VX_priority_encoder #( VX_onehot_encoder #( .N (N), .REVERSE (REVERSE) - ) b ( + ) onehot_encoder ( .data_in (onehot), .data_out (index), `UNUSED_PIN (valid) diff --git a/hw/rtl/libs/VX_rr_arbiter.v b/hw/rtl/libs/VX_rr_arbiter.v index 69406011..c7398ba2 100644 --- a/hw/rtl/libs/VX_rr_arbiter.v +++ b/hw/rtl/libs/VX_rr_arbiter.v @@ -3,7 +3,8 @@ module VX_rr_arbiter #( parameter NUM_REQS = 1, parameter LOCK_ENABLE = 0, - parameter LOG_NUM_REQS = $clog2(NUM_REQS) + parameter LOG_NUM_REQS = $clog2(NUM_REQS), + parameter FAST = 1 ) ( input wire clk, input wire reset, @@ -23,6 +24,58 @@ module VX_rr_arbiter #( assign grant_onehot = requests; assign grant_valid = requests[0]; + end else if (FAST == 1) begin + + wire [NUM_REQS-1:0] req_masked; + wire [NUM_REQS-1:0] grant, grant_masked, grant_unmasked; + /* verilator lint_off UNOPTFLAT */ + wire [NUM_REQS-1:0] mask_higher_pri_reqs; + /* verilator lint_off UNOPTFLAT */ + wire [NUM_REQS-1:0] unmask_higher_pri_reqs; + wire no_req_masked; + reg [NUM_REQS-1:0] pointer_reg; + + // Simple priority arbitration for masked portion + assign req_masked = requests & pointer_reg; + assign mask_higher_pri_reqs[NUM_REQS-1:1] = mask_higher_pri_reqs[NUM_REQS-2:0] | req_masked[NUM_REQS-2:0]; + assign mask_higher_pri_reqs[0] = 1'b0; + assign grant_masked[NUM_REQS-1:0] = req_masked[NUM_REQS-1:0] & ~mask_higher_pri_reqs[NUM_REQS-1:0]; + + // Simple priority arbitration for unmasked portion + assign unmask_higher_pri_reqs[NUM_REQS-1:1] = unmask_higher_pri_reqs[NUM_REQS-2:0] | requests[NUM_REQS-2:0]; + assign unmask_higher_pri_reqs[0] = 1'b0; + assign grant_unmasked[NUM_REQS-1:0] = requests[NUM_REQS-1:0] & ~unmask_higher_pri_reqs[NUM_REQS-1:0]; + + // Use grant_masked if there is any there, otherwise use grant_unmasked. + assign no_req_masked = ~(| req_masked); + assign grant = ({NUM_REQS{no_req_masked}} & grant_unmasked) | grant_masked; + + // Generate arbiter pointer update + wire mask_ptr_sel = (| req_masked) & (!LOCK_ENABLE || enable); + wire unmask_ptr_sel = (| requests) & (!LOCK_ENABLE || enable); + + // Pointer update + always @(posedge clk) begin + if (reset) begin + pointer_reg <= {NUM_REQS{1'b1}}; + end else if (mask_ptr_sel) begin // select if masked arbiter used + pointer_reg <= mask_higher_pri_reqs; + end else if (unmask_ptr_sel) begin // select if unmasked arbiter used + pointer_reg <= unmask_higher_pri_reqs; + end + end + + VX_onehot_encoder #( + .N (NUM_REQS) + ) onehot_encoder ( + .data_in (grant), + .data_out (grant_index), + `UNUSED_PIN (valid) + ); + + assign grant_onehot = grant; + assign grant_valid = (| requests); + end else begin reg [LOG_NUM_REQS-1:0] grant_table [NUM_REQS-1:0]; diff --git a/hw/rtl/libs/VX_skid_buffer.v b/hw/rtl/libs/VX_skid_buffer.v index e770a852..c31a4ea8 100644 --- a/hw/rtl/libs/VX_skid_buffer.v +++ b/hw/rtl/libs/VX_skid_buffer.v @@ -83,7 +83,7 @@ module VX_skid_buffer #( end if (pop && !use_buffer) begin data_out_r <= data_in; - end else if (pop) begin + end else if (ready_out) begin data_out_r <= buffer; end end diff --git a/hw/syn/opae/Makefile b/hw/syn/opae/Makefile index 10034c00..4a35b54d 100644 --- a/hw/syn/opae/Makefile +++ b/hw/syn/opae/Makefile @@ -76,16 +76,16 @@ $(FPGA_BUILD_DIR)_4c/build/dcp.qpf: afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_4c $(FPGA_BUILD_DIR)_8c/build/dcp.qpf: - afu_synth_setup -s setup8.cfg $(FPGA_BUILD_DIR)_8c + afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_8c $(FPGA_BUILD_DIR)_16c/build/dcp.qpf: - afu_synth_setup -s setup16.cfg $(FPGA_BUILD_DIR)_16c + afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_16c $(FPGA_BUILD_DIR)_32c/build/dcp.qpf: - afu_synth_setup -s setup16.cfg $(FPGA_BUILD_DIR)_32c + afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_32c $(FPGA_BUILD_DIR)_64c/build/dcp.qpf: - afu_synth_setup -s setup16.cfg $(FPGA_BUILD_DIR)_64c + afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_64c gen-sources-1c: ./gen_sources.sh $(CFLAGS) $(CONFIG1) > sources.txt diff --git a/hw/syn/opae/setup16.cfg b/hw/syn/opae/setup16.cfg deleted file mode 100644 index e29575ee..00000000 --- a/hw/syn/opae/setup16.cfg +++ /dev/null @@ -1,7 +0,0 @@ -+define+SYNTHESIS -+define+QUARTUS - -vortex_afu16.json -QI:vortex_afu.qsf - -C:sources.txt \ No newline at end of file diff --git a/hw/syn/opae/setup8.cfg b/hw/syn/opae/setup8.cfg deleted file mode 100644 index 36c42484..00000000 --- a/hw/syn/opae/setup8.cfg +++ /dev/null @@ -1,7 +0,0 @@ -+define+SYNTHESIS -+define+QUARTUS - -vortex_afu8.json -QI:vortex_afu.qsf - -C:sources.txt \ No newline at end of file diff --git a/hw/syn/opae/vortex_afu.json b/hw/syn/opae/vortex_afu.json index 1d49bf51..29bc25a9 100644 --- a/hw/syn/opae/vortex_afu.json +++ b/hw/syn/opae/vortex_afu.json @@ -2,8 +2,8 @@ "version": 1, "afu-image": { "power": 0, - "clock-frequency-high": "auto-220", - "clock-frequency-low": "auto-220", + "clock-frequency-high": "auto-210", + "clock-frequency-low": "auto-210", "cmd-mem-read": 1, "cmd-mem-write": 2, diff --git a/hw/syn/opae/vortex_afu16.json b/hw/syn/opae/vortex_afu16.json deleted file mode 100644 index 0e6dd4c9..00000000 --- a/hw/syn/opae/vortex_afu16.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "version": 1, - "afu-image": { - "power": 0, - "clock-frequency-high": "auto-200", - "clock-frequency-low": "auto-200", - - "cmd-mem-read": 1, - "cmd-mem-write": 2, - "cmd-run": 3, - "cmd-csr-read": 4, - "cmd-csr-write": 5, - - "mmio-cmd-type": 10, - "mmio-io-addr": 12, - "mmio-mem-addr": 14, - "mmio-data-size": 16, - "mmio-status": 18, - "mmio-scope-read": 20, - "mmio-scope-write": 22, - "mmio-dev-caps": 24, - - "afu-top-interface": - { - "class": "ccip_std_afu_avalon_mm", - "module-ports" : - [ - { - "class": "cci-p", - "params": - { - "clock": "uClk_usr" - } - }, - { - "class": "local-memory", - "params": - { - "clock": "uClk_usr" - } - } - ] - }, - "accelerator-clusters": - [ - { - "name": "vortex_afu", - "total-contexts": 1, - "accelerator-type-uuid": "35f9452b-25c2-434c-93d5-6f8c60db361c" - } - ] - } -} diff --git a/hw/syn/opae/vortex_afu8.json b/hw/syn/opae/vortex_afu8.json deleted file mode 100644 index e4583a37..00000000 --- a/hw/syn/opae/vortex_afu8.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "version": 1, - "afu-image": { - "power": 0, - "clock-frequency-high": "auto-210", - "clock-frequency-low": "auto-210", - - "cmd-mem-read": 1, - "cmd-mem-write": 2, - "cmd-run": 3, - "cmd-csr-read": 4, - "cmd-csr-write": 5, - - "mmio-cmd-type": 10, - "mmio-io-addr": 12, - "mmio-mem-addr": 14, - "mmio-data-size": 16, - "mmio-status": 18, - "mmio-scope-read": 20, - "mmio-scope-write": 22, - "mmio-dev-caps": 24, - - "afu-top-interface": - { - "class": "ccip_std_afu_avalon_mm", - "module-ports" : - [ - { - "class": "cci-p", - "params": - { - "clock": "uClk_usr" - } - }, - { - "class": "local-memory", - "params": - { - "clock": "uClk_usr" - } - } - ] - }, - "accelerator-clusters": - [ - { - "name": "vortex_afu", - "total-contexts": 1, - "accelerator-type-uuid": "35f9452b-25c2-434c-93d5-6f8c60db361c" - } - ] - } - } - \ No newline at end of file