From b5569dd5253b21e5fbef75d8c19d4a297db34543 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 20 May 2020 12:08:10 -0700 Subject: [PATCH] OPAE rtl fixes --- driver/rtlsim/Makefile | 9 +++-- hw/opae/sources.txt | 9 +++-- hw/opae/vortex_afu.sv | 49 ++++++++++++++--------- hw/rtl/VX_scheduler.v | 10 ++--- hw/rtl/cache/VX_bank.v | 48 +++++++++++----------- hw/rtl/cache/VX_cache.v | 2 +- hw/rtl/cache/VX_cache_core_req_bank_sel.v | 31 +++++++------- hw/rtl/cache/VX_cache_miss_resrv.v | 19 +++++++++ hw/rtl/cache/VX_snp_forwarder.v | 2 +- hw/rtl/libs/VX_generic_queue.v | 11 +++-- 10 files changed, 112 insertions(+), 78 deletions(-) diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 85243412..e5de9fd2 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -5,10 +5,11 @@ CFLAGS += -I../../include -I../../../hw/simulate -I../../../runtime # control RTL debug print states DBG_PRINT = -DDBG_PRINT_CORE_ICACHE \ - -DDBG_PRINT_CORE_DCACHE \ - -DDBG_PRINT_BANK \ - -DDBG_PRINT_DRAM \ - -DDBG_PRINT_SNP_FWD + -DDBG_PRINT_CORE_DCACHE \ + -DDBG_PRINT_CACHE_BANK \ + -DDBG_PRINT_CACHE_SNP \ + -DDBG_PRINT_CACHE_MSRQ \ + -DDBG_PRINT_DRAM #MULTICORE += -DNUM_CLUSTERS=2 -DNUM_CORES=2 #MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=2 diff --git a/hw/opae/sources.txt b/hw/opae/sources.txt index 2d57397f..0cfd9e19 100644 --- a/hw/opae/sources.txt +++ b/hw/opae/sources.txt @@ -2,14 +2,15 @@ vortex_afu.json +define+GLOBAL_BLOCK_SIZE=64 -#+define+NUM_CORES=2 -#+define+L2_ENABLE=0 ++define+NUM_CORES=2 ++define+L2_ENABLE=0 #+define+DBG_PRINT_CORE_ICACHE #+define+DBG_PRINT_CORE_DCACHE -#+define+DBG_PRINT_BANK +#+define+DBG_PRINT_CACHE_BANK +#+define+DBG_PRINT_CACHE_SNP +#+define+DBG_PRINT_CACHE_MSRQ #+define+DBG_PRINT_DRAM -#+define+DBG_PRINT_SNP_FWD +incdir+. +incdir+../rtl diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index 0aa17cce..2faeeb36 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -3,6 +3,8 @@ import local_mem_cfg_pkg::*; `include "afu_json_info.vh" `include "VX_define.vh" +`define DRAM_TO_BYTE_ADDR(x) {x, 6'b0} + module vortex_afu #( parameter NUM_LOCAL_MEM_BANKS = 2 ) ( @@ -110,6 +112,7 @@ logic avs_rdq_pop; t_local_mem_data avs_rdq_dout; logic avs_rdq_empty; logic avs_rdq_full; +logic [`LOG2UP(AVS_RD_QUEUE_SIZE+1)-1:0] avs_rdq_size; // CSR variables ////////////////////////////////////////////////////////////// @@ -149,11 +152,11 @@ begin case (mmioHdr.address) MMIO_CSR_IO_ADDR: begin csr_io_addr <= t_ccip_clAddr'(cp2af_sRxPort.c0.data); - $display("%t: CSR_IO_ADDR: 0x%h", $time, t_ccip_clAddr'(cp2af_sRxPort.c0.data)); + $display("%t: CSR_IO_ADDR: 0x%0h", $time, t_ccip_clAddr'(cp2af_sRxPort.c0.data)); end MMIO_CSR_MEM_ADDR: begin csr_mem_addr <= t_local_mem_addr'(cp2af_sRxPort.c0.data); - $display("%t: CSR_MEM_ADDR: 0x%h", $time, t_local_mem_addr'(cp2af_sRxPort.c0.data)); + $display("%t: CSR_MEM_ADDR: 0x%0h", $time, t_local_mem_addr'(cp2af_sRxPort.c0.data)); end MMIO_CSR_DATA_SIZE: begin csr_data_size <= $bits(csr_data_size)'(cp2af_sRxPort.c0.data); @@ -235,11 +238,11 @@ begin STATE_IDLE: begin case (csr_cmd) CMD_TYPE_READ: begin - $display("%t: STATE READ: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); + $display("%t: STATE READ: ia=%0h da=%0h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); state <= STATE_READ; end CMD_TYPE_WRITE: begin - $display("%t: STATE WRITE: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); + $display("%t: STATE WRITE: ia=%0h da=%0h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); state <= STATE_WRITE; end CMD_TYPE_RUN: begin @@ -248,7 +251,7 @@ begin state <= STATE_START; end CMD_TYPE_CLFLUSH: begin - $display("%t: STATE CFLUSH: da=%h sz=%0d", $time, csr_mem_addr, csr_data_size); + $display("%t: STATE CFLUSH: da=%0h sz=%0d", $time, csr_mem_addr, csr_data_size); state <= STATE_CLFLUSH; end endcase @@ -296,7 +299,8 @@ logic cci_dram_req_read_fire; logic cci_dram_req_write_fire; logic vx_dram_req_read_fire; logic vx_dram_req_write_fire; -logic [`LOG2UP(AVS_RD_QUEUE_SIZE):0] avs_pending_reads, avs_pending_rds_next; +logic vx_dram_rsp_fire; +logic [`LOG2UP(AVS_RD_QUEUE_SIZE+1)-1:0] avs_pending_reads, avs_pending_rds_next; t_ccip_clAddr next_avs_address; always_comb @@ -311,7 +315,7 @@ begin && avs_write_ctr < csr_data_size); cci_dram_req_read_fire = (state == STATE_READ) - && (avs_pending_reads < AVS_RD_QUEUE_SIZE) + && ((avs_pending_reads + avs_rdq_size) < AVS_RD_QUEUE_SIZE) && !avs_waitrequest && avs_read_ctr < csr_data_size; @@ -322,6 +326,8 @@ begin vx_dram_req_write_fire = vx_dram_req_write && vx_dram_req_ready; + vx_dram_rsp_fire = vx_dram_rsp_valid && vx_dram_rsp_ready; + if ((cci_dram_req_read_fire || vx_dram_req_read_fire) && ~avs_readdatavalid) begin avs_pending_rds_next = avs_pending_reads + 1; @@ -363,7 +369,7 @@ begin avs_address <= csr_mem_addr + avs_read_ctr; avs_read_ctr <= avs_read_ctr + 1; avs_read <= 1; - $display("%t: AVS Rd Req: addr=%h, pending=%0d", $time, (csr_mem_addr + avs_read_ctr), avs_pending_reads); + $display("%t: AVS Rd Req: addr=%0h, pending=%0d", $time, `DRAM_TO_BYTE_ADDR(csr_mem_addr + avs_read_ctr), avs_pending_reads); end if (cci_dram_req_write_fire) begin @@ -371,20 +377,20 @@ begin avs_address <= next_avs_address; avs_write_ctr <= avs_write_ctr + 1; avs_write <= 1; - $display("%t: AVS Wr Req: addr=%h (%0d/%0d)", $time, next_avs_address, avs_write_ctr + 1, csr_data_size); + $display("%t: AVS Wr Req: addr=%0h (%0d/%0d)", $time, `DRAM_TO_BYTE_ADDR(next_avs_address), avs_write_ctr + 1, csr_data_size); end if (vx_dram_req_read_fire) begin avs_address <= vx_dram_req_addr; avs_read <= 1; - $display("%t: AVS Rd Req: addr=%h, pending=%0d", $time, vx_dram_req_addr, avs_pending_reads); + $display("%t: AVS Rd Req: addr=%0h, pending=%0d", $time, `DRAM_TO_BYTE_ADDR(vx_dram_req_addr), avs_pending_reads); end if (vx_dram_req_write_fire) begin avs_address <= vx_dram_req_addr; avs_writedata <= vx_dram_req_data; avs_write <= 1; - $display("%t: AVS Wr Req: addr=%h", $time, vx_dram_req_addr); + $display("%t: AVS Wr Req: addr=%0h", $time, `DRAM_TO_BYTE_ADDR(vx_dram_req_addr)); end if (avs_readdatavalid) begin @@ -399,7 +405,9 @@ end always_comb begin - vx_dram_req_ready = vortex_enabled && !avs_waitrequest && (avs_pending_reads < AVS_RD_QUEUE_SIZE); + vx_dram_req_ready = vortex_enabled + && !avs_waitrequest + && ((avs_pending_reads + avs_rdq_size) < AVS_RD_QUEUE_SIZE); end // Vortex DRAM fill response @@ -419,7 +427,7 @@ always_comb begin avs_rtq_push = vx_dram_req_read_fire; avs_rtq_din = vx_dram_req_tag; - avs_rtq_pop = vx_dram_rsp_valid; + avs_rtq_pop = vx_dram_rsp_fire; end VX_generic_queue #( @@ -442,7 +450,7 @@ always_comb begin avs_rdq_push = avs_readdatavalid; avs_rdq_din = avs_readdata; - avs_rdq_pop = vx_dram_rsp_valid || cci_wr_req; + avs_rdq_pop = vx_dram_rsp_fire || cci_wr_req; end VX_generic_queue #( @@ -456,7 +464,8 @@ VX_generic_queue #( .pop (avs_rdq_pop), .data_out (avs_rdq_dout), .empty (avs_rdq_empty), - .full (avs_rdq_full) + .full (avs_rdq_full), + .size (avs_rdq_size) ); // CCI Read Request /////////////////////////////////////////////////////////// @@ -513,7 +522,7 @@ begin if (t_cci_rdq_tag'(cci_read_ctr) == (CCI_RD_WINDOW_SIZE-1)) begin cci_read_wait <= 1; // end current request batch end - $display("%t: CCI Rd Req: addr=%h, ctr=%0d", $time, cci_read_hdr.address, cci_read_ctr); + $display("%t: CCI Rd Req: addr=%0h, ctr=%0d", $time, `DRAM_TO_BYTE_ADDR(cci_read_hdr.address), cci_read_ctr); end if (cci_rdq_push) begin @@ -591,7 +600,7 @@ begin af2cp_sTxPort.c1.data <= t_ccip_clData'(avs_rdq_dout); af2cp_sTxPort.c1.valid <= 1; cci_write_ctr <= cci_write_ctr + 1; - $display("%t: CCI Wr Req: addr=%h (%0d/%0d)", $time, cci_write_hdr.address, cci_write_ctr + 1, csr_data_size); + $display("%t: CCI Wr Req: addr=%0h (%0d/%0d)", $time, `DRAM_TO_BYTE_ADDR(cci_write_hdr.address), cci_write_ctr + 1, csr_data_size); end if (cp2af_sRxPort.c1.rspValid) begin @@ -607,9 +616,12 @@ end logic [DRAM_ADDR_WIDTH-1:0] snp_req_ctr; logic [DRAM_ADDR_WIDTH-1:0] snp_rsp_ctr; +logic vx_snp_rsp_fire; + always_comb begin cmd_clflush_done = (snp_rsp_ctr >= csr_data_size); + vx_snp_rsp_fire = vx_snp_rsp_valid && vx_snp_rsp_ready; end always_ff @(posedge clk) @@ -642,8 +654,7 @@ begin if ((STATE_CLFLUSH == state) && (snp_rsp_ctr < csr_data_size) - && vx_snp_rsp_valid - && vx_snp_rsp_ready) begin + && vx_snp_rsp_fire) begin snp_rsp_ctr <= snp_rsp_ctr + 1; end end diff --git a/hw/rtl/VX_scheduler.v b/hw/rtl/VX_scheduler.v index 54a6d584..f20456e4 100644 --- a/hw/rtl/VX_scheduler.v +++ b/hw/rtl/VX_scheduler.v @@ -42,11 +42,11 @@ module VX_scheduler ( wire rename_valid = rs1_rename_qual || rs2_rename_qual || rd_rename_qual; - assign schedule_delay = (|bckE_req_if.valid) && - ((rename_valid ) || - (memory_delay && is_mem) || - (gpr_stage_delay && (is_mem || is_exec)) || - (exec_delay && is_exec)); + assign schedule_delay = (| bckE_req_if.valid) + && ((rename_valid ) + || (memory_delay && is_mem) + || (gpr_stage_delay && (is_mem || is_exec)) + || (exec_delay && is_exec)); integer i, w; diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index e53133c7..d5c2a8de 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -105,35 +105,33 @@ module VX_bank #( ); `DEBUG_BEGIN - - wire[31:0] debug_use_pc_st0; - wire[1:0] debug_wb_st0; - wire[4:0] debug_rd_st0; - wire[`NW_BITS-1:0] debug_warp_num_st0; - wire[2:0] debug_mem_read_st0; - wire[2:0] debug_mem_write_st0; - wire[`REQS_BITS-1:0] debug_tid_st0; + wire[31:0] debug_use_pc_st0; + wire[1:0] debug_wb_st0; + wire[4:0] debug_rd_st0; + wire[`NW_BITS-1:0] debug_warp_num_st0; + wire[2:0] debug_mem_read_st0; + wire[2:0] debug_mem_write_st0; + wire[`REQS_BITS-1:0] debug_tid_st0; - wire[31:0] debug_use_pc_st1e; - wire[1:0] debug_wb_st1e; - wire[4:0] debug_rd_st1e; - wire[`NW_BITS-1:0] debug_warp_num_st1e; - wire[2:0] debug_mem_read_st1e; - wire[2:0] debug_mem_write_st1e; - wire[`REQS_BITS-1:0] debug_tid_st1e; + wire[31:0] debug_use_pc_st1e; + wire[1:0] debug_wb_st1e; + wire[4:0] debug_rd_st1e; + wire[`NW_BITS-1:0] debug_warp_num_st1e; + wire[2:0] debug_mem_read_st1e; + wire[2:0] debug_mem_write_st1e; + wire[`REQS_BITS-1:0] debug_tid_st1e; - wire[31:0] debug_use_pc_st2; - wire[1:0] debug_wb_st2; - wire[4:0] debug_rd_st2; - wire[`NW_BITS-1:0] debug_warp_num_st2; - wire[2:0] debug_mem_read_st2; - wire[2:0] debug_mem_write_st2; - wire[`REQS_BITS-1:0] debug_tid_st2; + wire[31:0] debug_use_pc_st2; + wire[1:0] debug_wb_st2; + wire[4:0] debug_rd_st2; + wire[`NW_BITS-1:0] debug_warp_num_st2; + wire[2:0] debug_mem_read_st2; + wire[2:0] debug_mem_write_st2; + wire[`REQS_BITS-1:0] debug_tid_st2; `DEBUG_END - wire snrq_pop; wire snrq_empty; wire snrq_full; @@ -505,6 +503,8 @@ module VX_bank #( assign mrvq_init_ready_state_st2 = mrvq_init_ready_state_unqual_st2 || mrvq_init_ready_state_hazard_st0_st1 || mrvq_init_ready_state_hazard_st1e_st1; VX_cache_miss_resrv #( + .BANK_ID (BANK_ID), + .CACHE_ID (CACHE_ID), .BANK_LINE_SIZE (BANK_LINE_SIZE), .NUM_BANKS (NUM_BANKS), .WORD_SIZE (WORD_SIZE), @@ -681,7 +681,7 @@ module VX_bank #( || msrq_push_stall || dram_fill_req_stall; -`ifdef DBG_PRINT_BANK +`ifdef DBG_PRINT_CACHE_BANK always_ff @(posedge clk) begin if (dram_fill_req_valid && dram_fill_req_ready) begin $display("%t: bank%02d:%01d dram_fill req: addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_fill_req_addr, BANK_ID)); diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index f929f731..b2190b1e 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -228,7 +228,7 @@ module VX_cache #( .NUM_BANKS (NUM_BANKS), .WORD_SIZE (WORD_SIZE), .NUM_REQUESTS (NUM_REQUESTS) - ) cache_core_req_bank_sell ( + ) cache_core_req_bank_sel ( .core_req_valid (core_req_valid), .core_req_addr (core_req_addr), .per_bank_valids (per_bank_valids) diff --git a/hw/rtl/cache/VX_cache_core_req_bank_sel.v b/hw/rtl/cache/VX_cache_core_req_bank_sel.v index 06fde46f..a332d921 100644 --- a/hw/rtl/cache/VX_cache_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_cache_core_req_bank_sel.v @@ -11,25 +11,22 @@ module VX_cache_core_req_bank_sel #( // Number of Word requests per cycle {1, 2, 4, 8, ...} parameter NUM_REQUESTS = 0 ) ( - input wire [NUM_REQUESTS-1:0] core_req_valid, - input wire [NUM_REQUESTS-1:0][31:0] core_req_addr, + input wire [NUM_REQUESTS-1:0] core_req_valid, + input wire [NUM_REQUESTS-1:0][31:0] core_req_addr, + output reg [NUM_BANKS-1:0][NUM_REQUESTS-1:0] per_bank_valids +); + integer i; - output reg [NUM_BANKS-1:0][NUM_REQUESTS-1:0] per_bank_valids -); - - generate - integer i; - always @(*) begin - per_bank_valids = 0; - for (i = 0; i < NUM_REQUESTS; i++) begin - if (NUM_BANKS == 1) begin - // If there is only one bank, then only map requests to that bank - per_bank_valids[0][i] = core_req_valid[i]; - end else begin - per_bank_valids[core_req_addr[i][`BANK_SELECT_ADDR_RNG]][i] = core_req_valid[i]; - end + always @(*) begin + per_bank_valids = 0; + for (i = 0; i < NUM_REQUESTS; i++) begin + if (NUM_BANKS == 1) begin + // If there is only one bank, then only map requests to that bank + per_bank_valids[0][i] = core_req_valid[i]; + end else begin + per_bank_valids[core_req_addr[i][`BANK_SELECT_ADDR_RNG]][i] = core_req_valid[i]; end end - endgenerate + end endmodule \ No newline at end of file diff --git a/hw/rtl/cache/VX_cache_miss_resrv.v b/hw/rtl/cache/VX_cache_miss_resrv.v index 390a85ec..0e8a4c87 100644 --- a/hw/rtl/cache/VX_cache_miss_resrv.v +++ b/hw/rtl/cache/VX_cache_miss_resrv.v @@ -1,6 +1,8 @@ `include "VX_cache_config.vh" module VX_cache_miss_resrv #( + parameter CACHE_ID = 0, + parameter BANK_ID = 0, // Size of line inside a bank in bytes parameter BANK_LINE_SIZE = 0, // Number of banks {1, 2, 4, 8,...} @@ -141,4 +143,21 @@ module VX_cache_miss_resrv #( end end +`ifdef DBG_PRINT_CACHE_MSRQ + always_ff @(posedge clk) begin + if (mrvq_push || mrvq_pop) begin + $write("%t: bank%02d:%01d msrq: push=%b pop=%b", $time, CACHE_ID, BANK_ID, mrvq_push, mrvq_pop); + for (int i = 0; i < MRVQ_SIZE; i++) begin + if (valid_table[i]) begin + $write(" "); + if (i == head_ptr) $write("*"); + if (~ready_table[i]) $write("!"); + $write("addr%0d=%0h", i, `LINE_TO_BYTE_ADDR(addr_table[i], BANK_ID)); + end + end + $write("\n"); + end + end +`endif + endmodule \ No newline at end of file diff --git a/hw/rtl/cache/VX_snp_forwarder.v b/hw/rtl/cache/VX_snp_forwarder.v index d0d65584..270078b2 100644 --- a/hw/rtl/cache/VX_snp_forwarder.v +++ b/hw/rtl/cache/VX_snp_forwarder.v @@ -112,7 +112,7 @@ module VX_snp_forwarder #( assign snp_fwdin_ready[i] = fwdin_ready && (fwdin_sel == `REQS_BITS'(i)); end -`ifdef DBG_PRINT_SNP_FWD +`ifdef DBG_PRINT_CACHE_SNP always_ff @(posedge clk) begin if (snp_req_valid && snp_req_ready) begin $display("%t: snp req: addr=%0h, tag=%0h", $time, snp_req_addr, snp_req_tag); diff --git a/hw/rtl/libs/VX_generic_queue.v b/hw/rtl/libs/VX_generic_queue.v index 24c25c00..8a89c666 100644 --- a/hw/rtl/libs/VX_generic_queue.v +++ b/hw/rtl/libs/VX_generic_queue.v @@ -13,13 +13,15 @@ module VX_generic_queue #( output wire full, `IGNORE_WARNINGS_END input wire [DATAW-1:0] data_in, - output wire [DATAW-1:0] data_out + output wire [DATAW-1:0] data_out, + output wire [`LOG2UP(SIZE+1)-1:0] size ); if (SIZE == 0) begin assign empty = 1; assign data_out = data_in; assign full = 0; + assign size = 0; end else begin // (SIZE > 0) @@ -59,6 +61,8 @@ module VX_generic_queue #( assign data_out = head_r; assign empty = (size_r == 0); assign full = (size_r != 0); + assign size = size_r; + end else begin // (SIZE > 1) reg [DATAW-1:0] curr_r; @@ -131,8 +135,9 @@ module VX_generic_queue #( end assign data_out = bypass_r ? curr_r : head_r; - assign empty = empty_r; - assign full = full_r; + assign empty = empty_r; + assign full = full_r; + assign size = size_r; end end