diff --git a/hw/Makefile b/hw/Makefile index 65d194fa..1547e8d2 100644 --- a/hw/Makefile +++ b/hw/Makefile @@ -34,7 +34,7 @@ gen-singlecore-t: build_config verilator $(VF) -cc $(SINGLE_CORE) -CFLAGS '$(CF) -DNDEBUG -O2' --threads $(THREADS) gen-singlecore-d: build_config - verilator $(VF) -cc $(SINGLE_CORE) -CFLAGS '$(CF) -DVCD_OUTPUT' $(DBG) + verilator $(VF) -cc $(SINGLE_CORE) -CFLAGS '$(CF) -g -O0 -DVCD_OUTPUT' $(DBG) gen-multicore: build_config verilator $(VF) -DNDEBUG -cc $(MULTI_CORE) -CFLAGS '$(CF) -DNDEBUG -DUSE_MULTICORE' @@ -43,7 +43,7 @@ gen-multicore-t: build_config verilator $(VF) -DNDEBUG -cc $(MULTI_CORE) -CFLAGS '$(CF) -DNDEBUG -O2 -DUSE_MULTICORE' --threads $(THREADS) gen-multicore-d: build_config - verilator $(VF) -cc $(MULTI_CORE) -CFLAGS '$(CF) -DVCD_OUTPUT -DUSE_MULTICORE' $(DBG) + verilator $(VF) -cc $(MULTI_CORE) -CFLAGS '$(CF) -g -O0 -DVCD_OUTPUT -DUSE_MULTICORE' $(DBG) singlecore: gen-singlecore (cd obj_dir && make -j -f VVortex.mk) diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index 0bc5c813..d253621d 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -347,15 +347,15 @@ logic vortex_enabled; always_comb begin - vortex_enabled = (STATE_RUN == state) || (STATE_CLFLUSH == state); - vx_dram_req_full = !vortex_enabled || avs_waitrequest || avs_raq_full || avs_rdq_full; + vortex_enabled = (STATE_RUN == state) || (STATE_CLFLUSH == state); + vx_dram_req_ready = vortex_enabled && !avs_waitrequest && !avs_raq_full && !avs_rdq_full; end // Vortex DRAM fill response always_comb begin - vx_dram_rsp_valid = vortex_enabled && !avs_rdq_empty && vx_dram_rsp_ready; + vx_dram_rsp_valid = vortex_enabled && !avs_rdq_empty && vx_dram_rsp_ready; vx_dram_rsp_addr = (avs_raq_dout << 6); {>>{vx_dram_rsp_data}} = avs_rdq_dout; end @@ -531,7 +531,7 @@ begin if ((STATE_CLFLUSH == state) && vx_snoop_ctr < csr_data_size - && !vx_snp_req_full) + && vx_snp_req_ready) begin vx_snp_req_addr <= (csr_mem_addr + vx_snoop_ctr) << 6; vx_snp_req <= 1; @@ -556,7 +556,7 @@ Vortex_Socket #() vx_socket ( .dram_req_read (vx_dram_req_read), .dram_req_addr (vx_dram_req_addr), .dram_req_data (vx_dram_req_data), - .dram_req_full (vx_dram_req_full), + .dram_req_ready (vx_dram_req_ready), // DRAM Rsp .out_dram_rsp_ready (vx_dram_rsp_ready), @@ -567,7 +567,7 @@ Vortex_Socket #() vx_socket ( // Cache Snooping Req .llc_snp_req_valid (vx_snp_req), .llc_snp_req_addr (vx_snp_req_addr), - .llc_snp_req_full (vx_snp_req_full), + .llc_snp_req_ready (vx_snp_req_ready), // program exit signal .out_ebreak (vx_ebreak) diff --git a/hw/rtl/VX_back_end.v b/hw/rtl/VX_back_end.v index 394f4823..76736ea6 100644 --- a/hw/rtl/VX_back_end.v +++ b/hw/rtl/VX_back_end.v @@ -32,53 +32,53 @@ assign writeback_if.wb_pc = writeback_temp_if.wb_pc; // assign VX_writeback_if(writeback_temp_if); -wire no_slot_mem; -wire no_slot_exec; +wire no_slot_mem; +wire no_slot_exec; // LSU input + output -VX_lsu_req_if lsu_req_if(); -VX_inst_mem_wb_if mem_wb_if(); +VX_lsu_req_if lsu_req_if(); +VX_inst_mem_wb_if mem_wb_if(); // Exec unit input + output -VX_exec_unit_req_if exec_unit_req_if(); -VX_inst_exec_wb_if inst_exec_wb_if(); +VX_exec_unit_req_if exec_unit_req_if(); +VX_inst_exec_wb_if inst_exec_wb_if(); // GPU unit input -VX_gpu_inst_req_if gpu_inst_req_if(); +VX_gpu_inst_req_if gpu_inst_req_if(); // CSR unit inputs -VX_csr_req_if csr_req_if(); -VX_csr_wb_if csr_wb_if(); -wire no_slot_csr; -wire stall_gpr_csr; +VX_csr_req_if csr_req_if(); +VX_csr_wb_if csr_wb_if(); +wire no_slot_csr; +wire stall_gpr_csr; VX_gpr_stage gpr_stage ( - .clk (clk), - .reset (reset), - .schedule_delay (schedule_delay), - .writeback_if (writeback_temp_if), - .bckE_req_if (bckE_req_if), + .clk (clk), + .reset (reset), + .schedule_delay (schedule_delay), + .writeback_if (writeback_temp_if), + .bckE_req_if (bckE_req_if), // New - .exec_unit_req_if(exec_unit_req_if), - .lsu_req_if (lsu_req_if), - .gpu_inst_req_if (gpu_inst_req_if), - .csr_req_if (csr_req_if), - .stall_gpr_csr (stall_gpr_csr), + .exec_unit_req_if (exec_unit_req_if), + .lsu_req_if (lsu_req_if), + .gpu_inst_req_if (gpu_inst_req_if), + .csr_req_if (csr_req_if), + .stall_gpr_csr (stall_gpr_csr), // End new - .memory_delay (out_mem_delay), - .exec_delay (out_exec_delay), - .gpr_stage_delay (gpr_stage_delay) + .memory_delay (out_mem_delay), + .exec_delay (out_exec_delay), + .gpr_stage_delay (gpr_stage_delay) ); VX_lsu load_store_unit ( - .clk (clk), - .reset (reset), - .lsu_req_if (lsu_req_if), - .mem_wb_if (mem_wb_if), - .dcache_rsp_if(dcache_rsp_if), - .dcache_req_if(dcache_req_if), - .out_delay (out_mem_delay), - .no_slot_mem (no_slot_mem) + .clk (clk), + .reset (reset), + .lsu_req_if (lsu_req_if), + .mem_wb_if (mem_wb_if), + .dcache_rsp_if (dcache_rsp_if), + .dcache_req_if (dcache_req_if), + .out_delay (out_mem_delay), + .no_slot_mem (no_slot_mem) ); VX_execute_unit execUnit ( @@ -97,11 +97,6 @@ VX_gpgpu_inst gpgpu_inst ( .warp_ctl_if (warp_ctl_if) ); -// VX_csr_wrapper csr_wrapper( -// .csr_req_if(csr_req_if), -// .csr_wb_if (csr_wb_if) -// ); - VX_csr_pipe #( .CORE_ID(CORE_ID) ) csr_pipe ( diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 2ed92322..9e299ef7 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -23,8 +23,6 @@ `define NUM_BARRIERS 4 `endif -// `define SINGLE_CORE_BENCH - `ifndef GLOBAL_BLOCK_SIZE_BYTES `define GLOBAL_BLOCK_SIZE_BYTES 16 `endif diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index a3a79257..457ab175 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -11,16 +11,15 @@ module VX_decode( VX_join_if join_if, output wire terminate_sim - ); - wire[31:0] in_instruction = fd_inst_meta_de.instruction; - wire[31:0] in_curr_PC = fd_inst_meta_de.inst_pc; - wire[`NW_BITS-1:0] in_warp_num = fd_inst_meta_de.warp_num; + wire[31:0] in_instruction = fd_inst_meta_de.instruction; + wire[31:0] in_curr_PC = fd_inst_meta_de.inst_pc; + wire[`NW_BITS-1:0] in_warp_num = fd_inst_meta_de.warp_num; - assign frE_to_bckE_req_if.curr_PC = in_curr_PC; + assign frE_to_bckE_req_if.curr_PC = in_curr_PC; - wire[`NUM_THREADS-1:0] in_valid = fd_inst_meta_de.valid; + wire[`NUM_THREADS-1:0] in_valid = fd_inst_meta_de.valid; wire[6:0] curr_opcode; @@ -122,28 +121,22 @@ module VX_decode( assign is_split = is_gpgpu && (func3 == 2); // Goes to BE assign is_join = is_gpgpu && (func3 == 3); // Doesn't go to BE - assign join_if.is_join = is_join; assign join_if.join_warp_num = in_warp_num; - assign frE_to_bckE_req_if.is_wspawn = is_wspawn; assign frE_to_bckE_req_if.is_tmc = is_tmc; assign frE_to_bckE_req_if.is_split = is_split; assign frE_to_bckE_req_if.is_barrier = is_barrier; - - assign frE_to_bckE_req_if.csr_immed = is_csr_immed; assign frE_to_bckE_req_if.is_csr = is_csr; - assign frE_to_bckE_req_if.wb = (is_jal || is_jalr || is_e_inst) ? `WB_JAL : is_linst ? `WB_MEM : (is_itype || is_rtype || is_lui || is_auipc || is_csr) ? `WB_ALU : `NO_WB; - assign frE_to_bckE_req_if.rs2_src = (is_itype || is_stype) ? `RS2_IMMED : `RS2_REG; // MEM signals @@ -161,7 +154,6 @@ module VX_decode( assign frE_to_bckE_req_if.upper_immed = temp_upper_immed; - assign jal_b_19_to_12 = in_instruction[19:12]; assign jal_b_11 = in_instruction[20]; assign jal_b_10_to_1 = in_instruction[30:21]; @@ -170,11 +162,9 @@ module VX_decode( assign jal_unsigned_offset = {jal_b_20, jal_b_19_to_12, jal_b_11, jal_b_10_to_1, jal_b_0}; assign jal_1_offset = {{11{jal_b_20}}, jal_unsigned_offset}; - assign jalr_immed = {func7, frE_to_bckE_req_if.rs2}; assign jal_2_offset = {{20{jalr_immed[11]}}, jalr_immed}; - assign jal_sys_cond1 = func3 == 3'h0; assign jal_sys_cond2 = u_12 < 12'h2; @@ -214,13 +204,11 @@ module VX_decode( // wire is_ebreak; - // assign is_ebreak = is_e_inst; wire ebreak = (curr_opcode == `SYS_INST) && (jal_sys_jal && (|in_valid)); assign frE_to_bckE_req_if.ebreak = ebreak; assign terminate_sim = is_e_inst; - // CSR assign csr_cond1 = func3 != 3'h0; @@ -228,13 +216,11 @@ module VX_decode( assign frE_to_bckE_req_if.csr_address = (csr_cond1 && csr_cond2) ? u_12 : 12'h55; - // ITYPE IMEED assign alu_shift_i = (func3 == 3'h1) || (func3 == 3'h5); assign alu_shift_i_immed = {{7{1'b0}}, frE_to_bckE_req_if.rs2}; assign alu_tempp = alu_shift_i ? alu_shift_i_immed : u_12; - always @(*) begin case(curr_opcode) `ALU_INST: temp_itype_immed = {{20{alu_tempp[11]}}, alu_tempp}; @@ -331,11 +317,11 @@ module VX_decode( wire[4:0] temp_final_alu; assign temp_final_alu = is_btype ? ((frE_to_bckE_req_if.branch_type < `BLTU) ? `SUB : `SUBU) : - is_lui ? `LUI_ALU : - is_auipc ? `AUIPC_ALU : - is_csr ? csr_alu : - (is_stype || is_linst) ? `ADD : - alu_op; + is_lui ? `LUI_ALU : + is_auipc ? `AUIPC_ALU : + is_csr ? csr_alu : + (is_stype || is_linst) ? `ADD : + alu_op; assign frE_to_bckE_req_if.alu_op = ((func7[0] == 1'b1) && is_rtype) ? mul_alu : temp_final_alu; diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index df74dd70..e0614dfc 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -135,6 +135,9 @@ `define ZERO_REG 5'h0 +// IO BUS +`define IO_BUS_ADDR 32'h00010000 + // ======================= Dcache Configurable Knobs ========================== // Function ID diff --git a/hw/rtl/VX_dmem_controller.v b/hw/rtl/VX_dmem_controller.v index ea842699..5391562a 100644 --- a/hw/rtl/VX_dmem_controller.v +++ b/hw/rtl/VX_dmem_controller.v @@ -23,28 +23,29 @@ module VX_dmem_controller ( VX_gpu_dcache_req_if icache_req_if ); - VX_gpu_dcache_rsp_if #(.NUM_REQUESTS(`DNUM_REQUESTS)) dcache_rsp_smem_if(); VX_gpu_dcache_req_if #(.NUM_REQUESTS(`DNUM_REQUESTS)) dcache_req_smem_if(); - - VX_gpu_dcache_rsp_if #(.NUM_REQUESTS(`DNUM_REQUESTS)) dcache_rsp_dcache_if(); + VX_gpu_dcache_rsp_if #(.NUM_REQUESTS(`DNUM_REQUESTS)) dcache_rsp_smem_if(); + VX_gpu_dcache_req_if #(.NUM_REQUESTS(`DNUM_REQUESTS)) dcache_req_dcache_if(); + VX_gpu_dcache_rsp_if #(.NUM_REQUESTS(`DNUM_REQUESTS)) dcache_rsp_dcache_if(); wire to_shm = dcache_req_if.core_req_addr[0][31:24] == 8'hFF; wire dcache_wants_wb = (|dcache_rsp_dcache_if.core_wb_valid); // Dcache Request assign dcache_req_dcache_if.core_req_valid = dcache_req_if.core_req_valid & {`NUM_THREADS{~to_shm}}; - assign dcache_req_dcache_if.core_req_addr = dcache_req_if.core_req_addr; - assign dcache_req_dcache_if.core_req_writedata = dcache_req_if.core_req_writedata; assign dcache_req_dcache_if.core_req_mem_read = dcache_req_if.core_req_mem_read; assign dcache_req_dcache_if.core_req_mem_write = dcache_req_if.core_req_mem_write; - assign dcache_req_dcache_if.core_req_rd = dcache_req_if.core_req_rd; + assign dcache_req_dcache_if.core_req_addr = dcache_req_if.core_req_addr; + assign dcache_req_dcache_if.core_req_writedata = dcache_req_if.core_req_writedata; + assign dcache_req_dcache_if.core_req_rd = dcache_req_if.core_req_rd; assign dcache_req_dcache_if.core_req_wb = dcache_req_if.core_req_wb; assign dcache_req_dcache_if.core_req_warp_num = dcache_req_if.core_req_warp_num; assign dcache_req_dcache_if.core_req_pc = dcache_req_if.core_req_pc; - assign dcache_req_dcache_if.core_no_wb_slot = dcache_req_if.core_no_wb_slot; - // Shred Memory Request + assign dcache_req_dcache_if.core_no_wb_slot = dcache_req_if.core_no_wb_slot; + + // Shared Memory Request assign dcache_req_smem_if.core_req_valid = dcache_req_if.core_req_valid & {`NUM_THREADS{to_shm}}; assign dcache_req_smem_if.core_req_addr = dcache_req_if.core_req_addr; assign dcache_req_smem_if.core_req_writedata = dcache_req_if.core_req_writedata; @@ -54,17 +55,18 @@ module VX_dmem_controller ( assign dcache_req_smem_if.core_req_wb = dcache_req_if.core_req_wb; assign dcache_req_smem_if.core_req_warp_num = dcache_req_if.core_req_warp_num; assign dcache_req_smem_if.core_req_pc = dcache_req_if.core_req_pc; - assign dcache_req_smem_if.core_no_wb_slot = dcache_req_if.core_no_wb_slot || dcache_wants_wb; - // Dcache Response + assign dcache_req_smem_if.core_no_wb_slot = dcache_req_if.core_no_wb_slot || dcache_wants_wb; + + // Dcache Response assign dcache_rsp_if.core_wb_valid = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_valid : dcache_rsp_smem_if.core_wb_valid; assign dcache_rsp_if.core_wb_req_rd = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_req_rd : dcache_rsp_smem_if.core_wb_req_rd; assign dcache_rsp_if.core_wb_req_wb = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_req_wb : dcache_rsp_smem_if.core_wb_req_wb; - assign dcache_rsp_if.core_wb_warp_num = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_warp_num : dcache_rsp_smem_if.core_wb_warp_num; + assign dcache_rsp_if.core_wb_pc = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_pc : dcache_rsp_smem_if.core_wb_pc; assign dcache_rsp_if.core_wb_readdata = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_readdata : dcache_rsp_smem_if.core_wb_readdata; - assign dcache_rsp_if.core_wb_pc = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_pc : dcache_rsp_smem_if.core_wb_pc; + assign dcache_rsp_if.core_wb_warp_num = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_warp_num : dcache_rsp_smem_if.core_wb_warp_num; - assign dcache_rsp_if.delay_req = to_shm ? dcache_rsp_smem_if.delay_req : dcache_rsp_dcache_if.delay_req; + assign dcache_rsp_if.core_req_ready = to_shm ? dcache_rsp_smem_if.core_req_ready : dcache_rsp_dcache_if.core_req_ready; VX_gpu_dcache_dram_req_if #(.BANK_LINE_WORDS(`DBANK_LINE_WORDS)) gpu_smem_dram_req_if(); VX_gpu_dcache_dram_rsp_if #(.BANK_LINE_WORDS(`DBANK_LINE_WORDS)) gpu_smem_dram_res_if(); @@ -105,8 +107,8 @@ module VX_dmem_controller ( .core_req_warp_num (dcache_req_smem_if.core_req_warp_num), .core_req_pc (dcache_req_smem_if.core_req_pc), - // Delay Core Req - .delay_req (dcache_rsp_smem_if.delay_req), + // Can submit core Req + .core_req_ready (dcache_rsp_smem_if.core_req_ready), // Core Cache Can't WB .core_no_wb_slot (dcache_req_smem_if.core_no_wb_slot), @@ -135,7 +137,7 @@ module VX_dmem_controller ( .dram_req_write (gpu_smem_dram_req_if.dram_req_write), .dram_req_addr (gpu_smem_dram_req_if.dram_req_addr), .dram_req_data (gpu_smem_dram_req_if.dram_req_data), - .dram_req_full (1), + .dram_req_ready (0), // Snoop Request .snp_req_valid (0), @@ -188,8 +190,8 @@ module VX_dmem_controller ( .core_req_warp_num (dcache_req_dcache_if.core_req_warp_num), .core_req_pc (dcache_req_dcache_if.core_req_pc), - // Delay Core Req - .delay_req (dcache_rsp_dcache_if.delay_req), + // Can submit core Req + .core_req_ready (dcache_rsp_dcache_if.core_req_ready), // Core Cache Can't WB .core_no_wb_slot (dcache_req_dcache_if.core_no_wb_slot), @@ -218,7 +220,7 @@ module VX_dmem_controller ( .dram_req_write (gpu_dcache_dram_req_if.dram_req_write), .dram_req_addr (gpu_dcache_dram_req_if.dram_req_addr), .dram_req_data (gpu_dcache_dram_req_if.dram_req_data), - .dram_req_full (gpu_dcache_dram_req_if.dram_req_full), + .dram_req_ready (gpu_dcache_dram_req_if.dram_req_ready), // Snoop Request .snp_req_valid (gpu_dcache_snp_req_if.snp_req_valid), @@ -269,8 +271,8 @@ module VX_dmem_controller ( .core_req_warp_num (icache_req_if.core_req_warp_num), .core_req_pc (icache_req_if.core_req_pc), - // Delay Core Req - .delay_req (icache_rsp_if.delay_req), + // Can submit core Req + .core_req_ready (icache_rsp_if.core_req_ready), // Core Cache Can't WB .core_no_wb_slot (icache_req_if.core_no_wb_slot), @@ -299,7 +301,7 @@ module VX_dmem_controller ( .dram_req_write (gpu_icache_dram_req_if.dram_req_write), .dram_req_addr (gpu_icache_dram_req_if.dram_req_addr), .dram_req_data (gpu_icache_dram_req_if.dram_req_data), - .dram_req_full (gpu_icache_dram_req_if.dram_req_full), + .dram_req_ready (gpu_icache_dram_req_if.dram_req_ready), // Snoop Request .snp_req_valid (gpu_icache_snp_req_if.snp_req_valid), diff --git a/hw/rtl/VX_front_end.v b/hw/rtl/VX_front_end.v index 06b82e70..6ac70286 100644 --- a/hw/rtl/VX_front_end.v +++ b/hw/rtl/VX_front_end.v @@ -1,22 +1,22 @@ `include "VX_define.vh" module VX_front_end ( - input wire clk, - input wire reset, + input wire clk, + input wire reset, - input wire schedule_delay, + input wire schedule_delay, - VX_warp_ctl_if warp_ctl_if, + VX_warp_ctl_if warp_ctl_if, - VX_gpu_dcache_rsp_if icache_rsp_if, - VX_gpu_dcache_req_if icache_req_if, + VX_gpu_dcache_rsp_if icache_rsp_if, + VX_gpu_dcache_req_if icache_req_if, - VX_jal_response_if jal_rsp_if, - VX_branch_response_if branch_rsp_if, + VX_jal_response_if jal_rsp_if, + VX_branch_response_if branch_rsp_if, - VX_frE_to_bckE_req_if bckE_req_if, + VX_frE_to_bckE_req_if bckE_req_if, - output wire fetch_ebreak + output wire fetch_ebreak ); VX_inst_meta_if fe_inst_meta_fi(); @@ -35,16 +35,7 @@ module VX_front_end ( wire[`NW_BITS-1:0] icache_stage_wid; wire[`NUM_THREADS-1:0] icache_stage_valids; - reg old_ebreak; // This should be eventually removed - always @(posedge clk) begin - if (reset) begin - old_ebreak <= 0; - end else begin - old_ebreak <= old_ebreak || fetch_ebreak; - end - end - - assign fetch_ebreak = vortex_ebreak || terminate_sim || old_ebreak; + assign fetch_ebreak = vortex_ebreak || terminate_sim; VX_wstall_if wstall_if(); VX_join_if join_if(); diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index ef40a490..aeb170dc 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -39,7 +39,7 @@ module VX_icache_stage ( assign icache_stage_valids = fe_inst_meta_id.valid & {`NUM_THREADS{!icache_stage_delay}}; // Cache can't accept request - assign icache_stage_delay = icache_rsp_if.delay_req; + assign icache_stage_delay = ~icache_rsp_if.core_req_ready; // Core can't accept response assign icache_req_if.core_no_wb_slot = total_freeze; diff --git a/hw/rtl/VX_lsu.v b/hw/rtl/VX_lsu.v index 51249e26..928aa8eb 100644 --- a/hw/rtl/VX_lsu.v +++ b/hw/rtl/VX_lsu.v @@ -59,7 +59,7 @@ module VX_lsu ( assign dcache_req_if.core_no_wb_slot = no_slot_mem; // Cache can't accept request - assign out_delay = dcache_rsp_if.delay_req; + assign out_delay = ~dcache_rsp_if.core_req_ready; // Core Response assign mem_wb_if.rd = dcache_rsp_if.core_wb_req_rd; diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index 413c7634..5169f809 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -1,26 +1,23 @@ `include "VX_define.vh" `include "VX_cache_config.vh" -module Vortex - #( - parameter CORE_ID = 0 - ) ( -`ifdef SINGLE_CORE_BENCH - +module Vortex #( + parameter CORE_ID = 0 +) ( // Clock - input wire clk, - input wire reset, + input wire clk, + input wire reset, // IO - output wire io_valid, - output wire [31:0] io_data, + output wire io_valid, + output wire [31:0] io_data, // DRAM Dcache Req output wire dram_req_read, output wire dram_req_write, output wire [31:0] dram_req_addr, output wire [`DBANK_LINE_SIZE-1:0] dram_req_data, - input wire dram_req_full, + input wire dram_req_ready, // DRAM Dcache Rsp input wire dram_rsp_valid, @@ -33,7 +30,7 @@ module Vortex output wire I_dram_req_write, output wire [31:0] I_dram_req_addr, output wire [`IBANK_LINE_SIZE-1:0] I_dram_req_data, - input wire I_dram_req_full, + input wire I_dram_req_ready, // DRAM Icache Rsp input wire I_dram_rsp_valid, @@ -42,52 +39,11 @@ module Vortex output wire I_dram_rsp_ready, // LLC Snooping - input wire snp_req_valid, - input wire [31:0] snp_req_addr, - output wire snp_req_full, + input wire llc_snp_req_valid, + input wire [31:0] llc_snp_req_addr, + output wire llc_snp_req_full, output wire out_ebreak - -`else - - input wire clk, - input wire reset, - // IO - output wire io_valid, - output wire[31:0] io_data, - - // DRAM Dcache Req - output wire dram_req_read, - output wire dram_req_write, - output wire [31:0] dram_req_addr, - output wire [`DBANK_LINE_SIZE-1:0] dram_req_data, - input wire dram_req_full, - - // DRAM Dcache Rsp - input wire dram_rsp_valid, - input wire [31:0] dram_rsp_addr, - input wire [`DBANK_LINE_SIZE-1:0] dram_rsp_data, - output wire dram_rsp_ready, - - // DRAM Icache Req - output wire I_dram_req_read, - output wire I_dram_req_write, - output wire [31:0] I_dram_req_addr, - output wire [`IBANK_LINE_SIZE-1:0] I_dram_req_data, - input wire I_dram_req_full, - - // DRAM Icache Rsp - output wire I_dram_rsp_ready, - input wire I_dram_rsp_valid, - input wire [31:0] I_dram_rsp_addr, - input wire [`IBANK_LINE_SIZE-1:0] I_dram_rsp_data, - - input wire snp_req_valid, - input wire [31:0] snp_req_addr, - output wire snp_req_full, - - output wire out_ebreak -`endif ); `DEBUG_BEGIN wire scheduler_empty; @@ -114,36 +70,37 @@ module Vortex assign dram_req_addr = gpu_dcache_dram_req_if.dram_req_addr; assign dram_rsp_ready = gpu_dcache_dram_req_if.dram_rsp_ready; - assign gpu_dcache_dram_req_if.dram_req_full = dram_req_full; + assign gpu_dcache_dram_req_if.dram_req_ready = dram_req_ready; genvar i; generate for (i = 0; i < `DBANK_LINE_WORDS; i=i+1) begin assign gpu_dcache_dram_res_if.dram_rsp_data[i] = dram_rsp_data[i * 32 +: 32]; - assign dram_req_data[i * 32 +: 32] = gpu_dcache_dram_req_if.dram_req_data[i]; + assign dram_req_data[i * 32 +: 32] = gpu_dcache_dram_req_if.dram_req_data[i]; end endgenerate wire temp_io_valid = (!memory_delay) && (|dcache_req_if.core_req_valid) && (dcache_req_if.core_req_mem_write[0] != `NO_MEM_WRITE) - && (dcache_req_if.core_req_addr[0] == 32'h00010000); + && (dcache_req_if.core_req_addr[0] == `IO_BUS_ADDR); - wire[31:0] temp_io_data = dcache_req_if.core_req_writedata[0]; - assign io_valid = temp_io_valid; - assign io_data = temp_io_data; + wire [31:0] temp_io_data = dcache_req_if.core_req_writedata[0]; + assign io_valid = temp_io_valid; + assign io_data = temp_io_data; - assign dcache_req_qual_if.core_req_valid = dcache_req_if.core_req_valid & {`NUM_THREADS{~io_valid}}; - assign dcache_req_qual_if.core_req_addr = dcache_req_if.core_req_addr; - assign dcache_req_qual_if.core_req_writedata = dcache_req_if.core_req_writedata; + assign dcache_req_qual_if.core_req_valid = dcache_req_if.core_req_valid & {`NUM_THREADS{~io_valid}}; assign dcache_req_qual_if.core_req_mem_read = dcache_req_if.core_req_mem_read; assign dcache_req_qual_if.core_req_mem_write = dcache_req_if.core_req_mem_write; + assign dcache_req_qual_if.core_req_addr = dcache_req_if.core_req_addr; + assign dcache_req_qual_if.core_req_writedata = dcache_req_if.core_req_writedata; assign dcache_req_qual_if.core_req_rd = dcache_req_if.core_req_rd; assign dcache_req_qual_if.core_req_wb = dcache_req_if.core_req_wb; assign dcache_req_qual_if.core_req_warp_num = dcache_req_if.core_req_warp_num; assign dcache_req_qual_if.core_req_pc = dcache_req_if.core_req_pc; - assign dcache_req_qual_if.core_no_wb_slot = dcache_req_if.core_no_wb_slot; + assign dcache_req_qual_if.core_no_wb_slot = dcache_req_if.core_no_wb_slot; + VX_gpu_dcache_rsp_if #(.NUM_REQUESTS(`INUM_REQUESTS)) icache_rsp_if(); VX_gpu_dcache_req_if #(.NUM_REQUESTS(`INUM_REQUESTS)) icache_req_if(); @@ -158,7 +115,7 @@ module Vortex assign I_dram_req_addr = gpu_icache_dram_req_if.dram_req_addr; assign I_dram_rsp_ready = gpu_icache_dram_req_if.dram_rsp_ready; - assign gpu_icache_dram_req_if.dram_req_full = I_dram_req_full; + assign gpu_icache_dram_req_if.dram_req_ready = I_dram_req_ready; genvar j; generate @@ -168,42 +125,41 @@ module Vortex end endgenerate -///////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// // Front-end to Back-end -VX_frE_to_bckE_req_if bckE_req_if(); // New instruction request to EXE/MEM +VX_frE_to_bckE_req_if bckE_req_if(); // New instruction request to EXE/MEM // Back-end to Front-end -VX_wb_if writeback_if(); // Writeback to GPRs -VX_branch_response_if branch_rsp_if(); // Branch Resolution to Fetch -VX_jal_response_if jal_rsp_if(); // Jump resolution to Fetch - -// CSR Buses -// VX_csr_write_request_if csr_w_req_if(); +VX_wb_if writeback_if(); // Writeback to GPRs +VX_branch_response_if branch_rsp_if(); // Branch Resolution to Fetch +VX_jal_response_if jal_rsp_if(); // Jump resolution to Fetch +// Warp controls VX_warp_ctl_if warp_ctl_if(); + +// Cache snooping VX_gpu_snp_req_rsp_if gpu_icache_snp_req_if(); VX_gpu_snp_req_rsp_if gpu_dcache_snp_req_if(); +assign gpu_dcache_snp_req_if.snp_req_valid = llc_snp_req_valid; +assign gpu_dcache_snp_req_if.snp_req_addr = llc_snp_req_addr; +assign llc_snp_req_full = gpu_dcache_snp_req_if.snp_req_full; -assign gpu_dcache_snp_req_if.snp_req_valid = snp_req_valid; -assign gpu_dcache_snp_req_if.snp_req_addr = snp_req_addr; -assign snp_req_full = gpu_dcache_snp_req_if.snp_req_full; - -VX_front_end front_end( - .clk (clk), - .reset (reset), - .warp_ctl_if (warp_ctl_if), - .bckE_req_if (bckE_req_if), - .schedule_delay (schedule_delay), - .icache_rsp_if (icache_rsp_if), - .icache_req_if (icache_req_if), - .jal_rsp_if (jal_rsp_if), - .branch_rsp_if (branch_rsp_if), - .fetch_ebreak (out_ebreak) +VX_front_end front_end ( + .clk (clk), + .reset (reset), + .warp_ctl_if (warp_ctl_if), + .bckE_req_if (bckE_req_if), + .schedule_delay (schedule_delay), + .icache_rsp_if (icache_rsp_if), + .icache_req_if (icache_req_if), + .jal_rsp_if (jal_rsp_if), + .branch_rsp_if (branch_rsp_if), + .fetch_ebreak (out_ebreak) ); -VX_scheduler schedule( - .clk (clk), +VX_scheduler schedule ( + .clk (clk), .reset (reset), .memory_delay (memory_delay), .exec_delay (exec_delay), @@ -214,7 +170,9 @@ VX_scheduler schedule( .is_empty (scheduler_empty) ); -VX_back_end #(.CORE_ID(CORE_ID)) back_end( +VX_back_end #( + .CORE_ID(CORE_ID) +) back_end ( .clk (clk), .reset (reset), .schedule_delay (schedule_delay), @@ -230,7 +188,7 @@ VX_back_end #(.CORE_ID(CORE_ID)) back_end( .gpr_stage_delay (gpr_stage_delay) ); -VX_dmem_controller dmem_controller( +VX_dmem_controller dmem_controller ( .clk (clk), .reset (reset), @@ -253,14 +211,6 @@ VX_dmem_controller dmem_controller( .dcache_rsp_if (dcache_rsp_if) ); -// VX_csr_handler csr_handler( -// .clk (clk), -// .in_decode_csr_address(decode_csr_address), -// .csr_w_req_if (csr_w_req_if), -// .in_wb_valid (writeback_if.wb_valid[0]), -// .out_decode_csr_data (csr_decode_csr_data) -// ); - endmodule // Vortex diff --git a/hw/rtl/Vortex_Cluster.v b/hw/rtl/Vortex_Cluster.v index fa8d7e9a..43c1cb36 100644 --- a/hw/rtl/Vortex_Cluster.v +++ b/hw/rtl/Vortex_Cluster.v @@ -1,14 +1,12 @@ `include "VX_define.vh" `include "VX_cache_config.vh" -module Vortex_Cluster - #( - parameter CLUSTER_ID = 0 - ) ( - +module Vortex_Cluster #( + parameter CLUSTER_ID = 0 +) ( // Clock - input wire clk, - input wire reset, + input wire clk, + input wire reset, // IO output wire[`NUM_CORES_PER_CLUSTER-1:0] io_valid, @@ -19,7 +17,7 @@ module Vortex_Cluster output wire dram_req_write, output wire [31:0] dram_req_addr, output wire [`DBANK_LINE_SIZE-1:0] dram_req_data, - input wire dram_req_full, + input wire dram_req_ready, // DRAM Rsp input wire dram_rsp_valid, @@ -28,11 +26,11 @@ module Vortex_Cluster output wire dram_rsp_ready, // LLC Snooping - input wire llc_snp_req_valid, - input wire[31:0] llc_snp_req_addr, - output wire llc_snp_req_full, + input wire llc_snp_req_valid, + input wire[31:0] llc_snp_req_addr, + output wire llc_snp_req_full, - output wire out_ebreak + output wire out_ebreak ); // DRAM Dcache Req wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_dram_req_read; @@ -64,7 +62,7 @@ module Vortex_Cluster wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_io_valid; wire[`NUM_CORES_PER_CLUSTER-1:0][31:0] per_core_io_data; - wire l2c_core_accept; + wire l2c_core_req_ready; wire snp_fwd_valid; wire[31:0] snp_fwd_addr; @@ -94,7 +92,7 @@ module Vortex_Cluster .dram_req_write (per_core_dram_req_write [curr_core]), .dram_req_addr (per_core_dram_req_addr [curr_core]), .dram_req_data (curr_core_dram_req_data ), - .dram_req_full (l2c_core_accept ), + .dram_req_ready (l2c_core_req_ready ), .dram_rsp_valid (per_core_dram_rsp_valid [curr_core]), .dram_rsp_addr (per_core_dram_rsp_addr [curr_core]), .dram_rsp_data (per_core_dram_rsp_data [curr_core]), @@ -103,14 +101,14 @@ module Vortex_Cluster .I_dram_req_write (per_core_I_dram_req_write [curr_core]), .I_dram_req_addr (per_core_I_dram_req_addr [curr_core]), .I_dram_req_data (curr_core_I_dram_req_data ), - .I_dram_req_full (l2c_core_accept ), + .I_dram_req_ready (l2c_core_req_ready ), .I_dram_rsp_valid (per_core_I_dram_rsp_valid [curr_core]), .I_dram_rsp_addr (per_core_I_dram_rsp_addr [curr_core]), .I_dram_rsp_data (per_core_I_dram_rsp_data [curr_core]), .I_dram_rsp_ready (per_core_I_dram_rsp_ready [curr_core]), - .snp_req_valid (snp_fwd_valid), - .snp_req_addr (snp_fwd_addr), - .snp_req_full (snp_fwd_full [curr_core]), + .llc_snp_req_valid (snp_fwd_valid), + .llc_snp_req_addr (snp_fwd_addr), + .llc_snp_req_full (snp_fwd_full [curr_core]), .out_ebreak (per_core_out_ebreak [curr_core]) ); @@ -220,7 +218,7 @@ module Vortex_Cluster .core_req_pc (0), // L2 can't accept Core Request - .delay_req (l2c_core_accept), + .core_req_ready (l2c_core_req_ready), // Core can't accept L2 Request .core_no_wb_slot (|l2c_core_no_wb_slot), @@ -249,7 +247,7 @@ module Vortex_Cluster .dram_req_write (dram_req_write), .dram_req_addr (dram_req_addr), .dram_req_data ({dram_req_data_port}), - .dram_req_full (dram_req_full), + .dram_req_ready (dram_req_ready), // Snoop Request .snp_req_valid (llc_snp_req_valid), diff --git a/hw/rtl/Vortex_Socket.v b/hw/rtl/Vortex_Socket.v index b3b31f94..000a17fc 100644 --- a/hw/rtl/Vortex_Socket.v +++ b/hw/rtl/Vortex_Socket.v @@ -2,21 +2,20 @@ `include "VX_cache_config.vh" module Vortex_Socket ( - // Clock - input wire clk, - input wire reset, + input wire clk, + input wire reset, // IO - output wire io_valid[`NUM_CORES-1:0], - output wire[31:0] io_data [`NUM_CORES-1:0], + output wire io_valid[`NUM_CORES-1:0], + output wire[31:0] io_data [`NUM_CORES-1:0], // DRAM Req output wire dram_req_read, output wire dram_req_write, output wire [31:0] dram_req_addr, output wire [`DBANK_LINE_SIZE-1:0] dram_req_data, - input wire dram_req_full, + input wire dram_req_ready, // DRAM Rsp input wire dram_rsp_valid, @@ -25,11 +24,11 @@ module Vortex_Socket ( output wire dram_rsp_ready, // LLC Snooping - input wire llc_snp_req_valid, - input wire[31:0] llc_snp_req_addr, - output wire llc_snp_req_full, + input wire llc_snp_req_valid, + input wire[31:0] llc_snp_req_addr, + output wire llc_snp_req_full, - output wire out_ebreak + output wire out_ebreak ); if (`NUM_CLUSTERS == 1) begin @@ -53,7 +52,7 @@ module Vortex_Socket ( .dram_req_write (dram_req_write), .dram_req_addr (dram_req_addr), .dram_req_data (dram_req_data), - .dram_req_full (dram_req_full), + .dram_req_ready (dram_req_ready), .dram_rsp_valid (dram_rsp_valid), .dram_rsp_addr (dram_rsp_addr), @@ -85,7 +84,7 @@ module Vortex_Socket ( wire[`NUM_CLUSTERS-1:0][`DBANK_LINE_WORDS-1:0][31:0] per_cluster_dram_req_data; wire[31:0] per_cluster_dram_req_data_up[`NUM_CLUSTERS-1:0][`DBANK_LINE_WORDS-1:0]; - wire l3c_core_req_full; + wire l3c_core_req_ready; // // DRAM Dcache Rsp wire[`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_ready; @@ -113,7 +112,9 @@ module Vortex_Socket ( genvar curr_cluster; for (curr_cluster = 0; curr_cluster < `NUM_CLUSTERS; curr_cluster=curr_cluster+1) begin - Vortex_Cluster #(.CLUSTER_ID(curr_cluster)) Vortex_Cluster( + Vortex_Cluster #( + .CLUSTER_ID(curr_cluster) + ) Vortex_Cluster( .clk (clk), .reset (reset), .io_valid (per_cluster_io_valid [curr_cluster]), @@ -123,7 +124,7 @@ module Vortex_Socket ( .dram_req_read (per_cluster_dram_req_read [curr_cluster]), .dram_req_addr (per_cluster_dram_req_addr [curr_cluster]), .dram_req_data (per_cluster_dram_req_data_up [curr_cluster]), - .dram_req_full (l3c_core_req_full), + .dram_req_ready (l3c_core_req_ready), .dram_rsp_valid (per_cluster_dram_rsp_valid [curr_cluster]), .dram_rsp_addr (per_cluster_dram_rsp_addr [curr_cluster]), @@ -139,6 +140,7 @@ module Vortex_Socket ( end //////////////////// L3 Cache //////////////////// + wire[`L3NUM_REQUESTS-1:0] l3c_core_req_valid; wire[`L3NUM_REQUESTS-1:0][2:0] l3c_core_req_mem_write; wire[`L3NUM_REQUESTS-1:0][2:0] l3c_core_req_mem_read; @@ -161,25 +163,24 @@ module Vortex_Socket ( assign dram_rsp_data_port[llb_index] = dram_rsp_data[llb_index]; end - // genvar l3c_curr_cluster; - for (l3c_curr_cluster = 0; l3c_curr_cluster < `L3NUM_REQUESTS; l3c_curr_cluster=l3c_curr_cluster+1) begin - // Core Request - assign l3c_core_req_valid [l3c_curr_cluster] = per_cluster_dram_req_valid[l3c_curr_cluster]; - assign l3c_core_req_mem_read [l3c_curr_cluster] = per_cluster_dram_req_read [l3c_curr_cluster] ? `LW_MEM_READ : `NO_MEM_READ; - assign l3c_core_req_mem_write [l3c_curr_cluster] = per_cluster_dram_req_write[l3c_curr_cluster] ? `SW_MEM_WRITE : `NO_MEM_WRITE; - assign l3c_core_req_wb [l3c_curr_cluster] = per_cluster_dram_req_read [l3c_curr_cluster] ? 1 : 0; - assign l3c_core_req_addr [l3c_curr_cluster] = per_cluster_dram_req_addr [l3c_curr_cluster]; - assign l3c_core_req_data [l3c_curr_cluster] = per_cluster_dram_req_data [l3c_curr_cluster]; + for (l3c_curr_cluster = 0; l3c_curr_cluster < `L3NUM_REQUESTS; l3c_curr_cluster=l3c_curr_cluster+1) begin + // Core Request + assign l3c_core_req_valid [l3c_curr_cluster] = per_cluster_dram_req_valid[l3c_curr_cluster]; + assign l3c_core_req_mem_read [l3c_curr_cluster] = per_cluster_dram_req_read [l3c_curr_cluster] ? `LW_MEM_READ : `NO_MEM_READ; + assign l3c_core_req_mem_write [l3c_curr_cluster] = per_cluster_dram_req_write[l3c_curr_cluster] ? `SW_MEM_WRITE : `NO_MEM_WRITE; + assign l3c_core_req_wb [l3c_curr_cluster] = per_cluster_dram_req_read [l3c_curr_cluster] ? 1 : 0; + assign l3c_core_req_addr [l3c_curr_cluster] = per_cluster_dram_req_addr [l3c_curr_cluster]; + assign l3c_core_req_data [l3c_curr_cluster] = per_cluster_dram_req_data [l3c_curr_cluster]; - // Core can't accept Response - assign l3c_core_no_wb_slot [l3c_curr_cluster] = ~per_cluster_dram_rsp_ready[l3c_curr_cluster]; + // Core can't accept Response + assign l3c_core_no_wb_slot [l3c_curr_cluster] = ~per_cluster_dram_rsp_ready[l3c_curr_cluster]; - // Cache Fill Response - assign per_cluster_dram_rsp_valid [l3c_curr_cluster] = l3c_wb [l3c_curr_cluster]; - assign per_cluster_dram_rsp_data [l3c_curr_cluster] = l3c_wb_data [l3c_curr_cluster]; - assign per_cluster_dram_rsp_addr [l3c_curr_cluster] = l3c_wb_addr [l3c_curr_cluster]; - end + // Cache Fill Response + assign per_cluster_dram_rsp_valid [l3c_curr_cluster] = l3c_wb [l3c_curr_cluster]; + assign per_cluster_dram_rsp_data [l3c_curr_cluster] = l3c_wb_data [l3c_curr_cluster]; + assign per_cluster_dram_rsp_addr [l3c_curr_cluster] = l3c_wb_addr [l3c_curr_cluster]; + end VX_cache #( .CACHE_SIZE_BYTES (`L3CACHE_SIZE_BYTES), @@ -203,8 +204,8 @@ module Vortex_Socket ( .FILL_INVALIDAOR_SIZE (`L3FILL_INVALIDAOR_SIZE), .SIMULATED_DRAM_LATENCY_CYCLES(`L3SIMULATED_DRAM_LATENCY_CYCLES) ) gpu_l3cache ( - .clk (clk), - .reset (reset), + .clk (clk), + .reset (reset), // Core Req (DRAM Fills/WB) To L2 Request .core_req_valid (l3c_core_req_valid), @@ -218,7 +219,7 @@ module Vortex_Socket ( .core_req_pc (0), // L2 can't accept Core Request - .delay_req (l3c_core_req_full), + .core_req_ready (l3c_core_req_ready), // Core can't accept L2 Request .core_no_wb_slot (|l3c_core_no_wb_slot), @@ -247,7 +248,7 @@ module Vortex_Socket ( .dram_req_read (dram_req_read), .dram_req_addr (dram_req_addr), .dram_req_data ({dram_req_data_port}), - .dram_req_full (dram_req_full), + .dram_req_ready (dram_req_ready), // Snoop Request .snp_req_valid (llc_snp_req_valid), diff --git a/hw/rtl/generic_cache/VX_bank.v b/hw/rtl/generic_cache/VX_bank.v index 621f947d..6d4e4c72 100644 --- a/hw/rtl/generic_cache/VX_bank.v +++ b/hw/rtl/generic_cache/VX_bank.v @@ -48,7 +48,7 @@ module VX_bank #( input wire reset, // Input Core Request - input wire delay_req, + input wire req_ready, input wire [NUM_REQUESTS-1:0] bank_valids, input wire [NUM_REQUESTS-1:0][31:0] bank_addr, input wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] bank_writedata, @@ -168,7 +168,7 @@ module VX_bank #( wire [2:0] reqq_req_mem_write_st0; wire [31:0] reqq_req_pc_st0; - assign reqq_push = !delay_req && (|bank_valids); + assign reqq_push = req_ready && (|bank_valids); VX_cache_req_queue #( .CACHE_SIZE_BYTES (CACHE_SIZE_BYTES), diff --git a/hw/rtl/generic_cache/VX_cache.v b/hw/rtl/generic_cache/VX_cache.v index a818c5bc..463056a3 100644 --- a/hw/rtl/generic_cache/VX_cache.v +++ b/hw/rtl/generic_cache/VX_cache.v @@ -52,44 +52,46 @@ module VX_cache #( input wire clk, input wire reset, - // Req Info + // Core request input wire [NUM_REQUESTS-1:0] core_req_valid, - input wire [NUM_REQUESTS-1:0][31:0] core_req_addr, - input wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] core_req_writedata, input wire [NUM_REQUESTS-1:0][2:0] core_req_mem_read, input wire [NUM_REQUESTS-1:0][2:0] core_req_mem_write, + input wire [NUM_REQUESTS-1:0][31:0] core_req_addr, + input wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] core_req_writedata, + output wire core_req_ready, - // Req meta + // Core request meta data input wire [4:0] core_req_rd, input wire [NUM_REQUESTS-1:0][1:0] core_req_wb, input wire [`NW_BITS-1:0] core_req_warp_num, input wire [31:0] core_req_pc, - output wire delay_req, + - // Core Writeback - input wire core_no_wb_slot, + // Core response output wire [NUM_REQUESTS-1:0] core_wb_valid, output wire [4:0] core_wb_req_rd, output wire [1:0] core_wb_req_wb, - output wire [`NW_BITS-1:0] core_wb_warp_num, - output wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] core_wb_readdata, - output wire [NUM_REQUESTS-1:0][31:0] core_wb_pc, output wire [NUM_REQUESTS-1:0][31:0] core_wb_address, + output wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] core_wb_readdata, + input wire core_no_wb_slot, - // Dram Fill Response + // Core response meta data + output wire [`NW_BITS-1:0] core_wb_warp_num, + output wire [NUM_REQUESTS-1:0][31:0] core_wb_pc, + + // DRAM request + output wire dram_req_read, + output wire dram_req_write, + output wire [31:0] dram_req_addr, + output wire [`IBANK_LINE_WORDS-1:0][31:0] dram_req_data, + input wire dram_req_ready, + + // DRAM response input wire dram_rsp_valid, input wire [31:0] dram_rsp_addr, input wire [`IBANK_LINE_WORDS-1:0][31:0] dram_rsp_data, output wire dram_rsp_ready, - // Dram request - output wire dram_req_read, - output wire dram_req_write, - output wire [31:0] dram_req_addr, - output wire [`IBANK_LINE_WORDS-1:0][31:0] dram_req_data, - input wire dram_req_full, - - // Snoop Req input wire snp_req_valid, input wire [31:0] snp_req_addr, @@ -132,7 +134,7 @@ module VX_cache #( wire [NUM_BANKS-1:0][31:0] per_bank_snp_fwd_addr; wire [NUM_BANKS-1:0] per_bank_snp_fwd_pop; - assign delay_req = (|per_bank_reqq_full); + assign core_req_ready = ~(|per_bank_reqq_full); assign snp_req_full = (|per_bank_snrq_full); // assign dram_rsp_ready = (NUM_BANKS == 1) ? per_bank_dram_rsp_ready[0] : per_bank_dram_rsp_ready[dram_rsp_addr[`BANK_SELECT_ADDR_RNG]]; @@ -171,7 +173,7 @@ module VX_cache #( .dram_req_write (dram_req_write), .dram_req_addr (dram_req_addr), .dram_req_data (dram_req_data), - .dram_req_full (dram_req_full) + .dram_req_ready (dram_req_ready) ); VX_cache_core_req_bank_sel #( @@ -372,7 +374,7 @@ module VX_cache #( .clk (clk), .reset (reset), // Core req - .delay_req (delay_req), + .req_ready (core_req_ready), .bank_valids (curr_bank_valids), .bank_addr (curr_bank_addr), .bank_writedata (curr_bank_writedata), diff --git a/hw/rtl/generic_cache/VX_cache_dram_req_arb.v b/hw/rtl/generic_cache/VX_cache_dram_req_arb.v index 176b4dda..5471aac3 100644 --- a/hw/rtl/generic_cache/VX_cache_dram_req_arb.v +++ b/hw/rtl/generic_cache/VX_cache_dram_req_arb.v @@ -50,14 +50,14 @@ module VX_cache_dram_req_arb #( // Fill Request output wire dfqq_full, - input wire[NUM_BANKS-1:0] per_bank_dram_fill_req_valid, - input wire[NUM_BANKS-1:0][31:0] per_bank_dram_fill_req_addr, + input wire [NUM_BANKS-1:0] per_bank_dram_fill_req_valid, + input wire [NUM_BANKS-1:0][31:0] per_bank_dram_fill_req_addr, // DFQ Request - output wire[NUM_BANKS-1:0] per_bank_dram_wb_queue_pop, - input wire[NUM_BANKS-1:0] per_bank_dram_wb_req_valid, - input wire[NUM_BANKS-1:0][31:0] per_bank_dram_wb_req_addr, - input wire[NUM_BANKS-1:0][`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] per_bank_dram_wb_req_data, + output wire [NUM_BANKS-1:0] per_bank_dram_wb_queue_pop, + input wire [NUM_BANKS-1:0] per_bank_dram_wb_req_valid, + input wire [NUM_BANKS-1:0][31:0] per_bank_dram_wb_req_addr, + input wire [NUM_BANKS-1:0][`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] per_bank_dram_wb_req_data, // real Dram request output wire dram_req_read, @@ -65,7 +65,7 @@ module VX_cache_dram_req_arb #( output wire [31:0] dram_req_addr, output wire [`IBANK_LINE_WORDS-1:0][31:0] dram_req_data, - input wire dram_req_full + input wire dram_req_ready ); wire pref_pop; @@ -75,7 +75,8 @@ module VX_cache_dram_req_arb #( wire dwb_valid; wire dfqq_req; - assign pref_pop = !dwb_valid && !dfqq_req && !dram_req_full && pref_valid; + assign pref_pop = !dwb_valid && !dfqq_req && dram_req_ready && pref_valid; + VX_prefetcher #( .PRFQ_SIZE (PRFQ_SIZE), .PRFQ_STRIDE (PRFQ_STRIDE), @@ -99,7 +100,7 @@ module VX_cache_dram_req_arb #( wire dfqq_empty; `DEBUG_END - wire dfqq_pop = !dwb_valid && dfqq_req && !dram_req_full; // If no dwb, and dfqq has valids, then pop + wire dfqq_pop = !dwb_valid && dfqq_req && dram_req_ready; // If no dwb, and dfqq has valids, then pop wire dfqq_push = (|per_bank_dram_fill_req_valid); VX_cache_dfq_queue cache_dfq_queue( @@ -115,9 +116,9 @@ module VX_cache_dram_req_arb #( .dfqq_full (dfqq_full) ); - wire[`LOG2UP(NUM_BANKS)-1:0] dwb_bank; + wire [`LOG2UP(NUM_BANKS)-1:0] dwb_bank; - wire[NUM_BANKS-1:0] use_wb_valid = per_bank_dram_wb_req_valid; + wire [NUM_BANKS-1:0] use_wb_valid = per_bank_dram_wb_req_valid; VX_generic_priority_encoder #( .N(NUM_BANKS) @@ -127,7 +128,7 @@ module VX_cache_dram_req_arb #( .found (dwb_valid) ); - assign per_bank_dram_wb_queue_pop = dram_req_full ? 0 : use_wb_valid & ((1 << dwb_bank)); + assign per_bank_dram_wb_queue_pop = dram_req_ready ? (use_wb_valid & ((1 << dwb_bank))) : 0; wire dram_req = dwb_valid || dfqq_req || pref_pop; assign dram_req_read = ((dfqq_req && !dwb_valid) || pref_pop) && dram_req; diff --git a/hw/rtl/interfaces/VX_gpu_dcache_dram_req_if.v b/hw/rtl/interfaces/VX_gpu_dcache_dram_req_if.v index 07cac070..6a66923b 100644 --- a/hw/rtl/interfaces/VX_gpu_dcache_dram_req_if.v +++ b/hw/rtl/interfaces/VX_gpu_dcache_dram_req_if.v @@ -12,7 +12,7 @@ interface VX_gpu_dcache_dram_req_if #( wire dram_req_read; wire [31:0] dram_req_addr; wire [BANK_LINE_WORDS-1:0][31:0] dram_req_data; - wire dram_req_full; + wire dram_req_ready; wire dram_rsp_ready; diff --git a/hw/rtl/interfaces/VX_gpu_dcache_req_if.v b/hw/rtl/interfaces/VX_gpu_dcache_req_if.v index 98521ab6..877aa918 100644 --- a/hw/rtl/interfaces/VX_gpu_dcache_req_if.v +++ b/hw/rtl/interfaces/VX_gpu_dcache_req_if.v @@ -7,21 +7,21 @@ interface VX_gpu_dcache_req_if #( parameter NUM_REQUESTS = 32 ) (); - // Core Request + // Core request wire [NUM_REQUESTS-1:0] core_req_valid; - wire [NUM_REQUESTS-1:0][31:0] core_req_addr; - wire [NUM_REQUESTS-1:0][31:0] core_req_writedata; wire [NUM_REQUESTS-1:0][2:0] core_req_mem_read; wire [NUM_REQUESTS-1:0][2:0] core_req_mem_write; + wire [NUM_REQUESTS-1:0][31:0] core_req_addr; + wire [NUM_REQUESTS-1:0][31:0] core_req_writedata; + + // Core request Meta data wire [4:0] core_req_rd; wire [NUM_REQUESTS-1:0][1:0] core_req_wb; wire [`NW_BITS-1:0] core_req_warp_num; wire [31:0] core_req_pc; - // Can't WB - wire core_no_wb_slot; + wire core_no_wb_slot; endinterface - `endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_gpu_dcache_rsp_if.v b/hw/rtl/interfaces/VX_gpu_dcache_rsp_if.v index 80383ce2..a4e0dc36 100644 --- a/hw/rtl/interfaces/VX_gpu_dcache_rsp_if.v +++ b/hw/rtl/interfaces/VX_gpu_dcache_rsp_if.v @@ -7,18 +7,19 @@ interface VX_gpu_dcache_rsp_if #( parameter NUM_REQUESTS = 32 ) (); - // Cache WB + // Core response wire [NUM_REQUESTS-1:0] core_wb_valid; `IGNORE_WARNINGS_BEGIN wire [4:0] core_wb_req_rd; wire [1:0] core_wb_req_wb; -`IGNORE_WARNINGS_END - wire [`NW_BITS-1:0] core_wb_warp_num; +`IGNORE_WARNINGS_END + wire [NUM_REQUESTS-1:0][31:0] core_wb_pc; wire [NUM_REQUESTS-1:0][31:0] core_wb_readdata; - wire [NUM_REQUESTS-1:0][31:0] core_wb_pc; + + // Core response meta data + wire [`NW_BITS-1:0] core_wb_warp_num; - // Cache Full - wire delay_req; + wire core_req_ready; endinterface diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index 0503c81b..3645cfad 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -34,6 +34,77 @@ void Simulator::print_stats(std::ostream& out) { out << std::setw(24) << "# of total cycles:" << std::dec << total_cycles_ << std::endl; } +void Simulator::dbus_driver() { + // Iterate through each element, and get pop index + int dequeue_index = -1; + bool dequeue_valid = false; + for (int i = 0; i < dram_req_vec_.size(); i++) { + if (dram_req_vec_[i].cycles_left > 0) { + dram_req_vec_[i].cycles_left -= 1; + } + + if ((dram_req_vec_[i].cycles_left == 0) && (!dequeue_valid)) { + dequeue_index = i; + dequeue_valid = true; + } + } + +#ifdef ENABLE_DRAM_STALLS + dram_stalled_ = false; + if (0 == (total_cycles_ % DRAM_STALLS_MODULO)) { + dram_stalled_ = true; + } else + if (dram_req_vec_.size() >= DRAM_RQ_SIZE) { + dram_stalled_ = true; + } +#endif + + if (!dram_stalled_) { + if (vortex_->dram_req_read) { + // Need to add an element + dram_req_t dram_req; + dram_req.cycles_left = DRAM_LATENCY; + dram_req.base_addr = vortex_->dram_req_addr; + dram_req.data = (unsigned *)malloc(GLOBAL_BLOCK_SIZE_BYTES); + + for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) { + unsigned curr_addr = dram_req.base_addr + (i * 4); + unsigned data_rd; + ram_->getWord(curr_addr, &data_rd); + dram_req.data[i] = data_rd; + } + dram_req_vec_.push_back(dram_req); + } + + if (vortex_->dram_req_write) { + unsigned base_addr = vortex_->dram_req_addr; + + for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) { + unsigned curr_addr = base_addr + (i * 4); + unsigned data_wr = vortex_->dram_req_data[i]; + ram_->writeWord(curr_addr, &data_wr); + } + } + } + + if (vortex_->dram_rsp_ready && dequeue_valid) { + vortex_->dram_rsp_valid = 1; + vortex_->dram_rsp_addr = dram_req_vec_[dequeue_index].base_addr; + + for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) { + vortex_->dram_rsp_data[i] = dram_req_vec_[dequeue_index].data[i]; + } + free(dram_req_vec_[dequeue_index].data); + + dram_req_vec_.erase(dram_req_vec_.begin() + dequeue_index); + } else { + vortex_->dram_rsp_valid = 0; + vortex_->dram_rsp_addr = 0; + } + + vortex_->dram_req_ready = ~dram_stalled_; +} + #ifndef USE_MULTICORE void Simulator::ibus_driver() { @@ -51,6 +122,16 @@ void Simulator::ibus_driver() { } } +#ifdef ENABLE_DRAM_STALLS + I_dram_stalled_ = false; + if (0 == (total_cycles_ % DRAM_STALLS_MODULO)) { + I_dram_stalled_ = true; + } else + if (I_dram_req_vec_.size() >= DRAM_RQ_SIZE) { + I_dram_stalled_ = true; + } +#endif + if (!I_dram_stalled_) { // std::cout << "Icache Dram Request received!\n"; if (vortex_->I_dram_req_read) { @@ -100,135 +181,11 @@ void Simulator::ibus_driver() { vortex_->I_dram_rsp_addr = 0; } -// #ifdef ENABLE_DRAM_STALLS -// I_dram_stalled_ = false; -// if (0 == (total_cycles_ % DRAM_STALLS_MODULO)) { -// I_dram_stalled_ = true; -// } else -// if (I_dram_req_vec_.size() >= DRAM_RQ_SIZE) { -// I_dram_stalled_ = true; -// } -// #endif - -// vortex_->dram_req_delay = I_dram_stalled_; + vortex_->I_dram_req_ready = ~I_dram_stalled_; } #endif -void Simulator::dbus_driver() { - // Iterate through each element, and get pop index - int dequeue_index = -1; - bool dequeue_valid = false; - for (int i = 0; i < dram_req_vec_.size(); i++) { - if (dram_req_vec_[i].cycles_left > 0) { - dram_req_vec_[i].cycles_left -= 1; - } - - if ((dram_req_vec_[i].cycles_left == 0) && (!dequeue_valid)) { - dequeue_index = i; - dequeue_valid = true; - } - } - -#ifdef USE_MULTICORE - - if (!dram_stalled_) { - if (vortex_->dram_req_read) { - // Need to add an element - dram_req_t dram_req; - dram_req.cycles_left = DRAM_LATENCY; - dram_req.base_addr = vortex_->dram_req_addr; - dram_req.data = (unsigned *)malloc(GLOBAL_BLOCK_SIZE_BYTES); - - for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) { - unsigned curr_addr = dram_req.base_addr + (i * 4); - unsigned data_rd; - ram_->getWord(curr_addr, &data_rd); - dram_req.data[i] = data_rd; - } - dram_req_vec_.push_back(dram_req); - } - - if (vortex_->dram_req_write) { - unsigned base_addr = vortex_->dram_req_addr; - - for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) { - unsigned curr_addr = base_addr + (i * 4); - unsigned data_wr = vortex_->dram_req_data[i]; - ram_->writeWord(curr_addr, &data_wr); - } - } - } - - if (vortex_->dram_rsp_ready && dequeue_valid) { - vortex_->dram_rsp_valid = 1; - vortex_->dram_rsp_addr = dram_req_vec_[dequeue_index].base_addr; - - for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) { - vortex_->dram_rsp_data[i] = dram_req_vec_[dequeue_index].data[i]; - } - free(dram_req_vec_[dequeue_index].data); - - dram_req_vec_.erase(dram_req_vec_.begin() + dequeue_index); - } else { - vortex_->dram_rsp_valid = 0; - vortex_->dram_rsp_addr = 0; - } - -#else - - if (!dram_stalled_) { - if (vortex_->dram_req_read) { - // Need to add an element - dram_req_t dram_req; - dram_req.cycles_left = DRAM_LATENCY; - dram_req.base_addr = vortex_->dram_req_addr; - dram_req.data = (unsigned *)malloc(GLOBAL_BLOCK_SIZE_BYTES); - - for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) { - unsigned curr_addr = dram_req.base_addr + (i * 4); - unsigned data_rd; - ram_->getWord(curr_addr, &data_rd); - dram_req.data[i] = data_rd; - } - dram_req_vec_.push_back(dram_req); - } - - if (vortex_->dram_req_write) { - unsigned base_addr = vortex_->dram_req_addr; - - for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) { - unsigned curr_addr = base_addr + (i * 4); - unsigned data_wr = vortex_->dram_req_data[i]; - ram_->writeWord(curr_addr, &data_wr); - } - } - } - - if (vortex_->dram_rsp_ready && dequeue_valid) { - vortex_->dram_rsp_valid = 1; - vortex_->dram_rsp_addr = dram_req_vec_[dequeue_index].base_addr; - - for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) { - vortex_->dram_rsp_data[i] = dram_req_vec_[dequeue_index].data[i]; - } - free(dram_req_vec_[dequeue_index].data); - - dram_req_vec_.erase(dram_req_vec_.begin() + dequeue_index); - } else { - vortex_->dram_rsp_valid = 0; - vortex_->dram_rsp_addr = 0; - } - -#endif - -#ifdef USE_MULTICORE - vortex_->dram_req_full = dram_stalled_; -#else - vortex_->dram_req_full = dram_stalled_; -#endif -} - void Simulator::io_handler() { #ifdef USE_MULTICORE bool io_valid = false; @@ -309,7 +266,6 @@ void Simulator::send_snoops(uint32_t mem_addr, uint32_t size) { auto aligned_addr_start = GLOBAL_BLOCK_SIZE_BYTES * (mem_addr / GLOBAL_BLOCK_SIZE_BYTES); auto aligned_addr_end = GLOBAL_BLOCK_SIZE_BYTES * ((mem_addr + size + GLOBAL_BLOCK_SIZE_BYTES - 1) / GLOBAL_BLOCK_SIZE_BYTES); -#ifdef USE_MULTICORE // submit snoop requests for the needed blocks vortex_->llc_snp_req_addr = aligned_addr_start; vortex_->llc_snp_req_valid = false; @@ -325,37 +281,13 @@ void Simulator::send_snoops(uint32_t mem_addr, uint32_t size) { vortex_->llc_snp_req_valid = true; } } -#else - // submit snoop requests for the needed blocks - vortex_->snp_req_addr = aligned_addr_start; - vortex_->snp_req_valid = false; - for (;;) { - this->step(); - if (vortex_->snp_req_valid) { - vortex_->snp_req_valid = false; - if (vortex_->snp_req_addr >= aligned_addr_end) - break; - vortex_->snp_req_addr += GLOBAL_BLOCK_SIZE_BYTES; - } - if (!vortex_->snp_req_full) { - vortex_->snp_req_valid = true; - } - } -#endif } void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) { printf("[sim] total cycles: %ld\n", this->total_cycles_); - - // send snoops for L1 flush + // send snoop requests to the caches this->send_snoops(mem_addr, size); this->wait(PIPELINE_FLUSH_LATENCY); - -// #if NUM_CORES != 1 - // send snoops for L2 flush - // this->send_snoops(mem_addr, size); - // this->wait(PIPELINE_FLUSH_LATENCY); -// #endif } bool Simulator::run() { diff --git a/hw/simulate/simulator.h b/hw/simulate/simulator.h index 422c41bb..7acf2ff0 100644 --- a/hw/simulate/simulator.h +++ b/hw/simulate/simulator.h @@ -19,7 +19,7 @@ #include #include -#define ENABLE_DRAM_STALLS +//#define ENABLE_DRAM_STALLS #define DRAM_LATENCY 200 #define DRAM_RQ_SIZE 16 #define DRAM_STALLS_MODULO 16 @@ -55,7 +55,7 @@ private: void send_snoops(uint32_t mem_addr, uint32_t size); void wait(uint32_t cycles); - int64_t total_cycles_; + uint64_t total_cycles_; bool dram_stalled_; bool I_dram_stalled_; std::vector dram_req_vec_; diff --git a/hw/simulate/testbench.cpp b/hw/simulate/testbench.cpp index f8e2f0b4..1861192d 100644 --- a/hw/simulate/testbench.cpp +++ b/hw/simulate/testbench.cpp @@ -12,7 +12,7 @@ int main(int argc, char **argv) Verilated::commandArgs(argc, argv); -#define ALL_TESTS +//#define ALL_TESTS #ifdef ALL_TESTS bool passed = true;