diff --git a/rtl/VX_back_end.v b/rtl/VX_back_end.v index fdbcfd5f..cbdd452e 100644 --- a/rtl/VX_back_end.v +++ b/rtl/VX_back_end.v @@ -27,6 +27,7 @@ assign VX_writeback_inter.rd = VX_writeback_temp.rd; assign VX_writeback_inter.write_data = VX_writeback_temp.write_data; assign VX_writeback_inter.wb_valid = VX_writeback_temp.wb_valid; assign VX_writeback_inter.wb_warp_num = VX_writeback_temp.wb_warp_num; +assign VX_writeback_inter.wb_pc = VX_writeback_temp.wb_pc; // assign VX_writeback_inter(VX_writeback_temp); diff --git a/rtl/VX_cache/VX_bank.v b/rtl/VX_cache/VX_bank.v index f12018da..d2b24462 100644 --- a/rtl/VX_cache/VX_bank.v +++ b/rtl/VX_cache/VX_bank.v @@ -294,11 +294,22 @@ module VX_bank ); wire stall_bank_pipe; + reg is_fill_in_pipe; + genvar p_stage; + always @(*) begin + assign is_fill_in_pipe = 0; + for (p_stage = 0; p_stage < STAGE_1_CYCLES; p_stage=p_stage+1) begin + if (is_fill_st1[p_stage]) assign is_fill_in_pipe = 1; + end + + if (is_fill_st2) assign is_fill_in_pipe = 1; + end + assign dfpq_pop = !dfpq_empty && !stall_bank_pipe && !dfpq_hazard_st0; assign mrvq_pop = !dfpq_pop && mrvq_valid_st0 && !stall_bank_pipe && !mrvq_hazard_st0; - assign reqq_pop = !mrvq_pop && !reqq_empty && reqq_req_st0 && !stall_bank_pipe && !is_fill_st1[0] && !reqq_hazard_st0; + assign reqq_pop = !mrvq_pop && !reqq_empty && reqq_req_st0 && !stall_bank_pipe && !is_fill_st1[0] && !(reqq_hazard_st0 || (mrvq_valid_st0 && mrvq_hazard_st0)) && !is_fill_in_pipe; assign snrq_pop = !reqq_pop && snrq_valid_st0 && !stall_bank_pipe && !snrq_hazard_st0; @@ -495,14 +506,15 @@ module VX_bank // Enqueue to miss reserv if it's a valid miss - assign miss_add = valid_st2 && miss_st2; + assign miss_add = valid_st2 && miss_st2 && !stall_bank_pipe && !mrvq_full && !(dirty_st2 && dwbq_full); + assign miss_add_pc = pc_st2; assign miss_add_addr = addr_st2; assign miss_add_data = writeword_st2; assign {miss_add_rd, miss_add_wb, miss_add_warp_num, miss_add_mem_read, miss_add_mem_write, miss_add_tid} = inst_meta_st2; // Enqueue to CWB Queue - wire cwbq_push = (valid_st2 && !miss_st2); + wire cwbq_push = (valid_st2 && !miss_st2) && !cwbq_full & !llvq_full; wire [31:0] cwbq_data = readword_st2; wire [`vx_clog2(NUMBER_REQUESTS)-1:0] cwbq_tid = miss_add_tid; wire [4:0] cwbq_rd = miss_add_rd; @@ -527,8 +539,8 @@ module VX_bank ); // Enqueue to DWB Queue - wire dwbq_push = (valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2; - wire[31:0] dwbq_req_addr = {readtag_st2, addr_st2[`LINE_SELECT_ADDR_END:0]}; + wire dwbq_push = ((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2) && !dwbq_full && !(!fill_saw_dirty_st2 && mrvq_full); + wire[31:0] dwbq_req_addr = {readtag_st2, addr_st2[`LINE_SELECT_ADDR_END:0]} & `BASE_ADDR_MASK; wire[`BANK_LINE_SIZE_RNG][31:0] dwbq_req_data = readdata_st2; wire dwbq_empty; wire dwbq_full; @@ -536,6 +548,7 @@ module VX_bank wire invalidate_fill; wire possible_fill = valid_st2 && miss_st2; + wire[31:0] fill_invalidator_addr = addr_st2 & `BASE_ADDR_MASK; VX_fill_invalidator #( .CACHE_SIZE_BYTES (CACHE_SIZE_BYTES), .BANK_LINE_SIZE_BYTES (BANK_LINE_SIZE_BYTES), @@ -560,16 +573,16 @@ module VX_bank .reset (reset), .possible_fill (possible_fill), .success_fill (is_fill_st2), - .fill_addr (addr_st2), + .fill_addr (fill_invalidator_addr), .invalidate_fill (invalidate_fill) ); // Enqueu in dram_fill_req - assign dram_fill_req = valid_st2 && miss_st2 && !invalidate_fill; + assign dram_fill_req = valid_st2 && miss_st2 && !invalidate_fill && !dram_fill_req_queue_full; assign dram_because_of_snp = is_snp_st2 && valid_st2 && miss_st2; assign dram_snp_full = snrq_full && snp_req; - assign dram_fill_req_addr = addr_st2; + assign dram_fill_req_addr = addr_st2 & `BASE_ADDR_MASK; assign dram_wb_req = !dwbq_empty; VX_generic_queue_ll #(.DATAW( 32 + (`BANK_LINE_SIZE_WORDS * 32)), .SIZE(DWBQ_SIZE)) dwb_queue( @@ -589,7 +602,7 @@ module VX_bank // Lower Cache Hit wire llvq_empty; wire llvq_full; - wire llvq_push = valid_st2 && !miss_st2; + wire llvq_push = valid_st2 && !miss_st2 && !llvq_full && !cwbq_full; wire[`BANK_LINE_SIZE_RNG][31:0] llvq_push_data = readdata_st2; wire[31:0] llvq_addr = addr_st2; wire[`vx_clog2(NUMBER_REQUESTS)-1:0] llvq_tid = miss_add_tid; @@ -608,7 +621,7 @@ module VX_bank ); - assign stall_bank_pipe = (cwbq_push && cwbq_full) || (dwbq_push && dwbq_full) || (miss_add && mrvq_full) || (dram_fill_req && dram_fill_req_queue_full); + assign stall_bank_pipe = (cwbq_push && cwbq_full) || (llvq_push && llvq_full) || (dwbq_push && dwbq_full) || (miss_add && mrvq_full) || (dram_fill_req && dram_fill_req_queue_full); endmodule diff --git a/rtl/VX_cache/VX_cache.v b/rtl/VX_cache/VX_cache.v index c8f5d352..855b0905 100644 --- a/rtl/VX_cache/VX_cache.v +++ b/rtl/VX_cache/VX_cache.v @@ -254,7 +254,7 @@ module VX_cache .FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE), .SIMULATED_DRAM_LATENCY_CYCLES(SIMULATED_DRAM_LATENCY_CYCLES) ) - VX_cache_core_req_bank_sel + VX_cache_core_wb_sel_merge ( .per_bank_wb_valid (per_bank_wb_valid), .per_bank_wb_tid (per_bank_wb_tid), diff --git a/rtl/VX_cache/VX_cache_config.v b/rtl/VX_cache/VX_cache_config.v index 5dba0c7e..68d80446 100644 --- a/rtl/VX_cache/VX_cache_config.v +++ b/rtl/VX_cache/VX_cache_config.v @@ -115,7 +115,7 @@ `define TAG_SELECT_SIZE_RNG `TAG_SELECT_SIZE_END-1:0 -`define BASE_ADDR_MASK (~((1<<`WORD_SELECT_ADDR_END)-1)) +`define BASE_ADDR_MASK (~((1<<(`WORD_SELECT_ADDR_END+1))-1)) `endif diff --git a/rtl/VX_cache/VX_cache_dfq_queue.v b/rtl/VX_cache/VX_cache_dfq_queue.v index b2d4743a..1b95dd02 100644 --- a/rtl/VX_cache/VX_cache_dfq_queue.v +++ b/rtl/VX_cache/VX_cache_dfq_queue.v @@ -91,7 +91,7 @@ module VX_cache_dfq_queue ); - assign qual_bank_dram_fill_req = use_empty ? out_per_bank_dram_fill_req : use_per_bank_dram_fill_req; + assign qual_bank_dram_fill_req = use_empty ? (out_per_bank_dram_fill_req & {NUMBER_BANKS{!o_empty}}) : (use_per_bank_dram_fill_req & {NUMBER_BANKS{!use_empty}}); assign qual_bank_dram_fill_req_addr = use_empty ? out_per_bank_dram_fill_req_addr : use_per_bank_dram_fill_req_addr; wire[`vx_clog2(NUMBER_BANKS)-1:0] qual_request_index; diff --git a/rtl/VX_cache/VX_cache_dram_req_arb.v b/rtl/VX_cache/VX_cache_dram_req_arb.v index 01e699ed..b2cfab9a 100644 --- a/rtl/VX_cache/VX_cache_dram_req_arb.v +++ b/rtl/VX_cache/VX_cache_dram_req_arb.v @@ -101,7 +101,7 @@ module VX_cache_dram_req_arb ); - assign per_bank_dram_wb_queue_pop = per_bank_dram_wb_req & (~(1 << dwb_bank)); + assign per_bank_dram_wb_queue_pop = per_bank_dram_wb_req & ((1 << dwb_bank)); assign dram_req = dwb_valid || dfqq_req; diff --git a/rtl/VX_cache/VX_cache_miss_resrv.v b/rtl/VX_cache/VX_cache_miss_resrv.v index d175aeeb..fcbd5ba7 100644 --- a/rtl/VX_cache/VX_cache_miss_resrv.v +++ b/rtl/VX_cache/VX_cache_miss_resrv.v @@ -95,7 +95,7 @@ module VX_cache_miss_resrv wire enqueue_possible = !miss_resrv_full; - wire[`vx_clog2(MRVQ_SIZE)-1:0] enqueue_index = tail_ptr; + wire[`vx_clog2(MRVQ_SIZE)-1:0] enqueue_index = tail_ptr; reg[MRVQ_SIZE-1:0] make_ready; genvar curr_e; diff --git a/rtl/VX_cache/VX_fill_invalidator.v b/rtl/VX_cache/VX_fill_invalidator.v index 33e6eead..e5c0ae10 100644 --- a/rtl/VX_cache/VX_fill_invalidator.v +++ b/rtl/VX_cache/VX_fill_invalidator.v @@ -98,14 +98,12 @@ module VX_fill_invalidator wire enqueue_found; VX_generic_priority_encoder #(.N(FILL_INVALIDAOR_SIZE)) VX_sel_bank( - .valids(fills_active), + .valids(~fills_active), .index (enqueue_index), .found (enqueue_found) ); - reg[FILL_INVALIDAOR_SIZE-1:0] new_valids; - always @(posedge clk) begin @@ -113,7 +111,7 @@ module VX_fill_invalidator fills_active <= 0; fills_address <= 0; end else begin - if (enqueue_found && !invalidate_fill) begin + if (possible_fill && !invalidate_fill) begin fills_active[enqueue_index] <= 1; fills_address[enqueue_index] <= fill_addr; end diff --git a/rtl/VX_cache/VX_mrv_queue.v b/rtl/VX_cache/VX_mrv_queue.v new file mode 100644 index 00000000..36fedd7a --- /dev/null +++ b/rtl/VX_cache/VX_mrv_queue.v @@ -0,0 +1,122 @@ + +module VX_mrv_queue + #( + parameter DATAW = 4, + parameter SIZE = 277 + ) + ( + input wire clk, + input wire reset, + input wire push, + input wire[DATAW-1:0] in_data, + + input wire pop, + output wire[DATAW-1:0] out_data, + output wire empty, + output wire full +); + + if (SIZE == 0) begin + assign empty = 1; + assign out_data = 0; + assign full = 0; + end else begin + + reg[DATAW-1:0] data[SIZE-1:0], curr_r, head_r; + reg[$clog2(SIZE+1)-1:0] size_r; + reg[$clog2(SIZE)-1:0] wr_ctr_r; + reg[$clog2(SIZE)-1:0] rd_ptr_r, rd_next_ptr_r; + reg empty_r, full_r, bypass_r; + wire reading, writing; + + assign reading = pop && !empty; + assign writing = push && !full; + + if (SIZE == 1) begin + always @(posedge clk) begin + if (reset) begin + size_r <= 0; + end else begin + if (writing && !reading) begin + size_r <= 1; + end else if (reading && !writing) begin + size_r <= 0; + end + + if (writing) begin + head_r <= in_data; + end + end + end + + assign out_data = head_r; + assign empty = (size_r == 0); + assign full = (size_r != 0) && !pop; + end else begin + always @(posedge clk) begin + if (reset) begin + wr_ctr_r <= 0; + end else begin + if (writing) + wr_ctr_r <= wr_ctr_r + 1; + end + end + + always @(posedge clk) begin + if (reset) begin + size_r <= 0; + empty_r <= 1; + full_r <= 0; + end else begin + if (writing && !reading) begin + size_r <= size_r + 1; + empty_r <= 0; + if (size_r == SIZE-1) + full_r <= 1; + end else if (reading && !writing) begin + size_r <= size_r - 1; + if (size_r == 1) + empty_r <= 1; + full_r <= 0; + end + end + end + + always @(posedge clk) begin + if (writing) begin + data[wr_ctr_r] <= in_data; + end + end + + always @(posedge clk) begin + if (reset) begin + rd_ptr_r <= 0; + rd_next_ptr_r <= 1; + bypass_r <= 0; + end else begin + if (reading) begin + if (SIZE == 2) begin + rd_ptr_r <= rd_next_ptr_r; + rd_next_ptr_r <= ~rd_next_ptr_r; + end else if (SIZE > 2) begin + rd_ptr_r <= rd_next_ptr_r; + rd_next_ptr_r <= rd_ptr_r + 2; + end + end + + bypass_r <= writing && (empty_r || (1 == size_r) && reading); + curr_r <= in_data; + head_r <= data[reading ? rd_next_ptr_r : rd_ptr_r]; + end + end + + assign out_data = bypass_r ? curr_r : head_r; + assign empty = empty_r; + assign full = full_r; + end + + end + + + +endmodule \ No newline at end of file diff --git a/rtl/VX_cache/VX_tag_data_access.v b/rtl/VX_cache/VX_tag_data_access.v index 2ff175cd..71710636 100644 --- a/rtl/VX_cache/VX_tag_data_access.v +++ b/rtl/VX_cache/VX_tag_data_access.v @@ -71,12 +71,12 @@ module VX_tag_data_access ); - reg[`BANK_LINE_SIZE_RNG][31:0] readdata_st[STAGE_1_CYCLES-1:0]; + reg[`BANK_LINE_SIZE_RNG][31:0] readdata_st[STAGE_1_CYCLES-2:0]; - reg read_valid_st1c[STAGE_1_CYCLES-1:0]; - reg read_dirty_st1c[STAGE_1_CYCLES-1:0]; - reg[`TAG_SELECT_SIZE_RNG] read_tag_st1c [STAGE_1_CYCLES-1:0]; - reg[`BANK_LINE_SIZE_RNG][31:0] read_data_st1c [STAGE_1_CYCLES-1:0]; + reg read_valid_st1c[STAGE_1_CYCLES-2:0]; + reg read_dirty_st1c[STAGE_1_CYCLES-2:0]; + reg[`TAG_SELECT_SIZE_RNG] read_tag_st1c [STAGE_1_CYCLES-2:0]; + reg[`BANK_LINE_SIZE_RNG][31:0] read_data_st1c [STAGE_1_CYCLES-2:0]; wire qual_read_valid_st1; @@ -142,7 +142,7 @@ module VX_tag_data_access genvar curr_stage; generate - for (curr_stage = 1; curr_stage < STAGE_1_CYCLES; curr_stage = curr_stage + 1) begin + for (curr_stage = 1; curr_stage < STAGE_1_CYCLES-2; curr_stage = curr_stage + 1) begin VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`BANK_LINE_SIZE_WORDS*32) )) s0_1_cc ( .clk (clk), .reset(reset), @@ -155,13 +155,13 @@ module VX_tag_data_access endgenerate - assign use_read_valid_st1e = read_valid_st1c[STAGE_1_CYCLES-1]; - assign use_read_dirty_st1e = read_dirty_st1c[STAGE_1_CYCLES-1]; - assign use_read_tag_st1e = read_tag_st1c [STAGE_1_CYCLES-1]; + assign use_read_valid_st1e = read_valid_st1c[STAGE_1_CYCLES-2]; + assign use_read_dirty_st1e = read_dirty_st1c[STAGE_1_CYCLES-2]; + assign use_read_tag_st1e = read_tag_st1c [STAGE_1_CYCLES-2]; genvar curr_w; - for (curr_w = 0; curr_w < `BANK_LINE_SIZE_WORDS; curr_w = curr_w+1) assign use_read_data_st1e[curr_w][31:0] = read_data_st1c[STAGE_1_CYCLES-1][curr_w][31:0]; - // assign use_read_data_st1e = read_data_st1c [STAGE_1_CYCLES-1]; + for (curr_w = 0; curr_w < `BANK_LINE_SIZE_WORDS; curr_w = curr_w+1) assign use_read_data_st1e[curr_w][31:0] = read_data_st1c[STAGE_1_CYCLES-2][curr_w][31:0]; + // assign use_read_data_st1e = read_data_st1c [STAGE_1_CYCLES-2]; /////////////////////// LOAD LOGIC /////////////////// @@ -179,12 +179,12 @@ module VX_tag_data_access wire b2 = (byte_select == 2); wire b3 = (byte_select == 3); - wire[31:0] w0 = read_data_st1c[STAGE_1_CYCLES-1][0][31:0]; - wire[31:0] w1 = read_data_st1c[STAGE_1_CYCLES-1][1][31:0]; - wire[31:0] w2 = read_data_st1c[STAGE_1_CYCLES-1][2][31:0]; - wire[31:0] w3 = read_data_st1c[STAGE_1_CYCLES-1][3][31:0]; + wire[31:0] w0 = read_data_st1c[STAGE_1_CYCLES-2][0][31:0]; + wire[31:0] w1 = read_data_st1c[STAGE_1_CYCLES-2][1][31:0]; + wire[31:0] w2 = read_data_st1c[STAGE_1_CYCLES-2][2][31:0]; + wire[31:0] w3 = read_data_st1c[STAGE_1_CYCLES-2][3][31:0]; - wire[31:0] data_unmod = read_data_st1c[STAGE_1_CYCLES-1][block_offset][31:0]; + wire[31:0] data_unmod = read_data_st1c[STAGE_1_CYCLES-2][block_offset][31:0]; wire[31:0] data_unQual = (b0 || lw) ? (data_unmod) : b1 ? (data_unmod >> 8) : @@ -231,14 +231,14 @@ module VX_tag_data_access wire[3:0] sh_mask = (b0 ? 4'b0011 : 4'b1100); wire should_write = (sw || sb || sh) && valid_req_st1e && use_read_valid_st1e && !miss_st1e; - wire force_write = writefill_st1e && valid_req_st1e && miss_st1e; + wire force_write = writefill_st1e && valid_req_st1e && (!use_read_valid_st1e || (use_read_valid_st1e && !miss_st1e)); wire[`BANK_LINE_SIZE_RNG][3:0] we; wire[`BANK_LINE_SIZE_RNG][31:0] data_write; genvar g; generate for (g = 0; g < `BANK_LINE_SIZE_WORDS; g = g + 1) begin : write_enables - wire normal_write = (block_offset == g) && should_write; + wire normal_write = (block_offset == g) && should_write && !writefill_st1e; assign we[g] = (force_write) ? 4'b1111 : (normal_write && sw) ? 4'b1111 : diff --git a/rtl/VX_fetch.v b/rtl/VX_fetch.v index 00330676..923a7294 100644 --- a/rtl/VX_fetch.v +++ b/rtl/VX_fetch.v @@ -27,21 +27,11 @@ module VX_fetch ( // Only reason this is there is because there is a hidden assumption that decode is exactly after fetch - reg stall_might_be_branch; - always @(posedge clk) begin - if (reset) begin - stall_might_be_branch <= 0; - end else if ((stall_might_be_branch == 1'b1) && !icache_stage_delay && !schedule_delay) begin - stall_might_be_branch <= 0; - end else if (scheduled_warp == 1'b1) begin - stall_might_be_branch <= 1'b1; - end - end // Locals - assign pipe_stall = schedule_delay || icache_stage_delay || (stall_might_be_branch && (icache_stage_wid == warp_num)) ; + assign pipe_stall = schedule_delay || icache_stage_delay; VX_warp_scheduler warp_scheduler( .clk (clk), @@ -68,6 +58,10 @@ module VX_fetch ( .wstall (VX_wstall.wstall), .wstall_warp_num (VX_wstall.warp_num), + // Lock/release Stuff + .icache_stage_valids(icache_stage_valids), + .icache_stage_wid (icache_stage_wid), + // Join .is_join (VX_join.is_join), .join_warp_num (VX_join.join_warp_num), @@ -100,7 +94,7 @@ module VX_fetch ( ); assign fe_inst_meta_fi.warp_num = warp_num; - assign fe_inst_meta_fi.valid = thread_mask && {`NT{!stall_might_be_branch}}; + assign fe_inst_meta_fi.valid = thread_mask; assign fe_inst_meta_fi.instruction = 32'h0; assign fe_inst_meta_fi.inst_pc = warp_pc; diff --git a/rtl/VX_icache_stage.v b/rtl/VX_icache_stage.v index 3c6b3c3d..54233e1b 100644 --- a/rtl/VX_icache_stage.v +++ b/rtl/VX_icache_stage.v @@ -30,7 +30,7 @@ module VX_icache_stage ( assign fe_inst_meta_id.valid = fe_inst_meta_fi.valid & {`NT{!icache_stage_delay}}; assign icache_stage_wid = fe_inst_meta_fi.warp_num; - assign icache_stage_valids = fe_inst_meta_fi.valid; + assign icache_stage_valids = fe_inst_meta_fi.valid & {`NT{!icache_stage_delay}}; endmodule \ No newline at end of file diff --git a/rtl/VX_scheduler.v b/rtl/VX_scheduler.v index c6247ab8..ed796e65 100644 --- a/rtl/VX_scheduler.v +++ b/rtl/VX_scheduler.v @@ -9,11 +9,14 @@ module VX_scheduler ( VX_frE_to_bckE_req_inter VX_bckE_req, VX_wb_inter VX_writeback_inter, - output wire schedule_delay + output wire schedule_delay, + output wire is_empty ); + reg[31:0] count_valid; + assign is_empty = count_valid == 0; reg[31:0][`NT-1:0] rename_table[`NW-1:0]; @@ -67,6 +70,10 @@ module VX_scheduler ( end else begin if (valid_wb ) rename_table[VX_writeback_inter.wb_warp_num][VX_writeback_inter.rd] <= rename_table[VX_writeback_inter.wb_warp_num][VX_writeback_inter.rd] & (~VX_writeback_inter.wb_valid); if (!schedule_delay && wb_inc) rename_table[VX_bckE_req.warp_num ][VX_bckE_req.rd ] <= VX_bckE_req.valid; + + if (valid_wb && ((rename_table[VX_writeback_inter.wb_warp_num][VX_writeback_inter.rd] & (~VX_writeback_inter.wb_valid)) == 0)) count_valid = count_valid - 1; + if (!schedule_delay && wb_inc) count_valid = count_valid + 1; + end end diff --git a/rtl/VX_warp_scheduler.v b/rtl/VX_warp_scheduler.v index 8fcc5a32..42014786 100644 --- a/rtl/VX_warp_scheduler.v +++ b/rtl/VX_warp_scheduler.v @@ -54,7 +54,10 @@ module VX_warp_scheduler ( output wire[`NW_M1:0] warp_num, output wire[31:0] warp_pc, output wire out_ebreak, - output wire scheduled_warp + output wire scheduled_warp, + + input wire[`NW_M1:0] icache_stage_wid, + input wire[`NT-1:0] icache_stage_valids ); @@ -76,8 +79,10 @@ module VX_warp_scheduler ( reg[`NW-1:0] warp_active; reg[`NW-1:0] warp_stalled; - reg[`NW-1:0] visible_active; - wire[`NW-1:0] use_active; + reg [`NW-1:0] visible_active; + wire[`NW-1:0] use_active; + + reg [`NW-1:0] warp_lock; wire wstall_this_cycle; @@ -188,7 +193,7 @@ module VX_warp_scheduler ( // Refilling active warps if (update_visible_active) begin - visible_active <= warp_active & (~warp_stalled) & (~total_barrier_stall); + visible_active <= warp_active & (~warp_stalled) & (~total_barrier_stall) & ~warp_lock; end // Don't change state if stall @@ -208,6 +213,15 @@ module VX_warp_scheduler ( if (branch_dir) warp_pcs[branch_warp_num] <= branch_dest; warp_stalled[branch_warp_num] <= 0; end + + // Lock/Release + if (scheduled_warp && !stall) begin + warp_lock[warp_num] <= 1'b1; + end + if (|icache_stage_valids && !stall) begin + warp_lock[icache_stage_wid] <= 1'b0; + end + end end @@ -294,7 +308,7 @@ module VX_warp_scheduler ( assign new_pc = warp_pc + 4; - assign use_active = (count_visible_active < 1) ? (warp_active & (~warp_stalled) & (~total_barrier_stall)) : visible_active; + assign use_active = (count_visible_active < 1) ? (warp_active & (~warp_stalled) & (~total_barrier_stall) & (~warp_lock)) : visible_active; // Choosing a warp to schedule VX_priority_encoder choose_schedule( diff --git a/rtl/Vortex.v b/rtl/Vortex.v index cdfa50c2..eacfc257 100644 --- a/rtl/Vortex.v +++ b/rtl/Vortex.v @@ -40,6 +40,11 @@ module Vortex output wire out_ebreak ); + wire scheduler_empty; + wire out_ebreak_unqual; + + assign out_ebreak = out_ebreak_unqual && (scheduler_empty && 1); + reg[31:0] icache_banks = `ICACHE_BANKS; reg[31:0] icache_num_words_per_block = `ICACHE_NUM_WORDS_PER_BLOCK; @@ -63,6 +68,7 @@ module Vortex // Dcache Interface VX_gpu_dcache_res_inter #(.NUMBER_REQUESTS(`DNUMBER_REQUESTS)) VX_dcache_rsp(); VX_gpu_dcache_req_inter #(.NUMBER_REQUESTS(`DNUMBER_REQUESTS)) VX_dcache_req(); + VX_gpu_dcache_req_inter #(.NUMBER_REQUESTS(`DNUMBER_REQUESTS)) VX_dcache_req_qual(); VX_gpu_dcache_dram_req_inter #(.BANK_LINE_SIZE_WORDS(`DBANK_LINE_SIZE_WORDS)) VX_gpu_dcache_dram_req(); VX_gpu_dcache_dram_res_inter #(.BANK_LINE_SIZE_WORDS(`DBANK_LINE_SIZE_WORDS)) VX_gpu_dcache_dram_res(); @@ -88,10 +94,21 @@ module Vortex endgenerate wire temp_io_valid = (!memory_delay) && (|VX_dcache_req.core_req_valid) && (VX_dcache_req.core_req_mem_write != `NO_MEM_WRITE) && (VX_dcache_req.core_req_addr[0] == 32'h00010000); - wire[31:0] temp_io_data = VX_dcache_req.core_req_valid[0]; + wire[31:0] temp_io_data = VX_dcache_req.core_req_writedata[0]; assign io_valid = temp_io_valid; assign io_data = temp_io_data; + assign VX_dcache_req_qual.core_req_valid = VX_dcache_req.core_req_valid & {`NT{~io_valid}}; + assign VX_dcache_req_qual.core_req_addr = VX_dcache_req.core_req_addr; + assign VX_dcache_req_qual.core_req_writedata = VX_dcache_req.core_req_writedata; + assign VX_dcache_req_qual.core_req_mem_read = VX_dcache_req.core_req_mem_read; + assign VX_dcache_req_qual.core_req_mem_write = VX_dcache_req.core_req_mem_write; + assign VX_dcache_req_qual.core_req_rd = VX_dcache_req.core_req_rd; + assign VX_dcache_req_qual.core_req_wb = VX_dcache_req.core_req_wb; + assign VX_dcache_req_qual.core_req_warp_num = VX_dcache_req.core_req_warp_num; + assign VX_dcache_req_qual.core_req_pc = VX_dcache_req.core_req_pc; + assign VX_dcache_req_qual.core_no_wb_slot = VX_dcache_req.core_no_wb_slot; + VX_icache_response_inter icache_response_fe(); VX_icache_request_inter icache_request_fe(); @@ -145,7 +162,7 @@ VX_front_end vx_front_end( .icache_request_fe (icache_request_fe), .VX_jal_rsp (VX_jal_rsp), .VX_branch_rsp (VX_branch_rsp), - .fetch_ebreak (out_ebreak) + .fetch_ebreak (out_ebreak_unqual) ); VX_scheduler schedule( @@ -156,7 +173,8 @@ VX_scheduler schedule( .gpr_stage_delay (gpr_stage_delay), .VX_bckE_req (VX_bckE_req), .VX_writeback_inter(VX_writeback_inter), - .schedule_delay (schedule_delay) + .schedule_delay (schedule_delay), + .is_empty (scheduler_empty) ); VX_back_end vx_back_end( @@ -184,7 +202,7 @@ VX_dmem_controller VX_dmem_controller( .VX_dram_req_rsp_icache (VX_dram_req_rsp_icache), .VX_icache_req (icache_request_fe), .VX_icache_rsp (icache_response_fe), - .VX_dcache_req (VX_dcache_req), + .VX_dcache_req (VX_dcache_req_qual), .VX_dcache_rsp (VX_dcache_rsp) ); diff --git a/rtl/interfaces/VX_gpu_dcache_req_inter.v b/rtl/interfaces/VX_gpu_dcache_req_inter.v index 2c37f355..83b507fd 100644 --- a/rtl/interfaces/VX_gpu_dcache_req_inter.v +++ b/rtl/interfaces/VX_gpu_dcache_req_inter.v @@ -24,7 +24,7 @@ interface VX_gpu_dcache_req_inter wire [31:0] core_req_pc; // Can't WB - wire core_no_wb_slot; + wire core_no_wb_slot; endinterface