round robin warp scheduling

This commit is contained in:
Richard Yan
2024-04-16 23:03:00 -07:00
parent 7ae54bd280
commit 8de5470da4
4 changed files with 91 additions and 30 deletions

View File

@@ -1,5 +1,5 @@
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at 2024-01-04 01:43:02.432130
// Generated at 2024-04-08 12:40:13.594321
// Translated from ./rtl/VX_config.vh:
@@ -84,15 +84,15 @@
#endif
#ifndef NUM_CORES
#define NUM_CORES 1
#define NUM_CORES 2
#endif
#ifndef NUM_WARPS
#define NUM_WARPS 4
#define NUM_WARPS 8
#endif
#ifndef NUM_THREADS
#define NUM_THREADS 4
#define NUM_THREADS 8
#endif
#ifndef NUM_BARRIERS
@@ -141,6 +141,18 @@
#endif
#endif
#ifdef L2_ENABLE
#define L2_LINE_SIZE MEM_BLOCK_SIZE
#else
#define L2_LINE_SIZE L1_LINE_SIZE
#endif
#ifdef L3_ENABLE
#define L3_LINE_SIZE MEM_BLOCK_SIZE
#else
#define L3_LINE_SIZE L2_LINE_SIZE
#endif
#ifdef XLEN_64
#ifndef STARTUP_ADDR
@@ -168,7 +180,7 @@
#endif
#ifndef SMEM_LOG_SIZE
#define SMEM_LOG_SIZE 14
#define SMEM_LOG_SIZE 15
#endif
#ifndef IO_BASE_ADDR
@@ -196,13 +208,21 @@
#define STALL_TIMEOUT (100000 * (1 ** (L2_ENABLED + L3_ENABLED)))
#endif
#ifndef SV_DPI
#define DPI_DISABLE
#endif
#ifndef FPU_FPNEW
#ifndef FPU_DSP
#ifndef FPU_DPI
#ifdef SYNTHESIS
#define FPU_DSP
#else
#ifndef SYNTHESIS
#ifndef DPI_DISABLE
#define FPU_DPI
#else
#define FPU_DSP
#endif
#else
#define FPU_DSP
#endif
#endif
#endif
@@ -228,18 +248,18 @@
// Number of ALU units
#ifndef NUM_ALU_LANES
#define NUM_ALU_LANES UP(NUM_THREADS / 2)
#define NUM_ALU_LANES NUM_THREADS
#endif
#ifndef NUM_ALU_BLOCKS
#define NUM_ALU_BLOCKS UP(ISSUE_WIDTH / 1)
#define NUM_ALU_BLOCKS ISSUE_WIDTH
#endif
// Number of FPU units
#ifndef NUM_FPU_LANES
#define NUM_FPU_LANES UP(NUM_THREADS / 2)
#define NUM_FPU_LANES NUM_THREADS
#endif
#ifndef NUM_FPU_BLOCKS
#define NUM_FPU_BLOCKS UP(ISSUE_WIDTH / 1)
#define NUM_FPU_BLOCKS ISSUE_WIDTH
#endif
// Number of LSU units
@@ -254,16 +274,19 @@
// Size of Instruction Buffer
#ifndef IBUF_SIZE
#define IBUF_SIZE (2 * (NUM_WARPS / ISSUE_WIDTH))
#define IBUF_SIZE (8 * (NUM_WARPS / ISSUE_WIDTH))
#endif
// Size of LSU Request Queue
#ifndef LSUQ_SIZE
#define LSUQ_SIZE (2 * (NUM_THREADS / NUM_LSU_LANES))
#define LSUQ_SIZE (8 * (NUM_THREADS / NUM_LSU_LANES))
#endif
// LSU Duplicate Address Check
#ifdef LSU_DUP
#ifndef LSU_DUP_DISABLE
#define LSU_DUP_ENABLE
#endif
#ifdef LSU_DUP_ENABLE
#define LSU_DUP_ENABLED 1
#else
#define LSU_DUP_ENABLED 0
@@ -290,8 +313,8 @@
// Floating-Point Units ///////////////////////////////////////////////////////
// Size of FPU Request Queue
#ifndef FPU_REQ_QUEUE_SIZE
#define FPU_REQ_QUEUE_SIZE (2 * (NUM_THREADS / NUM_FPU_LANES))
#ifndef FPUQ_SIZE
#define FPUQ_SIZE (2 * (NUM_THREADS / NUM_FPU_LANES))
#endif
// FNCP Latency
@@ -382,7 +405,7 @@
// Number of Cache Units
#ifndef NUM_ICACHES
#define NUM_ICACHES UP(NUM_CORES / 4)
#define NUM_ICACHES UP(SOCKET_SIZE / 4)
#endif
// Cache Size
@@ -412,7 +435,7 @@
// Number of Associative Ways
#ifndef ICACHE_NUM_WAYS
#define ICACHE_NUM_WAYS 2
#define ICACHE_NUM_WAYS 1
#endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
@@ -431,7 +454,7 @@
// Number of Cache Units
#ifndef NUM_DCACHES
#define NUM_DCACHES UP(NUM_CORES / 4)
#define NUM_DCACHES UP(SOCKET_SIZE / 4)
#endif
// Cache Size
@@ -441,7 +464,7 @@
// Number of Banks
#ifndef DCACHE_NUM_BANKS
#define DCACHE_NUM_BANKS (NUM_LSU_LANES)
#define DCACHE_NUM_BANKS MIN(NUM_LSU_LANES, 4)
#endif
// Core Response Queue Size
@@ -466,7 +489,7 @@
// Number of Associative Ways
#ifndef DCACHE_NUM_WAYS
#define DCACHE_NUM_WAYS 2
#define DCACHE_NUM_WAYS 1
#endif
// SM Configurable Knobs //////////////////////////////////////////////////////
@@ -525,7 +548,7 @@
// Number of Associative Ways
#ifndef L2_NUM_WAYS
#define L2_NUM_WAYS 4
#define L2_NUM_WAYS 2
#endif
// L3cache Configurable Knobs /////////////////////////////////////////////////

View File

@@ -274,7 +274,7 @@
// Size of LSU Request Queue
`ifndef LSUQ_SIZE
`define LSUQ_SIZE (8 * (`NUM_THREADS / `NUM_LSU_LANES))
`define LSUQ_SIZE (2 * `NUM_WARPS * (`NUM_THREADS / `NUM_LSU_LANES))
`endif
// LSU Duplicate Address Check

View File

@@ -308,10 +308,11 @@ module VX_schedule import VX_gpu_pkg::*; #(
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
VX_lzc #(
.N (`NUM_WARPS),
.REVERSE (1)
VX_lzc_rr #(
.N (`NUM_WARPS)
) wid_select (
.clk (clk),
.reset (reset),
.data_in (ready_warps),
.data_out (schedule_wid),
.valid_out (schedule_valid)

View File

@@ -21,7 +21,7 @@ module VX_lzc #(
) (
input wire [N-1:0] data_in,
output wire [LOGN-1:0] data_out,
output wire valid_out
output logic valid_out
);
if (N == 1) begin
@@ -33,11 +33,11 @@ module VX_lzc #(
end else begin
wire [N-1:0][LOGN-1:0] indices;
for (genvar i = 0; i < N; ++i) begin
assign indices[i] = REVERSE ? LOGN'(i) : LOGN'(N-1-i);
end
VX_find_first #(
.N (N),
.DATAW (LOGN),
@@ -51,5 +51,42 @@ module VX_lzc #(
end
endmodule
module VX_lzc_rr #(
parameter N = 2
) (
input wire clk,
input wire reset,
input wire [N-1:0] data_in,
output logic [$clog2(N)-1:0] data_out,
output logic valid_out
);
logic [$clog2(N)-1:0] current_idx;
always @(*) begin
integer i;
data_out = 0;
for (i = 0; i < N; i += 1) begin
if (data_in[(current_idx + i) % N] == 1'b1) begin
data_out = (current_idx + i) % N;
break;
end
end
end
assign valid_out = |data_in;
always @(posedge clk) begin
if (reset) begin
current_idx <= 0;
end else begin
if (valid_out) begin
current_idx = (current_idx + 1) % N;
end
end
end
endmodule
`TRACING_ON