From 8de5470da415ef89c57e4f7254f8e97ca17860e2 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Tue, 16 Apr 2024 23:03:00 -0700 Subject: [PATCH] round robin warp scheduling --- hw/VX_config.h | 69 +++++++++++++++++++++++++------------- hw/rtl/VX_config.vh | 2 +- hw/rtl/core/VX_schedule.sv | 7 ++-- hw/rtl/libs/VX_lzc.sv | 43 ++++++++++++++++++++++-- 4 files changed, 91 insertions(+), 30 deletions(-) diff --git a/hw/VX_config.h b/hw/VX_config.h index 811fa6be..6e04c04a 100644 --- a/hw/VX_config.h +++ b/hw/VX_config.h @@ -1,5 +1,5 @@ // auto-generated by gen_config.py. DO NOT EDIT -// Generated at 2024-01-04 01:43:02.432130 +// Generated at 2024-04-08 12:40:13.594321 // Translated from ./rtl/VX_config.vh: @@ -84,15 +84,15 @@ #endif #ifndef NUM_CORES -#define NUM_CORES 1 +#define NUM_CORES 2 #endif #ifndef NUM_WARPS -#define NUM_WARPS 4 +#define NUM_WARPS 8 #endif #ifndef NUM_THREADS -#define NUM_THREADS 4 +#define NUM_THREADS 8 #endif #ifndef NUM_BARRIERS @@ -141,6 +141,18 @@ #endif #endif +#ifdef L2_ENABLE +#define L2_LINE_SIZE MEM_BLOCK_SIZE +#else +#define L2_LINE_SIZE L1_LINE_SIZE +#endif + +#ifdef L3_ENABLE +#define L3_LINE_SIZE MEM_BLOCK_SIZE +#else +#define L3_LINE_SIZE L2_LINE_SIZE +#endif + #ifdef XLEN_64 #ifndef STARTUP_ADDR @@ -168,7 +180,7 @@ #endif #ifndef SMEM_LOG_SIZE -#define SMEM_LOG_SIZE 14 +#define SMEM_LOG_SIZE 15 #endif #ifndef IO_BASE_ADDR @@ -196,13 +208,21 @@ #define STALL_TIMEOUT (100000 * (1 ** (L2_ENABLED + L3_ENABLED))) #endif +#ifndef SV_DPI +#define DPI_DISABLE +#endif + #ifndef FPU_FPNEW #ifndef FPU_DSP #ifndef FPU_DPI -#ifdef SYNTHESIS -#define FPU_DSP -#else +#ifndef SYNTHESIS +#ifndef DPI_DISABLE #define FPU_DPI +#else +#define FPU_DSP +#endif +#else +#define FPU_DSP #endif #endif #endif @@ -228,18 +248,18 @@ // Number of ALU units #ifndef NUM_ALU_LANES -#define NUM_ALU_LANES UP(NUM_THREADS / 2) +#define NUM_ALU_LANES NUM_THREADS #endif #ifndef NUM_ALU_BLOCKS -#define NUM_ALU_BLOCKS UP(ISSUE_WIDTH / 1) +#define NUM_ALU_BLOCKS ISSUE_WIDTH #endif // Number of FPU units #ifndef NUM_FPU_LANES -#define NUM_FPU_LANES UP(NUM_THREADS / 2) +#define NUM_FPU_LANES NUM_THREADS #endif #ifndef NUM_FPU_BLOCKS -#define NUM_FPU_BLOCKS UP(ISSUE_WIDTH / 1) +#define NUM_FPU_BLOCKS ISSUE_WIDTH #endif // Number of LSU units @@ -254,16 +274,19 @@ // Size of Instruction Buffer #ifndef IBUF_SIZE -#define IBUF_SIZE (2 * (NUM_WARPS / ISSUE_WIDTH)) +#define IBUF_SIZE (8 * (NUM_WARPS / ISSUE_WIDTH)) #endif // Size of LSU Request Queue #ifndef LSUQ_SIZE -#define LSUQ_SIZE (2 * (NUM_THREADS / NUM_LSU_LANES)) +#define LSUQ_SIZE (8 * (NUM_THREADS / NUM_LSU_LANES)) #endif // LSU Duplicate Address Check -#ifdef LSU_DUP +#ifndef LSU_DUP_DISABLE +#define LSU_DUP_ENABLE +#endif +#ifdef LSU_DUP_ENABLE #define LSU_DUP_ENABLED 1 #else #define LSU_DUP_ENABLED 0 @@ -290,8 +313,8 @@ // Floating-Point Units /////////////////////////////////////////////////////// // Size of FPU Request Queue -#ifndef FPU_REQ_QUEUE_SIZE -#define FPU_REQ_QUEUE_SIZE (2 * (NUM_THREADS / NUM_FPU_LANES)) +#ifndef FPUQ_SIZE +#define FPUQ_SIZE (2 * (NUM_THREADS / NUM_FPU_LANES)) #endif // FNCP Latency @@ -382,7 +405,7 @@ // Number of Cache Units #ifndef NUM_ICACHES -#define NUM_ICACHES UP(NUM_CORES / 4) +#define NUM_ICACHES UP(SOCKET_SIZE / 4) #endif // Cache Size @@ -412,7 +435,7 @@ // Number of Associative Ways #ifndef ICACHE_NUM_WAYS -#define ICACHE_NUM_WAYS 2 +#define ICACHE_NUM_WAYS 1 #endif // Dcache Configurable Knobs ////////////////////////////////////////////////// @@ -431,7 +454,7 @@ // Number of Cache Units #ifndef NUM_DCACHES -#define NUM_DCACHES UP(NUM_CORES / 4) +#define NUM_DCACHES UP(SOCKET_SIZE / 4) #endif // Cache Size @@ -441,7 +464,7 @@ // Number of Banks #ifndef DCACHE_NUM_BANKS -#define DCACHE_NUM_BANKS (NUM_LSU_LANES) +#define DCACHE_NUM_BANKS MIN(NUM_LSU_LANES, 4) #endif // Core Response Queue Size @@ -466,7 +489,7 @@ // Number of Associative Ways #ifndef DCACHE_NUM_WAYS -#define DCACHE_NUM_WAYS 2 +#define DCACHE_NUM_WAYS 1 #endif // SM Configurable Knobs ////////////////////////////////////////////////////// @@ -525,7 +548,7 @@ // Number of Associative Ways #ifndef L2_NUM_WAYS -#define L2_NUM_WAYS 4 +#define L2_NUM_WAYS 2 #endif // L3cache Configurable Knobs ///////////////////////////////////////////////// diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 9c3fd529..1f2e6545 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -274,7 +274,7 @@ // Size of LSU Request Queue `ifndef LSUQ_SIZE -`define LSUQ_SIZE (8 * (`NUM_THREADS / `NUM_LSU_LANES)) +`define LSUQ_SIZE (2 * `NUM_WARPS * (`NUM_THREADS / `NUM_LSU_LANES)) `endif // LSU Duplicate Address Check diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index 8c165261..9f6672a5 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -308,10 +308,11 @@ module VX_schedule import VX_gpu_pkg::*; #( wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls); - VX_lzc #( - .N (`NUM_WARPS), - .REVERSE (1) + VX_lzc_rr #( + .N (`NUM_WARPS) ) wid_select ( + .clk (clk), + .reset (reset), .data_in (ready_warps), .data_out (schedule_wid), .valid_out (schedule_valid) diff --git a/hw/rtl/libs/VX_lzc.sv b/hw/rtl/libs/VX_lzc.sv index 2589bf5a..7acff819 100644 --- a/hw/rtl/libs/VX_lzc.sv +++ b/hw/rtl/libs/VX_lzc.sv @@ -21,7 +21,7 @@ module VX_lzc #( ) ( input wire [N-1:0] data_in, output wire [LOGN-1:0] data_out, - output wire valid_out + output logic valid_out ); if (N == 1) begin @@ -33,11 +33,11 @@ module VX_lzc #( end else begin wire [N-1:0][LOGN-1:0] indices; - + for (genvar i = 0; i < N; ++i) begin assign indices[i] = REVERSE ? LOGN'(i) : LOGN'(N-1-i); end - + VX_find_first #( .N (N), .DATAW (LOGN), @@ -51,5 +51,42 @@ module VX_lzc #( end +endmodule + +module VX_lzc_rr #( + parameter N = 2 +) ( + input wire clk, + input wire reset, + input wire [N-1:0] data_in, + output logic [$clog2(N)-1:0] data_out, + output logic valid_out +); + + logic [$clog2(N)-1:0] current_idx; + + always @(*) begin + integer i; + data_out = 0; + for (i = 0; i < N; i += 1) begin + if (data_in[(current_idx + i) % N] == 1'b1) begin + data_out = (current_idx + i) % N; + break; + end + end + end + + assign valid_out = |data_in; + + always @(posedge clk) begin + if (reset) begin + current_idx <= 0; + end else begin + if (valid_out) begin + current_idx = (current_idx + 1) % N; + end + end + end + endmodule `TRACING_ON