adding tracking for SFU stalls
This commit is contained in:
@@ -45,6 +45,15 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
||||
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS);
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
assign mem_perf_tmp_if.icache = 'x;
|
||||
assign mem_perf_tmp_if.dcache = 'x;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.smem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
|
||||
VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS]();
|
||||
@@ -69,24 +78,68 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
||||
.reset (gbar_reset),
|
||||
.gbar_bus_if (gbar_bus_if)
|
||||
);
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
cache_perf_t perf_l2cache;
|
||||
|
||||
assign mem_perf_tmp_if.icache = 'x;
|
||||
assign mem_perf_tmp_if.dcache = 'x;
|
||||
assign mem_perf_tmp_if.l2cache = perf_l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.smem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
||||
) l1_mem_bus_if[2]();
|
||||
.DATA_SIZE (L2_WORD_SIZE),
|
||||
.TAG_WIDTH (L2_TAG_WIDTH)
|
||||
) l2_mem_bus_if[L2_NUM_REQS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
||||
) per_socket_icache_mem_bus_if[`NUM_SOCKETS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
||||
) per_socket_dcache_mem_bus_if[`NUM_SOCKETS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH)
|
||||
) icache_mem_bus_if[1]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH)
|
||||
) dcache_mem_bus_if[1]();
|
||||
|
||||
`RESET_RELAY (l1_mem_arb_reset, reset);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (`NUM_SOCKETS),
|
||||
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG_REQ (2),
|
||||
.OUT_REG_RSP (2)
|
||||
) icache_mem_arb (
|
||||
.clk (clk),
|
||||
.reset (l1_mem_arb_reset),
|
||||
.bus_in_if (per_socket_icache_mem_bus_if),
|
||||
.bus_out_if (icache_mem_bus_if)
|
||||
);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (`NUM_SOCKETS),
|
||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG_REQ (2),
|
||||
.OUT_REG_RSP (2)
|
||||
) dcache_mem_arb (
|
||||
.clk (clk),
|
||||
.reset (l1_mem_arb_reset),
|
||||
.bus_in_if (per_socket_dcache_mem_bus_if),
|
||||
.bus_out_if (dcache_mem_bus_if)
|
||||
);
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[ICACHE_MEM_ARB_IDX], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[DCACHE_MEM_ARB_IDX], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
|
||||
|
||||
`RESET_RELAY (l2_reset, reset);
|
||||
|
||||
@@ -113,67 +166,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (l2_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (perf_l2cache),
|
||||
.cache_perf (mem_perf_tmp_if.l2cache),
|
||||
`endif
|
||||
.core_bus_if (l1_mem_bus_if),
|
||||
.core_bus_if (l2_mem_bus_if),
|
||||
.mem_bus_if (mem_bus_if)
|
||||
);
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
||||
) per_socket_icache_mem_bus_if[`NUM_SOCKETS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
||||
) per_socket_dcache_mem_bus_if[`NUM_SOCKETS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH)
|
||||
) icache_mem_bus_if[1]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH)
|
||||
) dcache_mem_bus_if[1]();
|
||||
|
||||
`RESET_RELAY (l1_mem_arb_reset, reset);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (`NUM_SOCKETS),
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG_REQ (2),
|
||||
.OUT_REG_RSP (2)
|
||||
) icache_mem_arb (
|
||||
.clk (clk),
|
||||
.reset (l1_mem_arb_reset),
|
||||
.bus_in_if (per_socket_icache_mem_bus_if),
|
||||
.bus_out_if (icache_mem_bus_if)
|
||||
);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (`NUM_SOCKETS),
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG_REQ (2),
|
||||
.OUT_REG_RSP (2)
|
||||
) dcache_mem_arb (
|
||||
.clk (clk),
|
||||
.reset (l1_mem_arb_reset),
|
||||
.bus_in_if (per_socket_dcache_mem_bus_if),
|
||||
.bus_out_if (dcache_mem_bus_if)
|
||||
);
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak;
|
||||
@@ -201,6 +199,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
||||
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
|
||||
) socket (
|
||||
`SCOPE_IO_BIND (scope_socket+i)
|
||||
|
||||
.clk (clk),
|
||||
.reset (socket_reset),
|
||||
|
||||
|
||||
@@ -57,10 +57,18 @@
|
||||
`define EX_ALU 0
|
||||
`define EX_LSU 1
|
||||
`define EX_SFU 2
|
||||
`define EX_FPU 3
|
||||
`define EX_FPU (`EX_SFU + `EXT_F_ENABLED)
|
||||
|
||||
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
|
||||
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
|
||||
`define EX_WIDTH `UP(`EX_BITS)
|
||||
|
||||
`define SFU_CSRS 0
|
||||
`define SFU_WCTL 1
|
||||
|
||||
`define NUM_SFU_UNITS (2)
|
||||
`define SFU_BITS `CLOG2(`NUM_SFU_UNITS)
|
||||
`define SFU_WIDTH `UP(`SFU_BITS)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -369,35 +377,32 @@
|
||||
VX_dcr_bus_if dst(); \
|
||||
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
|
||||
|
||||
`define PERF_REDUCE(dst, src, field, width, count) \
|
||||
wire [count-1:0][width-1:0] __reduce_add_i_``src``field; \
|
||||
wire [width-1:0] __reduce_add_o_``dst``field; \
|
||||
reg [width-1:0] __reduce_add_r_``dst``field; \
|
||||
for (genvar __i = 0; __i < count; ++__i) begin \
|
||||
assign __reduce_add_i_``src``field[__i] = ``src[__i].``field; \
|
||||
end \
|
||||
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``dst``field ( \
|
||||
__reduce_add_i_``src``field, \
|
||||
__reduce_add_o_``dst``field \
|
||||
); \
|
||||
always @(posedge clk) begin \
|
||||
if (reset) begin \
|
||||
__reduce_add_r_``dst``field <= '0; \
|
||||
end else begin \
|
||||
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
|
||||
end \
|
||||
end \
|
||||
assign ``dst.``field = __reduce_add_r_``dst``field
|
||||
|
||||
`define PERF_CACHE_REDUCE(dst, src, count) \
|
||||
`PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, write_misses, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, bank_stalls, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, mshr_stalls, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, mem_stalls, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, crsp_stalls, `PERF_CTR_BITS, count)
|
||||
`define PERF_COUNTER_ADD(dst, src, field, width, dst_count, src_count, reg_enable) \
|
||||
for (genvar __d = 0; __d < dst_count; ++__d) begin \
|
||||
localparam __count = ((src_count > dst_count) ? ((src_count + dst_count - 1) / dst_count) : 1); \
|
||||
wire [__count-1:0][width-1:0] __reduce_add_i_``src``field; \
|
||||
wire [width-1:0] __reduce_add_o_``dst``field; \
|
||||
for (genvar __i = 0; __i < __count; ++__i) begin \
|
||||
assign __reduce_add_i_``src``field[__i] = ``src[__d * __count + __i].``field; \
|
||||
end \
|
||||
VX_reduce #(.DATAW_IN(width), .N(__count), .OP("+")) __reduce_add_``dst``field ( \
|
||||
__reduce_add_i_``src``field, \
|
||||
__reduce_add_o_``dst``field \
|
||||
); \
|
||||
if (reg_enable) begin \
|
||||
reg [width-1:0] __reduce_add_r_``dst``field; \
|
||||
always @(posedge clk) begin \
|
||||
if (reset) begin \
|
||||
__reduce_add_r_``dst``field <= '0; \
|
||||
end else begin \
|
||||
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
|
||||
end \
|
||||
end \
|
||||
assign ``dst[__d].``field = __reduce_add_r_``dst``field; \
|
||||
end else begin \
|
||||
assign ``dst[__d].``field = __reduce_add_o_``dst``field; \
|
||||
end \
|
||||
end
|
||||
|
||||
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
|
||||
if (block_size != 1) begin \
|
||||
|
||||
@@ -99,7 +99,7 @@ package VX_gpu_pkg;
|
||||
`ifdef ICACHE_ENABLE
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
||||
`else
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES);
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||
`endif
|
||||
|
||||
////////////////////////// Dcache Parameters //////////////////////////////
|
||||
@@ -147,6 +147,9 @@ package VX_gpu_pkg;
|
||||
|
||||
/////////////////////////////// L2 Parameters /////////////////////////////
|
||||
|
||||
localparam ICACHE_MEM_ARB_IDX = 0;
|
||||
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
|
||||
|
||||
// Word size in bytes
|
||||
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
|
||||
@@ -66,19 +66,12 @@ module VX_socket import VX_gpu_pkg::*; #(
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
cache_perf_t perf_icache;
|
||||
cache_perf_t perf_dcache;
|
||||
|
||||
assign mem_perf_tmp_if.icache = perf_icache;
|
||||
assign mem_perf_tmp_if.dcache = perf_dcache;
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.smem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
@@ -110,7 +103,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
||||
.MEM_OUT_REG (2)
|
||||
) icache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (perf_icache),
|
||||
.cache_perf (mem_perf_tmp_if.icache),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (icache_reset),
|
||||
@@ -150,7 +143,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
||||
.MEM_OUT_REG (2)
|
||||
) dcache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (perf_dcache),
|
||||
.cache_perf (mem_perf_tmp_if.dcache),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (dcache_reset),
|
||||
|
||||
@@ -97,6 +97,11 @@
|
||||
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
|
||||
`define VX_CSR_MPM_LOAD_LT 12'hB0F
|
||||
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F
|
||||
// SFU: scoreboard
|
||||
`define VX_CSR_MPM_SCRB_WCTL 12'hB10
|
||||
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB90
|
||||
`define VX_CSR_MPM_SCRB_CSRS 12'hB11
|
||||
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB91
|
||||
|
||||
// Machine Performance-monitoring memory counters
|
||||
// PERF: icache
|
||||
|
||||
@@ -46,15 +46,9 @@ module Vortex import VX_gpu_pkg::*; (
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
cache_perf_t perf_l3cache;
|
||||
mem_perf_t mem_perf;
|
||||
|
||||
assign mem_perf_if.smem = 'x;
|
||||
assign mem_perf_if.icache = 'x;
|
||||
assign mem_perf_if.dcache = 'x;
|
||||
assign mem_perf_if.l2cache = 'x;
|
||||
assign mem_perf_if.l3cache = perf_l3cache;
|
||||
assign mem_perf_if.mem = mem_perf;
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if #(
|
||||
@@ -93,7 +87,7 @@ module Vortex import VX_gpu_pkg::*; (
|
||||
.reset (l3_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (perf_l3cache),
|
||||
.cache_perf (mem_perf_if.l3cache),
|
||||
`endif
|
||||
|
||||
.core_bus_if (per_cluster_mem_bus_if),
|
||||
@@ -171,6 +165,7 @@ module Vortex import VX_gpu_pkg::*; (
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
mem_perf_t mem_perf;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
@@ -193,6 +188,7 @@ module Vortex import VX_gpu_pkg::*; (
|
||||
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
assign mem_perf_if.mem = mem_perf;
|
||||
|
||||
`endif
|
||||
|
||||
|
||||
5
hw/rtl/cache/VX_cache_cluster.sv
vendored
5
hw/rtl/cache/VX_cache_cluster.sv
vendored
@@ -83,8 +83,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
||||
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
cache_perf_t perf_cache_unit[NUM_CACHES];
|
||||
`PERF_CACHE_REDUCE (cache_perf, perf_cache_unit, NUM_CACHES);
|
||||
cache_perf_t perf_cache_tmp[1], perf_cache_unit[NUM_CACHES];
|
||||
`PERF_CACHE_ADD (perf_cache_tmp, perf_cache_unit, 1, NUM_CACHES)
|
||||
assign cache_perf = perf_cache_tmp[0];
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if #(
|
||||
|
||||
12
hw/rtl/cache/VX_cache_define.vh
vendored
12
hw/rtl/cache/VX_cache_define.vh
vendored
@@ -62,4 +62,16 @@
|
||||
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
|
||||
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define PERF_CACHE_ADD(dst, src, dcount, scount) \
|
||||
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1))
|
||||
|
||||
`endif // VX_CACHE_DEFINE_VH
|
||||
|
||||
@@ -77,20 +77,14 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
|
||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
`ifdef SM_ENABLE
|
||||
cache_perf_t smem_perf;
|
||||
assign mem_perf_tmp_if.smem = smem_perf;
|
||||
`else
|
||||
assign mem_perf_tmp_if.smem = '0;
|
||||
`endif
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (dcr_data_reset, reset);
|
||||
@@ -250,7 +244,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (smem_perf),
|
||||
.cache_perf (mem_perf_tmp_if.smem),
|
||||
`endif
|
||||
.dcache_bus_in_if (dcache_bus_tmp_if),
|
||||
.dcache_bus_out_if (dcache_bus_if)
|
||||
|
||||
@@ -130,11 +130,11 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.smem = '0;
|
||||
assign mem_perf_if.icache = '0;
|
||||
assign mem_perf_if.dcache = '0;
|
||||
assign mem_perf_if.l2cache = '0;
|
||||
assign mem_perf_if.l3cache = '0;
|
||||
assign mem_perf_if.smem = '0;
|
||||
assign mem_perf_if.mem = '0;
|
||||
`endif
|
||||
|
||||
|
||||
@@ -33,7 +33,6 @@ import VX_fpu_pkg::*;
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
VX_sfu_perf_if.slave sfu_perf_if,
|
||||
`endif
|
||||
|
||||
VX_commit_csr_if.slave commit_csr_if,
|
||||
@@ -187,103 +186,107 @@ import VX_fpu_pkg::*;
|
||||
`VX_DCR_MPM_CLASS_CORE: begin
|
||||
case (read_addr)
|
||||
// PERF: pipeline
|
||||
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
|
||||
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
||||
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0];
|
||||
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
|
||||
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
||||
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_ALU][31:0];
|
||||
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_FPU][31:0];
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_FPU][31:0];
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
||||
`else
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
|
||||
`endif
|
||||
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0];
|
||||
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0];
|
||||
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_LSU][31:0];
|
||||
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_SFU][31:0];
|
||||
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_CSRS : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_CSRS][31:0];
|
||||
`VX_CSR_MPM_SCRB_CSRS_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_WCTL : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_WCTL][31:0];
|
||||
`VX_CSR_MPM_SCRB_WCTL_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL][`PERF_CTR_BITS-1:32]);
|
||||
// PERF: memory
|
||||
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
||||
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
|
||||
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
||||
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
||||
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
|
||||
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
||||
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
`VX_DCR_MPM_CLASS_MEM: begin
|
||||
case (read_addr)
|
||||
// PERF: icache
|
||||
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
|
||||
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
|
||||
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: dcache
|
||||
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
||||
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
|
||||
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
||||
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
|
||||
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: smem
|
||||
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
|
||||
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
|
||||
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
|
||||
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
|
||||
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: l2cache
|
||||
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: l3cache
|
||||
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: memory
|
||||
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
|
||||
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
||||
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
|
||||
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
||||
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
@@ -303,8 +306,6 @@ import VX_fpu_pkg::*;
|
||||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
|
||||
`UNUSED_VAR (perf_wctl_stalls);
|
||||
`UNUSED_VAR (mem_perf_if.icache);
|
||||
`UNUSED_VAR (mem_perf_if.smem);
|
||||
`endif
|
||||
|
||||
@@ -25,7 +25,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
VX_sfu_perf_if.slave sfu_perf_if,
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
@@ -81,7 +80,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.sfu_perf_if (sfu_perf_if),
|
||||
`endif
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
|
||||
@@ -61,7 +61,8 @@ module VX_issue #(
|
||||
.reset (scoreboard_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_scb_stalls(perf_issue_if.scb_stalls),
|
||||
.perf_scb_uses (perf_issue_if.scb_uses),
|
||||
.perf_units_uses(perf_issue_if.units_uses),
|
||||
.perf_sfu_uses (perf_issue_if.sfu_uses),
|
||||
`endif
|
||||
.writeback_if (writeback_if),
|
||||
.ibuffer_if (ibuffer_if),
|
||||
|
||||
@@ -21,7 +21,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_scb_uses [`NUM_EX_UNITS],
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
|
||||
`endif
|
||||
|
||||
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
||||
@@ -32,10 +33,14 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`NUM_EX_UNITS-1:0] perf_uses_per_cycle;
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle;
|
||||
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_uses_per_cycle;
|
||||
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
|
||||
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
|
||||
|
||||
reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle;
|
||||
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
|
||||
|
||||
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
|
||||
|
||||
@@ -43,10 +48,51 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
.DATAW_IN (`NUM_EX_UNITS),
|
||||
.N (`ISSUE_WIDTH),
|
||||
.OP ("|")
|
||||
) reduce (
|
||||
.data_in (perf_issue_uses_per_cycle),
|
||||
.data_out (perf_uses_per_cycle)
|
||||
) perf_units_reduce (
|
||||
.data_in (perf_issue_units_per_cycle),
|
||||
.data_out (perf_units_per_cycle)
|
||||
);
|
||||
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_SFU_UNITS),
|
||||
.N (`ISSUE_WIDTH),
|
||||
.OP ("|")
|
||||
) perf_sfu_reduce (
|
||||
.data_in (perf_issue_sfu_per_cycle),
|
||||
.data_out (perf_sfu_per_cycle)
|
||||
);
|
||||
|
||||
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
|
||||
`BUFFER(perf_units_per_cycle_r, perf_units_per_cycle);
|
||||
`BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_scb_stalls <= '0;
|
||||
end else begin
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_units_uses[i] <= '0;
|
||||
end else begin
|
||||
perf_units_uses[i] <= perf_units_uses[i] + `PERF_CTR_BITS'(perf_units_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_sfu_uses[i] <= '0;
|
||||
end else begin
|
||||
perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
@@ -60,21 +106,46 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units;
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
|
||||
|
||||
reg [`SFU_WIDTH-1:0] sfu_type;
|
||||
always @(*) begin
|
||||
perf_issue_uses_per_cycle[i] = '0;
|
||||
case (scoreboard_if[i].data.op_type)
|
||||
`INST_SFU_CSRRW,
|
||||
`INST_SFU_CSRRS,
|
||||
`INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
|
||||
default: sfu_type = `SFU_WCTL;
|
||||
endcase
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
perf_issue_units_per_cycle[i] = '0;
|
||||
perf_issue_sfu_per_cycle[i] = '0;
|
||||
if (ibuffer_if[i].valid) begin
|
||||
if (inuse_rd) begin
|
||||
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs1) begin
|
||||
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs2) begin
|
||||
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs3) begin
|
||||
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -109,6 +180,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1;
|
||||
`ifdef PERF_ENABLE
|
||||
inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type;
|
||||
if (scoreboard_if[i].data.ex_type == `EX_SFU) begin
|
||||
inuse_sfu[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= sfu_type;
|
||||
end
|
||||
`endif
|
||||
end
|
||||
valid_out_r <= 0;
|
||||
@@ -155,30 +229,4 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle_r;
|
||||
wire [`NUM_EX_UNITS-1:0] perf_uses_per_cycle_r;
|
||||
|
||||
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
|
||||
`BUFFER(perf_uses_per_cycle_r, perf_uses_per_cycle);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_scb_stalls <= '0;
|
||||
end else begin
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_scb_uses[i] <= '0;
|
||||
end else begin
|
||||
perf_scb_uses[i] <= perf_scb_uses[i] + `PERF_CTR_BITS'(perf_uses_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -48,7 +48,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
|
||||
localparam RSP_ARB_SIZE = 1 + 1;
|
||||
localparam RSP_ARB_IDX_WCTL = 0;
|
||||
localparam RSP_ARB_IDX_CSR = 1;
|
||||
localparam RSP_ARB_IDX_CSRS = 1;
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
@@ -71,9 +71,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
|
||||
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_sfu_perf_if sfu_perf_if();
|
||||
`endif
|
||||
|
||||
// Warp control block
|
||||
VX_execute_if #(
|
||||
@@ -129,7 +126,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.sfu_perf_if (sfu_perf_if),
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
@@ -141,18 +137,18 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
.commit_if (csr_commit_if)
|
||||
);
|
||||
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_CSR] = csr_commit_if.valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_CSR] = csr_commit_if.data;
|
||||
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSR];
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
|
||||
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS];
|
||||
|
||||
// can accept new request?
|
||||
|
||||
reg sfu_req_ready;
|
||||
always @(*) begin
|
||||
case (execute_if[0].data.op_type)
|
||||
`INST_SFU_CSRRW,
|
||||
`INST_SFU_CSRRS,
|
||||
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
|
||||
`INST_SFU_CSRRW,
|
||||
`INST_SFU_CSRRS,
|
||||
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
|
||||
default: sfu_req_ready = wctl_execute_if.ready;
|
||||
endcase
|
||||
end
|
||||
@@ -194,19 +190,4 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
.commit_out_if (commit_if)
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls;
|
||||
|
||||
wire wctl_execute_stall = wctl_execute_if.valid && ~wctl_execute_if.ready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_wctl_stalls <= '0;
|
||||
end else begin
|
||||
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_stall);
|
||||
end
|
||||
end
|
||||
assign sfu_perf_if.wctl_stalls = perf_wctl_stalls;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -18,7 +18,8 @@ interface VX_pipeline_perf_if ();
|
||||
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
|
||||
wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
|
||||
wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS];
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] ifetches;
|
||||
wire [`PERF_CTR_BITS-1:0] loads;
|
||||
@@ -34,7 +35,8 @@ interface VX_pipeline_perf_if ();
|
||||
modport issue (
|
||||
output ibf_stalls,
|
||||
output scb_stalls,
|
||||
output scb_uses
|
||||
output units_uses,
|
||||
output sfu_uses
|
||||
);
|
||||
|
||||
modport slave (
|
||||
@@ -42,7 +44,8 @@ interface VX_pipeline_perf_if ();
|
||||
input sched_stalls,
|
||||
input ibf_stalls,
|
||||
input scb_stalls,
|
||||
input scb_uses,
|
||||
input units_uses,
|
||||
input sfu_uses,
|
||||
input ifetches,
|
||||
input loads,
|
||||
input stores,
|
||||
|
||||
@@ -208,6 +208,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
||||
uint64_t scrb_fpu = 0;
|
||||
uint64_t scrb_lsu = 0;
|
||||
uint64_t scrb_sfu = 0;
|
||||
uint64_t scrb_wctl = 0;
|
||||
uint64_t scrb_csrs = 0;
|
||||
uint64_t ifetches = 0;
|
||||
uint64_t loads = 0;
|
||||
uint64_t stores = 0;
|
||||
@@ -269,43 +271,68 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
||||
// scheduler idles
|
||||
{
|
||||
uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID);
|
||||
int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler idles=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
|
||||
if (num_cores > 1) {
|
||||
int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
|
||||
fprintf(stream, "PERF: core%d: scheduler idle=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
|
||||
}
|
||||
sched_idles += sched_idles_per_core;
|
||||
}
|
||||
// scheduler stalls
|
||||
{
|
||||
uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
|
||||
int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
|
||||
if (num_cores > 1) {
|
||||
int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
|
||||
fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
|
||||
}
|
||||
sched_stalls += sched_stalls_per_core;
|
||||
}
|
||||
// ibuffer_stalls
|
||||
{
|
||||
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
|
||||
int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
|
||||
if (num_cores > 1) {
|
||||
int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
|
||||
fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
|
||||
}
|
||||
ibuffer_stalls += ibuffer_stalls_per_core;
|
||||
}
|
||||
// scrb_stalls
|
||||
// issue_stalls
|
||||
{
|
||||
uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
|
||||
uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU);
|
||||
uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU);
|
||||
uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU);
|
||||
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
|
||||
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
|
||||
scrb_alu += scrb_alu_per_core;
|
||||
scrb_fpu += scrb_fpu_per_core;
|
||||
scrb_lsu += scrb_lsu_per_core;
|
||||
scrb_sfu += scrb_sfu_per_core;
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
|
||||
if (num_cores > 1) {
|
||||
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
|
||||
fprintf(stream, "PERF: core%d: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
|
||||
calcAvgPercent(scrb_alu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_fpu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_lsu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_sfu_per_core, scrb_total));
|
||||
}
|
||||
scrb_stalls += scrb_stalls_per_core;
|
||||
}
|
||||
// sfu_stalls
|
||||
{
|
||||
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
|
||||
uint64_t scrb_wctl_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_WCTL);
|
||||
uint64_t scrb_csrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_CSRS);
|
||||
if (num_cores > 1) {
|
||||
uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core + scrb_tex_per_core + scrb_raster_per_core + scrb_om_per_core;
|
||||
fprintf(stream, "PERF: core%d: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
|
||||
, core_id
|
||||
, scrb_sfu_per_core
|
||||
, calcAvgPercent(scrb_csrs_per_core, sfu_total)
|
||||
, calcAvgPercent(scrb_wctl_per_core, sfu_total)
|
||||
);
|
||||
}
|
||||
scrb_wctl += scrb_wctl_per_core;
|
||||
scrb_csrs += scrb_csrs_per_core;
|
||||
}
|
||||
// PERF: memory
|
||||
// ifetches
|
||||
{
|
||||
@@ -314,8 +341,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
||||
ifetches += ifetches_per_core;
|
||||
|
||||
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT);
|
||||
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
|
||||
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
}
|
||||
ifetch_lat += ifetch_lat_per_core;
|
||||
}
|
||||
// loads
|
||||
@@ -325,8 +354,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
||||
loads += loads_per_core;
|
||||
|
||||
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT);
|
||||
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
|
||||
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
}
|
||||
load_lat += load_lat_per_core;
|
||||
}
|
||||
// stores
|
||||
@@ -428,14 +459,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
||||
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
|
||||
int load_avg_lat = (int)(double(load_lat) / double(loads));
|
||||
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
|
||||
fprintf(stream, "PERF: scheduler idles=%ld (%d%%)\n", sched_idles, sched_idles_percent);
|
||||
uint64_t sfu_total = scrb_wctl + scrb_csrs;
|
||||
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
|
||||
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
|
||||
fprintf(stream, "PERF: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
|
||||
calcAvgPercent(scrb_alu, scrb_total),
|
||||
calcAvgPercent(scrb_fpu, scrb_total),
|
||||
calcAvgPercent(scrb_lsu, scrb_total),
|
||||
calcAvgPercent(scrb_sfu, scrb_total));
|
||||
fprintf(stream, "PERF: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
|
||||
, scrb_sfu
|
||||
, calcAvgPercent(scrb_csrs, sfu_total)
|
||||
, calcAvgPercent(scrb_wctl, sfu_total)
|
||||
);
|
||||
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
|
||||
fprintf(stream, "PERF: loads=%ld\n", loads);
|
||||
fprintf(stream, "PERF: stores=%ld\n", stores);
|
||||
|
||||
@@ -18,20 +18,20 @@ using namespace vortex;
|
||||
Cluster::Cluster(const SimContext& ctx,
|
||||
uint32_t cluster_id,
|
||||
ProcessorImpl* processor,
|
||||
const Arch &arch, const
|
||||
DCRS &dcrs)
|
||||
const Arch &arch,
|
||||
const DCRS &dcrs)
|
||||
: SimObject(ctx, "cluster")
|
||||
, mem_req_port(this)
|
||||
, mem_rsp_port(this)
|
||||
, cluster_id_(cluster_id)
|
||||
, processor_(processor)
|
||||
, sockets_(NUM_SOCKETS)
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
, processor_(processor)
|
||||
, cores_per_socket_(arch.socket_size())
|
||||
{
|
||||
char sname[100];
|
||||
|
||||
auto sockets_per_cluster = sockets_.size();
|
||||
uint32_t sockets_per_cluster = sockets_.size();
|
||||
|
||||
// create sockets
|
||||
|
||||
@@ -43,7 +43,10 @@ Cluster::Cluster(const SimContext& ctx,
|
||||
|
||||
for (uint32_t i = 0; i < sockets_per_cluster; ++i) {
|
||||
uint32_t socket_id = cluster_id * sockets_per_cluster + i;
|
||||
auto socket = Socket::Create(socket_id, this, arch, dcrs);
|
||||
auto socket = Socket::Create(socket_id,
|
||||
this,
|
||||
arch,
|
||||
dcrs);
|
||||
|
||||
socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i));
|
||||
icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port);
|
||||
@@ -154,7 +157,7 @@ void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
|
||||
}
|
||||
|
||||
Cluster::PerfStats Cluster::perf_stats() const {
|
||||
Cluster::PerfStats perf;
|
||||
perf.l2cache = l2cache_->perf_stats();
|
||||
return perf;
|
||||
PerfStats perf_stats;
|
||||
perf_stats.l2cache = l2cache_->perf_stats();
|
||||
return perf_stats;
|
||||
}
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "dcrs.h"
|
||||
#include "arch.h"
|
||||
#include "cache_cluster.h"
|
||||
#include "shared_mem.h"
|
||||
#include "core.h"
|
||||
#include "socket.h"
|
||||
#include "constants.h"
|
||||
@@ -29,11 +30,6 @@ class Cluster : public SimObject<Cluster> {
|
||||
public:
|
||||
struct PerfStats {
|
||||
CacheSim::PerfStats l2cache;
|
||||
|
||||
PerfStats& operator+=(const PerfStats& rhs) {
|
||||
this->l2cache += rhs.l2cache;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
SimPort<MemReq> mem_req_port;
|
||||
@@ -67,15 +63,15 @@ public:
|
||||
|
||||
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
|
||||
|
||||
Cluster::PerfStats perf_stats() const;
|
||||
PerfStats perf_stats() const;
|
||||
|
||||
private:
|
||||
uint32_t cluster_id_;
|
||||
std::vector<Socket::Ptr> sockets_;
|
||||
std::vector<CoreMask> barriers_;
|
||||
CacheSim::Ptr l2cache_;
|
||||
ProcessorImpl* processor_;
|
||||
uint32_t cores_per_socket_;
|
||||
uint32_t cluster_id_;
|
||||
ProcessorImpl* processor_;
|
||||
std::vector<Socket::Ptr> sockets_;
|
||||
std::vector<CoreMask> barriers_;
|
||||
CacheSim::Ptr l2cache_;
|
||||
uint32_t cores_per_socket_;
|
||||
};
|
||||
|
||||
} // namespace vortex
|
||||
@@ -28,13 +28,18 @@
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs)
|
||||
Core::Core(const SimContext& ctx,
|
||||
uint32_t core_id,
|
||||
Socket* socket,
|
||||
const Arch &arch,
|
||||
const DCRS &dcrs)
|
||||
: SimObject(ctx, "core")
|
||||
, icache_req_ports(1, this)
|
||||
, icache_rsp_ports(1, this)
|
||||
, dcache_req_ports(NUM_LSU_LANES, this)
|
||||
, dcache_rsp_ports(NUM_LSU_LANES, this)
|
||||
, core_id_(core_id)
|
||||
, socket_(socket)
|
||||
, arch_(arch)
|
||||
, dcrs_(dcrs)
|
||||
, decoder_(arch)
|
||||
@@ -51,7 +56,6 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
|
||||
, decode_latch_("decode")
|
||||
, pending_icache_(arch_.num_warps())
|
||||
, csrs_(arch.num_warps())
|
||||
, socket_(socket)
|
||||
, commit_arbs_(ISSUE_WIDTH)
|
||||
{
|
||||
char sname[100];
|
||||
@@ -69,6 +73,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
|
||||
}
|
||||
|
||||
// initialize shared memory
|
||||
snprintf(sname, 100, "core%d-shared_mem", core_id);
|
||||
shared_mem_ = SharedMem::Create(sname, SharedMem::Config{
|
||||
(1 << SMEM_LOG_SIZE),
|
||||
sizeof(Word),
|
||||
@@ -77,17 +82,17 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
|
||||
false
|
||||
});
|
||||
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
|
||||
snprintf(sname, 100, "smem_demux%d_%d", core_id, i);
|
||||
auto smem_demux = SMemDemux::Create(sname);
|
||||
snprintf(sname, 100, "core%d-smem_demux%d", core_id, i);
|
||||
auto smem_demux = SMemDemux::Create(sname);
|
||||
|
||||
smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
|
||||
dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
|
||||
smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
|
||||
dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
|
||||
|
||||
smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
|
||||
shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
|
||||
smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
|
||||
shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
|
||||
|
||||
smem_demuxs_.at(i) = smem_demux;
|
||||
}
|
||||
smem_demuxs_.at(i) = smem_demux;
|
||||
}
|
||||
|
||||
// initialize dispatchers
|
||||
dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
|
||||
@@ -103,7 +108,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
|
||||
|
||||
// bind commit arbiters
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
snprintf(sname, 100, "commit-arb%d", i);
|
||||
snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
|
||||
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1);
|
||||
for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) {
|
||||
exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
|
||||
@@ -184,7 +189,7 @@ void Core::schedule() {
|
||||
}
|
||||
}
|
||||
if (scheduled_warp == -1) {
|
||||
++perf_stats_.sched_idles;
|
||||
++perf_stats_.sched_idle;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -311,7 +316,21 @@ void Core::issue() {
|
||||
case ExeType::ALU: ++perf_stats_.scrb_alu; break;
|
||||
case ExeType::FPU: ++perf_stats_.scrb_fpu; break;
|
||||
case ExeType::LSU: ++perf_stats_.scrb_lsu; break;
|
||||
case ExeType::SFU: ++perf_stats_.scrb_sfu; break;
|
||||
case ExeType::SFU: {
|
||||
++perf_stats_.scrb_sfu;
|
||||
switch (use.sfu_type) {
|
||||
case SfuType::TMC:
|
||||
case SfuType::WSPAWN:
|
||||
case SfuType::SPLIT:
|
||||
case SfuType::JOIN:
|
||||
case SfuType::BAR:
|
||||
case SfuType::PRED: ++perf_stats_.scrb_wctl; break;
|
||||
case SfuType::CSRRW:
|
||||
case SfuType::CSRRS:
|
||||
case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break;
|
||||
default: assert(false);
|
||||
}
|
||||
} break;
|
||||
default: assert(false);
|
||||
}
|
||||
}
|
||||
@@ -356,7 +375,6 @@ void Core::commit() {
|
||||
auto& commit_arb = commit_arbs_.at(i);
|
||||
if (commit_arb->Outputs.at(0).empty())
|
||||
continue;
|
||||
|
||||
auto trace = commit_arb->Outputs.at(0).front();
|
||||
|
||||
// advance to commit stage
|
||||
@@ -558,8 +576,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
break;
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
switch (addr) {
|
||||
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idles & 0xffffffff;
|
||||
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idles >> 32;
|
||||
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idle & 0xffffffff;
|
||||
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idle >> 32;
|
||||
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
|
||||
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
|
||||
@@ -574,6 +592,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
|
||||
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
|
||||
case VX_CSR_MPM_SCRB_WCTL: return perf_stats_.scrb_wctl & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_WCTL_H: return perf_stats_.scrb_wctl >> 32;
|
||||
case VX_CSR_MPM_SCRB_CSRS: return perf_stats_.scrb_csrs & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_CSRS_H: return perf_stats_.scrb_csrs >> 32;
|
||||
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
|
||||
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
|
||||
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
|
||||
@@ -588,6 +610,7 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
} break;
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
auto proc_perf = socket_->cluster()->processor()->perf_stats();
|
||||
auto cluster_perf = socket_->cluster()->perf_stats();
|
||||
auto socket_perf = socket_->perf_stats();
|
||||
auto smem_perf = shared_mem_->perf_stats();
|
||||
switch (addr) {
|
||||
@@ -611,18 +634,18 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES: return proc_perf.clusters.l2cache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES_H: return proc_perf.clusters.l2cache.writes >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R: return proc_perf.clusters.l2cache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R_H: return proc_perf.clusters.l2cache.read_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W: return proc_perf.clusters.l2cache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W_H: return proc_perf.clusters.l2cache.write_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST: return proc_perf.clusters.l2cache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return proc_perf.clusters.l2cache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST: return proc_perf.clusters.l2cache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return proc_perf.clusters.l2cache.mshr_stalls >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
|
||||
@@ -652,6 +675,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32;
|
||||
}
|
||||
} break;
|
||||
default: {
|
||||
std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
|
||||
std::abort();
|
||||
} break;
|
||||
}
|
||||
} else {
|
||||
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
|
||||
|
||||
@@ -49,7 +49,7 @@ public:
|
||||
struct PerfStats {
|
||||
uint64_t cycles;
|
||||
uint64_t instrs;
|
||||
uint64_t sched_idles;
|
||||
uint64_t sched_idle;
|
||||
uint64_t sched_stalls;
|
||||
uint64_t ibuf_stalls;
|
||||
uint64_t scrb_stalls;
|
||||
@@ -57,6 +57,8 @@ public:
|
||||
uint64_t scrb_fpu;
|
||||
uint64_t scrb_lsu;
|
||||
uint64_t scrb_sfu;
|
||||
uint64_t scrb_wctl;
|
||||
uint64_t scrb_csrs;
|
||||
uint64_t ifetches;
|
||||
uint64_t loads;
|
||||
uint64_t stores;
|
||||
@@ -66,7 +68,7 @@ public:
|
||||
PerfStats()
|
||||
: cycles(0)
|
||||
, instrs(0)
|
||||
, sched_idles(0)
|
||||
, sched_idle(0)
|
||||
, sched_stalls(0)
|
||||
, ibuf_stalls(0)
|
||||
, scrb_stalls(0)
|
||||
@@ -74,6 +76,8 @@ public:
|
||||
, scrb_fpu(0)
|
||||
, scrb_lsu(0)
|
||||
, scrb_sfu(0)
|
||||
, scrb_wctl(0)
|
||||
, scrb_csrs(0)
|
||||
, ifetches(0)
|
||||
, loads(0)
|
||||
, stores(0)
|
||||
@@ -88,7 +92,11 @@ public:
|
||||
std::vector<SimPort<MemReq>> dcache_req_ports;
|
||||
std::vector<SimPort<MemRsp>> dcache_rsp_ports;
|
||||
|
||||
Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs);
|
||||
Core(const SimContext& ctx,
|
||||
uint32_t core_id,
|
||||
Socket* socket,
|
||||
const Arch &arch,
|
||||
const DCRS &dcrs);
|
||||
|
||||
~Core();
|
||||
|
||||
@@ -158,6 +166,7 @@ private:
|
||||
void cout_flush();
|
||||
|
||||
uint32_t core_id_;
|
||||
Socket* socket_;
|
||||
const Arch& arch_;
|
||||
const DCRS &dcrs_;
|
||||
|
||||
@@ -193,10 +202,9 @@ private:
|
||||
|
||||
PerfStats perf_stats_;
|
||||
|
||||
Socket* socket_;
|
||||
|
||||
std::vector<TraceSwitch::Ptr> commit_arbs_;
|
||||
|
||||
uint32_t commit_exe_;
|
||||
uint32_t ibuffer_idx_;
|
||||
|
||||
friend class Warp;
|
||||
|
||||
@@ -113,6 +113,7 @@ void ProcessorImpl::reset() {
|
||||
perf_mem_writes_ = 0;
|
||||
perf_mem_latency_ = 0;
|
||||
perf_mem_pending_reads_ = 0;
|
||||
|
||||
}
|
||||
|
||||
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
|
||||
@@ -125,9 +126,6 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
|
||||
perf.mem_writes = perf_mem_writes_;
|
||||
perf.mem_latency = perf_mem_latency_;
|
||||
perf.l3cache = l3cache_->perf_stats();
|
||||
for (auto cluster : clusters_) {
|
||||
perf.clusters += cluster->perf_stats();
|
||||
}
|
||||
return perf;
|
||||
}
|
||||
|
||||
|
||||
@@ -24,17 +24,10 @@ namespace vortex {
|
||||
class ProcessorImpl {
|
||||
public:
|
||||
struct PerfStats {
|
||||
CacheSim::PerfStats l3cache;
|
||||
uint64_t mem_reads;
|
||||
uint64_t mem_writes;
|
||||
uint64_t mem_latency;
|
||||
CacheSim::PerfStats l3cache;
|
||||
Cluster::PerfStats clusters;
|
||||
|
||||
PerfStats()
|
||||
: mem_reads(0)
|
||||
, mem_writes(0)
|
||||
, mem_latency(0)
|
||||
{}
|
||||
};
|
||||
|
||||
ProcessorImpl(const Arch& arch);
|
||||
@@ -46,7 +39,7 @@ public:
|
||||
|
||||
void write_dcr(uint32_t addr, uint32_t value);
|
||||
|
||||
ProcessorImpl::PerfStats perf_stats() const;
|
||||
PerfStats perf_stats() const;
|
||||
|
||||
private:
|
||||
|
||||
@@ -55,7 +48,7 @@ private:
|
||||
const Arch& arch_;
|
||||
std::vector<std::shared_ptr<Cluster>> clusters_;
|
||||
DCRS dcrs_;
|
||||
MemSim::Ptr memsim_;
|
||||
MemSim::Ptr memsim_;
|
||||
CacheSim::Ptr l3cache_;
|
||||
uint64_t perf_mem_reads_;
|
||||
uint64_t perf_mem_writes_;
|
||||
|
||||
@@ -25,6 +25,7 @@ public:
|
||||
RegType reg_type;
|
||||
uint32_t reg_id;
|
||||
ExeType exe_type;
|
||||
SfuType sfu_type;
|
||||
uint64_t uuid;
|
||||
};
|
||||
|
||||
@@ -62,7 +63,7 @@ public:
|
||||
if (used_iregs.test(r)) {
|
||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({RegType::Integer, r, owner->exe_type, owner->uuid});
|
||||
out.push_back({RegType::Integer, r, owner->exe_type, owner->sfu_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,7 +71,7 @@ public:
|
||||
if (used_fregs.test(r)) {
|
||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({RegType::Float, r, owner->exe_type, owner->uuid});
|
||||
out.push_back({RegType::Float, r, owner->exe_type, owner->sfu_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,7 +79,7 @@ public:
|
||||
if (used_vregs.test(r)) {
|
||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({RegType::Vector, r, owner->exe_type, owner->uuid});
|
||||
out.push_back({RegType::Vector, r, owner->exe_type, owner->sfu_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -19,16 +19,16 @@ using namespace vortex;
|
||||
Socket::Socket(const SimContext& ctx,
|
||||
uint32_t socket_id,
|
||||
Cluster* cluster,
|
||||
const Arch &arch, const
|
||||
DCRS &dcrs)
|
||||
const Arch &arch,
|
||||
const DCRS &dcrs)
|
||||
: SimObject(ctx, "socket")
|
||||
, icache_mem_req_port(this)
|
||||
, icache_mem_rsp_port(this)
|
||||
, dcache_mem_req_port(this)
|
||||
, dcache_mem_rsp_port(this)
|
||||
, socket_id_(socket_id)
|
||||
, cores_(arch.socket_size())
|
||||
, cluster_(cluster)
|
||||
, cores_(arch.socket_size())
|
||||
{
|
||||
auto cores_per_socket = cores_.size();
|
||||
|
||||
@@ -77,7 +77,10 @@ Socket::Socket(const SimContext& ctx,
|
||||
|
||||
for (uint32_t i = 0; i < cores_per_socket; ++i) {
|
||||
uint32_t core_id = socket_id * cores_per_socket + i;
|
||||
cores_.at(i) = Core::Create(core_id, this, arch, dcrs);
|
||||
cores_.at(i) = Core::Create(core_id,
|
||||
this,
|
||||
arch,
|
||||
dcrs);
|
||||
|
||||
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
|
||||
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
|
||||
@@ -139,8 +142,8 @@ void Socket::resume(uint32_t core_index) {
|
||||
}
|
||||
|
||||
Socket::PerfStats Socket::perf_stats() const {
|
||||
Socket::PerfStats perf;
|
||||
perf.icache = icaches_->perf_stats();
|
||||
perf.dcache = dcaches_->perf_stats();
|
||||
return perf;
|
||||
PerfStats perf_stats;
|
||||
perf_stats.icache = icaches_->perf_stats();
|
||||
perf_stats.dcache = dcaches_->perf_stats();
|
||||
return perf_stats;
|
||||
}
|
||||
@@ -30,12 +30,6 @@ public:
|
||||
struct PerfStats {
|
||||
CacheSim::PerfStats icache;
|
||||
CacheSim::PerfStats dcache;
|
||||
|
||||
PerfStats& operator+=(const PerfStats& rhs) {
|
||||
this->icache += rhs.icache;
|
||||
this->dcache += rhs.dcache;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
SimPort<MemReq> icache_mem_req_port;
|
||||
@@ -74,14 +68,14 @@ public:
|
||||
|
||||
void resume(uint32_t core_id);
|
||||
|
||||
Socket::PerfStats perf_stats() const;
|
||||
PerfStats perf_stats() const;
|
||||
|
||||
private:
|
||||
uint32_t socket_id_;
|
||||
Cluster* cluster_;
|
||||
std::vector<Core::Ptr> cores_;
|
||||
CacheCluster::Ptr icaches_;
|
||||
CacheCluster::Ptr dcaches_;
|
||||
Cluster* cluster_;
|
||||
};
|
||||
|
||||
} // namespace vortex
|
||||
@@ -15,10 +15,10 @@ all:
|
||||
$(MAKE) -C blackscholes
|
||||
$(MAKE) -C transpose
|
||||
$(MAKE) -C convolution
|
||||
# $(MAKE) -C cutcp
|
||||
# $(MAKE) -C sgemm2
|
||||
# $(MAKE) -C vectorhypot
|
||||
# $(MAKE) -C mri-q run-simx
|
||||
$(MAKE) -C cutcp
|
||||
$(MAKE) -C sgemm2
|
||||
$(MAKE) -C vectorhypot
|
||||
$(MAKE) -C mri-q run-simx
|
||||
|
||||
run-simx:
|
||||
$(MAKE) -C vecadd run-simx
|
||||
@@ -125,7 +125,7 @@ clean-all:
|
||||
$(MAKE) -C oclprintf clean-all
|
||||
$(MAKE) -C blackscholes clean-all
|
||||
$(MAKE) -C convolution clean-all
|
||||
# $(MAKE) -C cutcp clean-all
|
||||
# $(MAKE) -C sgemm2 clean-all
|
||||
# $(MAKE) -C vectorhypot clean-all
|
||||
# $(MAKE) -C mri-q clean-all
|
||||
$(MAKE) -C cutcp clean-all
|
||||
$(MAKE) -C sgemm2 clean-all
|
||||
$(MAKE) -C vectorhypot clean-all
|
||||
$(MAKE) -C mri-q clean-all
|
||||
|
||||
Reference in New Issue
Block a user