adding sockets support to simx and cache subsystem refactoring
minor update minor update minor updates
This commit is contained in:
@@ -85,8 +85,8 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
.DATA_SIZE (`L1_LINE_SIZE),
|
.DATA_SIZE (`L1_LINE_SIZE),
|
||||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
||||||
) per_socket_mem_bus_if[`NUM_SOCKETS]();
|
) l1_mem_bus_if[2]();
|
||||||
|
|
||||||
`RESET_RELAY (l2_reset, reset);
|
`RESET_RELAY (l2_reset, reset);
|
||||||
|
|
||||||
@@ -102,7 +102,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
.MSHR_SIZE (`L2_MSHR_SIZE),
|
.MSHR_SIZE (`L2_MSHR_SIZE),
|
||||||
.MRSQ_SIZE (`L2_MRSQ_SIZE),
|
.MRSQ_SIZE (`L2_MRSQ_SIZE),
|
||||||
.MREQ_SIZE (`L2_MREQ_SIZE),
|
.MREQ_SIZE (`L2_MREQ_SIZE),
|
||||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH),
|
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
||||||
.WRITE_ENABLE (1),
|
.WRITE_ENABLE (1),
|
||||||
.UUID_WIDTH (`UUID_WIDTH),
|
.UUID_WIDTH (`UUID_WIDTH),
|
||||||
.CORE_OUT_REG (2),
|
.CORE_OUT_REG (2),
|
||||||
@@ -115,10 +115,65 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf (perf_l2cache),
|
.cache_perf (perf_l2cache),
|
||||||
`endif
|
`endif
|
||||||
.core_bus_if (per_socket_mem_bus_if),
|
.core_bus_if (l1_mem_bus_if),
|
||||||
.mem_bus_if (mem_bus_if)
|
.mem_bus_if (mem_bus_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (`L1_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
||||||
|
) per_socket_icache_mem_bus_if[`NUM_SOCKETS]();
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (`L1_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
||||||
|
) per_socket_dcache_mem_bus_if[`NUM_SOCKETS]();
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH)
|
||||||
|
) icache_mem_bus_if[1]();
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH)
|
||||||
|
) dcache_mem_bus_if[1]();
|
||||||
|
|
||||||
|
`RESET_RELAY (l1_mem_arb_reset, reset);
|
||||||
|
|
||||||
|
VX_mem_arb #(
|
||||||
|
.NUM_INPUTS (`NUM_SOCKETS),
|
||||||
|
.DATA_SIZE (`L1_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH),
|
||||||
|
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||||
|
.ARBITER ("R"),
|
||||||
|
.OUT_REG_REQ (2),
|
||||||
|
.OUT_REG_RSP (2)
|
||||||
|
) icache_mem_arb (
|
||||||
|
.clk (clk),
|
||||||
|
.reset (l1_mem_arb_reset),
|
||||||
|
.bus_in_if (per_socket_icache_mem_bus_if),
|
||||||
|
.bus_out_if (icache_mem_bus_if)
|
||||||
|
);
|
||||||
|
|
||||||
|
VX_mem_arb #(
|
||||||
|
.NUM_INPUTS (`NUM_SOCKETS),
|
||||||
|
.DATA_SIZE (`L1_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH),
|
||||||
|
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||||
|
.ARBITER ("R"),
|
||||||
|
.OUT_REG_REQ (2),
|
||||||
|
.OUT_REG_RSP (2)
|
||||||
|
) dcache_mem_arb (
|
||||||
|
.clk (clk),
|
||||||
|
.reset (l1_mem_arb_reset),
|
||||||
|
.bus_in_if (per_socket_dcache_mem_bus_if),
|
||||||
|
.bus_out_if (dcache_mem_bus_if)
|
||||||
|
);
|
||||||
|
|
||||||
|
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH);
|
||||||
|
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak;
|
wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak;
|
||||||
@@ -155,7 +210,8 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
.dcr_bus_if (socket_dcr_bus_if),
|
.dcr_bus_if (socket_dcr_bus_if),
|
||||||
|
|
||||||
.mem_bus_if (per_socket_mem_bus_if[i]),
|
.icache_mem_bus_if (per_socket_icache_mem_bus_if[i]),
|
||||||
|
.dcache_mem_bus_if (per_socket_dcache_mem_bus_if[i]),
|
||||||
|
|
||||||
`ifdef GBAR_ENABLE
|
`ifdef GBAR_ENABLE
|
||||||
.gbar_bus_if (per_socket_gbar_bus_if[i]),
|
.gbar_bus_if (per_socket_gbar_bus_if[i]),
|
||||||
|
|||||||
@@ -262,7 +262,10 @@
|
|||||||
`endif
|
`endif
|
||||||
|
|
||||||
// LSU Duplicate Address Check
|
// LSU Duplicate Address Check
|
||||||
`ifdef LSU_DUP
|
`ifndef LSU_DUP_DISABLE
|
||||||
|
`define LSU_DUP_ENABLE
|
||||||
|
`endif
|
||||||
|
`ifdef LSU_DUP_ENABLE
|
||||||
`define LSU_DUP_ENABLED 1
|
`define LSU_DUP_ENABLED 1
|
||||||
`else
|
`else
|
||||||
`define LSU_DUP_ENABLED 0
|
`define LSU_DUP_ENABLED 0
|
||||||
@@ -381,7 +384,7 @@
|
|||||||
|
|
||||||
// Number of Cache Units
|
// Number of Cache Units
|
||||||
`ifndef NUM_ICACHES
|
`ifndef NUM_ICACHES
|
||||||
`define NUM_ICACHES `UP(`NUM_CORES / 4)
|
`define NUM_ICACHES `UP(`SOCKET_SIZE / 4)
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Cache Size
|
// Cache Size
|
||||||
@@ -430,7 +433,7 @@
|
|||||||
|
|
||||||
// Number of Cache Units
|
// Number of Cache Units
|
||||||
`ifndef NUM_DCACHES
|
`ifndef NUM_DCACHES
|
||||||
`define NUM_DCACHES `UP(`NUM_CORES / 4)
|
`define NUM_DCACHES `UP(`SOCKET_SIZE / 4)
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Cache Size
|
// Cache Size
|
||||||
|
|||||||
@@ -410,8 +410,22 @@
|
|||||||
assign dst = src; \
|
assign dst = src; \
|
||||||
end
|
end
|
||||||
|
|
||||||
`define TO_DISPATCH_DATA(data, tid) \
|
`define TO_DISPATCH_DATA(data, tid) { \
|
||||||
{data.uuid, data.wis, data.tmask, data.op_type, data.op_mod, data.wb, data.use_PC, data.use_imm, data.PC, data.imm, data.rd, tid, data.rs1_data, data.rs2_data, data.rs3_data}
|
data.uuid, \
|
||||||
|
data.wis, \
|
||||||
|
data.tmask, \
|
||||||
|
data.op_type, \
|
||||||
|
data.op_mod, \
|
||||||
|
data.wb, \
|
||||||
|
data.use_PC, \
|
||||||
|
data.use_imm, \
|
||||||
|
data.PC, \
|
||||||
|
data.imm, \
|
||||||
|
data.rd, \
|
||||||
|
tid, \
|
||||||
|
data.rs1_data, \
|
||||||
|
data.rs2_data, \
|
||||||
|
data.rs3_data}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|||||||
@@ -141,8 +141,9 @@ package VX_gpu_pkg;
|
|||||||
|
|
||||||
/////////////////////////////// L1 Parameters /////////////////////////////
|
/////////////////////////////// L1 Parameters /////////////////////////////
|
||||||
|
|
||||||
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
localparam ICACHE_MEM_ARB_TAG_WIDTH = (ICACHE_MEM_TAG_WIDTH + `CLOG2(`NUM_SOCKETS));
|
||||||
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
|
localparam DCACHE_MEM_ARB_TAG_WIDTH = (DCACHE_MEM_TAG_WIDTH + `CLOG2(`NUM_SOCKETS));
|
||||||
|
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_ARB_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
|
||||||
|
|
||||||
/////////////////////////////// L2 Parameters /////////////////////////////
|
/////////////////////////////// L2 Parameters /////////////////////////////
|
||||||
|
|
||||||
@@ -150,10 +151,10 @@ package VX_gpu_pkg;
|
|||||||
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
||||||
|
|
||||||
// Input request size
|
// Input request size
|
||||||
localparam L2_NUM_REQS = `NUM_SOCKETS;
|
localparam L2_NUM_REQS = 2;
|
||||||
|
|
||||||
// Core request tag bits
|
// Core request tag bits
|
||||||
localparam L2_TAG_WIDTH = L1_MEM_ARB_TAG_WIDTH;
|
localparam L2_TAG_WIDTH = L1_MEM_TAG_WIDTH;
|
||||||
|
|
||||||
// Memory request data bits
|
// Memory request data bits
|
||||||
localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8);
|
localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8);
|
||||||
|
|||||||
@@ -30,7 +30,8 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
VX_dcr_bus_if.slave dcr_bus_if,
|
VX_dcr_bus_if.slave dcr_bus_if,
|
||||||
|
|
||||||
// Memory
|
// Memory
|
||||||
VX_mem_bus_if.master mem_bus_if,
|
VX_mem_bus_if.master icache_mem_bus_if,
|
||||||
|
VX_mem_bus_if.master dcache_mem_bus_if,
|
||||||
|
|
||||||
`ifdef GBAR_ENABLE
|
`ifdef GBAR_ENABLE
|
||||||
// Barrier
|
// Barrier
|
||||||
@@ -76,47 +77,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (ICACHE_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
|
||||||
) icache_mem_bus_if();
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
|
||||||
) dcache_mem_bus_if();
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (`L1_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
|
||||||
) cache_mem_bus_if[2]();
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (`L1_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
|
||||||
) mem_bus_tmp_if[1]();
|
|
||||||
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
|
||||||
|
|
||||||
`RESET_RELAY (mem_arb_reset, reset);
|
|
||||||
|
|
||||||
VX_mem_arb #(
|
|
||||||
.NUM_INPUTS (2),
|
|
||||||
.DATA_SIZE (`L1_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
|
||||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
|
||||||
.ARBITER ("R"),
|
|
||||||
.OUT_REG_REQ (2),
|
|
||||||
.OUT_REG_RSP (2)
|
|
||||||
) mem_arb (
|
|
||||||
.clk (clk),
|
|
||||||
.reset (mem_arb_reset),
|
|
||||||
.bus_in_if (cache_mem_bus_if),
|
|
||||||
.bus_out_if (mem_bus_tmp_if)
|
|
||||||
);
|
|
||||||
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|||||||
@@ -78,33 +78,25 @@
|
|||||||
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
|
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
|
||||||
`define VX_CSR_MPM_SCRB_ST 12'hB06
|
`define VX_CSR_MPM_SCRB_ST 12'hB06
|
||||||
`define VX_CSR_MPM_SCRB_ST_H 12'hB86
|
`define VX_CSR_MPM_SCRB_ST_H 12'hB86
|
||||||
`define VX_CSR_MPM_ALU_ST 12'hB07
|
`define VX_CSR_MPM_SCRB_ALU 12'hB07
|
||||||
`define VX_CSR_MPM_ALU_ST_H 12'hB87
|
`define VX_CSR_MPM_SCRB_ALU_H 12'hB87
|
||||||
`define VX_CSR_MPM_LSU_ST 12'hB08
|
`define VX_CSR_MPM_SCRB_FPU 12'hB08
|
||||||
`define VX_CSR_MPM_LSU_ST_H 12'hB88
|
`define VX_CSR_MPM_SCRB_FPU_H 12'hB88
|
||||||
`define VX_CSR_MPM_FPU_ST 12'hB09
|
`define VX_CSR_MPM_SCRB_LSU 12'hB09
|
||||||
`define VX_CSR_MPM_FPU_ST_H 12'hB89
|
`define VX_CSR_MPM_SCRB_LSU_H 12'hB89
|
||||||
`define VX_CSR_MPM_SFU_ST 12'hB0A
|
`define VX_CSR_MPM_SCRB_SFU 12'hB0A
|
||||||
`define VX_CSR_MPM_SFU_ST_H 12'hB8A
|
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A
|
||||||
`define VX_CSR_MPM_SCRB_ALU 12'hB0B
|
|
||||||
`define VX_CSR_MPM_SCRB_ALU_H 12'hB8B
|
|
||||||
`define VX_CSR_MPM_SCRB_FPU 12'hB0C
|
|
||||||
`define VX_CSR_MPM_SCRB_FPU_H 12'hB8C
|
|
||||||
`define VX_CSR_MPM_SCRB_LSU 12'hB0D
|
|
||||||
`define VX_CSR_MPM_SCRB_LSU_H 12'hB8D
|
|
||||||
`define VX_CSR_MPM_SCRB_SFU 12'hB0E
|
|
||||||
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8E
|
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
`define VX_CSR_MPM_IFETCHES 12'hB0F
|
`define VX_CSR_MPM_IFETCHES 12'hB0B
|
||||||
`define VX_CSR_MPM_IFETCHES_H 12'hB8F
|
`define VX_CSR_MPM_IFETCHES_H 12'hB8B
|
||||||
`define VX_CSR_MPM_LOADS 12'hB10
|
`define VX_CSR_MPM_LOADS 12'hB0C
|
||||||
`define VX_CSR_MPM_LOADS_H 12'hB90
|
`define VX_CSR_MPM_LOADS_H 12'hB8C
|
||||||
`define VX_CSR_MPM_STORES 12'hB11
|
`define VX_CSR_MPM_STORES 12'hB0D
|
||||||
`define VX_CSR_MPM_STORES_H 12'hB91
|
`define VX_CSR_MPM_STORES_H 12'hB8D
|
||||||
`define VX_CSR_MPM_IFETCH_LT 12'hB12
|
`define VX_CSR_MPM_IFETCH_LT 12'hB0E
|
||||||
`define VX_CSR_MPM_IFETCH_LT_H 12'hB92
|
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
|
||||||
`define VX_CSR_MPM_LOAD_LT 12'hB13
|
`define VX_CSR_MPM_LOAD_LT 12'hB0F
|
||||||
`define VX_CSR_MPM_LOAD_LT_H 12'hB93
|
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F
|
||||||
|
|
||||||
// Machine Performance-monitoring memory counters
|
// Machine Performance-monitoring memory counters
|
||||||
// PERF: icache
|
// PERF: icache
|
||||||
|
|||||||
@@ -273,23 +273,23 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
wire [1:0] perf_icache_pending_read_cycle;
|
wire [1:0] perf_icache_pending_read_cycle;
|
||||||
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
||||||
|
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
|
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
|
||||||
|
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
|
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
||||||
|
|
||||||
wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready;
|
wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
|
||||||
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready;
|
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
|
||||||
|
|
||||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
|
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
|
||||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
|
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
|
||||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
|
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
|
||||||
|
|
||||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||||
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
|
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw;
|
||||||
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
|
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
|
||||||
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
|
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -195,19 +195,6 @@ import VX_fpu_pkg::*;
|
|||||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||||
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_ALU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_ALU][31:0];
|
|
||||||
`VX_CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
|
||||||
`VX_CSR_MPM_LSU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_LSU][31:0];
|
|
||||||
`VX_CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
|
||||||
`ifdef EXT_F_ENABLE
|
|
||||||
`VX_CSR_MPM_FPU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_FPU][31:0];
|
|
||||||
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
|
||||||
`else
|
|
||||||
`VX_CSR_MPM_FPU_ST : read_data_ro_r = '0;
|
|
||||||
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = '0;
|
|
||||||
`endif
|
|
||||||
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
|
|
||||||
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
|
||||||
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0];
|
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0];
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
@@ -220,7 +207,7 @@ import VX_fpu_pkg::*;
|
|||||||
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0];
|
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0];
|
||||||
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0];
|
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0];
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
||||||
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||||
|
|||||||
@@ -84,7 +84,7 @@ module VX_issue #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (dispatch_reset),
|
.reset (dispatch_reset),
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.perf_stalls (perf_issue_if.dsp_stalls),
|
`UNUSED_PIN (perf_stalls),
|
||||||
`endif
|
`endif
|
||||||
.operands_if (operands_if),
|
.operands_if (operands_if),
|
||||||
.alu_dispatch_if(alu_dispatch_if),
|
.alu_dispatch_if(alu_dispatch_if),
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||||||
// detect duplicate addresses
|
// detect duplicate addresses
|
||||||
|
|
||||||
wire lsu_is_dup;
|
wire lsu_is_dup;
|
||||||
`ifdef LSU_DUP
|
`ifdef LSU_DUP_ENABLE
|
||||||
if (NUM_LANES > 1) begin
|
if (NUM_LANES > 1) begin
|
||||||
wire [NUM_LANES-2:0] addr_matches;
|
wire [NUM_LANES-2:0] addr_matches;
|
||||||
for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
|
for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
|
||||||
@@ -304,7 +304,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
assign mem_req_tag = {
|
assign mem_req_tag = {
|
||||||
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
|
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
|
||||||
`ifdef LSU_DUP
|
`ifdef LSU_DUP_ENABLE
|
||||||
, lsu_is_dup
|
, lsu_is_dup
|
||||||
`endif
|
`endif
|
||||||
};
|
};
|
||||||
@@ -448,13 +448,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||||||
wire [PID_WIDTH-1:0] rsp_pid;
|
wire [PID_WIDTH-1:0] rsp_pid;
|
||||||
wire rsp_is_dup;
|
wire rsp_is_dup;
|
||||||
|
|
||||||
`ifndef LSU_DUP
|
`ifndef LSU_DUP_ENABLE
|
||||||
assign rsp_is_dup = 0;
|
assign rsp_is_dup = 0;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
assign {
|
assign {
|
||||||
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
|
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
|
||||||
`ifdef LSU_DUP
|
`ifdef LSU_DUP_ENABLE
|
||||||
, rsp_is_dup
|
, rsp_is_dup
|
||||||
`endif
|
`endif
|
||||||
} = mem_rsp_tag;
|
} = mem_rsp_tag;
|
||||||
|
|||||||
@@ -14,18 +14,17 @@
|
|||||||
`include "VX_define.vh"
|
`include "VX_define.vh"
|
||||||
|
|
||||||
interface VX_pipeline_perf_if ();
|
interface VX_pipeline_perf_if ();
|
||||||
wire [`PERF_CTR_BITS-1:0] sched_idles;
|
wire [`PERF_CTR_BITS-1:0] sched_idles;
|
||||||
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
|
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
|
||||||
wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS];
|
|
||||||
|
|
||||||
wire [`PERF_CTR_BITS-1:0] ifetches;
|
wire [`PERF_CTR_BITS-1:0] ifetches;
|
||||||
wire [`PERF_CTR_BITS-1:0] loads;
|
wire [`PERF_CTR_BITS-1:0] loads;
|
||||||
wire [`PERF_CTR_BITS-1:0] stores;
|
wire [`PERF_CTR_BITS-1:0] stores;
|
||||||
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||||
wire [`PERF_CTR_BITS-1:0] load_latency;
|
wire [`PERF_CTR_BITS-1:0] load_latency;
|
||||||
|
|
||||||
modport schedule (
|
modport schedule (
|
||||||
output sched_idles,
|
output sched_idles,
|
||||||
@@ -35,8 +34,7 @@ interface VX_pipeline_perf_if ();
|
|||||||
modport issue (
|
modport issue (
|
||||||
output ibf_stalls,
|
output ibf_stalls,
|
||||||
output scb_stalls,
|
output scb_stalls,
|
||||||
output scb_uses,
|
output scb_uses
|
||||||
output dsp_stalls
|
|
||||||
);
|
);
|
||||||
|
|
||||||
modport slave (
|
modport slave (
|
||||||
@@ -45,7 +43,6 @@ interface VX_pipeline_perf_if ();
|
|||||||
input ibf_stalls,
|
input ibf_stalls,
|
||||||
input scb_stalls,
|
input scb_stalls,
|
||||||
input scb_uses,
|
input scb_uses,
|
||||||
input dsp_stalls,
|
|
||||||
input ifetches,
|
input ifetches,
|
||||||
input loads,
|
input loads,
|
||||||
input stores,
|
input stores,
|
||||||
|
|||||||
@@ -204,10 +204,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
uint64_t sched_stalls = 0;
|
uint64_t sched_stalls = 0;
|
||||||
uint64_t ibuffer_stalls = 0;
|
uint64_t ibuffer_stalls = 0;
|
||||||
uint64_t scrb_stalls = 0;
|
uint64_t scrb_stalls = 0;
|
||||||
uint64_t lsu_stalls = 0;
|
|
||||||
uint64_t fpu_stalls = 0;
|
|
||||||
uint64_t alu_stalls = 0;
|
|
||||||
uint64_t sfu_stalls = 0;
|
|
||||||
uint64_t scrb_alu = 0;
|
uint64_t scrb_alu = 0;
|
||||||
uint64_t scrb_fpu = 0;
|
uint64_t scrb_fpu = 0;
|
||||||
uint64_t scrb_lsu = 0;
|
uint64_t scrb_lsu = 0;
|
||||||
@@ -310,34 +306,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
calcAvgPercent(scrb_sfu_per_core, scrb_total));
|
calcAvgPercent(scrb_sfu_per_core, scrb_total));
|
||||||
scrb_stalls += scrb_stalls_per_core;
|
scrb_stalls += scrb_stalls_per_core;
|
||||||
}
|
}
|
||||||
// alu_stalls
|
|
||||||
{
|
|
||||||
uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST);
|
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
|
|
||||||
alu_stalls += alu_stalls_per_core;
|
|
||||||
}
|
|
||||||
// lsu_stalls
|
|
||||||
{
|
|
||||||
uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST);
|
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
|
|
||||||
lsu_stalls += lsu_stalls_per_core;
|
|
||||||
}
|
|
||||||
// fpu_stalls
|
|
||||||
{
|
|
||||||
uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST);
|
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
|
|
||||||
fpu_stalls += fpu_stalls_per_core;
|
|
||||||
}
|
|
||||||
// sfu_stalls
|
|
||||||
{
|
|
||||||
uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST);
|
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core);
|
|
||||||
sfu_stalls += sfu_stalls_per_core;
|
|
||||||
}
|
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
// ifetches
|
// ifetches
|
||||||
{
|
{
|
||||||
uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
|
uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCHES);
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
|
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
|
||||||
ifetches += ifetches_per_core;
|
ifetches += ifetches_per_core;
|
||||||
|
|
||||||
@@ -464,10 +436,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
calcAvgPercent(scrb_fpu, scrb_total),
|
calcAvgPercent(scrb_fpu, scrb_total),
|
||||||
calcAvgPercent(scrb_lsu, scrb_total),
|
calcAvgPercent(scrb_lsu, scrb_total),
|
||||||
calcAvgPercent(scrb_sfu, scrb_total));
|
calcAvgPercent(scrb_sfu, scrb_total));
|
||||||
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
|
|
||||||
fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls);
|
|
||||||
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
|
|
||||||
fprintf(stream, "PERF: sfu unit stalls=%ld\n", sfu_stalls);
|
|
||||||
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
|
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
|
||||||
fprintf(stream, "PERF: loads=%ld\n", loads);
|
fprintf(stream, "PERF: loads=%ld\n", loads);
|
||||||
fprintf(stream, "PERF: stores=%ld\n", stores);
|
fprintf(stream, "PERF: stores=%ld\n", stores);
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ private:
|
|||||||
class vx_device {
|
class vx_device {
|
||||||
public:
|
public:
|
||||||
vx_device()
|
vx_device()
|
||||||
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, NUM_CLUSTERS)
|
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
|
||||||
, ram_(RAM_PAGE_SIZE)
|
, ram_(RAM_PAGE_SIZE)
|
||||||
, processor_(arch_)
|
, processor_(arch_)
|
||||||
, global_mem_(
|
, global_mem_(
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
|
|||||||
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
|
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
|
||||||
|
|
||||||
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
|
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
|
||||||
SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
|
SRCS += processor.cpp cluster.cpp socket.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
|
||||||
|
|
||||||
# Debugigng
|
# Debugigng
|
||||||
ifdef DEBUG
|
ifdef DEBUG
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ private:
|
|||||||
uint16_t num_warps_;
|
uint16_t num_warps_;
|
||||||
uint16_t num_cores_;
|
uint16_t num_cores_;
|
||||||
uint16_t num_clusters_;
|
uint16_t num_clusters_;
|
||||||
|
uint16_t socket_size_;
|
||||||
uint16_t vsize_;
|
uint16_t vsize_;
|
||||||
uint16_t num_regs_;
|
uint16_t num_regs_;
|
||||||
uint16_t num_csrs_;
|
uint16_t num_csrs_;
|
||||||
@@ -35,11 +36,12 @@ private:
|
|||||||
uint16_t ipdom_size_;
|
uint16_t ipdom_size_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)
|
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)
|
||||||
: num_threads_(num_threads)
|
: num_threads_(num_threads)
|
||||||
, num_warps_(num_warps)
|
, num_warps_(num_warps)
|
||||||
, num_cores_(num_cores)
|
, num_cores_(num_cores)
|
||||||
, num_clusters_(num_clusters)
|
, num_clusters_(NUM_CLUSTERS)
|
||||||
|
, socket_size_(SOCKET_SIZE)
|
||||||
, vsize_(16)
|
, vsize_(16)
|
||||||
, num_regs_(32)
|
, num_regs_(32)
|
||||||
, num_csrs_(4096)
|
, num_csrs_(4096)
|
||||||
@@ -82,6 +84,10 @@ public:
|
|||||||
uint16_t num_clusters() const {
|
uint16_t num_clusters() const {
|
||||||
return num_clusters_;
|
return num_clusters_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint16_t socket_size() const {
|
||||||
|
return socket_size_;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -24,14 +24,38 @@ Cluster::Cluster(const SimContext& ctx,
|
|||||||
, mem_req_port(this)
|
, mem_req_port(this)
|
||||||
, mem_rsp_port(this)
|
, mem_rsp_port(this)
|
||||||
, cluster_id_(cluster_id)
|
, cluster_id_(cluster_id)
|
||||||
, cores_(arch.num_cores())
|
, sockets_(NUM_SOCKETS)
|
||||||
, barriers_(arch.num_barriers(), 0)
|
, barriers_(arch.num_barriers(), 0)
|
||||||
, sharedmems_(arch.num_cores())
|
|
||||||
, processor_(processor)
|
, processor_(processor)
|
||||||
|
, cores_per_socket_(arch.socket_size())
|
||||||
{
|
{
|
||||||
auto num_cores = arch.num_cores();
|
|
||||||
|
|
||||||
char sname[100];
|
char sname[100];
|
||||||
|
|
||||||
|
auto sockets_per_cluster = sockets_.size();
|
||||||
|
|
||||||
|
// create sockets
|
||||||
|
|
||||||
|
snprintf(sname, 100, "cluster%d-icache-arb", cluster_id);
|
||||||
|
auto icache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
|
||||||
|
|
||||||
|
snprintf(sname, 100, "cluster%d-dcache-arb", cluster_id);
|
||||||
|
auto dcache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < sockets_per_cluster; ++i) {
|
||||||
|
uint32_t socket_id = cluster_id * sockets_per_cluster + i;
|
||||||
|
auto socket = Socket::Create(socket_id, this, arch, dcrs);
|
||||||
|
|
||||||
|
socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i));
|
||||||
|
icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port);
|
||||||
|
|
||||||
|
socket->dcache_mem_req_port.bind(&dcache_switch->ReqIn.at(i));
|
||||||
|
dcache_switch->RspIn.at(i).bind(&socket->dcache_mem_rsp_port);
|
||||||
|
|
||||||
|
sockets_.at(i) = socket;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create l2cache
|
||||||
|
|
||||||
snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
|
snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
|
||||||
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
|
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
|
||||||
!L2_ENABLED,
|
!L2_ENABLED,
|
||||||
@@ -42,7 +66,7 @@ Cluster::Cluster(const SimContext& ctx,
|
|||||||
log2ceil(L2_NUM_BANKS), // B
|
log2ceil(L2_NUM_BANKS), // B
|
||||||
XLEN, // address bits
|
XLEN, // address bits
|
||||||
1, // number of ports
|
1, // number of ports
|
||||||
5, // request size
|
2, // request size
|
||||||
true, // write-through
|
true, // write-through
|
||||||
false, // write response
|
false, // write response
|
||||||
L2_MSHR_SIZE, // mshr
|
L2_MSHR_SIZE, // mshr
|
||||||
@@ -52,87 +76,11 @@ Cluster::Cluster(const SimContext& ctx,
|
|||||||
l2cache_->MemReqPort.bind(&this->mem_req_port);
|
l2cache_->MemReqPort.bind(&this->mem_req_port);
|
||||||
this->mem_rsp_port.bind(&l2cache_->MemRspPort);
|
this->mem_rsp_port.bind(&l2cache_->MemRspPort);
|
||||||
|
|
||||||
snprintf(sname, 100, "cluster%d-icaches", cluster_id);
|
icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0));
|
||||||
icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
|
l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0));
|
||||||
!ICACHE_ENABLED,
|
|
||||||
log2ceil(ICACHE_SIZE), // C
|
|
||||||
log2ceil(L1_LINE_SIZE), // L
|
|
||||||
log2ceil(sizeof(uint32_t)), // W
|
|
||||||
log2ceil(ICACHE_NUM_WAYS),// A
|
|
||||||
1, // B
|
|
||||||
XLEN, // address bits
|
|
||||||
1, // number of ports
|
|
||||||
1, // number of inputs
|
|
||||||
true, // write-through
|
|
||||||
false, // write response
|
|
||||||
(uint8_t)arch.num_warps(), // mshr
|
|
||||||
2, // pipeline latency
|
|
||||||
});
|
|
||||||
|
|
||||||
icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
|
dcache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(1));
|
||||||
l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
|
l2cache_->CoreRspPorts.at(1).bind(&dcache_switch->RspOut.at(0));
|
||||||
|
|
||||||
snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
|
|
||||||
dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
|
|
||||||
!DCACHE_ENABLED,
|
|
||||||
log2ceil(DCACHE_SIZE), // C
|
|
||||||
log2ceil(L1_LINE_SIZE), // L
|
|
||||||
log2ceil(sizeof(Word)), // W
|
|
||||||
log2ceil(DCACHE_NUM_WAYS),// A
|
|
||||||
log2ceil(DCACHE_NUM_BANKS), // B
|
|
||||||
XLEN, // address bits
|
|
||||||
1, // number of ports
|
|
||||||
DCACHE_NUM_BANKS, // number of inputs
|
|
||||||
true, // write-through
|
|
||||||
false, // write response
|
|
||||||
DCACHE_MSHR_SIZE, // mshr
|
|
||||||
4, // pipeline latency
|
|
||||||
});
|
|
||||||
|
|
||||||
dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
|
|
||||||
l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
// create shared memory blocks
|
|
||||||
for (uint32_t i = 0; i < num_cores; ++i) {
|
|
||||||
snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
|
|
||||||
sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
|
|
||||||
(1 << SMEM_LOG_SIZE),
|
|
||||||
sizeof(Word),
|
|
||||||
NUM_LSU_LANES,
|
|
||||||
NUM_LSU_LANES,
|
|
||||||
false
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// create cores
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < num_cores; ++i) {
|
|
||||||
uint32_t core_id = cluster_id * num_cores + i;
|
|
||||||
cores_.at(i) = Core::Create(core_id,
|
|
||||||
this,
|
|
||||||
arch,
|
|
||||||
dcrs,
|
|
||||||
sharedmems_.at(i));
|
|
||||||
|
|
||||||
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
|
|
||||||
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
|
|
||||||
|
|
||||||
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
|
|
||||||
snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
|
|
||||||
auto smem_demux = SMemDemux::Create(sname);
|
|
||||||
|
|
||||||
cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
|
|
||||||
smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));
|
|
||||||
|
|
||||||
smem_demux->ReqDC.bind(&dcaches_->CoreReqPorts.at(i).at(j));
|
|
||||||
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDC);
|
|
||||||
|
|
||||||
smem_demux->ReqSM.bind(&sharedmems_.at(i)->Inputs.at(j));
|
|
||||||
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSM);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Cluster::~Cluster() {
|
Cluster::~Cluster() {
|
||||||
@@ -150,14 +98,14 @@ void Cluster::tick() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Cluster::attach_ram(RAM* ram) {
|
void Cluster::attach_ram(RAM* ram) {
|
||||||
for (auto core : cores_) {
|
for (auto& socket : sockets_) {
|
||||||
core->attach_ram(ram);
|
socket->attach_ram(ram);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Cluster::running() const {
|
bool Cluster::running() const {
|
||||||
for (auto& core : cores_) {
|
for (auto& socket : sockets_) {
|
||||||
if (core->running())
|
if (socket->running())
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
@@ -166,9 +114,9 @@ bool Cluster::running() const {
|
|||||||
bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
|
bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
|
||||||
bool done = true;
|
bool done = true;
|
||||||
Word exitcode_ = 0;
|
Word exitcode_ = 0;
|
||||||
for (auto& core : cores_) {
|
for (auto& socket : sockets_) {
|
||||||
Word ec;
|
Word ec;
|
||||||
if (core->check_exit(&ec, riscv_test)) {
|
if (socket->check_exit(&ec, riscv_test)) {
|
||||||
exitcode_ |= ec;
|
exitcode_ |= ec;
|
||||||
} else {
|
} else {
|
||||||
done = false;
|
done = false;
|
||||||
@@ -181,36 +129,32 @@ bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
|
|||||||
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
|
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
|
||||||
auto& barrier = barriers_.at(bar_id);
|
auto& barrier = barriers_.at(bar_id);
|
||||||
|
|
||||||
uint32_t local_core_id = core_id % cores_.size();
|
auto sockets_per_cluster = sockets_.size();
|
||||||
|
auto cores_per_socket = cores_per_socket_;
|
||||||
|
|
||||||
|
uint32_t cores_per_cluster = sockets_per_cluster * cores_per_socket;
|
||||||
|
uint32_t local_core_id = core_id % cores_per_cluster;
|
||||||
barrier.set(local_core_id);
|
barrier.set(local_core_id);
|
||||||
|
|
||||||
DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
|
DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
|
||||||
|
|
||||||
if (barrier.count() == (size_t)count) {
|
if (barrier.count() == (size_t)count) {
|
||||||
// resume all suspended cores
|
// resume all suspended cores
|
||||||
for (uint32_t i = 0; i < cores_.size(); ++i) {
|
for (uint32_t s = 0; s < sockets_per_cluster; ++s) {
|
||||||
if (barrier.test(i)) {
|
for (uint32_t c = 0; c < cores_per_socket; ++c) {
|
||||||
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
|
uint32_t i = s * cores_per_socket + c;
|
||||||
cores_.at(i)->resume();
|
if (barrier.test(i)) {
|
||||||
|
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
|
||||||
|
sockets_.at(s)->resume(c);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
barrier.reset();
|
barrier.reset();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ProcessorImpl* Cluster::processor() const {
|
|
||||||
return processor_;
|
|
||||||
}
|
|
||||||
|
|
||||||
Cluster::PerfStats Cluster::perf_stats() const {
|
Cluster::PerfStats Cluster::perf_stats() const {
|
||||||
Cluster::PerfStats perf;
|
Cluster::PerfStats perf;
|
||||||
perf.icache = icaches_->perf_stats();
|
|
||||||
perf.dcache = dcaches_->perf_stats();
|
|
||||||
perf.l2cache = l2cache_->perf_stats();
|
perf.l2cache = l2cache_->perf_stats();
|
||||||
|
|
||||||
for (auto sharedmem : sharedmems_) {
|
|
||||||
perf.sharedmem += sharedmem->perf_stats();
|
|
||||||
}
|
|
||||||
|
|
||||||
return perf;
|
return perf;
|
||||||
}
|
}
|
||||||
@@ -17,8 +17,8 @@
|
|||||||
#include "dcrs.h"
|
#include "dcrs.h"
|
||||||
#include "arch.h"
|
#include "arch.h"
|
||||||
#include "cache_cluster.h"
|
#include "cache_cluster.h"
|
||||||
#include "shared_mem.h"
|
|
||||||
#include "core.h"
|
#include "core.h"
|
||||||
|
#include "socket.h"
|
||||||
#include "constants.h"
|
#include "constants.h"
|
||||||
|
|
||||||
namespace vortex {
|
namespace vortex {
|
||||||
@@ -27,17 +27,11 @@ class ProcessorImpl;
|
|||||||
|
|
||||||
class Cluster : public SimObject<Cluster> {
|
class Cluster : public SimObject<Cluster> {
|
||||||
public:
|
public:
|
||||||
struct PerfStats {
|
struct PerfStats {
|
||||||
CacheSim::PerfStats icache;
|
CacheSim::PerfStats l2cache;
|
||||||
CacheSim::PerfStats dcache;
|
|
||||||
SharedMem::PerfStats sharedmem;
|
|
||||||
CacheSim::PerfStats l2cache;
|
|
||||||
|
|
||||||
PerfStats& operator+=(const PerfStats& rhs) {
|
PerfStats& operator+=(const PerfStats& rhs) {
|
||||||
this->icache += rhs.icache;
|
this->l2cache += rhs.l2cache;
|
||||||
this->dcache += rhs.dcache;
|
|
||||||
this->sharedmem += rhs.sharedmem;
|
|
||||||
this->l2cache += rhs.l2cache;
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -53,6 +47,14 @@ public:
|
|||||||
|
|
||||||
~Cluster();
|
~Cluster();
|
||||||
|
|
||||||
|
uint32_t id() const {
|
||||||
|
return cluster_id_;
|
||||||
|
}
|
||||||
|
|
||||||
|
ProcessorImpl* processor() const {
|
||||||
|
return processor_;
|
||||||
|
}
|
||||||
|
|
||||||
void reset();
|
void reset();
|
||||||
|
|
||||||
void tick();
|
void tick();
|
||||||
@@ -65,22 +67,15 @@ public:
|
|||||||
|
|
||||||
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
|
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
|
||||||
|
|
||||||
ProcessorImpl* processor() const;
|
|
||||||
|
|
||||||
Cluster::PerfStats perf_stats() const;
|
Cluster::PerfStats perf_stats() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
uint32_t cluster_id_;
|
uint32_t cluster_id_;
|
||||||
std::vector<Core::Ptr> cores_;
|
std::vector<Socket::Ptr> sockets_;
|
||||||
std::vector<CoreMask> barriers_;
|
std::vector<CoreMask> barriers_;
|
||||||
CacheSim::Ptr l2cache_;
|
CacheSim::Ptr l2cache_;
|
||||||
CacheCluster::Ptr icaches_;
|
ProcessorImpl* processor_;
|
||||||
CacheCluster::Ptr dcaches_;
|
uint32_t cores_per_socket_;
|
||||||
std::vector<SharedMem::Ptr> sharedmems_;
|
|
||||||
CacheCluster::Ptr tcaches_;
|
|
||||||
CacheCluster::Ptr ocaches_;
|
|
||||||
CacheCluster::Ptr rcaches_;
|
|
||||||
ProcessorImpl* processor_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace vortex
|
} // namespace vortex
|
||||||
@@ -21,18 +21,14 @@
|
|||||||
#include "mem.h"
|
#include "mem.h"
|
||||||
#include "decode.h"
|
#include "decode.h"
|
||||||
#include "core.h"
|
#include "core.h"
|
||||||
|
#include "socket.h"
|
||||||
#include "debug.h"
|
#include "debug.h"
|
||||||
#include "constants.h"
|
#include "constants.h"
|
||||||
#include "processor_impl.h"
|
#include "processor_impl.h"
|
||||||
|
|
||||||
using namespace vortex;
|
using namespace vortex;
|
||||||
|
|
||||||
Core::Core(const SimContext& ctx,
|
Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs)
|
||||||
uint32_t core_id,
|
|
||||||
Cluster* cluster,
|
|
||||||
const Arch &arch,
|
|
||||||
const DCRS &dcrs,
|
|
||||||
SharedMem::Ptr sharedmem)
|
|
||||||
: SimObject(ctx, "core")
|
: SimObject(ctx, "core")
|
||||||
, icache_req_ports(1, this)
|
, icache_req_ports(1, this)
|
||||||
, icache_rsp_ports(1, this)
|
, icache_rsp_ports(1, this)
|
||||||
@@ -50,12 +46,12 @@ Core::Core(const SimContext& ctx,
|
|||||||
, operands_(ISSUE_WIDTH)
|
, operands_(ISSUE_WIDTH)
|
||||||
, dispatchers_((uint32_t)ExeType::ExeTypeCount)
|
, dispatchers_((uint32_t)ExeType::ExeTypeCount)
|
||||||
, exe_units_((uint32_t)ExeType::ExeTypeCount)
|
, exe_units_((uint32_t)ExeType::ExeTypeCount)
|
||||||
, sharedmem_(sharedmem)
|
, smem_demuxs_(NUM_LSU_LANES)
|
||||||
, fetch_latch_("fetch")
|
, fetch_latch_("fetch")
|
||||||
, decode_latch_("decode")
|
, decode_latch_("decode")
|
||||||
, pending_icache_(arch_.num_warps())
|
, pending_icache_(arch_.num_warps())
|
||||||
, csrs_(arch.num_warps())
|
, csrs_(arch.num_warps())
|
||||||
, cluster_(cluster)
|
, socket_(socket)
|
||||||
, commit_arbs_(ISSUE_WIDTH)
|
, commit_arbs_(ISSUE_WIDTH)
|
||||||
{
|
{
|
||||||
char sname[100];
|
char sname[100];
|
||||||
@@ -72,6 +68,27 @@ Core::Core(const SimContext& ctx,
|
|||||||
operands_.at(i) = SimPlatform::instance().create_object<Operand>();
|
operands_.at(i) = SimPlatform::instance().create_object<Operand>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// initialize shared memory
|
||||||
|
shared_mem_ = SharedMem::Create(sname, SharedMem::Config{
|
||||||
|
(1 << SMEM_LOG_SIZE),
|
||||||
|
sizeof(Word),
|
||||||
|
NUM_LSU_LANES,
|
||||||
|
NUM_LSU_LANES,
|
||||||
|
false
|
||||||
|
});
|
||||||
|
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
|
||||||
|
snprintf(sname, 100, "smem_demux%d_%d", core_id, i);
|
||||||
|
auto smem_demux = SMemDemux::Create(sname);
|
||||||
|
|
||||||
|
smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
|
||||||
|
dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
|
||||||
|
|
||||||
|
smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
|
||||||
|
shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
|
||||||
|
|
||||||
|
smem_demuxs_.at(i) = smem_demux;
|
||||||
|
}
|
||||||
|
|
||||||
// initialize dispatchers
|
// initialize dispatchers
|
||||||
dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
|
dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
|
||||||
dispatchers_.at((int)ExeType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
|
dispatchers_.at((int)ExeType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
|
||||||
@@ -241,13 +258,6 @@ void Core::decode() {
|
|||||||
stalled_warps_.reset(trace->wid);
|
stalled_warps_.reset(trace->wid);
|
||||||
}
|
}
|
||||||
|
|
||||||
// update perf counters
|
|
||||||
uint32_t active_threads = trace->tmask.count();
|
|
||||||
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::LOAD)
|
|
||||||
perf_stats_.loads += active_threads;
|
|
||||||
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE)
|
|
||||||
perf_stats_.stores += active_threads;
|
|
||||||
|
|
||||||
DT(3, "pipeline-decode: " << *trace);
|
DT(3, "pipeline-decode: " << *trace);
|
||||||
|
|
||||||
// insert to ibuffer
|
// insert to ibuffer
|
||||||
@@ -394,7 +404,7 @@ void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) {
|
|||||||
if (is_global) {
|
if (is_global) {
|
||||||
// global barrier handling
|
// global barrier handling
|
||||||
if (barrier.count() == active_warps_.count()) {
|
if (barrier.count() == active_warps_.count()) {
|
||||||
cluster_->barrier(bar_idx, count, core_id_);
|
socket_->barrier(bar_idx, count, core_id_);
|
||||||
barrier.reset();
|
barrier.reset();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -431,7 +441,7 @@ AddrType Core::get_addr_type(uint64_t addr) {
|
|||||||
void Core::dcache_read(void *data, uint64_t addr, uint32_t size) {
|
void Core::dcache_read(void *data, uint64_t addr, uint32_t size) {
|
||||||
auto type = this->get_addr_type(addr);
|
auto type = this->get_addr_type(addr);
|
||||||
if (type == AddrType::Shared) {
|
if (type == AddrType::Shared) {
|
||||||
sharedmem_->read(data, addr, size);
|
shared_mem_->read(data, addr, size);
|
||||||
} else {
|
} else {
|
||||||
mmu_.read(data, addr, size, 0);
|
mmu_.read(data, addr, size, 0);
|
||||||
}
|
}
|
||||||
@@ -446,7 +456,7 @@ void Core::dcache_write(const void* data, uint64_t addr, uint32_t size) {
|
|||||||
this->writeToStdOut(data, addr, size);
|
this->writeToStdOut(data, addr, size);
|
||||||
} else {
|
} else {
|
||||||
if (type == AddrType::Shared) {
|
if (type == AddrType::Shared) {
|
||||||
sharedmem_->write(data, addr, size);
|
shared_mem_->write(data, addr, size);
|
||||||
} else {
|
} else {
|
||||||
mmu_.write(data, addr, size, 0);
|
mmu_.write(data, addr, size, 0);
|
||||||
}
|
}
|
||||||
@@ -554,16 +564,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||||||
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
|
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
|
||||||
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
|
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
|
||||||
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
|
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
|
||||||
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
|
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
|
||||||
case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
|
case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
|
||||||
case VX_CSR_MPM_ALU_ST: return perf_stats_.alu_stalls & 0xffffffff;
|
|
||||||
case VX_CSR_MPM_ALU_ST_H: return perf_stats_.alu_stalls >> 32;
|
|
||||||
case VX_CSR_MPM_LSU_ST: return perf_stats_.lsu_stalls & 0xffffffff;
|
|
||||||
case VX_CSR_MPM_LSU_ST_H: return perf_stats_.lsu_stalls >> 32;
|
|
||||||
case VX_CSR_MPM_FPU_ST: return perf_stats_.fpu_stalls & 0xffffffff;
|
|
||||||
case VX_CSR_MPM_FPU_ST_H: return perf_stats_.fpu_stalls >> 32;
|
|
||||||
case VX_CSR_MPM_SFU_ST: return perf_stats_.sfu_stalls & 0xffffffff;
|
|
||||||
case VX_CSR_MPM_SFU_ST_H: return perf_stats_.sfu_stalls >> 32;
|
|
||||||
case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff;
|
case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff;
|
||||||
case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32;
|
case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32;
|
||||||
case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff;
|
case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff;
|
||||||
@@ -572,7 +574,6 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||||||
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
|
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
|
||||||
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
|
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
|
||||||
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
|
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
|
||||||
|
|
||||||
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
|
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
|
||||||
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
|
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
|
||||||
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
|
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
|
||||||
@@ -586,27 +587,29 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case VX_DCR_MPM_CLASS_MEM: {
|
case VX_DCR_MPM_CLASS_MEM: {
|
||||||
auto proc_perf = cluster_->processor()->perf_stats();
|
auto proc_perf = socket_->cluster()->processor()->perf_stats();
|
||||||
|
auto socket_perf = socket_->perf_stats();
|
||||||
|
auto smem_perf = shared_mem_->perf_stats();
|
||||||
switch (addr) {
|
switch (addr) {
|
||||||
case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff;
|
case VX_CSR_MPM_ICACHE_READS: return socket_perf.icache.reads & 0xffffffff;
|
||||||
case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32;
|
case VX_CSR_MPM_ICACHE_READS_H: return socket_perf.icache.reads >> 32;
|
||||||
case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff;
|
case VX_CSR_MPM_ICACHE_MISS_R: return socket_perf.icache.read_misses & 0xffffffff;
|
||||||
case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32;
|
case VX_CSR_MPM_ICACHE_MISS_R_H: return socket_perf.icache.read_misses >> 32;
|
||||||
case VX_CSR_MPM_ICACHE_MSHR_ST: return proc_perf.clusters.icache.mshr_stalls & 0xffffffff;
|
case VX_CSR_MPM_ICACHE_MSHR_ST: return socket_perf.icache.mshr_stalls & 0xffffffff;
|
||||||
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return proc_perf.clusters.icache.mshr_stalls >> 32;
|
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32;
|
||||||
|
|
||||||
case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff;
|
case VX_CSR_MPM_DCACHE_READS: return socket_perf.dcache.reads & 0xffffffff;
|
||||||
case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32;
|
case VX_CSR_MPM_DCACHE_READS_H: return socket_perf.dcache.reads >> 32;
|
||||||
case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff;
|
case VX_CSR_MPM_DCACHE_WRITES: return socket_perf.dcache.writes & 0xffffffff;
|
||||||
case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32;
|
case VX_CSR_MPM_DCACHE_WRITES_H: return socket_perf.dcache.writes >> 32;
|
||||||
case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff;
|
case VX_CSR_MPM_DCACHE_MISS_R: return socket_perf.dcache.read_misses & 0xffffffff;
|
||||||
case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32;
|
case VX_CSR_MPM_DCACHE_MISS_R_H: return socket_perf.dcache.read_misses >> 32;
|
||||||
case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff;
|
case VX_CSR_MPM_DCACHE_MISS_W: return socket_perf.dcache.write_misses & 0xffffffff;
|
||||||
case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32;
|
case VX_CSR_MPM_DCACHE_MISS_W_H: return socket_perf.dcache.write_misses >> 32;
|
||||||
case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff;
|
case VX_CSR_MPM_DCACHE_BANK_ST: return socket_perf.dcache.bank_stalls & 0xffffffff;
|
||||||
case VX_CSR_MPM_DCACHE_BANK_ST_H: return proc_perf.clusters.dcache.bank_stalls >> 32;
|
case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32;
|
||||||
case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff;
|
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
|
||||||
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return proc_perf.clusters.dcache.mshr_stalls >> 32;
|
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
|
||||||
|
|
||||||
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
|
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
|
||||||
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
|
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
|
||||||
@@ -641,12 +644,12 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||||||
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
|
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
|
||||||
case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
|
case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
|
||||||
|
|
||||||
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff;
|
case VX_CSR_MPM_SMEM_READS: return smem_perf.reads & 0xffffffff;
|
||||||
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32;
|
case VX_CSR_MPM_SMEM_READS_H: return smem_perf.reads >> 32;
|
||||||
case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff;
|
case VX_CSR_MPM_SMEM_WRITES: return smem_perf.writes & 0xffffffff;
|
||||||
case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32;
|
case VX_CSR_MPM_SMEM_WRITES_H: return smem_perf.writes >> 32;
|
||||||
case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff;
|
case VX_CSR_MPM_SMEM_BANK_ST: return smem_perf.bank_stalls & 0xffffffff;
|
||||||
case VX_CSR_MPM_SMEM_BANK_ST_H: return proc_perf.clusters.sharedmem.bank_stalls >> 32;
|
case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,7 +40,7 @@
|
|||||||
|
|
||||||
namespace vortex {
|
namespace vortex {
|
||||||
|
|
||||||
class Cluster;
|
class Socket;
|
||||||
|
|
||||||
using TraceSwitch = Mux<pipeline_trace_t*>;
|
using TraceSwitch = Mux<pipeline_trace_t*>;
|
||||||
|
|
||||||
@@ -53,10 +53,6 @@ public:
|
|||||||
uint64_t sched_stalls;
|
uint64_t sched_stalls;
|
||||||
uint64_t ibuf_stalls;
|
uint64_t ibuf_stalls;
|
||||||
uint64_t scrb_stalls;
|
uint64_t scrb_stalls;
|
||||||
uint64_t alu_stalls;
|
|
||||||
uint64_t lsu_stalls;
|
|
||||||
uint64_t fpu_stalls;
|
|
||||||
uint64_t sfu_stalls;
|
|
||||||
uint64_t scrb_alu;
|
uint64_t scrb_alu;
|
||||||
uint64_t scrb_fpu;
|
uint64_t scrb_fpu;
|
||||||
uint64_t scrb_lsu;
|
uint64_t scrb_lsu;
|
||||||
@@ -74,10 +70,6 @@ public:
|
|||||||
, sched_stalls(0)
|
, sched_stalls(0)
|
||||||
, ibuf_stalls(0)
|
, ibuf_stalls(0)
|
||||||
, scrb_stalls(0)
|
, scrb_stalls(0)
|
||||||
, alu_stalls(0)
|
|
||||||
, lsu_stalls(0)
|
|
||||||
, fpu_stalls(0)
|
|
||||||
, sfu_stalls(0)
|
|
||||||
, scrb_alu(0)
|
, scrb_alu(0)
|
||||||
, scrb_fpu(0)
|
, scrb_fpu(0)
|
||||||
, scrb_lsu(0)
|
, scrb_lsu(0)
|
||||||
@@ -96,12 +88,7 @@ public:
|
|||||||
std::vector<SimPort<MemReq>> dcache_req_ports;
|
std::vector<SimPort<MemReq>> dcache_req_ports;
|
||||||
std::vector<SimPort<MemRsp>> dcache_rsp_ports;
|
std::vector<SimPort<MemRsp>> dcache_rsp_ports;
|
||||||
|
|
||||||
Core(const SimContext& ctx,
|
Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs);
|
||||||
uint32_t core_id,
|
|
||||||
Cluster* cluster,
|
|
||||||
const Arch &arch,
|
|
||||||
const DCRS &dcrs,
|
|
||||||
SharedMem::Ptr sharedmem);
|
|
||||||
|
|
||||||
~Core();
|
~Core();
|
||||||
|
|
||||||
@@ -119,6 +106,10 @@ public:
|
|||||||
return core_id_;
|
return core_id_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Socket* socket() const {
|
||||||
|
return socket_;
|
||||||
|
}
|
||||||
|
|
||||||
const Arch& arch() const {
|
const Arch& arch() const {
|
||||||
return arch_;
|
return arch_;
|
||||||
}
|
}
|
||||||
@@ -181,7 +172,8 @@ private:
|
|||||||
std::vector<Operand::Ptr> operands_;
|
std::vector<Operand::Ptr> operands_;
|
||||||
std::vector<Dispatcher::Ptr> dispatchers_;
|
std::vector<Dispatcher::Ptr> dispatchers_;
|
||||||
std::vector<ExeUnit::Ptr> exe_units_;
|
std::vector<ExeUnit::Ptr> exe_units_;
|
||||||
SharedMem::Ptr sharedmem_;
|
SharedMem::Ptr shared_mem_;
|
||||||
|
std::vector<SMemDemux::Ptr> smem_demuxs_;
|
||||||
|
|
||||||
PipelineLatch fetch_latch_;
|
PipelineLatch fetch_latch_;
|
||||||
PipelineLatch decode_latch_;
|
PipelineLatch decode_latch_;
|
||||||
@@ -201,7 +193,7 @@ private:
|
|||||||
|
|
||||||
PerfStats perf_stats_;
|
PerfStats perf_stats_;
|
||||||
|
|
||||||
Cluster* cluster_;
|
Socket* socket_;
|
||||||
|
|
||||||
std::vector<TraceSwitch::Ptr> commit_arbs_;
|
std::vector<TraceSwitch::Ptr> commit_arbs_;
|
||||||
|
|
||||||
|
|||||||
@@ -51,8 +51,7 @@ void AluUnit::tick() {
|
|||||||
assert(core_->stalled_warps_.test(trace->wid));
|
assert(core_->stalled_warps_.test(trace->wid));
|
||||||
core_->stalled_warps_.reset(trace->wid);
|
core_->stalled_warps_.reset(trace->wid);
|
||||||
}
|
}
|
||||||
auto time = input.pop();
|
input.pop();
|
||||||
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -87,8 +86,7 @@ void FpuUnit::tick() {
|
|||||||
std::abort();
|
std::abort();
|
||||||
}
|
}
|
||||||
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
|
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
|
||||||
auto time = input.pop();
|
input.pop();
|
||||||
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -114,7 +112,7 @@ void LsuUnit::tick() {
|
|||||||
|
|
||||||
// handle dcache response
|
// handle dcache response
|
||||||
for (uint32_t t = 0; t < num_lanes_; ++t) {
|
for (uint32_t t = 0; t < num_lanes_; ++t) {
|
||||||
auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
|
auto& dcache_rsp_port = core_->smem_demuxs_.at(t)->RspIn;
|
||||||
if (dcache_rsp_port.empty())
|
if (dcache_rsp_port.empty())
|
||||||
continue;
|
continue;
|
||||||
auto& mem_rsp = dcache_rsp_port.front();
|
auto& mem_rsp = dcache_rsp_port.front();
|
||||||
@@ -136,7 +134,7 @@ void LsuUnit::tick() {
|
|||||||
|
|
||||||
// handle shared memory response
|
// handle shared memory response
|
||||||
for (uint32_t t = 0; t < num_lanes_; ++t) {
|
for (uint32_t t = 0; t < num_lanes_; ++t) {
|
||||||
auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
|
auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
|
||||||
if (smem_rsp_port.empty())
|
if (smem_rsp_port.empty())
|
||||||
continue;
|
continue;
|
||||||
auto& mem_rsp = smem_rsp_port.front();
|
auto& mem_rsp = smem_rsp_port.front();
|
||||||
@@ -184,8 +182,7 @@ void LsuUnit::tick() {
|
|||||||
fence_lock_ = true;
|
fence_lock_ = true;
|
||||||
DT(3, "fence-lock: " << *trace);
|
DT(3, "fence-lock: " << *trace);
|
||||||
// remove input
|
// remove input
|
||||||
auto time = input.pop();
|
input.pop();
|
||||||
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -213,7 +210,9 @@ void LsuUnit::tick() {
|
|||||||
auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
|
auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
|
||||||
matches += (addr0 == mem_addr);
|
matches += (addr0 == mem_addr);
|
||||||
}
|
}
|
||||||
|
#ifdef LSU_DUP_ENABLE
|
||||||
is_dup = (matches == trace->tmask.count());
|
is_dup = (matches == trace->tmask.count());
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t addr_count;
|
uint32_t addr_count;
|
||||||
@@ -229,7 +228,7 @@ void LsuUnit::tick() {
|
|||||||
if (!trace->tmask.test(t0 + t))
|
if (!trace->tmask.test(t0 + t))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
auto& dcache_req_port = core_->dcache_req_ports.at(t);
|
auto& dcache_req_port = core_->smem_demuxs_.at(t)->ReqIn;
|
||||||
auto mem_addr = trace_data->mem_addrs.at(t);
|
auto mem_addr = trace_data->mem_addrs.at(t);
|
||||||
auto type = core_->get_addr_type(mem_addr.addr);
|
auto type = core_->get_addr_type(mem_addr.addr);
|
||||||
|
|
||||||
@@ -241,12 +240,16 @@ void LsuUnit::tick() {
|
|||||||
mem_req.cid = trace->cid;
|
mem_req.cid = trace->cid;
|
||||||
mem_req.uuid = trace->uuid;
|
mem_req.uuid = trace->uuid;
|
||||||
|
|
||||||
dcache_req_port.send(mem_req, 2);
|
dcache_req_port.send(mem_req, 1);
|
||||||
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
|
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
|
||||||
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
|
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
|
||||||
|
|
||||||
++pending_loads_;
|
if (is_write) {
|
||||||
++core_->perf_stats_.loads;
|
++core_->perf_stats_.stores;
|
||||||
|
} else {
|
||||||
|
++core_->perf_stats_.loads;
|
||||||
|
++pending_loads_;
|
||||||
|
}
|
||||||
if (is_dup)
|
if (is_dup)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -254,13 +257,11 @@ void LsuUnit::tick() {
|
|||||||
// do not wait on writes
|
// do not wait on writes
|
||||||
if (is_write) {
|
if (is_write) {
|
||||||
pending_rd_reqs_.release(tag);
|
pending_rd_reqs_.release(tag);
|
||||||
output.send(trace, 1);
|
output.send(trace, 1);
|
||||||
++core_->perf_stats_.stores;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove input
|
// remove input
|
||||||
auto time = input.pop();
|
input.pop();
|
||||||
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
|
|
||||||
|
|
||||||
break; // single block
|
break; // single block
|
||||||
}
|
}
|
||||||
@@ -318,10 +319,7 @@ void SfuUnit::tick() {
|
|||||||
core_->stalled_warps_.reset(trace->wid);
|
core_->stalled_warps_.reset(trace->wid);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto time = input.pop();
|
input.pop();
|
||||||
auto stalls = (SimPlatform::instance().cycles() - time);
|
|
||||||
|
|
||||||
core_->perf_stats_.sfu_stalls += stalls;
|
|
||||||
|
|
||||||
break; // single block
|
break; // single block
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,14 +34,13 @@ static void show_usage() {
|
|||||||
uint32_t num_threads = NUM_THREADS;
|
uint32_t num_threads = NUM_THREADS;
|
||||||
uint32_t num_warps = NUM_WARPS;
|
uint32_t num_warps = NUM_WARPS;
|
||||||
uint32_t num_cores = NUM_CORES;
|
uint32_t num_cores = NUM_CORES;
|
||||||
uint32_t num_clusters = NUM_CLUSTERS;
|
|
||||||
bool showStats = false;;
|
bool showStats = false;;
|
||||||
bool riscv_test = false;
|
bool riscv_test = false;
|
||||||
const char* program = nullptr;
|
const char* program = nullptr;
|
||||||
|
|
||||||
static void parse_args(int argc, char **argv) {
|
static void parse_args(int argc, char **argv) {
|
||||||
int c;
|
int c;
|
||||||
while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
|
while ((c = getopt(argc, argv, "t:w:c:rsh?")) != -1) {
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 't':
|
case 't':
|
||||||
num_threads = atoi(optarg);
|
num_threads = atoi(optarg);
|
||||||
@@ -51,9 +50,6 @@ static void parse_args(int argc, char **argv) {
|
|||||||
break;
|
break;
|
||||||
case 'c':
|
case 'c':
|
||||||
num_cores = atoi(optarg);
|
num_cores = atoi(optarg);
|
||||||
break;
|
|
||||||
case 'g':
|
|
||||||
num_clusters = atoi(optarg);
|
|
||||||
break;
|
break;
|
||||||
case 'r':
|
case 'r':
|
||||||
riscv_test = true;
|
riscv_test = true;
|
||||||
@@ -88,7 +84,7 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
{
|
{
|
||||||
// create processor configuation
|
// create processor configuation
|
||||||
Arch arch(num_threads, num_warps, num_cores, num_clusters);
|
Arch arch(num_threads, num_warps, num_cores);
|
||||||
|
|
||||||
// create memory module
|
// create memory module
|
||||||
RAM ram(RAM_PAGE_SIZE);
|
RAM ram(RAM_PAGE_SIZE);
|
||||||
|
|||||||
146
sim/simx/socket.cpp
Normal file
146
sim/simx/socket.cpp
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
// Copyright © 2019-2023
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "socket.h"
|
||||||
|
#include "cluster.h"
|
||||||
|
|
||||||
|
using namespace vortex;
|
||||||
|
|
||||||
|
Socket::Socket(const SimContext& ctx,
|
||||||
|
uint32_t socket_id,
|
||||||
|
Cluster* cluster,
|
||||||
|
const Arch &arch, const
|
||||||
|
DCRS &dcrs)
|
||||||
|
: SimObject(ctx, "socket")
|
||||||
|
, icache_mem_req_port(this)
|
||||||
|
, icache_mem_rsp_port(this)
|
||||||
|
, dcache_mem_req_port(this)
|
||||||
|
, dcache_mem_rsp_port(this)
|
||||||
|
, socket_id_(socket_id)
|
||||||
|
, cores_(arch.socket_size())
|
||||||
|
, cluster_(cluster)
|
||||||
|
{
|
||||||
|
auto cores_per_socket = cores_.size();
|
||||||
|
|
||||||
|
char sname[100];
|
||||||
|
snprintf(sname, 100, "socket%d-icaches", socket_id);
|
||||||
|
icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{
|
||||||
|
!ICACHE_ENABLED,
|
||||||
|
log2ceil(ICACHE_SIZE), // C
|
||||||
|
log2ceil(L1_LINE_SIZE), // L
|
||||||
|
log2ceil(sizeof(uint32_t)), // W
|
||||||
|
log2ceil(ICACHE_NUM_WAYS),// A
|
||||||
|
1, // B
|
||||||
|
XLEN, // address bits
|
||||||
|
1, // number of ports
|
||||||
|
1, // number of inputs
|
||||||
|
true, // write-through
|
||||||
|
false, // write response
|
||||||
|
(uint8_t)arch.num_warps(), // mshr
|
||||||
|
2, // pipeline latency
|
||||||
|
});
|
||||||
|
|
||||||
|
icaches_->MemReqPort.bind(&icache_mem_req_port);
|
||||||
|
icache_mem_rsp_port.bind(&icaches_->MemRspPort);
|
||||||
|
|
||||||
|
snprintf(sname, 100, "socket%d-dcaches", socket_id);
|
||||||
|
dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
|
||||||
|
!DCACHE_ENABLED,
|
||||||
|
log2ceil(DCACHE_SIZE), // C
|
||||||
|
log2ceil(L1_LINE_SIZE), // L
|
||||||
|
log2ceil(sizeof(Word)), // W
|
||||||
|
log2ceil(DCACHE_NUM_WAYS),// A
|
||||||
|
log2ceil(DCACHE_NUM_BANKS), // B
|
||||||
|
XLEN, // address bits
|
||||||
|
1, // number of ports
|
||||||
|
DCACHE_NUM_BANKS, // number of inputs
|
||||||
|
true, // write-through
|
||||||
|
false, // write response
|
||||||
|
DCACHE_MSHR_SIZE, // mshr
|
||||||
|
2, // pipeline latency
|
||||||
|
});
|
||||||
|
|
||||||
|
dcaches_->MemReqPort.bind(&dcache_mem_req_port);
|
||||||
|
dcache_mem_rsp_port.bind(&dcaches_->MemRspPort);
|
||||||
|
|
||||||
|
// create cores
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < cores_per_socket; ++i) {
|
||||||
|
uint32_t core_id = socket_id * cores_per_socket + i;
|
||||||
|
cores_.at(i) = Core::Create(core_id, this, arch, dcrs);
|
||||||
|
|
||||||
|
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
|
||||||
|
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
|
||||||
|
|
||||||
|
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
|
||||||
|
cores_.at(i)->dcache_req_ports.at(j).bind(&dcaches_->CoreReqPorts.at(i).at(j));
|
||||||
|
dcaches_->CoreRspPorts.at(i).at(j).bind(&cores_.at(i)->dcache_rsp_ports.at(j));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Socket::~Socket() {
|
||||||
|
//--
|
||||||
|
}
|
||||||
|
|
||||||
|
void Socket::reset() {
|
||||||
|
//--
|
||||||
|
}
|
||||||
|
|
||||||
|
void Socket::tick() {
|
||||||
|
//--
|
||||||
|
}
|
||||||
|
|
||||||
|
void Socket::attach_ram(RAM* ram) {
|
||||||
|
for (auto core : cores_) {
|
||||||
|
core->attach_ram(ram);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Socket::running() const {
|
||||||
|
for (auto& core : cores_) {
|
||||||
|
if (core->running())
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Socket::check_exit(Word* exitcode, bool riscv_test) const {
|
||||||
|
bool done = true;
|
||||||
|
Word exitcode_ = 0;
|
||||||
|
for (auto& core : cores_) {
|
||||||
|
Word ec;
|
||||||
|
if (core->check_exit(&ec, riscv_test)) {
|
||||||
|
exitcode_ |= ec;
|
||||||
|
} else {
|
||||||
|
done = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*exitcode = exitcode_;
|
||||||
|
return done;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Socket::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
|
||||||
|
cluster_->barrier(bar_id, count, socket_id_ * cores_.size() + core_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Socket::resume(uint32_t core_index) {
|
||||||
|
cores_.at(core_index)->resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
Socket::PerfStats Socket::perf_stats() const {
|
||||||
|
Socket::PerfStats perf;
|
||||||
|
perf.icache = icaches_->perf_stats();
|
||||||
|
perf.dcache = dcaches_->perf_stats();
|
||||||
|
return perf;
|
||||||
|
}
|
||||||
87
sim/simx/socket.h
Normal file
87
sim/simx/socket.h
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
// Copyright © 2019-2023
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <simobject.h>
|
||||||
|
#include "dcrs.h"
|
||||||
|
#include "arch.h"
|
||||||
|
#include "cache_cluster.h"
|
||||||
|
#include "shared_mem.h"
|
||||||
|
#include "core.h"
|
||||||
|
#include "constants.h"
|
||||||
|
|
||||||
|
namespace vortex {
|
||||||
|
|
||||||
|
class Cluster;
|
||||||
|
|
||||||
|
class Socket : public SimObject<Socket> {
|
||||||
|
public:
|
||||||
|
struct PerfStats {
|
||||||
|
CacheSim::PerfStats icache;
|
||||||
|
CacheSim::PerfStats dcache;
|
||||||
|
|
||||||
|
PerfStats& operator+=(const PerfStats& rhs) {
|
||||||
|
this->icache += rhs.icache;
|
||||||
|
this->dcache += rhs.dcache;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
SimPort<MemReq> icache_mem_req_port;
|
||||||
|
SimPort<MemRsp> icache_mem_rsp_port;
|
||||||
|
|
||||||
|
SimPort<MemReq> dcache_mem_req_port;
|
||||||
|
SimPort<MemRsp> dcache_mem_rsp_port;
|
||||||
|
|
||||||
|
Socket(const SimContext& ctx,
|
||||||
|
uint32_t socket_id,
|
||||||
|
Cluster* cluster,
|
||||||
|
const Arch &arch,
|
||||||
|
const DCRS &dcrs);
|
||||||
|
|
||||||
|
~Socket();
|
||||||
|
|
||||||
|
uint32_t id() const {
|
||||||
|
return socket_id_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Cluster* cluster() const {
|
||||||
|
return cluster_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset();
|
||||||
|
|
||||||
|
void tick();
|
||||||
|
|
||||||
|
void attach_ram(RAM* ram);
|
||||||
|
|
||||||
|
bool running() const;
|
||||||
|
|
||||||
|
bool check_exit(Word* exitcode, bool riscv_test) const;
|
||||||
|
|
||||||
|
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
|
||||||
|
|
||||||
|
void resume(uint32_t core_id);
|
||||||
|
|
||||||
|
Socket::PerfStats perf_stats() const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
uint32_t socket_id_;
|
||||||
|
std::vector<Core::Ptr> cores_;
|
||||||
|
CacheCluster::Ptr icaches_;
|
||||||
|
CacheCluster::Ptr dcaches_;
|
||||||
|
Cluster* cluster_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace vortex
|
||||||
@@ -70,6 +70,7 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
|
|||||||
case RegType::Integer: os << "x"; break;
|
case RegType::Integer: os << "x"; break;
|
||||||
case RegType::Float: os << "f"; break;
|
case RegType::Float: os << "f"; break;
|
||||||
case RegType::Vector: os << "v"; break;
|
case RegType::Vector: os << "v"; break;
|
||||||
|
default: assert(false);
|
||||||
}
|
}
|
||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
@@ -112,6 +113,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
|
|||||||
case AluType::SYSCALL: os << "SYSCALL"; break;
|
case AluType::SYSCALL: os << "SYSCALL"; break;
|
||||||
case AluType::IMUL: os << "IMUL"; break;
|
case AluType::IMUL: os << "IMUL"; break;
|
||||||
case AluType::IDIV: os << "IDIV"; break;
|
case AluType::IDIV: os << "IDIV"; break;
|
||||||
|
default: assert(false);
|
||||||
}
|
}
|
||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
@@ -129,6 +131,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
|
|||||||
case LsuType::LOAD: os << "LOAD"; break;
|
case LsuType::LOAD: os << "LOAD"; break;
|
||||||
case LsuType::STORE: os << "STORE"; break;
|
case LsuType::STORE: os << "STORE"; break;
|
||||||
case LsuType::FENCE: os << "FENCE"; break;
|
case LsuType::FENCE: os << "FENCE"; break;
|
||||||
|
default: assert(false);
|
||||||
}
|
}
|
||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
@@ -146,6 +149,7 @@ inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
|
|||||||
case AddrType::Global: os << "Global"; break;
|
case AddrType::Global: os << "Global"; break;
|
||||||
case AddrType::Shared: os << "Shared"; break;
|
case AddrType::Shared: os << "Shared"; break;
|
||||||
case AddrType::IO: os << "IO"; break;
|
case AddrType::IO: os << "IO"; break;
|
||||||
|
default: assert(false);
|
||||||
}
|
}
|
||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
@@ -174,6 +178,7 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
|
|||||||
case FpuType::FDIV: os << "FDIV"; break;
|
case FpuType::FDIV: os << "FDIV"; break;
|
||||||
case FpuType::FSQRT: os << "FSQRT"; break;
|
case FpuType::FSQRT: os << "FSQRT"; break;
|
||||||
case FpuType::FCVT: os << "FCVT"; break;
|
case FpuType::FCVT: os << "FCVT"; break;
|
||||||
|
default: assert(false);
|
||||||
}
|
}
|
||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
@@ -205,6 +210,7 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
|
|||||||
case SfuType::CSRRS: os << "CSRRS"; break;
|
case SfuType::CSRRS: os << "CSRRS"; break;
|
||||||
case SfuType::CSRRC: os << "CSRRC"; break;
|
case SfuType::CSRRC: os << "CSRRC"; break;
|
||||||
case SfuType::CMOV: os << "CMOV"; break;
|
case SfuType::CMOV: os << "CMOV"; break;
|
||||||
|
default: assert(false);
|
||||||
}
|
}
|
||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
@@ -220,6 +226,7 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
|
|||||||
switch (type) {
|
switch (type) {
|
||||||
case ArbiterType::Priority: os << "Priority"; break;
|
case ArbiterType::Priority: os << "Priority"; break;
|
||||||
case ArbiterType::RoundRobin: os << "RoundRobin"; break;
|
case ArbiterType::RoundRobin: os << "RoundRobin"; break;
|
||||||
|
default: assert(false);
|
||||||
}
|
}
|
||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user