adding tracking for SFU stalls

This commit is contained in:
Blaise Tine
2023-12-28 12:12:11 -08:00
parent c7a81d1493
commit e217bc2c23
27 changed files with 1266 additions and 1166 deletions

View File

@@ -43,7 +43,16 @@ module VX_cluster import VX_gpu_pkg::*; #(
`ifdef SCOPE `ifdef SCOPE
localparam scope_socket = 0; localparam scope_socket = 0;
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS); `SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS);
`endif `endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
`ifdef GBAR_ENABLE `ifdef GBAR_ENABLE
@@ -69,24 +78,68 @@ module VX_cluster import VX_gpu_pkg::*; #(
.reset (gbar_reset), .reset (gbar_reset),
.gbar_bus_if (gbar_bus_if) .gbar_bus_if (gbar_bus_if)
); );
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
cache_perf_t perf_l2cache;
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l2cache = perf_l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif `endif
VX_mem_bus_if #( VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE), .DATA_SIZE (L2_WORD_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH) .TAG_WIDTH (L2_TAG_WIDTH)
) l1_mem_bus_if[2](); ) l2_mem_bus_if[L2_NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) per_socket_icache_mem_bus_if[`NUM_SOCKETS]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) per_socket_dcache_mem_bus_if[`NUM_SOCKETS]();
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH)
) icache_mem_bus_if[1]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH)
) dcache_mem_bus_if[1]();
`RESET_RELAY (l1_mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (`NUM_SOCKETS),
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) icache_mem_arb (
.clk (clk),
.reset (l1_mem_arb_reset),
.bus_in_if (per_socket_icache_mem_bus_if),
.bus_out_if (icache_mem_bus_if)
);
VX_mem_arb #(
.NUM_INPUTS (`NUM_SOCKETS),
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) dcache_mem_arb (
.clk (clk),
.reset (l1_mem_arb_reset),
.bus_in_if (per_socket_dcache_mem_bus_if),
.bus_out_if (dcache_mem_bus_if)
);
`ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[ICACHE_MEM_ARB_IDX], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[DCACHE_MEM_ARB_IDX], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
`RESET_RELAY (l2_reset, reset); `RESET_RELAY (l2_reset, reset);
@@ -113,67 +166,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (l2_reset), .reset (l2_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (perf_l2cache), .cache_perf (mem_perf_tmp_if.l2cache),
`endif `endif
.core_bus_if (l1_mem_bus_if), .core_bus_if (l2_mem_bus_if),
.mem_bus_if (mem_bus_if) .mem_bus_if (mem_bus_if)
); );
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) per_socket_icache_mem_bus_if[`NUM_SOCKETS]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) per_socket_dcache_mem_bus_if[`NUM_SOCKETS]();
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH)
) icache_mem_bus_if[1]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH)
) dcache_mem_bus_if[1]();
`RESET_RELAY (l1_mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (`NUM_SOCKETS),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) icache_mem_arb (
.clk (clk),
.reset (l1_mem_arb_reset),
.bus_in_if (per_socket_icache_mem_bus_if),
.bus_out_if (icache_mem_bus_if)
);
VX_mem_arb #(
.NUM_INPUTS (`NUM_SOCKETS),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) dcache_mem_arb (
.clk (clk),
.reset (l1_mem_arb_reset),
.bus_in_if (per_socket_dcache_mem_bus_if),
.bus_out_if (dcache_mem_bus_if)
);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak; wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak;
@@ -201,6 +199,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i) .SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
) socket ( ) socket (
`SCOPE_IO_BIND (scope_socket+i) `SCOPE_IO_BIND (scope_socket+i)
.clk (clk), .clk (clk),
.reset (socket_reset), .reset (socket_reset),
@@ -212,7 +211,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.icache_mem_bus_if (per_socket_icache_mem_bus_if[i]), .icache_mem_bus_if (per_socket_icache_mem_bus_if[i]),
.dcache_mem_bus_if (per_socket_dcache_mem_bus_if[i]), .dcache_mem_bus_if (per_socket_dcache_mem_bus_if[i]),
`ifdef GBAR_ENABLE `ifdef GBAR_ENABLE
.gbar_bus_if (per_socket_gbar_bus_if[i]), .gbar_bus_if (per_socket_gbar_bus_if[i]),
`endif `endif

View File

@@ -1,432 +1,437 @@
// Copyright © 2019-2023 // Copyright © 2019-2023
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
// You may obtain a copy of the License at // You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
// //
// Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
`ifndef VX_DEFINE_VH `ifndef VX_DEFINE_VH
`define VX_DEFINE_VH `define VX_DEFINE_VH
`include "VX_platform.vh" `include "VX_platform.vh"
`include "VX_config.vh" `include "VX_config.vh"
`include "VX_types.vh" `include "VX_types.vh"
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`define NW_BITS `CLOG2(`NUM_WARPS) `define NW_BITS `CLOG2(`NUM_WARPS)
`define NC_WIDTH `UP(`NC_BITS) `define NC_WIDTH `UP(`NC_BITS)
`define NT_BITS `CLOG2(`NUM_THREADS) `define NT_BITS `CLOG2(`NUM_THREADS)
`define NW_WIDTH `UP(`NW_BITS) `define NW_WIDTH `UP(`NW_BITS)
`define NC_BITS `CLOG2(`NUM_CORES) `define NC_BITS `CLOG2(`NUM_CORES)
`define NT_WIDTH `UP(`NT_BITS) `define NT_WIDTH `UP(`NT_BITS)
`define NB_BITS `CLOG2(`NUM_BARRIERS) `define NB_BITS `CLOG2(`NUM_BARRIERS)
`define NB_WIDTH `UP(`NB_BITS) `define NB_WIDTH `UP(`NB_BITS)
`define NUM_IREGS 32 `define NUM_IREGS 32
`define NRI_BITS `CLOG2(`NUM_IREGS) `define NRI_BITS `CLOG2(`NUM_IREGS)
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
`define NUM_REGS (2 * `NUM_IREGS) `define NUM_REGS (2 * `NUM_IREGS)
`else `else
`define NUM_REGS `NUM_IREGS `define NUM_REGS `NUM_IREGS
`endif `endif
`define NR_BITS `CLOG2(`NUM_REGS) `define NR_BITS `CLOG2(`NUM_REGS)
`define PERF_CTR_BITS 44 `define PERF_CTR_BITS 44
`ifndef NDEBUG `ifndef NDEBUG
`define UUID_WIDTH 44 `define UUID_WIDTH 44
`else `else
`define UUID_WIDTH 1 `define UUID_WIDTH 1
`endif `endif
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`define EX_ALU 0 `define EX_ALU 0
`define EX_LSU 1 `define EX_LSU 1
`define EX_SFU 2 `define EX_SFU 2
`define EX_FPU 3 `define EX_FPU (`EX_SFU + `EXT_F_ENABLED)
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED) `define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
`define EX_BITS `CLOG2(`NUM_EX_UNITS) `define EX_BITS `CLOG2(`NUM_EX_UNITS)
`define EX_WIDTH `UP(`EX_BITS)
///////////////////////////////////////////////////////////////////////////////
`define SFU_CSRS 0
`define INST_LUI 7'b0110111 `define SFU_WCTL 1
`define INST_AUIPC 7'b0010111
`define INST_JAL 7'b1101111 `define NUM_SFU_UNITS (2)
`define INST_JALR 7'b1100111 `define SFU_BITS `CLOG2(`NUM_SFU_UNITS)
`define INST_B 7'b1100011 // branch instructions `define SFU_WIDTH `UP(`SFU_BITS)
`define INST_L 7'b0000011 // load instructions
`define INST_S 7'b0100011 // store instructions ///////////////////////////////////////////////////////////////////////////////
`define INST_I 7'b0010011 // immediate instructions
`define INST_R 7'b0110011 // register instructions `define INST_LUI 7'b0110111
`define INST_FENCE 7'b0001111 // Fence instructions `define INST_AUIPC 7'b0010111
`define INST_SYS 7'b1110011 // system instructions `define INST_JAL 7'b1101111
`define INST_JALR 7'b1100111
// RV64I instruction specific opcodes (for any W instruction) `define INST_B 7'b1100011 // branch instructions
`define INST_I_W 7'b0011011 // W type immediate instructions `define INST_L 7'b0000011 // load instructions
`define INST_R_W 7'b0111011 // W type register instructions `define INST_S 7'b0100011 // store instructions
`define INST_I 7'b0010011 // immediate instructions
`define INST_FL 7'b0000111 // float load instruction `define INST_R 7'b0110011 // register instructions
`define INST_FS 7'b0100111 // float store instruction `define INST_FENCE 7'b0001111 // Fence instructions
`define INST_FMADD 7'b1000011 `define INST_SYS 7'b1110011 // system instructions
`define INST_FMSUB 7'b1000111
`define INST_FNMSUB 7'b1001011 // RV64I instruction specific opcodes (for any W instruction)
`define INST_FNMADD 7'b1001111 `define INST_I_W 7'b0011011 // W type immediate instructions
`define INST_FCI 7'b1010011 // float common instructions `define INST_R_W 7'b0111011 // W type register instructions
// Custom extension opcodes `define INST_FL 7'b0000111 // float load instruction
`define INST_EXT1 7'b0001011 // 0x0B `define INST_FS 7'b0100111 // float store instruction
`define INST_EXT2 7'b0101011 // 0x2B `define INST_FMADD 7'b1000011
`define INST_EXT3 7'b1011011 // 0x5B `define INST_FMSUB 7'b1000111
`define INST_EXT4 7'b1111011 // 0x7B `define INST_FNMSUB 7'b1001011
`define INST_FNMADD 7'b1001111
/////////////////////////////////////////////////////////////////////////////// `define INST_FCI 7'b1010011 // float common instructions
`define INST_FRM_RNE 3'b000 // round to nearest even // Custom extension opcodes
`define INST_FRM_RTZ 3'b001 // round to zero `define INST_EXT1 7'b0001011 // 0x0B
`define INST_FRM_RDN 3'b010 // round to -inf `define INST_EXT2 7'b0101011 // 0x2B
`define INST_FRM_RUP 3'b011 // round to +inf `define INST_EXT3 7'b1011011 // 0x5B
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude `define INST_EXT4 7'b1111011 // 0x7B
`define INST_FRM_DYN 3'b111 // dynamic mode
`define INST_FRM_BITS 3 ///////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////// `define INST_FRM_RNE 3'b000 // round to nearest even
`define INST_FRM_RTZ 3'b001 // round to zero
`define INST_OP_BITS 4 `define INST_FRM_RDN 3'b010 // round to -inf
`define INST_MOD_BITS 3 `define INST_FRM_RUP 3'b011 // round to +inf
`define INST_FMT_BITS 2 `define INST_FRM_RMM 3'b100 // round to nearest max magnitude
`define INST_FRM_DYN 3'b111 // dynamic mode
/////////////////////////////////////////////////////////////////////////////// `define INST_FRM_BITS 3
`define INST_ALU_ADD 4'b0000 ///////////////////////////////////////////////////////////////////////////////
`define INST_ALU_LUI 4'b0010
`define INST_ALU_AUIPC 4'b0011 `define INST_OP_BITS 4
`define INST_ALU_SLTU 4'b0100 `define INST_MOD_BITS 3
`define INST_ALU_SLT 4'b0101 `define INST_FMT_BITS 2
`define INST_ALU_SUB 4'b0111
`define INST_ALU_SRL 4'b1000 ///////////////////////////////////////////////////////////////////////////////
`define INST_ALU_SRA 4'b1001
`define INST_ALU_AND 4'b1100 `define INST_ALU_ADD 4'b0000
`define INST_ALU_OR 4'b1101 `define INST_ALU_LUI 4'b0010
`define INST_ALU_XOR 4'b1110 `define INST_ALU_AUIPC 4'b0011
`define INST_ALU_SLL 4'b1111 `define INST_ALU_SLTU 4'b0100
`define INST_ALU_OTHER 4'b0111 `define INST_ALU_SLT 4'b0101
`define INST_ALU_BITS 4 `define INST_ALU_SUB 4'b0111
`define INST_ALU_CLASS(op) op[3:2] `define INST_ALU_SRL 4'b1000
`define INST_ALU_SIGNED(op) op[0] `define INST_ALU_SRA 4'b1001
`define INST_ALU_IS_SUB(op) op[1] `define INST_ALU_AND 4'b1100
`define INST_ALU_IS_BR(mod) mod[0] `define INST_ALU_OR 4'b1101
`define INST_ALU_IS_M(mod) mod[1] `define INST_ALU_XOR 4'b1110
`define INST_ALU_IS_W(mod) mod[2] `define INST_ALU_SLL 4'b1111
`define INST_ALU_OTHER 4'b0111
`define INST_BR_EQ 4'b0000 `define INST_ALU_BITS 4
`define INST_BR_NE 4'b0010 `define INST_ALU_CLASS(op) op[3:2]
`define INST_BR_LTU 4'b0100 `define INST_ALU_SIGNED(op) op[0]
`define INST_BR_GEU 4'b0110 `define INST_ALU_IS_SUB(op) op[1]
`define INST_BR_LT 4'b0101 `define INST_ALU_IS_BR(mod) mod[0]
`define INST_BR_GE 4'b0111 `define INST_ALU_IS_M(mod) mod[1]
`define INST_BR_JAL 4'b1000 `define INST_ALU_IS_W(mod) mod[2]
`define INST_BR_JALR 4'b1001
`define INST_BR_ECALL 4'b1010 `define INST_BR_EQ 4'b0000
`define INST_BR_EBREAK 4'b1011 `define INST_BR_NE 4'b0010
`define INST_BR_URET 4'b1100 `define INST_BR_LTU 4'b0100
`define INST_BR_SRET 4'b1101 `define INST_BR_GEU 4'b0110
`define INST_BR_MRET 4'b1110 `define INST_BR_LT 4'b0101
`define INST_BR_OTHER 4'b1111 `define INST_BR_GE 4'b0111
`define INST_BR_BITS 4 `define INST_BR_JAL 4'b1000
`define INST_BR_CLASS(op) {1'b0, ~op[3]} `define INST_BR_JALR 4'b1001
`define INST_BR_IS_NEG(op) op[1] `define INST_BR_ECALL 4'b1010
`define INST_BR_IS_LESS(op) op[2] `define INST_BR_EBREAK 4'b1011
`define INST_BR_IS_STATIC(op) op[3] `define INST_BR_URET 4'b1100
`define INST_BR_SRET 4'b1101
`define INST_M_MUL 3'b000 `define INST_BR_MRET 4'b1110
`define INST_M_MULHU 3'b001 `define INST_BR_OTHER 4'b1111
`define INST_M_MULH 3'b010 `define INST_BR_BITS 4
`define INST_M_MULHSU 3'b011 `define INST_BR_CLASS(op) {1'b0, ~op[3]}
`define INST_M_DIV 3'b100 `define INST_BR_IS_NEG(op) op[1]
`define INST_M_DIVU 3'b101 `define INST_BR_IS_LESS(op) op[2]
`define INST_M_REM 3'b110 `define INST_BR_IS_STATIC(op) op[3]
`define INST_M_REMU 3'b111
`define INST_M_BITS 3 `define INST_M_MUL 3'b000
`define INST_M_SIGNED(op) (~op[0]) `define INST_M_MULHU 3'b001
`define INST_M_IS_MULX(op) (~op[2]) `define INST_M_MULH 3'b010
`define INST_M_IS_MULH(op) (op[1:0] != 0) `define INST_M_MULHSU 3'b011
`define INST_M_SIGNED_A(op) (op[1:0] != 1) `define INST_M_DIV 3'b100
`define INST_M_IS_REM(op) op[1] `define INST_M_DIVU 3'b101
`define INST_M_REM 3'b110
`define INST_FMT_B 3'b000 `define INST_M_REMU 3'b111
`define INST_FMT_H 3'b001 `define INST_M_BITS 3
`define INST_FMT_W 3'b010 `define INST_M_SIGNED(op) (~op[0])
`define INST_FMT_D 3'b011 `define INST_M_IS_MULX(op) (~op[2])
`define INST_FMT_BU 3'b100 `define INST_M_IS_MULH(op) (op[1:0] != 0)
`define INST_FMT_HU 3'b101 `define INST_M_SIGNED_A(op) (op[1:0] != 1)
`define INST_FMT_WU 3'b110 `define INST_M_IS_REM(op) op[1]
`define INST_LSU_LB 4'b0000 `define INST_FMT_B 3'b000
`define INST_LSU_LH 4'b0001 `define INST_FMT_H 3'b001
`define INST_LSU_LW 4'b0010 `define INST_FMT_W 3'b010
`define INST_LSU_LD 4'b0011 // new for RV64I LD `define INST_FMT_D 3'b011
`define INST_LSU_LBU 4'b0100 `define INST_FMT_BU 3'b100
`define INST_LSU_LHU 4'b0101 `define INST_FMT_HU 3'b101
`define INST_LSU_LWU 4'b0110 // new for RV64I LWU `define INST_FMT_WU 3'b110
`define INST_LSU_SB 4'b1000
`define INST_LSU_SH 4'b1001 `define INST_LSU_LB 4'b0000
`define INST_LSU_SW 4'b1010 `define INST_LSU_LH 4'b0001
`define INST_LSU_SD 4'b1011 // new for RV64I SD `define INST_LSU_LW 4'b0010
`define INST_LSU_FENCE 4'b1111 `define INST_LSU_LD 4'b0011 // new for RV64I LD
`define INST_LSU_BITS 4 `define INST_LSU_LBU 4'b0100
`define INST_LSU_FMT(op) op[2:0] `define INST_LSU_LHU 4'b0101
`define INST_LSU_WSIZE(op) op[1:0] `define INST_LSU_LWU 4'b0110 // new for RV64I LWU
`define INST_LSU_IS_FENCE(op) (op[3:2] == 3) `define INST_LSU_SB 4'b1000
`define INST_LSU_SH 4'b1001
`define INST_FENCE_BITS 1 `define INST_LSU_SW 4'b1010
`define INST_FENCE_D 1'h0 `define INST_LSU_SD 4'b1011 // new for RV64I SD
`define INST_FENCE_I 1'h1 `define INST_LSU_FENCE 4'b1111
`define INST_LSU_BITS 4
`define INST_FPU_ADD 4'b0000 `define INST_LSU_FMT(op) op[2:0]
`define INST_FPU_SUB 4'b0001 `define INST_LSU_WSIZE(op) op[1:0]
`define INST_FPU_MUL 4'b0010 `define INST_LSU_IS_FENCE(op) (op[3:2] == 3)
`define INST_FPU_DIV 4'b0011
`define INST_FPU_SQRT 4'b0100 `define INST_FENCE_BITS 1
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2 `define INST_FENCE_D 1'h0
`define INST_FPU_F2F 4'b0110 `define INST_FENCE_I 1'h1
`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_F2I 4'b1000 `define INST_FPU_ADD 4'b0000
`define INST_FPU_F2U 4'b1001 `define INST_FPU_SUB 4'b0001
`define INST_FPU_I2F 4'b1010 `define INST_FPU_MUL 4'b0010
`define INST_FPU_U2F 4'b1011 `define INST_FPU_DIV 4'b0011
`define INST_FPU_MADD 4'b1100 `define INST_FPU_SQRT 4'b0100
`define INST_FPU_MSUB 4'b1101 `define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2
`define INST_FPU_NMSUB 4'b1110 `define INST_FPU_F2F 4'b0110
`define INST_FPU_NMADD 4'b1111 `define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_BITS 4 `define INST_FPU_F2I 4'b1000
`define INST_FPU_IS_W(mod) (mod[4]) `define INST_FPU_F2U 4'b1001
`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3) `define INST_FPU_I2F 4'b1010
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4) `define INST_FPU_U2F 4'b1011
`define INST_FPU_MADD 4'b1100
`define INST_SFU_TMC 4'h0 `define INST_FPU_MSUB 4'b1101
`define INST_SFU_WSPAWN 4'h1 `define INST_FPU_NMSUB 4'b1110
`define INST_SFU_SPLIT 4'h2 `define INST_FPU_NMADD 4'b1111
`define INST_SFU_JOIN 4'h3 `define INST_FPU_BITS 4
`define INST_SFU_BAR 4'h4 `define INST_FPU_IS_W(mod) (mod[4])
`define INST_SFU_PRED 4'h5 `define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3)
`define INST_SFU_CSRRW 4'h6 `define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
`define INST_SFU_CSRRS 4'h7
`define INST_SFU_CSRRC 4'h8 `define INST_SFU_TMC 4'h0
`define INST_SFU_CMOV 4'h9 `define INST_SFU_WSPAWN 4'h1
`define INST_SFU_BITS 4 `define INST_SFU_SPLIT 4'h2
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1) `define INST_SFU_JOIN 4'h3
`define INST_SFU_IS_WCTL(op) (op <= 5) `define INST_SFU_BAR 4'h4
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8) `define INST_SFU_PRED 4'h5
`define INST_SFU_CSRRW 4'h6
/////////////////////////////////////////////////////////////////////////////// `define INST_SFU_CSRRS 4'h7
`define INST_SFU_CSRRC 4'h8
// non-cacheable tag bits `define INST_SFU_CMOV 4'h9
`define NC_TAG_BITS 1 `define INST_SFU_BITS 4
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
// cache address type bits `define INST_SFU_IS_WCTL(op) (op <= 5)
`ifdef SM_ENABLE `define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
`else ///////////////////////////////////////////////////////////////////////////////
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
`endif // non-cacheable tag bits
`define NC_TAG_BITS 1
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
// cache address type bits
/////////////////////////////////////////////////////////////////////////////// `ifdef SM_ENABLE
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \ `else
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS) `define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
`endif
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width) `define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \ ///////////////////////////////////////////////////////////////////////////////
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \ (`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS)
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
/////////////////////////////////////////////////////////////////////////////// (`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \ `define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches))) (`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \ `define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1)) `MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \ ///////////////////////////////////////////////////////////////////////////////
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ (tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ (tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ `CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
/////////////////////////////////////////////////////////////////////////////// `CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`ifdef L2_ENABLE `define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`define L2_LINE_SIZE `MEM_BLOCK_SIZE `CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
`else
`define L2_LINE_SIZE `L1_LINE_SIZE `define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`endif `CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
`ifdef L3_ENABLE ///////////////////////////////////////////////////////////////////////////////
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`else `ifdef L2_ENABLE
`define L3_LINE_SIZE `L2_LINE_SIZE `define L2_LINE_SIZE `MEM_BLOCK_SIZE
`endif `else
`define L2_LINE_SIZE `L1_LINE_SIZE
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE `endif
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8) `ifdef L3_ENABLE
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH `define L3_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS `define L3_LINE_SIZE `L2_LINE_SIZE
`define VX_DCR_DATA_WIDTH 32 `endif
`define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)} `define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
/////////////////////////////////////////////////////////////////////////////// `define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH
`define BUFFER_EX(dst, src, ena, latency) \
VX_pipe_register #( \ `define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS
.DATAW ($bits(dst)), \ `define VX_DCR_DATA_WIDTH 32
.RESETW ($bits(dst)), \
.DEPTH (latency) \ `define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)}
) __``dst ( \
.clk (clk), \ ///////////////////////////////////////////////////////////////////////////////
.reset (reset), \
.enable (ena), \ `define BUFFER_EX(dst, src, ena, latency) \
.data_in (src), \ VX_pipe_register #( \
.data_out (dst) \ .DATAW ($bits(dst)), \
) .RESETW ($bits(dst)), \
.DEPTH (latency) \
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1) ) __``dst ( \
.clk (clk), \
`define POP_COUNT_EX(out, in, model) \ .reset (reset), \
VX_popcount #( \ .enable (ena), \
.N ($bits(in)), \ .data_in (src), \
.MODEL (model) \ .data_out (dst) \
) __``out ( \ )
.data_in (in), \
.data_out (out) \ `define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1)
)
`define POP_COUNT_EX(out, in, model) \
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1) VX_popcount #( \
.N ($bits(in)), \
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \ .MODEL (model) \
assign dst.req_valid = src.req_valid; \ ) __``out ( \
assign dst.req_data = src.req_data; \ .data_in (in), \
assign src.req_ready = dst.req_ready; \ .data_out (out) \
assign src.rsp_valid = dst.rsp_valid; \ )
assign src.rsp_data = dst.rsp_data; \
assign dst.rsp_ready = src.rsp_ready `define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \ `define ASSIGN_VX_MEM_BUS_IF(dst, src) \
assign dst.req_valid = src.req_valid; \ assign dst.req_valid = src.req_valid; \
assign dst.req_data.rw = src.req_data.rw; \ assign dst.req_data = src.req_data; \
assign dst.req_data.byteen = src.req_data.byteen; \ assign src.req_ready = dst.req_ready; \
assign dst.req_data.addr = src.req_data.addr; \ assign src.rsp_valid = dst.rsp_valid; \
assign dst.req_data.data = src.req_data.data; \ assign src.rsp_data = dst.rsp_data; \
if (TD != TS) \ assign dst.rsp_ready = src.rsp_ready
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
else \ `define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
assign dst.req_data.tag = src.req_data.tag; \ assign dst.req_valid = src.req_valid; \
assign src.req_ready = dst.req_ready; \ assign dst.req_data.rw = src.req_data.rw; \
assign src.rsp_valid = dst.rsp_valid; \ assign dst.req_data.byteen = src.req_data.byteen; \
assign src.rsp_data.data = dst.rsp_data.data; \ assign dst.req_data.addr = src.req_data.addr; \
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \ assign dst.req_data.data = src.req_data.data; \
assign dst.rsp_ready = src.rsp_ready if (TD != TS) \
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
`define BUFFER_DCR_BUS_IF(dst, src, enable) \ else \
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \ assign dst.req_data.tag = src.req_data.tag; \
if (enable) begin \ assign src.req_ready = dst.req_ready; \
always @(posedge clk) begin \ assign src.rsp_valid = dst.rsp_valid; \
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \ assign src.rsp_data.data = dst.rsp_data.data; \
end \ assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
end else begin \ assign dst.rsp_ready = src.rsp_ready
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
end \ `define BUFFER_DCR_BUS_IF(dst, src, enable) \
VX_dcr_bus_if dst(); \ logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst if (enable) begin \
always @(posedge clk) begin \
`define PERF_REDUCE(dst, src, field, width, count) \ __``dst <= {src.write_valid, src.write_addr, src.write_data}; \
wire [count-1:0][width-1:0] __reduce_add_i_``src``field; \ end \
wire [width-1:0] __reduce_add_o_``dst``field; \ end else begin \
reg [width-1:0] __reduce_add_r_``dst``field; \ assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
for (genvar __i = 0; __i < count; ++__i) begin \ end \
assign __reduce_add_i_``src``field[__i] = ``src[__i].``field; \ VX_dcr_bus_if dst(); \
end \ assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``dst``field ( \
__reduce_add_i_``src``field, \ `define PERF_COUNTER_ADD(dst, src, field, width, dst_count, src_count, reg_enable) \
__reduce_add_o_``dst``field \ for (genvar __d = 0; __d < dst_count; ++__d) begin \
); \ localparam __count = ((src_count > dst_count) ? ((src_count + dst_count - 1) / dst_count) : 1); \
always @(posedge clk) begin \ wire [__count-1:0][width-1:0] __reduce_add_i_``src``field; \
if (reset) begin \ wire [width-1:0] __reduce_add_o_``dst``field; \
__reduce_add_r_``dst``field <= '0; \ for (genvar __i = 0; __i < __count; ++__i) begin \
end else begin \ assign __reduce_add_i_``src``field[__i] = ``src[__d * __count + __i].``field; \
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \ end \
end \ VX_reduce #(.DATAW_IN(width), .N(__count), .OP("+")) __reduce_add_``dst``field ( \
end \ __reduce_add_i_``src``field, \
assign ``dst.``field = __reduce_add_r_``dst``field __reduce_add_o_``dst``field \
); \
`define PERF_CACHE_REDUCE(dst, src, count) \ if (reg_enable) begin \
`PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \ reg [width-1:0] __reduce_add_r_``dst``field; \
`PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \ always @(posedge clk) begin \
`PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \ if (reset) begin \
`PERF_REDUCE (dst, src, write_misses, `PERF_CTR_BITS, count); \ __reduce_add_r_``dst``field <= '0; \
`PERF_REDUCE (dst, src, bank_stalls, `PERF_CTR_BITS, count); \ end else begin \
`PERF_REDUCE (dst, src, mshr_stalls, `PERF_CTR_BITS, count); \ __reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
`PERF_REDUCE (dst, src, mem_stalls, `PERF_CTR_BITS, count); \ end \
`PERF_REDUCE (dst, src, crsp_stalls, `PERF_CTR_BITS, count) end \
assign ``dst[__d].``field = __reduce_add_r_``dst``field; \
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \ end else begin \
if (block_size != 1) begin \ assign ``dst[__d].``field = __reduce_add_o_``dst``field; \
if (block_size != `NUM_WARPS) begin \ end \
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \ end
end else begin \
assign dst = `NW_WIDTH'(block_idx); \ `define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
end \ if (block_size != 1) begin \
end else begin \ if (block_size != `NUM_WARPS) begin \
assign dst = src; \ assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
end end else begin \
assign dst = `NW_WIDTH'(block_idx); \
`define TO_DISPATCH_DATA(data, tid) { \ end \
data.uuid, \ end else begin \
data.wis, \ assign dst = src; \
data.tmask, \ end
data.op_type, \
data.op_mod, \ `define TO_DISPATCH_DATA(data, tid) { \
data.wb, \ data.uuid, \
data.use_PC, \ data.wis, \
data.use_imm, \ data.tmask, \
data.PC, \ data.op_type, \
data.imm, \ data.op_mod, \
data.rd, \ data.wb, \
tid, \ data.use_PC, \
data.rs1_data, \ data.use_imm, \
data.rs2_data, \ data.PC, \
data.rs3_data} data.imm, \
data.rd, \
/////////////////////////////////////////////////////////////////////////////// tid, \
data.rs1_data, \
`endif // VX_DEFINE_VH data.rs2_data, \
data.rs3_data}
///////////////////////////////////////////////////////////////////////////////
`endif // VX_DEFINE_VH

View File

@@ -99,7 +99,7 @@ package VX_gpu_pkg;
`ifdef ICACHE_ENABLE `ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES); localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else `else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES); localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif `endif
////////////////////////// Dcache Parameters ////////////////////////////// ////////////////////////// Dcache Parameters //////////////////////////////
@@ -147,6 +147,9 @@ package VX_gpu_pkg;
/////////////////////////////// L2 Parameters ///////////////////////////// /////////////////////////////// L2 Parameters /////////////////////////////
localparam ICACHE_MEM_ARB_IDX = 0;
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
// Word size in bytes // Word size in bytes
localparam L2_WORD_SIZE = `L1_LINE_SIZE; localparam L2_WORD_SIZE = `L1_LINE_SIZE;

View File

@@ -66,18 +66,11 @@ module VX_socket import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if(); VX_mem_perf_if mem_perf_tmp_if();
cache_perf_t perf_icache;
cache_perf_t perf_dcache;
assign mem_perf_tmp_if.icache = perf_icache;
assign mem_perf_tmp_if.dcache = perf_dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x; assign mem_perf_tmp_if.smem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem; assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif `endif
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
@@ -110,7 +103,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MEM_OUT_REG (2) .MEM_OUT_REG (2)
) icache ( ) icache (
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (perf_icache), .cache_perf (mem_perf_tmp_if.icache),
`endif `endif
.clk (clk), .clk (clk),
.reset (icache_reset), .reset (icache_reset),
@@ -121,7 +114,7 @@ module VX_socket import VX_gpu_pkg::*; #(
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #( VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE), .DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH) .TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS](); ) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
@@ -150,7 +143,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MEM_OUT_REG (2) .MEM_OUT_REG (2)
) dcache ( ) dcache (
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (perf_dcache), .cache_perf (mem_perf_tmp_if.dcache),
`endif `endif
.clk (clk), .clk (clk),
.reset (dcache_reset), .reset (dcache_reset),

View File

@@ -97,6 +97,11 @@
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E `define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
`define VX_CSR_MPM_LOAD_LT 12'hB0F `define VX_CSR_MPM_LOAD_LT 12'hB0F
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F `define VX_CSR_MPM_LOAD_LT_H 12'hB8F
// SFU: scoreboard
`define VX_CSR_MPM_SCRB_WCTL 12'hB10
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB90
`define VX_CSR_MPM_SCRB_CSRS 12'hB11
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB91
// Machine Performance-monitoring memory counters // Machine Performance-monitoring memory counters
// PERF: icache // PERF: icache

View File

@@ -22,15 +22,15 @@ module Vortex import VX_gpu_pkg::*; (
// Memory request // Memory request
output wire mem_req_valid, output wire mem_req_valid,
output wire mem_req_rw, output wire mem_req_rw,
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen, output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr, output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data, output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data,
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag, output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready, input wire mem_req_ready,
// Memory response // Memory response
input wire mem_rsp_valid, input wire mem_rsp_valid,
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data, input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data,
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag, input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready, output wire mem_rsp_ready,
@@ -45,17 +45,11 @@ module Vortex import VX_gpu_pkg::*; (
); );
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if(); VX_mem_perf_if mem_perf_if();
cache_perf_t perf_l3cache;
mem_perf_t mem_perf;
assign mem_perf_if.smem = 'x;
assign mem_perf_if.icache = 'x; assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x; assign mem_perf_if.dcache = 'x;
assign mem_perf_if.l2cache = 'x; assign mem_perf_if.l2cache = 'x;
assign mem_perf_if.l3cache = perf_l3cache; `endif
assign mem_perf_if.mem = mem_perf;
`endif
VX_mem_bus_if #( VX_mem_bus_if #(
.DATA_SIZE (`L2_LINE_SIZE), .DATA_SIZE (`L2_LINE_SIZE),
@@ -93,7 +87,7 @@ module Vortex import VX_gpu_pkg::*; (
.reset (l3_reset), .reset (l3_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (perf_l3cache), .cache_perf (mem_perf_if.l3cache),
`endif `endif
.core_bus_if (per_cluster_mem_bus_if), .core_bus_if (per_cluster_mem_bus_if),
@@ -166,11 +160,12 @@ module Vortex import VX_gpu_pkg::*; (
); );
end end
`BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1)); `BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1));
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads; reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
mem_perf_t mem_perf;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
@@ -193,6 +188,7 @@ module Vortex import VX_gpu_pkg::*; (
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads; mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end end
end end
assign mem_perf_if.mem = mem_perf;
`endif `endif

View File

@@ -83,8 +83,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter")) `STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
cache_perf_t perf_cache_unit[NUM_CACHES]; cache_perf_t perf_cache_tmp[1], perf_cache_unit[NUM_CACHES];
`PERF_CACHE_REDUCE (cache_perf, perf_cache_unit, NUM_CACHES); `PERF_CACHE_ADD (perf_cache_tmp, perf_cache_unit, 1, NUM_CACHES)
assign cache_perf = perf_cache_tmp[0];
`endif `endif
VX_mem_bus_if #( VX_mem_bus_if #(

View File

@@ -62,4 +62,16 @@
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))} `define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)} `define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
///////////////////////////////////////////////////////////////////////////////
`define PERF_CACHE_ADD(dst, src, dcount, scount) \
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1))
`endif // VX_CACHE_DEFINE_VH `endif // VX_CACHE_DEFINE_VH

View File

@@ -1,344 +1,338 @@
// Copyright © 2019-2023 // Copyright © 2019-2023
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
// You may obtain a copy of the License at // You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
// //
// Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
`include "VX_define.vh" `include "VX_define.vh"
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
`include "VX_fpu_define.vh" `include "VX_fpu_define.vh"
`endif `endif
module VX_core import VX_gpu_pkg::*; #( module VX_core import VX_gpu_pkg::*; #(
parameter CORE_ID = 0 parameter CORE_ID = 0
) ( ) (
`SCOPE_IO_DECL `SCOPE_IO_DECL
// Clock // Clock
input wire clk, input wire clk,
input wire reset, input wire reset,
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, VX_mem_perf_if.slave mem_perf_if,
`endif `endif
VX_dcr_bus_if.slave dcr_bus_if, VX_dcr_bus_if.slave dcr_bus_if,
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS], VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
VX_mem_bus_if.master icache_bus_if, VX_mem_bus_if.master icache_bus_if,
`ifdef GBAR_ENABLE `ifdef GBAR_ENABLE
VX_gbar_bus_if.master gbar_bus_if, VX_gbar_bus_if.master gbar_bus_if,
`endif `endif
// simulation helper signals // simulation helper signals
output wire sim_ebreak, output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value, output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status // Status
output wire busy output wire busy
); );
VX_schedule_if schedule_if(); VX_schedule_if schedule_if();
VX_fetch_if fetch_if(); VX_fetch_if fetch_if();
VX_decode_if decode_if(); VX_decode_if decode_if();
VX_sched_csr_if sched_csr_if(); VX_sched_csr_if sched_csr_if();
VX_decode_sched_if decode_sched_if(); VX_decode_sched_if decode_sched_if();
VX_commit_sched_if commit_sched_if(); VX_commit_sched_if commit_sched_if();
VX_commit_csr_if commit_csr_if(); VX_commit_csr_if commit_csr_if();
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS](); VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
VX_warp_ctl_if warp_ctl_if(); VX_warp_ctl_if warp_ctl_if();
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH](); VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if alu_commit_if[`ISSUE_WIDTH](); VX_commit_if alu_commit_if[`ISSUE_WIDTH]();
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH](); VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if lsu_commit_if[`ISSUE_WIDTH](); VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH](); VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if fpu_commit_if[`ISSUE_WIDTH](); VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
`endif `endif
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH](); VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if sfu_commit_if[`ISSUE_WIDTH](); VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
VX_writeback_if writeback_if[`ISSUE_WIDTH](); VX_writeback_if writeback_if[`ISSUE_WIDTH]();
VX_mem_bus_if #( VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE), .DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH) .TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_NUM_REQS](); ) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_pipeline_perf_if pipeline_perf_if(); VX_mem_perf_if mem_perf_tmp_if();
VX_mem_perf_if mem_perf_tmp_if(); VX_pipeline_perf_if pipeline_perf_if();
assign mem_perf_tmp_if.icache = mem_perf_if.icache; assign mem_perf_tmp_if.icache = mem_perf_if.icache;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache; assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
`ifdef SM_ENABLE assign mem_perf_tmp_if.mem = mem_perf_if.mem;
cache_perf_t smem_perf; `endif
assign mem_perf_tmp_if.smem = smem_perf;
`else `RESET_RELAY (dcr_data_reset, reset);
assign mem_perf_tmp_if.smem = '0; `RESET_RELAY (schedule_reset, reset);
`endif `RESET_RELAY (fetch_reset, reset);
assign mem_perf_tmp_if.mem = mem_perf_if.mem; `RESET_RELAY (decode_reset, reset);
`endif `RESET_RELAY (issue_reset, reset);
`RESET_RELAY (execute_reset, reset);
`RESET_RELAY (dcr_data_reset, reset); `RESET_RELAY (commit_reset, reset);
`RESET_RELAY (schedule_reset, reset);
`RESET_RELAY (fetch_reset, reset); base_dcrs_t base_dcrs;
`RESET_RELAY (decode_reset, reset);
`RESET_RELAY (issue_reset, reset); VX_dcr_data dcr_data (
`RESET_RELAY (execute_reset, reset); .clk (clk),
`RESET_RELAY (commit_reset, reset); .reset (dcr_data_reset),
.dcr_bus_if (dcr_bus_if),
base_dcrs_t base_dcrs; .base_dcrs (base_dcrs)
);
VX_dcr_data dcr_data (
.clk (clk), `SCOPE_IO_SWITCH (3)
.reset (dcr_data_reset),
.dcr_bus_if (dcr_bus_if), VX_schedule #(
.base_dcrs (base_dcrs) .CORE_ID (CORE_ID)
); ) schedule (
.clk (clk),
`SCOPE_IO_SWITCH (3) .reset (schedule_reset),
VX_schedule #( `ifdef PERF_ENABLE
.CORE_ID (CORE_ID) .perf_schedule_if (pipeline_perf_if.schedule),
) schedule ( `endif
.clk (clk),
.reset (schedule_reset), .base_dcrs (base_dcrs),
`ifdef PERF_ENABLE .warp_ctl_if (warp_ctl_if),
.perf_schedule_if (pipeline_perf_if.schedule), .branch_ctl_if (branch_ctl_if),
`endif .decode_sched_if(decode_sched_if),
.commit_sched_if(commit_sched_if),
.base_dcrs (base_dcrs),
.schedule_if (schedule_if),
.warp_ctl_if (warp_ctl_if), `ifdef GBAR_ENABLE
.branch_ctl_if (branch_ctl_if), .gbar_bus_if (gbar_bus_if),
.decode_sched_if(decode_sched_if), `endif
.commit_sched_if(commit_sched_if), .sched_csr_if (sched_csr_if),
.schedule_if (schedule_if), .busy (busy)
`ifdef GBAR_ENABLE );
.gbar_bus_if (gbar_bus_if),
`endif VX_fetch #(
.sched_csr_if (sched_csr_if), .CORE_ID (CORE_ID)
) fetch (
.busy (busy) `SCOPE_IO_BIND (0)
); .clk (clk),
.reset (fetch_reset),
VX_fetch #( .icache_bus_if (icache_bus_if),
.CORE_ID (CORE_ID) .schedule_if (schedule_if),
) fetch ( .fetch_if (fetch_if)
`SCOPE_IO_BIND (0) );
.clk (clk),
.reset (fetch_reset), VX_decode #(
.icache_bus_if (icache_bus_if), .CORE_ID (CORE_ID)
.schedule_if (schedule_if), ) decode (
.fetch_if (fetch_if) .clk (clk),
); .reset (decode_reset),
.fetch_if (fetch_if),
VX_decode #( .decode_if (decode_if),
.CORE_ID (CORE_ID) .decode_sched_if(decode_sched_if)
) decode ( );
.clk (clk),
.reset (decode_reset), VX_issue #(
.fetch_if (fetch_if), .CORE_ID (CORE_ID)
.decode_if (decode_if), ) issue (
.decode_sched_if(decode_sched_if) `SCOPE_IO_BIND (1)
);
.clk (clk),
VX_issue #( .reset (issue_reset),
.CORE_ID (CORE_ID)
) issue ( `ifdef PERF_ENABLE
`SCOPE_IO_BIND (1) .perf_issue_if (pipeline_perf_if.issue),
`endif
.clk (clk),
.reset (issue_reset), .decode_if (decode_if),
.writeback_if (writeback_if),
`ifdef PERF_ENABLE
.perf_issue_if (pipeline_perf_if.issue), .alu_dispatch_if(alu_dispatch_if),
`endif .lsu_dispatch_if(lsu_dispatch_if),
`ifdef EXT_F_ENABLE
.decode_if (decode_if), .fpu_dispatch_if(fpu_dispatch_if),
.writeback_if (writeback_if), `endif
.sfu_dispatch_if(sfu_dispatch_if)
.alu_dispatch_if(alu_dispatch_if), );
.lsu_dispatch_if(lsu_dispatch_if),
`ifdef EXT_F_ENABLE VX_execute #(
.fpu_dispatch_if(fpu_dispatch_if), .CORE_ID (CORE_ID)
`endif ) execute (
.sfu_dispatch_if(sfu_dispatch_if) `SCOPE_IO_BIND (2)
);
.clk (clk),
VX_execute #( .reset (execute_reset),
.CORE_ID (CORE_ID)
) execute ( .base_dcrs (base_dcrs),
`SCOPE_IO_BIND (2)
`ifdef PERF_ENABLE
.clk (clk), .mem_perf_if (mem_perf_tmp_if),
.reset (execute_reset), .pipeline_perf_if(pipeline_perf_if),
`endif
.base_dcrs (base_dcrs),
.dcache_bus_if (dcache_bus_tmp_if),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if), `ifdef EXT_F_ENABLE
.pipeline_perf_if(pipeline_perf_if), .fpu_dispatch_if(fpu_dispatch_if),
`endif .fpu_commit_if (fpu_commit_if),
`endif
.dcache_bus_if (dcache_bus_tmp_if),
.commit_csr_if (commit_csr_if),
`ifdef EXT_F_ENABLE .sched_csr_if (sched_csr_if),
.fpu_dispatch_if(fpu_dispatch_if),
.fpu_commit_if (fpu_commit_if), .alu_dispatch_if(alu_dispatch_if),
`endif .lsu_dispatch_if(lsu_dispatch_if),
.sfu_dispatch_if(sfu_dispatch_if),
.commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if), .warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if), .alu_commit_if (alu_commit_if),
.sfu_dispatch_if(sfu_dispatch_if), .lsu_commit_if (lsu_commit_if),
.sfu_commit_if (sfu_commit_if),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if), .sim_ebreak (sim_ebreak)
);
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if), VX_commit #(
.sfu_commit_if (sfu_commit_if), .CORE_ID (CORE_ID)
) commit (
.sim_ebreak (sim_ebreak) .clk (clk),
); .reset (commit_reset),
VX_commit #( .alu_commit_if (alu_commit_if),
.CORE_ID (CORE_ID) .lsu_commit_if (lsu_commit_if),
) commit ( `ifdef EXT_F_ENABLE
.clk (clk), .fpu_commit_if (fpu_commit_if),
.reset (commit_reset), `endif
.sfu_commit_if (sfu_commit_if),
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if), .writeback_if (writeback_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if), .commit_csr_if (commit_csr_if),
`endif .commit_sched_if(commit_sched_if),
.sfu_commit_if (sfu_commit_if),
.sim_wb_value (sim_wb_value)
.writeback_if (writeback_if), );
.commit_csr_if (commit_csr_if), `ifdef SM_ENABLE
.commit_sched_if(commit_sched_if),
VX_smem_unit #(
.sim_wb_value (sim_wb_value) .CORE_ID (CORE_ID)
); ) smem_unit (
.clk (clk),
`ifdef SM_ENABLE .reset (reset),
`ifdef PERF_ENABLE
VX_smem_unit #( .cache_perf (mem_perf_tmp_if.smem),
.CORE_ID (CORE_ID) `endif
) smem_unit ( .dcache_bus_in_if (dcache_bus_tmp_if),
.clk (clk), .dcache_bus_out_if (dcache_bus_if)
.reset (reset), );
`ifdef PERF_ENABLE
.cache_perf (smem_perf), `else
`endif
.dcache_bus_in_if (dcache_bus_tmp_if), for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
.dcache_bus_out_if (dcache_bus_if) `ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
); end
`else `endif
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin `ifdef PERF_ENABLE
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
end wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
`endif wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
`ifdef PERF_ENABLE wire [1:0] perf_icache_pending_read_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle; reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle; reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
wire [1:0] perf_icache_pending_read_cycle; reg [`PERF_CTR_BITS-1:0] perf_ifetches;
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle; reg [`PERF_CTR_BITS-1:0] perf_loads;
reg [`PERF_CTR_BITS-1:0] perf_stores;
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads; wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
reg [`PERF_CTR_BITS-1:0] perf_loads; wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
reg [`PERF_CTR_BITS-1:0] perf_stores; wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready; for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r; assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r; assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire; end
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin `BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw; `BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire);
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready; `POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r);
end `POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r);
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
`BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
`BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire); assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r); always @(posedge clk) begin
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire); if (reset) begin
perf_icache_pending_reads <= '0;
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire; perf_dcache_pending_reads <= '0;
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle; end else begin
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
always @(posedge clk) begin perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
if (reset) begin end
perf_icache_pending_reads <= '0; end
perf_dcache_pending_reads <= '0;
end else begin reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle)); reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
end always @(posedge clk) begin
end if (reset) begin
perf_ifetches <= '0;
reg [`PERF_CTR_BITS-1:0] perf_icache_lat; perf_loads <= '0;
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat; perf_stores <= '0;
perf_icache_lat <= '0;
always @(posedge clk) begin perf_dcache_lat <= '0;
if (reset) begin end else begin
perf_ifetches <= '0; perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
perf_loads <= '0; perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
perf_stores <= '0; perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
perf_icache_lat <= '0; perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
perf_dcache_lat <= '0; perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
end else begin end
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire); end
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle); assign pipeline_perf_if.ifetches = perf_ifetches;
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads; assign pipeline_perf_if.loads = perf_loads;
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads; assign pipeline_perf_if.stores = perf_stores;
end assign pipeline_perf_if.load_latency = perf_dcache_lat;
end assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf_if.ifetches = perf_ifetches;
assign pipeline_perf_if.loads = perf_loads; `endif
assign pipeline_perf_if.stores = perf_stores;
assign pipeline_perf_if.load_latency = perf_dcache_lat; endmodule
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
`endif
endmodule

View File

@@ -129,12 +129,12 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_rsp_ready = icache_bus_if.rsp_ready; assign icache_rsp_ready = icache_bus_if.rsp_ready;
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if(); VX_mem_perf_if mem_perf_if();
assign mem_perf_if.smem = '0;
assign mem_perf_if.icache = '0; assign mem_perf_if.icache = '0;
assign mem_perf_if.dcache = '0; assign mem_perf_if.dcache = '0;
assign mem_perf_if.l2cache = '0; assign mem_perf_if.l2cache = '0;
assign mem_perf_if.l3cache = '0; assign mem_perf_if.l3cache = '0;
assign mem_perf_if.smem = '0;
assign mem_perf_if.mem = '0; assign mem_perf_if.mem = '0;
`endif `endif

View File

@@ -33,7 +33,6 @@ import VX_fpu_pkg::*;
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if, VX_pipeline_perf_if.slave pipeline_perf_if,
VX_sfu_perf_if.slave sfu_perf_if,
`endif `endif
VX_commit_csr_if.slave commit_csr_if, VX_commit_csr_if.slave commit_csr_if,
@@ -187,103 +186,107 @@ import VX_fpu_pkg::*;
`VX_DCR_MPM_CLASS_CORE: begin `VX_DCR_MPM_CLASS_CORE: begin
case (read_addr) case (read_addr)
// PERF: pipeline // PERF: pipeline
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0]; `VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0]; `VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0]; `VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0]; `VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_ALU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_ALU][31:0];
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0]; `VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_FPU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_FPU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_FPU][31:0];
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_FPU][31:0]; `VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
`else `else
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0; `VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0; `VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
`endif `endif
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_LSU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_LSU][31:0];
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0]; `VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_SFU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_SFU][31:0];
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0]; `VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_CSRS : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_CSRS][31:0];
`VX_CSR_MPM_SCRB_CSRS_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_WCTL : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_WCTL][31:0];
`VX_CSR_MPM_SCRB_WCTL_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL][`PERF_CTR_BITS-1:32]);
// PERF: memory // PERF: memory
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0]; `VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0]; `VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0]; `VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0]; `VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0]; `VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
default:; default:;
endcase endcase
end end
`VX_DCR_MPM_CLASS_MEM: begin `VX_DCR_MPM_CLASS_MEM: begin
case (read_addr) case (read_addr)
// PERF: icache // PERF: icache
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0]; `VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0]; `VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0]; `VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
`VX_CSR_MPM_ICACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_ICACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: dcache // PERF: dcache
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0]; `VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0]; `VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0]; `VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0]; `VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0]; `VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0]; `VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: smem // PERF: smem
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0]; `VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0]; `VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0]; `VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l2cache // PERF: l2cache
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0]; `VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0]; `VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L2CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0]; `VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L2CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0]; `VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L2CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0]; `VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L2CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0]; `VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L2CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l3cache // PERF: l3cache
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0]; `VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0]; `VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L3CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0]; `VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L3CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0]; `VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L3CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0]; `VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L3CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0]; `VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L3CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: memory // PERF: memory
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0]; `VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0]; `VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0]; `VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
default:; default:;
endcase endcase
end end
@@ -303,8 +306,6 @@ import VX_fpu_pkg::*;
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid)) `RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
`UNUSED_VAR (perf_wctl_stalls);
`UNUSED_VAR (mem_perf_if.icache); `UNUSED_VAR (mem_perf_if.icache);
`UNUSED_VAR (mem_perf_if.smem); `UNUSED_VAR (mem_perf_if.smem);
`endif `endif

View File

@@ -25,7 +25,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if, VX_pipeline_perf_if.slave pipeline_perf_if,
VX_sfu_perf_if.slave sfu_perf_if,
`endif `endif
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
@@ -81,7 +80,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if), .mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if), .pipeline_perf_if(pipeline_perf_if),
.sfu_perf_if (sfu_perf_if),
`endif `endif
.commit_csr_if (commit_csr_if), .commit_csr_if (commit_csr_if),

View File

@@ -61,7 +61,8 @@ module VX_issue #(
.reset (scoreboard_reset), .reset (scoreboard_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.perf_scb_stalls(perf_issue_if.scb_stalls), .perf_scb_stalls(perf_issue_if.scb_stalls),
.perf_scb_uses (perf_issue_if.scb_uses), .perf_units_uses(perf_issue_if.units_uses),
.perf_sfu_uses (perf_issue_if.sfu_uses),
`endif `endif
.writeback_if (writeback_if), .writeback_if (writeback_if),
.ibuffer_if (ibuffer_if), .ibuffer_if (ibuffer_if),

View File

@@ -21,7 +21,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls, output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
output reg [`PERF_CTR_BITS-1:0] perf_scb_uses [`NUM_EX_UNITS], output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
`endif `endif
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
@@ -32,21 +33,66 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1; localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
wire [`NUM_EX_UNITS-1:0] perf_uses_per_cycle; reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle; wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_uses_per_cycle;
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle); reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
VX_reduce #( VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS), .DATAW_IN (`NUM_EX_UNITS),
.N (`ISSUE_WIDTH), .N (`ISSUE_WIDTH),
.OP ("|") .OP ("|")
) reduce ( ) perf_units_reduce (
.data_in (perf_issue_uses_per_cycle), .data_in (perf_issue_units_per_cycle),
.data_out (perf_uses_per_cycle) .data_out (perf_units_per_cycle)
);
VX_reduce #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) perf_sfu_reduce (
.data_in (perf_issue_sfu_per_cycle),
.data_out (perf_sfu_per_cycle)
); );
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
`BUFFER(perf_units_per_cycle_r, perf_units_per_cycle);
`BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle);
always @(posedge clk) begin
if (reset) begin
perf_scb_stalls <= '0;
end else begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
end
end
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_units_uses[i] <= '0;
end else begin
perf_units_uses[i] <= perf_units_uses[i] + `PERF_CTR_BITS'(perf_units_per_cycle_r[i]);
end
end
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_sfu_uses[i] <= '0;
end else begin
perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
end
end
end
`endif `endif
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
@@ -60,21 +106,46 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]; wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units; reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
reg [`SFU_WIDTH-1:0] sfu_type;
always @(*) begin always @(*) begin
perf_issue_uses_per_cycle[i] = '0; case (scoreboard_if[i].data.op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
default: sfu_type = `SFU_WCTL;
endcase
end
always @(*) begin
perf_issue_units_per_cycle[i] = '0;
perf_issue_sfu_per_cycle[i] = '0;
if (ibuffer_if[i].valid) begin if (ibuffer_if[i].valid) begin
if (inuse_rd) begin if (inuse_rd) begin
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1; perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
end
end end
if (inuse_rs1) begin if (inuse_rs1) begin
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1; perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
end
end end
if (inuse_rs2) begin if (inuse_rs2) begin
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1; perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
end
end end
if (inuse_rs3) begin if (inuse_rs3) begin
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1; perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
end
end end
end end
end end
@@ -97,8 +168,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
valid_out_r <= 0; valid_out_r <= 0;
inuse_regs <= '0; inuse_regs <= '0;
end else begin end else begin
if (writeback_fire) begin if (writeback_fire) begin
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0; inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
end end
@@ -109,6 +180,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1; inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1;
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type; inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type;
if (scoreboard_if[i].data.ex_type == `EX_SFU) begin
inuse_sfu[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= sfu_type;
end
`endif `endif
end end
valid_out_r <= 0; valid_out_r <= 0;
@@ -141,7 +215,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
timeout_ctr <= '0; timeout_ctr <= '0;
end end
end end
end end
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT), `RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)", ("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
@@ -153,32 +227,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid)); $time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
`endif `endif
end
`ifdef PERF_ENABLE
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle_r;
wire [`NUM_EX_UNITS-1:0] perf_uses_per_cycle_r;
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
`BUFFER(perf_uses_per_cycle_r, perf_uses_per_cycle);
always @(posedge clk) begin
if (reset) begin
perf_scb_stalls <= '0;
end else begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
end
end end
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_scb_uses[i] <= '0;
end else begin
perf_scb_uses[i] <= perf_scb_uses[i] + `PERF_CTR_BITS'(perf_uses_per_cycle_r[i]);
end
end
end
`endif
endmodule endmodule

View File

@@ -48,7 +48,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1; localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + 1; localparam RSP_ARB_SIZE = 1 + 1;
localparam RSP_ARB_IDX_WCTL = 0; localparam RSP_ARB_IDX_WCTL = 0;
localparam RSP_ARB_IDX_CSR = 1; localparam RSP_ARB_IDX_CSRS = 1;
VX_execute_if #( VX_execute_if #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
@@ -71,9 +71,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in; wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in; wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
`ifdef PERF_ENABLE
VX_sfu_perf_if sfu_perf_if();
`endif
// Warp control block // Warp control block
VX_execute_if #( VX_execute_if #(
@@ -129,7 +126,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if), .mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if), .pipeline_perf_if(pipeline_perf_if),
.sfu_perf_if (sfu_perf_if),
`endif `endif
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
@@ -141,21 +137,21 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.commit_if (csr_commit_if) .commit_if (csr_commit_if)
); );
assign rsp_arb_valid_in[RSP_ARB_IDX_CSR] = csr_commit_if.valid; assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_CSR] = csr_commit_if.data; assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSR]; assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS];
// can accept new request? // can accept new request?
reg sfu_req_ready; reg sfu_req_ready;
always @(*) begin always @(*) begin
case (execute_if[0].data.op_type) case (execute_if[0].data.op_type)
`INST_SFU_CSRRW, `INST_SFU_CSRRW,
`INST_SFU_CSRRS, `INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready; `INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
default: sfu_req_ready = wctl_execute_if.ready; default: sfu_req_ready = wctl_execute_if.ready;
endcase endcase
end end
assign execute_if[0].ready = sfu_req_ready; assign execute_if[0].ready = sfu_req_ready;
// response arbitration // response arbitration
@@ -194,19 +190,4 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.commit_out_if (commit_if) .commit_out_if (commit_if)
); );
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls;
wire wctl_execute_stall = wctl_execute_if.valid && ~wctl_execute_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_wctl_stalls <= '0;
end else begin
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_stall);
end
end
assign sfu_perf_if.wctl_stalls = perf_wctl_stalls;
`endif
endmodule endmodule

View File

@@ -18,7 +18,8 @@ interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] sched_stalls; wire [`PERF_CTR_BITS-1:0] sched_stalls;
wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS]; wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS];
wire [`PERF_CTR_BITS-1:0] ifetches; wire [`PERF_CTR_BITS-1:0] ifetches;
wire [`PERF_CTR_BITS-1:0] loads; wire [`PERF_CTR_BITS-1:0] loads;
@@ -34,7 +35,8 @@ interface VX_pipeline_perf_if ();
modport issue ( modport issue (
output ibf_stalls, output ibf_stalls,
output scb_stalls, output scb_stalls,
output scb_uses output units_uses,
output sfu_uses
); );
modport slave ( modport slave (
@@ -42,7 +44,8 @@ interface VX_pipeline_perf_if ();
input sched_stalls, input sched_stalls,
input ibf_stalls, input ibf_stalls,
input scb_stalls, input scb_stalls,
input scb_uses, input units_uses,
input sfu_uses,
input ifetches, input ifetches,
input loads, input loads,
input stores, input stores,

View File

@@ -208,6 +208,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t scrb_fpu = 0; uint64_t scrb_fpu = 0;
uint64_t scrb_lsu = 0; uint64_t scrb_lsu = 0;
uint64_t scrb_sfu = 0; uint64_t scrb_sfu = 0;
uint64_t scrb_wctl = 0;
uint64_t scrb_csrs = 0;
uint64_t ifetches = 0; uint64_t ifetches = 0;
uint64_t loads = 0; uint64_t loads = 0;
uint64_t stores = 0; uint64_t stores = 0;
@@ -268,44 +270,69 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
// PERF: pipeline // PERF: pipeline
// scheduler idles // scheduler idles
{ {
uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID); uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID);
int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core); if (num_cores > 1) {
if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler idles=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core); int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: scheduler idle=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
}
sched_idles += sched_idles_per_core; sched_idles += sched_idles_per_core;
} }
// scheduler stalls // scheduler stalls
{ {
uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST); uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core); if (num_cores > 1) {
if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core); int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
}
sched_stalls += sched_stalls_per_core; sched_stalls += sched_stalls_per_core;
} }
// ibuffer_stalls // ibuffer_stalls
{ {
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST); uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core); if (num_cores > 1) {
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core); int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
}
ibuffer_stalls += ibuffer_stalls_per_core; ibuffer_stalls += ibuffer_stalls_per_core;
} }
// scrb_stalls // issue_stalls
{ {
uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST); uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU); uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU);
uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU); uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU);
uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU); uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU);
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU); uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
scrb_alu += scrb_alu_per_core; scrb_alu += scrb_alu_per_core;
scrb_fpu += scrb_fpu_per_core; scrb_fpu += scrb_fpu_per_core;
scrb_lsu += scrb_lsu_per_core; scrb_lsu += scrb_lsu_per_core;
scrb_sfu += scrb_sfu_per_core; scrb_sfu += scrb_sfu_per_core;
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core, if (num_cores > 1) {
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
fprintf(stream, "PERF: core%d: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
calcAvgPercent(scrb_alu_per_core, scrb_total), calcAvgPercent(scrb_alu_per_core, scrb_total),
calcAvgPercent(scrb_fpu_per_core, scrb_total), calcAvgPercent(scrb_fpu_per_core, scrb_total),
calcAvgPercent(scrb_lsu_per_core, scrb_total), calcAvgPercent(scrb_lsu_per_core, scrb_total),
calcAvgPercent(scrb_sfu_per_core, scrb_total)); calcAvgPercent(scrb_sfu_per_core, scrb_total));
}
scrb_stalls += scrb_stalls_per_core; scrb_stalls += scrb_stalls_per_core;
} }
// sfu_stalls
{
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
uint64_t scrb_wctl_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_WCTL);
uint64_t scrb_csrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_CSRS);
if (num_cores > 1) {
uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core + scrb_tex_per_core + scrb_raster_per_core + scrb_om_per_core;
fprintf(stream, "PERF: core%d: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
, core_id
, scrb_sfu_per_core
, calcAvgPercent(scrb_csrs_per_core, sfu_total)
, calcAvgPercent(scrb_wctl_per_core, sfu_total)
);
}
scrb_wctl += scrb_wctl_per_core;
scrb_csrs += scrb_csrs_per_core;
}
// PERF: memory // PERF: memory
// ifetches // ifetches
{ {
@@ -313,9 +340,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core); if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
ifetches += ifetches_per_core; ifetches += ifetches_per_core;
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT); uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT);
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core); if (num_cores > 1) {
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat); int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
}
ifetch_lat += ifetch_lat_per_core; ifetch_lat += ifetch_lat_per_core;
} }
// loads // loads
@@ -324,9 +353,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core); if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
loads += loads_per_core; loads += loads_per_core;
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT); uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT);
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core); if (num_cores > 1) {
if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat); int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
}
load_lat += load_lat_per_core; load_lat += load_lat_per_core;
} }
// stores // stores
@@ -428,14 +459,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches)); int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
int load_avg_lat = (int)(double(load_lat) / double(loads)); int load_avg_lat = (int)(double(load_lat) / double(loads));
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu; uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
fprintf(stream, "PERF: scheduler idles=%ld (%d%%)\n", sched_idles, sched_idles_percent); uint64_t sfu_total = scrb_wctl + scrb_csrs;
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent); fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent); fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls, fprintf(stream, "PERF: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
calcAvgPercent(scrb_alu, scrb_total), calcAvgPercent(scrb_alu, scrb_total),
calcAvgPercent(scrb_fpu, scrb_total), calcAvgPercent(scrb_fpu, scrb_total),
calcAvgPercent(scrb_lsu, scrb_total), calcAvgPercent(scrb_lsu, scrb_total),
calcAvgPercent(scrb_sfu, scrb_total)); calcAvgPercent(scrb_sfu, scrb_total));
fprintf(stream, "PERF: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
, scrb_sfu
, calcAvgPercent(scrb_csrs, sfu_total)
, calcAvgPercent(scrb_wctl, sfu_total)
);
fprintf(stream, "PERF: ifetches=%ld\n", ifetches); fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
fprintf(stream, "PERF: loads=%ld\n", loads); fprintf(stream, "PERF: loads=%ld\n", loads);
fprintf(stream, "PERF: stores=%ld\n", stores); fprintf(stream, "PERF: stores=%ld\n", stores);

View File

@@ -18,20 +18,20 @@ using namespace vortex;
Cluster::Cluster(const SimContext& ctx, Cluster::Cluster(const SimContext& ctx,
uint32_t cluster_id, uint32_t cluster_id,
ProcessorImpl* processor, ProcessorImpl* processor,
const Arch &arch, const const Arch &arch,
DCRS &dcrs) const DCRS &dcrs)
: SimObject(ctx, "cluster") : SimObject(ctx, "cluster")
, mem_req_port(this) , mem_req_port(this)
, mem_rsp_port(this) , mem_rsp_port(this)
, cluster_id_(cluster_id) , cluster_id_(cluster_id)
, sockets_(NUM_SOCKETS)
, barriers_(arch.num_barriers(), 0)
, processor_(processor) , processor_(processor)
, sockets_(NUM_SOCKETS)
, barriers_(arch.num_barriers(), 0)
, cores_per_socket_(arch.socket_size()) , cores_per_socket_(arch.socket_size())
{ {
char sname[100]; char sname[100];
auto sockets_per_cluster = sockets_.size(); uint32_t sockets_per_cluster = sockets_.size();
// create sockets // create sockets
@@ -43,7 +43,10 @@ Cluster::Cluster(const SimContext& ctx,
for (uint32_t i = 0; i < sockets_per_cluster; ++i) { for (uint32_t i = 0; i < sockets_per_cluster; ++i) {
uint32_t socket_id = cluster_id * sockets_per_cluster + i; uint32_t socket_id = cluster_id * sockets_per_cluster + i;
auto socket = Socket::Create(socket_id, this, arch, dcrs); auto socket = Socket::Create(socket_id,
this,
arch,
dcrs);
socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i)); socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i));
icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port); icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port);
@@ -154,7 +157,7 @@ void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
} }
Cluster::PerfStats Cluster::perf_stats() const { Cluster::PerfStats Cluster::perf_stats() const {
Cluster::PerfStats perf; PerfStats perf_stats;
perf.l2cache = l2cache_->perf_stats(); perf_stats.l2cache = l2cache_->perf_stats();
return perf; return perf_stats;
} }

View File

@@ -17,6 +17,7 @@
#include "dcrs.h" #include "dcrs.h"
#include "arch.h" #include "arch.h"
#include "cache_cluster.h" #include "cache_cluster.h"
#include "shared_mem.h"
#include "core.h" #include "core.h"
#include "socket.h" #include "socket.h"
#include "constants.h" #include "constants.h"
@@ -27,13 +28,8 @@ class ProcessorImpl;
class Cluster : public SimObject<Cluster> { class Cluster : public SimObject<Cluster> {
public: public:
struct PerfStats { struct PerfStats {
CacheSim::PerfStats l2cache; CacheSim::PerfStats l2cache;
PerfStats& operator+=(const PerfStats& rhs) {
this->l2cache += rhs.l2cache;
return *this;
}
}; };
SimPort<MemReq> mem_req_port; SimPort<MemReq> mem_req_port;
@@ -67,15 +63,15 @@ public:
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id); void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
Cluster::PerfStats perf_stats() const; PerfStats perf_stats() const;
private: private:
uint32_t cluster_id_; uint32_t cluster_id_;
std::vector<Socket::Ptr> sockets_; ProcessorImpl* processor_;
std::vector<CoreMask> barriers_; std::vector<Socket::Ptr> sockets_;
CacheSim::Ptr l2cache_; std::vector<CoreMask> barriers_;
ProcessorImpl* processor_; CacheSim::Ptr l2cache_;
uint32_t cores_per_socket_; uint32_t cores_per_socket_;
}; };
} // namespace vortex } // namespace vortex

View File

@@ -28,13 +28,18 @@
using namespace vortex; using namespace vortex;
Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs) Core::Core(const SimContext& ctx,
uint32_t core_id,
Socket* socket,
const Arch &arch,
const DCRS &dcrs)
: SimObject(ctx, "core") : SimObject(ctx, "core")
, icache_req_ports(1, this) , icache_req_ports(1, this)
, icache_rsp_ports(1, this) , icache_rsp_ports(1, this)
, dcache_req_ports(NUM_LSU_LANES, this) , dcache_req_ports(NUM_LSU_LANES, this)
, dcache_rsp_ports(NUM_LSU_LANES, this) , dcache_rsp_ports(NUM_LSU_LANES, this)
, core_id_(core_id) , core_id_(core_id)
, socket_(socket)
, arch_(arch) , arch_(arch)
, dcrs_(dcrs) , dcrs_(dcrs)
, decoder_(arch) , decoder_(arch)
@@ -42,7 +47,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
, barriers_(arch.num_barriers(), 0) , barriers_(arch.num_barriers(), 0)
, fcsrs_(arch.num_warps(), 0) , fcsrs_(arch.num_warps(), 0)
, ibuffers_(arch.num_warps(), IBUF_SIZE) , ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_) , scoreboard_(arch_)
, operands_(ISSUE_WIDTH) , operands_(ISSUE_WIDTH)
, dispatchers_((uint32_t)ExeType::ExeTypeCount) , dispatchers_((uint32_t)ExeType::ExeTypeCount)
, exe_units_((uint32_t)ExeType::ExeTypeCount) , exe_units_((uint32_t)ExeType::ExeTypeCount)
@@ -50,8 +55,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
, fetch_latch_("fetch") , fetch_latch_("fetch")
, decode_latch_("decode") , decode_latch_("decode")
, pending_icache_(arch_.num_warps()) , pending_icache_(arch_.num_warps())
, csrs_(arch.num_warps()) , csrs_(arch.num_warps())
, socket_(socket)
, commit_arbs_(ISSUE_WIDTH) , commit_arbs_(ISSUE_WIDTH)
{ {
char sname[100]; char sname[100];
@@ -69,6 +73,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
} }
// initialize shared memory // initialize shared memory
snprintf(sname, 100, "core%d-shared_mem", core_id);
shared_mem_ = SharedMem::Create(sname, SharedMem::Config{ shared_mem_ = SharedMem::Create(sname, SharedMem::Config{
(1 << SMEM_LOG_SIZE), (1 << SMEM_LOG_SIZE),
sizeof(Word), sizeof(Word),
@@ -77,17 +82,17 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
false false
}); });
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) { for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
snprintf(sname, 100, "smem_demux%d_%d", core_id, i); snprintf(sname, 100, "core%d-smem_demux%d", core_id, i);
auto smem_demux = SMemDemux::Create(sname); auto smem_demux = SMemDemux::Create(sname);
smem_demux->ReqDC.bind(&dcache_req_ports.at(i)); smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&smem_demux->RspDC); dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i)); smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM); shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
smem_demuxs_.at(i) = smem_demux; smem_demuxs_.at(i) = smem_demux;
} }
// initialize dispatchers // initialize dispatchers
dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES); dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
@@ -103,7 +108,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
// bind commit arbiters // bind commit arbiters
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
snprintf(sname, 100, "commit-arb%d", i); snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1); auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1);
for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) { for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) {
exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j)); exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
@@ -128,7 +133,7 @@ void Core::reset() {
for (auto& exe_unit : exe_units_) { for (auto& exe_unit : exe_units_) {
exe_unit->reset(); exe_unit->reset();
} }
for (auto& commit_arb : commit_arbs_) { for (auto& commit_arb : commit_arbs_) {
commit_arb->reset(); commit_arb->reset();
} }
@@ -184,7 +189,7 @@ void Core::schedule() {
} }
} }
if (scheduled_warp == -1) { if (scheduled_warp == -1) {
++perf_stats_.sched_idles; ++perf_stats_.sched_idle;
return; return;
} }
@@ -229,7 +234,7 @@ void Core::fetch() {
mem_req.uuid = trace->uuid; mem_req.uuid = trace->uuid;
icache_req_ports.at(0).send(mem_req, 2); icache_req_ports.at(0).send(mem_req, 2);
DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
fetch_latch_.pop(); fetch_latch_.pop();
++perf_stats_.ifetches; ++perf_stats_.ifetches;
++pending_ifetches_; ++pending_ifetches_;
} }
@@ -311,7 +316,21 @@ void Core::issue() {
case ExeType::ALU: ++perf_stats_.scrb_alu; break; case ExeType::ALU: ++perf_stats_.scrb_alu; break;
case ExeType::FPU: ++perf_stats_.scrb_fpu; break; case ExeType::FPU: ++perf_stats_.scrb_fpu; break;
case ExeType::LSU: ++perf_stats_.scrb_lsu; break; case ExeType::LSU: ++perf_stats_.scrb_lsu; break;
case ExeType::SFU: ++perf_stats_.scrb_sfu; break; case ExeType::SFU: {
++perf_stats_.scrb_sfu;
switch (use.sfu_type) {
case SfuType::TMC:
case SfuType::WSPAWN:
case SfuType::SPLIT:
case SfuType::JOIN:
case SfuType::BAR:
case SfuType::PRED: ++perf_stats_.scrb_wctl; break;
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break;
default: assert(false);
}
} break;
default: assert(false); default: assert(false);
} }
} }
@@ -356,7 +375,6 @@ void Core::commit() {
auto& commit_arb = commit_arbs_.at(i); auto& commit_arb = commit_arbs_.at(i);
if (commit_arb->Outputs.at(0).empty()) if (commit_arb->Outputs.at(0).empty())
continue; continue;
auto trace = commit_arb->Outputs.at(0).front(); auto trace = commit_arb->Outputs.at(0).front();
// advance to commit stage // advance to commit stage
@@ -558,8 +576,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
break; break;
case VX_DCR_MPM_CLASS_CORE: { case VX_DCR_MPM_CLASS_CORE: {
switch (addr) { switch (addr) {
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idles & 0xffffffff; case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idle & 0xffffffff;
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idles >> 32; case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idle >> 32;
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff; case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32; case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff; case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
@@ -574,6 +592,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32; case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff; case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32; case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
case VX_CSR_MPM_SCRB_WCTL: return perf_stats_.scrb_wctl & 0xffffffff;
case VX_CSR_MPM_SCRB_WCTL_H: return perf_stats_.scrb_wctl >> 32;
case VX_CSR_MPM_SCRB_CSRS: return perf_stats_.scrb_csrs & 0xffffffff;
case VX_CSR_MPM_SCRB_CSRS_H: return perf_stats_.scrb_csrs >> 32;
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff; case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32; case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff; case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
@@ -588,6 +610,7 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
} break; } break;
case VX_DCR_MPM_CLASS_MEM: { case VX_DCR_MPM_CLASS_MEM: {
auto proc_perf = socket_->cluster()->processor()->perf_stats(); auto proc_perf = socket_->cluster()->processor()->perf_stats();
auto cluster_perf = socket_->cluster()->perf_stats();
auto socket_perf = socket_->perf_stats(); auto socket_perf = socket_->perf_stats();
auto smem_perf = shared_mem_->perf_stats(); auto smem_perf = shared_mem_->perf_stats();
switch (addr) { switch (addr) {
@@ -611,18 +634,18 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff; case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32; case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff; case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff;
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32; case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32;
case VX_CSR_MPM_L2CACHE_WRITES: return proc_perf.clusters.l2cache.writes & 0xffffffff; case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff;
case VX_CSR_MPM_L2CACHE_WRITES_H: return proc_perf.clusters.l2cache.writes >> 32; case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32;
case VX_CSR_MPM_L2CACHE_MISS_R: return proc_perf.clusters.l2cache.read_misses & 0xffffffff; case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_R_H: return proc_perf.clusters.l2cache.read_misses >> 32; case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32;
case VX_CSR_MPM_L2CACHE_MISS_W: return proc_perf.clusters.l2cache.write_misses & 0xffffffff; case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_W_H: return proc_perf.clusters.l2cache.write_misses >> 32; case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32;
case VX_CSR_MPM_L2CACHE_BANK_ST: return proc_perf.clusters.l2cache.bank_stalls & 0xffffffff; case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return proc_perf.clusters.l2cache.bank_stalls >> 32; case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
case VX_CSR_MPM_L2CACHE_MSHR_ST: return proc_perf.clusters.l2cache.mshr_stalls & 0xffffffff; case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return proc_perf.clusters.l2cache.mshr_stalls >> 32; case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff; case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32; case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
@@ -638,7 +661,7 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32; case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff; case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32; case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff; case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32; case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff; case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
@@ -652,6 +675,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32; case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32;
} }
} break; } break;
default: {
std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
std::abort();
} break;
} }
} else { } else {
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl; std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;

View File

@@ -49,7 +49,7 @@ public:
struct PerfStats { struct PerfStats {
uint64_t cycles; uint64_t cycles;
uint64_t instrs; uint64_t instrs;
uint64_t sched_idles; uint64_t sched_idle;
uint64_t sched_stalls; uint64_t sched_stalls;
uint64_t ibuf_stalls; uint64_t ibuf_stalls;
uint64_t scrb_stalls; uint64_t scrb_stalls;
@@ -57,6 +57,8 @@ public:
uint64_t scrb_fpu; uint64_t scrb_fpu;
uint64_t scrb_lsu; uint64_t scrb_lsu;
uint64_t scrb_sfu; uint64_t scrb_sfu;
uint64_t scrb_wctl;
uint64_t scrb_csrs;
uint64_t ifetches; uint64_t ifetches;
uint64_t loads; uint64_t loads;
uint64_t stores; uint64_t stores;
@@ -66,7 +68,7 @@ public:
PerfStats() PerfStats()
: cycles(0) : cycles(0)
, instrs(0) , instrs(0)
, sched_idles(0) , sched_idle(0)
, sched_stalls(0) , sched_stalls(0)
, ibuf_stalls(0) , ibuf_stalls(0)
, scrb_stalls(0) , scrb_stalls(0)
@@ -74,6 +76,8 @@ public:
, scrb_fpu(0) , scrb_fpu(0)
, scrb_lsu(0) , scrb_lsu(0)
, scrb_sfu(0) , scrb_sfu(0)
, scrb_wctl(0)
, scrb_csrs(0)
, ifetches(0) , ifetches(0)
, loads(0) , loads(0)
, stores(0) , stores(0)
@@ -88,7 +92,11 @@ public:
std::vector<SimPort<MemReq>> dcache_req_ports; std::vector<SimPort<MemReq>> dcache_req_ports;
std::vector<SimPort<MemRsp>> dcache_rsp_ports; std::vector<SimPort<MemRsp>> dcache_rsp_ports;
Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs); Core(const SimContext& ctx,
uint32_t core_id,
Socket* socket,
const Arch &arch,
const DCRS &dcrs);
~Core(); ~Core();
@@ -158,6 +166,7 @@ private:
void cout_flush(); void cout_flush();
uint32_t core_id_; uint32_t core_id_;
Socket* socket_;
const Arch& arch_; const Arch& arch_;
const DCRS &dcrs_; const DCRS &dcrs_;
@@ -193,10 +202,9 @@ private:
PerfStats perf_stats_; PerfStats perf_stats_;
Socket* socket_;
std::vector<TraceSwitch::Ptr> commit_arbs_; std::vector<TraceSwitch::Ptr> commit_arbs_;
uint32_t commit_exe_;
uint32_t ibuffer_idx_; uint32_t ibuffer_idx_;
friend class Warp; friend class Warp;

View File

@@ -113,6 +113,7 @@ void ProcessorImpl::reset() {
perf_mem_writes_ = 0; perf_mem_writes_ = 0;
perf_mem_latency_ = 0; perf_mem_latency_ = 0;
perf_mem_pending_reads_ = 0; perf_mem_pending_reads_ = 0;
} }
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) { void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
@@ -125,9 +126,6 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
perf.mem_writes = perf_mem_writes_; perf.mem_writes = perf_mem_writes_;
perf.mem_latency = perf_mem_latency_; perf.mem_latency = perf_mem_latency_;
perf.l3cache = l3cache_->perf_stats(); perf.l3cache = l3cache_->perf_stats();
for (auto cluster : clusters_) {
perf.clusters += cluster->perf_stats();
}
return perf; return perf;
} }

View File

@@ -24,17 +24,10 @@ namespace vortex {
class ProcessorImpl { class ProcessorImpl {
public: public:
struct PerfStats { struct PerfStats {
CacheSim::PerfStats l3cache;
uint64_t mem_reads; uint64_t mem_reads;
uint64_t mem_writes; uint64_t mem_writes;
uint64_t mem_latency; uint64_t mem_latency;
CacheSim::PerfStats l3cache;
Cluster::PerfStats clusters;
PerfStats()
: mem_reads(0)
, mem_writes(0)
, mem_latency(0)
{}
}; };
ProcessorImpl(const Arch& arch); ProcessorImpl(const Arch& arch);
@@ -46,7 +39,7 @@ public:
void write_dcr(uint32_t addr, uint32_t value); void write_dcr(uint32_t addr, uint32_t value);
ProcessorImpl::PerfStats perf_stats() const; PerfStats perf_stats() const;
private: private:
@@ -55,7 +48,7 @@ private:
const Arch& arch_; const Arch& arch_;
std::vector<std::shared_ptr<Cluster>> clusters_; std::vector<std::shared_ptr<Cluster>> clusters_;
DCRS dcrs_; DCRS dcrs_;
MemSim::Ptr memsim_; MemSim::Ptr memsim_;
CacheSim::Ptr l3cache_; CacheSim::Ptr l3cache_;
uint64_t perf_mem_reads_; uint64_t perf_mem_reads_;
uint64_t perf_mem_writes_; uint64_t perf_mem_writes_;

View File

@@ -25,6 +25,7 @@ public:
RegType reg_type; RegType reg_type;
uint32_t reg_id; uint32_t reg_id;
ExeType exe_type; ExeType exe_type;
SfuType sfu_type;
uint64_t uuid; uint64_t uuid;
}; };
@@ -62,7 +63,7 @@ public:
if (used_iregs.test(r)) { if (used_iregs.test(r)) {
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer; uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer;
auto owner = owners_.at(tag); auto owner = owners_.at(tag);
out.push_back({RegType::Integer, r, owner->exe_type, owner->uuid}); out.push_back({RegType::Integer, r, owner->exe_type, owner->sfu_type, owner->uuid});
} }
} }
@@ -70,7 +71,7 @@ public:
if (used_fregs.test(r)) { if (used_fregs.test(r)) {
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float; uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float;
auto owner = owners_.at(tag); auto owner = owners_.at(tag);
out.push_back({RegType::Float, r, owner->exe_type, owner->uuid}); out.push_back({RegType::Float, r, owner->exe_type, owner->sfu_type, owner->uuid});
} }
} }
@@ -78,7 +79,7 @@ public:
if (used_vregs.test(r)) { if (used_vregs.test(r)) {
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector; uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector;
auto owner = owners_.at(tag); auto owner = owners_.at(tag);
out.push_back({RegType::Vector, r, owner->exe_type, owner->uuid}); out.push_back({RegType::Vector, r, owner->exe_type, owner->sfu_type, owner->uuid});
} }
} }

View File

@@ -19,16 +19,16 @@ using namespace vortex;
Socket::Socket(const SimContext& ctx, Socket::Socket(const SimContext& ctx,
uint32_t socket_id, uint32_t socket_id,
Cluster* cluster, Cluster* cluster,
const Arch &arch, const const Arch &arch,
DCRS &dcrs) const DCRS &dcrs)
: SimObject(ctx, "socket") : SimObject(ctx, "socket")
, icache_mem_req_port(this) , icache_mem_req_port(this)
, icache_mem_rsp_port(this) , icache_mem_rsp_port(this)
, dcache_mem_req_port(this) , dcache_mem_req_port(this)
, dcache_mem_rsp_port(this) , dcache_mem_rsp_port(this)
, socket_id_(socket_id) , socket_id_(socket_id)
, cores_(arch.socket_size())
, cluster_(cluster) , cluster_(cluster)
, cores_(arch.socket_size())
{ {
auto cores_per_socket = cores_.size(); auto cores_per_socket = cores_.size();
@@ -77,7 +77,10 @@ Socket::Socket(const SimContext& ctx,
for (uint32_t i = 0; i < cores_per_socket; ++i) { for (uint32_t i = 0; i < cores_per_socket; ++i) {
uint32_t core_id = socket_id * cores_per_socket + i; uint32_t core_id = socket_id * cores_per_socket + i;
cores_.at(i) = Core::Create(core_id, this, arch, dcrs); cores_.at(i) = Core::Create(core_id,
this,
arch,
dcrs);
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0)); cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0)); icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
@@ -139,8 +142,8 @@ void Socket::resume(uint32_t core_index) {
} }
Socket::PerfStats Socket::perf_stats() const { Socket::PerfStats Socket::perf_stats() const {
Socket::PerfStats perf; PerfStats perf_stats;
perf.icache = icaches_->perf_stats(); perf_stats.icache = icaches_->perf_stats();
perf.dcache = dcaches_->perf_stats(); perf_stats.dcache = dcaches_->perf_stats();
return perf; return perf_stats;
} }

View File

@@ -30,12 +30,6 @@ public:
struct PerfStats { struct PerfStats {
CacheSim::PerfStats icache; CacheSim::PerfStats icache;
CacheSim::PerfStats dcache; CacheSim::PerfStats dcache;
PerfStats& operator+=(const PerfStats& rhs) {
this->icache += rhs.icache;
this->dcache += rhs.dcache;
return *this;
}
}; };
SimPort<MemReq> icache_mem_req_port; SimPort<MemReq> icache_mem_req_port;
@@ -74,14 +68,14 @@ public:
void resume(uint32_t core_id); void resume(uint32_t core_id);
Socket::PerfStats perf_stats() const; PerfStats perf_stats() const;
private: private:
uint32_t socket_id_; uint32_t socket_id_;
Cluster* cluster_;
std::vector<Core::Ptr> cores_; std::vector<Core::Ptr> cores_;
CacheCluster::Ptr icaches_; CacheCluster::Ptr icaches_;
CacheCluster::Ptr dcaches_; CacheCluster::Ptr dcaches_;
Cluster* cluster_;
}; };
} // namespace vortex } // namespace vortex

View File

@@ -15,10 +15,10 @@ all:
$(MAKE) -C blackscholes $(MAKE) -C blackscholes
$(MAKE) -C transpose $(MAKE) -C transpose
$(MAKE) -C convolution $(MAKE) -C convolution
# $(MAKE) -C cutcp $(MAKE) -C cutcp
# $(MAKE) -C sgemm2 $(MAKE) -C sgemm2
# $(MAKE) -C vectorhypot $(MAKE) -C vectorhypot
# $(MAKE) -C mri-q run-simx $(MAKE) -C mri-q run-simx
run-simx: run-simx:
$(MAKE) -C vecadd run-simx $(MAKE) -C vecadd run-simx
@@ -125,7 +125,7 @@ clean-all:
$(MAKE) -C oclprintf clean-all $(MAKE) -C oclprintf clean-all
$(MAKE) -C blackscholes clean-all $(MAKE) -C blackscholes clean-all
$(MAKE) -C convolution clean-all $(MAKE) -C convolution clean-all
# $(MAKE) -C cutcp clean-all $(MAKE) -C cutcp clean-all
# $(MAKE) -C sgemm2 clean-all $(MAKE) -C sgemm2 clean-all
# $(MAKE) -C vectorhypot clean-all $(MAKE) -C vectorhypot clean-all
# $(MAKE) -C mri-q clean-all $(MAKE) -C mri-q clean-all