profiling update
minor updates
This commit is contained in:
@@ -217,7 +217,7 @@ package VX_gpu_pkg;
|
|||||||
function logic [ISSUE_WIS_W-1:0] wid_to_wis(
|
function logic [ISSUE_WIS_W-1:0] wid_to_wis(
|
||||||
input logic [`NW_WIDTH-1:0] wid
|
input logic [`NW_WIDTH-1:0] wid
|
||||||
);
|
);
|
||||||
wid_to_wis = ISSUE_WIS_W'(wid >> `CLOG2(`ISSUE_WIDTH));
|
wid_to_wis = ISSUE_WIS_W'({1'b0, wid} >> `CLOG2(`ISSUE_WIDTH));
|
||||||
endfunction
|
endfunction
|
||||||
|
|
||||||
function logic [ISSUE_ADDRW-1:0] wis_to_addr(
|
function logic [ISSUE_ADDRW-1:0] wis_to_addr(
|
||||||
|
|||||||
@@ -70,10 +70,10 @@
|
|||||||
`define VX_CSR_MINSTRET 12'hB02
|
`define VX_CSR_MINSTRET 12'hB02
|
||||||
`define VX_CSR_MINSTRET_H 12'hB82
|
`define VX_CSR_MINSTRET_H 12'hB82
|
||||||
// PERF: pipeline
|
// PERF: pipeline
|
||||||
`define VX_CSR_MPM_SCHED_ST 12'hB03
|
`define VX_CSR_MPM_SCHED_ID 12'hB03
|
||||||
`define VX_CSR_MPM_SCHED_ST_H 12'hB83
|
`define VX_CSR_MPM_SCHED_ID_H 12'hB83
|
||||||
`define VX_CSR_MPM_FETCH_ST 12'hB04
|
`define VX_CSR_MPM_SCHED_ST 12'hB04
|
||||||
`define VX_CSR_MPM_FETCH_ST_H 12'hB84
|
`define VX_CSR_MPM_SCHED_ST_H 12'hB84
|
||||||
`define VX_CSR_MPM_IBUF_ST 12'hB05
|
`define VX_CSR_MPM_IBUF_ST 12'hB05
|
||||||
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
|
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
|
||||||
`define VX_CSR_MPM_SCRB_ST 12'hB06
|
`define VX_CSR_MPM_SCRB_ST 12'hB06
|
||||||
@@ -101,10 +101,10 @@
|
|||||||
`define VX_CSR_MPM_LOADS_H 12'hB90
|
`define VX_CSR_MPM_LOADS_H 12'hB90
|
||||||
`define VX_CSR_MPM_STORES 12'hB11
|
`define VX_CSR_MPM_STORES 12'hB11
|
||||||
`define VX_CSR_MPM_STORES_H 12'hB91
|
`define VX_CSR_MPM_STORES_H 12'hB91
|
||||||
`define VX_CSR_MPM_IFETCH_LAT 12'hB12
|
`define VX_CSR_MPM_IFETCH_LT 12'hB12
|
||||||
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB92
|
`define VX_CSR_MPM_IFETCH_LT_H 12'hB92
|
||||||
`define VX_CSR_MPM_LOAD_LAT 12'hB13
|
`define VX_CSR_MPM_LOAD_LT 12'hB13
|
||||||
`define VX_CSR_MPM_LOAD_LAT_H 12'hB93
|
`define VX_CSR_MPM_LOAD_LT_H 12'hB93
|
||||||
|
|
||||||
// Machine Performance-monitoring memory counters
|
// Machine Performance-monitoring memory counters
|
||||||
// PERF: icache
|
// PERF: icache
|
||||||
@@ -158,8 +158,8 @@
|
|||||||
`define VX_CSR_MPM_MEM_READS_H 12'hB98
|
`define VX_CSR_MPM_MEM_READS_H 12'hB98
|
||||||
`define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes
|
`define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes
|
||||||
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
|
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
|
||||||
`define VX_CSR_MPM_MEM_LAT 12'hB1A // memory latency
|
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
|
||||||
`define VX_CSR_MPM_MEM_LAT_H 12'hB9A
|
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
|
||||||
// PERF: smem
|
// PERF: smem
|
||||||
`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads
|
`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads
|
||||||
`define VX_CSR_MPM_SMEM_READS_H 12'hB9B
|
`define VX_CSR_MPM_SMEM_READS_H 12'hB9B
|
||||||
|
|||||||
@@ -49,12 +49,12 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
cache_perf_t perf_l3cache;
|
cache_perf_t perf_l3cache;
|
||||||
mem_perf_t mem_perf;
|
mem_perf_t mem_perf;
|
||||||
|
|
||||||
assign mem_perf_if.icache = 'x;
|
assign mem_perf_if.smem = 'x;
|
||||||
assign mem_perf_if.dcache = 'x;
|
assign mem_perf_if.icache = 'x;
|
||||||
|
assign mem_perf_if.dcache = 'x;
|
||||||
assign mem_perf_if.l2cache = 'x;
|
assign mem_perf_if.l2cache = 'x;
|
||||||
assign mem_perf_if.l3cache = perf_l3cache;
|
assign mem_perf_if.l3cache = perf_l3cache;
|
||||||
assign mem_perf_if.smem = 'x;
|
assign mem_perf_if.mem = mem_perf;
|
||||||
assign mem_perf_if.mem = mem_perf;
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
|
|||||||
@@ -130,6 +130,12 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if mem_perf_if();
|
VX_mem_perf_if mem_perf_if();
|
||||||
|
assign mem_perf_if.smem = '0;
|
||||||
|
assign mem_perf_if.icache = '0;
|
||||||
|
assign mem_perf_if.dcache = '0;
|
||||||
|
assign mem_perf_if.l2cache = '0;
|
||||||
|
assign mem_perf_if.l3cache = '0;
|
||||||
|
assign mem_perf_if.mem = '0;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`ifdef SCOPE
|
`ifdef SCOPE
|
||||||
|
|||||||
@@ -186,11 +186,11 @@ import VX_fpu_pkg::*;
|
|||||||
case (base_dcrs.mpm_class)
|
case (base_dcrs.mpm_class)
|
||||||
`VX_DCR_MPM_CLASS_CORE: begin
|
`VX_DCR_MPM_CLASS_CORE: begin
|
||||||
case (read_addr)
|
case (read_addr)
|
||||||
// PERF: pipeline
|
// PERF: pipeline
|
||||||
|
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
|
||||||
|
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
||||||
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_FETCH_ST : read_data_ro_r = pipeline_perf_if.fetch_stalls[31:0];
|
|
||||||
`VX_CSR_MPM_FETCH_ST_H : read_data_ro_r = 32'(pipeline_perf_if.fetch_stalls[`PERF_CTR_BITS-1:32]);
|
|
||||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||||
@@ -228,10 +228,10 @@ import VX_fpu_pkg::*;
|
|||||||
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
||||||
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||||
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||||
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||||
default:;
|
default:;
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
@@ -295,8 +295,8 @@ import VX_fpu_pkg::*;
|
|||||||
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
||||||
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||||
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||||
default:;
|
default:;
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -70,8 +70,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
batch_idx <= '0;
|
batch_idx <= '0;
|
||||||
end else if (batch_done) begin
|
end else begin
|
||||||
batch_idx <= batch_idx + BATCH_COUNT_W'(1);
|
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end else begin
|
end else begin
|
||||||
|
|||||||
@@ -554,7 +554,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||||||
VX_stream_arb #(
|
VX_stream_arb #(
|
||||||
.NUM_INPUTS (2),
|
.NUM_INPUTS (2),
|
||||||
.DATAW (RSP_ARB_DATAW),
|
.DATAW (RSP_ARB_DATAW),
|
||||||
.OUT_REG (1)
|
.OUT_REG (2)
|
||||||
) rsp_arb (
|
) rsp_arb (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (commit_reset),
|
.reset (commit_reset),
|
||||||
|
|||||||
@@ -381,23 +381,24 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||||||
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
|
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
|
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
|
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls;
|
|
||||||
|
|
||||||
|
wire schedule_idle = ~schedule_valid;
|
||||||
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
|
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
perf_sched_stalls <= '0;
|
perf_sched_idles <= '0;
|
||||||
perf_fetch_stalls <= '0;
|
perf_sched_stalls <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(~schedule_valid);
|
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
|
||||||
perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_stall);
|
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
assign perf_schedule_if.sched_idles = perf_sched_idles;
|
||||||
assign perf_schedule_if.fetch_stalls = perf_fetch_stalls;
|
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -355,11 +355,14 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||||||
wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;
|
wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||||
fflags_t i2f_regular_status_s3 = i2f_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
|
fflags_t i2f_regular_status_s3, f2i_regular_status_s3;
|
||||||
fflags_t f2i_regular_status_s3 = f2i_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
|
fflags_t i2f_status_s3, f2i_status_s3;
|
||||||
|
|
||||||
fflags_t i2f_status_s3 = i2f_regular_status_s3;
|
assign i2f_regular_status_s3 = {4'h0, i2f_round_has_sticky_s3[i]};
|
||||||
fflags_t f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
|
assign f2i_regular_status_s3 = {4'h0, f2i_round_has_sticky_s3[i]};
|
||||||
|
|
||||||
|
assign i2f_status_s3 = i2f_regular_status_s3;
|
||||||
|
assign f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
|
||||||
|
|
||||||
wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
|
wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
|
||||||
wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];
|
wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];
|
||||||
|
|||||||
@@ -14,8 +14,8 @@
|
|||||||
`include "VX_define.vh"
|
`include "VX_define.vh"
|
||||||
|
|
||||||
interface VX_pipeline_perf_if ();
|
interface VX_pipeline_perf_if ();
|
||||||
|
wire [`PERF_CTR_BITS-1:0] sched_idles;
|
||||||
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] fetch_stalls;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
|
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
|
||||||
@@ -28,8 +28,8 @@ interface VX_pipeline_perf_if ();
|
|||||||
wire [`PERF_CTR_BITS-1:0] load_latency;
|
wire [`PERF_CTR_BITS-1:0] load_latency;
|
||||||
|
|
||||||
modport schedule (
|
modport schedule (
|
||||||
output sched_stalls,
|
output sched_idles,
|
||||||
output fetch_stalls
|
output sched_stalls
|
||||||
);
|
);
|
||||||
|
|
||||||
modport issue (
|
modport issue (
|
||||||
@@ -40,8 +40,8 @@ interface VX_pipeline_perf_if ();
|
|||||||
);
|
);
|
||||||
|
|
||||||
modport slave (
|
modport slave (
|
||||||
|
input sched_idles,
|
||||||
input sched_stalls,
|
input sched_stalls,
|
||||||
input fetch_stalls,
|
|
||||||
input ibf_stalls,
|
input ibf_stalls,
|
||||||
input scb_stalls,
|
input scb_stalls,
|
||||||
input scb_uses,
|
input scb_uses,
|
||||||
|
|||||||
@@ -201,9 +201,7 @@ module VX_fifo_queue #(
|
|||||||
rd_ptr_r <= '0;
|
rd_ptr_r <= '0;
|
||||||
rd_ptr_n_r <= 1;
|
rd_ptr_n_r <= 1;
|
||||||
end else begin
|
end else begin
|
||||||
if (push) begin
|
wr_ptr_r <= wr_ptr_r + ADDRW'(push);
|
||||||
wr_ptr_r <= wr_ptr_r + ADDRW'(1);
|
|
||||||
end
|
|
||||||
if (pop) begin
|
if (pop) begin
|
||||||
rd_ptr_r <= rd_ptr_n_r;
|
rd_ptr_r <= rd_ptr_n_r;
|
||||||
if (DEPTH > 2) begin
|
if (DEPTH > 2) begin
|
||||||
|
|||||||
@@ -175,8 +175,9 @@ static uint64_t get_csr_64(const void* ptr, int addr) {
|
|||||||
extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
uint64_t instrs = 0;
|
uint64_t total_instrs = 0;
|
||||||
uint64_t cycles = 0;
|
uint64_t total_cycles = 0;
|
||||||
|
uint64_t max_cycles = 0;
|
||||||
|
|
||||||
#ifdef PERF_ENABLE
|
#ifdef PERF_ENABLE
|
||||||
|
|
||||||
@@ -199,8 +200,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
auto perf_class = gAutoPerfDump.get_perf_class();
|
auto perf_class = gAutoPerfDump.get_perf_class();
|
||||||
|
|
||||||
// PERF: pipeline stalls
|
// PERF: pipeline stalls
|
||||||
uint64_t scheduler_stalls = 0;
|
uint64_t sched_idles = 0;
|
||||||
uint64_t fetch_stalls = 0;
|
uint64_t sched_stalls = 0;
|
||||||
uint64_t ibuffer_stalls = 0;
|
uint64_t ibuffer_stalls = 0;
|
||||||
uint64_t scrb_stalls = 0;
|
uint64_t scrb_stalls = 0;
|
||||||
uint64_t lsu_stalls = 0;
|
uint64_t lsu_stalls = 0;
|
||||||
@@ -269,19 +270,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
switch (perf_class) {
|
switch (perf_class) {
|
||||||
case VX_DCR_MPM_CLASS_CORE: {
|
case VX_DCR_MPM_CLASS_CORE: {
|
||||||
// PERF: pipeline
|
// PERF: pipeline
|
||||||
// schedule stalls
|
// scheduler idles
|
||||||
{
|
{
|
||||||
uint64_t scheduler_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
|
uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID);
|
||||||
int scheduler_percent_per_core = calcAvgPercent(scheduler_stalls_per_core, cycles_per_core);
|
int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: schedule stalls=%ld (%d%%)\n", core_id, scheduler_stalls_per_core, scheduler_percent_per_core);
|
if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler idles=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
|
||||||
scheduler_stalls += scheduler_stalls_per_core;
|
sched_idles += sched_idles_per_core;
|
||||||
}
|
}
|
||||||
// fetch stalls
|
// scheduler stalls
|
||||||
{
|
{
|
||||||
uint64_t fetch_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FETCH_ST);
|
uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
|
||||||
int fetch_percent_per_core = calcAvgPercent(fetch_stalls_per_core, cycles_per_core);
|
int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch stalls=%ld (%d%%)\n", core_id, fetch_stalls_per_core, fetch_percent_per_core);
|
if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
|
||||||
fetch_stalls += fetch_stalls_per_core;
|
sched_stalls += sched_stalls_per_core;
|
||||||
}
|
}
|
||||||
// ibuffer_stalls
|
// ibuffer_stalls
|
||||||
{
|
{
|
||||||
@@ -340,7 +341,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
|
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
|
||||||
ifetches += ifetches_per_core;
|
ifetches += ifetches_per_core;
|
||||||
|
|
||||||
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
|
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT);
|
||||||
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
|
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
||||||
ifetch_lat += ifetch_lat_per_core;
|
ifetch_lat += ifetch_lat_per_core;
|
||||||
@@ -351,7 +352,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
|
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
|
||||||
loads += loads_per_core;
|
loads += loads_per_core;
|
||||||
|
|
||||||
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
|
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT);
|
||||||
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
|
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
||||||
load_lat += load_lat_per_core;
|
load_lat += load_lat_per_core;
|
||||||
@@ -431,7 +432,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
// PERF: memory
|
// PERF: memory
|
||||||
mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS);
|
mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS);
|
||||||
mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES);
|
mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES);
|
||||||
mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LAT);
|
mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LT);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
@@ -441,21 +442,22 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
|
|
||||||
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
|
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
|
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
|
||||||
instrs += instrs_per_core;
|
total_instrs += instrs_per_core;
|
||||||
cycles = std::max<uint64_t>(cycles_per_core, cycles);
|
total_cycles += cycles_per_core;
|
||||||
|
max_cycles = std::max<uint64_t>(cycles_per_core, max_cycles);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef PERF_ENABLE
|
#ifdef PERF_ENABLE
|
||||||
switch (perf_class) {
|
switch (perf_class) {
|
||||||
case VX_DCR_MPM_CLASS_CORE: {
|
case VX_DCR_MPM_CLASS_CORE: {
|
||||||
int scheduler_percent = calcAvgPercent(scheduler_stalls, cycles);
|
int sched_idles_percent = calcAvgPercent(sched_idles, total_cycles);
|
||||||
int fetch_percent = calcAvgPercent(fetch_stalls, cycles);
|
int sched_stalls_percent = calcAvgPercent(sched_stalls, total_cycles);
|
||||||
int ibuffer_percent = calcAvgPercent(ibuffer_stalls, cycles);
|
int ibuffer_percent = calcAvgPercent(ibuffer_stalls, total_cycles);
|
||||||
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
|
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
|
||||||
int load_avg_lat = (int)(double(load_lat) / double(loads));
|
int load_avg_lat = (int)(double(load_lat) / double(loads));
|
||||||
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
|
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
|
||||||
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", scheduler_stalls, scheduler_percent);
|
fprintf(stream, "PERF: scheduler idles=%ld (%d%%)\n", sched_idles, sched_idles_percent);
|
||||||
fprintf(stream, "PERF: fetch stalls=%ld (%d%%)\n", fetch_stalls, fetch_percent);
|
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
|
||||||
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
|
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
|
||||||
fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
|
fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
|
||||||
calcAvgPercent(scrb_alu, scrb_total),
|
calcAvgPercent(scrb_alu, scrb_total),
|
||||||
@@ -514,8 +516,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
float IPC = (float)(double(instrs) / double(cycles));
|
float IPC = (float)(double(total_instrs) / double(max_cycles));
|
||||||
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
|
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, max_cycles, IPC);
|
||||||
|
|
||||||
fflush(stream);
|
fflush(stream);
|
||||||
|
|
||||||
|
|||||||
@@ -167,7 +167,7 @@ void Core::schedule() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (scheduled_warp == -1) {
|
if (scheduled_warp == -1) {
|
||||||
++perf_stats_.sched_stalls;
|
++perf_stats_.sched_idles;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -548,10 +548,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||||||
break;
|
break;
|
||||||
case VX_DCR_MPM_CLASS_CORE: {
|
case VX_DCR_MPM_CLASS_CORE: {
|
||||||
switch (addr) {
|
switch (addr) {
|
||||||
|
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idles & 0xffffffff;
|
||||||
|
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idles >> 32;
|
||||||
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
|
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
|
||||||
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
|
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
|
||||||
case VX_CSR_MPM_FETCH_ST: return perf_stats_.fetch_stalls & 0xffffffff;
|
|
||||||
case VX_CSR_MPM_FETCH_ST_H:return perf_stats_.fetch_stalls >> 32;
|
|
||||||
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
|
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
|
||||||
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
|
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
|
||||||
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
|
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
|
||||||
@@ -579,10 +579,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||||||
case VX_CSR_MPM_LOADS_H: return perf_stats_.loads >> 32;
|
case VX_CSR_MPM_LOADS_H: return perf_stats_.loads >> 32;
|
||||||
case VX_CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff;
|
case VX_CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff;
|
||||||
case VX_CSR_MPM_STORES_H: return perf_stats_.stores >> 32;
|
case VX_CSR_MPM_STORES_H: return perf_stats_.stores >> 32;
|
||||||
case VX_CSR_MPM_IFETCH_LAT: return perf_stats_.ifetch_latency & 0xffffffff;
|
case VX_CSR_MPM_IFETCH_LT: return perf_stats_.ifetch_latency & 0xffffffff;
|
||||||
case VX_CSR_MPM_IFETCH_LAT_H: return perf_stats_.ifetch_latency >> 32;
|
case VX_CSR_MPM_IFETCH_LT_H: return perf_stats_.ifetch_latency >> 32;
|
||||||
case VX_CSR_MPM_LOAD_LAT: return perf_stats_.load_latency & 0xffffffff;
|
case VX_CSR_MPM_LOAD_LT: return perf_stats_.load_latency & 0xffffffff;
|
||||||
case VX_CSR_MPM_LOAD_LAT_H: return perf_stats_.load_latency >> 32;
|
case VX_CSR_MPM_LOAD_LT_H: return perf_stats_.load_latency >> 32;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case VX_DCR_MPM_CLASS_MEM: {
|
case VX_DCR_MPM_CLASS_MEM: {
|
||||||
@@ -638,8 +638,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||||||
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
|
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
|
||||||
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
|
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
|
||||||
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
|
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
|
||||||
case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff;
|
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
|
||||||
case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32;
|
case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
|
||||||
|
|
||||||
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff;
|
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff;
|
||||||
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32;
|
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32;
|
||||||
|
|||||||
@@ -49,8 +49,8 @@ public:
|
|||||||
struct PerfStats {
|
struct PerfStats {
|
||||||
uint64_t cycles;
|
uint64_t cycles;
|
||||||
uint64_t instrs;
|
uint64_t instrs;
|
||||||
|
uint64_t sched_idles;
|
||||||
uint64_t sched_stalls;
|
uint64_t sched_stalls;
|
||||||
uint64_t fetch_stalls;
|
|
||||||
uint64_t ibuf_stalls;
|
uint64_t ibuf_stalls;
|
||||||
uint64_t scrb_stalls;
|
uint64_t scrb_stalls;
|
||||||
uint64_t alu_stalls;
|
uint64_t alu_stalls;
|
||||||
@@ -70,8 +70,8 @@ public:
|
|||||||
PerfStats()
|
PerfStats()
|
||||||
: cycles(0)
|
: cycles(0)
|
||||||
, instrs(0)
|
, instrs(0)
|
||||||
|
, sched_idles(0)
|
||||||
, sched_stalls(0)
|
, sched_stalls(0)
|
||||||
, fetch_stalls(0)
|
|
||||||
, ibuf_stalls(0)
|
, ibuf_stalls(0)
|
||||||
, scrb_stalls(0)
|
, scrb_stalls(0)
|
||||||
, alu_stalls(0)
|
, alu_stalls(0)
|
||||||
|
|||||||
@@ -9,16 +9,16 @@ all:
|
|||||||
$(MAKE) -C dotproduct
|
$(MAKE) -C dotproduct
|
||||||
$(MAKE) -C kmeans
|
$(MAKE) -C kmeans
|
||||||
$(MAKE) -C spmv
|
$(MAKE) -C spmv
|
||||||
$(MAKE) -C transpose
|
|
||||||
$(MAKE) -C cutcp
|
|
||||||
$(MAKE) -C vectorhypot
|
|
||||||
$(MAKE) -C stencil
|
$(MAKE) -C stencil
|
||||||
$(MAKE) -C mri-q
|
|
||||||
$(MAKE) -C lbm
|
$(MAKE) -C lbm
|
||||||
$(MAKE) -C oclprintf
|
$(MAKE) -C oclprintf
|
||||||
$(MAKE) -C blackscholes
|
$(MAKE) -C blackscholes
|
||||||
$(MAKE) -C sgemm2
|
$(MAKE) -C transpose
|
||||||
$(MAKE) -C convolution
|
$(MAKE) -C convolution
|
||||||
|
# $(MAKE) -C cutcp
|
||||||
|
# $(MAKE) -C sgemm2
|
||||||
|
# $(MAKE) -C vectorhypot
|
||||||
|
# $(MAKE) -C mri-q run-simx
|
||||||
|
|
||||||
run-simx:
|
run-simx:
|
||||||
$(MAKE) -C vecadd run-simx
|
$(MAKE) -C vecadd run-simx
|
||||||
@@ -37,10 +37,10 @@ run-simx:
|
|||||||
$(MAKE) -C blackscholes run-simx
|
$(MAKE) -C blackscholes run-simx
|
||||||
$(MAKE) -C transpose run-simx
|
$(MAKE) -C transpose run-simx
|
||||||
$(MAKE) -C convolution run-simx
|
$(MAKE) -C convolution run-simx
|
||||||
$(MAKE) -C cutcp run-simx
|
# $(MAKE) -C cutcp run-simx
|
||||||
$(MAKE) -C vectorhypot run-simx
|
|
||||||
$(MAKE) -C mri-q run-simx
|
|
||||||
# $(MAKE) -C sgemm2 run-simx
|
# $(MAKE) -C sgemm2 run-simx
|
||||||
|
# $(MAKE) -C vectorhypot run-simx
|
||||||
|
# $(MAKE) -C mri-q run-simx
|
||||||
|
|
||||||
run-rtlsim:
|
run-rtlsim:
|
||||||
$(MAKE) -C vecadd run-rtlsim
|
$(MAKE) -C vecadd run-rtlsim
|
||||||
@@ -98,15 +98,15 @@ clean:
|
|||||||
$(MAKE) -C kmeans clean
|
$(MAKE) -C kmeans clean
|
||||||
$(MAKE) -C spmv clean
|
$(MAKE) -C spmv clean
|
||||||
$(MAKE) -C transpose clean
|
$(MAKE) -C transpose clean
|
||||||
$(MAKE) -C cutcp clean
|
|
||||||
$(MAKE) -C vectorhypot clean
|
|
||||||
$(MAKE) -C stencil clean
|
$(MAKE) -C stencil clean
|
||||||
$(MAKE) -C mri-q clean
|
|
||||||
$(MAKE) -C lbm clean
|
$(MAKE) -C lbm clean
|
||||||
$(MAKE) -C oclprintf clean
|
$(MAKE) -C oclprintf clean
|
||||||
$(MAKE) -C blackscholes clean
|
$(MAKE) -C blackscholes clean
|
||||||
$(MAKE) -C sgemm2 clean
|
|
||||||
$(MAKE) -C convolution clean
|
$(MAKE) -C convolution clean
|
||||||
|
# $(MAKE) -C cutcp clean
|
||||||
|
# $(MAKE) -C sgemm2 clean
|
||||||
|
# $(MAKE) -C vectorhypot clean
|
||||||
|
# $(MAKE) -C mri-q clean
|
||||||
|
|
||||||
clean-all:
|
clean-all:
|
||||||
$(MAKE) -C vecadd clean-all
|
$(MAKE) -C vecadd clean-all
|
||||||
@@ -114,19 +114,18 @@ clean-all:
|
|||||||
$(MAKE) -C psort clean-all
|
$(MAKE) -C psort clean-all
|
||||||
$(MAKE) -C saxpy clean-all
|
$(MAKE) -C saxpy clean-all
|
||||||
$(MAKE) -C sfilter clean-all
|
$(MAKE) -C sfilter clean-all
|
||||||
$(MAKE) -C sfilter clean-all
|
|
||||||
$(MAKE) -C nearn clean-all
|
$(MAKE) -C nearn clean-all
|
||||||
$(MAKE) -C guassian clean-all
|
$(MAKE) -C guassian clean-all
|
||||||
$(MAKE) -C dotproduct clean-all
|
$(MAKE) -C dotproduct clean-all
|
||||||
$(MAKE) -C kmeans clean-all
|
$(MAKE) -C kmeans clean-all
|
||||||
$(MAKE) -C spmv clean-all
|
$(MAKE) -C spmv clean-all
|
||||||
$(MAKE) -C transpose clean-all
|
$(MAKE) -C transpose clean-all
|
||||||
$(MAKE) -C cutcp clean-all
|
|
||||||
$(MAKE) -C vectorhypot clean-all
|
|
||||||
$(MAKE) -C stencil clean-all
|
$(MAKE) -C stencil clean-all
|
||||||
$(MAKE) -C mri-q clean-all
|
|
||||||
$(MAKE) -C lbm clean-all
|
$(MAKE) -C lbm clean-all
|
||||||
$(MAKE) -C oclprintf clean-all
|
$(MAKE) -C oclprintf clean-all
|
||||||
$(MAKE) -C blackscholes clean-all
|
$(MAKE) -C blackscholes clean-all
|
||||||
$(MAKE) -C sgemm2 clean-all
|
|
||||||
$(MAKE) -C convolution clean-all
|
$(MAKE) -C convolution clean-all
|
||||||
|
# $(MAKE) -C cutcp clean-all
|
||||||
|
# $(MAKE) -C sgemm2 clean-all
|
||||||
|
# $(MAKE) -C vectorhypot clean-all
|
||||||
|
# $(MAKE) -C mri-q clean-all
|
||||||
|
|||||||
Reference in New Issue
Block a user