profiling update
minor updates
This commit is contained in:
@@ -217,7 +217,7 @@ package VX_gpu_pkg;
|
||||
function logic [ISSUE_WIS_W-1:0] wid_to_wis(
|
||||
input logic [`NW_WIDTH-1:0] wid
|
||||
);
|
||||
wid_to_wis = ISSUE_WIS_W'(wid >> `CLOG2(`ISSUE_WIDTH));
|
||||
wid_to_wis = ISSUE_WIS_W'({1'b0, wid} >> `CLOG2(`ISSUE_WIDTH));
|
||||
endfunction
|
||||
|
||||
function logic [ISSUE_ADDRW-1:0] wis_to_addr(
|
||||
|
||||
@@ -70,10 +70,10 @@
|
||||
`define VX_CSR_MINSTRET 12'hB02
|
||||
`define VX_CSR_MINSTRET_H 12'hB82
|
||||
// PERF: pipeline
|
||||
`define VX_CSR_MPM_SCHED_ST 12'hB03
|
||||
`define VX_CSR_MPM_SCHED_ST_H 12'hB83
|
||||
`define VX_CSR_MPM_FETCH_ST 12'hB04
|
||||
`define VX_CSR_MPM_FETCH_ST_H 12'hB84
|
||||
`define VX_CSR_MPM_SCHED_ID 12'hB03
|
||||
`define VX_CSR_MPM_SCHED_ID_H 12'hB83
|
||||
`define VX_CSR_MPM_SCHED_ST 12'hB04
|
||||
`define VX_CSR_MPM_SCHED_ST_H 12'hB84
|
||||
`define VX_CSR_MPM_IBUF_ST 12'hB05
|
||||
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
|
||||
`define VX_CSR_MPM_SCRB_ST 12'hB06
|
||||
@@ -101,10 +101,10 @@
|
||||
`define VX_CSR_MPM_LOADS_H 12'hB90
|
||||
`define VX_CSR_MPM_STORES 12'hB11
|
||||
`define VX_CSR_MPM_STORES_H 12'hB91
|
||||
`define VX_CSR_MPM_IFETCH_LAT 12'hB12
|
||||
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB92
|
||||
`define VX_CSR_MPM_LOAD_LAT 12'hB13
|
||||
`define VX_CSR_MPM_LOAD_LAT_H 12'hB93
|
||||
`define VX_CSR_MPM_IFETCH_LT 12'hB12
|
||||
`define VX_CSR_MPM_IFETCH_LT_H 12'hB92
|
||||
`define VX_CSR_MPM_LOAD_LT 12'hB13
|
||||
`define VX_CSR_MPM_LOAD_LT_H 12'hB93
|
||||
|
||||
// Machine Performance-monitoring memory counters
|
||||
// PERF: icache
|
||||
@@ -158,8 +158,8 @@
|
||||
`define VX_CSR_MPM_MEM_READS_H 12'hB98
|
||||
`define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes
|
||||
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
|
||||
`define VX_CSR_MPM_MEM_LAT 12'hB1A // memory latency
|
||||
`define VX_CSR_MPM_MEM_LAT_H 12'hB9A
|
||||
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
|
||||
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
|
||||
// PERF: smem
|
||||
`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads
|
||||
`define VX_CSR_MPM_SMEM_READS_H 12'hB9B
|
||||
|
||||
@@ -49,12 +49,12 @@ module Vortex import VX_gpu_pkg::*; (
|
||||
cache_perf_t perf_l3cache;
|
||||
mem_perf_t mem_perf;
|
||||
|
||||
assign mem_perf_if.icache = 'x;
|
||||
assign mem_perf_if.dcache = 'x;
|
||||
assign mem_perf_if.smem = 'x;
|
||||
assign mem_perf_if.icache = 'x;
|
||||
assign mem_perf_if.dcache = 'x;
|
||||
assign mem_perf_if.l2cache = 'x;
|
||||
assign mem_perf_if.l3cache = perf_l3cache;
|
||||
assign mem_perf_if.smem = 'x;
|
||||
assign mem_perf_if.mem = mem_perf;
|
||||
assign mem_perf_if.mem = mem_perf;
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if #(
|
||||
|
||||
@@ -130,6 +130,12 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.smem = '0;
|
||||
assign mem_perf_if.icache = '0;
|
||||
assign mem_perf_if.dcache = '0;
|
||||
assign mem_perf_if.l2cache = '0;
|
||||
assign mem_perf_if.l3cache = '0;
|
||||
assign mem_perf_if.mem = '0;
|
||||
`endif
|
||||
|
||||
`ifdef SCOPE
|
||||
|
||||
@@ -186,11 +186,11 @@ import VX_fpu_pkg::*;
|
||||
case (base_dcrs.mpm_class)
|
||||
`VX_DCR_MPM_CLASS_CORE: begin
|
||||
case (read_addr)
|
||||
// PERF: pipeline
|
||||
// PERF: pipeline
|
||||
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
|
||||
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
||||
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_FETCH_ST : read_data_ro_r = pipeline_perf_if.fetch_stalls[31:0];
|
||||
`VX_CSR_MPM_FETCH_ST_H : read_data_ro_r = 32'(pipeline_perf_if.fetch_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||
@@ -228,10 +228,10 @@ import VX_fpu_pkg::*;
|
||||
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
||||
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
@@ -295,8 +295,8 @@ import VX_fpu_pkg::*;
|
||||
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
||||
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
|
||||
@@ -70,8 +70,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
batch_idx <= '0;
|
||||
end else if (batch_done) begin
|
||||
batch_idx <= batch_idx + BATCH_COUNT_W'(1);
|
||||
end else begin
|
||||
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
|
||||
end
|
||||
end
|
||||
end else begin
|
||||
|
||||
@@ -554,7 +554,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (RSP_ARB_DATAW),
|
||||
.OUT_REG (1)
|
||||
.OUT_REG (2)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
|
||||
@@ -381,23 +381,24 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls;
|
||||
|
||||
wire schedule_idle = ~schedule_valid;
|
||||
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_sched_stalls <= '0;
|
||||
perf_fetch_stalls <= '0;
|
||||
perf_sched_idles <= '0;
|
||||
perf_sched_stalls <= '0;
|
||||
end else begin
|
||||
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(~schedule_valid);
|
||||
perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_stall);
|
||||
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
|
||||
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
||||
assign perf_schedule_if.fetch_stalls = perf_fetch_stalls;
|
||||
assign perf_schedule_if.sched_idles = perf_sched_idles;
|
||||
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -355,11 +355,14 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
||||
wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
fflags_t i2f_regular_status_s3 = i2f_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
|
||||
fflags_t f2i_regular_status_s3 = f2i_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
|
||||
fflags_t i2f_regular_status_s3, f2i_regular_status_s3;
|
||||
fflags_t i2f_status_s3, f2i_status_s3;
|
||||
|
||||
fflags_t i2f_status_s3 = i2f_regular_status_s3;
|
||||
fflags_t f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
|
||||
assign i2f_regular_status_s3 = {4'h0, i2f_round_has_sticky_s3[i]};
|
||||
assign f2i_regular_status_s3 = {4'h0, f2i_round_has_sticky_s3[i]};
|
||||
|
||||
assign i2f_status_s3 = i2f_regular_status_s3;
|
||||
assign f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
|
||||
|
||||
wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
|
||||
wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];
|
||||
|
||||
@@ -14,8 +14,8 @@
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_pipeline_perf_if ();
|
||||
wire [`PERF_CTR_BITS-1:0] sched_idles;
|
||||
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] fetch_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
|
||||
@@ -28,8 +28,8 @@ interface VX_pipeline_perf_if ();
|
||||
wire [`PERF_CTR_BITS-1:0] load_latency;
|
||||
|
||||
modport schedule (
|
||||
output sched_stalls,
|
||||
output fetch_stalls
|
||||
output sched_idles,
|
||||
output sched_stalls
|
||||
);
|
||||
|
||||
modport issue (
|
||||
@@ -40,8 +40,8 @@ interface VX_pipeline_perf_if ();
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input sched_idles,
|
||||
input sched_stalls,
|
||||
input fetch_stalls,
|
||||
input ibf_stalls,
|
||||
input scb_stalls,
|
||||
input scb_uses,
|
||||
|
||||
@@ -201,9 +201,7 @@ module VX_fifo_queue #(
|
||||
rd_ptr_r <= '0;
|
||||
rd_ptr_n_r <= 1;
|
||||
end else begin
|
||||
if (push) begin
|
||||
wr_ptr_r <= wr_ptr_r + ADDRW'(1);
|
||||
end
|
||||
wr_ptr_r <= wr_ptr_r + ADDRW'(push);
|
||||
if (pop) begin
|
||||
rd_ptr_r <= rd_ptr_n_r;
|
||||
if (DEPTH > 2) begin
|
||||
|
||||
Reference in New Issue
Block a user