diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv index cdb48db4..4ece6c9c 100644 --- a/hw/rtl/VX_gpu_pkg.sv +++ b/hw/rtl/VX_gpu_pkg.sv @@ -217,7 +217,7 @@ package VX_gpu_pkg; function logic [ISSUE_WIS_W-1:0] wid_to_wis( input logic [`NW_WIDTH-1:0] wid ); - wid_to_wis = ISSUE_WIS_W'(wid >> `CLOG2(`ISSUE_WIDTH)); + wid_to_wis = ISSUE_WIS_W'({1'b0, wid} >> `CLOG2(`ISSUE_WIDTH)); endfunction function logic [ISSUE_ADDRW-1:0] wis_to_addr( diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 80a4a7d7..4fb03783 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -70,10 +70,10 @@ `define VX_CSR_MINSTRET 12'hB02 `define VX_CSR_MINSTRET_H 12'hB82 // PERF: pipeline -`define VX_CSR_MPM_SCHED_ST 12'hB03 -`define VX_CSR_MPM_SCHED_ST_H 12'hB83 -`define VX_CSR_MPM_FETCH_ST 12'hB04 -`define VX_CSR_MPM_FETCH_ST_H 12'hB84 +`define VX_CSR_MPM_SCHED_ID 12'hB03 +`define VX_CSR_MPM_SCHED_ID_H 12'hB83 +`define VX_CSR_MPM_SCHED_ST 12'hB04 +`define VX_CSR_MPM_SCHED_ST_H 12'hB84 `define VX_CSR_MPM_IBUF_ST 12'hB05 `define VX_CSR_MPM_IBUF_ST_H 12'hB85 `define VX_CSR_MPM_SCRB_ST 12'hB06 @@ -101,10 +101,10 @@ `define VX_CSR_MPM_LOADS_H 12'hB90 `define VX_CSR_MPM_STORES 12'hB11 `define VX_CSR_MPM_STORES_H 12'hB91 -`define VX_CSR_MPM_IFETCH_LAT 12'hB12 -`define VX_CSR_MPM_IFETCH_LAT_H 12'hB92 -`define VX_CSR_MPM_LOAD_LAT 12'hB13 -`define VX_CSR_MPM_LOAD_LAT_H 12'hB93 +`define VX_CSR_MPM_IFETCH_LT 12'hB12 +`define VX_CSR_MPM_IFETCH_LT_H 12'hB92 +`define VX_CSR_MPM_LOAD_LT 12'hB13 +`define VX_CSR_MPM_LOAD_LT_H 12'hB93 // Machine Performance-monitoring memory counters // PERF: icache @@ -158,8 +158,8 @@ `define VX_CSR_MPM_MEM_READS_H 12'hB98 `define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes `define VX_CSR_MPM_MEM_WRITES_H 12'hB99 -`define VX_CSR_MPM_MEM_LAT 12'hB1A // memory latency -`define VX_CSR_MPM_MEM_LAT_H 12'hB9A +`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency +`define VX_CSR_MPM_MEM_LT_H 12'hB9A // PERF: smem `define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads `define VX_CSR_MPM_SMEM_READS_H 12'hB9B diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv index b29f0802..e9d068f7 100644 --- a/hw/rtl/Vortex.sv +++ b/hw/rtl/Vortex.sv @@ -49,12 +49,12 @@ module Vortex import VX_gpu_pkg::*; ( cache_perf_t perf_l3cache; mem_perf_t mem_perf; - assign mem_perf_if.icache = 'x; - assign mem_perf_if.dcache = 'x; + assign mem_perf_if.smem = 'x; + assign mem_perf_if.icache = 'x; + assign mem_perf_if.dcache = 'x; assign mem_perf_if.l2cache = 'x; assign mem_perf_if.l3cache = perf_l3cache; - assign mem_perf_if.smem = 'x; - assign mem_perf_if.mem = mem_perf; + assign mem_perf_if.mem = mem_perf; `endif VX_mem_bus_if #( diff --git a/hw/rtl/core/VX_core_top.sv b/hw/rtl/core/VX_core_top.sv index 8d126f96..6ecd4772 100644 --- a/hw/rtl/core/VX_core_top.sv +++ b/hw/rtl/core/VX_core_top.sv @@ -130,6 +130,12 @@ module VX_core_top import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE VX_mem_perf_if mem_perf_if(); + assign mem_perf_if.smem = '0; + assign mem_perf_if.icache = '0; + assign mem_perf_if.dcache = '0; + assign mem_perf_if.l2cache = '0; + assign mem_perf_if.l3cache = '0; + assign mem_perf_if.mem = '0; `endif `ifdef SCOPE diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv index 44e997ff..6d7c41f8 100644 --- a/hw/rtl/core/VX_csr_data.sv +++ b/hw/rtl/core/VX_csr_data.sv @@ -186,11 +186,11 @@ import VX_fpu_pkg::*; case (base_dcrs.mpm_class) `VX_DCR_MPM_CLASS_CORE: begin case (read_addr) - // PERF: pipeline + // PERF: pipeline + `VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0]; + `VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0]; - `VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_FETCH_ST : read_data_ro_r = pipeline_perf_if.fetch_stalls[31:0]; - `VX_CSR_MPM_FETCH_ST_H : read_data_ro_r = 32'(pipeline_perf_if.fetch_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0]; `VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0]; @@ -228,10 +228,10 @@ import VX_fpu_pkg::*; `VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0]; `VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0]; - `VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0]; - `VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0]; + `VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0]; + `VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]); default:; endcase end @@ -295,8 +295,8 @@ import VX_fpu_pkg::*; `VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0]; `VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0]; - `VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0]; + `VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]); default:; endcase end diff --git a/hw/rtl/core/VX_dispatch_unit.sv b/hw/rtl/core/VX_dispatch_unit.sv index 86564187..586acc0b 100644 --- a/hw/rtl/core/VX_dispatch_unit.sv +++ b/hw/rtl/core/VX_dispatch_unit.sv @@ -70,8 +70,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( always @(posedge clk) begin if (reset) begin batch_idx <= '0; - end else if (batch_done) begin - batch_idx <= batch_idx + BATCH_COUNT_W'(1); + end else begin + batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done); end end end else begin diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv index b939b081..3383f70f 100644 --- a/hw/rtl/core/VX_lsu_unit.sv +++ b/hw/rtl/core/VX_lsu_unit.sv @@ -554,7 +554,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( VX_stream_arb #( .NUM_INPUTS (2), .DATAW (RSP_ARB_DATAW), - .OUT_REG (1) + .OUT_REG (2) ) rsp_arb ( .clk (clk), .reset (commit_reset), diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index f11e4324..4f74af36 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -381,23 +381,24 @@ module VX_schedule import VX_gpu_pkg::*; #( `RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps)); `ifdef PERF_ENABLE + reg [`PERF_CTR_BITS-1:0] perf_sched_idles; reg [`PERF_CTR_BITS-1:0] perf_sched_stalls; - reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls; + wire schedule_idle = ~schedule_valid; wire schedule_stall = schedule_if.valid && ~schedule_if.ready; always @(posedge clk) begin if (reset) begin - perf_sched_stalls <= '0; - perf_fetch_stalls <= '0; + perf_sched_idles <= '0; + perf_sched_stalls <= '0; end else begin - perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(~schedule_valid); - perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_stall); + perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle); + perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall); end end - assign perf_schedule_if.sched_stalls = perf_sched_stalls; - assign perf_schedule_if.fetch_stalls = perf_fetch_stalls; + assign perf_schedule_if.sched_idles = perf_sched_idles; + assign perf_schedule_if.sched_stalls = perf_sched_stalls; `endif endmodule diff --git a/hw/rtl/fpu/VX_fpu_cvt.sv b/hw/rtl/fpu/VX_fpu_cvt.sv index e12e51ad..7ba6330e 100644 --- a/hw/rtl/fpu/VX_fpu_cvt.sv +++ b/hw/rtl/fpu/VX_fpu_cvt.sv @@ -355,11 +355,14 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3; for (genvar i = 0; i < NUM_LANES; ++i) begin - fflags_t i2f_regular_status_s3 = i2f_round_has_sticky_s3[i] ? 5'h1 : 5'h0; - fflags_t f2i_regular_status_s3 = f2i_round_has_sticky_s3[i] ? 5'h1 : 5'h0; + fflags_t i2f_regular_status_s3, f2i_regular_status_s3; + fflags_t i2f_status_s3, f2i_status_s3; - fflags_t i2f_status_s3 = i2f_regular_status_s3; - fflags_t f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3; + assign i2f_regular_status_s3 = {4'h0, i2f_round_has_sticky_s3[i]}; + assign f2i_regular_status_s3 = {4'h0, f2i_round_has_sticky_s3[i]}; + + assign i2f_status_s3 = i2f_regular_status_s3; + assign f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3; wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i]; wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i]; diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index 4f6ffb5d..66225336 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -14,8 +14,8 @@ `include "VX_define.vh" interface VX_pipeline_perf_if (); + wire [`PERF_CTR_BITS-1:0] sched_idles; wire [`PERF_CTR_BITS-1:0] sched_stalls; - wire [`PERF_CTR_BITS-1:0] fetch_stalls; wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls; wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS]; @@ -28,8 +28,8 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] load_latency; modport schedule ( - output sched_stalls, - output fetch_stalls + output sched_idles, + output sched_stalls ); modport issue ( @@ -40,8 +40,8 @@ interface VX_pipeline_perf_if (); ); modport slave ( + input sched_idles, input sched_stalls, - input fetch_stalls, input ibf_stalls, input scb_stalls, input scb_uses, diff --git a/hw/rtl/libs/VX_fifo_queue.sv b/hw/rtl/libs/VX_fifo_queue.sv index 78a2785c..1eda9fff 100644 --- a/hw/rtl/libs/VX_fifo_queue.sv +++ b/hw/rtl/libs/VX_fifo_queue.sv @@ -201,9 +201,7 @@ module VX_fifo_queue #( rd_ptr_r <= '0; rd_ptr_n_r <= 1; end else begin - if (push) begin - wr_ptr_r <= wr_ptr_r + ADDRW'(1); - end + wr_ptr_r <= wr_ptr_r + ADDRW'(push); if (pop) begin rd_ptr_r <= rd_ptr_n_r; if (DEPTH > 2) begin diff --git a/runtime/common/utils.cpp b/runtime/common/utils.cpp index 574f64a7..c0199a86 100644 --- a/runtime/common/utils.cpp +++ b/runtime/common/utils.cpp @@ -175,8 +175,9 @@ static uint64_t get_csr_64(const void* ptr, int addr) { extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { int ret = 0; - uint64_t instrs = 0; - uint64_t cycles = 0; + uint64_t total_instrs = 0; + uint64_t total_cycles = 0; + uint64_t max_cycles = 0; #ifdef PERF_ENABLE @@ -199,8 +200,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { auto perf_class = gAutoPerfDump.get_perf_class(); // PERF: pipeline stalls - uint64_t scheduler_stalls = 0; - uint64_t fetch_stalls = 0; + uint64_t sched_idles = 0; + uint64_t sched_stalls = 0; uint64_t ibuffer_stalls = 0; uint64_t scrb_stalls = 0; uint64_t lsu_stalls = 0; @@ -269,19 +270,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { switch (perf_class) { case VX_DCR_MPM_CLASS_CORE: { // PERF: pipeline - // schedule stalls + // scheduler idles { - uint64_t scheduler_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST); - int scheduler_percent_per_core = calcAvgPercent(scheduler_stalls_per_core, cycles_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: schedule stalls=%ld (%d%%)\n", core_id, scheduler_stalls_per_core, scheduler_percent_per_core); - scheduler_stalls += scheduler_stalls_per_core; + uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID); + int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler idles=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core); + sched_idles += sched_idles_per_core; } - // fetch stalls + // scheduler stalls { - uint64_t fetch_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FETCH_ST); - int fetch_percent_per_core = calcAvgPercent(fetch_stalls_per_core, cycles_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch stalls=%ld (%d%%)\n", core_id, fetch_stalls_per_core, fetch_percent_per_core); - fetch_stalls += fetch_stalls_per_core; + uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST); + int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core); + sched_stalls += sched_stalls_per_core; } // ibuffer_stalls { @@ -340,7 +341,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core); ifetches += ifetches_per_core; - uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT); + uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT); int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core); if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat); ifetch_lat += ifetch_lat_per_core; @@ -351,7 +352,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core); loads += loads_per_core; - uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT); + uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT); int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core); if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat); load_lat += load_lat_per_core; @@ -431,7 +432,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { // PERF: memory mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS); mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES); - mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LAT); + mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LT); } } break; default: @@ -441,21 +442,22 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { float IPC = (float)(double(instrs_per_core) / double(cycles_per_core)); if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC); - instrs += instrs_per_core; - cycles = std::max(cycles_per_core, cycles); + total_instrs += instrs_per_core; + total_cycles += cycles_per_core; + max_cycles = std::max(cycles_per_core, max_cycles); } #ifdef PERF_ENABLE switch (perf_class) { case VX_DCR_MPM_CLASS_CORE: { - int scheduler_percent = calcAvgPercent(scheduler_stalls, cycles); - int fetch_percent = calcAvgPercent(fetch_stalls, cycles); - int ibuffer_percent = calcAvgPercent(ibuffer_stalls, cycles); + int sched_idles_percent = calcAvgPercent(sched_idles, total_cycles); + int sched_stalls_percent = calcAvgPercent(sched_stalls, total_cycles); + int ibuffer_percent = calcAvgPercent(ibuffer_stalls, total_cycles); int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches)); int load_avg_lat = (int)(double(load_lat) / double(loads)); uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu; - fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", scheduler_stalls, scheduler_percent); - fprintf(stream, "PERF: fetch stalls=%ld (%d%%)\n", fetch_stalls, fetch_percent); + fprintf(stream, "PERF: scheduler idles=%ld (%d%%)\n", sched_idles, sched_idles_percent); + fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent); fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent); fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls, calcAvgPercent(scrb_alu, scrb_total), @@ -514,8 +516,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { } #endif - float IPC = (float)(double(instrs) / double(cycles)); - fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); + float IPC = (float)(double(total_instrs) / double(max_cycles)); + fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, max_cycles, IPC); fflush(stream); diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index b2fe7ea2..49c2ec35 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -167,7 +167,7 @@ void Core::schedule() { } } if (scheduled_warp == -1) { - ++perf_stats_.sched_stalls; + ++perf_stats_.sched_idles; return; } @@ -548,10 +548,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { break; case VX_DCR_MPM_CLASS_CORE: { switch (addr) { + case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idles & 0xffffffff; + case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idles >> 32; case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff; case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32; - case VX_CSR_MPM_FETCH_ST: return perf_stats_.fetch_stalls & 0xffffffff; - case VX_CSR_MPM_FETCH_ST_H:return perf_stats_.fetch_stalls >> 32; case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff; case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32; case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff; @@ -579,10 +579,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_LOADS_H: return perf_stats_.loads >> 32; case VX_CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff; case VX_CSR_MPM_STORES_H: return perf_stats_.stores >> 32; - case VX_CSR_MPM_IFETCH_LAT: return perf_stats_.ifetch_latency & 0xffffffff; - case VX_CSR_MPM_IFETCH_LAT_H: return perf_stats_.ifetch_latency >> 32; - case VX_CSR_MPM_LOAD_LAT: return perf_stats_.load_latency & 0xffffffff; - case VX_CSR_MPM_LOAD_LAT_H: return perf_stats_.load_latency >> 32; + case VX_CSR_MPM_IFETCH_LT: return perf_stats_.ifetch_latency & 0xffffffff; + case VX_CSR_MPM_IFETCH_LT_H: return perf_stats_.ifetch_latency >> 32; + case VX_CSR_MPM_LOAD_LT: return perf_stats_.load_latency & 0xffffffff; + case VX_CSR_MPM_LOAD_LT_H: return perf_stats_.load_latency >> 32; } } break; case VX_DCR_MPM_CLASS_MEM: { @@ -638,8 +638,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32; case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff; case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32; - case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff; - case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32; + case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff; + case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32; case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff; case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32; diff --git a/sim/simx/core.h b/sim/simx/core.h index 60290bef..cef60e81 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -49,8 +49,8 @@ public: struct PerfStats { uint64_t cycles; uint64_t instrs; + uint64_t sched_idles; uint64_t sched_stalls; - uint64_t fetch_stalls; uint64_t ibuf_stalls; uint64_t scrb_stalls; uint64_t alu_stalls; @@ -70,8 +70,8 @@ public: PerfStats() : cycles(0) , instrs(0) + , sched_idles(0) , sched_stalls(0) - , fetch_stalls(0) , ibuf_stalls(0) , scrb_stalls(0) , alu_stalls(0) diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index c7ba1ed7..5d18f9cd 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -9,16 +9,16 @@ all: $(MAKE) -C dotproduct $(MAKE) -C kmeans $(MAKE) -C spmv - $(MAKE) -C transpose - $(MAKE) -C cutcp - $(MAKE) -C vectorhypot $(MAKE) -C stencil - $(MAKE) -C mri-q $(MAKE) -C lbm $(MAKE) -C oclprintf $(MAKE) -C blackscholes - $(MAKE) -C sgemm2 + $(MAKE) -C transpose $(MAKE) -C convolution +# $(MAKE) -C cutcp +# $(MAKE) -C sgemm2 +# $(MAKE) -C vectorhypot +# $(MAKE) -C mri-q run-simx run-simx: $(MAKE) -C vecadd run-simx @@ -37,10 +37,10 @@ run-simx: $(MAKE) -C blackscholes run-simx $(MAKE) -C transpose run-simx $(MAKE) -C convolution run-simx - $(MAKE) -C cutcp run-simx - $(MAKE) -C vectorhypot run-simx - $(MAKE) -C mri-q run-simx +# $(MAKE) -C cutcp run-simx # $(MAKE) -C sgemm2 run-simx +# $(MAKE) -C vectorhypot run-simx +# $(MAKE) -C mri-q run-simx run-rtlsim: $(MAKE) -C vecadd run-rtlsim @@ -98,15 +98,15 @@ clean: $(MAKE) -C kmeans clean $(MAKE) -C spmv clean $(MAKE) -C transpose clean - $(MAKE) -C cutcp clean - $(MAKE) -C vectorhypot clean $(MAKE) -C stencil clean - $(MAKE) -C mri-q clean $(MAKE) -C lbm clean $(MAKE) -C oclprintf clean $(MAKE) -C blackscholes clean - $(MAKE) -C sgemm2 clean $(MAKE) -C convolution clean +# $(MAKE) -C cutcp clean +# $(MAKE) -C sgemm2 clean +# $(MAKE) -C vectorhypot clean +# $(MAKE) -C mri-q clean clean-all: $(MAKE) -C vecadd clean-all @@ -114,19 +114,18 @@ clean-all: $(MAKE) -C psort clean-all $(MAKE) -C saxpy clean-all $(MAKE) -C sfilter clean-all - $(MAKE) -C sfilter clean-all $(MAKE) -C nearn clean-all $(MAKE) -C guassian clean-all $(MAKE) -C dotproduct clean-all $(MAKE) -C kmeans clean-all $(MAKE) -C spmv clean-all $(MAKE) -C transpose clean-all - $(MAKE) -C cutcp clean-all - $(MAKE) -C vectorhypot clean-all $(MAKE) -C stencil clean-all - $(MAKE) -C mri-q clean-all $(MAKE) -C lbm clean-all $(MAKE) -C oclprintf clean-all $(MAKE) -C blackscholes clean-all - $(MAKE) -C sgemm2 clean-all $(MAKE) -C convolution clean-all +# $(MAKE) -C cutcp clean-all +# $(MAKE) -C sgemm2 clean-all +# $(MAKE) -C vectorhypot clean-all +# $(MAKE) -C mri-q clean-all