diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp index f890d2f9..86e30266 100644 --- a/sim/simx/decode.cpp +++ b/sim/simx/decode.cpp @@ -195,6 +195,7 @@ static const char* op_string(const Instr &instr) { case 2: return "SPLIT"; case 3: return "JOIN"; case 4: return "BAR"; + case 5: return "PREFETCH"; default: std::abort(); } diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index efc199d2..5df72c6f 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -425,11 +425,11 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; - Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned + Word mem_addr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8; - Word data_read = core_->dcache_read(memAddr, 4); - trace->mem_addrs.at(t).push_back({memAddr, 4}); - DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read); + Word data_read = core_->dcache_read(mem_addr, 4); + trace->mem_addrs.at(t).push_back({mem_addr, 4}); + DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << mem_addr << ", DATA=0x" << data_read); switch (func3) { case 0: // LBI @@ -465,10 +465,10 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { case 6: { // load word and unit strided (not checking for unit stride) for (int i = 0; i < vl_; i++) { - Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); - DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr); - Word data_read = core_->dcache_read(memAddr, 4); - DP(4, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); + Word mem_addr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); + DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << mem_addr); + Word data_read = core_->dcache_read(mem_addr, 4); + DP(4, "Mem addr: " << std::hex << mem_addr << " Data read " << data_read); int *result_ptr = (int *)(vd.data() + i); *result_ptr = data_read; } @@ -490,21 +490,21 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; - Word memAddr = rsdata[t][0] + immsrc; - trace->mem_addrs.at(t).push_back({memAddr, (1u << func3)}); - DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + Word mem_addr = rsdata[t][0] + immsrc; + trace->mem_addrs.at(t).push_back({mem_addr, (1u << func3)}); + DP(4, "STORE MEM: ADDRESS=0x" << std::hex << mem_addr); switch (func3) { case 0: // SB - core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1); + core_->dcache_write(mem_addr, rsdata[t][1] & 0x000000FF, 1); break; case 1: // SH - core_->dcache_write(memAddr, rsdata[t][1], 2); + core_->dcache_write(mem_addr, rsdata[t][1], 2); break; case 2: // SW - core_->dcache_write(memAddr, rsdata[t][1], 4); + core_->dcache_write(mem_addr, rsdata[t][1], 4); break; default: std::abort(); @@ -512,14 +512,14 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { } } else { for (int i = 0; i < vl_; i++) { - Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8); - DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr); + Word mem_addr = rsdata[i][0] + (i * vtype_.vsew / 8); + DP(4, "STORE MEM: ADDRESS=0x" << std::hex << mem_addr); switch (instr.getVlsWidth()) { case 6: { // store word and unit strided (not checking for unit stride) uint32_t value = *(uint32_t *)(vreg_file_.at(instr.getVs3()).data() + i); - core_->dcache_write(memAddr, value, 4); - DP(4, "store: " << memAddr << " value:" << value); + core_->dcache_write(mem_addr, value, 4); + DP(4, "store: " << mem_addr << " value:" << value); } break; default: std::abort(); @@ -888,8 +888,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) { for (int t = 0; t < num_threads; ++t) { if (!tmask_.test(t)) continue; - int addr = rsdata[t][0]; - printf("*** PREFETCHED %d ***\n", addr); + auto mem_addr = rsdata[t][0]; + trace->mem_addrs.at(t).push_back({mem_addr, 4}); } } break; default: diff --git a/tests/regression/prefetch/Makefile b/tests/regression/prefetch/Makefile index 0627bd36..af58821c 100644 --- a/tests/regression/prefetch/Makefile +++ b/tests/regression/prefetch/Makefile @@ -2,7 +2,7 @@ RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain VORTEX_DRV_PATH ?= $(realpath ../../../driver) VORTEX_RT_PATH ?= $(realpath ../../../runtime) -OPTS ?= -n64 +OPTS ?= -n32 VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ diff --git a/tests/regression/prefetch/kernel.c b/tests/regression/prefetch/kernel.c index 9136592b..b852f582 100644 --- a/tests/regression/prefetch/kernel.c +++ b/tests/regression/prefetch/kernel.c @@ -1,24 +1,43 @@ #include #include #include +#include #include "common.h" -void kernel_body(int task_id, const kernel_arg_t* arg) { - uint32_t count = arg->task_size; - int32_t* src0_ptr = (int32_t*)arg->src0_ptr; - int32_t* src1_ptr = (int32_t*)arg->src1_ptr; - int32_t* dst_ptr = (int32_t*)arg->dst_ptr; - +#define BLOCK_SIZE 64 + +void kernel_body(int task_id, kernel_arg_t* arg) { + uint32_t count = arg->task_size; uint32_t offset = task_id * count; + uint32_t num_blocks = (count * 4 + BLOCK_SIZE-1) / BLOCK_SIZE; + + int32_t* src0_ptr = (int32_t*)arg->src0_ptr + offset; + int32_t* src1_ptr = (int32_t*)arg->src1_ptr + offset; + int32_t* dst_ptr = (int32_t*)arg->dst_ptr + offset; + + uint32_t src0_end = (uint32_t)(src0_ptr + count); + uint32_t src1_end = (uint32_t)(src1_ptr + count); for (uint32_t i = 0; i < count; ++i) { - dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i]; - vx_prefetch((uint32_t)(src0_ptr) + offset + i); - vx_prefetch((uint32_t)(src1_ptr) + offset + i); + dst_ptr[i] = src0_ptr[i] + src1_ptr[i]; + + uint32_t src0_mask = ((uint32_t)(src0_ptr + i)) % BLOCK_SIZE; + uint32_t src0_next = (uint32_t)(src0_ptr + i + BLOCK_SIZE/4); + if (src0_mask == 0 && src0_next < src0_end) { + //vx_printf("src0_next=%d\n", src0_next); + vx_prefetch(src0_next); + } + + uint32_t src1_mask = ((uint32_t)(src1_ptr + i)) % BLOCK_SIZE; + uint32_t src1_next = (uint32_t)(src1_ptr + i + BLOCK_SIZE/4); + if (src1_mask == 0 && src1_next < src1_end) { + //vx_printf("src1_next=%d\n", src1_next); + vx_prefetch(src1_next); + } } } void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - vx_spawn_tasks(arg->num_tasks, kernel_body, arg); + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg); } \ No newline at end of file diff --git a/tests/regression/prefetch/main.cpp b/tests/regression/prefetch/main.cpp index 2961b517..8be0d2a4 100644 --- a/tests/regression/prefetch/main.cpp +++ b/tests/regression/prefetch/main.cpp @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - unsigned max_cores, max_warps, max_threads; + uint64_t max_cores, max_warps, max_threads; RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));