From 3071fb7a293548f19f4d24ca96232a482abcc29d Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 6 Jun 2021 13:35:55 -0700 Subject: [PATCH] adding support for non-cacheable memory addressing --- ci/regression.sh | 1 + driver/tests/Makefile | 4 + driver/tests/io_addr/Makefile | 67 ++++ driver/tests/io_addr/common.h | 12 + driver/tests/io_addr/kernel.bin | Bin 0 -> 6908 bytes driver/tests/io_addr/kernel.c | 19 ++ driver/tests/io_addr/kernel.dump | 547 +++++++++++++++++++++++++++++++ driver/tests/io_addr/kernel.elf | Bin 0 -> 8964 bytes driver/tests/io_addr/main.cpp | 246 ++++++++++++++ hw/rtl/VX_cluster.v | 8 +- hw/rtl/VX_config.vh | 16 +- hw/rtl/VX_core.v | 4 +- hw/rtl/VX_databus_arb.v | 70 ++-- hw/rtl/VX_define.vh | 62 ++-- hw/rtl/VX_lsu_unit.v | 81 +++-- hw/rtl/VX_mem_unit.v | 19 +- hw/rtl/Vortex.v | 9 +- hw/rtl/cache/VX_cache.v | 324 +++++++++++++----- hw/rtl/cache/VX_nc_bypass.v | 301 +++++++++++++++++ hw/scripts/scope.json | 8 +- hw/simulate/simulator.cpp | 2 +- hw/simulate/simulator.h | 2 +- simX/core.cpp | 8 +- 23 files changed, 1605 insertions(+), 205 deletions(-) create mode 100644 driver/tests/io_addr/Makefile create mode 100644 driver/tests/io_addr/common.h create mode 100755 driver/tests/io_addr/kernel.bin create mode 100644 driver/tests/io_addr/kernel.c create mode 100644 driver/tests/io_addr/kernel.dump create mode 100755 driver/tests/io_addr/kernel.elf create mode 100644 driver/tests/io_addr/main.cpp create mode 100644 hw/rtl/cache/VX_nc_bypass.v diff --git a/ci/regression.sh b/ci/regression.sh index 0e0fb248..fd9adabf 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -26,6 +26,7 @@ make -s ./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1" ./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1" ./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=demo --args="-n1" +./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1" # build flags ./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --perf --app=demo --args="-n1" diff --git a/driver/tests/Makefile b/driver/tests/Makefile index 40394bc9..dfe8825d 100644 --- a/driver/tests/Makefile +++ b/driver/tests/Makefile @@ -3,22 +3,26 @@ all: $(MAKE) -C demo $(MAKE) -C dogfood $(MAKE) -C stress + $(MAKE) -C io_addr run: $(MAKE) -C basic run-vlsim $(MAKE) -C demo run-vlsim $(MAKE) -C dogfood run-vlsim $(MAKE) -C stress run-vlsim + $(MAKE) -C io_addr run-vlsim clean: $(MAKE) -C basic clean $(MAKE) -C demo clean $(MAKE) -C dogfood clean $(MAKE) -C stress clean + $(MAKE) -C io_addr clean clean-all: $(MAKE) -C basic clean-all $(MAKE) -C demo clean-all $(MAKE) -C dogfood clean-all $(MAKE) -C stress clean-all + $(MAKE) -C io_addr clean-all diff --git a/driver/tests/io_addr/Makefile b/driver/tests/io_addr/Makefile new file mode 100644 index 00000000..3888cc3e --- /dev/null +++ b/driver/tests/io_addr/Makefile @@ -0,0 +1,67 @@ +RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +VORTEX_RT_PATH ?= $(wildcard ../../../runtime) + +OPTS ?= -n1 + +VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc +VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ +VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump +VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy + +VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections +VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw + +VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a + +VX_SRCS = kernel.c + +#CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors +CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors + +CXXFLAGS += -I../../include -I$(VORTEX_RT_PATH)/../hw + +PROJECT = io_addr + +SRCS = main.cpp + +all: $(PROJECT) kernel.bin kernel.dump + +kernel.dump: kernel.elf + $(VX_DP) -D kernel.elf > kernel.dump + +kernel.bin: kernel.elf + $(VX_CP) -O binary kernel.elf kernel.bin + +kernel.elf: $(VX_SRCS) + $(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf + +$(PROJECT): $(SRCS) + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../stub -lvortex -o $@ + +run-fpga: $(PROJECT) + LD_LIBRARY_PATH=../../opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-asesim: $(PROJECT) + ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-vlsim: $(PROJECT) + LD_LIBRARY_PATH=../../opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-rtlsim: $(PROJECT) + LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-simx: $(PROJECT) + LD_LIBRARY_PATH=../../simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +.depend: $(SRCS) + $(CXX) $(CXXFLAGS) -MM $^ > .depend; + +clean: + rm -rf $(PROJECT) *.o .depend + +clean-all: clean + rm -rf *.elf *.bin *.dump + +ifneq ($(MAKECMDGOALS),clean) + -include .depend +endif \ No newline at end of file diff --git a/driver/tests/io_addr/common.h b/driver/tests/io_addr/common.h new file mode 100644 index 00000000..73247b2c --- /dev/null +++ b/driver/tests/io_addr/common.h @@ -0,0 +1,12 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 + +struct kernel_arg_t { + uint32_t num_points; + uint32_t src_ptr; + uint32_t dst_ptr; +}; + +#endif \ No newline at end of file diff --git a/driver/tests/io_addr/kernel.bin b/driver/tests/io_addr/kernel.bin new file mode 100755 index 0000000000000000000000000000000000000000..fb88e1ea01805016b3ea20979cee5defeb4eb4e5 GIT binary patch literal 6908 zcmeHLU1%It6h3$U?ru`**4ZYh2u(YiohB)zWvYF2tC3O=Q4qlwr3_Ue4M7%KpX|oj z&17ql+AB%>RB3*;Dj3RA7Qu&xq^2zq6jQ1%YFU!BAY?J=u8G^M-`%GEJ^5J53>?nR z?3r`F`+eWJ4>P0_iOiLTWLu=#9l1n{bUAG55$YzLn%js>UH_svj%@T_a*VNrKs{=n zX-39>yBCQ%5l<~rih3#4S*X;GtD~F>I;HlNYsW=K5A{+aDmallz-b?zn|j7Ut^HPt z$jd!X=H@oa`#Dl`9QPSgZB9-mM^@_`$r=-DNj%9ms>(dH4Ve2TZZ$JHZnFK$z?4YF zBQyET6~$o}6HD>kGj6}}*L(FIjICL_Hj%ZnkF1W5$l9@uS@At=$wtojYC{eawsP zXKFiRUi$=@j4A8*tH7@SzXJRU@T;3HVUKhYCJa@S%ba1$=0AK750U z*B5^4E&Y7YgtsJS{Jv}Y8o95W0mcv&xYW+~rZfc}syS(&JX<4?UapziRIRT48Ep-FBw_L+~tvvyb74$nIG|W=He-!w;?t81{?ApKI5BJFT|Id5>#0BpE z#-^6QtxoQHlZrT2A@>w-a8K(Yanz`pN6VD&pU2!5fDaKv#0#kBtoY{5H8*P1=Q(do zCd6Qzh7>~mt9bS`S-e2Q2I>*RIx=EdLq-g%$cSMD88OU|5yOOxkerFTcOMgj8OqPy zATg*jnc;`BoZ|=Ui1!J+7^@sFcYXB z0SE9`9VefTTCqyBxAexUH#Xh!pw5{G2e|)ly{VUK>z=qL@mV`@Hhkuf!yB0YO^Vun z0{!|F_(8FD?O|5O%goww2xr?nz}d&-44RGb{0E4J2#Ij1=EV+@2_L%8!kZt*D6gF* zH{Gatxvq$tI;NZ2a!sVCYQ?6#HTXb*KNa{BYcT~g$kYy;Q60e1i8HDb{>7OJ@2V1~ zXCK5&cY)tZ&Hd2Vtn(Tb2aHwE&bBy1Cs2Ek^3pe_xdJV4)}!7&5p| +#include +#include +#include "common.h" + +void kernel_body(int task_id, void* arg) { + struct kernel_arg_t* _arg = (struct kernel_arg_t*)(arg); + uint32_t* src_ptr = (uint32_t*)_arg->src_ptr; + uint32_t* dst_ptr = (uint32_t*)_arg->dst_ptr; + + int32_t* addr_ptr = (int32_t*)(src_ptr[task_id]); + + dst_ptr[task_id] = *addr_ptr; +} + +void main() { + struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_points, kernel_body, arg); +} \ No newline at end of file diff --git a/driver/tests/io_addr/kernel.dump b/driver/tests/io_addr/kernel.dump new file mode 100644 index 00000000..e413e35e --- /dev/null +++ b/driver/tests/io_addr/kernel.dump @@ -0,0 +1,547 @@ + +kernel.elf: file format elf32-littleriscv + + +Disassembly of section .init: + +80000000 <_start>: +80000000: 00000597 auipc a1,0x0 +80000004: 0ac58593 addi a1,a1,172 # 800000ac +80000008: fc102573 csrr a0,0xfc1 +8000000c: 00b5106b 0xb5106b +80000010: 09c000ef jal ra,800000ac +80000014: 00100513 li a0,1 +80000018: 0005006b 0x5006b +8000001c: 00002517 auipc a0,0x2 +80000020: ae050513 addi a0,a0,-1312 # 80001afc +80000024: 00002617 auipc a2,0x2 +80000028: b5860613 addi a2,a2,-1192 # 80001b7c <__BSS_END__> +8000002c: 40a60633 sub a2,a2,a0 +80000030: 00000593 li a1,0 +80000034: 3fc000ef jal ra,80000430 +80000038: 00000517 auipc a0,0x0 +8000003c: 30050513 addi a0,a0,768 # 80000338 <__libc_fini_array> +80000040: 2b0000ef jal ra,800002f0 +80000044: 350000ef jal ra,80000394 <__libc_init_array> +80000048: 008000ef jal ra,80000050
+8000004c: 2b80006f j 80000304 + +Disassembly of section .text: + +80000050
: +80000050: 7ffff7b7 lui a5,0x7ffff +80000054: 0007a503 lw a0,0(a5) # 7ffff000 <__stack_size+0x7fffec00> +80000058: 800005b7 lui a1,0x80000 +8000005c: 7ffff637 lui a2,0x7ffff +80000060: 08058593 addi a1,a1,128 # 80000080 <__stack_top+0x81000080> +80000064: 1440006f j 800001a8 + +80000068 : +80000068: 00000793 li a5,0 +8000006c: 00078863 beqz a5,8000007c +80000070: 80000537 lui a0,0x80000 +80000074: 33850513 addi a0,a0,824 # 80000338 <__stack_top+0x81000338> +80000078: 2780006f j 800002f0 +8000007c: 00008067 ret + +80000080 : +80000080: 0045a783 lw a5,4(a1) +80000084: 00251513 slli a0,a0,0x2 +80000088: 00a787b3 add a5,a5,a0 +8000008c: 0007a703 lw a4,0(a5) +80000090: 0085a783 lw a5,8(a1) +80000094: 00072703 lw a4,0(a4) +80000098: 00a78533 add a0,a5,a0 +8000009c: 00e52023 sw a4,0(a0) +800000a0: 00008067 ret + +800000a4 <_exit>: +800000a4: 00000513 li a0,0 +800000a8: 0005006b 0x5006b + +800000ac : +800000ac: fc002573 csrr a0,0xfc0 +800000b0: 0005006b 0x5006b +800000b4: 00002197 auipc gp,0x2 +800000b8: e1c18193 addi gp,gp,-484 # 80001ed0 <__global_pointer> +800000bc: 7f000117 auipc sp,0x7f000 +800000c0: f4410113 addi sp,sp,-188 # ff000000 <__stack_top> +800000c4: 40000593 li a1,1024 +800000c8: cc102673 csrr a2,0xcc1 +800000cc: 02c585b3 mul a1,a1,a2 +800000d0: 40b10133 sub sp,sp,a1 +800000d4: cc3026f3 csrr a3,0xcc3 +800000d8: 00068663 beqz a3,800000e4 +800000dc: 00000513 li a0,0 +800000e0: 0005006b 0x5006b + +800000e4 : +800000e4: 00008067 ret + +800000e8 : +800000e8: fe010113 addi sp,sp,-32 +800000ec: 00112e23 sw ra,28(sp) +800000f0: 00812c23 sw s0,24(sp) +800000f4: 00912a23 sw s1,20(sp) +800000f8: 01212823 sw s2,16(sp) +800000fc: 01312623 sw s3,12(sp) +80000100: fc0027f3 csrr a5,0xfc0 +80000104: 0007806b 0x7806b +80000108: cc5027f3 csrr a5,0xcc5 +8000010c: cc3029f3 csrr s3,0xcc3 +80000110: cc002773 csrr a4,0xcc0 +80000114: fc002673 csrr a2,0xfc0 +80000118: 00279693 slli a3,a5,0x2 +8000011c: 800027b7 lui a5,0x80002 +80000120: afc78793 addi a5,a5,-1284 # 80001afc <__stack_top+0x81001afc> +80000124: 00d787b3 add a5,a5,a3 +80000128: 0007a483 lw s1,0(a5) +8000012c: 0104a403 lw s0,16(s1) +80000130: 00c4a683 lw a3,12(s1) +80000134: 0089a933 slt s2,s3,s0 +80000138: 00040793 mv a5,s0 +8000013c: 00d90933 add s2,s2,a3 +80000140: 03368433 mul s0,a3,s3 +80000144: 00f9d463 bge s3,a5,8000014c +80000148: 00098793 mv a5,s3 +8000014c: 00f40433 add s0,s0,a5 +80000150: 0084a683 lw a3,8(s1) +80000154: 02c40433 mul s0,s0,a2 +80000158: 02e907b3 mul a5,s2,a4 +8000015c: 00d40433 add s0,s0,a3 +80000160: 00f40433 add s0,s0,a5 +80000164: 00890933 add s2,s2,s0 +80000168: 01245e63 bge s0,s2,80000184 +8000016c: 0004a783 lw a5,0(s1) +80000170: 0044a583 lw a1,4(s1) +80000174: 00040513 mv a0,s0 +80000178: 00140413 addi s0,s0,1 +8000017c: 000780e7 jalr a5 +80000180: fe8916e3 bne s2,s0,8000016c +80000184: 0019b993 seqz s3,s3 +80000188: 0009806b 0x9806b +8000018c: 01c12083 lw ra,28(sp) +80000190: 01812403 lw s0,24(sp) +80000194: 01412483 lw s1,20(sp) +80000198: 01012903 lw s2,16(sp) +8000019c: 00c12983 lw s3,12(sp) +800001a0: 02010113 addi sp,sp,32 +800001a4: 00008067 ret + +800001a8 : +800001a8: fc010113 addi sp,sp,-64 +800001ac: 02112e23 sw ra,60(sp) +800001b0: 02812c23 sw s0,56(sp) +800001b4: 02912a23 sw s1,52(sp) +800001b8: 03212823 sw s2,48(sp) +800001bc: 03312623 sw s3,44(sp) +800001c0: fc2026f3 csrr a3,0xfc2 +800001c4: fc102873 csrr a6,0xfc1 +800001c8: fc002473 csrr s0,0xfc0 +800001cc: cc5027f3 csrr a5,0xcc5 +800001d0: 01f00713 li a4,31 +800001d4: 0cf74463 blt a4,a5,8000029c +800001d8: 030408b3 mul a7,s0,a6 +800001dc: 00100713 li a4,1 +800001e0: 00a8d463 bge a7,a0,800001e8 +800001e4: 03154733 div a4,a0,a7 +800001e8: 0ce6c863 blt a3,a4,800002b8 +800001ec: 0ae7d863 bge a5,a4,8000029c +800001f0: fff68693 addi a3,a3,-1 +800001f4: 02e54333 div t1,a0,a4 +800001f8: 00030893 mv a7,t1 +800001fc: 00f69663 bne a3,a5,80000208 +80000200: 02e56533 rem a0,a0,a4 +80000204: 006508b3 add a7,a0,t1 +80000208: 0288c4b3 div s1,a7,s0 +8000020c: 0288e933 rem s2,a7,s0 +80000210: 0b04ca63 blt s1,a6,800002c4 +80000214: 00100693 li a3,1 +80000218: 0304c733 div a4,s1,a6 +8000021c: 00070663 beqz a4,80000228 +80000220: 00070693 mv a3,a4 +80000224: 0304e733 rem a4,s1,a6 +80000228: 800029b7 lui s3,0x80002 +8000022c: afc98993 addi s3,s3,-1284 # 80001afc <__stack_top+0x81001afc> +80000230: 00e12e23 sw a4,28(sp) +80000234: 00c10713 addi a4,sp,12 +80000238: 00b12623 sw a1,12(sp) +8000023c: 00c12823 sw a2,16(sp) +80000240: 00d12c23 sw a3,24(sp) +80000244: 02f30333 mul t1,t1,a5 +80000248: 00279793 slli a5,a5,0x2 +8000024c: 00f987b3 add a5,s3,a5 +80000250: 00e7a023 sw a4,0(a5) +80000254: 00612a23 sw t1,20(sp) +80000258: 06904c63 bgtz s1,800002d0 +8000025c: 04090063 beqz s2,8000029c +80000260: 02848433 mul s0,s1,s0 +80000264: 00812a23 sw s0,20(sp) +80000268: 0009006b 0x9006b +8000026c: cc5027f3 csrr a5,0xcc5 +80000270: cc202573 csrr a0,0xcc2 +80000274: 00279793 slli a5,a5,0x2 +80000278: 00f989b3 add s3,s3,a5 +8000027c: 0009a783 lw a5,0(s3) +80000280: 0087a683 lw a3,8(a5) +80000284: 0007a703 lw a4,0(a5) +80000288: 0047a583 lw a1,4(a5) +8000028c: 00d50533 add a0,a0,a3 +80000290: 000700e7 jalr a4 +80000294: 00100793 li a5,1 +80000298: 0007806b 0x7806b +8000029c: 03c12083 lw ra,60(sp) +800002a0: 03812403 lw s0,56(sp) +800002a4: 03412483 lw s1,52(sp) +800002a8: 03012903 lw s2,48(sp) +800002ac: 02c12983 lw s3,44(sp) +800002b0: 04010113 addi sp,sp,64 +800002b4: 00008067 ret +800002b8: 00068713 mv a4,a3 +800002bc: f2e7cae3 blt a5,a4,800001f0 +800002c0: fddff06f j 8000029c +800002c4: 00000713 li a4,0 +800002c8: 00100693 li a3,1 +800002cc: f5dff06f j 80000228 +800002d0: 00048713 mv a4,s1 +800002d4: 00985463 bge a6,s1,800002dc +800002d8: 00080713 mv a4,a6 +800002dc: 800007b7 lui a5,0x80000 +800002e0: 0e878793 addi a5,a5,232 # 800000e8 <__stack_top+0x810000e8> +800002e4: 00f7106b 0xf7106b +800002e8: e01ff0ef jal ra,800000e8 +800002ec: f71ff06f j 8000025c + +800002f0 : +800002f0: 00050593 mv a1,a0 +800002f4: 00000693 li a3,0 +800002f8: 00000613 li a2,0 +800002fc: 00000513 li a0,0 +80000300: 20c0006f j 8000050c <__register_exitproc> + +80000304 : +80000304: ff010113 addi sp,sp,-16 +80000308: 00000593 li a1,0 +8000030c: 00812423 sw s0,8(sp) +80000310: 00112623 sw ra,12(sp) +80000314: 00050413 mv s0,a0 +80000318: 290000ef jal ra,800005a8 <__call_exitprocs> +8000031c: 800027b7 lui a5,0x80002 +80000320: af87a503 lw a0,-1288(a5) # 80001af8 <__stack_top+0x81001af8> +80000324: 03c52783 lw a5,60(a0) +80000328: 00078463 beqz a5,80000330 +8000032c: 000780e7 jalr a5 +80000330: 00040513 mv a0,s0 +80000334: d71ff0ef jal ra,800000a4 <_exit> + +80000338 <__libc_fini_array>: +80000338: ff010113 addi sp,sp,-16 +8000033c: 00812423 sw s0,8(sp) +80000340: 800017b7 lui a5,0x80001 +80000344: 80001437 lui s0,0x80001 +80000348: 6d040413 addi s0,s0,1744 # 800016d0 <__stack_top+0x810016d0> +8000034c: 6d078793 addi a5,a5,1744 # 800016d0 <__stack_top+0x810016d0> +80000350: 408787b3 sub a5,a5,s0 +80000354: 00912223 sw s1,4(sp) +80000358: 00112623 sw ra,12(sp) +8000035c: 4027d493 srai s1,a5,0x2 +80000360: 02048063 beqz s1,80000380 <__libc_fini_array+0x48> +80000364: ffc78793 addi a5,a5,-4 +80000368: 00878433 add s0,a5,s0 +8000036c: 00042783 lw a5,0(s0) +80000370: fff48493 addi s1,s1,-1 +80000374: ffc40413 addi s0,s0,-4 +80000378: 000780e7 jalr a5 +8000037c: fe0498e3 bnez s1,8000036c <__libc_fini_array+0x34> +80000380: 00c12083 lw ra,12(sp) +80000384: 00812403 lw s0,8(sp) +80000388: 00412483 lw s1,4(sp) +8000038c: 01010113 addi sp,sp,16 +80000390: 00008067 ret + +80000394 <__libc_init_array>: +80000394: ff010113 addi sp,sp,-16 +80000398: 00812423 sw s0,8(sp) +8000039c: 01212023 sw s2,0(sp) +800003a0: 80001437 lui s0,0x80001 +800003a4: 80001937 lui s2,0x80001 +800003a8: 6cc40793 addi a5,s0,1740 # 800016cc <__stack_top+0x810016cc> +800003ac: 6cc90913 addi s2,s2,1740 # 800016cc <__stack_top+0x810016cc> +800003b0: 40f90933 sub s2,s2,a5 +800003b4: 00112623 sw ra,12(sp) +800003b8: 00912223 sw s1,4(sp) +800003bc: 40295913 srai s2,s2,0x2 +800003c0: 02090063 beqz s2,800003e0 <__libc_init_array+0x4c> +800003c4: 6cc40413 addi s0,s0,1740 +800003c8: 00000493 li s1,0 +800003cc: 00042783 lw a5,0(s0) +800003d0: 00148493 addi s1,s1,1 +800003d4: 00440413 addi s0,s0,4 +800003d8: 000780e7 jalr a5 +800003dc: fe9918e3 bne s2,s1,800003cc <__libc_init_array+0x38> +800003e0: 80001437 lui s0,0x80001 +800003e4: 80001937 lui s2,0x80001 +800003e8: 6cc40793 addi a5,s0,1740 # 800016cc <__stack_top+0x810016cc> +800003ec: 6d090913 addi s2,s2,1744 # 800016d0 <__stack_top+0x810016d0> +800003f0: 40f90933 sub s2,s2,a5 +800003f4: 40295913 srai s2,s2,0x2 +800003f8: 02090063 beqz s2,80000418 <__libc_init_array+0x84> +800003fc: 6cc40413 addi s0,s0,1740 +80000400: 00000493 li s1,0 +80000404: 00042783 lw a5,0(s0) +80000408: 00148493 addi s1,s1,1 +8000040c: 00440413 addi s0,s0,4 +80000410: 000780e7 jalr a5 +80000414: fe9918e3 bne s2,s1,80000404 <__libc_init_array+0x70> +80000418: 00c12083 lw ra,12(sp) +8000041c: 00812403 lw s0,8(sp) +80000420: 00412483 lw s1,4(sp) +80000424: 00012903 lw s2,0(sp) +80000428: 01010113 addi sp,sp,16 +8000042c: 00008067 ret + +80000430 : +80000430: 00f00313 li t1,15 +80000434: 00050713 mv a4,a0 +80000438: 02c37e63 bgeu t1,a2,80000474 +8000043c: 00f77793 andi a5,a4,15 +80000440: 0a079063 bnez a5,800004e0 +80000444: 08059263 bnez a1,800004c8 +80000448: ff067693 andi a3,a2,-16 +8000044c: 00f67613 andi a2,a2,15 +80000450: 00e686b3 add a3,a3,a4 +80000454: 00b72023 sw a1,0(a4) +80000458: 00b72223 sw a1,4(a4) +8000045c: 00b72423 sw a1,8(a4) +80000460: 00b72623 sw a1,12(a4) +80000464: 01070713 addi a4,a4,16 +80000468: fed766e3 bltu a4,a3,80000454 +8000046c: 00061463 bnez a2,80000474 +80000470: 00008067 ret +80000474: 40c306b3 sub a3,t1,a2 +80000478: 00269693 slli a3,a3,0x2 +8000047c: 00000297 auipc t0,0x0 +80000480: 005686b3 add a3,a3,t0 +80000484: 00c68067 jr 12(a3) +80000488: 00b70723 sb a1,14(a4) +8000048c: 00b706a3 sb a1,13(a4) +80000490: 00b70623 sb a1,12(a4) +80000494: 00b705a3 sb a1,11(a4) +80000498: 00b70523 sb a1,10(a4) +8000049c: 00b704a3 sb a1,9(a4) +800004a0: 00b70423 sb a1,8(a4) +800004a4: 00b703a3 sb a1,7(a4) +800004a8: 00b70323 sb a1,6(a4) +800004ac: 00b702a3 sb a1,5(a4) +800004b0: 00b70223 sb a1,4(a4) +800004b4: 00b701a3 sb a1,3(a4) +800004b8: 00b70123 sb a1,2(a4) +800004bc: 00b700a3 sb a1,1(a4) +800004c0: 00b70023 sb a1,0(a4) +800004c4: 00008067 ret +800004c8: 0ff5f593 andi a1,a1,255 +800004cc: 00859693 slli a3,a1,0x8 +800004d0: 00d5e5b3 or a1,a1,a3 +800004d4: 01059693 slli a3,a1,0x10 +800004d8: 00d5e5b3 or a1,a1,a3 +800004dc: f6dff06f j 80000448 +800004e0: 00279693 slli a3,a5,0x2 +800004e4: 00000297 auipc t0,0x0 +800004e8: 005686b3 add a3,a3,t0 +800004ec: 00008293 mv t0,ra +800004f0: fa0680e7 jalr -96(a3) +800004f4: 00028093 mv ra,t0 +800004f8: ff078793 addi a5,a5,-16 +800004fc: 40f70733 sub a4,a4,a5 +80000500: 00f60633 add a2,a2,a5 +80000504: f6c378e3 bgeu t1,a2,80000474 +80000508: f3dff06f j 80000444 + +8000050c <__register_exitproc>: +8000050c: 800027b7 lui a5,0x80002 +80000510: af87a703 lw a4,-1288(a5) # 80001af8 <__stack_top+0x81001af8> +80000514: 14872783 lw a5,328(a4) +80000518: 04078c63 beqz a5,80000570 <__register_exitproc+0x64> +8000051c: 0047a703 lw a4,4(a5) +80000520: 01f00813 li a6,31 +80000524: 06e84e63 blt a6,a4,800005a0 <__register_exitproc+0x94> +80000528: 00271813 slli a6,a4,0x2 +8000052c: 02050663 beqz a0,80000558 <__register_exitproc+0x4c> +80000530: 01078333 add t1,a5,a6 +80000534: 08c32423 sw a2,136(t1) +80000538: 1887a883 lw a7,392(a5) +8000053c: 00100613 li a2,1 +80000540: 00e61633 sll a2,a2,a4 +80000544: 00c8e8b3 or a7,a7,a2 +80000548: 1917a423 sw a7,392(a5) +8000054c: 10d32423 sw a3,264(t1) +80000550: 00200693 li a3,2 +80000554: 02d50463 beq a0,a3,8000057c <__register_exitproc+0x70> +80000558: 00170713 addi a4,a4,1 +8000055c: 00e7a223 sw a4,4(a5) +80000560: 010787b3 add a5,a5,a6 +80000564: 00b7a423 sw a1,8(a5) +80000568: 00000513 li a0,0 +8000056c: 00008067 ret +80000570: 14c70793 addi a5,a4,332 +80000574: 14f72423 sw a5,328(a4) +80000578: fa5ff06f j 8000051c <__register_exitproc+0x10> +8000057c: 18c7a683 lw a3,396(a5) +80000580: 00170713 addi a4,a4,1 +80000584: 00e7a223 sw a4,4(a5) +80000588: 00c6e633 or a2,a3,a2 +8000058c: 18c7a623 sw a2,396(a5) +80000590: 010787b3 add a5,a5,a6 +80000594: 00b7a423 sw a1,8(a5) +80000598: 00000513 li a0,0 +8000059c: 00008067 ret +800005a0: fff00513 li a0,-1 +800005a4: 00008067 ret + +800005a8 <__call_exitprocs>: +800005a8: fd010113 addi sp,sp,-48 +800005ac: 800027b7 lui a5,0x80002 +800005b0: 01412c23 sw s4,24(sp) +800005b4: af87aa03 lw s4,-1288(a5) # 80001af8 <__stack_top+0x81001af8> +800005b8: 03212023 sw s2,32(sp) +800005bc: 02112623 sw ra,44(sp) +800005c0: 148a2903 lw s2,328(s4) +800005c4: 02812423 sw s0,40(sp) +800005c8: 02912223 sw s1,36(sp) +800005cc: 01312e23 sw s3,28(sp) +800005d0: 01512a23 sw s5,20(sp) +800005d4: 01612823 sw s6,16(sp) +800005d8: 01712623 sw s7,12(sp) +800005dc: 01812423 sw s8,8(sp) +800005e0: 04090063 beqz s2,80000620 <__call_exitprocs+0x78> +800005e4: 00050b13 mv s6,a0 +800005e8: 00058b93 mv s7,a1 +800005ec: 00100a93 li s5,1 +800005f0: fff00993 li s3,-1 +800005f4: 00492483 lw s1,4(s2) +800005f8: fff48413 addi s0,s1,-1 +800005fc: 02044263 bltz s0,80000620 <__call_exitprocs+0x78> +80000600: 00249493 slli s1,s1,0x2 +80000604: 009904b3 add s1,s2,s1 +80000608: 040b8463 beqz s7,80000650 <__call_exitprocs+0xa8> +8000060c: 1044a783 lw a5,260(s1) +80000610: 05778063 beq a5,s7,80000650 <__call_exitprocs+0xa8> +80000614: fff40413 addi s0,s0,-1 +80000618: ffc48493 addi s1,s1,-4 +8000061c: ff3416e3 bne s0,s3,80000608 <__call_exitprocs+0x60> +80000620: 02c12083 lw ra,44(sp) +80000624: 02812403 lw s0,40(sp) +80000628: 02412483 lw s1,36(sp) +8000062c: 02012903 lw s2,32(sp) +80000630: 01c12983 lw s3,28(sp) +80000634: 01812a03 lw s4,24(sp) +80000638: 01412a83 lw s5,20(sp) +8000063c: 01012b03 lw s6,16(sp) +80000640: 00c12b83 lw s7,12(sp) +80000644: 00812c03 lw s8,8(sp) +80000648: 03010113 addi sp,sp,48 +8000064c: 00008067 ret +80000650: 00492783 lw a5,4(s2) +80000654: 0044a683 lw a3,4(s1) +80000658: fff78793 addi a5,a5,-1 +8000065c: 04878e63 beq a5,s0,800006b8 <__call_exitprocs+0x110> +80000660: 0004a223 sw zero,4(s1) +80000664: fa0688e3 beqz a3,80000614 <__call_exitprocs+0x6c> +80000668: 18892783 lw a5,392(s2) +8000066c: 008a9733 sll a4,s5,s0 +80000670: 00492c03 lw s8,4(s2) +80000674: 00f777b3 and a5,a4,a5 +80000678: 02079263 bnez a5,8000069c <__call_exitprocs+0xf4> +8000067c: 000680e7 jalr a3 +80000680: 00492703 lw a4,4(s2) +80000684: 148a2783 lw a5,328(s4) +80000688: 01871463 bne a4,s8,80000690 <__call_exitprocs+0xe8> +8000068c: f92784e3 beq a5,s2,80000614 <__call_exitprocs+0x6c> +80000690: f80788e3 beqz a5,80000620 <__call_exitprocs+0x78> +80000694: 00078913 mv s2,a5 +80000698: f5dff06f j 800005f4 <__call_exitprocs+0x4c> +8000069c: 18c92783 lw a5,396(s2) +800006a0: 0844a583 lw a1,132(s1) +800006a4: 00f77733 and a4,a4,a5 +800006a8: 00071c63 bnez a4,800006c0 <__call_exitprocs+0x118> +800006ac: 000b0513 mv a0,s6 +800006b0: 000680e7 jalr a3 +800006b4: fcdff06f j 80000680 <__call_exitprocs+0xd8> +800006b8: 00892223 sw s0,4(s2) +800006bc: fa9ff06f j 80000664 <__call_exitprocs+0xbc> +800006c0: 00058513 mv a0,a1 +800006c4: 000680e7 jalr a3 +800006c8: fb9ff06f j 80000680 <__call_exitprocs+0xd8> + +Disassembly of section .init_array: + +800016cc <__init_array_start>: +800016cc: 0068 addi a0,sp,12 +800016ce: 8000 0x8000 + +Disassembly of section .data: + +800016d0 : +800016d0: 0000 unimp +800016d2: 0000 unimp +800016d4: 19bc addi a5,sp,248 +800016d6: 8000 0x8000 +800016d8: 1a24 addi s1,sp,312 +800016da: 8000 0x8000 +800016dc: 1a8c addi a1,sp,368 +800016de: 8000 0x8000 + ... +80001778: 0001 nop +8000177a: 0000 unimp +8000177c: 0000 unimp +8000177e: 0000 unimp +80001780: 330e fld ft6,224(sp) +80001782: abcd j 80001d74 <__BSS_END__+0x1f8> +80001784: 1234 addi a3,sp,296 +80001786: e66d bnez a2,80001870 +80001788: deec sw a1,124(a3) +8000178a: 0005 c.nop 1 +8000178c: 0000000b 0xb + ... + +Disassembly of section .sdata: + +80001af8 <_global_impure_ptr>: +80001af8: 16d0 addi a2,sp,868 +80001afa: 8000 0x8000 + +Disassembly of section .bss: + +80001afc : + ... + +Disassembly of section .comment: + +00000000 <.comment>: + 0: 3a434347 fmsub.d ft6,ft6,ft4,ft7,rmm + 4: 2820 fld fs0,80(s0) + 6: 29554e47 fmsub.s ft8,fa0,fs5,ft5,rmm + a: 3120 fld fs0,96(a0) + c: 2e30 fld fa2,88(a2) + e: 2e32 fld ft8,264(sp) + 10: 0030 addi a2,sp,8 + +Disassembly of section .riscv.attributes: + +00000000 <.riscv.attributes>: + 0: 2941 jal 490 <__stack_size+0x90> + 2: 0000 unimp + 4: 7200 flw fs0,32(a2) + 6: 7369 lui t1,0xffffa + 8: 01007663 bgeu zero,a6,14 <__stack_usage+0x14> + c: 001f 0000 1004 0x10040000001f + 12: 7205 lui tp,0xfffe1 + 14: 3376 fld ft6,376(sp) + 16: 6932 flw fs2,12(sp) + 18: 7032 flw ft0,44(sp) + 1a: 5f30 lw a2,120(a4) + 1c: 326d jal fffff9c6 <__stack_top+0xfff9c6> + 1e: 3070 fld fa2,224(s0) + 20: 665f 7032 0030 0x307032665f + 26: 0108 addi a0,sp,128 + 28: 0b0a slli s6,s6,0x2 diff --git a/driver/tests/io_addr/kernel.elf b/driver/tests/io_addr/kernel.elf new file mode 100755 index 0000000000000000000000000000000000000000..069b6996b5b5b7870c86537d1ea2188b15c572d8 GIT binary patch literal 8964 zcmeHNYitzP6+ZLWon0HzHDjn&Uv|D{w%F=5BvmJrA=q_FKtDXRL;J5jzHAJe%^F~!Ir1$7s zy7ye81$)?hqY4tbhPy=T(7$w-OiGfuZg{3}L^`X`u!qde(}g2epU0P^#!5>eGL9*< z8RM#_Pls)&-6YBmHIur04drt~Bn>IReI$hx())%;tGP{Vqhc+NC%uMdS)Rl@39IMQ z2UP+cQH3q{5L08iHZYp+yCUksn~hVo4Wq_U@8fy44?%dOg)2x~-$UBkpORL$TF`1Y z2vZ^WG!>c(?HHY^Zx~I7Xf$0*v(`X}hC^gpxvLlQxm%PyqmXn)7P99?NdHAYS&B@$ z|1YHXepxbZJRmFQCtbcr*>k;+BVu0pP}Joc_L%PPC*ylTAv+{f_G}a4l1Y^-X#(qx zt>}Mb{dOgFn10IEToR;UkB|*+5u|!S$ktyXRS?8t{IiH(MEoM+7ZJaN_@$}Zn%Q(+ zU^X43S$+mpnG}qsnI(CjEU8L>j2q`kZ&@xGZ;vlA-kT`1j!qZ!ZMQ|sCy_BQO?vFM znD@mfKXOdeKRhNH@5tqrH$eJTSu(r|S?KzNEDhI#i~egDwD1R{DwjyBeG?o)uKzRX z(v)bO3?YXPNPB$({6@)m<|VH|ep&DBMZ7)8pWo&jPd1U@A2A%PDOe8@H*IWLX9Gx1S!^8HU#)E38#`}=B6CS%tr#OR}8 zh4YtN+7XzcyOXQ_o}64YV|hHV<$8ANDoGb>;IqJWtp$7VTouTT2Q^OhoPX(T?m{+n zMKXF7J71hT_`1b&*nYSQbzX^@mgR(%Qxf@Hu2Fj$^#woHjZmXu$|7p0p!en|8k%FE3Q}QD`BjFhd&&atJin+FZYc;C{_Pu!}oGi| z$0Gc7T6n+a_^(9#O-^l9zxrt5e1C%Hf0lXk;{x|{u}PEQ)hyeWyZ@M zpLM`nP!CK~{^&UN?J(j)52Du&!(I`__TGAARC?|GocCUCv<7PFBwj4$kA53te?4(FZ+{=z}sO`k(}fJ}5$>4+@azg9M4roLTqpf595)qx|igWDR%(mZ*nm-pB4b z){h9hcpqA(i2bS*bY-Sd_fFH;q07^-owD;i#QDxu8YdK48fqcchyB!g$E#L{$nbhZ zJ*x;>@Uky^K3A>sK3OqN`8#iucBX2j7Q7y?yftLVH$_#cBJK19_5|$DfCJQ*9j9`& zQVY&hCMP2gv*)f>yLS3GH~?O3uX-j6#hUnpdeVMRB(M5OMYhya&~C)clXp%K~KiCkxr&c2ZG?##YqkD?!T3uaePQE@y|D z0)~0eqssRRR_{n*Y{jMm>Oe$&il|SlC5YIAg0vQA)LO)`4rkOl)Gy9d)UL#FW<%St zr<=gm%X@0H6B42SuXH=$t7nb7oAI0_g3*Pd1 z6IlP$A{++h^RpUK#GkJ((zgKLf&OAq-?>QNwFrNA5jGa#A1uQA7vUFynV&jPBNX{@ zO*sDJMZGg7)6W*+Hx^)tehJ_E>S6Y#rFIqUQM-9+4B1tA%~=h&O5L12OMz)PTa{a*!s;Yl|hFI@+A>n(uz z-v`EbTl23)|Kq@1&s-nu|DV8WHPfVx3`wA`v6n*%^suVu#`x z;8d(Vp3cNlk-hP*xIMKydaw(MG|NaPn(j#3Q`(|N2eA=!xbob{oSS%}dw(j%D*Gm~ zCz^>u&-RgMDiu8(iFNH+IAF(!A$GRFraKk;)Y!ZMciuuXXOtg(E^au-u+b6OpN_W2 z9;0;p`53iF4my_6RC^k#olRSIZHa7cZrR=%i2%2ECflN&k?v%?3+#}Ki;vfx!BC_# z-e$}T-yNBIXpA@kWRl%Bx3LW3!>FAD&Omo6Y4Gxo)8GzbZ0pXQk>=JWJDj$3+DR%U zVu^GtLy2g-i<~UNbYbu@hkbTcOnjs*x#uuZIDI&giMByzQjXl`-Z8-xMpfWCM>d6R zSK+iH5Cw&eWFiskf|-5#hohNHD&Dp~6HBB1K5KgthS&z!J$5ts|3dKoa~tcptwwk4 zcu(zs-3#-Mc-gih3PJKY&Oa^qtU@PO$0mHwT}NksY{z>Lo%q~zo3D8DRYR8{34qx4_YAJ|X>jFr&;pWEX yAe?`&>&C<~4Din@UKdp1O@b2}gx$_V`2;^yvAO;#fcY8R_J +#include +#include +#include +#include +#include +#include "common.h" + +#define NUM_ADDRS 16 + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +uint32_t count = 0; + +size_t usr_test_mem; + +std::vector src_data; +std::vector ref_data; + +vx_device_h device = nullptr; +vx_buffer_h staging_buf = nullptr; + +static void show_usage() { + std::cout << "Vortex Driver Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { + switch (c) { + case 'n': + count = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (staging_buf) { + vx_buf_release(staging_buf); + } + if (device) { + vx_dev_close(device); + } +} + +void gen_input_data(uint32_t num_points) { + src_data.resize(num_points); + + uint32_t u = 0, k = 0; + for (uint32_t i = 0; i < num_points; ++i) { + if (0 ==(i % 4)) { + k = (i + u) % NUM_ADDRS; + ++u; + } + uint32_t j = i % NUM_ADDRS; + uint32_t v = ((j == k) ? usr_test_mem : IO_BASE_ADDR) + j * sizeof(uint32_t); + src_data[i] = v; + std::cout << std::dec << i << "," << k << ": value=0x" << std::hex << v << std::endl; + } +} + +void gen_ref_data(uint32_t num_points) { + ref_data.resize(num_points); + + for (uint32_t i = 0; i < num_points; ++i) { + uint32_t j = i % NUM_ADDRS; + ref_data[i] = j * j; + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + uint32_t num_points) { + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, -1)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < num_points; ++i) { + int ref = ref_data.at(i); + int cur = buf_ptr[i]; + if (cur != ref) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + size_t value; + kernel_arg_t kernel_arg; + + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + unsigned max_cores, max_warps, max_threads; + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); + + uint32_t num_tasks = max_cores * max_warps * max_threads; + uint32_t num_points = count * num_tasks; + + RT_CHECK(vx_alloc_dev_mem(device, NUM_ADDRS * sizeof(uint32_t), &usr_test_mem)); + + // generate input data + gen_input_data(num_points); + + // generate reference data + gen_ref_data(num_points); + + uint32_t src_buf_size = src_data.size() * sizeof(int32_t); + uint32_t dst_buf_size = src_data.size() * sizeof(int32_t); + + std::cout << "number of points: " << num_points << std::endl; + std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + + RT_CHECK(vx_alloc_dev_mem(device, src_buf_size, &value)); + kernel_arg.src_ptr = value; + RT_CHECK(vx_alloc_dev_mem(device, dst_buf_size, &value)); + kernel_arg.dst_ptr = value; + + kernel_arg.num_points = num_points; + + std::cout << "dev_src=" << std::hex << kernel_arg.src_ptr << std::endl; + std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl; + + // allocate shared memory + std::cout << "allocate shared memory" << std::endl; + uint32_t staging_buf_size = std::max(src_buf_size, + std::max(dst_buf_size, + sizeof(kernel_arg_t))); + RT_CHECK(vx_alloc_shared_mem(device, staging_buf_size, &staging_buf)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + { + auto buf_ptr = (int*)vx_host_ptr(staging_buf); + memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); + } + + // upload test address data + { + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < NUM_ADDRS; ++i) { + buf_ptr[i] = i * i; + } + } + RT_CHECK(vx_copy_to_dev(staging_buf, 0xFF000000, NUM_ADDRS * sizeof(uint32_t), 0)); + RT_CHECK(vx_copy_to_dev(staging_buf, usr_test_mem, NUM_ADDRS * sizeof(uint32_t), 0)); + + // upload source buffer + { + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = src_data.at(i); + } + } + std::cout << "upload source buffer" << std::endl; + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_ptr, src_buf_size, 0)); + + // clear destination buffer + { + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = 0xdeadbeef; + } + } + std::cout << "clear destination buffer" << std::endl; + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, dst_buf_size, 0)); + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points)); + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + std::cout << "PASSED!" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index d9521b8e..59e3243c 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -41,6 +41,7 @@ module VX_cluster #( output wire busy, output wire ebreak ); + `STATIC_ASSERT((`L2_ENABLE == 0 || `NUM_CORES > 1), ("invalid parameter")) wire [`NUM_CORES-1:0] per_core_mem_req_valid; wire [`NUM_CORES-1:0] per_core_mem_req_rw; @@ -166,7 +167,7 @@ module VX_cluster #( .CACHE_LINE_SIZE (`L2CACHE_LINE_SIZE), .NUM_BANKS (`L2NUM_BANKS), .WORD_SIZE (`L2WORD_SIZE), - .NUM_REQS (`NUM_CORES), + .NUM_REQS (`L2NUM_REQS), .CREQ_SIZE (`L2CREQ_SIZE), .MSHR_SIZE (`L2MSHR_SIZE), .MRSQ_SIZE (`L2MRSQ_SIZE), @@ -174,15 +175,14 @@ module VX_cluster #( .WRITE_ENABLE (1), .CORE_TAG_WIDTH (`XMEM_TAG_WIDTH), .CORE_TAG_ID_BITS (0), - .MEM_TAG_WIDTH (`L2MEM_TAG_WIDTH) + .MEM_TAG_WIDTH (`L2MEM_TAG_WIDTH), + .NC_ENABLE (1) ) l2cache ( `SCOPE_BIND_VX_cluster_l2cache .clk (clk), .reset (reset), - .flush (1'b0), - `ifdef PERF_ENABLE .perf_cache_if (perf_l2cache_if), `endif diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 6368eaa7..2b78d644 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -45,20 +45,20 @@ `define STARTUP_ADDR 32'h80000000 `endif -`ifndef IO_BUS_BASE_ADDR -`define IO_BUS_BASE_ADDR 32'hFF000000 +`ifndef IO_BASE_ADDR +`define IO_BASE_ADDR 32'hFF000000 `endif -`ifndef SHARED_MEM_BASE_ADDR -`define SHARED_MEM_BASE_ADDR `IO_BUS_BASE_ADDR +`ifndef IO_ADDR_SIZE +`define IO_ADDR_SIZE (32'hFFFFFFFF - 32'hFF000000 + 1) `endif -`ifndef SHARED_MEM_BASE_ADDR_ALIGN -`define SHARED_MEM_BASE_ADDR_ALIGN 64 +`ifndef IO_ADDR_COUT +`define IO_ADDR_COUT 32'hFFFFFFFC `endif -`ifndef IO_BUS_ADDR_COUT -`define IO_BUS_ADDR_COUT 32'hFFFFFFFC +`ifndef SMEM_BASE_ADDR +`define SMEM_BASE_ADDR `IO_BASE_ADDR `endif `ifndef FRAME_BUFFER_BASE_ADDR diff --git a/hw/rtl/VX_core.v b/hw/rtl/VX_core.v index 3a13ac66..36e74097 100644 --- a/hw/rtl/VX_core.v +++ b/hw/rtl/VX_core.v @@ -71,13 +71,13 @@ module VX_core #( //-- VX_dcache_core_req_if #( - .NUM_REQS(`DNUM_REQUESTS), + .NUM_REQS(`DNUM_REQS), .WORD_SIZE(`DWORD_SIZE), .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH) ) dcache_core_req_if(); VX_dcache_core_rsp_if #( - .NUM_REQS(`DNUM_REQUESTS), + .NUM_REQS(`DNUM_REQS), .WORD_SIZE(`DWORD_SIZE), .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH) ) dcache_core_rsp_if(); diff --git a/hw/rtl/VX_databus_arb.v b/hw/rtl/VX_databus_arb.v index 02704b7f..47b6c6c1 100644 --- a/hw/rtl/VX_databus_arb.v +++ b/hw/rtl/VX_databus_arb.v @@ -18,40 +18,36 @@ module VX_databus_arb ( // output response VX_dcache_core_rsp_if core_rsp_if ); - localparam SMEM_ASHIFT = `CLOG2(`SHARED_MEM_BASE_ADDR_ALIGN); - localparam REQ_ASHIFT = `CLOG2(`DWORD_SIZE); - localparam REQ_ADDRW = 32 - REQ_ASHIFT; - localparam REQ_DATAW = 1 + REQ_ADDRW + 1 + `DWORD_SIZE + (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; - localparam RSP_DATAW = `NUM_THREADS + `NUM_THREADS * (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; + localparam REQ_ASHIFT = `CLOG2(`DWORD_SIZE); + localparam REQ_ADDRW = 32 - REQ_ASHIFT; + localparam REQ_DATAW = 1 + REQ_ADDRW + 1 + `DWORD_SIZE + (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; + localparam RSP_DATAW = `NUM_THREADS + `NUM_THREADS * (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; // // handle requests // for (genvar i = 0; i < `NUM_THREADS; ++i) begin - - wire cache_req_valid_out, cache_req_ready_out; - wire is_smem_addr_in, is_smem_addr_out; - - // select shared memory bus - assign is_smem_addr_in = `SM_ENABLE - && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] >= (32-SMEM_ASHIFT)'((`SHARED_MEM_BASE_ADDR - `SMEM_SIZE) >> SMEM_ASHIFT)) - && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] < (32-SMEM_ASHIFT)'(`SHARED_MEM_BASE_ADDR >> SMEM_ASHIFT)); - - VX_skid_buffer #( - .DATAW (REQ_DATAW) - ) out_buffer ( - .clk (clk), - .reset (reset), - .valid_in (core_req_if.valid[i]), - .data_in ({is_smem_addr_in, core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), - .ready_in (core_req_if.ready[i]), - .valid_out (cache_req_valid_out), - .data_out ({is_smem_addr_out, cache_req_if.addr[i], cache_req_if.rw[i], cache_req_if.byteen[i], cache_req_if.data[i], cache_req_if.tag[i]}), - .ready_out (cache_req_ready_out) - ); - if (`SM_ENABLE) begin + wire cache_req_valid_out; + wire cache_req_ready_out; + wire is_smem_addr_out; + + wire is_smem_addr_in = core_req_if.tag[i][1]; + + VX_skid_buffer #( + .DATAW (REQ_DATAW) + ) out_buffer ( + .clk (clk), + .reset (reset), + .valid_in (core_req_if.valid[i]), + .data_in ({is_smem_addr_in, core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), + .ready_in (core_req_if.ready[i]), + .valid_out (cache_req_valid_out), + .data_out ({is_smem_addr_out, cache_req_if.addr[i], cache_req_if.rw[i], cache_req_if.byteen[i], cache_req_if.data[i], cache_req_if.tag[i]}), + .ready_out (cache_req_ready_out) + ); + assign cache_req_if.valid[i] = cache_req_valid_out && ~is_smem_addr_out; assign smem_req_if.valid[i] = cache_req_valid_out && is_smem_addr_out; assign cache_req_ready_out = is_smem_addr_out ? smem_req_if.ready[i] : cache_req_if.ready[i]; @@ -61,10 +57,22 @@ module VX_databus_arb ( assign smem_req_if.byteen[i] = cache_req_if.byteen[i]; assign smem_req_if.data[i] = cache_req_if.data[i]; assign smem_req_if.tag[i] = cache_req_if.tag[i]; + end else begin - `UNUSED_VAR (is_smem_addr_out) - assign cache_req_if.valid[i] = cache_req_valid_out; - assign cache_req_ready_out = cache_req_if.ready[i]; + + VX_skid_buffer #( + .DATAW (REQ_DATAW) + ) out_buffer ( + .clk (clk), + .reset (reset), + .valid_in (core_req_if.valid[i]), + .data_in ({core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), + .ready_in (core_req_if.ready[i]), + .valid_out (cache_req_if.valid[i]), + .data_out ({cache_req_if.addr[i], cache_req_if.rw[i], cache_req_if.byteen[i], cache_req_if.data[i], cache_req_if.tag[i]}), + .ready_out (cache_req_if.ready[i]) + ); + end end @@ -90,7 +98,7 @@ module VX_databus_arb ( VX_stream_arbiter #( .NUM_REQS (2), .DATAW (RSP_DATAW), - .BUFFERED (0) + .BUFFERED (1) ) rsp_arb ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index ece01117..3f80fcc1 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -237,6 +237,9 @@ `define DBG_CACHE_REQ_MDATAW 0 `endif +// Shared memory and non-cacheable flags +`define SM_NC_BITS 2 + ////////////////////////// Icache Configurable Knobs ////////////////////////// // Cache ID @@ -280,10 +283,11 @@ // Word size in bytes `define DWORD_SIZE 4 -// TAG sharing enable -`define DCORE_TAG_ID_BITS `LOG2UP(`LSUQ_SIZE) +// TAG sharing enable +`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE) +`define DCORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `SM_NC_BITS) -// Core request tag bits +// Input request tag bits `define DCORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCORE_TAG_ID_BITS) // Memory request data bits @@ -295,11 +299,13 @@ // Memory byte enable bits `define DMEM_BYTEEN_WIDTH `DCACHE_LINE_SIZE -// Memory request tag bits -`define DMEM_TAG_WIDTH `DMEM_ADDR_WIDTH +// Input request size +`define DNUM_REQS `NUM_THREADS -// Core request size -`define DNUM_REQUESTS `NUM_THREADS +// Memory request tag bits +`define _DMEM_ADDR_RATIO_W $clog2(`DCACHE_LINE_SIZE / `DWORD_SIZE) +`define _DNC_MEM_TAG_WIDTH ($clog2(`DNUM_REQS) + `_DMEM_ADDR_RATIO_W + `DCORE_TAG_WIDTH) +`define DMEM_TAG_WIDTH `MAX((`DMEM_ADDR_WIDTH + `SM_NC_BITS), `_DNC_MEM_TAG_WIDTH) ////////////////////////// SM Configurable Knobs ////////////////////////////// @@ -312,11 +318,8 @@ // bank address offset `define SBANK_ADDR_OFFSET `CLOG2(`STACK_SIZE / `SWORD_SIZE) -// Core request size -`define SNUM_REQUESTS `NUM_THREADS - -// Core request size -`define SNUM_REQUESTS `NUM_THREADS +// Input request size +`define SNUM_REQS `NUM_THREADS ////////////////////////// L2cache Configurable Knobs ///////////////////////// @@ -324,12 +327,12 @@ `define L2CACHE_ID (32'(`L3_ENABLE) + CLUSTER_ID) // Block size in bytes -`define L2CACHE_LINE_SIZE `MEM_BLOCK_SIZE +`define L2CACHE_LINE_SIZE `MEM_BLOCK_SIZE // Word size in bytes `define L2WORD_SIZE `DCACHE_LINE_SIZE -// Core request tag bits +// Input request tag bits `define L2CORE_TAG_WIDTH (`DCORE_TAG_WIDTH + `CLOG2(`NUM_CORES)) // Memory request data bits @@ -341,8 +344,14 @@ // Memory byte enable bits `define L2MEM_BYTEEN_WIDTH `L2CACHE_LINE_SIZE +// Input request size +`define L2NUM_REQS `NUM_CORES + // Memory request tag bits -`define L2MEM_TAG_WIDTH (`L2_ENABLE ? `L2MEM_ADDR_WIDTH : (`XMEM_TAG_WIDTH+`CLOG2(`NUM_CORES))) +`define _L2MEM_ADDR_RATIO_W $clog2(`L2CACHE_LINE_SIZE / `L2WORD_SIZE) +`define _L2NC_MEM_TAG_WIDTH ($clog2(`L2NUM_REQS) + `_L2MEM_ADDR_RATIO_W + `XMEM_TAG_WIDTH) +`define _L2MEM_TAG_WIDTH `MAX((`L2MEM_ADDR_WIDTH + `SM_NC_BITS), `_L2NC_MEM_TAG_WIDTH) +`define L2MEM_TAG_WIDTH (`L2_ENABLE ? `_L2MEM_TAG_WIDTH : (`XMEM_TAG_WIDTH + `CLOG2(`L2NUM_REQS))) ////////////////////////// L3cache Configurable Knobs ///////////////////////// @@ -350,12 +359,12 @@ `define L3CACHE_ID 0 // Block size in bytes -`define L3CACHE_LINE_SIZE `MEM_BLOCK_SIZE +`define L3CACHE_LINE_SIZE `MEM_BLOCK_SIZE // Word size in bytes `define L3WORD_SIZE `L2CACHE_LINE_SIZE -// Core request tag bits +// Input request tag bits `define L3CORE_TAG_WIDTH (`L2CORE_TAG_WIDTH + `CLOG2(`NUM_CLUSTERS)) // Memory request data bits @@ -367,21 +376,28 @@ // Memory byte enable bits `define L3MEM_BYTEEN_WIDTH `L3CACHE_LINE_SIZE +// Input request size +`define L3NUM_REQS `NUM_CLUSTERS + // Memory request tag bits -`define L3MEM_TAG_WIDTH (`L3_ENABLE ? `L3MEM_ADDR_WIDTH : (`L2MEM_TAG_WIDTH+`CLOG2(`NUM_CLUSTERS))) +`define _L3MEM_ADDR_RATIO_W $clog2(`L3CACHE_LINE_SIZE / `L3WORD_SIZE) +`define _L3NC_MEM_TAG_WIDTH ($clog2(`L3NUM_REQS) + `_L3MEM_ADDR_RATIO_W + `L2MEM_TAG_WIDTH) +`define _L3MEM_TAG_WIDTH `MAX((`L3MEM_ADDR_WIDTH + `SM_NC_BITS), `_L3NC_MEM_TAG_WIDTH) +`define L3MEM_TAG_WIDTH (`L3_ENABLE ? `_L3MEM_TAG_WIDTH : (`L2MEM_TAG_WIDTH + `CLOG2(`L3NUM_REQS))) /////////////////////////////////////////////////////////////////////////////// -`define VX_MEM_BYTEEN_WIDTH `L3MEM_BYTEEN_WIDTH -`define VX_MEM_ADDR_WIDTH `L3MEM_ADDR_WIDTH -`define VX_MEM_LINE_WIDTH `L3MEM_LINE_WIDTH -`define VX_MEM_TAG_WIDTH `L3MEM_TAG_WIDTH +`define VX_MEM_BYTEEN_WIDTH `L3MEM_BYTEEN_WIDTH +`define VX_MEM_ADDR_WIDTH `L3MEM_ADDR_WIDTH +`define VX_MEM_LINE_WIDTH `L3MEM_LINE_WIDTH +`define VX_MEM_TAG_WIDTH `L3MEM_TAG_WIDTH `define VX_CORE_TAG_WIDTH `L3CORE_TAG_WIDTH `define VX_CSR_ID_WIDTH `LOG2UP(`NUM_CLUSTERS * `NUM_CORES) `define TO_FULL_ADDR(x) {x, (32-$bits(x))'(0)} -`define XMEM_TAG_WIDTH (`DMEM_TAG_WIDTH+`CLOG2(2)) +// Merged D-cache/I-cache memory tag +`define XMEM_TAG_WIDTH (`DMEM_TAG_WIDTH + `CLOG2(2)) `include "VX_types.vh" diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 97928cd6..f1f99162 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -19,8 +19,17 @@ module VX_lsu_unit #( VX_commit_if ld_commit_if, VX_commit_if st_commit_if ); + localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE); + localparam MEM_ADDRW = 32 - MEM_ASHIFT; - `UNUSED_PARAM (CORE_ID) + localparam REQ_ASHIFT = `CLOG2(`DWORD_SIZE); + localparam REQ_ADDRW = 32 - REQ_ASHIFT; + + localparam ADDR_TYPEW = 1 + `SM_ENABLE; + + `STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) + `STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) + `STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter")) wire req_valid; wire [`NUM_THREADS-1:0] req_tmask; @@ -33,29 +42,53 @@ module VX_lsu_unit #( wire [31:0] req_pc; wire req_is_dup; - wire [`NUM_THREADS-1:0][31:0] full_address; + wire [`NUM_THREADS-1:0][ADDR_TYPEW-1:0] lsu_addr_type, req_addr_type; + + wire [`NUM_THREADS-1:0][31:0] full_addr; for (genvar i = 0; i < `NUM_THREADS; i++) begin - assign full_address[i] = lsu_req_if.base_addr[i] + lsu_req_if.offset; + assign full_addr[i] = lsu_req_if.base_addr[i] + lsu_req_if.offset; + end + + wire [`NUM_THREADS-1:0][REQ_ADDRW-1:0] word_addr; + for (genvar i = 0; i < `NUM_THREADS; i++) begin + assign word_addr[i] = full_addr[i][REQ_ASHIFT +: REQ_ADDRW]; end wire [`NUM_THREADS-1:0] addr_matches; for (genvar i = 0; i < `NUM_THREADS; i++) begin - assign addr_matches[i] = (full_address[0][31:2] == full_address[i][31:2]) || ~lsu_req_if.tmask[i]; + assign addr_matches[i] = (word_addr[0] == word_addr[i]) || ~lsu_req_if.tmask[i]; end wire is_dup_load = lsu_req_if.wb && lsu_req_if.tmask[0] && (& addr_matches); + + wire [`NUM_THREADS-1:0] is_addr_sm, is_addr_nc; + + for (genvar i = 0; i < `NUM_THREADS; i++) begin + // is shared memory address + assign is_addr_sm[i] = (word_addr[i][(MEM_ASHIFT-REQ_ASHIFT) +: MEM_ADDRW] >= MEM_ADDRW'((`SMEM_BASE_ADDR - `SMEM_SIZE) >> MEM_ASHIFT)) + & (word_addr[i][(MEM_ASHIFT-REQ_ASHIFT) +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT)); + + // is non-cacheable address + assign is_addr_nc[i] = (word_addr[i][(MEM_ASHIFT-REQ_ASHIFT) +: MEM_ADDRW] >= MEM_ADDRW'(`IO_BASE_ADDR >> MEM_ASHIFT)); + + if (`SM_ENABLE) begin + assign lsu_addr_type[i] = {is_addr_sm[i], is_addr_nc[i]}; + end else begin + assign lsu_addr_type[i] = {1'b0, is_addr_nc[i]}; + end + end wire ready_in; wire stall_in = ~ready_in && req_valid; VX_pipe_register #( - .DATAW (1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + `LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) req_pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_in), - .data_in ({lsu_req_if.valid, is_dup_load, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_address, lsu_req_if.op_type, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.store_data}), - .data_out ({req_valid, req_is_dup, req_wid, req_tmask, req_pc, req_addr, req_type, req_rd, req_wb, req_data}) + .data_in ({lsu_req_if.valid, is_dup_load, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.store_data}), + .data_out ({req_valid, req_is_dup, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data}) ); // Can accept new request? @@ -77,10 +110,10 @@ module VX_lsu_unit #( reg [`NUM_THREADS-1:0] req_sent_mask; wire req_ready_all; - wire [`DCORE_TAG_ID_BITS-1:0] mbuf_waddr, mbuf_raddr; + wire [`LSUQ_ADDR_BITS-1:0] mbuf_waddr, mbuf_raddr; wire mbuf_full; - wire [`NUM_THREADS-1:0][1:0] req_offset, rsp_offset; + wire [`NUM_THREADS-1:0][REQ_ASHIFT-1:0] req_offset, rsp_offset; for (genvar i = 0; i < `NUM_THREADS; i++) begin assign req_offset[i] = req_addr[i][1:0]; end @@ -95,10 +128,10 @@ module VX_lsu_unit #( wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n); - assign mbuf_raddr = dcache_rsp_if.tag[`DCORE_TAG_ID_BITS-1:0]; + assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS]; VX_index_buffer #( - .DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `LSU_BITS + (`NUM_THREADS * 2) + 1), + .DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1), .SIZE (`LSUQ_SIZE) ) req_metadata ( .clk (clk), @@ -132,8 +165,8 @@ module VX_lsu_unit #( wire is_req_start = (0 == req_sent_mask); // need to hold the acquired tag index until the full request is submitted - reg [`DCORE_TAG_ID_BITS-1:0] req_tag_hold; - wire [`DCORE_TAG_ID_BITS-1:0] req_tag = is_req_start ? mbuf_waddr : req_tag_hold; + reg [`LSUQ_ADDR_BITS-1:0] req_tag_hold; + wire [`LSUQ_ADDR_BITS-1:0] req_tag = is_req_start ? mbuf_waddr : req_tag_hold; always @(posedge clk) begin if (mbuf_push) begin req_tag_hold <= mbuf_waddr; @@ -193,11 +226,13 @@ module VX_lsu_unit #( assign dcache_req_if.byteen = mem_req_byteen; assign dcache_req_if.data = mem_req_data; -`ifdef DBG_CACHE_REQ_INFO - assign dcache_req_if.tag = {`NUM_THREADS{req_pc, req_wid, req_tag}}; -`else - assign dcache_req_if.tag = {`NUM_THREADS{req_tag}}; -`endif + for (genvar i = 0; i < `NUM_THREADS; ++i) begin + `ifdef DBG_CACHE_REQ_INFO + assign dcache_req_if.tag[i] = {req_pc, req_wid, req_tag, req_addr_type[i]}; + `else + assign dcache_req_if.tag[i] = {req_tag, req_addr_type[i]}; + `endif + end assign ready_in = req_dep_ready && req_ready_all; @@ -293,18 +328,22 @@ module VX_lsu_unit #( if (dcache_req_if.rw[0]) begin $write("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire); `PRINT_ARRAY1D(req_addr, `NUM_THREADS); - $write(", tag=%0h, byteen=%0h, data=", dcache_req_if.tag[0], dcache_req_if.byteen); + $write(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); + `PRINT_ARRAY1D(req_addr_type, `NUM_THREADS); + $write(", data="); `PRINT_ARRAY1D(dcache_req_if.data, `NUM_THREADS); $write("\n"); end else begin $write("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire); `PRINT_ARRAY1D(req_addr, `NUM_THREADS); - $write(", tag=%0h, byteen=%0h, rd=%0d, is_dup=%b\n", dcache_req_if.tag[0], dcache_req_if.byteen, req_rd, req_is_dup); + $write(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); + `PRINT_ARRAY1D(req_addr_type, `NUM_THREADS); + $write(", rd=%0d, is_dup=%b\n", req_rd, req_is_dup); end end if (dcache_rsp_fire) begin $write("%t: D$%0d Rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=", - $time, CORE_ID, dcache_rsp_if.valid, rsp_wid, rsp_pc, dcache_rsp_if.tag, rsp_rd); + $time, CORE_ID, dcache_rsp_if.valid, rsp_wid, rsp_pc, mbuf_raddr, rsp_rd); `PRINT_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS); $write(", is_dup=%b\n", rsp_is_dup); end diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index 945a41d1..f95f8547 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -41,25 +41,25 @@ module VX_mem_unit # ( ) dcache_mem_rsp_if(), icache_mem_rsp_if(); VX_dcache_core_req_if #( - .NUM_REQS (`DNUM_REQUESTS), + .NUM_REQS (`DNUM_REQS), .WORD_SIZE (`DWORD_SIZE), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH) ) dcache_req_if(); VX_dcache_core_rsp_if #( - .NUM_REQS (`DNUM_REQUESTS), + .NUM_REQS (`DNUM_REQS), .WORD_SIZE (`DWORD_SIZE), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH) ) dcache_rsp_if(); VX_dcache_core_req_if #( - .NUM_REQS (`DNUM_REQUESTS), + .NUM_REQS (`DNUM_REQS), .WORD_SIZE (`DWORD_SIZE), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH) ) smem_req_if(); VX_dcache_core_rsp_if #( - .NUM_REQS (`DNUM_REQUESTS), + .NUM_REQS (`DNUM_REQS), .WORD_SIZE (`DWORD_SIZE), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH) ) smem_rsp_if(); @@ -108,8 +108,6 @@ module VX_mem_unit # ( .clk (clk), .reset (icache_reset), - .flush (1'b0), - // Core request .core_req_valid (icache_core_req_if.valid), .core_req_rw (1'b0), @@ -152,7 +150,7 @@ module VX_mem_unit # ( .NUM_BANKS (`DNUM_BANKS), .NUM_PORTS (`DNUM_PORTS), .WORD_SIZE (`DWORD_SIZE), - .NUM_REQS (`DNUM_REQUESTS), + .NUM_REQS (`DNUM_REQS), .CREQ_SIZE (`DCREQ_SIZE), .MSHR_SIZE (`DMSHR_SIZE), .MRSQ_SIZE (`DMRSQ_SIZE), @@ -160,15 +158,14 @@ module VX_mem_unit # ( .WRITE_ENABLE (1), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH), .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS), - .MEM_TAG_WIDTH (`DMEM_TAG_WIDTH) + .MEM_TAG_WIDTH (`DMEM_TAG_WIDTH), + .NC_ENABLE (1) ) dcache ( `SCOPE_BIND_VX_mem_unit_dcache .clk (clk), .reset (dcache_reset), - .flush (1'b0), - // Core req .core_req_valid (dcache_req_if.valid), .core_req_rw (dcache_req_if.rw), @@ -219,7 +216,7 @@ module VX_mem_unit # ( .CACHE_SIZE (`SMEM_SIZE), .NUM_BANKS (`SNUM_BANKS), .WORD_SIZE (`SWORD_SIZE), - .NUM_REQS (`SNUM_REQUESTS), + .NUM_REQS (`SNUM_REQS), .CREQ_SIZE (`SCREQ_SIZE), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH), .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS), diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index 15a1def4..36173f8b 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -39,6 +39,7 @@ module Vortex ( output wire busy, output wire ebreak ); + `STATIC_ASSERT((`L3_ENABLE == 0 || `NUM_CLUSTERS > 1), ("invalid parameter")) wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_valid; wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_rw; @@ -168,7 +169,7 @@ module Vortex ( .CACHE_LINE_SIZE (`L3CACHE_LINE_SIZE), .NUM_BANKS (`L3NUM_BANKS), .WORD_SIZE (`L3WORD_SIZE), - .NUM_REQS (`NUM_CLUSTERS), + .NUM_REQS (`L3NUM_REQS), .CREQ_SIZE (`L3CREQ_SIZE), .MSHR_SIZE (`L3MSHR_SIZE), .MRSQ_SIZE (`L3MRSQ_SIZE), @@ -176,15 +177,14 @@ module Vortex ( .WRITE_ENABLE (1), .CORE_TAG_WIDTH (`L2MEM_TAG_WIDTH), .CORE_TAG_ID_BITS (0), - .MEM_TAG_WIDTH (`L3MEM_TAG_WIDTH) + .MEM_TAG_WIDTH (`L3MEM_TAG_WIDTH), + .NC_ENABLE (1) ) l3cache ( `SCOPE_BIND_Vortex_l3cache .clk (clk), .reset (reset), - .flush (1'b0), - `ifdef PERF_ENABLE .perf_cache_if (perf_l3cache_if), `endif @@ -267,7 +267,6 @@ module Vortex ( end `SCOPE_ASSIGN (reset, reset); - `SCOPE_ASSIGN (mem_req_fire, mem_req_valid && mem_req_ready); `SCOPE_ASSIGN (mem_req_addr, `TO_FULL_ADDR(mem_req_addr)); `SCOPE_ASSIGN (mem_req_rw, mem_req_rw); diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index f3ade5e9..b2ed57d6 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -36,30 +36,38 @@ module VX_cache #( parameter CORE_TAG_ID_BITS = CORE_TAG_WIDTH, // Memory request tag size - parameter MEM_TAG_WIDTH = (32 - $clog2(CACHE_LINE_SIZE)), + parameter MEM_TAG_WIDTH = (32 - $clog2(CACHE_LINE_SIZE)), // bank offset from beginning of index range - parameter BANK_ADDR_OFFSET = 0 + parameter BANK_ADDR_OFFSET = 0, + + // enable bypass for non-cacheable addresses + parameter NC_ENABLE = 0 ) ( - `SCOPE_IO_VX_cache + `SCOPE_IO_VX_cache + + // PERF +`ifdef PERF_ENABLE + VX_perf_cache_if perf_cache_if, +`endif input wire clk, input wire reset, // Core request - input wire [NUM_REQS-1:0] core_req_valid, - input wire [NUM_REQS-1:0] core_req_rw, - input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, - input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen, - input wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data, - input wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag, - output wire [NUM_REQS-1:0] core_req_ready, + input wire [NUM_REQS-1:0] core_req_valid, + input wire [NUM_REQS-1:0] core_req_rw, + input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, + input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen, + input wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data, + input wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag, + output wire [NUM_REQS-1:0] core_req_ready, // Core response - output wire [NUM_REQS-1:0] core_rsp_valid, - output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data, + output wire [NUM_REQS-1:0] core_rsp_valid, + output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data, output wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag, - input wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready, + input wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready, // Memory request output wire mem_req_valid, @@ -74,18 +82,206 @@ module VX_cache #( input wire mem_rsp_valid, input wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data, input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag, - output wire mem_rsp_ready, - - // PERF -`ifdef PERF_ENABLE - VX_perf_cache_if perf_cache_if, -`endif - - // device flush - input wire flush + output wire mem_rsp_ready ); `STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid value")) + +`ifdef PERF_ENABLE + wire [NUM_BANKS-1:0] perf_read_miss_per_bank; + wire [NUM_BANKS-1:0] perf_write_miss_per_bank; + wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank; + wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank; +`endif + + /////////////////////////////////////////////////////////////////////////// + + // Core request + wire [NUM_REQS-1:0] core_req_valid_out; + wire [NUM_REQS-1:0] core_req_rw_out; + wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr_out; + wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_out; + wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data_out; + wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag_out; + wire [NUM_REQS-1:0] core_req_ready_out; + + // Core response + wire [NUM_REQS-1:0] core_rsp_valid_in; + wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_in; + wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_in; + wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready_in; + + // Memory request + wire mem_req_valid_in; + wire mem_req_rw_in; + wire [CACHE_LINE_SIZE-1:0] mem_req_byteen_in; + wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_in; + wire [`CACHE_LINE_WIDTH-1:0] mem_req_data_in; + wire [MEM_TAG_WIDTH-1:0] mem_req_tag_in; + wire mem_req_ready_in; + + // Memory response + wire mem_rsp_valid_out; + wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_out; + wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_out; + wire mem_rsp_ready_out; + + if (NC_ENABLE) begin + VX_nc_bypass #( + .NUM_REQS (NUM_REQS), + .NUM_RSP_TAGS (`CORE_REQ_TAG_COUNT), + .NC_TAG_BIT (0), + + .CORE_ADDR_WIDTH(`WORD_ADDR_WIDTH), + .CORE_DATA_SIZE (WORD_SIZE), + .CORE_TAG_WIDTH (CORE_TAG_WIDTH), + + .MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH), + .MEM_DATA_SIZE (CACHE_LINE_SIZE), + .MEM_TAG_WIDTH (MEM_TAG_WIDTH) + ) nc_bypass ( + .clk (clk), + .reset (reset), + + // Core request in + .core_req_valid_in (core_req_valid), + .core_req_rw_in (core_req_rw), + .core_req_byteen_in (core_req_byteen), + .core_req_addr_in (core_req_addr), + .core_req_data_in (core_req_data), + .core_req_tag_in (core_req_tag), + .core_req_ready_in (core_req_ready), + + // Core request out + .core_req_valid_out (core_req_valid_out), + .core_req_rw_out (core_req_rw_out), + .core_req_byteen_out(core_req_byteen_out), + .core_req_addr_out (core_req_addr_out), + .core_req_data_out (core_req_data_out), + .core_req_tag_out (core_req_tag_out), + .core_req_ready_out (core_req_ready_out), + + // Core response in + .core_rsp_valid_in (core_rsp_valid_in), + .core_rsp_data_in (core_rsp_data_in), + .core_rsp_tag_in (core_rsp_tag_in), + .core_rsp_ready_in (core_rsp_ready_in), + + // Core response out + .core_rsp_valid_out (core_rsp_valid), + .core_rsp_data_out (core_rsp_data), + .core_rsp_tag_out (core_rsp_tag), + .core_rsp_ready_out (core_rsp_ready), + + // Memory request in + .mem_req_valid_in (mem_req_valid_in), + .mem_req_rw_in (mem_req_rw_in), + .mem_req_byteen_in (mem_req_byteen_in), + .mem_req_addr_in (mem_req_addr_in), + .mem_req_data_in (mem_req_data_in), + .mem_req_tag_in (mem_req_tag_in), + .mem_req_ready_in (mem_req_ready_in), + + // Memory request out + .mem_req_valid_out (mem_req_valid), + .mem_req_rw_out (mem_req_rw), + .mem_req_byteen_out (mem_req_byteen), + .mem_req_addr_out (mem_req_addr), + .mem_req_data_out (mem_req_data), + .mem_req_tag_out (mem_req_tag), + .mem_req_ready_out (mem_req_ready), + + // Memory response in + .mem_rsp_valid_in (mem_rsp_valid), + .mem_rsp_data_in (mem_rsp_data), + .mem_rsp_tag_in (mem_rsp_tag), + .mem_rsp_ready_in (mem_rsp_ready), + + // Memory response out + .mem_rsp_valid_out (mem_rsp_valid_out), + .mem_rsp_data_out (mem_rsp_data_out), + .mem_rsp_tag_out (mem_rsp_tag_out), + .mem_rsp_ready_out (mem_rsp_ready_out) + ); + end else begin + assign core_req_valid_out = core_req_valid; + assign core_req_rw_out = core_req_rw; + assign core_req_addr_out = core_req_addr; + assign core_req_byteen_out = core_req_byteen; + assign core_req_data_out = core_req_data; + assign core_req_tag_out = core_req_tag; + assign core_req_ready = core_req_ready_out; + + assign core_rsp_valid = core_rsp_valid_in; + assign core_rsp_data = core_rsp_data_in; + assign core_rsp_tag = core_rsp_tag_in; + assign core_rsp_ready_in = core_rsp_ready; + + assign mem_req_valid = mem_req_valid_in; + assign mem_req_rw = mem_req_rw_in; + assign mem_req_addr = mem_req_addr_in; + assign mem_req_byteen = mem_req_byteen_in; + assign mem_req_data = mem_req_data_in; + assign mem_req_tag = mem_req_tag_in; + assign mem_req_ready_in = mem_req_ready; + + assign mem_rsp_valid_out = mem_rsp_valid; + assign mem_rsp_data_out = mem_rsp_data; + assign mem_rsp_tag_out = mem_rsp_tag; + assign mem_rsp_ready = mem_rsp_ready_out; + end + + /////////////////////////////////////////////////////////////////////////// + + wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_qual; + wire [`MEM_ADDR_WIDTH-1:0] mem_rsp_tag_out_a, mem_rsp_tag_qual; + + wire mrsq_full, mrsq_empty; + wire mrsq_push, mrsq_pop; + + assign mrsq_push = mem_rsp_valid_out && mem_rsp_ready_out; + assign mem_rsp_ready_out = !mrsq_full; + + // trim out shared memory and non-cacheable flags + assign mem_rsp_tag_out_a = mem_rsp_tag_out[2 +: `MEM_ADDR_WIDTH]; + + VX_fifo_queue #( + .DATAW (`MEM_ADDR_WIDTH + `CACHE_LINE_WIDTH), + .SIZE (MRSQ_SIZE), + .BUFFERED (1) + ) mem_rsp_queue ( + .clk (clk), + .reset (reset), + .push (mrsq_push), + .pop (mrsq_pop), + .data_in ({mem_rsp_tag_out_a, mem_rsp_data_out}), + .data_out ({mem_rsp_tag_qual, mem_rsp_data_qual}), + .empty (mrsq_empty), + .full (mrsq_full), + `UNUSED_PIN (alm_full), + `UNUSED_PIN (alm_empty), + `UNUSED_PIN (size) + ); + + `UNUSED_VAR (mem_rsp_tag_out) + + /////////////////////////////////////////////////////////////////////////// + + wire [`LINE_SELECT_BITS-1:0] flush_addr; + wire flush_enable; + + VX_flush_ctrl #( + .CACHE_SIZE (CACHE_SIZE), + .CACHE_LINE_SIZE (CACHE_LINE_SIZE), + .NUM_BANKS (NUM_BANKS) + ) flush_ctrl ( + .clk (clk), + .reset (reset), + .addr_out (flush_addr), + .valid_out (flush_enable) + ); + + /////////////////////////////////////////////////////////////////////////// wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_req_valid; wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] per_bank_core_req_wsel; @@ -112,44 +308,6 @@ module VX_cache #( wire [NUM_BANKS-1:0] per_bank_mem_req_ready; wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready; - - wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_qual; - wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_qual; - wire [`LINE_SELECT_BITS-1:0] flush_addr; - wire flush_enable; - -`ifdef PERF_ENABLE - wire [NUM_BANKS-1:0] perf_read_miss_per_bank; - wire [NUM_BANKS-1:0] perf_write_miss_per_bank; - wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank; - wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank; -`endif - - /////////////////////////////////////////////////////////////////////////// - - wire mrsq_full, mrsq_empty; - wire mrsq_push, mrsq_pop; - - assign mrsq_push = mem_rsp_valid && mem_rsp_ready; - assign mem_rsp_ready = !mrsq_full; - - VX_fifo_queue #( - .DATAW (MEM_TAG_WIDTH + `CACHE_LINE_WIDTH), - .SIZE (MRSQ_SIZE), - .BUFFERED (1) - ) mem_rsp_queue ( - .clk (clk), - .reset (reset), - .push (mrsq_push), - .pop (mrsq_pop), - .data_in ({mem_rsp_tag, mem_rsp_data}), - .data_out ({mem_rsp_tag_qual, mem_rsp_data_qual}), - .empty (mrsq_empty), - .full (mrsq_full), - `UNUSED_PIN (alm_full), - `UNUSED_PIN (alm_empty), - `UNUSED_PIN (size) - ); if (NUM_BANKS == 1) begin `UNUSED_VAR (mem_rsp_tag_qual) @@ -158,21 +316,6 @@ module VX_cache #( assign mrsq_pop = !mrsq_empty && per_bank_mem_rsp_ready[`MEM_ADDR_BANK(mem_rsp_tag_qual)]; end - /////////////////////////////////////////////////////////////////////////// - - VX_flush_ctrl #( - .CACHE_SIZE (CACHE_SIZE), - .CACHE_LINE_SIZE (CACHE_LINE_SIZE), - .NUM_BANKS (NUM_BANKS) - ) flush_ctrl ( - .clk (clk), - .reset (reset || flush), - .addr_out (flush_addr), - .valid_out (flush_enable) - ); - - /////////////////////////////////////////////////////////////////////////// - VX_cache_core_req_bank_sel #( .CACHE_ID (CACHE_ID), .CACHE_LINE_SIZE (CACHE_LINE_SIZE), @@ -188,13 +331,13 @@ module VX_cache #( `ifdef PERF_ENABLE .bank_stalls(perf_cache_if.bank_stalls), `endif - .core_req_valid (core_req_valid), - .core_req_rw (core_req_rw), - .core_req_addr (core_req_addr), - .core_req_byteen(core_req_byteen), - .core_req_data (core_req_data), - .core_req_tag (core_req_tag), - .core_req_ready (core_req_ready), + .core_req_valid (core_req_valid_out), + .core_req_rw (core_req_rw_out), + .core_req_addr (core_req_addr_out), + .core_req_byteen(core_req_byteen_out), + .core_req_data (core_req_data_out), + .core_req_tag (core_req_tag_out), + .core_req_ready (core_req_ready_out), .per_bank_core_req_valid (per_bank_core_req_valid), .per_bank_core_req_rw (per_bank_core_req_rw), .per_bank_core_req_addr (per_bank_core_req_addr), @@ -365,10 +508,10 @@ module VX_cache #( .per_bank_core_rsp_tag (per_bank_core_rsp_tag), .per_bank_core_rsp_tid (per_bank_core_rsp_tid), .per_bank_core_rsp_ready (per_bank_core_rsp_ready), - .core_rsp_valid (core_rsp_valid), - .core_rsp_tag (core_rsp_tag), - .core_rsp_data (core_rsp_data), - .core_rsp_ready (core_rsp_ready) + .core_rsp_valid (core_rsp_valid_in), + .core_rsp_tag (core_rsp_tag_in), + .core_rsp_data (core_rsp_data_in), + .core_rsp_ready (core_rsp_ready_in) ); wire [NUM_BANKS-1:0][(`MEM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH)-1:0] data_in; @@ -386,12 +529,13 @@ module VX_cache #( .valid_in (per_bank_mem_req_valid), .data_in (data_in), .ready_in (per_bank_mem_req_ready), - .valid_out (mem_req_valid), - .data_out ({mem_req_addr, mem_req_rw, mem_req_byteen, mem_req_data}), - .ready_out (mem_req_ready) + .valid_out (mem_req_valid_in), + .data_out ({mem_req_addr_in, mem_req_rw_in, mem_req_byteen_in, mem_req_data_in}), + .ready_out (mem_req_ready_in) ); - assign mem_req_tag = mem_req_addr; + // build memory tag adding shared memory and non-cacheable flags + assign mem_req_tag_in = MEM_TAG_WIDTH'({mem_req_addr_in, 1'b0, 1'b0}); `ifdef PERF_ENABLE // per cycle: core_reads, core_writes diff --git a/hw/rtl/cache/VX_nc_bypass.v b/hw/rtl/cache/VX_nc_bypass.v new file mode 100644 index 00000000..dd8b3da6 --- /dev/null +++ b/hw/rtl/cache/VX_nc_bypass.v @@ -0,0 +1,301 @@ +`include "VX_cache_define.vh" + +module VX_nc_bypass #( + parameter NUM_REQS = 1, + parameter NUM_RSP_TAGS = 0, + parameter NC_TAG_BIT = 0, + + parameter CORE_ADDR_WIDTH = 1, + parameter CORE_DATA_SIZE = 1, + parameter CORE_TAG_WIDTH = 1, + + parameter MEM_ADDR_WIDTH = 1, + parameter MEM_DATA_SIZE = 1, + parameter MEM_TAG_WIDTH = 1, + + parameter CORE_DATA_WIDTH = CORE_DATA_SIZE * 8, + parameter MEM_DATA_WIDTH = MEM_DATA_SIZE * 8 + ) ( + input wire clk, + input wire reset, + + // Core request in + input wire [NUM_REQS-1:0] core_req_valid_in, + input wire [NUM_REQS-1:0] core_req_rw_in, + input wire [NUM_REQS-1:0][CORE_ADDR_WIDTH-1:0] core_req_addr_in, + input wire [NUM_REQS-1:0][CORE_DATA_SIZE-1:0] core_req_byteen_in, + input wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_req_data_in, + input wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag_in, + output wire [NUM_REQS-1:0] core_req_ready_in, + + // Core request out + output wire [NUM_REQS-1:0] core_req_valid_out, + output wire [NUM_REQS-1:0] core_req_rw_out, + output wire [NUM_REQS-1:0][CORE_ADDR_WIDTH-1:0] core_req_addr_out, + output wire [NUM_REQS-1:0][CORE_DATA_SIZE-1:0] core_req_byteen_out, + output wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_req_data_out, + output wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag_out, + input wire [NUM_REQS-1:0] core_req_ready_out, + + // Core response in + input wire [NUM_REQS-1:0] core_rsp_valid_in, + input wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_in, + input wire [NUM_RSP_TAGS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_in, + output wire [NUM_RSP_TAGS-1:0] core_rsp_ready_in, + + // Core response out + output wire [NUM_REQS-1:0] core_rsp_valid_out, + output wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_out, + output wire [NUM_RSP_TAGS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_out, + input wire [NUM_RSP_TAGS-1:0] core_rsp_ready_out, + + // Memory request in + input wire mem_req_valid_in, + input wire mem_req_rw_in, + input wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_in, + input wire [MEM_DATA_SIZE-1:0] mem_req_byteen_in, + input wire [MEM_DATA_WIDTH-1:0] mem_req_data_in, + input wire [MEM_TAG_WIDTH-1:0] mem_req_tag_in, + output wire mem_req_ready_in, + + // Memory request out + output wire mem_req_valid_out, + output wire mem_req_rw_out, + output wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_out, + output wire [MEM_DATA_SIZE-1:0] mem_req_byteen_out, + output wire [MEM_DATA_WIDTH-1:0] mem_req_data_out, + output wire [MEM_TAG_WIDTH-1:0] mem_req_tag_out, + input wire mem_req_ready_out, + + // Memory response in + input wire mem_rsp_valid_in, + input wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_in, + input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_in, + output wire mem_rsp_ready_in, + + // Memory response out + output wire mem_rsp_valid_out, + output wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_out, + output wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_out, + input wire mem_rsp_ready_out +); + `STATIC_ASSERT((NUM_RSP_TAGS == 1 || NUM_RSP_TAGS == NUM_REQS), ("invalid paramter")) + + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + + localparam CORE_REQ_TIDW = $clog2(NUM_REQS); + + localparam CORE_LDATAW = $clog2(CORE_DATA_WIDTH); + localparam MEM_LDATAW = $clog2(MEM_DATA_WIDTH); + localparam D = MEM_LDATAW - CORE_LDATAW; + localparam P = 2**D; + + // core request handling + + reg [NUM_REQS-1:0] core_req_valid_out_r; + reg [NUM_REQS-1:0] core_req_ready_in_r; + + wire [NUM_REQS-1:0] core_req_valid_in_nc; + wire [CORE_REQ_TIDW-1:0] core_req_nc_tid; + + for (genvar i = 0; i < NUM_REQS; ++i) begin + assign core_req_valid_in_nc[i] = core_req_valid_in[i] && core_req_tag_in[i][NC_TAG_BIT]; + end + + always @(*) begin + for (integer i = 0; i < NUM_REQS; ++i) begin + if (core_req_valid_in_nc[i]) begin + core_req_valid_out_r[i] = 0; + core_req_ready_in_r[i] = mem_req_ready_out && (core_req_nc_tid == CORE_REQ_TIDW'(i)); + end else begin + core_req_valid_out_r[i] = core_req_valid_in[i]; + core_req_ready_in_r[i] = core_req_ready_out[i]; + end + end + end + + assign core_req_valid_out = core_req_valid_out_r; + assign core_req_rw_out = core_req_rw_in; + assign core_req_addr_out = core_req_addr_in; + assign core_req_byteen_out = core_req_byteen_in; + assign core_req_data_out = core_req_data_in; + assign core_req_tag_out = core_req_tag_in; + assign core_req_ready_in = core_req_ready_in_r; + + // memory request handling + + reg mem_req_valid_out_r; + reg mem_req_rw_out_r; + reg [MEM_DATA_SIZE-1:0] mem_req_byteen_out_r; + reg [MEM_ADDR_WIDTH-1:0] mem_req_addr_out_r; + reg [MEM_DATA_WIDTH-1:0] mem_req_data_out_r; + reg [MEM_TAG_WIDTH-1:0] mem_req_tag_out_r; + reg mem_req_ready_in_r; + + wire core_req_nc_valid; + + VX_priority_encoder #( + .N (NUM_REQS) + ) core_req_sel ( + .data_in (core_req_valid_in_nc), + .index (core_req_nc_tid), + `UNUSED_PIN (onehot), + .valid_out (core_req_nc_valid) + ); + + always @(*) begin + if (core_req_nc_valid) begin + mem_req_valid_out_r = 1; + mem_req_rw_out_r = core_req_rw_in[core_req_nc_tid]; + mem_req_addr_out_r = core_req_addr_in[core_req_nc_tid][D +: MEM_ADDR_WIDTH]; + for (integer i = 0; i < P; ++i) begin + mem_req_data_out_r[i * CORE_DATA_WIDTH +: CORE_DATA_WIDTH] = core_req_data_in[core_req_nc_tid]; + end + mem_req_ready_in_r = 0; + end else begin + mem_req_valid_out_r = mem_req_valid_in; + mem_req_rw_out_r = mem_req_rw_in; + mem_req_addr_out_r = mem_req_addr_in; + mem_req_data_out_r = mem_req_data_in; + mem_req_ready_in_r = mem_req_ready_out; + end + end + + if (D != 0) begin + wire [D-1:0] req_addr_idx = core_req_addr_in[core_req_nc_tid][D-1:0]; + always @(*) begin + if (core_req_nc_valid) begin + mem_req_byteen_out_r = 0; + mem_req_byteen_out_r[req_addr_idx * CORE_DATA_SIZE +: CORE_DATA_SIZE] = core_req_byteen_in[core_req_nc_tid]; + mem_req_tag_out_r = MEM_TAG_WIDTH'({core_req_nc_tid, req_addr_idx, core_req_tag_in[core_req_nc_tid]}); + end else begin + mem_req_byteen_out_r = mem_req_byteen_in; + mem_req_tag_out_r = mem_req_tag_in; + end + end + end else begin + always @(*) begin + if (core_req_nc_valid) begin + mem_req_byteen_out_r = core_req_byteen_in[core_req_nc_tid]; + mem_req_tag_out_r = MEM_TAG_WIDTH'({core_req_nc_tid, core_req_tag_in[core_req_nc_tid]}); + end else begin + mem_req_byteen_out_r = mem_req_byteen_in; + mem_req_tag_out_r = mem_req_tag_in; + end + end + end + + assign mem_req_valid_out = mem_req_valid_out_r; + assign mem_req_rw_out = mem_req_rw_out_r; + assign mem_req_addr_out = mem_req_addr_out_r; + assign mem_req_byteen_out = mem_req_byteen_out_r; + assign mem_req_data_out = mem_req_data_out_r; + assign mem_req_tag_out = mem_req_tag_out_r; + assign mem_req_ready_in = mem_req_ready_in_r; + + // core response handling + + reg [NUM_REQS-1:0] core_rsp_valid_out_r; + reg [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_out_r; + reg [NUM_RSP_TAGS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_out_r; + reg [NUM_RSP_TAGS-1:0] core_rsp_ready_in_r; + + wire [CORE_REQ_TIDW-1:0] rsp_tid = mem_rsp_tag_in[(CORE_TAG_WIDTH + D) +: CORE_REQ_TIDW]; + + wire is_mem_rsp_nc = mem_rsp_valid_in && mem_rsp_tag_in[NC_TAG_BIT]; + + if (NUM_REQS > 1) begin + always @(*) begin + if (is_mem_rsp_nc) begin + core_rsp_valid_out_r = 0; + core_rsp_valid_out_r[rsp_tid] = 1; + for (integer i = 0; i < NUM_RSP_TAGS; ++i) begin + core_rsp_tag_out_r[i] = mem_rsp_tag_in[CORE_TAG_WIDTH-1:0]; + end + core_rsp_ready_in_r = 0; + end else begin + core_rsp_valid_out_r = core_rsp_valid_in; + core_rsp_tag_out_r = core_rsp_tag_in; + core_rsp_ready_in_r = core_rsp_ready_out; + end + end + end else begin + always @(*) begin + if (is_mem_rsp_nc) begin + core_rsp_valid_out_r = 1; + core_rsp_tag_out_r = mem_rsp_tag_in[CORE_TAG_WIDTH-1:0]; + core_rsp_ready_in_r = 0; + end else begin + core_rsp_valid_out_r = core_rsp_valid_in; + core_rsp_tag_out_r = core_rsp_tag_in; + core_rsp_ready_in_r = core_rsp_ready_out; + end + end + end + + if (D != 0) begin + wire [D-1:0] rsp_addr_idx = mem_rsp_tag_in[CORE_TAG_WIDTH +: D]; + always @(*) begin + if (is_mem_rsp_nc) begin + for (integer i = 0; i < NUM_REQS; ++i) begin + core_rsp_data_out_r[i] = mem_rsp_data_in[rsp_addr_idx * CORE_DATA_WIDTH +: CORE_DATA_WIDTH]; + end + end else begin + core_rsp_data_out_r = core_rsp_data_in; + end + end + end else begin + always @(*) begin + if (is_mem_rsp_nc) begin + for (integer i = 0; i < NUM_REQS; ++i) begin + core_rsp_data_out_r[i] = mem_rsp_data_in; + end + end else begin + core_rsp_data_out_r = core_rsp_data_in; + end + end + end + + assign core_rsp_valid_out = core_rsp_valid_out_r; + assign core_rsp_data_out = core_rsp_data_out_r; + assign core_rsp_tag_out = core_rsp_tag_out_r; + assign core_rsp_ready_in = core_rsp_ready_in_r; + + // memory response handling + + reg mem_rsp_valid_out_r; + reg mem_rsp_ready_in_r; + + always @(*) begin + if (is_mem_rsp_nc) begin + mem_rsp_valid_out_r = 0; + end else begin + mem_rsp_valid_out_r = mem_rsp_valid_in; + end + end + + if (NUM_RSP_TAGS > 1) begin + always @(*) begin + if (is_mem_rsp_nc) begin + mem_rsp_ready_in_r = core_rsp_ready_out[rsp_tid]; + end else begin + mem_rsp_ready_in_r = mem_rsp_ready_out; + end + end + end else begin + always @(*) begin + if (is_mem_rsp_nc) begin + mem_rsp_ready_in_r = core_rsp_ready_out; + end else begin + mem_rsp_ready_in_r = mem_rsp_ready_out; + end + end + end + + assign mem_rsp_valid_out = mem_rsp_valid_out_r; + assign mem_rsp_data_out = mem_rsp_data_in; + assign mem_rsp_tag_out = mem_rsp_tag_in; + assign mem_rsp_ready_in = mem_rsp_ready_in_r; + +endmodule diff --git a/hw/scripts/scope.json b/hw/scripts/scope.json index fea3dd02..b22c2efa 100644 --- a/hw/scripts/scope.json +++ b/hw/scripts/scope.json @@ -111,9 +111,9 @@ "!cci_pending_writes_full": 1, "?afu_mem_req_fire": 1, "afu_mem_req_addr": 26, - "afu_mem_req_tag": 28, + "afu_mem_req_tag": 30, "?afu_mem_rsp_fire": 1, - "afu_mem_rsp_tag": 28 + "afu_mem_rsp_tag": 30 }, "afu/vortex": { "!reset": 1, @@ -167,10 +167,10 @@ "dcache_req_rw": 1, "dcache_req_byteen":"`NUM_THREADS * 4", "dcache_req_data": "`NUM_THREADS * 32", - "dcache_req_tag":"`DCORE_TAG_ID_BITS", + "dcache_req_tag":"`LSUQ_ADDR_BITS", "?dcache_rsp_fire":"`NUM_THREADS", "dcache_rsp_data":"`NUM_THREADS * 32", - "dcache_rsp_tag":"`DCORE_TAG_ID_BITS" + "dcache_rsp_tag":"`LSUQ_ADDR_BITS" }, "afu/vortex/cluster/core/pipeline/issue": { "?issue_fire": 1, diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index 25ed1992..e3964cb8 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -309,7 +309,7 @@ void Simulator::run() { } int Simulator::get_last_wb_value(int reg) const { - return (int)vortex_->Vortex->genblk1__BRA__0__KET____DOT__cluster->genblk1__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg]; + return (int)vortex_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg]; } void Simulator::load_bin(const char* program_file) { diff --git a/hw/simulate/simulator.h b/hw/simulate/simulator.h index 80e2fa42..261870d5 100644 --- a/hw/simulate/simulator.h +++ b/hw/simulate/simulator.h @@ -50,7 +50,7 @@ private: int cycles_left; std::array block; uint32_t addr; - uint32_t tag; + uint64_t tag; } mem_req_t; std::unordered_map print_bufs_; diff --git a/simX/core.cpp b/simX/core.cpp index 1f0df567..a1133196 100644 --- a/simX/core.cpp +++ b/simX/core.cpp @@ -321,8 +321,8 @@ Word Core::dcache_read(Addr addr, Size size) { ++loads_; Word data = 0; #ifdef SM_ENABLE - if ((addr >= (SHARED_MEM_BASE_ADDR - SMEM_SIZE)) - && ((addr + 3) < SHARED_MEM_BASE_ADDR)) { + if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) + && ((addr + 3) < SMEM_BASE_ADDR)) { shared_mem_.read(addr & (SMEM_SIZE-1), &data, size); return data; } @@ -334,8 +334,8 @@ Word Core::dcache_read(Addr addr, Size size) { void Core::dcache_write(Addr addr, Word data, Size size) { ++stores_; #ifdef SM_ENABLE - if ((addr >= (SHARED_MEM_BASE_ADDR - SMEM_SIZE)) - && ((addr + 3) < SHARED_MEM_BASE_ADDR)) { + if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE)) + && ((addr + 3) < SMEM_BASE_ADDR)) { shared_mem_.write(addr & (SMEM_SIZE-1), &data, size); return; }