project tests refactoring
This commit is contained in:
25
tests/Makefile
Normal file
25
tests/Makefile
Normal file
@@ -0,0 +1,25 @@
|
||||
all: driver runtime regression opencl riscv
|
||||
|
||||
driver:
|
||||
$(MAKE) -C driver
|
||||
|
||||
runtime:
|
||||
$(MAKE) -C runtime
|
||||
|
||||
regression:
|
||||
$(MAKE) -C regression
|
||||
|
||||
opencl:
|
||||
$(MAKE) -C opencl
|
||||
|
||||
riscv:
|
||||
$(MAKE) -C riscv
|
||||
|
||||
clean:
|
||||
$(MAKE) clean -C driver
|
||||
$(MAKE) clean -C runtime
|
||||
$(MAKE) clean -C regression
|
||||
$(MAKE) clean -C opencl
|
||||
$(MAKE) clean -C riscv
|
||||
|
||||
.PHONY: all driver runtime regression opencl riscv
|
||||
16
tests/driver/Makefile
Normal file
16
tests/driver/Makefile
Normal file
@@ -0,0 +1,16 @@
|
||||
all:
|
||||
$(MAKE) -C basic
|
||||
$(MAKE) -C demo
|
||||
|
||||
run:
|
||||
$(MAKE) -C basic run-vlsim
|
||||
$(MAKE) -C demo run-vlsim
|
||||
|
||||
clean:
|
||||
$(MAKE) -C basic clean
|
||||
$(MAKE) -C demo clean
|
||||
|
||||
clean-all:
|
||||
$(MAKE) -C basic clean-all
|
||||
$(MAKE) -C demo clean-all
|
||||
|
||||
69
tests/driver/basic/Makefile
Normal file
69
tests/driver/basic/Makefile
Normal file
@@ -0,0 +1,69 @@
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
|
||||
OPTS ?= -n256
|
||||
|
||||
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
|
||||
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
|
||||
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
|
||||
|
||||
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
|
||||
|
||||
VX_SRCS = kernel.c
|
||||
|
||||
#CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I../../include
|
||||
|
||||
LDFLAGS +=
|
||||
|
||||
PROJECT = basic
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
all: $(PROJECT) kernel.bin kernel.dump
|
||||
|
||||
kernel.dump: kernel.elf
|
||||
$(VX_DP) -D kernel.elf > kernel.dump
|
||||
|
||||
kernel.bin: kernel.elf
|
||||
$(VX_CP) -O binary kernel.elf kernel.bin
|
||||
|
||||
kernel.elf: $(VX_SRCS)
|
||||
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../stub -lvortex -o $@
|
||||
|
||||
run-fpga: $(PROJECT)
|
||||
LD_LIBRARY_PATH=../../opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT)
|
||||
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT)
|
||||
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT)
|
||||
LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT)
|
||||
LD_LIBRARY_PATH=../../simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.elf *.bin *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
12
tests/driver/basic/common.h
Normal file
12
tests/driver/basic/common.h
Normal file
@@ -0,0 +1,12 @@
|
||||
#ifndef _COMMON_H_
|
||||
#define _COMMON_H_
|
||||
|
||||
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
|
||||
|
||||
struct kernel_arg_t {
|
||||
uint32_t count;
|
||||
uint32_t src_ptr;
|
||||
uint32_t dst_ptr;
|
||||
};
|
||||
|
||||
#endif
|
||||
BIN
tests/driver/basic/kernel.bin
Executable file
BIN
tests/driver/basic/kernel.bin
Executable file
Binary file not shown.
16
tests/driver/basic/kernel.c
Normal file
16
tests/driver/basic/kernel.c
Normal file
@@ -0,0 +1,16 @@
|
||||
#include <stdint.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include "common.h"
|
||||
|
||||
void main() {
|
||||
struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
|
||||
uint32_t count = arg->count;
|
||||
int32_t* src_ptr = (int32_t*)arg->src_ptr;
|
||||
int32_t* dst_ptr = (int32_t*)arg->dst_ptr;
|
||||
|
||||
uint32_t offset = vx_core_id() * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
dst_ptr[offset + i] = src_ptr[offset + i];
|
||||
}
|
||||
}
|
||||
545
tests/driver/basic/kernel.dump
Normal file
545
tests/driver/basic/kernel.dump
Normal file
@@ -0,0 +1,545 @@
|
||||
|
||||
kernel.elf: file format elf32-littleriscv
|
||||
|
||||
|
||||
Disassembly of section .init:
|
||||
|
||||
80000000 <_start>:
|
||||
80000000: 00000597 auipc a1,0x0
|
||||
80000004: 0c058593 addi a1,a1,192 # 800000c0 <vx_set_sp>
|
||||
80000008: fc102573 csrr a0,0xfc1
|
||||
8000000c: 00b5106b 0xb5106b
|
||||
80000010: 0b0000ef jal ra,800000c0 <vx_set_sp>
|
||||
80000014: 00100513 li a0,1
|
||||
80000018: 0005006b 0x5006b
|
||||
8000001c: 00002517 auipc a0,0x2
|
||||
80000020: b0050513 addi a0,a0,-1280 # 80001b1c <__BSS_END__>
|
||||
80000024: 00002617 auipc a2,0x2
|
||||
80000028: af860613 addi a2,a2,-1288 # 80001b1c <__BSS_END__>
|
||||
8000002c: 40a60633 sub a2,a2,a0
|
||||
80000030: 00000593 li a1,0
|
||||
80000034: 41c000ef jal ra,80000450 <memset>
|
||||
80000038: 00000517 auipc a0,0x0
|
||||
8000003c: 32050513 addi a0,a0,800 # 80000358 <__libc_fini_array>
|
||||
80000040: 2d0000ef jal ra,80000310 <atexit>
|
||||
80000044: 370000ef jal ra,800003b4 <__libc_init_array>
|
||||
80000048: 008000ef jal ra,80000050 <main>
|
||||
8000004c: 2d80006f j 80000324 <exit>
|
||||
|
||||
Disassembly of section .text:
|
||||
|
||||
80000050 <main>:
|
||||
80000050: 7ffff7b7 lui a5,0x7ffff
|
||||
80000054: 0007a703 lw a4,0(a5) # 7ffff000 <__stack_size+0x7fffec00>
|
||||
80000058: 0047a683 lw a3,4(a5)
|
||||
8000005c: 0087a583 lw a1,8(a5)
|
||||
80000060: cc5027f3 csrr a5,0xcc5
|
||||
80000064: 02e787b3 mul a5,a5,a4
|
||||
80000068: 02070863 beqz a4,80000098 <main+0x48>
|
||||
8000006c: 00f70733 add a4,a4,a5
|
||||
80000070: 00271713 slli a4,a4,0x2
|
||||
80000074: 00279793 slli a5,a5,0x2
|
||||
80000078: 00d787b3 add a5,a5,a3
|
||||
8000007c: 00d70733 add a4,a4,a3
|
||||
80000080: 40d585b3 sub a1,a1,a3
|
||||
80000084: 0007a603 lw a2,0(a5)
|
||||
80000088: 00f586b3 add a3,a1,a5
|
||||
8000008c: 00478793 addi a5,a5,4
|
||||
80000090: 00c6a023 sw a2,0(a3)
|
||||
80000094: fef718e3 bne a4,a5,80000084 <main+0x34>
|
||||
80000098: 00008067 ret
|
||||
|
||||
8000009c <register_fini>:
|
||||
8000009c: 00000793 li a5,0
|
||||
800000a0: 00078863 beqz a5,800000b0 <register_fini+0x14>
|
||||
800000a4: 80000537 lui a0,0x80000
|
||||
800000a8: 35850513 addi a0,a0,856 # 80000358 <__stack_top+0x81000358>
|
||||
800000ac: 2640006f j 80000310 <atexit>
|
||||
800000b0: 00008067 ret
|
||||
|
||||
800000b4 <_exit>:
|
||||
800000b4: 048000ef jal ra,800000fc <vx_perf_dump>
|
||||
800000b8: 00000513 li a0,0
|
||||
800000bc: 0005006b 0x5006b
|
||||
|
||||
800000c0 <vx_set_sp>:
|
||||
800000c0: fc002573 csrr a0,0xfc0
|
||||
800000c4: 0005006b 0x5006b
|
||||
800000c8: 00002197 auipc gp,0x2
|
||||
800000cc: e2818193 addi gp,gp,-472 # 80001ef0 <__global_pointer>
|
||||
800000d0: 7f000117 auipc sp,0x7f000
|
||||
800000d4: f3010113 addi sp,sp,-208 # ff000000 <__stack_top>
|
||||
800000d8: 40000593 li a1,1024
|
||||
800000dc: cc102673 csrr a2,0xcc1
|
||||
800000e0: 02c585b3 mul a1,a1,a2
|
||||
800000e4: 40b10133 sub sp,sp,a1
|
||||
800000e8: cc3026f3 csrr a3,0xcc3
|
||||
800000ec: 00068663 beqz a3,800000f8 <RETURN>
|
||||
800000f0: 00000513 li a0,0
|
||||
800000f4: 0005006b 0x5006b
|
||||
|
||||
800000f8 <RETURN>:
|
||||
800000f8: 00008067 ret
|
||||
|
||||
800000fc <vx_perf_dump>:
|
||||
800000fc: cc5027f3 csrr a5,0xcc5
|
||||
80000100: 00ff0737 lui a4,0xff0
|
||||
80000104: 00e787b3 add a5,a5,a4
|
||||
80000108: 00879793 slli a5,a5,0x8
|
||||
8000010c: b0002773 csrr a4,mcycle
|
||||
80000110: 00e7a023 sw a4,0(a5)
|
||||
80000114: b0102773 csrr a4,0xb01
|
||||
80000118: 00e7a223 sw a4,4(a5)
|
||||
8000011c: b0202773 csrr a4,minstret
|
||||
80000120: 00e7a423 sw a4,8(a5)
|
||||
80000124: b0302773 csrr a4,mhpmcounter3
|
||||
80000128: 00e7a623 sw a4,12(a5)
|
||||
8000012c: b0402773 csrr a4,mhpmcounter4
|
||||
80000130: 00e7a823 sw a4,16(a5)
|
||||
80000134: b0502773 csrr a4,mhpmcounter5
|
||||
80000138: 00e7aa23 sw a4,20(a5)
|
||||
8000013c: b0602773 csrr a4,mhpmcounter6
|
||||
80000140: 00e7ac23 sw a4,24(a5)
|
||||
80000144: b0702773 csrr a4,mhpmcounter7
|
||||
80000148: 00e7ae23 sw a4,28(a5)
|
||||
8000014c: b0802773 csrr a4,mhpmcounter8
|
||||
80000150: 02e7a023 sw a4,32(a5)
|
||||
80000154: b0902773 csrr a4,mhpmcounter9
|
||||
80000158: 02e7a223 sw a4,36(a5)
|
||||
8000015c: b0a02773 csrr a4,mhpmcounter10
|
||||
80000160: 02e7a423 sw a4,40(a5)
|
||||
80000164: b0b02773 csrr a4,mhpmcounter11
|
||||
80000168: 02e7a623 sw a4,44(a5)
|
||||
8000016c: b0c02773 csrr a4,mhpmcounter12
|
||||
80000170: 02e7a823 sw a4,48(a5)
|
||||
80000174: b0d02773 csrr a4,mhpmcounter13
|
||||
80000178: 02e7aa23 sw a4,52(a5)
|
||||
8000017c: b0e02773 csrr a4,mhpmcounter14
|
||||
80000180: 02e7ac23 sw a4,56(a5)
|
||||
80000184: b0f02773 csrr a4,mhpmcounter15
|
||||
80000188: 02e7ae23 sw a4,60(a5)
|
||||
8000018c: b1002773 csrr a4,mhpmcounter16
|
||||
80000190: 04e7a023 sw a4,64(a5)
|
||||
80000194: b1102773 csrr a4,mhpmcounter17
|
||||
80000198: 04e7a223 sw a4,68(a5)
|
||||
8000019c: b1202773 csrr a4,mhpmcounter18
|
||||
800001a0: 04e7a423 sw a4,72(a5)
|
||||
800001a4: b1302773 csrr a4,mhpmcounter19
|
||||
800001a8: 04e7a623 sw a4,76(a5)
|
||||
800001ac: b1402773 csrr a4,mhpmcounter20
|
||||
800001b0: 04e7a823 sw a4,80(a5)
|
||||
800001b4: b1502773 csrr a4,mhpmcounter21
|
||||
800001b8: 04e7aa23 sw a4,84(a5)
|
||||
800001bc: b1602773 csrr a4,mhpmcounter22
|
||||
800001c0: 04e7ac23 sw a4,88(a5)
|
||||
800001c4: b1702773 csrr a4,mhpmcounter23
|
||||
800001c8: 04e7ae23 sw a4,92(a5)
|
||||
800001cc: b1802773 csrr a4,mhpmcounter24
|
||||
800001d0: 06e7a023 sw a4,96(a5)
|
||||
800001d4: b1902773 csrr a4,mhpmcounter25
|
||||
800001d8: 06e7a223 sw a4,100(a5)
|
||||
800001dc: b1a02773 csrr a4,mhpmcounter26
|
||||
800001e0: 06e7a423 sw a4,104(a5)
|
||||
800001e4: b1b02773 csrr a4,mhpmcounter27
|
||||
800001e8: 06e7a623 sw a4,108(a5)
|
||||
800001ec: b1c02773 csrr a4,mhpmcounter28
|
||||
800001f0: 06e7a823 sw a4,112(a5)
|
||||
800001f4: b1d02773 csrr a4,mhpmcounter29
|
||||
800001f8: 06e7aa23 sw a4,116(a5)
|
||||
800001fc: b1e02773 csrr a4,mhpmcounter30
|
||||
80000200: 06e7ac23 sw a4,120(a5)
|
||||
80000204: b1f02773 csrr a4,mhpmcounter31
|
||||
80000208: 06e7ae23 sw a4,124(a5)
|
||||
8000020c: b8002773 csrr a4,mcycleh
|
||||
80000210: 08e7a023 sw a4,128(a5)
|
||||
80000214: b8102773 csrr a4,0xb81
|
||||
80000218: 08e7a223 sw a4,132(a5)
|
||||
8000021c: b8202773 csrr a4,minstreth
|
||||
80000220: 08e7a423 sw a4,136(a5)
|
||||
80000224: b8302773 csrr a4,mhpmcounter3h
|
||||
80000228: 08e7a623 sw a4,140(a5)
|
||||
8000022c: b8402773 csrr a4,mhpmcounter4h
|
||||
80000230: 08e7a823 sw a4,144(a5)
|
||||
80000234: b8502773 csrr a4,mhpmcounter5h
|
||||
80000238: 08e7aa23 sw a4,148(a5)
|
||||
8000023c: b8602773 csrr a4,mhpmcounter6h
|
||||
80000240: 08e7ac23 sw a4,152(a5)
|
||||
80000244: b8702773 csrr a4,mhpmcounter7h
|
||||
80000248: 08e7ae23 sw a4,156(a5)
|
||||
8000024c: b8802773 csrr a4,mhpmcounter8h
|
||||
80000250: 0ae7a023 sw a4,160(a5)
|
||||
80000254: b8902773 csrr a4,mhpmcounter9h
|
||||
80000258: 0ae7a223 sw a4,164(a5)
|
||||
8000025c: b8a02773 csrr a4,mhpmcounter10h
|
||||
80000260: 0ae7a423 sw a4,168(a5)
|
||||
80000264: b8b02773 csrr a4,mhpmcounter11h
|
||||
80000268: 0ae7a623 sw a4,172(a5)
|
||||
8000026c: b8c02773 csrr a4,mhpmcounter12h
|
||||
80000270: 0ae7a823 sw a4,176(a5)
|
||||
80000274: b8d02773 csrr a4,mhpmcounter13h
|
||||
80000278: 0ae7aa23 sw a4,180(a5)
|
||||
8000027c: b8e02773 csrr a4,mhpmcounter14h
|
||||
80000280: 0ae7ac23 sw a4,184(a5)
|
||||
80000284: b8f02773 csrr a4,mhpmcounter15h
|
||||
80000288: 0ae7ae23 sw a4,188(a5)
|
||||
8000028c: b9002773 csrr a4,mhpmcounter16h
|
||||
80000290: 0ce7a023 sw a4,192(a5)
|
||||
80000294: b9102773 csrr a4,mhpmcounter17h
|
||||
80000298: 0ce7a223 sw a4,196(a5)
|
||||
8000029c: b9202773 csrr a4,mhpmcounter18h
|
||||
800002a0: 0ce7a423 sw a4,200(a5)
|
||||
800002a4: b9302773 csrr a4,mhpmcounter19h
|
||||
800002a8: 0ce7a623 sw a4,204(a5)
|
||||
800002ac: b9402773 csrr a4,mhpmcounter20h
|
||||
800002b0: 0ce7a823 sw a4,208(a5)
|
||||
800002b4: b9502773 csrr a4,mhpmcounter21h
|
||||
800002b8: 0ce7aa23 sw a4,212(a5)
|
||||
800002bc: b9602773 csrr a4,mhpmcounter22h
|
||||
800002c0: 0ce7ac23 sw a4,216(a5)
|
||||
800002c4: b9702773 csrr a4,mhpmcounter23h
|
||||
800002c8: 0ce7ae23 sw a4,220(a5)
|
||||
800002cc: b9802773 csrr a4,mhpmcounter24h
|
||||
800002d0: 0ee7a023 sw a4,224(a5)
|
||||
800002d4: b9902773 csrr a4,mhpmcounter25h
|
||||
800002d8: 0ee7a223 sw a4,228(a5)
|
||||
800002dc: b9a02773 csrr a4,mhpmcounter26h
|
||||
800002e0: 0ee7a423 sw a4,232(a5)
|
||||
800002e4: b9b02773 csrr a4,mhpmcounter27h
|
||||
800002e8: 0ee7a623 sw a4,236(a5)
|
||||
800002ec: b9c02773 csrr a4,mhpmcounter28h
|
||||
800002f0: 0ee7a823 sw a4,240(a5)
|
||||
800002f4: b9d02773 csrr a4,mhpmcounter29h
|
||||
800002f8: 0ee7aa23 sw a4,244(a5)
|
||||
800002fc: b9e02773 csrr a4,mhpmcounter30h
|
||||
80000300: 0ee7ac23 sw a4,248(a5)
|
||||
80000304: b9f02773 csrr a4,mhpmcounter31h
|
||||
80000308: 0ee7ae23 sw a4,252(a5)
|
||||
8000030c: 00008067 ret
|
||||
|
||||
80000310 <atexit>:
|
||||
80000310: 00050593 mv a1,a0
|
||||
80000314: 00000693 li a3,0
|
||||
80000318: 00000613 li a2,0
|
||||
8000031c: 00000513 li a0,0
|
||||
80000320: 20c0006f j 8000052c <__register_exitproc>
|
||||
|
||||
80000324 <exit>:
|
||||
80000324: ff010113 addi sp,sp,-16
|
||||
80000328: 00000593 li a1,0
|
||||
8000032c: 00812423 sw s0,8(sp)
|
||||
80000330: 00112623 sw ra,12(sp)
|
||||
80000334: 00050413 mv s0,a0
|
||||
80000338: 290000ef jal ra,800005c8 <__call_exitprocs>
|
||||
8000033c: 800027b7 lui a5,0x80002
|
||||
80000340: b187a503 lw a0,-1256(a5) # 80001b18 <__stack_top+0x81001b18>
|
||||
80000344: 03c52783 lw a5,60(a0)
|
||||
80000348: 00078463 beqz a5,80000350 <exit+0x2c>
|
||||
8000034c: 000780e7 jalr a5
|
||||
80000350: 00040513 mv a0,s0
|
||||
80000354: d61ff0ef jal ra,800000b4 <_exit>
|
||||
|
||||
80000358 <__libc_fini_array>:
|
||||
80000358: ff010113 addi sp,sp,-16
|
||||
8000035c: 00812423 sw s0,8(sp)
|
||||
80000360: 800017b7 lui a5,0x80001
|
||||
80000364: 80001437 lui s0,0x80001
|
||||
80000368: 6f040413 addi s0,s0,1776 # 800016f0 <__stack_top+0x810016f0>
|
||||
8000036c: 6f078793 addi a5,a5,1776 # 800016f0 <__stack_top+0x810016f0>
|
||||
80000370: 408787b3 sub a5,a5,s0
|
||||
80000374: 00912223 sw s1,4(sp)
|
||||
80000378: 00112623 sw ra,12(sp)
|
||||
8000037c: 4027d493 srai s1,a5,0x2
|
||||
80000380: 02048063 beqz s1,800003a0 <__libc_fini_array+0x48>
|
||||
80000384: ffc78793 addi a5,a5,-4
|
||||
80000388: 00878433 add s0,a5,s0
|
||||
8000038c: 00042783 lw a5,0(s0)
|
||||
80000390: fff48493 addi s1,s1,-1
|
||||
80000394: ffc40413 addi s0,s0,-4
|
||||
80000398: 000780e7 jalr a5
|
||||
8000039c: fe0498e3 bnez s1,8000038c <__libc_fini_array+0x34>
|
||||
800003a0: 00c12083 lw ra,12(sp)
|
||||
800003a4: 00812403 lw s0,8(sp)
|
||||
800003a8: 00412483 lw s1,4(sp)
|
||||
800003ac: 01010113 addi sp,sp,16
|
||||
800003b0: 00008067 ret
|
||||
|
||||
800003b4 <__libc_init_array>:
|
||||
800003b4: ff010113 addi sp,sp,-16
|
||||
800003b8: 00812423 sw s0,8(sp)
|
||||
800003bc: 01212023 sw s2,0(sp)
|
||||
800003c0: 80001437 lui s0,0x80001
|
||||
800003c4: 80001937 lui s2,0x80001
|
||||
800003c8: 6ec40793 addi a5,s0,1772 # 800016ec <__stack_top+0x810016ec>
|
||||
800003cc: 6ec90913 addi s2,s2,1772 # 800016ec <__stack_top+0x810016ec>
|
||||
800003d0: 40f90933 sub s2,s2,a5
|
||||
800003d4: 00112623 sw ra,12(sp)
|
||||
800003d8: 00912223 sw s1,4(sp)
|
||||
800003dc: 40295913 srai s2,s2,0x2
|
||||
800003e0: 02090063 beqz s2,80000400 <__libc_init_array+0x4c>
|
||||
800003e4: 6ec40413 addi s0,s0,1772
|
||||
800003e8: 00000493 li s1,0
|
||||
800003ec: 00042783 lw a5,0(s0)
|
||||
800003f0: 00148493 addi s1,s1,1
|
||||
800003f4: 00440413 addi s0,s0,4
|
||||
800003f8: 000780e7 jalr a5
|
||||
800003fc: fe9918e3 bne s2,s1,800003ec <__libc_init_array+0x38>
|
||||
80000400: 80001437 lui s0,0x80001
|
||||
80000404: 80001937 lui s2,0x80001
|
||||
80000408: 6ec40793 addi a5,s0,1772 # 800016ec <__stack_top+0x810016ec>
|
||||
8000040c: 6f090913 addi s2,s2,1776 # 800016f0 <__stack_top+0x810016f0>
|
||||
80000410: 40f90933 sub s2,s2,a5
|
||||
80000414: 40295913 srai s2,s2,0x2
|
||||
80000418: 02090063 beqz s2,80000438 <__libc_init_array+0x84>
|
||||
8000041c: 6ec40413 addi s0,s0,1772
|
||||
80000420: 00000493 li s1,0
|
||||
80000424: 00042783 lw a5,0(s0)
|
||||
80000428: 00148493 addi s1,s1,1
|
||||
8000042c: 00440413 addi s0,s0,4
|
||||
80000430: 000780e7 jalr a5
|
||||
80000434: fe9918e3 bne s2,s1,80000424 <__libc_init_array+0x70>
|
||||
80000438: 00c12083 lw ra,12(sp)
|
||||
8000043c: 00812403 lw s0,8(sp)
|
||||
80000440: 00412483 lw s1,4(sp)
|
||||
80000444: 00012903 lw s2,0(sp)
|
||||
80000448: 01010113 addi sp,sp,16
|
||||
8000044c: 00008067 ret
|
||||
|
||||
80000450 <memset>:
|
||||
80000450: 00f00313 li t1,15
|
||||
80000454: 00050713 mv a4,a0
|
||||
80000458: 02c37e63 bgeu t1,a2,80000494 <memset+0x44>
|
||||
8000045c: 00f77793 andi a5,a4,15
|
||||
80000460: 0a079063 bnez a5,80000500 <memset+0xb0>
|
||||
80000464: 08059263 bnez a1,800004e8 <memset+0x98>
|
||||
80000468: ff067693 andi a3,a2,-16
|
||||
8000046c: 00f67613 andi a2,a2,15
|
||||
80000470: 00e686b3 add a3,a3,a4
|
||||
80000474: 00b72023 sw a1,0(a4) # ff0000 <__stack_size+0xfefc00>
|
||||
80000478: 00b72223 sw a1,4(a4)
|
||||
8000047c: 00b72423 sw a1,8(a4)
|
||||
80000480: 00b72623 sw a1,12(a4)
|
||||
80000484: 01070713 addi a4,a4,16
|
||||
80000488: fed766e3 bltu a4,a3,80000474 <memset+0x24>
|
||||
8000048c: 00061463 bnez a2,80000494 <memset+0x44>
|
||||
80000490: 00008067 ret
|
||||
80000494: 40c306b3 sub a3,t1,a2
|
||||
80000498: 00269693 slli a3,a3,0x2
|
||||
8000049c: 00000297 auipc t0,0x0
|
||||
800004a0: 005686b3 add a3,a3,t0
|
||||
800004a4: 00c68067 jr 12(a3)
|
||||
800004a8: 00b70723 sb a1,14(a4)
|
||||
800004ac: 00b706a3 sb a1,13(a4)
|
||||
800004b0: 00b70623 sb a1,12(a4)
|
||||
800004b4: 00b705a3 sb a1,11(a4)
|
||||
800004b8: 00b70523 sb a1,10(a4)
|
||||
800004bc: 00b704a3 sb a1,9(a4)
|
||||
800004c0: 00b70423 sb a1,8(a4)
|
||||
800004c4: 00b703a3 sb a1,7(a4)
|
||||
800004c8: 00b70323 sb a1,6(a4)
|
||||
800004cc: 00b702a3 sb a1,5(a4)
|
||||
800004d0: 00b70223 sb a1,4(a4)
|
||||
800004d4: 00b701a3 sb a1,3(a4)
|
||||
800004d8: 00b70123 sb a1,2(a4)
|
||||
800004dc: 00b700a3 sb a1,1(a4)
|
||||
800004e0: 00b70023 sb a1,0(a4)
|
||||
800004e4: 00008067 ret
|
||||
800004e8: 0ff5f593 andi a1,a1,255
|
||||
800004ec: 00859693 slli a3,a1,0x8
|
||||
800004f0: 00d5e5b3 or a1,a1,a3
|
||||
800004f4: 01059693 slli a3,a1,0x10
|
||||
800004f8: 00d5e5b3 or a1,a1,a3
|
||||
800004fc: f6dff06f j 80000468 <memset+0x18>
|
||||
80000500: 00279693 slli a3,a5,0x2
|
||||
80000504: 00000297 auipc t0,0x0
|
||||
80000508: 005686b3 add a3,a3,t0
|
||||
8000050c: 00008293 mv t0,ra
|
||||
80000510: fa0680e7 jalr -96(a3)
|
||||
80000514: 00028093 mv ra,t0
|
||||
80000518: ff078793 addi a5,a5,-16
|
||||
8000051c: 40f70733 sub a4,a4,a5
|
||||
80000520: 00f60633 add a2,a2,a5
|
||||
80000524: f6c378e3 bgeu t1,a2,80000494 <memset+0x44>
|
||||
80000528: f3dff06f j 80000464 <memset+0x14>
|
||||
|
||||
8000052c <__register_exitproc>:
|
||||
8000052c: 800027b7 lui a5,0x80002
|
||||
80000530: b187a703 lw a4,-1256(a5) # 80001b18 <__stack_top+0x81001b18>
|
||||
80000534: 14872783 lw a5,328(a4)
|
||||
80000538: 04078c63 beqz a5,80000590 <__register_exitproc+0x64>
|
||||
8000053c: 0047a703 lw a4,4(a5)
|
||||
80000540: 01f00813 li a6,31
|
||||
80000544: 06e84e63 blt a6,a4,800005c0 <__register_exitproc+0x94>
|
||||
80000548: 00271813 slli a6,a4,0x2
|
||||
8000054c: 02050663 beqz a0,80000578 <__register_exitproc+0x4c>
|
||||
80000550: 01078333 add t1,a5,a6
|
||||
80000554: 08c32423 sw a2,136(t1)
|
||||
80000558: 1887a883 lw a7,392(a5)
|
||||
8000055c: 00100613 li a2,1
|
||||
80000560: 00e61633 sll a2,a2,a4
|
||||
80000564: 00c8e8b3 or a7,a7,a2
|
||||
80000568: 1917a423 sw a7,392(a5)
|
||||
8000056c: 10d32423 sw a3,264(t1)
|
||||
80000570: 00200693 li a3,2
|
||||
80000574: 02d50463 beq a0,a3,8000059c <__register_exitproc+0x70>
|
||||
80000578: 00170713 addi a4,a4,1
|
||||
8000057c: 00e7a223 sw a4,4(a5)
|
||||
80000580: 010787b3 add a5,a5,a6
|
||||
80000584: 00b7a423 sw a1,8(a5)
|
||||
80000588: 00000513 li a0,0
|
||||
8000058c: 00008067 ret
|
||||
80000590: 14c70793 addi a5,a4,332
|
||||
80000594: 14f72423 sw a5,328(a4)
|
||||
80000598: fa5ff06f j 8000053c <__register_exitproc+0x10>
|
||||
8000059c: 18c7a683 lw a3,396(a5)
|
||||
800005a0: 00170713 addi a4,a4,1
|
||||
800005a4: 00e7a223 sw a4,4(a5)
|
||||
800005a8: 00c6e633 or a2,a3,a2
|
||||
800005ac: 18c7a623 sw a2,396(a5)
|
||||
800005b0: 010787b3 add a5,a5,a6
|
||||
800005b4: 00b7a423 sw a1,8(a5)
|
||||
800005b8: 00000513 li a0,0
|
||||
800005bc: 00008067 ret
|
||||
800005c0: fff00513 li a0,-1
|
||||
800005c4: 00008067 ret
|
||||
|
||||
800005c8 <__call_exitprocs>:
|
||||
800005c8: fd010113 addi sp,sp,-48
|
||||
800005cc: 800027b7 lui a5,0x80002
|
||||
800005d0: 01412c23 sw s4,24(sp)
|
||||
800005d4: b187aa03 lw s4,-1256(a5) # 80001b18 <__stack_top+0x81001b18>
|
||||
800005d8: 03212023 sw s2,32(sp)
|
||||
800005dc: 02112623 sw ra,44(sp)
|
||||
800005e0: 148a2903 lw s2,328(s4)
|
||||
800005e4: 02812423 sw s0,40(sp)
|
||||
800005e8: 02912223 sw s1,36(sp)
|
||||
800005ec: 01312e23 sw s3,28(sp)
|
||||
800005f0: 01512a23 sw s5,20(sp)
|
||||
800005f4: 01612823 sw s6,16(sp)
|
||||
800005f8: 01712623 sw s7,12(sp)
|
||||
800005fc: 01812423 sw s8,8(sp)
|
||||
80000600: 04090063 beqz s2,80000640 <__call_exitprocs+0x78>
|
||||
80000604: 00050b13 mv s6,a0
|
||||
80000608: 00058b93 mv s7,a1
|
||||
8000060c: 00100a93 li s5,1
|
||||
80000610: fff00993 li s3,-1
|
||||
80000614: 00492483 lw s1,4(s2)
|
||||
80000618: fff48413 addi s0,s1,-1
|
||||
8000061c: 02044263 bltz s0,80000640 <__call_exitprocs+0x78>
|
||||
80000620: 00249493 slli s1,s1,0x2
|
||||
80000624: 009904b3 add s1,s2,s1
|
||||
80000628: 040b8463 beqz s7,80000670 <__call_exitprocs+0xa8>
|
||||
8000062c: 1044a783 lw a5,260(s1)
|
||||
80000630: 05778063 beq a5,s7,80000670 <__call_exitprocs+0xa8>
|
||||
80000634: fff40413 addi s0,s0,-1
|
||||
80000638: ffc48493 addi s1,s1,-4
|
||||
8000063c: ff3416e3 bne s0,s3,80000628 <__call_exitprocs+0x60>
|
||||
80000640: 02c12083 lw ra,44(sp)
|
||||
80000644: 02812403 lw s0,40(sp)
|
||||
80000648: 02412483 lw s1,36(sp)
|
||||
8000064c: 02012903 lw s2,32(sp)
|
||||
80000650: 01c12983 lw s3,28(sp)
|
||||
80000654: 01812a03 lw s4,24(sp)
|
||||
80000658: 01412a83 lw s5,20(sp)
|
||||
8000065c: 01012b03 lw s6,16(sp)
|
||||
80000660: 00c12b83 lw s7,12(sp)
|
||||
80000664: 00812c03 lw s8,8(sp)
|
||||
80000668: 03010113 addi sp,sp,48
|
||||
8000066c: 00008067 ret
|
||||
80000670: 00492783 lw a5,4(s2)
|
||||
80000674: 0044a683 lw a3,4(s1)
|
||||
80000678: fff78793 addi a5,a5,-1
|
||||
8000067c: 04878e63 beq a5,s0,800006d8 <__call_exitprocs+0x110>
|
||||
80000680: 0004a223 sw zero,4(s1)
|
||||
80000684: fa0688e3 beqz a3,80000634 <__call_exitprocs+0x6c>
|
||||
80000688: 18892783 lw a5,392(s2)
|
||||
8000068c: 008a9733 sll a4,s5,s0
|
||||
80000690: 00492c03 lw s8,4(s2)
|
||||
80000694: 00f777b3 and a5,a4,a5
|
||||
80000698: 02079263 bnez a5,800006bc <__call_exitprocs+0xf4>
|
||||
8000069c: 000680e7 jalr a3
|
||||
800006a0: 00492703 lw a4,4(s2)
|
||||
800006a4: 148a2783 lw a5,328(s4)
|
||||
800006a8: 01871463 bne a4,s8,800006b0 <__call_exitprocs+0xe8>
|
||||
800006ac: f8f904e3 beq s2,a5,80000634 <__call_exitprocs+0x6c>
|
||||
800006b0: f80788e3 beqz a5,80000640 <__call_exitprocs+0x78>
|
||||
800006b4: 00078913 mv s2,a5
|
||||
800006b8: f5dff06f j 80000614 <__call_exitprocs+0x4c>
|
||||
800006bc: 18c92783 lw a5,396(s2)
|
||||
800006c0: 0844a583 lw a1,132(s1)
|
||||
800006c4: 00f77733 and a4,a4,a5
|
||||
800006c8: 00071c63 bnez a4,800006e0 <__call_exitprocs+0x118>
|
||||
800006cc: 000b0513 mv a0,s6
|
||||
800006d0: 000680e7 jalr a3
|
||||
800006d4: fcdff06f j 800006a0 <__call_exitprocs+0xd8>
|
||||
800006d8: 00892223 sw s0,4(s2)
|
||||
800006dc: fa9ff06f j 80000684 <__call_exitprocs+0xbc>
|
||||
800006e0: 00058513 mv a0,a1
|
||||
800006e4: 000680e7 jalr a3
|
||||
800006e8: fb9ff06f j 800006a0 <__call_exitprocs+0xd8>
|
||||
|
||||
Disassembly of section .init_array:
|
||||
|
||||
800016ec <__init_array_start>:
|
||||
800016ec: 009c addi a5,sp,64
|
||||
800016ee: 8000 0x8000
|
||||
|
||||
Disassembly of section .data:
|
||||
|
||||
800016f0 <impure_data>:
|
||||
800016f0: 0000 unimp
|
||||
800016f2: 0000 unimp
|
||||
800016f4: 19dc addi a5,sp,244
|
||||
800016f6: 8000 0x8000
|
||||
800016f8: 1a44 addi s1,sp,308
|
||||
800016fa: 8000 0x8000
|
||||
800016fc: 1aac addi a1,sp,376
|
||||
800016fe: 8000 0x8000
|
||||
...
|
||||
80001798: 0001 nop
|
||||
8000179a: 0000 unimp
|
||||
8000179c: 0000 unimp
|
||||
8000179e: 0000 unimp
|
||||
800017a0: 330e fld ft6,224(sp)
|
||||
800017a2: abcd j 80001d94 <__BSS_END__+0x278>
|
||||
800017a4: 1234 addi a3,sp,296
|
||||
800017a6: e66d bnez a2,80001890 <impure_data+0x1a0>
|
||||
800017a8: deec sw a1,124(a3)
|
||||
800017aa: 0005 c.nop 1
|
||||
800017ac: 0000000b 0xb
|
||||
...
|
||||
|
||||
Disassembly of section .sdata:
|
||||
|
||||
80001b18 <_global_impure_ptr>:
|
||||
80001b18: 16f0 addi a2,sp,876
|
||||
80001b1a: 8000 0x8000
|
||||
|
||||
Disassembly of section .comment:
|
||||
|
||||
00000000 <.comment>:
|
||||
0: 3a434347 fmsub.d ft6,ft6,ft4,ft7,rmm
|
||||
4: 2820 fld fs0,80(s0)
|
||||
6: 29554e47 fmsub.s ft8,fa0,fs5,ft5,rmm
|
||||
a: 3920 fld fs0,112(a0)
|
||||
c: 322e fld ft4,232(sp)
|
||||
e: 302e fld ft0,232(sp)
|
||||
...
|
||||
|
||||
Disassembly of section .riscv.attributes:
|
||||
|
||||
00000000 <.riscv.attributes>:
|
||||
0: 2541 jal 680 <__stack_size+0x280>
|
||||
2: 0000 unimp
|
||||
4: 7200 flw fs0,32(a2)
|
||||
6: 7369 lui t1,0xffffa
|
||||
8: 01007663 bgeu zero,a6,14 <__stack_usage+0x14>
|
||||
c: 0000001b 0x1b
|
||||
10: 1004 addi s1,sp,32
|
||||
12: 7205 lui tp,0xfffe1
|
||||
14: 3376 fld ft6,376(sp)
|
||||
16: 6932 flw fs2,12(sp)
|
||||
18: 7032 flw ft0,44(sp)
|
||||
1a: 5f30 lw a2,120(a4)
|
||||
1c: 326d jal fffff9c6 <__stack_top+0xfff9c6>
|
||||
1e: 3070 fld fa2,224(s0)
|
||||
20: 665f 7032 0030 0x307032665f
|
||||
BIN
tests/driver/basic/kernel.elf
Executable file
BIN
tests/driver/basic/kernel.elf
Executable file
Binary file not shown.
153
tests/driver/basic/kernel_scheduler.h
Normal file
153
tests/driver/basic/kernel_scheduler.h
Normal file
@@ -0,0 +1,153 @@
|
||||
#include <iostream>
|
||||
#include <assert.h>
|
||||
|
||||
#define NUM_CORES_MAX 32
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
struct context_t {
|
||||
uint32_t num_groups[3];
|
||||
uint32_t global_offset[3];
|
||||
uint32_t local_size[3];
|
||||
char * printf_buffer;
|
||||
uint32_t *printf_buffer_position;
|
||||
uint32_t printf_buffer_capacity;
|
||||
uint32_t work_dim;
|
||||
};
|
||||
|
||||
typedef void (*vx_pocl_workgroup_func) (
|
||||
const void * /* args */,
|
||||
const struct context_t * /* context */,
|
||||
uint32_t /* group_x */,
|
||||
uint32_t /* group_y */,
|
||||
uint32_t /* group_z */
|
||||
);
|
||||
|
||||
typedef struct {
|
||||
struct context_t * ctx;
|
||||
vx_pocl_workgroup_func pfn;
|
||||
const void * args;
|
||||
int offset;
|
||||
int N;
|
||||
int R;
|
||||
} wspawn_args_t;
|
||||
|
||||
void kernel_spawn_callback(int core_id, int NW, int NT, int nW, wspawn_args_t* p_wspawn_args) {
|
||||
assert(nW <= NW);
|
||||
for (int wid = 0; wid < nW; ++wid) {
|
||||
for (int tid = 0; tid < NT; ++tid) {
|
||||
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
|
||||
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
|
||||
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
|
||||
|
||||
int X = p_wspawn_args->ctx->num_groups[0];
|
||||
int Y = p_wspawn_args->ctx->num_groups[1];
|
||||
int XY = X * Y;
|
||||
|
||||
for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {
|
||||
int k = wg_id / XY;
|
||||
int wg_2d = wg_id - k * XY;
|
||||
int j = wg_2d / X;
|
||||
int i = wg_2d - j * X;
|
||||
|
||||
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
|
||||
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
|
||||
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
|
||||
|
||||
printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_spawn_remaining_callback(int core_id, int NW, int NT, int wid, int nT, wspawn_args_t* p_wspawn_args) {
|
||||
assert(wid < NW);
|
||||
assert(nT <= NT);
|
||||
for (int t = 0; t < nT; ++t) {
|
||||
int tid = core_id * NW * NT + wid * NT + t;
|
||||
|
||||
int wg_id = p_wspawn_args->offset + tid;
|
||||
|
||||
int X = p_wspawn_args->ctx->num_groups[0];
|
||||
int Y = p_wspawn_args->ctx->num_groups[1];
|
||||
int XY = X * Y;
|
||||
|
||||
int k = wg_id / XY;
|
||||
int wg_2d = wg_id - k * XY;
|
||||
int j = wg_2d / X;
|
||||
int i = wg_2d - j * X;
|
||||
|
||||
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
|
||||
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
|
||||
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
|
||||
|
||||
printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2);
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_run_once(context_t* ctx, int NC, int NW, int NT, int core_id) {
|
||||
// total number of WGs
|
||||
int X = ctx->num_groups[0];
|
||||
int Y = ctx->num_groups[1];
|
||||
int Z = ctx->num_groups[2];
|
||||
int Q = X * Y * Z;
|
||||
|
||||
// current core id
|
||||
if (core_id >= NUM_CORES_MAX)
|
||||
return;
|
||||
|
||||
// calculate necessary active cores
|
||||
int WT = NW * NT;
|
||||
int nC = (Q > WT) ? (Q / WT) : 1;
|
||||
int nc = MIN(nC, NC);
|
||||
if (core_id >= nc)
|
||||
return; // terminate extra cores
|
||||
|
||||
// number of workgroups per core
|
||||
int wgs_per_core = Q / nc;
|
||||
int wgs_per_core0 = wgs_per_core;
|
||||
if (core_id == (NC-1)) {
|
||||
int QC_r = Q - (nc * wgs_per_core0);
|
||||
wgs_per_core0 += QC_r; // last core executes remaining WGs
|
||||
}
|
||||
|
||||
// number of workgroups per warp
|
||||
int nW = wgs_per_core0 / NT; // total warps per core
|
||||
int rT = wgs_per_core0 - (nW * NT); // remaining threads
|
||||
int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations
|
||||
int rW = (fW != 0) ? (nW - fW * NW) : 0; // reamining full warps
|
||||
if (0 == fW)
|
||||
fW = 1;
|
||||
|
||||
//--
|
||||
wspawn_args_t wspawn_args = { ctx, NULL, NULL, core_id * wgs_per_core, fW, rW };
|
||||
|
||||
//--
|
||||
if (nW >= 1) {
|
||||
int nw = MIN(nW, NW);
|
||||
kernel_spawn_callback(core_id, NW, NT, nw, &wspawn_args);
|
||||
}
|
||||
|
||||
//--
|
||||
if (rT != 0) {
|
||||
wspawn_args.offset = wgs_per_core0 - rT;
|
||||
kernel_spawn_remaining_callback(core_id, NW, NT, 0, rT, &wspawn_args);
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_run(int X, int Y, int Z, int NC, int NW, int NT) {
|
||||
context_t ctx;
|
||||
|
||||
ctx.num_groups[0] = X;
|
||||
ctx.num_groups[1] = Y;
|
||||
ctx.num_groups[2] = Z;
|
||||
ctx.global_offset[0] = 0;
|
||||
ctx.global_offset[1] = 0;
|
||||
ctx.global_offset[2] = 0;
|
||||
|
||||
for (int cid = 0; cid < NC; ++cid) {
|
||||
kernel_run_once(&ctx, NC, NW, NT, cid);
|
||||
}
|
||||
|
||||
exit (0);
|
||||
}
|
||||
288
tests/driver/basic/main.cpp
Executable file
288
tests/driver/basic/main.cpp
Executable file
@@ -0,0 +1,288 @@
|
||||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <vortex.h>
|
||||
#include <chrono>
|
||||
#include "common.h"
|
||||
#include "kernel_scheduler.h"
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
if (0 == _ret) \
|
||||
break; \
|
||||
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
const char* kernel_file = "kernel.bin";
|
||||
int test = -1;
|
||||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
vx_buffer_h staging_buf = nullptr;
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Driver Test." << std::endl;
|
||||
std::cout << "Usage: [-t testno][-k: kernel][-n words][-h: help]" << std::endl;
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:t:k:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
count = atoi(optarg);
|
||||
break;
|
||||
case 't':
|
||||
test = atoi(optarg);
|
||||
break;
|
||||
case 'k':
|
||||
kernel_file = optarg;
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup() {
|
||||
if (staging_buf) {
|
||||
vx_buf_release(staging_buf);
|
||||
}
|
||||
if (device) {
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t shuffle(int i, uint64_t value) {
|
||||
return (value << i) | (value & ((1 << i)-1));;
|
||||
}
|
||||
|
||||
int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
|
||||
int errors = 0;
|
||||
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
int num_blocks_8 = (64 * num_blocks) / 8;
|
||||
|
||||
// update source buffer
|
||||
for (int i = 0; i < num_blocks_8; ++i) {
|
||||
((uint64_t*)vx_host_ptr(staging_buf))[i] = shuffle(i, value);
|
||||
}
|
||||
|
||||
/*for (int i = 0; i < num_blocks; ++i) {
|
||||
std::cout << "data[" << i << "]=0x";
|
||||
for (int j = 7; j >= 0; --j) {
|
||||
std::cout << std::hex << ((uint64_t*)vx_host_ptr(staging_buf))[i * 8 +j];
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}*/
|
||||
|
||||
// write source buffer to local memory
|
||||
std::cout << "write source buffer to local memory" << std::endl;
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, dev_addr, 64 * num_blocks, 0));
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear destination buffer
|
||||
for (int i = 0; i < num_blocks_8; ++i) {
|
||||
((uint64_t*)vx_host_ptr(staging_buf))[i] = 0;
|
||||
}
|
||||
|
||||
// read destination buffer from local memory
|
||||
std::cout << "read destination buffer from local memory" << std::endl;
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_from_dev(staging_buf, dev_addr, 64 * num_blocks, 0));
|
||||
auto t3 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
for (int i = 0; i < num_blocks_8; ++i) {
|
||||
auto curr = ((uint64_t*)vx_host_ptr(staging_buf))[i];
|
||||
auto ref = shuffle(i, value);
|
||||
if (curr != ref) {
|
||||
std::cout << "error at 0x" << std::hex << (dev_addr + 8 * i)
|
||||
<< ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double elapsed;
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
|
||||
printf("upload time: %lg ms\n", elapsed);
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
|
||||
printf("download time: %lg ms\n", elapsed);
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Total elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int run_kernel_test(const kernel_arg_t& kernel_arg,
|
||||
uint32_t buf_size,
|
||||
uint32_t num_points) {
|
||||
int errors = 0;
|
||||
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// update source buffer
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i;
|
||||
}
|
||||
}
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_ptr, buf_size, 0));
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
}
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0));
|
||||
|
||||
// start device
|
||||
std::cout << "start execution" << std::endl;
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_start(device));
|
||||
RT_CHECK(vx_ready_wait(device, -1));
|
||||
auto t3 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// read destination buffer from local memory
|
||||
std::cout << "read destination buffer from local memory" << std::endl;
|
||||
auto t4 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0));
|
||||
auto t5 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int32_t curr = ((int32_t*)vx_host_ptr(staging_buf))[i];
|
||||
int32_t ref = i;
|
||||
if (curr != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double elapsed;
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
|
||||
printf("upload time: %lg ms\n", elapsed);
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
|
||||
printf("execute time: %lg ms\n", elapsed);
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t5 - t4).count();
|
||||
printf("download time: %lg ms\n", elapsed);
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Total elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
size_t value;
|
||||
kernel_arg_t kernel_arg;
|
||||
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
if (count == 0) {
|
||||
count = 1;
|
||||
}
|
||||
|
||||
//kernel_run(count, 1, 1, test, 4, 4);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
unsigned max_cores;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
|
||||
uint32_t num_points = 1 * count;
|
||||
uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64;
|
||||
uint32_t buf_size = num_blocks * 64;
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
// allocate device memory
|
||||
RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
|
||||
kernel_arg.src_ptr = value;
|
||||
RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
|
||||
kernel_arg.dst_ptr = value;
|
||||
|
||||
kernel_arg.count = count;
|
||||
|
||||
std::cout << "dev_src=" << std::hex << kernel_arg.src_ptr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &staging_buf));
|
||||
|
||||
// run tests
|
||||
if (0 == test || -1 == test) {
|
||||
std::cout << "run memcopy test" << std::endl;
|
||||
RT_CHECK(run_memcopy_test(kernel_arg.src_ptr, 0x0badf00d40ff40ff, num_blocks));
|
||||
}
|
||||
|
||||
if (1 == test || -1 == test) {
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (void*)vx_host_ptr(staging_buf);
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
|
||||
}
|
||||
|
||||
std::cout << "run kernel test" << std::endl;
|
||||
RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points));
|
||||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
std::cout << "Test PASSED" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
67
tests/driver/demo/Makefile
Normal file
67
tests/driver/demo/Makefile
Normal file
@@ -0,0 +1,67 @@
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
|
||||
OPTS ?= -n64
|
||||
|
||||
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
|
||||
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
|
||||
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
|
||||
|
||||
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
|
||||
|
||||
VX_SRCS = kernel.c
|
||||
|
||||
#CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I../../include
|
||||
|
||||
PROJECT = demo
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
all: $(PROJECT) kernel.bin kernel.dump
|
||||
|
||||
kernel.dump: kernel.elf
|
||||
$(VX_DP) -D kernel.elf > kernel.dump
|
||||
|
||||
kernel.bin: kernel.elf
|
||||
$(VX_CP) -O binary kernel.elf kernel.bin
|
||||
|
||||
kernel.elf: $(VX_SRCS)
|
||||
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../stub -lvortex -o $@
|
||||
|
||||
run-fpga: $(PROJECT)
|
||||
LD_LIBRARY_PATH=../../opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT)
|
||||
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT)
|
||||
LD_LIBRARY_PATH=../../opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT)
|
||||
LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT)
|
||||
LD_LIBRARY_PATH=../../simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.elf *.bin *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
14
tests/driver/demo/common.h
Normal file
14
tests/driver/demo/common.h
Normal file
@@ -0,0 +1,14 @@
|
||||
#ifndef _COMMON_H_
|
||||
#define _COMMON_H_
|
||||
|
||||
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
|
||||
|
||||
struct kernel_arg_t {
|
||||
uint32_t num_tasks;
|
||||
uint32_t task_size;
|
||||
uint32_t src0_ptr;
|
||||
uint32_t src1_ptr;
|
||||
uint32_t dst_ptr;
|
||||
};
|
||||
|
||||
#endif
|
||||
BIN
tests/driver/demo/kernel.bin
Executable file
BIN
tests/driver/demo/kernel.bin
Executable file
Binary file not shown.
23
tests/driver/demo/kernel.c
Normal file
23
tests/driver/demo/kernel.c
Normal file
@@ -0,0 +1,23 @@
|
||||
#include <stdint.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
void kernel_body(int task_id, void* arg) {
|
||||
struct kernel_arg_t* _arg = (struct kernel_arg_t*)(arg);
|
||||
uint32_t count = _arg->task_size;
|
||||
int32_t* src0_ptr = (int32_t*)_arg->src0_ptr;
|
||||
int32_t* src1_ptr = (int32_t*)_arg->src1_ptr;
|
||||
int32_t* dst_ptr = (int32_t*)_arg->dst_ptr;
|
||||
|
||||
uint32_t offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
|
||||
vx_spawn_tasks(arg->num_tasks, kernel_body, arg);
|
||||
}
|
||||
695
tests/driver/demo/kernel.dump
Normal file
695
tests/driver/demo/kernel.dump
Normal file
@@ -0,0 +1,695 @@
|
||||
|
||||
kernel.elf: file format elf32-littleriscv
|
||||
|
||||
|
||||
Disassembly of section .init:
|
||||
|
||||
80000000 <_start>:
|
||||
80000000: 00000597 auipc a1,0x0
|
||||
80000004: 0e458593 addi a1,a1,228 # 800000e4 <vx_set_sp>
|
||||
80000008: fc102573 csrr a0,0xfc1
|
||||
8000000c: 00b5106b 0xb5106b
|
||||
80000010: 0d4000ef jal ra,800000e4 <vx_set_sp>
|
||||
80000014: 00100513 li a0,1
|
||||
80000018: 0005006b 0x5006b
|
||||
8000001c: 00002517 auipc a0,0x2
|
||||
80000020: d3050513 addi a0,a0,-720 # 80001d4c <g_wspawn_args>
|
||||
80000024: 00002617 auipc a2,0x2
|
||||
80000028: da860613 addi a2,a2,-600 # 80001dcc <__BSS_END__>
|
||||
8000002c: 40a60633 sub a2,a2,a0
|
||||
80000030: 00000593 li a1,0
|
||||
80000034: 648000ef jal ra,8000067c <memset>
|
||||
80000038: 00000517 auipc a0,0x0
|
||||
8000003c: 54c50513 addi a0,a0,1356 # 80000584 <__libc_fini_array>
|
||||
80000040: 4fc000ef jal ra,8000053c <atexit>
|
||||
80000044: 59c000ef jal ra,800005e0 <__libc_init_array>
|
||||
80000048: 008000ef jal ra,80000050 <main>
|
||||
8000004c: 5040006f j 80000550 <exit>
|
||||
|
||||
Disassembly of section .text:
|
||||
|
||||
80000050 <main>:
|
||||
80000050: 7ffff7b7 lui a5,0x7ffff
|
||||
80000054: 0007a503 lw a0,0(a5) # 7ffff000 <__stack_size+0x7fffec00>
|
||||
80000058: 800005b7 lui a1,0x80000
|
||||
8000005c: 7ffff637 lui a2,0x7ffff
|
||||
80000060: 08058593 addi a1,a1,128 # 80000080 <__stack_top+0x81000080>
|
||||
80000064: 17c0006f j 800001e0 <vx_spawn_tasks>
|
||||
|
||||
80000068 <register_fini>:
|
||||
80000068: 00000793 li a5,0
|
||||
8000006c: 00078863 beqz a5,8000007c <register_fini+0x14>
|
||||
80000070: 80000537 lui a0,0x80000
|
||||
80000074: 58450513 addi a0,a0,1412 # 80000584 <__stack_top+0x81000584>
|
||||
80000078: 4c40006f j 8000053c <atexit>
|
||||
8000007c: 00008067 ret
|
||||
|
||||
80000080 <kernel_body>:
|
||||
80000080: 0045a683 lw a3,4(a1)
|
||||
80000084: 0085a603 lw a2,8(a1)
|
||||
80000088: 00c5a703 lw a4,12(a1)
|
||||
8000008c: 02d50533 mul a0,a0,a3
|
||||
80000090: 0105a803 lw a6,16(a1)
|
||||
80000094: 04068063 beqz a3,800000d4 <kernel_body+0x54>
|
||||
80000098: 00a686b3 add a3,a3,a0
|
||||
8000009c: 00269693 slli a3,a3,0x2
|
||||
800000a0: 00251513 slli a0,a0,0x2
|
||||
800000a4: 00c507b3 add a5,a0,a2
|
||||
800000a8: 00c686b3 add a3,a3,a2
|
||||
800000ac: 40c80833 sub a6,a6,a2
|
||||
800000b0: 40c70533 sub a0,a4,a2
|
||||
800000b4: 00f50733 add a4,a0,a5
|
||||
800000b8: 0007a583 lw a1,0(a5)
|
||||
800000bc: 00072703 lw a4,0(a4)
|
||||
800000c0: 00f80633 add a2,a6,a5
|
||||
800000c4: 00478793 addi a5,a5,4
|
||||
800000c8: 00b70733 add a4,a4,a1
|
||||
800000cc: 00e62023 sw a4,0(a2) # 7ffff000 <__stack_size+0x7fffec00>
|
||||
800000d0: fef692e3 bne a3,a5,800000b4 <kernel_body+0x34>
|
||||
800000d4: 00008067 ret
|
||||
|
||||
800000d8 <_exit>:
|
||||
800000d8: 250000ef jal ra,80000328 <vx_perf_dump>
|
||||
800000dc: 00000513 li a0,0
|
||||
800000e0: 0005006b 0x5006b
|
||||
|
||||
800000e4 <vx_set_sp>:
|
||||
800000e4: fc002573 csrr a0,0xfc0
|
||||
800000e8: 0005006b 0x5006b
|
||||
800000ec: 00002197 auipc gp,0x2
|
||||
800000f0: 03418193 addi gp,gp,52 # 80002120 <__global_pointer>
|
||||
800000f4: 7f000117 auipc sp,0x7f000
|
||||
800000f8: f0c10113 addi sp,sp,-244 # ff000000 <__stack_top>
|
||||
800000fc: 40000593 li a1,1024
|
||||
80000100: cc102673 csrr a2,0xcc1
|
||||
80000104: 02c585b3 mul a1,a1,a2
|
||||
80000108: 40b10133 sub sp,sp,a1
|
||||
8000010c: cc3026f3 csrr a3,0xcc3
|
||||
80000110: 00068663 beqz a3,8000011c <RETURN>
|
||||
80000114: 00000513 li a0,0
|
||||
80000118: 0005006b 0x5006b
|
||||
|
||||
8000011c <RETURN>:
|
||||
8000011c: 00008067 ret
|
||||
|
||||
80000120 <spawn_tasks_callback>:
|
||||
80000120: fe010113 addi sp,sp,-32
|
||||
80000124: 00112e23 sw ra,28(sp)
|
||||
80000128: 00812c23 sw s0,24(sp)
|
||||
8000012c: 00912a23 sw s1,20(sp)
|
||||
80000130: 01212823 sw s2,16(sp)
|
||||
80000134: 01312623 sw s3,12(sp)
|
||||
80000138: fc0027f3 csrr a5,0xfc0
|
||||
8000013c: 0007806b 0x7806b
|
||||
80000140: cc5026f3 csrr a3,0xcc5
|
||||
80000144: cc3029f3 csrr s3,0xcc3
|
||||
80000148: cc002773 csrr a4,0xcc0
|
||||
8000014c: fc002673 csrr a2,0xfc0
|
||||
80000150: 800027b7 lui a5,0x80002
|
||||
80000154: 00269693 slli a3,a3,0x2
|
||||
80000158: d4c78793 addi a5,a5,-692 # 80001d4c <__stack_top+0x81001d4c>
|
||||
8000015c: 00d787b3 add a5,a5,a3
|
||||
80000160: 0007a483 lw s1,0(a5)
|
||||
80000164: 0104a403 lw s0,16(s1)
|
||||
80000168: 00c4a683 lw a3,12(s1)
|
||||
8000016c: 0089a933 slt s2,s3,s0
|
||||
80000170: 00040793 mv a5,s0
|
||||
80000174: 00d90933 add s2,s2,a3
|
||||
80000178: 03368433 mul s0,a3,s3
|
||||
8000017c: 00f9d463 bge s3,a5,80000184 <spawn_tasks_callback+0x64>
|
||||
80000180: 00098793 mv a5,s3
|
||||
80000184: 00f40433 add s0,s0,a5
|
||||
80000188: 0084a683 lw a3,8(s1)
|
||||
8000018c: 02c40433 mul s0,s0,a2
|
||||
80000190: 02e907b3 mul a5,s2,a4
|
||||
80000194: 00d40433 add s0,s0,a3
|
||||
80000198: 00f40433 add s0,s0,a5
|
||||
8000019c: 00890933 add s2,s2,s0
|
||||
800001a0: 01245e63 bge s0,s2,800001bc <spawn_tasks_callback+0x9c>
|
||||
800001a4: 0004a783 lw a5,0(s1)
|
||||
800001a8: 0044a583 lw a1,4(s1)
|
||||
800001ac: 00040513 mv a0,s0
|
||||
800001b0: 00140413 addi s0,s0,1
|
||||
800001b4: 000780e7 jalr a5
|
||||
800001b8: fe8916e3 bne s2,s0,800001a4 <spawn_tasks_callback+0x84>
|
||||
800001bc: 0019b993 seqz s3,s3
|
||||
800001c0: 0009806b 0x9806b
|
||||
800001c4: 01c12083 lw ra,28(sp)
|
||||
800001c8: 01812403 lw s0,24(sp)
|
||||
800001cc: 01412483 lw s1,20(sp)
|
||||
800001d0: 01012903 lw s2,16(sp)
|
||||
800001d4: 00c12983 lw s3,12(sp)
|
||||
800001d8: 02010113 addi sp,sp,32
|
||||
800001dc: 00008067 ret
|
||||
|
||||
800001e0 <vx_spawn_tasks>:
|
||||
800001e0: fc010113 addi sp,sp,-64
|
||||
800001e4: 02112e23 sw ra,60(sp)
|
||||
800001e8: 02812c23 sw s0,56(sp)
|
||||
800001ec: 02912a23 sw s1,52(sp)
|
||||
800001f0: 03212823 sw s2,48(sp)
|
||||
800001f4: 03312623 sw s3,44(sp)
|
||||
800001f8: fc2026f3 csrr a3,0xfc2
|
||||
800001fc: fc102873 csrr a6,0xfc1
|
||||
80000200: fc002473 csrr s0,0xfc0
|
||||
80000204: cc5027f3 csrr a5,0xcc5
|
||||
80000208: 01f00713 li a4,31
|
||||
8000020c: 0cf74463 blt a4,a5,800002d4 <vx_spawn_tasks+0xf4>
|
||||
80000210: 030408b3 mul a7,s0,a6
|
||||
80000214: 00100713 li a4,1
|
||||
80000218: 00a8d463 bge a7,a0,80000220 <vx_spawn_tasks+0x40>
|
||||
8000021c: 03154733 div a4,a0,a7
|
||||
80000220: 0ce6c863 blt a3,a4,800002f0 <vx_spawn_tasks+0x110>
|
||||
80000224: 0ae7d863 bge a5,a4,800002d4 <vx_spawn_tasks+0xf4>
|
||||
80000228: fff68693 addi a3,a3,-1
|
||||
8000022c: 02e54333 div t1,a0,a4
|
||||
80000230: 00030893 mv a7,t1
|
||||
80000234: 00f69663 bne a3,a5,80000240 <vx_spawn_tasks+0x60>
|
||||
80000238: 02e56533 rem a0,a0,a4
|
||||
8000023c: 006508b3 add a7,a0,t1
|
||||
80000240: 0288c4b3 div s1,a7,s0
|
||||
80000244: 0288e933 rem s2,a7,s0
|
||||
80000248: 0b04ca63 blt s1,a6,800002fc <vx_spawn_tasks+0x11c>
|
||||
8000024c: 00100693 li a3,1
|
||||
80000250: 0304c733 div a4,s1,a6
|
||||
80000254: 00070663 beqz a4,80000260 <vx_spawn_tasks+0x80>
|
||||
80000258: 00070693 mv a3,a4
|
||||
8000025c: 0304e733 rem a4,s1,a6
|
||||
80000260: 800029b7 lui s3,0x80002
|
||||
80000264: d4c98993 addi s3,s3,-692 # 80001d4c <__stack_top+0x81001d4c>
|
||||
80000268: 00e12e23 sw a4,28(sp)
|
||||
8000026c: 00c10713 addi a4,sp,12
|
||||
80000270: 00b12623 sw a1,12(sp)
|
||||
80000274: 00c12823 sw a2,16(sp)
|
||||
80000278: 00d12c23 sw a3,24(sp)
|
||||
8000027c: 02f30333 mul t1,t1,a5
|
||||
80000280: 00279793 slli a5,a5,0x2
|
||||
80000284: 00f987b3 add a5,s3,a5
|
||||
80000288: 00e7a023 sw a4,0(a5)
|
||||
8000028c: 00612a23 sw t1,20(sp)
|
||||
80000290: 06904c63 bgtz s1,80000308 <vx_spawn_tasks+0x128>
|
||||
80000294: 04090063 beqz s2,800002d4 <vx_spawn_tasks+0xf4>
|
||||
80000298: 02848433 mul s0,s1,s0
|
||||
8000029c: 00812a23 sw s0,20(sp)
|
||||
800002a0: 0009006b 0x9006b
|
||||
800002a4: cc5027f3 csrr a5,0xcc5
|
||||
800002a8: cc202573 csrr a0,0xcc2
|
||||
800002ac: 00279793 slli a5,a5,0x2
|
||||
800002b0: 00f989b3 add s3,s3,a5
|
||||
800002b4: 0009a783 lw a5,0(s3)
|
||||
800002b8: 0087a683 lw a3,8(a5)
|
||||
800002bc: 0007a703 lw a4,0(a5)
|
||||
800002c0: 0047a583 lw a1,4(a5)
|
||||
800002c4: 00d50533 add a0,a0,a3
|
||||
800002c8: 000700e7 jalr a4
|
||||
800002cc: 00100793 li a5,1
|
||||
800002d0: 0007806b 0x7806b
|
||||
800002d4: 03c12083 lw ra,60(sp)
|
||||
800002d8: 03812403 lw s0,56(sp)
|
||||
800002dc: 03412483 lw s1,52(sp)
|
||||
800002e0: 03012903 lw s2,48(sp)
|
||||
800002e4: 02c12983 lw s3,44(sp)
|
||||
800002e8: 04010113 addi sp,sp,64
|
||||
800002ec: 00008067 ret
|
||||
800002f0: 00068713 mv a4,a3
|
||||
800002f4: f2e7cae3 blt a5,a4,80000228 <vx_spawn_tasks+0x48>
|
||||
800002f8: fddff06f j 800002d4 <vx_spawn_tasks+0xf4>
|
||||
800002fc: 00000713 li a4,0
|
||||
80000300: 00100693 li a3,1
|
||||
80000304: f5dff06f j 80000260 <vx_spawn_tasks+0x80>
|
||||
80000308: 00048713 mv a4,s1
|
||||
8000030c: 00985463 bge a6,s1,80000314 <vx_spawn_tasks+0x134>
|
||||
80000310: 00080713 mv a4,a6
|
||||
80000314: 800007b7 lui a5,0x80000
|
||||
80000318: 12078793 addi a5,a5,288 # 80000120 <__stack_top+0x81000120>
|
||||
8000031c: 00f7106b 0xf7106b
|
||||
80000320: e01ff0ef jal ra,80000120 <spawn_tasks_callback>
|
||||
80000324: f71ff06f j 80000294 <vx_spawn_tasks+0xb4>
|
||||
|
||||
80000328 <vx_perf_dump>:
|
||||
80000328: cc5027f3 csrr a5,0xcc5
|
||||
8000032c: 00ff0737 lui a4,0xff0
|
||||
80000330: 00e787b3 add a5,a5,a4
|
||||
80000334: 00879793 slli a5,a5,0x8
|
||||
80000338: b0002773 csrr a4,mcycle
|
||||
8000033c: 00e7a023 sw a4,0(a5)
|
||||
80000340: b0102773 csrr a4,0xb01
|
||||
80000344: 00e7a223 sw a4,4(a5)
|
||||
80000348: b0202773 csrr a4,minstret
|
||||
8000034c: 00e7a423 sw a4,8(a5)
|
||||
80000350: b0302773 csrr a4,mhpmcounter3
|
||||
80000354: 00e7a623 sw a4,12(a5)
|
||||
80000358: b0402773 csrr a4,mhpmcounter4
|
||||
8000035c: 00e7a823 sw a4,16(a5)
|
||||
80000360: b0502773 csrr a4,mhpmcounter5
|
||||
80000364: 00e7aa23 sw a4,20(a5)
|
||||
80000368: b0602773 csrr a4,mhpmcounter6
|
||||
8000036c: 00e7ac23 sw a4,24(a5)
|
||||
80000370: b0702773 csrr a4,mhpmcounter7
|
||||
80000374: 00e7ae23 sw a4,28(a5)
|
||||
80000378: b0802773 csrr a4,mhpmcounter8
|
||||
8000037c: 02e7a023 sw a4,32(a5)
|
||||
80000380: b0902773 csrr a4,mhpmcounter9
|
||||
80000384: 02e7a223 sw a4,36(a5)
|
||||
80000388: b0a02773 csrr a4,mhpmcounter10
|
||||
8000038c: 02e7a423 sw a4,40(a5)
|
||||
80000390: b0b02773 csrr a4,mhpmcounter11
|
||||
80000394: 02e7a623 sw a4,44(a5)
|
||||
80000398: b0c02773 csrr a4,mhpmcounter12
|
||||
8000039c: 02e7a823 sw a4,48(a5)
|
||||
800003a0: b0d02773 csrr a4,mhpmcounter13
|
||||
800003a4: 02e7aa23 sw a4,52(a5)
|
||||
800003a8: b0e02773 csrr a4,mhpmcounter14
|
||||
800003ac: 02e7ac23 sw a4,56(a5)
|
||||
800003b0: b0f02773 csrr a4,mhpmcounter15
|
||||
800003b4: 02e7ae23 sw a4,60(a5)
|
||||
800003b8: b1002773 csrr a4,mhpmcounter16
|
||||
800003bc: 04e7a023 sw a4,64(a5)
|
||||
800003c0: b1102773 csrr a4,mhpmcounter17
|
||||
800003c4: 04e7a223 sw a4,68(a5)
|
||||
800003c8: b1202773 csrr a4,mhpmcounter18
|
||||
800003cc: 04e7a423 sw a4,72(a5)
|
||||
800003d0: b1302773 csrr a4,mhpmcounter19
|
||||
800003d4: 04e7a623 sw a4,76(a5)
|
||||
800003d8: b1402773 csrr a4,mhpmcounter20
|
||||
800003dc: 04e7a823 sw a4,80(a5)
|
||||
800003e0: b1502773 csrr a4,mhpmcounter21
|
||||
800003e4: 04e7aa23 sw a4,84(a5)
|
||||
800003e8: b1602773 csrr a4,mhpmcounter22
|
||||
800003ec: 04e7ac23 sw a4,88(a5)
|
||||
800003f0: b1702773 csrr a4,mhpmcounter23
|
||||
800003f4: 04e7ae23 sw a4,92(a5)
|
||||
800003f8: b1802773 csrr a4,mhpmcounter24
|
||||
800003fc: 06e7a023 sw a4,96(a5)
|
||||
80000400: b1902773 csrr a4,mhpmcounter25
|
||||
80000404: 06e7a223 sw a4,100(a5)
|
||||
80000408: b1a02773 csrr a4,mhpmcounter26
|
||||
8000040c: 06e7a423 sw a4,104(a5)
|
||||
80000410: b1b02773 csrr a4,mhpmcounter27
|
||||
80000414: 06e7a623 sw a4,108(a5)
|
||||
80000418: b1c02773 csrr a4,mhpmcounter28
|
||||
8000041c: 06e7a823 sw a4,112(a5)
|
||||
80000420: b1d02773 csrr a4,mhpmcounter29
|
||||
80000424: 06e7aa23 sw a4,116(a5)
|
||||
80000428: b1e02773 csrr a4,mhpmcounter30
|
||||
8000042c: 06e7ac23 sw a4,120(a5)
|
||||
80000430: b1f02773 csrr a4,mhpmcounter31
|
||||
80000434: 06e7ae23 sw a4,124(a5)
|
||||
80000438: b8002773 csrr a4,mcycleh
|
||||
8000043c: 08e7a023 sw a4,128(a5)
|
||||
80000440: b8102773 csrr a4,0xb81
|
||||
80000444: 08e7a223 sw a4,132(a5)
|
||||
80000448: b8202773 csrr a4,minstreth
|
||||
8000044c: 08e7a423 sw a4,136(a5)
|
||||
80000450: b8302773 csrr a4,mhpmcounter3h
|
||||
80000454: 08e7a623 sw a4,140(a5)
|
||||
80000458: b8402773 csrr a4,mhpmcounter4h
|
||||
8000045c: 08e7a823 sw a4,144(a5)
|
||||
80000460: b8502773 csrr a4,mhpmcounter5h
|
||||
80000464: 08e7aa23 sw a4,148(a5)
|
||||
80000468: b8602773 csrr a4,mhpmcounter6h
|
||||
8000046c: 08e7ac23 sw a4,152(a5)
|
||||
80000470: b8702773 csrr a4,mhpmcounter7h
|
||||
80000474: 08e7ae23 sw a4,156(a5)
|
||||
80000478: b8802773 csrr a4,mhpmcounter8h
|
||||
8000047c: 0ae7a023 sw a4,160(a5)
|
||||
80000480: b8902773 csrr a4,mhpmcounter9h
|
||||
80000484: 0ae7a223 sw a4,164(a5)
|
||||
80000488: b8a02773 csrr a4,mhpmcounter10h
|
||||
8000048c: 0ae7a423 sw a4,168(a5)
|
||||
80000490: b8b02773 csrr a4,mhpmcounter11h
|
||||
80000494: 0ae7a623 sw a4,172(a5)
|
||||
80000498: b8c02773 csrr a4,mhpmcounter12h
|
||||
8000049c: 0ae7a823 sw a4,176(a5)
|
||||
800004a0: b8d02773 csrr a4,mhpmcounter13h
|
||||
800004a4: 0ae7aa23 sw a4,180(a5)
|
||||
800004a8: b8e02773 csrr a4,mhpmcounter14h
|
||||
800004ac: 0ae7ac23 sw a4,184(a5)
|
||||
800004b0: b8f02773 csrr a4,mhpmcounter15h
|
||||
800004b4: 0ae7ae23 sw a4,188(a5)
|
||||
800004b8: b9002773 csrr a4,mhpmcounter16h
|
||||
800004bc: 0ce7a023 sw a4,192(a5)
|
||||
800004c0: b9102773 csrr a4,mhpmcounter17h
|
||||
800004c4: 0ce7a223 sw a4,196(a5)
|
||||
800004c8: b9202773 csrr a4,mhpmcounter18h
|
||||
800004cc: 0ce7a423 sw a4,200(a5)
|
||||
800004d0: b9302773 csrr a4,mhpmcounter19h
|
||||
800004d4: 0ce7a623 sw a4,204(a5)
|
||||
800004d8: b9402773 csrr a4,mhpmcounter20h
|
||||
800004dc: 0ce7a823 sw a4,208(a5)
|
||||
800004e0: b9502773 csrr a4,mhpmcounter21h
|
||||
800004e4: 0ce7aa23 sw a4,212(a5)
|
||||
800004e8: b9602773 csrr a4,mhpmcounter22h
|
||||
800004ec: 0ce7ac23 sw a4,216(a5)
|
||||
800004f0: b9702773 csrr a4,mhpmcounter23h
|
||||
800004f4: 0ce7ae23 sw a4,220(a5)
|
||||
800004f8: b9802773 csrr a4,mhpmcounter24h
|
||||
800004fc: 0ee7a023 sw a4,224(a5)
|
||||
80000500: b9902773 csrr a4,mhpmcounter25h
|
||||
80000504: 0ee7a223 sw a4,228(a5)
|
||||
80000508: b9a02773 csrr a4,mhpmcounter26h
|
||||
8000050c: 0ee7a423 sw a4,232(a5)
|
||||
80000510: b9b02773 csrr a4,mhpmcounter27h
|
||||
80000514: 0ee7a623 sw a4,236(a5)
|
||||
80000518: b9c02773 csrr a4,mhpmcounter28h
|
||||
8000051c: 0ee7a823 sw a4,240(a5)
|
||||
80000520: b9d02773 csrr a4,mhpmcounter29h
|
||||
80000524: 0ee7aa23 sw a4,244(a5)
|
||||
80000528: b9e02773 csrr a4,mhpmcounter30h
|
||||
8000052c: 0ee7ac23 sw a4,248(a5)
|
||||
80000530: b9f02773 csrr a4,mhpmcounter31h
|
||||
80000534: 0ee7ae23 sw a4,252(a5)
|
||||
80000538: 00008067 ret
|
||||
|
||||
8000053c <atexit>:
|
||||
8000053c: 00050593 mv a1,a0
|
||||
80000540: 00000693 li a3,0
|
||||
80000544: 00000613 li a2,0
|
||||
80000548: 00000513 li a0,0
|
||||
8000054c: 20c0006f j 80000758 <__register_exitproc>
|
||||
|
||||
80000550 <exit>:
|
||||
80000550: ff010113 addi sp,sp,-16
|
||||
80000554: 00000593 li a1,0
|
||||
80000558: 00812423 sw s0,8(sp)
|
||||
8000055c: 00112623 sw ra,12(sp)
|
||||
80000560: 00050413 mv s0,a0
|
||||
80000564: 290000ef jal ra,800007f4 <__call_exitprocs>
|
||||
80000568: 800027b7 lui a5,0x80002
|
||||
8000056c: d487a503 lw a0,-696(a5) # 80001d48 <__stack_top+0x81001d48>
|
||||
80000570: 03c52783 lw a5,60(a0)
|
||||
80000574: 00078463 beqz a5,8000057c <exit+0x2c>
|
||||
80000578: 000780e7 jalr a5
|
||||
8000057c: 00040513 mv a0,s0
|
||||
80000580: b59ff0ef jal ra,800000d8 <_exit>
|
||||
|
||||
80000584 <__libc_fini_array>:
|
||||
80000584: ff010113 addi sp,sp,-16
|
||||
80000588: 00812423 sw s0,8(sp)
|
||||
8000058c: 800027b7 lui a5,0x80002
|
||||
80000590: 80002437 lui s0,0x80002
|
||||
80000594: 91c40413 addi s0,s0,-1764 # 8000191c <__stack_top+0x8100191c>
|
||||
80000598: 91c78793 addi a5,a5,-1764 # 8000191c <__stack_top+0x8100191c>
|
||||
8000059c: 408787b3 sub a5,a5,s0
|
||||
800005a0: 00912223 sw s1,4(sp)
|
||||
800005a4: 00112623 sw ra,12(sp)
|
||||
800005a8: 4027d493 srai s1,a5,0x2
|
||||
800005ac: 02048063 beqz s1,800005cc <__libc_fini_array+0x48>
|
||||
800005b0: ffc78793 addi a5,a5,-4
|
||||
800005b4: 00878433 add s0,a5,s0
|
||||
800005b8: 00042783 lw a5,0(s0)
|
||||
800005bc: fff48493 addi s1,s1,-1
|
||||
800005c0: ffc40413 addi s0,s0,-4
|
||||
800005c4: 000780e7 jalr a5
|
||||
800005c8: fe0498e3 bnez s1,800005b8 <__libc_fini_array+0x34>
|
||||
800005cc: 00c12083 lw ra,12(sp)
|
||||
800005d0: 00812403 lw s0,8(sp)
|
||||
800005d4: 00412483 lw s1,4(sp)
|
||||
800005d8: 01010113 addi sp,sp,16
|
||||
800005dc: 00008067 ret
|
||||
|
||||
800005e0 <__libc_init_array>:
|
||||
800005e0: ff010113 addi sp,sp,-16
|
||||
800005e4: 00812423 sw s0,8(sp)
|
||||
800005e8: 01212023 sw s2,0(sp)
|
||||
800005ec: 80002437 lui s0,0x80002
|
||||
800005f0: 80002937 lui s2,0x80002
|
||||
800005f4: 91840793 addi a5,s0,-1768 # 80001918 <__stack_top+0x81001918>
|
||||
800005f8: 91890913 addi s2,s2,-1768 # 80001918 <__stack_top+0x81001918>
|
||||
800005fc: 40f90933 sub s2,s2,a5
|
||||
80000600: 00112623 sw ra,12(sp)
|
||||
80000604: 00912223 sw s1,4(sp)
|
||||
80000608: 40295913 srai s2,s2,0x2
|
||||
8000060c: 02090063 beqz s2,8000062c <__libc_init_array+0x4c>
|
||||
80000610: 91840413 addi s0,s0,-1768
|
||||
80000614: 00000493 li s1,0
|
||||
80000618: 00042783 lw a5,0(s0)
|
||||
8000061c: 00148493 addi s1,s1,1
|
||||
80000620: 00440413 addi s0,s0,4
|
||||
80000624: 000780e7 jalr a5
|
||||
80000628: fe9918e3 bne s2,s1,80000618 <__libc_init_array+0x38>
|
||||
8000062c: 80002437 lui s0,0x80002
|
||||
80000630: 80002937 lui s2,0x80002
|
||||
80000634: 91840793 addi a5,s0,-1768 # 80001918 <__stack_top+0x81001918>
|
||||
80000638: 91c90913 addi s2,s2,-1764 # 8000191c <__stack_top+0x8100191c>
|
||||
8000063c: 40f90933 sub s2,s2,a5
|
||||
80000640: 40295913 srai s2,s2,0x2
|
||||
80000644: 02090063 beqz s2,80000664 <__libc_init_array+0x84>
|
||||
80000648: 91840413 addi s0,s0,-1768
|
||||
8000064c: 00000493 li s1,0
|
||||
80000650: 00042783 lw a5,0(s0)
|
||||
80000654: 00148493 addi s1,s1,1
|
||||
80000658: 00440413 addi s0,s0,4
|
||||
8000065c: 000780e7 jalr a5
|
||||
80000660: fe9918e3 bne s2,s1,80000650 <__libc_init_array+0x70>
|
||||
80000664: 00c12083 lw ra,12(sp)
|
||||
80000668: 00812403 lw s0,8(sp)
|
||||
8000066c: 00412483 lw s1,4(sp)
|
||||
80000670: 00012903 lw s2,0(sp)
|
||||
80000674: 01010113 addi sp,sp,16
|
||||
80000678: 00008067 ret
|
||||
|
||||
8000067c <memset>:
|
||||
8000067c: 00f00313 li t1,15
|
||||
80000680: 00050713 mv a4,a0
|
||||
80000684: 02c37e63 bgeu t1,a2,800006c0 <memset+0x44>
|
||||
80000688: 00f77793 andi a5,a4,15
|
||||
8000068c: 0a079063 bnez a5,8000072c <memset+0xb0>
|
||||
80000690: 08059263 bnez a1,80000714 <memset+0x98>
|
||||
80000694: ff067693 andi a3,a2,-16
|
||||
80000698: 00f67613 andi a2,a2,15
|
||||
8000069c: 00e686b3 add a3,a3,a4
|
||||
800006a0: 00b72023 sw a1,0(a4) # ff0000 <__stack_size+0xfefc00>
|
||||
800006a4: 00b72223 sw a1,4(a4)
|
||||
800006a8: 00b72423 sw a1,8(a4)
|
||||
800006ac: 00b72623 sw a1,12(a4)
|
||||
800006b0: 01070713 addi a4,a4,16
|
||||
800006b4: fed766e3 bltu a4,a3,800006a0 <memset+0x24>
|
||||
800006b8: 00061463 bnez a2,800006c0 <memset+0x44>
|
||||
800006bc: 00008067 ret
|
||||
800006c0: 40c306b3 sub a3,t1,a2
|
||||
800006c4: 00269693 slli a3,a3,0x2
|
||||
800006c8: 00000297 auipc t0,0x0
|
||||
800006cc: 005686b3 add a3,a3,t0
|
||||
800006d0: 00c68067 jr 12(a3)
|
||||
800006d4: 00b70723 sb a1,14(a4)
|
||||
800006d8: 00b706a3 sb a1,13(a4)
|
||||
800006dc: 00b70623 sb a1,12(a4)
|
||||
800006e0: 00b705a3 sb a1,11(a4)
|
||||
800006e4: 00b70523 sb a1,10(a4)
|
||||
800006e8: 00b704a3 sb a1,9(a4)
|
||||
800006ec: 00b70423 sb a1,8(a4)
|
||||
800006f0: 00b703a3 sb a1,7(a4)
|
||||
800006f4: 00b70323 sb a1,6(a4)
|
||||
800006f8: 00b702a3 sb a1,5(a4)
|
||||
800006fc: 00b70223 sb a1,4(a4)
|
||||
80000700: 00b701a3 sb a1,3(a4)
|
||||
80000704: 00b70123 sb a1,2(a4)
|
||||
80000708: 00b700a3 sb a1,1(a4)
|
||||
8000070c: 00b70023 sb a1,0(a4)
|
||||
80000710: 00008067 ret
|
||||
80000714: 0ff5f593 andi a1,a1,255
|
||||
80000718: 00859693 slli a3,a1,0x8
|
||||
8000071c: 00d5e5b3 or a1,a1,a3
|
||||
80000720: 01059693 slli a3,a1,0x10
|
||||
80000724: 00d5e5b3 or a1,a1,a3
|
||||
80000728: f6dff06f j 80000694 <memset+0x18>
|
||||
8000072c: 00279693 slli a3,a5,0x2
|
||||
80000730: 00000297 auipc t0,0x0
|
||||
80000734: 005686b3 add a3,a3,t0
|
||||
80000738: 00008293 mv t0,ra
|
||||
8000073c: fa0680e7 jalr -96(a3)
|
||||
80000740: 00028093 mv ra,t0
|
||||
80000744: ff078793 addi a5,a5,-16
|
||||
80000748: 40f70733 sub a4,a4,a5
|
||||
8000074c: 00f60633 add a2,a2,a5
|
||||
80000750: f6c378e3 bgeu t1,a2,800006c0 <memset+0x44>
|
||||
80000754: f3dff06f j 80000690 <memset+0x14>
|
||||
|
||||
80000758 <__register_exitproc>:
|
||||
80000758: 800027b7 lui a5,0x80002
|
||||
8000075c: d487a703 lw a4,-696(a5) # 80001d48 <__stack_top+0x81001d48>
|
||||
80000760: 14872783 lw a5,328(a4)
|
||||
80000764: 04078c63 beqz a5,800007bc <__register_exitproc+0x64>
|
||||
80000768: 0047a703 lw a4,4(a5)
|
||||
8000076c: 01f00813 li a6,31
|
||||
80000770: 06e84e63 blt a6,a4,800007ec <__register_exitproc+0x94>
|
||||
80000774: 00271813 slli a6,a4,0x2
|
||||
80000778: 02050663 beqz a0,800007a4 <__register_exitproc+0x4c>
|
||||
8000077c: 01078333 add t1,a5,a6
|
||||
80000780: 08c32423 sw a2,136(t1)
|
||||
80000784: 1887a883 lw a7,392(a5)
|
||||
80000788: 00100613 li a2,1
|
||||
8000078c: 00e61633 sll a2,a2,a4
|
||||
80000790: 00c8e8b3 or a7,a7,a2
|
||||
80000794: 1917a423 sw a7,392(a5)
|
||||
80000798: 10d32423 sw a3,264(t1)
|
||||
8000079c: 00200693 li a3,2
|
||||
800007a0: 02d50463 beq a0,a3,800007c8 <__register_exitproc+0x70>
|
||||
800007a4: 00170713 addi a4,a4,1
|
||||
800007a8: 00e7a223 sw a4,4(a5)
|
||||
800007ac: 010787b3 add a5,a5,a6
|
||||
800007b0: 00b7a423 sw a1,8(a5)
|
||||
800007b4: 00000513 li a0,0
|
||||
800007b8: 00008067 ret
|
||||
800007bc: 14c70793 addi a5,a4,332
|
||||
800007c0: 14f72423 sw a5,328(a4)
|
||||
800007c4: fa5ff06f j 80000768 <__register_exitproc+0x10>
|
||||
800007c8: 18c7a683 lw a3,396(a5)
|
||||
800007cc: 00170713 addi a4,a4,1
|
||||
800007d0: 00e7a223 sw a4,4(a5)
|
||||
800007d4: 00c6e633 or a2,a3,a2
|
||||
800007d8: 18c7a623 sw a2,396(a5)
|
||||
800007dc: 010787b3 add a5,a5,a6
|
||||
800007e0: 00b7a423 sw a1,8(a5)
|
||||
800007e4: 00000513 li a0,0
|
||||
800007e8: 00008067 ret
|
||||
800007ec: fff00513 li a0,-1
|
||||
800007f0: 00008067 ret
|
||||
|
||||
800007f4 <__call_exitprocs>:
|
||||
800007f4: fd010113 addi sp,sp,-48
|
||||
800007f8: 800027b7 lui a5,0x80002
|
||||
800007fc: 01412c23 sw s4,24(sp)
|
||||
80000800: d487aa03 lw s4,-696(a5) # 80001d48 <__stack_top+0x81001d48>
|
||||
80000804: 03212023 sw s2,32(sp)
|
||||
80000808: 02112623 sw ra,44(sp)
|
||||
8000080c: 148a2903 lw s2,328(s4)
|
||||
80000810: 02812423 sw s0,40(sp)
|
||||
80000814: 02912223 sw s1,36(sp)
|
||||
80000818: 01312e23 sw s3,28(sp)
|
||||
8000081c: 01512a23 sw s5,20(sp)
|
||||
80000820: 01612823 sw s6,16(sp)
|
||||
80000824: 01712623 sw s7,12(sp)
|
||||
80000828: 01812423 sw s8,8(sp)
|
||||
8000082c: 04090063 beqz s2,8000086c <__call_exitprocs+0x78>
|
||||
80000830: 00050b13 mv s6,a0
|
||||
80000834: 00058b93 mv s7,a1
|
||||
80000838: 00100a93 li s5,1
|
||||
8000083c: fff00993 li s3,-1
|
||||
80000840: 00492483 lw s1,4(s2)
|
||||
80000844: fff48413 addi s0,s1,-1
|
||||
80000848: 02044263 bltz s0,8000086c <__call_exitprocs+0x78>
|
||||
8000084c: 00249493 slli s1,s1,0x2
|
||||
80000850: 009904b3 add s1,s2,s1
|
||||
80000854: 040b8463 beqz s7,8000089c <__call_exitprocs+0xa8>
|
||||
80000858: 1044a783 lw a5,260(s1)
|
||||
8000085c: 05778063 beq a5,s7,8000089c <__call_exitprocs+0xa8>
|
||||
80000860: fff40413 addi s0,s0,-1
|
||||
80000864: ffc48493 addi s1,s1,-4
|
||||
80000868: ff3416e3 bne s0,s3,80000854 <__call_exitprocs+0x60>
|
||||
8000086c: 02c12083 lw ra,44(sp)
|
||||
80000870: 02812403 lw s0,40(sp)
|
||||
80000874: 02412483 lw s1,36(sp)
|
||||
80000878: 02012903 lw s2,32(sp)
|
||||
8000087c: 01c12983 lw s3,28(sp)
|
||||
80000880: 01812a03 lw s4,24(sp)
|
||||
80000884: 01412a83 lw s5,20(sp)
|
||||
80000888: 01012b03 lw s6,16(sp)
|
||||
8000088c: 00c12b83 lw s7,12(sp)
|
||||
80000890: 00812c03 lw s8,8(sp)
|
||||
80000894: 03010113 addi sp,sp,48
|
||||
80000898: 00008067 ret
|
||||
8000089c: 00492783 lw a5,4(s2)
|
||||
800008a0: 0044a683 lw a3,4(s1)
|
||||
800008a4: fff78793 addi a5,a5,-1
|
||||
800008a8: 04878e63 beq a5,s0,80000904 <__call_exitprocs+0x110>
|
||||
800008ac: 0004a223 sw zero,4(s1)
|
||||
800008b0: fa0688e3 beqz a3,80000860 <__call_exitprocs+0x6c>
|
||||
800008b4: 18892783 lw a5,392(s2)
|
||||
800008b8: 008a9733 sll a4,s5,s0
|
||||
800008bc: 00492c03 lw s8,4(s2)
|
||||
800008c0: 00f777b3 and a5,a4,a5
|
||||
800008c4: 02079263 bnez a5,800008e8 <__call_exitprocs+0xf4>
|
||||
800008c8: 000680e7 jalr a3
|
||||
800008cc: 00492703 lw a4,4(s2)
|
||||
800008d0: 148a2783 lw a5,328(s4)
|
||||
800008d4: 01871463 bne a4,s8,800008dc <__call_exitprocs+0xe8>
|
||||
800008d8: f8f904e3 beq s2,a5,80000860 <__call_exitprocs+0x6c>
|
||||
800008dc: f80788e3 beqz a5,8000086c <__call_exitprocs+0x78>
|
||||
800008e0: 00078913 mv s2,a5
|
||||
800008e4: f5dff06f j 80000840 <__call_exitprocs+0x4c>
|
||||
800008e8: 18c92783 lw a5,396(s2)
|
||||
800008ec: 0844a583 lw a1,132(s1)
|
||||
800008f0: 00f77733 and a4,a4,a5
|
||||
800008f4: 00071c63 bnez a4,8000090c <__call_exitprocs+0x118>
|
||||
800008f8: 000b0513 mv a0,s6
|
||||
800008fc: 000680e7 jalr a3
|
||||
80000900: fcdff06f j 800008cc <__call_exitprocs+0xd8>
|
||||
80000904: 00892223 sw s0,4(s2)
|
||||
80000908: fa9ff06f j 800008b0 <__call_exitprocs+0xbc>
|
||||
8000090c: 00058513 mv a0,a1
|
||||
80000910: 000680e7 jalr a3
|
||||
80000914: fb9ff06f j 800008cc <__call_exitprocs+0xd8>
|
||||
|
||||
Disassembly of section .init_array:
|
||||
|
||||
80001918 <__init_array_start>:
|
||||
80001918: 0068 addi a0,sp,12
|
||||
8000191a: 8000 0x8000
|
||||
|
||||
Disassembly of section .data:
|
||||
|
||||
80001920 <impure_data>:
|
||||
80001920: 0000 unimp
|
||||
80001922: 0000 unimp
|
||||
80001924: 1c0c addi a1,sp,560
|
||||
80001926: 8000 0x8000
|
||||
80001928: 1c74 addi a3,sp,572
|
||||
8000192a: 8000 0x8000
|
||||
8000192c: 1cdc addi a5,sp,628
|
||||
8000192e: 8000 0x8000
|
||||
...
|
||||
800019c8: 0001 nop
|
||||
800019ca: 0000 unimp
|
||||
800019cc: 0000 unimp
|
||||
800019ce: 0000 unimp
|
||||
800019d0: 330e fld ft6,224(sp)
|
||||
800019d2: abcd j 80001fc4 <__BSS_END__+0x1f8>
|
||||
800019d4: 1234 addi a3,sp,296
|
||||
800019d6: e66d bnez a2,80001ac0 <impure_data+0x1a0>
|
||||
800019d8: deec sw a1,124(a3)
|
||||
800019da: 0005 c.nop 1
|
||||
800019dc: 0000000b 0xb
|
||||
...
|
||||
|
||||
Disassembly of section .sdata:
|
||||
|
||||
80001d48 <_global_impure_ptr>:
|
||||
80001d48: 1920 addi s0,sp,184
|
||||
80001d4a: 8000 0x8000
|
||||
|
||||
Disassembly of section .bss:
|
||||
|
||||
80001d4c <g_wspawn_args>:
|
||||
...
|
||||
|
||||
Disassembly of section .comment:
|
||||
|
||||
00000000 <.comment>:
|
||||
0: 3a434347 fmsub.d ft6,ft6,ft4,ft7,rmm
|
||||
4: 2820 fld fs0,80(s0)
|
||||
6: 29554e47 fmsub.s ft8,fa0,fs5,ft5,rmm
|
||||
a: 3920 fld fs0,112(a0)
|
||||
c: 322e fld ft4,232(sp)
|
||||
e: 302e fld ft0,232(sp)
|
||||
...
|
||||
|
||||
Disassembly of section .riscv.attributes:
|
||||
|
||||
00000000 <.riscv.attributes>:
|
||||
0: 2541 jal 680 <__stack_size+0x280>
|
||||
2: 0000 unimp
|
||||
4: 7200 flw fs0,32(a2)
|
||||
6: 7369 lui t1,0xffffa
|
||||
8: 01007663 bgeu zero,a6,14 <__stack_usage+0x14>
|
||||
c: 0000001b 0x1b
|
||||
10: 1004 addi s1,sp,32
|
||||
12: 7205 lui tp,0xfffe1
|
||||
14: 3376 fld ft6,376(sp)
|
||||
16: 6932 flw fs2,12(sp)
|
||||
18: 7032 flw ft0,44(sp)
|
||||
1a: 5f30 lw a2,120(a4)
|
||||
1c: 326d jal fffff9c6 <__stack_top+0xfff9c6>
|
||||
1e: 3070 fld fa2,224(s0)
|
||||
20: 665f 7032 0030 0x307032665f
|
||||
BIN
tests/driver/demo/kernel.elf
Executable file
BIN
tests/driver/demo/kernel.elf
Executable file
Binary file not shown.
202
tests/driver/demo/main.cpp
Normal file
202
tests/driver/demo/main.cpp
Normal file
@@ -0,0 +1,202 @@
|
||||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <vortex.h>
|
||||
#include "common.h"
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
if (0 == _ret) \
|
||||
break; \
|
||||
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
const char* kernel_file = "kernel.bin";
|
||||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
vx_buffer_h staging_buf = nullptr;
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Driver Test." << std::endl;
|
||||
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
count = atoi(optarg);
|
||||
break;
|
||||
case 'k':
|
||||
kernel_file = optarg;
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup() {
|
||||
if (staging_buf) {
|
||||
vx_buf_release(staging_buf);
|
||||
}
|
||||
if (device) {
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
int run_test(const kernel_arg_t& kernel_arg,
|
||||
uint32_t buf_size,
|
||||
uint32_t num_points) {
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, -1));
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int ref = i + i;
|
||||
int cur = buf_ptr[i];
|
||||
if (cur != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
size_t value;
|
||||
kernel_arg_t kernel_arg;
|
||||
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
if (count == 0) {
|
||||
count = 1;
|
||||
}
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
unsigned max_cores, max_warps, max_threads;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
|
||||
|
||||
uint32_t num_tasks = max_cores * max_warps * max_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
|
||||
RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
|
||||
kernel_arg.src0_ptr = value;
|
||||
RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
|
||||
kernel_arg.src1_ptr = value;
|
||||
RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
|
||||
kernel_arg.dst_ptr = value;
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.task_size = count;
|
||||
|
||||
std::cout << "dev_src0=" << std::hex << kernel_arg.src0_ptr << std::endl;
|
||||
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_ptr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &staging_buf));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i-1;
|
||||
}
|
||||
}
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src0_ptr, buf_size, 0));
|
||||
|
||||
// upload source buffer1
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i+1;
|
||||
}
|
||||
}
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src1_ptr, buf_size, 0));
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
}
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0));
|
||||
|
||||
// run tests
|
||||
std::cout << "run tests" << std::endl;
|
||||
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
101
tests/opencl/BlackScholes/BlackScholes.cl
Normal file
101
tests/opencl/BlackScholes/BlackScholes.cl
Normal file
@@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#if(0)
|
||||
#define EXP(a) native_exp(a)
|
||||
#define LOG(a) native_log(a)
|
||||
#define SQRT(a) native_sqrt(a)
|
||||
#else
|
||||
#define EXP(a) exp(a)
|
||||
#define LOG(a) log(a)
|
||||
#define SQRT(a) sqrt(a)
|
||||
#endif
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Predefine functions to avoid bug in OpenCL compiler on Mac OSX 10.7 systems
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
float CND(float d);
|
||||
void BlackScholesBody(__global float *call, __global float *put, float S,
|
||||
float X, float T, float R, float V);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Rational approximation of cumulative normal distribution function
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
float CND(float d){
|
||||
const float A1 = 0.31938153f;
|
||||
const float A2 = -0.356563782f;
|
||||
const float A3 = 1.781477937f;
|
||||
const float A4 = -1.821255978f;
|
||||
const float A5 = 1.330274429f;
|
||||
const float RSQRT2PI = 0.39894228040143267793994605993438f;
|
||||
|
||||
float
|
||||
K = 1.0f / (1.0f + 0.2316419f * fabs(d));
|
||||
|
||||
float
|
||||
cnd = RSQRT2PI * EXP(- 0.5f * d * d) *
|
||||
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
|
||||
|
||||
if(d > 0)
|
||||
cnd = 1.0f - cnd;
|
||||
|
||||
return cnd;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Black-Scholes formula for both call and put
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
void BlackScholesBody(
|
||||
__global float *call, //Call option price
|
||||
__global float *put, //Put option price
|
||||
float S, //Current stock price
|
||||
float X, //Option strike price
|
||||
float T, //Option years
|
||||
float R, //Riskless rate of return
|
||||
float V //Stock volatility
|
||||
){
|
||||
float sqrtT = SQRT(T);
|
||||
float d1 = (LOG(S / X) + (R + 0.5f * V * V) * T) / (V * sqrtT);
|
||||
float d2 = d1 - V * sqrtT;
|
||||
float CNDD1 = CND(d1);
|
||||
float CNDD2 = CND(d2);
|
||||
|
||||
//Calculate Call and Put simultaneously
|
||||
float expRT = EXP(- R * T);
|
||||
*call = (S * CNDD1 - X * expRT * CNDD2);
|
||||
*put = (X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1));
|
||||
}
|
||||
|
||||
|
||||
|
||||
__kernel void BlackScholes(
|
||||
__global float *d_Call, //Call option price
|
||||
__global float *d_Put, //Put option price
|
||||
__global float *d_S, //Current stock price
|
||||
__global float *d_X, //Option strike price
|
||||
__global float *d_T, //Option years
|
||||
float R, //Riskless rate of return
|
||||
float V, //Stock volatility
|
||||
unsigned int optN
|
||||
){
|
||||
for(unsigned int opt = get_global_id(0); opt < optN; opt += get_global_size(0))
|
||||
BlackScholesBody(
|
||||
&d_Call[opt],
|
||||
&d_Put[opt],
|
||||
d_S[opt],
|
||||
d_X[opt],
|
||||
d_T[opt],
|
||||
R,
|
||||
V
|
||||
);
|
||||
}
|
||||
65
tests/opencl/BlackScholes/Makefile
Normal file
65
tests/opencl/BlackScholes/Makefile
Normal file
@@ -0,0 +1,65 @@
|
||||
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT=BlackScholes
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: BlackScholes.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc oclBlackScholes_common.h oclBlackScholes_launcher.cpp oclBlackScholes_gold.cpp $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.dump *.hex
|
||||
248
tests/opencl/BlackScholes/main.cpp
Normal file
248
tests/opencl/BlackScholes/main.cpp
Normal file
@@ -0,0 +1,248 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// standard utilities and systems includes
|
||||
#include <oclUtils.h>
|
||||
#include <shrQATest.h>
|
||||
#include "oclBlackScholes_common.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Helper functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
double executionTime(cl_event &event){
|
||||
cl_ulong start, end;
|
||||
|
||||
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
|
||||
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
|
||||
|
||||
return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Random float helper
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
float randFloat(float low, float high){
|
||||
float t = (float)rand() / (float)RAND_MAX;
|
||||
return (1.0f - t) * low + t * high;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Main program
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
cl_platform_id cpPlatform; //OpenCL platform
|
||||
cl_device_id* cdDevices = NULL; //OpenCL devices list (array)
|
||||
cl_context cxGPUContext; //OpenCL context
|
||||
cl_command_queue cqCommandQueue; //OpenCL command que
|
||||
cl_mem //OpenCL memory buffer objects
|
||||
d_Call,
|
||||
d_Put,
|
||||
d_S,
|
||||
d_X,
|
||||
d_T;
|
||||
|
||||
cl_int ciErrNum;
|
||||
|
||||
float
|
||||
*h_CallCPU,
|
||||
*h_PutCPU,
|
||||
*h_CallGPU,
|
||||
*h_PutGPU,
|
||||
*h_S,
|
||||
*h_X,
|
||||
*h_T;
|
||||
|
||||
const unsigned int optionCount = 4000000;
|
||||
const float R = 0.02f;
|
||||
const float V = 0.30f;
|
||||
|
||||
shrQAStart(argc, argv);
|
||||
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("clGetPlatformID...\n");
|
||||
|
||||
//Get all the devices
|
||||
cl_uint uiNumDevices = 0; // Number of devices available
|
||||
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
||||
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
|
||||
shrLog("Get the Device info and select Device...\n");
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
|
||||
// Get command line device options and config accordingly
|
||||
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
|
||||
{
|
||||
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
||||
}
|
||||
shrLog(" Using Device %u: ", uiTargetDevice);
|
||||
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
|
||||
|
||||
// set logfile name and start logs
|
||||
shrSetLogFileName ("oclBlackScholes.txt");
|
||||
shrLog("%s Starting...\n\n", argv[0]);
|
||||
|
||||
shrLog("Allocating and initializing host memory...\n");
|
||||
h_CallCPU = (float *)malloc(optionCount * sizeof(float));
|
||||
h_PutCPU = (float *)malloc(optionCount * sizeof(float));
|
||||
h_CallGPU = (float *)malloc(optionCount * sizeof(float));
|
||||
h_PutGPU = (float *)malloc(optionCount * sizeof(float));
|
||||
h_S = (float *)malloc(optionCount * sizeof(float));
|
||||
h_X = (float *)malloc(optionCount * sizeof(float));
|
||||
h_T = (float *)malloc(optionCount * sizeof(float));
|
||||
|
||||
srand(2009);
|
||||
for(unsigned int i = 0; i < optionCount; i++){
|
||||
h_CallCPU[i] = -1.0f;
|
||||
h_PutCPU[i] = -1.0f;
|
||||
h_S[i] = randFloat(5.0f, 30.0f);
|
||||
h_X[i] = randFloat(1.0f, 100.0f);
|
||||
h_T[i] = randFloat(0.25f, 10.0f);
|
||||
}
|
||||
|
||||
shrLog("Initializing OpenCL...\n");
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Get a GPU device
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Create the context
|
||||
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Create a command-queue
|
||||
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("Creating OpenCL memory objects...\n");
|
||||
d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
d_Put = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
d_S = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
d_X = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
d_T = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("Starting up BlackScholes...\n");
|
||||
initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv);
|
||||
|
||||
shrLog("Running OpenCL BlackScholes...\n\n");
|
||||
//Just a single run or a warmup iteration
|
||||
BlackScholes(
|
||||
NULL,
|
||||
d_Call,
|
||||
d_Put,
|
||||
d_S,
|
||||
d_X,
|
||||
d_T,
|
||||
R,
|
||||
V,
|
||||
optionCount
|
||||
);
|
||||
|
||||
#ifdef GPU_PROFILING
|
||||
const int numIterations = 16;
|
||||
cl_event startMark, endMark;
|
||||
ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
|
||||
ciErrNum |= clFinish(cqCommandQueue);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
shrDeltaT(0);
|
||||
|
||||
for(int i = 0; i < numIterations; i++){
|
||||
BlackScholes(
|
||||
cqCommandQueue,
|
||||
d_Call,
|
||||
d_Put,
|
||||
d_S,
|
||||
d_X,
|
||||
d_T,
|
||||
R,
|
||||
V,
|
||||
optionCount
|
||||
);
|
||||
}
|
||||
|
||||
ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark);
|
||||
ciErrNum |= clFinish(cqCommandQueue);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Calculate performance metrics by wallclock time
|
||||
double gpuTime = shrDeltaT(0) / numIterations;
|
||||
shrLogEx(LOGBOTH | MASTER, 0, "oclBlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %i, Workgroup = %u\n",
|
||||
(double)(2.0 * optionCount * 1.0e-9)/gpuTime, gpuTime, (2 * optionCount), 1, 0);
|
||||
|
||||
//Get profiling info
|
||||
cl_ulong startTime = 0, endTime = 0;
|
||||
ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
|
||||
ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
|
||||
#endif
|
||||
|
||||
shrLog("\nReading back OpenCL BlackScholes results...\n");
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("Comparing against Host/C++ computation...\n");
|
||||
BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount);
|
||||
double deltaCall = 0, deltaPut = 0, sumCall = 0, sumPut = 0;
|
||||
double L1call, L1put;
|
||||
for(unsigned int i = 0; i < optionCount; i++)
|
||||
{
|
||||
sumCall += fabs(h_CallCPU[i]);
|
||||
sumPut += fabs(h_PutCPU[i]);
|
||||
deltaCall += fabs(h_CallCPU[i] - h_CallGPU[i]);
|
||||
deltaPut += fabs(h_PutCPU[i] - h_PutGPU[i]);
|
||||
}
|
||||
L1call = deltaCall / sumCall;
|
||||
L1put = deltaPut / sumPut;
|
||||
shrLog("Relative L1 (call, put) = (%.3e, %.3e)\n\n", L1call, L1put);
|
||||
|
||||
shrLog("Shutting down...\n");
|
||||
closeBlackScholes();
|
||||
ciErrNum = clReleaseMemObject(d_T);
|
||||
ciErrNum |= clReleaseMemObject(d_X);
|
||||
ciErrNum |= clReleaseMemObject(d_S);
|
||||
ciErrNum |= clReleaseMemObject(d_Put);
|
||||
ciErrNum |= clReleaseMemObject(d_Call);
|
||||
ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
|
||||
ciErrNum |= clReleaseContext(cxGPUContext);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
free(h_T);
|
||||
free(h_X);
|
||||
free(h_S);
|
||||
free(h_PutGPU);
|
||||
free(h_CallGPU);
|
||||
free(h_PutCPU);
|
||||
free(h_CallCPU);
|
||||
|
||||
if(cdDevices)free(cdDevices);
|
||||
|
||||
shrQAFinishExit(argc, (const char **)argv, ((L1call < 1E-6) && (L1put < 1E-6)) ? QA_PASSED : QA_FAILED );
|
||||
}
|
||||
BIN
tests/opencl/BlackScholes/oclBlackScholes.pdf
Normal file
BIN
tests/opencl/BlackScholes/oclBlackScholes.pdf
Normal file
Binary file not shown.
50
tests/opencl/BlackScholes/oclBlackScholes_common.h
Normal file
50
tests/opencl/BlackScholes/oclBlackScholes_common.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <oclUtils.h>
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Process an array of optN options on CPU
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void BlackScholesCPU(
|
||||
float *h_Call, //Call option price
|
||||
float *h_Put, //Put option price
|
||||
float *h_S, //Current stock price
|
||||
float *h_X, //Option strike price
|
||||
float *h_T, //Option years
|
||||
float R, //Riskless rate of return
|
||||
float V, //Stock volatility
|
||||
unsigned int optionCount
|
||||
);
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// OpenCL Black-Scholes kernel launcher
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv);
|
||||
|
||||
extern "C" void closeBlackScholes(void);
|
||||
|
||||
extern "C" void BlackScholes(
|
||||
cl_command_queue cqCommandQueue,
|
||||
cl_mem d_Call, //Call option price
|
||||
cl_mem d_Put, //Put option price
|
||||
cl_mem d_S, //Current stock price
|
||||
cl_mem d_X, //Option strike price
|
||||
cl_mem d_T, //Option years
|
||||
cl_float R, //Riskless rate of return
|
||||
cl_float V, //Stock volatility
|
||||
cl_uint optionCount
|
||||
);
|
||||
92
tests/opencl/BlackScholes/oclBlackScholes_gold.cpp
Normal file
92
tests/opencl/BlackScholes/oclBlackScholes_gold.cpp
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <math.h>
|
||||
#include "oclBlackScholes_common.h"
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Rational approximation of cumulative normal distribution function
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
static double CND(double d){
|
||||
const double A1 = 0.31938153;
|
||||
const double A2 = -0.356563782;
|
||||
const double A3 = 1.781477937;
|
||||
const double A4 = -1.821255978;
|
||||
const double A5 = 1.330274429;
|
||||
const double RSQRT2PI = 0.39894228040143267793994605993438;
|
||||
|
||||
double
|
||||
K = 1.0 / (1.0 + 0.2316419 * fabs(d));
|
||||
|
||||
double
|
||||
cnd = RSQRT2PI * exp(- 0.5 * d * d) *
|
||||
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
|
||||
|
||||
if(d > 0)
|
||||
cnd = 1.0 - cnd;
|
||||
|
||||
return cnd;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Black-Scholes formula for both call and put
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
static void BlackScholesBodyCPU(
|
||||
float& call, //Call option price
|
||||
float& put, //Put option price
|
||||
float Sf, //Current stock price
|
||||
float Xf, //Option strike price
|
||||
float Tf, //Option years
|
||||
float Rf, //Riskless rate of return
|
||||
float Vf //Stock volatility
|
||||
){
|
||||
double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf;
|
||||
|
||||
double sqrtT = sqrt(T);
|
||||
double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
|
||||
double d2 = d1 - V * sqrtT;
|
||||
double CNDD1 = CND(d1);
|
||||
double CNDD2 = CND(d2);
|
||||
|
||||
//Calculate Call and Put simultaneously
|
||||
double expRT = exp(- R * T);
|
||||
call = (float)(S * CNDD1 - X * expRT * CNDD2);
|
||||
put = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1));
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Process an array of optN options
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void BlackScholesCPU(
|
||||
float *h_Call, //Call option price
|
||||
float *h_Put, //Put option price
|
||||
float *h_S, //Current stock price
|
||||
float *h_X, //Option strike price
|
||||
float *h_T, //Option years
|
||||
float R, //Riskless rate of return
|
||||
float V, //Stock volatility
|
||||
unsigned int optionCount
|
||||
){
|
||||
for(unsigned int i = 0; i < optionCount; i++)
|
||||
BlackScholesBodyCPU(
|
||||
h_Call[i],
|
||||
h_Put[i],
|
||||
h_S[i],
|
||||
h_X[i],
|
||||
h_T[i],
|
||||
R,
|
||||
V
|
||||
);
|
||||
}
|
||||
125
tests/opencl/BlackScholes/oclBlackScholes_launcher.cpp
Normal file
125
tests/opencl/BlackScholes/oclBlackScholes_launcher.cpp
Normal file
@@ -0,0 +1,125 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <oclUtils.h>
|
||||
#include "oclBlackScholes_common.h"
|
||||
|
||||
static cl_program cpBlackScholes; //OpenCL program
|
||||
static cl_kernel ckBlackScholes; //OpenCL kernel
|
||||
static cl_command_queue cqDefaultCommandQueue;
|
||||
|
||||
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
|
||||
cl_int ciErrNum;
|
||||
size_t kernelLength;
|
||||
|
||||
shrLog("...loading BlackScholes.cl\n");
|
||||
char *cPathAndName = shrFindFilePath("BlackScholes.cl", argv[0]);
|
||||
shrCheckError(cPathAndName != NULL, shrTRUE);
|
||||
char *cBlackScholes = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
|
||||
shrCheckError(cBlackScholes != NULL, shrTRUE);
|
||||
|
||||
shrLog("...creating BlackScholes program\n");
|
||||
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
|
||||
cpBlackScholes = clCreateProgramWithBuiltInKernels(context, 1, &device_id, "BlackScholes", NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("...building BlackScholes program\n");
|
||||
ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL);
|
||||
|
||||
if(ciErrNum != CL_BUILD_SUCCESS){
|
||||
shrLog("*** Compilation failure ***\n");
|
||||
|
||||
size_t deviceNum;
|
||||
cl_device_id *cdDevices;
|
||||
ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &deviceNum);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
cdDevices = (cl_device_id *)malloc(deviceNum * sizeof(cl_device_id));
|
||||
shrCheckError(cdDevices != NULL, shrTRUE);
|
||||
|
||||
ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, deviceNum * sizeof(cl_device_id), cdDevices, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
size_t logSize;
|
||||
char *logTxt;
|
||||
|
||||
ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
logTxt = (char *)malloc(logSize);
|
||||
shrCheckError(logTxt != NULL, shrTRUE);
|
||||
|
||||
ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, logSize, logTxt, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("%s\n", logTxt);
|
||||
shrLog("*** Exiting ***\n");
|
||||
free(logTxt);
|
||||
free(cdDevices);
|
||||
exit(666);
|
||||
}
|
||||
|
||||
//Save ptx code to separate file
|
||||
oclLogPtx(cpBlackScholes, oclGetFirstDev(cxGPUContext), "BlackScholes.ptx");
|
||||
|
||||
shrLog("...creating BlackScholes kernels\n");
|
||||
ckBlackScholes = clCreateKernel(cpBlackScholes, "BlackScholes", &ciErrNum);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
cqDefaultCommandQueue = cqParamCommandQueue;
|
||||
free(cBlackScholes);
|
||||
free(cPathAndName);
|
||||
}
|
||||
|
||||
extern "C" void closeBlackScholes(void){
|
||||
cl_int ciErrNum;
|
||||
ciErrNum = clReleaseKernel(ckBlackScholes);
|
||||
ciErrNum |= clReleaseProgram(cpBlackScholes);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// OpenCL Black-Scholes kernel launcher
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void BlackScholes(
|
||||
cl_command_queue cqCommandQueue,
|
||||
cl_mem d_Call, //Call option price
|
||||
cl_mem d_Put, //Put option price
|
||||
cl_mem d_S, //Current stock price
|
||||
cl_mem d_X, //Option strike price
|
||||
cl_mem d_T, //Option years
|
||||
cl_float R, //Riskless rate of return
|
||||
cl_float V, //Stock volatility
|
||||
cl_uint optionCount
|
||||
){
|
||||
cl_int ciErrNum;
|
||||
|
||||
if(!cqCommandQueue)
|
||||
cqCommandQueue = cqDefaultCommandQueue;
|
||||
|
||||
ciErrNum = clSetKernelArg(ckBlackScholes, 0, sizeof(cl_mem), (void *)&d_Call);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 1, sizeof(cl_mem), (void *)&d_Put);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 2, sizeof(cl_mem), (void *)&d_S);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 3, sizeof(cl_mem), (void *)&d_X);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 4, sizeof(cl_mem), (void *)&d_T);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 5, sizeof(cl_float), (void *)&R);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 6, sizeof(cl_float), (void *)&V);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 7, sizeof(cl_uint), (void *)&optionCount);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Run the kernel
|
||||
size_t globalWorkSize = 60 * 1024;
|
||||
size_t localWorkSize = 128;
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBlackScholes, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
198
tests/opencl/BlackScholes/oclUtils.h
Normal file
198
tests/opencl/BlackScholes/oclUtils.h
Normal file
@@ -0,0 +1,198 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef OCL_UTILS_H
|
||||
#define OCL_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// Common headers: Cross-API utililties and OpenCL header
|
||||
#include <shrUtils.h>
|
||||
|
||||
// All OpenCL headers
|
||||
#if defined (__APPLE__) || defined(MACOSX)
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
// Includes
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||
// extensions from <CL/cl_ext.h>
|
||||
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||
#endif
|
||||
|
||||
// reminders for build output window and log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including shrUtils.h")
|
||||
#pragma message ("Note: including opencl.h")
|
||||
#endif
|
||||
|
||||
// SDK Revision #
|
||||
#define OCL_SDKREVISION "7027912"
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param clSelectedPlatformID OpenCL platform ID
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print info about the device
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and return device capability
|
||||
//!
|
||||
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" int oclGetDevCap(cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print the device name
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the first device from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of device with maximal FLOPS from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Loads a Program file and prepends the cPreamble to the code.
|
||||
//!
|
||||
//! @return the source string if succeeded, 0 otherwise
|
||||
//! @param cFilename program filename
|
||||
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||
//! @param szFinalLength returned length of the code string
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the binary (PTX) of the program associated with the device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param binary returned code
|
||||
//! @param length length of returned code
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param const char* cPtxFileName optional PTX file name
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||
|
||||
// Helper function for De-allocating cl objects
|
||||
// *********************************************************************
|
||||
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||
|
||||
// Helper function to get OpenCL error string from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclErrorString(cl_int error);
|
||||
|
||||
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
// An error condition is defined by the sample/test value not equal to the reference
|
||||
if (iReference != iSample)
|
||||
{
|
||||
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||
iSample = (iSample == 0) ? -9999 : iSample;
|
||||
|
||||
// Log the error info
|
||||
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||
|
||||
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(iSample);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(iSample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
238
tests/opencl/BlackScholes/shrQATest.h
Normal file
238
tests/opencl/BlackScholes/shrQATest.h
Normal file
@@ -0,0 +1,238 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_QATEST_H
|
||||
#define SHR_QATEST_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#pragma message ("Note: including time.h")
|
||||
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#include <time.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifndef STRCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRCASECMP _stricmp
|
||||
#else
|
||||
#define STRCASECMP strcasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef STRNCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRNCASECMP _strnicmp
|
||||
#else
|
||||
#define STRNCASECMP strncasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// Standardized QA Start/Finish for CUDA SDK tests
|
||||
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||
|
||||
inline int findExeNameStart(const char *exec_name)
|
||||
{
|
||||
int exename_start = (int)strlen(exec_name);
|
||||
|
||||
while( (exename_start > 0) &&
|
||||
(exec_name[exename_start] != '\\') &&
|
||||
(exec_name[exename_start] != '/') )
|
||||
{
|
||||
exename_start--;
|
||||
}
|
||||
if (exec_name[exename_start] == '\\' ||
|
||||
exec_name[exename_start] == '/')
|
||||
{
|
||||
return exename_start+1;
|
||||
} else {
|
||||
return exename_start;
|
||||
}
|
||||
}
|
||||
|
||||
inline int __shrQAStart(int argc, char **argv)
|
||||
{
|
||||
bool bQATest = false;
|
||||
// First clear the output buffer
|
||||
fflush(stdout);
|
||||
fflush(stdout);
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
char *string_argv = &argv[i][string_start];
|
||||
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
}
|
||||
|
||||
// We don't want to print the entire path, so we search for the first
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
return exename_start;
|
||||
}
|
||||
|
||||
enum eQAstatus {
|
||||
QA_FAILED = 0,
|
||||
QA_PASSED = 1,
|
||||
QA_WAIVED = 2
|
||||
};
|
||||
|
||||
inline void __ExitInTime(int seconds)
|
||||
{
|
||||
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||
fflush(stdout);
|
||||
time_t t;
|
||||
int count;
|
||||
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||
fprintf(stdout, "%d...", count);
|
||||
#ifdef WIN32
|
||||
Sleep(1000);
|
||||
#else
|
||||
sleep(1);
|
||||
#endif
|
||||
}
|
||||
fprintf(stdout,"done!\n\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bNoPrompt = true;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bNoPrompt = false;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
} else {
|
||||
if (!bNoPrompt) {
|
||||
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||
fflush(stdout);
|
||||
getchar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
bool bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
}
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish(argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#endif
|
||||
642
tests/opencl/BlackScholes/shrUtils.h
Normal file
642
tests/opencl/BlackScholes/shrUtils.h
Normal file
@@ -0,0 +1,642 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_UTILS_H
|
||||
#define SHR_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// reminders for output window and build log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#endif
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
// Other headers needed for both Windows and Linux
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Un-comment the following #define to enable profiling code in SDK apps
|
||||
//#define GPU_PROFILING
|
||||
|
||||
// Beginning of GPU Architecture definitions
|
||||
inline int ConvertSMVer2Cores(int major, int minor)
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||
typedef struct {
|
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||
int Cores;
|
||||
} sSMtoCores;
|
||||
|
||||
sSMtoCores nGpuArchCoresPerSM[] =
|
||||
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||
{ -1, -1 }
|
||||
};
|
||||
|
||||
int index = 0;
|
||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||
return nGpuArchCoresPerSM[index].Cores;
|
||||
}
|
||||
index++;
|
||||
}
|
||||
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||
return -1;
|
||||
}
|
||||
// end of GPU Architecture definitions
|
||||
|
||||
|
||||
// Defines and enum for use with logging functions
|
||||
// *********************************************************************
|
||||
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||
enum LOGMODES
|
||||
{
|
||||
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||
LOGFILE = 2, // bit to signal "log to file"
|
||||
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||
MASTER = 8, // bit to signal master .csv log output
|
||||
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||
};
|
||||
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||
|
||||
// Standardized boolean
|
||||
enum shrBOOL
|
||||
{
|
||||
shrFALSE = 0,
|
||||
shrTRUE = 1
|
||||
};
|
||||
|
||||
// Standardized MAX, MIN and CLAMP
|
||||
#define MAX(a, b) ((a > b) ? a : b)
|
||||
#define MIN(a, b) ((a < b) ? a : b)
|
||||
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... extended version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... short version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||
|
||||
// Simple argument checker macro
|
||||
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||
|
||||
// Define for user-customized error handling
|
||||
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||
|
||||
// Function to deallocate memory allocated within shrUtils
|
||||
// *********************************************************************
|
||||
extern "C" void shrFree(void* ptr);
|
||||
|
||||
// *********************************************************************
|
||||
// Helper function to log standardized information to Console, to File or to both
|
||||
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
//!
|
||||
//! Automatically opens file and stores handle if needed and not done yet
|
||||
//! Closes file and nulls handle on request
|
||||
//!
|
||||
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||
//! @param 2 dValue:
|
||||
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||
//! Negative val is an error code and this give error preformatting.
|
||||
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||
// *********************************************************************
|
||||
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||
|
||||
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||
// *********************************************************************
|
||||
extern "C" int shrLog(const char* cFormatString, ...);
|
||||
|
||||
// *********************************************************************
|
||||
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||
// Maintains state for 3 independent counters
|
||||
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||
//!
|
||||
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||
// *********************************************************************
|
||||
extern "C" double shrDeltaT(int iCounterID);
|
||||
|
||||
// Optional LogFileNameOverride function
|
||||
// *********************************************************************
|
||||
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||
|
||||
// Helper function to init data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||
|
||||
// Helper function to print data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing single precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing double precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||
const float epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||
const double epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! 4th component
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param OutData handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//!
|
||||
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! 4 bytes)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are seperated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if command line argument \a flag-name is given
|
||||
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||
//! otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param flag_name name of command line flag
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||
const char* flag_name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||
const char* arg_name, int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type unsigned int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||
const char* arg_name, unsigned int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type float
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||
const char* arg_name, float* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type string
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument list those element are strings
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val command line argument list
|
||||
//! @param len length of the list / number of elements
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val,
|
||||
unsigned int* len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned char arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integers with a tolernance for # of byte errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||
//! threshold for # pixel errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||
|
||||
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
if (iReference != iSample)
|
||||
{
|
||||
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(EXIT_FAILURE);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Standardized Exit
|
||||
// *********************************************************************
|
||||
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||
{
|
||||
#ifdef WIN32
|
||||
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#else
|
||||
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#endif
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||
getchar();
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||
}
|
||||
fflush(stderr);
|
||||
exit(iExitCode);
|
||||
}
|
||||
|
||||
#endif
|
||||
29
tests/opencl/DotProduct/DotProduct.cl
Normal file
29
tests/opencl/DotProduct/DotProduct.cl
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
__kernel void DotProduct (__global float* a, __global float* b, __global float* c, int iNumElements)
|
||||
{
|
||||
// find position in global arrays
|
||||
int iGID = get_global_id(0);
|
||||
|
||||
// bound check (equivalent to the limit on a 'for' loop for standard/serial C code
|
||||
if (iGID >= iNumElements)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// process
|
||||
int iInOffset = iGID << 2;
|
||||
c[iGID] = a[iInOffset] * b[iInOffset]
|
||||
+ a[iInOffset + 1] * b[iInOffset + 1]
|
||||
+ a[iInOffset + 2] * b[iInOffset + 2]
|
||||
+ a[iInOffset + 3] * b[iInOffset + 3];
|
||||
}
|
||||
65
tests/opencl/DotProduct/Makefile
Normal file
65
tests/opencl/DotProduct/Makefile
Normal file
@@ -0,0 +1,65 @@
|
||||
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT=DotProduct
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: DotProduct.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.dump *.hex
|
||||
270
tests/opencl/DotProduct/main.cc
Normal file
270
tests/opencl/DotProduct/main.cc
Normal file
@@ -0,0 +1,270 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// *********************************************************************
|
||||
// oclDotProduct Notes:
|
||||
//
|
||||
// A simple OpenCL API demo application that implements a
|
||||
// vector dot product computation between 2 float arrays.
|
||||
//
|
||||
// Runs computations with OpenCL on the GPU device and then checks results
|
||||
// against basic host CPU/C++ computation.
|
||||
//
|
||||
// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness.
|
||||
// But these are NOT required libs for OpenCL developement in general.
|
||||
// *********************************************************************
|
||||
|
||||
// standard utilities and systems includes
|
||||
#include <oclUtils.h>
|
||||
#include <shrQATest.h>
|
||||
|
||||
// Name of the file with the source code for the computation kernel
|
||||
// *********************************************************************
|
||||
const char* cSourceFile = "DotProduct.cl";
|
||||
|
||||
// Host buffers for demo
|
||||
// *********************************************************************
|
||||
void *srcA, *srcB, *dst; // Host buffers for OpenCL test
|
||||
void* Golden; // Host buffer for host golden processing cross check
|
||||
|
||||
// OpenCL Vars
|
||||
cl_platform_id cpPlatform; // OpenCL platform
|
||||
cl_device_id *cdDevices; // OpenCL device
|
||||
cl_context cxGPUContext; // OpenCL context
|
||||
cl_command_queue cqCommandQueue;// OpenCL command que
|
||||
cl_program program; // OpenCL program
|
||||
cl_kernel ckKernel; // OpenCL kernel
|
||||
cl_mem cmDevSrcA; // OpenCL device source buffer A
|
||||
cl_mem cmDevSrcB; // OpenCL device source buffer B
|
||||
cl_mem cmDevDst; // OpenCL device destination buffer
|
||||
size_t szGlobalWorkSize; // Total # of work items in the 1D range
|
||||
size_t szLocalWorkSize; // # of work items in the 1D work group
|
||||
size_t szParmDataBytes; // Byte size of context information
|
||||
size_t szKernelLength; // Byte size of kernel code
|
||||
cl_int ciErrNum; // Error code var
|
||||
char* cPathAndName = NULL; // var for full paths to data, src, etc.
|
||||
char* cSourceCL = NULL; // Buffer to hold source for compilation
|
||||
const char* cExecutableName = NULL;
|
||||
|
||||
// demo config vars
|
||||
int iNumElements= 1277944; // Length of float arrays to process (odd # for illustration)
|
||||
shrBOOL bNoPrompt = shrFALSE;
|
||||
|
||||
// Forward Declarations
|
||||
// *********************************************************************
|
||||
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements);
|
||||
void Cleanup (int iExitCode);
|
||||
void (*pCleanup)(int) = &Cleanup;
|
||||
|
||||
int *gp_argc = NULL;
|
||||
char ***gp_argv = NULL;
|
||||
|
||||
// Main function
|
||||
// *********************************************************************
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
gp_argc = &argc;
|
||||
gp_argv = &argv;
|
||||
|
||||
shrQAStart(argc, argv);
|
||||
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("clGetPlatformID...\n");
|
||||
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("clGetPlatformID...\n");
|
||||
|
||||
//Get all the devices
|
||||
cl_uint uiNumDevices = 0; // Number of devices available
|
||||
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
||||
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
|
||||
shrLog("Get the Device info and select Device...\n");
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
|
||||
// Get command line device options and config accordingly
|
||||
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
|
||||
{
|
||||
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
||||
}
|
||||
shrLog(" Using Device %u: ", uiTargetDevice);
|
||||
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
|
||||
|
||||
// get command line arg for quick test, if provided
|
||||
bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
|
||||
|
||||
// start logs
|
||||
cExecutableName = argv[0];
|
||||
shrSetLogFileName ("oclDotProduct.txt");
|
||||
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
|
||||
|
||||
// set and log Global and Local work size dimensions
|
||||
szLocalWorkSize = 256;
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
|
||||
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
|
||||
|
||||
// Allocate and initialize host arrays
|
||||
shrLog( "Allocate and Init Host Mem...\n");
|
||||
srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
|
||||
srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
|
||||
dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
|
||||
Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
|
||||
shrFillArray((float*)srcA, 4 * iNumElements);
|
||||
shrFillArray((float*)srcB, 4 * iNumElements);
|
||||
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Get a GPU device
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Create the context
|
||||
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Create a command-queue
|
||||
shrLog("clCreateCommandQueue...\n");
|
||||
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
|
||||
shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
|
||||
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Read the OpenCL kernel in from source file
|
||||
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
|
||||
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
|
||||
//oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
|
||||
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
|
||||
//oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
|
||||
|
||||
// Create the program
|
||||
shrLog("clCreateProgramWithSource...\n");
|
||||
//program = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
|
||||
cl_program program =
|
||||
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL);
|
||||
// Build the program with 'mad' Optimization option
|
||||
#ifdef MAC
|
||||
char* flags = "-cl-fast-relaxed-math -DMAC";
|
||||
#else
|
||||
char* flags = "-cl-fast-relaxed-math";
|
||||
#endif
|
||||
shrLog("clBuildProgram...\n");
|
||||
ciErrNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
// write out standard error, Build Log and PTX, then cleanup and exit
|
||||
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
|
||||
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
|
||||
Cleanup(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Create the kernel
|
||||
shrLog("clCreateKernel (DotProduct)...\n");
|
||||
ckKernel = clCreateKernel(program, "DotProduct", &ciErrNum);
|
||||
|
||||
// Set the Argument values
|
||||
shrLog("clSetKernelArg 0 - 3...\n\n");
|
||||
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
|
||||
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Core sequence... copy input data to GPU, compute, copy results back
|
||||
|
||||
// Asynchronous write of data to GPU device
|
||||
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Launch kernel
|
||||
shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Read back results and check accumulated errors
|
||||
shrLog("clEnqueueReadBuffer (Dst)...\n\n");
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Compute and compare results for golden-host and report errors and pass/fail
|
||||
shrLog("Comparing against Host/C++ computation...\n\n");
|
||||
DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
|
||||
shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);
|
||||
|
||||
// Cleanup and leave
|
||||
Cleanup (EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
// "Golden" Host processing dot product function for comparison purposes
|
||||
// *********************************************************************
|
||||
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements)
|
||||
{
|
||||
int i, j, k;
|
||||
for (i = 0, j = 0; i < iNumElements; i++)
|
||||
{
|
||||
pfResult[i] = 0.0f;
|
||||
for (k = 0; k < 4; k++, j++)
|
||||
{
|
||||
pfResult[i] += pfData1[j] * pfData2[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup and exit code
|
||||
// *********************************************************************
|
||||
void Cleanup(int iExitCode)
|
||||
{
|
||||
// Cleanup allocated objects
|
||||
shrLog("Starting Cleanup...\n\n");
|
||||
if(cPathAndName)free(cPathAndName);
|
||||
if(cSourceCL)free(cSourceCL);
|
||||
if(ckKernel)clReleaseKernel(ckKernel);
|
||||
if(program)clReleaseProgram(program);
|
||||
if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
|
||||
if(cxGPUContext)clReleaseContext(cxGPUContext);
|
||||
if (cmDevSrcA)clReleaseMemObject(cmDevSrcA);
|
||||
if (cmDevSrcB)clReleaseMemObject(cmDevSrcB);
|
||||
if (cmDevDst)clReleaseMemObject(cmDevDst);
|
||||
|
||||
// Free host memory
|
||||
free(srcA);
|
||||
free(srcB);
|
||||
free (dst);
|
||||
free(Golden);
|
||||
|
||||
if (cdDevices) free(cdDevices);
|
||||
|
||||
shrQAFinishExit(*gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED);
|
||||
}
|
||||
198
tests/opencl/DotProduct/oclUtils.h
Normal file
198
tests/opencl/DotProduct/oclUtils.h
Normal file
@@ -0,0 +1,198 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef OCL_UTILS_H
|
||||
#define OCL_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// Common headers: Cross-API utililties and OpenCL header
|
||||
#include <shrUtils.h>
|
||||
|
||||
// All OpenCL headers
|
||||
#if defined (__APPLE__) || defined(MACOSX)
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
// Includes
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||
// extensions from <CL/cl_ext.h>
|
||||
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||
#endif
|
||||
|
||||
// reminders for build output window and log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including shrUtils.h")
|
||||
#pragma message ("Note: including opencl.h")
|
||||
#endif
|
||||
|
||||
// SDK Revision #
|
||||
#define OCL_SDKREVISION "7027912"
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param clSelectedPlatformID OpenCL platform ID
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print info about the device
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and return device capability
|
||||
//!
|
||||
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" int oclGetDevCap(cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print the device name
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the first device from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of device with maximal FLOPS from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Loads a Program file and prepends the cPreamble to the code.
|
||||
//!
|
||||
//! @return the source string if succeeded, 0 otherwise
|
||||
//! @param cFilename program filename
|
||||
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||
//! @param szFinalLength returned length of the code string
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the binary (PTX) of the program associated with the device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param binary returned code
|
||||
//! @param length length of returned code
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param const char* cPtxFileName optional PTX file name
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||
|
||||
// Helper function for De-allocating cl objects
|
||||
// *********************************************************************
|
||||
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||
|
||||
// Helper function to get OpenCL error string from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclErrorString(cl_int error);
|
||||
|
||||
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
// An error condition is defined by the sample/test value not equal to the reference
|
||||
if (iReference != iSample)
|
||||
{
|
||||
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||
iSample = (iSample == 0) ? -9999 : iSample;
|
||||
|
||||
// Log the error info
|
||||
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||
|
||||
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(iSample);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(iSample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
238
tests/opencl/DotProduct/shrQATest.h
Normal file
238
tests/opencl/DotProduct/shrQATest.h
Normal file
@@ -0,0 +1,238 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_QATEST_H
|
||||
#define SHR_QATEST_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#pragma message ("Note: including time.h")
|
||||
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#include <time.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifndef STRCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRCASECMP _stricmp
|
||||
#else
|
||||
#define STRCASECMP strcasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef STRNCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRNCASECMP _strnicmp
|
||||
#else
|
||||
#define STRNCASECMP strncasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// Standardized QA Start/Finish for CUDA SDK tests
|
||||
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||
|
||||
inline int findExeNameStart(const char *exec_name)
|
||||
{
|
||||
int exename_start = (int)strlen(exec_name);
|
||||
|
||||
while( (exename_start > 0) &&
|
||||
(exec_name[exename_start] != '\\') &&
|
||||
(exec_name[exename_start] != '/') )
|
||||
{
|
||||
exename_start--;
|
||||
}
|
||||
if (exec_name[exename_start] == '\\' ||
|
||||
exec_name[exename_start] == '/')
|
||||
{
|
||||
return exename_start+1;
|
||||
} else {
|
||||
return exename_start;
|
||||
}
|
||||
}
|
||||
|
||||
inline int __shrQAStart(int argc, char **argv)
|
||||
{
|
||||
bool bQATest = false;
|
||||
// First clear the output buffer
|
||||
fflush(stdout);
|
||||
fflush(stdout);
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
char *string_argv = &argv[i][string_start];
|
||||
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
}
|
||||
|
||||
// We don't want to print the entire path, so we search for the first
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
return exename_start;
|
||||
}
|
||||
|
||||
enum eQAstatus {
|
||||
QA_FAILED = 0,
|
||||
QA_PASSED = 1,
|
||||
QA_WAIVED = 2
|
||||
};
|
||||
|
||||
inline void __ExitInTime(int seconds)
|
||||
{
|
||||
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||
fflush(stdout);
|
||||
time_t t;
|
||||
int count;
|
||||
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||
fprintf(stdout, "%d...", count);
|
||||
#ifdef WIN32
|
||||
Sleep(1000);
|
||||
#else
|
||||
sleep(1);
|
||||
#endif
|
||||
}
|
||||
fprintf(stdout,"done!\n\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bNoPrompt = true;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bNoPrompt = false;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
} else {
|
||||
if (!bNoPrompt) {
|
||||
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||
fflush(stdout);
|
||||
getchar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
bool bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
}
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish(argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#endif
|
||||
642
tests/opencl/DotProduct/shrUtils.h
Normal file
642
tests/opencl/DotProduct/shrUtils.h
Normal file
@@ -0,0 +1,642 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_UTILS_H
|
||||
#define SHR_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// reminders for output window and build log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#endif
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
// Other headers needed for both Windows and Linux
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Un-comment the following #define to enable profiling code in SDK apps
|
||||
//#define GPU_PROFILING
|
||||
|
||||
// Beginning of GPU Architecture definitions
|
||||
inline int ConvertSMVer2Cores(int major, int minor)
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||
typedef struct {
|
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||
int Cores;
|
||||
} sSMtoCores;
|
||||
|
||||
sSMtoCores nGpuArchCoresPerSM[] =
|
||||
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||
{ -1, -1 }
|
||||
};
|
||||
|
||||
int index = 0;
|
||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||
return nGpuArchCoresPerSM[index].Cores;
|
||||
}
|
||||
index++;
|
||||
}
|
||||
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||
return -1;
|
||||
}
|
||||
// end of GPU Architecture definitions
|
||||
|
||||
|
||||
// Defines and enum for use with logging functions
|
||||
// *********************************************************************
|
||||
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||
enum LOGMODES
|
||||
{
|
||||
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||
LOGFILE = 2, // bit to signal "log to file"
|
||||
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||
MASTER = 8, // bit to signal master .csv log output
|
||||
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||
};
|
||||
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||
|
||||
// Standardized boolean
|
||||
enum shrBOOL
|
||||
{
|
||||
shrFALSE = 0,
|
||||
shrTRUE = 1
|
||||
};
|
||||
|
||||
// Standardized MAX, MIN and CLAMP
|
||||
#define MAX(a, b) ((a > b) ? a : b)
|
||||
#define MIN(a, b) ((a < b) ? a : b)
|
||||
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... extended version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... short version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||
|
||||
// Simple argument checker macro
|
||||
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||
|
||||
// Define for user-customized error handling
|
||||
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||
|
||||
// Function to deallocate memory allocated within shrUtils
|
||||
// *********************************************************************
|
||||
extern "C" void shrFree(void* ptr);
|
||||
|
||||
// *********************************************************************
|
||||
// Helper function to log standardized information to Console, to File or to both
|
||||
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
//!
|
||||
//! Automatically opens file and stores handle if needed and not done yet
|
||||
//! Closes file and nulls handle on request
|
||||
//!
|
||||
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||
//! @param 2 dValue:
|
||||
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||
//! Negative val is an error code and this give error preformatting.
|
||||
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||
// *********************************************************************
|
||||
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||
|
||||
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||
// *********************************************************************
|
||||
extern "C" int shrLog(const char* cFormatString, ...);
|
||||
|
||||
// *********************************************************************
|
||||
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||
// Maintains state for 3 independent counters
|
||||
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||
//!
|
||||
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||
// *********************************************************************
|
||||
extern "C" double shrDeltaT(int iCounterID);
|
||||
|
||||
// Optional LogFileNameOverride function
|
||||
// *********************************************************************
|
||||
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||
|
||||
// Helper function to init data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||
|
||||
// Helper function to print data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing single precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing double precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||
const float epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||
const double epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! 4th component
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param OutData handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//!
|
||||
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! 4 bytes)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are seperated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if command line argument \a flag-name is given
|
||||
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||
//! otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param flag_name name of command line flag
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||
const char* flag_name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||
const char* arg_name, int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type unsigned int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||
const char* arg_name, unsigned int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type float
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||
const char* arg_name, float* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type string
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument list those element are strings
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val command line argument list
|
||||
//! @param len length of the list / number of elements
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val,
|
||||
unsigned int* len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned char arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integers with a tolernance for # of byte errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||
//! threshold for # pixel errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||
|
||||
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
if (iReference != iSample)
|
||||
{
|
||||
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(EXIT_FAILURE);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Standardized Exit
|
||||
// *********************************************************************
|
||||
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||
{
|
||||
#ifdef WIN32
|
||||
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#else
|
||||
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#endif
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||
getchar();
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||
}
|
||||
fflush(stderr);
|
||||
exit(iExitCode);
|
||||
}
|
||||
|
||||
#endif
|
||||
31
tests/opencl/Makefile
Normal file
31
tests/opencl/Makefile
Normal file
@@ -0,0 +1,31 @@
|
||||
all:
|
||||
$(MAKE) -C vecadd
|
||||
$(MAKE) -C sgemm
|
||||
$(MAKE) -C saxpy
|
||||
$(MAKE) -C sfilter
|
||||
$(MAKE) -C nearn
|
||||
$(MAKE) -C guassian
|
||||
|
||||
run:
|
||||
$(MAKE) -C vecadd run-vlsim
|
||||
$(MAKE) -C sgemm run-vlsim
|
||||
$(MAKE) -C saxpy run-vlsim
|
||||
$(MAKE) -C sfilter run-vlsim
|
||||
$(MAKE) -C nearn run-vlsim
|
||||
$(MAKE) -C guassian run-vlsim
|
||||
|
||||
clean:
|
||||
$(MAKE) -C vecadd clean
|
||||
$(MAKE) -C sgemm clean
|
||||
$(MAKE) -C saxpy clean
|
||||
$(MAKE) -C sfilter clean
|
||||
$(MAKE) -C nearn clean
|
||||
$(MAKE) -C guassian clean
|
||||
|
||||
clean-all:
|
||||
$(MAKE) -C vecadd clean-all
|
||||
$(MAKE) -C sgemm clean-all
|
||||
$(MAKE) -C saxpy clean-all
|
||||
$(MAKE) -C sfilter clean-all
|
||||
$(MAKE) -C nearn clean-all
|
||||
$(MAKE) -C guassian clean-all
|
||||
65
tests/opencl/VectorHypot/Makefile
Normal file
65
tests/opencl/VectorHypot/Makefile
Normal file
@@ -0,0 +1,65 @@
|
||||
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT=VectorHypot
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: VectorHypot.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.dump *.hex
|
||||
41
tests/opencl/VectorHypot/VectorHypot.cl
Normal file
41
tests/opencl/VectorHypot/VectorHypot.cl
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// OpenCL Kernel Function Naive Implementation for hyptenuse
|
||||
__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
|
||||
{
|
||||
// get index into global data array
|
||||
size_t szGlobalOffset = get_global_id(0) + uiOffset;
|
||||
|
||||
// bound check
|
||||
if (szGlobalOffset >= uiNumElements)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Processing 4 elements per work item, so read fgA and fgB source values from GMEM
|
||||
float4 f4A = fg4A[szGlobalOffset];
|
||||
float4 f4B = fg4B[szGlobalOffset];
|
||||
float4 f4H = (float4)0.0f;
|
||||
|
||||
// Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop
|
||||
for (int i = 0; i < iInnerLoopCount; i++)
|
||||
{
|
||||
// compute the 4 hypotenuses using built-in function
|
||||
f4H.x = hypot (f4A.x, f4B.x);
|
||||
f4H.y = hypot (f4A.y, f4B.y);
|
||||
f4H.z = hypot (f4A.z, f4B.z);
|
||||
f4H.w = hypot (f4A.w, f4B.w);
|
||||
}
|
||||
|
||||
// Write 4 result values back out to GMEM
|
||||
fg4Hypot[szGlobalOffset] = f4H;
|
||||
}
|
||||
686
tests/opencl/VectorHypot/main.cc
Normal file
686
tests/opencl/VectorHypot/main.cc
Normal file
@@ -0,0 +1,686 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// *********************************************************************
|
||||
// oclCopyComputeOverlap Notes:
|
||||
//
|
||||
// OpenCL API demo application for NVIDIA CUDA GPU's that implements a
|
||||
// element by element vector hyptenuse computation using 2 input float arrays
|
||||
// and 1 output float array.
|
||||
//
|
||||
// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped
|
||||
// with respect to GPU computation (and with respect to host thread).
|
||||
//
|
||||
// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the
|
||||
// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard.
|
||||
// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's.
|
||||
//
|
||||
// After setup, warmup and calibration to the system, the sample runs 4 scenarios:
|
||||
// A) Computations with 2 command queues on GPU
|
||||
// A multiple-cycle sequence is executed, timed and compared against the host
|
||||
// B) Computations with 1 command queue on GPU
|
||||
// A multiple-cycle sequence is executed, timed and compared against the host
|
||||
//
|
||||
// The 2-command queue approach ought to be substantially faster
|
||||
//
|
||||
// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently
|
||||
// increases compute time without increasing data size (via a loop inside the kernel)
|
||||
//
|
||||
// At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%:
|
||||
// (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems)
|
||||
//
|
||||
// If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is:
|
||||
//
|
||||
// Single Queue with all the data and all the work
|
||||
// Ttot (serial) = 4T + 4T + 2T = 10T
|
||||
//
|
||||
// Dual Queue, where each queue has 1/2 the data and 1/2 the work
|
||||
// Tq0 (overlap) = 2T + 2T + T ....
|
||||
// Tq1 (overlap) = .... 2T + 2T + T
|
||||
//
|
||||
// Ttot (elapsed, wall) = 2T + 2T + 2T + T = 7T
|
||||
//
|
||||
// Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 % (Tesla arch 1.2 or 1.3, single copy engine)
|
||||
//
|
||||
// For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped.
|
||||
// This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles.
|
||||
// *********************************************************************
|
||||
|
||||
// common SDK header for standard utilities and system libs
|
||||
#include <oclUtils.h>
|
||||
#include <shrQATest.h>
|
||||
|
||||
// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
|
||||
// values greater than 0.0f represent a speed-up relative to non-overlapped
|
||||
#define EXPECTED_OVERLAP 30.0f
|
||||
#define EXPECTED_OVERLAP_FERMI 45.0f
|
||||
#define PASS_FACTOR 0.60f
|
||||
#define RETRIES_ON_FAILURE 1
|
||||
|
||||
// Base sizes for parameters manipulated dynamically or on the command line
|
||||
#define BASE_WORK_ITEMS 64
|
||||
#define BASE_ARRAY_LENGTH 40000
|
||||
#define BASE_LOOP_COUNT 32
|
||||
|
||||
// Vars
|
||||
// *********************************************************************
|
||||
cl_platform_id cpPlatform; // OpenCL platform
|
||||
cl_context cxGPUContext; // OpenCL context
|
||||
cl_command_queue cqCommandQueue[2]; // OpenCL command queues
|
||||
cl_device_id* cdDevices; // OpenCL device list
|
||||
cl_program cpProgram; // OpenCL program
|
||||
cl_kernel ckKernel[2]; // OpenCL kernel, 1 per queue
|
||||
cl_mem cmPinnedSrcA; // OpenCL pinned host source buffer A
|
||||
cl_mem cmPinnedSrcB; // OpenCL pinned host source buffer B
|
||||
cl_mem cmPinnedResult; // OpenCL pinned host result buffer
|
||||
float* fSourceA = NULL; // Mapped pointer for pinned Host source A buffer
|
||||
float* fSourceB = NULL; // Mapped pointer for pinned Host source B buffer
|
||||
float* fResult = NULL; // Mapped pointer for pinned Host result buffer
|
||||
cl_mem cmDevSrcA; // OpenCL device source buffer A
|
||||
cl_mem cmDevSrcB; // OpenCL device source buffer B
|
||||
cl_mem cmDevResult; // OpenCL device result buffer
|
||||
size_t szBuffBytes; // Size of main buffers
|
||||
size_t szGlobalWorkSize; // 1D var for Total # of work items in the launched ND range
|
||||
size_t szLocalWorkSize = BASE_WORK_ITEMS; // initial # of work items in the work group
|
||||
cl_int ciErrNum; // Error code var
|
||||
char* cPathAndName = NULL; // Var for full paths to data, src, etc.
|
||||
char* cSourceCL = NULL; // Buffer to hold source for compilation
|
||||
const char* cExecutableName = NULL;
|
||||
|
||||
// demo config vars
|
||||
const char* cSourceFile = "VectorHypot.cl"; // OpenCL computation kernel source code
|
||||
float* Golden = NULL; // temp buffer to hold golden results for cross check
|
||||
bool bNoPrompt = false; // Command line switch to skip exit prompt
|
||||
bool bQATest = false; // Command line switch to test
|
||||
|
||||
// Forward Declarations
|
||||
// *********************************************************************
|
||||
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
|
||||
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
|
||||
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles);
|
||||
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
|
||||
void Cleanup (int iExitCode);
|
||||
void (*pCleanup)(int) = &Cleanup;
|
||||
|
||||
int *gp_argc = 0;
|
||||
const char *** gp_argv = NULL;
|
||||
|
||||
// Main function
|
||||
// *********************************************************************
|
||||
int main(int argc, const char **argv)
|
||||
{
|
||||
//Locals
|
||||
size_t szKernelLength; // Byte size of kernel code
|
||||
double dBuildTime; // Compile time
|
||||
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
||||
cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample
|
||||
cl_uint uiNumDevices; // Number of devices available
|
||||
int iDevCap = -1; // Capability of device
|
||||
int iInnerLoopCount = BASE_LOOP_COUNT; // Varies "compute intensity" per data within the kernel
|
||||
const int iTestCycles = 10; // How many times to run the external test loop
|
||||
const int iWarmupCycles = 8; // How many times to run the warmup sequence
|
||||
cl_uint uiWorkGroupMultiple = 4; // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
|
||||
cl_uint uiNumElements = BASE_ARRAY_LENGTH; // initial # of elements per array to process (note: procesing 4 per work item)
|
||||
cl_uint uiSizeMultiple = 4; // Command line var (using "sizemult=<n>") to optionally increase vector sizes
|
||||
bool bPassFlag = false; // Var to accumulate test pass/fail
|
||||
shrBOOL bMatch = shrFALSE; // Cross check result
|
||||
shrBOOL bTestOverlap = shrFALSE;
|
||||
double dAvgGPUTime[2] = {0.0, 0.0}; // Average time of iTestCycles calls for 2-Queue and 1-Queue test
|
||||
double dHostTime[2] = {0.0, 0.0}; // Host computation time (2nd test is redundant but a good stability indicator)
|
||||
float fMinPassCriteria[2] = {0.0f, 0.0f}; // Test pass cireria, adjusted dependant on GPU arch
|
||||
|
||||
gp_argc = &argc;
|
||||
gp_argv = &argv;
|
||||
|
||||
shrQAStart(argc, (char **)argv);
|
||||
|
||||
// start logs
|
||||
cExecutableName = argv[0];
|
||||
shrSetLogFileName ("oclCopyComputeOverlap.txt");
|
||||
shrLog("%s Starting...\n\n", argv[0]);
|
||||
|
||||
// get basic command line args
|
||||
bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
|
||||
bQATest = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
|
||||
shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);
|
||||
|
||||
// Optional Command-line multiplier for vector size
|
||||
// Default val of 4 gives 10.24 million float elements per vector
|
||||
// Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
|
||||
shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
|
||||
uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);
|
||||
uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
|
||||
shrLog("Array sizes = %u float elements\n", uiNumElements);
|
||||
|
||||
// Optional Command-line multiplier for workgroup size (x 64 work items)
|
||||
// Default val of 4 gives szLocalWorkSize of 256.
|
||||
// Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
|
||||
shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
|
||||
uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10);
|
||||
szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
|
||||
shrLog("Workgroup Size = %u\n\n", szLocalWorkSize);
|
||||
|
||||
// Get the NVIDIA platform if available, otherwise use default
|
||||
shrLog("Get the Platform ID...\n\n");
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Get OpenCL platform name and version
|
||||
char cBuffer[256];
|
||||
ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("Platform Name = %s\n\n", cBuffer);
|
||||
|
||||
// Get all the devices
|
||||
shrLog("Get the Device info and select Device...\n");
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));
|
||||
|
||||
// Ethans changes
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
|
||||
//ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Set target device and check capabilities
|
||||
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
||||
shrLog(" Using Device %u, ", uiTargetDevice);
|
||||
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||
iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
|
||||
if (iDevCap > 0) {
|
||||
shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
|
||||
} else {
|
||||
shrLog("\n\n", iDevCap);
|
||||
}
|
||||
if (strstr(cBuffer, "NVIDIA") != NULL)
|
||||
{
|
||||
if (iDevCap < 12)
|
||||
{
|
||||
shrLog("Device doesn't have overlap capability. Skipping test...\n");
|
||||
Cleanup (EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
// Device and Platform eligible for overlap testing
|
||||
bTestOverlap = shrTRUE;
|
||||
|
||||
// If device has overlap capability, proceed
|
||||
fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP; // 1st cycle overlap is same for 1 or 2 copy engines
|
||||
if (iDevCap != 20)
|
||||
{
|
||||
// Single copy engine
|
||||
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // avg of many cycles
|
||||
}
|
||||
else
|
||||
{
|
||||
char cDevName[1024];
|
||||
clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
|
||||
if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
|
||||
{
|
||||
// Tesla or Quadro (arch = 2.0) ... Dual copy engine
|
||||
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI; // average of many cycles
|
||||
}
|
||||
else
|
||||
{
|
||||
// Geforce ... Single copy engine
|
||||
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // average of many cycles
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create the context
|
||||
shrLog("clCreateContext...\n");
|
||||
cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Create 2 command-queues
|
||||
cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateCommandQueue [0]...\n");
|
||||
cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateCommandQueue [1]...\n");
|
||||
|
||||
// Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
|
||||
szBuffBytes = sizeof(cl_float) * uiNumElements;
|
||||
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements);
|
||||
|
||||
// Allocate pinned source and result host buffers:
|
||||
// Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
|
||||
cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements);
|
||||
|
||||
// Get mapped pointers to pinned input host buffers
|
||||
// Note: This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
|
||||
fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n");
|
||||
|
||||
// Alloc temp golden buffer for cross checks
|
||||
Golden = (float*)malloc(szBuffBytes);
|
||||
//oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);
|
||||
|
||||
// Read the OpenCL kernel in from source file
|
||||
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
|
||||
//oclCheckError(cPathAndName != NULL, shrTRUE);
|
||||
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
|
||||
// oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
|
||||
|
||||
// Create the program object
|
||||
//cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateProgramWithSource...\n");
|
||||
cl_program program =
|
||||
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "VectorHypot", NULL);
|
||||
// Build the program for the target device
|
||||
clFinish(cqCommandQueue[0]);
|
||||
shrDeltaT(0);
|
||||
ciErrNum = clBuildProgram(program, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
|
||||
shrLog("clBuildProgram...");
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
// write out standard error, Build Log and PTX, then cleanup and exit
|
||||
shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
|
||||
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
|
||||
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
|
||||
Cleanup(EXIT_FAILURE);
|
||||
}
|
||||
dBuildTime = shrDeltaT(0);
|
||||
|
||||
// Ethan - Kernel Addition
|
||||
|
||||
if (program == NULL) {
|
||||
std::cerr << "Failed to write program binary" << std::endl;
|
||||
Cleanup(context, queue, program, kernel, memObjects);
|
||||
return 1;
|
||||
} else {
|
||||
std::cout << "Read program from binary." << std::endl;
|
||||
}
|
||||
|
||||
// Create the kernel
|
||||
ckKernel[0] = clCreateKernel(program, "VectorHypot", &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
ckKernel[1] = clCreateKernel(program, "VectorHypot", &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateKernel (ckKernel[2])...\n");
|
||||
|
||||
// Offsets for 2 queues
|
||||
cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};
|
||||
|
||||
// Set the Argument values for the 1st kernel instance (queue 0)
|
||||
ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
|
||||
shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n");
|
||||
|
||||
// Set the Argument values for the 2d kernel instance (queue 1)
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n");
|
||||
|
||||
//*******************************************
|
||||
// Warmup the driver with dual queue sequence
|
||||
//*******************************************
|
||||
|
||||
// Warmup with dual queue sequence for iTestCycles
|
||||
shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
|
||||
DualQueueSequence(iWarmupCycles, uiNumElements, false);
|
||||
|
||||
// Use single queue config to adjust compute intensity
|
||||
shrLog("Adjust compute for GPU / system...\n");
|
||||
iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles);
|
||||
shrLog(" Kernel inner loop count = %d\n", iInnerLoopCount);
|
||||
|
||||
//*******************************************
|
||||
// Run and time with 2 command-queues
|
||||
//*******************************************
|
||||
for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
|
||||
|
||||
// Run the sequence iTestCycles times
|
||||
dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);
|
||||
|
||||
// Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrLog(" Device vs Host Result Comparison\t: ");
|
||||
VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
|
||||
shrDeltaT(0);
|
||||
for (int i = 0; i < iTestCycles; i++)
|
||||
{
|
||||
VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
|
||||
}
|
||||
dHostTime[0] = shrDeltaT(0)/iTestCycles;
|
||||
|
||||
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
|
||||
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
|
||||
bPassFlag = (bMatch == shrTRUE);
|
||||
|
||||
//*******************************************
|
||||
// Run and time with 1 command queue
|
||||
//*******************************************
|
||||
// Run the sequence iTestCycles times
|
||||
dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);
|
||||
|
||||
// Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrLog(" Device vs Host Result Comparison\t: ");
|
||||
shrDeltaT(0);
|
||||
for (int i = 0; i < iTestCycles; i++)
|
||||
{
|
||||
VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);
|
||||
}
|
||||
dHostTime[1] = shrDeltaT(0)/iTestCycles;
|
||||
|
||||
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
|
||||
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
|
||||
bPassFlag &= (bMatch == shrTRUE);
|
||||
|
||||
//*******************************************
|
||||
|
||||
// Compare Single and Dual queue timing
|
||||
shrLog("\nResult Summary:\n");
|
||||
|
||||
// Log GPU and CPU Time for 2-queue scenario
|
||||
shrLog(" Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
|
||||
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);
|
||||
|
||||
// Log GPU and CPU Time for 1-queue scenario
|
||||
shrLog(" Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
|
||||
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);
|
||||
|
||||
// Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
|
||||
double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
|
||||
|
||||
if( bTestOverlap ) {
|
||||
bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
|
||||
if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
|
||||
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");
|
||||
|
||||
// Log info to master log in standard format
|
||||
shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||
dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize);
|
||||
|
||||
bPassFlag &= bAvgOverlapOK;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
|
||||
}
|
||||
|
||||
|
||||
//*******************************************
|
||||
// Report pass/fail, cleanup and exit
|
||||
Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Run 1 queue sequence for n cycles
|
||||
// *********************************************************************
|
||||
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
|
||||
{
|
||||
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrFillArray(fSourceA, (int)uiNumElements);
|
||||
shrFillArray(fSourceB, (int)uiNumElements);
|
||||
|
||||
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
|
||||
|
||||
// *** Make sure queues are empty and then start timer
|
||||
double dAvgTime = 0.0;
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
shrDeltaT(0);
|
||||
|
||||
// Run the sequence iCycles times
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Nonblocking Write of all of input data from host to device in command-queue 0
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Launch kernel computation, command-queue 0
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Non Blocking Read of output data from device to host, command-queue 0
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
}
|
||||
|
||||
// *** Assure sync to host and return average sequence time
|
||||
clFinish(cqCommandQueue[0]);
|
||||
dAvgTime = shrDeltaT(0)/(double)iCycles;
|
||||
|
||||
// Log config if asked for
|
||||
if (bShowConfig)
|
||||
{
|
||||
shrLog("\n1-Queue sequence Configuration:\n");
|
||||
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 1\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
|
||||
}
|
||||
return dAvgTime;
|
||||
}
|
||||
|
||||
// Run 2 queue sequence for n cycles
|
||||
// *********************************************************************
|
||||
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
|
||||
{
|
||||
// Locals
|
||||
size_t szHalfBuffer = szBuffBytes / 2;
|
||||
size_t szHalfOffset = szHalfBuffer / sizeof(float);
|
||||
double dAvgTime = 0.0;
|
||||
|
||||
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrFillArray(fSourceA, (int)uiNumElements);
|
||||
shrFillArray(fSourceB, (int)uiNumElements);
|
||||
|
||||
// Set Global work size for 2 command-queues, and log work sizes & dimensions
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));
|
||||
|
||||
// Make sure queues are empty and then start timer
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
shrDeltaT(0);
|
||||
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Mid Phase 0
|
||||
// Nonblocking Write of 1st half of input data from host to device in command-queue 0
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver
|
||||
// (not necessary on Linux, Mac OSX or WinXP)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
clFlush(cqCommandQueue[1]);
|
||||
|
||||
// Start Phase 1 ***********************************
|
||||
|
||||
// Launch kernel computation, command-queue 0
|
||||
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Nonblocking Write of 2nd half of input data from host to device in command-queue 1
|
||||
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Push out the compute for queue 0 and write for queue 1 to the driver
|
||||
// (not necessary on Linux, Mac OSX or WinXP)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
clFlush(cqCommandQueue[1]);
|
||||
|
||||
// Start Phase 2 ***********************************
|
||||
|
||||
// Launch kernel computation, command-queue 1
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Non Blocking Read of 1st half of output data from device to host, command-queue 0
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Push out the compute for queue 1 and the read for queue 0 to the driver
|
||||
// (not necessary on Linux, Mac OSX or WinXP)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
clFlush(cqCommandQueue[1]);
|
||||
|
||||
// Start Phase 0 (Rolls over) ***********************************
|
||||
|
||||
// Non Blocking Read of 2nd half of output data from device to host, command-queue 1
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
|
||||
// *** Sync to host and get average sequence time
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
dAvgTime = shrDeltaT(0)/(double)iCycles;
|
||||
|
||||
// Log config if asked for
|
||||
if (bShowConfig)
|
||||
{
|
||||
shrLog("\n2-Queue sequence Configuration:\n");
|
||||
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 2\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
|
||||
}
|
||||
|
||||
return dAvgTime;
|
||||
}
|
||||
|
||||
// Function to adjust compute task according to device capability
|
||||
// This allows a consistent overlap % across a wide variety of GPU's for test purposes
|
||||
// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
|
||||
// *********************************************************************
|
||||
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
|
||||
{
|
||||
// Locals
|
||||
double dCopyTime, dComputeTime;
|
||||
int iComputedLoopCount;
|
||||
|
||||
// Change Source Data
|
||||
shrFillArray(fSourceA, (int)uiNumElements);
|
||||
shrFillArray(fSourceB, (int)uiNumElements);
|
||||
|
||||
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
|
||||
|
||||
// *** Make sure queues are empty and then start timer
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
shrDeltaT(0);
|
||||
|
||||
// Run the copy iCycles times and measure copy time on this system
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Nonblocking Write of all of input data from host to device in command-queue 0
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||
ciErrNum |= clFlush(cqCommandQueue[0]);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
clFinish(cqCommandQueue[0]);
|
||||
dCopyTime = shrDeltaT(0);
|
||||
|
||||
// Run the compute iCycles times and measure compute time on this system
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Launch kernel computation, command-queue 0
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
ciErrNum |= clFlush(cqCommandQueue[0]);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
}
|
||||
clFinish(cqCommandQueue[0]);
|
||||
dComputeTime = shrDeltaT(0);
|
||||
|
||||
// Determine number of core loop cycles proportional to copy/compute time ratio
|
||||
dComputeTime = MAX(dComputeTime, 1.0e-6);
|
||||
iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
return (iComputedLoopCount);
|
||||
}
|
||||
|
||||
// Cleanup/Exit function
|
||||
// *********************************************************************
|
||||
void Cleanup (int iExitCode)
|
||||
{
|
||||
// Cleanup allocated objects
|
||||
shrLog("Starting Cleanup...\n\n");
|
||||
if(cPathAndName)free(cPathAndName);
|
||||
if(cSourceCL)free(cSourceCL);
|
||||
if(Golden)free(Golden);
|
||||
if(ckKernel[0])clReleaseKernel(ckKernel[0]);
|
||||
if(ckKernel[1])clReleaseKernel(ckKernel[1]);
|
||||
if(program)clReleaseProgram(program);
|
||||
if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
|
||||
if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
|
||||
if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
|
||||
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
|
||||
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
|
||||
if(cmDevResult)clReleaseMemObject(cmDevResult);
|
||||
if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
|
||||
if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
|
||||
if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
|
||||
if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
|
||||
if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
|
||||
if(cxGPUContext)clReleaseContext(cxGPUContext);
|
||||
if(cdDevices)free(cdDevices);
|
||||
|
||||
// Master status Pass/Fail (all tests)
|
||||
shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
|
||||
}
|
||||
|
||||
// "Golden" Host processing vector hyptenuse function for comparison purposes
|
||||
// *********************************************************************
|
||||
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
|
||||
{
|
||||
for (unsigned int i = 0; i < uiNumElements; i++)
|
||||
{
|
||||
float fA = pfData1[i];
|
||||
float fB = pfData2[i];
|
||||
float fC = sqrtf(fA * fA + fB * fB);
|
||||
|
||||
pfResult[i] = fC;
|
||||
}
|
||||
}
|
||||
198
tests/opencl/VectorHypot/oclUtils.h
Normal file
198
tests/opencl/VectorHypot/oclUtils.h
Normal file
@@ -0,0 +1,198 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef OCL_UTILS_H
|
||||
#define OCL_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// Common headers: Cross-API utililties and OpenCL header
|
||||
#include <shrUtils.h>
|
||||
|
||||
// All OpenCL headers
|
||||
#if defined (__APPLE__) || defined(MACOSX)
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
// Includes
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||
// extensions from <CL/cl_ext.h>
|
||||
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||
#endif
|
||||
|
||||
// reminders for build output window and log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including shrUtils.h")
|
||||
#pragma message ("Note: including opencl.h")
|
||||
#endif
|
||||
|
||||
// SDK Revision #
|
||||
#define OCL_SDKREVISION "7027912"
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param clSelectedPlatformID OpenCL platform ID
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print info about the device
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and return device capability
|
||||
//!
|
||||
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" int oclGetDevCap(cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print the device name
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the first device from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of device with maximal FLOPS from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Loads a Program file and prepends the cPreamble to the code.
|
||||
//!
|
||||
//! @return the source string if succeeded, 0 otherwise
|
||||
//! @param cFilename program filename
|
||||
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||
//! @param szFinalLength returned length of the code string
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the binary (PTX) of the program associated with the device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param binary returned code
|
||||
//! @param length length of returned code
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param const char* cPtxFileName optional PTX file name
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||
|
||||
// Helper function for De-allocating cl objects
|
||||
// *********************************************************************
|
||||
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||
|
||||
// Helper function to get OpenCL error string from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclErrorString(cl_int error);
|
||||
|
||||
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
// An error condition is defined by the sample/test value not equal to the reference
|
||||
if (iReference != iSample)
|
||||
{
|
||||
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||
iSample = (iSample == 0) ? -9999 : iSample;
|
||||
|
||||
// Log the error info
|
||||
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||
|
||||
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(iSample);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(iSample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
238
tests/opencl/VectorHypot/shrQATest.h
Normal file
238
tests/opencl/VectorHypot/shrQATest.h
Normal file
@@ -0,0 +1,238 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_QATEST_H
|
||||
#define SHR_QATEST_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#pragma message ("Note: including time.h")
|
||||
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#include <time.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifndef STRCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRCASECMP _stricmp
|
||||
#else
|
||||
#define STRCASECMP strcasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef STRNCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRNCASECMP _strnicmp
|
||||
#else
|
||||
#define STRNCASECMP strncasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// Standardized QA Start/Finish for CUDA SDK tests
|
||||
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||
|
||||
inline int findExeNameStart(const char *exec_name)
|
||||
{
|
||||
int exename_start = (int)strlen(exec_name);
|
||||
|
||||
while( (exename_start > 0) &&
|
||||
(exec_name[exename_start] != '\\') &&
|
||||
(exec_name[exename_start] != '/') )
|
||||
{
|
||||
exename_start--;
|
||||
}
|
||||
if (exec_name[exename_start] == '\\' ||
|
||||
exec_name[exename_start] == '/')
|
||||
{
|
||||
return exename_start+1;
|
||||
} else {
|
||||
return exename_start;
|
||||
}
|
||||
}
|
||||
|
||||
inline int __shrQAStart(int argc, char **argv)
|
||||
{
|
||||
bool bQATest = false;
|
||||
// First clear the output buffer
|
||||
fflush(stdout);
|
||||
fflush(stdout);
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
char *string_argv = &argv[i][string_start];
|
||||
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
}
|
||||
|
||||
// We don't want to print the entire path, so we search for the first
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
return exename_start;
|
||||
}
|
||||
|
||||
enum eQAstatus {
|
||||
QA_FAILED = 0,
|
||||
QA_PASSED = 1,
|
||||
QA_WAIVED = 2
|
||||
};
|
||||
|
||||
inline void __ExitInTime(int seconds)
|
||||
{
|
||||
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||
fflush(stdout);
|
||||
time_t t;
|
||||
int count;
|
||||
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||
fprintf(stdout, "%d...", count);
|
||||
#ifdef WIN32
|
||||
Sleep(1000);
|
||||
#else
|
||||
sleep(1);
|
||||
#endif
|
||||
}
|
||||
fprintf(stdout,"done!\n\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bNoPrompt = true;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bNoPrompt = false;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
} else {
|
||||
if (!bNoPrompt) {
|
||||
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||
fflush(stdout);
|
||||
getchar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
bool bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
}
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish(argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#endif
|
||||
642
tests/opencl/VectorHypot/shrUtils.h
Normal file
642
tests/opencl/VectorHypot/shrUtils.h
Normal file
@@ -0,0 +1,642 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_UTILS_H
|
||||
#define SHR_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// reminders for output window and build log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#endif
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
// Other headers needed for both Windows and Linux
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Un-comment the following #define to enable profiling code in SDK apps
|
||||
//#define GPU_PROFILING
|
||||
|
||||
// Beginning of GPU Architecture definitions
|
||||
inline int ConvertSMVer2Cores(int major, int minor)
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||
typedef struct {
|
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||
int Cores;
|
||||
} sSMtoCores;
|
||||
|
||||
sSMtoCores nGpuArchCoresPerSM[] =
|
||||
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||
{ -1, -1 }
|
||||
};
|
||||
|
||||
int index = 0;
|
||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||
return nGpuArchCoresPerSM[index].Cores;
|
||||
}
|
||||
index++;
|
||||
}
|
||||
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||
return -1;
|
||||
}
|
||||
// end of GPU Architecture definitions
|
||||
|
||||
|
||||
// Defines and enum for use with logging functions
|
||||
// *********************************************************************
|
||||
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||
enum LOGMODES
|
||||
{
|
||||
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||
LOGFILE = 2, // bit to signal "log to file"
|
||||
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||
MASTER = 8, // bit to signal master .csv log output
|
||||
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||
};
|
||||
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||
|
||||
// Standardized boolean
|
||||
enum shrBOOL
|
||||
{
|
||||
shrFALSE = 0,
|
||||
shrTRUE = 1
|
||||
};
|
||||
|
||||
// Standardized MAX, MIN and CLAMP
|
||||
#define MAX(a, b) ((a > b) ? a : b)
|
||||
#define MIN(a, b) ((a < b) ? a : b)
|
||||
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... extended version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... short version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||
|
||||
// Simple argument checker macro
|
||||
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||
|
||||
// Define for user-customized error handling
|
||||
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||
|
||||
// Function to deallocate memory allocated within shrUtils
|
||||
// *********************************************************************
|
||||
extern "C" void shrFree(void* ptr);
|
||||
|
||||
// *********************************************************************
|
||||
// Helper function to log standardized information to Console, to File or to both
|
||||
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
//!
|
||||
//! Automatically opens file and stores handle if needed and not done yet
|
||||
//! Closes file and nulls handle on request
|
||||
//!
|
||||
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||
//! @param 2 dValue:
|
||||
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||
//! Negative val is an error code and this give error preformatting.
|
||||
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||
// *********************************************************************
|
||||
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||
|
||||
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||
// *********************************************************************
|
||||
extern "C" int shrLog(const char* cFormatString, ...);
|
||||
|
||||
// *********************************************************************
|
||||
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||
// Maintains state for 3 independent counters
|
||||
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||
//!
|
||||
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||
// *********************************************************************
|
||||
extern "C" double shrDeltaT(int iCounterID);
|
||||
|
||||
// Optional LogFileNameOverride function
|
||||
// *********************************************************************
|
||||
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||
|
||||
// Helper function to init data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||
|
||||
// Helper function to print data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing single precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing double precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||
const float epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||
const double epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! 4th component
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param OutData handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//!
|
||||
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! 4 bytes)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are seperated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if command line argument \a flag-name is given
|
||||
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||
//! otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param flag_name name of command line flag
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||
const char* flag_name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||
const char* arg_name, int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type unsigned int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||
const char* arg_name, unsigned int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type float
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||
const char* arg_name, float* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type string
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument list those element are strings
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val command line argument list
|
||||
//! @param len length of the list / number of elements
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val,
|
||||
unsigned int* len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned char arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integers with a tolernance for # of byte errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||
//! threshold for # pixel errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||
|
||||
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
if (iReference != iSample)
|
||||
{
|
||||
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(EXIT_FAILURE);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Standardized Exit
|
||||
// *********************************************************************
|
||||
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||
{
|
||||
#ifdef WIN32
|
||||
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#else
|
||||
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#endif
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||
getchar();
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||
}
|
||||
fflush(stderr);
|
||||
exit(iExitCode);
|
||||
}
|
||||
|
||||
#endif
|
||||
1847
tests/opencl/bfs/BFS_1.dump
Normal file
1847
tests/opencl/bfs/BFS_1.dump
Normal file
File diff suppressed because it is too large
Load Diff
1724
tests/opencl/bfs/BFS_2.dump
Normal file
1724
tests/opencl/bfs/BFS_2.dump
Normal file
File diff suppressed because it is too large
Load Diff
859
tests/opencl/bfs/CLHelper.h
Executable file
859
tests/opencl/bfs/CLHelper.h
Executable file
@@ -0,0 +1,859 @@
|
||||
//------------------------------------------
|
||||
//--cambine:helper function for OpenCL
|
||||
//--programmer: Jianbin Fang
|
||||
//--date: 27/12/2010
|
||||
//------------------------------------------
|
||||
#ifndef _CL_HELPER_
|
||||
#define _CL_HELPER_
|
||||
|
||||
#include <CL/cl.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
||||
using std::string;
|
||||
using std::ifstream;
|
||||
using std::cerr;
|
||||
using std::endl;
|
||||
using std::cout;
|
||||
//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable
|
||||
#define WORK_DIM 2 // work-items dimensions
|
||||
|
||||
struct oclHandleStruct {
|
||||
cl_context context;
|
||||
cl_device_id *devices;
|
||||
cl_command_queue queue;
|
||||
cl_program program;
|
||||
cl_int cl_status;
|
||||
std::string error_str;
|
||||
std::vector<cl_kernel> kernel;
|
||||
};
|
||||
|
||||
struct oclHandleStruct oclHandles;
|
||||
|
||||
char kernel_file[100] = "Kernels.cl";
|
||||
int total_kernels = 2;
|
||||
string kernel_names[2] = {"BFS_1", "BFS_2"};
|
||||
int work_group_size = 512;
|
||||
int device_id_inused = 0; // deviced id used (default : 0)
|
||||
|
||||
int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return -1;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return -1;
|
||||
}
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Converts the contents of a file into a string
|
||||
*/
|
||||
string FileToString(const string fileName) {
|
||||
ifstream f(fileName.c_str(), ifstream::in | ifstream::binary);
|
||||
|
||||
try {
|
||||
size_t size;
|
||||
char *str;
|
||||
string s;
|
||||
|
||||
if (f.is_open()) {
|
||||
size_t fileSize;
|
||||
f.seekg(0, ifstream::end);
|
||||
size = fileSize = f.tellg();
|
||||
f.seekg(0, ifstream::beg);
|
||||
|
||||
str = new char[size + 1];
|
||||
if (!str)
|
||||
throw(string("Could not allocate memory"));
|
||||
|
||||
f.read(str, fileSize);
|
||||
f.close();
|
||||
str[size] = '\0';
|
||||
|
||||
s = str;
|
||||
delete[] str;
|
||||
return s;
|
||||
}
|
||||
} catch (std::string msg) {
|
||||
cerr << "Exception caught in FileToString(): " << msg << endl;
|
||||
if (f.is_open())
|
||||
f.close();
|
||||
} catch (...) {
|
||||
cerr << "Exception caught in FileToString()" << endl;
|
||||
if (f.is_open())
|
||||
f.close();
|
||||
}
|
||||
string errorMsg = "FileToString()::Error: Unable to open file " + fileName;
|
||||
throw(errorMsg);
|
||||
}
|
||||
//---------------------------------------
|
||||
// Read command line parameters
|
||||
//
|
||||
void _clCmdParams(int argc, char *argv[]) {
|
||||
for (int i = 0; i < argc; ++i) {
|
||||
switch (argv[i][1]) {
|
||||
case 'g': //--g stands for size of work group
|
||||
if (++i < argc) {
|
||||
sscanf(argv[i], "%u", &work_group_size);
|
||||
} else {
|
||||
std::cerr << "Could not read argument after option " << argv[i - 1]
|
||||
<< std::endl;
|
||||
throw;
|
||||
}
|
||||
break;
|
||||
case 'd': //--d stands for device id used in computaion
|
||||
if (++i < argc) {
|
||||
sscanf(argv[i], "%u", &device_id_inused);
|
||||
} else {
|
||||
std::cerr << "Could not read argument after option " << argv[i - 1]
|
||||
<< std::endl;
|
||||
throw;
|
||||
}
|
||||
break;
|
||||
default:;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------
|
||||
// Initlize CL objects
|
||||
//--description: there are 5 steps to initialize all the OpenCL objects needed
|
||||
//--revised on 04/01/2011: get the number of devices and
|
||||
// devices have no relationship with context
|
||||
void _clInit() {
|
||||
printf("_clInit()\n");
|
||||
|
||||
int DEVICE_ID_INUSED = device_id_inused;
|
||||
cl_int resultCL;
|
||||
|
||||
oclHandles.context = NULL;
|
||||
oclHandles.devices = NULL;
|
||||
oclHandles.queue = NULL;
|
||||
oclHandles.program = NULL;
|
||||
|
||||
cl_uint deviceListSize;
|
||||
|
||||
//-----------------------------------------------
|
||||
//--cambine-1: find the available platforms and select one
|
||||
|
||||
cl_uint numPlatforms = 1;
|
||||
cl_platform_id targetPlatform = NULL;
|
||||
|
||||
cl_platform_id *allPlatforms =
|
||||
(cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
|
||||
|
||||
resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL);
|
||||
if (resultCL != CL_SUCCESS)
|
||||
throw(string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)"));
|
||||
|
||||
// Select the target platform. Default: first platform
|
||||
targetPlatform = allPlatforms[0];
|
||||
|
||||
/*for (int i = 0; i < numPlatforms; i++)
|
||||
{
|
||||
char pbuff[128];
|
||||
resultCL = clGetPlatformInfo( allPlatforms[i],
|
||||
CL_PLATFORM_VENDOR,
|
||||
sizeof(pbuff),
|
||||
pbuff,
|
||||
NULL);
|
||||
if (resultCL != CL_SUCCESS)
|
||||
throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)"));
|
||||
|
||||
//printf("vedor is %s\n",pbuff);
|
||||
|
||||
}
|
||||
free(allPlatforms);*/
|
||||
|
||||
//-----------------------------------------------
|
||||
//--cambine-2: create an OpenCL context
|
||||
/*cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM,
|
||||
(cl_context_properties)targetPlatform, 0 };
|
||||
oclHandles.context = clCreateContextFromType(cprops,
|
||||
CL_DEVICE_TYPE_GPU,
|
||||
NULL,
|
||||
NULL,
|
||||
&resultCL);
|
||||
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
|
||||
throw (string("InitCL()::Error: Creating Context
|
||||
(clCreateContextFromType)"));
|
||||
|
||||
//-----------------------------------------------
|
||||
//--cambine-3: detect OpenCL devices
|
||||
// First, get the size of device list
|
||||
oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, 0,
|
||||
NULL, &deviceListSize);
|
||||
if(oclHandles.cl_status!=CL_SUCCESS){
|
||||
throw(string("exception in _clInit -> clGetDeviceIDs"));
|
||||
}
|
||||
if (deviceListSize == 0)
|
||||
throw(string("InitCL()::Error: No devices found."));
|
||||
|
||||
printf("OK1()\n");
|
||||
|
||||
//std::cout<<"device number:"<<deviceListSize<<std::endl;*/
|
||||
|
||||
// Now, allocate the device list
|
||||
deviceListSize = 1;
|
||||
oclHandles.devices =
|
||||
(cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id));
|
||||
if (oclHandles.devices == 0)
|
||||
throw(string("InitCL()::Error: Could not allocate memory."));
|
||||
|
||||
//* Next, get the device list data
|
||||
oclHandles.cl_status =
|
||||
clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_DEFAULT, deviceListSize,
|
||||
oclHandles.devices, NULL);
|
||||
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||
throw(string("exception in _clInit -> clGetDeviceIDs-2"));
|
||||
}
|
||||
|
||||
oclHandles.context = clCreateContext(NULL, deviceListSize, oclHandles.devices,
|
||||
NULL, NULL, &resultCL);
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
|
||||
throw(string("InitCL()::Error: Creating Context (clCreateContext)"));
|
||||
|
||||
//-----------------------------------------------
|
||||
//--cambine-4: Create an OpenCL command queue
|
||||
oclHandles.queue = clCreateCommandQueue(
|
||||
oclHandles.context, oclHandles.devices[DEVICE_ID_INUSED], 0, &resultCL);
|
||||
//printf("resultCL=%d, queue=0x%x\n", resultCL, oclHandles.queue);
|
||||
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
|
||||
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
|
||||
//-----------------------------------------------
|
||||
//--cambine-5: Load CL file, build CL program object, create CL kernel object
|
||||
/*std::string source_str = FileToString(kernel_file);
|
||||
const char * source = source_str.c_str();
|
||||
size_t sourceSize[] = { source_str.length() };*/
|
||||
|
||||
//oclHandles.program = clCreateProgramWithBuiltInKernels(
|
||||
// oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED],
|
||||
// "BFS_1;BFS_2", &resultCL);
|
||||
/*oclHandles.program = clCreateProgramWithSource(oclHandles.context,
|
||||
1,
|
||||
&source,
|
||||
sourceSize,
|
||||
&resultCL);*/
|
||||
// read kernel binary from file
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
std::abort();
|
||||
|
||||
oclHandles.program = clCreateProgramWithBinary(
|
||||
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &resultCL);
|
||||
free(kernel_bin);
|
||||
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
|
||||
throw(string("InitCL()::Error: Loading Binary into cl_program. "
|
||||
"(clCreateProgramWithBinary)"));
|
||||
|
||||
// insert debug information
|
||||
// std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines
|
||||
// options += " -cl-nv-opt-level=3";
|
||||
resultCL = clBuildProgram(oclHandles.program, deviceListSize,
|
||||
oclHandles.devices, NULL, NULL, NULL);
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) {
|
||||
cerr << "InitCL()::Error: In clBuildProgram" << endl;
|
||||
|
||||
size_t length;
|
||||
resultCL = clGetProgramBuildInfo(oclHandles.program,
|
||||
oclHandles.devices[DEVICE_ID_INUSED],
|
||||
CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
|
||||
if (resultCL != CL_SUCCESS)
|
||||
throw(string("InitCL()::Error: Getting Program build "
|
||||
"info(clGetProgramBuildInfo)"));
|
||||
|
||||
char *buffer = (char *)malloc(length);
|
||||
resultCL = clGetProgramBuildInfo(
|
||||
oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
|
||||
CL_PROGRAM_BUILD_LOG, length, buffer, NULL);
|
||||
if (resultCL != CL_SUCCESS)
|
||||
throw(string("InitCL()::Error: Getting Program build "
|
||||
"info(clGetProgramBuildInfo)"));
|
||||
|
||||
cerr << buffer << endl;
|
||||
free(buffer);
|
||||
|
||||
throw(string("InitCL()::Error: Building Program (clBuildProgram)"));
|
||||
}
|
||||
|
||||
// get program information in intermediate representation
|
||||
#ifdef PTX_MSG
|
||||
size_t binary_sizes[deviceListSize];
|
||||
char *binaries[deviceListSize];
|
||||
// figure out number of devices and the sizes of the binary for each device.
|
||||
oclHandles.cl_status =
|
||||
clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES,
|
||||
sizeof(size_t) * deviceListSize, &binary_sizes, NULL);
|
||||
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2"));
|
||||
}
|
||||
|
||||
std::cout << "--cambine:" << binary_sizes << std::endl;
|
||||
// copy over all of the generated binaries.
|
||||
for (int i = 0; i < deviceListSize; i++)
|
||||
binaries[i] = (char *)malloc(sizeof(char) * (binary_sizes[i] + 1));
|
||||
oclHandles.cl_status =
|
||||
clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARIES,
|
||||
sizeof(char *) * deviceListSize, binaries, NULL);
|
||||
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-3"));
|
||||
}
|
||||
for (int i = 0; i < deviceListSize; i++)
|
||||
binaries[i][binary_sizes[i]] = '\0';
|
||||
std::cout << "--cambine:writing ptd information..." << std::endl;
|
||||
FILE *ptx_file = fopen("cl.ptx", "w");
|
||||
if (ptx_file == NULL) {
|
||||
throw(string("exceptions in allocate ptx file."));
|
||||
}
|
||||
fprintf(ptx_file, "%s", binaries[DEVICE_ID_INUSED]);
|
||||
fclose(ptx_file);
|
||||
std::cout << "--cambine:writing ptd information done." << std::endl;
|
||||
for (int i = 0; i < deviceListSize; i++)
|
||||
free(binaries[i]);
|
||||
#endif
|
||||
|
||||
for (int nKernel = 0; nKernel < total_kernels; nKernel++) {
|
||||
/* get a kernel object handle for a kernel with the given name */
|
||||
cl_kernel kernel = clCreateKernel(
|
||||
oclHandles.program, (kernel_names[nKernel]).c_str(), &resultCL);
|
||||
|
||||
if ((resultCL != CL_SUCCESS) || (kernel == NULL)) {
|
||||
string errorMsg = "InitCL()::Error: Creating Kernel (clCreateKernel) \"" +
|
||||
kernel_names[nKernel] + "\"";
|
||||
throw(errorMsg);
|
||||
}
|
||||
|
||||
oclHandles.kernel.push_back(kernel);
|
||||
}
|
||||
// get resource alocation information
|
||||
#ifdef RES_MSG
|
||||
char *build_log;
|
||||
size_t ret_val_size;
|
||||
oclHandles.cl_status = clGetProgramBuildInfo(
|
||||
oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
|
||||
CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
|
||||
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||
throw(string("exceptions in _InitCL -> getting resource information"));
|
||||
}
|
||||
|
||||
build_log = (char *)malloc(ret_val_size + 1);
|
||||
oclHandles.cl_status = clGetProgramBuildInfo(
|
||||
oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
|
||||
CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
|
||||
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||
throw(string(
|
||||
"exceptions in _InitCL -> getting resources allocation information-2"));
|
||||
}
|
||||
build_log[ret_val_size] = '\0';
|
||||
std::cout << "--cambine:" << build_log << std::endl;
|
||||
free(build_log);
|
||||
#endif
|
||||
}
|
||||
|
||||
//---------------------------------------
|
||||
// release CL objects
|
||||
void _clRelease() {
|
||||
char errorFlag = false;
|
||||
|
||||
for (int nKernel = 0; nKernel < oclHandles.kernel.size(); nKernel++) {
|
||||
if (oclHandles.kernel[nKernel] != NULL) {
|
||||
cl_int resultCL = clReleaseKernel(oclHandles.kernel[nKernel]);
|
||||
if (resultCL != CL_SUCCESS) {
|
||||
cerr << "ReleaseCL()::Error: In clReleaseKernel" << endl;
|
||||
errorFlag = true;
|
||||
}
|
||||
oclHandles.kernel[nKernel] = NULL;
|
||||
printf("clReleaseKernel()\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (oclHandles.program != NULL) {
|
||||
cl_int resultCL = clReleaseProgram(oclHandles.program);
|
||||
if (resultCL != CL_SUCCESS) {
|
||||
cerr << "ReleaseCL()::Error: In clReleaseProgram" << endl;
|
||||
errorFlag = true;
|
||||
}
|
||||
oclHandles.program = NULL;
|
||||
printf("clReleaseProgram()\n");
|
||||
}
|
||||
|
||||
if (oclHandles.queue != NULL) {
|
||||
cl_int resultCL = clReleaseCommandQueue(oclHandles.queue);
|
||||
if (resultCL != CL_SUCCESS) {
|
||||
cerr << "ReleaseCL()::Error: In clReleaseCommandQueue" << endl;
|
||||
errorFlag = true;
|
||||
}
|
||||
oclHandles.queue = NULL;
|
||||
printf("clReleaseCommandQueue()\n");
|
||||
}
|
||||
|
||||
if (oclHandles.context != NULL) {
|
||||
cl_int resultCL = clReleaseContext(oclHandles.context);
|
||||
if (resultCL != CL_SUCCESS) {
|
||||
cerr << "ReleaseCL()::Error: In clReleaseContext" << endl;
|
||||
errorFlag = true;
|
||||
}
|
||||
oclHandles.context = NULL;
|
||||
printf("clReleaseContext()\n");
|
||||
}
|
||||
|
||||
if (oclHandles.devices != NULL) {
|
||||
cl_int resultCL = clReleaseDevice(oclHandles.devices[0]);
|
||||
if (resultCL != CL_SUCCESS) {
|
||||
cerr << "ReleaseCL()::Error: In clReleaseDevice" << endl;
|
||||
errorFlag = true;
|
||||
}
|
||||
free(oclHandles.devices);
|
||||
printf("clReleaseDevice()\n");
|
||||
}
|
||||
|
||||
if (errorFlag)
|
||||
throw(string("ReleaseCL()::Error encountered."));
|
||||
}
|
||||
//--------------------------------------------------------
|
||||
//--cambine:create buffer and then copy data from host to device
|
||||
cl_mem _clCreateAndCpyMem(int size, void *h_mem_source) throw(string) {
|
||||
cl_mem d_mem;
|
||||
d_mem = clCreateBuffer(oclHandles.context,
|
||||
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size,
|
||||
h_mem_source, &oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clCreateAndCpyMem()"));
|
||||
#endif
|
||||
return d_mem;
|
||||
}
|
||||
//-------------------------------------------------------
|
||||
//--cambine: create read only buffer for devices
|
||||
//--date: 17/01/2011
|
||||
cl_mem _clMallocRW(int size, void *h_mem_ptr) throw(string) {
|
||||
cl_mem d_mem;
|
||||
d_mem = clCreateBuffer(oclHandles.context,
|
||||
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size,
|
||||
h_mem_ptr, &oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clMallocRW"));
|
||||
#endif
|
||||
return d_mem;
|
||||
}
|
||||
//-------------------------------------------------------
|
||||
//--cambine: create read and write buffer for devices
|
||||
//--date: 17/01/2011
|
||||
cl_mem _clMalloc(int size, void *h_mem_ptr) throw(string) {
|
||||
cl_mem d_mem;
|
||||
d_mem = clCreateBuffer(oclHandles.context,
|
||||
CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, size,
|
||||
h_mem_ptr, &oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clMalloc"));
|
||||
#endif
|
||||
return d_mem;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------
|
||||
//--cambine: transfer data from host to device
|
||||
//--date: 17/01/2011
|
||||
void _clMemcpyH2D(cl_mem d_mem, int size, const void *h_mem_ptr) throw(string) {
|
||||
oclHandles.cl_status = clEnqueueWriteBuffer(
|
||||
oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_ptr, 0, NULL, NULL);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clMemcpyH2D"));
|
||||
#endif
|
||||
}
|
||||
//--------------------------------------------------------
|
||||
//--cambine:create buffer and then copy data from host to device with pinned
|
||||
// memory
|
||||
cl_mem _clCreateAndCpyPinnedMem(int size, float *h_mem_source) throw(string) {
|
||||
cl_mem d_mem, d_mem_pinned;
|
||||
float *h_mem_pinned = NULL;
|
||||
d_mem_pinned = clCreateBuffer(oclHandles.context,
|
||||
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size,
|
||||
NULL, &oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clCreateAndCpyMem()->d_mem_pinned"));
|
||||
#endif
|
||||
//------------
|
||||
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY, size, NULL,
|
||||
&oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clCreateAndCpyMem() -> d_mem "));
|
||||
#endif
|
||||
//----------
|
||||
h_mem_pinned = (cl_float *)clEnqueueMapBuffer(
|
||||
oclHandles.queue, d_mem_pinned, CL_TRUE, CL_MAP_WRITE, 0, size, 0, NULL,
|
||||
NULL, &oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer"));
|
||||
#endif
|
||||
int element_number = size / sizeof(float);
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < element_number; i++) {
|
||||
h_mem_pinned[i] = h_mem_source[i];
|
||||
}
|
||||
//----------
|
||||
oclHandles.cl_status = clEnqueueWriteBuffer(
|
||||
oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_pinned, 0, NULL, NULL);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueWriteBuffer"));
|
||||
#endif
|
||||
|
||||
return d_mem;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------
|
||||
//--cambine:create write only buffer on device
|
||||
cl_mem _clMallocWO(int size) throw(string) {
|
||||
cl_mem d_mem;
|
||||
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0,
|
||||
&oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clCreateMem()"));
|
||||
#endif
|
||||
return d_mem;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------
|
||||
// transfer data from device to host
|
||||
void _clMemcpyD2H(cl_mem d_mem, int size, void *h_mem) throw(string) {
|
||||
oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE,
|
||||
0, size, h_mem, 0, 0, 0);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clCpyMemD2H -> ";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_COMMAND_QUEUE:
|
||||
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||
break;
|
||||
case CL_INVALID_CONTEXT:
|
||||
oclHandles.error_str += "CL_INVALID_CONTEXT";
|
||||
break;
|
||||
case CL_INVALID_MEM_OBJECT:
|
||||
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||
break;
|
||||
case CL_INVALID_VALUE:
|
||||
oclHandles.error_str += "CL_INVALID_VALUE";
|
||||
break;
|
||||
case CL_INVALID_EVENT_WAIT_LIST:
|
||||
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
|
||||
break;
|
||||
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unknown reason";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(oclHandles.error_str);
|
||||
#endif
|
||||
}
|
||||
|
||||
//--------------------------------------------------------
|
||||
// set kernel arguments
|
||||
void _clSetArgs(int kernel_id, int arg_idx, void *d_mem,
|
||||
int size = 0) throw(string) {
|
||||
if (!size) {
|
||||
oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx,
|
||||
sizeof(d_mem), &d_mem);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_KERNEL:
|
||||
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||
break;
|
||||
case CL_INVALID_ARG_INDEX:
|
||||
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
|
||||
break;
|
||||
case CL_INVALID_ARG_VALUE:
|
||||
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
|
||||
break;
|
||||
case CL_INVALID_MEM_OBJECT:
|
||||
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||
break;
|
||||
case CL_INVALID_SAMPLER:
|
||||
oclHandles.error_str += "CL_INVALID_SAMPLER";
|
||||
break;
|
||||
case CL_INVALID_ARG_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
|
||||
break;
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unknown reason";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(oclHandles.error_str);
|
||||
#endif
|
||||
} else {
|
||||
oclHandles.cl_status =
|
||||
clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_KERNEL:
|
||||
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||
break;
|
||||
case CL_INVALID_ARG_INDEX:
|
||||
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
|
||||
break;
|
||||
case CL_INVALID_ARG_VALUE:
|
||||
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
|
||||
break;
|
||||
case CL_INVALID_MEM_OBJECT:
|
||||
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||
break;
|
||||
case CL_INVALID_SAMPLER:
|
||||
oclHandles.error_str += "CL_INVALID_SAMPLER";
|
||||
break;
|
||||
case CL_INVALID_ARG_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
|
||||
break;
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unknown reason";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(oclHandles.error_str);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
void _clFinish() throw(string) {
|
||||
oclHandles.cl_status = clFinish(oclHandles.queue);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clFinish";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_COMMAND_QUEUE:
|
||||
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||
break;
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unknown reasons";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||
throw(oclHandles.error_str);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
//--------------------------------------------------------
|
||||
//--cambine:enqueue kernel
|
||||
void _clInvokeKernel(int kernel_id, int work_items,
|
||||
int work_group_size) throw(string) {
|
||||
cl_uint work_dim = WORK_DIM;
|
||||
//cl_event e[1];
|
||||
if (work_items % work_group_size != 0) // process situations that work_items
|
||||
// cannot be divided by work_group_size
|
||||
work_items =
|
||||
work_items + (work_group_size - (work_items % work_group_size));
|
||||
size_t local_work_size[] = {work_group_size, 1};
|
||||
size_t global_work_size[] = {work_items, 1};
|
||||
oclHandles.cl_status = clEnqueueNDRangeKernel(
|
||||
oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
|
||||
global_work_size, local_work_size, 0, 0, NULL);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_PROGRAM_EXECUTABLE:
|
||||
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
|
||||
break;
|
||||
case CL_INVALID_COMMAND_QUEUE:
|
||||
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||
break;
|
||||
case CL_INVALID_KERNEL:
|
||||
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||
break;
|
||||
case CL_INVALID_CONTEXT:
|
||||
oclHandles.error_str += "CL_INVALID_CONTEXT";
|
||||
break;
|
||||
case CL_INVALID_KERNEL_ARGS:
|
||||
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
|
||||
break;
|
||||
case CL_INVALID_WORK_DIMENSION:
|
||||
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
|
||||
break;
|
||||
case CL_INVALID_GLOBAL_WORK_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
|
||||
break;
|
||||
case CL_INVALID_WORK_GROUP_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
|
||||
break;
|
||||
case CL_INVALID_WORK_ITEM_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
|
||||
break;
|
||||
case CL_INVALID_GLOBAL_OFFSET:
|
||||
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
|
||||
break;
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||
break;
|
||||
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||
break;
|
||||
case CL_INVALID_EVENT_WAIT_LIST:
|
||||
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unkown reseason";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(oclHandles.error_str);
|
||||
#endif
|
||||
//_clFinish();
|
||||
// oclHandles.cl_status = clWaitForEvents(1, &e[0]);
|
||||
// #ifdef ERRMSG
|
||||
// if (oclHandles.cl_status!= CL_SUCCESS)
|
||||
// throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
|
||||
// #endif
|
||||
}
|
||||
void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x,
|
||||
int group_y) throw(string) {
|
||||
cl_uint work_dim = WORK_DIM;
|
||||
size_t local_work_size[] = {group_x, group_y};
|
||||
size_t global_work_size[] = {range_x, range_y};
|
||||
//cl_event e[1];
|
||||
/*if(work_items%work_group_size != 0) //process situations that work_items
|
||||
cannot be divided by work_group_size
|
||||
work_items = work_items + (work_group_size-(work_items%work_group_size));*/
|
||||
oclHandles.cl_status = clEnqueueNDRangeKernel(
|
||||
oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
|
||||
global_work_size, local_work_size, 0, 0, NULL);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_PROGRAM_EXECUTABLE:
|
||||
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
|
||||
break;
|
||||
case CL_INVALID_COMMAND_QUEUE:
|
||||
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||
break;
|
||||
case CL_INVALID_KERNEL:
|
||||
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||
break;
|
||||
case CL_INVALID_CONTEXT:
|
||||
oclHandles.error_str += "CL_INVALID_CONTEXT";
|
||||
break;
|
||||
case CL_INVALID_KERNEL_ARGS:
|
||||
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
|
||||
break;
|
||||
case CL_INVALID_WORK_DIMENSION:
|
||||
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
|
||||
break;
|
||||
case CL_INVALID_GLOBAL_WORK_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
|
||||
break;
|
||||
case CL_INVALID_WORK_GROUP_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
|
||||
break;
|
||||
case CL_INVALID_WORK_ITEM_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
|
||||
break;
|
||||
case CL_INVALID_GLOBAL_OFFSET:
|
||||
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
|
||||
break;
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||
break;
|
||||
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||
break;
|
||||
case CL_INVALID_EVENT_WAIT_LIST:
|
||||
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unkown reseason";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(oclHandles.error_str);
|
||||
#endif
|
||||
//_clFinish();
|
||||
/*oclHandles.cl_status = clWaitForEvents(1, &e[0]);
|
||||
|
||||
#ifdef ERRMSG
|
||||
|
||||
if (oclHandles.cl_status!= CL_SUCCESS)
|
||||
|
||||
throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
|
||||
|
||||
#endif*/
|
||||
}
|
||||
|
||||
//--------------------------------------------------------
|
||||
// release OpenCL objects
|
||||
void _clFree(cl_mem ob) throw(string) {
|
||||
if (ob != NULL)
|
||||
oclHandles.cl_status = clReleaseMemObject(ob);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clFree() ->";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_MEM_OBJECT:
|
||||
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||
break;
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unkown reseason";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(oclHandles.error_str);
|
||||
#endif
|
||||
}
|
||||
#endif //_CL_HELPER_
|
||||
59
tests/opencl/bfs/Makefile
Normal file
59
tests/opencl/bfs/Makefile
Normal file
@@ -0,0 +1,59 @@
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 -Wstack-usage=1024 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
#CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
PROJECT = bfs
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
0
tests/opencl/bfs/README
Normal file
0
tests/opencl/bfs/README
Normal file
28677
tests/opencl/bfs/graph4096.txt
Executable file
28677
tests/opencl/bfs/graph4096.txt
Executable file
File diff suppressed because it is too large
Load Diff
53
tests/opencl/bfs/kernel.cl
Executable file
53
tests/opencl/bfs/kernel.cl
Executable file
@@ -0,0 +1,53 @@
|
||||
/* ============================================================
|
||||
//--cambine: kernel funtion of Breadth-First-Search
|
||||
//--author: created by Jianbin Fang
|
||||
//--date: 06/12/2010
|
||||
============================================================ */
|
||||
|
||||
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
|
||||
|
||||
//Structure to hold a node information
|
||||
typedef struct{
|
||||
int starting;
|
||||
int no_of_edges;
|
||||
} Node;
|
||||
|
||||
//--7 parameters
|
||||
__kernel void BFS_1( const __global Node* g_graph_nodes,
|
||||
const __global int* g_graph_edges,
|
||||
__global char* g_graph_mask,
|
||||
__global char* g_updating_graph_mask,
|
||||
__global char* g_graph_visited,
|
||||
__global int* g_cost,
|
||||
const int no_of_nodes){
|
||||
int tid = get_global_id(0);
|
||||
if( tid<no_of_nodes && g_graph_mask[tid]){
|
||||
g_graph_mask[tid]=false;
|
||||
for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++){
|
||||
int id = g_graph_edges[i];
|
||||
if(!g_graph_visited[id]){
|
||||
g_cost[id]=g_cost[tid]+1;
|
||||
g_updating_graph_mask[id]=true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//--5 parameters
|
||||
__kernel void BFS_2(__global char* g_graph_mask,
|
||||
__global char* g_updating_graph_mask,
|
||||
__global char* g_graph_visited,
|
||||
__global char* g_over,
|
||||
const int no_of_nodes
|
||||
) {
|
||||
int tid = get_global_id(0);
|
||||
if( tid<no_of_nodes && g_updating_graph_mask[tid]){
|
||||
|
||||
g_graph_mask[tid]=true;
|
||||
g_graph_visited[tid]=true;
|
||||
*g_over=true;
|
||||
g_updating_graph_mask[tid]=false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
BIN
tests/opencl/bfs/kernel.pocl
Normal file
BIN
tests/opencl/bfs/kernel.pocl
Normal file
Binary file not shown.
297
tests/opencl/bfs/main.cc
Executable file
297
tests/opencl/bfs/main.cc
Executable file
@@ -0,0 +1,297 @@
|
||||
//--by Jianbin Fang
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#ifdef PROFILING
|
||||
#include "timer.h"
|
||||
#endif
|
||||
|
||||
#include "CLHelper.h"
|
||||
#include "util.h"
|
||||
|
||||
#define MAX_THREADS_PER_BLOCK 256
|
||||
|
||||
// Structure to hold a node information
|
||||
struct Node {
|
||||
int starting;
|
||||
int no_of_edges;
|
||||
};
|
||||
|
||||
//----------------------------------------------------------
|
||||
//--bfs on cpu
|
||||
//--programmer: jianbin
|
||||
//--date: 26/01/2011
|
||||
//--note: width is changed to the new_width
|
||||
//----------------------------------------------------------
|
||||
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
|
||||
int *h_graph_edges, char *h_graph_mask,
|
||||
char *h_updating_graph_mask, char *h_graph_visited,
|
||||
int *h_cost_ref) {
|
||||
char stop;
|
||||
int k = 0;
|
||||
do {
|
||||
// if no thread changes this value then the loop stops
|
||||
stop = false;
|
||||
for (int tid = 0; tid < no_of_nodes; tid++) {
|
||||
if (h_graph_mask[tid] == true) {
|
||||
h_graph_mask[tid] = false;
|
||||
for (int i = h_graph_nodes[tid].starting;
|
||||
i < (h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting);
|
||||
i++) {
|
||||
int id =
|
||||
h_graph_edges[i]; //--cambine: node id is connected with node tid
|
||||
if (!h_graph_visited[id]) { //--cambine: if node id has not been
|
||||
//visited, enter the body below
|
||||
h_cost_ref[id] = h_cost_ref[tid] + 1;
|
||||
h_updating_graph_mask[id] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int tid = 0; tid < no_of_nodes; tid++) {
|
||||
if (h_updating_graph_mask[tid] == true) {
|
||||
h_graph_mask[tid] = true;
|
||||
h_graph_visited[tid] = true;
|
||||
stop = true;
|
||||
h_updating_graph_mask[tid] = false;
|
||||
}
|
||||
}
|
||||
k++;
|
||||
} while (stop);
|
||||
}
|
||||
//----------------------------------------------------------
|
||||
//--breadth first search on GPUs
|
||||
//----------------------------------------------------------
|
||||
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
|
||||
int *h_graph_edges, char *h_graph_mask,
|
||||
char *h_updating_graph_mask, char *h_graph_visited,
|
||||
int *h_cost) throw(std::string) {
|
||||
|
||||
// int number_elements = height*width;
|
||||
char h_over;
|
||||
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask,
|
||||
d_graph_visited, d_cost, d_over;
|
||||
|
||||
try {
|
||||
//--1 transfer data from host to device
|
||||
_clInit();
|
||||
|
||||
d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes);
|
||||
d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges);
|
||||
d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask);
|
||||
d_updating_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
|
||||
d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited);
|
||||
|
||||
d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost);
|
||||
d_over = _clMallocRW(sizeof(char), &h_over);
|
||||
|
||||
_clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes);
|
||||
_clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges);
|
||||
_clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask);
|
||||
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char), h_updating_graph_mask);
|
||||
_clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited);
|
||||
_clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost);
|
||||
|
||||
//--2 invoke kernel
|
||||
#ifdef PROFILING
|
||||
timer kernel_timer;
|
||||
double kernel_time = 0.0;
|
||||
kernel_timer.reset();
|
||||
kernel_timer.start();
|
||||
#endif
|
||||
|
||||
do {
|
||||
h_over = false;
|
||||
_clMemcpyH2D(d_over, sizeof(char), &h_over);
|
||||
//--kernel 0
|
||||
int kernel_id = 0;
|
||||
int kernel_idx = 0;
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_cost);
|
||||
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
|
||||
|
||||
// int work_items = no_of_nodes;
|
||||
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
|
||||
|
||||
//--kernel 1
|
||||
kernel_id = 1;
|
||||
kernel_idx = 0;
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_over);
|
||||
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
|
||||
|
||||
// work_items = no_of_nodes;
|
||||
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
|
||||
|
||||
_clMemcpyD2H(d_over, sizeof(char), &h_over);
|
||||
} while (h_over);
|
||||
|
||||
#ifdef PROFILING
|
||||
kernel_timer.stop();
|
||||
kernel_time = kernel_timer.getTimeInSeconds();
|
||||
#endif
|
||||
//--3 transfer data from device to host
|
||||
_clMemcpyD2H(d_cost, no_of_nodes * sizeof(int), h_cost);
|
||||
//--statistics
|
||||
#ifdef PROFILING
|
||||
std::cout << "kernel time(s):" << kernel_time << std::endl;
|
||||
#endif
|
||||
//--4 release cl resources.
|
||||
_clFree(d_graph_nodes);
|
||||
_clFree(d_graph_edges);
|
||||
_clFree(d_graph_mask);
|
||||
_clFree(d_updating_graph_mask);
|
||||
_clFree(d_graph_visited);
|
||||
_clFree(d_cost);
|
||||
_clFree(d_over);
|
||||
_clRelease();
|
||||
} catch (std::string msg) {
|
||||
_clFree(d_graph_nodes);
|
||||
_clFree(d_graph_edges);
|
||||
_clFree(d_graph_mask);
|
||||
_clFree(d_updating_graph_mask);
|
||||
_clFree(d_graph_visited);
|
||||
_clFree(d_cost);
|
||||
_clFree(d_over);
|
||||
_clRelease();
|
||||
std::string e_str = "in run_transpose_gpu -> ";
|
||||
e_str += msg;
|
||||
throw(e_str);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------
|
||||
//--cambine: main function
|
||||
//--author: created by Jianbin Fang
|
||||
//--date: 25/01/2011
|
||||
//----------------------------------------------------------
|
||||
int main(int argc, char *argv[]) {
|
||||
printf("enter demo main\n");
|
||||
|
||||
int no_of_nodes;
|
||||
int edge_list_size;
|
||||
FILE *fp;
|
||||
Node *h_graph_nodes;
|
||||
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
|
||||
|
||||
try {
|
||||
char *input_f = "graph4096.txt";
|
||||
printf("Reading File\n");
|
||||
// Read in Graph from a file
|
||||
fp = fopen(input_f, "r");
|
||||
if (!fp) {
|
||||
printf("Error Reading graph file\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("Reading File completed!\n");
|
||||
|
||||
int source = 0;
|
||||
|
||||
fscanf(fp, "%d", &no_of_nodes);
|
||||
|
||||
int num_of_blocks = 1;
|
||||
int num_of_threads_per_block = no_of_nodes;
|
||||
|
||||
// Make execution Parameters according to the number of nodes
|
||||
// Distribute threads across multiple Blocks if necessary
|
||||
if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
|
||||
num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
|
||||
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
|
||||
}
|
||||
work_group_size = num_of_threads_per_block;
|
||||
// allocate host memory
|
||||
h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
|
||||
h_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
|
||||
h_updating_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
|
||||
h_graph_visited = (char *)malloc(sizeof(char) * no_of_nodes);
|
||||
|
||||
int start, edgeno;
|
||||
// initalize the memory
|
||||
for (int i = 0; i < no_of_nodes; i++) {
|
||||
fscanf(fp, "%d %d", &start, &edgeno);
|
||||
h_graph_nodes[i].starting = start;
|
||||
h_graph_nodes[i].no_of_edges = edgeno;
|
||||
h_graph_mask[i] = false;
|
||||
h_updating_graph_mask[i] = false;
|
||||
h_graph_visited[i] = false;
|
||||
}
|
||||
// read the source node from the file
|
||||
fscanf(fp, "%d", &source);
|
||||
source = 0;
|
||||
// set the source node as true in the mask
|
||||
h_graph_mask[source] = true;
|
||||
h_graph_visited[source] = true;
|
||||
fscanf(fp, "%d", &edge_list_size);
|
||||
int id, cost;
|
||||
int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
|
||||
for (int i = 0; i < edge_list_size; i++) {
|
||||
fscanf(fp, "%d", &id);
|
||||
fscanf(fp, "%d", &cost);
|
||||
h_graph_edges[i] = id;
|
||||
}
|
||||
|
||||
if (fp)
|
||||
fclose(fp);
|
||||
// allocate mem for the result on host side
|
||||
int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
|
||||
int *h_cost_ref = (int *)malloc(sizeof(int) * no_of_nodes);
|
||||
for (int i = 0; i < no_of_nodes; i++) {
|
||||
h_cost[i] = -1;
|
||||
h_cost_ref[i] = -1;
|
||||
}
|
||||
h_cost[source] = 0;
|
||||
h_cost_ref[source] = 0;
|
||||
//---------------------------------------------------------
|
||||
//--gpu entry
|
||||
run_bfs_gpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
|
||||
h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
|
||||
//---------------------------------------------------------
|
||||
//--cpu entry
|
||||
// initalize the memory again
|
||||
for (int i = 0; i < no_of_nodes; i++) {
|
||||
h_graph_mask[i] = false;
|
||||
h_updating_graph_mask[i] = false;
|
||||
h_graph_visited[i] = false;
|
||||
}
|
||||
// set the source node as true in the mask
|
||||
source = 0;
|
||||
h_graph_mask[source] = true;
|
||||
h_graph_visited[source] = true;
|
||||
run_bfs_cpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
|
||||
h_graph_mask, h_updating_graph_mask, h_graph_visited,
|
||||
h_cost_ref);
|
||||
//---------------------------------------------------------
|
||||
//--result varification
|
||||
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
|
||||
// release host memory
|
||||
free(h_graph_nodes);
|
||||
free(h_graph_mask);
|
||||
free(h_updating_graph_mask);
|
||||
free(h_graph_visited);
|
||||
|
||||
} catch (std::string msg) {
|
||||
std::cout << "--cambine: exception in main ->" << msg << std::endl;
|
||||
// release host memory
|
||||
free(h_graph_nodes);
|
||||
free(h_graph_mask);
|
||||
free(h_updating_graph_mask);
|
||||
free(h_graph_visited);
|
||||
}
|
||||
printf("Passed!\n");
|
||||
return 0;
|
||||
}
|
||||
1
tests/opencl/bfs/run
Executable file
1
tests/opencl/bfs/run
Executable file
@@ -0,0 +1 @@
|
||||
./bfs ../../data/bfs/graph1MW_6.txt
|
||||
78
tests/opencl/bfs/timer.cc
Executable file
78
tests/opencl/bfs/timer.cc
Executable file
@@ -0,0 +1,78 @@
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
|
||||
#include "timer.h"
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz();
|
||||
|
||||
|
||||
double timer::get_CPU_speed_in_MHz()
|
||||
{
|
||||
#if defined __linux__
|
||||
ifstream infile("/proc/cpuinfo");
|
||||
char buffer[256], *colon;
|
||||
|
||||
while (infile.good()) {
|
||||
infile.getline(buffer, 256);
|
||||
|
||||
if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0)
|
||||
return atof(colon + 2);
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
|
||||
void timer::print_time(ostream &str, const char *which, double time) const
|
||||
{
|
||||
static const char *units[] = { " ns", " us", " ms", " s", " ks", 0 };
|
||||
const char **unit = units;
|
||||
|
||||
time = 1000.0 * time / CPU_speed_in_MHz;
|
||||
|
||||
while (time >= 999.5 && unit[1] != 0) {
|
||||
time /= 1000.0;
|
||||
++ unit;
|
||||
}
|
||||
|
||||
str << which << " = " << setprecision(3) << setw(4) << time << *unit;
|
||||
}
|
||||
|
||||
|
||||
ostream &timer::print(ostream &str)
|
||||
{
|
||||
str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right;
|
||||
|
||||
if (CPU_speed_in_MHz == 0)
|
||||
str << "could not determine CPU speed\n";
|
||||
else if (count > 0) {
|
||||
double total = static_cast<double>(total_time);
|
||||
|
||||
print_time(str, "avg", total / static_cast<double>(count));
|
||||
print_time(str, ", total", total);
|
||||
str << ", count = " << setw(9) << count << '\n';
|
||||
}
|
||||
else
|
||||
str << "not used\n";
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
|
||||
ostream &operator << (ostream &str, class timer &timer)
|
||||
{
|
||||
return timer.print(str);
|
||||
}
|
||||
|
||||
double timer::getTimeInSeconds()
|
||||
{
|
||||
double total = static_cast<double>(total_time);
|
||||
double res = (total / 1000000.0) / CPU_speed_in_MHz;
|
||||
return res;
|
||||
}
|
||||
101
tests/opencl/bfs/timer.h
Executable file
101
tests/opencl/bfs/timer.h
Executable file
@@ -0,0 +1,101 @@
|
||||
#ifndef timer_h
|
||||
#define timer_h
|
||||
|
||||
#include <iostream>
|
||||
|
||||
class timer {
|
||||
public:
|
||||
timer(const char *name = 0);
|
||||
timer(const char *name, std::ostream &write_on_exit);
|
||||
|
||||
~timer();
|
||||
|
||||
void start(), stop();
|
||||
void reset();
|
||||
std::ostream &print(std::ostream &);
|
||||
|
||||
double getTimeInSeconds();
|
||||
|
||||
private:
|
||||
void print_time(std::ostream &, const char *which, double time) const;
|
||||
|
||||
union {
|
||||
long long total_time;
|
||||
struct {
|
||||
#if defined __PPC__
|
||||
int high, low;
|
||||
#else
|
||||
int low, high;
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
unsigned long long count;
|
||||
const char *const name;
|
||||
std::ostream *const write_on_exit;
|
||||
|
||||
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
|
||||
};
|
||||
|
||||
std::ostream &operator<<(std::ostream &, class timer &);
|
||||
|
||||
inline void timer::reset() {
|
||||
total_time = 0;
|
||||
count = 0;
|
||||
}
|
||||
|
||||
inline timer::timer(const char *name) : name(name), write_on_exit(0) {
|
||||
reset();
|
||||
}
|
||||
|
||||
inline timer::timer(const char *name, std::ostream &write_on_exit)
|
||||
: name(name), write_on_exit(&write_on_exit) {
|
||||
reset();
|
||||
}
|
||||
|
||||
inline timer::~timer() {
|
||||
if (write_on_exit != 0)
|
||||
print(*write_on_exit);
|
||||
}
|
||||
|
||||
inline void timer::start() {
|
||||
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
|
||||
unsigned eax, edx;
|
||||
|
||||
asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
|
||||
|
||||
total_time -= ((unsigned long long)edx << 32) + eax;
|
||||
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
|
||||
(defined __i386 || defined __x86_64)
|
||||
asm volatile("rdtsc\n\t"
|
||||
"subl %%eax, %0\n\t"
|
||||
"sbbl %%edx, %1"
|
||||
: "+m"(low), "+m"(high)
|
||||
:
|
||||
: "eax", "edx");
|
||||
#else
|
||||
#error Compiler/Architecture not recognized
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void timer::stop() {
|
||||
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
|
||||
unsigned eax, edx;
|
||||
|
||||
asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
|
||||
|
||||
total_time += ((unsigned long long)edx << 32) + eax;
|
||||
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
|
||||
(defined __i386 || defined __x86_64)
|
||||
asm volatile("rdtsc\n\t"
|
||||
"addl %%eax, %0\n\t"
|
||||
"adcl %%edx, %1"
|
||||
: "+m"(low), "+m"(high)
|
||||
:
|
||||
: "eax", "edx");
|
||||
#endif
|
||||
|
||||
++count;
|
||||
}
|
||||
|
||||
#endif
|
||||
72
tests/opencl/bfs/util.h
Executable file
72
tests/opencl/bfs/util.h
Executable file
@@ -0,0 +1,72 @@
|
||||
#ifndef _C_UTIL_
|
||||
#define _C_UTIL_
|
||||
#include <math.h>
|
||||
#include <iostream>
|
||||
|
||||
//-------------------------------------------------------------------
|
||||
//--initialize array with maximum limit
|
||||
//-------------------------------------------------------------------
|
||||
template<typename datatype>
|
||||
void fill(datatype *A, const int n, const datatype maxi){
|
||||
for (int j = 0; j < n; j++)
|
||||
{
|
||||
A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
|
||||
}
|
||||
}
|
||||
|
||||
//--print matrix
|
||||
template<typename datatype>
|
||||
void print_matrix(datatype *A, int height, int width){
|
||||
for(int i=0; i<height; i++){
|
||||
for(int j=0; j<width; j++){
|
||||
int idx = i*width + j;
|
||||
std::cout<<A[idx]<<" ";
|
||||
}
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
//-------------------------------------------------------------------
|
||||
//--verify results
|
||||
//-------------------------------------------------------------------
|
||||
#define MAX_RELATIVE_ERROR .002
|
||||
template<typename datatype>
|
||||
void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
|
||||
|
||||
char passed = true;
|
||||
#pragma omp parallel for
|
||||
for (int i=0; i<size; i++){
|
||||
if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
|
||||
passed = false;
|
||||
}
|
||||
}
|
||||
if (passed){
|
||||
std::cout << "--cambine:passed:-)" << endl;
|
||||
}
|
||||
else{
|
||||
std::cout << "--cambine: failed:-(" << endl;
|
||||
}
|
||||
return ;
|
||||
}
|
||||
template<typename datatype>
|
||||
void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
|
||||
|
||||
char passed = true;
|
||||
//#pragma omp parallel for
|
||||
for (int i=0; i<size; i++){
|
||||
if (cpu_results[i]!=gpu_results[i]){
|
||||
passed = false;
|
||||
}
|
||||
}
|
||||
if (passed){
|
||||
std::cout << "--cambine: passed: -)" << endl;
|
||||
}
|
||||
else{
|
||||
std::cout << "--cambine: failed :-(" << endl;
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
1
tests/opencl/convolution/.gitignore
vendored
Normal file
1
tests/opencl/convolution/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
convolution
|
||||
59
tests/opencl/convolution/Makefile
Normal file
59
tests/opencl/convolution/Makefile
Normal file
@@ -0,0 +1,59 @@
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
#CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/simx -lOpenCL -lvortex
|
||||
|
||||
PROJECT = convolution
|
||||
|
||||
SRCS = main.cpp utils.cpp
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
25475
tests/opencl/convolution/convolution.dump
Normal file
25475
tests/opencl/convolution/convolution.dump
Normal file
File diff suppressed because it is too large
Load Diff
BIN
tests/opencl/convolution/input.bmp
Normal file
BIN
tests/opencl/convolution/input.bmp
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 44 KiB |
54
tests/opencl/convolution/kernel.cl
Executable file
54
tests/opencl/convolution/kernel.cl
Executable file
@@ -0,0 +1,54 @@
|
||||
__kernel
|
||||
void convolution(
|
||||
__read_only image2d_t sourceImage,
|
||||
__write_only image2d_t outputImage,
|
||||
int rows,
|
||||
int cols,
|
||||
__constant float* filter,
|
||||
int filterWidth,
|
||||
sampler_t sampler)
|
||||
{
|
||||
// Store each work-item’s unique row and column
|
||||
int column = get_global_id(0);
|
||||
int row = get_global_id(1);
|
||||
|
||||
// Half the width of the filter is needed for indexing
|
||||
// memory later
|
||||
int halfWidth = (int)(filterWidth/2);
|
||||
|
||||
// All accesses to images return data as four-element vector
|
||||
// (i.e., float4), although only the 'x' component will contain
|
||||
// meaningful data in this code
|
||||
float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
// Iterator for the filter
|
||||
int filterIdx = 0;
|
||||
|
||||
// Each work-item iterates around its local area based on the
|
||||
// size of the filter
|
||||
int2 coords; // Coordinates for accessing the image
|
||||
// Iterate the filter rows
|
||||
for(int i = -halfWidth; i <= halfWidth; i++) {
|
||||
coords.y = row + i;
|
||||
|
||||
// Iterate over the filter columns
|
||||
for(int j = -halfWidth; j <= halfWidth; j++) {
|
||||
coords.x = column + j;
|
||||
|
||||
float4 pixel;
|
||||
// Read a pixel from the image. A single channel image
|
||||
// stores the pixel in the 'x' coordinate of the returned
|
||||
// vector.
|
||||
pixel = read_imagef(sourceImage, sampler, coords);
|
||||
sum.x += pixel.x * filter[filterIdx++];
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the data to the output image if the
|
||||
// work-item is in bounds
|
||||
if(row < rows && column < cols) {
|
||||
coords.x = column;
|
||||
coords.y = row;
|
||||
write_imagef(outputImage, coords, sum);
|
||||
}
|
||||
}
|
||||
261
tests/opencl/convolution/main.cpp
Executable file
261
tests/opencl/convolution/main.cpp
Executable file
@@ -0,0 +1,261 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <CL/cl.h>
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
// This function takes a positive integer and rounds it up to
|
||||
// the nearest multiple of another provided integer
|
||||
unsigned int roundUp(unsigned int value, unsigned int multiple) {
|
||||
|
||||
// Determine how far past the nearest multiple the value is
|
||||
unsigned int remainder = value % multiple;
|
||||
|
||||
// Add the difference to make the value a multiple
|
||||
if(remainder != 0) {
|
||||
value += (multiple-remainder);
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
// This function reads in a text file and stores it as a char pointer
|
||||
char* readSource(char* kernelPath) {
|
||||
|
||||
cl_int status;
|
||||
FILE *fp;
|
||||
char *source;
|
||||
long int size;
|
||||
|
||||
printf("Program file is: %s\n", kernelPath);
|
||||
|
||||
fp = fopen(kernelPath, "rb");
|
||||
if(!fp) {
|
||||
printf("Could not open kernel file\n");
|
||||
exit(-1);
|
||||
}
|
||||
status = fseek(fp, 0, SEEK_END);
|
||||
if(status != 0) {
|
||||
printf("Error seeking to end of file\n");
|
||||
exit(-1);
|
||||
}
|
||||
size = ftell(fp);
|
||||
if(size < 0) {
|
||||
printf("Error getting file position\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
rewind(fp);
|
||||
|
||||
source = (char *)malloc(size + 1);
|
||||
|
||||
int i;
|
||||
for (i = 0; i < size+1; i++) {
|
||||
source[i]='\0';
|
||||
}
|
||||
|
||||
if(source == NULL) {
|
||||
printf("Error allocating space for the kernel source\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fread(source, 1, size, fp);
|
||||
source[size] = '\0';
|
||||
|
||||
return source;
|
||||
}
|
||||
|
||||
void chk(cl_int status, const char* cmd) {
|
||||
|
||||
if(status != CL_SUCCESS) {
|
||||
printf("%s failed (%d)\n", cmd, status);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
|
||||
int i, j, k, l;
|
||||
|
||||
// Rows and columns in the input image
|
||||
int imageHeight;
|
||||
int imageWidth;
|
||||
|
||||
const char* inputFile = "input.bmp";
|
||||
const char* outputFile = "output.bmp";
|
||||
|
||||
// Homegrown function to read a BMP from file
|
||||
float* inputImage = readImage(inputFile, &imageWidth,
|
||||
&imageHeight);
|
||||
|
||||
// Size of the input and output images on the host
|
||||
int dataSize = imageHeight*imageWidth*sizeof(float);
|
||||
|
||||
// Output image on the host
|
||||
float* outputImage = NULL;
|
||||
outputImage = (float*)malloc(dataSize);
|
||||
float* refImage = NULL;
|
||||
refImage = (float*)malloc(dataSize);
|
||||
|
||||
// 45 degree motion blur
|
||||
float filter[49] =
|
||||
{0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, -1, 0, 1, 0, 0,
|
||||
0, 0, -2, 0, 2, 0, 0,
|
||||
0, 0, -1, 0, 1, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
// The convolution filter is 7x7
|
||||
int filterWidth = 7;
|
||||
int filterSize = filterWidth*filterWidth; // Assume a square kernel
|
||||
|
||||
// Set up the OpenCL environment
|
||||
cl_int status;
|
||||
|
||||
// Discovery platform
|
||||
cl_platform_id platform;
|
||||
status = clGetPlatformIDs(1, &platform, NULL);
|
||||
chk(status, "clGetPlatformIDs");
|
||||
|
||||
// Discover device
|
||||
cl_device_id device;
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
|
||||
chk(status, "clGetDeviceIDs");
|
||||
|
||||
// Create context
|
||||
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
|
||||
(cl_context_properties)(platform), 0};
|
||||
cl_context context;
|
||||
context = clCreateContext(props, 1, &device, NULL, NULL, &status);
|
||||
chk(status, "clCreateContext");
|
||||
|
||||
// Create command queue
|
||||
cl_command_queue queue;
|
||||
queue = clCreateCommandQueue(context, device, 0, &status);
|
||||
chk(status, "clCreateCommandQueue");
|
||||
|
||||
// The image format describes how the data will be stored in memory
|
||||
cl_image_format format;
|
||||
format.image_channel_order = CL_R; // single channel
|
||||
format.image_channel_data_type = CL_FLOAT; // float data type
|
||||
|
||||
// Create space for the source image on the device
|
||||
cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth,
|
||||
imageHeight, 0, NULL, &status);
|
||||
chk(status, "clCreateImage2D");
|
||||
|
||||
// Create space for the output image on the device
|
||||
cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth,
|
||||
imageHeight, 0, NULL, &status);
|
||||
chk(status, "clCreateImage2D");
|
||||
|
||||
// Create space for the 7x7 filter on the device
|
||||
cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float),
|
||||
NULL, &status);
|
||||
chk(status, "clCreateBuffer");
|
||||
|
||||
// Copy the source image to the device
|
||||
size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from
|
||||
size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
|
||||
status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region,
|
||||
0, 0, inputImage, 0, NULL, NULL);
|
||||
chk(status, "clEnqueueWriteImage");
|
||||
|
||||
// Copy the 7x7 filter to the device
|
||||
status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0,
|
||||
filterSize*sizeof(float), filter, 0, NULL, NULL);
|
||||
chk(status, "clEnqueueWriteBuffer");
|
||||
|
||||
// Create the image sampler
|
||||
cl_sampler sampler = clCreateSampler(context, CL_FALSE,
|
||||
CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
|
||||
chk(status, "clCreateSampler");
|
||||
|
||||
const char* source = readSource("kernel.cl");
|
||||
|
||||
// Create a program object with source and build it
|
||||
cl_program program;
|
||||
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
|
||||
chk(status, "clCreateProgramWithSource");
|
||||
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
|
||||
chk(status, "clBuildProgram");
|
||||
|
||||
// Create the kernel object
|
||||
cl_kernel kernel;
|
||||
kernel = clCreateKernel(program, "convolution", &status);
|
||||
chk(status, "clCreateKernel");
|
||||
|
||||
// Set the kernel arguments
|
||||
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
|
||||
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
|
||||
status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
|
||||
status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
|
||||
status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
|
||||
status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
|
||||
status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
|
||||
chk(status, "clSetKernelArg");
|
||||
|
||||
// Set the work item dimensions
|
||||
size_t globalSize[2] = {imageWidth, imageHeight};
|
||||
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
|
||||
NULL, NULL);
|
||||
chk(status, "clEnqueueNDRange");
|
||||
|
||||
// Read the image back to the host
|
||||
status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin,
|
||||
region, 0, 0, outputImage, 0, NULL, NULL);
|
||||
chk(status, "clEnqueueReadImage");
|
||||
|
||||
// Write the output image to file
|
||||
storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
|
||||
|
||||
// Compute the reference image
|
||||
for(i = 0; i < imageHeight; i++) {
|
||||
for(j = 0; j < imageWidth; j++) {
|
||||
refImage[i*imageWidth+j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Iterate over the rows of the source image
|
||||
int halfFilterWidth = filterWidth/2;
|
||||
float sum;
|
||||
for(i = 0; i < imageHeight; i++) {
|
||||
// Iterate over the columns of the source image
|
||||
for(j = 0; j < imageWidth; j++) {
|
||||
sum = 0; // Reset sum for new source pixel
|
||||
// Apply the filter to the neighborhood
|
||||
for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
|
||||
for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
|
||||
if(i+k >= 0 && i+k < imageHeight &&
|
||||
j+l >= 0 && j+l < imageWidth) {
|
||||
sum += inputImage[(i+k)*imageWidth + j+l] *
|
||||
filter[(k+halfFilterWidth)*filterWidth +
|
||||
l+halfFilterWidth];
|
||||
}
|
||||
}
|
||||
}
|
||||
refImage[i*imageWidth+j] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
int failed = 0;
|
||||
for(i = 0; i < imageHeight; i++) {
|
||||
for(j = 0; j < imageWidth; j++) {
|
||||
if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
|
||||
printf("Results are INCORRECT\n");
|
||||
printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
|
||||
outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
|
||||
failed = 1;
|
||||
}
|
||||
if(failed) break;
|
||||
}
|
||||
if(failed) break;
|
||||
}
|
||||
if(!failed) {
|
||||
printf("Results are correct\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
180
tests/opencl/convolution/utils.cpp
Normal file
180
tests/opencl/convolution/utils.cpp
Normal file
@@ -0,0 +1,180 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
void storeImage(float *imageOut,
|
||||
const char *filename,
|
||||
int rows,
|
||||
int cols,
|
||||
const char* refFilename) {
|
||||
|
||||
FILE *ifp, *ofp;
|
||||
unsigned char tmp;
|
||||
int offset;
|
||||
unsigned char *buffer;
|
||||
int i, j;
|
||||
|
||||
int bytes;
|
||||
|
||||
int height, width;
|
||||
|
||||
ifp = fopen(refFilename, "rb");
|
||||
if(ifp == NULL) {
|
||||
perror(filename);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fseek(ifp, 10, SEEK_SET);
|
||||
fread(&offset, 4, 1, ifp);
|
||||
|
||||
fseek(ifp, 18, SEEK_SET);
|
||||
fread(&width, 4, 1, ifp);
|
||||
fread(&height, 4, 1, ifp);
|
||||
|
||||
fseek(ifp, 0, SEEK_SET);
|
||||
|
||||
buffer = (unsigned char *)malloc(offset);
|
||||
if(buffer == NULL) {
|
||||
perror("malloc");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fread(buffer, 1, offset, ifp);
|
||||
|
||||
printf("Writing output image to %s\n", filename);
|
||||
ofp = fopen(filename, "wb");
|
||||
if(ofp == NULL) {
|
||||
perror("opening output file");
|
||||
exit(-1);
|
||||
}
|
||||
bytes = fwrite(buffer, 1, offset, ofp);
|
||||
if(bytes != offset) {
|
||||
printf("error writing header!\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// NOTE bmp formats store data in reverse raster order (see comment in
|
||||
// readImage function), so we need to flip it upside down here.
|
||||
int mod = width % 4;
|
||||
if(mod != 0) {
|
||||
mod = 4 - mod;
|
||||
}
|
||||
// printf("mod = %d\n", mod);
|
||||
for(i = height-1; i >= 0; i--) {
|
||||
for(j = 0; j < width; j++) {
|
||||
tmp = (unsigned char)imageOut[i*cols+j];
|
||||
fwrite(&tmp, sizeof(char), 1, ofp);
|
||||
}
|
||||
// In bmp format, rows must be a multiple of 4-bytes.
|
||||
// So if we're not at a multiple of 4, add junk padding.
|
||||
for(j = 0; j < mod; j++) {
|
||||
fwrite(&tmp, sizeof(char), 1, ofp);
|
||||
}
|
||||
}
|
||||
|
||||
fclose(ofp);
|
||||
fclose(ifp);
|
||||
|
||||
free(buffer);
|
||||
}
|
||||
|
||||
/*
|
||||
* Read bmp image and convert to byte array. Also output the width and height
|
||||
*/
|
||||
float* readImage(const char *filename, int* widthOut, int* heightOut) {
|
||||
|
||||
uchar* imageData;
|
||||
|
||||
int height, width;
|
||||
uchar tmp;
|
||||
int offset;
|
||||
int i, j;
|
||||
|
||||
printf("Reading input image from %s\n", filename);
|
||||
FILE *fp = fopen(filename, "rb");
|
||||
if(fp == NULL) {
|
||||
perror(filename);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fseek(fp, 10, SEEK_SET);
|
||||
fread(&offset, 4, 1, fp);
|
||||
|
||||
fseek(fp, 18, SEEK_SET);
|
||||
fread(&width, 4, 1, fp);
|
||||
fread(&height, 4, 1, fp);
|
||||
|
||||
printf("width = %d\n", width);
|
||||
printf("height = %d\n", height);
|
||||
|
||||
*widthOut = width;
|
||||
*heightOut = height;
|
||||
|
||||
imageData = (uchar*)malloc(width*height);
|
||||
if(imageData == NULL) {
|
||||
perror("malloc");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fseek(fp, offset, SEEK_SET);
|
||||
fflush(NULL);
|
||||
|
||||
int mod = width % 4;
|
||||
if(mod != 0) {
|
||||
mod = 4 - mod;
|
||||
}
|
||||
|
||||
// NOTE bitmaps are stored in upside-down raster order. So we begin
|
||||
// reading from the bottom left pixel, then going from left-to-right,
|
||||
// read from the bottom to the top of the image. For image analysis,
|
||||
// we want the image to be right-side up, so we'll modify it here.
|
||||
|
||||
// First we read the image in upside-down
|
||||
|
||||
// Read in the actual image
|
||||
for(i = 0; i < height; i++) {
|
||||
|
||||
// add actual data to the image
|
||||
for(j = 0; j < width; j++) {
|
||||
fread(&tmp, sizeof(char), 1, fp);
|
||||
imageData[i*width + j] = tmp;
|
||||
}
|
||||
// For the bmp format, each row has to be a multiple of 4,
|
||||
// so I need to read in the junk data and throw it away
|
||||
for(j = 0; j < mod; j++) {
|
||||
fread(&tmp, sizeof(char), 1, fp);
|
||||
}
|
||||
}
|
||||
|
||||
// Then we flip it over
|
||||
int flipRow;
|
||||
for(i = 0; i < height/2; i++) {
|
||||
flipRow = height - (i+1);
|
||||
for(j = 0; j < width; j++) {
|
||||
tmp = imageData[i*width+j];
|
||||
imageData[i*width+j] = imageData[flipRow*width+j];
|
||||
imageData[flipRow*width+j] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
|
||||
// Input image on the host
|
||||
float* floatImage = NULL;
|
||||
floatImage = (float*)malloc(sizeof(float)*width*height);
|
||||
if(floatImage == NULL) {
|
||||
perror("malloc");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// Convert the BMP image to float (not required)
|
||||
for(i = 0; i < height; i++) {
|
||||
for(j = 0; j < width; j++) {
|
||||
floatImage[i*width+j] = (float)imageData[i*width+j];
|
||||
}
|
||||
}
|
||||
|
||||
free(imageData);
|
||||
return floatImage;
|
||||
}
|
||||
11
tests/opencl/convolution/utils.h
Normal file
11
tests/opencl/convolution/utils.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#ifndef __UTILS__
|
||||
#define __UTILS__
|
||||
|
||||
typedef unsigned char uchar;
|
||||
|
||||
float* readImage(const char *filename, int* widthOut, int* heightOut);
|
||||
|
||||
void storeImage(float *imageOut, const char *filename, int rows, int cols,
|
||||
const char* refFilename);
|
||||
|
||||
#endif
|
||||
67
tests/opencl/cutcp/Makefile
Normal file
67
tests/opencl/cutcp/Makefile
Normal file
@@ -0,0 +1,67 @@
|
||||
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT = cutcp
|
||||
|
||||
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c cutoff.c cutcpu.c output.c readatom.c excl.c
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||
617
tests/opencl/cutcp/args.c
Normal file
617
tests/opencl/cutcp/args.c
Normal file
@@ -0,0 +1,617 @@
|
||||
|
||||
#include <parboil.h>
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/*****************************************************************************/
|
||||
/* Memory management routines */
|
||||
|
||||
/* Free an array of owned strings. */
|
||||
void
|
||||
pb_FreeStringArray(char **string_array)
|
||||
{
|
||||
char **p;
|
||||
|
||||
if (!string_array) return;
|
||||
for (p = string_array; *p; p++) free(*p);
|
||||
free(string_array);
|
||||
}
|
||||
|
||||
struct pb_PlatformParam *
|
||||
pb_PlatformParam(char *name, char *version)
|
||||
{
|
||||
if (name == NULL) {
|
||||
fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
struct pb_PlatformParam *ret =
|
||||
(struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
|
||||
|
||||
ret->name = name;
|
||||
ret->version = version;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
pb_FreePlatformParam(struct pb_PlatformParam *p)
|
||||
{
|
||||
if (p == NULL) return;
|
||||
|
||||
free(p->name);
|
||||
free(p->version);
|
||||
free(p);
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_index(int index)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_INDEX;
|
||||
ret->index = index;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_cpu(void)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_CPU;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_gpu(void)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_GPU;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_accelerator(void)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_ACCELERATOR;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_name(char *name)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_NAME;
|
||||
ret->name = name;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
pb_FreeDeviceParam(struct pb_DeviceParam *p)
|
||||
{
|
||||
if (p == NULL) return;
|
||||
|
||||
switch(p->criterion) {
|
||||
case pb_Device_NAME:
|
||||
free(p->name);
|
||||
break;
|
||||
case pb_Device_INDEX:
|
||||
case pb_Device_CPU:
|
||||
case pb_Device_ACCELERATOR:
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
pb_FreeParameters(struct pb_Parameters *p)
|
||||
{
|
||||
free(p->outFile);
|
||||
pb_FreeStringArray(p->inpFiles);
|
||||
pb_FreePlatformParam(p->platform);
|
||||
pb_FreeDeviceParam(p->device);
|
||||
free(p);
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/* Parse a comma-delimited list of strings into an
|
||||
* array of strings. */
|
||||
static char **
|
||||
read_string_array(char *in)
|
||||
{
|
||||
char **ret;
|
||||
int i;
|
||||
int count; /* Number of items in the input */
|
||||
char *substring; /* Current substring within 'in' */
|
||||
|
||||
/* Count the number of items in the string */
|
||||
count = 1;
|
||||
for (i = 0; in[i]; i++) if (in[i] == ',') count++;
|
||||
|
||||
/* Allocate storage */
|
||||
ret = (char **)malloc((count + 1) * sizeof(char *));
|
||||
|
||||
/* Create copies of the strings from the list */
|
||||
substring = in;
|
||||
for (i = 0; i < count; i++) {
|
||||
char *substring_end;
|
||||
int substring_length;
|
||||
|
||||
/* Find length of substring */
|
||||
for (substring_end = substring;
|
||||
(*substring_end != ',') && (*substring_end != 0);
|
||||
substring_end++);
|
||||
|
||||
substring_length = substring_end - substring;
|
||||
|
||||
/* Allocate memory and copy the substring */
|
||||
ret[i] = (char *)malloc(substring_length + 1);
|
||||
memcpy(ret[i], substring, substring_length);
|
||||
ret[i][substring_length] = 0;
|
||||
|
||||
/* go to next substring */
|
||||
substring = substring_end + 1;
|
||||
}
|
||||
ret[i] = NULL; /* Write the sentinel value */
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void
|
||||
report_parse_error(const char *str)
|
||||
{
|
||||
fputs(str, stderr);
|
||||
}
|
||||
|
||||
/* Interpret a string as a 'pb_DeviceParam' value.
|
||||
* Return a pointer to a new value, or NULL on failure.
|
||||
*/
|
||||
static struct pb_DeviceParam *
|
||||
read_device_param(char *str)
|
||||
{
|
||||
/* Try different ways of interpreting 'device_string' until one works */
|
||||
|
||||
/* If argument is an integer, then interpret it as a device index */
|
||||
errno = 0;
|
||||
char *end;
|
||||
long device_int = strtol(str, &end, 10);
|
||||
if (!errno) {
|
||||
/* Negative numbers are not valid */
|
||||
if (device_int < 0 || device_int > INT_MAX) return NULL;
|
||||
|
||||
return pb_DeviceParam_index(device_int);
|
||||
}
|
||||
|
||||
/* Match against predefined strings */
|
||||
if (strcmp(str, "CPU") == 0)
|
||||
return pb_DeviceParam_cpu();
|
||||
if (strcmp(str, "GPU") == 0)
|
||||
return pb_DeviceParam_gpu();
|
||||
if (strcmp(str, "ACCELERATOR") == 0)
|
||||
return pb_DeviceParam_accelerator();
|
||||
|
||||
/* Assume any other string is a device name */
|
||||
return pb_DeviceParam_name(strdup(str));
|
||||
}
|
||||
|
||||
/* Interpret a string as a 'pb_PlatformParam' value.
|
||||
* Return a pointer to a new value, or NULL on failure.
|
||||
*/
|
||||
static struct pb_PlatformParam *
|
||||
read_platform_param(char *str)
|
||||
{
|
||||
int separator_index; /* Index of the '-' character separating
|
||||
* name and version number. It's -1 if
|
||||
* there's no '-' character. */
|
||||
|
||||
/* Find the last occurrence of '-' in 'str' */
|
||||
{
|
||||
char *cur;
|
||||
separator_index = -1;
|
||||
for (cur = str; *cur; cur++) {
|
||||
if (*cur == '-') separator_index = cur - str;
|
||||
}
|
||||
}
|
||||
|
||||
/* The platform name is either the entire string, or all characters before
|
||||
* the separator */
|
||||
int name_length = separator_index == -1 ? strlen(str) : separator_index;
|
||||
char *name_str = (char *)malloc(name_length + 1);
|
||||
memcpy(name_str, str, name_length);
|
||||
name_str[name_length] = 0;
|
||||
|
||||
/* The version is either NULL, or all characters after the separator */
|
||||
char *version_str;
|
||||
if (separator_index == -1) {
|
||||
version_str = NULL;
|
||||
}
|
||||
else {
|
||||
const char *version_input_str = str + separator_index + 1;
|
||||
int version_length = strlen(version_input_str);
|
||||
|
||||
version_str = (char *)malloc(version_length + 1);
|
||||
memcpy(version_str, version_input_str, version_length);
|
||||
version_str[version_length] = 0;
|
||||
}
|
||||
|
||||
/* Create output structure */
|
||||
return pb_PlatformParam(name_str, version_str);
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
/* Argument parsing state */
|
||||
|
||||
/* Argument parsing state.
|
||||
*
|
||||
* Arguments that are interpreted by the argument parser are removed from
|
||||
* the list. Variables 'argc' and 'argn' do not count arguments that have
|
||||
* been removed.
|
||||
*
|
||||
* During argument parsing, the array of arguments is compacted, overwriting
|
||||
* the erased arguments. Variable 'argv_put' points to the array element
|
||||
* where the next argument will be written. Variable 'argv_get' points to
|
||||
* the array element where the next argument will be read from.
|
||||
*/
|
||||
struct argparse {
|
||||
int argc; /* Number of arguments. Mutable. */
|
||||
int argn; /* Current argument index. */
|
||||
char **argv_get; /* Argument value being read. */
|
||||
char **argv_put; /* Argument value being written.
|
||||
* argv_put <= argv_get. */
|
||||
};
|
||||
|
||||
static void
|
||||
initialize_argparse(struct argparse *ap, int argc, char **argv)
|
||||
{
|
||||
ap->argc = argc;
|
||||
ap->argn = 0;
|
||||
ap->argv_get = ap->argv_put = argv;
|
||||
}
|
||||
|
||||
/* Finish argument parsing, without processing the remaining arguments.
|
||||
* Write new argument count into _argc. */
|
||||
static void
|
||||
finalize_argparse(struct argparse *ap, int *_argc, char **argv)
|
||||
{
|
||||
/* Move the remaining arguments */
|
||||
for(; ap->argn < ap->argc; ap->argn++)
|
||||
*ap->argv_put++ = *ap->argv_get++;
|
||||
|
||||
/* Update the argument count */
|
||||
*_argc = ap->argc;
|
||||
|
||||
/* Insert a terminating NULL */
|
||||
argv[ap->argc] = NULL;
|
||||
}
|
||||
|
||||
/* Delete the current argument. The argument will not be visible
|
||||
* when argument parsing is done. */
|
||||
static void
|
||||
delete_argument(struct argparse *ap)
|
||||
{
|
||||
if (ap->argn >= ap->argc) {
|
||||
fprintf(stderr, "delete_argument\n");
|
||||
}
|
||||
ap->argc--;
|
||||
ap->argv_get++;
|
||||
}
|
||||
|
||||
/* Go to the next argument. Also, move the current argument to its
|
||||
* final location in argv. */
|
||||
static void
|
||||
next_argument(struct argparse *ap)
|
||||
{
|
||||
if (ap->argn >= ap->argc) {
|
||||
fprintf(stderr, "next_argument\n");
|
||||
}
|
||||
/* Move argument to its new location. */
|
||||
*ap->argv_put++ = *ap->argv_get++;
|
||||
ap->argn++;
|
||||
}
|
||||
|
||||
static int
|
||||
is_end_of_arguments(struct argparse *ap)
|
||||
{
|
||||
return ap->argn == ap->argc;
|
||||
}
|
||||
|
||||
/* Get the current argument */
|
||||
static char *
|
||||
get_argument(struct argparse *ap)
|
||||
{
|
||||
return *ap->argv_get;
|
||||
}
|
||||
|
||||
/* Get the current argument, and also delete it */
|
||||
static char *
|
||||
consume_argument(struct argparse *ap)
|
||||
{
|
||||
char *ret = get_argument(ap);
|
||||
delete_argument(ap);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
/* The result of parsing a command-line argument */
|
||||
typedef enum {
|
||||
ARGPARSE_OK, /* Success */
|
||||
ARGPARSE_ERROR, /* Error */
|
||||
ARGPARSE_DONE /* Success, and do not continue parsing */
|
||||
} result;
|
||||
|
||||
typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
|
||||
|
||||
|
||||
/* A command-line option */
|
||||
struct option {
|
||||
char short_name; /* If not 0, the one-character
|
||||
* name of this option */
|
||||
const char *long_name; /* If not NULL, the long name of this option */
|
||||
parse_action *action; /* What to do when this option occurs.
|
||||
* Sentinel value is NULL.
|
||||
*/
|
||||
};
|
||||
|
||||
/* Output file
|
||||
*
|
||||
* -o FILE
|
||||
*/
|
||||
static result
|
||||
parse_output_file(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
if (is_end_of_arguments(ap))
|
||||
{
|
||||
report_parse_error("Expecting file name after '-o'\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
/* Replace the output file name */
|
||||
free(params->outFile);
|
||||
params->outFile = strdup(consume_argument(ap));
|
||||
|
||||
return ARGPARSE_OK;
|
||||
}
|
||||
|
||||
/* Input files
|
||||
*
|
||||
* -i FILE,FILE,...
|
||||
*/
|
||||
static result
|
||||
parse_input_files(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
if (is_end_of_arguments(ap))
|
||||
{
|
||||
report_parse_error("Expecting file name after '-i'\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
/* Replace the input file list */
|
||||
pb_FreeStringArray(params->inpFiles);
|
||||
params->inpFiles = read_string_array(consume_argument(ap));
|
||||
return ARGPARSE_OK;
|
||||
}
|
||||
|
||||
/* End of options
|
||||
*
|
||||
* --
|
||||
*/
|
||||
|
||||
static result
|
||||
parse_end_options(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
return ARGPARSE_DONE;
|
||||
}
|
||||
|
||||
/* OpenCL device
|
||||
*
|
||||
* --device X
|
||||
*/
|
||||
|
||||
static result
|
||||
parse_device(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
/* Read the next argument, which specifies a device */
|
||||
|
||||
if (is_end_of_arguments(ap))
|
||||
{
|
||||
report_parse_error("Expecting device specification after '--device'\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
char *device_string = consume_argument(ap);
|
||||
struct pb_DeviceParam *device_param = read_device_param(device_string);
|
||||
|
||||
if (!device_param) {
|
||||
report_parse_error("Unrecognized device specification format on command line\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
/* Save the result */
|
||||
pb_FreeDeviceParam(params->device);
|
||||
params->device = device_param;
|
||||
|
||||
return ARGPARSE_OK;
|
||||
}
|
||||
|
||||
static result
|
||||
parse_platform(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
/* Read the next argument, which specifies a platform */
|
||||
|
||||
if (is_end_of_arguments(ap))
|
||||
{
|
||||
report_parse_error("Expecting device specification after '--platform'\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
char *platform_string = consume_argument(ap);
|
||||
struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
|
||||
|
||||
if (!platform_param) {
|
||||
report_parse_error("Unrecognized platform specification format on command line\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
/* Save the result */
|
||||
pb_FreePlatformParam(params->platform);
|
||||
params->platform = platform_param;
|
||||
|
||||
return ARGPARSE_OK;
|
||||
}
|
||||
|
||||
|
||||
static struct option options[] = {
|
||||
{ 'o', NULL, &parse_output_file },
|
||||
{ 'i', NULL, &parse_input_files },
|
||||
{ '-', NULL, &parse_end_options },
|
||||
{ 0, "device", &parse_device },
|
||||
{ 0, "platform", &parse_platform },
|
||||
{ 0, NULL, NULL }
|
||||
};
|
||||
|
||||
static int
|
||||
is_last_option(struct option *op)
|
||||
{
|
||||
return op->action == NULL;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
/* Parse command-line parameters.
|
||||
* Return zero on error, nonzero otherwise.
|
||||
* On error, the other outputs may be invalid.
|
||||
*
|
||||
* The information collected from parameters is used to update
|
||||
* 'ret'. 'ret' should be initialized.
|
||||
*
|
||||
* '_argc' and 'argv' are updated to contain only the unprocessed arguments.
|
||||
*/
|
||||
static int
|
||||
pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
|
||||
{
|
||||
char *err_message;
|
||||
struct argparse ap;
|
||||
|
||||
/* Each argument */
|
||||
initialize_argparse(&ap, *_argc, argv);
|
||||
while(!is_end_of_arguments(&ap)) {
|
||||
result arg_result; /* Result of parsing this option */
|
||||
char *arg = get_argument(&ap);
|
||||
|
||||
/* Process this argument */
|
||||
if (arg[0] == '-') {
|
||||
/* Single-character flag */
|
||||
if ((arg[1] != 0) && (arg[2] == 0)) {
|
||||
delete_argument(&ap); /* This argument is consumed here */
|
||||
|
||||
/* Find a matching short option */
|
||||
struct option *op;
|
||||
for (op = options; !is_last_option(op); op++) {
|
||||
if (op->short_name == arg[1]) {
|
||||
arg_result = (*op->action)(&ap, ret);
|
||||
goto option_was_processed;
|
||||
}
|
||||
}
|
||||
|
||||
/* No option matches */
|
||||
report_parse_error("Unexpected command-line parameter\n");
|
||||
arg_result = ARGPARSE_ERROR;
|
||||
goto option_was_processed;
|
||||
}
|
||||
|
||||
/* Long flag */
|
||||
if (arg[1] == '-') {
|
||||
delete_argument(&ap); /* This argument is consumed here */
|
||||
|
||||
/* Find a matching long option */
|
||||
struct option *op;
|
||||
for (op = options; !is_last_option(op); op++) {
|
||||
if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
|
||||
arg_result = (*op->action)(&ap, ret);
|
||||
goto option_was_processed;
|
||||
}
|
||||
}
|
||||
|
||||
/* No option matches */
|
||||
report_parse_error("Unexpected command-line parameter\n");
|
||||
arg_result = ARGPARSE_ERROR;
|
||||
goto option_was_processed;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* Other arguments are ignored */
|
||||
next_argument(&ap);
|
||||
arg_result = ARGPARSE_OK;
|
||||
goto option_was_processed;
|
||||
}
|
||||
|
||||
option_was_processed:
|
||||
/* Decide what to do next based on 'arg_result' */
|
||||
switch(arg_result) {
|
||||
case ARGPARSE_OK:
|
||||
/* Continue processing */
|
||||
break;
|
||||
|
||||
case ARGPARSE_ERROR:
|
||||
/* Error exit from the function */
|
||||
return 0;
|
||||
|
||||
case ARGPARSE_DONE:
|
||||
/* Normal exit from the argument parsing loop */
|
||||
goto end_of_options;
|
||||
}
|
||||
} /* end for each argument */
|
||||
|
||||
/* If all arguments were processed, then normal exit from the loop */
|
||||
|
||||
end_of_options:
|
||||
finalize_argparse(&ap, _argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
/* Other exported functions */
|
||||
|
||||
struct pb_Parameters *
|
||||
pb_ReadParameters(int *_argc, char **argv)
|
||||
{
|
||||
struct pb_Parameters *ret =
|
||||
(struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
|
||||
|
||||
/* Initialize the parameters structure */
|
||||
ret->outFile = NULL;
|
||||
ret->inpFiles = (char **)malloc(sizeof(char *));
|
||||
ret->inpFiles[0] = NULL;
|
||||
ret->platform = NULL;
|
||||
ret->device = NULL;
|
||||
|
||||
/* Read parameters and update _argc, argv */
|
||||
if (!pb_ParseParameters(ret, _argc, argv)) {
|
||||
/* Parse error */
|
||||
pb_FreeParameters(ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int
|
||||
pb_Parameters_CountInputs(struct pb_Parameters *p)
|
||||
{
|
||||
int n;
|
||||
|
||||
for (n = 0; p->inpFiles[n]; n++);
|
||||
return n;
|
||||
}
|
||||
|
||||
37
tests/opencl/cutcp/atom.h
Normal file
37
tests/opencl/cutcp/atom.h
Normal file
@@ -0,0 +1,37 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef ATOM_H
|
||||
#define ATOM_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct Atom_t {
|
||||
float x, y, z, q;
|
||||
} Atom;
|
||||
|
||||
typedef struct Atoms_t {
|
||||
Atom *atoms;
|
||||
int size;
|
||||
} Atoms;
|
||||
|
||||
typedef struct Vec3_t {
|
||||
float x, y, z;
|
||||
} Vec3;
|
||||
|
||||
Atoms *read_atom_file(const char *fname);
|
||||
void free_atom(Atoms *atom);
|
||||
void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* ATOM_H */
|
||||
195
tests/opencl/cutcp/cutcpu.c
Normal file
195
tests/opencl/cutcp/cutcpu.c
Normal file
@@ -0,0 +1,195 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <parboil.h>
|
||||
#include "atom.h"
|
||||
#include "cutoff.h"
|
||||
|
||||
#undef DEBUG_PASS_RATE
|
||||
#define CHECK_CYLINDER_CPU
|
||||
|
||||
#define CELLEN 4.f
|
||||
#define INV_CELLEN (1.f/CELLEN)
|
||||
|
||||
extern int cpu_compute_cutoff_potential_lattice(
|
||||
Lattice *lattice, /* the lattice */
|
||||
float cutoff, /* cutoff distance */
|
||||
Atoms *atoms /* array of atoms */
|
||||
)
|
||||
{
|
||||
int nx = lattice->dim.nx;
|
||||
int ny = lattice->dim.ny;
|
||||
int nz = lattice->dim.nz;
|
||||
float xlo = lattice->dim.lo.x;
|
||||
float ylo = lattice->dim.lo.y;
|
||||
float zlo = lattice->dim.lo.z;
|
||||
float gridspacing = lattice->dim.h;
|
||||
int natoms = atoms->size;
|
||||
Atom *atom = atoms->atoms;
|
||||
|
||||
const float a2 = cutoff * cutoff;
|
||||
const float inv_a2 = 1.f / a2;
|
||||
float s;
|
||||
const float inv_gridspacing = 1.f / gridspacing;
|
||||
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
|
||||
/* lattice point radius about each atom */
|
||||
|
||||
int n;
|
||||
int i, j, k;
|
||||
int ia, ib, ic;
|
||||
int ja, jb, jc;
|
||||
int ka, kb, kc;
|
||||
int index;
|
||||
int koff, jkoff;
|
||||
|
||||
float x, y, z, q;
|
||||
float dx, dy, dz;
|
||||
float dz2, dydz2, r2;
|
||||
float e;
|
||||
float xstart, ystart;
|
||||
|
||||
float *pg;
|
||||
|
||||
int gindex;
|
||||
int ncell, nxcell, nycell, nzcell;
|
||||
int *first, *next;
|
||||
float inv_cellen = INV_CELLEN;
|
||||
Vec3 minext, maxext; /* Extent of atom bounding box */
|
||||
float xmin, ymin, zmin;
|
||||
float xmax, ymax, zmax;
|
||||
|
||||
#if DEBUG_PASS_RATE
|
||||
unsigned long long pass_count = 0;
|
||||
unsigned long long fail_count = 0;
|
||||
#endif
|
||||
|
||||
/* find min and max extent */
|
||||
get_atom_extent(&minext, &maxext, atoms);
|
||||
|
||||
/* number of cells in each dimension */
|
||||
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
|
||||
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
|
||||
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
|
||||
ncell = nxcell * nycell * nzcell;
|
||||
|
||||
/* allocate for cursor link list implementation */
|
||||
first = (int *) malloc(ncell * sizeof(int));
|
||||
for (gindex = 0; gindex < ncell; gindex++) {
|
||||
first[gindex] = -1;
|
||||
}
|
||||
next = (int *) malloc(natoms * sizeof(int));
|
||||
for (n = 0; n < natoms; n++) {
|
||||
next[n] = -1;
|
||||
}
|
||||
|
||||
/* geometric hashing */
|
||||
for (n = 0; n < natoms; n++) {
|
||||
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
|
||||
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
|
||||
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
|
||||
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
|
||||
gindex = (k*nycell + j)*nxcell + i;
|
||||
next[n] = first[gindex];
|
||||
first[gindex] = n;
|
||||
}
|
||||
|
||||
/* traverse the grid cells */
|
||||
for (gindex = 0; gindex < ncell; gindex++) {
|
||||
for (n = first[gindex]; n != -1; n = next[n]) {
|
||||
x = atom[n].x - xlo;
|
||||
y = atom[n].y - ylo;
|
||||
z = atom[n].z - zlo;
|
||||
q = atom[n].q;
|
||||
|
||||
/* find closest grid point with position less than or equal to atom */
|
||||
ic = (int) (x * inv_gridspacing);
|
||||
jc = (int) (y * inv_gridspacing);
|
||||
kc = (int) (z * inv_gridspacing);
|
||||
|
||||
/* find extent of surrounding box of grid points */
|
||||
ia = ic - radius;
|
||||
ib = ic + radius + 1;
|
||||
ja = jc - radius;
|
||||
jb = jc + radius + 1;
|
||||
ka = kc - radius;
|
||||
kb = kc + radius + 1;
|
||||
|
||||
/* trim box edges so that they are within grid point lattice */
|
||||
if (ia < 0) ia = 0;
|
||||
if (ib >= nx) ib = nx-1;
|
||||
if (ja < 0) ja = 0;
|
||||
if (jb >= ny) jb = ny-1;
|
||||
if (ka < 0) ka = 0;
|
||||
if (kb >= nz) kb = nz-1;
|
||||
|
||||
/* loop over surrounding grid points */
|
||||
xstart = ia*gridspacing - x;
|
||||
ystart = ja*gridspacing - y;
|
||||
dz = ka*gridspacing - z;
|
||||
for (k = ka; k <= kb; k++, dz += gridspacing) {
|
||||
koff = k*ny;
|
||||
dz2 = dz*dz;
|
||||
dy = ystart;
|
||||
for (j = ja; j <= jb; j++, dy += gridspacing) {
|
||||
jkoff = (koff + j)*nx;
|
||||
dydz2 = dy*dy + dz2;
|
||||
#ifdef CHECK_CYLINDER_CPU
|
||||
if (dydz2 >= a2) continue;
|
||||
#endif
|
||||
|
||||
dx = xstart;
|
||||
index = jkoff + ia;
|
||||
pg = lattice->lattice + index;
|
||||
|
||||
#if defined(__INTEL_COMPILER)
|
||||
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
|
||||
r2 = dx*dx + dydz2;
|
||||
s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
|
||||
e = q * (1/sqrtf(r2)) * s;
|
||||
*pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
|
||||
}
|
||||
#else
|
||||
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
|
||||
r2 = dx*dx + dydz2;
|
||||
if (r2 >= a2)
|
||||
{
|
||||
#ifdef DEBUG_PASS_RATE
|
||||
fail_count++;
|
||||
#endif
|
||||
continue;
|
||||
}
|
||||
#ifdef DEBUG_PASS_RATE
|
||||
pass_count++;
|
||||
#endif
|
||||
s = (1.f - r2 * inv_a2);
|
||||
e = q * (1/sqrtf(r2)) * s * s;
|
||||
*pg += e;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} /* end loop over surrounding grid points */
|
||||
|
||||
} /* end loop over atoms in a gridcell */
|
||||
} /* end loop over gridcells */
|
||||
|
||||
/* free memory */
|
||||
free(next);
|
||||
free(first);
|
||||
|
||||
/* For debugging: print the number of times that the test passed/failed */
|
||||
#ifdef DEBUG_PASS_RATE
|
||||
printf ("Pass :%lld\n", pass_count);
|
||||
printf ("Fail :%lld\n", fail_count);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
499
tests/opencl/cutcp/cutoff.c
Normal file
499
tests/opencl/cutcp/cutoff.c
Normal file
@@ -0,0 +1,499 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <parboil.h>
|
||||
|
||||
#include "atom.h"
|
||||
#include "cutoff.h"
|
||||
#include "macros.h"
|
||||
#include "ocl.h"
|
||||
|
||||
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
|
||||
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
|
||||
typedef cl_int4 xyz;
|
||||
|
||||
//extern "C" int gpu_compute_cutoff_potential_lattice(
|
||||
int gpu_compute_cutoff_potential_lattice(
|
||||
struct pb_TimerSet *timers,
|
||||
Lattice *lattice, /* the lattice */
|
||||
float cutoff, /* cutoff distance */
|
||||
Atoms *atoms, /* array of atoms */
|
||||
int verbose, /* print info/debug messages */
|
||||
struct pb_Parameters *parameters
|
||||
)
|
||||
{
|
||||
int nx = lattice->dim.nx;
|
||||
int ny = lattice->dim.ny;
|
||||
int nz = lattice->dim.nz;
|
||||
float xlo = lattice->dim.lo.x;
|
||||
float ylo = lattice->dim.lo.y;
|
||||
float zlo = lattice->dim.lo.z;
|
||||
float h = lattice->dim.h;
|
||||
int natoms = atoms->size;
|
||||
Atom *atom = atoms->atoms;
|
||||
|
||||
xyz nbrlist[NBRLIST_MAXLEN];
|
||||
int nbrlistlen = 0;
|
||||
|
||||
int binHistoFull[BIN_DEPTH+1] = { 0 }; /* clear every array element */
|
||||
int binHistoCover[BIN_DEPTH+1] = { 0 }; /* clear every array element */
|
||||
int num_excluded = 0;
|
||||
|
||||
int xRegionDim, yRegionDim, zRegionDim;
|
||||
int xRegionIndex, yRegionIndex, zRegionIndex;
|
||||
int xOffset, yOffset, zOffset;
|
||||
int lnx, lny, lnz, lnall;
|
||||
float *regionZeroAddr, *thisRegion;
|
||||
cl_mem regionZeroCl;
|
||||
int index, indexRegion;
|
||||
|
||||
int c;
|
||||
xyz binDim;
|
||||
int nbins;
|
||||
cl_float4 *binBaseAddr, *binZeroAddr;
|
||||
cl_mem binBaseCl, binZeroCl;
|
||||
int *bincntBaseAddr, *bincntZeroAddr;
|
||||
Atoms *extra = NULL;
|
||||
|
||||
cl_mem NbrListLen;
|
||||
cl_mem NbrList;
|
||||
|
||||
int i, j, k, n;
|
||||
int sum, total;
|
||||
|
||||
float avgFillFull, avgFillCover;
|
||||
const float cutoff2 = cutoff * cutoff;
|
||||
const float inv_cutoff2 = 1.f / cutoff2;
|
||||
|
||||
size_t gridDim[3], blockDim[3];
|
||||
|
||||
// The "compute" timer should be active upon entry to this function
|
||||
|
||||
/* pad lattice to be factor of 8 in each dimension */
|
||||
xRegionDim = (int) ceilf(nx/8.f);
|
||||
yRegionDim = (int) ceilf(ny/8.f);
|
||||
zRegionDim = (int) ceilf(nz/8.f);
|
||||
|
||||
lnx = 8 * xRegionDim;
|
||||
lny = 8 * yRegionDim;
|
||||
lnz = 8 * zRegionDim;
|
||||
lnall = lnx * lny * lnz;
|
||||
|
||||
/* will receive energies from OpenCL */
|
||||
regionZeroAddr = (float *) malloc(lnall * sizeof(float));
|
||||
|
||||
/* create bins */
|
||||
c = (int) ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */
|
||||
binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
|
||||
binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
|
||||
binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
|
||||
nbins = binDim.x * binDim.y * binDim.z;
|
||||
binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
|
||||
binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
|
||||
|
||||
bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
|
||||
bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
|
||||
|
||||
/* create neighbor list */
|
||||
if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
|
||||
float s = sqrtf(3);
|
||||
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
|
||||
int cnt = 0;
|
||||
/* develop neighbor list around 1 cell */
|
||||
if (2*c + 1 > NBRLIST_DIM) {
|
||||
fprintf(stderr, "must have cutoff <= %f\n",
|
||||
(NBRLIST_DIM-1)/2 * BIN_LENGTH);
|
||||
return -1;
|
||||
}
|
||||
for (k = -c; k <= c; k++) {
|
||||
for (j = -c; j <= c; j++) {
|
||||
for (i = -c; i <= c; i++) {
|
||||
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
|
||||
nbrlist[cnt].x = i;
|
||||
nbrlist[cnt].y = j;
|
||||
nbrlist[cnt].z = k;
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
nbrlistlen = cnt;
|
||||
}
|
||||
else if (8*h <= 2*BIN_LENGTH) {
|
||||
float s = 2.f*sqrtf(3);
|
||||
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
|
||||
int cnt = 0;
|
||||
/* develop neighbor list around 3-cube of cells */
|
||||
if (2*c + 3 > NBRLIST_DIM) {
|
||||
fprintf(stderr, "must have cutoff <= %f\n",
|
||||
(NBRLIST_DIM-3)/2 * BIN_LENGTH);
|
||||
return -1;
|
||||
}
|
||||
for (k = -c; k <= c; k++) {
|
||||
for (j = -c; j <= c; j++) {
|
||||
for (i = -c; i <= c; i++) {
|
||||
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
|
||||
nbrlist[cnt].x = i;
|
||||
nbrlist[cnt].y = j;
|
||||
nbrlist[cnt].z = k;
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
nbrlistlen = cnt;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* perform geometric hashing of atoms into bins */
|
||||
{
|
||||
/* array of extra atoms, permit average of one extra per bin */
|
||||
Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
|
||||
int extra_len = 0;
|
||||
|
||||
for (n = 0; n < natoms; n++) {
|
||||
cl_float4 p;
|
||||
p.x = atom[n].x - xlo;
|
||||
p.y = atom[n].y - ylo;
|
||||
p.z = atom[n].z - zlo;
|
||||
p.w = atom[n].q;
|
||||
i = (int) floorf(p.x * BIN_INVLEN);
|
||||
j = (int) floorf(p.y * BIN_INVLEN);
|
||||
k = (int) floorf(p.z * BIN_INVLEN);
|
||||
if (i >= -c && i < binDim.x - c &&
|
||||
j >= -c && j < binDim.y - c &&
|
||||
k >= -c && k < binDim.z - c &&
|
||||
atom[n].q != 0) {
|
||||
int index = (k * binDim.y + j) * binDim.x + i;
|
||||
cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
|
||||
int bindex = bincntZeroAddr[index];
|
||||
if (bindex < BIN_DEPTH) {
|
||||
/* copy atom into bin and increase counter for this bin */
|
||||
bin[bindex] = p;
|
||||
bincntZeroAddr[index]++;
|
||||
}
|
||||
else {
|
||||
/* add index to array of extra atoms to be computed with CPU */
|
||||
if (extra_len >= nbins) {
|
||||
fprintf(stderr, "exceeded space for storing extra atoms\n");
|
||||
return -1;
|
||||
}
|
||||
extra_atoms[extra_len] = atom[n];
|
||||
extra_len++;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* excluded atoms are either outside bins or neutrally charged */
|
||||
num_excluded++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Save result */
|
||||
extra = (Atoms *)malloc(sizeof(Atoms));
|
||||
extra->atoms = extra_atoms;
|
||||
extra->size = extra_len;
|
||||
}
|
||||
|
||||
/* bin stats */
|
||||
sum = total = 0;
|
||||
for (n = 0; n < nbins; n++) {
|
||||
binHistoFull[ bincntBaseAddr[n] ]++;
|
||||
sum += bincntBaseAddr[n];
|
||||
total += BIN_DEPTH;
|
||||
}
|
||||
avgFillFull = sum / (float) total;
|
||||
sum = total = 0;
|
||||
for (k = 0; k < binDim.z - 2*c; k++) {
|
||||
for (j = 0; j < binDim.y - 2*c; j++) {
|
||||
for (i = 0; i < binDim.x - 2*c; i++) {
|
||||
int index = (k * binDim.y + j) * binDim.x + i;
|
||||
binHistoCover[ bincntZeroAddr[index] ]++;
|
||||
sum += bincntZeroAddr[index];
|
||||
total += BIN_DEPTH;
|
||||
}
|
||||
}
|
||||
}
|
||||
avgFillCover = sum / (float) total;
|
||||
|
||||
if (verbose) {
|
||||
/* report */
|
||||
printf("number of atoms = %d\n", natoms);
|
||||
printf("lattice spacing = %g\n", h);
|
||||
printf("cutoff distance = %g\n", cutoff);
|
||||
printf("\n");
|
||||
printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
|
||||
printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
|
||||
printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
|
||||
printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
|
||||
printf("number of bytes for lattice data = %u\n", (unsigned int) (lnall*sizeof(float)));
|
||||
printf("\n");
|
||||
printf("bin padding thickness = %d\n", c);
|
||||
printf("bin cover dimensions = %d %d %d\n",
|
||||
binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
|
||||
printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
|
||||
printf("number of bins = %d\n", nbins);
|
||||
printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
|
||||
printf("%% overhead space = %g\n",
|
||||
(natoms / (double) (nbins * BIN_DEPTH)) * 100);
|
||||
printf("number of bytes for bin data = %u\n",
|
||||
(unsigned int)(nbins * BIN_DEPTH * sizeof(cl_float4)));
|
||||
printf("\n");
|
||||
printf("bin histogram with padding:\n");
|
||||
sum = 0;
|
||||
for (n = 0; n <= BIN_DEPTH; n++) {
|
||||
printf(" number of bins with %d atoms: %d\n", n, binHistoFull[n]);
|
||||
sum += binHistoFull[n];
|
||||
}
|
||||
printf(" total number of bins: %d\n", sum);
|
||||
printf(" %% average fill: %g\n", avgFillFull * 100);
|
||||
printf("\n");
|
||||
printf("bin histogram excluding padding:\n");
|
||||
sum = 0;
|
||||
for (n = 0; n <= BIN_DEPTH; n++) {
|
||||
printf(" number of bins with %d atoms: %d\n", n, binHistoCover[n]);
|
||||
sum += binHistoCover[n];
|
||||
}
|
||||
printf(" total number of bins: %d\n", sum);
|
||||
printf(" %% average fill: %g\n", avgFillCover * 100);
|
||||
printf("\n");
|
||||
printf("number of extra atoms = %d\n", extra->size);
|
||||
printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
|
||||
printf("\n");
|
||||
|
||||
/* sanity check on bins */
|
||||
sum = 0;
|
||||
for (n = 0; n <= BIN_DEPTH; n++) {
|
||||
sum += n * binHistoFull[n];
|
||||
}
|
||||
sum += extra->size + num_excluded;
|
||||
printf("sanity check on bin histogram with edges: "
|
||||
"sum + others = %d\n", sum);
|
||||
sum = 0;
|
||||
for (n = 0; n <= BIN_DEPTH; n++) {
|
||||
sum += n * binHistoCover[n];
|
||||
}
|
||||
sum += extra->size + num_excluded;
|
||||
printf("sanity check on bin histogram excluding edges: "
|
||||
"sum + others = %d\n", sum);
|
||||
printf("\n");
|
||||
|
||||
/* neighbor list */
|
||||
printf("neighbor list length = %d\n", nbrlistlen);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("Ok!\n");
|
||||
|
||||
pb_Context* pb_context;
|
||||
pb_context = pb_InitOpenCLContext(parameters);
|
||||
if (pb_context == NULL) {
|
||||
fprintf (stderr, "Error: No OpenCL platform/device can be found.");
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("Ok!\n");
|
||||
|
||||
cl_int clStatus;
|
||||
cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
|
||||
cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
|
||||
cl_context clContext = (cl_context) pb_context->clContext;
|
||||
|
||||
cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
|
||||
CHECK_ERROR("clCreateCommandQueue")
|
||||
|
||||
pb_SetOpenCL(&clContext, &clCommandQueue);
|
||||
|
||||
//const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
|
||||
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
|
||||
cl_program clProgram = clCreateProgramWithBuiltInKernels(
|
||||
clContext, 1, &clDevice, "opencl_cutoff_potential_lattice", &clStatus);
|
||||
CHECK_ERROR("clCreateProgramWithSource")
|
||||
|
||||
char clOptions[50];
|
||||
sprintf(clOptions,"-I src/opencl_base"); //-cl-nv-verbose
|
||||
|
||||
clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
|
||||
if (clStatus != CL_SUCCESS) {
|
||||
size_t string_size = 0;
|
||||
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
|
||||
0, NULL, &string_size);
|
||||
char* string = (char*)malloc(string_size*sizeof(char));
|
||||
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
|
||||
string_size, string, NULL);
|
||||
puts(string);
|
||||
}
|
||||
|
||||
CHECK_ERROR("clBuildProgram")
|
||||
|
||||
cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice",&clStatus);
|
||||
CHECK_ERROR("clCreateKernel")
|
||||
|
||||
/* setup OpenCL kernel parameters */
|
||||
blockDim[0] = 8;
|
||||
blockDim[1] = 8;
|
||||
blockDim[2] = 2;
|
||||
gridDim[0] = 4 * xRegionDim * blockDim[0];
|
||||
gridDim[1] = yRegionDim * blockDim[1];
|
||||
gridDim[2] = 1 * blockDim[2];
|
||||
|
||||
/* allocate and initialize memory on OpenCL device */
|
||||
pb_SwitchToTimer(timers, pb_TimerID_COPY);
|
||||
if (verbose) {
|
||||
printf("Allocating %.2fMB on OpenCL device for potentials\n",
|
||||
lnall * sizeof(float) / (double) (1024*1024));
|
||||
}
|
||||
|
||||
regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(float),NULL,&clStatus);
|
||||
CHECK_ERROR("clCreateBuffer")
|
||||
|
||||
// clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(float));
|
||||
|
||||
if (verbose) {
|
||||
printf("Allocating %.2fMB on OpenCL device for atom bins\n",
|
||||
nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024));
|
||||
}
|
||||
|
||||
binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus);
|
||||
CHECK_ERROR("clCreateBuffer")
|
||||
|
||||
clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||
|
||||
//Sub buffers are not supported in OpenCL v1.0
|
||||
int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
|
||||
|
||||
NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus);
|
||||
CHECK_ERROR("clCreateBuffer")
|
||||
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||
|
||||
NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus);
|
||||
CHECK_ERROR("clCreateBuffer")
|
||||
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||
|
||||
if (verbose)
|
||||
printf("\n");
|
||||
|
||||
clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x));
|
||||
clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y));
|
||||
clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl);
|
||||
clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset);
|
||||
clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h);
|
||||
clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2);
|
||||
clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2);
|
||||
clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),®ionZeroCl);
|
||||
clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen);
|
||||
clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
|
||||
CHECK_ERROR("clSetKernelArg")
|
||||
|
||||
printf("Ok!!\n");
|
||||
|
||||
|
||||
/* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
|
||||
pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
|
||||
printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
|
||||
for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
|
||||
printf(" computing plane %d\r", zRegionIndex);
|
||||
fflush(stdout);
|
||||
|
||||
clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
|
||||
CHECK_ERROR("clSetKernelArg")
|
||||
|
||||
printf("Ok**!2\n");
|
||||
|
||||
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
|
||||
|
||||
printf("Ok**!2\n");
|
||||
|
||||
CHECK_ERROR("clEnqueueNDRangeKernel")
|
||||
|
||||
printf("Ok**!2\n");
|
||||
|
||||
clStatus = clFinish(clCommandQueue);
|
||||
|
||||
printf("Ok**!2\n");
|
||||
|
||||
CHECK_ERROR("clFinish")
|
||||
}
|
||||
|
||||
printf("Ok++!\n");
|
||||
|
||||
printf("Finished OpenCL kernel calls \n");
|
||||
|
||||
/* copy result regions from OpenCL device */
|
||||
pb_SwitchToTimer(timers, pb_TimerID_COPY);
|
||||
clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(float),regionZeroAddr,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueReadBuffer")
|
||||
|
||||
/* free OpenCL memory allocations */
|
||||
clStatus = clReleaseMemObject(regionZeroCl);
|
||||
clStatus = clReleaseMemObject(binBaseCl);
|
||||
clStatus = clReleaseMemObject(NbrListLen);
|
||||
clStatus = clReleaseMemObject(NbrList);
|
||||
CHECK_ERROR("clReleaseMemObject")
|
||||
|
||||
clStatus = clReleaseKernel(clKernel);
|
||||
clStatus = clReleaseProgram(clProgram);
|
||||
clStatus = clReleaseCommandQueue(clCommandQueue);
|
||||
clStatus = clReleaseContext(clContext);
|
||||
|
||||
//free((void*)clSource[0]);
|
||||
|
||||
/* transpose regions back into lattice */
|
||||
pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
|
||||
for (k = 0; k < nz; k++) {
|
||||
zRegionIndex = (k >> 3);
|
||||
zOffset = (k & 7);
|
||||
|
||||
for (j = 0; j < ny; j++) {
|
||||
yRegionIndex = (j >> 3);
|
||||
yOffset = (j & 7);
|
||||
|
||||
for (i = 0; i < nx; i++) {
|
||||
xRegionIndex = (i >> 3);
|
||||
xOffset = (i & 7);
|
||||
|
||||
thisRegion = regionZeroAddr
|
||||
+ ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
|
||||
+ xRegionIndex) * REGION_SIZE;
|
||||
|
||||
indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
|
||||
index = (k * ny + j) * nx + i;
|
||||
|
||||
lattice->lattice[index] = thisRegion[indexRegion];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* handle extra atoms */
|
||||
if (extra->size > 0) {
|
||||
printf("computing extra atoms on CPU\n");
|
||||
if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
|
||||
fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
|
||||
"for extra atoms\n");
|
||||
return -1;
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
/* cleanup memory allocations */
|
||||
free(regionZeroAddr);
|
||||
free(binBaseAddr);
|
||||
free(bincntBaseAddr);
|
||||
free_atom(extra);
|
||||
|
||||
return 0;
|
||||
}
|
||||
72
tests/opencl/cutcp/cutoff.h
Normal file
72
tests/opencl/cutcp/cutoff.h
Normal file
@@ -0,0 +1,72 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef CUTOFF_H
|
||||
#define CUTOFF_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define SHIFTED
|
||||
|
||||
/* A structure to record how points in 3D space map to array
|
||||
elements. Array element (z, y, x)
|
||||
where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
|
||||
maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
|
||||
*/
|
||||
typedef struct LatticeDim_t {
|
||||
/* Number of lattice points in x, y, z dimensions */
|
||||
int nx, ny, nz;
|
||||
|
||||
/* Lowest corner of lattice */
|
||||
Vec3 lo;
|
||||
|
||||
/* Lattice spacing */
|
||||
float h;
|
||||
} LatticeDim;
|
||||
|
||||
/* An electric potential field sampled on a regular grid. The
|
||||
lattice size and grid point positions are specified by 'dim'.
|
||||
*/
|
||||
typedef struct Lattice_t {
|
||||
LatticeDim dim;
|
||||
float *lattice;
|
||||
} Lattice;
|
||||
|
||||
LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
|
||||
|
||||
Lattice *create_lattice(LatticeDim dim);
|
||||
void destroy_lattice(Lattice *);
|
||||
|
||||
int gpu_compute_cutoff_potential_lattice(
|
||||
struct pb_TimerSet *timers,
|
||||
Lattice *lattice,
|
||||
float cutoff, /* cutoff distance */
|
||||
Atoms *atom, /* array of atoms */
|
||||
int verbose, /* print info/debug messages */
|
||||
struct pb_Parameters *parameters
|
||||
);
|
||||
|
||||
int cpu_compute_cutoff_potential_lattice(
|
||||
Lattice *lattice, /* the lattice */
|
||||
float cutoff, /* cutoff distance */
|
||||
Atoms *atoms /* array of atoms */
|
||||
);
|
||||
|
||||
int remove_exclusions(
|
||||
Lattice *lattice, /* the lattice */
|
||||
float exclcutoff, /* exclusion cutoff distance */
|
||||
Atoms *atom /* array of atoms */
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* CUTOFF_H */
|
||||
157
tests/opencl/cutcp/excl.c
Normal file
157
tests/opencl/cutcp/excl.c
Normal file
@@ -0,0 +1,157 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <parboil.h>
|
||||
|
||||
#include "atom.h"
|
||||
#include "cutoff.h"
|
||||
|
||||
#define CELLEN 4.f
|
||||
#define INV_CELLEN (1.f/CELLEN)
|
||||
|
||||
extern int remove_exclusions(
|
||||
Lattice *lattice, /* the lattice */
|
||||
float cutoff, /* exclusion cutoff distance */
|
||||
Atoms *atoms /* array of atoms */
|
||||
)
|
||||
{
|
||||
int nx = lattice->dim.nx;
|
||||
int ny = lattice->dim.ny;
|
||||
int nz = lattice->dim.nz;
|
||||
float xlo = lattice->dim.lo.x;
|
||||
float ylo = lattice->dim.lo.y;
|
||||
float zlo = lattice->dim.lo.z;
|
||||
float gridspacing = lattice->dim.h;
|
||||
Atom *atom = atoms->atoms;
|
||||
|
||||
const float a2 = cutoff * cutoff;
|
||||
const float inv_gridspacing = 1.f / gridspacing;
|
||||
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
|
||||
/* lattice point radius about each atom */
|
||||
|
||||
int n;
|
||||
int i, j, k;
|
||||
int ia, ib, ic;
|
||||
int ja, jb, jc;
|
||||
int ka, kb, kc;
|
||||
int index;
|
||||
int koff, jkoff;
|
||||
|
||||
float x, y, z, q;
|
||||
float dx, dy, dz;
|
||||
float dz2, dydz2, r2;
|
||||
float e;
|
||||
float xstart, ystart;
|
||||
|
||||
float *pg;
|
||||
|
||||
int gindex;
|
||||
int ncell, nxcell, nycell, nzcell;
|
||||
int *first, *next;
|
||||
float inv_cellen = INV_CELLEN;
|
||||
Vec3 minext, maxext;
|
||||
|
||||
/* find min and max extent */
|
||||
get_atom_extent(&minext, &maxext, atoms);
|
||||
|
||||
/* number of cells in each dimension */
|
||||
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
|
||||
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
|
||||
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
|
||||
ncell = nxcell * nycell * nzcell;
|
||||
|
||||
/* allocate for cursor link list implementation */
|
||||
first = (int *) malloc(ncell * sizeof(int));
|
||||
for (gindex = 0; gindex < ncell; gindex++) {
|
||||
first[gindex] = -1;
|
||||
}
|
||||
next = (int *) malloc(atoms->size * sizeof(int));
|
||||
for (n = 0; n < atoms->size; n++) {
|
||||
next[n] = -1;
|
||||
}
|
||||
|
||||
/* geometric hashing */
|
||||
for (n = 0; n < atoms->size; n++) {
|
||||
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
|
||||
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
|
||||
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
|
||||
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
|
||||
gindex = (k*nycell + j)*nxcell + i;
|
||||
next[n] = first[gindex];
|
||||
first[gindex] = n;
|
||||
}
|
||||
|
||||
/* traverse the grid cells */
|
||||
for (gindex = 0; gindex < ncell; gindex++) {
|
||||
for (n = first[gindex]; n != -1; n = next[n]) {
|
||||
x = atom[n].x - xlo;
|
||||
y = atom[n].y - ylo;
|
||||
z = atom[n].z - zlo;
|
||||
q = atom[n].q;
|
||||
|
||||
/* find closest grid point with position less than or equal to atom */
|
||||
ic = (int) (x * inv_gridspacing);
|
||||
jc = (int) (y * inv_gridspacing);
|
||||
kc = (int) (z * inv_gridspacing);
|
||||
|
||||
/* find extent of surrounding box of grid points */
|
||||
ia = ic - radius;
|
||||
ib = ic + radius + 1;
|
||||
ja = jc - radius;
|
||||
jb = jc + radius + 1;
|
||||
ka = kc - radius;
|
||||
kb = kc + radius + 1;
|
||||
|
||||
/* trim box edges so that they are within grid point lattice */
|
||||
if (ia < 0) ia = 0;
|
||||
if (ib >= nx) ib = nx-1;
|
||||
if (ja < 0) ja = 0;
|
||||
if (jb >= ny) jb = ny-1;
|
||||
if (ka < 0) ka = 0;
|
||||
if (kb >= nz) kb = nz-1;
|
||||
|
||||
/* loop over surrounding grid points */
|
||||
xstart = ia*gridspacing - x;
|
||||
ystart = ja*gridspacing - y;
|
||||
dz = ka*gridspacing - z;
|
||||
for (k = ka; k <= kb; k++, dz += gridspacing) {
|
||||
koff = k*ny;
|
||||
dz2 = dz*dz;
|
||||
|
||||
dy = ystart;
|
||||
for (j = ja; j <= jb; j++, dy += gridspacing) {
|
||||
jkoff = (koff + j)*nx;
|
||||
dydz2 = dy*dy + dz2;
|
||||
|
||||
dx = xstart;
|
||||
index = jkoff + ia;
|
||||
pg = lattice->lattice + index;
|
||||
|
||||
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
|
||||
r2 = dx*dx + dydz2;
|
||||
|
||||
/* If atom and lattice point are too close, set the lattice value
|
||||
* to zero */
|
||||
if (r2 < a2) *pg = 0;
|
||||
}
|
||||
}
|
||||
} /* end loop over surrounding grid points */
|
||||
|
||||
} /* end loop over atoms in a gridcell */
|
||||
} /* end loop over gridcells */
|
||||
|
||||
/* free memory */
|
||||
free(next);
|
||||
free(first);
|
||||
|
||||
return 0;
|
||||
}
|
||||
55
tests/opencl/cutcp/gpu_info.c
Normal file
55
tests/opencl/cutcp/gpu_info.c
Normal file
@@ -0,0 +1,55 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
//#include <endian.h>
|
||||
#include <stdlib.h>
|
||||
#include <malloc.h>
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "gpu_info.h"
|
||||
|
||||
void compute_active_thread(size_t *thread,
|
||||
size_t *grid,
|
||||
int task,
|
||||
int pad,
|
||||
int major,
|
||||
int minor,
|
||||
int sm)
|
||||
{
|
||||
int max_thread;
|
||||
int max_block=8;
|
||||
if(major==1)
|
||||
{
|
||||
if(minor>=2)
|
||||
max_thread=1024;
|
||||
else
|
||||
max_thread=768;
|
||||
}
|
||||
else if(major==2)
|
||||
max_thread=1536;
|
||||
else
|
||||
//newer GPU //keep using 2.0
|
||||
max_thread=1536;
|
||||
|
||||
int _grid;
|
||||
int _thread;
|
||||
|
||||
if(task*pad>sm*max_thread)
|
||||
{
|
||||
_thread=max_thread/max_block;
|
||||
_grid = ((task*pad+_thread-1)/_thread)*_thread;
|
||||
}
|
||||
else
|
||||
{
|
||||
_thread=pad;
|
||||
_grid=task*pad;
|
||||
}
|
||||
|
||||
thread[0]=_thread;
|
||||
grid[0]=_grid;
|
||||
}
|
||||
20
tests/opencl/cutcp/gpu_info.h
Normal file
20
tests/opencl/cutcp/gpu_info.h
Normal file
@@ -0,0 +1,20 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef __GPUINFOH__
|
||||
#define __GPUINFOH__
|
||||
|
||||
void compute_active_thread(size_t *thread,
|
||||
size_t *grid,
|
||||
int task,
|
||||
int pad,
|
||||
int major,
|
||||
int minor,
|
||||
int sm);
|
||||
|
||||
#endif
|
||||
104
tests/opencl/cutcp/kernel.cl
Normal file
104
tests/opencl/cutcp/kernel.cl
Normal file
@@ -0,0 +1,104 @@
|
||||
/*
|
||||
* potential lattice is decomposed into size 8^3 lattice point "regions"
|
||||
*
|
||||
* THIS IMPLEMENTATION: one thread per lattice point
|
||||
* thread block size 128 gives 4 thread blocks per region
|
||||
* kernel is invoked for each x-y plane of regions,
|
||||
* where gridDim.x is 4*(x region dimension) so that blockIdx.x
|
||||
* can absorb the z sub-region index in its 2 lowest order bits
|
||||
*
|
||||
* Regions are stored contiguously in memory in row-major order
|
||||
*
|
||||
* The bins have to not only cover the region, but they need to surround
|
||||
* the outer edges so that region sides and corners can still use
|
||||
* neighbor list stencil. The binZeroAddr is actually a shifted pointer into
|
||||
* the bin array (binZeroAddr = binBaseAddr + (c*binDim_y + c)*binDim_x + c)
|
||||
* where c = ceil(cutoff / binsize). This allows for negative offsets to
|
||||
* be added to myBinIndex.
|
||||
*
|
||||
* The (0,0,0) spatial origin corresponds to lower left corner of both
|
||||
* regionZeroAddr and binZeroAddr. The atom coordinates are translated
|
||||
* during binning to enforce this assumption.
|
||||
*/
|
||||
|
||||
#include "macros.h"
|
||||
|
||||
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
|
||||
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
|
||||
typedef int4 xyz;
|
||||
|
||||
__kernel void opencl_cutoff_potential_lattice(
|
||||
int binDim_x,
|
||||
int binDim_y,
|
||||
__global float4 *binBaseAddr,
|
||||
int offset,
|
||||
float h, /* lattice spacing */
|
||||
float cutoff2, /* square of cutoff distance */
|
||||
float inv_cutoff2,
|
||||
__global float *regionZeroAddr, /* address of lattice regions starting at origin */
|
||||
int zRegionIndex,
|
||||
__constant int *NbrListLen,
|
||||
__constant xyz *NbrList
|
||||
)
|
||||
{
|
||||
__global float4* binZeroAddr = binBaseAddr + offset;
|
||||
|
||||
__global float *myRegionAddr;
|
||||
int Bx, By, Bz;
|
||||
|
||||
/* thread id */
|
||||
const int tid = (get_local_id(2)*get_local_size(1) +
|
||||
get_local_id(1))*get_local_size(0) + get_local_id(0);
|
||||
|
||||
/* this is the start of the sub-region indexed by tid */
|
||||
myRegionAddr = regionZeroAddr + ((zRegionIndex*get_num_groups(1)
|
||||
+ get_group_id(1))*(get_num_groups(0)>>2) + (get_group_id(0)>>2))*REGION_SIZE
|
||||
+ (get_group_id(0)&3)*SUB_REGION_SIZE;
|
||||
|
||||
/* spatial coordinate of this lattice point */
|
||||
float x = (8 * (get_group_id(0) >> 2) + get_local_id(0)) * h;
|
||||
float y = (8 * get_group_id(1) + get_local_id(1)) * h;
|
||||
float z = (8 * zRegionIndex + 2*(get_group_id(0)&3) + get_local_id(2)) * h;
|
||||
|
||||
float dx;
|
||||
float dy;
|
||||
float dz;
|
||||
float r2;
|
||||
float s;
|
||||
|
||||
int totalbins = 0;
|
||||
|
||||
/* bin number determined by center of region */
|
||||
Bx = (int) floor((8 * (get_group_id(0) >> 2) + 4) * h * BIN_INVLEN);
|
||||
By = (int) floor((8 * get_group_id(1) + 4) * h * BIN_INVLEN);
|
||||
Bz = (int) floor((8 * zRegionIndex + 4) * h * BIN_INVLEN);
|
||||
|
||||
float energy = 0.f;
|
||||
int bincnt;
|
||||
for (bincnt = 0; bincnt < *NbrListLen; bincnt++) {
|
||||
int i = Bx + NbrList[bincnt].x;
|
||||
int j = By + NbrList[bincnt].y;
|
||||
int k = Bz + NbrList[bincnt].z;
|
||||
|
||||
__global float4* p_global = binZeroAddr +
|
||||
(((k*binDim_y + j)*binDim_x + i) * BIN_DEPTH);
|
||||
|
||||
int m;
|
||||
for (m = 0; m < BIN_DEPTH; m++) {
|
||||
float aq = p_global[m].w;
|
||||
if (0.f != aq) {
|
||||
dx = p_global[m].x - x;
|
||||
dy = p_global[m].y - y;
|
||||
dz = p_global[m].z - z;
|
||||
r2 = dx*dx + dy*dy + dz*dz;
|
||||
if (r2 < cutoff2) {
|
||||
s = (1.f - r2 * inv_cutoff2);
|
||||
energy += aq * rsqrt(r2) * s * s;
|
||||
}
|
||||
}
|
||||
} /* end loop over atoms in bin */
|
||||
} /* end loop over neighbor list */
|
||||
|
||||
/* store into global memory */
|
||||
myRegionAddr[tid+0] = energy;
|
||||
}
|
||||
BIN
tests/opencl/cutcp/libcutcp.a
Normal file
BIN
tests/opencl/cutcp/libcutcp.a
Normal file
Binary file not shown.
69
tests/opencl/cutcp/macros.h
Normal file
69
tests/opencl/cutcp/macros.h
Normal file
@@ -0,0 +1,69 @@
|
||||
#ifndef __MACROSH__
|
||||
#define __MACROSH__
|
||||
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
#define DEBUG
|
||||
/* define which grid block and which thread to examine */
|
||||
#define BX 0
|
||||
#define BY 0
|
||||
#define TX 0
|
||||
#define TY 0
|
||||
#define TZ 0
|
||||
#define EMU(code) do { \
|
||||
if (blockIdx.x==BX && blockIdx.y==BY && \
|
||||
threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
|
||||
code; \
|
||||
} \
|
||||
} while (0)
|
||||
#define INT(n) printf("%s = %d\n", #n, n)
|
||||
#define FLOAT(f) printf("%s = %g\n", #f, (double)(f))
|
||||
#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
|
||||
#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
|
||||
(double)(f).y, (double)(f).z, (double)(f).w)
|
||||
#else
|
||||
#define EMU(code)
|
||||
#define INT(n)
|
||||
#define FLOAT(f)
|
||||
#define INT3(n)
|
||||
#define FLOAT4(f)
|
||||
#endif
|
||||
|
||||
/* report error from OpenCL */
|
||||
#define CHECK_ERROR(errorMessage) \
|
||||
if(clStatus != CL_SUCCESS) \
|
||||
{ \
|
||||
printf("Error: %s!\n",errorMessage); \
|
||||
printf("Line: %d\n",__LINE__); \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
/*
|
||||
* neighbor list:
|
||||
* stored in constant memory as table of offsets
|
||||
* flat index addressing is computed by kernel
|
||||
*
|
||||
* reserve enough memory for 11^3 stencil of grid cells
|
||||
* this fits within 16K of memory
|
||||
*/
|
||||
#define NBRLIST_DIM 11
|
||||
#define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
|
||||
|
||||
/*
|
||||
* atom bins cached into shared memory for processing
|
||||
*
|
||||
* this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
|
||||
* should permit scheduling of up to 3 thread blocks per SM
|
||||
*/
|
||||
#define BIN_DEPTH 8 /* max number of atoms per bin */
|
||||
#define BIN_SIZE 32 /* size of bin in floats */
|
||||
#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */
|
||||
|
||||
#define BIN_LENGTH 4.f /* spatial length in Angstroms */
|
||||
#define BIN_INVLEN (1.f / BIN_LENGTH)
|
||||
/* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
|
||||
* so that bin fill should be 80% (for non-empty regions of space) */
|
||||
|
||||
#define REGION_SIZE 512 /* number of floats in lattice region */
|
||||
#define SUB_REGION_SIZE 128 /* number of floats in lattice sub-region */
|
||||
|
||||
#endif
|
||||
194
tests/opencl/cutcp/main.cc
Normal file
194
tests/opencl/cutcp/main.cc
Normal file
@@ -0,0 +1,194 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <parboil.h>
|
||||
|
||||
#include "atom.h"
|
||||
#include "cutoff.h"
|
||||
#include "output.h"
|
||||
|
||||
#define ERRTOL 1e-4f
|
||||
|
||||
#define NOKERNELS 0
|
||||
#define CUTOFF1 1
|
||||
#define CUTOFF6 32
|
||||
#define CUTOFF6OVERLAP 64
|
||||
#define CUTOFFCPU 16384
|
||||
|
||||
|
||||
int appenddata(const char *filename, int size, double time) {
|
||||
FILE *fp;
|
||||
fp=fopen(filename, "a");
|
||||
if (fp == NULL) {
|
||||
printf("error appending to file %s..\n", filename);
|
||||
return -1;
|
||||
}
|
||||
fprintf(fp, "%d %.3f\n", size, time);
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
LatticeDim
|
||||
lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
|
||||
{
|
||||
LatticeDim ret;
|
||||
|
||||
ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
|
||||
ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
|
||||
ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
|
||||
ret.lo = lo;
|
||||
ret.h = h;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
Lattice *
|
||||
create_lattice(LatticeDim dim)
|
||||
{
|
||||
int size;
|
||||
Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
|
||||
|
||||
if (lat == NULL) {
|
||||
fprintf(stderr, "Out of memory\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
lat->dim = dim;
|
||||
|
||||
/* Round up the allocated size to a multiple of 8 */
|
||||
size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7;
|
||||
lat->lattice = (float *)calloc(size, sizeof(float));
|
||||
|
||||
if (lat->lattice == NULL) {
|
||||
fprintf(stderr, "Out of memory\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
return lat;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
destroy_lattice(Lattice *lat)
|
||||
{
|
||||
if (lat) {
|
||||
free(lat->lattice);
|
||||
free(lat);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
Atoms *atom;
|
||||
|
||||
LatticeDim lattice_dim;
|
||||
Lattice *gpu_lattice;
|
||||
Vec3 min_ext, max_ext; /* Bounding box of atoms */
|
||||
Vec3 lo, hi; /* Bounding box with padding */
|
||||
|
||||
float h = 0.5f; /* Lattice spacing */
|
||||
float cutoff = 12.f; /* Cutoff radius */
|
||||
float exclcutoff = 1.f; /* Radius for exclusion */
|
||||
float padding = 0.5f; /* Bounding box padding distance */
|
||||
|
||||
int n;
|
||||
|
||||
struct pb_Parameters *parameters;
|
||||
struct pb_TimerSet timers;
|
||||
|
||||
/* Read input parameters */
|
||||
parameters = pb_ReadParameters(&argc, argv);
|
||||
if (parameters == NULL) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
parameters->inpFiles = (char **)malloc(sizeof(char *) * 2);
|
||||
parameters->inpFiles[0] = (char *)malloc(100);
|
||||
parameters->inpFiles[1] = NULL;
|
||||
strncpy(parameters->inpFiles[0], "watbox.sl40.pqr", 100);
|
||||
|
||||
/* Expect one input file */
|
||||
if (pb_Parameters_CountInputs(parameters) != 1) {
|
||||
fprintf(stderr, "Expecting one input file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
pb_InitializeTimerSet(&timers);
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
{
|
||||
const char *pqrfilename = parameters->inpFiles[0];
|
||||
|
||||
if (!(atom = read_atom_file(pqrfilename))) {
|
||||
fprintf(stderr, "read_atom_file() failed\n");
|
||||
exit(1);
|
||||
}
|
||||
printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
|
||||
}
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
/* find extent of domain */
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
get_atom_extent(&min_ext, &max_ext, atom);
|
||||
printf("extent of domain is:\n");
|
||||
printf(" minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z);
|
||||
printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
|
||||
|
||||
printf("padding domain by %g Angstroms\n", padding);
|
||||
lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
|
||||
hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
|
||||
printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
|
||||
|
||||
lattice_dim = lattice_from_bounding_box(lo, hi, h);
|
||||
gpu_lattice = create_lattice(lattice_dim);
|
||||
printf("\n");
|
||||
|
||||
/*
|
||||
* Run OpenCL kernel
|
||||
* (Begin and end with COMPUTE timer active)
|
||||
*/
|
||||
if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0, parameters)) {
|
||||
fprintf(stderr, "Computation failed\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Zero the lattice points that are too close to an atom. This is
|
||||
* necessary for numerical stability.
|
||||
*/
|
||||
if (remove_exclusions(gpu_lattice, exclcutoff, atom)) {
|
||||
fprintf(stderr, "remove_exclusions() failed for gpu lattice\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||
|
||||
/* Print output */
|
||||
if (parameters->outFile) {
|
||||
//write_lattice_summary(parameters->outFile, gpu_lattice);
|
||||
}
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
|
||||
/* Cleanup */
|
||||
destroy_lattice(gpu_lattice);
|
||||
free_atom(atom);
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
|
||||
pb_PrintTimerSet(&timers);
|
||||
pb_FreeParameters(parameters);
|
||||
|
||||
return 0;
|
||||
}
|
||||
49
tests/opencl/cutcp/ocl.c
Normal file
49
tests/opencl/cutcp/ocl.c
Normal file
@@ -0,0 +1,49 @@
|
||||
#include <CL/cl.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "ocl.h"
|
||||
|
||||
char* readFile(const char* fileName)
|
||||
{
|
||||
FILE* fp;
|
||||
fp = fopen(fileName,"r");
|
||||
if(fp == NULL)
|
||||
{
|
||||
printf("Error 1!\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fseek(fp,0,SEEK_END);
|
||||
long size = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
char* buffer = (char*)malloc(sizeof(char)*(size+1));
|
||||
if(buffer == NULL)
|
||||
{
|
||||
printf("Error 2!\n");
|
||||
fclose(fp);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
size_t res = fread(buffer,1,size,fp);
|
||||
if(res != size)
|
||||
{
|
||||
printf("Error 3!\n");
|
||||
fclose(fp);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
buffer[size] = 0;
|
||||
fclose(fp);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
|
||||
{
|
||||
cl_int clStatus;
|
||||
char* temp = (char*)malloc(size);
|
||||
memset(temp,val,size);
|
||||
clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||
free(temp);
|
||||
}
|
||||
17
tests/opencl/cutcp/ocl.h
Normal file
17
tests/opencl/cutcp/ocl.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#ifndef __OCLH__
|
||||
#define __OCLH__
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
void clMemSet(cl_command_queue, cl_mem, int, size_t);
|
||||
char* readFile(const char*);
|
||||
|
||||
#define CHECK_ERROR(errorMessage) \
|
||||
if(clStatus != CL_SUCCESS) \
|
||||
{ \
|
||||
printf("Error: %s!\n",errorMessage); \
|
||||
printf("Line: %d\n",__LINE__); \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
#endif
|
||||
67
tests/opencl/cutcp/output.c
Normal file
67
tests/opencl/cutcp/output.c
Normal file
@@ -0,0 +1,67 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
#include <parboil.h>
|
||||
|
||||
#include "atom.h"
|
||||
#include "cutoff.h"
|
||||
|
||||
void
|
||||
write_lattice_summary(const char *filename, Lattice *lattice)
|
||||
{
|
||||
float *lattice_data = lattice->lattice;
|
||||
int nx = lattice->dim.nx;
|
||||
int ny = lattice->dim.ny;
|
||||
int nz = lattice->dim.nz;
|
||||
|
||||
/* Open output file */
|
||||
FILE *outfile = fopen(filename, "w");
|
||||
|
||||
if (outfile == NULL) {
|
||||
fprintf(stderr, "Cannot open output file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* Write the sum of the the absolute values of all lattice potentials */
|
||||
{
|
||||
double abspotential = 0.0;
|
||||
float tmp;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nx * ny * nz; i++)
|
||||
abspotential += fabs((double) lattice_data[i]);
|
||||
|
||||
tmp = (float) abspotential;
|
||||
|
||||
fwrite(&tmp, 1, sizeof(float), outfile);
|
||||
}
|
||||
|
||||
/* Write the size of a lattice plane */
|
||||
{
|
||||
uint32_t tmp;
|
||||
|
||||
tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
|
||||
fwrite(&tmp, 1, sizeof(uint32_t), outfile);
|
||||
}
|
||||
|
||||
/* Write the plane of lattice data at z=0 and z = nz-1 */
|
||||
{
|
||||
int plane_size = nx * ny;
|
||||
|
||||
fwrite(lattice_data, plane_size, sizeof(float), outfile);
|
||||
fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
|
||||
outfile);
|
||||
}
|
||||
|
||||
/* Cleanup */
|
||||
fclose(outfile);
|
||||
}
|
||||
25
tests/opencl/cutcp/output.h
Normal file
25
tests/opencl/cutcp/output.h
Normal file
@@ -0,0 +1,25 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef OUTPUT_H
|
||||
#define OUTPUT_H
|
||||
|
||||
#include "cutoff.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void
|
||||
write_lattice_summary(const char *filename, Lattice *lattice);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
348
tests/opencl/cutcp/parboil.h
Normal file
348
tests/opencl/cutcp/parboil.h
Normal file
@@ -0,0 +1,348 @@
|
||||
/*
|
||||
* (c) 2010 The Board of Trustees of the University of Illinois.
|
||||
*/
|
||||
#ifndef PARBOIL_HEADER
|
||||
#define PARBOIL_HEADER
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
/* A platform as specified by the user on the command line */
|
||||
struct pb_PlatformParam {
|
||||
char *name; /* The platform name. This string is owned. */
|
||||
char *version; /* The platform version; may be NULL.
|
||||
* This string is owned. */
|
||||
};
|
||||
|
||||
/* Create a PlatformParam from the given strings.
|
||||
* 'name' must not be NULL. 'version' may be NULL.
|
||||
* If not NULL, the strings should have been allocated by malloc(),
|
||||
* and they will be owned by the returned object.
|
||||
*/
|
||||
struct pb_PlatformParam *
|
||||
pb_PlatformParam(char *name, char *version);
|
||||
|
||||
void
|
||||
pb_FreePlatformParam(struct pb_PlatformParam *);
|
||||
|
||||
/* A criterion for how to select a device */
|
||||
enum pb_DeviceSelectionCriterion {
|
||||
pb_Device_INDEX, /* Enumerate the devices and select one
|
||||
* by its number */
|
||||
pb_Device_CPU, /* Select a CPU device */
|
||||
pb_Device_GPU, /* Select a GPU device */
|
||||
pb_Device_ACCELERATOR, /* Select an accelerator device */
|
||||
pb_Device_NAME /* Select a device by name */
|
||||
};
|
||||
|
||||
/* A device as specified by the user on the command line */
|
||||
struct pb_DeviceParam {
|
||||
enum pb_DeviceSelectionCriterion criterion;
|
||||
union {
|
||||
int index; /* If criterion == pb_Device_INDEX,
|
||||
* the index of the device */
|
||||
char *name; /* If criterion == pb_Device_NAME,
|
||||
* the name of the device.
|
||||
* This string is owned. */
|
||||
};
|
||||
};
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_index(int index);
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_cpu(void);
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_gpu(void);
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_accelerator(void);
|
||||
|
||||
/* Create a by-name device selection criterion.
|
||||
* The string should have been allocated by malloc(), and it will will be
|
||||
* owned by the returned object.
|
||||
*/
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_name(char *name);
|
||||
|
||||
void
|
||||
pb_FreeDeviceParam(struct pb_DeviceParam *);
|
||||
|
||||
/* Command line parameters for benchmarks */
|
||||
struct pb_Parameters {
|
||||
char *outFile; /* If not NULL, the raw output of the
|
||||
* computation should be saved to this
|
||||
* file. The string is owned. */
|
||||
char **inpFiles; /* A NULL-terminated array of strings
|
||||
* holding the input file(s) for the
|
||||
* computation. The array and strings
|
||||
* are owned. */
|
||||
struct pb_PlatformParam *platform; /* If not NULL, the platform
|
||||
* specified on the command line. */
|
||||
struct pb_DeviceParam *device; /* If not NULL, the device
|
||||
* specified on the command line. */
|
||||
};
|
||||
|
||||
/* Read command-line parameters.
|
||||
*
|
||||
* The argc and argv parameters to main are read, and any parameters
|
||||
* interpreted by this function are removed from the argument list.
|
||||
*
|
||||
* A new instance of struct pb_Parameters is returned.
|
||||
* If there is an error, then an error message is printed on stderr
|
||||
* and NULL is returned.
|
||||
*/
|
||||
struct pb_Parameters *
|
||||
pb_ReadParameters(int *_argc, char **argv);
|
||||
|
||||
/* Free an instance of struct pb_Parameters.
|
||||
*/
|
||||
void
|
||||
pb_FreeParameters(struct pb_Parameters *p);
|
||||
|
||||
void
|
||||
pb_FreeStringArray(char **);
|
||||
|
||||
/* Count the number of input files in a pb_Parameters instance.
|
||||
*/
|
||||
int
|
||||
pb_Parameters_CountInputs(struct pb_Parameters *p);
|
||||
|
||||
/* A time or duration. */
|
||||
//#if _POSIX_VERSION >= 200112L
|
||||
typedef unsigned long long pb_Timestamp; /* time in microseconds */
|
||||
//#else
|
||||
//# error "Timestamps not implemented"
|
||||
//#endif
|
||||
|
||||
enum pb_TimerState {
|
||||
pb_Timer_STOPPED,
|
||||
pb_Timer_RUNNING,
|
||||
};
|
||||
|
||||
struct pb_Timer {
|
||||
enum pb_TimerState state;
|
||||
pb_Timestamp elapsed; /* Amount of time elapsed so far */
|
||||
pb_Timestamp init; /* Beginning of the current time interval,
|
||||
* if state is RUNNING. End of the last
|
||||
* recorded time interfal otherwise. */
|
||||
};
|
||||
|
||||
/* Reset a timer.
|
||||
* Use this to initialize a timer or to clear
|
||||
* its elapsed time. The reset timer is stopped.
|
||||
*/
|
||||
void
|
||||
pb_ResetTimer(struct pb_Timer *timer);
|
||||
|
||||
/* Start a timer. The timer is set to RUNNING mode and
|
||||
* time elapsed while the timer is running is added to
|
||||
* the timer.
|
||||
* The timer should not already be running.
|
||||
*/
|
||||
void
|
||||
pb_StartTimer(struct pb_Timer *timer);
|
||||
|
||||
/* Stop a timer.
|
||||
* This stops adding elapsed time to the timer.
|
||||
* The timer should not already be stopped.
|
||||
*/
|
||||
void
|
||||
pb_StopTimer(struct pb_Timer *timer);
|
||||
|
||||
/* Get the elapsed time in seconds. */
|
||||
double
|
||||
pb_GetElapsedTime(struct pb_Timer *timer);
|
||||
|
||||
/* Execution time is assigned to one of these categories. */
|
||||
enum pb_TimerID {
|
||||
pb_TimerID_NONE = 0,
|
||||
pb_TimerID_IO, /* Time spent in input/output */
|
||||
pb_TimerID_KERNEL, /* Time spent computing on the device,
|
||||
* recorded asynchronously */
|
||||
pb_TimerID_COPY, /* Time spent synchronously moving data
|
||||
* to/from device and allocating/freeing
|
||||
* memory on the device */
|
||||
pb_TimerID_DRIVER, /* Time spent in the host interacting with the
|
||||
* driver, primarily for recording the time
|
||||
* spent queueing asynchronous operations */
|
||||
pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
|
||||
pb_TimerID_COMPUTE, /* Time for all program execution other
|
||||
* than parsing command line arguments,
|
||||
* I/O, kernel, and copy */
|
||||
pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and
|
||||
* host activity: automatically filled in,
|
||||
* not intended for direct usage */
|
||||
pb_TimerID_LAST /* Number of timer IDs */
|
||||
};
|
||||
|
||||
/* Dynamic list of asynchronously tracked times between events */
|
||||
struct pb_async_time_marker_list {
|
||||
char *label; // actually just a pointer to a string
|
||||
enum pb_TimerID timerID; /* The ID to which the interval beginning
|
||||
* with this marker should be attributed */
|
||||
void * marker;
|
||||
//cudaEvent_t marker; /* The driver event for this marker */
|
||||
struct pb_async_time_marker_list *next;
|
||||
};
|
||||
|
||||
struct pb_SubTimer {
|
||||
char *label;
|
||||
struct pb_Timer timer;
|
||||
struct pb_SubTimer *next;
|
||||
};
|
||||
|
||||
struct pb_SubTimerList {
|
||||
struct pb_SubTimer *current;
|
||||
struct pb_SubTimer *subtimer_list;
|
||||
};
|
||||
|
||||
/* A set of timers for recording execution times. */
|
||||
struct pb_TimerSet {
|
||||
enum pb_TimerID current;
|
||||
struct pb_async_time_marker_list* async_markers;
|
||||
pb_Timestamp async_begin;
|
||||
pb_Timestamp wall_begin;
|
||||
struct pb_Timer timers[pb_TimerID_LAST];
|
||||
struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST];
|
||||
};
|
||||
|
||||
/* Reset all timers in the set. */
|
||||
void
|
||||
pb_InitializeTimerSet(struct pb_TimerSet *timers);
|
||||
|
||||
void
|
||||
pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
|
||||
|
||||
/* Select which timer the next interval of time should be accounted
|
||||
* to. The selected timer is started and other timers are stopped.
|
||||
* Using pb_TimerID_NONE stops all timers. */
|
||||
void
|
||||
pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
|
||||
|
||||
void
|
||||
pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
|
||||
|
||||
/* Print timer values to standard output. */
|
||||
void
|
||||
pb_PrintTimerSet(struct pb_TimerSet *timers);
|
||||
|
||||
/* Release timer resources */
|
||||
void
|
||||
pb_DestroyTimerSet(struct pb_TimerSet * timers);
|
||||
|
||||
void
|
||||
pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
|
||||
|
||||
|
||||
typedef struct pb_Device_tag {
|
||||
char* name;
|
||||
void* clDevice;
|
||||
int id;
|
||||
unsigned int in_use;
|
||||
unsigned int available;
|
||||
} pb_Device;
|
||||
|
||||
struct pb_Context_tag;
|
||||
typedef struct pb_Context_tag pb_Context;
|
||||
|
||||
typedef struct pb_Platform_tag {
|
||||
char* name;
|
||||
char* version;
|
||||
void* clPlatform;
|
||||
unsigned int in_use;
|
||||
pb_Context** contexts;
|
||||
pb_Device** devices;
|
||||
} pb_Platform;
|
||||
|
||||
struct pb_Context_tag {
|
||||
void* clPlatformId;
|
||||
void* clContext;
|
||||
void* clDeviceId;
|
||||
pb_Platform* pb_platform;
|
||||
pb_Device* pb_device;
|
||||
};
|
||||
|
||||
// verbosely print out list of platforms and their devices to the console.
|
||||
pb_Platform**
|
||||
pb_GetPlatforms();
|
||||
|
||||
// Choose a platform according to the given platform specification
|
||||
pb_Platform*
|
||||
pb_GetPlatform(struct pb_PlatformParam *platform);
|
||||
|
||||
// choose a platform: by name, name & version
|
||||
pb_Platform*
|
||||
pb_GetPlatformByName(const char* name);
|
||||
|
||||
pb_Platform*
|
||||
pb_GetPlatformByNameAndVersion(const char* name, const char* version);
|
||||
|
||||
// Choose a device according to the given device specification
|
||||
pb_Device*
|
||||
pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device);
|
||||
|
||||
pb_Device**
|
||||
pb_GetDevices(pb_Platform* pb_platform);
|
||||
|
||||
// choose a device by name.
|
||||
pb_Device*
|
||||
pb_GetDeviceByName(pb_Platform* pb_platform, const char* name);
|
||||
|
||||
pb_Platform*
|
||||
pb_GetPlatformByEnvVars();
|
||||
|
||||
pb_Context*
|
||||
pb_InitOpenCLContext(struct pb_Parameters* parameters);
|
||||
|
||||
void
|
||||
pb_ReleasePlatforms();
|
||||
|
||||
void
|
||||
pb_ReleaseContext(pb_Context* c);
|
||||
|
||||
void
|
||||
pb_PrintPlatformInfo(pb_Context* c);
|
||||
|
||||
void
|
||||
perf_init();
|
||||
|
||||
//#define MEASURE_KERNEL_TIME
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
#ifdef MEASURE_KERNEL_TIME
|
||||
#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c))
|
||||
cl_int
|
||||
pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
|
||||
cl_kernel /* kernel */,
|
||||
cl_uint /* work_dim */,
|
||||
const size_t * /* global_work_offset */,
|
||||
const size_t * /* global_work_size */,
|
||||
const size_t * /* local_work_size */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */);
|
||||
#endif
|
||||
|
||||
enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR };
|
||||
void pb_sig_float(char*, float*, int);
|
||||
void pb_sig_double(char*, double*, int);
|
||||
void pb_sig_short(char*, short*, int);
|
||||
void pb_sig_int(char*, int*, int);
|
||||
void pb_sig_uchar(char*, unsigned char*, unsigned int);
|
||||
void pb_sig_clmem(char*, cl_command_queue, cl_mem, int);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //PARBOIL_HEADER
|
||||
|
||||
1394
tests/opencl/cutcp/parboil_opencl.c
Normal file
1394
tests/opencl/cutcp/parboil_opencl.c
Normal file
File diff suppressed because it is too large
Load Diff
139
tests/opencl/cutcp/readatom.c
Normal file
139
tests/opencl/cutcp/readatom.c
Normal file
@@ -0,0 +1,139 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include "atom.h"
|
||||
|
||||
|
||||
#define LINELEN 96
|
||||
#define INITLEN 20
|
||||
|
||||
|
||||
Atoms *read_atom_file(const char *fname)
|
||||
{
|
||||
FILE *file;
|
||||
char line[LINELEN];
|
||||
|
||||
Atom *atom; /* Atom array */
|
||||
int len = INITLEN; /* Size of atom array */
|
||||
int cnt = 0; /* Number of atoms read */
|
||||
|
||||
/* allocate initial atom array */
|
||||
atom = (Atom *) malloc(len * sizeof(Atom));
|
||||
if (NULL==atom) {
|
||||
fprintf(stderr, "can't allocate memory\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int i;
|
||||
for (i = 0; i < len; ++i) {
|
||||
atom[i].x = i+0;
|
||||
atom[i].y = i+1;
|
||||
atom[i].z = i+2;
|
||||
atom[i].q = 1;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* open atom "pqr" file */
|
||||
file = fopen(fname, "r");
|
||||
if (NULL==file) {
|
||||
fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* loop to read pqr file line by line */
|
||||
while (fgets(line, LINELEN, file) != NULL) {
|
||||
|
||||
if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
|
||||
continue; /* skip anything that isn't an atom record */
|
||||
}
|
||||
|
||||
if (cnt==len) { /* extend atom array */
|
||||
void *tmp = realloc(atom, 2*len*sizeof(Atom));
|
||||
if (NULL==tmp) {
|
||||
fprintf(stderr, "can't allocate more memory\n");
|
||||
return NULL;
|
||||
}
|
||||
atom = (Atom *) tmp;
|
||||
len *= 2;
|
||||
}
|
||||
|
||||
/* read position coordinates and charge from atom record */
|
||||
if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
|
||||
&(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
|
||||
fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
cnt++; /* count atoms as we store them */
|
||||
}
|
||||
|
||||
/* verify EOF and close file */
|
||||
if ( !feof(file) ) {
|
||||
fprintf(stderr, "did not find EOF\n");
|
||||
return NULL;
|
||||
}
|
||||
if (fclose(file)) {
|
||||
fprintf(stderr, "can't close file\n");
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Build the output data structure */
|
||||
{
|
||||
Atoms *out = (Atoms *)malloc(sizeof(Atoms));
|
||||
|
||||
if (NULL == out) {
|
||||
fprintf(stderr, "can't allocate memory\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
out->size = cnt;
|
||||
out->atoms = atom;
|
||||
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void free_atom(Atoms *atom)
|
||||
{
|
||||
if (atom) {
|
||||
free(atom->atoms);
|
||||
free(atom);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
|
||||
{
|
||||
Atom *atoms = atom->atoms;
|
||||
int natoms = atom->size;
|
||||
Vec3 lo;
|
||||
Vec3 hi;
|
||||
int n;
|
||||
|
||||
hi.x = lo.x = atoms[0].x;
|
||||
hi.y = lo.y = atoms[0].y;
|
||||
hi.z = lo.z = atoms[0].z;
|
||||
|
||||
for (n = 1; n < natoms; n++) {
|
||||
lo.x = fminf(lo.x, atoms[n].x);
|
||||
hi.x = fmaxf(hi.x, atoms[n].x);
|
||||
lo.y = fminf(lo.y, atoms[n].y);
|
||||
hi.y = fmaxf(hi.y, atoms[n].y);
|
||||
lo.z = fminf(lo.z, atoms[n].z);
|
||||
hi.z = fmaxf(hi.z, atoms[n].z);
|
||||
}
|
||||
|
||||
*out_lo = lo;
|
||||
*out_hi = hi;
|
||||
}
|
||||
5945
tests/opencl/cutcp/watbox.sl40.pqr
Executable file
5945
tests/opencl/cutcp/watbox.sl40.pqr
Executable file
File diff suppressed because it is too large
Load Diff
1674
tests/opencl/guassian/Fan1.dump
Normal file
1674
tests/opencl/guassian/Fan1.dump
Normal file
File diff suppressed because it is too large
Load Diff
1891
tests/opencl/guassian/Fan2.dump
Normal file
1891
tests/opencl/guassian/Fan2.dump
Normal file
File diff suppressed because it is too large
Load Diff
62
tests/opencl/guassian/Makefile
Normal file
62
tests/opencl/guassian/Makefile
Normal file
@@ -0,0 +1,62 @@
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
|
||||
#CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter
|
||||
CXXFLAGS += -Wno-unused-variable -Wno-narrowing -Wno-unused-result -Wno-unused-but-set-variable
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
PROJECT = guassian
|
||||
|
||||
SRCS = main.cc clutils.cpp utils.cpp
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
241
tests/opencl/guassian/OriginalParallel.c
Executable file
241
tests/opencl/guassian/OriginalParallel.c
Executable file
@@ -0,0 +1,241 @@
|
||||
/*-----------------------------------------------------------
|
||||
** ge_p.c -- The program is to solve a linear system Ax = b
|
||||
** by using Gaussian Elimination. The algorithm on page 101
|
||||
** ("Foundations of Parallel Programming") is used.
|
||||
** The sequential version is ge_s.c. This parallel
|
||||
** implementation converts three independent for() loops
|
||||
** into three Fans. Use the data file ge_3.dat to verify
|
||||
** the correction of the output.
|
||||
**
|
||||
** Written by Andreas Kura, 02/15/95
|
||||
** Modified by Chong-wei Xu, /04/20/95
|
||||
**-----------------------------------------------------------
|
||||
*/
|
||||
#include <us.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int Size, t;
|
||||
float **a, *b;
|
||||
BEGIN_SHARED_DECL
|
||||
float **m;
|
||||
END_SHARED_DECL;
|
||||
FILE *fp;
|
||||
|
||||
void InitProblemOnce();
|
||||
void InitPerRun();
|
||||
void ForwardSub();
|
||||
void Fan1();
|
||||
void Fan2();
|
||||
void Fan3();
|
||||
void InitMat();
|
||||
void InitAry();
|
||||
void PrintMat();
|
||||
void PrintAry();
|
||||
|
||||
main ()
|
||||
{
|
||||
InitializeUs();
|
||||
MakeSharedVariables; /* to make SHARED m */
|
||||
|
||||
InitProblemOnce();
|
||||
InitPerRun();
|
||||
ForwardSub();
|
||||
|
||||
printf("The result of matrix m is: \n");
|
||||
PrintMat(SHARED m, Size, Size);
|
||||
printf("The result of matrix a is: \n");
|
||||
PrintMat(a, Size, Size);
|
||||
printf("The result of array b is: \n");
|
||||
PrintAry(b, Size);
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** InitProblemOnce -- Initialize all of matrices and
|
||||
** vectors by opening a data file specified by the user.
|
||||
**
|
||||
** We used dynamic array **a, *b, and **m to allocate
|
||||
** the memory storages.
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void InitProblemOnce()
|
||||
{
|
||||
char filename[30];
|
||||
|
||||
printf("Enter the data file name: ");
|
||||
scanf("%s", filename);
|
||||
printf("The file name is: %s\n", filename);
|
||||
|
||||
fp = fopen(filename, "r");
|
||||
|
||||
fscanf(fp, "%d", &Size);
|
||||
a = (float **) UsAllocScatterMatrix(Size, Size, sizeof(float));
|
||||
/*
|
||||
a = (float **) malloc(Size * sizeof(float *));
|
||||
for (i=0; i<Size; i++) {
|
||||
a[i] = (float *) malloc(Size * sizeof(float));
|
||||
}
|
||||
*/
|
||||
InitMat(a, Size, Size);
|
||||
printf("The input matrix a is:\n");
|
||||
PrintMat(a, Size, Size);
|
||||
|
||||
b = (float *) UsAlloc(Size * sizeof(float));
|
||||
/*
|
||||
b = (float *) malloc(Size * sizeof(float));
|
||||
*/
|
||||
InitAry(b, Size);
|
||||
printf("The input array b is:\n");
|
||||
PrintAry(b, Size);
|
||||
|
||||
SHARED m = (float **) UsAllocScatterMatrix(Size, Size, sizeof(float));
|
||||
/*
|
||||
m = (float **) malloc(Size * sizeof(float *));
|
||||
for (i=0; i<Size; i++) {
|
||||
m[i] = (float *) malloc(Size * sizeof(float));
|
||||
}
|
||||
*/
|
||||
|
||||
Share(&Size);
|
||||
Share(&a);
|
||||
Share(&b);
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** InitPerRun() -- Initialize the contents of the
|
||||
** multipier matrix **m
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void InitPerRun()
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i=0; i<Size; i++)
|
||||
for (j=0; j<Size; j++)
|
||||
SHARED m[i][j] = 0.0;
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** ForwardSub() -- Forward substitution of Gaussian
|
||||
** elimination.
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void ForwardSub()
|
||||
{
|
||||
for (t=0; t<(Size-1); t++) {
|
||||
Share(&t);
|
||||
GenOnI(Fan1, Size-1-t); /* t=0 to (Size-2), the range is
|
||||
** Size-2-t+1 = Size-1-t
|
||||
*/
|
||||
GenOnA(Fan2, Size-1-t, Size-t);
|
||||
GenOnI(Fan3, Size-1-t);
|
||||
}
|
||||
}
|
||||
|
||||
/*-------------------------------------------------------
|
||||
** Fan1() -- Calculate multiplier matrix
|
||||
** Pay attention to the index. Index i give the range
|
||||
** which starts from 0 to range-1. The real values of
|
||||
** the index should be adjust and related with the value
|
||||
** of t which is defined on the ForwardSub().
|
||||
**-------------------------------------------------------
|
||||
*/
|
||||
void Fan1(dummy, i)
|
||||
int dummy, i;
|
||||
{
|
||||
/* Use these printf() to display the nodes and index */
|
||||
printf("from node #%d\n", PhysProcToUsProc(Proc_Node));
|
||||
SHARED m[i+t+1][t] = a[i+t+1][t] / a[t][t];
|
||||
printf("i=%d, a[%d][%d]=%.2f, a[%d][%d]=%.2f, m[%d][%d]=%.2f\n",
|
||||
(i+t+1),t,t,a[t][t],(i+t+1),t,a[i+t+1][t],(i+t+1),t,
|
||||
SHARED m[i+t+1][t]);
|
||||
}
|
||||
|
||||
/*-------------------------------------------------------
|
||||
** Fan2() -- Modify the matrix A into LUD
|
||||
**-------------------------------------------------------
|
||||
*/
|
||||
void Fan2(dummy, i, j)
|
||||
int dummy, i, j;
|
||||
{
|
||||
a[i+1+t][j+t] -= SHARED m[i+1+t][t] * a[t][j+t];
|
||||
Share (&a);
|
||||
}
|
||||
|
||||
/*-------------------------------------------------------
|
||||
** Fan3() -- Modify the array b
|
||||
**-------------------------------------------------------
|
||||
*/
|
||||
void Fan3(dummy, i)
|
||||
int dummy, i;
|
||||
{
|
||||
b[i+1+t] -= SHARED m[i+1+t][t] * b[t];
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** InitMat() -- Initialize the matrix by reading data
|
||||
** from the data file
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void InitMat(ary, nrow, ncol)
|
||||
float **ary;
|
||||
int nrow, ncol;
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i=0; i<nrow; i++) {
|
||||
for (j=0; j<ncol; j++) {
|
||||
fscanf(fp, "%f", &ary[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** PrintMat() -- Print the contents of the matrix
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void PrintMat(ary, nrow, ncol)
|
||||
float **ary;
|
||||
int nrow, ncol;
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i=0; i<nrow; i++) {
|
||||
for (j=0; j<ncol; j++) {
|
||||
printf("%8.2f ", ary[i][j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** InitAry() -- Initialize the array (vector) by reading
|
||||
** data from the data file
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void InitAry(ary, ary_size)
|
||||
float *ary;
|
||||
int ary_size;
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i=0; i<ary_size; i++) {
|
||||
fscanf(fp, "%f", &ary[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** PrintAry() -- Print the contents of the array (vector)
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void PrintAry(ary, ary_size)
|
||||
float *ary;
|
||||
int ary_size;
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i=0; i<ary_size; i++) {
|
||||
printf("%.2f ", ary[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
60
tests/opencl/guassian/README.txt
Executable file
60
tests/opencl/guassian/README.txt
Executable file
@@ -0,0 +1,60 @@
|
||||
The Gaussian Elimination application solves systems of equations using the
|
||||
gaussian elimination method.
|
||||
|
||||
The application analyzes an n x n matrix and an associated 1 x n vector to solve a
|
||||
set of equations with n variables and n unknowns. The matrix and vector describe equations
|
||||
of the form:
|
||||
|
||||
a0x + b0y + c0z + d0w = e0
|
||||
a1x + b1y + c1z + d1w = e1
|
||||
a2x + b2y + c2z + d2w = e2
|
||||
a3x + b3y + c3z + d3w = e3
|
||||
|
||||
where in this case n=4. The matrix for the above equations would be as follows:
|
||||
|
||||
[a0 b0 c0 d0]
|
||||
[a1 b1 c1 d1]
|
||||
[a2 b2 c2 d2]
|
||||
[a3 b3 c3 d3]
|
||||
|
||||
and the vector would be:
|
||||
|
||||
[e0]
|
||||
[e1]
|
||||
[e2]
|
||||
[e3]
|
||||
|
||||
The application creates a solution vector:
|
||||
|
||||
[x]
|
||||
[y]
|
||||
[z]
|
||||
[w]
|
||||
|
||||
|
||||
The Makefile may need to be adjusted for different machines, but it was written for Mac OS X and
|
||||
Linux with either NVIDIA or AMD OpenCL SDKs.
|
||||
|
||||
Additional input files can be created with the matrixGenerator.py file in the data folder.
|
||||
|
||||
Gaussian Elimination Usage
|
||||
|
||||
gaussianElimination [filename] [-hqt] [-p [int] -d [int]]
|
||||
|
||||
example:
|
||||
$ ./gaussianElimination matrix4.txt
|
||||
|
||||
filename the filename that holds the matrix data
|
||||
|
||||
-h, --help Display the help file
|
||||
-q Quiet mode. Suppress all text output.
|
||||
-t Print timing information.
|
||||
|
||||
-p [int] Choose the platform (must choose both platform and device)
|
||||
-d [int] Choose the device (must choose both platform and device)
|
||||
|
||||
|
||||
Notes: 1. The filename is required as the first parameter.
|
||||
2. If you declare either the device or the platform,
|
||||
you must declare both.
|
||||
|
||||
1457
tests/opencl/guassian/clutils.cpp
Executable file
1457
tests/opencl/guassian/clutils.cpp
Executable file
File diff suppressed because it is too large
Load Diff
281
tests/opencl/guassian/clutils.h
Executable file
281
tests/opencl/guassian/clutils.h
Executable file
@@ -0,0 +1,281 @@
|
||||
/****************************************************************************\
|
||||
* Copyright (c) 2011, Advanced Micro Devices, Inc. *
|
||||
* All rights reserved. *
|
||||
* *
|
||||
* Redistribution and use in source and binary forms, with or without *
|
||||
* modification, are permitted provided that the following conditions *
|
||||
* are met: *
|
||||
* *
|
||||
* Redistributions of source code must retain the above copyright notice, *
|
||||
* this list of conditions and the following disclaimer. *
|
||||
* *
|
||||
* Redistributions in binary form must reproduce the above copyright notice, *
|
||||
* this list of conditions and the following disclaimer in the documentation *
|
||||
* and/or other materials provided with the distribution. *
|
||||
* *
|
||||
* Neither the name of the copyright holder nor the names of its contributors *
|
||||
* may be used to endorse or promote products derived from this software *
|
||||
* without specific prior written permission. *
|
||||
* *
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
|
||||
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR *
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
|
||||
* *
|
||||
* If you use the software (in whole or in part), you shall adhere to all *
|
||||
* applicable U.S., European, and other export laws, including but not *
|
||||
* limited to the U.S. Export Administration Regulations (EAR), (15 C.F.R. *
|
||||
* Sections 730 through 774), and E.U. Council Regulation (EC) No 1334/2000 *
|
||||
* of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, you *
|
||||
* hereby certify that, except pursuant to a license granted by the United *
|
||||
* States Department of Commerce Bureau of Industry and Security or as *
|
||||
* otherwise permitted pursuant to a License Exception under the U.S. Export *
|
||||
* Administration Regulations ("EAR"), you will not (1) export, re-export or *
|
||||
* release to a national of a country in Country Groups D:1, E:1 or E:2 any *
|
||||
* restricted technology, software, or source code you receive hereunder, *
|
||||
* or (2) export to Country Groups D:1, E:1 or E:2 the direct product of such *
|
||||
* technology or software, if such foreign produced direct product is subject *
|
||||
* to national security controls as identified on the Commerce Control List *
|
||||
*(currently found in Supplement 1 to Part 774 of EAR). For the most current *
|
||||
* Country Group listings, or for additional information about the EAR or *
|
||||
* your obligations under those regulations, please refer to the U.S. Bureau *
|
||||
* of Industry and Securitys website at http://www.bis.doc.gov/. *
|
||||
\****************************************************************************/
|
||||
|
||||
#ifndef __CL_UTILS_H__
|
||||
#define __CL_UTILS_H__
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
// The cl_time type is OS specific
|
||||
#ifdef _WIN32
|
||||
#include <tchar.h>
|
||||
#include <Windows.h>
|
||||
typedef __int64 cl_time;
|
||||
#else
|
||||
#include <sys/time.h>
|
||||
typedef double cl_time;
|
||||
#endif
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Initialization and Cleanup
|
||||
//-------------------------------------------------------
|
||||
|
||||
// Detects platforms and devices, creates context and command queue
|
||||
cl_context cl_init(char devicePreference='\0');
|
||||
|
||||
// Creates a context given a platform and a device
|
||||
cl_context cl_init_context(int platform,int dev,int quiet=0);
|
||||
|
||||
// Releases resources used by clutils
|
||||
void cl_cleanup();
|
||||
|
||||
// Releases a kernel object
|
||||
void cl_freeKernel(cl_kernel kernel);
|
||||
|
||||
// Releases a memory object
|
||||
void cl_freeMem(cl_mem mem);
|
||||
|
||||
// Releases a program object
|
||||
void cl_freeProgram(cl_program program);
|
||||
|
||||
// Returns the global command queue
|
||||
cl_command_queue cl_getCommandQueue();
|
||||
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Synchronization functions
|
||||
//-------------------------------------------------------
|
||||
|
||||
// Performs a clFinish on the command queue
|
||||
void cl_sync();
|
||||
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Memory allocation
|
||||
//-------------------------------------------------------
|
||||
|
||||
// Allocates a regular buffer on the device
|
||||
cl_mem cl_allocBuffer(size_t mem_size,
|
||||
cl_mem_flags flags = CL_MEM_READ_WRITE);
|
||||
|
||||
// XXX I don't think this does exactly what we want it to do
|
||||
// Allocates a read-only buffer and transfers the data
|
||||
cl_mem cl_allocBufferConst(size_t mem_size, void* host_ptr);
|
||||
|
||||
// Allocates pinned memory on the host
|
||||
cl_mem cl_allocBufferPinned(size_t mem_size);
|
||||
|
||||
// Allocates an image on the device
|
||||
cl_mem cl_allocImage(size_t height, size_t width, char type,
|
||||
cl_mem_flags flags = CL_MEM_READ_WRITE);
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Data transfers
|
||||
//-------------------------------------------------------
|
||||
|
||||
// Copies a buffer from the device to pinned memory on the host and
|
||||
// maps it so it can be read
|
||||
void* cl_copyAndMapBuffer(cl_mem dst, cl_mem src, size_t size);
|
||||
|
||||
// Copies from one buffer to another
|
||||
void cl_copyBufferToBuffer(cl_mem dst, cl_mem src, size_t size);
|
||||
|
||||
// Copies data to a buffer on the device
|
||||
void cl_copyBufferToDevice(cl_mem dst, void *src, size_t mem_size,
|
||||
cl_bool blocking = CL_TRUE);
|
||||
|
||||
// Copies data to an image on the device
|
||||
void cl_copyImageToDevice(cl_mem dst, void* src, size_t height, size_t width);
|
||||
|
||||
// Copies an image from the device to the host
|
||||
void cl_copyImageToHost(void* dst, cl_mem src, size_t height, size_t width);
|
||||
|
||||
// Copies data from a device buffer to the host
|
||||
void cl_copyBufferToHost(void *dst, cl_mem src, size_t mem_size,
|
||||
cl_bool blocking = CL_TRUE);
|
||||
|
||||
// Copies data from a buffer on the device to an image on the device
|
||||
void cl_copyBufferToImage(cl_mem src, cl_mem dst, int height, int width);
|
||||
|
||||
// Maps a buffer
|
||||
void* cl_mapBuffer(cl_mem mem, size_t mem_size, cl_mem_flags flags);
|
||||
|
||||
// Unmaps a buffer
|
||||
void cl_unmapBuffer(cl_mem mem, void *ptr);
|
||||
|
||||
// Writes data to a zero-copy buffer on the device
|
||||
void cl_writeToZCBuffer(cl_mem mem, void* data, size_t size);
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Program and kernels
|
||||
//-------------------------------------------------------
|
||||
|
||||
// Compiles a program
|
||||
cl_program cl_compileProgram(char* kernelPath, char* compileoptions,
|
||||
bool verboseoptions = 0);
|
||||
|
||||
// Creates a kernel
|
||||
cl_kernel cl_createKernel(cl_program program, const char* kernelName);
|
||||
|
||||
|
||||
// Sets a kernel argument
|
||||
void cl_setKernelArg(cl_kernel kernel, unsigned int index, size_t size,
|
||||
void* data);
|
||||
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Profiling/events
|
||||
//-------------------------------------------------------
|
||||
|
||||
// Computes the execution time (start to end) for an event
|
||||
double cl_computeExecTime(cl_event);
|
||||
|
||||
// Compute the elapsed time between two CPU timer values
|
||||
double cl_computeTime(cl_time start, cl_time end);
|
||||
|
||||
// Creates an event from CPU timers
|
||||
void cl_createUserEvent(cl_time start, cl_time end, char* desc);
|
||||
|
||||
// Disable logging of events
|
||||
void cl_disableEvents();
|
||||
|
||||
// Enable logging of events
|
||||
void cl_enableEvents();
|
||||
|
||||
// Query the current system time
|
||||
void cl_getTime(cl_time* time);
|
||||
|
||||
// Calls a function which prints events to the terminal
|
||||
void cl_printEvents();
|
||||
|
||||
// Calls a function which writes the events to a file
|
||||
void cl_writeEventsToFile(char* path);
|
||||
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Error handling
|
||||
//-------------------------------------------------------
|
||||
|
||||
// Compare a status value to CL_SUCCESS and optionally exit on error
|
||||
int cl_errChk(const cl_int status, const char *msg, bool exitOnErr);
|
||||
|
||||
// Queries the supported image formats for the device and prints
|
||||
// them to the screen
|
||||
void printSupportedImageFormats();
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Platform and device information
|
||||
//-------------------------------------------------------
|
||||
|
||||
bool cl_deviceIsAMD(cl_device_id dev=NULL);
|
||||
bool cl_deviceIsNVIDIA(cl_device_id dev=NULL);
|
||||
bool cl_platformIsNVIDIA(cl_platform_id plat=NULL);
|
||||
char* cl_getDeviceDriverVersion(cl_device_id dev=NULL);
|
||||
char* cl_getDeviceName(cl_device_id dev=NULL);
|
||||
char* cl_getDeviceVendor(cl_device_id dev=NULL);
|
||||
char* cl_getDeviceVersion(cl_device_id dev=NULL);
|
||||
char* cl_getPlatformName(cl_platform_id platform);
|
||||
char* cl_getPlatformVendor(cl_platform_id platform);
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Utility functions
|
||||
//-------------------------------------------------------
|
||||
|
||||
char* catStringWithInt(const char* str, int integer);
|
||||
|
||||
char* itoa_portable(int value, char* result, int base);
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Data types
|
||||
//-------------------------------------------------------
|
||||
typedef struct{
|
||||
int x;
|
||||
int y;
|
||||
} int2;
|
||||
|
||||
typedef struct{
|
||||
float x;
|
||||
float y;
|
||||
}float2;
|
||||
|
||||
typedef struct{
|
||||
float x;
|
||||
float y;
|
||||
float z;
|
||||
float w;
|
||||
}float4;
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Defines
|
||||
//-------------------------------------------------------
|
||||
|
||||
#define MAX_ERR_VAL 64
|
||||
|
||||
#define NUM_PROGRAMS 7
|
||||
|
||||
#define NUM_KERNELS 13
|
||||
#define KERNEL_INIT_DET 0
|
||||
#define KERNEL_BUILD_DET 1
|
||||
#define KERNEL_SURF_DESC 2
|
||||
#define KERNEL_NORM_DESC 3
|
||||
#define KERNEL_NON_MAX_SUP 4
|
||||
#define KERNEL_GET_ORIENT1 5
|
||||
#define KERNEL_GET_ORIENT2 6
|
||||
#define KERNEL_NN 7
|
||||
#define KERNEL_SCAN 8
|
||||
#define KERNEL_SCAN4 9
|
||||
#define KERNEL_TRANSPOSE 10
|
||||
#define KERNEL_SCANIMAGE 11
|
||||
#define KERNEL_TRANSPOSEIMAGE 12
|
||||
|
||||
#endif
|
||||
40
tests/opencl/guassian/gaussianElim.h
Executable file
40
tests/opencl/guassian/gaussianElim.h
Executable file
@@ -0,0 +1,40 @@
|
||||
#ifndef _GAUSSIANELIM
|
||||
#define _GAUSSIANELIM
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <float.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <algorithm>
|
||||
|
||||
#include "clutils.h"
|
||||
|
||||
// All OpenCL headers
|
||||
#if defined (__APPLE__) || defined(MACOSX)
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
float *OpenClGaussianElimination(
|
||||
cl_context context,
|
||||
int timing);
|
||||
|
||||
void printUsage();
|
||||
int parseCommandline(int argc, char *argv[], char* filename,
|
||||
int *q, int *t, int *p, int *d);
|
||||
|
||||
void InitPerRun(int size,float *m);
|
||||
void ForwardSub(cl_context context, float *a, float *b, float *m, int size,int timing);
|
||||
void BackSub(float *a, float *b, float *finalVec, int size);
|
||||
void Fan1(float *m, float *a, int Size, int t);
|
||||
void Fan2(float *m, float *a, float *b,int Size, int j1, int t);
|
||||
//void Fan3(float *m, float *b, int Size, int t);
|
||||
void InitMat(FILE *fp, int size, float *ary, int nrow, int ncol);
|
||||
void InitAry(FILE *fp, float *ary, int ary_size);
|
||||
void PrintMat(float *ary, int size, int nrow, int ncolumn);
|
||||
void PrintAry(float *ary, int ary_size);
|
||||
float eventTime(cl_event event,cl_command_queue command_queue);
|
||||
#endif
|
||||
74
tests/opencl/guassian/gettimeofday.cpp
Executable file
74
tests/opencl/guassian/gettimeofday.cpp
Executable file
@@ -0,0 +1,74 @@
|
||||
#include "stdio.h"
|
||||
#include <time.h>
|
||||
#include <windows.h>
|
||||
#include <iostream>
|
||||
//using namespace System;
|
||||
using namespace std;
|
||||
|
||||
#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
|
||||
#else
|
||||
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||
#endif
|
||||
|
||||
struct timezone
|
||||
{
|
||||
int tz_minuteswest; /* minutes W of Greenwich */
|
||||
int tz_dsttime; /* type of dst correction */
|
||||
};
|
||||
|
||||
|
||||
// Definition of a gettimeofday function
|
||||
int gettimeofday(struct timeval *tv, struct timezone *tz)
|
||||
{
|
||||
// Define a structure to receive the current Windows filetime
|
||||
FILETIME ft;
|
||||
|
||||
// Initialize the present time to 0 and the timezone to UTC
|
||||
unsigned __int64 tmpres = 0;
|
||||
static int tzflag = 0;
|
||||
|
||||
if (NULL != tv)
|
||||
{
|
||||
GetSystemTimeAsFileTime(&ft);
|
||||
|
||||
// The GetSystemTimeAsFileTime returns the number of 100 nanosecond
|
||||
// intervals since Jan 1, 1601 in a structure. Copy the high bits to
|
||||
// the 64 bit tmpres, shift it left by 32 then or in the low 32 bits.
|
||||
tmpres |= ft.dwHighDateTime;
|
||||
tmpres <<= 32;
|
||||
tmpres |= ft.dwLowDateTime;
|
||||
|
||||
// Convert to microseconds by dividing by 10
|
||||
tmpres /= 10;
|
||||
|
||||
// The Unix epoch starts on Jan 1 1970. Need to subtract the difference
|
||||
// in seconds from Jan 1 1601.
|
||||
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||
|
||||
// Finally change microseconds to seconds and place in the seconds value.
|
||||
// The modulus picks up the microseconds.
|
||||
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||
}
|
||||
|
||||
if (NULL != tz)
|
||||
{
|
||||
if (!tzflag)
|
||||
{
|
||||
_tzset();
|
||||
tzflag++;
|
||||
}
|
||||
|
||||
// Adjust for the timezone west of Greenwich
|
||||
long seconds_diff;
|
||||
_get_timezone(&seconds_diff);
|
||||
tz->tz_minuteswest = seconds_diff / 60;
|
||||
int hours_offset;
|
||||
_get_daylight(&hours_offset);
|
||||
tz->tz_dsttime = hours_offset;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
17
tests/opencl/guassian/gettimeofday.h
Executable file
17
tests/opencl/guassian/gettimeofday.h
Executable file
@@ -0,0 +1,17 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <WinSock.h>
|
||||
/**
|
||||
Based on code seen at.
|
||||
|
||||
http://www.winehq.org/pipermail/wine-devel/2003-June/018082.html
|
||||
|
||||
http://msdn.microsoft.com/en-us/library/ms740560
|
||||
|
||||
*/
|
||||
int gettimeofday(struct timeval *tv, struct timezone *tz);
|
||||
#else
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
|
||||
49
tests/opencl/guassian/kernel.cl
Executable file
49
tests/opencl/guassian/kernel.cl
Executable file
@@ -0,0 +1,49 @@
|
||||
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
|
||||
|
||||
typedef struct latLong
|
||||
{
|
||||
float lat;
|
||||
float lng;
|
||||
} LatLong;
|
||||
|
||||
__kernel void Fan1(__global float *m_dev,
|
||||
__global float *a_dev,
|
||||
__global float *b_dev,
|
||||
const int size,
|
||||
const int t) {
|
||||
int globalId = get_global_id(0);
|
||||
|
||||
if (globalId < size-1-t) {
|
||||
*(m_dev + size * (globalId + t + 1)+t) = *(a_dev + size * (globalId + t + 1) + t) / *(a_dev + size * t + t);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void Fan2(__global float *m_dev,
|
||||
__global float *a_dev,
|
||||
__global float *b_dev,
|
||||
const int size,
|
||||
const int t) {
|
||||
int globalId = get_global_id(0);
|
||||
|
||||
int globalIdx = get_global_id(0);
|
||||
int globalIdy = get_global_id(1);
|
||||
if (globalIdx < size-1-t && globalIdy < size-t) {
|
||||
a_dev[size*(globalIdx+1+t)+(globalIdy+t)] -= m_dev[size*(globalIdx+1+t)+t] * a_dev[size*t+(globalIdy+t)];
|
||||
|
||||
if(globalIdy == 0){
|
||||
b_dev[globalIdx+1+t] -= m_dev[size*(globalIdx+1+t)+(globalIdy+t)] * b_dev[t];
|
||||
}
|
||||
}
|
||||
// One dimensional
|
||||
// int globalIdx = globalId % size;
|
||||
// int globalIdy = globalId / size;
|
||||
//
|
||||
// if (globalIdx < size-1-t && globalIdy < size-t) {
|
||||
// a_dev[size*(globalIdx+1+t)+(globalIdy+t)] -= m_dev[size*(globalIdx+1+t)+t] * a_dev[size*t+(globalIdy+t)];
|
||||
// }
|
||||
// if(globalIdy == 0){
|
||||
// b_dev[globalIdx+1+t] -= m_dev[size*(globalIdx+1+t)+(globalIdy+t)] * b_dev[t];
|
||||
// }
|
||||
|
||||
}
|
||||
BIN
tests/opencl/guassian/kernel.pocl
Normal file
BIN
tests/opencl/guassian/kernel.pocl
Normal file
Binary file not shown.
422
tests/opencl/guassian/main.cc
Executable file
422
tests/opencl/guassian/main.cc
Executable file
@@ -0,0 +1,422 @@
|
||||
#ifndef __GAUSSIAN_ELIMINATION__
|
||||
#define __GAUSSIAN_ELIMINATION__
|
||||
|
||||
#include "gaussianElim.h"
|
||||
|
||||
cl_context context = NULL;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
printf("enter demo main\n");
|
||||
float *a = NULL, *b = NULL, *finalVec = NULL;
|
||||
float *m = NULL;
|
||||
int size;
|
||||
|
||||
FILE *fp;
|
||||
|
||||
// args
|
||||
char filename[100];
|
||||
int quiet = 0, timing = 0, platform = -1, device = -1;
|
||||
|
||||
// parse command line
|
||||
if (parseCommandline(argc, argv, filename, &quiet, &timing, &platform,
|
||||
&device)) {
|
||||
printUsage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
context = cl_init_context(platform, device, quiet);
|
||||
|
||||
fp = fopen(filename, "r");
|
||||
fscanf(fp, "%d", &size);
|
||||
|
||||
a = (float *)malloc(size * size * sizeof(float));
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
InitMat(fp, size, a, size, size);
|
||||
// printf("The input matrix a is:\n");
|
||||
// PrintMat(a, size, size, size);
|
||||
b = (float *)malloc(size * sizeof(float));
|
||||
|
||||
InitAry(fp, b, size);
|
||||
// printf("The input array b is:\n");
|
||||
// PrintAry(b, size);
|
||||
|
||||
// create the solution matrix
|
||||
m = (float *)malloc(size * size * sizeof(float));
|
||||
|
||||
// create a new vector to hold the final answer
|
||||
finalVec = (float *)malloc(size * sizeof(float));
|
||||
|
||||
InitPerRun(size, m);
|
||||
|
||||
// begin timing
|
||||
|
||||
// run kernels
|
||||
ForwardSub(context, a, b, m, size, timing);
|
||||
|
||||
// end timing
|
||||
if (!quiet) {
|
||||
printf("The result of matrix m is: \n");
|
||||
|
||||
PrintMat(m, size, size, size);
|
||||
printf("The result of matrix a is: \n");
|
||||
PrintMat(a, size, size, size);
|
||||
printf("The result of array b is: \n");
|
||||
PrintAry(b, size);
|
||||
|
||||
BackSub(a, b, finalVec, size);
|
||||
printf("The final solution is: \n");
|
||||
PrintAry(finalVec, size);
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
free(m);
|
||||
free(a);
|
||||
free(b);
|
||||
free(finalVec);
|
||||
// OpenClGaussianElimination(context,timing);
|
||||
|
||||
cl_cleanup();
|
||||
|
||||
printf("Passed!\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** ForwardSub() -- Forward substitution of Gaussian
|
||||
** elimination.
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
|
||||
int timing) {
|
||||
// 1. set up kernels
|
||||
cl_kernel fan1_kernel, fan2_kernel;
|
||||
cl_int status = 0;
|
||||
cl_program gaussianElim_program;
|
||||
cl_event writeEvent, kernelEvent, readEvent;
|
||||
float writeTime = 0, readTime = 0, kernelTime = 0;
|
||||
float writeMB = 0, readMB = 0;
|
||||
|
||||
gaussianElim_program = cl_compileProgram((char *)"gaussianElim_kernels.cl", NULL);
|
||||
|
||||
fan1_kernel = clCreateKernel(gaussianElim_program, "Fan1", &status);
|
||||
status = cl_errChk(status, (char *)"Error Creating Fan1 kernel", true);
|
||||
if (status)
|
||||
exit(1);
|
||||
|
||||
fan2_kernel = clCreateKernel(gaussianElim_program, "Fan2", &status);
|
||||
status = cl_errChk(status, (char *)"Error Creating Fan2 kernel", true);
|
||||
if (status)
|
||||
exit(1);
|
||||
|
||||
// 2. set up memory on device and send ipts data to device
|
||||
|
||||
cl_mem a_dev, b_dev, m_dev;
|
||||
|
||||
cl_int error = 0;
|
||||
|
||||
a_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
sizeof(float) * size * size, NULL, &error);
|
||||
|
||||
b_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * size, NULL,
|
||||
&error);
|
||||
|
||||
m_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
sizeof(float) * size * size, NULL, &error);
|
||||
|
||||
cl_command_queue command_queue = cl_getCommandQueue();
|
||||
|
||||
error = clEnqueueWriteBuffer(command_queue, a_dev,
|
||||
1, // change to 0 for nonblocking write
|
||||
0, // offset
|
||||
sizeof(float) * size * size, a, 0, NULL,
|
||||
&writeEvent);
|
||||
|
||||
if (timing)
|
||||
writeTime += eventTime(writeEvent, command_queue);
|
||||
clReleaseEvent(writeEvent);
|
||||
|
||||
error = clEnqueueWriteBuffer(command_queue, b_dev,
|
||||
1, // change to 0 for nonblocking write
|
||||
0, // offset
|
||||
sizeof(float) * size, b, 0, NULL, &writeEvent);
|
||||
if (timing)
|
||||
writeTime += eventTime(writeEvent, command_queue);
|
||||
clReleaseEvent(writeEvent);
|
||||
|
||||
error = clEnqueueWriteBuffer(command_queue,
|
||||
m_dev,
|
||||
1, // change to 0 for nonblocking write
|
||||
0, // offset
|
||||
sizeof(float) * size * size, m, 0, NULL,
|
||||
&writeEvent);
|
||||
if (timing)
|
||||
writeTime += eventTime(writeEvent, command_queue);
|
||||
clReleaseEvent(writeEvent);
|
||||
writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);
|
||||
|
||||
// 3. Determine block sizes
|
||||
size_t globalWorksizeFan1[1];
|
||||
size_t globalWorksizeFan2[2];
|
||||
|
||||
globalWorksizeFan1[0] = size;
|
||||
globalWorksizeFan2[0] = size;
|
||||
globalWorksizeFan2[1] = size;
|
||||
|
||||
int t;
|
||||
// 4. Setup and Run kernels
|
||||
for (t = 0; t < (size - 1); t++) {
|
||||
// kernel args
|
||||
cl_int argchk;
|
||||
argchk = clSetKernelArg(fan1_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
|
||||
argchk |= clSetKernelArg(fan1_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
|
||||
argchk |= clSetKernelArg(fan1_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
|
||||
argchk |= clSetKernelArg(fan1_kernel, 3, sizeof(int), (void *)&size);
|
||||
argchk |= clSetKernelArg(fan1_kernel, 4, sizeof(int), (void *)&t);
|
||||
|
||||
cl_errChk(argchk, "ERROR in Setting Fan1 kernel args", true);
|
||||
|
||||
// launch kernel
|
||||
error =
|
||||
clEnqueueNDRangeKernel(command_queue, fan1_kernel, 1, 0,
|
||||
globalWorksizeFan1, NULL, 0, NULL, &kernelEvent);
|
||||
|
||||
cl_errChk(error, "ERROR in Executing Fan1 Kernel", true);
|
||||
if (timing) {
|
||||
// printf("here1a\n");
|
||||
kernelTime += eventTime(kernelEvent, command_queue);
|
||||
// printf("here1b\n");
|
||||
}
|
||||
clReleaseEvent(kernelEvent);
|
||||
// Fan1<<<dimGrid,dimBlock>>>(m_cuda,a_cuda,Size,t);
|
||||
// cudaThreadSynchronize();
|
||||
|
||||
// kernel args
|
||||
argchk = clSetKernelArg(fan2_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
|
||||
argchk |= clSetKernelArg(fan2_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
|
||||
argchk |= clSetKernelArg(fan2_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
|
||||
argchk |= clSetKernelArg(fan2_kernel, 3, sizeof(int), (void *)&size);
|
||||
argchk |= clSetKernelArg(fan2_kernel, 4, sizeof(int), (void *)&t);
|
||||
|
||||
cl_errChk(argchk, "ERROR in Setting Fan2 kernel args", true);
|
||||
|
||||
// launch kernel
|
||||
error =
|
||||
clEnqueueNDRangeKernel(command_queue, fan2_kernel, 2, 0,
|
||||
globalWorksizeFan2, NULL, 0, NULL, &kernelEvent);
|
||||
|
||||
cl_errChk(error, "ERROR in Executing Fan1 Kernel", true);
|
||||
if (timing) {
|
||||
// printf("here2a\n");
|
||||
kernelTime += eventTime(kernelEvent, command_queue);
|
||||
// printf("here2b\n");
|
||||
}
|
||||
clReleaseEvent(kernelEvent);
|
||||
// Fan2<<<dimGridXY,dimBlockXY>>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t);
|
||||
// cudaThreadSynchronize();
|
||||
}
|
||||
// 5. transfer data off of device
|
||||
error =
|
||||
clEnqueueReadBuffer(command_queue, a_dev,
|
||||
1, // change to 0 for nonblocking write
|
||||
0, // offset
|
||||
sizeof(float) * size * size, a, 0, NULL, &readEvent);
|
||||
|
||||
cl_errChk(error, "ERROR with clEnqueueReadBuffer", true);
|
||||
if (timing)
|
||||
readTime += eventTime(readEvent, command_queue);
|
||||
clReleaseEvent(readEvent);
|
||||
|
||||
error = clEnqueueReadBuffer(command_queue, b_dev,
|
||||
1, // change to 0 for nonblocking write
|
||||
0, // offset
|
||||
sizeof(float) * size, b, 0, NULL, &readEvent);
|
||||
cl_errChk(error, "ERROR with clEnqueueReadBuffer", true);
|
||||
if (timing)
|
||||
readTime += eventTime(readEvent, command_queue);
|
||||
clReleaseEvent(readEvent);
|
||||
|
||||
error =
|
||||
clEnqueueReadBuffer(command_queue, m_dev,
|
||||
1, // change to 0 for nonblocking write
|
||||
0, // offset
|
||||
sizeof(float) * size * size, m, 0, NULL, &readEvent);
|
||||
|
||||
cl_errChk(error, "ERROR with clEnqueueReadBuffer", true);
|
||||
if (timing)
|
||||
readTime += eventTime(readEvent, command_queue);
|
||||
clReleaseEvent(readEvent);
|
||||
readMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);
|
||||
|
||||
if (timing) {
|
||||
printf("Matrix Size\tWrite(s) [size]\t\tKernel(s)\tRead(s) "
|
||||
"[size]\t\tTotal(s)\n");
|
||||
printf("%dx%d \t", size, size);
|
||||
|
||||
printf("%f [%.2fMB]\t", writeTime, writeMB);
|
||||
|
||||
printf("%f\t", kernelTime);
|
||||
|
||||
printf("%f [%.2fMB]\t", readTime, readMB);
|
||||
|
||||
printf("%f\n\n", writeTime + kernelTime + readTime);
|
||||
}
|
||||
|
||||
cl_freeMem(a_dev);
|
||||
cl_freeMem(b_dev);
|
||||
cl_freeMem(m_dev);
|
||||
cl_freeKernel(fan1_kernel);
|
||||
cl_freeKernel(fan2_kernel);
|
||||
cl_freeProgram(gaussianElim_program);
|
||||
}
|
||||
|
||||
float eventTime(cl_event event, cl_command_queue command_queue) {
|
||||
cl_int error = 0;
|
||||
cl_ulong eventStart, eventEnd;
|
||||
clFinish(command_queue);
|
||||
error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
|
||||
sizeof(cl_ulong), &eventStart, NULL);
|
||||
cl_errChk(error, "ERROR in Event Profiling.", true);
|
||||
error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
|
||||
sizeof(cl_ulong), &eventEnd, NULL);
|
||||
cl_errChk(error, "ERROR in Event Profiling.", true);
|
||||
|
||||
return (float)((eventEnd - eventStart) / 1e9);
|
||||
}
|
||||
|
||||
int parseCommandline(int argc, char *argv[], char *filename, int *q, int *t,
|
||||
int *p, int *d) {
|
||||
int i;
|
||||
// if (argc < 2) return 1; // error
|
||||
strncpy(filename, "matrix4.txt", 100);
|
||||
char flag;
|
||||
|
||||
for (i = 1; i < argc; i++) {
|
||||
if (argv[i][0] == '-') { // flag
|
||||
flag = argv[i][1];
|
||||
switch (flag) {
|
||||
case 'h': // help
|
||||
return 1;
|
||||
break;
|
||||
case 'q': // quiet
|
||||
*q = 1;
|
||||
break;
|
||||
case 't': // timing
|
||||
*t = 1;
|
||||
break;
|
||||
case 'p': // platform
|
||||
i++;
|
||||
*p = atoi(argv[i]);
|
||||
break;
|
||||
case 'd': // device
|
||||
i++;
|
||||
*d = atoi(argv[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((*d >= 0 && *p < 0) ||
|
||||
(*p >= 0 &&
|
||||
*d < 0)) // both p and d must be specified if either are specified
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void printUsage() {
|
||||
printf("Gaussian Elimination Usage\n");
|
||||
printf("\n");
|
||||
printf("gaussianElimination [filename] [-hqt] [-p [int] -d [int]]\n");
|
||||
printf("\n");
|
||||
printf("example:\n");
|
||||
printf("$ ./gaussianElimination matrix4.txt\n");
|
||||
printf("\n");
|
||||
printf("filename the filename that holds the matrix data\n");
|
||||
printf("\n");
|
||||
printf("-h Display the help file\n");
|
||||
printf("-q Quiet mode. Suppress all text output.\n");
|
||||
printf("-t Print timing information.\n");
|
||||
printf("\n");
|
||||
printf("-p [int] Choose the platform (must choose both platform and "
|
||||
"device)\n");
|
||||
printf("-d [int] Choose the device (must choose both platform and "
|
||||
"device)\n");
|
||||
printf("\n");
|
||||
printf("\n");
|
||||
printf("Notes: 1. The filename is required as the first parameter.\n");
|
||||
printf(" 2. If you declare either the device or the platform,\n");
|
||||
printf(" you must declare both.\n\n");
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** InitPerRun() -- Initialize the contents of the
|
||||
** multipier matrix **m
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void InitPerRun(int size, float *m) {
|
||||
int i;
|
||||
for (i = 0; i < size * size; i++)
|
||||
*(m + i) = 0.0;
|
||||
}
|
||||
void BackSub(float *a, float *b, float *finalVec, int size) {
|
||||
// solve "bottom up"
|
||||
int i, j;
|
||||
for (i = 0; i < size; i++) {
|
||||
finalVec[size - i - 1] = b[size - i - 1];
|
||||
for (j = 0; j < i; j++) {
|
||||
finalVec[size - i - 1] -= *(a + size * (size - i - 1) + (size - j - 1)) *
|
||||
finalVec[size - j - 1];
|
||||
}
|
||||
finalVec[size - i - 1] =
|
||||
finalVec[size - i - 1] / *(a + size * (size - i - 1) + (size - i - 1));
|
||||
}
|
||||
}
|
||||
void InitMat(FILE *fp, int size, float *ary, int nrow, int ncol) {
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < nrow; i++) {
|
||||
for (j = 0; j < ncol; j++) {
|
||||
fscanf(fp, "%f", ary + size * i + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*------------------------------------------------------
|
||||
** InitAry() -- Initialize the array (vector) by reading
|
||||
** data from the data file
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void InitAry(FILE *fp, float *ary, int ary_size) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ary_size; i++) {
|
||||
fscanf(fp, "%f", &ary[i]);
|
||||
}
|
||||
}
|
||||
/*------------------------------------------------------
|
||||
** PrintMat() -- Print the contents of the matrix
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void PrintMat(float *ary, int size, int nrow, int ncol) {
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < nrow; i++) {
|
||||
for (j = 0; j < ncol; j++) {
|
||||
printf("%8.2f ", *(ary + size * i + j));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** PrintAry() -- Print the contents of the array (vector)
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void PrintAry(float *ary, int ary_size) {
|
||||
int i;
|
||||
for (i = 0; i < ary_size; i++) {
|
||||
printf("%.2f ", ary[i]);
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
#endif
|
||||
11
tests/opencl/guassian/matrix4.txt
Executable file
11
tests/opencl/guassian/matrix4.txt
Executable file
@@ -0,0 +1,11 @@
|
||||
4
|
||||
|
||||
-0.6 -0.5 0.7 0.3
|
||||
-0.3 -0.9 0.3 0.7
|
||||
-0.4 -0.5 -0.3 -0.8
|
||||
0.0 -0.1 0.2 0.9
|
||||
|
||||
-0.85 -0.68 0.24 -0.53
|
||||
|
||||
0.7 0.0 -0.4 -0.5
|
||||
|
||||
1
tests/opencl/guassian/run
Executable file
1
tests/opencl/guassian/run
Executable file
@@ -0,0 +1 @@
|
||||
./gaussian ../../data/gaussian/matrix4.txt
|
||||
204
tests/opencl/guassian/utils.cpp
Executable file
204
tests/opencl/guassian/utils.cpp
Executable file
@@ -0,0 +1,204 @@
|
||||
/****************************************************************************\
|
||||
* Copyright (c) 2011, Advanced Micro Devices, Inc. *
|
||||
* All rights reserved. *
|
||||
* *
|
||||
* Redistribution and use in source and binary forms, with or without *
|
||||
* modification, are permitted provided that the following conditions *
|
||||
* are met: *
|
||||
* *
|
||||
* Redistributions of source code must retain the above copyright notice, *
|
||||
* this list of conditions and the following disclaimer. *
|
||||
* *
|
||||
* Redistributions in binary form must reproduce the above copyright notice, *
|
||||
* this list of conditions and the following disclaimer in the documentation *
|
||||
* and/or other materials provided with the distribution. *
|
||||
* *
|
||||
* Neither the name of the copyright holder nor the names of its contributors *
|
||||
* may be used to endorse or promote products derived from this software *
|
||||
* without specific prior written permission. *
|
||||
* *
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
|
||||
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR *
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
|
||||
* *
|
||||
* If you use the software (in whole or in part), you shall adhere to all *
|
||||
* applicable U.S., European, and other export laws, including but not *
|
||||
* limited to the U.S. Export Administration Regulations (EAR), (15 C.F.R. *
|
||||
* Sections 730 through 774), and E.U. Council Regulation (EC) No 1334/2000 *
|
||||
* of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, you *
|
||||
* hereby certify that, except pursuant to a license granted by the United *
|
||||
* States Department of Commerce Bureau of Industry and Security or as *
|
||||
* otherwise permitted pursuant to a License Exception under the U.S. Export *
|
||||
* Administration Regulations ("EAR"), you will not (1) export, re-export or *
|
||||
* release to a national of a country in Country Groups D:1, E:1 or E:2 any *
|
||||
* restricted technology, software, or source code you receive hereunder, *
|
||||
* or (2) export to Country Groups D:1, E:1 or E:2 the direct product of such *
|
||||
* technology or software, if such foreign produced direct product is subject *
|
||||
* to national security controls as identified on the Commerce Control List *
|
||||
*(currently found in Supplement 1 to Part 774 of EAR). For the most current *
|
||||
* Country Group listings, or for additional information about the EAR or *
|
||||
* your obligations under those regulations, please refer to the U.S. Bureau *
|
||||
* of Industry and Securitys website at http://www.bis.doc.gov/. *
|
||||
\****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
static bool usingImages = true;
|
||||
|
||||
//! A wrapper for malloc that checks the return value
|
||||
void* alloc(size_t size) {
|
||||
|
||||
void* ptr = NULL;
|
||||
ptr = malloc(size);
|
||||
if(ptr == NULL) {
|
||||
perror("malloc");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// This function checks to make sure a file exists before we open it
|
||||
void checkFile(char* filename)
|
||||
{
|
||||
|
||||
struct stat fileStatus;
|
||||
if(stat(filename, &fileStatus) != 0) {
|
||||
printf("Error opening file: %s\n", filename);
|
||||
exit(-1);
|
||||
}
|
||||
else {
|
||||
if(!(S_IFREG & fileStatus.st_mode)) {
|
||||
printf("File %s is not a regular file\n", filename);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// This function checks to make sure a directory exists
|
||||
void checkDir(char* dirpath)
|
||||
{
|
||||
|
||||
struct stat fileStatus;
|
||||
if(stat(dirpath, &fileStatus) != 0) {
|
||||
printf("Directory does not exist: %s\n", dirpath);
|
||||
exit(-1);
|
||||
}
|
||||
else {
|
||||
if(!(S_IFDIR & fileStatus.st_mode)) {
|
||||
printf("Directory was not provided: %s\n", dirpath);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Parse the command line arguments
|
||||
void parseArguments(int argc, char** argv, char** input, char** events,
|
||||
char** ipts, char* devicePref, bool* verifyResults)
|
||||
{
|
||||
|
||||
for(int i = 2; i < argc; i++) {
|
||||
if(strcmp(argv[i], "-d") == 0) { // Event dump found
|
||||
if(i == argc-1) {
|
||||
printf("Usage: -e Needs directory path\n");
|
||||
exit(-1);
|
||||
}
|
||||
devicePref[0] = argv[i+1][0];
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
if(strcmp(argv[i], "-e") == 0) { // Event dump found
|
||||
if(i == argc-1) {
|
||||
printf("Usage: -e Needs directory path\n");
|
||||
exit(-1);
|
||||
}
|
||||
*events = argv[i+1];
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
if(strcmp(argv[i], "-i") == 0) { // Input found
|
||||
if(i == argc-1) {
|
||||
printf("Usage: -i Needs directory path\n");
|
||||
exit(-1);
|
||||
}
|
||||
*input = argv[i+1];
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
if(strcmp(argv[i], "-l") == 0) { // Ipts dump found
|
||||
if(i == argc-1) {
|
||||
printf("Usage: -l Needs directory path\n");
|
||||
exit(-1);
|
||||
}
|
||||
*ipts = argv[i+1];
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
if(strcmp(argv[i], "-n") == 0) { // Don't use OpenCL images
|
||||
setUsingImages(false);
|
||||
continue;
|
||||
}
|
||||
if(strcmp(argv[i], "-v") == 0) { // Verify results
|
||||
*verifyResults = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// This function that takes a positive integer 'value' and returns
|
||||
// the nearest multiple of 'multiple' (used for padding columns)
|
||||
unsigned int roundUp(unsigned int value, unsigned int multiple) {
|
||||
|
||||
unsigned int remainder = value % multiple;
|
||||
|
||||
// Make the value a multiple of multiple
|
||||
if(remainder != 0) {
|
||||
value += (multiple-remainder);
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
// Concatenate two strings and return a pointer to the new string
|
||||
char* smartStrcat(char* str1, char* str2)
|
||||
{
|
||||
char* newStr = NULL;
|
||||
|
||||
newStr = (char*)alloc((strlen(str1)+strlen(str2)+1)*sizeof(char));
|
||||
|
||||
strcpy(newStr, str1);
|
||||
strcat(newStr, str2);
|
||||
|
||||
return newStr;
|
||||
}
|
||||
|
||||
|
||||
// Set the value of using images to true if they are being
|
||||
// used, or false if they are not
|
||||
void setUsingImages(bool val)
|
||||
{
|
||||
usingImages = val;
|
||||
}
|
||||
|
||||
|
||||
// Return whether or not images are being used
|
||||
bool isUsingImages()
|
||||
{
|
||||
return usingImages;
|
||||
}
|
||||
84
tests/opencl/guassian/utils.h
Executable file
84
tests/opencl/guassian/utils.h
Executable file
@@ -0,0 +1,84 @@
|
||||
/****************************************************************************\
|
||||
* Copyright (c) 2011, Advanced Micro Devices, Inc. *
|
||||
* All rights reserved. *
|
||||
* *
|
||||
* Redistribution and use in source and binary forms, with or without *
|
||||
* modification, are permitted provided that the following conditions *
|
||||
* are met: *
|
||||
* *
|
||||
* Redistributions of source code must retain the above copyright notice, *
|
||||
* this list of conditions and the following disclaimer. *
|
||||
* *
|
||||
* Redistributions in binary form must reproduce the above copyright notice, *
|
||||
* this list of conditions and the following disclaimer in the documentation *
|
||||
* and/or other materials provided with the distribution. *
|
||||
* *
|
||||
* Neither the name of the copyright holder nor the names of its contributors *
|
||||
* may be used to endorse or promote products derived from this software *
|
||||
* without specific prior written permission. *
|
||||
* *
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
|
||||
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR *
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
|
||||
* *
|
||||
* If you use the software (in whole or in part), you shall adhere to all *
|
||||
* applicable U.S., European, and other export laws, including but not *
|
||||
* limited to the U.S. Export Administration Regulations (EAR), (15 C.F.R. *
|
||||
* Sections 730 through 774), and E.U. Council Regulation (EC) No 1334/2000 *
|
||||
* of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, you *
|
||||
* hereby certify that, except pursuant to a license granted by the United *
|
||||
* States Department of Commerce Bureau of Industry and Security or as *
|
||||
* otherwise permitted pursuant to a License Exception under the U.S. Export *
|
||||
* Administration Regulations ("EAR"), you will not (1) export, re-export or *
|
||||
* release to a national of a country in Country Groups D:1, E:1 or E:2 any *
|
||||
* restricted technology, software, or source code you receive hereunder, *
|
||||
* or (2) export to Country Groups D:1, E:1 or E:2 the direct product of such *
|
||||
* technology or software, if such foreign produced direct product is subject *
|
||||
* to national security controls as identified on the Commerce Control List *
|
||||
*(currently found in Supplement 1 to Part 774 of EAR). For the most current *
|
||||
* Country Group listings, or for additional information about the EAR or *
|
||||
* your obligations under those regulations, please refer to the U.S. Bureau *
|
||||
* of Industry and Securitys website at http://www.bis.doc.gov/. *
|
||||
\****************************************************************************/
|
||||
|
||||
#ifndef _UTILS_
|
||||
#define _UTILS_
|
||||
|
||||
// Wrapper for malloc
|
||||
void* alloc(size_t size);
|
||||
|
||||
// Checks for existence of directory
|
||||
void checkDir(char* dirpath);
|
||||
|
||||
// Check for existence of file
|
||||
void checkFile(char* filename);
|
||||
|
||||
// Parse the input command line options to the program
|
||||
void parseArguments(int argc, char** argv, char** input, char** events,
|
||||
char** ipts, char* devicePref, bool* verifyResults);
|
||||
|
||||
|
||||
// Print the program usage information
|
||||
void printUsage();
|
||||
|
||||
// Rounds up size to the nearest multiple of multiple
|
||||
unsigned int roundUp(unsigned int value, unsigned int multiple);
|
||||
|
||||
// Concatenate two strings, creating a new one
|
||||
char* smartStrcat(char* str1, char* str2);
|
||||
|
||||
// Set the value of usingImages
|
||||
void setUsingImages(bool val);
|
||||
|
||||
// Return whether or not images are being used
|
||||
bool isUsingImages();
|
||||
|
||||
#endif
|
||||
2
tests/opencl/kmeans/.gitignore
vendored
Normal file
2
tests/opencl/kmeans/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
kmeans
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user