Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions

1
tests/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
**/*.log

View File

@@ -1,13 +1,15 @@
all: runtime regression opencl riscv unittest
all: kernel regression opencl riscv unittest
runtime:
$(MAKE) -C runtime
kernel:
$(MAKE) -C kernel
regression:
$(MAKE) -C regression
opencl:
ifneq ($(XLEN),64)
$(MAKE) -C opencl
endif
riscv:
$(MAKE) -C riscv
@@ -16,10 +18,17 @@ unittest:
$(MAKE) -C unittest
clean:
$(MAKE) clean -C runtime
$(MAKE) clean -C regression
$(MAKE) clean -C opencl
$(MAKE) clean -C riscv
$(MAKE) clean -C unittest
$(MAKE) -C kernel clean
$(MAKE) -C regression clean
$(MAKE) -C opencl clean
$(MAKE) -C riscv clean
$(MAKE) -C unittest clean
.PHONY: all runtime regression opencl riscv unittest
clean-all:
$(MAKE) -C kernel clean
$(MAKE) -C regression clean-all
$(MAKE) -C opencl clean-all
$(MAKE) -C riscv clean
$(MAKE) -C unittest clean
.PHONY: all kernel regression opencl riscv unittest

View File

@@ -1,20 +1,19 @@
all:
$(MAKE) -C conform
$(MAKE) -C hello
$(MAKE) -C fibonacci
$(MAKE) -C simple
$(MAKE) -C fibonacci
run-simx:
$(MAKE) -C conform run-simx
$(MAKE) -C hello run-simx
$(MAKE) -C fibonacci run-simx
$(MAKE) -C simple run-simx
$(MAKE) -C fibonacci run-simx
run-rtlsim:
$(MAKE) -C conform run-rtlsim
$(MAKE) -C hello run-rtlsim
$(MAKE) -C fibonacci run-rtlsim
$(MAKE) -C simple run-rtlsim
clean:
$(MAKE) -C conform clean
$(MAKE) -C hello clean
$(MAKE) -C fibonacci clean
$(MAKE) -C simple clean

View File

@@ -0,0 +1,52 @@
XLEN ?= 32
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
SIM_DIR = ../../../sim
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
PROJECT = conform
SRCS = main.cpp tests.cpp
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).elf
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
$(PROJECT).bin: $(PROJECT).elf
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
$(PROJECT).elf: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT).bin
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
run-simx: $(PROJECT).bin
$(SIM_DIR)/simx/simx $(PROJECT).bin
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.elf *.bin *.dump .depend

View File

@@ -6,8 +6,6 @@ int main() {
errors += test_global_memory();
errors += test_stack_memory();
errors += test_shared_memory();
errors += test_tmc();
@@ -26,10 +24,12 @@ int main() {
errors += test_barrier();
errors += test_tls();
if (0 == errors) {
vx_printf("Passed!\n");
PRINTF("Passed!\n");
} else {
vx_printf("Failed!\n");
PRINTF("Failed!\n");
}
return errors;

View File

@@ -5,26 +5,26 @@
#include <vx_print.h>
#include <vx_spawn.h>
int __attribute__ ((noinline)) check_error(const int* buffer, int offset, int size) {
int __attribute__((noinline)) check_error(const int* buffer, int offset, int size) {
int errors = 0;
for (int i = offset; i < size; i++) {
int value = buffer[i];
int ref_value = 65 + i;
if (value == ref_value) {
//vx_printf("[%d] %c\n", i, value);
//PRINTF("[%d] %c\n", i, value);
} else {
vx_printf("*** error: [%d] %x, expected %x\n", i, value, ref_value);
PRINTF("*** error: [%d] 0x%x, expected 0x%x\n", i, value, ref_value);
++errors;
}
}
return errors;
}
int __attribute__ ((noinline)) make_select_tmask(int tid) {
int __attribute__((noinline)) make_select_tmask(int tid) {
return (1 << tid);
}
int __attribute__ ((noinline)) make_full_tmask(int num_threads) {
int __attribute__((noinline)) make_full_tmask(int num_threads) {
return (1 << num_threads) - 1;
}
@@ -34,7 +34,7 @@ int __attribute__ ((noinline)) make_full_tmask(int num_threads) {
int global_buffer[GLOBAL_MEM_SZ];
int test_global_memory() {
vx_printf("Global Memory Test\n");
PRINTF("Global Memory Test\n");
for (int i = 0; i < GLOBAL_MEM_SZ; i++) {
global_buffer[i] = 65 + i;
@@ -45,51 +45,52 @@ int test_global_memory() {
///////////////////////////////////////////////////////////////////////////////
int test_stack_memory() {
vx_printf("Stack Memory Test\n");
int* smem_addr = (int*)SMEM_BASE_ADDR;
static const int STACK_MEM_SZ = 8;
int stack_buffer[STACK_MEM_SZ];
int smem_buffer[8];
for (int i = 0; i < STACK_MEM_SZ; i++) {
stack_buffer[i] = 65 + i;
}
return check_error(stack_buffer, 0, STACK_MEM_SZ);
void __attribute__((noinline)) do_smem_wr() {
unsigned tid = vx_thread_id();
smem_addr[tid] = 65 + tid;
}
///////////////////////////////////////////////////////////////////////////////
void __attribute__((noinline)) do_smem_rd() {
unsigned tid = vx_thread_id();
smem_buffer[tid] = smem_addr[tid];
}
int test_shared_memory() {
static const int SHARED_MEM_SZ = 8;
int* shared_buffer = (int*)(SMEM_BASE_ADDR-(SMEM_SIZE-SHARED_MEM_SZ-4));
vx_printf("Shared Memory Test\n");
for (int i = 0; i < SHARED_MEM_SZ; i++) {
shared_buffer[i] = 65 + i;
}
PRINTF("Shared Memory Test\n");
return check_error(shared_buffer, 0, SHARED_MEM_SZ);
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_smem_wr();
do_smem_rd();
vx_tmc_one();
return check_error(smem_buffer, 0, num_threads);
}
///////////////////////////////////////////////////////////////////////////////
int tmc_buffer[8];
void __attribute__ ((noinline)) do_tmc() {
void __attribute__((noinline)) do_tmc() {
unsigned tid = vx_thread_id();
tmc_buffer[tid] = 65 + tid;
}
int test_tmc() {
vx_printf("TMC Test\n");
PRINTF("TMC Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_tmc();
vx_tmc(1);
vx_tmc_one();
return check_error(tmc_buffer, 0, num_threads);
}
@@ -98,28 +99,26 @@ int test_tmc() {
int pred_buffer[8];
void __attribute__ ((noinline)) do_pred() {
void __attribute__((noinline)) do_pred() {
unsigned tid = vx_thread_id();
pred_buffer[tid] = 65 + tid;
vx_pred((tid == 0), 1);
pred_buffer[tid] = 65;
}
int test_pred() {
vx_printf("PRED Test\n");
PRINTF("PRED Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
for (int i = 0; i < num_threads; i++) {
pred_buffer[i] = 0;
for (int i = 1; i < num_threads; i++) {
pred_buffer[i] = 65 + i;
}
vx_pred(~1);
vx_tmc(tmask);
do_pred();
vx_tmc(1);
vx_tmc_one();
int status_n0 = (0 == tmc_buffer[0]);
int status_n1 = check_error(tmc_buffer, 1, num_threads);
return status_n0 && status_n1;
return check_error(pred_buffer, 0, num_threads);
}
///////////////////////////////////////////////////////////////////////////////
@@ -133,7 +132,7 @@ void wspawn_kernel() {
}
int test_wsapwn() {
vx_printf("Wspawn Test\n");
PRINTF("Wspawn Test\n");
int num_warps = std::min(vx_num_warps(), 8);
vx_wspawn(num_warps, wspawn_kernel);
wspawn_kernel();
@@ -145,39 +144,52 @@ int test_wsapwn() {
int dvg_buffer[4];
void __attribute__ ((noinline)) do_divergence() {
unsigned tid = vx_thread_id();
__if (tid < 2) {
__if (tid < 1) {
dvg_buffer[tid] = 65;
void __attribute__((noinline)) __attribute__((optimize("O1"))) do_divergence() {
int tid = vx_thread_id();
int cond1 = tid < 2;
int sp1 = vx_split(cond1);
if (cond1) {
{
int cond2 = tid < 1;
int sp2 = vx_split(cond2);
if (cond2) {
dvg_buffer[tid] = 65; // A
} else {
dvg_buffer[tid] = 66; // B
}
vx_join(sp2);
}
__else {
dvg_buffer[tid] = 66;
{
int cond3 = tid < 0;
int sp3 = vx_split(cond3);
if (cond3) {
dvg_buffer[tid] = 67; // C
}
vx_join(sp3);
}
} else {
{
int cond2 = tid < 3;
int sp2 = vx_split(cond2);
if (cond2) {
dvg_buffer[tid] = 67; // C
} else {
dvg_buffer[tid] = 68; // D
}
vx_join(sp2);
}
__endif
}
__else {
__if (tid < 3) {
dvg_buffer[tid] = 67;
}
__else {
dvg_buffer[tid] = 68;
}
__endif
}
__endif
vx_join(sp1);
}
int test_divergence() {
vx_printf("Control Divergence Test\n");
PRINTF("Control Divergence Test\n");
int num_threads = std::min(vx_num_threads(), 4);
int tmask = make_full_tmask(num_threads);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_divergence();
vx_tmc(1);
vx_tmc_one();
return check_error(dvg_buffer, 0, num_threads);
}
@@ -193,12 +205,12 @@ typedef struct {
int st_buffer_src[ST_BUF_SZ];
int st_buffer_dst[ST_BUF_SZ];
void st_kernel(int task_id, const st_args_t * arg) {
void st_kernel(int task_id, const st_args_t * __UNIFORM__ arg) {
arg->dst[task_id] = arg->src[task_id];
}
int test_spawn_tasks() {
vx_printf("SpawnTasks Test\n");
PRINTF("SpawnTasks Test\n");
st_args_t arg;
arg.src = st_buffer_src;
@@ -215,31 +227,30 @@ int test_spawn_tasks() {
///////////////////////////////////////////////////////////////////////////////
#define SR_BUF_SZ 8
typedef struct {
int * buf;
} sr_args_t;
int sr_buffer[SR_BUF_SZ];
int sr_buffer[8];
void sr_kernel(const sr_args_t * arg) {
int tid = vx_thread_id();
arg->buf[tid] = 65 + tid;
}
void __attribute__ ((noinline)) do_serial() {
void __attribute__((noinline)) do_serial() {
sr_args_t arg;
arg.buf = sr_buffer;
vx_serial((vx_serial_cb)sr_kernel, &arg);
}
int test_serial() {
vx_printf("Serial Test\n");
PRINTF("Serial Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_serial();
vx_tmc(1);
vx_tmc_one();
return check_error(sr_buffer, 0, num_threads);
}
@@ -248,7 +259,7 @@ int test_serial() {
int tmask_buffer[8];
int __attribute__ ((noinline)) do_tmask() {
int __attribute__((noinline)) do_tmask() {
int tid = vx_thread_id();
int tmask = make_select_tmask(tid);
int cur_tmask = vx_thread_mask();
@@ -257,7 +268,7 @@ int __attribute__ ((noinline)) do_tmask() {
}
int test_tmask() {
vx_printf("Thread Mask Test\n");
PRINTF("Thread Mask Test\n");
// activate all thread to populate shared variables
vx_tmc(-1);
@@ -271,7 +282,7 @@ l_start:
tid = do_tmask();
if (tid < num_threads)
goto l_start;
vx_tmc(1);
vx_tmc_one();
return check_error(tmask_buffer, 0, num_threads);
}
@@ -293,11 +304,36 @@ void barrier_kernel() {
}
int test_barrier() {
vx_printf("Barrier Test\n");
PRINTF("Barrier Test\n");
int num_warps = std::min(vx_num_warps(), 8);
barrier_ctr = num_warps;
barrier_stall = 0;
vx_wspawn(num_warps, barrier_kernel);
barrier_kernel();
return check_error(barrier_buffer, 0, num_warps);
}
///////////////////////////////////////////////////////////////////////////////
int tls_buffer[8];
__thread int tls_var;
__attribute__((noinline)) void print_tls_var() {
unsigned wid = vx_warp_id();
tls_buffer[wid] = 65 + tls_var;
}
void tls_kernel() {
unsigned wid = vx_warp_id();
tls_var = wid;
print_tls_var();
vx_tmc(0 == wid);
}
int test_tls() {
PRINTF("TLS Test\n");
int num_warps = std::min(vx_num_warps(), 8);
vx_wspawn(num_warps, tls_kernel);
tls_kernel();
return check_error(tls_buffer, 0, num_warps);
}

View File

@@ -1,9 +1,9 @@
#ifndef TESTS
#define TESTS
int test_global_memory();
#define PRINTF vx_printf
int test_stack_memory();
int test_global_memory();
int test_shared_memory();
@@ -23,4 +23,6 @@ int test_tmask();
int test_barrier();
int test_tls();
#endif

View File

@@ -0,0 +1,52 @@
XLEN ?= 32
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
SIM_DIR = ../../../sim
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
PROJECT = fibonacci
SRCS = main.cpp
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).elf
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
$(PROJECT).bin: $(PROJECT).elf
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
$(PROJECT).elf: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT).bin
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
run-simx: $(PROJECT).bin
$(SIM_DIR)/simx/simx $(PROJECT).bin
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.elf *.bin *.dump .depend

View File

@@ -0,0 +1,52 @@
XLEN ?= 32
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
SIM_DIR = ../../../sim
CFLAGS += -O3 -v -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
PROJECT = hello
SRCS = main.cpp
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).elf
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
$(PROJECT).bin: $(PROJECT).elf
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
$(PROJECT).elf: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT).bin
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
run-simx: $(PROJECT).bin
$(SIM_DIR)/simx/simx $(PROJECT).bin
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.elf *.bin *.dump .depend

View File

@@ -1,67 +0,0 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT=BlackScholes
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: BlackScholes.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc oclBlackScholes_common.h oclBlackScholes_launcher.cpp oclBlackScholes_gold.cpp $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex

View File

@@ -1,67 +0,0 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT=DotProduct
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: DotProduct.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex

View File

@@ -1,59 +1,126 @@
all:
$(MAKE) -C vecadd
$(MAKE) -C sgemm
$(MAKE) -C psort
$(MAKE) -C saxpy
$(MAKE) -C sfilter
$(MAKE) -C nearn
$(MAKE) -C guassian
$(MAKE) -C dotproduct
$(MAKE) -C kmeans
$(MAKE) -C spmv
$(MAKE) -C transpose
$(MAKE) -C cutcp
$(MAKE) -C vectorhypot
$(MAKE) -C stencil
$(MAKE) -C mri-q
$(MAKE) -C lbm
$(MAKE) -C oclprintf
$(MAKE) -C psort
$(MAKE) -C blackscholes
$(MAKE) -C matmul
run-simx:
$(MAKE) -C vecadd run-simx
$(MAKE) -C vecadd run-simx
$(MAKE) -C sgemm run-simx
$(MAKE) -C psort run-simx
$(MAKE) -C saxpy run-simx
$(MAKE) -C sfilter run-simx
$(MAKE) -C nearn run-simx
$(MAKE) -C guassian run-simx
$(MAKE) -C dotproduct run-simx
$(MAKE) -C kmeans run-simx
$(MAKE) -C spmv run-simx
$(MAKE) -C cutcp run-simx
$(MAKE) -C stencil run-simx
$(MAKE) -C lbm run-simx
$(MAKE) -C oclprintf run-simx
$(MAKE) -C psort run-simx
$(MAKE) -C blackscholes run-simx
$(MAKE) -C matmul run-simx
$(MAKE) -C transpose run-simx
# $(MAKE) -C vectorhypot run-simx
# $(MAKE) -C mri-q run-simx
run-rtlsim:
$(MAKE) -C vecadd run-rtlsim
$(MAKE) -C vecadd run-rtlsim
$(MAKE) -C sgemm run-rtlsim
$(MAKE) -C psort run-rtlsim
$(MAKE) -C saxpy run-rtlsim
$(MAKE) -C sfilter run-rtlsim
$(MAKE) -C nearn run-rtlsim
$(MAKE) -C guassian run-rtlsim
$(MAKE) -C dotproduct run-rtlsim
$(MAKE) -C kmeans run-rtlsim
$(MAKE) -C spmv run-rtlsim
$(MAKE) -C transpose run-rtlsim
$(MAKE) -C cutcp run-rtlsim
$(MAKE) -C stencil run-rtlsim
$(MAKE) -C lbm run-rtlsim
$(MAKE) -C oclprintf run-rtlsim
$(MAKE) -C psort run-rtlsim
$(MAKE) -C blackscholes run-rtlsim
$(MAKE) -C matmul run-rtlsim
# $(MAKE) -C vectorhypot run-rtlsim
# $(MAKE) -C mri-q run-rtlsim
run-vlsim:
$(MAKE) -C vecadd run-vlsim
$(MAKE) -C sgemm run-vlsim
$(MAKE) -C saxpy run-vlsim
$(MAKE) -C sfilter run-vlsim
$(MAKE) -C nearn run-vlsim
$(MAKE) -C guassian run-vlsim
$(MAKE) -C oclprintf run-vlsim
$(MAKE) -C psort run-vlsim
run-opae:
$(MAKE) -C vecadd run-opae
$(MAKE) -C sgemm run-opae
$(MAKE) -C psort run-opae
$(MAKE) -C saxpy run-opae
$(MAKE) -C sfilter run-opae
$(MAKE) -C nearn run-opae
$(MAKE) -C guassian run-opae
$(MAKE) -C dotproduct run-opae
$(MAKE) -C kmeans run-opae
$(MAKE) -C spmv run-opae
$(MAKE) -C transpose run-opae
$(MAKE) -C cutcp run-opae
$(MAKE) -C stencil run-opae
$(MAKE) -C lbm run-opae
$(MAKE) -C oclprintf run-opae
$(MAKE) -C blackscholes run-opae
$(MAKE) -C matmul run-opae
# $(MAKE) -C vectorhypot run-opae
# $(MAKE) -C mri-q run-opae
clean:
$(MAKE) -C vecadd clean
$(MAKE) -C sgemm clean
$(MAKE) -C psort clean
$(MAKE) -C saxpy clean
$(MAKE) -C sfilter clean
$(MAKE) -C nearn clean
$(MAKE) -C guassian clean
$(MAKE) -C dotproduct clean
$(MAKE) -C kmeans clean
$(MAKE) -C spmv clean
$(MAKE) -C transpose clean
$(MAKE) -C cutcp clean
$(MAKE) -C vectorhypot clean
$(MAKE) -C stencil clean
$(MAKE) -C mri-q clean
$(MAKE) -C lbm clean
$(MAKE) -C oclprintf clean
$(MAKE) -C psort clean
$(MAKE) -C blackscholes clean
$(MAKE) -C matmul clean
clean-all:
$(MAKE) -C vecadd clean-all
$(MAKE) -C sgemm clean-all
$(MAKE) -C psort clean-all
$(MAKE) -C saxpy clean-all
$(MAKE) -C sfilter clean-all
$(MAKE) -C sfilter clean-all
$(MAKE) -C nearn clean-all
$(MAKE) -C guassian clean-all
$(MAKE) -C dotproduct clean-all
$(MAKE) -C kmeans clean-all
$(MAKE) -C spmv clean-all
$(MAKE) -C transpose clean-all
$(MAKE) -C cutcp clean-all
$(MAKE) -C vectorhypot clean-all
$(MAKE) -C stencil clean-all
$(MAKE) -C mri-q clean-all
$(MAKE) -C lbm clean-all
$(MAKE) -C oclprintf clean-all
$(MAKE) -C psort clean-all
$(MAKE) -C blackscholes clean-all
$(MAKE) -C matmul clean-all

View File

@@ -1,67 +0,0 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT=VectorHypot
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: VectorHypot.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,67 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 -Wstack-usage=1024 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = bfs
SRCS = main.cc
all: $(PROJECT) kernel.pocl
OPTS ?=
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -285,7 +285,7 @@ int main(int argc, char *argv[]) {
free(h_graph_visited);
} catch (std::string msg) {
std::cout << "--cambine: exception in main ->" << msg << std::endl;
printf("--cambine: exception in main ->%s\n", msg);
// release host memory
free(h_graph_nodes);
free(h_graph_mask);

View File

@@ -0,0 +1,9 @@
PROJECT = blackscholes
SRCS = main.cpp oclUtils.cpp shrUtils.cpp cmd_arg_reader.cpp oclBlackScholes_launcher.cpp oclBlackScholes_gold.cpp
CXXFLAGS += -I.
OPTS ?=
include ../common.mk

View File

@@ -0,0 +1,152 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
// includes, file
#include "cmd_arg_reader.h"
// includes, system
#include <vector>
// internal unnamed namespace
namespace
{
// types, internal (class, enum, struct, union, typedef)
// variables, internal
} // namespace {
// variables, exported
/*static*/ CmdArgReader* CmdArgReader::self;
/*static*/ char** CmdArgReader::rargv;
/*static*/ int CmdArgReader::rargc;
// functions, exported
////////////////////////////////////////////////////////////////////////////////
//! Public construction interface
//! @return a handle to the class instance
//! @param argc number of command line arguments (as given to main())
//! @param argv command line argument string (as given to main())
////////////////////////////////////////////////////////////////////////////////
/*static*/ void
CmdArgReader::init( const int argc, const char** argv)
{
if ( NULL != self)
{
return;
}
// command line arguments
if (( 0 == argc) || ( 0 == argv))
{
LOGIC_EXCEPTION( "No command line arguments given.");
}
self = new CmdArgReader();
self->createArgsMaps( argc, argv);
rargc = argc;
rargv = const_cast<char**>( argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Constructor, default
////////////////////////////////////////////////////////////////////////////////
CmdArgReader::CmdArgReader() :
args(),
unprocessed(),
iter(),
iter_unprocessed()
{ }
////////////////////////////////////////////////////////////////////////////////
//! Destructor
////////////////////////////////////////////////////////////////////////////////
CmdArgReader::~CmdArgReader()
{
for( iter = args.begin(); iter != args.end(); ++iter)
{
if( *(iter->second.first) == typeid( int))
{
delete static_cast<int*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( bool))
{
delete static_cast<bool*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::string))
{
delete static_cast<std::string*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::vector< std::string>) )
{
delete static_cast< std::vector< std::string>* >( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::vector<int>) )
{
delete static_cast< std::vector<int>* >( iter->second.second);
break;
}
}
}
////////////////////////////////////////////////////////////////////////////////
//! Read args as token value pair into map for better processing (Even the
//! values remain strings until the parameter values is requested by the
//! program.)
//! @param argc the argument count (as given to 'main')
//! @param argv the char* array containing the command line arguments
////////////////////////////////////////////////////////////////////////////////
void
CmdArgReader::createArgsMaps( const int argc, const char** argv) {
std::string token;
std::string val_str;
std::map< std::string, std::string> args;
std::string::size_type pos;
std::string arg;
for( int i=1; i<argc; ++i)
{
arg = argv[i];
// check if valid command line argument: all arguments begin with - or --
if (arg[0] != '-')
{
RUNTIME_EXCEPTION("Invalid command line argument.");
}
int numDashes = (arg[1] == '-' ? 2 : 1);
// check if only flag or if a value is given
if ( (pos = arg.find( '=')) == std::string::npos)
{
unprocessed[ std::string( arg, numDashes, arg.length()-numDashes)] = "FLAG";
}
else
{
unprocessed[ std::string( arg, numDashes, pos-numDashes)] =
std::string( arg, pos+1, arg.length()-1);
}
}
}

View File

@@ -0,0 +1,488 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
#ifndef _CMDARGREADER_H_
#define _CMDARGREADER_H_
// includes, system
#include <map>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <typeinfo>
// includes, project
#include "exception.h"
//! Preprocessed command line arguments
//! @note Lazy evaluation: The arguments are converted from strings to
//! the correct data type upon request. Converted values are stored
//! in an additonal map so that no additional conversion is
//! necessary. Arrays of command line arguments are stored in
//! std::vectors
//! @note Usage:
//! const std::string* file =
//! CmdArgReader::getArg< std::string>( "model")
//! const std::vector< std::string>* files =
//! CmdArgReader::getArg< std::vector< std::string> >( "model")
//! @note All command line arguments begin with '--' followed by the token;
//! token and value are seperated by '='; example --samples=50
//! @note Arrays have the form --model=[one.obj,two.obj,three.obj]
//! (without whitespaces)
//! Command line argument parser
class CmdArgReader
{
template<class> friend class TestCmdArgReader;
protected:
//! @param self handle to the only instance of this class
static CmdArgReader* self;
public:
//! Public construction interface
//! @return a handle to the class instance
//! @param argc number of command line arguments (as given to main())
//! @param argv command line argument string (as given to main())
static void init( const int argc, const char** argv);
public:
//! Get the value of the command line argument with given name
//! @return A const handle to the requested argument.
//! If the argument does not exist or if it
//! is not from type T NULL is returned
//! @param name the name of the requested argument
//! @note T the type of the argument requested
template<class T>
static inline const T* getArg( const std::string& name);
//! Check if a command line argument with the given name exists
//! @return true if a command line argument with name \a name exists,
//! otherwise false
//! @param name name of the command line argument in question
static inline bool existArg( const std::string& name);
//! Get the original / raw argc program argument
static inline int& getRArgc();
//! Get the original / raw argv program argument
static inline char**& getRArgv();
public:
//! Destructor
~CmdArgReader();
protected:
//! Constructor, default
CmdArgReader();
private:
// private helper functions
//! Get the value of the command line argument with given name
//! @note Private helper function for 'getArg' to work on the members
//! @return A const handle to the requested argument. If the argument
//! does not exist or if it is not from type T a NULL pointer
//! is returned.
//! @param name the name of the requested argument
//! @note T the type of the argument requested
template<class T>
inline const T* getArgHelper( const std::string& name);
//! Check if a command line argument with name \a name exists
//! @return true if a command line argument of name \a name exists,
//! otherwise false
//! @param name the name of the requested argument
inline bool existArgHelper( const std::string& name) const;
//! Read args as token value pair into map for better processing
//! (Even the values remain strings until the parameter values is
//! requested by the program.)
//! @param argc the argument count (as given to 'main')
//! @param argv the char* array containing the command line arguments
void createArgsMaps( const int argc, const char** argv);
//! Helper for "casting" the strings from the map with the unprocessed
//! values to the correct
//! data type.
//! @return true if conversion succeeded, otherwise false
//! @param element the value as string
//! @param val the value as type T
template<class T>
static inline bool convertToT( const std::string& element, T& val);
public:
// typedefs internal
//! container for a processed command line argument
//! typeid is used to easily be able to decide if a re-requested token-value
//! pair match the type of the first conversion
typedef std::pair< const std::type_info*, void*> ValType;
//! map of already converted values
typedef std::map< std::string, ValType > ArgsMap;
//! iterator for the map of already converted values
typedef ArgsMap::iterator ArgsMapIter;
typedef ArgsMap::const_iterator ConstArgsMapIter;
//! map of unprocessed (means unconverted) token-value pairs
typedef std::map< std::string, std::string> UnpMap;
//! iterator for the map of unprocessed (means unconverted) token-value pairs
typedef std::map< std::string, std::string>::iterator UnpMapIter;
private:
#ifdef _WIN32
# pragma warning( disable: 4251)
#endif
//! rargc original value of argc
static int rargc;
//! rargv contains command line arguments in raw format
static char** rargv;
//! args Map containing the already converted token-value pairs
ArgsMap args;
//! args Map containing the unprocessed / unconverted token-value pairs
UnpMap unprocessed;
//! iter Iterator for the map with the already converted token-value
//! pairs (to avoid frequent reallocation)
ArgsMapIter iter;
//! iter Iterator for the map with the unconverted token-value
//! pairs (to avoid frequent reallocation)
UnpMapIter iter_unprocessed;
#ifdef _WIN32
# pragma warning( default: 4251)
#endif
private:
//! Constructor, copy (not implemented)
CmdArgReader( const CmdArgReader&);
//! Assignment operator (not implemented)
CmdArgReader& operator=( const CmdArgReader&);
};
// variables, exported (extern)
// functions, inlined (inline)
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line argument arrays
//! @note This function is used each type for which no template specialization
//! exist (which will cause errors if the type does not fulfill the std::vector
//! interface).
////////////////////////////////////////////////////////////////////////////////
template<class T>
/*static*/ inline bool
CmdArgReader::convertToT( const std::string& element, T& val)
{
// preallocate storage
val.resize( std::count( element.begin(), element.end(), ',') + 1);
unsigned int i = 0;
std::string::size_type pos_start = 1; // leave array prefix '['
std::string::size_type pos_end = 0;
// do for all elements of the comma seperated list
while( std::string::npos != ( pos_end = element.find(',', pos_end+1)) )
{
// convert each element by the appropriate function
if ( ! convertToT< typename T::value_type >(
std::string( element, pos_start, pos_end - pos_start), val[i]))
{
return false;
}
pos_start = pos_end + 1;
++i;
}
std::string tmp1( element, pos_start, element.length() - pos_start - 1);
// process last element (leave array postfix ']')
if ( ! convertToT< typename T::value_type >( std::string( element,
pos_start,
element.length() - pos_start - 1),
val[i]))
{
return false;
}
// possible to process all elements?
return true;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type int
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<int>( const std::string& element, int& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type float
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<float>( const std::string& element, float& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type double
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<double>( const std::string& element, double& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type string
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<std::string>( const std::string& element,
std::string& val)
{
val = element;
return true;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type bool
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<bool>( const std::string& element, bool& val)
{
// check if value is given as string-type { true | false }
if ( "true" == element)
{
val = true;
return true;
}
else if ( "false" == element)
{
val = false;
return true;
}
// check if argument is given as integer { 0 | 1 }
else
{
int tmp;
if ( convertToT<int>( element, tmp))
{
if ( 1 == tmp)
{
val = true;
return true;
}
else if ( 0 == tmp)
{
val = false;
return true;
}
}
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the value of the command line argument with given name
//! @return A const handle to the requested argument. If the argument does
//! not exist or if it is not from type T NULL is returned
//! @param T the type of the argument requested
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
template<class T>
/*static*/ const T*
CmdArgReader::getArg( const std::string& name)
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
return NULL;
}
return self->getArgHelper<T>( name);
}
////////////////////////////////////////////////////////////////////////////////
//! Check if a command line argument with the given name exists
//! @return true if a command line argument with name \a name exists,
//! otherwise false
//! @param name name of the command line argument in question
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline bool
CmdArgReader::existArg( const std::string& name)
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
return false;
}
return self->existArgHelper( name);
}
////////////////////////////////////////////////////////////////////////////////
//! @brief Get the value of the command line argument with given name
//! @return A const handle to the requested argument. If the argument does
//! not exist or if it is not from type T NULL is returned
//! @param T the type of the argument requested
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
template<class T>
const T*
CmdArgReader::getArgHelper( const std::string& name)
{
// check if argument already processed and stored in correct type
if ( args.end() != (iter = args.find( name)))
{
if ( (*(iter->second.first)) == typeid( T) )
{
return (T*) iter->second.second;
}
}
else
{
T* tmp = new T;
// check the array with unprocessed values
if ( unprocessed.end() != (iter_unprocessed = unprocessed.find( name)))
{
// try to "cast" the string to the type requested
if ( convertToT< T >( iter_unprocessed->second, *tmp))
{
// add the token element pair to map of already converted values
args[name] = std::make_pair( &(typeid( T)), (void*) tmp);
return tmp;
}
}
// not used while not inserted into the map -> cleanup
delete tmp;
}
// failed, argument not available
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
//! Check if a command line argument with name \a name exists
//! @return true if a command line argument of name \a name exists,
//! otherwise false
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
inline bool
CmdArgReader::existArgHelper( const std::string& name) const
{
bool ret_val = false;
// check if argument already processed and stored in correct type
if( args.end() != args.find( name))
{
ret_val = true;
}
else
{
// check the array with unprocessed values
if ( unprocessed.end() != unprocessed.find( name))
{
ret_val = true;
}
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the original / raw argc program argument
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline int&
CmdArgReader::getRArgc()
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
}
return rargc;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the original / raw argv program argument
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline char**&
CmdArgReader::getRArgv()
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
}
return rargv;
}
// functions, exported (extern)
#endif // #ifndef _CMDARGREADER_H_

View File

@@ -0,0 +1,151 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
#ifndef _EXCEPTION_H_
#define _EXCEPTION_H_
// includes, system
#include <exception>
#include <stdexcept>
#include <iostream>
#include <stdlib.h>
//! Exception wrapper.
//! @param Std_Exception Exception out of namespace std for easy typing.
template<class Std_Exception>
class Exception : public Std_Exception
{
public:
//! @brief Static construction interface
//! @return Alwayss throws ( Located_Exception<Exception>)
//! @param file file in which the Exception occurs
//! @param line line in which the Exception occurs
//! @param detailed details on the code fragment causing the Exception
static void throw_it( const char* file,
const int line,
const char* detailed = "-" );
//! Static construction interface
//! @return Alwayss throws ( Located_Exception<Exception>)
//! @param file file in which the Exception occurs
//! @param line line in which the Exception occurs
//! @param detailed details on the code fragment causing the Exception
static void throw_it( const char* file,
const int line,
const std::string& detailed);
//! Destructor
virtual ~Exception() throw();
private:
//! Constructor, default (private)
Exception();
//! Constructor, standard
//! @param str string returned by what()
Exception( const std::string& str);
};
////////////////////////////////////////////////////////////////////////////////
//! Exception handler function for arbitrary exceptions
//! @param ex exception to handle
////////////////////////////////////////////////////////////////////////////////
template<class Exception_Typ>
inline void
handleException( const Exception_Typ& ex)
{
std::cerr << ex.what() << std::endl;
exit( EXIT_FAILURE);
}
//! Convenience macros
//! Exception caused by dynamic program behavior, e.g. file does not exist
#define RUNTIME_EXCEPTION( msg) \
Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
//! Logic exception in program, e.g. an assert failed
#define LOGIC_EXCEPTION( msg) \
Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
//! Out of range exception
#define RANGE_EXCEPTION( msg) \
Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
////////////////////////////////////////////////////////////////////////////////
//! Implementation
// includes, system
#include <sstream>
////////////////////////////////////////////////////////////////////////////////
//! Static construction interface.
//! @param Exception causing code fragment (file and line) and detailed infos.
////////////////////////////////////////////////////////////////////////////////
/*static*/ template<class Std_Exception>
void
Exception<Std_Exception>::
throw_it( const char* file, const int line, const char* detailed)
{
std::stringstream s;
// Quiet heavy-weight but exceptions are not for
// performance / release versions
s << "Exception in file '" << file << "' in line " << line << "\n"
<< "Detailed description: " << detailed << "\n";
throw Exception( s.str());
}
////////////////////////////////////////////////////////////////////////////////
//! Static construction interface.
//! @param Exception causing code fragment (file and line) and detailed infos.
////////////////////////////////////////////////////////////////////////////////
/*static*/ template<class Std_Exception>
void
Exception<Std_Exception>::
throw_it( const char* file, const int line, const std::string& msg)
{
throw_it( file, line, msg.c_str());
}
////////////////////////////////////////////////////////////////////////////////
//! Constructor, default (private).
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::Exception() :
Exception("Unknown Exception.\n")
{ }
////////////////////////////////////////////////////////////////////////////////
//! Constructor, standard (private).
//! String returned by what().
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::Exception( const std::string& s) :
Std_Exception( s)
{ }
////////////////////////////////////////////////////////////////////////////////
//! Destructor
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::~Exception() throw() { }
// functions, exported
#endif // #ifndef _EXCEPTION_H_

View File

@@ -61,7 +61,7 @@ int main(int argc, char **argv)
*h_X,
*h_T;
const unsigned int optionCount = 4000000;
const unsigned int optionCount = 64;
const float R = 0.02f;
const float V = 0.30f;
@@ -69,7 +69,7 @@ int main(int argc, char **argv)
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
//Get all the devices
@@ -78,10 +78,10 @@ int main(int argc, char **argv)
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
shrLog("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
// Get command line device options and config accordingly
shrLog(" # of Devices Available = %u\n", uiNumDevices);
@@ -92,7 +92,7 @@ int main(int argc, char **argv)
shrLog(" Using Device %u: ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
// set logfile name and start logs
@@ -120,31 +120,31 @@ int main(int argc, char **argv)
shrLog("Initializing OpenCL...\n");
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
// Get a GPU device
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
// Create the context
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Creating OpenCL memory objects...\n");
d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
d_Put = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
d_S = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
d_X = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
d_T = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Starting up BlackScholes...\n");
initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv);
@@ -204,9 +204,9 @@ int main(int argc, char **argv)
shrLog("\nReading back OpenCL BlackScholes results...\n");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Comparing against Host/C++ computation...\n");
BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount);
@@ -232,7 +232,7 @@ int main(int argc, char **argv)
ciErrNum |= clReleaseMemObject(d_Call);
ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
ciErrNum |= clReleaseContext(cxGPUContext);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
free(h_T);
free(h_X);

View File

@@ -9,8 +9,6 @@
*
*/
#include <oclUtils.h>
#include "oclBlackScholes_common.h"
@@ -18,19 +16,47 @@ static cl_program cpBlackScholes; //OpenCL program
static cl_kernel ckBlackScholes; //OpenCL kernel
static cl_command_queue cqDefaultCommandQueue;
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (NULL == filename || NULL == data || 0 == size)
return CL_INVALID_VALUE;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return CL_INVALID_VALUE;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return CL_SUCCESS;
}
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
cl_int ciErrNum;
size_t kernelLength;
shrLog("...loading BlackScholes.cl\n");
/*shrLog("...loading BlackScholes.cl\n");
char *cPathAndName = shrFindFilePath("BlackScholes.cl", argv[0]);
shrCheckError(cPathAndName != NULL, shrTRUE);
char *cBlackScholes = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
shrCheckError(cBlackScholes != NULL, shrTRUE);
shrCheckError(cBlackScholes != NULL, shrTRUE);*/
shrLog("...creating BlackScholes program\n");
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
cpBlackScholes = clCreateProgramWithBuiltInKernels(context, 1, &device_id, "BlackScholes", NULL);
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
shrCheckError(ciErrNum, CL_SUCCESS);
cl_device_id device_id = oclGetFirstDev(cxGPUContext);
cpBlackScholes = clCreateProgramWithBinary(
cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
shrCheckError(ciErrNum, CL_SUCCESS);
shrLog("...building BlackScholes program\n");
@@ -66,7 +92,7 @@ extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqPar
shrLog("*** Exiting ***\n");
free(logTxt);
free(cdDevices);
exit(666);
exit(1);
}
//Save ptx code to separate file
@@ -77,8 +103,8 @@ extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqPar
shrCheckError(ciErrNum, CL_SUCCESS);
cqDefaultCommandQueue = cqParamCommandQueue;
free(cBlackScholes);
free(cPathAndName);
//free(cBlackScholes);
//free(cPathAndName);
}
extern "C" void closeBlackScholes(void){
@@ -118,8 +144,8 @@ extern "C" void BlackScholes(
shrCheckError(ciErrNum, CL_SUCCESS);
//Run the kernel
size_t globalWorkSize = 60 * 1024;
size_t localWorkSize = 128;
size_t globalWorkSize = 16;//60 * 1024;
size_t localWorkSize = 16;//128;
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBlackScholes, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
}

View File

@@ -0,0 +1,806 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
#include <fstream>
#include <vector>
#include <iostream>
#include <algorithm>
#include <stdarg.h>
#include "oclUtils.h"
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platoform ID
//////////////////////////////////////////////////////////////////////////////
cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID)
{
char chBuffer[1024];
cl_uint num_platforms;
cl_platform_id* clPlatformIDs;
cl_int ciErrNum;
*clSelectedPlatformID = NULL;
// Get OpenCL platform count
ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms);
if (ciErrNum != CL_SUCCESS)
{
shrLog(" Error %i in clGetPlatformIDs Call !!!\n\n", ciErrNum);
return -1000;
}
else
{
if(num_platforms == 0)
{
shrLog("No OpenCL platform found!\n\n");
return -2000;
}
else
{
// if there's a platform or more, make space for ID's
if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL)
{
shrLog("Failed to allocate memory for cl_platform ID's!\n\n");
return -3000;
}
// get platform info for each platform and trap the NVIDIA platform if found
ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL);
for(cl_uint i = 0; i < num_platforms; ++i)
{
ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL);
if(ciErrNum == CL_SUCCESS)
{
if(strstr(chBuffer, "NVIDIA") != NULL)
{
*clSelectedPlatformID = clPlatformIDs[i];
break;
}
}
}
// default to zeroeth platform if NVIDIA not found
if(*clSelectedPlatformID == NULL)
{
shrLog("WARNING: NVIDIA OpenCL platform not found - defaulting to first platform!\n\n");
*clSelectedPlatformID = clPlatformIDs[0];
}
free(clPlatformIDs);
}
}
return CL_SUCCESS;
}
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
void oclPrintDevName(int iLogMode, cl_device_id device)
{
char device_string[1024];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, "%s", device_string);
}
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
void oclPrintDevInfo(int iLogMode, cl_device_id device)
{
char device_string[1024];
bool nv_device_attibute_query = false;
// CL_DEVICE_NAME
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_NAME: \t\t\t%s\n", device_string);
// CL_DEVICE_VENDOR
clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_VENDOR: \t\t\t%s\n", device_string);
// CL_DRIVER_VERSION
clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DRIVER_VERSION: \t\t\t%s\n", device_string);
// CL_DEVICE_VERSION
clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_VERSION: \t\t\t%s\n", device_string);
// CL_DEVICE_OPENCL_C_VERSION (if CL_DEVICE_VERSION version > 1.0)
if(strncmp("OpenCL 1.0", device_string, 10) != 0)
{
// This code is unused for devices reporting OpenCL 1.0, but a def is needed anyway to allow compilation using v 1.0 headers
// This constant isn't #defined in 1.0
#ifndef CL_DEVICE_OPENCL_C_VERSION
#define CL_DEVICE_OPENCL_C_VERSION 0x103D
#endif
clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_OPENCL_C_VERSION: \t\t%s\n", device_string);
}
// CL_DEVICE_TYPE
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
if( type & CL_DEVICE_TYPE_CPU )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if( type & CL_DEVICE_TYPE_GPU )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if( type & CL_DEVICE_TYPE_ACCELERATOR )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if( type & CL_DEVICE_TYPE_DEFAULT )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", compute_units);
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
size_t workitem_dims;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims);
// CL_DEVICE_MAX_WORK_ITEM_SIZES
size_t workitem_size[3];
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
size_t workgroup_size;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency);
// CL_DEVICE_ADDRESS_BITS
cl_uint addr_bits;
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_ADDRESS_BITS:\t\t%u\n", addr_bits);
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
cl_ulong max_mem_alloc_size;
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_alloc_size), &max_mem_alloc_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(max_mem_alloc_size / (1024 * 1024)));
// CL_DEVICE_GLOBAL_MEM_SIZE
cl_ulong mem_size;
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(mem_size / (1024 * 1024)));
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
cl_bool error_correction_support;
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(error_correction_support), &error_correction_support, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", error_correction_support == CL_TRUE ? "yes" : "no");
// CL_DEVICE_LOCAL_MEM_TYPE
cl_device_local_mem_type local_mem_type;
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(local_mem_type), &local_mem_type, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", local_mem_type == 1 ? "local" : "global");
// CL_DEVICE_LOCAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(mem_size / 1024));
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(mem_size / 1024));
// CL_DEVICE_QUEUE_PROPERTIES
cl_command_queue_properties queue_properties;
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(queue_properties), &queue_properties, NULL);
if( queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
if( queue_properties & CL_QUEUE_PROFILING_ENABLE )
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
// CL_DEVICE_IMAGE_SUPPORT
cl_bool image_support;
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", image_support);
// CL_DEVICE_MAX_READ_IMAGE_ARGS
cl_uint max_read_image_args;
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(max_read_image_args), &max_read_image_args, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", max_read_image_args);
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
cl_uint max_write_image_args;
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(max_write_image_args), &max_write_image_args, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", max_write_image_args);
// CL_DEVICE_SINGLE_FP_CONFIG
cl_device_fp_config fp_config;
clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_SINGLE_FP_CONFIG:\t\t%s%s%s%s%s%s\n",
fp_config & CL_FP_DENORM ? "denorms " : "",
fp_config & CL_FP_INF_NAN ? "INF-quietNaNs " : "",
fp_config & CL_FP_ROUND_TO_NEAREST ? "round-to-nearest " : "",
fp_config & CL_FP_ROUND_TO_ZERO ? "round-to-zero " : "",
fp_config & CL_FP_ROUND_TO_INF ? "round-to-inf " : "",
fp_config & CL_FP_FMA ? "fma " : "");
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
size_t szMaxDims[5];
shrLogEx(iLogMode, 0, "\n CL_DEVICE_IMAGE <dim>");
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &szMaxDims[0], NULL);
shrLogEx(iLogMode, 0, "\t\t\t2D_MAX_WIDTH\t %u\n", szMaxDims[0]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[1], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", szMaxDims[1]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &szMaxDims[2], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_WIDTH\t %u\n", szMaxDims[2]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[3], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", szMaxDims[3]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &szMaxDims[4], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_DEPTH\t %u\n", szMaxDims[4]);
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(device_string), &device_string, NULL);
if (device_string != 0)
{
shrLogEx(iLogMode, 0, "\n CL_DEVICE_EXTENSIONS:");
std::string stdDevString;
stdDevString = std::string(device_string);
size_t szOldPos = 0;
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
while (szSpacePos != stdDevString.npos)
{
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
nv_device_attibute_query = true;
if (szOldPos > 0)
{
shrLogEx(iLogMode, 0, "\t\t");
}
shrLogEx(iLogMode, 0, "\t\t\t%s\n", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str());
do {
szOldPos = szSpacePos + 1;
szSpacePos = stdDevString.find(' ', szOldPos);
} while (szSpacePos == szOldPos);
}
shrLogEx(iLogMode, 0, "\n");
}
else
{
shrLogEx(iLogMode, 0, " CL_DEVICE_EXTENSIONS: None\n");
}
if(nv_device_attibute_query)
{
cl_uint compute_capability_major, compute_capability_minor;
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &compute_capability_major, NULL);
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &compute_capability_minor, NULL);
shrLogEx(iLogMode, 0, "\n CL_DEVICE_COMPUTE_CAPABILITY_NV:\t%u.%u\n", compute_capability_major, compute_capability_minor);
shrLogEx(iLogMode, 0, " NUMBER OF MULTIPROCESSORS:\t\t%u\n", compute_units); // this is the same value reported by CL_DEVICE_MAX_COMPUTE_UNITS
shrLogEx(iLogMode, 0, " NUMBER OF CUDA CORES:\t\t\t%u\n", ConvertSMVer2Cores(compute_capability_major, compute_capability_minor) * compute_units);
cl_uint regs_per_block;
clGetDeviceInfo(device, CL_DEVICE_REGISTERS_PER_BLOCK_NV, sizeof(cl_uint), &regs_per_block, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_REGISTERS_PER_BLOCK_NV:\t%u\n", regs_per_block);
cl_uint warp_size;
clGetDeviceInfo(device, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), &warp_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_WARP_SIZE_NV:\t\t%u\n", warp_size);
cl_bool gpu_overlap;
clGetDeviceInfo(device, CL_DEVICE_GPU_OVERLAP_NV, sizeof(cl_bool), &gpu_overlap, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_GPU_OVERLAP_NV:\t\t%s\n", gpu_overlap == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
cl_bool exec_timeout;
clGetDeviceInfo(device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof(cl_bool), &exec_timeout, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:\t%s\n", exec_timeout == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
cl_bool integrated_memory;
clGetDeviceInfo(device, CL_DEVICE_INTEGRATED_MEMORY_NV, sizeof(cl_bool), &integrated_memory, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_INTEGRATED_MEMORY_NV:\t%s\n", integrated_memory == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
}
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
shrLogEx(iLogMode, 0, " CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
cl_uint vec_width [6];
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &vec_width[0], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &vec_width[1], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &vec_width[2], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &vec_width[3], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &vec_width[4], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &vec_width[5], NULL);
shrLogEx(iLogMode, 0, "CHAR %u, SHORT %u, INT %u, LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
vec_width[0], vec_width[1], vec_width[2], vec_width[3], vec_width[4], vec_width[5]);
}
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
int oclGetDevCap(cl_device_id device)
{
char cDevString[1024];
bool bDevAttributeQuery = false;
int iDevArch = -1;
// Get device extensions, and if any then search for cl_nv_device_attribute_query
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(cDevString), &cDevString, NULL);
if (cDevString != 0)
{
std::string stdDevString;
stdDevString = std::string(cDevString);
size_t szOldPos = 0;
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
while (szSpacePos != stdDevString.npos)
{
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
{
bDevAttributeQuery = true;
}
do {
szOldPos = szSpacePos + 1;
szSpacePos = stdDevString.find(' ', szOldPos);
} while (szSpacePos == szOldPos);
}
}
// if search succeeded, get device caps
if(bDevAttributeQuery)
{
cl_int iComputeCapMajor, iComputeCapMinor;
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), (void*)&iComputeCapMajor, NULL);
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), (void*)&iComputeCapMinor, NULL);
iDevArch = (10 * iComputeCapMajor) + iComputeCapMinor;
}
return iDevArch;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetFirstDev(cl_context cxGPUContext)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id first = cdDevices[0];
free(cdDevices);
return first;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
size_t device_count = szParmDataBytes / sizeof(cl_device_id);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id max_flops_device = cdDevices[0];
int max_flops = 0;
size_t current_device = 0;
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
max_flops = compute_units * clock_frequency;
++current_device;
while( current_device < device_count )
{
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
int flops = compute_units * clock_frequency;
if( flops > max_flops )
{
max_flops = flops;
max_flops_device = cdDevices[current_device];
}
++current_device;
}
free(cdDevices);
return max_flops_device;
}
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
#ifdef _WIN32 // Windows version
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
{
return NULL;
}
#else // Linux version
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
#endif
size_t szPreambleLength = strlen(cPreamble);
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
memcpy(cSourceString, cPreamble, szPreambleLength);
if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
{
fclose(pFileStream);
free(cSourceString);
return 0;
}
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength + szPreambleLength;
}
cSourceString[szSourceLength + szPreambleLength] = '\0';
return cSourceString;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int nr)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
if( szParmDataBytes / sizeof(cl_device_id) <= nr ) {
return (cl_device_id)-1;
}
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id device = cdDevices[nr];
free(cdDevices);
return device;
}
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length)
{
// Grab the number of devices associated witht the program
cl_uint num_devices;
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
// Grab the device ids
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
// Grab the sizes of the binaries
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
// Now get the binaries
char** ptx_code = (char**) malloc(num_devices * sizeof(char*));
for( unsigned int i=0; i<num_devices; ++i) {
ptx_code[i]= (char*)malloc(binary_sizes[i]);
}
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
// Find the index of the device of interest
unsigned int idx = 0;
while( idx<num_devices && devices[idx] != cdDevice ) ++idx;
// If it is associated prepare the result
if( idx < num_devices )
{
*binary = ptx_code[idx];
*length = binary_sizes[idx];
}
// Cleanup
free( devices );
free( binary_sizes );
for( unsigned int i=0; i<num_devices; ++i) {
if( i != idx ) free(ptx_code[i]);
}
free( ptx_code );
}
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName)
{
// Grab the number of devices associated with the program
cl_uint num_devices;
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
// Grab the device ids
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
// Grab the sizes of the binaries
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
// Now get the binaries
char** ptx_code = (char**)malloc(num_devices * sizeof(char*));
for( unsigned int i=0; i<num_devices; ++i)
{
ptx_code[i] = (char*)malloc(binary_sizes[i]);
}
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
// Find the index of the device of interest
unsigned int idx = 0;
while((idx < num_devices) && (devices[idx] != cdDevice))
{
++idx;
}
// If the index is associated, log the result
if(idx < num_devices)
{
// if a separate filename is supplied, dump ptx there
if (NULL != cPtxFileName)
{
shrLog("\nWriting ptx to separate file: %s ...\n\n", cPtxFileName);
FILE* pFileStream = NULL;
#ifdef _WIN32
fopen_s(&pFileStream, cPtxFileName, "wb");
#else
pFileStream = fopen(cPtxFileName, "wb");
#endif
fwrite(ptx_code[idx], binary_sizes[idx], 1, pFileStream);
fclose(pFileStream);
}
else // log to logfile and console if no ptx file specified
{
shrLog("\n%s\nProgram Binary:\n%s\n%s\n", HDASHLINE, ptx_code[idx], HDASHLINE);
}
}
// Cleanup
free(devices);
free(binary_sizes);
for(unsigned int i = 0; i < num_devices; ++i)
{
free(ptx_code[i]);
}
free( ptx_code );
}
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice)
{
// write out the build log and ptx, then exit
char cBuildLog[10240];
clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG,
sizeof(cBuildLog), cBuildLog, NULL );
shrLog("\n%s\nBuild Log:\n%s\n%s\n", HDASHLINE, cBuildLog, HDASHLINE);
}
// Helper function for De-allocating cl objects
// *********************************************************************
void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs)
{
int i;
for (i = 0; i < iNumObjs; i++)
{
if (cmMemObjs[i])clReleaseMemObject(cmMemObjs[i]);
}
}
// Helper function to get OpenCL error string from constant
// *********************************************************************
const char* oclErrorString(cl_int error)
{
static const char* errorString[] = {
"CL_SUCCESS",
"CL_DEVICE_NOT_FOUND",
"CL_DEVICE_NOT_AVAILABLE",
"CL_COMPILER_NOT_AVAILABLE",
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
"CL_OUT_OF_RESOURCES",
"CL_OUT_OF_HOST_MEMORY",
"CL_PROFILING_INFO_NOT_AVAILABLE",
"CL_MEM_COPY_OVERLAP",
"CL_IMAGE_FORMAT_MISMATCH",
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
"CL_BUILD_PROGRAM_FAILURE",
"CL_MAP_FAILURE",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"CL_INVALID_VALUE",
"CL_INVALID_DEVICE_TYPE",
"CL_INVALID_PLATFORM",
"CL_INVALID_DEVICE",
"CL_INVALID_CONTEXT",
"CL_INVALID_QUEUE_PROPERTIES",
"CL_INVALID_COMMAND_QUEUE",
"CL_INVALID_HOST_PTR",
"CL_INVALID_MEM_OBJECT",
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
"CL_INVALID_IMAGE_SIZE",
"CL_INVALID_SAMPLER",
"CL_INVALID_BINARY",
"CL_INVALID_BUILD_OPTIONS",
"CL_INVALID_PROGRAM",
"CL_INVALID_PROGRAM_EXECUTABLE",
"CL_INVALID_KERNEL_NAME",
"CL_INVALID_KERNEL_DEFINITION",
"CL_INVALID_KERNEL",
"CL_INVALID_ARG_INDEX",
"CL_INVALID_ARG_VALUE",
"CL_INVALID_ARG_SIZE",
"CL_INVALID_KERNEL_ARGS",
"CL_INVALID_WORK_DIMENSION",
"CL_INVALID_WORK_GROUP_SIZE",
"CL_INVALID_WORK_ITEM_SIZE",
"CL_INVALID_GLOBAL_OFFSET",
"CL_INVALID_EVENT_WAIT_LIST",
"CL_INVALID_EVENT",
"CL_INVALID_OPERATION",
"CL_INVALID_GL_OBJECT",
"CL_INVALID_BUFFER_SIZE",
"CL_INVALID_MIP_LEVEL",
"CL_INVALID_GLOBAL_WORK_SIZE",
};
const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
const int index = -error;
return (index >= 0 && index < errorCount) ? errorString[index] : "Unspecified Error";
}
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
const char* oclImageFormatString(cl_uint uiImageFormat)
{
// cl_channel_order
if (uiImageFormat == CL_R)return "CL_R";
if (uiImageFormat == CL_A)return "CL_A";
if (uiImageFormat == CL_RG)return "CL_RG";
if (uiImageFormat == CL_RA)return "CL_RA";
if (uiImageFormat == CL_RGB)return "CL_RGB";
if (uiImageFormat == CL_RGBA)return "CL_RGBA";
if (uiImageFormat == CL_BGRA)return "CL_BGRA";
if (uiImageFormat == CL_ARGB)return "CL_ARGB";
if (uiImageFormat == CL_INTENSITY)return "CL_INTENSITY";
if (uiImageFormat == CL_LUMINANCE)return "CL_LUMINANCE";
// cl_channel_type
if (uiImageFormat == CL_SNORM_INT8)return "CL_SNORM_INT8";
if (uiImageFormat == CL_SNORM_INT16)return "CL_SNORM_INT16";
if (uiImageFormat == CL_UNORM_INT8)return "CL_UNORM_INT8";
if (uiImageFormat == CL_UNORM_INT16)return "CL_UNORM_INT16";
if (uiImageFormat == CL_UNORM_SHORT_565)return "CL_UNORM_SHORT_565";
if (uiImageFormat == CL_UNORM_SHORT_555)return "CL_UNORM_SHORT_555";
if (uiImageFormat == CL_UNORM_INT_101010)return "CL_UNORM_INT_101010";
if (uiImageFormat == CL_SIGNED_INT8)return "CL_SIGNED_INT8";
if (uiImageFormat == CL_SIGNED_INT16)return "CL_SIGNED_INT16";
if (uiImageFormat == CL_SIGNED_INT32)return "CL_SIGNED_INT32";
if (uiImageFormat == CL_UNSIGNED_INT8)return "CL_UNSIGNED_INT8";
if (uiImageFormat == CL_UNSIGNED_INT16)return "CL_UNSIGNED_INT16";
if (uiImageFormat == CL_UNSIGNED_INT32)return "CL_UNSIGNED_INT32";
if (uiImageFormat == CL_HALF_FLOAT)return "CL_HALF_FLOAT";
if (uiImageFormat == CL_FLOAT)return "CL_FLOAT";
// unknown constant
return "Unknown";
}

File diff suppressed because it is too large Load Diff

117
tests/opencl/common.mk Normal file
View File

@@ -0,0 +1,117 @@
XLEN ?= 32
TARGET ?= opaesim
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
XRT_DEVICE_INDEX ?= 0
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
K_CFLAGS += -march=rv64imafd -mabi=ilp64d
STARTUP_ADDR ?= 0x180000000
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
K_CFLAGS += -march=rv32imaf -mabi=ilp32f
STARTUP_ADDR ?= 0x80000000
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
LLVM_VORTEX ?= /opt/llvm-vortex
LLVM_POCL ?= /opt/llvm-vortex
K_CFLAGS += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
K_CFLAGS += -I$(VORTEX_KN_PATH)/include -DNDEBUG -DLLVM_VOTEX
K_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a -lm
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
CXXFLAGS += -pthread
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_RT_PATH)/stub -lvortex
ifdef HOSTGPU
CXXFLAGS += -DHOSTGPU
LDFLAGS += -lOpenCL
else
LDFLAGS += $(POCL_RT_PATH)/lib/libOpenCL.so
endif
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
ifeq ($(TARGET), fpga)
OPAE_DRV_PATHS ?= libopae-c.so
else
ifeq ($(TARGET), asesim)
OPAE_DRV_PATHS ?= libopae-c-ase.so
else
ifeq ($(TARGET), opaesim)
OPAE_DRV_PATHS ?= libopae-c-sim.so
endif
endif
endif
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
all: $(PROJECT) kernel.pocl
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
%.cc.o: %.cc
$(CXX) $(CXXFLAGS) -c $< -o $@
%.cpp.o: %.cpp
$(CXX) $(CXXFLAGS) -c $< -o $@
%.c.o: %.c
$(CC) $(CXXFLAGS) -c $< -o $@
$(PROJECT): $(OBJS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-opae: $(PROJECT) kernel.pocl
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-xrt: $(PROJECT) kernel.pocl
ifeq ($(TARGET), hw)
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
else
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
endif
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.dump *.pocl
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -1 +0,0 @@
convolution

View File

@@ -1,67 +0,0 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/simx -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = convolution
SRCS = main.cpp utils.cpp
all: $(PROJECT) kernel.pocl
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 44 KiB

View File

@@ -1,54 +0,0 @@
__kernel
void convolution(
__read_only image2d_t sourceImage,
__write_only image2d_t outputImage,
int rows,
int cols,
__constant float* filter,
int filterWidth,
sampler_t sampler)
{
// Store each work-items unique row and column
int column = get_global_id(0);
int row = get_global_id(1);
// Half the width of the filter is needed for indexing
// memory later
int halfWidth = (int)(filterWidth/2);
// All accesses to images return data as four-element vector
// (i.e., float4), although only the 'x' component will contain
// meaningful data in this code
float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
// Iterator for the filter
int filterIdx = 0;
// Each work-item iterates around its local area based on the
// size of the filter
int2 coords; // Coordinates for accessing the image
// Iterate the filter rows
for(int i = -halfWidth; i <= halfWidth; i++) {
coords.y = row + i;
// Iterate over the filter columns
for(int j = -halfWidth; j <= halfWidth; j++) {
coords.x = column + j;
float4 pixel;
// Read a pixel from the image. A single channel image
// stores the pixel in the 'x' coordinate of the returned
// vector.
pixel = read_imagef(sourceImage, sampler, coords);
sum.x += pixel.x * filter[filterIdx++];
}
}
// Copy the data to the output image if the
// work-item is in bounds
if(row < rows && column < cols) {
coords.x = column;
coords.y = row;
write_imagef(outputImage, coords, sum);
}
}

View File

@@ -1,261 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include "utils.h"
// This function takes a positive integer and rounds it up to
// the nearest multiple of another provided integer
unsigned int roundUp(unsigned int value, unsigned int multiple) {
// Determine how far past the nearest multiple the value is
unsigned int remainder = value % multiple;
// Add the difference to make the value a multiple
if(remainder != 0) {
value += (multiple-remainder);
}
return value;
}
// This function reads in a text file and stores it as a char pointer
char* readSource(char* kernelPath) {
cl_int status;
FILE *fp;
char *source;
long int size;
printf("Program file is: %s\n", kernelPath);
fp = fopen(kernelPath, "rb");
if(!fp) {
printf("Could not open kernel file\n");
exit(-1);
}
status = fseek(fp, 0, SEEK_END);
if(status != 0) {
printf("Error seeking to end of file\n");
exit(-1);
}
size = ftell(fp);
if(size < 0) {
printf("Error getting file position\n");
exit(-1);
}
rewind(fp);
source = (char *)malloc(size + 1);
int i;
for (i = 0; i < size+1; i++) {
source[i]='\0';
}
if(source == NULL) {
printf("Error allocating space for the kernel source\n");
exit(-1);
}
fread(source, 1, size, fp);
source[size] = '\0';
return source;
}
void chk(cl_int status, const char* cmd) {
if(status != CL_SUCCESS) {
printf("%s failed (%d)\n", cmd, status);
exit(-1);
}
}
int main() {
int i, j, k, l;
// Rows and columns in the input image
int imageHeight;
int imageWidth;
const char* inputFile = "input.bmp";
const char* outputFile = "output.bmp";
// Homegrown function to read a BMP from file
float* inputImage = readImage(inputFile, &imageWidth,
&imageHeight);
// Size of the input and output images on the host
int dataSize = imageHeight*imageWidth*sizeof(float);
// Output image on the host
float* outputImage = NULL;
outputImage = (float*)malloc(dataSize);
float* refImage = NULL;
refImage = (float*)malloc(dataSize);
// 45 degree motion blur
float filter[49] =
{0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, -1, 0, 1, 0, 0,
0, 0, -2, 0, 2, 0, 0,
0, 0, -1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0};
// The convolution filter is 7x7
int filterWidth = 7;
int filterSize = filterWidth*filterWidth; // Assume a square kernel
// Set up the OpenCL environment
cl_int status;
// Discovery platform
cl_platform_id platform;
status = clGetPlatformIDs(1, &platform, NULL);
chk(status, "clGetPlatformIDs");
// Discover device
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
chk(status, "clGetDeviceIDs");
// Create context
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)(platform), 0};
cl_context context;
context = clCreateContext(props, 1, &device, NULL, NULL, &status);
chk(status, "clCreateContext");
// Create command queue
cl_command_queue queue;
queue = clCreateCommandQueue(context, device, 0, &status);
chk(status, "clCreateCommandQueue");
// The image format describes how the data will be stored in memory
cl_image_format format;
format.image_channel_order = CL_R; // single channel
format.image_channel_data_type = CL_FLOAT; // float data type
// Create space for the source image on the device
cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth,
imageHeight, 0, NULL, &status);
chk(status, "clCreateImage2D");
// Create space for the output image on the device
cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth,
imageHeight, 0, NULL, &status);
chk(status, "clCreateImage2D");
// Create space for the 7x7 filter on the device
cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float),
NULL, &status);
chk(status, "clCreateBuffer");
// Copy the source image to the device
size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from
size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region,
0, 0, inputImage, 0, NULL, NULL);
chk(status, "clEnqueueWriteImage");
// Copy the 7x7 filter to the device
status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0,
filterSize*sizeof(float), filter, 0, NULL, NULL);
chk(status, "clEnqueueWriteBuffer");
// Create the image sampler
cl_sampler sampler = clCreateSampler(context, CL_FALSE,
CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
chk(status, "clCreateSampler");
const char* source = readSource("kernel.cl");
// Create a program object with source and build it
cl_program program;
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
chk(status, "clCreateProgramWithSource");
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
chk(status, "clBuildProgram");
// Create the kernel object
cl_kernel kernel;
kernel = clCreateKernel(program, "convolution", &status);
chk(status, "clCreateKernel");
// Set the kernel arguments
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
chk(status, "clSetKernelArg");
// Set the work item dimensions
size_t globalSize[2] = {imageWidth, imageHeight};
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
NULL, NULL);
chk(status, "clEnqueueNDRange");
// Read the image back to the host
status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin,
region, 0, 0, outputImage, 0, NULL, NULL);
chk(status, "clEnqueueReadImage");
// Write the output image to file
storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
// Compute the reference image
for(i = 0; i < imageHeight; i++) {
for(j = 0; j < imageWidth; j++) {
refImage[i*imageWidth+j] = 0;
}
}
// Iterate over the rows of the source image
int halfFilterWidth = filterWidth/2;
float sum;
for(i = 0; i < imageHeight; i++) {
// Iterate over the columns of the source image
for(j = 0; j < imageWidth; j++) {
sum = 0; // Reset sum for new source pixel
// Apply the filter to the neighborhood
for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
if(i+k >= 0 && i+k < imageHeight &&
j+l >= 0 && j+l < imageWidth) {
sum += inputImage[(i+k)*imageWidth + j+l] *
filter[(k+halfFilterWidth)*filterWidth +
l+halfFilterWidth];
}
}
}
refImage[i*imageWidth+j] = sum;
}
}
int failed = 0;
for(i = 0; i < imageHeight; i++) {
for(j = 0; j < imageWidth; j++) {
if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
printf("Results are INCORRECT\n");
printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
failed = 1;
}
if(failed) break;
}
if(failed) break;
}
if(!failed) {
printf("Results are correct\n");
}
return 0;
}

View File

@@ -1,180 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include "utils.h"
void storeImage(float *imageOut,
const char *filename,
int rows,
int cols,
const char* refFilename) {
FILE *ifp, *ofp;
unsigned char tmp;
int offset;
unsigned char *buffer;
int i, j;
int bytes;
int height, width;
ifp = fopen(refFilename, "rb");
if(ifp == NULL) {
perror(filename);
exit(-1);
}
fseek(ifp, 10, SEEK_SET);
fread(&offset, 4, 1, ifp);
fseek(ifp, 18, SEEK_SET);
fread(&width, 4, 1, ifp);
fread(&height, 4, 1, ifp);
fseek(ifp, 0, SEEK_SET);
buffer = (unsigned char *)malloc(offset);
if(buffer == NULL) {
perror("malloc");
exit(-1);
}
fread(buffer, 1, offset, ifp);
printf("Writing output image to %s\n", filename);
ofp = fopen(filename, "wb");
if(ofp == NULL) {
perror("opening output file");
exit(-1);
}
bytes = fwrite(buffer, 1, offset, ofp);
if(bytes != offset) {
printf("error writing header!\n");
exit(-1);
}
// NOTE bmp formats store data in reverse raster order (see comment in
// readImage function), so we need to flip it upside down here.
int mod = width % 4;
if(mod != 0) {
mod = 4 - mod;
}
// printf("mod = %d\n", mod);
for(i = height-1; i >= 0; i--) {
for(j = 0; j < width; j++) {
tmp = (unsigned char)imageOut[i*cols+j];
fwrite(&tmp, sizeof(char), 1, ofp);
}
// In bmp format, rows must be a multiple of 4-bytes.
// So if we're not at a multiple of 4, add junk padding.
for(j = 0; j < mod; j++) {
fwrite(&tmp, sizeof(char), 1, ofp);
}
}
fclose(ofp);
fclose(ifp);
free(buffer);
}
/*
* Read bmp image and convert to byte array. Also output the width and height
*/
float* readImage(const char *filename, int* widthOut, int* heightOut) {
uchar* imageData;
int height, width;
uchar tmp;
int offset;
int i, j;
printf("Reading input image from %s\n", filename);
FILE *fp = fopen(filename, "rb");
if(fp == NULL) {
perror(filename);
exit(-1);
}
fseek(fp, 10, SEEK_SET);
fread(&offset, 4, 1, fp);
fseek(fp, 18, SEEK_SET);
fread(&width, 4, 1, fp);
fread(&height, 4, 1, fp);
printf("width = %d\n", width);
printf("height = %d\n", height);
*widthOut = width;
*heightOut = height;
imageData = (uchar*)malloc(width*height);
if(imageData == NULL) {
perror("malloc");
exit(-1);
}
fseek(fp, offset, SEEK_SET);
fflush(NULL);
int mod = width % 4;
if(mod != 0) {
mod = 4 - mod;
}
// NOTE bitmaps are stored in upside-down raster order. So we begin
// reading from the bottom left pixel, then going from left-to-right,
// read from the bottom to the top of the image. For image analysis,
// we want the image to be right-side up, so we'll modify it here.
// First we read the image in upside-down
// Read in the actual image
for(i = 0; i < height; i++) {
// add actual data to the image
for(j = 0; j < width; j++) {
fread(&tmp, sizeof(char), 1, fp);
imageData[i*width + j] = tmp;
}
// For the bmp format, each row has to be a multiple of 4,
// so I need to read in the junk data and throw it away
for(j = 0; j < mod; j++) {
fread(&tmp, sizeof(char), 1, fp);
}
}
// Then we flip it over
int flipRow;
for(i = 0; i < height/2; i++) {
flipRow = height - (i+1);
for(j = 0; j < width; j++) {
tmp = imageData[i*width+j];
imageData[i*width+j] = imageData[flipRow*width+j];
imageData[flipRow*width+j] = tmp;
}
}
fclose(fp);
// Input image on the host
float* floatImage = NULL;
floatImage = (float*)malloc(sizeof(float)*width*height);
if(floatImage == NULL) {
perror("malloc");
exit(-1);
}
// Convert the BMP image to float (not required)
for(i = 0; i < height; i++) {
for(j = 0; j < width; j++) {
floatImage[i*width+j] = (float)imageData[i*width+j];
}
}
free(imageData);
return floatImage;
}

View File

@@ -1,11 +0,0 @@
#ifndef __UTILS__
#define __UTILS__
typedef unsigned char uchar;
float* readImage(const char *filename, int* widthOut, int* heightOut);
void storeImage(float *imageOut, const char *filename, int rows, int cols,
const char* refFilename);
#endif

View File

@@ -1,69 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = cutcp
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c cutoff.c cutcpu.c output.c readatom.c excl.c
all: $(PROJECT).dump $(PROJECT).hex
CXXFLAGS += -I.
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
OPTS ?=
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
include ../common.mk

View File

@@ -19,6 +19,27 @@
#include "macros.h"
#include "ocl.h"
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (NULL == filename || NULL == data || 0 == size)
return CL_INVALID_VALUE;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return CL_INVALID_VALUE;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return CL_SUCCESS;
}
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
typedef cl_int4 xyz;
@@ -294,8 +315,6 @@ int gpu_compute_cutoff_potential_lattice(
printf("\n");
}
printf("Ok!\n");
pb_Context* pb_context;
pb_context = pb_InitOpenCLContext(parameters);
if (pb_context == NULL) {
@@ -303,8 +322,6 @@ int gpu_compute_cutoff_potential_lattice(
return -1;
}
printf("Ok!\n");
cl_int clStatus;
cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
@@ -317,8 +334,13 @@ int gpu_compute_cutoff_potential_lattice(
//const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
cl_program clProgram = clCreateProgramWithBuiltInKernels(
clContext, 1, &clDevice, "opencl_cutoff_potential_lattice", &clStatus);
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
CHECK_ERROR("read_kernel_file")
cl_program clProgram = clCreateProgramWithBinary(
clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
CHECK_ERROR("clCreateProgramWithSource")
char clOptions[50];
@@ -399,9 +421,6 @@ int gpu_compute_cutoff_potential_lattice(
clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
CHECK_ERROR("clSetKernelArg")
printf("Ok!!\n");
/* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
@@ -412,26 +431,16 @@ int gpu_compute_cutoff_potential_lattice(
clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
CHECK_ERROR("clSetKernelArg")
printf("Ok**!2\n");
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
printf("Ok**!2\n");
CHECK_ERROR("clEnqueueNDRangeKernel")
printf("Ok**!2\n");
clStatus = clFinish(clCommandQueue);
printf("Ok**!2\n");
CHECK_ERROR("clFinish")
}
printf("Ok++!\n");
printf("Finished OpenCL kernel calls \n");
printf("Finished OpenCL kernel calls\n");
/* copy result regions from OpenCL device */
pb_SwitchToTimer(timers, pb_TimerID_COPY);

View File

@@ -9,6 +9,10 @@
#ifndef __GPUINFOH__
#define __GPUINFOH__
#ifdef __cplusplus
extern "C" {
#endif
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
@@ -17,4 +21,8 @@ void compute_active_thread(size_t *thread,
int minor,
int sm);
#ifdef __cplusplus
}
#endif
#endif

Binary file not shown.

View File

@@ -124,8 +124,6 @@ int main(int argc, char *argv[]) {
pb_InitializeTimerSet(&timers);
pb_SwitchToTimer(&timers, pb_TimerID_IO);
printf("OK\n");
{
const char *pqrfilename = parameters->inpFiles[0];
@@ -136,8 +134,6 @@ int main(int argc, char *argv[]) {
printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
}
printf("OK\n");
/* find extent of domain */
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
get_atom_extent(&min_ext, &max_ext, atom);

View File

@@ -3,6 +3,10 @@
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
void clMemSet(cl_command_queue, cl_mem, int, size_t);
char* readFile(const char*);
@@ -14,4 +18,8 @@ char* readFile(const char*);
exit(1); \
}
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,7 @@
PROJECT = dotproduct
SRCS = main.cc oclUtils.cpp shrUtils.cpp cmd_arg_reader.cpp
OPTS ?= -n64
include ../common.mk

View File

@@ -0,0 +1,152 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
// includes, file
#include "cmd_arg_reader.h"
// includes, system
#include <vector>
// internal unnamed namespace
namespace
{
// types, internal (class, enum, struct, union, typedef)
// variables, internal
} // namespace {
// variables, exported
/*static*/ CmdArgReader* CmdArgReader::self;
/*static*/ char** CmdArgReader::rargv;
/*static*/ int CmdArgReader::rargc;
// functions, exported
////////////////////////////////////////////////////////////////////////////////
//! Public construction interface
//! @return a handle to the class instance
//! @param argc number of command line arguments (as given to main())
//! @param argv command line argument string (as given to main())
////////////////////////////////////////////////////////////////////////////////
/*static*/ void
CmdArgReader::init( const int argc, const char** argv)
{
if ( NULL != self)
{
return;
}
// command line arguments
if (( 0 == argc) || ( 0 == argv))
{
LOGIC_EXCEPTION( "No command line arguments given.");
}
self = new CmdArgReader();
self->createArgsMaps( argc, argv);
rargc = argc;
rargv = const_cast<char**>( argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Constructor, default
////////////////////////////////////////////////////////////////////////////////
CmdArgReader::CmdArgReader() :
args(),
unprocessed(),
iter(),
iter_unprocessed()
{ }
////////////////////////////////////////////////////////////////////////////////
//! Destructor
////////////////////////////////////////////////////////////////////////////////
CmdArgReader::~CmdArgReader()
{
for( iter = args.begin(); iter != args.end(); ++iter)
{
if( *(iter->second.first) == typeid( int))
{
delete static_cast<int*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( bool))
{
delete static_cast<bool*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::string))
{
delete static_cast<std::string*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::vector< std::string>) )
{
delete static_cast< std::vector< std::string>* >( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::vector<int>) )
{
delete static_cast< std::vector<int>* >( iter->second.second);
break;
}
}
}
////////////////////////////////////////////////////////////////////////////////
//! Read args as token value pair into map for better processing (Even the
//! values remain strings until the parameter values is requested by the
//! program.)
//! @param argc the argument count (as given to 'main')
//! @param argv the char* array containing the command line arguments
////////////////////////////////////////////////////////////////////////////////
void
CmdArgReader::createArgsMaps( const int argc, const char** argv) {
std::string token;
std::string val_str;
std::map< std::string, std::string> args;
std::string::size_type pos;
std::string arg;
for( int i=1; i<argc; ++i)
{
arg = argv[i];
// check if valid command line argument: all arguments begin with - or --
if (arg[0] != '-')
{
RUNTIME_EXCEPTION("Invalid command line argument.");
}
int numDashes = (arg[1] == '-' ? 2 : 1);
// check if only flag or if a value is given
if ( (pos = arg.find( '=')) == std::string::npos)
{
unprocessed[ std::string( arg, numDashes, arg.length()-numDashes)] = "FLAG";
}
else
{
unprocessed[ std::string( arg, numDashes, pos-numDashes)] =
std::string( arg, pos+1, arg.length()-1);
}
}
}

View File

@@ -0,0 +1,488 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
#ifndef _CMDARGREADER_H_
#define _CMDARGREADER_H_
// includes, system
#include <map>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <typeinfo>
// includes, project
#include "exception.h"
//! Preprocessed command line arguments
//! @note Lazy evaluation: The arguments are converted from strings to
//! the correct data type upon request. Converted values are stored
//! in an additonal map so that no additional conversion is
//! necessary. Arrays of command line arguments are stored in
//! std::vectors
//! @note Usage:
//! const std::string* file =
//! CmdArgReader::getArg< std::string>( "model")
//! const std::vector< std::string>* files =
//! CmdArgReader::getArg< std::vector< std::string> >( "model")
//! @note All command line arguments begin with '--' followed by the token;
//! token and value are seperated by '='; example --samples=50
//! @note Arrays have the form --model=[one.obj,two.obj,three.obj]
//! (without whitespaces)
//! Command line argument parser
class CmdArgReader
{
template<class> friend class TestCmdArgReader;
protected:
//! @param self handle to the only instance of this class
static CmdArgReader* self;
public:
//! Public construction interface
//! @return a handle to the class instance
//! @param argc number of command line arguments (as given to main())
//! @param argv command line argument string (as given to main())
static void init( const int argc, const char** argv);
public:
//! Get the value of the command line argument with given name
//! @return A const handle to the requested argument.
//! If the argument does not exist or if it
//! is not from type T NULL is returned
//! @param name the name of the requested argument
//! @note T the type of the argument requested
template<class T>
static inline const T* getArg( const std::string& name);
//! Check if a command line argument with the given name exists
//! @return true if a command line argument with name \a name exists,
//! otherwise false
//! @param name name of the command line argument in question
static inline bool existArg( const std::string& name);
//! Get the original / raw argc program argument
static inline int& getRArgc();
//! Get the original / raw argv program argument
static inline char**& getRArgv();
public:
//! Destructor
~CmdArgReader();
protected:
//! Constructor, default
CmdArgReader();
private:
// private helper functions
//! Get the value of the command line argument with given name
//! @note Private helper function for 'getArg' to work on the members
//! @return A const handle to the requested argument. If the argument
//! does not exist or if it is not from type T a NULL pointer
//! is returned.
//! @param name the name of the requested argument
//! @note T the type of the argument requested
template<class T>
inline const T* getArgHelper( const std::string& name);
//! Check if a command line argument with name \a name exists
//! @return true if a command line argument of name \a name exists,
//! otherwise false
//! @param name the name of the requested argument
inline bool existArgHelper( const std::string& name) const;
//! Read args as token value pair into map for better processing
//! (Even the values remain strings until the parameter values is
//! requested by the program.)
//! @param argc the argument count (as given to 'main')
//! @param argv the char* array containing the command line arguments
void createArgsMaps( const int argc, const char** argv);
//! Helper for "casting" the strings from the map with the unprocessed
//! values to the correct
//! data type.
//! @return true if conversion succeeded, otherwise false
//! @param element the value as string
//! @param val the value as type T
template<class T>
static inline bool convertToT( const std::string& element, T& val);
public:
// typedefs internal
//! container for a processed command line argument
//! typeid is used to easily be able to decide if a re-requested token-value
//! pair match the type of the first conversion
typedef std::pair< const std::type_info*, void*> ValType;
//! map of already converted values
typedef std::map< std::string, ValType > ArgsMap;
//! iterator for the map of already converted values
typedef ArgsMap::iterator ArgsMapIter;
typedef ArgsMap::const_iterator ConstArgsMapIter;
//! map of unprocessed (means unconverted) token-value pairs
typedef std::map< std::string, std::string> UnpMap;
//! iterator for the map of unprocessed (means unconverted) token-value pairs
typedef std::map< std::string, std::string>::iterator UnpMapIter;
private:
#ifdef _WIN32
# pragma warning( disable: 4251)
#endif
//! rargc original value of argc
static int rargc;
//! rargv contains command line arguments in raw format
static char** rargv;
//! args Map containing the already converted token-value pairs
ArgsMap args;
//! args Map containing the unprocessed / unconverted token-value pairs
UnpMap unprocessed;
//! iter Iterator for the map with the already converted token-value
//! pairs (to avoid frequent reallocation)
ArgsMapIter iter;
//! iter Iterator for the map with the unconverted token-value
//! pairs (to avoid frequent reallocation)
UnpMapIter iter_unprocessed;
#ifdef _WIN32
# pragma warning( default: 4251)
#endif
private:
//! Constructor, copy (not implemented)
CmdArgReader( const CmdArgReader&);
//! Assignment operator (not implemented)
CmdArgReader& operator=( const CmdArgReader&);
};
// variables, exported (extern)
// functions, inlined (inline)
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line argument arrays
//! @note This function is used each type for which no template specialization
//! exist (which will cause errors if the type does not fulfill the std::vector
//! interface).
////////////////////////////////////////////////////////////////////////////////
template<class T>
/*static*/ inline bool
CmdArgReader::convertToT( const std::string& element, T& val)
{
// preallocate storage
val.resize( std::count( element.begin(), element.end(), ',') + 1);
unsigned int i = 0;
std::string::size_type pos_start = 1; // leave array prefix '['
std::string::size_type pos_end = 0;
// do for all elements of the comma seperated list
while( std::string::npos != ( pos_end = element.find(',', pos_end+1)) )
{
// convert each element by the appropriate function
if ( ! convertToT< typename T::value_type >(
std::string( element, pos_start, pos_end - pos_start), val[i]))
{
return false;
}
pos_start = pos_end + 1;
++i;
}
std::string tmp1( element, pos_start, element.length() - pos_start - 1);
// process last element (leave array postfix ']')
if ( ! convertToT< typename T::value_type >( std::string( element,
pos_start,
element.length() - pos_start - 1),
val[i]))
{
return false;
}
// possible to process all elements?
return true;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type int
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<int>( const std::string& element, int& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type float
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<float>( const std::string& element, float& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type double
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<double>( const std::string& element, double& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type string
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<std::string>( const std::string& element,
std::string& val)
{
val = element;
return true;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type bool
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<bool>( const std::string& element, bool& val)
{
// check if value is given as string-type { true | false }
if ( "true" == element)
{
val = true;
return true;
}
else if ( "false" == element)
{
val = false;
return true;
}
// check if argument is given as integer { 0 | 1 }
else
{
int tmp;
if ( convertToT<int>( element, tmp))
{
if ( 1 == tmp)
{
val = true;
return true;
}
else if ( 0 == tmp)
{
val = false;
return true;
}
}
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the value of the command line argument with given name
//! @return A const handle to the requested argument. If the argument does
//! not exist or if it is not from type T NULL is returned
//! @param T the type of the argument requested
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
template<class T>
/*static*/ const T*
CmdArgReader::getArg( const std::string& name)
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
return NULL;
}
return self->getArgHelper<T>( name);
}
////////////////////////////////////////////////////////////////////////////////
//! Check if a command line argument with the given name exists
//! @return true if a command line argument with name \a name exists,
//! otherwise false
//! @param name name of the command line argument in question
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline bool
CmdArgReader::existArg( const std::string& name)
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
return false;
}
return self->existArgHelper( name);
}
////////////////////////////////////////////////////////////////////////////////
//! @brief Get the value of the command line argument with given name
//! @return A const handle to the requested argument. If the argument does
//! not exist or if it is not from type T NULL is returned
//! @param T the type of the argument requested
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
template<class T>
const T*
CmdArgReader::getArgHelper( const std::string& name)
{
// check if argument already processed and stored in correct type
if ( args.end() != (iter = args.find( name)))
{
if ( (*(iter->second.first)) == typeid( T) )
{
return (T*) iter->second.second;
}
}
else
{
T* tmp = new T;
// check the array with unprocessed values
if ( unprocessed.end() != (iter_unprocessed = unprocessed.find( name)))
{
// try to "cast" the string to the type requested
if ( convertToT< T >( iter_unprocessed->second, *tmp))
{
// add the token element pair to map of already converted values
args[name] = std::make_pair( &(typeid( T)), (void*) tmp);
return tmp;
}
}
// not used while not inserted into the map -> cleanup
delete tmp;
}
// failed, argument not available
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
//! Check if a command line argument with name \a name exists
//! @return true if a command line argument of name \a name exists,
//! otherwise false
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
inline bool
CmdArgReader::existArgHelper( const std::string& name) const
{
bool ret_val = false;
// check if argument already processed and stored in correct type
if( args.end() != args.find( name))
{
ret_val = true;
}
else
{
// check the array with unprocessed values
if ( unprocessed.end() != unprocessed.find( name))
{
ret_val = true;
}
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the original / raw argc program argument
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline int&
CmdArgReader::getRArgc()
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
}
return rargc;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the original / raw argv program argument
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline char**&
CmdArgReader::getRArgv()
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
}
return rargv;
}
// functions, exported (extern)
#endif // #ifndef _CMDARGREADER_H_

View File

@@ -0,0 +1,151 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
#ifndef _EXCEPTION_H_
#define _EXCEPTION_H_
// includes, system
#include <exception>
#include <stdexcept>
#include <iostream>
#include <stdlib.h>
//! Exception wrapper.
//! @param Std_Exception Exception out of namespace std for easy typing.
template<class Std_Exception>
class Exception : public Std_Exception
{
public:
//! @brief Static construction interface
//! @return Alwayss throws ( Located_Exception<Exception>)
//! @param file file in which the Exception occurs
//! @param line line in which the Exception occurs
//! @param detailed details on the code fragment causing the Exception
static void throw_it( const char* file,
const int line,
const char* detailed = "-" );
//! Static construction interface
//! @return Alwayss throws ( Located_Exception<Exception>)
//! @param file file in which the Exception occurs
//! @param line line in which the Exception occurs
//! @param detailed details on the code fragment causing the Exception
static void throw_it( const char* file,
const int line,
const std::string& detailed);
//! Destructor
virtual ~Exception() throw();
private:
//! Constructor, default (private)
Exception();
//! Constructor, standard
//! @param str string returned by what()
Exception( const std::string& str);
};
////////////////////////////////////////////////////////////////////////////////
//! Exception handler function for arbitrary exceptions
//! @param ex exception to handle
////////////////////////////////////////////////////////////////////////////////
template<class Exception_Typ>
inline void
handleException( const Exception_Typ& ex)
{
std::cerr << ex.what() << std::endl;
exit( EXIT_FAILURE);
}
//! Convenience macros
//! Exception caused by dynamic program behavior, e.g. file does not exist
#define RUNTIME_EXCEPTION( msg) \
Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
//! Logic exception in program, e.g. an assert failed
#define LOGIC_EXCEPTION( msg) \
Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
//! Out of range exception
#define RANGE_EXCEPTION( msg) \
Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
////////////////////////////////////////////////////////////////////////////////
//! Implementation
// includes, system
#include <sstream>
////////////////////////////////////////////////////////////////////////////////
//! Static construction interface.
//! @param Exception causing code fragment (file and line) and detailed infos.
////////////////////////////////////////////////////////////////////////////////
/*static*/ template<class Std_Exception>
void
Exception<Std_Exception>::
throw_it( const char* file, const int line, const char* detailed)
{
std::stringstream s;
// Quiet heavy-weight but exceptions are not for
// performance / release versions
s << "Exception in file '" << file << "' in line " << line << "\n"
<< "Detailed description: " << detailed << "\n";
throw Exception( s.str());
}
////////////////////////////////////////////////////////////////////////////////
//! Static construction interface.
//! @param Exception causing code fragment (file and line) and detailed infos.
////////////////////////////////////////////////////////////////////////////////
/*static*/ template<class Std_Exception>
void
Exception<Std_Exception>::
throw_it( const char* file, const int line, const std::string& msg)
{
throw_it( file, line, msg.c_str());
}
////////////////////////////////////////////////////////////////////////////////
//! Constructor, default (private).
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::Exception() :
Exception("Unknown Exception.\n")
{ }
////////////////////////////////////////////////////////////////////////////////
//! Constructor, standard (private).
//! String returned by what().
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::Exception( const std::string& s) :
Std_Exception( s)
{ }
////////////////////////////////////////////////////////////////////////////////
//! Destructor
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::~Exception() throw() { }
// functions, exported
#endif // #ifndef _EXCEPTION_H_

View File

@@ -0,0 +1,22 @@
__kernel void DotProduct (__global float* a, __global float* b, __global float* c, int iNumElements)
{
// find position in global arrays
int iGID = get_global_id(0);
// bound check (equivalent to the limit on a 'for' loop for standard/serial C code
//printf("%d, %d\n", iGID, iNumElements);
if (iGID >= iNumElements)
{
return;
}
// process
int iInOffset = iGID << 2;
c[iGID] = a[iInOffset] * b[iInOffset]
+ a[iInOffset + 1] * b[iInOffset + 1]
+ a[iInOffset + 2] * b[iInOffset + 2]
+ a[iInOffset + 3] * b[iInOffset + 3];
//float cc = c[iGID];
//printf("c[%d]=%f\n", iGID, cc);
}

View File

@@ -1,3 +1,4 @@
//////////////////////////////////////////////////////////////////////////
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
@@ -10,25 +11,26 @@
*/
// *********************************************************************
// oclDotProduct Notes:
// oclDotProduct Notes:
//
// A simple OpenCL API demo application that implements a
// vector dot product computation between 2 float arrays.
// vector dot product computation between 2 float arrays.
//
// Runs computations with OpenCL on the GPU device and then checks results
// Runs computations with OpenCL on the GPU device and then checks results
// against basic host CPU/C++ computation.
//
// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness.
// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness.
// But these are NOT required libs for OpenCL developement in general.
// *********************************************************************
// standard utilities and systems includes
#include <oclUtils.h>
#include <shrQATest.h>
#include "oclUtils.h"
#include "shrQATest.h"
// Name of the file with the source code for the computation kernel
// *********************************************************************
const char* cSourceFile = "DotProduct.cl";
const char* cSourceFile = "kernel.pocl";
// Host buffers for demo
// *********************************************************************
@@ -43,20 +45,20 @@ cl_command_queue cqCommandQueue;// OpenCL command que
cl_program program; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel
cl_mem cmDevSrcA; // OpenCL device source buffer A
cl_mem cmDevSrcB; // OpenCL device source buffer B
cl_mem cmDevDst; // OpenCL device destination buffer
cl_mem cmDevSrcB; // OpenCL device source buffer B
cl_mem cmDevDst; // OpenCL device destination buffer
size_t szGlobalWorkSize; // Total # of work items in the 1D range
size_t szLocalWorkSize; // # of work items in the 1D work group
size_t szLocalWorkSize; // # of work items in the 1D work group
size_t szParmDataBytes; // Byte size of context information
size_t szKernelLength; // Byte size of kernel code
cl_int ciErrNum; // Error code var
char* cPathAndName = NULL; // var for full paths to data, src, etc.
char* cSourceCL = NULL; // Buffer to hold source for compilation
char* cSourceCL = NULL; // Buffer to hold source for compilation
const char* cExecutableName = NULL;
// demo config vars
int iNumElements= 1277944; // Length of float arrays to process (odd # for illustration)
shrBOOL bNoPrompt = shrFALSE;
int iNumElements= 1024; // Length of float arrays to process (odd # for illustration)
shrBOOL bNoPrompt = shrFALSE;
// Forward Declarations
// *********************************************************************
@@ -67,7 +69,7 @@ void (*pCleanup)(int) = &Cleanup;
int *gp_argc = NULL;
char ***gp_argv = NULL;
// Main function
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
@@ -76,38 +78,29 @@ int main(int argc, char **argv)
shrQAStart(argc, argv);
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
cl_uint uiNumComputeUnits;
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
ciErrNum = clGetPlatformIDs(1, &cpPlatform, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
cl_uint uiNumDevices = 1;
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
cl_uint uiTargetDevice = 0;
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
//Get all the devices
cl_uint uiNumDevices = 0; // Number of devices available
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
shrLog("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
// Get command line device options and config accordingly
shrLog(" # of Devices Available = %u\n", uiNumDevices);
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
shrLog(" # of Devices Available = %u\n", uiNumDevices);
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
{
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
}
shrLog(" Using Device %u: ", uiTargetDevice);
shrLog(" Using Device %u: ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
// get command line arg for quick test, if provided
bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
@@ -115,16 +108,16 @@ int main(int argc, char **argv)
// start logs
cExecutableName = argv[0];
shrSetLogFileName ("oclDotProduct.txt");
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
// set and log Global and Local work size dimensions
szLocalWorkSize = 256;
szLocalWorkSize = 16;
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
// Allocate and initialize host arrays
shrLog( "Allocate and Init Host Mem...\n");
shrLog( "Allocate and Init Host Mem...\n");
srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
@@ -134,49 +127,50 @@ int main(int argc, char **argv)
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Get a GPU device
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create the context
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create a command-queue
shrLog("clCreateCommandQueue...\n");
shrLog("clCreateCommandQueue...\n");
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Read the OpenCL kernel in from source file
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
//oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
//oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
// Create the program
shrLog("clCreateProgramWithSource...\n");
//program = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
shrLog("clCreateProgramWithSource...\n");
cl_int binary_status;
cl_program program =
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL);
clCreateProgramWithBinary(cxGPUContext, 1, cdDevices, &szKernelLength, (const uint8_t**)&cSourceCL, &binary_status, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Build the program with 'mad' Optimization option
#ifdef MAC
char* flags = "-cl-fast-relaxed-math -DMAC";
#else
char* flags = "-cl-fast-relaxed-math";
#endif
shrLog("clBuildProgram...\n");
shrLog("clBuildProgram...\n");
ciErrNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
@@ -184,47 +178,50 @@ int main(int argc, char **argv)
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
Cleanup(EXIT_FAILURE);
Cleanup(EXIT_FAILURE);
}
// Create the kernel
shrLog("clCreateKernel (DotProduct)...\n");
shrLog("clCreateKernel (nDotProduct)...\n");
ckKernel = clCreateKernel(program, "DotProduct", &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Set the Argument values
shrLog("clSetKernelArg 0 - 3...\n\n");
shrLog("clSetKernelArg 0 - 3...\n\n");
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// --------------------------------------------------------
// Core sequence... copy input data to GPU, compute, copy results back
// Asynchronous write of data to GPU device
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Launch kernel
shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Read back results and check accumulated errors
shrLog("clEnqueueReadBuffer (Dst)...\n\n");
shrLog("clEnqueueReadBuffer (Dst)...\n\n");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Compute and compare results for golden-host and report errors and pass/fail
shrLog("Comparing against Host/C++ computation...\n\n");
shrLog("Comparing against Host/C++ computation...\n\n");
DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);
// Cleanup and leave
Cleanup (EXIT_SUCCESS);
return (bMatch == shrTRUE) ? 0 : 1;
}
// "Golden" Host processing dot product function for comparison purposes
@@ -232,13 +229,13 @@ int main(int argc, char **argv)
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements)
{
int i, j, k;
for (i = 0, j = 0; i < iNumElements; i++)
for (i = 0, j = 0; i < iNumElements; i++)
{
pfResult[i] = 0.0f;
for (k = 0; k < 4; k++, j++)
for (k = 0; k < 4; k++, j++)
{
pfResult[i] += pfData1[j] * pfData2[j];
}
pfResult[i] += pfData1[j] * pfData2[j];
}
}
}
@@ -250,7 +247,7 @@ void Cleanup(int iExitCode)
shrLog("Starting Cleanup...\n\n");
if(cPathAndName)free(cPathAndName);
if(cSourceCL)free(cSourceCL);
if(ckKernel)clReleaseKernel(ckKernel);
if(ckKernel)clReleaseKernel(ckKernel);
if(program)clReleaseProgram(program);
if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
if(cxGPUContext)clReleaseContext(cxGPUContext);
@@ -259,7 +256,7 @@ void Cleanup(int iExitCode)
if (cmDevDst)clReleaseMemObject(cmDevDst);
// Free host memory
free(srcA);
free(srcA);
free(srcB);
free (dst);
free(Golden);
@@ -267,4 +264,4 @@ void Cleanup(int iExitCode)
if (cdDevices) free(cdDevices);
shrQAFinishExit(*gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED);
}
}

View File

@@ -0,0 +1,806 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
#include <fstream>
#include <vector>
#include <iostream>
#include <algorithm>
#include <stdarg.h>
#include "oclUtils.h"
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platoform ID
//////////////////////////////////////////////////////////////////////////////
cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID)
{
char chBuffer[1024];
cl_uint num_platforms;
cl_platform_id* clPlatformIDs;
cl_int ciErrNum;
*clSelectedPlatformID = NULL;
// Get OpenCL platform count
ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms);
if (ciErrNum != CL_SUCCESS)
{
shrLog(" Error %i in clGetPlatformIDs Call !!!\n\n", ciErrNum);
return -1000;
}
else
{
if(num_platforms == 0)
{
shrLog("No OpenCL platform found!\n\n");
return -2000;
}
else
{
// if there's a platform or more, make space for ID's
if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL)
{
shrLog("Failed to allocate memory for cl_platform ID's!\n\n");
return -3000;
}
// get platform info for each platform and trap the NVIDIA platform if found
ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL);
for(cl_uint i = 0; i < num_platforms; ++i)
{
ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL);
if(ciErrNum == CL_SUCCESS)
{
if(strstr(chBuffer, "NVIDIA") != NULL)
{
*clSelectedPlatformID = clPlatformIDs[i];
break;
}
}
}
// default to zeroeth platform if NVIDIA not found
if(*clSelectedPlatformID == NULL)
{
shrLog("WARNING: NVIDIA OpenCL platform not found - defaulting to first platform!\n\n");
*clSelectedPlatformID = clPlatformIDs[0];
}
free(clPlatformIDs);
}
}
return CL_SUCCESS;
}
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
void oclPrintDevName(int iLogMode, cl_device_id device)
{
char device_string[1024];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, "%s", device_string);
}
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
void oclPrintDevInfo(int iLogMode, cl_device_id device)
{
char device_string[1024];
bool nv_device_attibute_query = false;
// CL_DEVICE_NAME
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_NAME: \t\t\t%s\n", device_string);
// CL_DEVICE_VENDOR
clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_VENDOR: \t\t\t%s\n", device_string);
// CL_DRIVER_VERSION
clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DRIVER_VERSION: \t\t\t%s\n", device_string);
// CL_DEVICE_VERSION
clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_VERSION: \t\t\t%s\n", device_string);
// CL_DEVICE_OPENCL_C_VERSION (if CL_DEVICE_VERSION version > 1.0)
if(strncmp("OpenCL 1.0", device_string, 10) != 0)
{
// This code is unused for devices reporting OpenCL 1.0, but a def is needed anyway to allow compilation using v 1.0 headers
// This constant isn't #defined in 1.0
#ifndef CL_DEVICE_OPENCL_C_VERSION
#define CL_DEVICE_OPENCL_C_VERSION 0x103D
#endif
clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_OPENCL_C_VERSION: \t\t%s\n", device_string);
}
// CL_DEVICE_TYPE
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
if( type & CL_DEVICE_TYPE_CPU )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if( type & CL_DEVICE_TYPE_GPU )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if( type & CL_DEVICE_TYPE_ACCELERATOR )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if( type & CL_DEVICE_TYPE_DEFAULT )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", compute_units);
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
size_t workitem_dims;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims);
// CL_DEVICE_MAX_WORK_ITEM_SIZES
size_t workitem_size[3];
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
size_t workgroup_size;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency);
// CL_DEVICE_ADDRESS_BITS
cl_uint addr_bits;
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_ADDRESS_BITS:\t\t%u\n", addr_bits);
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
cl_ulong max_mem_alloc_size;
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_alloc_size), &max_mem_alloc_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(max_mem_alloc_size / (1024 * 1024)));
// CL_DEVICE_GLOBAL_MEM_SIZE
cl_ulong mem_size;
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(mem_size / (1024 * 1024)));
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
cl_bool error_correction_support;
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(error_correction_support), &error_correction_support, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", error_correction_support == CL_TRUE ? "yes" : "no");
// CL_DEVICE_LOCAL_MEM_TYPE
cl_device_local_mem_type local_mem_type;
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(local_mem_type), &local_mem_type, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", local_mem_type == 1 ? "local" : "global");
// CL_DEVICE_LOCAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(mem_size / 1024));
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(mem_size / 1024));
// CL_DEVICE_QUEUE_PROPERTIES
cl_command_queue_properties queue_properties;
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(queue_properties), &queue_properties, NULL);
if( queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
if( queue_properties & CL_QUEUE_PROFILING_ENABLE )
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
// CL_DEVICE_IMAGE_SUPPORT
cl_bool image_support;
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", image_support);
// CL_DEVICE_MAX_READ_IMAGE_ARGS
cl_uint max_read_image_args;
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(max_read_image_args), &max_read_image_args, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", max_read_image_args);
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
cl_uint max_write_image_args;
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(max_write_image_args), &max_write_image_args, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", max_write_image_args);
// CL_DEVICE_SINGLE_FP_CONFIG
cl_device_fp_config fp_config;
clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_SINGLE_FP_CONFIG:\t\t%s%s%s%s%s%s\n",
fp_config & CL_FP_DENORM ? "denorms " : "",
fp_config & CL_FP_INF_NAN ? "INF-quietNaNs " : "",
fp_config & CL_FP_ROUND_TO_NEAREST ? "round-to-nearest " : "",
fp_config & CL_FP_ROUND_TO_ZERO ? "round-to-zero " : "",
fp_config & CL_FP_ROUND_TO_INF ? "round-to-inf " : "",
fp_config & CL_FP_FMA ? "fma " : "");
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
size_t szMaxDims[5];
shrLogEx(iLogMode, 0, "\n CL_DEVICE_IMAGE <dim>");
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &szMaxDims[0], NULL);
shrLogEx(iLogMode, 0, "\t\t\t2D_MAX_WIDTH\t %u\n", szMaxDims[0]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[1], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", szMaxDims[1]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &szMaxDims[2], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_WIDTH\t %u\n", szMaxDims[2]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[3], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", szMaxDims[3]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &szMaxDims[4], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_DEPTH\t %u\n", szMaxDims[4]);
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(device_string), &device_string, NULL);
if (device_string != 0)
{
shrLogEx(iLogMode, 0, "\n CL_DEVICE_EXTENSIONS:");
std::string stdDevString;
stdDevString = std::string(device_string);
size_t szOldPos = 0;
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
while (szSpacePos != stdDevString.npos)
{
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
nv_device_attibute_query = true;
if (szOldPos > 0)
{
shrLogEx(iLogMode, 0, "\t\t");
}
shrLogEx(iLogMode, 0, "\t\t\t%s\n", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str());
do {
szOldPos = szSpacePos + 1;
szSpacePos = stdDevString.find(' ', szOldPos);
} while (szSpacePos == szOldPos);
}
shrLogEx(iLogMode, 0, "\n");
}
else
{
shrLogEx(iLogMode, 0, " CL_DEVICE_EXTENSIONS: None\n");
}
if(nv_device_attibute_query)
{
cl_uint compute_capability_major, compute_capability_minor;
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &compute_capability_major, NULL);
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &compute_capability_minor, NULL);
shrLogEx(iLogMode, 0, "\n CL_DEVICE_COMPUTE_CAPABILITY_NV:\t%u.%u\n", compute_capability_major, compute_capability_minor);
shrLogEx(iLogMode, 0, " NUMBER OF MULTIPROCESSORS:\t\t%u\n", compute_units); // this is the same value reported by CL_DEVICE_MAX_COMPUTE_UNITS
shrLogEx(iLogMode, 0, " NUMBER OF CUDA CORES:\t\t\t%u\n", ConvertSMVer2Cores(compute_capability_major, compute_capability_minor) * compute_units);
cl_uint regs_per_block;
clGetDeviceInfo(device, CL_DEVICE_REGISTERS_PER_BLOCK_NV, sizeof(cl_uint), &regs_per_block, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_REGISTERS_PER_BLOCK_NV:\t%u\n", regs_per_block);
cl_uint warp_size;
clGetDeviceInfo(device, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), &warp_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_WARP_SIZE_NV:\t\t%u\n", warp_size);
cl_bool gpu_overlap;
clGetDeviceInfo(device, CL_DEVICE_GPU_OVERLAP_NV, sizeof(cl_bool), &gpu_overlap, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_GPU_OVERLAP_NV:\t\t%s\n", gpu_overlap == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
cl_bool exec_timeout;
clGetDeviceInfo(device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof(cl_bool), &exec_timeout, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:\t%s\n", exec_timeout == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
cl_bool integrated_memory;
clGetDeviceInfo(device, CL_DEVICE_INTEGRATED_MEMORY_NV, sizeof(cl_bool), &integrated_memory, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_INTEGRATED_MEMORY_NV:\t%s\n", integrated_memory == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
}
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
shrLogEx(iLogMode, 0, " CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
cl_uint vec_width [6];
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &vec_width[0], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &vec_width[1], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &vec_width[2], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &vec_width[3], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &vec_width[4], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &vec_width[5], NULL);
shrLogEx(iLogMode, 0, "CHAR %u, SHORT %u, INT %u, LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
vec_width[0], vec_width[1], vec_width[2], vec_width[3], vec_width[4], vec_width[5]);
}
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
int oclGetDevCap(cl_device_id device)
{
char cDevString[1024];
bool bDevAttributeQuery = false;
int iDevArch = -1;
// Get device extensions, and if any then search for cl_nv_device_attribute_query
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(cDevString), &cDevString, NULL);
if (cDevString != 0)
{
std::string stdDevString;
stdDevString = std::string(cDevString);
size_t szOldPos = 0;
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
while (szSpacePos != stdDevString.npos)
{
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
{
bDevAttributeQuery = true;
}
do {
szOldPos = szSpacePos + 1;
szSpacePos = stdDevString.find(' ', szOldPos);
} while (szSpacePos == szOldPos);
}
}
// if search succeeded, get device caps
if(bDevAttributeQuery)
{
cl_int iComputeCapMajor, iComputeCapMinor;
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), (void*)&iComputeCapMajor, NULL);
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), (void*)&iComputeCapMinor, NULL);
iDevArch = (10 * iComputeCapMajor) + iComputeCapMinor;
}
return iDevArch;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetFirstDev(cl_context cxGPUContext)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id first = cdDevices[0];
free(cdDevices);
return first;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
size_t device_count = szParmDataBytes / sizeof(cl_device_id);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id max_flops_device = cdDevices[0];
int max_flops = 0;
size_t current_device = 0;
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
max_flops = compute_units * clock_frequency;
++current_device;
while( current_device < device_count )
{
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
int flops = compute_units * clock_frequency;
if( flops > max_flops )
{
max_flops = flops;
max_flops_device = cdDevices[current_device];
}
++current_device;
}
free(cdDevices);
return max_flops_device;
}
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
#ifdef _WIN32 // Windows version
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
{
return NULL;
}
#else // Linux version
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
#endif
size_t szPreambleLength = strlen(cPreamble);
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
memcpy(cSourceString, cPreamble, szPreambleLength);
if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
{
fclose(pFileStream);
free(cSourceString);
return 0;
}
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength + szPreambleLength;
}
cSourceString[szSourceLength + szPreambleLength] = '\0';
return cSourceString;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int nr)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
if( szParmDataBytes / sizeof(cl_device_id) <= nr ) {
return (cl_device_id)-1;
}
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id device = cdDevices[nr];
free(cdDevices);
return device;
}
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length)
{
// Grab the number of devices associated witht the program
cl_uint num_devices;
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
// Grab the device ids
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
// Grab the sizes of the binaries
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
// Now get the binaries
char** ptx_code = (char**) malloc(num_devices * sizeof(char*));
for( unsigned int i=0; i<num_devices; ++i) {
ptx_code[i]= (char*)malloc(binary_sizes[i]);
}
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
// Find the index of the device of interest
unsigned int idx = 0;
while( idx<num_devices && devices[idx] != cdDevice ) ++idx;
// If it is associated prepare the result
if( idx < num_devices )
{
*binary = ptx_code[idx];
*length = binary_sizes[idx];
}
// Cleanup
free( devices );
free( binary_sizes );
for( unsigned int i=0; i<num_devices; ++i) {
if( i != idx ) free(ptx_code[i]);
}
free( ptx_code );
}
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName)
{
// Grab the number of devices associated with the program
cl_uint num_devices;
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
// Grab the device ids
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
// Grab the sizes of the binaries
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
// Now get the binaries
char** ptx_code = (char**)malloc(num_devices * sizeof(char*));
for( unsigned int i=0; i<num_devices; ++i)
{
ptx_code[i] = (char*)malloc(binary_sizes[i]);
}
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
// Find the index of the device of interest
unsigned int idx = 0;
while((idx < num_devices) && (devices[idx] != cdDevice))
{
++idx;
}
// If the index is associated, log the result
if(idx < num_devices)
{
// if a separate filename is supplied, dump ptx there
if (NULL != cPtxFileName)
{
shrLog("\nWriting ptx to separate file: %s ...\n\n", cPtxFileName);
FILE* pFileStream = NULL;
#ifdef _WIN32
fopen_s(&pFileStream, cPtxFileName, "wb");
#else
pFileStream = fopen(cPtxFileName, "wb");
#endif
fwrite(ptx_code[idx], binary_sizes[idx], 1, pFileStream);
fclose(pFileStream);
}
else // log to logfile and console if no ptx file specified
{
shrLog("\n%s\nProgram Binary:\n%s\n%s\n", HDASHLINE, ptx_code[idx], HDASHLINE);
}
}
// Cleanup
free(devices);
free(binary_sizes);
for(unsigned int i = 0; i < num_devices; ++i)
{
free(ptx_code[i]);
}
free( ptx_code );
}
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice)
{
// write out the build log and ptx, then exit
char cBuildLog[10240];
clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG,
sizeof(cBuildLog), cBuildLog, NULL );
shrLog("\n%s\nBuild Log:\n%s\n%s\n", HDASHLINE, cBuildLog, HDASHLINE);
}
// Helper function for De-allocating cl objects
// *********************************************************************
void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs)
{
int i;
for (i = 0; i < iNumObjs; i++)
{
if (cmMemObjs[i])clReleaseMemObject(cmMemObjs[i]);
}
}
// Helper function to get OpenCL error string from constant
// *********************************************************************
const char* oclErrorString(cl_int error)
{
static const char* errorString[] = {
"CL_SUCCESS",
"CL_DEVICE_NOT_FOUND",
"CL_DEVICE_NOT_AVAILABLE",
"CL_COMPILER_NOT_AVAILABLE",
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
"CL_OUT_OF_RESOURCES",
"CL_OUT_OF_HOST_MEMORY",
"CL_PROFILING_INFO_NOT_AVAILABLE",
"CL_MEM_COPY_OVERLAP",
"CL_IMAGE_FORMAT_MISMATCH",
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
"CL_BUILD_PROGRAM_FAILURE",
"CL_MAP_FAILURE",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"CL_INVALID_VALUE",
"CL_INVALID_DEVICE_TYPE",
"CL_INVALID_PLATFORM",
"CL_INVALID_DEVICE",
"CL_INVALID_CONTEXT",
"CL_INVALID_QUEUE_PROPERTIES",
"CL_INVALID_COMMAND_QUEUE",
"CL_INVALID_HOST_PTR",
"CL_INVALID_MEM_OBJECT",
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
"CL_INVALID_IMAGE_SIZE",
"CL_INVALID_SAMPLER",
"CL_INVALID_BINARY",
"CL_INVALID_BUILD_OPTIONS",
"CL_INVALID_PROGRAM",
"CL_INVALID_PROGRAM_EXECUTABLE",
"CL_INVALID_KERNEL_NAME",
"CL_INVALID_KERNEL_DEFINITION",
"CL_INVALID_KERNEL",
"CL_INVALID_ARG_INDEX",
"CL_INVALID_ARG_VALUE",
"CL_INVALID_ARG_SIZE",
"CL_INVALID_KERNEL_ARGS",
"CL_INVALID_WORK_DIMENSION",
"CL_INVALID_WORK_GROUP_SIZE",
"CL_INVALID_WORK_ITEM_SIZE",
"CL_INVALID_GLOBAL_OFFSET",
"CL_INVALID_EVENT_WAIT_LIST",
"CL_INVALID_EVENT",
"CL_INVALID_OPERATION",
"CL_INVALID_GL_OBJECT",
"CL_INVALID_BUFFER_SIZE",
"CL_INVALID_MIP_LEVEL",
"CL_INVALID_GLOBAL_WORK_SIZE",
};
const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
const int index = -error;
return (index >= 0 && index < errorCount) ? errorString[index] : "Unspecified Error";
}
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
const char* oclImageFormatString(cl_uint uiImageFormat)
{
// cl_channel_order
if (uiImageFormat == CL_R)return "CL_R";
if (uiImageFormat == CL_A)return "CL_A";
if (uiImageFormat == CL_RG)return "CL_RG";
if (uiImageFormat == CL_RA)return "CL_RA";
if (uiImageFormat == CL_RGB)return "CL_RGB";
if (uiImageFormat == CL_RGBA)return "CL_RGBA";
if (uiImageFormat == CL_BGRA)return "CL_BGRA";
if (uiImageFormat == CL_ARGB)return "CL_ARGB";
if (uiImageFormat == CL_INTENSITY)return "CL_INTENSITY";
if (uiImageFormat == CL_LUMINANCE)return "CL_LUMINANCE";
// cl_channel_type
if (uiImageFormat == CL_SNORM_INT8)return "CL_SNORM_INT8";
if (uiImageFormat == CL_SNORM_INT16)return "CL_SNORM_INT16";
if (uiImageFormat == CL_UNORM_INT8)return "CL_UNORM_INT8";
if (uiImageFormat == CL_UNORM_INT16)return "CL_UNORM_INT16";
if (uiImageFormat == CL_UNORM_SHORT_565)return "CL_UNORM_SHORT_565";
if (uiImageFormat == CL_UNORM_SHORT_555)return "CL_UNORM_SHORT_555";
if (uiImageFormat == CL_UNORM_INT_101010)return "CL_UNORM_INT_101010";
if (uiImageFormat == CL_SIGNED_INT8)return "CL_SIGNED_INT8";
if (uiImageFormat == CL_SIGNED_INT16)return "CL_SIGNED_INT16";
if (uiImageFormat == CL_SIGNED_INT32)return "CL_SIGNED_INT32";
if (uiImageFormat == CL_UNSIGNED_INT8)return "CL_UNSIGNED_INT8";
if (uiImageFormat == CL_UNSIGNED_INT16)return "CL_UNSIGNED_INT16";
if (uiImageFormat == CL_UNSIGNED_INT32)return "CL_UNSIGNED_INT32";
if (uiImageFormat == CL_HALF_FLOAT)return "CL_HALF_FLOAT";
if (uiImageFormat == CL_FLOAT)return "CL_FLOAT";
// unknown constant
return "Unknown";
}

View File

@@ -17,7 +17,7 @@
// *********************************************************************
// Common headers: Cross-API utililties and OpenCL header
#include <shrUtils.h>
#include "shrUtils.h"
// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)

File diff suppressed because it is too large Load Diff

View File

@@ -13,7 +13,7 @@
#define SHR_UTILS_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// reminders for output window and build log
@@ -639,4 +639,4 @@ inline void __shrExitEX(int argc, const char** argv, int iExitCode)
exit(iExitCode);
}
#endif
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,70 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter
CXXFLAGS += -Wno-unused-variable -Wno-narrowing -Wno-unused-result -Wno-unused-but-set-variable
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = guassian
SRCS = main.cc clutils.cpp utils.cpp
all: $(PROJECT) kernel.pocl
OPTS ?=
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

Binary file not shown.

View File

@@ -31,8 +31,6 @@ int main(int argc, char *argv[]) {
a = (float *)malloc(size * size * sizeof(float));
printf("OK\n");
InitMat(fp, size, a, size, size);
// printf("The input matrix a is:\n");
// PrintMat(a, size, size, size);

View File

@@ -1,67 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = kmeans
SRCS = main.cc read_input.c rmse.c kmeans_clustering.c cluster.c getopt.c
all: $(PROJECT) kernel.pocl
OPTS ?=
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

Binary file not shown.

View File

@@ -170,6 +170,7 @@ float** kmeans_clustering(float **feature, /* in: [npoints][nfeatures] */
free(new_centers[0]);
free(new_centers);
free(new_centers_len);
free(initial);
return clusters;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -104,8 +104,8 @@ static int initialize(int use_gpu) {
context = clCreateContext(NULL, 1, device_list, NULL, NULL, &result);
// create command queue for the first device
cmd_queue = clCreateCommandQueue(context, device_list[0], 0, NULL);
if (!cmd_queue) {
cmd_queue = clCreateCommandQueue(context, device_list[0], 0, &result);
if (!cmd_queue || result != CL_SUCCESS) {
printf("ERROR: clCreateCommandQueue() failed\n");
return -1;
}
@@ -120,7 +120,7 @@ static int shutdown() {
if (context)
clReleaseContext(context);
if (device_list)
delete device_list;
delete [] device_list;
// reset all variables
cmd_queue = 0;
@@ -188,7 +188,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
fread(source + strlen(source), sourcesize, 1, fp);
fclose(fp);*/
// OpenCL initialization
// OpenCL initialization
int use_gpu = 1;
if (initialize(use_gpu))
return -1;
@@ -197,12 +197,25 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
cl_int err = 0;
//const char *slist[2] = {source, 0};
//cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
cl_program prog = clCreateProgramWithBuiltInKernels(context, 1, device_list, "kmeans_kernel_c;kmeans_swap", &err);
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateProgramWithSource() => %d\n", err);
printf("ERROR: read_kernel_file() => %d\n", err);
return -1;
}
err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
cl_program prog = clCreateProgramWithBinary(
context, 1, device_list, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateProgramWithBinary() => %d\n", err);
return -1;
}
free(kernel_bin);
err = clBuildProgram(prog, 1, &device_list[0], NULL, NULL, NULL);
{ // show warnings/errors
// static char log[65536]; memset(log, 0, sizeof(log));
// cl_device_id device_id = 0;
@@ -226,6 +239,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
printf("ERROR: clCreateKernel() 0 => %d\n", err);
return -1;
}
kernel2 = clCreateKernel(prog, kernel_swap, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateKernel() 0 => %d\n", err);
@@ -241,6 +255,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
n_points * n_features, err);
return -1;
}
d_feature_swap =
clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * n_features * sizeof(float), NULL, &err);
@@ -249,6 +264,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
n_points * n_features, err);
return -1;
}
d_cluster =
clCreateBuffer(context, CL_MEM_READ_WRITE,
n_clusters * n_features * sizeof(float), NULL, &err);
@@ -257,6 +273,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
n_clusters * n_features, err);
return -1;
}
d_membership = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * sizeof(int), NULL, &err);
if (err != CL_SUCCESS) {
@@ -296,6 +313,8 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
}
membership_OCL = (int *)malloc(n_points * sizeof(int));
return 0;
}
void deallocateMemory() {

View File

@@ -331,7 +331,9 @@ int setup(int argc, char **argv) {
}
}
/* free up memory */
/* free up memory */
free(cluster_centres[0]);
free(cluster_centres);
free(features[0]);
free(features);
return (0);

View File

@@ -1,69 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = lbm
SRCS = main.cc args.c parboil_opencl.c gpu_info.c lbm.c ocl.c
all: $(PROJECT).dump $(PROJECT).hex
CXXFLAGS += -I.
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
OPTS ?=
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
include ../common.mk

View File

@@ -9,6 +9,10 @@
#ifndef __GPUINFOH__
#define __GPUINFOH__
#ifdef __cplusplus
extern "C" {
#endif
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
@@ -17,4 +21,8 @@ void compute_active_thread(size_t *thread,
int minor,
int sm);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -6,16 +6,16 @@
*cr
***************************************************************************/
/*############################################################################*/
#ifndef _LBM_H_
#define _LBM_H_
/*############################################################################*/
#include "ocl.h"
#include "lbm_macros.h"
#ifdef __cplusplus
extern "C" {
#endif
void LBM_allocateGrid( float** ptr );
void LBM_freeGrid( float** ptr );
void LBM_initializeGrid( LBM_Grid grid );
@@ -34,6 +34,8 @@ void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid
void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid );
/*############################################################################*/
#ifdef __cplusplus
}
#endif
#endif /* _LBM_H_ */

Binary file not shown.

View File

@@ -21,6 +21,26 @@
#include "main.h"
#include "ocl.h"
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return CL_INVALID_VALUE;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return CL_INVALID_VALUE;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return CL_SUCCESS;
}
/*############################################################################*/
@@ -170,8 +190,6 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
LBM_freeGrid((float **)&TEMP_srcGrid);
LBM_freeGrid((float **)&TEMP_dstGrid);
printf("OK\n");
}
/*############################################################################*/
@@ -188,7 +206,9 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
LBM_showGridStatistics(TEMP_srcGrid);
LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
if (param->resultFilename) {
LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
}
LBM_freeGrid((float **)&TEMP_srcGrid);
OpenCL_LBM_freeGrid(OpenCL_srcGrid);
@@ -220,8 +240,14 @@ void OpenCL_initialize(struct pb_Parameters *p, OpenCL_Param *prm) {
//const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
//prm->clProgram = clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus);
prm->clProgram = clCreateProgramWithBuiltInKernels(
prm->clContext, 1, &prm->clDevice, "performStreamCollide_kernel", &clStatus);
// read kernel binary from file
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
CHECK_ERROR("read_kernel_file")
prm->clProgram = clCreateProgramWithBinary(
prm->clContext, 1, &prm->clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
CHECK_ERROR("clCreateProgramWithSource")
//char clOptions[100];

View File

@@ -1,6 +1,10 @@
#ifndef __OCLH__
#define __OCLH__
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
cl_platform_id clPlatform;
cl_context_properties clCps[3];
@@ -22,4 +26,8 @@ typedef struct {
char* readFile(char*);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1216,9 +1216,24 @@ pb_InitOpenCLContext(struct pb_Parameters* parameters) {
cl_platform_id platform_id;
cl_device_id device_id;
cl_context context;
clGetPlatformIDs(1, &platform_id, NULL);
clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err);
_err = clGetPlatformIDs(1, &platform_id, NULL);
if (_err != CL_SUCCESS) {
fprintf(stderr, "Error querying platform!\n");
exit(-1);
}
_err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL);
if (_err != CL_SUCCESS) {
fprintf(stderr, "Error querying device IDs!\n");
exit(-1);
}
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err);
if (_err != CL_SUCCESS) {
fprintf(stderr, "Error Creating device context!\n");
exit(-1);
}
pb_Context* c = (pb_Context*)malloc(sizeof(pb_Context));
c->clContext = context;

View File

@@ -0,0 +1,7 @@
PROJECT = matmul
SRCS = main.cc
OPTS ?= -n16
include ../common.mk

View File

@@ -0,0 +1,73 @@
__kernel void matmul(__global float *A,
__global float *B,
__global float *C,
const unsigned int N,
__local float *localA,
__local float *localB)
{
int row = get_global_id(1);
int col = get_global_id(0);
int localRow = get_local_id(1);
int localCol = get_local_id(0);
int localSize = get_local_size(0); // assuming square local size
float sum = 0.0f;
// Loop over all blocks of both matrices
for (int k = 0; k < N; k += localSize) {
// Load block of matrix A to local memory
localA[localRow * localSize + localCol] = A[row * N + k + localCol];
// Load block of matrix B to local memory, adjusting for column-major access
localB[localRow * localSize + localCol] = B[(k + localRow) * N + col];
// Synchronize to make sure the tiles are loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Multiply the two matrix blocks and accumulate result
for (int j = 0; j < localSize; j++) {
sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
}
// Synchronize before loading the next block
barrier(CLK_LOCAL_MEM_FENCE);
}
C[row * N + col] = sum;
}
/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N)
{
int globalRow = get_global_id(1);
int globalCol = get_global_id(0);
int localRow = get_local_id(1);
int localCol = get_local_id(0);
// Static local memory declaration
__local float localA[16][16];
__local float localB[16][16];
float sum = 0.0f;
// Iterate over blocks
for (int k = 0; k < N; k += 16) {
// Load a block of matrix A into local memory
localA[localRow][localCol] = A[globalRow * N + k + localCol];
// Load a block of matrix B into local memory
localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
// Ensure the entire block is loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Compute multiplication for this block
for (int j = 0; j < 16; j++) {
sum += localA[localRow][j] * localB[j][localCol];
}
// Wait until all threads have computed before loading the next block
barrier(CLK_LOCAL_MEM_FENCE);
}
C[globalRow * N + globalCol] = sum;
}*/

246
tests/opencl/matmul/main.cc Normal file
View File

@@ -0,0 +1,246 @@
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <CL/opencl.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <chrono>
#include <vector>
#define LOCAL_SIZE 16
#define KERNEL_NAME "matmul"
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
cleanup(); \
exit(-1); \
} while (0)
#define CL_CHECK2(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
decltype(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
cleanup(); \
exit(-1); \
} \
_ret; \
})
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
static bool compare_equal(float a, float b, int ulp = 21) {
union fi_t { int i; float f; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
return std::abs(fa.i - fb.i) <= ulp;
}
static void matrix_multiply_cpu(float *A, float *B, float *C, int N) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
float sum = 0.0f;
for (int k = 0; k < N; k++) {
sum += A[i * N + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
}
}
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue commandQueue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_mem a_memobj = NULL;
cl_mem b_memobj = NULL;
cl_mem c_memobj = NULL;
uint8_t *kernel_bin = NULL;
static void cleanup() {
if (commandQueue) clReleaseCommandQueue(commandQueue);
if (kernel) clReleaseKernel(kernel);
if (program) clReleaseProgram(program);
if (a_memobj) clReleaseMemObject(a_memobj);
if (b_memobj) clReleaseMemObject(b_memobj);
if (c_memobj) clReleaseMemObject(c_memobj);
if (context) clReleaseContext(context);
if (device_id) clReleaseDevice(device_id);
if (kernel_bin) free(kernel_bin);
}
int size = 64;
static void show_usage() {
printf("Usage: [-n size] [-h: help]\n");
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "fn:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
int main (int argc, char **argv) {
// parse command arguments
parse_args(argc, argv);
printf("Matrix size=%d\n", size);
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
return -1;
}
cl_platform_id platform_id;
size_t kernel_size;
// Getting platform and device information
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
printf("Create context\n");
context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
char device_string[1024];
clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
printf("Using device: %s\n", device_string);
printf("Allocate device buffers\n");
size_t nbytes = size * size * sizeof(float);
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
if (program == NULL) {
cleanup();
return -1;
}
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
// Create kernel
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
size_t global_size[2] = {size, size};
// Set kernel arguments
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
// Allocate memories for input arrays and output arrays.
std::vector<float> h_a(size * size);
std::vector<float> h_b(size * size);
std::vector<float> h_c(size * size);
// Initialize values for array members.
for (int i = 0; i < (size * size); ++i) {
#ifdef USE_FLOAT
h_a[i] = (float)rand() / (float)RAND_MAX;
h_b[i] = (float)rand() / (float)RAND_MAX;
#else
h_a[i] = rand();
h_b[i] = rand();
#endif
h_c[i] = 0xdeadbeef;
}
// Creating command queue
commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
printf("Upload source buffers\n");
CL_CHECK(clEnqueueWriteBuffer(commandQueue, a_memobj, CL_TRUE, 0, nbytes, h_a.data(), 0, NULL, NULL));
CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b.data(), 0, NULL, NULL));
printf("Execute the kernel\n");
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL));
CL_CHECK(clFinish(commandQueue));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
printf("Download destination buffer\n");
CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));
printf("Verify result\n");
std::vector<float> ref_vec(size * size);
matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size);
int errors = 0;
for (int i = 0; i < (size * size); i++) {
if (!compare_equal(h_c[i], ref_vec[i])) {
if (errors < 100)
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
++errors;
}
}
if (errors != 0) {
printf("FAILED! - %d errors\n", errors);
} else {
printf("PASSED!\n");
}
// Clean up
cleanup();
return errors;
}

View File

@@ -1,69 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = mri-q
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c file.cc computeQ.c
all: $(PROJECT).dump $(PROJECT).hex
CXXFLAGS += -I.
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
OPTS ?=
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
include ../common.mk

View File

@@ -1,6 +1,10 @@
#ifndef __COMPUTEQ__
#define __COMPUTEQ__
#ifdef __cplusplus
extern "C" {
#endif
void computePhiMag_GPU(int numK,cl_mem phiR_d,cl_mem phiI_d,cl_mem phiMag_d,clPrmtr* clPrm);
void computeQ_GPU (int numK,int numX,
cl_mem x_d, cl_mem y_d, cl_mem z_d,
@@ -11,4 +15,8 @@ void computeQ_GPU (int numK,int numX,
void createDataStructsCPU(int numK, int numX, float** phiMag,
float** Qr, float** Qi);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -9,6 +9,10 @@
#ifndef __GPUINFOH__
#define __GPUINFOH__
#ifdef __cplusplus
extern "C" {
#endif
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
@@ -17,4 +21,8 @@ void compute_active_thread(size_t *thread,
int minor,
int sm);
#ifdef __cplusplus
}
#endif
#endif

Binary file not shown.

Binary file not shown.

View File

@@ -34,6 +34,27 @@
#include "macros.h"
#include "computeQ.h"
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return CL_INVALID_VALUE;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return CL_INVALID_VALUE;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return CL_SUCCESS;
}
static void
setupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr,clPrmtr* clPrm)
{
@@ -93,8 +114,6 @@ main (int argc, char *argv[]) {
&x, &y, &z,
&phiR, &phiI);
printf("OK\n");
/* Reduce the number of k-space samples if a number is given
* on the command line */
if (argc < 2)
@@ -137,13 +156,20 @@ main (int argc, char *argv[]) {
pb_SetOpenCL(&(clPrm.clContext), &(clPrm.clCommandQueue));
printf("OK\n");
//const char* clSource[] = {readFile("src/opencl_base/kernels.cl")};
//cl_program clProgram = clCreateProgramWithSource(clPrm.clContext,1,clSource,NULL,&clStatus);
cl_program clProgram = clCreateProgramWithBuiltInKernels(
clPrm.clContext, 1, &clDevice, "ComputePhiMag_GPU;ComputeQ_GPU", &clStatus);
#ifdef HOSTGPU
const char* clSource[] = {readFile("kernel.cl")};
CHECK_ERROR("clCreateProgramWithSource")
cl_program clProgram = clCreateProgramWithSource(clPrm.clContext,1,clSource,NULL,&clStatus);
#else
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
CHECK_ERROR("read_kernel_file")
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
CHECK_ERROR("clCreateProgramWithSource")
cl_program clProgram = clCreateProgramWithBinary(
clPrm.clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
#endif
char options[50];
sprintf(options,"-I src/opencl_nvidia");

View File

@@ -3,6 +3,10 @@
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
cl_context clContext;
cl_command_queue clCommandQueue;
@@ -20,4 +24,8 @@ char* readFile(const char*);
exit(1); \
}
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1,72 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
OPTS ?= filelist.txt
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter
CXXFLAGS += -Wno-unused-variable -Wno-narrowing -Wno-unused-result -Wno-unused-but-set-variable
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = nearn
SRCS = main.cc clutils.cpp utils.cpp
all: $(PROJECT) kernel.pocl
OPTS ?= filelist.txt
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,2 @@
cane4_0.db
cane4_1.db
cane4_2.db
cane4_3.db
cane4_1.db

Binary file not shown.

View File

@@ -172,6 +172,7 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
printf("%f\n\n", (float)(totalTime / 1e9));
}
// 6. return finalized data and release buffers
clReleaseEvent(writeEvent);
clReleaseEvent(kernelEvent);

View File

@@ -1,71 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
OPTS ?= -n1
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = oclprintf
SRCS = main.cc
all: $(PROJECT) kernel.pocl
OPTS ?= -n1
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -1,71 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
OPTS ?= -f -n16
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-strict-aliasing -Wno-narrowing
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = psort
SRCS = main.cc
all: $(PROJECT) kernel.pocl
OPTS ?= -f -n16
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More