Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions

1
tests/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
**/*.log

View File

@@ -1,13 +1,15 @@
all: runtime regression opencl riscv unittest
all: kernel regression opencl riscv unittest
runtime:
$(MAKE) -C runtime
kernel:
$(MAKE) -C kernel
regression:
$(MAKE) -C regression
opencl:
ifneq ($(XLEN),64)
$(MAKE) -C opencl
endif
riscv:
$(MAKE) -C riscv
@@ -16,10 +18,17 @@ unittest:
$(MAKE) -C unittest
clean:
$(MAKE) clean -C runtime
$(MAKE) clean -C regression
$(MAKE) clean -C opencl
$(MAKE) clean -C riscv
$(MAKE) clean -C unittest
$(MAKE) -C kernel clean
$(MAKE) -C regression clean
$(MAKE) -C opencl clean
$(MAKE) -C riscv clean
$(MAKE) -C unittest clean
.PHONY: all runtime regression opencl riscv unittest
clean-all:
$(MAKE) -C kernel clean
$(MAKE) -C regression clean-all
$(MAKE) -C opencl clean-all
$(MAKE) -C riscv clean
$(MAKE) -C unittest clean
.PHONY: all kernel regression opencl riscv unittest

View File

@@ -1,20 +1,19 @@
all:
$(MAKE) -C conform
$(MAKE) -C hello
$(MAKE) -C fibonacci
$(MAKE) -C simple
$(MAKE) -C fibonacci
run-simx:
$(MAKE) -C conform run-simx
$(MAKE) -C hello run-simx
$(MAKE) -C fibonacci run-simx
$(MAKE) -C simple run-simx
$(MAKE) -C fibonacci run-simx
run-rtlsim:
$(MAKE) -C conform run-rtlsim
$(MAKE) -C hello run-rtlsim
$(MAKE) -C fibonacci run-rtlsim
$(MAKE) -C simple run-rtlsim
clean:
$(MAKE) -C conform clean
$(MAKE) -C hello clean
$(MAKE) -C fibonacci clean
$(MAKE) -C simple clean

View File

@@ -0,0 +1,52 @@
XLEN ?= 32
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
SIM_DIR = ../../../sim
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
PROJECT = conform
SRCS = main.cpp tests.cpp
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).elf
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
$(PROJECT).bin: $(PROJECT).elf
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
$(PROJECT).elf: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT).bin
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
run-simx: $(PROJECT).bin
$(SIM_DIR)/simx/simx $(PROJECT).bin
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.elf *.bin *.dump .depend

View File

@@ -6,8 +6,6 @@ int main() {
errors += test_global_memory();
errors += test_stack_memory();
errors += test_shared_memory();
errors += test_tmc();
@@ -26,10 +24,12 @@ int main() {
errors += test_barrier();
errors += test_tls();
if (0 == errors) {
vx_printf("Passed!\n");
PRINTF("Passed!\n");
} else {
vx_printf("Failed!\n");
PRINTF("Failed!\n");
}
return errors;

View File

@@ -5,26 +5,26 @@
#include <vx_print.h>
#include <vx_spawn.h>
int __attribute__ ((noinline)) check_error(const int* buffer, int offset, int size) {
int __attribute__((noinline)) check_error(const int* buffer, int offset, int size) {
int errors = 0;
for (int i = offset; i < size; i++) {
int value = buffer[i];
int ref_value = 65 + i;
if (value == ref_value) {
//vx_printf("[%d] %c\n", i, value);
//PRINTF("[%d] %c\n", i, value);
} else {
vx_printf("*** error: [%d] %x, expected %x\n", i, value, ref_value);
PRINTF("*** error: [%d] 0x%x, expected 0x%x\n", i, value, ref_value);
++errors;
}
}
return errors;
}
int __attribute__ ((noinline)) make_select_tmask(int tid) {
int __attribute__((noinline)) make_select_tmask(int tid) {
return (1 << tid);
}
int __attribute__ ((noinline)) make_full_tmask(int num_threads) {
int __attribute__((noinline)) make_full_tmask(int num_threads) {
return (1 << num_threads) - 1;
}
@@ -34,7 +34,7 @@ int __attribute__ ((noinline)) make_full_tmask(int num_threads) {
int global_buffer[GLOBAL_MEM_SZ];
int test_global_memory() {
vx_printf("Global Memory Test\n");
PRINTF("Global Memory Test\n");
for (int i = 0; i < GLOBAL_MEM_SZ; i++) {
global_buffer[i] = 65 + i;
@@ -45,51 +45,52 @@ int test_global_memory() {
///////////////////////////////////////////////////////////////////////////////
int test_stack_memory() {
vx_printf("Stack Memory Test\n");
int* smem_addr = (int*)SMEM_BASE_ADDR;
static const int STACK_MEM_SZ = 8;
int stack_buffer[STACK_MEM_SZ];
int smem_buffer[8];
for (int i = 0; i < STACK_MEM_SZ; i++) {
stack_buffer[i] = 65 + i;
}
return check_error(stack_buffer, 0, STACK_MEM_SZ);
void __attribute__((noinline)) do_smem_wr() {
unsigned tid = vx_thread_id();
smem_addr[tid] = 65 + tid;
}
///////////////////////////////////////////////////////////////////////////////
void __attribute__((noinline)) do_smem_rd() {
unsigned tid = vx_thread_id();
smem_buffer[tid] = smem_addr[tid];
}
int test_shared_memory() {
static const int SHARED_MEM_SZ = 8;
int* shared_buffer = (int*)(SMEM_BASE_ADDR-(SMEM_SIZE-SHARED_MEM_SZ-4));
vx_printf("Shared Memory Test\n");
for (int i = 0; i < SHARED_MEM_SZ; i++) {
shared_buffer[i] = 65 + i;
}
PRINTF("Shared Memory Test\n");
return check_error(shared_buffer, 0, SHARED_MEM_SZ);
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_smem_wr();
do_smem_rd();
vx_tmc_one();
return check_error(smem_buffer, 0, num_threads);
}
///////////////////////////////////////////////////////////////////////////////
int tmc_buffer[8];
void __attribute__ ((noinline)) do_tmc() {
void __attribute__((noinline)) do_tmc() {
unsigned tid = vx_thread_id();
tmc_buffer[tid] = 65 + tid;
}
int test_tmc() {
vx_printf("TMC Test\n");
PRINTF("TMC Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_tmc();
vx_tmc(1);
vx_tmc_one();
return check_error(tmc_buffer, 0, num_threads);
}
@@ -98,28 +99,26 @@ int test_tmc() {
int pred_buffer[8];
void __attribute__ ((noinline)) do_pred() {
void __attribute__((noinline)) do_pred() {
unsigned tid = vx_thread_id();
pred_buffer[tid] = 65 + tid;
vx_pred((tid == 0), 1);
pred_buffer[tid] = 65;
}
int test_pred() {
vx_printf("PRED Test\n");
PRINTF("PRED Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
for (int i = 0; i < num_threads; i++) {
pred_buffer[i] = 0;
for (int i = 1; i < num_threads; i++) {
pred_buffer[i] = 65 + i;
}
vx_pred(~1);
vx_tmc(tmask);
do_pred();
vx_tmc(1);
vx_tmc_one();
int status_n0 = (0 == tmc_buffer[0]);
int status_n1 = check_error(tmc_buffer, 1, num_threads);
return status_n0 && status_n1;
return check_error(pred_buffer, 0, num_threads);
}
///////////////////////////////////////////////////////////////////////////////
@@ -133,7 +132,7 @@ void wspawn_kernel() {
}
int test_wsapwn() {
vx_printf("Wspawn Test\n");
PRINTF("Wspawn Test\n");
int num_warps = std::min(vx_num_warps(), 8);
vx_wspawn(num_warps, wspawn_kernel);
wspawn_kernel();
@@ -145,39 +144,52 @@ int test_wsapwn() {
int dvg_buffer[4];
void __attribute__ ((noinline)) do_divergence() {
unsigned tid = vx_thread_id();
__if (tid < 2) {
__if (tid < 1) {
dvg_buffer[tid] = 65;
void __attribute__((noinline)) __attribute__((optimize("O1"))) do_divergence() {
int tid = vx_thread_id();
int cond1 = tid < 2;
int sp1 = vx_split(cond1);
if (cond1) {
{
int cond2 = tid < 1;
int sp2 = vx_split(cond2);
if (cond2) {
dvg_buffer[tid] = 65; // A
} else {
dvg_buffer[tid] = 66; // B
}
vx_join(sp2);
}
__else {
dvg_buffer[tid] = 66;
{
int cond3 = tid < 0;
int sp3 = vx_split(cond3);
if (cond3) {
dvg_buffer[tid] = 67; // C
}
vx_join(sp3);
}
} else {
{
int cond2 = tid < 3;
int sp2 = vx_split(cond2);
if (cond2) {
dvg_buffer[tid] = 67; // C
} else {
dvg_buffer[tid] = 68; // D
}
vx_join(sp2);
}
__endif
}
__else {
__if (tid < 3) {
dvg_buffer[tid] = 67;
}
__else {
dvg_buffer[tid] = 68;
}
__endif
}
__endif
vx_join(sp1);
}
int test_divergence() {
vx_printf("Control Divergence Test\n");
PRINTF("Control Divergence Test\n");
int num_threads = std::min(vx_num_threads(), 4);
int tmask = make_full_tmask(num_threads);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_divergence();
vx_tmc(1);
vx_tmc_one();
return check_error(dvg_buffer, 0, num_threads);
}
@@ -193,12 +205,12 @@ typedef struct {
int st_buffer_src[ST_BUF_SZ];
int st_buffer_dst[ST_BUF_SZ];
void st_kernel(int task_id, const st_args_t * arg) {
void st_kernel(int task_id, const st_args_t * __UNIFORM__ arg) {
arg->dst[task_id] = arg->src[task_id];
}
int test_spawn_tasks() {
vx_printf("SpawnTasks Test\n");
PRINTF("SpawnTasks Test\n");
st_args_t arg;
arg.src = st_buffer_src;
@@ -215,31 +227,30 @@ int test_spawn_tasks() {
///////////////////////////////////////////////////////////////////////////////
#define SR_BUF_SZ 8
typedef struct {
int * buf;
} sr_args_t;
int sr_buffer[SR_BUF_SZ];
int sr_buffer[8];
void sr_kernel(const sr_args_t * arg) {
int tid = vx_thread_id();
arg->buf[tid] = 65 + tid;
}
void __attribute__ ((noinline)) do_serial() {
void __attribute__((noinline)) do_serial() {
sr_args_t arg;
arg.buf = sr_buffer;
vx_serial((vx_serial_cb)sr_kernel, &arg);
}
int test_serial() {
vx_printf("Serial Test\n");
PRINTF("Serial Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_serial();
vx_tmc(1);
vx_tmc_one();
return check_error(sr_buffer, 0, num_threads);
}
@@ -248,7 +259,7 @@ int test_serial() {
int tmask_buffer[8];
int __attribute__ ((noinline)) do_tmask() {
int __attribute__((noinline)) do_tmask() {
int tid = vx_thread_id();
int tmask = make_select_tmask(tid);
int cur_tmask = vx_thread_mask();
@@ -257,7 +268,7 @@ int __attribute__ ((noinline)) do_tmask() {
}
int test_tmask() {
vx_printf("Thread Mask Test\n");
PRINTF("Thread Mask Test\n");
// activate all thread to populate shared variables
vx_tmc(-1);
@@ -271,7 +282,7 @@ l_start:
tid = do_tmask();
if (tid < num_threads)
goto l_start;
vx_tmc(1);
vx_tmc_one();
return check_error(tmask_buffer, 0, num_threads);
}
@@ -293,11 +304,36 @@ void barrier_kernel() {
}
int test_barrier() {
vx_printf("Barrier Test\n");
PRINTF("Barrier Test\n");
int num_warps = std::min(vx_num_warps(), 8);
barrier_ctr = num_warps;
barrier_stall = 0;
vx_wspawn(num_warps, barrier_kernel);
barrier_kernel();
return check_error(barrier_buffer, 0, num_warps);
}
///////////////////////////////////////////////////////////////////////////////
int tls_buffer[8];
__thread int tls_var;
__attribute__((noinline)) void print_tls_var() {
unsigned wid = vx_warp_id();
tls_buffer[wid] = 65 + tls_var;
}
void tls_kernel() {
unsigned wid = vx_warp_id();
tls_var = wid;
print_tls_var();
vx_tmc(0 == wid);
}
int test_tls() {
PRINTF("TLS Test\n");
int num_warps = std::min(vx_num_warps(), 8);
vx_wspawn(num_warps, tls_kernel);
tls_kernel();
return check_error(tls_buffer, 0, num_warps);
}

View File

@@ -1,9 +1,9 @@
#ifndef TESTS
#define TESTS
int test_global_memory();
#define PRINTF vx_printf
int test_stack_memory();
int test_global_memory();
int test_shared_memory();
@@ -23,4 +23,6 @@ int test_tmask();
int test_barrier();
int test_tls();
#endif

View File

@@ -0,0 +1,52 @@
XLEN ?= 32
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
SIM_DIR = ../../../sim
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
PROJECT = fibonacci
SRCS = main.cpp
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).elf
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
$(PROJECT).bin: $(PROJECT).elf
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
$(PROJECT).elf: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT).bin
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
run-simx: $(PROJECT).bin
$(SIM_DIR)/simx/simx $(PROJECT).bin
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.elf *.bin *.dump .depend

View File

@@ -0,0 +1,52 @@
XLEN ?= 32
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
SIM_DIR = ../../../sim
CFLAGS += -O3 -v -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
PROJECT = hello
SRCS = main.cpp
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).elf
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
$(PROJECT).bin: $(PROJECT).elf
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
$(PROJECT).elf: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT).bin
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
run-simx: $(PROJECT).bin
$(SIM_DIR)/simx/simx $(PROJECT).bin
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.elf *.bin *.dump .depend

View File

@@ -1,67 +0,0 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT=BlackScholes
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: BlackScholes.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc oclBlackScholes_common.h oclBlackScholes_launcher.cpp oclBlackScholes_gold.cpp $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex

View File

@@ -1,67 +0,0 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT=DotProduct
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: DotProduct.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex

View File

@@ -1,59 +1,126 @@
all:
$(MAKE) -C vecadd
$(MAKE) -C sgemm
$(MAKE) -C psort
$(MAKE) -C saxpy
$(MAKE) -C sfilter
$(MAKE) -C nearn
$(MAKE) -C guassian
$(MAKE) -C dotproduct
$(MAKE) -C kmeans
$(MAKE) -C spmv
$(MAKE) -C transpose
$(MAKE) -C cutcp
$(MAKE) -C vectorhypot
$(MAKE) -C stencil
$(MAKE) -C mri-q
$(MAKE) -C lbm
$(MAKE) -C oclprintf
$(MAKE) -C psort
$(MAKE) -C blackscholes
$(MAKE) -C matmul
run-simx:
$(MAKE) -C vecadd run-simx
$(MAKE) -C vecadd run-simx
$(MAKE) -C sgemm run-simx
$(MAKE) -C psort run-simx
$(MAKE) -C saxpy run-simx
$(MAKE) -C sfilter run-simx
$(MAKE) -C nearn run-simx
$(MAKE) -C guassian run-simx
$(MAKE) -C dotproduct run-simx
$(MAKE) -C kmeans run-simx
$(MAKE) -C spmv run-simx
$(MAKE) -C cutcp run-simx
$(MAKE) -C stencil run-simx
$(MAKE) -C lbm run-simx
$(MAKE) -C oclprintf run-simx
$(MAKE) -C psort run-simx
$(MAKE) -C blackscholes run-simx
$(MAKE) -C matmul run-simx
$(MAKE) -C transpose run-simx
# $(MAKE) -C vectorhypot run-simx
# $(MAKE) -C mri-q run-simx
run-rtlsim:
$(MAKE) -C vecadd run-rtlsim
$(MAKE) -C vecadd run-rtlsim
$(MAKE) -C sgemm run-rtlsim
$(MAKE) -C psort run-rtlsim
$(MAKE) -C saxpy run-rtlsim
$(MAKE) -C sfilter run-rtlsim
$(MAKE) -C nearn run-rtlsim
$(MAKE) -C guassian run-rtlsim
$(MAKE) -C dotproduct run-rtlsim
$(MAKE) -C kmeans run-rtlsim
$(MAKE) -C spmv run-rtlsim
$(MAKE) -C transpose run-rtlsim
$(MAKE) -C cutcp run-rtlsim
$(MAKE) -C stencil run-rtlsim
$(MAKE) -C lbm run-rtlsim
$(MAKE) -C oclprintf run-rtlsim
$(MAKE) -C psort run-rtlsim
$(MAKE) -C blackscholes run-rtlsim
$(MAKE) -C matmul run-rtlsim
# $(MAKE) -C vectorhypot run-rtlsim
# $(MAKE) -C mri-q run-rtlsim
run-vlsim:
$(MAKE) -C vecadd run-vlsim
$(MAKE) -C sgemm run-vlsim
$(MAKE) -C saxpy run-vlsim
$(MAKE) -C sfilter run-vlsim
$(MAKE) -C nearn run-vlsim
$(MAKE) -C guassian run-vlsim
$(MAKE) -C oclprintf run-vlsim
$(MAKE) -C psort run-vlsim
run-opae:
$(MAKE) -C vecadd run-opae
$(MAKE) -C sgemm run-opae
$(MAKE) -C psort run-opae
$(MAKE) -C saxpy run-opae
$(MAKE) -C sfilter run-opae
$(MAKE) -C nearn run-opae
$(MAKE) -C guassian run-opae
$(MAKE) -C dotproduct run-opae
$(MAKE) -C kmeans run-opae
$(MAKE) -C spmv run-opae
$(MAKE) -C transpose run-opae
$(MAKE) -C cutcp run-opae
$(MAKE) -C stencil run-opae
$(MAKE) -C lbm run-opae
$(MAKE) -C oclprintf run-opae
$(MAKE) -C blackscholes run-opae
$(MAKE) -C matmul run-opae
# $(MAKE) -C vectorhypot run-opae
# $(MAKE) -C mri-q run-opae
clean:
$(MAKE) -C vecadd clean
$(MAKE) -C sgemm clean
$(MAKE) -C psort clean
$(MAKE) -C saxpy clean
$(MAKE) -C sfilter clean
$(MAKE) -C nearn clean
$(MAKE) -C guassian clean
$(MAKE) -C dotproduct clean
$(MAKE) -C kmeans clean
$(MAKE) -C spmv clean
$(MAKE) -C transpose clean
$(MAKE) -C cutcp clean
$(MAKE) -C vectorhypot clean
$(MAKE) -C stencil clean
$(MAKE) -C mri-q clean
$(MAKE) -C lbm clean
$(MAKE) -C oclprintf clean
$(MAKE) -C psort clean
$(MAKE) -C blackscholes clean
$(MAKE) -C matmul clean
clean-all:
$(MAKE) -C vecadd clean-all
$(MAKE) -C sgemm clean-all
$(MAKE) -C psort clean-all
$(MAKE) -C saxpy clean-all
$(MAKE) -C sfilter clean-all
$(MAKE) -C sfilter clean-all
$(MAKE) -C nearn clean-all
$(MAKE) -C guassian clean-all
$(MAKE) -C dotproduct clean-all
$(MAKE) -C kmeans clean-all
$(MAKE) -C spmv clean-all
$(MAKE) -C transpose clean-all
$(MAKE) -C cutcp clean-all
$(MAKE) -C vectorhypot clean-all
$(MAKE) -C stencil clean-all
$(MAKE) -C mri-q clean-all
$(MAKE) -C lbm clean-all
$(MAKE) -C oclprintf clean-all
$(MAKE) -C psort clean-all
$(MAKE) -C blackscholes clean-all
$(MAKE) -C matmul clean-all

View File

@@ -1,67 +0,0 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT=VectorHypot
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: VectorHypot.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,67 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 -Wstack-usage=1024 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = bfs
SRCS = main.cc
all: $(PROJECT) kernel.pocl
OPTS ?=
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -285,7 +285,7 @@ int main(int argc, char *argv[]) {
free(h_graph_visited);
} catch (std::string msg) {
std::cout << "--cambine: exception in main ->" << msg << std::endl;
printf("--cambine: exception in main ->%s\n", msg);
// release host memory
free(h_graph_nodes);
free(h_graph_mask);

View File

@@ -0,0 +1,9 @@
PROJECT = blackscholes
SRCS = main.cpp oclUtils.cpp shrUtils.cpp cmd_arg_reader.cpp oclBlackScholes_launcher.cpp oclBlackScholes_gold.cpp
CXXFLAGS += -I.
OPTS ?=
include ../common.mk

View File

@@ -0,0 +1,152 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
// includes, file
#include "cmd_arg_reader.h"
// includes, system
#include <vector>
// internal unnamed namespace
namespace
{
// types, internal (class, enum, struct, union, typedef)
// variables, internal
} // namespace {
// variables, exported
/*static*/ CmdArgReader* CmdArgReader::self;
/*static*/ char** CmdArgReader::rargv;
/*static*/ int CmdArgReader::rargc;
// functions, exported
////////////////////////////////////////////////////////////////////////////////
//! Public construction interface
//! @return a handle to the class instance
//! @param argc number of command line arguments (as given to main())
//! @param argv command line argument string (as given to main())
////////////////////////////////////////////////////////////////////////////////
/*static*/ void
CmdArgReader::init( const int argc, const char** argv)
{
if ( NULL != self)
{
return;
}
// command line arguments
if (( 0 == argc) || ( 0 == argv))
{
LOGIC_EXCEPTION( "No command line arguments given.");
}
self = new CmdArgReader();
self->createArgsMaps( argc, argv);
rargc = argc;
rargv = const_cast<char**>( argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Constructor, default
////////////////////////////////////////////////////////////////////////////////
CmdArgReader::CmdArgReader() :
args(),
unprocessed(),
iter(),
iter_unprocessed()
{ }
////////////////////////////////////////////////////////////////////////////////
//! Destructor
////////////////////////////////////////////////////////////////////////////////
CmdArgReader::~CmdArgReader()
{
for( iter = args.begin(); iter != args.end(); ++iter)
{
if( *(iter->second.first) == typeid( int))
{
delete static_cast<int*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( bool))
{
delete static_cast<bool*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::string))
{
delete static_cast<std::string*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::vector< std::string>) )
{
delete static_cast< std::vector< std::string>* >( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::vector<int>) )
{
delete static_cast< std::vector<int>* >( iter->second.second);
break;
}
}
}
////////////////////////////////////////////////////////////////////////////////
//! Read args as token value pair into map for better processing (Even the
//! values remain strings until the parameter values is requested by the
//! program.)
//! @param argc the argument count (as given to 'main')
//! @param argv the char* array containing the command line arguments
////////////////////////////////////////////////////////////////////////////////
void
CmdArgReader::createArgsMaps( const int argc, const char** argv) {
std::string token;
std::string val_str;
std::map< std::string, std::string> args;
std::string::size_type pos;
std::string arg;
for( int i=1; i<argc; ++i)
{
arg = argv[i];
// check if valid command line argument: all arguments begin with - or --
if (arg[0] != '-')
{
RUNTIME_EXCEPTION("Invalid command line argument.");
}
int numDashes = (arg[1] == '-' ? 2 : 1);
// check if only flag or if a value is given
if ( (pos = arg.find( '=')) == std::string::npos)
{
unprocessed[ std::string( arg, numDashes, arg.length()-numDashes)] = "FLAG";
}
else
{
unprocessed[ std::string( arg, numDashes, pos-numDashes)] =
std::string( arg, pos+1, arg.length()-1);
}
}
}

View File

@@ -0,0 +1,488 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
#ifndef _CMDARGREADER_H_
#define _CMDARGREADER_H_
// includes, system
#include <map>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <typeinfo>
// includes, project
#include "exception.h"
//! Preprocessed command line arguments
//! @note Lazy evaluation: The arguments are converted from strings to
//! the correct data type upon request. Converted values are stored
//! in an additonal map so that no additional conversion is
//! necessary. Arrays of command line arguments are stored in
//! std::vectors
//! @note Usage:
//! const std::string* file =
//! CmdArgReader::getArg< std::string>( "model")
//! const std::vector< std::string>* files =
//! CmdArgReader::getArg< std::vector< std::string> >( "model")
//! @note All command line arguments begin with '--' followed by the token;
//! token and value are seperated by '='; example --samples=50
//! @note Arrays have the form --model=[one.obj,two.obj,three.obj]
//! (without whitespaces)
//! Command line argument parser
class CmdArgReader
{
template<class> friend class TestCmdArgReader;
protected:
//! @param self handle to the only instance of this class
static CmdArgReader* self;
public:
//! Public construction interface
//! @return a handle to the class instance
//! @param argc number of command line arguments (as given to main())
//! @param argv command line argument string (as given to main())
static void init( const int argc, const char** argv);
public:
//! Get the value of the command line argument with given name
//! @return A const handle to the requested argument.
//! If the argument does not exist or if it
//! is not from type T NULL is returned
//! @param name the name of the requested argument
//! @note T the type of the argument requested
template<class T>
static inline const T* getArg( const std::string& name);
//! Check if a command line argument with the given name exists
//! @return true if a command line argument with name \a name exists,
//! otherwise false
//! @param name name of the command line argument in question
static inline bool existArg( const std::string& name);
//! Get the original / raw argc program argument
static inline int& getRArgc();
//! Get the original / raw argv program argument
static inline char**& getRArgv();
public:
//! Destructor
~CmdArgReader();
protected:
//! Constructor, default
CmdArgReader();
private:
// private helper functions
//! Get the value of the command line argument with given name
//! @note Private helper function for 'getArg' to work on the members
//! @return A const handle to the requested argument. If the argument
//! does not exist or if it is not from type T a NULL pointer
//! is returned.
//! @param name the name of the requested argument
//! @note T the type of the argument requested
template<class T>
inline const T* getArgHelper( const std::string& name);
//! Check if a command line argument with name \a name exists
//! @return true if a command line argument of name \a name exists,
//! otherwise false
//! @param name the name of the requested argument
inline bool existArgHelper( const std::string& name) const;
//! Read args as token value pair into map for better processing
//! (Even the values remain strings until the parameter values is
//! requested by the program.)
//! @param argc the argument count (as given to 'main')
//! @param argv the char* array containing the command line arguments
void createArgsMaps( const int argc, const char** argv);
//! Helper for "casting" the strings from the map with the unprocessed
//! values to the correct
//! data type.
//! @return true if conversion succeeded, otherwise false
//! @param element the value as string
//! @param val the value as type T
template<class T>
static inline bool convertToT( const std::string& element, T& val);
public:
// typedefs internal
//! container for a processed command line argument
//! typeid is used to easily be able to decide if a re-requested token-value
//! pair match the type of the first conversion
typedef std::pair< const std::type_info*, void*> ValType;
//! map of already converted values
typedef std::map< std::string, ValType > ArgsMap;
//! iterator for the map of already converted values
typedef ArgsMap::iterator ArgsMapIter;
typedef ArgsMap::const_iterator ConstArgsMapIter;
//! map of unprocessed (means unconverted) token-value pairs
typedef std::map< std::string, std::string> UnpMap;
//! iterator for the map of unprocessed (means unconverted) token-value pairs
typedef std::map< std::string, std::string>::iterator UnpMapIter;
private:
#ifdef _WIN32
# pragma warning( disable: 4251)
#endif
//! rargc original value of argc
static int rargc;
//! rargv contains command line arguments in raw format
static char** rargv;
//! args Map containing the already converted token-value pairs
ArgsMap args;
//! args Map containing the unprocessed / unconverted token-value pairs
UnpMap unprocessed;
//! iter Iterator for the map with the already converted token-value
//! pairs (to avoid frequent reallocation)
ArgsMapIter iter;
//! iter Iterator for the map with the unconverted token-value
//! pairs (to avoid frequent reallocation)
UnpMapIter iter_unprocessed;
#ifdef _WIN32
# pragma warning( default: 4251)
#endif
private:
//! Constructor, copy (not implemented)
CmdArgReader( const CmdArgReader&);
//! Assignment operator (not implemented)
CmdArgReader& operator=( const CmdArgReader&);
};
// variables, exported (extern)
// functions, inlined (inline)
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line argument arrays
//! @note This function is used each type for which no template specialization
//! exist (which will cause errors if the type does not fulfill the std::vector
//! interface).
////////////////////////////////////////////////////////////////////////////////
template<class T>
/*static*/ inline bool
CmdArgReader::convertToT( const std::string& element, T& val)
{
// preallocate storage
val.resize( std::count( element.begin(), element.end(), ',') + 1);
unsigned int i = 0;
std::string::size_type pos_start = 1; // leave array prefix '['
std::string::size_type pos_end = 0;
// do for all elements of the comma seperated list
while( std::string::npos != ( pos_end = element.find(',', pos_end+1)) )
{
// convert each element by the appropriate function
if ( ! convertToT< typename T::value_type >(
std::string( element, pos_start, pos_end - pos_start), val[i]))
{
return false;
}
pos_start = pos_end + 1;
++i;
}
std::string tmp1( element, pos_start, element.length() - pos_start - 1);
// process last element (leave array postfix ']')
if ( ! convertToT< typename T::value_type >( std::string( element,
pos_start,
element.length() - pos_start - 1),
val[i]))
{
return false;
}
// possible to process all elements?
return true;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type int
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<int>( const std::string& element, int& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type float
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<float>( const std::string& element, float& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type double
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<double>( const std::string& element, double& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type string
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<std::string>( const std::string& element,
std::string& val)
{
val = element;
return true;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type bool
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<bool>( const std::string& element, bool& val)
{
// check if value is given as string-type { true | false }
if ( "true" == element)
{
val = true;
return true;
}
else if ( "false" == element)
{
val = false;
return true;
}
// check if argument is given as integer { 0 | 1 }
else
{
int tmp;
if ( convertToT<int>( element, tmp))
{
if ( 1 == tmp)
{
val = true;
return true;
}
else if ( 0 == tmp)
{
val = false;
return true;
}
}
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the value of the command line argument with given name
//! @return A const handle to the requested argument. If the argument does
//! not exist or if it is not from type T NULL is returned
//! @param T the type of the argument requested
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
template<class T>
/*static*/ const T*
CmdArgReader::getArg( const std::string& name)
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
return NULL;
}
return self->getArgHelper<T>( name);
}
////////////////////////////////////////////////////////////////////////////////
//! Check if a command line argument with the given name exists
//! @return true if a command line argument with name \a name exists,
//! otherwise false
//! @param name name of the command line argument in question
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline bool
CmdArgReader::existArg( const std::string& name)
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
return false;
}
return self->existArgHelper( name);
}
////////////////////////////////////////////////////////////////////////////////
//! @brief Get the value of the command line argument with given name
//! @return A const handle to the requested argument. If the argument does
//! not exist or if it is not from type T NULL is returned
//! @param T the type of the argument requested
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
template<class T>
const T*
CmdArgReader::getArgHelper( const std::string& name)
{
// check if argument already processed and stored in correct type
if ( args.end() != (iter = args.find( name)))
{
if ( (*(iter->second.first)) == typeid( T) )
{
return (T*) iter->second.second;
}
}
else
{
T* tmp = new T;
// check the array with unprocessed values
if ( unprocessed.end() != (iter_unprocessed = unprocessed.find( name)))
{
// try to "cast" the string to the type requested
if ( convertToT< T >( iter_unprocessed->second, *tmp))
{
// add the token element pair to map of already converted values
args[name] = std::make_pair( &(typeid( T)), (void*) tmp);
return tmp;
}
}
// not used while not inserted into the map -> cleanup
delete tmp;
}
// failed, argument not available
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
//! Check if a command line argument with name \a name exists
//! @return true if a command line argument of name \a name exists,
//! otherwise false
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
inline bool
CmdArgReader::existArgHelper( const std::string& name) const
{
bool ret_val = false;
// check if argument already processed and stored in correct type
if( args.end() != args.find( name))
{
ret_val = true;
}
else
{
// check the array with unprocessed values
if ( unprocessed.end() != unprocessed.find( name))
{
ret_val = true;
}
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the original / raw argc program argument
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline int&
CmdArgReader::getRArgc()
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
}
return rargc;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the original / raw argv program argument
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline char**&
CmdArgReader::getRArgv()
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
}
return rargv;
}
// functions, exported (extern)
#endif // #ifndef _CMDARGREADER_H_

View File

@@ -0,0 +1,151 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
#ifndef _EXCEPTION_H_
#define _EXCEPTION_H_
// includes, system
#include <exception>
#include <stdexcept>
#include <iostream>
#include <stdlib.h>
//! Exception wrapper.
//! @param Std_Exception Exception out of namespace std for easy typing.
template<class Std_Exception>
class Exception : public Std_Exception
{
public:
//! @brief Static construction interface
//! @return Alwayss throws ( Located_Exception<Exception>)
//! @param file file in which the Exception occurs
//! @param line line in which the Exception occurs
//! @param detailed details on the code fragment causing the Exception
static void throw_it( const char* file,
const int line,
const char* detailed = "-" );
//! Static construction interface
//! @return Alwayss throws ( Located_Exception<Exception>)
//! @param file file in which the Exception occurs
//! @param line line in which the Exception occurs
//! @param detailed details on the code fragment causing the Exception
static void throw_it( const char* file,
const int line,
const std::string& detailed);
//! Destructor
virtual ~Exception() throw();
private:
//! Constructor, default (private)
Exception();
//! Constructor, standard
//! @param str string returned by what()
Exception( const std::string& str);
};
////////////////////////////////////////////////////////////////////////////////
//! Exception handler function for arbitrary exceptions
//! @param ex exception to handle
////////////////////////////////////////////////////////////////////////////////
template<class Exception_Typ>
inline void
handleException( const Exception_Typ& ex)
{
std::cerr << ex.what() << std::endl;
exit( EXIT_FAILURE);
}
//! Convenience macros
//! Exception caused by dynamic program behavior, e.g. file does not exist
#define RUNTIME_EXCEPTION( msg) \
Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
//! Logic exception in program, e.g. an assert failed
#define LOGIC_EXCEPTION( msg) \
Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
//! Out of range exception
#define RANGE_EXCEPTION( msg) \
Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
////////////////////////////////////////////////////////////////////////////////
//! Implementation
// includes, system
#include <sstream>
////////////////////////////////////////////////////////////////////////////////
//! Static construction interface.
//! @param Exception causing code fragment (file and line) and detailed infos.
////////////////////////////////////////////////////////////////////////////////
/*static*/ template<class Std_Exception>
void
Exception<Std_Exception>::
throw_it( const char* file, const int line, const char* detailed)
{
std::stringstream s;
// Quiet heavy-weight but exceptions are not for
// performance / release versions
s << "Exception in file '" << file << "' in line " << line << "\n"
<< "Detailed description: " << detailed << "\n";
throw Exception( s.str());
}
////////////////////////////////////////////////////////////////////////////////
//! Static construction interface.
//! @param Exception causing code fragment (file and line) and detailed infos.
////////////////////////////////////////////////////////////////////////////////
/*static*/ template<class Std_Exception>
void
Exception<Std_Exception>::
throw_it( const char* file, const int line, const std::string& msg)
{
throw_it( file, line, msg.c_str());
}
////////////////////////////////////////////////////////////////////////////////
//! Constructor, default (private).
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::Exception() :
Exception("Unknown Exception.\n")
{ }
////////////////////////////////////////////////////////////////////////////////
//! Constructor, standard (private).
//! String returned by what().
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::Exception( const std::string& s) :
Std_Exception( s)
{ }
////////////////////////////////////////////////////////////////////////////////
//! Destructor
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::~Exception() throw() { }
// functions, exported
#endif // #ifndef _EXCEPTION_H_

View File

@@ -61,7 +61,7 @@ int main(int argc, char **argv)
*h_X,
*h_T;
const unsigned int optionCount = 4000000;
const unsigned int optionCount = 64;
const float R = 0.02f;
const float V = 0.30f;
@@ -69,7 +69,7 @@ int main(int argc, char **argv)
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
//Get all the devices
@@ -78,10 +78,10 @@ int main(int argc, char **argv)
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
shrLog("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
// Get command line device options and config accordingly
shrLog(" # of Devices Available = %u\n", uiNumDevices);
@@ -92,7 +92,7 @@ int main(int argc, char **argv)
shrLog(" Using Device %u: ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
// set logfile name and start logs
@@ -120,31 +120,31 @@ int main(int argc, char **argv)
shrLog("Initializing OpenCL...\n");
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
// Get a GPU device
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
// Create the context
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Creating OpenCL memory objects...\n");
d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
d_Put = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
d_S = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
d_X = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
d_T = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Starting up BlackScholes...\n");
initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv);
@@ -204,9 +204,9 @@ int main(int argc, char **argv)
shrLog("\nReading back OpenCL BlackScholes results...\n");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Comparing against Host/C++ computation...\n");
BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount);
@@ -232,7 +232,7 @@ int main(int argc, char **argv)
ciErrNum |= clReleaseMemObject(d_Call);
ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
ciErrNum |= clReleaseContext(cxGPUContext);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
free(h_T);
free(h_X);

View File

@@ -9,8 +9,6 @@
*
*/
#include <oclUtils.h>
#include "oclBlackScholes_common.h"
@@ -18,19 +16,47 @@ static cl_program cpBlackScholes; //OpenCL program
static cl_kernel ckBlackScholes; //OpenCL kernel
static cl_command_queue cqDefaultCommandQueue;
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (NULL == filename || NULL == data || 0 == size)
return CL_INVALID_VALUE;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return CL_INVALID_VALUE;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return CL_SUCCESS;
}
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
cl_int ciErrNum;
size_t kernelLength;
shrLog("...loading BlackScholes.cl\n");
/*shrLog("...loading BlackScholes.cl\n");
char *cPathAndName = shrFindFilePath("BlackScholes.cl", argv[0]);
shrCheckError(cPathAndName != NULL, shrTRUE);
char *cBlackScholes = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
shrCheckError(cBlackScholes != NULL, shrTRUE);
shrCheckError(cBlackScholes != NULL, shrTRUE);*/
shrLog("...creating BlackScholes program\n");
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
cpBlackScholes = clCreateProgramWithBuiltInKernels(context, 1, &device_id, "BlackScholes", NULL);
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
shrCheckError(ciErrNum, CL_SUCCESS);
cl_device_id device_id = oclGetFirstDev(cxGPUContext);
cpBlackScholes = clCreateProgramWithBinary(
cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
shrCheckError(ciErrNum, CL_SUCCESS);
shrLog("...building BlackScholes program\n");
@@ -66,7 +92,7 @@ extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqPar
shrLog("*** Exiting ***\n");
free(logTxt);
free(cdDevices);
exit(666);
exit(1);
}
//Save ptx code to separate file
@@ -77,8 +103,8 @@ extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqPar
shrCheckError(ciErrNum, CL_SUCCESS);
cqDefaultCommandQueue = cqParamCommandQueue;
free(cBlackScholes);
free(cPathAndName);
//free(cBlackScholes);
//free(cPathAndName);
}
extern "C" void closeBlackScholes(void){
@@ -118,8 +144,8 @@ extern "C" void BlackScholes(
shrCheckError(ciErrNum, CL_SUCCESS);
//Run the kernel
size_t globalWorkSize = 60 * 1024;
size_t localWorkSize = 128;
size_t globalWorkSize = 16;//60 * 1024;
size_t localWorkSize = 16;//128;
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBlackScholes, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
}

View File

@@ -0,0 +1,806 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
#include <fstream>
#include <vector>
#include <iostream>
#include <algorithm>
#include <stdarg.h>
#include "oclUtils.h"
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platoform ID
//////////////////////////////////////////////////////////////////////////////
cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID)
{
char chBuffer[1024];
cl_uint num_platforms;
cl_platform_id* clPlatformIDs;
cl_int ciErrNum;
*clSelectedPlatformID = NULL;
// Get OpenCL platform count
ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms);
if (ciErrNum != CL_SUCCESS)
{
shrLog(" Error %i in clGetPlatformIDs Call !!!\n\n", ciErrNum);
return -1000;
}
else
{
if(num_platforms == 0)
{
shrLog("No OpenCL platform found!\n\n");
return -2000;
}
else
{
// if there's a platform or more, make space for ID's
if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL)
{
shrLog("Failed to allocate memory for cl_platform ID's!\n\n");
return -3000;
}
// get platform info for each platform and trap the NVIDIA platform if found
ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL);
for(cl_uint i = 0; i < num_platforms; ++i)
{
ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL);
if(ciErrNum == CL_SUCCESS)
{
if(strstr(chBuffer, "NVIDIA") != NULL)
{
*clSelectedPlatformID = clPlatformIDs[i];
break;
}
}
}
// default to zeroeth platform if NVIDIA not found
if(*clSelectedPlatformID == NULL)
{
shrLog("WARNING: NVIDIA OpenCL platform not found - defaulting to first platform!\n\n");
*clSelectedPlatformID = clPlatformIDs[0];
}
free(clPlatformIDs);
}
}
return CL_SUCCESS;
}
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
void oclPrintDevName(int iLogMode, cl_device_id device)
{
char device_string[1024];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, "%s", device_string);
}
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
void oclPrintDevInfo(int iLogMode, cl_device_id device)
{
char device_string[1024];
bool nv_device_attibute_query = false;
// CL_DEVICE_NAME
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_NAME: \t\t\t%s\n", device_string);
// CL_DEVICE_VENDOR
clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_VENDOR: \t\t\t%s\n", device_string);
// CL_DRIVER_VERSION
clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DRIVER_VERSION: \t\t\t%s\n", device_string);
// CL_DEVICE_VERSION
clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_VERSION: \t\t\t%s\n", device_string);
// CL_DEVICE_OPENCL_C_VERSION (if CL_DEVICE_VERSION version > 1.0)
if(strncmp("OpenCL 1.0", device_string, 10) != 0)
{
// This code is unused for devices reporting OpenCL 1.0, but a def is needed anyway to allow compilation using v 1.0 headers
// This constant isn't #defined in 1.0
#ifndef CL_DEVICE_OPENCL_C_VERSION
#define CL_DEVICE_OPENCL_C_VERSION 0x103D
#endif
clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_OPENCL_C_VERSION: \t\t%s\n", device_string);
}
// CL_DEVICE_TYPE
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
if( type & CL_DEVICE_TYPE_CPU )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if( type & CL_DEVICE_TYPE_GPU )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if( type & CL_DEVICE_TYPE_ACCELERATOR )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if( type & CL_DEVICE_TYPE_DEFAULT )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", compute_units);
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
size_t workitem_dims;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims);
// CL_DEVICE_MAX_WORK_ITEM_SIZES
size_t workitem_size[3];
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
size_t workgroup_size;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency);
// CL_DEVICE_ADDRESS_BITS
cl_uint addr_bits;
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_ADDRESS_BITS:\t\t%u\n", addr_bits);
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
cl_ulong max_mem_alloc_size;
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_alloc_size), &max_mem_alloc_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(max_mem_alloc_size / (1024 * 1024)));
// CL_DEVICE_GLOBAL_MEM_SIZE
cl_ulong mem_size;
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(mem_size / (1024 * 1024)));
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
cl_bool error_correction_support;
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(error_correction_support), &error_correction_support, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", error_correction_support == CL_TRUE ? "yes" : "no");
// CL_DEVICE_LOCAL_MEM_TYPE
cl_device_local_mem_type local_mem_type;
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(local_mem_type), &local_mem_type, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", local_mem_type == 1 ? "local" : "global");
// CL_DEVICE_LOCAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(mem_size / 1024));
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(mem_size / 1024));
// CL_DEVICE_QUEUE_PROPERTIES
cl_command_queue_properties queue_properties;
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(queue_properties), &queue_properties, NULL);
if( queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
if( queue_properties & CL_QUEUE_PROFILING_ENABLE )
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
// CL_DEVICE_IMAGE_SUPPORT
cl_bool image_support;
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", image_support);
// CL_DEVICE_MAX_READ_IMAGE_ARGS
cl_uint max_read_image_args;
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(max_read_image_args), &max_read_image_args, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", max_read_image_args);
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
cl_uint max_write_image_args;
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(max_write_image_args), &max_write_image_args, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", max_write_image_args);
// CL_DEVICE_SINGLE_FP_CONFIG
cl_device_fp_config fp_config;
clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_SINGLE_FP_CONFIG:\t\t%s%s%s%s%s%s\n",
fp_config & CL_FP_DENORM ? "denorms " : "",
fp_config & CL_FP_INF_NAN ? "INF-quietNaNs " : "",
fp_config & CL_FP_ROUND_TO_NEAREST ? "round-to-nearest " : "",
fp_config & CL_FP_ROUND_TO_ZERO ? "round-to-zero " : "",
fp_config & CL_FP_ROUND_TO_INF ? "round-to-inf " : "",
fp_config & CL_FP_FMA ? "fma " : "");
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
size_t szMaxDims[5];
shrLogEx(iLogMode, 0, "\n CL_DEVICE_IMAGE <dim>");
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &szMaxDims[0], NULL);
shrLogEx(iLogMode, 0, "\t\t\t2D_MAX_WIDTH\t %u\n", szMaxDims[0]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[1], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", szMaxDims[1]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &szMaxDims[2], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_WIDTH\t %u\n", szMaxDims[2]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[3], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", szMaxDims[3]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &szMaxDims[4], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_DEPTH\t %u\n", szMaxDims[4]);
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(device_string), &device_string, NULL);
if (device_string != 0)
{
shrLogEx(iLogMode, 0, "\n CL_DEVICE_EXTENSIONS:");
std::string stdDevString;
stdDevString = std::string(device_string);
size_t szOldPos = 0;
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
while (szSpacePos != stdDevString.npos)
{
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
nv_device_attibute_query = true;
if (szOldPos > 0)
{
shrLogEx(iLogMode, 0, "\t\t");
}
shrLogEx(iLogMode, 0, "\t\t\t%s\n", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str());
do {
szOldPos = szSpacePos + 1;
szSpacePos = stdDevString.find(' ', szOldPos);
} while (szSpacePos == szOldPos);
}
shrLogEx(iLogMode, 0, "\n");
}
else
{
shrLogEx(iLogMode, 0, " CL_DEVICE_EXTENSIONS: None\n");
}
if(nv_device_attibute_query)
{
cl_uint compute_capability_major, compute_capability_minor;
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &compute_capability_major, NULL);
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &compute_capability_minor, NULL);
shrLogEx(iLogMode, 0, "\n CL_DEVICE_COMPUTE_CAPABILITY_NV:\t%u.%u\n", compute_capability_major, compute_capability_minor);
shrLogEx(iLogMode, 0, " NUMBER OF MULTIPROCESSORS:\t\t%u\n", compute_units); // this is the same value reported by CL_DEVICE_MAX_COMPUTE_UNITS
shrLogEx(iLogMode, 0, " NUMBER OF CUDA CORES:\t\t\t%u\n", ConvertSMVer2Cores(compute_capability_major, compute_capability_minor) * compute_units);
cl_uint regs_per_block;
clGetDeviceInfo(device, CL_DEVICE_REGISTERS_PER_BLOCK_NV, sizeof(cl_uint), &regs_per_block, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_REGISTERS_PER_BLOCK_NV:\t%u\n", regs_per_block);
cl_uint warp_size;
clGetDeviceInfo(device, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), &warp_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_WARP_SIZE_NV:\t\t%u\n", warp_size);
cl_bool gpu_overlap;
clGetDeviceInfo(device, CL_DEVICE_GPU_OVERLAP_NV, sizeof(cl_bool), &gpu_overlap, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_GPU_OVERLAP_NV:\t\t%s\n", gpu_overlap == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
cl_bool exec_timeout;
clGetDeviceInfo(device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof(cl_bool), &exec_timeout, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:\t%s\n", exec_timeout == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
cl_bool integrated_memory;
clGetDeviceInfo(device, CL_DEVICE_INTEGRATED_MEMORY_NV, sizeof(cl_bool), &integrated_memory, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_INTEGRATED_MEMORY_NV:\t%s\n", integrated_memory == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
}
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
shrLogEx(iLogMode, 0, " CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
cl_uint vec_width [6];
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &vec_width[0], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &vec_width[1], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &vec_width[2], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &vec_width[3], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &vec_width[4], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &vec_width[5], NULL);
shrLogEx(iLogMode, 0, "CHAR %u, SHORT %u, INT %u, LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
vec_width[0], vec_width[1], vec_width[2], vec_width[3], vec_width[4], vec_width[5]);
}
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
int oclGetDevCap(cl_device_id device)
{
char cDevString[1024];
bool bDevAttributeQuery = false;
int iDevArch = -1;
// Get device extensions, and if any then search for cl_nv_device_attribute_query
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(cDevString), &cDevString, NULL);
if (cDevString != 0)
{
std::string stdDevString;
stdDevString = std::string(cDevString);
size_t szOldPos = 0;
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
while (szSpacePos != stdDevString.npos)
{
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
{
bDevAttributeQuery = true;
}
do {
szOldPos = szSpacePos + 1;
szSpacePos = stdDevString.find(' ', szOldPos);
} while (szSpacePos == szOldPos);
}
}
// if search succeeded, get device caps
if(bDevAttributeQuery)
{
cl_int iComputeCapMajor, iComputeCapMinor;
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), (void*)&iComputeCapMajor, NULL);
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), (void*)&iComputeCapMinor, NULL);
iDevArch = (10 * iComputeCapMajor) + iComputeCapMinor;
}
return iDevArch;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetFirstDev(cl_context cxGPUContext)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id first = cdDevices[0];
free(cdDevices);
return first;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
size_t device_count = szParmDataBytes / sizeof(cl_device_id);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id max_flops_device = cdDevices[0];
int max_flops = 0;
size_t current_device = 0;
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
max_flops = compute_units * clock_frequency;
++current_device;
while( current_device < device_count )
{
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
int flops = compute_units * clock_frequency;
if( flops > max_flops )
{
max_flops = flops;
max_flops_device = cdDevices[current_device];
}
++current_device;
}
free(cdDevices);
return max_flops_device;
}
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
#ifdef _WIN32 // Windows version
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
{
return NULL;
}
#else // Linux version
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
#endif
size_t szPreambleLength = strlen(cPreamble);
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
memcpy(cSourceString, cPreamble, szPreambleLength);
if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
{
fclose(pFileStream);
free(cSourceString);
return 0;
}
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength + szPreambleLength;
}
cSourceString[szSourceLength + szPreambleLength] = '\0';
return cSourceString;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int nr)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
if( szParmDataBytes / sizeof(cl_device_id) <= nr ) {
return (cl_device_id)-1;
}
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id device = cdDevices[nr];
free(cdDevices);
return device;
}
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length)
{
// Grab the number of devices associated witht the program
cl_uint num_devices;
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
// Grab the device ids
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
// Grab the sizes of the binaries
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
// Now get the binaries
char** ptx_code = (char**) malloc(num_devices * sizeof(char*));
for( unsigned int i=0; i<num_devices; ++i) {
ptx_code[i]= (char*)malloc(binary_sizes[i]);
}
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
// Find the index of the device of interest
unsigned int idx = 0;
while( idx<num_devices && devices[idx] != cdDevice ) ++idx;
// If it is associated prepare the result
if( idx < num_devices )
{
*binary = ptx_code[idx];
*length = binary_sizes[idx];
}
// Cleanup
free( devices );
free( binary_sizes );
for( unsigned int i=0; i<num_devices; ++i) {
if( i != idx ) free(ptx_code[i]);
}
free( ptx_code );
}
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName)
{
// Grab the number of devices associated with the program
cl_uint num_devices;
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
// Grab the device ids
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
// Grab the sizes of the binaries
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
// Now get the binaries
char** ptx_code = (char**)malloc(num_devices * sizeof(char*));
for( unsigned int i=0; i<num_devices; ++i)
{
ptx_code[i] = (char*)malloc(binary_sizes[i]);
}
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
// Find the index of the device of interest
unsigned int idx = 0;
while((idx < num_devices) && (devices[idx] != cdDevice))
{
++idx;
}
// If the index is associated, log the result
if(idx < num_devices)
{
// if a separate filename is supplied, dump ptx there
if (NULL != cPtxFileName)
{
shrLog("\nWriting ptx to separate file: %s ...\n\n", cPtxFileName);
FILE* pFileStream = NULL;
#ifdef _WIN32
fopen_s(&pFileStream, cPtxFileName, "wb");
#else
pFileStream = fopen(cPtxFileName, "wb");
#endif
fwrite(ptx_code[idx], binary_sizes[idx], 1, pFileStream);
fclose(pFileStream);
}
else // log to logfile and console if no ptx file specified
{
shrLog("\n%s\nProgram Binary:\n%s\n%s\n", HDASHLINE, ptx_code[idx], HDASHLINE);
}
}
// Cleanup
free(devices);
free(binary_sizes);
for(unsigned int i = 0; i < num_devices; ++i)
{
free(ptx_code[i]);
}
free( ptx_code );
}
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice)
{
// write out the build log and ptx, then exit
char cBuildLog[10240];
clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG,
sizeof(cBuildLog), cBuildLog, NULL );
shrLog("\n%s\nBuild Log:\n%s\n%s\n", HDASHLINE, cBuildLog, HDASHLINE);
}
// Helper function for De-allocating cl objects
// *********************************************************************
void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs)
{
int i;
for (i = 0; i < iNumObjs; i++)
{
if (cmMemObjs[i])clReleaseMemObject(cmMemObjs[i]);
}
}
// Helper function to get OpenCL error string from constant
// *********************************************************************
const char* oclErrorString(cl_int error)
{
static const char* errorString[] = {
"CL_SUCCESS",
"CL_DEVICE_NOT_FOUND",
"CL_DEVICE_NOT_AVAILABLE",
"CL_COMPILER_NOT_AVAILABLE",
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
"CL_OUT_OF_RESOURCES",
"CL_OUT_OF_HOST_MEMORY",
"CL_PROFILING_INFO_NOT_AVAILABLE",
"CL_MEM_COPY_OVERLAP",
"CL_IMAGE_FORMAT_MISMATCH",
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
"CL_BUILD_PROGRAM_FAILURE",
"CL_MAP_FAILURE",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"CL_INVALID_VALUE",
"CL_INVALID_DEVICE_TYPE",
"CL_INVALID_PLATFORM",
"CL_INVALID_DEVICE",
"CL_INVALID_CONTEXT",
"CL_INVALID_QUEUE_PROPERTIES",
"CL_INVALID_COMMAND_QUEUE",
"CL_INVALID_HOST_PTR",
"CL_INVALID_MEM_OBJECT",
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
"CL_INVALID_IMAGE_SIZE",
"CL_INVALID_SAMPLER",
"CL_INVALID_BINARY",
"CL_INVALID_BUILD_OPTIONS",
"CL_INVALID_PROGRAM",
"CL_INVALID_PROGRAM_EXECUTABLE",
"CL_INVALID_KERNEL_NAME",
"CL_INVALID_KERNEL_DEFINITION",
"CL_INVALID_KERNEL",
"CL_INVALID_ARG_INDEX",
"CL_INVALID_ARG_VALUE",
"CL_INVALID_ARG_SIZE",
"CL_INVALID_KERNEL_ARGS",
"CL_INVALID_WORK_DIMENSION",
"CL_INVALID_WORK_GROUP_SIZE",
"CL_INVALID_WORK_ITEM_SIZE",
"CL_INVALID_GLOBAL_OFFSET",
"CL_INVALID_EVENT_WAIT_LIST",
"CL_INVALID_EVENT",
"CL_INVALID_OPERATION",
"CL_INVALID_GL_OBJECT",
"CL_INVALID_BUFFER_SIZE",
"CL_INVALID_MIP_LEVEL",
"CL_INVALID_GLOBAL_WORK_SIZE",
};
const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
const int index = -error;
return (index >= 0 && index < errorCount) ? errorString[index] : "Unspecified Error";
}
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
const char* oclImageFormatString(cl_uint uiImageFormat)
{
// cl_channel_order
if (uiImageFormat == CL_R)return "CL_R";
if (uiImageFormat == CL_A)return "CL_A";
if (uiImageFormat == CL_RG)return "CL_RG";
if (uiImageFormat == CL_RA)return "CL_RA";
if (uiImageFormat == CL_RGB)return "CL_RGB";
if (uiImageFormat == CL_RGBA)return "CL_RGBA";
if (uiImageFormat == CL_BGRA)return "CL_BGRA";
if (uiImageFormat == CL_ARGB)return "CL_ARGB";
if (uiImageFormat == CL_INTENSITY)return "CL_INTENSITY";
if (uiImageFormat == CL_LUMINANCE)return "CL_LUMINANCE";
// cl_channel_type
if (uiImageFormat == CL_SNORM_INT8)return "CL_SNORM_INT8";
if (uiImageFormat == CL_SNORM_INT16)return "CL_SNORM_INT16";
if (uiImageFormat == CL_UNORM_INT8)return "CL_UNORM_INT8";
if (uiImageFormat == CL_UNORM_INT16)return "CL_UNORM_INT16";
if (uiImageFormat == CL_UNORM_SHORT_565)return "CL_UNORM_SHORT_565";
if (uiImageFormat == CL_UNORM_SHORT_555)return "CL_UNORM_SHORT_555";
if (uiImageFormat == CL_UNORM_INT_101010)return "CL_UNORM_INT_101010";
if (uiImageFormat == CL_SIGNED_INT8)return "CL_SIGNED_INT8";
if (uiImageFormat == CL_SIGNED_INT16)return "CL_SIGNED_INT16";
if (uiImageFormat == CL_SIGNED_INT32)return "CL_SIGNED_INT32";
if (uiImageFormat == CL_UNSIGNED_INT8)return "CL_UNSIGNED_INT8";
if (uiImageFormat == CL_UNSIGNED_INT16)return "CL_UNSIGNED_INT16";
if (uiImageFormat == CL_UNSIGNED_INT32)return "CL_UNSIGNED_INT32";
if (uiImageFormat == CL_HALF_FLOAT)return "CL_HALF_FLOAT";
if (uiImageFormat == CL_FLOAT)return "CL_FLOAT";
// unknown constant
return "Unknown";
}

File diff suppressed because it is too large Load Diff

116
tests/opencl/common.mk Normal file
View File

@@ -0,0 +1,116 @@
XLEN ?= 32
TARGET ?= opaesim
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
XRT_DEVICE_INDEX ?= 0
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
K_CFLAGS += -march=rv64imafd -mabi=ilp64d
STARTUP_ADDR ?= 0x180000000
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
K_CFLAGS += -march=rv32imaf -mabi=ilp32f
STARTUP_ADDR ?= 0x80000000
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
LLVM_VORTEX ?= /opt/llvm-vortex
LLVM_POCL ?= /opt/llvm-vortex
K_CFLAGS += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
K_CFLAGS += -I$(VORTEX_KN_PATH)/include -DNDEBUG -DLLVM_VOTEX
K_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a -lm
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_RT_PATH)/stub -lvortex
ifdef HOSTGPU
CXXFLAGS += -DHOSTGPU
LDFLAGS += -lOpenCL
else
LDFLAGS += $(POCL_RT_PATH)/lib/libOpenCL.so
endif
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
ifeq ($(TARGET), fpga)
OPAE_DRV_PATHS ?= libopae-c.so
else
ifeq ($(TARGET), asesim)
OPAE_DRV_PATHS ?= libopae-c-ase.so
else
ifeq ($(TARGET), opaesim)
OPAE_DRV_PATHS ?= libopae-c-sim.so
endif
endif
endif
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
all: $(PROJECT) kernel.pocl
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
%.cc.o: %.cc
$(CXX) $(CXXFLAGS) -c $< -o $@
%.cpp.o: %.cpp
$(CXX) $(CXXFLAGS) -c $< -o $@
%.c.o: %.c
$(CC) $(CXXFLAGS) -c $< -o $@
$(PROJECT): $(OBJS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-opae: $(PROJECT) kernel.pocl
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-xrt: $(PROJECT) kernel.pocl
ifeq ($(TARGET), hw)
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
else
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
endif
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.dump *.pocl
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -1 +0,0 @@
convolution

View File

@@ -1,67 +0,0 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/simx -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = convolution
SRCS = main.cpp utils.cpp
all: $(PROJECT) kernel.pocl
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 44 KiB

View File

@@ -1,54 +0,0 @@
__kernel
void convolution(
__read_only image2d_t sourceImage,
__write_only image2d_t outputImage,
int rows,
int cols,
__constant float* filter,
int filterWidth,
sampler_t sampler)
{
// Store each work-items unique row and column
int column = get_global_id(0);
int row = get_global_id(1);
// Half the width of the filter is needed for indexing
// memory later
int halfWidth = (int)(filterWidth/2);
// All accesses to images return data as four-element vector
// (i.e., float4), although only the 'x' component will contain
// meaningful data in this code
float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
// Iterator for the filter
int filterIdx = 0;
// Each work-item iterates around its local area based on the
// size of the filter
int2 coords; // Coordinates for accessing the image
// Iterate the filter rows
for(int i = -halfWidth; i <= halfWidth; i++) {
coords.y = row + i;
// Iterate over the filter columns
for(int j = -halfWidth; j <= halfWidth; j++) {
coords.x = column + j;
float4 pixel;
// Read a pixel from the image. A single channel image
// stores the pixel in the 'x' coordinate of the returned
// vector.
pixel = read_imagef(sourceImage, sampler, coords);
sum.x += pixel.x * filter[filterIdx++];
}
}
// Copy the data to the output image if the
// work-item is in bounds
if(row < rows && column < cols) {
coords.x = column;
coords.y = row;
write_imagef(outputImage, coords, sum);
}
}

View File

@@ -1,261 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include "utils.h"
// This function takes a positive integer and rounds it up to
// the nearest multiple of another provided integer
unsigned int roundUp(unsigned int value, unsigned int multiple) {
// Determine how far past the nearest multiple the value is
unsigned int remainder = value % multiple;
// Add the difference to make the value a multiple
if(remainder != 0) {
value += (multiple-remainder);
}
return value;
}
// This function reads in a text file and stores it as a char pointer
char* readSource(char* kernelPath) {
cl_int status;
FILE *fp;
char *source;
long int size;
printf("Program file is: %s\n", kernelPath);
fp = fopen(kernelPath, "rb");
if(!fp) {
printf("Could not open kernel file\n");
exit(-1);
}
status = fseek(fp, 0, SEEK_END);
if(status != 0) {
printf("Error seeking to end of file\n");
exit(-1);
}
size = ftell(fp);
if(size < 0) {
printf("Error getting file position\n");
exit(-1);
}
rewind(fp);
source = (char *)malloc(size + 1);
int i;
for (i = 0; i < size+1; i++) {
source[i]='\0';
}
if(source == NULL) {
printf("Error allocating space for the kernel source\n");
exit(-1);
}
fread(source, 1, size, fp);
source[size] = '\0';
return source;
}
void chk(cl_int status, const char* cmd) {
if(status != CL_SUCCESS) {
printf("%s failed (%d)\n", cmd, status);
exit(-1);
}
}
int main() {
int i, j, k, l;
// Rows and columns in the input image
int imageHeight;
int imageWidth;
const char* inputFile = "input.bmp";
const char* outputFile = "output.bmp";
// Homegrown function to read a BMP from file
float* inputImage = readImage(inputFile, &imageWidth,
&imageHeight);
// Size of the input and output images on the host
int dataSize = imageHeight*imageWidth*sizeof(float);
// Output image on the host
float* outputImage = NULL;
outputImage = (float*)malloc(dataSize);
float* refImage = NULL;
refImage = (float*)malloc(dataSize);
// 45 degree motion blur
float filter[49] =
{0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, -1, 0, 1, 0, 0,
0, 0, -2, 0, 2, 0, 0,
0, 0, -1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0};
// The convolution filter is 7x7
int filterWidth = 7;
int filterSize = filterWidth*filterWidth; // Assume a square kernel
// Set up the OpenCL environment
cl_int status;
// Discovery platform
cl_platform_id platform;
status = clGetPlatformIDs(1, &platform, NULL);
chk(status, "clGetPlatformIDs");
// Discover device
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
chk(status, "clGetDeviceIDs");
// Create context
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)(platform), 0};
cl_context context;
context = clCreateContext(props, 1, &device, NULL, NULL, &status);
chk(status, "clCreateContext");
// Create command queue
cl_command_queue queue;
queue = clCreateCommandQueue(context, device, 0, &status);
chk(status, "clCreateCommandQueue");
// The image format describes how the data will be stored in memory
cl_image_format format;
format.image_channel_order = CL_R; // single channel
format.image_channel_data_type = CL_FLOAT; // float data type
// Create space for the source image on the device
cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth,
imageHeight, 0, NULL, &status);
chk(status, "clCreateImage2D");
// Create space for the output image on the device
cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth,
imageHeight, 0, NULL, &status);
chk(status, "clCreateImage2D");
// Create space for the 7x7 filter on the device
cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float),
NULL, &status);
chk(status, "clCreateBuffer");
// Copy the source image to the device
size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from
size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region,
0, 0, inputImage, 0, NULL, NULL);
chk(status, "clEnqueueWriteImage");
// Copy the 7x7 filter to the device
status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0,
filterSize*sizeof(float), filter, 0, NULL, NULL);
chk(status, "clEnqueueWriteBuffer");
// Create the image sampler
cl_sampler sampler = clCreateSampler(context, CL_FALSE,
CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
chk(status, "clCreateSampler");
const char* source = readSource("kernel.cl");
// Create a program object with source and build it
cl_program program;
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
chk(status, "clCreateProgramWithSource");
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
chk(status, "clBuildProgram");
// Create the kernel object
cl_kernel kernel;
kernel = clCreateKernel(program, "convolution", &status);
chk(status, "clCreateKernel");
// Set the kernel arguments
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
chk(status, "clSetKernelArg");
// Set the work item dimensions
size_t globalSize[2] = {imageWidth, imageHeight};
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
NULL, NULL);
chk(status, "clEnqueueNDRange");
// Read the image back to the host
status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin,
region, 0, 0, outputImage, 0, NULL, NULL);
chk(status, "clEnqueueReadImage");
// Write the output image to file
storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
// Compute the reference image
for(i = 0; i < imageHeight; i++) {
for(j = 0; j < imageWidth; j++) {
refImage[i*imageWidth+j] = 0;
}
}
// Iterate over the rows of the source image
int halfFilterWidth = filterWidth/2;
float sum;
for(i = 0; i < imageHeight; i++) {
// Iterate over the columns of the source image
for(j = 0; j < imageWidth; j++) {
sum = 0; // Reset sum for new source pixel
// Apply the filter to the neighborhood
for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
if(i+k >= 0 && i+k < imageHeight &&
j+l >= 0 && j+l < imageWidth) {
sum += inputImage[(i+k)*imageWidth + j+l] *
filter[(k+halfFilterWidth)*filterWidth +
l+halfFilterWidth];
}
}
}
refImage[i*imageWidth+j] = sum;
}
}
int failed = 0;
for(i = 0; i < imageHeight; i++) {
for(j = 0; j < imageWidth; j++) {
if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
printf("Results are INCORRECT\n");
printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
failed = 1;
}
if(failed) break;
}
if(failed) break;
}
if(!failed) {
printf("Results are correct\n");
}
return 0;
}

View File

@@ -1,180 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include "utils.h"
void storeImage(float *imageOut,
const char *filename,
int rows,
int cols,
const char* refFilename) {
FILE *ifp, *ofp;
unsigned char tmp;
int offset;
unsigned char *buffer;
int i, j;
int bytes;
int height, width;
ifp = fopen(refFilename, "rb");
if(ifp == NULL) {
perror(filename);
exit(-1);
}
fseek(ifp, 10, SEEK_SET);
fread(&offset, 4, 1, ifp);
fseek(ifp, 18, SEEK_SET);
fread(&width, 4, 1, ifp);
fread(&height, 4, 1, ifp);
fseek(ifp, 0, SEEK_SET);
buffer = (unsigned char *)malloc(offset);
if(buffer == NULL) {
perror("malloc");
exit(-1);
}
fread(buffer, 1, offset, ifp);
printf("Writing output image to %s\n", filename);
ofp = fopen(filename, "wb");
if(ofp == NULL) {
perror("opening output file");
exit(-1);
}
bytes = fwrite(buffer, 1, offset, ofp);
if(bytes != offset) {
printf("error writing header!\n");
exit(-1);
}
// NOTE bmp formats store data in reverse raster order (see comment in
// readImage function), so we need to flip it upside down here.
int mod = width % 4;
if(mod != 0) {
mod = 4 - mod;
}
// printf("mod = %d\n", mod);
for(i = height-1; i >= 0; i--) {
for(j = 0; j < width; j++) {
tmp = (unsigned char)imageOut[i*cols+j];
fwrite(&tmp, sizeof(char), 1, ofp);
}
// In bmp format, rows must be a multiple of 4-bytes.
// So if we're not at a multiple of 4, add junk padding.
for(j = 0; j < mod; j++) {
fwrite(&tmp, sizeof(char), 1, ofp);
}
}
fclose(ofp);
fclose(ifp);
free(buffer);
}
/*
* Read bmp image and convert to byte array. Also output the width and height
*/
float* readImage(const char *filename, int* widthOut, int* heightOut) {
uchar* imageData;
int height, width;
uchar tmp;
int offset;
int i, j;
printf("Reading input image from %s\n", filename);
FILE *fp = fopen(filename, "rb");
if(fp == NULL) {
perror(filename);
exit(-1);
}
fseek(fp, 10, SEEK_SET);
fread(&offset, 4, 1, fp);
fseek(fp, 18, SEEK_SET);
fread(&width, 4, 1, fp);
fread(&height, 4, 1, fp);
printf("width = %d\n", width);
printf("height = %d\n", height);
*widthOut = width;
*heightOut = height;
imageData = (uchar*)malloc(width*height);
if(imageData == NULL) {
perror("malloc");
exit(-1);
}
fseek(fp, offset, SEEK_SET);
fflush(NULL);
int mod = width % 4;
if(mod != 0) {
mod = 4 - mod;
}
// NOTE bitmaps are stored in upside-down raster order. So we begin
// reading from the bottom left pixel, then going from left-to-right,
// read from the bottom to the top of the image. For image analysis,
// we want the image to be right-side up, so we'll modify it here.
// First we read the image in upside-down
// Read in the actual image
for(i = 0; i < height; i++) {
// add actual data to the image
for(j = 0; j < width; j++) {
fread(&tmp, sizeof(char), 1, fp);
imageData[i*width + j] = tmp;
}
// For the bmp format, each row has to be a multiple of 4,
// so I need to read in the junk data and throw it away
for(j = 0; j < mod; j++) {
fread(&tmp, sizeof(char), 1, fp);
}
}
// Then we flip it over
int flipRow;
for(i = 0; i < height/2; i++) {
flipRow = height - (i+1);
for(j = 0; j < width; j++) {
tmp = imageData[i*width+j];
imageData[i*width+j] = imageData[flipRow*width+j];
imageData[flipRow*width+j] = tmp;
}
}
fclose(fp);
// Input image on the host
float* floatImage = NULL;
floatImage = (float*)malloc(sizeof(float)*width*height);
if(floatImage == NULL) {
perror("malloc");
exit(-1);
}
// Convert the BMP image to float (not required)
for(i = 0; i < height; i++) {
for(j = 0; j < width; j++) {
floatImage[i*width+j] = (float)imageData[i*width+j];
}
}
free(imageData);
return floatImage;
}

View File

@@ -1,11 +0,0 @@
#ifndef __UTILS__
#define __UTILS__
typedef unsigned char uchar;
float* readImage(const char *filename, int* widthOut, int* heightOut);
void storeImage(float *imageOut, const char *filename, int rows, int cols,
const char* refFilename);
#endif

View File

@@ -1,69 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = cutcp
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c cutoff.c cutcpu.c output.c readatom.c excl.c
all: $(PROJECT).dump $(PROJECT).hex
CXXFLAGS += -I.
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
OPTS ?=
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
include ../common.mk

View File

@@ -19,6 +19,27 @@
#include "macros.h"
#include "ocl.h"
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (NULL == filename || NULL == data || 0 == size)
return CL_INVALID_VALUE;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return CL_INVALID_VALUE;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return CL_SUCCESS;
}
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
typedef cl_int4 xyz;
@@ -294,8 +315,6 @@ int gpu_compute_cutoff_potential_lattice(
printf("\n");
}
printf("Ok!\n");
pb_Context* pb_context;
pb_context = pb_InitOpenCLContext(parameters);
if (pb_context == NULL) {
@@ -303,8 +322,6 @@ int gpu_compute_cutoff_potential_lattice(
return -1;
}
printf("Ok!\n");
cl_int clStatus;
cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
@@ -317,8 +334,13 @@ int gpu_compute_cutoff_potential_lattice(
//const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
cl_program clProgram = clCreateProgramWithBuiltInKernels(
clContext, 1, &clDevice, "opencl_cutoff_potential_lattice", &clStatus);
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
CHECK_ERROR("read_kernel_file")
cl_program clProgram = clCreateProgramWithBinary(
clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
CHECK_ERROR("clCreateProgramWithSource")
char clOptions[50];
@@ -399,9 +421,6 @@ int gpu_compute_cutoff_potential_lattice(
clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
CHECK_ERROR("clSetKernelArg")
printf("Ok!!\n");
/* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
@@ -412,26 +431,16 @@ int gpu_compute_cutoff_potential_lattice(
clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
CHECK_ERROR("clSetKernelArg")
printf("Ok**!2\n");
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
printf("Ok**!2\n");
CHECK_ERROR("clEnqueueNDRangeKernel")
printf("Ok**!2\n");
clStatus = clFinish(clCommandQueue);
printf("Ok**!2\n");
CHECK_ERROR("clFinish")
}
printf("Ok++!\n");
printf("Finished OpenCL kernel calls \n");
printf("Finished OpenCL kernel calls\n");
/* copy result regions from OpenCL device */
pb_SwitchToTimer(timers, pb_TimerID_COPY);

View File

@@ -9,6 +9,10 @@
#ifndef __GPUINFOH__
#define __GPUINFOH__
#ifdef __cplusplus
extern "C" {
#endif
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
@@ -17,4 +21,8 @@ void compute_active_thread(size_t *thread,
int minor,
int sm);
#ifdef __cplusplus
}
#endif
#endif

Binary file not shown.

View File

@@ -124,8 +124,6 @@ int main(int argc, char *argv[]) {
pb_InitializeTimerSet(&timers);
pb_SwitchToTimer(&timers, pb_TimerID_IO);
printf("OK\n");
{
const char *pqrfilename = parameters->inpFiles[0];
@@ -136,8 +134,6 @@ int main(int argc, char *argv[]) {
printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
}
printf("OK\n");
/* find extent of domain */
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
get_atom_extent(&min_ext, &max_ext, atom);

View File

@@ -3,6 +3,10 @@
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
void clMemSet(cl_command_queue, cl_mem, int, size_t);
char* readFile(const char*);
@@ -14,4 +18,8 @@ char* readFile(const char*);
exit(1); \
}
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,7 @@
PROJECT = dotproduct
SRCS = main.cc oclUtils.cpp shrUtils.cpp cmd_arg_reader.cpp
OPTS ?= -n64
include ../common.mk

View File

@@ -0,0 +1,152 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
// includes, file
#include "cmd_arg_reader.h"
// includes, system
#include <vector>
// internal unnamed namespace
namespace
{
// types, internal (class, enum, struct, union, typedef)
// variables, internal
} // namespace {
// variables, exported
/*static*/ CmdArgReader* CmdArgReader::self;
/*static*/ char** CmdArgReader::rargv;
/*static*/ int CmdArgReader::rargc;
// functions, exported
////////////////////////////////////////////////////////////////////////////////
//! Public construction interface
//! @return a handle to the class instance
//! @param argc number of command line arguments (as given to main())
//! @param argv command line argument string (as given to main())
////////////////////////////////////////////////////////////////////////////////
/*static*/ void
CmdArgReader::init( const int argc, const char** argv)
{
if ( NULL != self)
{
return;
}
// command line arguments
if (( 0 == argc) || ( 0 == argv))
{
LOGIC_EXCEPTION( "No command line arguments given.");
}
self = new CmdArgReader();
self->createArgsMaps( argc, argv);
rargc = argc;
rargv = const_cast<char**>( argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Constructor, default
////////////////////////////////////////////////////////////////////////////////
CmdArgReader::CmdArgReader() :
args(),
unprocessed(),
iter(),
iter_unprocessed()
{ }
////////////////////////////////////////////////////////////////////////////////
//! Destructor
////////////////////////////////////////////////////////////////////////////////
CmdArgReader::~CmdArgReader()
{
for( iter = args.begin(); iter != args.end(); ++iter)
{
if( *(iter->second.first) == typeid( int))
{
delete static_cast<int*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( bool))
{
delete static_cast<bool*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::string))
{
delete static_cast<std::string*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::vector< std::string>) )
{
delete static_cast< std::vector< std::string>* >( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::vector<int>) )
{
delete static_cast< std::vector<int>* >( iter->second.second);
break;
}
}
}
////////////////////////////////////////////////////////////////////////////////
//! Read args as token value pair into map for better processing (Even the
//! values remain strings until the parameter values is requested by the
//! program.)
//! @param argc the argument count (as given to 'main')
//! @param argv the char* array containing the command line arguments
////////////////////////////////////////////////////////////////////////////////
void
CmdArgReader::createArgsMaps( const int argc, const char** argv) {
std::string token;
std::string val_str;
std::map< std::string, std::string> args;
std::string::size_type pos;
std::string arg;
for( int i=1; i<argc; ++i)
{
arg = argv[i];
// check if valid command line argument: all arguments begin with - or --
if (arg[0] != '-')
{
RUNTIME_EXCEPTION("Invalid command line argument.");
}
int numDashes = (arg[1] == '-' ? 2 : 1);
// check if only flag or if a value is given
if ( (pos = arg.find( '=')) == std::string::npos)
{
unprocessed[ std::string( arg, numDashes, arg.length()-numDashes)] = "FLAG";
}
else
{
unprocessed[ std::string( arg, numDashes, pos-numDashes)] =
std::string( arg, pos+1, arg.length()-1);
}
}
}

View File

@@ -0,0 +1,488 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
#ifndef _CMDARGREADER_H_
#define _CMDARGREADER_H_
// includes, system
#include <map>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <typeinfo>
// includes, project
#include "exception.h"
//! Preprocessed command line arguments
//! @note Lazy evaluation: The arguments are converted from strings to
//! the correct data type upon request. Converted values are stored
//! in an additonal map so that no additional conversion is
//! necessary. Arrays of command line arguments are stored in
//! std::vectors
//! @note Usage:
//! const std::string* file =
//! CmdArgReader::getArg< std::string>( "model")
//! const std::vector< std::string>* files =
//! CmdArgReader::getArg< std::vector< std::string> >( "model")
//! @note All command line arguments begin with '--' followed by the token;
//! token and value are seperated by '='; example --samples=50
//! @note Arrays have the form --model=[one.obj,two.obj,three.obj]
//! (without whitespaces)
//! Command line argument parser
class CmdArgReader
{
template<class> friend class TestCmdArgReader;
protected:
//! @param self handle to the only instance of this class
static CmdArgReader* self;
public:
//! Public construction interface
//! @return a handle to the class instance
//! @param argc number of command line arguments (as given to main())
//! @param argv command line argument string (as given to main())
static void init( const int argc, const char** argv);
public:
//! Get the value of the command line argument with given name
//! @return A const handle to the requested argument.
//! If the argument does not exist or if it
//! is not from type T NULL is returned
//! @param name the name of the requested argument
//! @note T the type of the argument requested
template<class T>
static inline const T* getArg( const std::string& name);
//! Check if a command line argument with the given name exists
//! @return true if a command line argument with name \a name exists,
//! otherwise false
//! @param name name of the command line argument in question
static inline bool existArg( const std::string& name);
//! Get the original / raw argc program argument
static inline int& getRArgc();
//! Get the original / raw argv program argument
static inline char**& getRArgv();
public:
//! Destructor
~CmdArgReader();
protected:
//! Constructor, default
CmdArgReader();
private:
// private helper functions
//! Get the value of the command line argument with given name
//! @note Private helper function for 'getArg' to work on the members
//! @return A const handle to the requested argument. If the argument
//! does not exist or if it is not from type T a NULL pointer
//! is returned.
//! @param name the name of the requested argument
//! @note T the type of the argument requested
template<class T>
inline const T* getArgHelper( const std::string& name);
//! Check if a command line argument with name \a name exists
//! @return true if a command line argument of name \a name exists,
//! otherwise false
//! @param name the name of the requested argument
inline bool existArgHelper( const std::string& name) const;
//! Read args as token value pair into map for better processing
//! (Even the values remain strings until the parameter values is
//! requested by the program.)
//! @param argc the argument count (as given to 'main')
//! @param argv the char* array containing the command line arguments
void createArgsMaps( const int argc, const char** argv);
//! Helper for "casting" the strings from the map with the unprocessed
//! values to the correct
//! data type.
//! @return true if conversion succeeded, otherwise false
//! @param element the value as string
//! @param val the value as type T
template<class T>
static inline bool convertToT( const std::string& element, T& val);
public:
// typedefs internal
//! container for a processed command line argument
//! typeid is used to easily be able to decide if a re-requested token-value
//! pair match the type of the first conversion
typedef std::pair< const std::type_info*, void*> ValType;
//! map of already converted values
typedef std::map< std::string, ValType > ArgsMap;
//! iterator for the map of already converted values
typedef ArgsMap::iterator ArgsMapIter;
typedef ArgsMap::const_iterator ConstArgsMapIter;
//! map of unprocessed (means unconverted) token-value pairs
typedef std::map< std::string, std::string> UnpMap;
//! iterator for the map of unprocessed (means unconverted) token-value pairs
typedef std::map< std::string, std::string>::iterator UnpMapIter;
private:
#ifdef _WIN32
# pragma warning( disable: 4251)
#endif
//! rargc original value of argc
static int rargc;
//! rargv contains command line arguments in raw format
static char** rargv;
//! args Map containing the already converted token-value pairs
ArgsMap args;
//! args Map containing the unprocessed / unconverted token-value pairs
UnpMap unprocessed;
//! iter Iterator for the map with the already converted token-value
//! pairs (to avoid frequent reallocation)
ArgsMapIter iter;
//! iter Iterator for the map with the unconverted token-value
//! pairs (to avoid frequent reallocation)
UnpMapIter iter_unprocessed;
#ifdef _WIN32
# pragma warning( default: 4251)
#endif
private:
//! Constructor, copy (not implemented)
CmdArgReader( const CmdArgReader&);
//! Assignment operator (not implemented)
CmdArgReader& operator=( const CmdArgReader&);
};
// variables, exported (extern)
// functions, inlined (inline)
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line argument arrays
//! @note This function is used each type for which no template specialization
//! exist (which will cause errors if the type does not fulfill the std::vector
//! interface).
////////////////////////////////////////////////////////////////////////////////
template<class T>
/*static*/ inline bool
CmdArgReader::convertToT( const std::string& element, T& val)
{
// preallocate storage
val.resize( std::count( element.begin(), element.end(), ',') + 1);
unsigned int i = 0;
std::string::size_type pos_start = 1; // leave array prefix '['
std::string::size_type pos_end = 0;
// do for all elements of the comma seperated list
while( std::string::npos != ( pos_end = element.find(',', pos_end+1)) )
{
// convert each element by the appropriate function
if ( ! convertToT< typename T::value_type >(
std::string( element, pos_start, pos_end - pos_start), val[i]))
{
return false;
}
pos_start = pos_end + 1;
++i;
}
std::string tmp1( element, pos_start, element.length() - pos_start - 1);
// process last element (leave array postfix ']')
if ( ! convertToT< typename T::value_type >( std::string( element,
pos_start,
element.length() - pos_start - 1),
val[i]))
{
return false;
}
// possible to process all elements?
return true;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type int
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<int>( const std::string& element, int& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type float
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<float>( const std::string& element, float& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type double
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<double>( const std::string& element, double& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type string
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<std::string>( const std::string& element,
std::string& val)
{
val = element;
return true;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type bool
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<bool>( const std::string& element, bool& val)
{
// check if value is given as string-type { true | false }
if ( "true" == element)
{
val = true;
return true;
}
else if ( "false" == element)
{
val = false;
return true;
}
// check if argument is given as integer { 0 | 1 }
else
{
int tmp;
if ( convertToT<int>( element, tmp))
{
if ( 1 == tmp)
{
val = true;
return true;
}
else if ( 0 == tmp)
{
val = false;
return true;
}
}
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the value of the command line argument with given name
//! @return A const handle to the requested argument. If the argument does
//! not exist or if it is not from type T NULL is returned
//! @param T the type of the argument requested
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
template<class T>
/*static*/ const T*
CmdArgReader::getArg( const std::string& name)
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
return NULL;
}
return self->getArgHelper<T>( name);
}
////////////////////////////////////////////////////////////////////////////////
//! Check if a command line argument with the given name exists
//! @return true if a command line argument with name \a name exists,
//! otherwise false
//! @param name name of the command line argument in question
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline bool
CmdArgReader::existArg( const std::string& name)
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
return false;
}
return self->existArgHelper( name);
}
////////////////////////////////////////////////////////////////////////////////
//! @brief Get the value of the command line argument with given name
//! @return A const handle to the requested argument. If the argument does
//! not exist or if it is not from type T NULL is returned
//! @param T the type of the argument requested
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
template<class T>
const T*
CmdArgReader::getArgHelper( const std::string& name)
{
// check if argument already processed and stored in correct type
if ( args.end() != (iter = args.find( name)))
{
if ( (*(iter->second.first)) == typeid( T) )
{
return (T*) iter->second.second;
}
}
else
{
T* tmp = new T;
// check the array with unprocessed values
if ( unprocessed.end() != (iter_unprocessed = unprocessed.find( name)))
{
// try to "cast" the string to the type requested
if ( convertToT< T >( iter_unprocessed->second, *tmp))
{
// add the token element pair to map of already converted values
args[name] = std::make_pair( &(typeid( T)), (void*) tmp);
return tmp;
}
}
// not used while not inserted into the map -> cleanup
delete tmp;
}
// failed, argument not available
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
//! Check if a command line argument with name \a name exists
//! @return true if a command line argument of name \a name exists,
//! otherwise false
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
inline bool
CmdArgReader::existArgHelper( const std::string& name) const
{
bool ret_val = false;
// check if argument already processed and stored in correct type
if( args.end() != args.find( name))
{
ret_val = true;
}
else
{
// check the array with unprocessed values
if ( unprocessed.end() != unprocessed.find( name))
{
ret_val = true;
}
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the original / raw argc program argument
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline int&
CmdArgReader::getRArgc()
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
}
return rargc;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the original / raw argv program argument
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline char**&
CmdArgReader::getRArgv()
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
}
return rargv;
}
// functions, exported (extern)
#endif // #ifndef _CMDARGREADER_H_

View File

@@ -0,0 +1,151 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
#ifndef _EXCEPTION_H_
#define _EXCEPTION_H_
// includes, system
#include <exception>
#include <stdexcept>
#include <iostream>
#include <stdlib.h>
//! Exception wrapper.
//! @param Std_Exception Exception out of namespace std for easy typing.
template<class Std_Exception>
class Exception : public Std_Exception
{
public:
//! @brief Static construction interface
//! @return Alwayss throws ( Located_Exception<Exception>)
//! @param file file in which the Exception occurs
//! @param line line in which the Exception occurs
//! @param detailed details on the code fragment causing the Exception
static void throw_it( const char* file,
const int line,
const char* detailed = "-" );
//! Static construction interface
//! @return Alwayss throws ( Located_Exception<Exception>)
//! @param file file in which the Exception occurs
//! @param line line in which the Exception occurs
//! @param detailed details on the code fragment causing the Exception
static void throw_it( const char* file,
const int line,
const std::string& detailed);
//! Destructor
virtual ~Exception() throw();
private:
//! Constructor, default (private)
Exception();
//! Constructor, standard
//! @param str string returned by what()
Exception( const std::string& str);
};
////////////////////////////////////////////////////////////////////////////////
//! Exception handler function for arbitrary exceptions
//! @param ex exception to handle
////////////////////////////////////////////////////////////////////////////////
template<class Exception_Typ>
inline void
handleException( const Exception_Typ& ex)
{
std::cerr << ex.what() << std::endl;
exit( EXIT_FAILURE);
}
//! Convenience macros
//! Exception caused by dynamic program behavior, e.g. file does not exist
#define RUNTIME_EXCEPTION( msg) \
Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
//! Logic exception in program, e.g. an assert failed
#define LOGIC_EXCEPTION( msg) \
Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
//! Out of range exception
#define RANGE_EXCEPTION( msg) \
Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
////////////////////////////////////////////////////////////////////////////////
//! Implementation
// includes, system
#include <sstream>
////////////////////////////////////////////////////////////////////////////////
//! Static construction interface.
//! @param Exception causing code fragment (file and line) and detailed infos.
////////////////////////////////////////////////////////////////////////////////
/*static*/ template<class Std_Exception>
void
Exception<Std_Exception>::
throw_it( const char* file, const int line, const char* detailed)
{
std::stringstream s;
// Quiet heavy-weight but exceptions are not for
// performance / release versions
s << "Exception in file '" << file << "' in line " << line << "\n"
<< "Detailed description: " << detailed << "\n";
throw Exception( s.str());
}
////////////////////////////////////////////////////////////////////////////////
//! Static construction interface.
//! @param Exception causing code fragment (file and line) and detailed infos.
////////////////////////////////////////////////////////////////////////////////
/*static*/ template<class Std_Exception>
void
Exception<Std_Exception>::
throw_it( const char* file, const int line, const std::string& msg)
{
throw_it( file, line, msg.c_str());
}
////////////////////////////////////////////////////////////////////////////////
//! Constructor, default (private).
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::Exception() :
Exception("Unknown Exception.\n")
{ }
////////////////////////////////////////////////////////////////////////////////
//! Constructor, standard (private).
//! String returned by what().
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::Exception( const std::string& s) :
Std_Exception( s)
{ }
////////////////////////////////////////////////////////////////////////////////
//! Destructor
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::~Exception() throw() { }
// functions, exported
#endif // #ifndef _EXCEPTION_H_

View File

@@ -0,0 +1,22 @@
__kernel void DotProduct (__global float* a, __global float* b, __global float* c, int iNumElements)
{
// find position in global arrays
int iGID = get_global_id(0);
// bound check (equivalent to the limit on a 'for' loop for standard/serial C code
//printf("%d, %d\n", iGID, iNumElements);
if (iGID >= iNumElements)
{
return;
}
// process
int iInOffset = iGID << 2;
c[iGID] = a[iInOffset] * b[iInOffset]
+ a[iInOffset + 1] * b[iInOffset + 1]
+ a[iInOffset + 2] * b[iInOffset + 2]
+ a[iInOffset + 3] * b[iInOffset + 3];
//float cc = c[iGID];
//printf("c[%d]=%f\n", iGID, cc);
}

View File

@@ -1,3 +1,4 @@
//////////////////////////////////////////////////////////////////////////
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
@@ -10,25 +11,26 @@
*/
// *********************************************************************
// oclDotProduct Notes:
// oclDotProduct Notes:
//
// A simple OpenCL API demo application that implements a
// vector dot product computation between 2 float arrays.
// vector dot product computation between 2 float arrays.
//
// Runs computations with OpenCL on the GPU device and then checks results
// Runs computations with OpenCL on the GPU device and then checks results
// against basic host CPU/C++ computation.
//
// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness.
// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness.
// But these are NOT required libs for OpenCL developement in general.
// *********************************************************************
// standard utilities and systems includes
#include <oclUtils.h>
#include <shrQATest.h>
#include "oclUtils.h"
#include "shrQATest.h"
// Name of the file with the source code for the computation kernel
// *********************************************************************
const char* cSourceFile = "DotProduct.cl";
const char* cSourceFile = "kernel.pocl";
// Host buffers for demo
// *********************************************************************
@@ -43,20 +45,20 @@ cl_command_queue cqCommandQueue;// OpenCL command que
cl_program program; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel
cl_mem cmDevSrcA; // OpenCL device source buffer A
cl_mem cmDevSrcB; // OpenCL device source buffer B
cl_mem cmDevDst; // OpenCL device destination buffer
cl_mem cmDevSrcB; // OpenCL device source buffer B
cl_mem cmDevDst; // OpenCL device destination buffer
size_t szGlobalWorkSize; // Total # of work items in the 1D range
size_t szLocalWorkSize; // # of work items in the 1D work group
size_t szLocalWorkSize; // # of work items in the 1D work group
size_t szParmDataBytes; // Byte size of context information
size_t szKernelLength; // Byte size of kernel code
cl_int ciErrNum; // Error code var
char* cPathAndName = NULL; // var for full paths to data, src, etc.
char* cSourceCL = NULL; // Buffer to hold source for compilation
char* cSourceCL = NULL; // Buffer to hold source for compilation
const char* cExecutableName = NULL;
// demo config vars
int iNumElements= 1277944; // Length of float arrays to process (odd # for illustration)
shrBOOL bNoPrompt = shrFALSE;
int iNumElements= 1024; // Length of float arrays to process (odd # for illustration)
shrBOOL bNoPrompt = shrFALSE;
// Forward Declarations
// *********************************************************************
@@ -67,7 +69,7 @@ void (*pCleanup)(int) = &Cleanup;
int *gp_argc = NULL;
char ***gp_argv = NULL;
// Main function
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
@@ -76,38 +78,29 @@ int main(int argc, char **argv)
shrQAStart(argc, argv);
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
cl_uint uiNumComputeUnits;
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
ciErrNum = clGetPlatformIDs(1, &cpPlatform, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
cl_uint uiNumDevices = 1;
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
cl_uint uiTargetDevice = 0;
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
//Get all the devices
cl_uint uiNumDevices = 0; // Number of devices available
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
shrLog("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
// Get command line device options and config accordingly
shrLog(" # of Devices Available = %u\n", uiNumDevices);
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
shrLog(" # of Devices Available = %u\n", uiNumDevices);
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
{
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
}
shrLog(" Using Device %u: ", uiTargetDevice);
shrLog(" Using Device %u: ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
// get command line arg for quick test, if provided
bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
@@ -115,16 +108,16 @@ int main(int argc, char **argv)
// start logs
cExecutableName = argv[0];
shrSetLogFileName ("oclDotProduct.txt");
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
// set and log Global and Local work size dimensions
szLocalWorkSize = 256;
szLocalWorkSize = 16;
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
// Allocate and initialize host arrays
shrLog( "Allocate and Init Host Mem...\n");
shrLog( "Allocate and Init Host Mem...\n");
srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
@@ -134,49 +127,50 @@ int main(int argc, char **argv)
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Get a GPU device
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create the context
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create a command-queue
shrLog("clCreateCommandQueue...\n");
shrLog("clCreateCommandQueue...\n");
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Read the OpenCL kernel in from source file
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
//oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
//oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
// Create the program
shrLog("clCreateProgramWithSource...\n");
//program = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
shrLog("clCreateProgramWithSource...\n");
cl_int binary_status;
cl_program program =
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL);
clCreateProgramWithBinary(cxGPUContext, 1, cdDevices, &szKernelLength, (const uint8_t**)&cSourceCL, &binary_status, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Build the program with 'mad' Optimization option
#ifdef MAC
char* flags = "-cl-fast-relaxed-math -DMAC";
#else
char* flags = "-cl-fast-relaxed-math";
#endif
shrLog("clBuildProgram...\n");
shrLog("clBuildProgram...\n");
ciErrNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
@@ -184,47 +178,50 @@ int main(int argc, char **argv)
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
Cleanup(EXIT_FAILURE);
Cleanup(EXIT_FAILURE);
}
// Create the kernel
shrLog("clCreateKernel (DotProduct)...\n");
shrLog("clCreateKernel (nDotProduct)...\n");
ckKernel = clCreateKernel(program, "DotProduct", &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Set the Argument values
shrLog("clSetKernelArg 0 - 3...\n\n");
shrLog("clSetKernelArg 0 - 3...\n\n");
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// --------------------------------------------------------
// Core sequence... copy input data to GPU, compute, copy results back
// Asynchronous write of data to GPU device
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Launch kernel
shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Read back results and check accumulated errors
shrLog("clEnqueueReadBuffer (Dst)...\n\n");
shrLog("clEnqueueReadBuffer (Dst)...\n\n");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Compute and compare results for golden-host and report errors and pass/fail
shrLog("Comparing against Host/C++ computation...\n\n");
shrLog("Comparing against Host/C++ computation...\n\n");
DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);
// Cleanup and leave
Cleanup (EXIT_SUCCESS);
return (bMatch == shrTRUE) ? 0 : 1;
}
// "Golden" Host processing dot product function for comparison purposes
@@ -232,13 +229,13 @@ int main(int argc, char **argv)
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements)
{
int i, j, k;
for (i = 0, j = 0; i < iNumElements; i++)
for (i = 0, j = 0; i < iNumElements; i++)
{
pfResult[i] = 0.0f;
for (k = 0; k < 4; k++, j++)
for (k = 0; k < 4; k++, j++)
{
pfResult[i] += pfData1[j] * pfData2[j];
}
pfResult[i] += pfData1[j] * pfData2[j];
}
}
}
@@ -250,7 +247,7 @@ void Cleanup(int iExitCode)
shrLog("Starting Cleanup...\n\n");
if(cPathAndName)free(cPathAndName);
if(cSourceCL)free(cSourceCL);
if(ckKernel)clReleaseKernel(ckKernel);
if(ckKernel)clReleaseKernel(ckKernel);
if(program)clReleaseProgram(program);
if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
if(cxGPUContext)clReleaseContext(cxGPUContext);
@@ -259,7 +256,7 @@ void Cleanup(int iExitCode)
if (cmDevDst)clReleaseMemObject(cmDevDst);
// Free host memory
free(srcA);
free(srcA);
free(srcB);
free (dst);
free(Golden);
@@ -267,4 +264,4 @@ void Cleanup(int iExitCode)
if (cdDevices) free(cdDevices);
shrQAFinishExit(*gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED);
}
}

View File

@@ -0,0 +1,806 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
#include <fstream>
#include <vector>
#include <iostream>
#include <algorithm>
#include <stdarg.h>
#include "oclUtils.h"
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platoform ID
//////////////////////////////////////////////////////////////////////////////
cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID)
{
char chBuffer[1024];
cl_uint num_platforms;
cl_platform_id* clPlatformIDs;
cl_int ciErrNum;
*clSelectedPlatformID = NULL;
// Get OpenCL platform count
ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms);
if (ciErrNum != CL_SUCCESS)
{
shrLog(" Error %i in clGetPlatformIDs Call !!!\n\n", ciErrNum);
return -1000;
}
else
{
if(num_platforms == 0)
{
shrLog("No OpenCL platform found!\n\n");
return -2000;
}
else
{
// if there's a platform or more, make space for ID's
if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL)
{
shrLog("Failed to allocate memory for cl_platform ID's!\n\n");
return -3000;
}
// get platform info for each platform and trap the NVIDIA platform if found
ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL);
for(cl_uint i = 0; i < num_platforms; ++i)
{
ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL);
if(ciErrNum == CL_SUCCESS)
{
if(strstr(chBuffer, "NVIDIA") != NULL)
{
*clSelectedPlatformID = clPlatformIDs[i];
break;
}
}
}
// default to zeroeth platform if NVIDIA not found
if(*clSelectedPlatformID == NULL)
{
shrLog("WARNING: NVIDIA OpenCL platform not found - defaulting to first platform!\n\n");
*clSelectedPlatformID = clPlatformIDs[0];
}
free(clPlatformIDs);
}
}
return CL_SUCCESS;
}
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
void oclPrintDevName(int iLogMode, cl_device_id device)
{
char device_string[1024];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, "%s", device_string);
}
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
void oclPrintDevInfo(int iLogMode, cl_device_id device)
{
char device_string[1024];
bool nv_device_attibute_query = false;
// CL_DEVICE_NAME
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_NAME: \t\t\t%s\n", device_string);
// CL_DEVICE_VENDOR
clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_VENDOR: \t\t\t%s\n", device_string);
// CL_DRIVER_VERSION
clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DRIVER_VERSION: \t\t\t%s\n", device_string);
// CL_DEVICE_VERSION
clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_VERSION: \t\t\t%s\n", device_string);
// CL_DEVICE_OPENCL_C_VERSION (if CL_DEVICE_VERSION version > 1.0)
if(strncmp("OpenCL 1.0", device_string, 10) != 0)
{
// This code is unused for devices reporting OpenCL 1.0, but a def is needed anyway to allow compilation using v 1.0 headers
// This constant isn't #defined in 1.0
#ifndef CL_DEVICE_OPENCL_C_VERSION
#define CL_DEVICE_OPENCL_C_VERSION 0x103D
#endif
clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_OPENCL_C_VERSION: \t\t%s\n", device_string);
}
// CL_DEVICE_TYPE
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
if( type & CL_DEVICE_TYPE_CPU )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if( type & CL_DEVICE_TYPE_GPU )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if( type & CL_DEVICE_TYPE_ACCELERATOR )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if( type & CL_DEVICE_TYPE_DEFAULT )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", compute_units);
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
size_t workitem_dims;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims);
// CL_DEVICE_MAX_WORK_ITEM_SIZES
size_t workitem_size[3];
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
size_t workgroup_size;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency);
// CL_DEVICE_ADDRESS_BITS
cl_uint addr_bits;
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_ADDRESS_BITS:\t\t%u\n", addr_bits);
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
cl_ulong max_mem_alloc_size;
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_alloc_size), &max_mem_alloc_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(max_mem_alloc_size / (1024 * 1024)));
// CL_DEVICE_GLOBAL_MEM_SIZE
cl_ulong mem_size;
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(mem_size / (1024 * 1024)));
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
cl_bool error_correction_support;
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(error_correction_support), &error_correction_support, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", error_correction_support == CL_TRUE ? "yes" : "no");
// CL_DEVICE_LOCAL_MEM_TYPE
cl_device_local_mem_type local_mem_type;
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(local_mem_type), &local_mem_type, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", local_mem_type == 1 ? "local" : "global");
// CL_DEVICE_LOCAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(mem_size / 1024));
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(mem_size / 1024));
// CL_DEVICE_QUEUE_PROPERTIES
cl_command_queue_properties queue_properties;
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(queue_properties), &queue_properties, NULL);
if( queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
if( queue_properties & CL_QUEUE_PROFILING_ENABLE )
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
// CL_DEVICE_IMAGE_SUPPORT
cl_bool image_support;
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", image_support);
// CL_DEVICE_MAX_READ_IMAGE_ARGS
cl_uint max_read_image_args;
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(max_read_image_args), &max_read_image_args, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", max_read_image_args);
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
cl_uint max_write_image_args;
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(max_write_image_args), &max_write_image_args, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", max_write_image_args);
// CL_DEVICE_SINGLE_FP_CONFIG
cl_device_fp_config fp_config;
clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_SINGLE_FP_CONFIG:\t\t%s%s%s%s%s%s\n",
fp_config & CL_FP_DENORM ? "denorms " : "",
fp_config & CL_FP_INF_NAN ? "INF-quietNaNs " : "",
fp_config & CL_FP_ROUND_TO_NEAREST ? "round-to-nearest " : "",
fp_config & CL_FP_ROUND_TO_ZERO ? "round-to-zero " : "",
fp_config & CL_FP_ROUND_TO_INF ? "round-to-inf " : "",
fp_config & CL_FP_FMA ? "fma " : "");
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
size_t szMaxDims[5];
shrLogEx(iLogMode, 0, "\n CL_DEVICE_IMAGE <dim>");
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &szMaxDims[0], NULL);
shrLogEx(iLogMode, 0, "\t\t\t2D_MAX_WIDTH\t %u\n", szMaxDims[0]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[1], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", szMaxDims[1]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &szMaxDims[2], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_WIDTH\t %u\n", szMaxDims[2]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[3], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", szMaxDims[3]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &szMaxDims[4], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_DEPTH\t %u\n", szMaxDims[4]);
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(device_string), &device_string, NULL);
if (device_string != 0)
{
shrLogEx(iLogMode, 0, "\n CL_DEVICE_EXTENSIONS:");
std::string stdDevString;
stdDevString = std::string(device_string);
size_t szOldPos = 0;
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
while (szSpacePos != stdDevString.npos)
{
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
nv_device_attibute_query = true;
if (szOldPos > 0)
{
shrLogEx(iLogMode, 0, "\t\t");
}
shrLogEx(iLogMode, 0, "\t\t\t%s\n", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str());
do {
szOldPos = szSpacePos + 1;
szSpacePos = stdDevString.find(' ', szOldPos);
} while (szSpacePos == szOldPos);
}
shrLogEx(iLogMode, 0, "\n");
}
else
{
shrLogEx(iLogMode, 0, " CL_DEVICE_EXTENSIONS: None\n");
}
if(nv_device_attibute_query)
{
cl_uint compute_capability_major, compute_capability_minor;
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &compute_capability_major, NULL);
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &compute_capability_minor, NULL);
shrLogEx(iLogMode, 0, "\n CL_DEVICE_COMPUTE_CAPABILITY_NV:\t%u.%u\n", compute_capability_major, compute_capability_minor);
shrLogEx(iLogMode, 0, " NUMBER OF MULTIPROCESSORS:\t\t%u\n", compute_units); // this is the same value reported by CL_DEVICE_MAX_COMPUTE_UNITS
shrLogEx(iLogMode, 0, " NUMBER OF CUDA CORES:\t\t\t%u\n", ConvertSMVer2Cores(compute_capability_major, compute_capability_minor) * compute_units);
cl_uint regs_per_block;
clGetDeviceInfo(device, CL_DEVICE_REGISTERS_PER_BLOCK_NV, sizeof(cl_uint), &regs_per_block, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_REGISTERS_PER_BLOCK_NV:\t%u\n", regs_per_block);
cl_uint warp_size;
clGetDeviceInfo(device, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), &warp_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_WARP_SIZE_NV:\t\t%u\n", warp_size);
cl_bool gpu_overlap;
clGetDeviceInfo(device, CL_DEVICE_GPU_OVERLAP_NV, sizeof(cl_bool), &gpu_overlap, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_GPU_OVERLAP_NV:\t\t%s\n", gpu_overlap == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
cl_bool exec_timeout;
clGetDeviceInfo(device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof(cl_bool), &exec_timeout, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:\t%s\n", exec_timeout == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
cl_bool integrated_memory;
clGetDeviceInfo(device, CL_DEVICE_INTEGRATED_MEMORY_NV, sizeof(cl_bool), &integrated_memory, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_INTEGRATED_MEMORY_NV:\t%s\n", integrated_memory == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
}
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
shrLogEx(iLogMode, 0, " CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
cl_uint vec_width [6];
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &vec_width[0], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &vec_width[1], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &vec_width[2], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &vec_width[3], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &vec_width[4], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &vec_width[5], NULL);
shrLogEx(iLogMode, 0, "CHAR %u, SHORT %u, INT %u, LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
vec_width[0], vec_width[1], vec_width[2], vec_width[3], vec_width[4], vec_width[5]);
}
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
int oclGetDevCap(cl_device_id device)
{
char cDevString[1024];
bool bDevAttributeQuery = false;
int iDevArch = -1;
// Get device extensions, and if any then search for cl_nv_device_attribute_query
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(cDevString), &cDevString, NULL);
if (cDevString != 0)
{
std::string stdDevString;
stdDevString = std::string(cDevString);
size_t szOldPos = 0;
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
while (szSpacePos != stdDevString.npos)
{
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
{
bDevAttributeQuery = true;
}
do {
szOldPos = szSpacePos + 1;
szSpacePos = stdDevString.find(' ', szOldPos);
} while (szSpacePos == szOldPos);
}
}
// if search succeeded, get device caps
if(bDevAttributeQuery)
{
cl_int iComputeCapMajor, iComputeCapMinor;
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), (void*)&iComputeCapMajor, NULL);
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), (void*)&iComputeCapMinor, NULL);
iDevArch = (10 * iComputeCapMajor) + iComputeCapMinor;
}
return iDevArch;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetFirstDev(cl_context cxGPUContext)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id first = cdDevices[0];
free(cdDevices);
return first;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
size_t device_count = szParmDataBytes / sizeof(cl_device_id);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id max_flops_device = cdDevices[0];
int max_flops = 0;
size_t current_device = 0;
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
max_flops = compute_units * clock_frequency;
++current_device;
while( current_device < device_count )
{
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
int flops = compute_units * clock_frequency;
if( flops > max_flops )
{
max_flops = flops;
max_flops_device = cdDevices[current_device];
}
++current_device;
}
free(cdDevices);
return max_flops_device;
}
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
#ifdef _WIN32 // Windows version
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
{
return NULL;
}
#else // Linux version
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
#endif
size_t szPreambleLength = strlen(cPreamble);
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
memcpy(cSourceString, cPreamble, szPreambleLength);
if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
{
fclose(pFileStream);
free(cSourceString);
return 0;
}
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength + szPreambleLength;
}
cSourceString[szSourceLength + szPreambleLength] = '\0';
return cSourceString;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int nr)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
if( szParmDataBytes / sizeof(cl_device_id) <= nr ) {
return (cl_device_id)-1;
}
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id device = cdDevices[nr];
free(cdDevices);
return device;
}
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length)
{
// Grab the number of devices associated witht the program
cl_uint num_devices;
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
// Grab the device ids
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
// Grab the sizes of the binaries
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
// Now get the binaries
char** ptx_code = (char**) malloc(num_devices * sizeof(char*));
for( unsigned int i=0; i<num_devices; ++i) {
ptx_code[i]= (char*)malloc(binary_sizes[i]);
}
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
// Find the index of the device of interest
unsigned int idx = 0;
while( idx<num_devices && devices[idx] != cdDevice ) ++idx;
// If it is associated prepare the result
if( idx < num_devices )
{
*binary = ptx_code[idx];
*length = binary_sizes[idx];
}
// Cleanup
free( devices );
free( binary_sizes );
for( unsigned int i=0; i<num_devices; ++i) {
if( i != idx ) free(ptx_code[i]);
}
free( ptx_code );
}
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName)
{
// Grab the number of devices associated with the program
cl_uint num_devices;
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
// Grab the device ids
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
// Grab the sizes of the binaries
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
// Now get the binaries
char** ptx_code = (char**)malloc(num_devices * sizeof(char*));
for( unsigned int i=0; i<num_devices; ++i)
{
ptx_code[i] = (char*)malloc(binary_sizes[i]);
}
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
// Find the index of the device of interest
unsigned int idx = 0;
while((idx < num_devices) && (devices[idx] != cdDevice))
{
++idx;
}
// If the index is associated, log the result
if(idx < num_devices)
{
// if a separate filename is supplied, dump ptx there
if (NULL != cPtxFileName)
{
shrLog("\nWriting ptx to separate file: %s ...\n\n", cPtxFileName);
FILE* pFileStream = NULL;
#ifdef _WIN32
fopen_s(&pFileStream, cPtxFileName, "wb");
#else
pFileStream = fopen(cPtxFileName, "wb");
#endif
fwrite(ptx_code[idx], binary_sizes[idx], 1, pFileStream);
fclose(pFileStream);
}
else // log to logfile and console if no ptx file specified
{
shrLog("\n%s\nProgram Binary:\n%s\n%s\n", HDASHLINE, ptx_code[idx], HDASHLINE);
}
}
// Cleanup
free(devices);
free(binary_sizes);
for(unsigned int i = 0; i < num_devices; ++i)
{
free(ptx_code[i]);
}
free( ptx_code );
}
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice)
{
// write out the build log and ptx, then exit
char cBuildLog[10240];
clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG,
sizeof(cBuildLog), cBuildLog, NULL );
shrLog("\n%s\nBuild Log:\n%s\n%s\n", HDASHLINE, cBuildLog, HDASHLINE);
}
// Helper function for De-allocating cl objects
// *********************************************************************
void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs)
{
int i;
for (i = 0; i < iNumObjs; i++)
{
if (cmMemObjs[i])clReleaseMemObject(cmMemObjs[i]);
}
}
// Helper function to get OpenCL error string from constant
// *********************************************************************
const char* oclErrorString(cl_int error)
{
static const char* errorString[] = {
"CL_SUCCESS",
"CL_DEVICE_NOT_FOUND",
"CL_DEVICE_NOT_AVAILABLE",
"CL_COMPILER_NOT_AVAILABLE",
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
"CL_OUT_OF_RESOURCES",
"CL_OUT_OF_HOST_MEMORY",
"CL_PROFILING_INFO_NOT_AVAILABLE",
"CL_MEM_COPY_OVERLAP",
"CL_IMAGE_FORMAT_MISMATCH",
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
"CL_BUILD_PROGRAM_FAILURE",
"CL_MAP_FAILURE",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"CL_INVALID_VALUE",
"CL_INVALID_DEVICE_TYPE",
"CL_INVALID_PLATFORM",
"CL_INVALID_DEVICE",
"CL_INVALID_CONTEXT",
"CL_INVALID_QUEUE_PROPERTIES",
"CL_INVALID_COMMAND_QUEUE",
"CL_INVALID_HOST_PTR",
"CL_INVALID_MEM_OBJECT",
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
"CL_INVALID_IMAGE_SIZE",
"CL_INVALID_SAMPLER",
"CL_INVALID_BINARY",
"CL_INVALID_BUILD_OPTIONS",
"CL_INVALID_PROGRAM",
"CL_INVALID_PROGRAM_EXECUTABLE",
"CL_INVALID_KERNEL_NAME",
"CL_INVALID_KERNEL_DEFINITION",
"CL_INVALID_KERNEL",
"CL_INVALID_ARG_INDEX",
"CL_INVALID_ARG_VALUE",
"CL_INVALID_ARG_SIZE",
"CL_INVALID_KERNEL_ARGS",
"CL_INVALID_WORK_DIMENSION",
"CL_INVALID_WORK_GROUP_SIZE",
"CL_INVALID_WORK_ITEM_SIZE",
"CL_INVALID_GLOBAL_OFFSET",
"CL_INVALID_EVENT_WAIT_LIST",
"CL_INVALID_EVENT",
"CL_INVALID_OPERATION",
"CL_INVALID_GL_OBJECT",
"CL_INVALID_BUFFER_SIZE",
"CL_INVALID_MIP_LEVEL",
"CL_INVALID_GLOBAL_WORK_SIZE",
};
const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
const int index = -error;
return (index >= 0 && index < errorCount) ? errorString[index] : "Unspecified Error";
}
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
const char* oclImageFormatString(cl_uint uiImageFormat)
{
// cl_channel_order
if (uiImageFormat == CL_R)return "CL_R";
if (uiImageFormat == CL_A)return "CL_A";
if (uiImageFormat == CL_RG)return "CL_RG";
if (uiImageFormat == CL_RA)return "CL_RA";
if (uiImageFormat == CL_RGB)return "CL_RGB";
if (uiImageFormat == CL_RGBA)return "CL_RGBA";
if (uiImageFormat == CL_BGRA)return "CL_BGRA";
if (uiImageFormat == CL_ARGB)return "CL_ARGB";
if (uiImageFormat == CL_INTENSITY)return "CL_INTENSITY";
if (uiImageFormat == CL_LUMINANCE)return "CL_LUMINANCE";
// cl_channel_type
if (uiImageFormat == CL_SNORM_INT8)return "CL_SNORM_INT8";
if (uiImageFormat == CL_SNORM_INT16)return "CL_SNORM_INT16";
if (uiImageFormat == CL_UNORM_INT8)return "CL_UNORM_INT8";
if (uiImageFormat == CL_UNORM_INT16)return "CL_UNORM_INT16";
if (uiImageFormat == CL_UNORM_SHORT_565)return "CL_UNORM_SHORT_565";
if (uiImageFormat == CL_UNORM_SHORT_555)return "CL_UNORM_SHORT_555";
if (uiImageFormat == CL_UNORM_INT_101010)return "CL_UNORM_INT_101010";
if (uiImageFormat == CL_SIGNED_INT8)return "CL_SIGNED_INT8";
if (uiImageFormat == CL_SIGNED_INT16)return "CL_SIGNED_INT16";
if (uiImageFormat == CL_SIGNED_INT32)return "CL_SIGNED_INT32";
if (uiImageFormat == CL_UNSIGNED_INT8)return "CL_UNSIGNED_INT8";
if (uiImageFormat == CL_UNSIGNED_INT16)return "CL_UNSIGNED_INT16";
if (uiImageFormat == CL_UNSIGNED_INT32)return "CL_UNSIGNED_INT32";
if (uiImageFormat == CL_HALF_FLOAT)return "CL_HALF_FLOAT";
if (uiImageFormat == CL_FLOAT)return "CL_FLOAT";
// unknown constant
return "Unknown";
}

View File

@@ -17,7 +17,7 @@
// *********************************************************************
// Common headers: Cross-API utililties and OpenCL header
#include <shrUtils.h>
#include "shrUtils.h"
// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)

File diff suppressed because it is too large Load Diff

View File

@@ -13,7 +13,7 @@
#define SHR_UTILS_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// reminders for output window and build log
@@ -639,4 +639,4 @@ inline void __shrExitEX(int argc, const char** argv, int iExitCode)
exit(iExitCode);
}
#endif
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,70 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter
CXXFLAGS += -Wno-unused-variable -Wno-narrowing -Wno-unused-result -Wno-unused-but-set-variable
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = guassian
SRCS = main.cc clutils.cpp utils.cpp
all: $(PROJECT) kernel.pocl
OPTS ?=
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

Binary file not shown.

View File

@@ -31,8 +31,6 @@ int main(int argc, char *argv[]) {
a = (float *)malloc(size * size * sizeof(float));
printf("OK\n");
InitMat(fp, size, a, size, size);
// printf("The input matrix a is:\n");
// PrintMat(a, size, size, size);

View File

@@ -1,67 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = kmeans
SRCS = main.cc read_input.c rmse.c kmeans_clustering.c cluster.c getopt.c
all: $(PROJECT) kernel.pocl
OPTS ?=
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

Binary file not shown.

View File

@@ -170,6 +170,7 @@ float** kmeans_clustering(float **feature, /* in: [npoints][nfeatures] */
free(new_centers[0]);
free(new_centers);
free(new_centers_len);
free(initial);
return clusters;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -104,8 +104,8 @@ static int initialize(int use_gpu) {
context = clCreateContext(NULL, 1, device_list, NULL, NULL, &result);
// create command queue for the first device
cmd_queue = clCreateCommandQueue(context, device_list[0], 0, NULL);
if (!cmd_queue) {
cmd_queue = clCreateCommandQueue(context, device_list[0], 0, &result);
if (!cmd_queue || result != CL_SUCCESS) {
printf("ERROR: clCreateCommandQueue() failed\n");
return -1;
}
@@ -120,7 +120,7 @@ static int shutdown() {
if (context)
clReleaseContext(context);
if (device_list)
delete device_list;
delete [] device_list;
// reset all variables
cmd_queue = 0;
@@ -188,7 +188,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
fread(source + strlen(source), sourcesize, 1, fp);
fclose(fp);*/
// OpenCL initialization
// OpenCL initialization
int use_gpu = 1;
if (initialize(use_gpu))
return -1;
@@ -197,12 +197,25 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
cl_int err = 0;
//const char *slist[2] = {source, 0};
//cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
cl_program prog = clCreateProgramWithBuiltInKernels(context, 1, device_list, "kmeans_kernel_c;kmeans_swap", &err);
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateProgramWithSource() => %d\n", err);
printf("ERROR: read_kernel_file() => %d\n", err);
return -1;
}
err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
cl_program prog = clCreateProgramWithBinary(
context, 1, device_list, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateProgramWithBinary() => %d\n", err);
return -1;
}
free(kernel_bin);
err = clBuildProgram(prog, 1, &device_list[0], NULL, NULL, NULL);
{ // show warnings/errors
// static char log[65536]; memset(log, 0, sizeof(log));
// cl_device_id device_id = 0;
@@ -226,6 +239,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
printf("ERROR: clCreateKernel() 0 => %d\n", err);
return -1;
}
kernel2 = clCreateKernel(prog, kernel_swap, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateKernel() 0 => %d\n", err);
@@ -241,6 +255,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
n_points * n_features, err);
return -1;
}
d_feature_swap =
clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * n_features * sizeof(float), NULL, &err);
@@ -249,6 +264,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
n_points * n_features, err);
return -1;
}
d_cluster =
clCreateBuffer(context, CL_MEM_READ_WRITE,
n_clusters * n_features * sizeof(float), NULL, &err);
@@ -257,6 +273,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
n_clusters * n_features, err);
return -1;
}
d_membership = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * sizeof(int), NULL, &err);
if (err != CL_SUCCESS) {
@@ -296,6 +313,8 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
}
membership_OCL = (int *)malloc(n_points * sizeof(int));
return 0;
}
void deallocateMemory() {

View File

@@ -331,7 +331,9 @@ int setup(int argc, char **argv) {
}
}
/* free up memory */
/* free up memory */
free(cluster_centres[0]);
free(cluster_centres);
free(features[0]);
free(features);
return (0);

View File

@@ -1,69 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = lbm
SRCS = main.cc args.c parboil_opencl.c gpu_info.c lbm.c ocl.c
all: $(PROJECT).dump $(PROJECT).hex
CXXFLAGS += -I.
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
OPTS ?=
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
include ../common.mk

View File

@@ -9,6 +9,10 @@
#ifndef __GPUINFOH__
#define __GPUINFOH__
#ifdef __cplusplus
extern "C" {
#endif
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
@@ -17,4 +21,8 @@ void compute_active_thread(size_t *thread,
int minor,
int sm);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -6,16 +6,16 @@
*cr
***************************************************************************/
/*############################################################################*/
#ifndef _LBM_H_
#define _LBM_H_
/*############################################################################*/
#include "ocl.h"
#include "lbm_macros.h"
#ifdef __cplusplus
extern "C" {
#endif
void LBM_allocateGrid( float** ptr );
void LBM_freeGrid( float** ptr );
void LBM_initializeGrid( LBM_Grid grid );
@@ -34,6 +34,8 @@ void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid
void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid );
/*############################################################################*/
#ifdef __cplusplus
}
#endif
#endif /* _LBM_H_ */

Binary file not shown.

View File

@@ -21,6 +21,26 @@
#include "main.h"
#include "ocl.h"
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return CL_INVALID_VALUE;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return CL_INVALID_VALUE;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return CL_SUCCESS;
}
/*############################################################################*/
@@ -170,8 +190,6 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
LBM_freeGrid((float **)&TEMP_srcGrid);
LBM_freeGrid((float **)&TEMP_dstGrid);
printf("OK\n");
}
/*############################################################################*/
@@ -188,7 +206,9 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
LBM_showGridStatistics(TEMP_srcGrid);
LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
if (param->resultFilename) {
LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
}
LBM_freeGrid((float **)&TEMP_srcGrid);
OpenCL_LBM_freeGrid(OpenCL_srcGrid);
@@ -220,8 +240,14 @@ void OpenCL_initialize(struct pb_Parameters *p, OpenCL_Param *prm) {
//const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
//prm->clProgram = clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus);
prm->clProgram = clCreateProgramWithBuiltInKernels(
prm->clContext, 1, &prm->clDevice, "performStreamCollide_kernel", &clStatus);
// read kernel binary from file
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
CHECK_ERROR("read_kernel_file")
prm->clProgram = clCreateProgramWithBinary(
prm->clContext, 1, &prm->clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
CHECK_ERROR("clCreateProgramWithSource")
//char clOptions[100];

View File

@@ -1,6 +1,10 @@
#ifndef __OCLH__
#define __OCLH__
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
cl_platform_id clPlatform;
cl_context_properties clCps[3];
@@ -22,4 +26,8 @@ typedef struct {
char* readFile(char*);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1216,9 +1216,24 @@ pb_InitOpenCLContext(struct pb_Parameters* parameters) {
cl_platform_id platform_id;
cl_device_id device_id;
cl_context context;
clGetPlatformIDs(1, &platform_id, NULL);
clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err);
_err = clGetPlatformIDs(1, &platform_id, NULL);
if (_err != CL_SUCCESS) {
fprintf(stderr, "Error querying platform!\n");
exit(-1);
}
_err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL);
if (_err != CL_SUCCESS) {
fprintf(stderr, "Error querying device IDs!\n");
exit(-1);
}
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err);
if (_err != CL_SUCCESS) {
fprintf(stderr, "Error Creating device context!\n");
exit(-1);
}
pb_Context* c = (pb_Context*)malloc(sizeof(pb_Context));
c->clContext = context;

View File

@@ -0,0 +1,7 @@
PROJECT = matmul
SRCS = main.cc
OPTS ?= -n16
include ../common.mk

View File

@@ -0,0 +1,73 @@
__kernel void matmul(__global float *A,
__global float *B,
__global float *C,
const unsigned int N,
__local float *localA,
__local float *localB)
{
int row = get_global_id(1);
int col = get_global_id(0);
int localRow = get_local_id(1);
int localCol = get_local_id(0);
int localSize = get_local_size(0); // assuming square local size
float sum = 0.0f;
// Loop over all blocks of both matrices
for (int k = 0; k < N; k += localSize) {
// Load block of matrix A to local memory
localA[localRow * localSize + localCol] = A[row * N + k + localCol];
// Load block of matrix B to local memory, adjusting for column-major access
localB[localRow * localSize + localCol] = B[(k + localRow) * N + col];
// Synchronize to make sure the tiles are loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Multiply the two matrix blocks and accumulate result
for (int j = 0; j < localSize; j++) {
sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
}
// Synchronize before loading the next block
barrier(CLK_LOCAL_MEM_FENCE);
}
C[row * N + col] = sum;
}
/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N)
{
int globalRow = get_global_id(1);
int globalCol = get_global_id(0);
int localRow = get_local_id(1);
int localCol = get_local_id(0);
// Static local memory declaration
__local float localA[16][16];
__local float localB[16][16];
float sum = 0.0f;
// Iterate over blocks
for (int k = 0; k < N; k += 16) {
// Load a block of matrix A into local memory
localA[localRow][localCol] = A[globalRow * N + k + localCol];
// Load a block of matrix B into local memory
localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
// Ensure the entire block is loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Compute multiplication for this block
for (int j = 0; j < 16; j++) {
sum += localA[localRow][j] * localB[j][localCol];
}
// Wait until all threads have computed before loading the next block
barrier(CLK_LOCAL_MEM_FENCE);
}
C[globalRow * N + globalCol] = sum;
}*/

246
tests/opencl/matmul/main.cc Normal file
View File

@@ -0,0 +1,246 @@
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <CL/opencl.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <chrono>
#include <vector>
#define LOCAL_SIZE 16
#define KERNEL_NAME "matmul"
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
cleanup(); \
exit(-1); \
} while (0)
#define CL_CHECK2(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
decltype(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
cleanup(); \
exit(-1); \
} \
_ret; \
})
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
static bool compare_equal(float a, float b, int ulp = 21) {
union fi_t { int i; float f; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
return std::abs(fa.i - fb.i) <= ulp;
}
static void matrix_multiply_cpu(float *A, float *B, float *C, int N) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
float sum = 0.0f;
for (int k = 0; k < N; k++) {
sum += A[i * N + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
}
}
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue commandQueue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_mem a_memobj = NULL;
cl_mem b_memobj = NULL;
cl_mem c_memobj = NULL;
uint8_t *kernel_bin = NULL;
static void cleanup() {
if (commandQueue) clReleaseCommandQueue(commandQueue);
if (kernel) clReleaseKernel(kernel);
if (program) clReleaseProgram(program);
if (a_memobj) clReleaseMemObject(a_memobj);
if (b_memobj) clReleaseMemObject(b_memobj);
if (c_memobj) clReleaseMemObject(c_memobj);
if (context) clReleaseContext(context);
if (device_id) clReleaseDevice(device_id);
if (kernel_bin) free(kernel_bin);
}
int size = 64;
static void show_usage() {
printf("Usage: [-n size] [-h: help]\n");
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "fn:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
int main (int argc, char **argv) {
// parse command arguments
parse_args(argc, argv);
printf("Matrix size=%d\n", size);
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
return -1;
}
cl_platform_id platform_id;
size_t kernel_size;
// Getting platform and device information
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
printf("Create context\n");
context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
char device_string[1024];
clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
printf("Using device: %s\n", device_string);
printf("Allocate device buffers\n");
size_t nbytes = size * size * sizeof(float);
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
if (program == NULL) {
cleanup();
return -1;
}
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
// Create kernel
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
size_t global_size[2] = {size, size};
// Set kernel arguments
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
// Allocate memories for input arrays and output arrays.
std::vector<float> h_a(size * size);
std::vector<float> h_b(size * size);
std::vector<float> h_c(size * size);
// Initialize values for array members.
for (int i = 0; i < (size * size); ++i) {
#ifdef USE_FLOAT
h_a[i] = (float)rand() / (float)RAND_MAX;
h_b[i] = (float)rand() / (float)RAND_MAX;
#else
h_a[i] = rand();
h_b[i] = rand();
#endif
h_c[i] = 0xdeadbeef;
}
// Creating command queue
commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
printf("Upload source buffers\n");
CL_CHECK(clEnqueueWriteBuffer(commandQueue, a_memobj, CL_TRUE, 0, nbytes, h_a.data(), 0, NULL, NULL));
CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b.data(), 0, NULL, NULL));
printf("Execute the kernel\n");
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL));
CL_CHECK(clFinish(commandQueue));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
printf("Download destination buffer\n");
CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));
printf("Verify result\n");
std::vector<float> ref_vec(size * size);
matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size);
int errors = 0;
for (int i = 0; i < (size * size); i++) {
if (!compare_equal(h_c[i], ref_vec[i])) {
if (errors < 100)
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
++errors;
}
}
if (errors != 0) {
printf("FAILED! - %d errors\n", errors);
} else {
printf("PASSED!\n");
}
// Clean up
cleanup();
return errors;
}

View File

@@ -1,69 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = mri-q
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c file.cc computeQ.c
all: $(PROJECT).dump $(PROJECT).hex
CXXFLAGS += -I.
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
OPTS ?=
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
include ../common.mk

View File

@@ -1,6 +1,10 @@
#ifndef __COMPUTEQ__
#define __COMPUTEQ__
#ifdef __cplusplus
extern "C" {
#endif
void computePhiMag_GPU(int numK,cl_mem phiR_d,cl_mem phiI_d,cl_mem phiMag_d,clPrmtr* clPrm);
void computeQ_GPU (int numK,int numX,
cl_mem x_d, cl_mem y_d, cl_mem z_d,
@@ -11,4 +15,8 @@ void computeQ_GPU (int numK,int numX,
void createDataStructsCPU(int numK, int numX, float** phiMag,
float** Qr, float** Qi);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -9,6 +9,10 @@
#ifndef __GPUINFOH__
#define __GPUINFOH__
#ifdef __cplusplus
extern "C" {
#endif
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
@@ -17,4 +21,8 @@ void compute_active_thread(size_t *thread,
int minor,
int sm);
#ifdef __cplusplus
}
#endif
#endif

Binary file not shown.

Binary file not shown.

View File

@@ -34,6 +34,27 @@
#include "macros.h"
#include "computeQ.h"
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return CL_INVALID_VALUE;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return CL_INVALID_VALUE;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return CL_SUCCESS;
}
static void
setupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr,clPrmtr* clPrm)
{
@@ -93,8 +114,6 @@ main (int argc, char *argv[]) {
&x, &y, &z,
&phiR, &phiI);
printf("OK\n");
/* Reduce the number of k-space samples if a number is given
* on the command line */
if (argc < 2)
@@ -137,13 +156,20 @@ main (int argc, char *argv[]) {
pb_SetOpenCL(&(clPrm.clContext), &(clPrm.clCommandQueue));
printf("OK\n");
//const char* clSource[] = {readFile("src/opencl_base/kernels.cl")};
//cl_program clProgram = clCreateProgramWithSource(clPrm.clContext,1,clSource,NULL,&clStatus);
cl_program clProgram = clCreateProgramWithBuiltInKernels(
clPrm.clContext, 1, &clDevice, "ComputePhiMag_GPU;ComputeQ_GPU", &clStatus);
#ifdef HOSTGPU
const char* clSource[] = {readFile("kernel.cl")};
CHECK_ERROR("clCreateProgramWithSource")
cl_program clProgram = clCreateProgramWithSource(clPrm.clContext,1,clSource,NULL,&clStatus);
#else
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
CHECK_ERROR("read_kernel_file")
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
CHECK_ERROR("clCreateProgramWithSource")
cl_program clProgram = clCreateProgramWithBinary(
clPrm.clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
#endif
char options[50];
sprintf(options,"-I src/opencl_nvidia");

View File

@@ -3,6 +3,10 @@
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
cl_context clContext;
cl_command_queue clCommandQueue;
@@ -20,4 +24,8 @@ char* readFile(const char*);
exit(1); \
}
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1,72 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
OPTS ?= filelist.txt
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter
CXXFLAGS += -Wno-unused-variable -Wno-narrowing -Wno-unused-result -Wno-unused-but-set-variable
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = nearn
SRCS = main.cc clutils.cpp utils.cpp
all: $(PROJECT) kernel.pocl
OPTS ?= filelist.txt
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,2 @@
cane4_0.db
cane4_1.db
cane4_2.db
cane4_3.db
cane4_1.db

Binary file not shown.

View File

@@ -172,6 +172,7 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
printf("%f\n\n", (float)(totalTime / 1e9));
}
// 6. return finalized data and release buffers
clReleaseEvent(writeEvent);
clReleaseEvent(kernelEvent);

View File

@@ -1,71 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
OPTS ?= -n1
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = oclprintf
SRCS = main.cc
all: $(PROJECT) kernel.pocl
OPTS ?= -n1
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -1,71 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
OPTS ?= -f -n16
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-strict-aliasing -Wno-narrowing
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = psort
SRCS = main.cc
all: $(PROJECT) kernel.pocl
OPTS ?= -f -n16
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More