From 027cbddf5f962ae47517dbf4ffb0db4b11adeb1c Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 25 Nov 2019 12:55:11 -0500 Subject: [PATCH] sad --- benchmarks/opencl/sad/DESCRIPTION | 1 + benchmarks/opencl/sad/Makefile | 68 ++ benchmarks/opencl/sad/OpenCL_common.cpp | 298 +++++ benchmarks/opencl/sad/OpenCL_common.h | 22 + benchmarks/opencl/sad/args.c | 617 ++++++++++ benchmarks/opencl/sad/file.c | 55 + benchmarks/opencl/sad/file.h | 22 + benchmarks/opencl/sad/frame.bin | Bin 0 -> 50692 bytes benchmarks/opencl/sad/gpu_info.c | 55 + benchmarks/opencl/sad/gpu_info.h | 20 + benchmarks/opencl/sad/image.c | 56 + benchmarks/opencl/sad/image.h | 25 + benchmarks/opencl/sad/kernel.cl | 326 ++++++ benchmarks/opencl/sad/libsad.a | Bin 0 -> 17384 bytes benchmarks/opencl/sad/main.cc | 545 +++++++++ benchmarks/opencl/sad/ocl.c | 50 + benchmarks/opencl/sad/ocl.h | 21 + benchmarks/opencl/sad/parboil.c | 427 +++++++ benchmarks/opencl/sad/parboil.h | 348 ++++++ benchmarks/opencl/sad/parboil_opencl.c | 1394 +++++++++++++++++++++++ benchmarks/opencl/sad/reference.bin | Bin 0 -> 50692 bytes benchmarks/opencl/sad/sad.h | 83 ++ benchmarks/opencl/sad/sad_kernel.h | 57 + 23 files changed, 4490 insertions(+) create mode 100755 benchmarks/opencl/sad/DESCRIPTION create mode 100644 benchmarks/opencl/sad/Makefile create mode 100644 benchmarks/opencl/sad/OpenCL_common.cpp create mode 100644 benchmarks/opencl/sad/OpenCL_common.h create mode 100644 benchmarks/opencl/sad/args.c create mode 100644 benchmarks/opencl/sad/file.c create mode 100644 benchmarks/opencl/sad/file.h create mode 100755 benchmarks/opencl/sad/frame.bin create mode 100644 benchmarks/opencl/sad/gpu_info.c create mode 100644 benchmarks/opencl/sad/gpu_info.h create mode 100644 benchmarks/opencl/sad/image.c create mode 100644 benchmarks/opencl/sad/image.h create mode 100644 benchmarks/opencl/sad/kernel.cl create mode 100644 benchmarks/opencl/sad/libsad.a create mode 100644 benchmarks/opencl/sad/main.cc create mode 100644 benchmarks/opencl/sad/ocl.c create mode 100644 benchmarks/opencl/sad/ocl.h create mode 100644 benchmarks/opencl/sad/parboil.c create mode 100644 benchmarks/opencl/sad/parboil.h create mode 100644 benchmarks/opencl/sad/parboil_opencl.c create mode 100755 benchmarks/opencl/sad/reference.bin create mode 100644 benchmarks/opencl/sad/sad.h create mode 100644 benchmarks/opencl/sad/sad_kernel.h diff --git a/benchmarks/opencl/sad/DESCRIPTION b/benchmarks/opencl/sad/DESCRIPTION new file mode 100755 index 00000000..87ef107a --- /dev/null +++ b/benchmarks/opencl/sad/DESCRIPTION @@ -0,0 +1 @@ +Inputs: reference.bin frame.bin diff --git a/benchmarks/opencl/sad/Makefile b/benchmarks/opencl/sad/Makefile new file mode 100644 index 00000000..8b843513 --- /dev/null +++ b/benchmarks/opencl/sad/Makefile @@ -0,0 +1,68 @@ +RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops) +POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc) +POCL_INC_PATH = $(wildcard ../include) +POCL_LIB_PATH = $(wildcard ../lib) +VX_RT_PATH = $(wildcard ../../../runtime) +VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir) + +CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc +CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++ +DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump +HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy +GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb + +VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c +VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s +VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s +VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c +VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s +VX_SRCS += $(VX_RT_PATH)/tests/tests.c +VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c +VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) + +VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld + +CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32 +CXXFLAGS += -ffreestanding # program may not begin at main() +CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections +CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions +CXXFLAGS += -I$(POCL_INC_PATH) -I. + +VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a +QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a + +PROJECT = sad + +SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c file.c image.c OpenCL_common.cpp + +all: $(PROJECT).dump $(PROJECT).hex + +lib$(PROJECT).a: kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl + +$(PROJECT).elf: $(SRCS) lib$(PROJECT).a + $(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf + +$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a + $(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu + +$(PROJECT).hex: $(PROJECT).elf + $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex + +$(PROJECT).dump: $(PROJECT).elf + $(DMP) -D $(PROJECT).elf > $(PROJECT).dump + +run: $(PROJECT).hex + POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug + +qemu: $(PROJECT).qemu + POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu + +gdb-s: $(PROJECT).qemu + POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu + +gdb-c: $(PROJECT).qemu + $(GDB) $(PROJECT).qemu + +clean: + rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug \ No newline at end of file diff --git a/benchmarks/opencl/sad/OpenCL_common.cpp b/benchmarks/opencl/sad/OpenCL_common.cpp new file mode 100644 index 00000000..2e147e4c --- /dev/null +++ b/benchmarks/opencl/sad/OpenCL_common.cpp @@ -0,0 +1,298 @@ + + +#include "OpenCL_common.h" +#include +#include + +// -1 for NO suitable device found, 0 if an appropriate device was found +int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) { + + // Supported Device Requests (anything that returns cl_bool) + // CL_DEVICE_IMAGE_SUPPORT + // CL_DEVICE_HOST_UNIFIED_MEMORY + // CL_DEVICE_ERROR_CORRECTION_SUPPORT + // CL_DEVICE_AVAILABLE + // CL_DEVICE_COMPILER_AVAILABLE + + cl_uint numEntries = 16; + cl_platform_id clPlatforms[numEntries]; + cl_uint numPlatforms; + + cl_device_id clDevices[numEntries]; + cl_uint numDevices; + + OCL_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) ); + //fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms); + bool needDevice = true; + + for (int ip = 0; ip < numPlatforms && needDevice; ++ip) { + + cl_platform_id clPlatform = clPlatforms[ip]; + + OCL_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) ); + //fprintf(stderr, " Number of Devices found for Platform %d: %d\n", ip, numDevices); + + for (int id = 0; (id < numDevices) && needDevice ; ++id) { + cl_device_id clDevice = clDevices[id]; + cl_device_type clDeviceType; + + bool canSatisfy = true; + + if (reqDeviceType != NULL) { + OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL)); + if (*reqDeviceType != CL_DEVICE_TYPE_ALL) { + if (*reqDeviceType != clDeviceType) { + canSatisfy = false; + } + } + } + + va_list paramList; + va_start(paramList, numRequests); + for (int i = 0; (i < numRequests) && canSatisfy ; ++i) { + + cl_device_info devReq = va_arg( paramList, cl_device_info ); + cl_bool clInfoBool; + size_t infoRetSize = sizeof(cl_bool); + + OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL)); + if (clInfoBool != true) { + canSatisfy = false; + } + } + + va_end(paramList); + if (canSatisfy) { + *device = clDevice; + *platform = clPlatform; + needDevice = false; + if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) { + *reqDeviceType = clDeviceType; + } + } + } // End checking all devices for a platform + } // End checking all platforms + + int retVal = -1; + if (needDevice) { + retVal = -1; + } else { + retVal = 0; + } + + return retVal; + +} + +const char* oclErrorString(cl_int error) +{ +// From NVIDIA SDK + static const char* errorString[] = { + "CL_SUCCESS", + "CL_DEVICE_NOT_FOUND", + "CL_DEVICE_NOT_AVAILABLE", + "CL_COMPILER_NOT_AVAILABLE", + "CL_MEM_OBJECT_ALLOCATION_FAILURE", + "CL_OUT_OF_RESOURCES", + "CL_OUT_OF_HOST_MEMORY", + "CL_PROFILING_INFO_NOT_AVAILABLE", + "CL_MEM_COPY_OVERLAP", + "CL_IMAGE_FORMAT_MISMATCH", + "CL_IMAGE_FORMAT_NOT_SUPPORTED", + "CL_BUILD_PROGRAM_FAILURE", + "CL_MAP_FAILURE", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "CL_INVALID_VALUE", + "CL_INVALID_DEVICE_TYPE", + "CL_INVALID_PLATFORM", + "CL_INVALID_DEVICE", + "CL_INVALID_CONTEXT", + "CL_INVALID_QUEUE_PROPERTIES", + "CL_INVALID_COMMAND_QUEUE", + "CL_INVALID_HOST_PTR", + "CL_INVALID_MEM_OBJECT", + "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", + "CL_INVALID_IMAGE_SIZE", + "CL_INVALID_SAMPLER", + "CL_INVALID_BINARY", + "CL_INVALID_BUILD_OPTIONS", + "CL_INVALID_PROGRAM", + "CL_INVALID_PROGRAM_EXECUTABLE", + "CL_INVALID_KERNEL_NAME", + "CL_INVALID_KERNEL_DEFINITION", + "CL_INVALID_KERNEL", + "CL_INVALID_ARG_INDEX", + "CL_INVALID_ARG_VALUE", + "CL_INVALID_ARG_SIZE", + "CL_INVALID_KERNEL_ARGS", + "CL_INVALID_WORK_DIMENSION", + "CL_INVALID_WORK_GROUP_SIZE", + "CL_INVALID_WORK_ITEM_SIZE", + "CL_INVALID_GLOBAL_OFFSET", + "CL_INVALID_EVENT_WAIT_LIST", + "CL_INVALID_EVENT", + "CL_INVALID_OPERATION", + "CL_INVALID_GL_OBJECT", + "CL_INVALID_BUFFER_SIZE", + "CL_INVALID_MIP_LEVEL", + "CL_INVALID_GLOBAL_WORK_SIZE", + }; + + const int errorCount = sizeof(errorString) / sizeof(errorString[0]); + + const int index = -error; + + return (index >= 0 && index < errorCount) ? errorString[index] : ""; +} + + +const char* oclDebugErrString(cl_int error, cl_device_id device) +{ +// From NVIDIA SDK + static const char* errorString[] = { + "CL_SUCCESS", + "CL_DEVICE_NOT_FOUND", + "CL_DEVICE_NOT_AVAILABLE", + "CL_COMPILER_NOT_AVAILABLE", + "CL_MEM_OBJECT_ALLOCATION_FAILURE", + "CL_OUT_OF_RESOURCES", + "CL_OUT_OF_HOST_MEMORY", + "CL_PROFILING_INFO_NOT_AVAILABLE", + "CL_MEM_COPY_OVERLAP", + "CL_IMAGE_FORMAT_MISMATCH", + "CL_IMAGE_FORMAT_NOT_SUPPORTED", + "CL_BUILD_PROGRAM_FAILURE", + "CL_MAP_FAILURE", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "CL_INVALID_VALUE", + "CL_INVALID_DEVICE_TYPE", + "CL_INVALID_PLATFORM", + "CL_INVALID_DEVICE", + "CL_INVALID_CONTEXT", + "CL_INVALID_QUEUE_PROPERTIES", + "CL_INVALID_COMMAND_QUEUE", + "CL_INVALID_HOST_PTR", + "CL_INVALID_MEM_OBJECT", + "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", + "CL_INVALID_IMAGE_SIZE", + "CL_INVALID_SAMPLER", + "CL_INVALID_BINARY", + "CL_INVALID_BUILD_OPTIONS", + "CL_INVALID_PROGRAM", + "CL_INVALID_PROGRAM_EXECUTABLE", + "CL_INVALID_KERNEL_NAME", + "CL_INVALID_KERNEL_DEFINITION", + "CL_INVALID_KERNEL", + "CL_INVALID_ARG_INDEX", + "CL_INVALID_ARG_VALUE", + "CL_INVALID_ARG_SIZE", + "CL_INVALID_KERNEL_ARGS", + "CL_INVALID_WORK_DIMENSION", + "CL_INVALID_WORK_GROUP_SIZE", + "CL_INVALID_WORK_ITEM_SIZE", + "CL_INVALID_GLOBAL_OFFSET", + "CL_INVALID_EVENT_WAIT_LIST", + "CL_INVALID_EVENT", + "CL_INVALID_OPERATION", + "CL_INVALID_GL_OBJECT", + "CL_INVALID_BUFFER_SIZE", + "CL_INVALID_MIP_LEVEL", + "CL_INVALID_GLOBAL_WORK_SIZE", + }; + + const int errorCount = sizeof(errorString) / sizeof(errorString[0]); + + const int index = -error; + + if (index == 4) { + cl_uint maxMemAlloc = 0; + + OCL_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) ); + + + fprintf(stderr, " Device Maximum block allocation size: %lu\n", maxMemAlloc); + } + + return (index >= 0 && index < errorCount) ? errorString[index] : ""; +} + +char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength) +{ + // locals + FILE* pFileStream = NULL; + size_t szSourceLength; + + // open the OpenCL source code file + #ifdef _WIN32 // Windows version + if(fopen_s(&pFileStream, cFilename, "rb") != 0) + { + return NULL; + } + #else // Linux version + pFileStream = fopen(cFilename, "rb"); + if(pFileStream == 0) + { + return NULL; + } + #endif + + size_t szPreambleLength = strlen(cPreamble); + szPreambleLength = 0; + + // get the length of the source code + fseek(pFileStream, 0, SEEK_END); + szSourceLength = ftell(pFileStream); + fseek(pFileStream, 0, SEEK_SET); + + // allocate a buffer for the source code string and read it in + char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); + memcpy(cSourceString, cPreamble, szPreambleLength); + if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1) + { + fclose(pFileStream); + free(cSourceString); + return 0; + } + + // close the file and return the total length of the combined (preamble + source) string + fclose(pFileStream); + if(szFinalLength != 0) + { + *szFinalLength = szSourceLength + szPreambleLength; + } + cSourceString[szSourceLength + szPreambleLength] = '\0'; + + return cSourceString; +} diff --git a/benchmarks/opencl/sad/OpenCL_common.h b/benchmarks/opencl/sad/OpenCL_common.h new file mode 100644 index 00000000..c5180053 --- /dev/null +++ b/benchmarks/opencl/sad/OpenCL_common.h @@ -0,0 +1,22 @@ + +#ifndef __OPENCL_COMMON_H_ +#define __OPENCL_COMMON_H_ + +#include +#include +#include + +int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...); +const char* oclErrorString(cl_int error); +const char* oclDebugErrString(cl_int error, cl_device_id device); + +#define OCL_ERRCK_VAR(var) \ + { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); } + +#define OCL_ERRCK_RETVAL(s) \ + { cl_int clerr = (s);\ + if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); } + +char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength); + +#endif diff --git a/benchmarks/opencl/sad/args.c b/benchmarks/opencl/sad/args.c new file mode 100644 index 00000000..9d751e29 --- /dev/null +++ b/benchmarks/opencl/sad/args.c @@ -0,0 +1,617 @@ + +#include +#include +#include +#include +#include +#include + +/*****************************************************************************/ +/* Memory management routines */ + +/* Free an array of owned strings. */ +void +pb_FreeStringArray(char **string_array) +{ + char **p; + + if (!string_array) return; + for (p = string_array; *p; p++) free(*p); + free(string_array); +} + +struct pb_PlatformParam * +pb_PlatformParam(char *name, char *version) +{ + if (name == NULL) { + fprintf(stderr, "pb_PlatformParam: Invalid argument\n"); + exit(-1); + } + + struct pb_PlatformParam *ret = + (struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam)); + + ret->name = name; + ret->version = version; + return ret; +} + +void +pb_FreePlatformParam(struct pb_PlatformParam *p) +{ + if (p == NULL) return; + + free(p->name); + free(p->version); + free(p); +} + +struct pb_DeviceParam * +pb_DeviceParam_index(int index) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_INDEX; + ret->index = index; + return ret; +} + +struct pb_DeviceParam * +pb_DeviceParam_cpu(void) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_CPU; + return ret; +} + +struct pb_DeviceParam * +pb_DeviceParam_gpu(void) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_GPU; + return ret; +} + +struct pb_DeviceParam * +pb_DeviceParam_accelerator(void) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_ACCELERATOR; + return ret; +} + +struct pb_DeviceParam * +pb_DeviceParam_name(char *name) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_NAME; + ret->name = name; + return ret; +} + +void +pb_FreeDeviceParam(struct pb_DeviceParam *p) +{ + if (p == NULL) return; + + switch(p->criterion) { + case pb_Device_NAME: + free(p->name); + break; + case pb_Device_INDEX: + case pb_Device_CPU: + case pb_Device_ACCELERATOR: + break; + default: + fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n"); + exit(-1); + } +} + +void +pb_FreeParameters(struct pb_Parameters *p) +{ + free(p->outFile); + pb_FreeStringArray(p->inpFiles); + pb_FreePlatformParam(p->platform); + pb_FreeDeviceParam(p->device); + free(p); +} + +/*****************************************************************************/ + +/* Parse a comma-delimited list of strings into an + * array of strings. */ +static char ** +read_string_array(char *in) +{ + char **ret; + int i; + int count; /* Number of items in the input */ + char *substring; /* Current substring within 'in' */ + + /* Count the number of items in the string */ + count = 1; + for (i = 0; in[i]; i++) if (in[i] == ',') count++; + + /* Allocate storage */ + ret = (char **)malloc((count + 1) * sizeof(char *)); + + /* Create copies of the strings from the list */ + substring = in; + for (i = 0; i < count; i++) { + char *substring_end; + int substring_length; + + /* Find length of substring */ + for (substring_end = substring; + (*substring_end != ',') && (*substring_end != 0); + substring_end++); + + substring_length = substring_end - substring; + + /* Allocate memory and copy the substring */ + ret[i] = (char *)malloc(substring_length + 1); + memcpy(ret[i], substring, substring_length); + ret[i][substring_length] = 0; + + /* go to next substring */ + substring = substring_end + 1; + } + ret[i] = NULL; /* Write the sentinel value */ + + return ret; +} + +static void +report_parse_error(const char *str) +{ + fputs(str, stderr); +} + +/* Interpret a string as a 'pb_DeviceParam' value. + * Return a pointer to a new value, or NULL on failure. + */ +static struct pb_DeviceParam * +read_device_param(char *str) +{ + /* Try different ways of interpreting 'device_string' until one works */ + + /* If argument is an integer, then interpret it as a device index */ + errno = 0; + char *end; + long device_int = strtol(str, &end, 10); + if (!errno) { + /* Negative numbers are not valid */ + if (device_int < 0 || device_int > INT_MAX) return NULL; + + return pb_DeviceParam_index(device_int); + } + + /* Match against predefined strings */ + if (strcmp(str, "CPU") == 0) + return pb_DeviceParam_cpu(); + if (strcmp(str, "GPU") == 0) + return pb_DeviceParam_gpu(); + if (strcmp(str, "ACCELERATOR") == 0) + return pb_DeviceParam_accelerator(); + + /* Assume any other string is a device name */ + return pb_DeviceParam_name(strdup(str)); +} + +/* Interpret a string as a 'pb_PlatformParam' value. + * Return a pointer to a new value, or NULL on failure. + */ +static struct pb_PlatformParam * +read_platform_param(char *str) +{ + int separator_index; /* Index of the '-' character separating + * name and version number. It's -1 if + * there's no '-' character. */ + + /* Find the last occurrence of '-' in 'str' */ + { + char *cur; + separator_index = -1; + for (cur = str; *cur; cur++) { + if (*cur == '-') separator_index = cur - str; + } + } + + /* The platform name is either the entire string, or all characters before + * the separator */ + int name_length = separator_index == -1 ? strlen(str) : separator_index; + char *name_str = (char *)malloc(name_length + 1); + memcpy(name_str, str, name_length); + name_str[name_length] = 0; + + /* The version is either NULL, or all characters after the separator */ + char *version_str; + if (separator_index == -1) { + version_str = NULL; + } + else { + const char *version_input_str = str + separator_index + 1; + int version_length = strlen(version_input_str); + + version_str = (char *)malloc(version_length + 1); + memcpy(version_str, version_input_str, version_length); + version_str[version_length] = 0; + } + + /* Create output structure */ + return pb_PlatformParam(name_str, version_str); +} + +/****************************************************************************/ +/* Argument parsing state */ + +/* Argument parsing state. + * + * Arguments that are interpreted by the argument parser are removed from + * the list. Variables 'argc' and 'argn' do not count arguments that have + * been removed. + * + * During argument parsing, the array of arguments is compacted, overwriting + * the erased arguments. Variable 'argv_put' points to the array element + * where the next argument will be written. Variable 'argv_get' points to + * the array element where the next argument will be read from. + */ +struct argparse { + int argc; /* Number of arguments. Mutable. */ + int argn; /* Current argument index. */ + char **argv_get; /* Argument value being read. */ + char **argv_put; /* Argument value being written. + * argv_put <= argv_get. */ +}; + +static void +initialize_argparse(struct argparse *ap, int argc, char **argv) +{ + ap->argc = argc; + ap->argn = 0; + ap->argv_get = ap->argv_put = argv; +} + +/* Finish argument parsing, without processing the remaining arguments. + * Write new argument count into _argc. */ +static void +finalize_argparse(struct argparse *ap, int *_argc, char **argv) +{ + /* Move the remaining arguments */ + for(; ap->argn < ap->argc; ap->argn++) + *ap->argv_put++ = *ap->argv_get++; + + /* Update the argument count */ + *_argc = ap->argc; + + /* Insert a terminating NULL */ + argv[ap->argc] = NULL; +} + +/* Delete the current argument. The argument will not be visible + * when argument parsing is done. */ +static void +delete_argument(struct argparse *ap) +{ + if (ap->argn >= ap->argc) { + fprintf(stderr, "delete_argument\n"); + } + ap->argc--; + ap->argv_get++; +} + +/* Go to the next argument. Also, move the current argument to its + * final location in argv. */ +static void +next_argument(struct argparse *ap) +{ + if (ap->argn >= ap->argc) { + fprintf(stderr, "next_argument\n"); + } + /* Move argument to its new location. */ + *ap->argv_put++ = *ap->argv_get++; + ap->argn++; +} + +static int +is_end_of_arguments(struct argparse *ap) +{ + return ap->argn == ap->argc; +} + +/* Get the current argument */ +static char * +get_argument(struct argparse *ap) +{ + return *ap->argv_get; +} + +/* Get the current argument, and also delete it */ +static char * +consume_argument(struct argparse *ap) +{ + char *ret = get_argument(ap); + delete_argument(ap); + return ret; +} + +/****************************************************************************/ + +/* The result of parsing a command-line argument */ +typedef enum { + ARGPARSE_OK, /* Success */ + ARGPARSE_ERROR, /* Error */ + ARGPARSE_DONE /* Success, and do not continue parsing */ +} result; + +typedef result parse_action(struct argparse *ap, struct pb_Parameters *params); + + +/* A command-line option */ +struct option { + char short_name; /* If not 0, the one-character + * name of this option */ + const char *long_name; /* If not NULL, the long name of this option */ + parse_action *action; /* What to do when this option occurs. + * Sentinel value is NULL. + */ +}; + +/* Output file + * + * -o FILE + */ +static result +parse_output_file(struct argparse *ap, struct pb_Parameters *params) +{ + if (is_end_of_arguments(ap)) + { + report_parse_error("Expecting file name after '-o'\n"); + return ARGPARSE_ERROR; + } + + /* Replace the output file name */ + free(params->outFile); + params->outFile = strdup(consume_argument(ap)); + + return ARGPARSE_OK; +} + +/* Input files + * + * -i FILE,FILE,... + */ +static result +parse_input_files(struct argparse *ap, struct pb_Parameters *params) +{ + if (is_end_of_arguments(ap)) + { + report_parse_error("Expecting file name after '-i'\n"); + return ARGPARSE_ERROR; + } + + /* Replace the input file list */ + pb_FreeStringArray(params->inpFiles); + params->inpFiles = read_string_array(consume_argument(ap)); + return ARGPARSE_OK; +} + +/* End of options + * + * -- + */ + +static result +parse_end_options(struct argparse *ap, struct pb_Parameters *params) +{ + return ARGPARSE_DONE; +} + +/* OpenCL device + * + * --device X + */ + +static result +parse_device(struct argparse *ap, struct pb_Parameters *params) +{ + /* Read the next argument, which specifies a device */ + + if (is_end_of_arguments(ap)) + { + report_parse_error("Expecting device specification after '--device'\n"); + return ARGPARSE_ERROR; + } + + char *device_string = consume_argument(ap); + struct pb_DeviceParam *device_param = read_device_param(device_string); + + if (!device_param) { + report_parse_error("Unrecognized device specification format on command line\n"); + return ARGPARSE_ERROR; + } + + /* Save the result */ + pb_FreeDeviceParam(params->device); + params->device = device_param; + + return ARGPARSE_OK; +} + +static result +parse_platform(struct argparse *ap, struct pb_Parameters *params) +{ + /* Read the next argument, which specifies a platform */ + + if (is_end_of_arguments(ap)) + { + report_parse_error("Expecting device specification after '--platform'\n"); + return ARGPARSE_ERROR; + } + + char *platform_string = consume_argument(ap); + struct pb_PlatformParam *platform_param = read_platform_param(platform_string); + + if (!platform_param) { + report_parse_error("Unrecognized platform specification format on command line\n"); + return ARGPARSE_ERROR; + } + + /* Save the result */ + pb_FreePlatformParam(params->platform); + params->platform = platform_param; + + return ARGPARSE_OK; +} + + +static struct option options[] = { + { 'o', NULL, &parse_output_file }, + { 'i', NULL, &parse_input_files }, + { '-', NULL, &parse_end_options }, + { 0, "device", &parse_device }, + { 0, "platform", &parse_platform }, + { 0, NULL, NULL } +}; + +static int +is_last_option(struct option *op) +{ + return op->action == NULL; +} + +/****************************************************************************/ + +/* Parse command-line parameters. + * Return zero on error, nonzero otherwise. + * On error, the other outputs may be invalid. + * + * The information collected from parameters is used to update + * 'ret'. 'ret' should be initialized. + * + * '_argc' and 'argv' are updated to contain only the unprocessed arguments. + */ +static int +pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv) +{ + char *err_message; + struct argparse ap; + + /* Each argument */ + initialize_argparse(&ap, *_argc, argv); + while(!is_end_of_arguments(&ap)) { + result arg_result; /* Result of parsing this option */ + char *arg = get_argument(&ap); + + /* Process this argument */ + if (arg[0] == '-') { + /* Single-character flag */ + if ((arg[1] != 0) && (arg[2] == 0)) { + delete_argument(&ap); /* This argument is consumed here */ + + /* Find a matching short option */ + struct option *op; + for (op = options; !is_last_option(op); op++) { + if (op->short_name == arg[1]) { + arg_result = (*op->action)(&ap, ret); + goto option_was_processed; + } + } + + /* No option matches */ + report_parse_error("Unexpected command-line parameter\n"); + arg_result = ARGPARSE_ERROR; + goto option_was_processed; + } + + /* Long flag */ + if (arg[1] == '-') { + delete_argument(&ap); /* This argument is consumed here */ + + /* Find a matching long option */ + struct option *op; + for (op = options; !is_last_option(op); op++) { + if (op->long_name && strcmp(&arg[2], op->long_name) == 0) { + arg_result = (*op->action)(&ap, ret); + goto option_was_processed; + } + } + + /* No option matches */ + report_parse_error("Unexpected command-line parameter\n"); + arg_result = ARGPARSE_ERROR; + goto option_was_processed; + } + } + else { + /* Other arguments are ignored */ + next_argument(&ap); + arg_result = ARGPARSE_OK; + goto option_was_processed; + } + + option_was_processed: + /* Decide what to do next based on 'arg_result' */ + switch(arg_result) { + case ARGPARSE_OK: + /* Continue processing */ + break; + + case ARGPARSE_ERROR: + /* Error exit from the function */ + return 0; + + case ARGPARSE_DONE: + /* Normal exit from the argument parsing loop */ + goto end_of_options; + } + } /* end for each argument */ + + /* If all arguments were processed, then normal exit from the loop */ + + end_of_options: + finalize_argparse(&ap, _argc, argv); + return 1; +} + +/*****************************************************************************/ +/* Other exported functions */ + +struct pb_Parameters * +pb_ReadParameters(int *_argc, char **argv) +{ + struct pb_Parameters *ret = + (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters)); + + /* Initialize the parameters structure */ + ret->outFile = NULL; + ret->inpFiles = (char **)malloc(sizeof(char *)); + ret->inpFiles[0] = NULL; + ret->platform = NULL; + ret->device = NULL; + + /* Read parameters and update _argc, argv */ + if (!pb_ParseParameters(ret, _argc, argv)) { + /* Parse error */ + pb_FreeParameters(ret); + return NULL; + } + + return ret; +} + +int +pb_Parameters_CountInputs(struct pb_Parameters *p) +{ + int n; + + for (n = 0; p->inpFiles[n]; n++); + return n; +} + diff --git a/benchmarks/opencl/sad/file.c b/benchmarks/opencl/sad/file.c new file mode 100644 index 00000000..5187c7f7 --- /dev/null +++ b/benchmarks/opencl/sad/file.c @@ -0,0 +1,55 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#include +#include "file.h" + +unsigned short +read16u(FILE *f) +{ + int n; + + n = fgetc(f); + n += fgetc(f) << 8; + + return n; +} + +short +read16i(FILE *f) +{ + int n; + + n = fgetc(f); + n += fgetc(f) << 8; + + return n; +} + +void +write32u(FILE *f, unsigned int i) +{ + putc(i, f); + putc(i >> 8, f); + putc(i >> 16, f); + putc(i >> 24, f); +} + +void +write16u(FILE *f, unsigned short h) +{ + putc(h, f); + putc(h >> 8, f); +} + +void +write16i(FILE *f, short h) +{ + putc(h, f); + putc(h >> 8, f); +} diff --git a/benchmarks/opencl/sad/file.h b/benchmarks/opencl/sad/file.h new file mode 100644 index 00000000..5d783e91 --- /dev/null +++ b/benchmarks/opencl/sad/file.h @@ -0,0 +1,22 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned short read16u(FILE *f); +short read16i(FILE *f); + +void write32u(FILE *f, unsigned int i); +void write16u(FILE *f, unsigned short h); +void write16i(FILE *f, short h); + +#ifdef __cplusplus +} +#endif diff --git a/benchmarks/opencl/sad/frame.bin b/benchmarks/opencl/sad/frame.bin new file mode 100755 index 0000000000000000000000000000000000000000..f8142c46a0cf2ce6695a639d213ad50cefd8c605 GIT binary patch literal 50692 zcmZwQ_t#$4eJ%Ru?>NVC?j2{Gli0aQ96ND~9b<3@8=GEinuI8#NJvN^At68wga9F` z=n}{X(Gf)w)pQVuE{bbn?2~%iFZYalZ|c%dn7o;NzdzZP=kE>MFD5_F>t9TMncq$Qy`OKto4x*g z^4#Q>?B()oS50o5Tsb*wa{A=#$+pBr~(&YKP z|Lo+v{ATrJd49Wm@^s#RBgcO`N9Y;7nQ!mR_Da70Qa*cO^3dd_$vKltCp)s|H}kDs z`R+UU#J66~{$8Bik!)%$fA}ujZJq=JgGexs&Viz1On0XM)1hdH!V1{p867 zd9`iw7dfk^vftn4h;L7}=Go4?`+wzH{pR5u>zO?NmptE=&;L2s`Rif(Wv=$ulfTRM ze!lV39R0@0J$d$a&TUuzVEj(5cUSiF#^i&%emjr0_Xngrdn=#5H(>To&dWZo%y!k} znq2MT$r+O~Cl}`0cH~GlzqEajUwHoBfWzxKUmKJ?J~Nu&p6SK#MgYVOm`zCj$o$d^}D<@Cn^+l5h!lQfh{)+75-n`zP zJ>NFj5~lt-SNf~`{W`q;bQ`&ypgHn}TIcp}&PdiHeBWPOm`ncn+a`1-+s?ce1*wx^xe1>3DdtLuqce`k(T z^x%oZSUh4p;_^-~`(?iSU=UrNZEadQd16`z^CotIv4@#N8b=8WFV+35Li?h6m+ z=UVKu-}6m<;?;a_NA~#ITO`CvsY zVT^6hQ_V0{kx$0 z+w}Yo^Qd3`ZQlP)9)CAn>px8XA+O)encNuO-jcR_F#E#KqNw-x8NDmc6aykKW@UaJ z*1nZvZ-^RP7&}CBm*l+X2Zt&G%cpnl8W43Q(5ouoaCO4o-y5zS6ZP4uB3tL#TVWLq zUzwvUjM_b!z2N0*ISX3pdysiO?elD~dolm-2}93JpL}!JU`tKlYoE{MJGZBIT^#+Z zY7}Ly`&t-sVy@`foQKbz4yKPpWv`fgGH0Ci-{t~K~+wHyl=Arz4MYc0=rzLe`>(ct@P+TyWnTf@LV zjAeWw=ZvL)o5!lduXCNQ1y_DGD8hSblTE>6O|X3;d!ay(Et0U*r`ZqwkQSj2wJ1D? zU*!37VaQcE-tvLBYhuMtOCP=T9_!NB+ftvR}cIQft9}0(EWvR&Zgh@vy#D zHCT^+Jxt=8y$}Y?$?;aEhb$j_N1SAXU(C1g_~ks~6}*_M+8zzPHGDfK%yahi4vTjV z*Xt-SzH;yvC=*^o0>W>ng_njYcV@4z<+$wN6M6N?^q*fuQGb=Ko)>G4x4+C){5*K` z61IioSELWz9^Sr`W9kF?*q-Q8tlIQ?y|{ucuzzdtS=K~tSEU#6zZM1A*?F}&XX5+% z?XLVEEf_RCFT&8}ci?W-=AG>OnJ{ig{=@V2T$i)LO53wJ=NEF#+r!_Px$+f3<$}C& zPP_68IJ0FCswc*OeJYr&k8NBK--UM6cXkAAr!8pFntbEVJTF&^FOTW*DA8HzpL`~J zdm>l&$+YdRVED@cU7pv-#g}v4)OK6Eh|AJy8wU%Cv-NJAp=);SdNd_~5*F*d&J7+S zfg6M9s<4)SdtuIgcDQ?a_{93Yk^T66v8^9QrSu)IYR}+Wd>dal3;ca9S0cK=8XJWu zYOMZ4Vd4|PRxeqRYhRXg!OI=_EpApzMixJtD||BjdR^LLLC$X1znz<-?Fhaw-jQ!D z&*yh%A2bQ8{nu^}MvoR`4-kGioVsVgSM=0gh^@a2KHFjuu1f1~jUBoo+xl2d(az5N zy_?_Kv#Z**8wKooFUJw7xKiI@@kJ;%rgc`O&9BTp&&`?7N-JNNbJ&&E`(Sr#Xr*02 z=7StvoAlRX>lNX5^NG*+d^>Y)&*f-O=R9`gcWs}3ziL9S-pV;Xoxf*u+>?XJh0%(| zxe}axDQ604>I$EivxyHLPoKCy=X}=W{9s*;#~D`6_xjt1vLD{CHhebw^1hz&b~JQm zj>JQyD)$fm17GylQQ6-`Cw`N&dLXEt9~HVf$X*>q+L|kQHph21>=eDP^{o|$WEtDO z$?{X%2f~C)!_#ZC7YcYzww<5zxi!CjVK@`V7CXN;JW@Qq7G{ll$eOV_THv{OvG}?sOua7bogYto zep>6{{7&qLt#ko~wOaO4TlR|_K|4<68yZp5_WCA2&mMjk1phhPKMjw4_D^~BPr-NG zeCqY@g7l7HdukA&PK$Hh)Xcd~A;*)JE7%h3wuZ$U(-KR==y$@bDhdCjt->vJG;OE10 zzRceTvk|!wNu0&pkK)@;``-VSJzYP<-7*skgD?x(h@(EPi1*1bgJpa?ym~ag*u0?g ze69-@==x|8U&*^Kgo|_Y`5pN_g`l0bcY~5>`@a1D>ENGj&z`Q%_S2v+e{w=B!M3#H ztNErr=yz}C>T12J+jAjz2g2dnI_VBRE-)Kx<#(10bJWnC%!t9ef(!J@Nd@{?ko6rw(wwsFbmZ-B4NF%e(gVIW=GJfU4ppd6 zA#!BJMn;aToO-?IJ0s2?|8(SV-~aod`@aV1Ejh;(>HBAw?sE0~T=UCPr&?{B%@tL6}ZO8cWg<+`NA9&)yEd!1MD zdIg@1+lcV^zc_vz%{^cHrEmLRTlf3C`dzpoo?AF@xUoZ{U{(&oPX({%<3rsspnY`^ zzA*@~hp?led|dI{Yw4RX)v$ct(M9Y++I!h_WXYjP=;@Wtv499Ff*obHUkNAw%YDx`Y zhmXbE|EcF6-A7+R`0s=6zvS4fa^A*$=0wlLzTz{H^K)S@uU72+co6*X<(iuTy(fx2H{5F8q6~&*BMw|$5<6&2 zgoVP$H{xZ>??A8WBO3)*sIrn;0b=h5d3oDaX`|Hx!tLAdC$X>c^jb9N#JQr?@-f3*mnbJLd(n^TXjYqOUvv zaf3LHAN*AQzZk6f4NvF#jYrOnGU6STsQ+5kmOXeV9KJStvGp4d1>Y3|ciRf0DRBf( zS{txX@1*_i&+jgbTJaJtiQh3Js&Iak-xxD~Vt6_0(N=_;D<88I5Y+!il;I9ptYr^> zk!$`aws09y95ME~&p(Ri_|2(z$6t*9i!JsxH$me!C!U%UkNE5`x!4#9vOSlLH!L?T z=kahN;?0B4ac^E@F7C)m)prwfJs7X|=HS&#(ChilCxaT>$*&l_vueki1EZcvbaF{l z3bt1S<9V@V3*)V*KH%HxCsRJx6t=|;KcNb3?VuH!E5}{!yEA(|L=+P z6plr63e!FD-4pk*eMaA*?1k^|^7q?Z$9w7Va}y(+6Hh`cW^_gd_W1!LBbEBT5u``r z)ye*x6CHRsi0R>aE@gypeS~Y%7dGw2_u=?w7Y=PEe8$_-A7$D$4(Q4vHjg`Ia*Oha&&b5$ zI==G1819Mb)bpudkNDPmz+mI#ALe{S>rV#V)51JVx+u7GY?!~MVuY{zVlaa61Mx)f z2!hM=8@^Sejp~H#m$>fod=9Til0TatHs2PdkbQX}Ou8_b%0%V+uBwlMv^ zoc*IY>vPi%&*d7c!g2`B%HaN!xw_@?TDOP4^O7UFC>(k^f0w48EYD{b4cN{K0=K6n zS@$syS$kCuj{L0!!&X|P4LTmzN3Y}{WFcQk>))K#y(q^}!*^4x%Toi3sKtlj@9%eW zc*J8*WMNnh=(F+l#~C~_d{2BouD{fw_^L$Gm){L~ss+wZ-#;_w_h{b3T;qF`fKtc` z%lSNeHE()J68I)*t+VJY^@7>+3 zr7i8{)l>GYx+~J1QU>fD@%10x9aDQFdayBjKQDR&*_#uatB0bzA4Ylk5wM-oh*2e` zP@F<_L~dm3NB>WKZ$w@zjV*@3xI5ze+qAx{ps|OoL8A!I3)dbEzA6CaxSG$w;3sk= z+ryJBvHpv~mFI&f4nw&039`el1l0@DR(}|O>%r_}ZQ4#o{FL~E=Y_RT<+^{HYi5Jk zJjfMWk1%bXoHtv4p-P|)dmL5_`tp|)T{L}3wC0wu_KG}Tnd7WZ@8dOmn06f5JB9Fw z$bZD)5%rHE3-=Lw-x!%ZwI>QR;@di<57V=Drl+&;uLrqvg2BSU&sC}O=AaV%s@6w7 zo?b8eLnofd-+A#>pU)LH-rz^5TG1-6#TG6~{`Z9NYJ0q{IT@Fu2%EFVU;bMrp?!YD zw-yUC%U;UMVo`|p4xBC9`MIy;7}pQTUKS--AJ$%#Ua~aDU6^f2_=3AUrCu|u^Y8Kf zsQyl&_U|#?^Rp4#k&TzG`IFjko`~5nvG%C`Dw8=QI)sTnlPiItXI9PDfW&EP#wTr zd@Zw$aqx(FxNoKh+?73^pBB3!jJ_OtSMPbOn9q(={}9x!y(FdM^EgC+yBEBLKUuU-(#J~Lnm zEDU}!zu6LnSU5y}IL^ZPkHccPt&>kALu2G&U4Bu29}oHd(Df)PfNB}qEMpl9*S=rd zDZ=d-hzf#BqXz2tu1o95VV;vsCS_jE=aP(gJe6Oo$r4jwaJMS(h%YooWZK6^HWs^& z#%dqOdu#{4nNo=ALyuMq_uE{Bs%SL`Dz4rQiYo_i?eut1tD?7!9cYAn1dVts>a4c? z;ath;^v7HBJIEGY@i0w)DEvJs{C#mivN;;q!mk+^@3kJM`1)k@bv@G~>eb5T?1+}% z5M)=URgLPY);TYD%473<&W`3^pEiFry+kcbD`Cstk0Lz!X+^mBjMz^79})G=_`Il& zkKQl7qrLq19@>XFb0h9DznjBX{z4IU-r}llVPtDmAuL*;!p4Ul557014bBdVvg&fU z;u7DN6Z=H`TqER09qEUSCg|n33soE~mQ@Gu*{H~vQINa5HF}`F?E0wEHA8H|mpmie zoJ8dpM0@YeHRvVXGnHrCK929G5>psYv2RaoKZDXek?2tBw`%3!Y z8G{a=96R#hz+1f@zC18B4=)G#J92e*Mt{&yWUK&tZ7oUoy zJ1xHJ-BARUBt|&+8*+{!iMztDyAp3)kni4}YZaArbnVgI_}2d_OQAdEb9vqx2woT8 ztJ$?(X~R3x<5x#*uZ|wv9PTb3te6=7jGWQ3;JqzpFj`IXKRSY}6~)T%h3ts0KKmcx zJ1Xah-<}AM&qpkaC={o3c0_UPEp5%Vo;wDy{xZJw){sulCejud4`JrDO7PGiJ72mS0{L zJy;*+u1?!6%63l9!5oFP(W>Y2J@rs+m*yfy>!_NCZL9BF{jEk!;X88I|8j189D6@) zz|Qhh1&KI?EvcW+55-EO;#F7d#7((bOwrDNJGAky^A6_v9{;wzU*_zM^w8PaDBk(O z(fl^`j;i|P+4;lI2I0-&@ZIUjvw}@Wg5Qph0bex_`fZgJySlY6*NUl!YAkklj(0_v zdtLZ>b36n!YYT!Z4`Eqe|1`%_*VvkA?byAIU)hgOy+`5K|S8c-X2LG zxFWXdy7Yt%gNJ~%X9wANd0ZYYKQVAzCPyUBbNguCV)X3f=i$qujkaxyzay$s$c|nB z)vP^&DVB<^QB?9TMpJBJ>7Pk43zkdN`5M+S7a<`=dR zYshcUzFr6xMic3)ks|$^=SGLG558xHM!&n zBCNVJytqDn&?t|Q2yJnH`t^gmQOE9WVZL$QM#CMclx3%Lw?^1R@=B#1W0F+V-)v_z*T=I`WSa%#rdX6L>6`OfOJ$*lvr_Xer^ zf;MJD_mQxATR7Od8j4;Pi_{(u(~gf4pPfCPlV4C@{58h>Z1_)A%ogx`WAI%OeBdiD zA+~GA@!fpm;rMaa4L0n?{J(tQuZq_B!PZ>RHR*59=2zx%HN(cE237dM@q*yHEZYTnXUTx;IlI;BjG#Cx z$j%J1$4rjQ_LEd^9h>U78Oar%kjE2)wEWUlvBh@}_}-gi;IYg(gddLT@PDYJyqP)# zR_yh(@9}A=dC9Pylke+Oa%!!7XD#K3D3`fe=jM#6zm3(5L_HSd&HKA5Uf1=();PzK z!HS6&Rty^Pc#g}zY9t0#8Vk9iT9v8ZKnbJULwwKt<@#pgCNq{ONbSFvU2o)jK9y^} zE$3oh^`aoTB1*Jkz;s!#wdwzh!V?zu%%FPufa&oWFFY>D{&;f49UuM+YOEo}X*GJeK63`I9MD?RIhA zTN=hv0oIjmh3cun>clW}dT>2^J#Fu>M~D?-#5O3-(8c z<^0NXqBpdH6)e8&ASSb#Fg7~z#?Y&ba_}Xl<+$d@&ku%_OYTj+2d+knt$J{O^ya$Y zyELC{2v64pL94EM2W0 z-=Vx^shf$GaZ)+>Ky22f;ic<_>!O?oML0L@i@nPS%{@EkH7Dr6mA=jhAE)PU#%=|` zP1{Msrzhr_aoQt;^8VrSHzwZ<&fg05r^k-WOuL+({qx(Y#FjAF=09H!#*e1o%bv?p z$}PMTglFXV#((EV2VNLF;_boMTrhLj%zb+#+lJ)NP6~c(*p*@Jh!@hw_M@ATeiP`YA#A=rH%d$-cq& zkSNGc@^^IF=Cok7BB)V{qB|lzrN`%Oj8LD?bvvI{!+I@9W_veOZyS zJ~yrMMA&K`{&V@P{<-+@zUaf5*{kX&Y9Ld)D6j6y_t+s-cjBir_3Er9;V$}PE^u7%_8sGY&V7M@-%3|1zFm_X)dwMpR*m?P1e+#}) z<8_@H)XG%Yofrg9%~s_34AaBYBg4~a`4sNO_?XF|QRZ(?zLvkk^Zwas*Q!s>Z=Zjr#qXazpj4NYiD2iU=Kv+F9^1)(|6WJiHvekT)bVF zGh$ii42+`#Ck+fdC8)u#+?*8#m7gbtfv|%m9X=rpJvyi!5eyFwM}H8m@*jSj&$SDF zs|!CS@BYo?>-l6xc(ye0$L-OG`f9arwRQBEt)s}axOs}5(Ow?);(`6E@)^|Sj%0y( znGa?B_~=}VnN(s0RgCnY{yp^ITXZ~wD+@<4rtBFEyG^x+R>pj7;P*1}Mp&L1{DjA& z`SWsC7ew*RJQ_1ZBV*?e$ij6-5aGKXzZ*9yi_Nps*Z2>HDPV$r^nS3k_VN9H>-^P3a%D++#GxO-UsjtSmYk+M?G1Hq+nw0MI*jK^E@e|s>N z+hLFWy%W@&>$&0aIZ<1AVKE1<6w=#*@2T-Ij4!Fu;pfxOk(XoSRo|fxF9OC#2CYLT2L#RU58lGp<1u_I&-TkZ2MktidiEpwsK(Hg8PSf@g3zk?{C5m~ z+ScGG3*-@R8+B|QJcV%<0H*HBxAimCmoo5rXJaE#$@anjwYtu(!OyEF0HemZ#jAaO z&%AKCTeh|h+A7}ZKFZsXzv?$s4ZWVDT#;)#X+V;C)tXX+jwa0vN{tN;N#FfW`ZaHL z`hYD9c6wMZ3t*d-SG8Oojr8#>V5M!28oUD@@WDa#M}s9hXux;cpfS*;G_+z?o}V5I zx;lQpwVg0{%>8t%c>C~L4(`!hiEIduQjYEIpz6r=CGN_*_-nmeC`TRI@p zY<{(N?BiaE!9{A-Tf7xr;H7tqs_GVa`X(>J`V92KXcE8Nh?RWxwxBBma7qxUCWs^O zw%8sKrcuqkg6v;qWwtMey@$mnO&gG)4dpHEos&(zknYu|V@24-d3jd`s?i5!ZqA`` zvrGhhYvt;JV6uHL3cfeRgSa;f=IY&0mK7+z{j|}ncOVDl;_CbODihJXH~g*8rpCJ&sHfta&{p2m zte`*}Fc-Gd!mdNY+waGge}3}0p!@lt`;90jd&Cx5sRypIAU29E5)t3=m+R44*@Z8} zZyQy)Q5_6V%*Ml@%Q)&RUkK#Id81St{gMLTde3vid zXm`arFAPHZE}S4?rt*yRSbo%jgC=}&@`WJ#*ORXV>F*7eZAK8luDRg{EwH+Pzw%a` zr&f$lMAoSha#dCiTNYbUkx=-V9|tv)1w^W)Xro|vHM(%Q#kFP=+oKvq{K z4`%Fi6{74dm(9z%YdA+Un^(pjYKg7Em;Qa3VYj# z^3I&mU1>vo$jVHYhiUrsnK=*s_MBWbU9=q<9|~)~6l9^hcWnL-qL;@7l~Z@4i?IvB zRBN-yNywYa_lQcwc=kARU?b1Bxw0ej>?hd^{{x3%Om~Yd2J_K=63=4)d}D3SSA=W@ z2vw3+PTY|9;QC~ALj<8t4P$Uu{AmyI1E}X4AYFlyA-1o6AQfMT+=%e6C)8_ONjE%agy% z2H&sj_E--Lr^ExN=ezw~^F8MU+4F;|oP>-|v7H-ie;l0lO-t<)4gXpm_swfQL!-Ab z@@ov&cv`$SEyp=1{@cN6vrPlO&9pxdTX$h}>Fluj#bEn*Slr4TzHPO^m^1#sSuSOJ z&TDRXG(W9=XIQL0ia!9+DSSu&pqz!>o@zVAa$ecUSh+}ZJV2Z2inIf*#x1h zyZ8oG5LOPdHd$WkVhueN-EhS#EBIU;Z)aWpZ_Sx(3l}cWCyRrKbsqV-M!aVP-5-T* zU(ANLdk5cd29@syBbXeYp6q)aN5ffpcE|8+MAzK?AEZSNOn;D9KP;R*AbsMi=@nle zl;)st_}Cl`!qkC3CXN&RiA@gAulGxvTV?m*a1~m2r(LmNVZLKb)9NuYy{!erTh^^r z6Hqqh`eHD8I_EYg2wV5j`jfB~&B}1X_uYKc`b^L*-qmIZk9JFhC*n{!`*;-U+96M4 z6!Ee|5LX3TGd))YgR_IHd64%6YqMs|uw59fJ0{r83=ZEKJl!vbzk6l-YW}_%boExf zTJF$@lyNpIY?+~ZQTDSkdp4SJZazIN-}_!R@miH(@3htD(_&v56lbrYwGK^7@)P)n z@RhsJi)ga=@5r>!8R>g8wsYCv&6O<)r|G-)xFWCBOqfCYLav)GuyiBR#aG_ushr!a z;q2(IIJ(*}KI-+&nibjdRXcQ4fjMT})E%ll^s zlM{w-o)G*F$nU46-m!&VN?tySoEz0x)mu*Hptsl5LXSO8q+p@gVh^u}AL`NT%U8`5n z`31i5axCkkQK?&klvxxP=bEkP?RqarkC06?ufBf0oPhDFn{r-{1>tr1re7_|yFUqQ zs+{}9XMpeC!CN-wySdf_(`Szgc4ve=wAOJh%aLT;*5>(&DDjf?1U}hzLnx07>|J95T{q&M&e#AJFfzrcz^SLWL+fZX$n zaLO7dG?+Gw`XHlTuR_bn5JLFmL4P|I-B`A91;wo*vtBqW%6&Dx^P5xAyST*sQ zY{qrX-<_4S%^29u0~{gWm@H#craH-ndaxnr{urBzAD?Iu$fwZHCV7w2LxT? z3&-SpbA#QoeD9JRuNwwanN?}iDuz{5L~i`Ij&<_8jt;K;k^QnS-r7+)%8Xz{9aP1r zcM_2wkxxW*({he96>9L6MX)x{j$E(vSd?o#F&w%-*qH%7)~itqep@|bW3=}1S`fc% zxBq~rw6zg}5qC2}$Ea9l@zMB!R+v_genGBoZM-k*L|MNWU-_2xF4^a58dRG>q;;nR zgS*p0KaHNr2k@|PUGB-)$;n~dw_^7X8@5C8_>)|=`S~=|EP~wS5l;{ zFX_0=0#PRU-2(@u8ApPeB^C*)#9_G(N}u_`;HjJx^frY>kZs*g@m2S~BmHl7u7M|L zl{(`OY#IHnZ-&pjVOcB@o^~=PO?=U{OB?l$UI7oyDDX3}rnja2SQJdd-OF>OysyP+ zQ~4@v#Yb1x(E;NJ#=V;VhOuWe!!0XbugjR6y(*<$(hd{_C+@i&2=A^RyaBD z;;uC(E)SaP;y>M%7P>uuwvB1^_4&I!k5*Y)I2>hq_-JIUk&CPjuR^3!%cd$_Y<*tX z@#9#)z0+Tg%CXKLqAoa%`Q4&arrn+M)Y=R3Xf^=;LbXbPzrO9){0sGI?i5gbjcC|L z>-BcN`9v(f`Bdi4tDs&p_*&HA;^>q}i4HW{6zeyd$Jo(6doIJ(JgsNEPL)6EvCqsa z#SVO@*G=T({h*2{7>=jCj~ zN^lu-CXIxe^X}LoM%KH;Us-UvY%A7kK-?Q*RlX-mHj=7lmN%^4L6vRCCg$getlH&6 z&DiGPdQWm&w+CDGOmYM=S{@%vzrHK~x8)lf!tzx?&K%cS!?B!)swkW6rL6P0LmRW2 z7X|sR#z#Ih=eaqoqP?|jA~XuYR;dt?Nqjc!lKW%ptSI$#J~e94G5LBC%?xWBs{^#Y zq$9eIh9B#)2iD`t{Q8ER?JYt2;@wD4134%btX87BN(->pW7*matQc7C5#Bt_dY!D` zvTzcMbB5b@W=Wu6mQ(<#o{;Oxg+h!1r-rOIZxHrt}x_jz6 z@PTj4N0bi|Rm)k(QixQ}ON**st5>-w4E{>Ae{oufW#T>X0rYl{aO&zg{H`6rg>`iA zhuP^@vhB2hGT^LV^jB=7^$C{7=TjqT9PGk8V(cwZ1`&gJOQss4GE(e=aTWO~D$ppC zMQgsVOopz_|MHeMPlc-(C+9fpa<%HlR>eZC3YJTT>m94hd}8eL`e3IjTn(O0HQqh> z+mbUkqR`6xhl1_1L0F~lGhvh3FY92cmw+tqSzK-J{OCp}@a>H2wOVaILhQPo_?-j(OL{9v%9xweO6qsO(DjflHeHLPZ=cRwQO<@syPn&|qX{NeA? zwCtt1`bANP8Htoc4j3pOg{!TF!(cNqJ12;ZdMoEKFIU6YKQ(*0H@{TVJu(=&lx1fa z@QGZvO8?t}?R7a?Ef>u)i(*5VLV4tk+#}}jAr9g}@h6XtO|FF-E0TB*MRVk96=d`+ zAPnEd!T0i9|3yLJy0oa-p4LplGjW>hUzHwY2EvBHQ`i(_Hw|M8TLyHE;~E*0Z80kC z`s#s-#UBf^t+v2B7EQyq?N)S)FZ;$HTAedmmY(i@LGFs6f4gdGEXGh5$NGB|eHj1U zl50>4EQ8Ho(|71_^FGk^EBb3ap38GxYBwm0RlGLFE}Ok9Yy3{mTxI6Z^NXuuk42yR zMz7@k4v&8WW%kb23}7>VFcn*{ZS~j2$CGj_^%(2I-Hky{kC~bM&CXd^zkvSZnKi}5 z+&An-d2@8k$T{B1AU>9J6sb;OtBuipzMt?*;--Pfh_zH8&^IhzOc2kGtBrkb7hF#4~4 zLVjGtVO)~+v=TmcQ2$$k;x}Ut_D&4@{p7gBS>o7Zq8nnMsxy72($(dvLlvT^TSsVf zwp;SICG4G>9&uD2t-4~x81+1~(KEHO*Qa+;;w=NN#=(q>J(aUE)1sPScDu0#b-GsN z+?1ZtdP|W^tA#z%1q#fczc`z?@TOcN%Vv&@%Kz#otA($vUf17caB$A7BL3QvTp^$2 zv>?1G*pGRi>H~cLQ&71q*ZELd-kibK<91BgY+5-r`3CLQ3d~!g%s&jmUrgTQTd}>Z z;W{+ThO!8>s>}}?F|I9~s$)7nTC291-fxMzpO`)%%X)6GyeWTtY@78yc)yh5)^G&A z*lYDdnujy$ZYgkAgrm!RiaQ3qyEr|9t!&nqy<_p%PFZC0O>fGzo87-H*md4pGv?KY zW_tK+(4`dB)vCq&Ig6#aT0XqlapwDJbKkE98$X_(vsMwMZl_~EvSTu4yy{(P&CX^r zC%<-ObI$Lm_>9IO|0Zl!yRm;1;I`~g_-H~_eEQ8OPsJK z=ctxewVc^AW+Ip)K4MHKuF2VN$}jl6Fz$>J3T!1#wRFaKJJ;p@w8X~jU(QLLU`LYF zW7adPwYxZ|-k0BO%^7TpjiQ~++0+l%8K#<5H0ILaTMkoUPPyW9Co#!#L8qd6b8VlJ0D9WAmcj8Ru6?^r*!zM02S6^s*i<}2-xuce=I=dWjP+>VeqXTJA>sThrdFv#(Rb)9)t-_?cv~ ztH=B9wtDoKTCjT%w&5}#LtUK_-j%totAp~kTp@M6CTHm0epcs@tCIn}BwO>K=8s=L z@J}9sAKaNgzJFiNXH)iZOST*G9r>uaIf~WiTj^e$XXofA=DbC2?$dK)esg=a4f&NC z{mw0xsTu2Yim%5}AF99*HVVOGGd{T`iq3+tbu049Xx+=XGUFF&TH$LgBXhqfFB>V# z^GL828L4;QGT^K(z=~5hWEY)ZS_n?IZ_u0Gc; z%X6it1V?QnGkj87S)IjRxu4)?lF2_fd@~ZbDlNhLyCMBiuETvAS;{fs7PuAKd{97j$-p756RsAr-T^nu^d(Kh2h#yPc&k&72|tyZKmFGi;F-kd?b z&--&-*QCuN5qdFwV6l1~Rr(9UoMq9+)wxFX z-wy}jwb}FLUjLLz;uRh^p_ zJu>I>#mT29f1K@4W0OzLw-yFt5u_PC%hSUasQR* zO3Pf9y{d>iF^|^=GwUlaAK0rxr=ypx+GXY3TZc!<7hT-MKT8h8lp z@vf)WhGS-GdX2@`=UcvKo*sX$k(n{&`9XJn-jOZdkiA)J?XvJn#rnSKUtdda`}5cf z>pq(8FIM*JIcb0Q0%%sOnR9lkRq52329NO_?Zx5x5?y8LrJSJ=17i+n=csoN_^M)U zg&zLmt!$_2YaTDZ@@LUs)k0f$+io5bw6DvVsSr3b`#U~%ZNHrLUxgiiwp%NV_VC>f z9JE3%miIP2j|b-c@1?&U7Y1%gOKr@#sAJv|>vKakGsTQSxx3WbAPQY8`cQx?^Pael zVi-eN6@;1@;w7--^KxEmcjGGuxITYZ2Km|H@i941z3q$fD6vhB&#VZumga_eGlI41 zCXvL8>Gk3bKJhr_hS509Z=8irJli!mOX-^b#qGe0y6vQP3>-^%YF z%r&S=k;4%$-xEcxh1289;B`Uv+Tae+DnV!I$sTB9nPcn58sQY<%NxsAnd#4?Fc0<0 zSPa)KnmH=laXI7W{iyM|`K8?Z%$(h#oCB2*8M1b&c`0#6-0Ho-weg1k%~;3;y^-U| zfve%1IpAykht}k@*33Kr_rlOK^jHy6=ZH7<#fI-+Hk21B(iYA=k&7GaV3;ygDZrHajb zq8oCJMt!WaEYriI=$IQHd9-@va%~^>QslSNI_^kqqyT@_M|CW-75`>f7&X;e@+I(v zvK*@%lRSW!*{p2TvY#QRPj4O_4Cn*KaC=PDUAMlyb3i9hvm~5!S~1@ zy>EKy-{kMuU~xzESFVv~{(N3tI^Zk&0$D1-#$EGomW)V z!+R>t4$_G0^ZUzlwdM{x1Lq(9))Cq{R+1_2B%!XoHqFgqGV zi+O9XW(ym$d2d&)WO?{DYd5|k<6_$tAgp9Ijz70zP@alfJQIe&RW#@BfUJ>~B+Xh^ zZ>nk>!s-uHd3-J`{?q8i7sFN+PevNe_EB@kuahS_IKNsEeyI%=yWAP2TrgO&wZkK( zvTI(Mx!bh}I&Rz=G*Ozlhjdgtg5PG&Tpb3>4E;11uitIw)MV*FKgoF->)|V&mfy|G zf1b`!1AEU3_Q$4A@F#5J_!^!;6Zaa?jTTQEP!iFHb)lZid7Kh_SA@IfSlfE16=>Uo zUtn=F&SDb%*DPl2l>>zI1L;X}NaPG3)9nn4+Sf>jzaRInrAl3FoXd#>I%NmrIKo&ya?)J z+yp;i-EIqJy_nAam-SUipqkxyp}1AOPUnp4CGs&{Wy7lAGC@`};Mb^3`Ei~dm9sr6 zpDm14QJZpASXHh;nqTFfWaThKDJpz1SO$k5@%W&>)+#h+VP3oWnN|j*-ql{CS}zCP z=CWDSdK5B0X5ES89!&eHI_OLY>&vM<5K9~zbenm1{>SFbRpJ|6+CSdeUq?4S8&CO* zxfcGo=-u3vcCyz8=RV37k9{% zJwD`j;H#e3U9;hAl;0kz4;?q;2U1}F-Cs(i@kmrcq_I7AU%rP%SmBqYGxjWxb9~N= zMRPSr2VL17cJF}rzk7$hpN;k7VeFT07u$tF=!)!(8nlj3F4#D)j60uzKP)1)N-M0b z7uH&-H97k1aP~(z0v1|BK*pJ7IfH4z4}v~3kH{T<#GiBX3D2B9u)s$R*!q^pUKO0x zr>E!Jn}Px_ts~XgTjhl;Ps@m%O?3C_z~ANJYwy3-@e7rGViQ_z!~BjrQ3^WG6MibF znt?a23iM!NIqRvz*KA^ydk}6tiaFV;#9bY2=o~^($e#z@Kbd?g{;|BFN|5uzOrt?m zLHwYauRNxvR%5(oK)CrIBO6AKF!;t`z9>ije!P`^g3-}=e%^r1Wl?0cuzYm(jQ66R z@`N0dCi5LShS>SbtzX08L$itQk53!S%C}BUZ|Rsd1#X;yy?T6e$N1c_rX7QQTM@>Z zcT5|a)p0*m`awrL`o5|SEvJ8|&a~c)yaa9N%<@N~f)_`FSrBvmzp&Wb+su+ zg%d~RyblV0_leJjy`M@(>nr(O#htkY?zwzxtQG%(2S#b(d~s~W%3v-V!B5~jtPQ@b znq1Gi;BSn0c69wu61RLeD4sCb*rnOCTJD*HrpN%z&e`yQtv#cL2CnsNcsH1<4Qeq& z_6O&4wO&SWSLLeR4Z}(>^%=$(fsQ_r&tNT9=0CXqv6aR;zgX7HJo|A(x?HB*_D3Q7 z)mJgXXl$7;Ay&RGS8Jr!9S&3`wKl5r8)S9Fc04w|=a1s~el_|lQ}LETif$(MJ zx_%#gt+_s~-Pfp(I=89~f1zCNwvt1vAtT(VSv`kJM&k~xi($RH%1kk)4Lr$aC%&62 z-zSVU2L0JwHGTNYoHcw^PTiiqu0E?HaII!i&(mrK^$~Iljd$ToFQ~G**)@6p*2LFW z1dF40>(lgjhMk@Ch4?Cag~^BH6Zc#^Ip?X`+nktcLOloCQl;<-jexTncy)58=Q>Ww zdGdqB3C+ff@>*wIj{twSg}*C$Hyw&f1xGH+m<^Iv&B9Xqymz_gtA5joHA<|@Aye7`VcVn02^cZY{Z%Yue)H1!pXNri!p|dp;Ex_YXyM-J z!Dho!h)-wRKm5HYSlt>MtqM*YR!uNhQ&k8LTP;IJ&uFl$1P@K_l3p0UWE(d`KV%-( zryr~f&h9^I2AA~?eiUEj3(1S@9aK-w`{(5h<SgK&6^e-Hug9z z7US-Lk*x%7uCZQLa}gas=93j+81t1Otop%Na%YOuah`);cMfBHHLXS~R3E6R*%oYM zWaX;6bvCSOL8J4$2N7K}6BPJ!QHW1Z{wUGLS^4Gl!RFTBa!t@}?U

os|gNV(qS8 zG`ILdxN)#bAK7$!pwR1r^38e2dhqT)e|YTF5ozNCg0vOT_$&*9C~w001@e3K+EpW! z$*}HPGdL`MR^@1XLi9X4`!m7TDiI`l3Ba&M64J@B0=4G+!M zT7l-4F!;%6u=)It#h);PMI_ht;fyqzb=)en`k?ag-%M}aD;i;}=F?H&eWGmURbZV+ zY;~eoT2qgDtP^3G)Btuin2{kBKCM=kLszRPtG+Q98v~IcvgUw_6WT?uW)57HDLy4< zdBzayd_R^@yv}cirus$CsBoi>#&u@HSXD(U@z;6ytHY}u!4$qlrnc_Yu&wvs>|KOj z4{EIhYQ?3=+)SFeULGhX5&wrAJ02(70y_h$EL~gh?|X8sC2?tEBgO**vl1Z z33Exke_JqC9VFvsFBi!lv`vFw$#Si~K(vRAcdD&tswO7U`hD)@@u#1^6unF(u~sP3T*EL}&M8Uf0H zin-Y~@8GjKA{C=mVEEpXYv<|f-Ni=^%SfV#L-vv%Y!3EGdG=@F?|+SzoBnV3ZVWc+ zqm5d&w#f(y7E>Lo&Z%R?+e?F7qdGITjMn3S5ndm)X};L}I8~HtUOO5hhiP1Cc3220 zYxMX#BS-FGrk5|FPDxvv(Paer zps?iNoYU30owKb zDw=zAX8w-Nx!xaKWfv*&j-a7_XIJ=Z#G<}Kb8E%bT~UoXL$-5{a29JDrPLRx4*_#^ zcJO^L2+7Bo3ENq-X7q_T%z`t%cvQ~yD>=g@Vc5KImjWLcRe`U{oZG^|YjSQl+u1%v z*sKj1gLBeuTCg>i=H-czH{_R_^IOri@r`CKF3vgVMO}50gS=}j9L62(65|~n&X~(=+~u~RhtmkvPxvc$Ms-Dc{IX_@ zd1P`s<4A_rHhx6^L%KN)wN2`%#;Q>a-k7Vyt=S97jxmXFfT+53{^gZ zT)O`LZ;a7A*CTmT2tBPLr zllnfZnu>pI*xR_RXsQc(Hh4Bi#G`2T0>)mAHcq}BzHD4qTY5NLlu^WB8X-44W_|d< z)5DO{ga4A?I4cp{ALYzfhJh+a%^{<|Nk1(h(Dd0Mo$84C7} z=X`qhIy-H)DCch-1M8Pu5c_8Rv*L@tB3?L-u?1ekw<>|pteQKQY|nolA6Cm8^?nDB z4QjkS<8qe|{NYoLn6M}!c$MwbgZCxrqejZUIpBL>CR>{hr;}Bbvh6h zBlAqu=RuwR%C;NLEQ2?P!<+Kl>{It?9ap!O!81q1y1no{ZNRtn7|pBma;;ejUy5O- zx%?SbIAC}jt)Cd$Z_57WJIzN|gMrJ}hiBvZJ4P(3vt}jNcVoxKJSJpqRbSqNoQ6B0 zi9DL)kk@ae#N*LlGt{=`3a#}Zsx|Wg>-YgOYwBiBj$*9I)h|h(J~dvjdcSLfs(RB0 z@@GDlx+W?PzeXILts<7QPkwe+idmU`$+gQr(1<&-Cs_T9| z(Y(9p%tn#@@r@4-#_;`N9`X76(d2`|UD^0X)fC*AsLZyk^StQs$vNWXKzHJhnyPQ}Kcdv~_m^SpOH zSdim%-py%I2Qx=i>`Y-h<{7FItX^}IQ(P{-`$r$THjr=1Vaify3oSy!`4j4Zc@@Tw z#lgG@o|)=uuj{{!mBw&}eD$`GGCe7G;V;IpL70`J!)9x2iuJNazKjpMB>Qd?)y)jf z#>-~p)yANHK>Ul#g9in+hMtV_?Xh@9+o-I0=G~KhH7lWKK#E6kL6pM$Q}>_S7@ZLn zQ}Wgao6#wn6y;dm$^8)21ULHipBA^j%l_8)!pq_YJKnMAju#;g**83vTdT)r6qlz= zQDi30$o}~cy|YnAm7AAzpW?@ZXgvkd8uY0I|Cvpz2Qeay-&94^qDD=~=nv^TXq#a~3zH#b?JNU7t3&I|y5c?EWzLw!Gt>7xE0|#Wm_y&t%=n zu6QjH{R73XUvS$CaicI6C0lRb%+pM}s+Zp9y#6Ahriz z5%Xig>WW+ym8ew`;dTbM^=QR-GT|qrUtbw^xfAGZ1Ml>1HqQMQTQyQ!R(sX;5qsw& zzMuR=<2q}9s6tVBcKd*=X!zDJTHa`^b6*$lX7PY3T$}H$FVRPw?AAB}cU9zf}@-;QU{hWZa>E={1n)rz7l+MvNA z408aRk$5!Mrhj)uqRvHQ>C}?CYvi)<)+#Lvg6oyx-!%g+d|)<8-&VhG6&h`%l@>+a ztQ=v5I`?Mt*i}@|%4g@NPZ-s&QiwRLzHv+Tqi$QJ-_3(UuTMKy6FO1>*Vd(ZR*#K0 zL0w^1rQo^o*1jKfzmriXfBOY@HnA$vR_WN%;e-CNR#1diJzN;#CcTxDXq8}dJ=I)~ z#oIGjQ<2&@v(rT3vOTbVApC9U5%~N6S^Xpgyl)t{muF-^Q zb3HcgK_TFawQ5mB%c>*Py0daddsu&?XdR8V-x zsW&cSB$)RhQs`WAxT>t;@rli>%&OJ2;)?vv$R~fIV@7PB(P=C`Yv|XF-Q*h~*pZ+; z(d{TQhVl`bpWqdO1`tAe$W++cxb?2OM*A9xN%nn$ZvR^ut` z>4CrM@K{~hjAf$?qka6~dc*F}S1YGtq;q{lU+1MyTjfpF)FC;a&Rsq(S8u%?v%v0% zc8bH+1Yg*emve%rQAsQO7HRdG6wqt4p3PYm7pvZDzL+Abs4}LpI_)ZpQ$>ZV74d%% zE}G@rsI5FLu6#0h)8Hxtg`mXy=Re)0BOSr>xijHU{6RRea~Q`o2boUz!_ab5oYq>( za#%&1Jn{HBb*LXX$^Wqqs+>73E7VWYmKe3t#SriRrYtPd%}FQ=;> zN8LYHm*uB+BxWJ6Ra3*nyMu0W;Xjy+3BxLj>t{y))`&(!Mb_} zdsgm_2-h>KPW1W6WIk}mGg6y z?vx;xcKcwDcxbEg-QJ-{{qBtPLN$5g{Q|^Zu70#q)}xSbP@C1PjydkUWUDG387$3V z>fJG{8Uo#}6DngiD#DnNVBaW0?VOwsCR;;S#euOU_-hrO1RwZ_w9_YZ#_liK`O514 z+y#U!tMamR#x$xLQx=U6K_!}ppbsn`4+5%QiIwFl%}x~4JsLjB))?_^WxpDni_`OO zjt(C;oUK??js8!9$n@qiA|^ zeE>G!`yKJE*FP!==xDq!%fO{>*|FGT(IcWfKqqB$O!2stRz#>EUCbK=Z*!50Xd=VH|o9Ck-gkSkQBRhO8 zxV6`dXrXyfP?;ITtfFr$!{yZ}w5Ozj0nO zYw%^=TIm2WVVVU@h#t=u`-yR)8THlaPXB2hBPIHua!dX3$pIrD)v>+ z-vM6IRH@2_`Y z#rwZb&6#peY5^#SK^rRX|dM&lq^{DJG9p;mJfoBAv%tM?P9lTF+R)k7|poltnF)P=&!d|Js`b;pkw!776)q~8*Q9hm8Ece^6COM3uJl+S2O0iG#x}hRr+H?^nen;V_~Kf>xS34)OLObb(o%5+J&^H*?YQzxy<+;|8)nb2W^5Zg zJBPG9_C7Re29CF))wqT=4jv3z?pxo?OUHSQuV0XBwl+^|YWB<38%JbKP6(>^hQ(H; zweo1=Z1KjNFcHEB2baCV&T;E1A9}&z>Bq+h?@tBauLVU_Pi9n@XFP4dT1C;hqII$w^Zqs)F&`=K?gTUPy*Uw~SBzOhRE9xq2P7uxmF zFdM4)D$CPHcUi8kP>E*;e7ytf*8lO7_J6N-Dyb7x(hy_}`@17WR zQPgpEzH77-@68+Ny5v}|u48(x(%k}h04mjOe#!SYOa9of`HzF|SHe%~eq@k6F|B!G zkY5_lhHqMsAel6s_R9%5u50l_KWoD7i$;y?wB^<%NL`p zkfH$1kx^S%iqERUW8CHyjD6Zn8q4-b6k)V^{4$;|4-SK$3;%8pwnf)GRdWI5y!p1q zK+U93GdwG7QbV8ug!fe~JtQq4%23TKhj2-_+jW#u-*%ssFLxz-)WvW8gPi2v3&ufN=GY zykpnH{$}+ zTyuM^+`wK#{UCg)FMQ3r**|z%{ptrf(go=Y=j0Q2CF#m5Dpu{KI(qPcEIjL#jNZwQ z(uVc0A*?N^0>&SeBUB%7xAB+0jit3wt!$vL%}AEF_25Qk*Or;F)Rlo)Jb9tkx4XBw zb!9vI&kUbygOW__4M9nqVJulT-1@D{!&>8MXNFTJrEkeMs$;$^T5>~LWpjRMPJntR za~Va-P-Yot=KDM(>#f7ry&!o@;?y4p$Nv(2_-a(3cO`)DG5MAV?khp~v*GLXoQYLq zj|$trl3(mIv=aZJb`8$hjRV3epYYHXFRaR!K@v|+C&p(cA7h@PGnPw>R4y+LrO zvBgmUYrClN>FVg_6PT+!D-p4kWUWtb?J`w8m*t%$gU4(w7_o*sYSLlVkL8^xu5&Pt z2(terp2F9I7*-#X?^;>RNINgXItpLNT!n80H#6|h%IDw8Yj;S~R(eCTX!eJrhvv12 zjz8?44_0q?7u>b+H)Z*63NF8$d>HKH&v34c<56KGt3-7%xv$)pbP8F|)t5XGJo&af zT`{nHx-~cIOT|A(IVROz^EPPck zv{JH_aGQIe6kXxnsKWZ-J~wA#PUiC9EHkX$eNNCd&OxiK{=O(Yot^j8sjUk_MuUvL zsVuW{k)A2S5&<+v(DBI6r~1ko)$$2O8_e6jG{583elz%r;qZQ+aQ5r@bV1I;IzaQ& zCOl)Yv^$=-56PVLCAmB~kNLUMe@ynq3LaJ(J2k$|&ders&ma-N=HPo?I0z?kEKPtf zWT9xBvKfZ@5%u68Ol4(pFxLEeqh)soU0HGKcw5QKJzmxX7nNbg)KrdM5e1OjIX9me zvAif}#Fw}zD5}|Stq4W1hKYI1D)ChBsMIs+Y{bW`wcFDsX9QPk!duZphMOYt?wgOP zCyJwAhyr{&kDt!{a{oAdo|iMYC@sSFt5qNKXZ*6S=6mKA&COA*I*F;&_K(PaRwK(s! z_T1de>vNUzW@qJ^I&*VvP~MO;SR2OhV&U6SuW>Y8zK8G6-t)EgNtOKn6ffd{>_ZO2 z9ePd}_^!&gl}}=|NV7FKyVt) +#include +#include +#include +#include + +#include "gpu_info.h" + +void compute_active_thread(size_t *thread, + size_t *grid, + int task, + int pad, + int major, + int minor, + int sm) +{ + int max_thread; + int max_block=8; + if(major==1) + { + if(minor>=2) + max_thread=1024; + else + max_thread=768; + } + else if(major==2) + max_thread=1536; + else + //newer GPU //keep using 2.0 + max_thread=1536; + + int _grid; + int _thread; + + if(task*pad>sm*max_thread) + { + _thread=max_thread/max_block; + _grid = ((task*pad+_thread-1)/_thread)*_thread; + } + else + { + _thread=pad; + _grid=task*pad; + } + + thread[0]=_thread; + grid[0]=_grid; +} diff --git a/benchmarks/opencl/sad/gpu_info.h b/benchmarks/opencl/sad/gpu_info.h new file mode 100644 index 00000000..4219cda9 --- /dev/null +++ b/benchmarks/opencl/sad/gpu_info.h @@ -0,0 +1,20 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#ifndef __GPUINFOH__ +#define __GPUINFOH__ + +void compute_active_thread(size_t *thread, + size_t *grid, + int task, + int pad, + int major, + int minor, + int sm); + +#endif diff --git a/benchmarks/opencl/sad/image.c b/benchmarks/opencl/sad/image.c new file mode 100644 index 00000000..d7ed0fcc --- /dev/null +++ b/benchmarks/opencl/sad/image.c @@ -0,0 +1,56 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#include +#include +#include "file.h" +#include "image.h" + +struct image_i16 * +load_image(char *filename) +{ + FILE *infile; + short *data; + int w; + int h; + + infile = fopen(filename, "r"); + + if (!infile) + { + fprintf(stderr, "Cannot find file '%s'\n", filename); + exit(-1); + } + + /* Read image dimensions */ + w = read16u(infile); + h = read16u(infile); + + /* Read image contents */ + data = (short *)malloc(w * h * sizeof(short)); + fread(data, sizeof(short), w * h, infile); + + fclose(infile); + + /* Create the return data structure */ + { + struct image_i16 *ret = + (struct image_i16 *)malloc(sizeof(struct image_i16)); + ret->width = w; + ret->height = h; + ret->data = data; + return ret; + } +} + +void +free_image(struct image_i16 *img) +{ + free(img->data); + free(img); +} diff --git a/benchmarks/opencl/sad/image.h b/benchmarks/opencl/sad/image.h new file mode 100644 index 00000000..27fc3e0b --- /dev/null +++ b/benchmarks/opencl/sad/image.h @@ -0,0 +1,25 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +struct image_i16 +{ + int width; + int height; + short *data; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +struct image_i16 * load_image(char *filename); +void free_image(struct image_i16 *); + +#ifdef __cplusplus +} +#endif diff --git a/benchmarks/opencl/sad/kernel.cl b/benchmarks/opencl/sad/kernel.cl new file mode 100644 index 00000000..f0e1c2e0 --- /dev/null +++ b/benchmarks/opencl/sad/kernel.cl @@ -0,0 +1,326 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#ifndef MAX_POS +#define MAX_POS 1089 +#define CEIL_POS 61 +#define POS_PER_THREAD 18 +#define MAX_POS_PADDED 1096 +#define THREADS_W 1 +#define THREADS_H 1 +#define SEARCH_RANGE 16 +#define SEARCH_DIMENSION 33 +#endif + +/* The compute kernel. */ +/* The macros THREADS_W and THREADS_H specify the width and height of the + * area to be processed by one thread, measured in 4-by-4 pixel blocks. + * Larger numbers mean more computation per thread block. + * + * The macro POS_PER_THREAD specifies the number of search positions for which + * an SAD is computed. A larger value indicates more computation per thread, + * and fewer threads per thread block. It must be a multiple of 3 and also + * must be at most 33 because the loop to copy from shared memory uses + * 32 threads per 4-by-4 pixel block. + * + */ + +// AMD OpenCL fails UINT_CUDA_V +#define SHORT2_V 0 +#define UINT_CUDA_V 0 + +// Either works +#define VEC_LOAD 0 + +// CAST_STORE is only method that works for all implementations of OpenCL tested +#define VEC_STORE 0 +#define CAST_STORE 0 +#define SCALAR_STORE 1 + +__kernel void mb_sad_calc(__global unsigned short *blk_sad, + __global unsigned short *frame, + int mb_width, + int mb_height, + __global unsigned short* img_ref) // __read_only image2d_t img_ref) +{ + int tx = (get_local_id(0) / CEIL_POS) % THREADS_W; + int ty = (get_local_id(0) / CEIL_POS) / THREADS_W; + int bx = get_group_id(0); + int by = get_group_id(1); + int img_width = mb_width*16; + int lidx = get_local_id(0); + + // Macroblock and sub-block coordinates + int mb_x = (tx + bx * THREADS_W) >> 2; + int mb_y = (ty + by * THREADS_H) >> 2; + int block_x = (tx + bx * THREADS_W) & 0x03; + int block_y = (ty + by * THREADS_H) & 0x03; + + // If this thread is assigned to an invalid 4x4 block, do nothing + if ((mb_x < mb_width) && (mb_y < mb_height)) + { + // Pixel offset of the origin of the current 4x4 block + int frame_x = ((mb_x << 2) + block_x) << 2; + int frame_y = ((mb_y << 2) + block_y) << 2; + + // Origin of the search area for this 4x4 block + int ref_x = frame_x - SEARCH_RANGE; + int ref_y = frame_y - SEARCH_RANGE; + + // Origin in the current frame for this 4x4 block + int cur_o = frame_y * img_width + frame_x; + + int search_pos; + int search_pos_base = + (lidx % CEIL_POS) * POS_PER_THREAD; + int search_pos_end = search_pos_base + POS_PER_THREAD; + + // Don't go past bounds + if (search_pos_end > MAX_POS) { + search_pos_end = MAX_POS; + } + + // For each search position, within the range allocated to this thread + for (search_pos = search_pos_base; + search_pos < search_pos_end; + search_pos++) { + unsigned short sad4x4 = 0; + int search_off_x = ref_x + (search_pos % SEARCH_DIMENSION); + int search_off_y = ref_y + (search_pos / SEARCH_DIMENSION); + + // 4x4 SAD computation + for(int y=0; y<4; y++) { + for (int x=0; x<4; x++) { + + // ([unsigned] short)read_imageui or + // read_imagei is required for correct calculation. + // Though read_imagei() is shorter, its results are undefined by specification since the input + // is an unsigned type, CL_UNSIGNED_INT16 + + int sx = search_off_x + x; + sx = (sx < 0) ? 0 : sx; + sx = (sx >= img_width) ? img_width - 1 : sx; + int sy = search_off_y + y; + sy = (sy < 0) ? 0 : sy; + sy = (sy >= mb_height * 16) ? mb_height * 16 - 1 : sy; + sad4x4 += abs((unsigned short) img_ref[(sx) + (sy) * img_width] - + frame[cur_o + y * img_width + x]); + } + } + + // Save this value into the local SAD array + blk_sad[mb_width * mb_height * MAX_POS_PADDED * (9 + 16) + + (mb_y * mb_width + mb_x) * MAX_POS_PADDED * 16 + + (4 * block_y + block_x) * MAX_POS_PADDED+search_pos] = sad4x4; + } + } + +} + + +//typedef unsigned int uint; + +__kernel void larger_sad_calc_8(__global unsigned short *blk_sad, + int mb_width, + int mb_height) +{ + int tx = get_local_id(1) & 1; + int ty = get_local_id(1) >> 1; + + // Macroblock and sub-block coordinates + int mb_x = get_group_id(0); + int mb_y = get_group_id(1); + int lidx = get_local_id(0); + + // Number of macroblocks in a frame + int macroblocks = mul24(mb_width, mb_height); + int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED; + + __global unsigned short *bi; + __global unsigned short *bo_6, *bo_5, *bo_4; + + // MXPA + bo_4 = (__global unsigned short *) tx; + bo_5 = (__global unsigned short *) tx; + + + bi = blk_sad + + (mul24(macroblocks, 25) + (ty * 8 + tx * 2)) * MAX_POS_PADDED + + macroblock_index * 16; + + // Block type 6: 4x8 + bo_6 = blk_sad + + ((macroblocks << 4) + macroblocks + (ty * 4 + tx * 2)) * MAX_POS_PADDED + + macroblock_index * 8; + + if (ty < 100) // always true, but improves register allocation + { + // Block type 5: 8x4 + bo_5 = blk_sad + + ((macroblocks << 3) + macroblocks + (ty * 4 + tx)) * MAX_POS_PADDED + + macroblock_index * 8; + + // Block type 4: 8x8 + bo_4 = blk_sad + + ((macroblocks << 2) + macroblocks + (ty * 2 + tx)) * MAX_POS_PADDED + + macroblock_index * 4; + } + + for (int search_pos = lidx; search_pos < (MAX_POS+1)/2; search_pos += 32) + { +#if SHORT2_V + #if VEC_LOAD + ushort2 s00 = vload2(search_pos, bi); + ushort2 s01 = vload2(search_pos+ MAX_POS_PADDED/2, bi); + ushort2 s10 = vload2(search_pos+4*MAX_POS_PADDED/2, bi); + ushort2 s11 = vload2(search_pos+5*MAX_POS_PADDED/2, bi); + #else + ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]); + ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]); + ushort2 s10 = (ushort2) (bi[(search_pos + 4*MAX_POS_PADDED/2)*2], bi[(search_pos + 4*MAX_POS_PADDED/2)*2+1]); + ushort2 s11 = (ushort2) (bi[(search_pos + 5*MAX_POS_PADDED/2)*2], bi[(search_pos + 5*MAX_POS_PADDED/2)*2+1]); + #endif + + #if VEC_STORE + ushort2 s0010 = s00 + s10; + ushort2 s0111 = s01 + s11; + ushort2 s0001 = s00 + s01; + ushort2 s1011 = s10 + s11; + ushort2 s00011011 = s0001 + s1011; + + vstore2(s0010, search_pos, bo_6); + vstore2(s0111, search_pos+MAX_POS_PADDED/2, bo_6); + vstore2(s0001, search_pos, bo_5); + vstore2(s1011, search_pos+2*MAX_POS_PADDED/2, bo_5); + vstore2(s00011011, search_pos, bo_4); + #elif CAST_STORE + ((__global ushort2 *)bo_6)[search_pos] = s00 + s10; + ((__global ushort2 *)bo_6)[search_pos+MAX_POS_PADDED/2] = s01 + s11; + ((__global ushort2 *)bo_5)[search_pos] = s00 + s01; + ((__global ushort2 *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = s10 + s11; + ((__global ushort2 *)bo_4)[search_pos] = (s00 + s01) + (s10 + s11); + #else // SCALAR_STORE + bo_6[search_pos*2] = s00.x + s10.x; + bo_6[search_pos*2+1] = s00.y + s10.y; + bo_6[(search_pos+MAX_POS_PADDED/2)*2] = s01.x + s11.x; + bo_6[(search_pos+MAX_POS_PADDED/2)*2+1] = s01.y + s11.y; + bo_5[search_pos*2] = s00.x + s01.x; + bo_5[search_pos*2+1] = s00.y + s01.y; + bo_5[(search_pos+2*MAX_POS_PADDED/2)*2] = s10.x + s11.x; + bo_5[(search_pos+2*MAX_POS_PADDED/2)*2+1] = s10.y + s11.y; + bo_4[search_pos*2] = (s00.x + s01.x) + (s10.x + s11.x); + bo_4[search_pos*2+1] = (s00.y + s01.y) + (s10.y + s11.y); + #endif +#else // UINT_CUDA_V + uint i00 = ((__global uint *)bi)[search_pos]; + uint i01 = ((__global uint *)bi)[search_pos + MAX_POS_PADDED/2]; + uint i10 = ((__global uint *)bi)[search_pos + 4*MAX_POS_PADDED/2]; + uint i11 = ((__global uint *)bi)[search_pos + 5*MAX_POS_PADDED/2]; + + ((__global uint *)bo_6)[search_pos] = i00 + i10; + ((__global uint *)bo_6)[search_pos+MAX_POS_PADDED/2] = i01 + i11; + ((__global uint *)bo_5)[search_pos] = i00 + i01; + ((__global uint *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = i10 + i11; + ((__global uint *)bo_4)[search_pos] = (i00 + i01) + (i10 + i11); +#endif + } + +} + + + +__kernel void larger_sad_calc_16(__global unsigned short *blk_sad, + int mb_width, + int mb_height) +{ + // Macroblock coordinates + int mb_x = get_group_id(0); + int mb_y = get_group_id(1); + int search_pos = get_local_id(0); + + // Number of macroblocks in a frame + int macroblocks = mul24(mb_width, mb_height) * MAX_POS_PADDED; + int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED; + + __global unsigned short *bi; + __global unsigned short *bo_3, *bo_2, *bo_1; + + //bi = blk_sad + macroblocks * 5 + macroblock_index * 4; + bi = blk_sad + ((macroblocks + macroblock_index) << 2) + macroblocks; + + // Block type 3: 8x16 + //bo_3 = blk_sad + macroblocks * 3 + macroblock_index * 2; + bo_3 = blk_sad + ((macroblocks + macroblock_index) << 1) + macroblocks; + + // Block type 5: 8x4 + bo_2 = blk_sad + macroblocks + macroblock_index * 2; + + // Block type 4: 8x8 + bo_1 = blk_sad + macroblock_index; + + for ( ; search_pos < (MAX_POS+1)/2; search_pos += 32) + { +#if SHORT2_V + #if VEC_LOAD + ushort2 s00 = vload2(search_pos, bi); + ushort2 s01 = vload2(search_pos+ MAX_POS_PADDED/2, bi); + ushort2 s10 = vload2(search_pos+2*MAX_POS_PADDED/2, bi); + ushort2 s11 = vload2(search_pos+3*MAX_POS_PADDED/2, bi); + #else + ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]); + ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]); + ushort2 s10 = (ushort2) (bi[(search_pos + 2*MAX_POS_PADDED/2)*2], bi[(search_pos + 2*MAX_POS_PADDED/2)*2+1]); + ushort2 s11 = (ushort2) (bi[(search_pos + 3*MAX_POS_PADDED/2)*2], bi[(search_pos + 3*MAX_POS_PADDED/2)*2+1]); + #endif + + #if VEC_STORE + ushort2 s0010 = s00 + s10; + ushort2 s0111 = s01 + s11; + ushort2 s0001 = s00 + s01; + ushort2 s1011 = s10 + s11; + ushort2 s00011011 = s0001 + s1011; + + vstore2(s0010, search_pos, bo_3); + vstore2(s0111, search_pos+MAX_POS_PADDED/2, bo_3); + vstore2(s0001, search_pos, bo_2); + vstore2(s1011, search_pos+MAX_POS_PADDED/2, bo_2); + vstore2(s00011011, search_pos, bo_1); + #elif CAST_STORE + ((__global ushort2 *)bo_3)[search_pos] = s00 + s10; + ((__global ushort2 *)bo_3)[search_pos+MAX_POS_PADDED/2] = s01 + s11; + ((__global ushort2 *)bo_2)[search_pos] = s00 + s01; + ((__global ushort2 *)bo_2)[search_pos+MAX_POS_PADDED/2] = s10 + s11; + ((__global ushort2 *)bo_1)[search_pos] = (s00 + s01) + (s10 + s11); + #else // SCALAR_STORE + bo_3[search_pos*2] = s00.x + s10.x; + bo_3[search_pos*2+1] = s00.y + s10.y; + bo_3[(search_pos+MAX_POS_PADDED/2)*2] = s01.x + s11.x; + bo_3[(search_pos+MAX_POS_PADDED/2)*2+1] = s01.y + s11.y; + bo_2[search_pos*2] = s00.x + s01.x; + bo_2[search_pos*2+1] = s00.y + s01.y; + bo_2[(search_pos+MAX_POS_PADDED/2)*2] = s10.x + s11.x; + bo_2[(search_pos+MAX_POS_PADDED/2)*2+1] = s10.y + s11.y; + bo_1[search_pos*2] = (s00.x + s01.x) + (s10.x + s11.x); + bo_1[search_pos*2+1] = (s00.y + s01.y) + (s10.y + s11.y); + #endif +#else // UINT_CUDA_V + uint i00 = ((__global uint *)bi)[search_pos]; + uint i01 = ((__global uint *)bi)[search_pos + MAX_POS_PADDED/2]; + uint i10 = ((__global uint *)bi)[search_pos + 2*MAX_POS_PADDED/2]; + uint i11 = ((__global uint *)bi)[search_pos + 3*MAX_POS_PADDED/2]; + + ((__global uint *)bo_3)[search_pos] = i00 + i10; + ((__global uint *)bo_3)[search_pos+MAX_POS_PADDED/2] = i01 + i11; + ((__global uint *)bo_2)[search_pos] = i00 + i01; + ((__global uint *)bo_2)[search_pos+MAX_POS_PADDED/2] = i10 + i11; + ((__global uint *)bo_1)[search_pos] = (i00 + i01) + (i10 + i11); +#endif + } +} + + diff --git a/benchmarks/opencl/sad/libsad.a b/benchmarks/opencl/sad/libsad.a new file mode 100644 index 0000000000000000000000000000000000000000..fa6e1a00111d41f1dd0ed5a5b181e61b95017516 GIT binary patch literal 17384 zcmeHO3v^t?d7inieMqv6Z)6^ZWTPv|w$QG%x>~K2L#YIkAz(;2#coQQu2-v-QIL@% z$&N{yR-4tX{NmM%4U|W4YnjHF2KQn+B{`>T4~B%3lDa1b%4tp#R6~;HVG`Rh#h`xw zeXQ=v4+)1r5B1n{=bxE>=FXj&@1J|WfA#Upk^`xmFApvbm4*gS)|S23P)KiaHC6-} zV*$p3_e{pnI>y}ii6~s~0FZ2)GvT~q* z&FUGbBzlsAL#29s$$^#Wf#Q7i|SG=$SFB@LsgaI7_) z>wVQ{g#-JH>Tv)-NA+|*=gL|dC%Y}FOPzv^7NgmaEKD0G* zGK(wF^C?zpAXSx5GXrHa?elbc^l0=vUPvL!SgJ31>&npD^uSOYMYgdI=^$ra54=lFYL{J&(N_Xq=?xwD2ODf&m+N$@Y z8e`4zbW3--Ioi|Qr8o3+H>A4au|?DNN&Q=0rW$)k1eu>@4XCp7EEGqO#IxbEJw1^kombdD+F3?hf#XW;C{Q2N-*2p(eVKz~p3@(dP@dvN&ORbpN2Kr(#Q9D+ z$!_@tkmpLcAdu{qcR;>Kpi;(2cFV7ZJl`oN*)3lR`65_f3M9MbpN2f&DJR)2r{}uJ zhYJG9Zuw^*&v(j6cFX?^@%$>PhHWzG?BPI5p8Ttt{Li2 z45U}~4i3S>J7>z!m5rTio#sR7bwiAH_p0=*I2IDAb?Xu<`}(_*eTlWnfduu=B7Q-->4_zFaas% zR)w2e!>JVgZ`8wf6%BAG*@elvZIzvNCx?vk=|; z9)axk0kJ5PuDAR_G@$k@osiN`CL0sGzw~ub;I^9{X#~NmQ)<5G+ zFzp#SEfkDy>e+29JKC(rs_qVIp^ukok!V1xZx3j7Hw3hW z$$(b-`2gwzs1InNZ2>K^+lP0EGWGL?2`zM5Jo#4#V~F>&lUjy%#)uGLe!h4ij6 zp+&YaEpk9mw=AK1D5`vP0{V(pR}hT(OPIRxt%>;lnk!NMN9bRQ16XKxHTq!j-0aI$ zx$<(flhm>{W?-ZDvo@ZsDd1hEl8H4LmO3_8rPc1X>a@DuC~Ft9HC#;n%b1X@Y3H^c z=i|r5PI&9b|9$M8$^J5u)1OFs=tI(~cT4)7F+67{Lw}NqIZ_YSw28F^ar`s;Pl&bc zlDHrv>GPfwjOE9;x_n8SuDpnMsecoZ3cB)Ko1uib?t4)%l*3o3ijSo_Uc&np$+cXI z6r}MRpF56zpu1O4=gni;+Jd0xpJiHoK{B?~FpO6*j_zk-Ef>r~`_Xo|!_4g$&_>i{ zN;##d&u}}R5hzdlMyuN|m-acL4h5KfFbl+6RBUopKd_hTdV2{^i6tjGdsNP<| z3;UUG(Yinj3dAwIa-n4HvTUk={40 zwod=!SFCwkm4mGR@DJZoxpXueF8IVS_h;+d{bGH_pS`r;7cb?3Y+ZXmtjh$liwXg8 z5f5go+k+zTl?@ewz-HOXVLTh>XN5bh>VTB3#Xj1B_wDnWnVEkLm^vWrTZ?z{eEtb> z!G4xq;A7%~9I)rq|0mDw7LUAc85&m=xaYmo@K{!*h{+w zqq3IX?hpjO^UdjdG_w(zTk-jLKSSJ~JmOahx)eMV3(>0oN$ z?MX~ldoT$s7cmLjVN`fA$sMc}Sn7X($%PIk>j{$?V6xr8WX92Jw=wCdFU2GFj+WzOkz*lnB*Qz0%L?p;MKxpy^G1r6ij9uOtw3i1okY9 z0@E%g>j{(X=-b6)2AIrvFbRB;EuNDDrrSN3D~1d3{bvky9X%vg_y*1;Z=gOXY)ABd?~&dLluXFSA7P&~))+MMn=e)seQG?By8@Vh6$*>jfI=hJ(` z@;0Uu@pK~Dlt`u$@n{Ll+n9z=E}U!%lb6n_ntBFB27f;+?-t<7duEx)eBXmLR|FMh zofF?RWZf zIbPBuo8|^;Lvbp{OPs9txm=laT^S;K1OvdaKP|_50y?KU%Nw2XQC}3kK{xOOH+uyH zFUPwJ_dONI`w;X>&V1ybK!4Ov3CG)F`)TcSJQSLa<85k=&iKr~;5_>Q+%x}F95250 z1#nDLeYj`EF}d8XG2#P%<^!)PgU12?Gp0EF?6btnO5#|aYv5m1_<{{tas0kdA2Wi` z&q*i?I%UI0^!Bn1yg!T*cXHD zCi(GwHK^MpSDWQu6is<6_?}#c`cNhMp2N-DQ{b<5gS+{hXy&$X@KxXzwlMSdt+;;> zI(KoK*BkzY#arb9cj@v@A$|e+l>Kv!`#!?d{78qHf0C)A`408`^BHlc##Y38FJ`9n zH9Y6tg6Vq`-`#H@y%4-KQsqCuKOX@1@*VJL;AE7ag3m4ps?WbcSJrUwnJ7yWFH8KG zL_8^R&=?;$nP1R1Jo({e;@WiiTg00&W7eE1;*hn4KL$@Zg_nM3EJD0A@g)W}DI61m zxPZ&x8Wp|lzrptvWMh>0){H!UC+-_NU3*Tny2?1!<-%xpu^4lUX1yG0vYoQ8T_#+zeYT8a2W@$v>#j=aq6+@Ygwe4l%PKlWn7xL^_1@Zy#oD$QOKfyC7 zDBsG=virIA$TzuJ_7r$j?19i1Fh5%$-v)Ufbf1U(HOS8~Q{IdGWmx;YO!fQy#(Kn2 zpWpb3;8#V_Z)`628(Gnh{qHw61pG!`5yoG5@wfel6TiZoVNR+2Yc%&ZM_6yoDfoZP zZym`6(R}rL#N`R5-31g!GjLl}*OEK?RKd>M&Lz)Hd9%Yz!W0d%}E4f%NqyG+`wbD*5hLM+-SbUUO zF~;@v*W$Uq!FB(G+*Gc@oGG06c{6_v-}19q9C__-1-v-RK8{$E z18)x7SkuYGI@rCU&1>5Mz6N$E;IV^j%na6W*Dqzrxeil)Qiw0dS}DqbeYHq8DAneM zI@q(AW$WU9gSAwG7n;iVVN;$F%#odf@jxZ+GY50JT>yq8y=*JfH^b)1WOH4c*Y+^k zXpHAB6E-zY_EeT1w``x!KUdEmLBIJ9<39LmMn^oh?<2s%cHE~4w*79!xaUHqk9--= zo%cn1@QtFMI(NFB2ixUrUAtm#{2A;ZS8O{7 zzta>usJ;n1SczvT+QCez9jv;+hjA#W<75YG9Xpuu*ul&cJJ{~AgW%^aOahzbgh?*w z6-eRtGRw#YwsUhf{9SN*TbR7*V2KS>{Z~T=7$iGLwr}%N_@J(%c17D~Y`T`YHqp4_ zqtv%D{_B4@Y+?DDgKz1ZKFh^A?lTlW<{{%V)|WFws33a>Y?m=3JDcvIs3WtOKKfZE zF7h#R@D_Ayfcb^J zlSJ%&#QZGSz1esN@iQZbv%%NI#m6u|zGj}KbhMd^yC@&$*+-FX9tHMjeztJ4{7YQV z0V|Mc51YPEz{Wj_Hk*(?iqy_q^Fns7qQZ~ow``qj_pJHK$*_6*Et{A90Q`#6+dJFN zWnueEY@Nji5C3tAt+P4eychl4$)CzUbOxgCOkSchG&||{nu(k6 zQ%w9Z#G1awrrzE&dWA|{S0~*Ak2B-&*U)r%D{6Ej$nAosHE}f*D5dD-;Bf30gUdtV zRoT*OaB5nhb1JTBrK9I|aKzIWmAUuDD?CC^{jED9p#dCh6bAF@g z{&3+e{6?qvyRrAian!6QdSZ#L)S&a~=>V0w?Z3V8pXBPR5 z4tlWWik%jYOV6zL!*4`?UqoYY$M_?U4AH8ibT#5k{6^0@{dxUHWYgS0_oDbg`HggE zeC~Ko=Qp|%dNc7GJq$hXnNR&a;XGrB-zeG?b$v#ZPIu-v>J5&{^uHA;r-eZ{3%}6` zEpncq;WxZm-QD0AKQ3yuQBkXI7qzMzL@ksQwaDk;8_D2z!*|nf^*Q_#9DXB-{w6@( zFzGkK??)&7M!TNn*F>Mv^ht>rJm@5LtlMeLX^$2j?bzz09BXWY}(UcJTHYp1rid#Y?#Wj#LoqGD3DyK@b;lF^jbntMOavP(j3R(WRBcm%Qwu{P@!^*?vOrx(~mo%QeMyC;#3)yy&Q-{s+wb&x-4= zyX!8|x{rXX+)CVCwW%Bw%sUS{>;5Zy-GkqoTz5aKGK4u;d#t&&_TWgd{;BW2jCF65 zR8eKAb*ZwFx))xy)|~dl_~F3`b(YiSm6tjBqKJzf|MkOI^Oq+K_?Kwi&HNs&kH#!6 zm42shjvnIr$Wbm`_8TP`?uB~Caz>X9coUX zZ=~BCxf&^PCEJmbd>m&kaU*{S`C=T#hn+leCFK&h5+^?suEhIt`fkV)Z@b?cTT=WG z#)C~0?m<0FUol*b$#|xin|!as)D+*xsG8m(j)QM<1E$Xk^a)P74R)5*;|18gysMM literal 0 HcmV?d00001 diff --git a/benchmarks/opencl/sad/main.cc b/benchmarks/opencl/sad/main.cc new file mode 100644 index 00000000..a156bd3b --- /dev/null +++ b/benchmarks/opencl/sad/main.cc @@ -0,0 +1,545 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "OpenCL_common.h" +#include "file.h" +#include "image.h" +#include "sad.h" +#include "sad_kernel.h" + +static unsigned short *load_sads(char *filename); +static void write_sads(char *filename, int image_width_macroblocks, + int image_height_macroblocks, unsigned short *sads); +static void write_sads_directly(char *filename, int width, int height, + unsigned short *sads); + +/* FILE I/O */ + +unsigned short *load_sads(char *filename) { + FILE *infile; + unsigned short *sads; + int w; + int h; + int sads_per_block; + + infile = fopen(filename, "r"); + + if (!infile) { + fprintf(stderr, "Cannot find file '%s'\n", filename); + exit(-1); + } + + /* Read image dimensions (measured in macroblocks) */ + w = read16u(infile); + h = read16u(infile); + + /* Read SAD values. Only interested in the 4x4 SAD values, which are + * at the end of the file. */ + sads_per_block = MAX_POS_PADDED * (w * h); + fseek(infile, 25 * sads_per_block * sizeof(unsigned short), SEEK_CUR); + + sads = (unsigned short *)malloc(sads_per_block * 16 * sizeof(unsigned short)); + fread(sads, sizeof(unsigned short), sads_per_block * 16, infile); + fclose(infile); + + return sads; +} + +/* Compare the reference SADs to the expected SADs. + */ +void check_sads(unsigned short *sads_reference, unsigned short *sads_computed, + int image_size_macroblocks) { + int block; + + /* Check the 4x4 SAD values. These are in sads_reference. + * Ignore the data at the beginning of sads_computed. */ + sads_computed += 25 * MAX_POS_PADDED * image_size_macroblocks; + + for (block = 0; block < image_size_macroblocks; block++) { + int subblock; + + for (subblock = 0; subblock < 16; subblock++) { + int sad_index; + + for (sad_index = 0; sad_index < MAX_POS; sad_index++) { + int index = (block * 16 + subblock) * MAX_POS_PADDED + sad_index; + + if (sads_reference[index] != sads_computed[index]) { +#if 0 + /* Print exactly where the mismatch was seen */ + printf("M %3d %2d %4d (%d = %d)\n", block, subblock, sad_index, sads_reference[index], sads_computed[index]); +#else + goto mismatch; +#endif + } + } + } + } + + printf("Success.\n"); + return; + +mismatch: + printf("Computed SADs do not match expected values.\n"); +} + +/* Extract the SAD data for a particular block type for a particular + * macroblock from the array of SADs of that block type. */ +static inline void write_subblocks(FILE *outfile, + unsigned short *subblock_array, + int macroblock, int count) { + int block; + int pos; + + for (block = 0; block < count; block++) { + unsigned short *vec = + subblock_array + (block + macroblock * count) * MAX_POS_PADDED; + + /* Write all SADs for this sub-block */ + for (pos = 0; pos < MAX_POS; pos++) + write16u(outfile, *vec++); + } +} + +/* Write some SAD data to a file for output checking. + * + * All SAD values for six rows of macroblocks are written. + * The six rows consist of the top two, middle two, and bottom two image rows. + */ +void write_sads(char *filename, int mb_width, int mb_height, + unsigned short *sads) { + FILE *outfile = fopen(filename, "w"); + int mbs = mb_width * mb_height; + int row_indir; + int row_indices[6] = { + 0, 1, mb_height / 2 - 1, mb_height / 2, mb_height - 2, mb_height - 1}; + + if (outfile == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + + /* Write the number of output macroblocks */ + write32u(outfile, mb_width * 6); + + /* Write zeros */ + write32u(outfile, 0); + + /* Each row */ + for (row_indir = 0; row_indir < 6; row_indir++) { + int row = row_indices[row_indir]; + + /* Each block in row */ + int block; + for (block = mb_width * row; block < mb_width * (row + 1); block++) { + int blocktype; + + /* Write SADs for all sub-block types */ + for (blocktype = 1; blocktype <= 7; blocktype++) + write_subblocks(outfile, sads + SAD_TYPE_IX(blocktype, mbs), block, + SAD_TYPE_CT(blocktype)); + } + } + + fclose(outfile); +} + +/* FILE I/O for debugging */ + +static void write_sads_directly(char *filename, int width, int height, + unsigned short *sads) { + FILE *f = fopen(filename, "w"); + int n; + + write16u(f, width); + write16u(f, height); + for (n = 0; n < 41 * MAX_POS_PADDED * (width * height); n++) { + write16u(f, sads[n]); + } + fclose(f); +} + +static void print_test_sad_vector(unsigned short *base, int macroblock, + int count) { + int n; + int searchpos = 17 * 33 + 17; + for (n = 0; n < count; n++) + printf(" %d", base[(count * macroblock + n) * MAX_POS_PADDED + searchpos]); +} + +static void print_test_sads(unsigned short *sads_computed, int mbs) { + int macroblock = 5; + int blocktype; + + for (blocktype = 1; blocktype <= 7; blocktype++) { + printf("%d:", blocktype); + print_test_sad_vector(sads_computed + SAD_TYPE_IX(blocktype, mbs), + macroblock, SAD_TYPE_CT(blocktype)); + puts("\n"); + } +} + +/* MAIN */ + +int main(int argc, char **argv) { + struct image_i16 *ref_image; + struct image_i16 *cur_image; + unsigned short *sads_computed; /* SADs generated by the program */ + + int image_size_bytes; + int image_width_macroblocks, image_height_macroblocks; + int image_size_macroblocks; + + struct pb_TimerSet timers; + struct pb_Parameters *params; + + char oclOverhead[] = "OpenCL Overhead"; + + pb_InitializeTimerSet(&timers); + pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); + + params = pb_ReadParameters(&argc, argv); + params->inpFiles = (char **)malloc(sizeof(char *) * 3); + params->inpFiles[0] = (char *)malloc(100); + params->inpFiles[1] = (char *)malloc(100); + params->inpFiles[2] = NULL; + strncpy(params->inpFiles[0], "reference.bin", 100); + strncpy(params->inpFiles[1], "frame.bin", 100); + + if (pb_Parameters_CountInputs(params) != 2) { + fprintf(stderr, "Expecting two input filenames\n"); + exit(-1); + } + + /* Read input files */ + pb_SwitchToTimer(&timers, pb_TimerID_IO); + ref_image = load_image(params->inpFiles[0]); + cur_image = load_image(params->inpFiles[1]); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + printf("Ok\n"); + + if ((ref_image->width != cur_image->width) || + (ref_image->height != cur_image->height)) { + fprintf(stderr, "Input images must be the same size\n"); + exit(-1); + } + if ((ref_image->width % 16) || (ref_image->height % 16)) { + fprintf(stderr, "Input image size must be an integral multiple of 16\n"); + exit(-1); + } + + printf("Ok\n"); + + /* Compute parameters, allocate memory */ + image_size_bytes = ref_image->width * ref_image->height * sizeof(short); + image_width_macroblocks = ref_image->width >> 4; + image_height_macroblocks = ref_image->height >> 4; + image_size_macroblocks = image_width_macroblocks * image_height_macroblocks; + + sads_computed = (unsigned short *)malloc( + 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short)); + + // Run the kernel code + // ************************************************************************ + + cl_int ciErrNum; + cl_command_queue clCommandQueue; + + cl_kernel mb_sad_calc; + cl_kernel larger_sad_calc_8; + cl_kernel larger_sad_calc_16; + + cl_mem imgRef; /* Reference image on the device */ + cl_mem d_cur_image; /* Current image on the device */ + cl_mem d_sads; /* SADs on the device */ + + // x : image_width_macroblocks + // y : image_height_macroblocks + + pb_Context *pb_context; + pb_context = pb_InitOpenCLContext(params); + if (pb_context == NULL) { + fprintf(stderr, "Error: No OpenCL platform/device can be found."); + return -1; + } + + printf("Ok+\n"); + + cl_int clStatus; + cl_device_id clDevice = (cl_device_id)pb_context->clDeviceId; + cl_platform_id clPlatform = (cl_platform_id)pb_context->clPlatformId; + cl_context clContext = (cl_context)pb_context->clContext; + + clCommandQueue = clCreateCommandQueue(clContext, clDevice, + CL_QUEUE_PROFILING_ENABLE, &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + + printf("Ok!\n"); + + pb_SetOpenCL(&clContext, &clCommandQueue); + + printf("Ok!\n"); + + pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); + + // Read Source Code File + /*size_t program_length; +const char* source_path = "src/opencl_base/kernel.cl"; +char* source = oclLoadProgSource(source_path, "", &program_length); +if(!source) { + fprintf(stderr, "Could not load program source\n"); exit(1); +} + + cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char +**)&source, &program_length, &ciErrNum);*/ +printf("Ok//-\n"); + cl_program clProgram = clCreateProgramWithBuiltInKernels( + clContext, 1, &clDevice, "mb_sad_calc;larger_sad_calc_8;larger_sad_calc_16", &ciErrNum); + printf("Ok//+\n"); + OCL_ERRCK_VAR(ciErrNum); + + printf("Ok+\n"); + + //free(source); + + // JIT Compilation Options + char compileOptions[1024]; + // -cl-nv-verbose + sprintf(compileOptions, "\ + -D MAX_POS=%u -D CEIL_POS=%u\ + -D POS_PER_THREAD=%u -D MAX_POS_PADDED=%u\ + -D THREADS_W=%u -D THREADS_H=%u\ + -D SEARCH_RANGE=%u -D SEARCH_DIMENSION=%u\ + \0", + MAX_POS, CEIL(MAX_POS, POS_PER_THREAD), POS_PER_THREAD, + MAX_POS_PADDED, THREADS_W, THREADS_H, SEARCH_RANGE, SEARCH_DIMENSION); + printf("options = %s\n", compileOptions); + + OCL_ERRCK_RETVAL( + clBuildProgram(clProgram, 1, &clDevice, compileOptions, NULL, NULL)); + + /* + char *build_log; + size_t ret_val_size; + OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, + CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) ); + build_log = (char *)malloc(ret_val_size+1); + OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, + CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) ); + + // Null terminate (original writer wasn't sure) + build_log[ret_val_size] = '\0'; + + fprintf(stderr, "%s\n", build_log ); + */ + + mb_sad_calc = clCreateKernel(clProgram, "mb_sad_calc", &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + larger_sad_calc_8 = clCreateKernel(clProgram, "larger_sad_calc_8", &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + larger_sad_calc_16 = + clCreateKernel(clProgram, "larger_sad_calc_16", &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + + size_t wgSize; + size_t comp_wgSize[3]; + cl_ulong localMemSize; + size_t prefwgSizeMult; + cl_ulong privateMemSize; + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + printf("Ok++\n"); + +#if 0 + cl_image_format img_format; + img_format.image_channel_order = CL_R; + img_format.image_channel_data_type = CL_UNSIGNED_INT16; + + /* Transfer reference image to device */ + imgRef = clCreateImage2D(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &img_format, + ref_image->width /** sizeof(unsigned short)*/, // width + ref_image->height, // height + ref_image->width * sizeof(unsigned short), // row_pitch + ref_image->data, &ciErrNum); +#endif + +#if 1 + imgRef = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + ref_image->width * ref_image->height * + sizeof(unsigned short), + NULL, &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, imgRef, CL_TRUE, 0, + ref_image->width * ref_image->height * + sizeof(unsigned short), + ref_image->data, 0, NULL, NULL)); +#else + imgRef = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + ref_image->width * ref_image->height * + sizeof(unsigned short), + ref_image->data, &ciErrNum); + printf("Allocating %d bytes\n", + ref_image->width * ref_image->height * sizeof(unsigned short)); + +#endif + OCL_ERRCK_VAR(ciErrNum); + + /* Allocate SAD data on the device */ + + unsigned short *tmpZero = (unsigned short *)calloc( + 41 * MAX_POS_PADDED * image_size_macroblocks, sizeof(unsigned short)); + + /* + size_t max_alloc_size = 0; + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(max_alloc_size), &max_alloc_size, NULL); + if (max_alloc_size < (41 * MAX_POS_PADDED * + image_size_macroblocks * sizeof(unsigned short))) { + fprintf(stderr, "Can't allocate sad buffer: max alloc size is %dMB\n", + (int) (max_alloc_size >> 20)); + exit(-1); + } + */ + + d_sads = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, + 41 * MAX_POS_PADDED * image_size_macroblocks * + sizeof(unsigned short), + tmpZero, &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + free(tmpZero); + + d_cur_image = + clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + image_size_bytes, cur_image->data, &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + + /* Set Kernel Parameters */ + + OCL_ERRCK_RETVAL( + clSetKernelArg(mb_sad_calc, 0, sizeof(cl_mem), (void *)&d_sads)); + OCL_ERRCK_RETVAL( + clSetKernelArg(mb_sad_calc, 1, sizeof(cl_mem), (void *)&d_cur_image)); + OCL_ERRCK_RETVAL( + clSetKernelArg(mb_sad_calc, 2, sizeof(int), &image_width_macroblocks)); + OCL_ERRCK_RETVAL( + clSetKernelArg(mb_sad_calc, 3, sizeof(int), &image_height_macroblocks)); + OCL_ERRCK_RETVAL( + clSetKernelArg(mb_sad_calc, 4, sizeof(cl_mem), (void *)&imgRef)); + + OCL_ERRCK_RETVAL( + clSetKernelArg(larger_sad_calc_8, 0, sizeof(cl_mem), (void *)&d_sads)); + OCL_ERRCK_RETVAL(clSetKernelArg(larger_sad_calc_8, 1, sizeof(int), + &image_width_macroblocks)); + OCL_ERRCK_RETVAL(clSetKernelArg(larger_sad_calc_8, 2, sizeof(int), + &image_height_macroblocks)); + + OCL_ERRCK_RETVAL( + clSetKernelArg(larger_sad_calc_16, 0, sizeof(cl_mem), (void *)&d_sads)); + OCL_ERRCK_RETVAL(clSetKernelArg(larger_sad_calc_16, 1, sizeof(int), + &image_width_macroblocks)); + OCL_ERRCK_RETVAL(clSetKernelArg(larger_sad_calc_16, 2, sizeof(int), + &image_height_macroblocks)); + + size_t mb_sad_calc_localWorkSize[2] = { + CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H, 1}; + size_t mb_sad_calc_globalWorkSize[2] = { + mb_sad_calc_localWorkSize[0] * CEIL(ref_image->width / 4, THREADS_W), + mb_sad_calc_localWorkSize[1] * CEIL(ref_image->height / 4, THREADS_H)}; + + size_t larger_sad_calc_8_localWorkSize[2] = {32, 4}; + size_t larger_sad_calc_8_globalWorkSize[2] = {image_width_macroblocks * 32, + image_height_macroblocks * 4}; + + size_t larger_sad_calc_16_localWorkSize[2] = {32, 1}; + size_t larger_sad_calc_16_globalWorkSize[2] = {image_width_macroblocks * 32, + image_height_macroblocks * 1}; + + printf("Ok+++\n"); + + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + + /* Run the 4x4 kernel */ + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, mb_sad_calc, 2, 0, + mb_sad_calc_globalWorkSize, + mb_sad_calc_localWorkSize, 0, 0, 0)); + + /* Run the larger-blocks kernels */ + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel( + clCommandQueue, larger_sad_calc_8, 2, 0, larger_sad_calc_8_globalWorkSize, + larger_sad_calc_8_localWorkSize, 0, 0, 0)); + + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_16, 2, + 0, larger_sad_calc_16_globalWorkSize, + larger_sad_calc_16_localWorkSize, 0, + 0, 0)); + + OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + /* Transfer SAD data to the host */ + OCL_ERRCK_RETVAL(clEnqueueReadBuffer( + clCommandQueue, d_sads, CL_TRUE, 0, + 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), + sads_computed, 0, NULL, NULL)); + + /* Free GPU memory */ + OCL_ERRCK_RETVAL(clReleaseKernel(larger_sad_calc_8)); + OCL_ERRCK_RETVAL(clReleaseKernel(larger_sad_calc_16)); + OCL_ERRCK_RETVAL(clReleaseProgram(clProgram)); + + OCL_ERRCK_RETVAL(clReleaseMemObject(d_sads)); + OCL_ERRCK_RETVAL(clReleaseMemObject(imgRef)); + OCL_ERRCK_RETVAL(clReleaseMemObject(d_cur_image)); + + OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // ************************************************************************ + // End GPU Code + + /* Print output */ + if (params->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + write_sads(params->outFile, image_width_macroblocks, + image_height_macroblocks, sads_computed); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + } + +#if 0 /* Debugging */ + print_test_sads(sads_computed, image_size_macroblocks); + write_sads_directly("sad-debug.bin", + ref_image->width / 16, ref_image->height / 16, + sads_computed); +#endif + + /* Free memory */ + free(sads_computed); + free_image(ref_image); + free_image(cur_image); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); + + OCL_ERRCK_RETVAL(clReleaseCommandQueue(clCommandQueue)); + OCL_ERRCK_RETVAL(clReleaseContext(clContext)); + + pb_DestroyTimerSet(&timers); + + return 0; +} diff --git a/benchmarks/opencl/sad/ocl.c b/benchmarks/opencl/sad/ocl.c new file mode 100644 index 00000000..9ce9a2f5 --- /dev/null +++ b/benchmarks/opencl/sad/ocl.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include "ocl.h" + +char* readFile(const char* fileName) +{ + FILE* fp; + fp = fopen(fileName,"r"); + if(fp == NULL) + { + printf("Error 1!\n"); + exit(1); + } + + fseek(fp,0,SEEK_END); + long size = ftell(fp); + rewind(fp); + + char* buffer = (char*)malloc(sizeof(char)*(size+1)); + if(buffer == NULL) + { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } + + size_t res = fread(buffer,1,size,fp); + if(res != size) + { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } + + buffer[size] = 0; + fclose(fp); + return buffer; +} + +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) +{ + cl_int clStatus; + char* temp = (char*)malloc(size); + memset(temp,val,size); + clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); +} diff --git a/benchmarks/opencl/sad/ocl.h b/benchmarks/opencl/sad/ocl.h new file mode 100644 index 00000000..8840a868 --- /dev/null +++ b/benchmarks/opencl/sad/ocl.h @@ -0,0 +1,21 @@ +#ifndef __OCLH__ +#define __OCLH__ + +typedef struct { + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; +} OpenCLDeviceProp; + +void clMemSet(cl_command_queue, cl_mem, int, size_t); +char* readFile(const char*); + +#define CHECK_ERROR(errorMessage) \ + if(clStatus != CL_SUCCESS) \ + { \ + printf("Error: %s!\n",errorMessage); \ + printf("Line: %d\n",__LINE__); \ + exit(1); \ + } + +#endif diff --git a/benchmarks/opencl/sad/parboil.c b/benchmarks/opencl/sad/parboil.c new file mode 100644 index 00000000..54fca9d0 --- /dev/null +++ b/benchmarks/opencl/sad/parboil.c @@ -0,0 +1,427 @@ +/* + * (c) 2007 The Board of Trustees of the University of Illinois. + */ + +#include +#include +#include +#include + +#if _POSIX_VERSION >= 200112L +# include +#endif + + +/*****************************************************************************/ +/* Timer routines */ + +static void +accumulate_time(pb_Timestamp *accum, + pb_Timestamp start, + pb_Timestamp end) +{ +#if _POSIX_VERSION >= 200112L + *accum += end - start; +#else +# error "Timestamps not implemented for this system" +#endif +} + +#if _POSIX_VERSION >= 200112L +static pb_Timestamp get_time() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (pb_Timestamp) (tv.tv_sec * 1000000LL + tv.tv_usec); +} +#else +# error "no supported time libraries are available on this platform" +#endif + +void +pb_ResetTimer(struct pb_Timer *timer) +{ + timer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + timer->elapsed = 0; +#else +# error "pb_ResetTimer: not implemented for this system" +#endif +} + +void +pb_StartTimer(struct pb_Timer *timer) +{ + if (timer->state != pb_Timer_STOPPED) { + fputs("Ignoring attempt to start a running timer\n", stderr); + return; + } + + timer->state = pb_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StartTimer: not implemented for this system" +#endif +} + +void +pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) +{ + unsigned int numNotStopped = 0x3; // 11 + if (timer->state != pb_Timer_STOPPED) { + fputs("Warning: Timer was not stopped\n", stderr); + numNotStopped &= 0x1; // Zero out 2^1 + } + if (subtimer->state != pb_Timer_STOPPED) { + fputs("Warning: Subtimer was not stopped\n", stderr); + numNotStopped &= 0x2; // Zero out 2^0 + } + if (numNotStopped == 0x0) { + fputs("Ignoring attempt to start running timer and subtimer\n", stderr); + return; + } + + timer->state = pb_Timer_RUNNING; + subtimer->state = pb_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + + if (numNotStopped & 0x2) { + timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } + + if (numNotStopped & 0x1) { + subtimer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } + } +#else +# error "pb_StartTimer: not implemented for this system" +#endif + +} + +void +pb_StopTimer(struct pb_Timer *timer) +{ + + pb_Timestamp fini; + + if (timer->state != pb_Timer_RUNNING) { + fputs("Ignoring attempt to stop a stopped timer\n", stderr); + return; + } + + timer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + fini = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StopTimer: not implemented for this system" +#endif + + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + +} + +void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) { + + pb_Timestamp fini; + + unsigned int numNotRunning = 0x3; // 0b11 + if (timer->state != pb_Timer_RUNNING) { + fputs("Warning: Timer was not running\n", stderr); + numNotRunning &= 0x1; // Zero out 2^1 + } + if (subtimer->state != pb_Timer_RUNNING) { + fputs("Warning: Subtimer was not running\n", stderr); + numNotRunning &= 0x2; // Zero out 2^0 + } + if (numNotRunning == 0x0) { + fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr); + return; + } + + + timer->state = pb_Timer_STOPPED; + subtimer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + fini = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StopTimer: not implemented for this system" +#endif + + if (numNotRunning & 0x2) { + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + } + + if (numNotRunning & 0x1) { + accumulate_time(&subtimer->elapsed, subtimer->init, fini); + subtimer->init = fini; + } + +} + +/* Get the elapsed time in seconds. */ +double +pb_GetElapsedTime(struct pb_Timer *timer) +{ + double ret; + + if (timer->state != pb_Timer_STOPPED) { + fputs("Elapsed time from a running timer is inaccurate\n", stderr); + } + +#if _POSIX_VERSION >= 200112L + ret = timer->elapsed / 1e6; +#else +# error "pb_GetElapsedTime: not implemented for this system" +#endif + return ret; +} + +void +pb_InitializeTimerSet(struct pb_TimerSet *timers) +{ + int n; + + timers->wall_begin = get_time(); + + timers->current = pb_TimerID_NONE; + + timers->async_markers = NULL; + + + for (n = 0; n < pb_TimerID_LAST; n++) { + pb_ResetTimer(&timers->timers[n]); + timers->sub_timer_list[n] = NULL; // free first? + } +} + +void +pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) { + + struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc + (sizeof(struct pb_SubTimer)); + + int len = strlen(label); + + subtimer->label = (char *) malloc (sizeof(char)*(len+1)); + sprintf(subtimer->label, "%s\0", label); + + pb_ResetTimer(&subtimer->timer); + subtimer->next = NULL; + + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category]; + if (subtimerlist == NULL) { + subtimerlist = (struct pb_SubTimerList *) malloc + (sizeof(struct pb_SubTimerList)); + subtimerlist->subtimer_list = subtimer; + timers->sub_timer_list[pb_Category] = subtimerlist; + } else { + // Append to list + struct pb_SubTimer *element = subtimerlist->subtimer_list; + while (element->next != NULL) { + element = element->next; + } + element->next = subtimer; + } + +} + +void +pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) +{ + +// switchToSub( NULL, NONE +// switchToSub( NULL, some +// switchToSub( some, some +// switchToSub( some, NONE -- tries to find "some" in NONE's sublist, which won't be printed + + struct pb_Timer *topLevelToStop = NULL; + if (timers->current != category && timers->current != pb_TimerID_NONE) { + // Switching to subtimer in a different category needs to stop the top-level current, different categoried timer. + // NONE shouldn't have a timer associated with it, so exclude from branch + topLevelToStop = &timers->timers[timers->current]; + } + + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + struct pb_SubTimer *curr = (subtimerlist == NULL) ? NULL : subtimerlist->current; + + if (timers->current != pb_TimerID_NONE) { + if (curr != NULL && topLevelToStop != NULL) { + pb_StopTimerAndSubTimer(topLevelToStop, &curr->timer); + } else if (curr != NULL) { + pb_StopTimer(&curr->timer); + } else { + pb_StopTimer(topLevelToStop); + } + } + + subtimerlist = timers->sub_timer_list[category]; + struct pb_SubTimer *subtimer = NULL; + + if (label != NULL) { + subtimer = subtimerlist->subtimer_list; + while (subtimer != NULL) { + if (strcmp(subtimer->label, label) == 0) { + break; + } else { + subtimer = subtimer->next; + } + } + } + + if (category != pb_TimerID_NONE) { + + if (subtimerlist != NULL) { + subtimerlist->current = subtimer; + } + + if (category != timers->current && subtimer != NULL) { + pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); + } else if (subtimer != NULL) { + // Same category, different non-NULL subtimer + pb_StartTimer(&subtimer->timer); + } else{ + // Different category, but no subtimer (not found or specified as NULL) -- unprefered way of setting topLevel timer + pb_StartTimer(&timers->timers[category]); + } + } + + timers->current = category; + +} + +void +pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) +{ + /* Stop the currently running timer */ + /*if (timers->current != pb_TimerID_NONE) { + struct pb_SubTimer *currSubTimer = NULL; + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + + if ( subtimerlist != NULL) { + currSubTimer = timers->sub_timer_list[timers->current]->current; + } + if ( currSubTimer!= NULL) { + pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); + } else { + pb_StopTimer(&timers->timers[timers->current]); + } + } + + timers->current = timer; + + if (timer != pb_TimerID_NONE) { + pb_StartTimer(&timers->timers[timer]); + }*/ +} + +void +pb_PrintTimerSet(struct pb_TimerSet *timers) +{ + + pb_Timestamp wall_end = get_time(); + + struct pb_Timer *t = timers->timers; + struct pb_SubTimer* sub = NULL; + + int maxSubLength; + + const char *categories[] = { + "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute" + }; + + const int maxCategoryLength = 10; + + int i; + for(i = 1; i < pb_TimerID_LAST-1; ++i) { // exclude NONE and OVRELAP from this format + if(pb_GetElapsedTime(&t[i]) != 0) { + + // Print Category Timer + printf("%-*s: %f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i])); + + if (timers->sub_timer_list[i] != NULL) { + sub = timers->sub_timer_list[i]->subtimer_list; + maxSubLength = 0; + while (sub != NULL) { + // Find longest SubTimer label + if (strlen(sub->label) > maxSubLength) { + maxSubLength = strlen(sub->label); + } + sub = sub->next; + } + + // Fit to Categories + if (maxSubLength <= maxCategoryLength) { + maxSubLength = maxCategoryLength; + } + + sub = timers->sub_timer_list[i]->subtimer_list; + + // Print SubTimers + while (sub != NULL) { + printf(" -%-*s: %f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer)); + sub = sub->next; + } + } + } + } + + if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0) + printf("CPU/Kernel Overlap: %f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP])); + + float walltime = (wall_end - timers->wall_begin)/ 1e6; + printf("Timer Wall Time: %f\n", walltime); + +} + +void pb_DestroyTimerSet(struct pb_TimerSet * timers) +{ + /* clean up all of the async event markers */ + struct pb_async_time_marker_list ** event = &(timers->async_markers); + while( *event != NULL) { + struct pb_async_time_marker_list ** next = &((*event)->next); + free(*event); + (*event) = NULL; + event = next; + } + + int i = 0; + for(i = 0; i < pb_TimerID_LAST; ++i) { + if (timers->sub_timer_list[i] != NULL) { + struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; + struct pb_SubTimer *prev = NULL; + while (subtimer != NULL) { + free(subtimer->label); + prev = subtimer; + subtimer = subtimer->next; + free(prev); + } + free(timers->sub_timer_list[i]); + } + } +} + + diff --git a/benchmarks/opencl/sad/parboil.h b/benchmarks/opencl/sad/parboil.h new file mode 100644 index 00000000..4c9a8b5e --- /dev/null +++ b/benchmarks/opencl/sad/parboil.h @@ -0,0 +1,348 @@ +/* + * (c) 2010 The Board of Trustees of the University of Illinois. + */ +#ifndef PARBOIL_HEADER +#define PARBOIL_HEADER + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* A platform as specified by the user on the command line */ +struct pb_PlatformParam { + char *name; /* The platform name. This string is owned. */ + char *version; /* The platform version; may be NULL. + * This string is owned. */ +}; + +/* Create a PlatformParam from the given strings. + * 'name' must not be NULL. 'version' may be NULL. + * If not NULL, the strings should have been allocated by malloc(), + * and they will be owned by the returned object. + */ +struct pb_PlatformParam * +pb_PlatformParam(char *name, char *version); + +void +pb_FreePlatformParam(struct pb_PlatformParam *); + +/* A criterion for how to select a device */ +enum pb_DeviceSelectionCriterion { + pb_Device_INDEX, /* Enumerate the devices and select one + * by its number */ + pb_Device_CPU, /* Select a CPU device */ + pb_Device_GPU, /* Select a GPU device */ + pb_Device_ACCELERATOR, /* Select an accelerator device */ + pb_Device_NAME /* Select a device by name */ +}; + +/* A device as specified by the user on the command line */ +struct pb_DeviceParam { + enum pb_DeviceSelectionCriterion criterion; + union { + int index; /* If criterion == pb_Device_INDEX, + * the index of the device */ + char *name; /* If criterion == pb_Device_NAME, + * the name of the device. + * This string is owned. */ + }; +}; + +struct pb_DeviceParam * +pb_DeviceParam_index(int index); + +struct pb_DeviceParam * +pb_DeviceParam_cpu(void); + +struct pb_DeviceParam * +pb_DeviceParam_gpu(void); + +struct pb_DeviceParam * +pb_DeviceParam_accelerator(void); + +/* Create a by-name device selection criterion. + * The string should have been allocated by malloc(), and it will will be + * owned by the returned object. + */ +struct pb_DeviceParam * +pb_DeviceParam_name(char *name); + +void +pb_FreeDeviceParam(struct pb_DeviceParam *); + +/* Command line parameters for benchmarks */ +struct pb_Parameters { + char *outFile; /* If not NULL, the raw output of the + * computation should be saved to this + * file. The string is owned. */ + char **inpFiles; /* A NULL-terminated array of strings + * holding the input file(s) for the + * computation. The array and strings + * are owned. */ + struct pb_PlatformParam *platform; /* If not NULL, the platform + * specified on the command line. */ + struct pb_DeviceParam *device; /* If not NULL, the device + * specified on the command line. */ +}; + +/* Read command-line parameters. + * + * The argc and argv parameters to main are read, and any parameters + * interpreted by this function are removed from the argument list. + * + * A new instance of struct pb_Parameters is returned. + * If there is an error, then an error message is printed on stderr + * and NULL is returned. + */ +struct pb_Parameters * +pb_ReadParameters(int *_argc, char **argv); + +/* Free an instance of struct pb_Parameters. + */ +void +pb_FreeParameters(struct pb_Parameters *p); + +void +pb_FreeStringArray(char **); + +/* Count the number of input files in a pb_Parameters instance. + */ +int +pb_Parameters_CountInputs(struct pb_Parameters *p); + +/* A time or duration. */ +//#if _POSIX_VERSION >= 200112L +typedef unsigned long long pb_Timestamp; /* time in microseconds */ +//#else +//# error "Timestamps not implemented" +//#endif + +enum pb_TimerState { + pb_Timer_STOPPED, + pb_Timer_RUNNING, +}; + +struct pb_Timer { + enum pb_TimerState state; + pb_Timestamp elapsed; /* Amount of time elapsed so far */ + pb_Timestamp init; /* Beginning of the current time interval, + * if state is RUNNING. End of the last + * recorded time interfal otherwise. */ +}; + +/* Reset a timer. + * Use this to initialize a timer or to clear + * its elapsed time. The reset timer is stopped. + */ +void +pb_ResetTimer(struct pb_Timer *timer); + +/* Start a timer. The timer is set to RUNNING mode and + * time elapsed while the timer is running is added to + * the timer. + * The timer should not already be running. + */ +void +pb_StartTimer(struct pb_Timer *timer); + +/* Stop a timer. + * This stops adding elapsed time to the timer. + * The timer should not already be stopped. + */ +void +pb_StopTimer(struct pb_Timer *timer); + +/* Get the elapsed time in seconds. */ +double +pb_GetElapsedTime(struct pb_Timer *timer); + +/* Execution time is assigned to one of these categories. */ +enum pb_TimerID { + pb_TimerID_NONE = 0, + pb_TimerID_IO, /* Time spent in input/output */ + pb_TimerID_KERNEL, /* Time spent computing on the device, + * recorded asynchronously */ + pb_TimerID_COPY, /* Time spent synchronously moving data + * to/from device and allocating/freeing + * memory on the device */ + pb_TimerID_DRIVER, /* Time spent in the host interacting with the + * driver, primarily for recording the time + * spent queueing asynchronous operations */ + pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */ + pb_TimerID_COMPUTE, /* Time for all program execution other + * than parsing command line arguments, + * I/O, kernel, and copy */ + pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and + * host activity: automatically filled in, + * not intended for direct usage */ + pb_TimerID_LAST /* Number of timer IDs */ +}; + +/* Dynamic list of asynchronously tracked times between events */ +struct pb_async_time_marker_list { + char *label; // actually just a pointer to a string + enum pb_TimerID timerID; /* The ID to which the interval beginning + * with this marker should be attributed */ + void * marker; + //cudaEvent_t marker; /* The driver event for this marker */ + struct pb_async_time_marker_list *next; +}; + +struct pb_SubTimer { + char *label; + struct pb_Timer timer; + struct pb_SubTimer *next; +}; + +struct pb_SubTimerList { + struct pb_SubTimer *current; + struct pb_SubTimer *subtimer_list; +}; + +/* A set of timers for recording execution times. */ +struct pb_TimerSet { + enum pb_TimerID current; + struct pb_async_time_marker_list* async_markers; + pb_Timestamp async_begin; + pb_Timestamp wall_begin; + struct pb_Timer timers[pb_TimerID_LAST]; + struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST]; +}; + +/* Reset all timers in the set. */ +void +pb_InitializeTimerSet(struct pb_TimerSet *timers); + +void +pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category); + +/* Select which timer the next interval of time should be accounted + * to. The selected timer is started and other timers are stopped. + * Using pb_TimerID_NONE stops all timers. */ +void +pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer); + +void +pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category); + +/* Print timer values to standard output. */ +void +pb_PrintTimerSet(struct pb_TimerSet *timers); + +/* Release timer resources */ +void +pb_DestroyTimerSet(struct pb_TimerSet * timers); + +void +pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr); + + +typedef struct pb_Device_tag { + char* name; + void* clDevice; + int id; + unsigned int in_use; + unsigned int available; +} pb_Device; + +struct pb_Context_tag; +typedef struct pb_Context_tag pb_Context; + +typedef struct pb_Platform_tag { + char* name; + char* version; + void* clPlatform; + unsigned int in_use; + pb_Context** contexts; + pb_Device** devices; +} pb_Platform; + +struct pb_Context_tag { + void* clPlatformId; + void* clContext; + void* clDeviceId; + pb_Platform* pb_platform; + pb_Device* pb_device; +}; + +// verbosely print out list of platforms and their devices to the console. +pb_Platform** +pb_GetPlatforms(); + +// Choose a platform according to the given platform specification +pb_Platform* +pb_GetPlatform(struct pb_PlatformParam *platform); + +// choose a platform: by name, name & version +pb_Platform* +pb_GetPlatformByName(const char* name); + +pb_Platform* +pb_GetPlatformByNameAndVersion(const char* name, const char* version); + +// Choose a device according to the given device specification +pb_Device* +pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device); + +pb_Device** +pb_GetDevices(pb_Platform* pb_platform); + +// choose a device by name. +pb_Device* +pb_GetDeviceByName(pb_Platform* pb_platform, const char* name); + +pb_Platform* +pb_GetPlatformByEnvVars(); + +pb_Context* +pb_InitOpenCLContext(struct pb_Parameters* parameters); + +void +pb_ReleasePlatforms(); + +void +pb_ReleaseContext(pb_Context* c); + +void +pb_PrintPlatformInfo(pb_Context* c); + +void +perf_init(); + +//#define MEASURE_KERNEL_TIME + +#include + +#ifdef MEASURE_KERNEL_TIME +#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c)) +cl_int +pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* work_dim */, + const size_t * /* global_work_offset */, + const size_t * /* global_work_size */, + const size_t * /* local_work_size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); +#endif + +enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR }; +void pb_sig_float(char*, float*, int); +void pb_sig_double(char*, double*, int); +void pb_sig_short(char*, short*, int); +void pb_sig_int(char*, int*, int); +void pb_sig_uchar(char*, unsigned char*, unsigned int); +void pb_sig_clmem(char*, cl_command_queue, cl_mem, int); + +#ifdef __cplusplus +} +#endif + +#endif //PARBOIL_HEADER + diff --git a/benchmarks/opencl/sad/parboil_opencl.c b/benchmarks/opencl/sad/parboil_opencl.c new file mode 100644 index 00000000..a4db1680 --- /dev/null +++ b/benchmarks/opencl/sad/parboil_opencl.c @@ -0,0 +1,1394 @@ +/* + * (c) 2007 The Board of Trustees of the University of Illinois. + */ + +#include +#include +#include +#include +#include +#include + +#if _POSIX_VERSION >= 200112L +# include +#endif + +//#include "perfmon.h" + +cl_context *clContextPtr; +cl_command_queue *clCommandQueuePtr; + +// #define DISABLE_PARBOIL_TIMER + +/*****************************************************************************/ +/* Timer routines */ + +static int is_async(enum pb_TimerID timer) +{ + return (timer == pb_TimerID_KERNEL) || + (timer == pb_TimerID_COPY_ASYNC); +} + +static int is_blocking(enum pb_TimerID timer) +{ + return (timer == pb_TimerID_COPY) || (timer == pb_TimerID_NONE); +} + +#define INVALID_TIMERID pb_TimerID_LAST + +static int asyncs_outstanding(struct pb_TimerSet* timers) +{ + return (timers->async_markers != NULL) && + (timers->async_markers->timerID != INVALID_TIMERID); +} + +static struct pb_async_time_marker_list * +get_last_async(struct pb_TimerSet* timers) +{ + /* Find the last event recorded thus far */ + struct pb_async_time_marker_list * last_event = timers->async_markers; + if(last_event != NULL && last_event->timerID != INVALID_TIMERID) { + while(last_event->next != NULL && + last_event->next->timerID != INVALID_TIMERID) + last_event = last_event->next; + return last_event; + } else + return NULL; +} + +static void insert_marker(struct pb_TimerSet* tset, enum pb_TimerID timer) +{ + cl_int ciErrNum = CL_SUCCESS; + struct pb_async_time_marker_list ** new_event = &(tset->async_markers); + + while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { + new_event = &((*new_event)->next); + } + + if(*new_event == NULL) { + *new_event = (struct pb_async_time_marker_list *) + malloc(sizeof(struct pb_async_time_marker_list)); + (*new_event)->marker = calloc(1, sizeof(cl_event)); + /* + // I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event +#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) +fprintf(stderr, "Creating Marker [%d]\n", timer); + *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Creating User Event Object!\n"); + } + ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Setting User Event Status!\n"); + } +#endif +*/ + (*new_event)->next = NULL; + } + + /* valid event handle now aquired: insert the event record */ + (*new_event)->label = NULL; + (*new_event)->timerID = timer; + ciErrNum = clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Enqueueing Marker!\n"); + } + +} + +static void insert_submarker(struct pb_TimerSet* tset, char *label, enum pb_TimerID timer) +{ + cl_int ciErrNum = CL_SUCCESS; + struct pb_async_time_marker_list ** new_event = &(tset->async_markers); + + while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { + new_event = &((*new_event)->next); + } + + if(*new_event == NULL) { + *new_event = (struct pb_async_time_marker_list *) + malloc(sizeof(struct pb_async_time_marker_list)); + (*new_event)->marker = calloc(1, sizeof(cl_event)); + /* +#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) +fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer); + *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Creating User Event Object!\n"); + } + ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Setting User Event Status!\n"); + } +#endif +*/ + (*new_event)->next = NULL; + } + + /* valid event handle now aquired: insert the event record */ + (*new_event)->label = label; + (*new_event)->timerID = timer; + ciErrNum = clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Enqueueing Marker!\n"); + } + +} + + +/* Assumes that all recorded events have completed */ +static pb_Timestamp record_async_times(struct pb_TimerSet* tset) +{ + struct pb_async_time_marker_list * next_interval = NULL; + struct pb_async_time_marker_list * last_marker = get_last_async(tset); + pb_Timestamp total_async_time = 0; + enum pb_TimerID timer; + + for(next_interval = tset->async_markers; next_interval != last_marker; + next_interval = next_interval->next) { + cl_ulong command_start=0, command_end=0; + cl_int ciErrNum = CL_SUCCESS; + + ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum); + } + + ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum); + } + + pb_Timestamp interval = (pb_Timestamp) (((double)(command_end - command_start)) / 1e3); + tset->timers[next_interval->timerID].elapsed += interval; + if (next_interval->label != NULL) { + struct pb_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list; + while (subtimer != NULL) { + if ( strcmp(subtimer->label, next_interval->label) == 0) { + subtimer->timer.elapsed += interval; + break; + } + subtimer = subtimer->next; + } + } + total_async_time += interval; + next_interval->timerID = INVALID_TIMERID; + } + + if(next_interval != NULL) + next_interval->timerID = INVALID_TIMERID; + + return total_async_time; +} + +static void +accumulate_time(pb_Timestamp *accum, + pb_Timestamp start, + pb_Timestamp end) +{ +//#if _POSIX_VERSION >= 200112L + *accum += end - start; +//#else +//# error "Timestamps not implemented for this system" +//#endif +} + +//#if _POSIX_VERSION >= 200112L +static pb_Timestamp get_time() +{ + //struct timeval tv; + //gettimeofday(&tv, NULL); + //return (pb_Timestamp) (tv.tv_sec * 1000000LL + tv.tv_usec); + return 0; +} +//#else +//# error "no supported time libraries are available on this platform" +//#endif + +void +pb_ResetTimer(struct pb_Timer *timer) +{ +//#ifndef DISABLE_PARBOIL_TIMER + timer->state = pb_Timer_STOPPED; + +//#if _POSIX_VERSION >= 200112L + timer->elapsed = 0; +//#else +//# error "pb_ResetTimer: not implemented for this system" +//#endif +//#endif +} + +void +pb_StartTimer(struct pb_Timer *timer) +{ +/*#ifndef DISABLE_PARBOIL_TIMER + if (timer->state != pb_Timer_STOPPED) { + fputs("Ignoring attempt to start a running timer\n", stderr); + return; + } + + timer->state = pb_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StartTimer: not implemented for this system" +#endif +#endif*/ +} + +void +pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) +{ +/*#ifndef DISABLE_PARBOIL_TIMER + + unsigned int numNotStopped = 0x3; // 11 + if (timer->state != pb_Timer_STOPPED) { + fputs("Warning: Timer was not stopped\n", stderr); + numNotStopped &= 0x1; // Zero out 2^1 + } + if (subtimer->state != pb_Timer_STOPPED) { + fputs("Warning: Subtimer was not stopped\n", stderr); + numNotStopped &= 0x2; // Zero out 2^0 + } + if (numNotStopped == 0x0) { + fputs("Ignoring attempt to start running timer and subtimer\n", stderr); + return; + } + + timer->state = pb_Timer_RUNNING; + subtimer->state = pb_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + + if (numNotStopped & 0x2) { + timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } + + if (numNotStopped & 0x1) { + subtimer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } + } +#else +# error "pb_StartTimer: not implemented for this system" +#endif + +#endif*/ +} + +void +pb_StopTimer(struct pb_Timer *timer) +{ +/*#ifndef DISABLE_PARBOIL_TIMER + + pb_Timestamp fini; + + if (timer->state != pb_Timer_RUNNING) { + fputs("Ignoring attempt to stop a stopped timer\n", stderr); + return; + } + + timer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + fini = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StopTimer: not implemented for this system" +#endif + + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + +#endif*/ +} + +void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) { +/*#ifndef DISABLE_PARBOIL_TIMER + + pb_Timestamp fini; + + unsigned int numNotRunning = 0x3; // 11 + if (timer->state != pb_Timer_RUNNING) { + fputs("Warning: Timer was not running\n", stderr); + numNotRunning &= 0x1; // Zero out 2^1 + } + if (subtimer->state != pb_Timer_RUNNING) { + fputs("Warning: Subtimer was not running\n", stderr); + numNotRunning &= 0x2; // Zero out 2^0 + } + if (numNotRunning == 0x0) { + fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr); + return; + } + + + timer->state = pb_Timer_STOPPED; + subtimer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + fini = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StopTimer: not implemented for this system" +#endif + + if (numNotRunning & 0x2) { + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + } + + if (numNotRunning & 0x1) { + accumulate_time(&subtimer->elapsed, subtimer->init, fini); + subtimer->init = fini; + } + +#endif*/ +} + +/* Get the elapsed time in seconds. */ +double +pb_GetElapsedTime(struct pb_Timer *timer) +{ + /*double ret; +#ifndef DISABLE_PARBOIL_TIMER + + if (timer->state != pb_Timer_STOPPED) { + fputs("Elapsed time from a running timer is inaccurate\n", stderr); + } + +#if _POSIX_VERSION >= 200112L + ret = timer->elapsed / 1e6; +#else +# error "pb_GetElapsedTime: not implemented for this system" +#endif +#endif + return ret;*/ + return 0; +} + +void +pb_InitializeTimerSet(struct pb_TimerSet *timers) +{ +/*#ifndef DISABLE_PARBOIL_TIMER + int n; + + timers->wall_begin = 0; //get_time(); + timers->current = pb_TimerID_NONE; + + timers->async_markers = NULL; + + for (n = 0; n < pb_TimerID_LAST; n++) { + pb_ResetTimer(&timers->timers[n]); + timers->sub_timer_list[n] = NULL; + } +#endif*/ +} + +void pb_SetOpenCL(void *p_clContextPtr, void *p_clCommandQueuePtr) { + clContextPtr = ((cl_context *)p_clContextPtr); + clCommandQueuePtr = ((cl_command_queue *)p_clCommandQueuePtr); +} + +void +pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) { +/*#ifndef DISABLE_PARBOIL_TIMER + + struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc + (sizeof(struct pb_SubTimer)); + + int len = strlen(label); + + subtimer->label = (char *) malloc (sizeof(char)*(len+1)); + sprintf(subtimer->label, "%s\0", label); + + pb_ResetTimer(&subtimer->timer); + subtimer->next = NULL; + + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category]; + if (subtimerlist == NULL) { + subtimerlist = (struct pb_SubTimerList *) calloc + (1, sizeof(struct pb_SubTimerList)); + subtimerlist->subtimer_list = subtimer; + timers->sub_timer_list[pb_Category] = subtimerlist; + } else { + // Append to list + struct pb_SubTimer *element = subtimerlist->subtimer_list; + while (element->next != NULL) { + element = element->next; + } + element->next = subtimer; + } + +#endif*/ +} + +void +pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) +{ +#if 0 +#ifndef DISABLE_PARBOIL_TIMER + + /* Stop the currently running timer */ + if (timers->current != pb_TimerID_NONE) { + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + struct pb_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL; + + if (!is_async(timers->current) ) { + if (timers->current != timer) { + if (currSubTimer != NULL) { + pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); + } else { + pb_StopTimer(&timers->timers[timers->current]); + } + } else { + if (currSubTimer != NULL) { + pb_StopTimer(&currSubTimer->timer); + } + } + } else { + insert_marker(timers, timer); + if (!is_async(timer)) { // if switching to async too, keep driver going + pb_StopTimer(&timers->timers[pb_TimerID_DRIVER]); + } + } + } + + pb_Timestamp currentTime = 0; //get_time(); + + /* The only cases we check for asynchronous task completion is + * when an overlapping CPU operation completes, or the next + * segment blocks on completion of previous async operations */ + if( asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(timer) ) ) { + + struct pb_async_time_marker_list * last_event = get_last_async(timers); + /* CL_COMPLETE if completed */ + + cl_int ciErrNum = CL_SUCCESS; + cl_int async_done = CL_COMPLETE; + + ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Querying EventInfo!\n"); + } + + + if(is_blocking(timer)) { + /* Async operations completed after previous CPU operations: + * overlapped time is the total CPU time since this set of async + * operations were first issued */ + + // timer to switch to is COPY or NONE + if(async_done != CL_COMPLETE) { + accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), + timers->async_begin,currentTime); + } + + /* Wait on async operation completion */ + ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Waiting for Events!\n"); + } + + pb_Timestamp total_async_time = record_async_times(timers); + + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + if(async_done == CL_COMPLETE) { + //fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time); + timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time; + } + + } else + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding + // so something is deeper in stack + if(async_done == CL_COMPLETE ) { + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers); + } + } + + /* Start the new timer */ + if (timer != pb_TimerID_NONE) { + if(!is_async(timer)) { + pb_StartTimer(&timers->timers[timer]); + } else { + // toSwitchTo Is Async (KERNEL/COPY_ASYNC) + if (!asyncs_outstanding(timers)) { + /* No asyncs outstanding, insert a fresh async marker */ + + insert_marker(timers, timer); + timers->async_begin = currentTime; + } else if(!is_async(timers->current)) { + /* Previous asyncs still in flight, but a previous SwitchTo + * already marked the end of the most recent async operation, + * so we can rename that marker as the beginning of this async + * operation */ + + struct pb_async_time_marker_list * last_event = get_last_async(timers); + last_event->label = NULL; + last_event->timerID = timer; + } + if (!is_async(timers->current)) { + pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]); + } + } + } + timers->current = timer; + +#endif +#endif +} + +void +pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) +{ +#if 0 +#ifndef DISABLE_PARBOIL_TIMER + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + struct pb_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL; + + if (timers->current != pb_TimerID_NONE) { + if (!is_async(timers->current) ) { + if (timers->current != category) { + if (curr != NULL) { + pb_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer); + } else { + pb_StopTimer(&timers->timers[timers->current]); + } + } else { + if (curr != NULL) { + pb_StopTimer(&curr->timer); + } + } + } else { + insert_submarker(timers, label, category); + if (!is_async(category)) { // if switching to async too, keep driver going + pb_StopTimer(&timers->timers[pb_TimerID_DRIVER]); + } + } + } + + pb_Timestamp currentTime = 0; //get_time(); + + /* The only cases we check for asynchronous task completion is + * when an overlapping CPU operation completes, or the next + * segment blocks on completion of previous async operations */ + if( asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(category) ) ) { + + struct pb_async_time_marker_list * last_event = get_last_async(timers); + /* CL_COMPLETE if completed */ + + cl_int ciErrNum = CL_SUCCESS; + cl_int async_done = CL_COMPLETE; + + ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Querying EventInfo!\n"); + } + + if(is_blocking(category)) { + /* Async operations completed after previous CPU operations: + * overlapped time is the total CPU time since this set of async + * operations were first issued */ + + // timer to switch to is COPY or NONE + // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP + // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization + // it seems that the extra sync wall time isn't being recorded anywhere + if(async_done != CL_COMPLETE) + accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), + timers->async_begin,currentTime); + + /* Wait on async operation completion */ + ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Waiting for Events!\n"); + } + pb_Timestamp total_async_time = record_async_times(timers); + + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + // If it did finish, then accumulate all the async time that did happen into OVERLAP + // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed. + if(async_done == CL_COMPLETE /*cudaSuccess*/) + timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time; + + } else + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding + // so something is deeper in stack + if(async_done == CL_COMPLETE /*cudaSuccess*/) { + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers); + } + // else, this isn't blocking, so just check the next time around + } + + subtimerlist = timers->sub_timer_list[category]; + struct pb_SubTimer *subtimer = NULL; + + if (label != NULL) { + subtimer = subtimerlist->subtimer_list; + while (subtimer != NULL) { + if (strcmp(subtimer->label, label) == 0) { + break; + } else { + subtimer = subtimer->next; + } + } + } + + /* Start the new timer */ + if (category != pb_TimerID_NONE) { + if(!is_async(category)) { + if (subtimerlist != NULL) { + subtimerlist->current = subtimer; + } + + if (category != timers->current && subtimer != NULL) { + pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); + } else if (subtimer != NULL) { + pb_StartTimer(&subtimer->timer); + } else { + pb_StartTimer(&timers->timers[category]); + } + } else { + if (subtimerlist != NULL) { + subtimerlist->current = subtimer; + } + + // toSwitchTo Is Async (KERNEL/COPY_ASYNC) + if (!asyncs_outstanding(timers)) { + /* No asyncs outstanding, insert a fresh async marker */ + insert_submarker(timers, label, category); + timers->async_begin = currentTime; + } else if(!is_async(timers->current)) { + /* Previous asyncs still in flight, but a previous SwitchTo + * already marked the end of the most recent async operation, + * so we can rename that marker as the beginning of this async + * operation */ + + struct pb_async_time_marker_list * last_event = get_last_async(timers); + last_event->timerID = category; + last_event->label = label; + } // else, marker for switchToThis was already inserted + + //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running + if (!is_async(timers->current)) { + pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]); + } + } + } + + timers->current = category; +#endif +#endif +} + +void +pb_PrintTimerSet(struct pb_TimerSet *timers) +{ +#if 0 +#ifndef DISABLE_PARBOIL_TIMER + pb_Timestamp wall_end = 0; //get_time(); + + struct pb_Timer *t = timers->timers; + struct pb_SubTimer* sub = NULL; + + int maxSubLength; + + const char *categories[] = { + "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute" + }; + + const int maxCategoryLength = 10; + + int i; + for(i = 1; i < pb_TimerID_LAST-1; ++i) { // exclude NONE and OVRELAP from this format + if(pb_GetElapsedTime(&t[i]) != 0) { + + // Print Category Timer + printf("%-*s: %f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i])); + + if (timers->sub_timer_list[i] != NULL) { + sub = timers->sub_timer_list[i]->subtimer_list; + maxSubLength = 0; + while (sub != NULL) { + // Find longest SubTimer label + if (strlen(sub->label) > maxSubLength) { + maxSubLength = strlen(sub->label); + } + sub = sub->next; + } + + // Fit to Categories + if (maxSubLength <= maxCategoryLength) { + maxSubLength = maxCategoryLength; + } + + sub = timers->sub_timer_list[i]->subtimer_list; + + // Print SubTimers + while (sub != NULL) { + printf(" -%-*s: %f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer)); + sub = sub->next; + } + } + } + } + + if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0) + printf("CPU/Kernel Overlap: %f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP])); + + float walltime = (wall_end - timers->wall_begin)/ 1e6; + printf("Timer Wall Time: %f\n", walltime); + +#endif +#endif +} + +void pb_DestroyTimerSet(struct pb_TimerSet * timers) +{ +#ifndef DISABLE_PARBOIL_TIMER + /* clean up all of the async event markers */ + struct pb_async_time_marker_list* event = timers->async_markers; + while(event != NULL) { + + cl_int ciErrNum = CL_SUCCESS; + ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker); + if (ciErrNum != CL_SUCCESS) { + //fprintf(stderr, "Error Waiting for Events!\n"); + } + + ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) ); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Release Events!\n"); + } + + free((event)->marker); + struct pb_async_time_marker_list* next = ((event)->next); + + free(event); + + // (*event) = NULL; + event = next; + } + + int i = 0; + for(i = 0; i < pb_TimerID_LAST; ++i) { + if (timers->sub_timer_list[i] != NULL) { + struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; + struct pb_SubTimer *prev = NULL; + while (subtimer != NULL) { + free(subtimer->label); + prev = subtimer; + subtimer = subtimer->next; + free(prev); + } + free(timers->sub_timer_list[i]); + } + } +#endif +} + +static pb_Platform** ptr = NULL; + +// verbosely print out list of platforms and their devices to the console. +pb_Platform** +pb_GetPlatforms() { + if (ptr == NULL) { + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + if (num_platforms == 0) return NULL; + + ptr = (pb_Platform **) malloc(sizeof(pb_Platform *) * (num_platforms + 1)); + cl_platform_id* ids = (cl_platform_id *) malloc(num_platforms * sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, ids, NULL); + + unsigned int i; + for (i = 0; i < num_platforms; i++) { + ptr[i] = (pb_Platform *) malloc(sizeof(pb_Platform)); + ptr[i]->clPlatform = ids[i]; + ptr[i]->contexts = NULL; + ptr[i]->in_use = 0; + ptr[i]->devices = NULL; + + size_t sz; + clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, 0, NULL, &sz); + char* name = (char *) malloc(sz + 1); + clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, sz, name, NULL); + name[sz] = '\0'; + ptr[i]->name = name; + + clGetPlatformInfo(ids[i], CL_PLATFORM_VERSION, 0, NULL, &sz); + char* version = (char *) malloc(sz + 1); + clGetPlatformInfo(ids[i], CL_PLATFORM_VERSION, sz, version, NULL); + version[sz] = '\0'; + ptr[i]->version = version; + } + ptr[i] = NULL; + + free(ids); + } + + return (pb_Platform**) ptr; +} + +pb_Context* +createContext(pb_Platform* pb_platform, pb_Device* pb_device) { + pb_Context* c = (pb_Context*) malloc(sizeof(pb_Context)); + cl_int clStatus; + cl_context_properties clCps[3] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)(pb_platform->clPlatform), 0 + }; + c->clContext = + clCreateContext(clCps, 1, (cl_device_id*)&pb_device->clDevice, NULL, NULL, &clStatus); + c->clPlatformId = pb_platform->clPlatform; + c->clDeviceId = pb_device->clDevice; + c->pb_platform = pb_platform; + c->pb_device = pb_device; + pb_platform->in_use = 1; + pb_device->in_use = 1; + unsigned int i = 0; + if (pb_platform->contexts == NULL) { + pb_platform->contexts = (pb_Context**) malloc(2*sizeof(pb_Context*)); + } else { + for (i = 0; pb_platform->contexts[i] != NULL; i++) {}; + pb_platform->contexts = (pb_Context**) realloc(pb_platform->contexts, + (i+1)*sizeof(pb_Context*)); + } + pb_platform->contexts[i+1] = NULL; + pb_platform->contexts[i] = c; + return c; +} + +// choose a platform by name. +pb_Platform* +pb_GetPlatformByName(const char* name) { + pb_Platform** ps = (pb_Platform **) pb_GetPlatforms(); + if (ps == NULL) return NULL; + if (name == NULL) { + return *ps; + } + + while (*ps) { + if (strstr((*ps)->name, name)) break; + ps++; + } + return (pb_Platform*) *ps; +} + +pb_Device** +pb_GetDevices(pb_Platform* pb_platform) { + if (pb_platform->devices == NULL) { + cl_uint num_devs; + cl_device_id* dev_ids; + clGetDeviceIDs((cl_platform_id) pb_platform->clPlatform, + CL_DEVICE_TYPE_ALL, 0, NULL, &num_devs); + if (num_devs == 0) return NULL; + + pb_platform->devices = + (pb_Device **) malloc((num_devs + 1) * sizeof(pb_Device *)); + dev_ids = (cl_device_id *) malloc(sizeof(cl_device_id) * num_devs); + clGetDeviceIDs((cl_platform_id) pb_platform->clPlatform, + CL_DEVICE_TYPE_ALL, num_devs, dev_ids, NULL); + + unsigned int i; + for (i = 0; i < num_devs; i++) { + pb_platform->devices[i] = (pb_Device *) malloc(sizeof(pb_Device)); + + pb_platform->devices[i]->clDevice = dev_ids[i]; + pb_platform->devices[i]->id = i; + + size_t sz; + clGetDeviceInfo(dev_ids[i], CL_DEVICE_NAME, 0, NULL, &sz); + char* name = (char *) malloc(sz + 1); + clGetDeviceInfo(dev_ids[i], CL_DEVICE_NAME, sz, name, NULL); + name[sz] = '\0'; + pb_platform->devices[i]->name = (char *) name; + + cl_bool available; + clGetDeviceInfo(dev_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); + pb_platform->devices[i]->available = (int) available; + + pb_platform->devices[i]->in_use = 0; + } + pb_platform->devices[i] = NULL; + } + return (pb_Device **) pb_platform->devices; +} + +// choose a device by name. +static pb_Device* +pb_SelectDeviceByName(pb_Device **ds, const char* name) { + if (ds == NULL) return NULL; + if (name == NULL) return *ds; + while (*ds) { + if (strstr((*ds)->name, name)) break; + ds++; + } + + return *ds; +} + +// choose a device by name and set the device's 'in_use' flag. +pb_Device* +pb_GetDeviceByName(pb_Platform* pb_platform, const char* name) { + pb_Device** ds = (pb_Device **) pb_GetDevices(pb_platform); + pb_Device *d = pb_SelectDeviceByName(ds, name); + + if (d) d->in_use = 1; + + return d; +} + +void +pb_ReleasePlatforms() { + if (!ptr) return; + pb_Platform** cur_ptr = ptr; + while (*cur_ptr) { + pb_Platform* pfptr = *cur_ptr++; + if (pfptr->devices) { + pb_Device** dvptr = pfptr->devices; + while (*dvptr) { + pb_Device* d = *dvptr++; + free(d->name); + free(d); + } + free(pfptr->devices); + } + if (pfptr->contexts) { + pb_Context** cptr = pfptr->contexts; + while (*cptr) { + free(*cptr++); + } + free(pfptr->contexts); + } + free(pfptr->name); + free(pfptr); + } + free(ptr); + ptr = NULL; +} + +pb_Platform* +pb_GetPlatformByNameAndVersion(const char* name, const char* version) { + pb_Platform** ps = (pb_Platform **) pb_GetPlatforms(); + if (ps == NULL) return NULL; + if (name == NULL) return *ps; + while (*ps) { + if (strstr((*ps)->name, name) && strstr((*ps)->version, version)) break; + ps++; + } + return (pb_Platform*) *ps; +} + +/* Return a pointer to the device at the specified index, or NULL. + * Used by pb_GetDevice. */ +static pb_Device * +select_device_by_index(pb_Device** ds, int id) +{ + int i = 0; + pb_Device** p = ds; + while (*p && (i < id)) { p++; i++; } + return *p; +} + +/* Return a pointer to the device with the specified type, or NULL. + * Used by pb_GetDevice. */ +static pb_Device * +select_device_by_type(pb_Device** ds, + enum pb_DeviceSelectionCriterion criterion) +{ + cl_device_type sought_type; + + /* Determine the OpenCL device type to search for */ + switch(criterion) { + case pb_Device_CPU: + sought_type = CL_DEVICE_TYPE_CPU; + break; + case pb_Device_GPU: + sought_type = CL_DEVICE_TYPE_GPU; + break; + case pb_Device_ACCELERATOR: + sought_type = CL_DEVICE_TYPE_ACCELERATOR; + break; + default: + fprintf(stderr, "pb_GetDevice: Invalid device type"); + exit(-1); + } + + /* Find the device */ + { + pb_Device** p = ds; + cl_device_type type; + while (*p) { + clGetDeviceInfo(((cl_device_id) ((*p)->clDevice)), CL_DEVICE_TYPE, + sizeof(cl_device_type), &type, NULL); + if (type == sought_type) break; + } + + return *p; + } +} + +pb_Device* +pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device) +{ + pb_Device** ds = (pb_Device **) pb_GetDevices(pb_platform); + + // The list of devices must be nonempty + if (ds == NULL || *ds == NULL) { + fprintf(stderr, "Error: No device is found in platform: name = %s, version = %s\n.", pb_platform->name, pb_platform->version); + exit(-1); + } + + pb_Device *selected_device = NULL; + + if (device != NULL) { + /* Use 'device' to select and return a device. + * If unable to select a device, fall + * back on the default selection mechanism. */ + switch(device->criterion) { + case pb_Device_INDEX: + selected_device = select_device_by_index(ds, device->index); + break; + case pb_Device_GPU: + case pb_Device_CPU: + case pb_Device_ACCELERATOR: + selected_device = select_device_by_type(ds, device->criterion); + break; + case pb_Device_NAME: + selected_device = pb_SelectDeviceByName(ds, device->name); + break; + default: + fprintf(stderr, "pb_GetDevice: Invalid argument"); + exit(-1); + } + } + + /* By default or if user-specified selection failed, + * select the first device */ + if (selected_device == NULL) + selected_device = *ds; + + /* Set the in_use flag */ + selected_device->in_use = 1; + + return selected_device; +} + +pb_Device* +pb_GetDeviceByEnvVars(pb_Platform* pb_platform) { + + /* Convert environment variables to a 'pb_DeviceParam' */ + struct pb_DeviceParam *param = NULL; + + char* device_num = getenv("PARBOIL_DEVICE_NUMBER"); + if (device_num && strcmp(device_num, "")) { + int id = atoi(device_num); + param = pb_DeviceParam_index(id); + } + else { + char* device_name = getenv("PARBOIL_DEVICE_NAME"); + if (device_name && strcmp(device_name, "")) { + param = pb_DeviceParam_name(strdup(device_name)); + } + else { + char* device_type = getenv("PARBOIL_DEVICE_TYPE"); + if (device_type && strcmp(device_type, "")) { + if (strcmp(device_type, "CPU") == 0) + param = pb_DeviceParam_cpu(); + else if (strcmp(device_type, "GPU") == 0) + param = pb_DeviceParam_gpu(); + else if (strcmp(device_type, "ACCELERATOR") == 0) + param = pb_DeviceParam_accelerator(); + } + } + } + + /* Get a device */ + pb_Device *d = pb_GetDevice(pb_platform, param); + pb_FreeDeviceParam(param); + + return d; +} + +pb_Platform* +pb_GetPlatformByEnvVars() { + char* name = getenv("PARBOIL_PLATFORM_NAME"); + char* version = getenv("PARBOIL_PLATFORM_VERSION"); + + /* Create a pb_PlatformParam object (or NULL) representing the data from the + * environment variables */ + struct pb_PlatformParam *platform; + + if (name) { + if (version) { + platform = pb_PlatformParam(strdup(name), strdup(version)); + } + else { + platform = pb_PlatformParam(strdup(name), NULL); + } + } + else { + platform = NULL; + } + + /* Convert to a platform */ + pb_Platform *p = pb_GetPlatform(platform); + pb_FreePlatformParam(platform); + + return p; +} + +/* Choose an OpenCL platform based on the given command-line parameters. + * If NULL, use the default OpenCL platform. */ +pb_Platform* +pb_GetPlatform(struct pb_PlatformParam *platform) { + if (platform != NULL) { + /* Try to use command-line parameters to choose platform */ + char *name = platform->name; + char *version = platform->version; + + if (!name) { + fprintf(stderr, "Internal error: NULL pointer"); + exit(-1); + } + + if (version) { + pb_Platform* p = pb_GetPlatformByNameAndVersion(name, version); + if (p) return p; + } + + pb_Platform* p = pb_GetPlatformByName(name); + if (p) return p; + } + + pb_Platform* p = pb_GetPlatformByName(NULL); + if (p == NULL) { + fprintf(stderr, "Error: No OpenCL platform in this system. Exiting."); + exit(-1); + } + return p; +} + +//extern void perf_init(); +//extern void mxpa_scheduler_init(); + +pb_Context* +pb_InitOpenCLContext(struct pb_Parameters* parameters) { +#if 0 + pb_Platform* ps = pb_GetPlatform(parameters->platform); + if (!ps) return NULL; + pb_Device* ds = pb_GetDevice(ps, parameters->device); + if (!ds) return NULL; + + /* HERE INITIALIZE TIMER */ + //perf_init(); + //mxpa_scheduler_init(); + + pb_Context* c = createContext(ps, ds); + pb_PrintPlatformInfo(c); + return c; +#endif + cl_int _err; + cl_platform_id platform_id; + cl_device_id device_id; + cl_context context; + clGetPlatformIDs(1, &platform_id, NULL); + clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL); + context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err); + + pb_Context* c = (pb_Context*)malloc(sizeof(pb_Context)); + c->clContext = context; + c->clDeviceId = device_id; + c->clPlatformId = platform_id; + c->pb_platform = (pb_Platform*)malloc(sizeof(pb_Platform)); + c->pb_device = (pb_Device*)malloc(sizeof(pb_Device)); + c->pb_platform->devices = (pb_Device**)malloc(sizeof(pb_Device*) * 2); + c->pb_platform->devices[0] = c->pb_device; + c->pb_platform->devices[1] = NULL; + c->pb_platform->contexts = (pb_Context**)malloc(sizeof(pb_Context*) * 2); + c->pb_platform->contexts[0] = c; + c->pb_platform->contexts[1] = NULL; + c->pb_platform->in_use = 1; + c->pb_device->in_use = 1; + return c; +} + +void +pb_ReleaseOpenCLContext(pb_Context* c) { + pb_ReleasePlatforms(); +} + +void +pb_PrintPlatformInfo(pb_Context* c) { + /*pb_Platform** ps = pb_GetPlatforms(); + if (!ps) { + fprintf (stderr, "No platform found"); + return; + } + + printf ("********************************************************\n"); + printf ("DETECTED OPENCL PLATFORMS AND DEVICES:\n"); + printf ("--------------------------------------------------------\n"); + + while (*ps) { + printf ("PLATFORM = %s, %s", (*ps)->name, (*ps)->version); + if (c->pb_platform == *ps) printf (" (SELECTED)"); + printf ("\n"); + + pb_Device** ds = (pb_Device **) pb_GetDevices((*ps)); + if (ds == NULL) { + printf (" + (No devices)\n"); + } else { + while (*ds) { + printf (" + %d: %s", (*ds)->id, (*ds)->name); + if (c->pb_device == *ds) printf (" (SELECTED)"); + printf ("\n"); + ds++; + } + } + + ps++; + } + printf ("********************************************************\n");*/ +} + +#ifdef MEASURE_KERNEL_TIME + +#undef clEnqueueNDRangeKernel + +//extern void pin_trace_enable(char*); +//extern void pin_trace_disable(char*); + +cl_int +pb_clEnqueueNDRangeKernel(cl_command_queue q/* command_queue */, + cl_kernel k/* kernel */, + cl_uint d/* work_dim */, + const size_t * o/* global_work_offset */, + const size_t * gws/* global_work_size */, + const size_t * lws/* local_work_size */, + cl_uint n/* num_events_in_wait_list */, + const cl_event * w/* event_wait_list */, + cl_event * e/* event */) { + + char buf[128]; + struct timeval begin, end; + clGetKernelInfo(k, CL_KERNEL_FUNCTION_NAME, 128, buf, NULL); + +#if 0 + int i; + for (i = 0; i < d; i++) { + printf ("%s: %d: %d / %d\n", buf, i, gws[i], (lws == NULL ? 0 : lws[i])); + } +#endif + + clFinish(q); clFlush(q); + //pin_trace_enable(buf); + //gettimeofday(&begin, NULL); + cl_int result = clEnqueueNDRangeKernel(q, k, d, o, gws, lws, n, w, e); + clFinish(q); clFlush(q); + //gettimeofday(&end, NULL); + //pin_trace_disable(buf); + //float t = (float)(end.tv_sec - begin.tv_sec) + (end.tv_usec - begin.tv_usec) / 1000000.0f; + fflush(stdout); + fflush(stderr); + //printf ("PBTIMER: %s: %f\n", buf, t); + return result; +} + +#endif + +void +pb_sig_float(char* c, float* p, int sz) { + int i; + double s = 0.0; + for (i = 0; i < sz; i++) s += p[i] * (float)(i+1); + printf ("[Signature] %s = %lf\n", c, s); +} + +void +pb_sig_double(char* c, double* p, int sz) { + int i; + double s = 0.0; + for (i = 0; i < sz; i++) s += p[i]; + printf ("[Signature] %s = %lf\n", c, s); +} + +void +pb_sig_short(char* c, short* p, int sz) { + int i; + long long int s = 0; + for (i = 0; i < sz; i++) s += p[i]; + printf ("[Signature] %s = %lld\n", c, s); +} + +void +pb_sig_int(char* c, int* p, int sz) { + int i; + long long int s = 0; + for (i = 0; i < sz; i++) s += p[i]; + printf ("[Signature] %s = %lld\n", c, s); +} + +void +pb_sig_uchar(char* c, unsigned char* p, unsigned int sz) { + int i; + unsigned long long int s = 0; + for (i = 0; i < sz; i++) s += p[i]; + printf ("[Signature] %s = %lld\n", c, s); +} + +void pb_sig_clmem(char* s, cl_command_queue command_queue, cl_mem memobj, int ty) { + size_t sz; + if (clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &sz, NULL) != CL_SUCCESS) { + printf ("Something wrong.\n"); + assert(0); + } else { + printf ("size = %d\n", sz); + } + char* hp; // = (char*) malloc(sz); + //posix_memalign((void**)&hp, 64, sz); + hp = (char*)malloc(sz); + + clEnqueueReadBuffer (command_queue, + memobj, + CL_TRUE, + 0, + sz, + hp, + 0, + NULL, + NULL); + + if (ty == T_FLOAT) pb_sig_float(s, (float*)hp, sz/sizeof(float)); + if (ty == T_DOUBLE) pb_sig_double(s, (double*)hp, sz/sizeof(double)); + if (ty == T_INT) pb_sig_int(s, (int*)hp, sz/sizeof(int)); + if (ty == T_SHORT) pb_sig_short(s, (short*)hp, sz/sizeof(short)); + if (ty == T_UCHAR) pb_sig_uchar(s, (unsigned char*)hp, sz/sizeof(char)); + + free(hp); +} + diff --git a/benchmarks/opencl/sad/reference.bin b/benchmarks/opencl/sad/reference.bin new file mode 100755 index 0000000000000000000000000000000000000000..94fb04f5014bcdfc10c383c6fddc6e29daa3ef30 GIT binary patch literal 50692 zcmch=hqF~zmgaf>hmMKpvY9>IQ{7Y5*;$oY>1EQ$d+$IXfdmMI7a+V7-W%bKFaiX~ z0ErXc2oT;#0=bk+SDBSnT|E;MGZEd>^ZT9eZJmcvm9o2P;>5f6zI%_`-~RTu*IIk+ zeO{}kSKp{ERR6vDJO14MYxRHg`rml}ue|>E>i?+U|6BF{^8Wwu-`}u(Qhi)~R{etQ z=e+;C`c?H?UQbmoSM#cc)l1d>>U4FsdZ#+iU)j!cl?&CoZ0}vReNg?3{Vr4=anz1# zMKzbZdcIm*EvP0}1FL7NVf_DEb&T_U$etgzY#&v7xzgKgXR9+@-R&$_US0L8hE&ry zYB1+IQJv$+(=GS$7XM}M)BN&Ab+9^G9j-Q3ld93xI`(;wGo9zvJAb?0_Y}t-rd&O% zQPn2)KFbd~<-S{gR{J ztF~81_`IX~&wP8TI$3R@+()Zj)!*}{{KGPS+2N}b4;&h@cKLmjRdKIpuLf6f6D!R47zSX_@kDyzg43uy2nA{ z46k#mXDIV?9Niy;Pf_-h^}cU`M?qM84^!4-yzZ&ifX5`texBdG``rJzy53H*=k98L z)uS2?Zbv!7E4dxv@2zSrR~yggclhNXb-tG~9;jZZdR2X4#lh-p9Q_)#zJootS5x_Y z8?RgVd#!pE2JNlZQ{r8mVOh0-(tQNJpVY6v2H&4in_D>BV%U3ttG&;4tYPo-%i#My z=w9I58)54-t~niaW>*to;LvI;rP{)|KB5*st*`tMuLa+qf%hj|>AoM0@JpzXh@%<6*dnVrw1;fAOTpxnaCJ^4s-5ln~SF0Ck zF9+fKDz0{jyTQ{2fwU7VBgI!WJUE;&MUUw-8uFX-@@FypXVw?BSh5l ze!#ij+(a+t%``Pe;=@Y$Qf)OZ*jGgHBz$l z6nA86oJ)%yL_PIKUsi#~S?;UwSA=bwGEDgV$S8aZ&o|2t?b(cj{G&O{1_yU()NGN{hp!pGD8*ksD-Vc z!l2J6>&La~s<|J6`pY0PkGns>5%1P@E|#*^l8a_h;Gi3Xtpn@gL+*Gxyr0ZAiP~NW z2Zn&gNRZylZ|~J8e!#oDdyBJ)vDzTIZWrqB&w*8NI>A*>QTF{DeFj$bL4hZL@*rBC z>g`#)yWlGWZRN6UFJ;(YiSQ&oeOTY2ED%+--Sdm{ylMb{d+PsA@v1tj5y#<2cX;O6 zq96O}b~Kq5^smsK{qX*5jiwy=r0%usyFRTqu@-H=_q&g&i&tR$QrgTqnEM{bcsJP! zmi~7+zTxmY+{rdt!%P@6gPI?YS`Ovj$Dk5>DF3I_<0q8!qZ&oq!#U2@aJb+rR%hzc zr_DCJya2-%p_zkFoZ+bPO3EEhMkmx?H#vM1{vP1>gJ9R2dyy}xUzuQ^>0YYsANyc=_G$@f&JZK>VF{hHicF?6gC{^Tyj?y zTPrGrM;xa;YYiNvJ+J|UQE%5&e@2_!!1=uY&p^?>%Q9HbXr5@>PS0@6I_g-h-oyRaGZ#&DPcg7Af^AUp zch4;{A5gMgFlj0pFuAr6Mo{NnxT~4e)?rGdWptkXttXH1DDNxhwU$*Z&-6a$wGR@n z=;#SPrR|@o5%#xJlx#TVo`MxMkor5q5vTaG4-;L{5g(P}7#g`7OlE@cbpF2pQr3kn zB>Ntr7S2-(6RHukzIW>5JVVs!9KBQ@O170_PEwi!V5Bbm8|wBwTBMlDR~c)mv{6)C z?b}a4b3N^17A#o7U7V=vPi>c3!8Hm|uq}%0|04B1*W8NQPNc=mz%H1J7WA$d&@~bClOUF#Vex)PfMZGxcv;w@2XG2(X<1!q1{U zhxpyQ7BjI8z9K9a4}!^75FS8}d!(iQY$58bJl8rq4`w4M$LpUTt->7TjPGIvbfbK4 z@Yz0aKm6TLeU&S}OWA^}43Vwp+2<7Bf6D(y!FD0o&VywKxepc0tDb=`g=@0)y}E32 zI;=j$9?tl6?wy?jc{+8{x_1|yn_uo~&tA0bdRP@cXlO9}oILuoLhtE@z z;H#FXrQxR}}Sf~n6^me=`D<<=%U%Vz&Hwdz1E>f^o1 zubaSkB-is!f5m3MeHu)bb0<^i#pczmc^$QSj4PEbiWGce8{tI^b-+|=s z>i2y59Vq{%-s?BKi}r6R>tZxwEU0Kv9Io$8g(x0JY5#h2o3ZC#MgazZ!fDDOd)-fr z)!H}t<~ZmNp(L;K%|6a&snzQDsi9$bH*0Ey^)uw}ht$6@gcG#aT`+b&ZFw?1`XpNL zYc+4}1*{`&$S7WXh@zOW8XAl2DEGJuCD0#R3dX}~Z*>rNwSzOoqBG_YeTm%>o~jFP zQ;K(Z&lb(H<%{YsvC{vx1ygbT$F}hKH`PB>{})G|re4QWzr(nTJzUu~pVlvO>8B1; z+U+2`tyYPlAbgCwILUtR!@_8>HtuPZe<8Islh0arq9HP8Vd@_0@EOXlKeqvVC&G%) z_^cABkZu#WlbP62d#Nc~h>9C@tw;5~(b@E6wsS51oYfkd1GWpOrzD_)K{Ys_Db3juY=GG_MFfCzRz*C$;LO)_B{Z5^uKpfC+ea-(pgaWh}!7`yZ3#IjEwy?EzB*gz2Mjy+tw&IpBsczGyf;oeU%p5 zos#M`TNaT?|D`2uR83258whWpBtzh^eeWCKA0II7SL=5VHP8=~H&Z$-knqladKEZM z<@a^fpThE2(CeRay|+1wJ;5?+M?S{au}6y?YtQDr#Geu)tvGG8^L)3P<0fN`j04+6 z=(qjYi2CaK+#j-+m=?Vb!c7mR);63K+Y+Y}+oGevtHHN?Qgh3uDwOZbcj^2778d_K zDzlT;)QkS*4N7LqbhA|Adx&ojfUx}C0?Vdjdn~729);JZ*x$_d z6sey?YK6*MZQJG4R)*(5sZwNSgMw$k@}l_Ze6<3?6MlTeQE$ z*S7s0pQocPOTc#wpYiNH!L|au8zWeU z;M%j`JGnk4KA^9*{>1Oa)-5(_+C|u_PxTIG*a)_hPy{2lBe1PIajm)3=?Ab@wJp7s zNCv~aU+GnALH2$Ezii7kBE1D`KdJd0h6cAFeI;DWckUs+pP>k1 zAbW#FYplNbbE$=bVC(nvZk{h~;v+1I!}R0h=_N+NQQ4>!Cek8&7=#a_0Qj7Lw@bV->vKVxOVZmsY#yk-;K=26Yi%Ssm?=*-?bx>wo@ljoF4o_ZaH63^kaJ z9!y2w`@!4J@b(4n=p%ZMpVfP(M=VxBEZKtOrSFL_#9q+KE%rhf_r-{Q(J!r$&F6w| z(A5(84GOe_`yY*`HU{KG#*qrSBZD1@&|2FLJ7wmpsL4=Do_;AB9IGQZ8+~8OS>k!y z!^znlwLXY+{D*?4{aX2)cNO^@x}+jZqfTbhJB)#`Pt$*n#-2M-+y6iNybp_=5VZ}u z@y9O3*8hz+WZX&JjV%^Lg3cG=_(k7eif>Sjmj6v%d)DA@xauSrJi3m1ZTTEw#~@!lNQrp+<#Xg(F=R_2u%mvxPWfj7B z=s{Y+e2^W_-!xwPgKZzQe;s%8KIj^ul(q35TH6X=jBhkSosPv2Ya#p+m46O=Q&Z)& zHNO8qzdw|gKZ+JO3JyDdcNm1kH|W~W9e~-6BJH71974JG)G8rE<(wEh24c%zhP`iG z#eUMh?;OYeDf(&LCJIro&3o}7j@3Fp6Sba<=QD$ztZO}bHy`D4-v(dp6Z^t~tTvc&VC}Ev@O2C|Hv@DhQsaHO zpQT*Q5ieVd^$;ZEmE|?P+m-YnR$0b3G9nRc*9endaPijuM;UyvKBT4!s(CNh_$_6g z3|EG+b;1hR2$r#I)!#!X?{3Pew{-;cx1z@LP>HFa?MP*b7|3HIKSo5SqM2JkB$mQ{ z_&b4LO1~$%rCv^Rc4Ix`!FD1DPv9P&;q3YiCu^?S>jhgG8YIKpg7B5lbr1bMk99;! zo43K&lK;!%n=9lh=E$6cHQni-hcK4XpZ0CMQ41p35&LMn_Umt=rCZUHt;BUi*wLcK zM~GUS1K~YjHGx{u);a@%e}evcM#tqJQjukC ze1eSGh^FWjbc@y78s+#UDv2@C^W4cPJ|!cl;cwH&6)t{JUv{PM%Q1Psl)t~OTZz~D zCFts1o(8kt;5(`=V{#OdNn%8JKX>>xh#Py~j76}kMtC~kyxYQHtpPnQt%|e!J`OhS z2PY#Ge+0fN`av+(dppZ(>Tfy=kL7j@`V= z{X6C~g?c`~88fbG|MoA7Z==28@Gr5o&E>GKIYqgwM$X`fgqamTr?mh7;%kngIbQk# zj`|w&8jO`RfNe3@I413H>ZUfAHkuqZ?z0Vq2ZG!Q*bhd5&XIz|6&Lh z#bRP7SzJpytc+S``Yd@#-ASSEoyxgx?{G^ zT-dvq*z_S#*JG1MzxWs7Z#?4STgc!>C4#S+f_h^5zAI{Mhw(ZLxFbrD@ zdoRW}2+P~jzm@pTIqq~bh)$;d%!FFX-*mPeoZS&h?cQiZQD5~p$|*i2vhYQEd+GPB z^|yS^*@AOH*sK1W8a8^ZeQNKt5^fEEU43EMO8O;vZPZ14?WIImn@yG24MQE5ahx~f zpjt)x#YcHPMkz;NpY^IOj|sGlUg&}OF5B3{)|?1fgQlMf(uKo$FPa;#SP$+!?%c7` zH;CTPfwSi7I2Jdr{%;5+SjN4WZ0#RFA95bEa)!g) z=cutglv&ysKdp!*TwiIzI2QlUIen^`I`l*bKtJ=lp$QPCsyJf>Q8kj_+HB4 z;80i`lp55&{HK2$-(PSC_EFjj;qWO?>5U~k8XosUC5)vXsg+n;BI6rI-*>{DL-b@5 zD2u*zY}<@Mn1N>Q#eVFw?)*L$1YYJ$w&;S@#SAU~HTY(~%lT`@<9>RsMd*PZpUBSU zE=Gc^InRA)dn2*Tj&r4tD4n`nw%AT#@INoUqMWu+IIIn!1);rp0c?{`W{&n*P<^Jp z%bvV?r^d)s))M*Ik3#IM(cTKj&86E$9To3P-@-`7A(;LrL~fUY|4B;v8E4i8F8!0( zeG%G;-^lL|IYQ4x9+*3^9%QGZGIM!16Uy--^{g*^=tJ*06@PmdcWWEbHnV5TRxF5E z5&x3-suICh1^$F`%2MMvW>A~o^AdX5xAv+WBQ|Tv8qhXKMz$IuM|Z%>#qeb!Wm0GC z3lsG@!SAnEe?aMD9cy2d=vP{Q!PfV}-@;eNjvcesqE)LeP+uERbYsHiN-yV6WX%QW zMW3nVGl}oEaDQsEwpx0(f^RH{E8DE(K$W-suo{?PtkfhU0j#eTlxsM)!9LJ1ZYh5gN#0%kJG3Dqz)fs1XsQIw zcyIcwq6zVG&++?vobe=WC37dtdYF%z8xJz0w=dZC1mRJ9>uiL#!8A6sN+f$*^S4-N zrH^aP;MO|a;P&M{Il5d%lpS|!jjsyv3+_a}H8Tbrzc4%WEO$H{J#$382W{MbEn`4$ zpvNL?-1ki)1}o^n7S+DVL9jH|E5iQmBY$!<+_y(7+%4+R8sGe;auoi)4+lR5ucOrS zY&aXcYY85milvX=OOJ@=ac95&-qj;V~F|Ukow7Bsyik}>$n{j&4x)Qbx!y7#P zByZh6$A!&N2sQXul*N*$!1n#-LmuOf-=VdQfF1f;y(s057JWEe?~x2$x%D#mz5;{# zfaMmBD$#Z0lV5>b!8Zs;S4GhnL*lujqI!sojFoxjW$mNgWG>YbG-wfbFs^3pv*7zI zcQpeo-p$#KE|@8uaenJ4x>7Rbi$Ywfm$R=c=*DIXR+l2||K_5Vd^9l?H!~L#i;AyP z?Co;a;QLET=-6hmA^qfLymmAA7?16R9WV`qPr+FEtM>Z)DgN7mk1NItVEYJAG4syR zytDA#{M3SPLAdCH_!j1B*%?9jfY$T@=igQ{HTJ=Lyt1jBX9(LMI6IggbrITbj$X#+ zQ!hE2_$KeE%%amnwv`rqqrt6JqM#vz!@|F34%vSuTSOdXVu^*>+or8&6xdO;Y|c}W zmzF)zSz}(w%r`eb8R-RK-;MjeLpg@QUvpf0!kB{`yO&=40B6&0))F%-@MS#W4X~^i zR_+0qs9wv!%keF_VGV!9JqS65Y{uy#`mh;n3pnE}&>RVG9R)BhG@ol7hO?h?jFxa= zezYMPlibqQmT$0)eO8c_yU|KfH7imJ>+itzAHdb9kkKZuk<1LET|w13L3#qlx?X3~ zn>A|SwuA5Varf2;inaMgMgSa(NSmn2}xx@N|Ul!j)hRjo21#jo$7cR&DG9S=r#b7q0r?cxUi?_K~XI>>|O;j7NF7=e$ zU3<3lF2z18{hPQJ8|`;+)a*RBzpZ(ialSJ(mfDyH>z+p>3v#>Rz#j113l>IV^kuUJ z=WTqqm2C(38%x**4$)g92xs~1CfDuh%fx@-jB-YTDL| z^!3hdvPW3RwwPXGIrhyO_HhJa6US@@i)?25ZsOi$_C8QiH_bGSPWWn8)CV9iiYWL{ zdb@70Y72Wh#%6rY`Ca>3?#UTHhiTP4*vIJbuFLS%k`QJ4G4<&r$8N$)Feb0Zu$a1W zHp4)+!L@}jiu>PNUn}--iCH$bUBT9K+P9S)C>fkBpMt3Q{f;Jo#NN@A#y8A%8OOaD-LQx44;p=7oEck21czgbPGax*)Wmw~CEEtxH`Q%> zC*|15RrGW-Zzev6c_VM}%LLA3wntaav77G_uQ1kNHq~J;HvVJXJ9~3FSRchcYx=k3 zL1olO_8p*JS5gY^X%_6YEM}BAd%~Qn3E=I#STkhZf?nf67hKDGgRb)s%$OEYKQ;4~ zRCwblDoLWL^23Ps%b;r3g>k;|AT<^n`#D<9C{%wq+c4PWHV9<<&=+-QWcW$E(`--j zxijB%uRR^-FD<2>#PzkB|3Nv-cIKHrm|CpQrroCZwvDTFr0hLl@@&d|vbH2Le@=YW zDq|8yKzTNGF^Bhm!`&sKV-F?+E#YBWjW)BHrVGG!CdiukpxVlbG4N+Qd!2`W9ubQp zufe)_)&<-2kN%cihag<$E15YFmg@P~<8P>ppUtV)zW)y5ft z=10Zn(2_7C%=R;cS{eWby*PR&-<$goRQ2jaIoVaaslh%l@~hmVna^6JKLgnlaOFi* zz^KezuD_5LH-c?o-8M$TpIQ8F+}-G{V2K?`$>qFpldErcSj>%uN^z(i3SvV?fs0cBV{! z@a$Tv;nVf&Q()V<-qZrOj&QXN+aqlEfO0mGz8~zLq?JF#Uq8-13yfa|;lkmKa9X@& zwY>HIocXYyd)&i)b>I#LbL@ng!z%6pkQLwkl;H%7-VWC{a^|maw|l@-pb?>G;79D~ zk?7Y*IIHHWPNnB^oTV#hs<2Oz8qJFgS*y;ilQZaloA~UQ!T>f z#MgU*topVHeCL9!^L33V4uB!X-CU8x`75_!0|rz(nYDYDI8 z4Zme<&`pN2D(iXGS!d!u50)ZktSVZo!Uo~k1ZLl=2qM+1)fSGHR}mN6zI81`k@K6s zbpL$oO#5zAYx8Z@4e0ZOU}g5hbKJ>l`nJMg-_>L{E#~bY>3T88Xv2rtyL-)qo*=gm z-pcYsh#Ut|m5mMmA$TqYUE_&zBUu&;sSQV-RfFOuUyR(9|GU)Y4f^TVGQXf?Y98fo zSHl=*{f*&12ZEoh6;+ulhqYtvzxvkq6>N>h_5@{d_BD71OA!o`UC|3OzB{16kJZZ1 zj&HiaZrg9*2nXX1wN~80uTRs~$D;o)(xSvScC`%E~W?YxWXbqc4d!r{%1r9(%$~SAdv7 zk2aDvq>ZAb;+UdZ@pKEr+JaSEP?LA9@r|lBD%=IWcBq*eWX1JXTL1O+b|atf0_}&{ zt1D*`;jUc4+Up39ALjiTG=2s>()!vSmaQdf5sbs?9iV9BQjI$gN2gH|dt&n*xAI-A zp;(*8urB_9D=wr}xYm`C0mmF2p){xZe`ZThXf(adw$aCQH6-USJ^_NZ#xbaXbCQSF zYG8cIeyAIGwgcCPtB2s>BYf`kIlfv0GB^0@N!iEA+b7to1HD)5v4>&iU7&g++fBU7 z(fc@C8x*4}H5HZ6zU$5zdvR<}?qU+X!CH8x+P)4tWiw|dXlse759Ui}zQsKB%e*RA zm(k;JjLvqdJ->_mhrztdGW9^OI8k(GUb~g(*YAgQ(AA>NOa<%p6!*1;tJ%vN5lW;W zvG|PA#h>i}o)3eDsN9W0wgr)=YVJmPV-eU_q%O1^RMuDyYK1i<>S}O15PyVvx)1i= z&-=r0yc53`9absq6Fc)h2o5f#ovRS9gUxO>BbjbS-b7n}!;y|4=xm~c*acnSjCR5E zcqF#bpnr_xzKTj4lXhG>W5+=?k+k0*-NIYz$*89BKI_z||0-Hhd&=Dl6>!}lZCU^H zHammGW3cOCFuk$*VfBORYToY#k@ox|28FkdL>sT^3%;V+12+4={&9ac`$PB6Zz{9A zZeJta7PLF?f0r8H=!B}H8g=EaCeSaugg$HqP0@8TU&mKRH;t-mfn~OpD4w8Fi54}*p`43e9mjN8xT}f@(9qPT$p(;tMDVphhN3chNAvG})YYppW~kK!y#s1e6eT;=F1;A{Nr zQiSsV?~AYKW=tsAgvpLO#dTIu!vkyXYG3sMjlrPRow6B;X@lQ-T?@KDfVn?L8*WD{ zI#TYgHQR%%sH!QdOVAwwx?0MPSSK66sDhSStY7)s*uAQ)IIF)ZOV66y?a}6YVfGMg zs8_IA3%TZ^8F+?tWSRrY4FW zWVB6>&G@|e0gke4CCWIHy44%7o%f*~4dC+==wc!;H=+pNfxGJNwbf5x?_DTHN7&ts zqukrE{%l5lTtU%Ii`h88{ayE(i}sF2Bp*b(AL1NZ4C-z0jO7qN#BbCMt>D|xde=Ez zjsEIQ7(LrTTOWxZt6!YC-v?`a#ubBgfr9*p;php%-t|Eiwo&A2IPS>fSu}nlBcHRuaT;;X=RtTDiWYoV@yYB| zXMc`>x$XEHL>cu1ZmxNIRjmy_1Yg;E512e&w`z64Ha7xfjkUXFRLDH@ew?E_d)^7U zdT#fD(p?~|9dZ?l^DXq|X1+11uH`F-ty}v+Ee#b}#cqqWaueru{mx^xPqZIK7*}%^ zN++!K?dY8RbyPzwb}QKuel|DYC_UX>hFO__SARLAk^$_pIS9YAz z%t=S4j4&>ymd%9~+xhU<82vmvv?=I>E3Ga7Usp0PU&3|!9;HlogTmEyzozb94S&V= z`WDZk9n5hRl)>EXC{P`PojH!99mQ~5V>rJ#o-6yr^;VAk0cViAKdQ0)CYIhcD8-#% zuM!)7P;U}55@T(}UhujTzdpiwMM(d^6+{kGf?ix>JbxXz-bU;&^LE?{!ofNCYP+4L z6`w|1hSm5EX@zfQ{H5P7C^z_ut=x82MeMor)Y%TOTLx37gR8T-9Q$w_)Ump(=w>T) z-KfJT+VkM=Na;*C(G!lzn5)r+Z?cN^_iAkjd$k?zfpL%4Xbh^+b^fTeH;K)$4C8Ji zK)z31Q;(pqkAb1_oI5%C=K9RvV0B~9d>uOc5Vfgq5j|J^{p($~5cNf!eS-5ETX6i( z{C+cfR&xbcP=1!{4CO!P37Xv}%Uf@a?dDt=SF?4Lb=*IRuRPYTlegwanH#I^ycc8_ z!p4d4(^X;IM0GfAwhwomSe9{tF5G!9uIK#674&#!{*0pq>G40q>*HW@HEsD?m}k#+ zL)|aw%?4kide3oYQFVQ*DQvE3JAoQ>jcbvLw_ya+QH{qy^L{YBnm+TJ@K{UQC`NpX zD0(cC-tf|z>CC-oDKDUO#-dzNKy|h3FLC}pl&e2y9zZ`WwqJ~|eO@+KEa>}>;489G zjjR}F{?Ji$e5bCs-%V@3pBAe=xYoDmx^9@)bF9U+Cqy}$Sq~YXPh>M=M^D1%+vqE= zM}^u@SC7F@V?fk%VJ3UojQPo_b=+Y$t}v7ucm^Fbf~NNn zOCl@|e`8IEudCK>qJ^b@lb5kvRf)_<3~P;zpXZ9kG96cTwce4i#}&BD=$S^z9bMMX z)pi$8xA>;9Iqj#r!{dIa#}dlbk#cIg8a2>cu%un;l3G^Bc~#SU>HGAlo~yA<3o)0( zytg2nHY1bGhj672=MifO_W}=NI}dWE`(X2zw~->SZ`3>D7X8GkRatH+UERYL~jcr~aFsaOMwWes}SeRicA6vTCX& zF=JBB%AiR#M(Km&%LiLugRJVC$c&cIlYBb}6#9VafVwR^zrogRCic^C%h9oiK-$Q- zG1&oN8~q8gS|xMq@olrX=dwM|{e;cq*xQo|4TiQ|mn^kL84{I1PVY>D)Dja{sUI12Yw+Nq(q>dtFGZ~9?xW2wR2U}I@LAfiWwCs2p4=sS4$}4Vqvvw z*XJ8!JdP6DqsP{I3XIKMcnqY5ab3~X-picCrKr;y6!B$r#`X8t@!9`Z)(qC(9!Tl! zsUCy9S}2e5u6JSFp<8_~{kW#s4x`9+DcYT8L+d77E zg(py?o$TG(K9j|bMV#8%&HTarxVc%Yu5hoyI$KRkOUp{FQ!`z$*w$_oN3B5LeO`OK#Qz5PXr^Hun80c2cr! zGS<6|Z9*#AXsCazd zsDunQ)6De;-{CIwiiU8Pu0ggDL|a?K1>Y!yR^FcKe}#)?41NN}Z_+;2qLBIp;_8U~ z65gkA?eqzxA`1my=WCgNG? zbZu!z=98zV7p#9ak*m)}pI)xF_1xe3I=bbEx$&25Tlj7x*czqX#rGTQa_Ga12F=(B ziCl=Ob(r^{Yu?H*5Wa)fsO43dS5!C(tOlF2VP?YxYH}2NJ0|3c7)z;@2I1&KYkdCz zCOK=>k=)rWY;kt*7>>0C_u=}kr)o}+@hsK0#8AzY@&7P6HbZNSi;o}FMLrrak@n#X zYR8DuR?H8`=)C0~3p#o$Gr6A?aCc43-F2|#6*#&P6wQ}0E8UsX=FOPlEW&1b?8aAI z&#`(qsj=j%cz)v+=2pbh)(@0Hy;13r9IKjMim>XCbzmH&^=S<6bDfEl*-@BoAheLX zD02^5D@6GFuQ|K^*9&Y**rrg&juYsYj^h{C$~D^Y8ti@=J4UO&O>OOHt(UAMBdk$6 zIbHaiC}8=oAm6J#uYJ*EYSf%<^Dpez#UbeGNtj7Euhys+QLXi$8~$c=A?TX5o^hM~ zV0@4}bza(bYGDawQi-*%Mb*DxYg`~XgL;HoMP?2;)7X)pY>h$`e4TA2n;j84%bpAC zwrmc8&&0@FLv0A_sKhVThp(b1D=3Alb2_KFCoT6muzU%YINIcsAWm@4j@Q4+*^EsY z+cIa)kw-^djDnZUuQ0ml^NLjwtYvT~t}u^H9WqCA7N7cHlg0WON15HkSA8(2dnI>c z+_3a*^0(k?4BWV3HZz?Mz-yz6u7pq`K{7U(Y%XM){(Mi(!c|ByD0f$4m-KAkwEP*BG!h z`gc~fh&giZ-MQ>Q&jdHEv{(K^oxxLR&R}Zo8NJ^|8PlkDV z!c(b%+0?|c+6vQlNLJ$(N>lL7_{~1zUM^R zu9(I)3%ao(qQi1HdC3_Y)}Ao3wFKf*%O;sma7lD$S-M6`*59zSKz=w zEMYx1{R=Y}%%d>c;9Q)&{Cb=tCV|FHAnYiPc`=TyBzn@mW^&Su#O3%p52(y#aYl>dyv|qIzIlh3GwZX zm19<~b5k;VK9RDFt!G`717K^G>n>h*QsV9)at(I*)kI_x!G3^m9PNGlbBrCcP4B0b z96MQzpGHl3Q>vG1y$?FSiL@;^$3 zf^&FYLrGq&^})<=H9ZQf^$-O%Mn3``&*TnPgRSiwvRK06Amxs-W&hT+-JAEn|Hneey^>ua#n*+ zk;`0{GIpo6t-jl9E`i~%P#(wOf67rW@So4Ma7KLDz8GH-7Fly=)X`VL)o~_AJ4R8i z{jgFcb`0{5(RuYEt0+fzjYT-#BXVhgV#yos`oMSHry*J~Mr`6hoqV9dZ? z?@pNf0N)!EY+r9;opJK?dR>SKwCB3cT(O+1xu+GJ&wTDVlwHp}3XyoPm^ky!nQX=% zw9}mN=4-ZB_)aZf#?cEo$1M6T^Y4wIJI`EhIkH;jJ&EvA>Q|3j#_RRH46^o4GB{gf zM})Icmr3%+tXs-ewh_I*~<5UaJI+X*}z-E-|N{gu7& zcLICn$yFsMB=KM8)@rT0slL0wcT3%;xHifb%INGD*G+6kjP=GEKyqpie=vyT{d6mK8{@oYhA%$IW=|!F3twuWgu=`S&h(JPh?U*dkeGgd4YB4SKuj-zI*JuUe9xP20?s{#;>Li(Pzu zwp78_{@Q0Y*+c7xcLBXSuqeL8y?>ojd=I6E@0PA+NW%o@qSbgpR!j57(J6?Oj zaUd}+IA4ly^fw5bCF;sfQ^D6c*}bW|6`$kl2$B4i!Hzk(7P$UKvuUUKObGK}MR*tW zi+y|tznH5Od>`l2&ES4L7Cqkman zE2$Q#v0U>fG8UvY6?|uK_x60+6*JkxeG=`@+7sqaiPH%7cb2~vVCE|8k6SN}ekN1% z0kAYOY$VLl3}Xp{`0Kzvb1A!HVtQ8gQ^B>Mt6qe^1>eRxb{!PgyX^rc#@C8II1?q= zd>I>Z#am}3W>Y6r3ZKnmUY6T-?1*(BVWcfNyiaq-526Xz!`*M!J)SJSmj3U?+H$+M z_CT~IY^%4x?`!yG9O&+&ce9ssRfab}+@8vjpCCN5)&|wrvFALqpo}ETTxX&MS#wc_ zaK`BzuX-48fWJ21k#h z7p;Bf;(_?C5+9=N7}M2mDfniNPv*3Uvnz%=hE%q)UbZWLC0E{g9~;rj8I;fbMRQ_$ zbIUK+>U-YCalfo2#{)c16v2m?TExC&V0D;t2cm-{1sm}xhq%Q z#K>_#b0OS(HmXr`>)25cv%h@$x5#Ft3emM6T*5hL!-ip$ybV~|)0(&K7_}otedrG! zM!mGMdY~l((JFP=C|CNrSa0H+wI70K>sNg|W5g%XxB*0Ty)6MZCb=UmVY>p?hv zw16|@c>&tyZK+x3+&LC&1in|@u8awqZPF8jeF~Jb*h^n$zo#a99Iym1Ga#zoSYVy|_gypcdl=wO#U-G1!?`;cmJY*cd z`UJ{FZ02Z*@xuB)`%u4gB(Wd&p{3XvOwECC2GC$$wc)J!_T0Z#>=S%5g8DX_M61|W z=OEkrX{B9kTlio6Xyfq48k~Pz^r4`eS;AQ_^iqT~KU`}q`0DxF=f6lB*3ulrJ!J-v zv-ixLeh53^hML9K!qodHmpz>LI=B69`f{@t&66}U*$B8cdM7>`nN)4%Z?=qdXfH&8 z#n%yttOH@>(flmgD$AdzCLGDnCgO|vWeI2X`L$-8nOiKr_Rh{Ff4aV}AZ#{@sQ2RD zdZFFv?aV1TR9jBYF;1*KD3{mxA*Hu_n|6LNzRqJcm%XfyVoMO^)Lig=1ANoVsS!cg zagVW}$ze4)tJ)TPo0Zh_G`FJYLAb$Jd*Z<7 z_&O(SE0*hO?%Mh7W{-%l8K0R~ky&iXBX@-h@qHAlL3Oy1xX^d8Fn$8=TIIG>`^bTu zEBJ~j`|KmVHC5PAP-B;gL>Q6F8USXq+Vds1LWVjbpGdH3?OaCdax_TmZFRf|UlDd( z<`Ix}C7d4oIsg`_#AX1i6V7=N-x3Wf)^RjKf6n#g3V$2@4Zi6Ut(n*n)|s}N_`Xrw z!D?{q2oZjTa>-zGM2sRi!_~34WKcQ-Es=-E!S`Xb;SN~)1CafC_2<=p#M0g$XrSz?#}9IgF4)fP5eD9x37clUx0Ia zN|8+U=j!&JSWZ?{DjBK9+s%Bj&oax{)yC(b48|E9UtUSQj0UYvV0tI2e=G0i5gYYL z>o+saR%JWV4imW%)P*wy>~)(>JHk164{^8p9md6s!aau-DbKW3o4FXRF)k>p5PL7e zSKHUVEwdA&(OE<3Qhc@3O5|OP9hcca|2&x%XjVr!+#3!xbJ6Vaf^S>y`ZhRgX0`bK z@4P<>=e6=(;m00H8z7N`jAg`|FiXSnI4vzBX6AD*0as(fj=9YOEqkOpIODY-+Zq0~ zqkenV?OordtXSZRN6uU{Q|cMMO}~3NzH+|}_v?HHXVJFjE?0xC>*+X;Q2mSzpdH|+ ze3SJN8+_v*8+Xt?FqZFDRupiRB=sSyP{n7?3WTEUn4-4CS`aa(Lw`zy)nT(9GFLtG z_?(aRAo_4?3%-Aeb^PaOu=#biRoC?py~J%2Z7>R~r=~q-2Im4W)q@seqYQ}$ujCt_ zP^qUp9>g54YR?@E=A1rn!XB&>Eh(dcGTDq(X9POmJbl~6_=dqo{+#h;eds?ufvP-@ zPPlTPo<}m!iY;4Ez7oDh9$X2>*>bMMm`!Ug3Sq0rbC06LiAspD$1IfLe4nTPDDfQ0$;s8 z*FI7?;#(FwM@FZY6JIl3i%;#GnoiW#a>`z`A^lTcW$2~&8lOx)NutKdSSg-J=76dk z!8Zs;CuDKf(li_FMS7jA^pt*1gwUIe~os3jqNAwVNRcGt|bzq z&oHk3)6z;NgI=1MX>&j?;}&YQ{(~bJdUmrodMU_fG(+3j3>C*oj0>CZm(6JP9pF0% z20Js+xO8D}!C6~UUn9uIcGvId#!-FQyAv$PJlNoBt9J9htP#ll(cybnulA(&+Dkal}!1xJOoBHGoqnd4yBVn2MFIPqU%Kl~@q z?gECB!NQd}jV0=#>HUgra{nA>&pfSJHOn*J77s0ot?jmyI$jL7eEOO>9fZ5-=BcdU2C2?N?5kw94o`m`966E`kt<_U`B~ifNAx3hki#aSM^hjT?OqF zh!p!Q_@-C$m!c2it2L+ZXO^*bQ|hU_XN_1@!cmR2l*_S%a3z+6mVCTsdlF-lH?v(2 zs^5mce-6HX4Sz<0#$0f4^kPO`OX01#=CNA!5kxy0kx@6Th{PJ^Q@6&;^x}-s>uKoU zP2+ri@0$DO>m;{u38Cu`?JE@!sxtGl!yzZF_ zB3HsUtK^8L7F<~cKl}~8&N#RdzA{-3yWXyAaV|mq<2`AsWhJ$6#<>yBvS>@|ybXk7 zqx}aE{tHSs9EK%|Hn+w%*^il1tDeU<@c+^5sWK$m6FWHQ8dVR*sGeew^oygspN_+d-XZjcPzo`BL9>E~ECcYW5obb8omdy6V5426}tIWQf12-LGHd;|eEXumS zvEMR+G?pWr_N5I#3i1>$eO%U!-a$OTavQU>W$WQ5 z?xQU%zK(&3?=6(-2jKhXt;KHU8i) zz9){+Si^owB(oze(>X=GxCeFOFl{)owO!0F6yJAiJjFC|wOA5OjHg72f^XOxKhSUF zuj(8AYDa9OtQlv}Zgp%UK67H1D)2+}ZnuH0_UXOVzk|VlOo%kNO2wzvWG($f&R`O}==`zdHsM)fci_{vqaAQ+3Y ze$YMq>4lgJa|he)^(NNIn32EHV(aBe%3~?F;plLQTRH;QnG%iygK1#>N5s&c=JN`+ zRW&|~I9`N}IoOA(=knNf3B<;=UCTIzK8LgQme$A1U?agItgYzG>@qfD{+n6-j>sEz zEa(QOf^X~z*&B34*m?QR`M-@%u1*nU4ug%eJqUkopHFf21@uSe6Yp=qH+oyf^b;Yr z2BNto7psXBWwcKVBA&M~%Zz;{lgja-lmEi1p##++T*D^4_pmjkp(ln@xW&oA{dPY|q!!kBGaqRb6kw zS$v(bTRQM+ZjCdE2g7R-YQ4$F#C;#AmG>U7EL+hEGZ)QAie(|fi391GwCC&tsZXD} zz5)p{U-iJ2U+jYt(`?P(SPkm9d--}XzCNEKa~6$UWt7+aFjvfR^$?ZHam*>y z_h`7*ja;h7DYMT`$Xp>a(H!s17)Hj@)Cs+X^q9^`EN9lfRU?vJsikJr(p)R^1xr3~ zX62d>Wwy0z)i>B`=lWk-yo&0YY}rIu%Q&04`oUM5_~N5nPrnPbsc-0v@vm|n@hv0E z_E{=JuoT5aG+VPb49+iM@1^)^XB!p%DWx>CHrWdHPbH^Z%f1(_bP}4;9e&Q_P9Nj< zc^t7C4*S%EwQ$k6f(Sc0?^vbdZrWP*md0i>*Hdjb2R^IQ_>4585I*H%J!f&2x-0e@ zc~YZXlg>ENVA^xRxA7u^ajY}#H7&OIbn)<9-zT$rTtT6UtcAr!ip^5$0mf$R|0#DE zJFM6XuH}9NX1gM16gVnew7B$gBAZQgKa>H!OoGQVmriwD5Lf=I4rh*^HWqJ<5K!^;;T2eg_4Si?b`m|e$IJ3uAP#nIS=Db zp5)iLU^bL;FM@66?XTliOD*~Sj<%U+s@m8ynH7@tIjpU$J?WZ@dd~XG&OBWPhgY?1 z+HBes=9wg8Y8HDJe@v9aV12a0elrXgK^5tQ4Iv*vOh*nH(MnqC?RtpSIE$+rbqaoK0x|FUHQS&{HtuLGJ30xsq|> zomyR4n^wD1-`mV)J!kD?ZHjRy!NU4%qFcOY&%c`Q2ZLPf-e8#3IGT~p2H#{ulo*8! z*84D5+m)M*w1vT8e$JZLi}4lJ!r8*-pdXteDv`dgd5w>(7AI1f*`D#2ol~hsh;TCa zOU{^)o2+xu8|5|6;5pi^(XV)6_WhZ;;%uvAN2}x`sfjW!?YvucysSrUUbDVK)=%{8 z6FJs(EX?jP`lB|e63aopFZ`94i7DihtV>(ZxJ6;F*p}Et;%q8~O6>fHqQQ-_lyh8; zZ!j+0Eoe6=H~n0=9Gs)Z(FjMDqrs}Mxv1km1;IfFg9A|Od_8lZhmWK3z?%b40}6~nv7eBt&B8&V+6uYoPBrArHlESXmKL) zt|s7EXg2Gl%=~-w3iuXWn|HaKzoNtquFa<~*gj4bF8!Q5RwFjU9W^xbGR}1S#e4G!l=G$ z0}SHm%;>fqdHsxBBpT^!;_77`{9rD({nTLF?C&QP!|1&8&l-fIEZQM$F0mgjWy#vR7jirsFP1~2*@_nX43!^AE@p@}Lnj*fLLv0_XvIgmk@%6uk-}Z+^ zr`3~edM2(7`$hN$*~X_UNEd%Yn<0NexHZ1XL3Y+$Gy8K7zMN~U>OY0Uj-NSqdmgxT zrX|@Do$nb|YE5Vd8%d5rsq~9#gZLI-uPC=MJO{@Z)t!Ty@gk#g_gA2rt2l z%Z$!qqxl5utaGobcBF-Co5~gYAdhy-9RAo=!9QxPmOP5xDw;l}%uUWFvRm-g=F$(= zHt?^UwXim-pe|@Vr!OnHR6#d=;{B*k;yY!GEeICd>{5g;##gnCZ(Mq|XoLNmW7LTV zXWpFn?rE{pj5WG$hpc*i#}RwValdG>G=0FIR7#%cDyI+Za8M@dMvIJGypDY zN9Acli4BNGJlNnOHW|xFR=2A*w*^sG$-bU3V%HFNOXOVs>i1?m$S6+k=VBg8?rs*KF?UCF z^{Qr}rdeN3Z_AZtWtOGWYQK-WHD`P&aWtR7sSV+Xr9Pa|q4IQ*CpjulXGu?0;^B$a zrtfklLgtJ9ginv6++DcR)iuJ-l2#ew?K%dO6~^9yo($>U4$mB)q2#(R@_5=FK}xDt-GU{+~y#8zu8+!a@A!+Q8~{0+YH zTJ;T{vb99uON>1VQ7nkU-`K+HgE4obx><>2HFlS)a~WGRqU#v^W0b0_0qhFa9r$G_ z_!_G$G2HOIKl;%Zg)sldrxj(pfp>GSR9H7>=|2dPf5rIpHS}h;PF2}=NZHT*Sl@ZSpRWI*Fk;`np5^oE-(f`zgy7U$FO!bWwP`tCIw`)EnlIDI5 zzWL;9VX+98I7D*YMA_^=JEIKr0xOg(Qsdf{jFLJcl<+2zXXv06g0&(u`6pj@KrMU7j( z%im&)71pW_1z)oV8y_Qz>|6rZdNQsizALfGoZaTLz|>=-?`AByK0ro#WpSQ?VO#9U zzFHB>sTCt^BJ7yGvk_gH-Z?Wqi|l4Fmc>z2XD&W~oon{=Rq*t$d6mCDOQ0PJ+@0Dn z3t~Fwxet7OCQs_cnbH}du$70yVk(29?_!*uv#}RqGsKorC$!BU|0nUixZjI))*9b% zwcuMkx8!K%TeEvyfnaNm@n-CS^j7o1cRdWA16F2+O{3p(MV~Ib_6D_wKrBzidl2N+ zYa^9r_Bhwh7^lx^&sy)!+rNj_tuhpw%Q5se=%Ufj*7)8DzK?Q#=jger-E8V@V*S~? zWLI8EzC-#oaZMyA*p_}%e!CT0xZqskl$lZU#P5Y~QIFWXu^-fiqQl{>wnLD$9(*FO ztaarKS3+I`qANhk?EYko%>}DbHNJYSj;Xj7rSq#?Pr~Qv-U1#|u+g+3%<1#VCB}L( zSILYDJqA|;cWmtT7G-cX1o8D*8{e-pV6Q?ud@{i!=z~vedx-q`@9|wvIP9$19-#F@ zn0#Z4m1e(Y&lYtksQaXi)>fNJP|(eIocH3BWFF)?iBGDjZpGTVSdG<$f^GD{{c=PZ zc?-7YL*&@x{Oy6aK0|mT{gd;8=h4FT%6w+5YiGE^_z>`Qoj%*?fZ9JRtI~;@u^ZQ) z9#?C2*sGWAnlI)_IUb$d;yicLF|ycG(LbNa&=wT`f=o@3(=PFOlSb+uuF?D}a**%f zO3tm(M!W-y;wpF?OW3>;BfRQViSuX;#{vt3^)zDhCR(o#*#{OZraXVleP;GbQPF~P z(c;Uo4!W_)f^RGZN9&z=pl0l=ZG}z55Z0mxuE5~TN=F{;*H(b1s{qdePgl!#{AdPP zyK;@|;<}DkV!0Wkn+|)Oq3KAw84y}@S{u%uNsLU3%XK@77bmB(D&`Y>`VRT8-$7q* z0V%UUT$7*^Ekn!974CiV(e1F!rxiFJco$0T%7j`JjsfXY#!Jx`*VA_VN4(XJyV=91 zO+3x91K~qA%CdtMtA9sK;EVAsOl*y_?2S6e*C>IX^oI2#?8&rqWv%O@ySA94w$Au* zbjG#goWJ%WE!~K$IZS4f8dq>N*?#cRv34UkW__xWy};6$S&mp*PudVuVTGe^J_pA2 zF`S{)nchSTKi;b?wH@EMJqUOI41BeH^$fd$f=?{C4HQLKE8<6W4|fB+GjDqm=k$qB zJ_*K^u;OJD-$hSGg|IIhP`5o}D$c@N!T#)$zWNfuR}aFB zhu7$}^lG!7jyX7v7QBG&%0J^fQ^EBGUd={zR3T5QvaM%^aMl|#itY?{|1(PL+U%q0 zjhtI#euGRg(<&O`Gx8DxNLv+OBLmLLydT@)5BYOG=ffOhMs%VV_J=-WHa4^K6@JKZ zgTT!7ktT9{Rwn#5nvr#y%!}$w&ANKweZ0R`{Q@krQr#S|ok|=22tVNf-yVf&e*|CB zqT?^5)d#_%#=bW!F5i~X;$%ZP3bdK4iE37wcD16c@>1|!4VF{s`yD|yn&1kauH2_* zHl90n9bT+)hPYVPeVXAt45lK?)qltt2+N;4J~qb z&EF2Nw0Kd82S*o-y89gTd+^^}DeU`r8DE3P_k)siHip1?S1J7=+|;wyA~TEc$7soo z)WI{fvUOZ@2<6*UE150Nb$<6y=2s}yOvywthPT1jb&%I{N5*%rjIWtY=AKUgeV;#{=PtP>hZg5UAZC9h4}ET%eSMy@ z+YttvmG>}Q8qOZ(As7oYlIs)S{Opr<@@j &get_ref(void); + +extern "C" __global__ void larger_sad_calc_8(unsigned short*, int, int); +extern "C" __global__ void larger_sad_calc_16(unsigned short*, int, int);*/