Merge branch 'master' of https://github.gatech.edu/casl/Vortex
This commit is contained in:
101
benchmarks/opencl/BlackScholes/BlackScholes.cl
Normal file
101
benchmarks/opencl/BlackScholes/BlackScholes.cl
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if(0)
|
||||||
|
#define EXP(a) native_exp(a)
|
||||||
|
#define LOG(a) native_log(a)
|
||||||
|
#define SQRT(a) native_sqrt(a)
|
||||||
|
#else
|
||||||
|
#define EXP(a) exp(a)
|
||||||
|
#define LOG(a) log(a)
|
||||||
|
#define SQRT(a) sqrt(a)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Predefine functions to avoid bug in OpenCL compiler on Mac OSX 10.7 systems
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
float CND(float d);
|
||||||
|
void BlackScholesBody(__global float *call, __global float *put, float S,
|
||||||
|
float X, float T, float R, float V);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Rational approximation of cumulative normal distribution function
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
float CND(float d){
|
||||||
|
const float A1 = 0.31938153f;
|
||||||
|
const float A2 = -0.356563782f;
|
||||||
|
const float A3 = 1.781477937f;
|
||||||
|
const float A4 = -1.821255978f;
|
||||||
|
const float A5 = 1.330274429f;
|
||||||
|
const float RSQRT2PI = 0.39894228040143267793994605993438f;
|
||||||
|
|
||||||
|
float
|
||||||
|
K = 1.0f / (1.0f + 0.2316419f * fabs(d));
|
||||||
|
|
||||||
|
float
|
||||||
|
cnd = RSQRT2PI * EXP(- 0.5f * d * d) *
|
||||||
|
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
|
||||||
|
|
||||||
|
if(d > 0)
|
||||||
|
cnd = 1.0f - cnd;
|
||||||
|
|
||||||
|
return cnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Black-Scholes formula for both call and put
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
void BlackScholesBody(
|
||||||
|
__global float *call, //Call option price
|
||||||
|
__global float *put, //Put option price
|
||||||
|
float S, //Current stock price
|
||||||
|
float X, //Option strike price
|
||||||
|
float T, //Option years
|
||||||
|
float R, //Riskless rate of return
|
||||||
|
float V //Stock volatility
|
||||||
|
){
|
||||||
|
float sqrtT = SQRT(T);
|
||||||
|
float d1 = (LOG(S / X) + (R + 0.5f * V * V) * T) / (V * sqrtT);
|
||||||
|
float d2 = d1 - V * sqrtT;
|
||||||
|
float CNDD1 = CND(d1);
|
||||||
|
float CNDD2 = CND(d2);
|
||||||
|
|
||||||
|
//Calculate Call and Put simultaneously
|
||||||
|
float expRT = EXP(- R * T);
|
||||||
|
*call = (S * CNDD1 - X * expRT * CNDD2);
|
||||||
|
*put = (X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
__kernel void BlackScholes(
|
||||||
|
__global float *d_Call, //Call option price
|
||||||
|
__global float *d_Put, //Put option price
|
||||||
|
__global float *d_S, //Current stock price
|
||||||
|
__global float *d_X, //Option strike price
|
||||||
|
__global float *d_T, //Option years
|
||||||
|
float R, //Riskless rate of return
|
||||||
|
float V, //Stock volatility
|
||||||
|
unsigned int optN
|
||||||
|
){
|
||||||
|
for(unsigned int opt = get_global_id(0); opt < optN; opt += get_global_size(0))
|
||||||
|
BlackScholesBody(
|
||||||
|
&d_Call[opt],
|
||||||
|
&d_Put[opt],
|
||||||
|
d_S[opt],
|
||||||
|
d_X[opt],
|
||||||
|
d_T[opt],
|
||||||
|
R,
|
||||||
|
V
|
||||||
|
);
|
||||||
|
}
|
||||||
66
benchmarks/opencl/BlackScholes/Makefile
Normal file
66
benchmarks/opencl/BlackScholes/Makefile
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
|
||||||
|
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
|
||||||
|
POCL_INC_PATH = $(wildcard ../include)
|
||||||
|
POCL_LIB_PATH = $(wildcard ../lib)
|
||||||
|
VX_RT_PATH = $(wildcard ../../../runtime)
|
||||||
|
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
|
||||||
|
|
||||||
|
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||||
|
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||||
|
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||||
|
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||||
|
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||||
|
|
||||||
|
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||||
|
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||||
|
|
||||||
|
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||||
|
|
||||||
|
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||||
|
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||||
|
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||||
|
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||||
|
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||||
|
|
||||||
|
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||||
|
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
|
||||||
|
|
||||||
|
PROJECT=BlackScholes
|
||||||
|
|
||||||
|
all: $(PROJECT).dump $(PROJECT).hex
|
||||||
|
|
||||||
|
lib$(PROJECT).a: BlackScholes.cl
|
||||||
|
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||||
|
|
||||||
|
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||||
|
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc oclBlackScholes_common.h oclBlackScholes_launcher.cpp oclBlackScholes_gold.cpp $(VX_LIBS) -o $(PROJECT).elf
|
||||||
|
|
||||||
|
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||||
|
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||||
|
|
||||||
|
$(PROJECT).hex: $(PROJECT).elf
|
||||||
|
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||||
|
|
||||||
|
$(PROJECT).dump: $(PROJECT).elf
|
||||||
|
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||||
|
|
||||||
|
run: $(PROJECT).hex
|
||||||
|
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||||
|
|
||||||
|
qemu: $(PROJECT).qemu
|
||||||
|
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||||
|
|
||||||
|
gdb-s: $(PROJECT).qemu
|
||||||
|
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||||
|
|
||||||
|
gdb-c: $(PROJECT).qemu
|
||||||
|
$(GDB) $(PROJECT).qemu
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf *.elf *.dump *.hex
|
||||||
248
benchmarks/opencl/BlackScholes/main.cpp
Normal file
248
benchmarks/opencl/BlackScholes/main.cpp
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
// standard utilities and systems includes
|
||||||
|
#include <oclUtils.h>
|
||||||
|
#include <shrQATest.h>
|
||||||
|
#include "oclBlackScholes_common.h"
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Helper functions
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
double executionTime(cl_event &event){
|
||||||
|
cl_ulong start, end;
|
||||||
|
|
||||||
|
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
|
||||||
|
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
|
||||||
|
|
||||||
|
return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Random float helper
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
float randFloat(float low, float high){
|
||||||
|
float t = (float)rand() / (float)RAND_MAX;
|
||||||
|
return (1.0f - t) * low + t * high;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Main program
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
cl_platform_id cpPlatform; //OpenCL platform
|
||||||
|
cl_device_id* cdDevices = NULL; //OpenCL devices list (array)
|
||||||
|
cl_context cxGPUContext; //OpenCL context
|
||||||
|
cl_command_queue cqCommandQueue; //OpenCL command que
|
||||||
|
cl_mem //OpenCL memory buffer objects
|
||||||
|
d_Call,
|
||||||
|
d_Put,
|
||||||
|
d_S,
|
||||||
|
d_X,
|
||||||
|
d_T;
|
||||||
|
|
||||||
|
cl_int ciErrNum;
|
||||||
|
|
||||||
|
float
|
||||||
|
*h_CallCPU,
|
||||||
|
*h_PutCPU,
|
||||||
|
*h_CallGPU,
|
||||||
|
*h_PutGPU,
|
||||||
|
*h_S,
|
||||||
|
*h_X,
|
||||||
|
*h_T;
|
||||||
|
|
||||||
|
const unsigned int optionCount = 4000000;
|
||||||
|
const float R = 0.02f;
|
||||||
|
const float V = 0.30f;
|
||||||
|
|
||||||
|
shrQAStart(argc, argv);
|
||||||
|
|
||||||
|
// Get the NVIDIA platform
|
||||||
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||||
|
shrLog("clGetPlatformID...\n");
|
||||||
|
|
||||||
|
//Get all the devices
|
||||||
|
cl_uint uiNumDevices = 0; // Number of devices available
|
||||||
|
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
||||||
|
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
|
||||||
|
shrLog("Get the Device info and select Device...\n");
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||||
|
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||||
|
|
||||||
|
// Get command line device options and config accordingly
|
||||||
|
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||||
|
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
|
||||||
|
{
|
||||||
|
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
||||||
|
}
|
||||||
|
shrLog(" Using Device %u: ", uiTargetDevice);
|
||||||
|
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||||
|
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||||
|
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
|
||||||
|
|
||||||
|
// set logfile name and start logs
|
||||||
|
shrSetLogFileName ("oclBlackScholes.txt");
|
||||||
|
shrLog("%s Starting...\n\n", argv[0]);
|
||||||
|
|
||||||
|
shrLog("Allocating and initializing host memory...\n");
|
||||||
|
h_CallCPU = (float *)malloc(optionCount * sizeof(float));
|
||||||
|
h_PutCPU = (float *)malloc(optionCount * sizeof(float));
|
||||||
|
h_CallGPU = (float *)malloc(optionCount * sizeof(float));
|
||||||
|
h_PutGPU = (float *)malloc(optionCount * sizeof(float));
|
||||||
|
h_S = (float *)malloc(optionCount * sizeof(float));
|
||||||
|
h_X = (float *)malloc(optionCount * sizeof(float));
|
||||||
|
h_T = (float *)malloc(optionCount * sizeof(float));
|
||||||
|
|
||||||
|
srand(2009);
|
||||||
|
for(unsigned int i = 0; i < optionCount; i++){
|
||||||
|
h_CallCPU[i] = -1.0f;
|
||||||
|
h_PutCPU[i] = -1.0f;
|
||||||
|
h_S[i] = randFloat(5.0f, 30.0f);
|
||||||
|
h_X[i] = randFloat(1.0f, 100.0f);
|
||||||
|
h_T[i] = randFloat(0.25f, 10.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
shrLog("Initializing OpenCL...\n");
|
||||||
|
// Get the NVIDIA platform
|
||||||
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// Get a GPU device
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// Create the context
|
||||||
|
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
//Create a command-queue
|
||||||
|
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
shrLog("Creating OpenCL memory objects...\n");
|
||||||
|
d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
d_Put = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
d_S = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
d_X = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
d_T = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
shrLog("Starting up BlackScholes...\n");
|
||||||
|
initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv);
|
||||||
|
|
||||||
|
shrLog("Running OpenCL BlackScholes...\n\n");
|
||||||
|
//Just a single run or a warmup iteration
|
||||||
|
BlackScholes(
|
||||||
|
NULL,
|
||||||
|
d_Call,
|
||||||
|
d_Put,
|
||||||
|
d_S,
|
||||||
|
d_X,
|
||||||
|
d_T,
|
||||||
|
R,
|
||||||
|
V,
|
||||||
|
optionCount
|
||||||
|
);
|
||||||
|
|
||||||
|
#ifdef GPU_PROFILING
|
||||||
|
const int numIterations = 16;
|
||||||
|
cl_event startMark, endMark;
|
||||||
|
ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
|
||||||
|
ciErrNum |= clFinish(cqCommandQueue);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
shrDeltaT(0);
|
||||||
|
|
||||||
|
for(int i = 0; i < numIterations; i++){
|
||||||
|
BlackScholes(
|
||||||
|
cqCommandQueue,
|
||||||
|
d_Call,
|
||||||
|
d_Put,
|
||||||
|
d_S,
|
||||||
|
d_X,
|
||||||
|
d_T,
|
||||||
|
R,
|
||||||
|
V,
|
||||||
|
optionCount
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark);
|
||||||
|
ciErrNum |= clFinish(cqCommandQueue);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
//Calculate performance metrics by wallclock time
|
||||||
|
double gpuTime = shrDeltaT(0) / numIterations;
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclBlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %i, Workgroup = %u\n",
|
||||||
|
(double)(2.0 * optionCount * 1.0e-9)/gpuTime, gpuTime, (2 * optionCount), 1, 0);
|
||||||
|
|
||||||
|
//Get profiling info
|
||||||
|
cl_ulong startTime = 0, endTime = 0;
|
||||||
|
ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
|
||||||
|
ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
shrLog("\nReading back OpenCL BlackScholes results...\n");
|
||||||
|
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
shrLog("Comparing against Host/C++ computation...\n");
|
||||||
|
BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount);
|
||||||
|
double deltaCall = 0, deltaPut = 0, sumCall = 0, sumPut = 0;
|
||||||
|
double L1call, L1put;
|
||||||
|
for(unsigned int i = 0; i < optionCount; i++)
|
||||||
|
{
|
||||||
|
sumCall += fabs(h_CallCPU[i]);
|
||||||
|
sumPut += fabs(h_PutCPU[i]);
|
||||||
|
deltaCall += fabs(h_CallCPU[i] - h_CallGPU[i]);
|
||||||
|
deltaPut += fabs(h_PutCPU[i] - h_PutGPU[i]);
|
||||||
|
}
|
||||||
|
L1call = deltaCall / sumCall;
|
||||||
|
L1put = deltaPut / sumPut;
|
||||||
|
shrLog("Relative L1 (call, put) = (%.3e, %.3e)\n\n", L1call, L1put);
|
||||||
|
|
||||||
|
shrLog("Shutting down...\n");
|
||||||
|
closeBlackScholes();
|
||||||
|
ciErrNum = clReleaseMemObject(d_T);
|
||||||
|
ciErrNum |= clReleaseMemObject(d_X);
|
||||||
|
ciErrNum |= clReleaseMemObject(d_S);
|
||||||
|
ciErrNum |= clReleaseMemObject(d_Put);
|
||||||
|
ciErrNum |= clReleaseMemObject(d_Call);
|
||||||
|
ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
|
||||||
|
ciErrNum |= clReleaseContext(cxGPUContext);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
free(h_T);
|
||||||
|
free(h_X);
|
||||||
|
free(h_S);
|
||||||
|
free(h_PutGPU);
|
||||||
|
free(h_CallGPU);
|
||||||
|
free(h_PutCPU);
|
||||||
|
free(h_CallCPU);
|
||||||
|
|
||||||
|
if(cdDevices)free(cdDevices);
|
||||||
|
|
||||||
|
shrQAFinishExit(argc, (const char **)argv, ((L1call < 1E-6) && (L1put < 1E-6)) ? QA_PASSED : QA_FAILED );
|
||||||
|
}
|
||||||
BIN
benchmarks/opencl/BlackScholes/oclBlackScholes.pdf
Normal file
BIN
benchmarks/opencl/BlackScholes/oclBlackScholes.pdf
Normal file
Binary file not shown.
50
benchmarks/opencl/BlackScholes/oclBlackScholes_common.h
Normal file
50
benchmarks/opencl/BlackScholes/oclBlackScholes_common.h
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#include <oclUtils.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Process an array of optN options on CPU
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void BlackScholesCPU(
|
||||||
|
float *h_Call, //Call option price
|
||||||
|
float *h_Put, //Put option price
|
||||||
|
float *h_S, //Current stock price
|
||||||
|
float *h_X, //Option strike price
|
||||||
|
float *h_T, //Option years
|
||||||
|
float R, //Riskless rate of return
|
||||||
|
float V, //Stock volatility
|
||||||
|
unsigned int optionCount
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// OpenCL Black-Scholes kernel launcher
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv);
|
||||||
|
|
||||||
|
extern "C" void closeBlackScholes(void);
|
||||||
|
|
||||||
|
extern "C" void BlackScholes(
|
||||||
|
cl_command_queue cqCommandQueue,
|
||||||
|
cl_mem d_Call, //Call option price
|
||||||
|
cl_mem d_Put, //Put option price
|
||||||
|
cl_mem d_S, //Current stock price
|
||||||
|
cl_mem d_X, //Option strike price
|
||||||
|
cl_mem d_T, //Option years
|
||||||
|
cl_float R, //Riskless rate of return
|
||||||
|
cl_float V, //Stock volatility
|
||||||
|
cl_uint optionCount
|
||||||
|
);
|
||||||
92
benchmarks/opencl/BlackScholes/oclBlackScholes_gold.cpp
Normal file
92
benchmarks/opencl/BlackScholes/oclBlackScholes_gold.cpp
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
#include "oclBlackScholes_common.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Rational approximation of cumulative normal distribution function
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
static double CND(double d){
|
||||||
|
const double A1 = 0.31938153;
|
||||||
|
const double A2 = -0.356563782;
|
||||||
|
const double A3 = 1.781477937;
|
||||||
|
const double A4 = -1.821255978;
|
||||||
|
const double A5 = 1.330274429;
|
||||||
|
const double RSQRT2PI = 0.39894228040143267793994605993438;
|
||||||
|
|
||||||
|
double
|
||||||
|
K = 1.0 / (1.0 + 0.2316419 * fabs(d));
|
||||||
|
|
||||||
|
double
|
||||||
|
cnd = RSQRT2PI * exp(- 0.5 * d * d) *
|
||||||
|
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
|
||||||
|
|
||||||
|
if(d > 0)
|
||||||
|
cnd = 1.0 - cnd;
|
||||||
|
|
||||||
|
return cnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Black-Scholes formula for both call and put
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
static void BlackScholesBodyCPU(
|
||||||
|
float& call, //Call option price
|
||||||
|
float& put, //Put option price
|
||||||
|
float Sf, //Current stock price
|
||||||
|
float Xf, //Option strike price
|
||||||
|
float Tf, //Option years
|
||||||
|
float Rf, //Riskless rate of return
|
||||||
|
float Vf //Stock volatility
|
||||||
|
){
|
||||||
|
double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf;
|
||||||
|
|
||||||
|
double sqrtT = sqrt(T);
|
||||||
|
double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
|
||||||
|
double d2 = d1 - V * sqrtT;
|
||||||
|
double CNDD1 = CND(d1);
|
||||||
|
double CNDD2 = CND(d2);
|
||||||
|
|
||||||
|
//Calculate Call and Put simultaneously
|
||||||
|
double expRT = exp(- R * T);
|
||||||
|
call = (float)(S * CNDD1 - X * expRT * CNDD2);
|
||||||
|
put = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1));
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Process an array of optN options
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void BlackScholesCPU(
|
||||||
|
float *h_Call, //Call option price
|
||||||
|
float *h_Put, //Put option price
|
||||||
|
float *h_S, //Current stock price
|
||||||
|
float *h_X, //Option strike price
|
||||||
|
float *h_T, //Option years
|
||||||
|
float R, //Riskless rate of return
|
||||||
|
float V, //Stock volatility
|
||||||
|
unsigned int optionCount
|
||||||
|
){
|
||||||
|
for(unsigned int i = 0; i < optionCount; i++)
|
||||||
|
BlackScholesBodyCPU(
|
||||||
|
h_Call[i],
|
||||||
|
h_Put[i],
|
||||||
|
h_S[i],
|
||||||
|
h_X[i],
|
||||||
|
h_T[i],
|
||||||
|
R,
|
||||||
|
V
|
||||||
|
);
|
||||||
|
}
|
||||||
125
benchmarks/opencl/BlackScholes/oclBlackScholes_launcher.cpp
Normal file
125
benchmarks/opencl/BlackScholes/oclBlackScholes_launcher.cpp
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#include <oclUtils.h>
|
||||||
|
#include "oclBlackScholes_common.h"
|
||||||
|
|
||||||
|
static cl_program cpBlackScholes; //OpenCL program
|
||||||
|
static cl_kernel ckBlackScholes; //OpenCL kernel
|
||||||
|
static cl_command_queue cqDefaultCommandQueue;
|
||||||
|
|
||||||
|
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
|
||||||
|
cl_int ciErrNum;
|
||||||
|
size_t kernelLength;
|
||||||
|
|
||||||
|
shrLog("...loading BlackScholes.cl\n");
|
||||||
|
char *cPathAndName = shrFindFilePath("BlackScholes.cl", argv[0]);
|
||||||
|
shrCheckError(cPathAndName != NULL, shrTRUE);
|
||||||
|
char *cBlackScholes = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
|
||||||
|
shrCheckError(cBlackScholes != NULL, shrTRUE);
|
||||||
|
|
||||||
|
shrLog("...creating BlackScholes program\n");
|
||||||
|
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
|
||||||
|
cpBlackScholes = clCreateProgramWithBuiltInKernels(context, 1, &device_id, "BlackScholes", NULL);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
shrLog("...building BlackScholes program\n");
|
||||||
|
ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL);
|
||||||
|
|
||||||
|
if(ciErrNum != CL_BUILD_SUCCESS){
|
||||||
|
shrLog("*** Compilation failure ***\n");
|
||||||
|
|
||||||
|
size_t deviceNum;
|
||||||
|
cl_device_id *cdDevices;
|
||||||
|
ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &deviceNum);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
cdDevices = (cl_device_id *)malloc(deviceNum * sizeof(cl_device_id));
|
||||||
|
shrCheckError(cdDevices != NULL, shrTRUE);
|
||||||
|
|
||||||
|
ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, deviceNum * sizeof(cl_device_id), cdDevices, NULL);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
size_t logSize;
|
||||||
|
char *logTxt;
|
||||||
|
|
||||||
|
ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
logTxt = (char *)malloc(logSize);
|
||||||
|
shrCheckError(logTxt != NULL, shrTRUE);
|
||||||
|
|
||||||
|
ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, logSize, logTxt, NULL);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
shrLog("%s\n", logTxt);
|
||||||
|
shrLog("*** Exiting ***\n");
|
||||||
|
free(logTxt);
|
||||||
|
free(cdDevices);
|
||||||
|
exit(666);
|
||||||
|
}
|
||||||
|
|
||||||
|
//Save ptx code to separate file
|
||||||
|
oclLogPtx(cpBlackScholes, oclGetFirstDev(cxGPUContext), "BlackScholes.ptx");
|
||||||
|
|
||||||
|
shrLog("...creating BlackScholes kernels\n");
|
||||||
|
ckBlackScholes = clCreateKernel(cpBlackScholes, "BlackScholes", &ciErrNum);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
cqDefaultCommandQueue = cqParamCommandQueue;
|
||||||
|
free(cBlackScholes);
|
||||||
|
free(cPathAndName);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" void closeBlackScholes(void){
|
||||||
|
cl_int ciErrNum;
|
||||||
|
ciErrNum = clReleaseKernel(ckBlackScholes);
|
||||||
|
ciErrNum |= clReleaseProgram(cpBlackScholes);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// OpenCL Black-Scholes kernel launcher
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void BlackScholes(
|
||||||
|
cl_command_queue cqCommandQueue,
|
||||||
|
cl_mem d_Call, //Call option price
|
||||||
|
cl_mem d_Put, //Put option price
|
||||||
|
cl_mem d_S, //Current stock price
|
||||||
|
cl_mem d_X, //Option strike price
|
||||||
|
cl_mem d_T, //Option years
|
||||||
|
cl_float R, //Riskless rate of return
|
||||||
|
cl_float V, //Stock volatility
|
||||||
|
cl_uint optionCount
|
||||||
|
){
|
||||||
|
cl_int ciErrNum;
|
||||||
|
|
||||||
|
if(!cqCommandQueue)
|
||||||
|
cqCommandQueue = cqDefaultCommandQueue;
|
||||||
|
|
||||||
|
ciErrNum = clSetKernelArg(ckBlackScholes, 0, sizeof(cl_mem), (void *)&d_Call);
|
||||||
|
ciErrNum |= clSetKernelArg(ckBlackScholes, 1, sizeof(cl_mem), (void *)&d_Put);
|
||||||
|
ciErrNum |= clSetKernelArg(ckBlackScholes, 2, sizeof(cl_mem), (void *)&d_S);
|
||||||
|
ciErrNum |= clSetKernelArg(ckBlackScholes, 3, sizeof(cl_mem), (void *)&d_X);
|
||||||
|
ciErrNum |= clSetKernelArg(ckBlackScholes, 4, sizeof(cl_mem), (void *)&d_T);
|
||||||
|
ciErrNum |= clSetKernelArg(ckBlackScholes, 5, sizeof(cl_float), (void *)&R);
|
||||||
|
ciErrNum |= clSetKernelArg(ckBlackScholes, 6, sizeof(cl_float), (void *)&V);
|
||||||
|
ciErrNum |= clSetKernelArg(ckBlackScholes, 7, sizeof(cl_uint), (void *)&optionCount);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
//Run the kernel
|
||||||
|
size_t globalWorkSize = 60 * 1024;
|
||||||
|
size_t localWorkSize = 128;
|
||||||
|
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBlackScholes, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
}
|
||||||
198
benchmarks/opencl/BlackScholes/oclUtils.h
Normal file
198
benchmarks/opencl/BlackScholes/oclUtils.h
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef OCL_UTILS_H
|
||||||
|
#define OCL_UTILS_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// Common headers: Cross-API utililties and OpenCL header
|
||||||
|
#include <shrUtils.h>
|
||||||
|
|
||||||
|
// All OpenCL headers
|
||||||
|
#if defined (__APPLE__) || defined(MACOSX)
|
||||||
|
#include <OpenCL/opencl.h>
|
||||||
|
#else
|
||||||
|
#include <CL/opencl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Includes
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||||
|
// extensions from <CL/cl_ext.h>
|
||||||
|
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||||
|
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||||
|
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||||
|
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||||
|
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||||
|
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||||
|
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||||
|
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||||
|
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// reminders for build output window and log
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including shrUtils.h")
|
||||||
|
#pragma message ("Note: including opencl.h")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// SDK Revision #
|
||||||
|
#define OCL_SDKREVISION "7027912"
|
||||||
|
|
||||||
|
// Error and Exit Handling Macros...
|
||||||
|
// *********************************************************************
|
||||||
|
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||||
|
|
||||||
|
// Short version without Cleanup() callback pointer
|
||||||
|
// Both Input (a) and Reference (b) are specified as args
|
||||||
|
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param clSelectedPlatformID OpenCL platform ID
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Print info about the device
|
||||||
|
//!
|
||||||
|
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and return device capability
|
||||||
|
//!
|
||||||
|
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" int oclGetDevCap(cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Print the device name
|
||||||
|
//!
|
||||||
|
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of the first device from the context
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of the nth device from the context
|
||||||
|
//!
|
||||||
|
//! @return the id or -1 when out of range
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//! @param device_idx index of the device of interest
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of device with maximal FLOPS from the context
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Loads a Program file and prepends the cPreamble to the code.
|
||||||
|
//!
|
||||||
|
//! @return the source string if succeeded, 0 otherwise
|
||||||
|
//! @param cFilename program filename
|
||||||
|
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||||
|
//! @param szFinalLength returned length of the code string
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the binary (PTX) of the program associated with the device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//! @param binary returned code
|
||||||
|
//! @param length length of returned code
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//! @param const char* cPtxFileName optional PTX file name
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||||
|
|
||||||
|
// Helper function for De-allocating cl objects
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||||
|
|
||||||
|
// Helper function to get OpenCL error string from constant
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" const char* oclErrorString(cl_int error);
|
||||||
|
|
||||||
|
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||||
|
|
||||||
|
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||||
|
{
|
||||||
|
// An error condition is defined by the sample/test value not equal to the reference
|
||||||
|
if (iReference != iSample)
|
||||||
|
{
|
||||||
|
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||||
|
iSample = (iSample == 0) ? -9999 : iSample;
|
||||||
|
|
||||||
|
// Log the error info
|
||||||
|
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||||
|
|
||||||
|
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||||
|
if (pCleanup != NULL)
|
||||||
|
{
|
||||||
|
pCleanup(iSample);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||||
|
exit(iSample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
238
benchmarks/opencl/BlackScholes/shrQATest.h
Normal file
238
benchmarks/opencl/BlackScholes/shrQATest.h
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SHR_QATEST_H
|
||||||
|
#define SHR_QATEST_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Generic utilities for NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// OS dependent includes
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including windows.h")
|
||||||
|
#pragma message ("Note: including math.h")
|
||||||
|
#pragma message ("Note: including assert.h")
|
||||||
|
#pragma message ("Note: including time.h")
|
||||||
|
|
||||||
|
// Headers needed for Windows
|
||||||
|
#include <windows.h>
|
||||||
|
#include <time.h>
|
||||||
|
#else
|
||||||
|
// Headers needed for Linux
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <time.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef STRCASECMP
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define STRCASECMP _stricmp
|
||||||
|
#else
|
||||||
|
#define STRCASECMP strcasecmp
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef STRNCASECMP
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define STRNCASECMP _strnicmp
|
||||||
|
#else
|
||||||
|
#define STRNCASECMP strncasecmp
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
// Standardized QA Start/Finish for CUDA SDK tests
|
||||||
|
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||||
|
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||||
|
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||||
|
|
||||||
|
inline int findExeNameStart(const char *exec_name)
|
||||||
|
{
|
||||||
|
int exename_start = (int)strlen(exec_name);
|
||||||
|
|
||||||
|
while( (exename_start > 0) &&
|
||||||
|
(exec_name[exename_start] != '\\') &&
|
||||||
|
(exec_name[exename_start] != '/') )
|
||||||
|
{
|
||||||
|
exename_start--;
|
||||||
|
}
|
||||||
|
if (exec_name[exename_start] == '\\' ||
|
||||||
|
exec_name[exename_start] == '/')
|
||||||
|
{
|
||||||
|
return exename_start+1;
|
||||||
|
} else {
|
||||||
|
return exename_start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int __shrQAStart(int argc, char **argv)
|
||||||
|
{
|
||||||
|
bool bQATest = false;
|
||||||
|
// First clear the output buffer
|
||||||
|
fflush(stdout);
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
char *string_argv = &argv[i][string_start];
|
||||||
|
|
||||||
|
if (!STRCASECMP(string_argv, "qatest")) {
|
||||||
|
bQATest = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't want to print the entire path, so we search for the first
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
printf("\n"); fflush(stdout);
|
||||||
|
return exename_start;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum eQAstatus {
|
||||||
|
QA_FAILED = 0,
|
||||||
|
QA_PASSED = 1,
|
||||||
|
QA_WAIVED = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
inline void __ExitInTime(int seconds)
|
||||||
|
{
|
||||||
|
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||||
|
fflush(stdout);
|
||||||
|
time_t t;
|
||||||
|
int count;
|
||||||
|
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||||
|
fprintf(stdout, "%d...", count);
|
||||||
|
#ifdef WIN32
|
||||||
|
Sleep(1000);
|
||||||
|
#else
|
||||||
|
sleep(1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
fprintf(stdout,"done!\n\n");
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||||
|
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||||
|
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
if (!STRCASECMP(string_argv, "qatest")) {
|
||||||
|
bQATest = true;
|
||||||
|
}
|
||||||
|
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||||
|
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||||
|
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||||
|
bNoPrompt = true;
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
if (!STRCASECMP(string_argv, "prompt")) {
|
||||||
|
bNoPrompt = false;
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
printf("\n"); fflush(stdout);
|
||||||
|
if (bQuitInTime) {
|
||||||
|
__ExitInTime(3);
|
||||||
|
} else {
|
||||||
|
if (!bNoPrompt) {
|
||||||
|
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||||
|
fflush(stdout);
|
||||||
|
getchar();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
bool bQuitInTime = true;
|
||||||
|
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||||
|
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||||
|
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
if (!STRCASECMP(string_argv, "prompt")) {
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
if (bQuitInTime) {
|
||||||
|
__ExitInTime(3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
__shrQAFinish(argc, argv, iStatus);
|
||||||
|
|
||||||
|
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||||
|
|
||||||
|
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
642
benchmarks/opencl/BlackScholes/shrUtils.h
Normal file
642
benchmarks/opencl/BlackScholes/shrUtils.h
Normal file
@@ -0,0 +1,642 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SHR_UTILS_H
|
||||||
|
#define SHR_UTILS_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Generic utilities for NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// reminders for output window and build log
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including windows.h")
|
||||||
|
#pragma message ("Note: including math.h")
|
||||||
|
#pragma message ("Note: including assert.h")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// OS dependent includes
|
||||||
|
#ifdef _WIN32
|
||||||
|
// Headers needed for Windows
|
||||||
|
#include <windows.h>
|
||||||
|
#else
|
||||||
|
// Headers needed for Linux
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Other headers needed for both Windows and Linux
|
||||||
|
#include <math.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// Un-comment the following #define to enable profiling code in SDK apps
|
||||||
|
//#define GPU_PROFILING
|
||||||
|
|
||||||
|
// Beginning of GPU Architecture definitions
|
||||||
|
inline int ConvertSMVer2Cores(int major, int minor)
|
||||||
|
{
|
||||||
|
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||||
|
typedef struct {
|
||||||
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||||
|
int Cores;
|
||||||
|
} sSMtoCores;
|
||||||
|
|
||||||
|
sSMtoCores nGpuArchCoresPerSM[] =
|
||||||
|
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||||
|
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||||
|
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||||
|
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||||
|
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||||
|
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||||
|
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||||
|
{ -1, -1 }
|
||||||
|
};
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||||
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||||
|
return nGpuArchCoresPerSM[index].Cores;
|
||||||
|
}
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
// end of GPU Architecture definitions
|
||||||
|
|
||||||
|
|
||||||
|
// Defines and enum for use with logging functions
|
||||||
|
// *********************************************************************
|
||||||
|
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||||
|
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||||
|
enum LOGMODES
|
||||||
|
{
|
||||||
|
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||||
|
LOGFILE = 2, // bit to signal "log to file"
|
||||||
|
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||||
|
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||||
|
MASTER = 8, // bit to signal master .csv log output
|
||||||
|
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||||
|
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||||
|
};
|
||||||
|
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||||
|
|
||||||
|
// Standardized boolean
|
||||||
|
enum shrBOOL
|
||||||
|
{
|
||||||
|
shrFALSE = 0,
|
||||||
|
shrTRUE = 1
|
||||||
|
};
|
||||||
|
|
||||||
|
// Standardized MAX, MIN and CLAMP
|
||||||
|
#define MAX(a, b) ((a > b) ? a : b)
|
||||||
|
#define MIN(a, b) ((a < b) ? a : b)
|
||||||
|
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||||
|
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||||
|
|
||||||
|
// Error and Exit Handling Macros...
|
||||||
|
// *********************************************************************
|
||||||
|
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||||
|
|
||||||
|
// Short version without Cleanup() callback pointer
|
||||||
|
// Both Input (a) and Reference (b) are specified as args
|
||||||
|
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||||
|
|
||||||
|
// Standardized Exit Macro for leaving main()... extended version
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||||
|
|
||||||
|
// Standardized Exit Macro for leaving main()... short version
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||||
|
|
||||||
|
// Simple argument checker macro
|
||||||
|
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||||
|
|
||||||
|
// Define for user-customized error handling
|
||||||
|
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||||
|
|
||||||
|
// Function to deallocate memory allocated within shrUtils
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrFree(void* ptr);
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Helper function to log standardized information to Console, to File or to both
|
||||||
|
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||||
|
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||||
|
//!
|
||||||
|
//! Automatically opens file and stores handle if needed and not done yet
|
||||||
|
//! Closes file and nulls handle on request
|
||||||
|
//!
|
||||||
|
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||||
|
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||||
|
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||||
|
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||||
|
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||||
|
//! @param 2 dValue:
|
||||||
|
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||||
|
//! Negative val is an error code and this give error preformatting.
|
||||||
|
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||||
|
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||||
|
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||||
|
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||||
|
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||||
|
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||||
|
|
||||||
|
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" int shrLog(const char* cFormatString, ...);
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||||
|
// Maintains state for 3 independent counters
|
||||||
|
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||||
|
//!
|
||||||
|
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||||
|
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" double shrDeltaT(int iCounterID);
|
||||||
|
|
||||||
|
// Optional LogFileNameOverride function
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||||
|
|
||||||
|
// Helper function to init data arrays
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||||
|
|
||||||
|
// Helper function to print data arrays
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Find the path for a filename
|
||||||
|
//! @return the path if succeeded, otherwise 0
|
||||||
|
//! @param filename name of the file
|
||||||
|
//! @param executablePath optional absolute path of the executable
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing single precision floating point data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing double precision floating point data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing integer data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing unsigned integer data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||||
|
unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing char / byte data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing unsigned char / byte data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||||
|
unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing single precision floating point
|
||||||
|
//! data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @param epsilon epsilon for comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||||
|
const float epsilon, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing double precision floating point
|
||||||
|
//! data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @param epsilon epsilon for comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||||
|
const double epsilon, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing integer data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing unsigned integer data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||||
|
unsigned int len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing char / byte data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing unsigned char / byte data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||||
|
unsigned int len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Load PPM image file (with unsigned char as data element type), padding
|
||||||
|
//! 4th component
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param OutData handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
//!
|
||||||
|
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||||
|
unsigned int *w, unsigned int *h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||||
|
//! 4 bytes)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||||
|
unsigned int w, unsigned int h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Save PGM image file (with unsigned char as data element type)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||||
|
unsigned int w, unsigned int h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Load PGM image file (with unsigned char as data element type)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||||
|
unsigned int *w,unsigned int *h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Command line arguments: General notes
|
||||||
|
// * All command line arguments begin with '--' followed by the token;
|
||||||
|
// token and value are seperated by '='; example --samples=50
|
||||||
|
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||||
|
// (without whitespaces)
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Check if command line argument \a flag-name is given
|
||||||
|
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||||
|
//! otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param flag_name name of command line flag
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||||
|
const char* flag_name);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type int
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||||
|
const char* arg_name, int* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type unsigned int
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||||
|
const char* arg_name, unsigned int* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type float
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||||
|
const char* arg_name, float* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type string
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||||
|
const char* arg_name, char** val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument list those element are strings
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val command line argument list
|
||||||
|
//! @param len length of the list / number of elements
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||||
|
const char* arg_name, char** val,
|
||||||
|
unsigned int* len);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||||
|
const unsigned int len);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integer arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||||
|
const unsigned int len );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two unsigned char arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integers with a tolernance for # of byte errors
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||||
|
//! threshold for # pixel errors
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||||
|
//! equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param src_file filename for the image to be compared
|
||||||
|
//! @param data filename for the reference data / gold image
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||||
|
//! $param verboseErrors output details of image mismatch to std::err
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param src_file filename for the image to be compared
|
||||||
|
//! @param data filename for the reference data / gold image
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||||
|
//! $param verboseErrors output details of image mismatch to std::err
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||||
|
|
||||||
|
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||||
|
|
||||||
|
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||||
|
|
||||||
|
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||||
|
{
|
||||||
|
if (iReference != iSample)
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||||
|
if (pCleanup != NULL)
|
||||||
|
{
|
||||||
|
pCleanup(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Standardized Exit
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||||
|
{
|
||||||
|
#ifdef WIN32
|
||||||
|
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||||
|
#else
|
||||||
|
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||||
|
getchar();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||||
|
}
|
||||||
|
fflush(stderr);
|
||||||
|
exit(iExitCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
29
benchmarks/opencl/DotProduct/DotProduct.cl
Normal file
29
benchmarks/opencl/DotProduct/DotProduct.cl
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
__kernel void DotProduct (__global float* a, __global float* b, __global float* c, int iNumElements)
|
||||||
|
{
|
||||||
|
// find position in global arrays
|
||||||
|
int iGID = get_global_id(0);
|
||||||
|
|
||||||
|
// bound check (equivalent to the limit on a 'for' loop for standard/serial C code
|
||||||
|
if (iGID >= iNumElements)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// process
|
||||||
|
int iInOffset = iGID << 2;
|
||||||
|
c[iGID] = a[iInOffset] * b[iInOffset]
|
||||||
|
+ a[iInOffset + 1] * b[iInOffset + 1]
|
||||||
|
+ a[iInOffset + 2] * b[iInOffset + 2]
|
||||||
|
+ a[iInOffset + 3] * b[iInOffset + 3];
|
||||||
|
}
|
||||||
66
benchmarks/opencl/DotProduct/Makefile
Normal file
66
benchmarks/opencl/DotProduct/Makefile
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
|
||||||
|
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
|
||||||
|
POCL_INC_PATH = $(wildcard ../include)
|
||||||
|
POCL_LIB_PATH = $(wildcard ../lib)
|
||||||
|
VX_RT_PATH = $(wildcard ../../../runtime)
|
||||||
|
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
|
||||||
|
|
||||||
|
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||||
|
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||||
|
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||||
|
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||||
|
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||||
|
|
||||||
|
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||||
|
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||||
|
|
||||||
|
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||||
|
|
||||||
|
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||||
|
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||||
|
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||||
|
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||||
|
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||||
|
|
||||||
|
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||||
|
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
|
||||||
|
|
||||||
|
PROJECT=DotProduct
|
||||||
|
|
||||||
|
all: $(PROJECT).dump $(PROJECT).hex
|
||||||
|
|
||||||
|
lib$(PROJECT).a: DotProduct.cl
|
||||||
|
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||||
|
|
||||||
|
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||||
|
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
|
||||||
|
|
||||||
|
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||||
|
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||||
|
|
||||||
|
$(PROJECT).hex: $(PROJECT).elf
|
||||||
|
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||||
|
|
||||||
|
$(PROJECT).dump: $(PROJECT).elf
|
||||||
|
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||||
|
|
||||||
|
run: $(PROJECT).hex
|
||||||
|
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||||
|
|
||||||
|
qemu: $(PROJECT).qemu
|
||||||
|
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||||
|
|
||||||
|
gdb-s: $(PROJECT).qemu
|
||||||
|
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||||
|
|
||||||
|
gdb-c: $(PROJECT).qemu
|
||||||
|
$(GDB) $(PROJECT).qemu
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf *.elf *.dump *.hex
|
||||||
270
benchmarks/opencl/DotProduct/main.cc
Normal file
270
benchmarks/opencl/DotProduct/main.cc
Normal file
@@ -0,0 +1,270 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// oclDotProduct Notes:
|
||||||
|
//
|
||||||
|
// A simple OpenCL API demo application that implements a
|
||||||
|
// vector dot product computation between 2 float arrays.
|
||||||
|
//
|
||||||
|
// Runs computations with OpenCL on the GPU device and then checks results
|
||||||
|
// against basic host CPU/C++ computation.
|
||||||
|
//
|
||||||
|
// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness.
|
||||||
|
// But these are NOT required libs for OpenCL developement in general.
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// standard utilities and systems includes
|
||||||
|
#include <oclUtils.h>
|
||||||
|
#include <shrQATest.h>
|
||||||
|
|
||||||
|
// Name of the file with the source code for the computation kernel
|
||||||
|
// *********************************************************************
|
||||||
|
const char* cSourceFile = "DotProduct.cl";
|
||||||
|
|
||||||
|
// Host buffers for demo
|
||||||
|
// *********************************************************************
|
||||||
|
void *srcA, *srcB, *dst; // Host buffers for OpenCL test
|
||||||
|
void* Golden; // Host buffer for host golden processing cross check
|
||||||
|
|
||||||
|
// OpenCL Vars
|
||||||
|
cl_platform_id cpPlatform; // OpenCL platform
|
||||||
|
cl_device_id *cdDevices; // OpenCL device
|
||||||
|
cl_context cxGPUContext; // OpenCL context
|
||||||
|
cl_command_queue cqCommandQueue;// OpenCL command que
|
||||||
|
cl_program program; // OpenCL program
|
||||||
|
cl_kernel ckKernel; // OpenCL kernel
|
||||||
|
cl_mem cmDevSrcA; // OpenCL device source buffer A
|
||||||
|
cl_mem cmDevSrcB; // OpenCL device source buffer B
|
||||||
|
cl_mem cmDevDst; // OpenCL device destination buffer
|
||||||
|
size_t szGlobalWorkSize; // Total # of work items in the 1D range
|
||||||
|
size_t szLocalWorkSize; // # of work items in the 1D work group
|
||||||
|
size_t szParmDataBytes; // Byte size of context information
|
||||||
|
size_t szKernelLength; // Byte size of kernel code
|
||||||
|
cl_int ciErrNum; // Error code var
|
||||||
|
char* cPathAndName = NULL; // var for full paths to data, src, etc.
|
||||||
|
char* cSourceCL = NULL; // Buffer to hold source for compilation
|
||||||
|
const char* cExecutableName = NULL;
|
||||||
|
|
||||||
|
// demo config vars
|
||||||
|
int iNumElements= 1277944; // Length of float arrays to process (odd # for illustration)
|
||||||
|
shrBOOL bNoPrompt = shrFALSE;
|
||||||
|
|
||||||
|
// Forward Declarations
|
||||||
|
// *********************************************************************
|
||||||
|
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements);
|
||||||
|
void Cleanup (int iExitCode);
|
||||||
|
void (*pCleanup)(int) = &Cleanup;
|
||||||
|
|
||||||
|
int *gp_argc = NULL;
|
||||||
|
char ***gp_argv = NULL;
|
||||||
|
|
||||||
|
// Main function
|
||||||
|
// *********************************************************************
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
gp_argc = &argc;
|
||||||
|
gp_argv = &argv;
|
||||||
|
|
||||||
|
shrQAStart(argc, argv);
|
||||||
|
|
||||||
|
// Get the NVIDIA platform
|
||||||
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||||
|
shrLog("clGetPlatformID...\n");
|
||||||
|
|
||||||
|
// Get the NVIDIA platform
|
||||||
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||||
|
shrLog("clGetPlatformID...\n");
|
||||||
|
|
||||||
|
//Get all the devices
|
||||||
|
cl_uint uiNumDevices = 0; // Number of devices available
|
||||||
|
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
||||||
|
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
|
||||||
|
shrLog("Get the Device info and select Device...\n");
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||||
|
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||||
|
|
||||||
|
// Get command line device options and config accordingly
|
||||||
|
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||||
|
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
|
||||||
|
{
|
||||||
|
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
||||||
|
}
|
||||||
|
shrLog(" Using Device %u: ", uiTargetDevice);
|
||||||
|
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||||
|
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||||
|
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
|
||||||
|
|
||||||
|
// get command line arg for quick test, if provided
|
||||||
|
bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
|
||||||
|
|
||||||
|
// start logs
|
||||||
|
cExecutableName = argv[0];
|
||||||
|
shrSetLogFileName ("oclDotProduct.txt");
|
||||||
|
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
|
||||||
|
|
||||||
|
// set and log Global and Local work size dimensions
|
||||||
|
szLocalWorkSize = 256;
|
||||||
|
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
|
||||||
|
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
|
||||||
|
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
|
||||||
|
|
||||||
|
// Allocate and initialize host arrays
|
||||||
|
shrLog( "Allocate and Init Host Mem...\n");
|
||||||
|
srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
|
||||||
|
srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
|
||||||
|
dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
|
||||||
|
Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
|
||||||
|
shrFillArray((float*)srcA, 4 * iNumElements);
|
||||||
|
shrFillArray((float*)srcB, 4 * iNumElements);
|
||||||
|
|
||||||
|
// Get the NVIDIA platform
|
||||||
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Get a GPU device
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Create the context
|
||||||
|
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Create a command-queue
|
||||||
|
shrLog("clCreateCommandQueue...\n");
|
||||||
|
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
|
||||||
|
shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
|
||||||
|
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Read the OpenCL kernel in from source file
|
||||||
|
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
|
||||||
|
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
|
||||||
|
//oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
|
||||||
|
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
|
||||||
|
//oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
|
||||||
|
|
||||||
|
// Create the program
|
||||||
|
shrLog("clCreateProgramWithSource...\n");
|
||||||
|
//program = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
|
||||||
|
cl_program program =
|
||||||
|
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL);
|
||||||
|
// Build the program with 'mad' Optimization option
|
||||||
|
#ifdef MAC
|
||||||
|
char* flags = "-cl-fast-relaxed-math -DMAC";
|
||||||
|
#else
|
||||||
|
char* flags = "-cl-fast-relaxed-math";
|
||||||
|
#endif
|
||||||
|
shrLog("clBuildProgram...\n");
|
||||||
|
ciErrNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
// write out standard error, Build Log and PTX, then cleanup and exit
|
||||||
|
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||||
|
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
|
||||||
|
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
|
||||||
|
Cleanup(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the kernel
|
||||||
|
shrLog("clCreateKernel (DotProduct)...\n");
|
||||||
|
ckKernel = clCreateKernel(program, "DotProduct", &ciErrNum);
|
||||||
|
|
||||||
|
// Set the Argument values
|
||||||
|
shrLog("clSetKernelArg 0 - 3...\n\n");
|
||||||
|
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// --------------------------------------------------------
|
||||||
|
// Core sequence... copy input data to GPU, compute, copy results back
|
||||||
|
|
||||||
|
// Asynchronous write of data to GPU device
|
||||||
|
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
|
||||||
|
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
|
||||||
|
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Launch kernel
|
||||||
|
shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
|
||||||
|
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Read back results and check accumulated errors
|
||||||
|
shrLog("clEnqueueReadBuffer (Dst)...\n\n");
|
||||||
|
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Compute and compare results for golden-host and report errors and pass/fail
|
||||||
|
shrLog("Comparing against Host/C++ computation...\n\n");
|
||||||
|
DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
|
||||||
|
shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);
|
||||||
|
|
||||||
|
// Cleanup and leave
|
||||||
|
Cleanup (EXIT_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// "Golden" Host processing dot product function for comparison purposes
|
||||||
|
// *********************************************************************
|
||||||
|
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements)
|
||||||
|
{
|
||||||
|
int i, j, k;
|
||||||
|
for (i = 0, j = 0; i < iNumElements; i++)
|
||||||
|
{
|
||||||
|
pfResult[i] = 0.0f;
|
||||||
|
for (k = 0; k < 4; k++, j++)
|
||||||
|
{
|
||||||
|
pfResult[i] += pfData1[j] * pfData2[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleanup and exit code
|
||||||
|
// *********************************************************************
|
||||||
|
void Cleanup(int iExitCode)
|
||||||
|
{
|
||||||
|
// Cleanup allocated objects
|
||||||
|
shrLog("Starting Cleanup...\n\n");
|
||||||
|
if(cPathAndName)free(cPathAndName);
|
||||||
|
if(cSourceCL)free(cSourceCL);
|
||||||
|
if(ckKernel)clReleaseKernel(ckKernel);
|
||||||
|
if(program)clReleaseProgram(program);
|
||||||
|
if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
|
||||||
|
if(cxGPUContext)clReleaseContext(cxGPUContext);
|
||||||
|
if (cmDevSrcA)clReleaseMemObject(cmDevSrcA);
|
||||||
|
if (cmDevSrcB)clReleaseMemObject(cmDevSrcB);
|
||||||
|
if (cmDevDst)clReleaseMemObject(cmDevDst);
|
||||||
|
|
||||||
|
// Free host memory
|
||||||
|
free(srcA);
|
||||||
|
free(srcB);
|
||||||
|
free (dst);
|
||||||
|
free(Golden);
|
||||||
|
|
||||||
|
if (cdDevices) free(cdDevices);
|
||||||
|
|
||||||
|
shrQAFinishExit(*gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED);
|
||||||
|
}
|
||||||
198
benchmarks/opencl/DotProduct/oclUtils.h
Normal file
198
benchmarks/opencl/DotProduct/oclUtils.h
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef OCL_UTILS_H
|
||||||
|
#define OCL_UTILS_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// Common headers: Cross-API utililties and OpenCL header
|
||||||
|
#include <shrUtils.h>
|
||||||
|
|
||||||
|
// All OpenCL headers
|
||||||
|
#if defined (__APPLE__) || defined(MACOSX)
|
||||||
|
#include <OpenCL/opencl.h>
|
||||||
|
#else
|
||||||
|
#include <CL/opencl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Includes
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||||
|
// extensions from <CL/cl_ext.h>
|
||||||
|
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||||
|
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||||
|
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||||
|
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||||
|
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||||
|
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||||
|
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||||
|
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||||
|
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// reminders for build output window and log
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including shrUtils.h")
|
||||||
|
#pragma message ("Note: including opencl.h")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// SDK Revision #
|
||||||
|
#define OCL_SDKREVISION "7027912"
|
||||||
|
|
||||||
|
// Error and Exit Handling Macros...
|
||||||
|
// *********************************************************************
|
||||||
|
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||||
|
|
||||||
|
// Short version without Cleanup() callback pointer
|
||||||
|
// Both Input (a) and Reference (b) are specified as args
|
||||||
|
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param clSelectedPlatformID OpenCL platform ID
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Print info about the device
|
||||||
|
//!
|
||||||
|
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and return device capability
|
||||||
|
//!
|
||||||
|
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" int oclGetDevCap(cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Print the device name
|
||||||
|
//!
|
||||||
|
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of the first device from the context
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of the nth device from the context
|
||||||
|
//!
|
||||||
|
//! @return the id or -1 when out of range
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//! @param device_idx index of the device of interest
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of device with maximal FLOPS from the context
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Loads a Program file and prepends the cPreamble to the code.
|
||||||
|
//!
|
||||||
|
//! @return the source string if succeeded, 0 otherwise
|
||||||
|
//! @param cFilename program filename
|
||||||
|
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||||
|
//! @param szFinalLength returned length of the code string
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the binary (PTX) of the program associated with the device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//! @param binary returned code
|
||||||
|
//! @param length length of returned code
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//! @param const char* cPtxFileName optional PTX file name
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||||
|
|
||||||
|
// Helper function for De-allocating cl objects
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||||
|
|
||||||
|
// Helper function to get OpenCL error string from constant
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" const char* oclErrorString(cl_int error);
|
||||||
|
|
||||||
|
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||||
|
|
||||||
|
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||||
|
{
|
||||||
|
// An error condition is defined by the sample/test value not equal to the reference
|
||||||
|
if (iReference != iSample)
|
||||||
|
{
|
||||||
|
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||||
|
iSample = (iSample == 0) ? -9999 : iSample;
|
||||||
|
|
||||||
|
// Log the error info
|
||||||
|
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||||
|
|
||||||
|
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||||
|
if (pCleanup != NULL)
|
||||||
|
{
|
||||||
|
pCleanup(iSample);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||||
|
exit(iSample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
238
benchmarks/opencl/DotProduct/shrQATest.h
Normal file
238
benchmarks/opencl/DotProduct/shrQATest.h
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SHR_QATEST_H
|
||||||
|
#define SHR_QATEST_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Generic utilities for NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// OS dependent includes
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including windows.h")
|
||||||
|
#pragma message ("Note: including math.h")
|
||||||
|
#pragma message ("Note: including assert.h")
|
||||||
|
#pragma message ("Note: including time.h")
|
||||||
|
|
||||||
|
// Headers needed for Windows
|
||||||
|
#include <windows.h>
|
||||||
|
#include <time.h>
|
||||||
|
#else
|
||||||
|
// Headers needed for Linux
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <time.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef STRCASECMP
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define STRCASECMP _stricmp
|
||||||
|
#else
|
||||||
|
#define STRCASECMP strcasecmp
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef STRNCASECMP
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define STRNCASECMP _strnicmp
|
||||||
|
#else
|
||||||
|
#define STRNCASECMP strncasecmp
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
// Standardized QA Start/Finish for CUDA SDK tests
|
||||||
|
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||||
|
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||||
|
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||||
|
|
||||||
|
inline int findExeNameStart(const char *exec_name)
|
||||||
|
{
|
||||||
|
int exename_start = (int)strlen(exec_name);
|
||||||
|
|
||||||
|
while( (exename_start > 0) &&
|
||||||
|
(exec_name[exename_start] != '\\') &&
|
||||||
|
(exec_name[exename_start] != '/') )
|
||||||
|
{
|
||||||
|
exename_start--;
|
||||||
|
}
|
||||||
|
if (exec_name[exename_start] == '\\' ||
|
||||||
|
exec_name[exename_start] == '/')
|
||||||
|
{
|
||||||
|
return exename_start+1;
|
||||||
|
} else {
|
||||||
|
return exename_start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int __shrQAStart(int argc, char **argv)
|
||||||
|
{
|
||||||
|
bool bQATest = false;
|
||||||
|
// First clear the output buffer
|
||||||
|
fflush(stdout);
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
char *string_argv = &argv[i][string_start];
|
||||||
|
|
||||||
|
if (!STRCASECMP(string_argv, "qatest")) {
|
||||||
|
bQATest = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't want to print the entire path, so we search for the first
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
printf("\n"); fflush(stdout);
|
||||||
|
return exename_start;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum eQAstatus {
|
||||||
|
QA_FAILED = 0,
|
||||||
|
QA_PASSED = 1,
|
||||||
|
QA_WAIVED = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
inline void __ExitInTime(int seconds)
|
||||||
|
{
|
||||||
|
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||||
|
fflush(stdout);
|
||||||
|
time_t t;
|
||||||
|
int count;
|
||||||
|
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||||
|
fprintf(stdout, "%d...", count);
|
||||||
|
#ifdef WIN32
|
||||||
|
Sleep(1000);
|
||||||
|
#else
|
||||||
|
sleep(1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
fprintf(stdout,"done!\n\n");
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||||
|
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||||
|
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
if (!STRCASECMP(string_argv, "qatest")) {
|
||||||
|
bQATest = true;
|
||||||
|
}
|
||||||
|
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||||
|
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||||
|
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||||
|
bNoPrompt = true;
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
if (!STRCASECMP(string_argv, "prompt")) {
|
||||||
|
bNoPrompt = false;
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
printf("\n"); fflush(stdout);
|
||||||
|
if (bQuitInTime) {
|
||||||
|
__ExitInTime(3);
|
||||||
|
} else {
|
||||||
|
if (!bNoPrompt) {
|
||||||
|
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||||
|
fflush(stdout);
|
||||||
|
getchar();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
bool bQuitInTime = true;
|
||||||
|
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||||
|
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||||
|
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
if (!STRCASECMP(string_argv, "prompt")) {
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
if (bQuitInTime) {
|
||||||
|
__ExitInTime(3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
__shrQAFinish(argc, argv, iStatus);
|
||||||
|
|
||||||
|
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||||
|
|
||||||
|
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
642
benchmarks/opencl/DotProduct/shrUtils.h
Normal file
642
benchmarks/opencl/DotProduct/shrUtils.h
Normal file
@@ -0,0 +1,642 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SHR_UTILS_H
|
||||||
|
#define SHR_UTILS_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Generic utilities for NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// reminders for output window and build log
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including windows.h")
|
||||||
|
#pragma message ("Note: including math.h")
|
||||||
|
#pragma message ("Note: including assert.h")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// OS dependent includes
|
||||||
|
#ifdef _WIN32
|
||||||
|
// Headers needed for Windows
|
||||||
|
#include <windows.h>
|
||||||
|
#else
|
||||||
|
// Headers needed for Linux
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Other headers needed for both Windows and Linux
|
||||||
|
#include <math.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// Un-comment the following #define to enable profiling code in SDK apps
|
||||||
|
//#define GPU_PROFILING
|
||||||
|
|
||||||
|
// Beginning of GPU Architecture definitions
|
||||||
|
inline int ConvertSMVer2Cores(int major, int minor)
|
||||||
|
{
|
||||||
|
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||||
|
typedef struct {
|
||||||
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||||
|
int Cores;
|
||||||
|
} sSMtoCores;
|
||||||
|
|
||||||
|
sSMtoCores nGpuArchCoresPerSM[] =
|
||||||
|
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||||
|
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||||
|
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||||
|
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||||
|
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||||
|
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||||
|
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||||
|
{ -1, -1 }
|
||||||
|
};
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||||
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||||
|
return nGpuArchCoresPerSM[index].Cores;
|
||||||
|
}
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
// end of GPU Architecture definitions
|
||||||
|
|
||||||
|
|
||||||
|
// Defines and enum for use with logging functions
|
||||||
|
// *********************************************************************
|
||||||
|
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||||
|
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||||
|
enum LOGMODES
|
||||||
|
{
|
||||||
|
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||||
|
LOGFILE = 2, // bit to signal "log to file"
|
||||||
|
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||||
|
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||||
|
MASTER = 8, // bit to signal master .csv log output
|
||||||
|
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||||
|
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||||
|
};
|
||||||
|
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||||
|
|
||||||
|
// Standardized boolean
|
||||||
|
enum shrBOOL
|
||||||
|
{
|
||||||
|
shrFALSE = 0,
|
||||||
|
shrTRUE = 1
|
||||||
|
};
|
||||||
|
|
||||||
|
// Standardized MAX, MIN and CLAMP
|
||||||
|
#define MAX(a, b) ((a > b) ? a : b)
|
||||||
|
#define MIN(a, b) ((a < b) ? a : b)
|
||||||
|
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||||
|
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||||
|
|
||||||
|
// Error and Exit Handling Macros...
|
||||||
|
// *********************************************************************
|
||||||
|
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||||
|
|
||||||
|
// Short version without Cleanup() callback pointer
|
||||||
|
// Both Input (a) and Reference (b) are specified as args
|
||||||
|
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||||
|
|
||||||
|
// Standardized Exit Macro for leaving main()... extended version
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||||
|
|
||||||
|
// Standardized Exit Macro for leaving main()... short version
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||||
|
|
||||||
|
// Simple argument checker macro
|
||||||
|
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||||
|
|
||||||
|
// Define for user-customized error handling
|
||||||
|
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||||
|
|
||||||
|
// Function to deallocate memory allocated within shrUtils
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrFree(void* ptr);
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Helper function to log standardized information to Console, to File or to both
|
||||||
|
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||||
|
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||||
|
//!
|
||||||
|
//! Automatically opens file and stores handle if needed and not done yet
|
||||||
|
//! Closes file and nulls handle on request
|
||||||
|
//!
|
||||||
|
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||||
|
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||||
|
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||||
|
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||||
|
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||||
|
//! @param 2 dValue:
|
||||||
|
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||||
|
//! Negative val is an error code and this give error preformatting.
|
||||||
|
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||||
|
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||||
|
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||||
|
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||||
|
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||||
|
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||||
|
|
||||||
|
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" int shrLog(const char* cFormatString, ...);
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||||
|
// Maintains state for 3 independent counters
|
||||||
|
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||||
|
//!
|
||||||
|
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||||
|
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" double shrDeltaT(int iCounterID);
|
||||||
|
|
||||||
|
// Optional LogFileNameOverride function
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||||
|
|
||||||
|
// Helper function to init data arrays
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||||
|
|
||||||
|
// Helper function to print data arrays
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Find the path for a filename
|
||||||
|
//! @return the path if succeeded, otherwise 0
|
||||||
|
//! @param filename name of the file
|
||||||
|
//! @param executablePath optional absolute path of the executable
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing single precision floating point data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing double precision floating point data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing integer data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing unsigned integer data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||||
|
unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing char / byte data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing unsigned char / byte data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||||
|
unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing single precision floating point
|
||||||
|
//! data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @param epsilon epsilon for comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||||
|
const float epsilon, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing double precision floating point
|
||||||
|
//! data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @param epsilon epsilon for comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||||
|
const double epsilon, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing integer data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing unsigned integer data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||||
|
unsigned int len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing char / byte data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing unsigned char / byte data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||||
|
unsigned int len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Load PPM image file (with unsigned char as data element type), padding
|
||||||
|
//! 4th component
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param OutData handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
//!
|
||||||
|
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||||
|
unsigned int *w, unsigned int *h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||||
|
//! 4 bytes)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||||
|
unsigned int w, unsigned int h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Save PGM image file (with unsigned char as data element type)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||||
|
unsigned int w, unsigned int h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Load PGM image file (with unsigned char as data element type)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||||
|
unsigned int *w,unsigned int *h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Command line arguments: General notes
|
||||||
|
// * All command line arguments begin with '--' followed by the token;
|
||||||
|
// token and value are seperated by '='; example --samples=50
|
||||||
|
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||||
|
// (without whitespaces)
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Check if command line argument \a flag-name is given
|
||||||
|
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||||
|
//! otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param flag_name name of command line flag
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||||
|
const char* flag_name);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type int
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||||
|
const char* arg_name, int* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type unsigned int
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||||
|
const char* arg_name, unsigned int* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type float
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||||
|
const char* arg_name, float* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type string
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||||
|
const char* arg_name, char** val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument list those element are strings
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val command line argument list
|
||||||
|
//! @param len length of the list / number of elements
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||||
|
const char* arg_name, char** val,
|
||||||
|
unsigned int* len);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||||
|
const unsigned int len);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integer arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||||
|
const unsigned int len );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two unsigned char arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integers with a tolernance for # of byte errors
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||||
|
//! threshold for # pixel errors
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||||
|
//! equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param src_file filename for the image to be compared
|
||||||
|
//! @param data filename for the reference data / gold image
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||||
|
//! $param verboseErrors output details of image mismatch to std::err
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param src_file filename for the image to be compared
|
||||||
|
//! @param data filename for the reference data / gold image
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||||
|
//! $param verboseErrors output details of image mismatch to std::err
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||||
|
|
||||||
|
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||||
|
|
||||||
|
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||||
|
|
||||||
|
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||||
|
{
|
||||||
|
if (iReference != iSample)
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||||
|
if (pCleanup != NULL)
|
||||||
|
{
|
||||||
|
pCleanup(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Standardized Exit
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||||
|
{
|
||||||
|
#ifdef WIN32
|
||||||
|
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||||
|
#else
|
||||||
|
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||||
|
getchar();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||||
|
}
|
||||||
|
fflush(stderr);
|
||||||
|
exit(iExitCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
66
benchmarks/opencl/VectorHypot/Makefile
Normal file
66
benchmarks/opencl/VectorHypot/Makefile
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
|
||||||
|
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
|
||||||
|
POCL_INC_PATH = $(wildcard ../include)
|
||||||
|
POCL_LIB_PATH = $(wildcard ../lib)
|
||||||
|
VX_RT_PATH = $(wildcard ../../../runtime)
|
||||||
|
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
|
||||||
|
|
||||||
|
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||||
|
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||||
|
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||||
|
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||||
|
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||||
|
|
||||||
|
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||||
|
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||||
|
|
||||||
|
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||||
|
|
||||||
|
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||||
|
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||||
|
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||||
|
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||||
|
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||||
|
|
||||||
|
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||||
|
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
|
||||||
|
|
||||||
|
PROJECT=VectorHypot
|
||||||
|
|
||||||
|
all: $(PROJECT).dump $(PROJECT).hex
|
||||||
|
|
||||||
|
lib$(PROJECT).a: VectorHypot.cl
|
||||||
|
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||||
|
|
||||||
|
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||||
|
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
|
||||||
|
|
||||||
|
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||||
|
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||||
|
|
||||||
|
$(PROJECT).hex: $(PROJECT).elf
|
||||||
|
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||||
|
|
||||||
|
$(PROJECT).dump: $(PROJECT).elf
|
||||||
|
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||||
|
|
||||||
|
run: $(PROJECT).hex
|
||||||
|
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||||
|
|
||||||
|
qemu: $(PROJECT).qemu
|
||||||
|
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||||
|
|
||||||
|
gdb-s: $(PROJECT).qemu
|
||||||
|
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||||
|
|
||||||
|
gdb-c: $(PROJECT).qemu
|
||||||
|
$(GDB) $(PROJECT).qemu
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf *.elf *.dump *.hex
|
||||||
41
benchmarks/opencl/VectorHypot/VectorHypot.cl
Normal file
41
benchmarks/opencl/VectorHypot/VectorHypot.cl
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
// OpenCL Kernel Function Naive Implementation for hyptenuse
|
||||||
|
__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
|
||||||
|
{
|
||||||
|
// get index into global data array
|
||||||
|
size_t szGlobalOffset = get_global_id(0) + uiOffset;
|
||||||
|
|
||||||
|
// bound check
|
||||||
|
if (szGlobalOffset >= uiNumElements)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Processing 4 elements per work item, so read fgA and fgB source values from GMEM
|
||||||
|
float4 f4A = fg4A[szGlobalOffset];
|
||||||
|
float4 f4B = fg4B[szGlobalOffset];
|
||||||
|
float4 f4H = (float4)0.0f;
|
||||||
|
|
||||||
|
// Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop
|
||||||
|
for (int i = 0; i < iInnerLoopCount; i++)
|
||||||
|
{
|
||||||
|
// compute the 4 hypotenuses using built-in function
|
||||||
|
f4H.x = hypot (f4A.x, f4B.x);
|
||||||
|
f4H.y = hypot (f4A.y, f4B.y);
|
||||||
|
f4H.z = hypot (f4A.z, f4B.z);
|
||||||
|
f4H.w = hypot (f4A.w, f4B.w);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write 4 result values back out to GMEM
|
||||||
|
fg4Hypot[szGlobalOffset] = f4H;
|
||||||
|
}
|
||||||
686
benchmarks/opencl/VectorHypot/main.cc
Normal file
686
benchmarks/opencl/VectorHypot/main.cc
Normal file
@@ -0,0 +1,686 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// oclCopyComputeOverlap Notes:
|
||||||
|
//
|
||||||
|
// OpenCL API demo application for NVIDIA CUDA GPU's that implements a
|
||||||
|
// element by element vector hyptenuse computation using 2 input float arrays
|
||||||
|
// and 1 output float array.
|
||||||
|
//
|
||||||
|
// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped
|
||||||
|
// with respect to GPU computation (and with respect to host thread).
|
||||||
|
//
|
||||||
|
// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the
|
||||||
|
// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard.
|
||||||
|
// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's.
|
||||||
|
//
|
||||||
|
// After setup, warmup and calibration to the system, the sample runs 4 scenarios:
|
||||||
|
// A) Computations with 2 command queues on GPU
|
||||||
|
// A multiple-cycle sequence is executed, timed and compared against the host
|
||||||
|
// B) Computations with 1 command queue on GPU
|
||||||
|
// A multiple-cycle sequence is executed, timed and compared against the host
|
||||||
|
//
|
||||||
|
// The 2-command queue approach ought to be substantially faster
|
||||||
|
//
|
||||||
|
// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently
|
||||||
|
// increases compute time without increasing data size (via a loop inside the kernel)
|
||||||
|
//
|
||||||
|
// At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%:
|
||||||
|
// (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems)
|
||||||
|
//
|
||||||
|
// If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is:
|
||||||
|
//
|
||||||
|
// Single Queue with all the data and all the work
|
||||||
|
// Ttot (serial) = 4T + 4T + 2T = 10T
|
||||||
|
//
|
||||||
|
// Dual Queue, where each queue has 1/2 the data and 1/2 the work
|
||||||
|
// Tq0 (overlap) = 2T + 2T + T ....
|
||||||
|
// Tq1 (overlap) = .... 2T + 2T + T
|
||||||
|
//
|
||||||
|
// Ttot (elapsed, wall) = 2T + 2T + 2T + T = 7T
|
||||||
|
//
|
||||||
|
// Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 % (Tesla arch 1.2 or 1.3, single copy engine)
|
||||||
|
//
|
||||||
|
// For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped.
|
||||||
|
// This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles.
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// common SDK header for standard utilities and system libs
|
||||||
|
#include <oclUtils.h>
|
||||||
|
#include <shrQATest.h>
|
||||||
|
|
||||||
|
// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
|
||||||
|
// values greater than 0.0f represent a speed-up relative to non-overlapped
|
||||||
|
#define EXPECTED_OVERLAP 30.0f
|
||||||
|
#define EXPECTED_OVERLAP_FERMI 45.0f
|
||||||
|
#define PASS_FACTOR 0.60f
|
||||||
|
#define RETRIES_ON_FAILURE 1
|
||||||
|
|
||||||
|
// Base sizes for parameters manipulated dynamically or on the command line
|
||||||
|
#define BASE_WORK_ITEMS 64
|
||||||
|
#define BASE_ARRAY_LENGTH 40000
|
||||||
|
#define BASE_LOOP_COUNT 32
|
||||||
|
|
||||||
|
// Vars
|
||||||
|
// *********************************************************************
|
||||||
|
cl_platform_id cpPlatform; // OpenCL platform
|
||||||
|
cl_context cxGPUContext; // OpenCL context
|
||||||
|
cl_command_queue cqCommandQueue[2]; // OpenCL command queues
|
||||||
|
cl_device_id* cdDevices; // OpenCL device list
|
||||||
|
cl_program cpProgram; // OpenCL program
|
||||||
|
cl_kernel ckKernel[2]; // OpenCL kernel, 1 per queue
|
||||||
|
cl_mem cmPinnedSrcA; // OpenCL pinned host source buffer A
|
||||||
|
cl_mem cmPinnedSrcB; // OpenCL pinned host source buffer B
|
||||||
|
cl_mem cmPinnedResult; // OpenCL pinned host result buffer
|
||||||
|
float* fSourceA = NULL; // Mapped pointer for pinned Host source A buffer
|
||||||
|
float* fSourceB = NULL; // Mapped pointer for pinned Host source B buffer
|
||||||
|
float* fResult = NULL; // Mapped pointer for pinned Host result buffer
|
||||||
|
cl_mem cmDevSrcA; // OpenCL device source buffer A
|
||||||
|
cl_mem cmDevSrcB; // OpenCL device source buffer B
|
||||||
|
cl_mem cmDevResult; // OpenCL device result buffer
|
||||||
|
size_t szBuffBytes; // Size of main buffers
|
||||||
|
size_t szGlobalWorkSize; // 1D var for Total # of work items in the launched ND range
|
||||||
|
size_t szLocalWorkSize = BASE_WORK_ITEMS; // initial # of work items in the work group
|
||||||
|
cl_int ciErrNum; // Error code var
|
||||||
|
char* cPathAndName = NULL; // Var for full paths to data, src, etc.
|
||||||
|
char* cSourceCL = NULL; // Buffer to hold source for compilation
|
||||||
|
const char* cExecutableName = NULL;
|
||||||
|
|
||||||
|
// demo config vars
|
||||||
|
const char* cSourceFile = "VectorHypot.cl"; // OpenCL computation kernel source code
|
||||||
|
float* Golden = NULL; // temp buffer to hold golden results for cross check
|
||||||
|
bool bNoPrompt = false; // Command line switch to skip exit prompt
|
||||||
|
bool bQATest = false; // Command line switch to test
|
||||||
|
|
||||||
|
// Forward Declarations
|
||||||
|
// *********************************************************************
|
||||||
|
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
|
||||||
|
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
|
||||||
|
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles);
|
||||||
|
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
|
||||||
|
void Cleanup (int iExitCode);
|
||||||
|
void (*pCleanup)(int) = &Cleanup;
|
||||||
|
|
||||||
|
int *gp_argc = 0;
|
||||||
|
const char *** gp_argv = NULL;
|
||||||
|
|
||||||
|
// Main function
|
||||||
|
// *********************************************************************
|
||||||
|
int main(int argc, const char **argv)
|
||||||
|
{
|
||||||
|
//Locals
|
||||||
|
size_t szKernelLength; // Byte size of kernel code
|
||||||
|
double dBuildTime; // Compile time
|
||||||
|
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
||||||
|
cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample
|
||||||
|
cl_uint uiNumDevices; // Number of devices available
|
||||||
|
int iDevCap = -1; // Capability of device
|
||||||
|
int iInnerLoopCount = BASE_LOOP_COUNT; // Varies "compute intensity" per data within the kernel
|
||||||
|
const int iTestCycles = 10; // How many times to run the external test loop
|
||||||
|
const int iWarmupCycles = 8; // How many times to run the warmup sequence
|
||||||
|
cl_uint uiWorkGroupMultiple = 4; // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
|
||||||
|
cl_uint uiNumElements = BASE_ARRAY_LENGTH; // initial # of elements per array to process (note: procesing 4 per work item)
|
||||||
|
cl_uint uiSizeMultiple = 4; // Command line var (using "sizemult=<n>") to optionally increase vector sizes
|
||||||
|
bool bPassFlag = false; // Var to accumulate test pass/fail
|
||||||
|
shrBOOL bMatch = shrFALSE; // Cross check result
|
||||||
|
shrBOOL bTestOverlap = shrFALSE;
|
||||||
|
double dAvgGPUTime[2] = {0.0, 0.0}; // Average time of iTestCycles calls for 2-Queue and 1-Queue test
|
||||||
|
double dHostTime[2] = {0.0, 0.0}; // Host computation time (2nd test is redundant but a good stability indicator)
|
||||||
|
float fMinPassCriteria[2] = {0.0f, 0.0f}; // Test pass cireria, adjusted dependant on GPU arch
|
||||||
|
|
||||||
|
gp_argc = &argc;
|
||||||
|
gp_argv = &argv;
|
||||||
|
|
||||||
|
shrQAStart(argc, (char **)argv);
|
||||||
|
|
||||||
|
// start logs
|
||||||
|
cExecutableName = argv[0];
|
||||||
|
shrSetLogFileName ("oclCopyComputeOverlap.txt");
|
||||||
|
shrLog("%s Starting...\n\n", argv[0]);
|
||||||
|
|
||||||
|
// get basic command line args
|
||||||
|
bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
|
||||||
|
bQATest = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
|
||||||
|
shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);
|
||||||
|
|
||||||
|
// Optional Command-line multiplier for vector size
|
||||||
|
// Default val of 4 gives 10.24 million float elements per vector
|
||||||
|
// Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
|
||||||
|
shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
|
||||||
|
uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);
|
||||||
|
uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
|
||||||
|
shrLog("Array sizes = %u float elements\n", uiNumElements);
|
||||||
|
|
||||||
|
// Optional Command-line multiplier for workgroup size (x 64 work items)
|
||||||
|
// Default val of 4 gives szLocalWorkSize of 256.
|
||||||
|
// Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
|
||||||
|
shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
|
||||||
|
uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10);
|
||||||
|
szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
|
||||||
|
shrLog("Workgroup Size = %u\n\n", szLocalWorkSize);
|
||||||
|
|
||||||
|
// Get the NVIDIA platform if available, otherwise use default
|
||||||
|
shrLog("Get the Platform ID...\n\n");
|
||||||
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Get OpenCL platform name and version
|
||||||
|
char cBuffer[256];
|
||||||
|
ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
shrLog("Platform Name = %s\n\n", cBuffer);
|
||||||
|
|
||||||
|
// Get all the devices
|
||||||
|
shrLog("Get the Device info and select Device...\n");
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));
|
||||||
|
|
||||||
|
// Ethans changes
|
||||||
|
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||||
|
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
|
||||||
|
//ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Set target device and check capabilities
|
||||||
|
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||||
|
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
||||||
|
shrLog(" Using Device %u, ", uiTargetDevice);
|
||||||
|
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||||
|
iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
|
||||||
|
if (iDevCap > 0) {
|
||||||
|
shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
|
||||||
|
} else {
|
||||||
|
shrLog("\n\n", iDevCap);
|
||||||
|
}
|
||||||
|
if (strstr(cBuffer, "NVIDIA") != NULL)
|
||||||
|
{
|
||||||
|
if (iDevCap < 12)
|
||||||
|
{
|
||||||
|
shrLog("Device doesn't have overlap capability. Skipping test...\n");
|
||||||
|
Cleanup (EXIT_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Device and Platform eligible for overlap testing
|
||||||
|
bTestOverlap = shrTRUE;
|
||||||
|
|
||||||
|
// If device has overlap capability, proceed
|
||||||
|
fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP; // 1st cycle overlap is same for 1 or 2 copy engines
|
||||||
|
if (iDevCap != 20)
|
||||||
|
{
|
||||||
|
// Single copy engine
|
||||||
|
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // avg of many cycles
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
char cDevName[1024];
|
||||||
|
clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
|
||||||
|
if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
|
||||||
|
{
|
||||||
|
// Tesla or Quadro (arch = 2.0) ... Dual copy engine
|
||||||
|
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI; // average of many cycles
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Geforce ... Single copy engine
|
||||||
|
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // average of many cycles
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the context
|
||||||
|
shrLog("clCreateContext...\n");
|
||||||
|
cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Create 2 command-queues
|
||||||
|
cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
shrLog("clCreateCommandQueue [0]...\n");
|
||||||
|
cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
shrLog("clCreateCommandQueue [1]...\n");
|
||||||
|
|
||||||
|
// Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
|
||||||
|
szBuffBytes = sizeof(cl_float) * uiNumElements;
|
||||||
|
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements);
|
||||||
|
|
||||||
|
// Allocate pinned source and result host buffers:
|
||||||
|
// Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
|
||||||
|
cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements);
|
||||||
|
|
||||||
|
// Get mapped pointers to pinned input host buffers
|
||||||
|
// Note: This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
|
||||||
|
fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||||
|
//oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n");
|
||||||
|
|
||||||
|
// Alloc temp golden buffer for cross checks
|
||||||
|
Golden = (float*)malloc(szBuffBytes);
|
||||||
|
//oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);
|
||||||
|
|
||||||
|
// Read the OpenCL kernel in from source file
|
||||||
|
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
|
||||||
|
//oclCheckError(cPathAndName != NULL, shrTRUE);
|
||||||
|
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
|
||||||
|
// oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
|
||||||
|
|
||||||
|
// Create the program object
|
||||||
|
//cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
shrLog("clCreateProgramWithSource...\n");
|
||||||
|
cl_program program =
|
||||||
|
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "VectorHypot", NULL);
|
||||||
|
// Build the program for the target device
|
||||||
|
clFinish(cqCommandQueue[0]);
|
||||||
|
shrDeltaT(0);
|
||||||
|
ciErrNum = clBuildProgram(program, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
|
||||||
|
shrLog("clBuildProgram...");
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
// write out standard error, Build Log and PTX, then cleanup and exit
|
||||||
|
shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
|
||||||
|
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
|
||||||
|
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
|
||||||
|
Cleanup(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
dBuildTime = shrDeltaT(0);
|
||||||
|
|
||||||
|
// Ethan - Kernel Addition
|
||||||
|
|
||||||
|
if (program == NULL) {
|
||||||
|
std::cerr << "Failed to write program binary" << std::endl;
|
||||||
|
Cleanup(context, queue, program, kernel, memObjects);
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
std::cout << "Read program from binary." << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the kernel
|
||||||
|
ckKernel[0] = clCreateKernel(program, "VectorHypot", &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
ckKernel[1] = clCreateKernel(program, "VectorHypot", &ciErrNum);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
shrLog("clCreateKernel (ckKernel[2])...\n");
|
||||||
|
|
||||||
|
// Offsets for 2 queues
|
||||||
|
cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};
|
||||||
|
|
||||||
|
// Set the Argument values for the 1st kernel instance (queue 0)
|
||||||
|
ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
|
||||||
|
shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n");
|
||||||
|
|
||||||
|
// Set the Argument values for the 2d kernel instance (queue 1)
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
|
||||||
|
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n");
|
||||||
|
|
||||||
|
//*******************************************
|
||||||
|
// Warmup the driver with dual queue sequence
|
||||||
|
//*******************************************
|
||||||
|
|
||||||
|
// Warmup with dual queue sequence for iTestCycles
|
||||||
|
shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
|
||||||
|
DualQueueSequence(iWarmupCycles, uiNumElements, false);
|
||||||
|
|
||||||
|
// Use single queue config to adjust compute intensity
|
||||||
|
shrLog("Adjust compute for GPU / system...\n");
|
||||||
|
iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles);
|
||||||
|
shrLog(" Kernel inner loop count = %d\n", iInnerLoopCount);
|
||||||
|
|
||||||
|
//*******************************************
|
||||||
|
// Run and time with 2 command-queues
|
||||||
|
//*******************************************
|
||||||
|
for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
|
||||||
|
|
||||||
|
// Run the sequence iTestCycles times
|
||||||
|
dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);
|
||||||
|
|
||||||
|
// Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
|
||||||
|
shrLog(" Device vs Host Result Comparison\t: ");
|
||||||
|
VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
|
||||||
|
shrDeltaT(0);
|
||||||
|
for (int i = 0; i < iTestCycles; i++)
|
||||||
|
{
|
||||||
|
VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
|
||||||
|
}
|
||||||
|
dHostTime[0] = shrDeltaT(0)/iTestCycles;
|
||||||
|
|
||||||
|
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
|
||||||
|
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
|
||||||
|
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
|
||||||
|
bPassFlag = (bMatch == shrTRUE);
|
||||||
|
|
||||||
|
//*******************************************
|
||||||
|
// Run and time with 1 command queue
|
||||||
|
//*******************************************
|
||||||
|
// Run the sequence iTestCycles times
|
||||||
|
dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);
|
||||||
|
|
||||||
|
// Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
|
||||||
|
shrLog(" Device vs Host Result Comparison\t: ");
|
||||||
|
shrDeltaT(0);
|
||||||
|
for (int i = 0; i < iTestCycles; i++)
|
||||||
|
{
|
||||||
|
VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);
|
||||||
|
}
|
||||||
|
dHostTime[1] = shrDeltaT(0)/iTestCycles;
|
||||||
|
|
||||||
|
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
|
||||||
|
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
|
||||||
|
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
|
||||||
|
bPassFlag &= (bMatch == shrTRUE);
|
||||||
|
|
||||||
|
//*******************************************
|
||||||
|
|
||||||
|
// Compare Single and Dual queue timing
|
||||||
|
shrLog("\nResult Summary:\n");
|
||||||
|
|
||||||
|
// Log GPU and CPU Time for 2-queue scenario
|
||||||
|
shrLog(" Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
|
||||||
|
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);
|
||||||
|
|
||||||
|
// Log GPU and CPU Time for 1-queue scenario
|
||||||
|
shrLog(" Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
|
||||||
|
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);
|
||||||
|
|
||||||
|
// Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
|
||||||
|
double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
|
||||||
|
|
||||||
|
if( bTestOverlap ) {
|
||||||
|
bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
|
||||||
|
if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
|
||||||
|
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");
|
||||||
|
|
||||||
|
// Log info to master log in standard format
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||||
|
dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize);
|
||||||
|
|
||||||
|
bPassFlag &= bAvgOverlapOK;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//*******************************************
|
||||||
|
// Report pass/fail, cleanup and exit
|
||||||
|
Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run 1 queue sequence for n cycles
|
||||||
|
// *********************************************************************
|
||||||
|
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
|
||||||
|
{
|
||||||
|
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
|
||||||
|
shrFillArray(fSourceA, (int)uiNumElements);
|
||||||
|
shrFillArray(fSourceB, (int)uiNumElements);
|
||||||
|
|
||||||
|
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
|
||||||
|
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
|
||||||
|
|
||||||
|
// *** Make sure queues are empty and then start timer
|
||||||
|
double dAvgTime = 0.0;
|
||||||
|
clFinish(cqCommandQueue[0]);
|
||||||
|
clFinish(cqCommandQueue[1]);
|
||||||
|
shrDeltaT(0);
|
||||||
|
|
||||||
|
// Run the sequence iCycles times
|
||||||
|
for (int i = 0; i < iCycles; i++)
|
||||||
|
{
|
||||||
|
// Nonblocking Write of all of input data from host to device in command-queue 0
|
||||||
|
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||||
|
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// Launch kernel computation, command-queue 0
|
||||||
|
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Non Blocking Read of output data from device to host, command-queue 0
|
||||||
|
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
|
||||||
|
clFlush(cqCommandQueue[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// *** Assure sync to host and return average sequence time
|
||||||
|
clFinish(cqCommandQueue[0]);
|
||||||
|
dAvgTime = shrDeltaT(0)/(double)iCycles;
|
||||||
|
|
||||||
|
// Log config if asked for
|
||||||
|
if (bShowConfig)
|
||||||
|
{
|
||||||
|
shrLog("\n1-Queue sequence Configuration:\n");
|
||||||
|
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 1\n",
|
||||||
|
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
|
||||||
|
}
|
||||||
|
return dAvgTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run 2 queue sequence for n cycles
|
||||||
|
// *********************************************************************
|
||||||
|
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
|
||||||
|
{
|
||||||
|
// Locals
|
||||||
|
size_t szHalfBuffer = szBuffBytes / 2;
|
||||||
|
size_t szHalfOffset = szHalfBuffer / sizeof(float);
|
||||||
|
double dAvgTime = 0.0;
|
||||||
|
|
||||||
|
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
|
||||||
|
shrFillArray(fSourceA, (int)uiNumElements);
|
||||||
|
shrFillArray(fSourceB, (int)uiNumElements);
|
||||||
|
|
||||||
|
// Set Global work size for 2 command-queues, and log work sizes & dimensions
|
||||||
|
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));
|
||||||
|
|
||||||
|
// Make sure queues are empty and then start timer
|
||||||
|
clFinish(cqCommandQueue[0]);
|
||||||
|
clFinish(cqCommandQueue[1]);
|
||||||
|
shrDeltaT(0);
|
||||||
|
|
||||||
|
for (int i = 0; i < iCycles; i++)
|
||||||
|
{
|
||||||
|
// Mid Phase 0
|
||||||
|
// Nonblocking Write of 1st half of input data from host to device in command-queue 0
|
||||||
|
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||||
|
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver
|
||||||
|
// (not necessary on Linux, Mac OSX or WinXP)
|
||||||
|
clFlush(cqCommandQueue[0]);
|
||||||
|
clFlush(cqCommandQueue[1]);
|
||||||
|
|
||||||
|
// Start Phase 1 ***********************************
|
||||||
|
|
||||||
|
// Launch kernel computation, command-queue 0
|
||||||
|
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
|
||||||
|
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||||
|
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Nonblocking Write of 2nd half of input data from host to device in command-queue 1
|
||||||
|
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
|
||||||
|
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
|
||||||
|
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// Push out the compute for queue 0 and write for queue 1 to the driver
|
||||||
|
// (not necessary on Linux, Mac OSX or WinXP)
|
||||||
|
clFlush(cqCommandQueue[0]);
|
||||||
|
clFlush(cqCommandQueue[1]);
|
||||||
|
|
||||||
|
// Start Phase 2 ***********************************
|
||||||
|
|
||||||
|
// Launch kernel computation, command-queue 1
|
||||||
|
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||||
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
|
||||||
|
// Non Blocking Read of 1st half of output data from device to host, command-queue 0
|
||||||
|
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// Push out the compute for queue 1 and the read for queue 0 to the driver
|
||||||
|
// (not necessary on Linux, Mac OSX or WinXP)
|
||||||
|
clFlush(cqCommandQueue[0]);
|
||||||
|
clFlush(cqCommandQueue[1]);
|
||||||
|
|
||||||
|
// Start Phase 0 (Rolls over) ***********************************
|
||||||
|
|
||||||
|
// Non Blocking Read of 2nd half of output data from device to host, command-queue 1
|
||||||
|
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// *** Sync to host and get average sequence time
|
||||||
|
clFinish(cqCommandQueue[0]);
|
||||||
|
clFinish(cqCommandQueue[1]);
|
||||||
|
dAvgTime = shrDeltaT(0)/(double)iCycles;
|
||||||
|
|
||||||
|
// Log config if asked for
|
||||||
|
if (bShowConfig)
|
||||||
|
{
|
||||||
|
shrLog("\n2-Queue sequence Configuration:\n");
|
||||||
|
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 2\n",
|
||||||
|
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
return dAvgTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function to adjust compute task according to device capability
|
||||||
|
// This allows a consistent overlap % across a wide variety of GPU's for test purposes
|
||||||
|
// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
|
||||||
|
// *********************************************************************
|
||||||
|
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
|
||||||
|
{
|
||||||
|
// Locals
|
||||||
|
double dCopyTime, dComputeTime;
|
||||||
|
int iComputedLoopCount;
|
||||||
|
|
||||||
|
// Change Source Data
|
||||||
|
shrFillArray(fSourceA, (int)uiNumElements);
|
||||||
|
shrFillArray(fSourceB, (int)uiNumElements);
|
||||||
|
|
||||||
|
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
|
||||||
|
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
|
||||||
|
|
||||||
|
// *** Make sure queues are empty and then start timer
|
||||||
|
clFinish(cqCommandQueue[0]);
|
||||||
|
clFinish(cqCommandQueue[1]);
|
||||||
|
shrDeltaT(0);
|
||||||
|
|
||||||
|
// Run the copy iCycles times and measure copy time on this system
|
||||||
|
for (int i = 0; i < iCycles; i++)
|
||||||
|
{
|
||||||
|
// Nonblocking Write of all of input data from host to device in command-queue 0
|
||||||
|
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||||
|
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||||
|
ciErrNum |= clFlush(cqCommandQueue[0]);
|
||||||
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
}
|
||||||
|
clFinish(cqCommandQueue[0]);
|
||||||
|
dCopyTime = shrDeltaT(0);
|
||||||
|
|
||||||
|
// Run the compute iCycles times and measure compute time on this system
|
||||||
|
for (int i = 0; i < iCycles; i++)
|
||||||
|
{
|
||||||
|
// Launch kernel computation, command-queue 0
|
||||||
|
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||||
|
ciErrNum |= clFlush(cqCommandQueue[0]);
|
||||||
|
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
}
|
||||||
|
clFinish(cqCommandQueue[0]);
|
||||||
|
dComputeTime = shrDeltaT(0);
|
||||||
|
|
||||||
|
// Determine number of core loop cycles proportional to copy/compute time ratio
|
||||||
|
dComputeTime = MAX(dComputeTime, 1.0e-6);
|
||||||
|
iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
|
||||||
|
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||||
|
return (iComputedLoopCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleanup/Exit function
|
||||||
|
// *********************************************************************
|
||||||
|
void Cleanup (int iExitCode)
|
||||||
|
{
|
||||||
|
// Cleanup allocated objects
|
||||||
|
shrLog("Starting Cleanup...\n\n");
|
||||||
|
if(cPathAndName)free(cPathAndName);
|
||||||
|
if(cSourceCL)free(cSourceCL);
|
||||||
|
if(Golden)free(Golden);
|
||||||
|
if(ckKernel[0])clReleaseKernel(ckKernel[0]);
|
||||||
|
if(ckKernel[1])clReleaseKernel(ckKernel[1]);
|
||||||
|
if(program)clReleaseProgram(program);
|
||||||
|
if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
|
||||||
|
if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
|
||||||
|
if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
|
||||||
|
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
|
||||||
|
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
|
||||||
|
if(cmDevResult)clReleaseMemObject(cmDevResult);
|
||||||
|
if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
|
||||||
|
if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
|
||||||
|
if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
|
||||||
|
if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
|
||||||
|
if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
|
||||||
|
if(cxGPUContext)clReleaseContext(cxGPUContext);
|
||||||
|
if(cdDevices)free(cdDevices);
|
||||||
|
|
||||||
|
// Master status Pass/Fail (all tests)
|
||||||
|
shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
|
||||||
|
}
|
||||||
|
|
||||||
|
// "Golden" Host processing vector hyptenuse function for comparison purposes
|
||||||
|
// *********************************************************************
|
||||||
|
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
|
||||||
|
{
|
||||||
|
for (unsigned int i = 0; i < uiNumElements; i++)
|
||||||
|
{
|
||||||
|
float fA = pfData1[i];
|
||||||
|
float fB = pfData2[i];
|
||||||
|
float fC = sqrtf(fA * fA + fB * fB);
|
||||||
|
|
||||||
|
pfResult[i] = fC;
|
||||||
|
}
|
||||||
|
}
|
||||||
198
benchmarks/opencl/VectorHypot/oclUtils.h
Normal file
198
benchmarks/opencl/VectorHypot/oclUtils.h
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef OCL_UTILS_H
|
||||||
|
#define OCL_UTILS_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// Common headers: Cross-API utililties and OpenCL header
|
||||||
|
#include <shrUtils.h>
|
||||||
|
|
||||||
|
// All OpenCL headers
|
||||||
|
#if defined (__APPLE__) || defined(MACOSX)
|
||||||
|
#include <OpenCL/opencl.h>
|
||||||
|
#else
|
||||||
|
#include <CL/opencl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Includes
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||||
|
// extensions from <CL/cl_ext.h>
|
||||||
|
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||||
|
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||||
|
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||||
|
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||||
|
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||||
|
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||||
|
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||||
|
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||||
|
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// reminders for build output window and log
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including shrUtils.h")
|
||||||
|
#pragma message ("Note: including opencl.h")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// SDK Revision #
|
||||||
|
#define OCL_SDKREVISION "7027912"
|
||||||
|
|
||||||
|
// Error and Exit Handling Macros...
|
||||||
|
// *********************************************************************
|
||||||
|
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||||
|
|
||||||
|
// Short version without Cleanup() callback pointer
|
||||||
|
// Both Input (a) and Reference (b) are specified as args
|
||||||
|
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param clSelectedPlatformID OpenCL platform ID
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Print info about the device
|
||||||
|
//!
|
||||||
|
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and return device capability
|
||||||
|
//!
|
||||||
|
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" int oclGetDevCap(cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Print the device name
|
||||||
|
//!
|
||||||
|
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of the first device from the context
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of the nth device from the context
|
||||||
|
//!
|
||||||
|
//! @return the id or -1 when out of range
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//! @param device_idx index of the device of interest
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of device with maximal FLOPS from the context
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Loads a Program file and prepends the cPreamble to the code.
|
||||||
|
//!
|
||||||
|
//! @return the source string if succeeded, 0 otherwise
|
||||||
|
//! @param cFilename program filename
|
||||||
|
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||||
|
//! @param szFinalLength returned length of the code string
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the binary (PTX) of the program associated with the device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//! @param binary returned code
|
||||||
|
//! @param length length of returned code
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//! @param const char* cPtxFileName optional PTX file name
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||||
|
|
||||||
|
// Helper function for De-allocating cl objects
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||||
|
|
||||||
|
// Helper function to get OpenCL error string from constant
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" const char* oclErrorString(cl_int error);
|
||||||
|
|
||||||
|
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||||
|
|
||||||
|
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||||
|
{
|
||||||
|
// An error condition is defined by the sample/test value not equal to the reference
|
||||||
|
if (iReference != iSample)
|
||||||
|
{
|
||||||
|
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||||
|
iSample = (iSample == 0) ? -9999 : iSample;
|
||||||
|
|
||||||
|
// Log the error info
|
||||||
|
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||||
|
|
||||||
|
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||||
|
if (pCleanup != NULL)
|
||||||
|
{
|
||||||
|
pCleanup(iSample);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||||
|
exit(iSample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
238
benchmarks/opencl/VectorHypot/shrQATest.h
Normal file
238
benchmarks/opencl/VectorHypot/shrQATest.h
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SHR_QATEST_H
|
||||||
|
#define SHR_QATEST_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Generic utilities for NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// OS dependent includes
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including windows.h")
|
||||||
|
#pragma message ("Note: including math.h")
|
||||||
|
#pragma message ("Note: including assert.h")
|
||||||
|
#pragma message ("Note: including time.h")
|
||||||
|
|
||||||
|
// Headers needed for Windows
|
||||||
|
#include <windows.h>
|
||||||
|
#include <time.h>
|
||||||
|
#else
|
||||||
|
// Headers needed for Linux
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <time.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef STRCASECMP
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define STRCASECMP _stricmp
|
||||||
|
#else
|
||||||
|
#define STRCASECMP strcasecmp
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef STRNCASECMP
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define STRNCASECMP _strnicmp
|
||||||
|
#else
|
||||||
|
#define STRNCASECMP strncasecmp
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
// Standardized QA Start/Finish for CUDA SDK tests
|
||||||
|
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||||
|
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||||
|
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||||
|
|
||||||
|
inline int findExeNameStart(const char *exec_name)
|
||||||
|
{
|
||||||
|
int exename_start = (int)strlen(exec_name);
|
||||||
|
|
||||||
|
while( (exename_start > 0) &&
|
||||||
|
(exec_name[exename_start] != '\\') &&
|
||||||
|
(exec_name[exename_start] != '/') )
|
||||||
|
{
|
||||||
|
exename_start--;
|
||||||
|
}
|
||||||
|
if (exec_name[exename_start] == '\\' ||
|
||||||
|
exec_name[exename_start] == '/')
|
||||||
|
{
|
||||||
|
return exename_start+1;
|
||||||
|
} else {
|
||||||
|
return exename_start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int __shrQAStart(int argc, char **argv)
|
||||||
|
{
|
||||||
|
bool bQATest = false;
|
||||||
|
// First clear the output buffer
|
||||||
|
fflush(stdout);
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
char *string_argv = &argv[i][string_start];
|
||||||
|
|
||||||
|
if (!STRCASECMP(string_argv, "qatest")) {
|
||||||
|
bQATest = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't want to print the entire path, so we search for the first
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
printf("\n"); fflush(stdout);
|
||||||
|
return exename_start;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum eQAstatus {
|
||||||
|
QA_FAILED = 0,
|
||||||
|
QA_PASSED = 1,
|
||||||
|
QA_WAIVED = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
inline void __ExitInTime(int seconds)
|
||||||
|
{
|
||||||
|
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||||
|
fflush(stdout);
|
||||||
|
time_t t;
|
||||||
|
int count;
|
||||||
|
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||||
|
fprintf(stdout, "%d...", count);
|
||||||
|
#ifdef WIN32
|
||||||
|
Sleep(1000);
|
||||||
|
#else
|
||||||
|
sleep(1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
fprintf(stdout,"done!\n\n");
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||||
|
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||||
|
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
if (!STRCASECMP(string_argv, "qatest")) {
|
||||||
|
bQATest = true;
|
||||||
|
}
|
||||||
|
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||||
|
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||||
|
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||||
|
bNoPrompt = true;
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
if (!STRCASECMP(string_argv, "prompt")) {
|
||||||
|
bNoPrompt = false;
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
printf("\n"); fflush(stdout);
|
||||||
|
if (bQuitInTime) {
|
||||||
|
__ExitInTime(3);
|
||||||
|
} else {
|
||||||
|
if (!bNoPrompt) {
|
||||||
|
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||||
|
fflush(stdout);
|
||||||
|
getchar();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
bool bQuitInTime = true;
|
||||||
|
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||||
|
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||||
|
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
if (!STRCASECMP(string_argv, "prompt")) {
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
if (bQuitInTime) {
|
||||||
|
__ExitInTime(3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
__shrQAFinish(argc, argv, iStatus);
|
||||||
|
|
||||||
|
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||||
|
|
||||||
|
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
642
benchmarks/opencl/VectorHypot/shrUtils.h
Normal file
642
benchmarks/opencl/VectorHypot/shrUtils.h
Normal file
@@ -0,0 +1,642 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SHR_UTILS_H
|
||||||
|
#define SHR_UTILS_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Generic utilities for NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// reminders for output window and build log
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including windows.h")
|
||||||
|
#pragma message ("Note: including math.h")
|
||||||
|
#pragma message ("Note: including assert.h")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// OS dependent includes
|
||||||
|
#ifdef _WIN32
|
||||||
|
// Headers needed for Windows
|
||||||
|
#include <windows.h>
|
||||||
|
#else
|
||||||
|
// Headers needed for Linux
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Other headers needed for both Windows and Linux
|
||||||
|
#include <math.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// Un-comment the following #define to enable profiling code in SDK apps
|
||||||
|
//#define GPU_PROFILING
|
||||||
|
|
||||||
|
// Beginning of GPU Architecture definitions
|
||||||
|
inline int ConvertSMVer2Cores(int major, int minor)
|
||||||
|
{
|
||||||
|
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||||
|
typedef struct {
|
||||||
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||||
|
int Cores;
|
||||||
|
} sSMtoCores;
|
||||||
|
|
||||||
|
sSMtoCores nGpuArchCoresPerSM[] =
|
||||||
|
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||||
|
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||||
|
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||||
|
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||||
|
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||||
|
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||||
|
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||||
|
{ -1, -1 }
|
||||||
|
};
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||||
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||||
|
return nGpuArchCoresPerSM[index].Cores;
|
||||||
|
}
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
// end of GPU Architecture definitions
|
||||||
|
|
||||||
|
|
||||||
|
// Defines and enum for use with logging functions
|
||||||
|
// *********************************************************************
|
||||||
|
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||||
|
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||||
|
enum LOGMODES
|
||||||
|
{
|
||||||
|
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||||
|
LOGFILE = 2, // bit to signal "log to file"
|
||||||
|
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||||
|
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||||
|
MASTER = 8, // bit to signal master .csv log output
|
||||||
|
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||||
|
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||||
|
};
|
||||||
|
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||||
|
|
||||||
|
// Standardized boolean
|
||||||
|
enum shrBOOL
|
||||||
|
{
|
||||||
|
shrFALSE = 0,
|
||||||
|
shrTRUE = 1
|
||||||
|
};
|
||||||
|
|
||||||
|
// Standardized MAX, MIN and CLAMP
|
||||||
|
#define MAX(a, b) ((a > b) ? a : b)
|
||||||
|
#define MIN(a, b) ((a < b) ? a : b)
|
||||||
|
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||||
|
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||||
|
|
||||||
|
// Error and Exit Handling Macros...
|
||||||
|
// *********************************************************************
|
||||||
|
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||||
|
|
||||||
|
// Short version without Cleanup() callback pointer
|
||||||
|
// Both Input (a) and Reference (b) are specified as args
|
||||||
|
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||||
|
|
||||||
|
// Standardized Exit Macro for leaving main()... extended version
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||||
|
|
||||||
|
// Standardized Exit Macro for leaving main()... short version
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||||
|
|
||||||
|
// Simple argument checker macro
|
||||||
|
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||||
|
|
||||||
|
// Define for user-customized error handling
|
||||||
|
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||||
|
|
||||||
|
// Function to deallocate memory allocated within shrUtils
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrFree(void* ptr);
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Helper function to log standardized information to Console, to File or to both
|
||||||
|
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||||
|
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||||
|
//!
|
||||||
|
//! Automatically opens file and stores handle if needed and not done yet
|
||||||
|
//! Closes file and nulls handle on request
|
||||||
|
//!
|
||||||
|
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||||
|
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||||
|
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||||
|
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||||
|
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||||
|
//! @param 2 dValue:
|
||||||
|
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||||
|
//! Negative val is an error code and this give error preformatting.
|
||||||
|
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||||
|
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||||
|
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||||
|
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||||
|
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||||
|
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||||
|
|
||||||
|
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" int shrLog(const char* cFormatString, ...);
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||||
|
// Maintains state for 3 independent counters
|
||||||
|
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||||
|
//!
|
||||||
|
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||||
|
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" double shrDeltaT(int iCounterID);
|
||||||
|
|
||||||
|
// Optional LogFileNameOverride function
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||||
|
|
||||||
|
// Helper function to init data arrays
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||||
|
|
||||||
|
// Helper function to print data arrays
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Find the path for a filename
|
||||||
|
//! @return the path if succeeded, otherwise 0
|
||||||
|
//! @param filename name of the file
|
||||||
|
//! @param executablePath optional absolute path of the executable
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing single precision floating point data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing double precision floating point data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing integer data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing unsigned integer data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||||
|
unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing char / byte data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing unsigned char / byte data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||||
|
unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing single precision floating point
|
||||||
|
//! data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @param epsilon epsilon for comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||||
|
const float epsilon, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing double precision floating point
|
||||||
|
//! data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @param epsilon epsilon for comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||||
|
const double epsilon, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing integer data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing unsigned integer data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||||
|
unsigned int len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing char / byte data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing unsigned char / byte data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||||
|
unsigned int len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Load PPM image file (with unsigned char as data element type), padding
|
||||||
|
//! 4th component
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param OutData handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
//!
|
||||||
|
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||||
|
unsigned int *w, unsigned int *h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||||
|
//! 4 bytes)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||||
|
unsigned int w, unsigned int h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Save PGM image file (with unsigned char as data element type)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||||
|
unsigned int w, unsigned int h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Load PGM image file (with unsigned char as data element type)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||||
|
unsigned int *w,unsigned int *h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Command line arguments: General notes
|
||||||
|
// * All command line arguments begin with '--' followed by the token;
|
||||||
|
// token and value are seperated by '='; example --samples=50
|
||||||
|
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||||
|
// (without whitespaces)
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Check if command line argument \a flag-name is given
|
||||||
|
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||||
|
//! otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param flag_name name of command line flag
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||||
|
const char* flag_name);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type int
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||||
|
const char* arg_name, int* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type unsigned int
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||||
|
const char* arg_name, unsigned int* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type float
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||||
|
const char* arg_name, float* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type string
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||||
|
const char* arg_name, char** val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument list those element are strings
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val command line argument list
|
||||||
|
//! @param len length of the list / number of elements
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||||
|
const char* arg_name, char** val,
|
||||||
|
unsigned int* len);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||||
|
const unsigned int len);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integer arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||||
|
const unsigned int len );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two unsigned char arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integers with a tolernance for # of byte errors
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||||
|
//! threshold for # pixel errors
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||||
|
//! equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param src_file filename for the image to be compared
|
||||||
|
//! @param data filename for the reference data / gold image
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||||
|
//! $param verboseErrors output details of image mismatch to std::err
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param src_file filename for the image to be compared
|
||||||
|
//! @param data filename for the reference data / gold image
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||||
|
//! $param verboseErrors output details of image mismatch to std::err
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||||
|
|
||||||
|
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||||
|
|
||||||
|
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||||
|
|
||||||
|
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||||
|
{
|
||||||
|
if (iReference != iSample)
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||||
|
if (pCleanup != NULL)
|
||||||
|
{
|
||||||
|
pCleanup(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Standardized Exit
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||||
|
{
|
||||||
|
#ifdef WIN32
|
||||||
|
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||||
|
#else
|
||||||
|
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||||
|
getchar();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||||
|
}
|
||||||
|
fflush(stderr);
|
||||||
|
exit(iExitCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
66
benchmarks/opencl/reduce0/Makefile
Normal file
66
benchmarks/opencl/reduce0/Makefile
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
|
||||||
|
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
|
||||||
|
POCL_INC_PATH = $(wildcard ../include)
|
||||||
|
POCL_LIB_PATH = $(wildcard ../lib)
|
||||||
|
VX_RT_PATH = $(wildcard ../../../runtime)
|
||||||
|
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
|
||||||
|
|
||||||
|
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||||
|
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||||
|
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||||
|
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||||
|
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||||
|
|
||||||
|
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||||
|
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||||
|
|
||||||
|
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||||
|
|
||||||
|
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||||
|
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||||
|
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||||
|
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||||
|
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||||
|
|
||||||
|
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||||
|
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
|
||||||
|
|
||||||
|
PROJECT=reduce0
|
||||||
|
|
||||||
|
all: $(PROJECT).dump $(PROJECT).hex
|
||||||
|
|
||||||
|
lib$(PROJECT).a: oclReduction_kernel.cl
|
||||||
|
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||||
|
|
||||||
|
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||||
|
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
|
||||||
|
|
||||||
|
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||||
|
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||||
|
|
||||||
|
$(PROJECT).hex: $(PROJECT).elf
|
||||||
|
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||||
|
|
||||||
|
$(PROJECT).dump: $(PROJECT).elf
|
||||||
|
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||||
|
|
||||||
|
run: $(PROJECT).hex
|
||||||
|
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||||
|
|
||||||
|
qemu: $(PROJECT).qemu
|
||||||
|
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||||
|
|
||||||
|
gdb-s: $(PROJECT).qemu
|
||||||
|
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||||
|
|
||||||
|
gdb-c: $(PROJECT).qemu
|
||||||
|
$(GDB) $(PROJECT).qemu
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf *.elf *.dump *.hex
|
||||||
638
benchmarks/opencl/reduce0/main.cc
Normal file
638
benchmarks/opencl/reduce0/main.cc
Normal file
@@ -0,0 +1,638 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
Parallel reduction
|
||||||
|
|
||||||
|
This sample shows how to perform a reduction operation on an array of values
|
||||||
|
to produce a single value.
|
||||||
|
|
||||||
|
Reductions are a very common computation in parallel algorithms. Any time
|
||||||
|
an array of values needs to be reduced to a single value using a binary
|
||||||
|
associative operator, a reduction can be used. Example applications include
|
||||||
|
statistics computaions such as mean and standard deviation, and image
|
||||||
|
processing applications such as finding the total luminance of an
|
||||||
|
image.
|
||||||
|
|
||||||
|
This code performs sum reductions, but any associative operator such as
|
||||||
|
min() or max() could also be used.
|
||||||
|
|
||||||
|
It assumes the input size is a power of 2.
|
||||||
|
|
||||||
|
COMMAND LINE ARGUMENTS
|
||||||
|
|
||||||
|
"--shmoo": Test performance for 1 to 32M elements with each of the 7 different kernels
|
||||||
|
"--n=<N>": Specify the number of elements to reduce (default 1048576)
|
||||||
|
"--threads=<N>": Specify the number of threads per block (default 128)
|
||||||
|
"--kernel=<N>": Specify which kernel to run (0-6, default 6)
|
||||||
|
"--maxblocks=<N>": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64)
|
||||||
|
"--cpufinal": Read back the per-block results and do final sum of block sums on CPU (default false)
|
||||||
|
"--cputhresh=<N>": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1)
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Common system and utility includes
|
||||||
|
#include <oclUtils.h>
|
||||||
|
#include <shrQATest.h>
|
||||||
|
|
||||||
|
// additional includes
|
||||||
|
#include <sstream>
|
||||||
|
#include <oclReduction.h>
|
||||||
|
|
||||||
|
// Forward declarations and sample-specific defines
|
||||||
|
// *********************************************************************
|
||||||
|
enum ReduceType
|
||||||
|
{
|
||||||
|
REDUCE_INT,
|
||||||
|
REDUCE_FLOAT,
|
||||||
|
REDUCE_DOUBLE
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
bool runTest( int argc, const char** argv, ReduceType datatype);
|
||||||
|
|
||||||
|
#define MAX_BLOCK_DIM_SIZE 65535
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
bool isPow2(unsigned int x)
|
||||||
|
{
|
||||||
|
return ((x&(x-1))==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2);
|
||||||
|
|
||||||
|
// Main function
|
||||||
|
// *********************************************************************
|
||||||
|
int main( int argc, const char** argv)
|
||||||
|
{
|
||||||
|
shrQAStart(argc, (char **)argv);
|
||||||
|
|
||||||
|
// start logs
|
||||||
|
shrSetLogFileName ("oclReduction.txt");
|
||||||
|
shrLog("%s Starting...\n\n", argv[0]);
|
||||||
|
|
||||||
|
char *typeChoice;
|
||||||
|
shrGetCmdLineArgumentstr(argc, argv, "type", &typeChoice);
|
||||||
|
|
||||||
|
// determine type of array from command line args
|
||||||
|
if (0 == typeChoice)
|
||||||
|
{
|
||||||
|
typeChoice = (char*)malloc(7 * sizeof(char));
|
||||||
|
#ifdef WIN32
|
||||||
|
strcpy_s(typeChoice, 7 * sizeof(char) + 1, "int");
|
||||||
|
#else
|
||||||
|
strcpy(typeChoice, "int");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
ReduceType datatype = REDUCE_INT;
|
||||||
|
|
||||||
|
#ifdef WIN32
|
||||||
|
if (!_strcmpi(typeChoice, "float"))
|
||||||
|
datatype = REDUCE_FLOAT;
|
||||||
|
else if (!_strcmpi(typeChoice, "double"))
|
||||||
|
datatype = REDUCE_DOUBLE;
|
||||||
|
else
|
||||||
|
datatype = REDUCE_INT;
|
||||||
|
#else
|
||||||
|
if (!strcmp(typeChoice, "float"))
|
||||||
|
datatype = REDUCE_FLOAT;
|
||||||
|
else if (!strcmp(typeChoice, "double"))
|
||||||
|
datatype = REDUCE_DOUBLE;
|
||||||
|
else
|
||||||
|
datatype = REDUCE_INT;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
shrLog("Reducing array of type %s.\n", typeChoice);
|
||||||
|
|
||||||
|
//Get the NVIDIA platform
|
||||||
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
//Get the devices
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
//Create the context
|
||||||
|
cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// get and log the device info
|
||||||
|
if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
|
||||||
|
int device_nr = 0;
|
||||||
|
shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &device_nr);
|
||||||
|
if( device_nr < uiNumDevices ) {
|
||||||
|
device = oclGetDev(cxGPUContext, device_nr);
|
||||||
|
} else {
|
||||||
|
shrLog("Invalid Device %d Requested.\n", device_nr);
|
||||||
|
shrExitEX(argc, argv, EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
device = oclGetMaxFlopsDev(cxGPUContext);
|
||||||
|
}
|
||||||
|
oclPrintDevName(LOGBOTH, device);
|
||||||
|
shrLog("\n");
|
||||||
|
|
||||||
|
// create a command-queue
|
||||||
|
cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
source_path = shrFindFilePath("oclReduction_kernel.cl", argv[0]);
|
||||||
|
|
||||||
|
bool bSuccess = false;
|
||||||
|
switch (datatype)
|
||||||
|
{
|
||||||
|
default:
|
||||||
|
case REDUCE_INT:
|
||||||
|
bSuccess = runTest<int>( argc, argv, datatype);
|
||||||
|
break;
|
||||||
|
case REDUCE_FLOAT:
|
||||||
|
bSuccess = runTest<float>( argc, argv, datatype);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// finish
|
||||||
|
shrQAFinishExit(argc, (const char **)argv, bSuccess ? QA_PASSED : QA_FAILED);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compute sum reduction on CPU
|
||||||
|
//! We use Kahan summation for an accurate sum of large arrays.
|
||||||
|
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
|
||||||
|
//!
|
||||||
|
//! @param data pointer to input data
|
||||||
|
//! @param size number of input data elements
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class T>
|
||||||
|
T reduceCPU(T *data, int size)
|
||||||
|
{
|
||||||
|
T sum = data[0];
|
||||||
|
T c = (T)0.0;
|
||||||
|
for (int i = 1; i < size; i++)
|
||||||
|
{
|
||||||
|
T y = data[i] - c;
|
||||||
|
T t = sum + y;
|
||||||
|
c = (t - sum) - y;
|
||||||
|
sum = t;
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int nextPow2( unsigned int x ) {
|
||||||
|
--x;
|
||||||
|
x |= x >> 1;
|
||||||
|
x |= x >> 2;
|
||||||
|
x |= x >> 4;
|
||||||
|
x |= x >> 8;
|
||||||
|
x |= x >> 16;
|
||||||
|
return ++x;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Compute the number of threads and blocks to use for the given reduction kernel
|
||||||
|
// For the kernels >= 3, we set threads / block to the minimum of maxThreads and
|
||||||
|
// n/2. For kernels < 3, we set to the minimum of maxThreads and n. For kernel
|
||||||
|
// 6, we observe the maximum specified number of blocks, because each thread in
|
||||||
|
// that kernel can process a variable number of elements.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
|
||||||
|
{
|
||||||
|
if (whichKernel < 3)
|
||||||
|
{
|
||||||
|
threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
|
||||||
|
blocks = (n + threads - 1) / threads;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
|
||||||
|
blocks = (n + (threads * 2 - 1)) / (threads * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (whichKernel == 6)
|
||||||
|
blocks = MIN(maxBlocks, blocks);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// This function performs a reduction of the input data multiple times and
|
||||||
|
// measures the average reduction time.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template <class T>
|
||||||
|
T profileReduce(ReduceType datatype,
|
||||||
|
cl_int n,
|
||||||
|
int numThreads,
|
||||||
|
int numBlocks,
|
||||||
|
int maxThreads,
|
||||||
|
int maxBlocks,
|
||||||
|
int whichKernel,
|
||||||
|
int testIterations,
|
||||||
|
bool cpuFinalReduction,
|
||||||
|
int cpuFinalThreshold,
|
||||||
|
double* dTotalTime,
|
||||||
|
T* h_odata,
|
||||||
|
cl_mem d_idata,
|
||||||
|
cl_mem d_odata)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
T gpu_result = 0;
|
||||||
|
bool needReadBack = true;
|
||||||
|
cl_kernel finalReductionKernel[10];
|
||||||
|
int finalReductionIterations=0;
|
||||||
|
|
||||||
|
//shrLog("Profile Kernel %d\n", whichKernel);
|
||||||
|
|
||||||
|
cl_kernel reductionKernel = getReductionKernel(datatype, whichKernel, numThreads, isPow2(n) );
|
||||||
|
clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &d_idata);
|
||||||
|
clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &d_odata);
|
||||||
|
clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &n);
|
||||||
|
clSetKernelArg(reductionKernel, 3, sizeof(T) * numThreads, NULL);
|
||||||
|
|
||||||
|
if( !cpuFinalReduction ) {
|
||||||
|
int s=numBlocks;
|
||||||
|
int threads = 0, blocks = 0;
|
||||||
|
int kernel = (whichKernel == 6) ? 5 : whichKernel;
|
||||||
|
|
||||||
|
while(s > cpuFinalThreshold)
|
||||||
|
{
|
||||||
|
getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
|
||||||
|
|
||||||
|
finalReductionKernel[finalReductionIterations] = getReductionKernel(datatype, kernel, threads, isPow2(s) );
|
||||||
|
clSetKernelArg(finalReductionKernel[finalReductionIterations], 0, sizeof(cl_mem), (void *) &d_odata);
|
||||||
|
clSetKernelArg(finalReductionKernel[finalReductionIterations], 1, sizeof(cl_mem), (void *) &d_odata);
|
||||||
|
clSetKernelArg(finalReductionKernel[finalReductionIterations], 2, sizeof(cl_int), &n);
|
||||||
|
clSetKernelArg(finalReductionKernel[finalReductionIterations], 3, sizeof(T) * numThreads, NULL);
|
||||||
|
|
||||||
|
if (kernel < 3)
|
||||||
|
s = (s + threads - 1) / threads;
|
||||||
|
else
|
||||||
|
s = (s + (threads*2-1)) / (threads*2);
|
||||||
|
|
||||||
|
finalReductionIterations++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t globalWorkSize[1];
|
||||||
|
size_t localWorkSize[1];
|
||||||
|
|
||||||
|
for (int i = 0; i < testIterations; ++i)
|
||||||
|
{
|
||||||
|
gpu_result = 0;
|
||||||
|
|
||||||
|
clFinish(cqCommandQueue);
|
||||||
|
if(i>0) shrDeltaT(1);
|
||||||
|
|
||||||
|
// execute the kernel
|
||||||
|
globalWorkSize[0] = numBlocks * numThreads;
|
||||||
|
localWorkSize[0] = numThreads;
|
||||||
|
|
||||||
|
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue,reductionKernel, 1, 0, globalWorkSize, localWorkSize,
|
||||||
|
0, NULL, NULL);
|
||||||
|
|
||||||
|
// check if kernel execution generated an error
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
if (cpuFinalReduction)
|
||||||
|
{
|
||||||
|
// sum partial sums from each block on CPU
|
||||||
|
// copy result from device to host
|
||||||
|
clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, numBlocks * sizeof(T),
|
||||||
|
h_odata, 0, NULL, NULL);
|
||||||
|
|
||||||
|
for(int i=0; i<numBlocks; i++)
|
||||||
|
{
|
||||||
|
gpu_result += h_odata[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
needReadBack = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// sum partial block sums on GPU
|
||||||
|
int s=numBlocks;
|
||||||
|
int kernel = (whichKernel == 6) ? 5 : whichKernel;
|
||||||
|
int it = 0;
|
||||||
|
|
||||||
|
|
||||||
|
while(s > cpuFinalThreshold)
|
||||||
|
{
|
||||||
|
int threads = 0, blocks = 0;
|
||||||
|
getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
|
||||||
|
|
||||||
|
globalWorkSize[0] = threads * blocks;
|
||||||
|
localWorkSize[0] = threads;
|
||||||
|
|
||||||
|
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, finalReductionKernel[it], 1, 0,
|
||||||
|
globalWorkSize, localWorkSize, 0, NULL, NULL);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
if (kernel < 3)
|
||||||
|
s = (s + threads - 1) / threads;
|
||||||
|
else
|
||||||
|
s = (s + (threads*2-1)) / (threads*2);
|
||||||
|
|
||||||
|
it++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s > 1)
|
||||||
|
{
|
||||||
|
// copy result from device to host
|
||||||
|
clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, s * sizeof(T),
|
||||||
|
h_odata, 0, NULL, NULL);
|
||||||
|
|
||||||
|
for(int i=0; i < s; i++)
|
||||||
|
{
|
||||||
|
gpu_result += h_odata[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
needReadBack = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
clFinish(cqCommandQueue);
|
||||||
|
if(i>0) *dTotalTime += shrDeltaT(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (needReadBack)
|
||||||
|
{
|
||||||
|
// copy final sum from device to host
|
||||||
|
clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, sizeof(T),
|
||||||
|
&gpu_result, 0, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Release the kernels
|
||||||
|
clReleaseKernel(reductionKernel);
|
||||||
|
if( !cpuFinalReduction ) {
|
||||||
|
for(int it=0; it<finalReductionIterations; ++it) {
|
||||||
|
clReleaseKernel(finalReductionKernel[it]);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return gpu_result;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// This function calls profileReduce multple times for a range of array sizes
|
||||||
|
// and prints a report in CSV (comma-separated value) format that can be used for
|
||||||
|
// generating a "shmoo" plot showing the performance for each kernel variation
|
||||||
|
// over a wide range of input sizes.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template <class T>
|
||||||
|
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
|
||||||
|
{
|
||||||
|
// create random input data on CPU
|
||||||
|
unsigned int bytes = maxN * sizeof(T);
|
||||||
|
|
||||||
|
T* h_idata = (T*)malloc(bytes);
|
||||||
|
|
||||||
|
for(int i = 0; i < maxN; i++) {
|
||||||
|
// Keep the numbers small so we don't get truncation error in the sum
|
||||||
|
if (datatype == REDUCE_INT)
|
||||||
|
h_idata[i] = (T)(rand() & 0xFF);
|
||||||
|
else
|
||||||
|
h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE);
|
||||||
|
|
||||||
|
// allocate mem for the result on host side
|
||||||
|
T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T));
|
||||||
|
|
||||||
|
// allocate device memory and data
|
||||||
|
cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
|
||||||
|
cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, maxNumBlocks * sizeof(T), NULL, NULL);
|
||||||
|
|
||||||
|
int testIterations = 100;
|
||||||
|
double dTotalTime = 0.0;
|
||||||
|
|
||||||
|
// print headers
|
||||||
|
shrLog("Time in seconds for various numbers of elements for each kernel\n");
|
||||||
|
shrLog("\n\n");
|
||||||
|
shrLog("Kernel");
|
||||||
|
for (int i = minN; i <= maxN; i *= 2)
|
||||||
|
{
|
||||||
|
shrLog(", %d", i);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int kernel = 0; kernel < 7; kernel++)
|
||||||
|
{
|
||||||
|
shrLog("\n");
|
||||||
|
shrLog("%d", kernel);
|
||||||
|
for (int i = minN; i <= maxN; i *= 2)
|
||||||
|
{
|
||||||
|
int numBlocks = 0;
|
||||||
|
int numThreads = 0;
|
||||||
|
getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads);
|
||||||
|
|
||||||
|
double reduceTime;
|
||||||
|
if( numBlocks <= MAX_BLOCK_DIM_SIZE ) {
|
||||||
|
profileReduce(datatype, i, numThreads, numBlocks, maxThreads, maxBlocks, kernel,
|
||||||
|
testIterations, false, 1, &dTotalTime, h_odata, d_idata, d_odata);
|
||||||
|
reduceTime = dTotalTime/(double)testIterations;
|
||||||
|
} else {
|
||||||
|
reduceTime = -1.0;
|
||||||
|
}
|
||||||
|
shrLog(", %.4f m", reduceTime);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// cleanup
|
||||||
|
free(h_idata);
|
||||||
|
free(h_odata);
|
||||||
|
clReleaseMemObject(d_idata);
|
||||||
|
clReleaseMemObject(d_odata);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// The main function whihc runs the reduction test.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template <class T>
|
||||||
|
bool
|
||||||
|
runTest( int argc, const char** argv, ReduceType datatype)
|
||||||
|
{
|
||||||
|
int size = 1<<24; // number of elements to reduce
|
||||||
|
int maxThreads;
|
||||||
|
|
||||||
|
cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1);
|
||||||
|
clReleaseKernel(reductionKernel);
|
||||||
|
|
||||||
|
if (smallBlock)
|
||||||
|
maxThreads = 64; // number of threads per block
|
||||||
|
else
|
||||||
|
maxThreads = 128;
|
||||||
|
|
||||||
|
int whichKernel = 6;
|
||||||
|
int maxBlocks = 64;
|
||||||
|
bool cpuFinalReduction = false;
|
||||||
|
int cpuFinalThreshold = 1;
|
||||||
|
|
||||||
|
shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
|
||||||
|
shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
|
||||||
|
shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
|
||||||
|
shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
|
||||||
|
|
||||||
|
shrLog(" %d elements\n", size);
|
||||||
|
shrLog(" %d threads (max)\n", maxThreads);
|
||||||
|
|
||||||
|
cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE);
|
||||||
|
shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);
|
||||||
|
|
||||||
|
bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE);
|
||||||
|
|
||||||
|
#ifdef GPU_PROFILING
|
||||||
|
if (runShmoo)
|
||||||
|
{
|
||||||
|
shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
// create random input data on CPU
|
||||||
|
unsigned int bytes = size * sizeof(T);
|
||||||
|
T* h_idata = (T*)malloc(bytes);
|
||||||
|
|
||||||
|
for(int i=0; i<size; i++)
|
||||||
|
{
|
||||||
|
// Keep the numbers small so we don't get truncation error in the sum
|
||||||
|
if (datatype == REDUCE_INT)
|
||||||
|
h_idata[i] = (T)(rand() & 0xFF);
|
||||||
|
else
|
||||||
|
h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
int numBlocks = 0;
|
||||||
|
int numThreads = 0;
|
||||||
|
getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
|
||||||
|
if (numBlocks == 1) cpuFinalThreshold = 1;
|
||||||
|
shrLog(" %d blocks\n\n", numBlocks);
|
||||||
|
|
||||||
|
// allocate mem for the result on host side
|
||||||
|
T* h_odata = (T*)malloc(numBlocks * sizeof(T));
|
||||||
|
|
||||||
|
// allocate device memory and data
|
||||||
|
cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
|
||||||
|
cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, numBlocks * sizeof(T), NULL, NULL);
|
||||||
|
|
||||||
|
int testIterations = 100;
|
||||||
|
double dTotalTime = 0.0;
|
||||||
|
T gpu_result = 0;
|
||||||
|
gpu_result = profileReduce<T>(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks,
|
||||||
|
whichKernel, testIterations, cpuFinalReduction,
|
||||||
|
cpuFinalThreshold, &dTotalTime,
|
||||||
|
h_odata, d_idata, d_odata);
|
||||||
|
|
||||||
|
#ifdef GPU_PROFILING
|
||||||
|
double reduceTime = dTotalTime/(double)testIterations;
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n",
|
||||||
|
1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// compute reference solution
|
||||||
|
shrLog("\nComparing against Host/C++ computation...\n");
|
||||||
|
T cpu_result = reduceCPU<T>(h_idata, size);
|
||||||
|
if (datatype == REDUCE_INT)
|
||||||
|
{
|
||||||
|
shrLog(" GPU result = %d\n", gpu_result);
|
||||||
|
shrLog(" CPU result = %d\n\n", cpu_result);
|
||||||
|
shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLog(" GPU result = %.9f\n", gpu_result);
|
||||||
|
shrLog(" CPU result = %.9f\n\n", cpu_result);
|
||||||
|
|
||||||
|
double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12;
|
||||||
|
double diff = abs((double)gpu_result - (double)cpu_result);
|
||||||
|
shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED");
|
||||||
|
}
|
||||||
|
|
||||||
|
// cleanup
|
||||||
|
free(h_idata);
|
||||||
|
free(h_odata);
|
||||||
|
clReleaseMemObject(d_idata);
|
||||||
|
clReleaseMemObject(d_odata);
|
||||||
|
|
||||||
|
return (gpu_result == cpu_result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to create and build program and kernel
|
||||||
|
// *********************************************************************
|
||||||
|
cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2)
|
||||||
|
{
|
||||||
|
// compile cl program
|
||||||
|
size_t program_length;
|
||||||
|
char *source;
|
||||||
|
|
||||||
|
std::ostringstream preamble;
|
||||||
|
|
||||||
|
// create the program
|
||||||
|
// with type specification depending on datatype argument
|
||||||
|
switch (datatype)
|
||||||
|
{
|
||||||
|
default:
|
||||||
|
case REDUCE_INT:
|
||||||
|
preamble << "#define T int" << std::endl;
|
||||||
|
break;
|
||||||
|
case REDUCE_FLOAT:
|
||||||
|
preamble << "#define T float" << std::endl;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// set blockSize at compile time
|
||||||
|
preamble << "#define blockSize " << blockSize << std::endl;
|
||||||
|
|
||||||
|
// set isPow2 at compile time
|
||||||
|
preamble << "#define nIsPow2 " << isPowOf2 << std::endl;
|
||||||
|
|
||||||
|
// Load the source code and prepend the preamble
|
||||||
|
source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length);
|
||||||
|
//oclCheckError(source != NULL, shrTRUE);
|
||||||
|
|
||||||
|
program =
|
||||||
|
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "reduce0", NULL);
|
||||||
|
//cl_program rv_program = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source,
|
||||||
|
// &program_length, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
free(source);
|
||||||
|
|
||||||
|
// build the program
|
||||||
|
ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
// write out standard error, Build Log and PTX, then cleanup and exit
|
||||||
|
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||||
|
oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
|
||||||
|
oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclReduction.ptx");
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// create Kernel
|
||||||
|
std::ostringstream kernelName;
|
||||||
|
kernelName << "reduce" << whichKernel;
|
||||||
|
cl_kernel ckKernel = clCreateKernel(rv_program, kernelName.str().c_str(), &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
size_t wgSize;
|
||||||
|
ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
|
||||||
|
if (wgSize == 64)
|
||||||
|
smallBlock = true;
|
||||||
|
else smallBlock = false;
|
||||||
|
|
||||||
|
// NOTE: the program will get deleted when the kernel is also released
|
||||||
|
clReleaseProgram(rv_program);
|
||||||
|
|
||||||
|
return ckKernel;
|
||||||
|
}
|
||||||
34
benchmarks/opencl/reduce0/oclReduction.h
Normal file
34
benchmarks/opencl/reduce0/oclReduction.h
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __REDUCTION_H__
|
||||||
|
#define __REDUCTION_H__
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void reduce_sm10(int size, int threads, int blocks,
|
||||||
|
int whichKernel, T *d_idata, T *d_odata);
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void reduce_sm13(int size, int threads, int blocks,
|
||||||
|
int whichKernel, T *d_idata, T *d_odata);
|
||||||
|
|
||||||
|
// CL objects
|
||||||
|
cl_platform_id cpPlatform;
|
||||||
|
cl_uint uiNumDevices;
|
||||||
|
cl_device_id* cdDevices;
|
||||||
|
cl_context cxGPUContext;
|
||||||
|
cl_command_queue cqCommandQueue;
|
||||||
|
cl_device_id device;
|
||||||
|
cl_int ciErrNum;
|
||||||
|
const char* source_path;
|
||||||
|
bool smallBlock = true;
|
||||||
|
|
||||||
|
#endif
|
||||||
273
benchmarks/opencl/reduce0/oclReduction_kernel.cl
Normal file
273
benchmarks/opencl/reduce0/oclReduction_kernel.cl
Normal file
@@ -0,0 +1,273 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
Parallel reduction kernels
|
||||||
|
*/
|
||||||
|
|
||||||
|
// The following defines are set during runtime compilation, see reduction.cpp
|
||||||
|
// #define T float
|
||||||
|
// #define blockSize 128
|
||||||
|
// #define nIsPow2 1
|
||||||
|
|
||||||
|
#ifndef _REDUCE_KERNEL_H_
|
||||||
|
#define _REDUCE_KERNEL_H_
|
||||||
|
|
||||||
|
/*
|
||||||
|
Parallel sum reduction using shared memory
|
||||||
|
- takes log(n) steps for n input elements
|
||||||
|
- uses n threads
|
||||||
|
- only works for power-of-2 arrays
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* This reduction interleaves which threads are active by using the modulo
|
||||||
|
operator. This operator is very expensive on GPUs, and the interleaved
|
||||||
|
inactivity means that no whole warps are active, which is also very
|
||||||
|
inefficient */
|
||||||
|
__kernel void reduce0(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
||||||
|
{
|
||||||
|
// load shared mem
|
||||||
|
unsigned int tid = get_local_id(0);
|
||||||
|
unsigned int i = get_global_id(0);
|
||||||
|
|
||||||
|
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
for(unsigned int s=1; s < get_local_size(0); s *= 2) {
|
||||||
|
// modulo arithmetic is slow!
|
||||||
|
if ((tid % (2*s)) == 0) {
|
||||||
|
sdata[tid] += sdata[tid + s];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// write result for this block to global mem
|
||||||
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* This version uses contiguous threads, but its interleaved
|
||||||
|
addressing results in many shared memory bank conflicts. */
|
||||||
|
__kernel void reduce1(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
||||||
|
{
|
||||||
|
// load shared mem
|
||||||
|
unsigned int tid = get_local_id(0);
|
||||||
|
unsigned int i = get_global_id(0);
|
||||||
|
|
||||||
|
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
for(unsigned int s=1; s < get_local_size(0); s *= 2)
|
||||||
|
{
|
||||||
|
int index = 2 * s * tid;
|
||||||
|
|
||||||
|
if (index < get_local_size(0))
|
||||||
|
{
|
||||||
|
sdata[index] += sdata[index + s];
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// write result for this block to global mem
|
||||||
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
This version uses sequential addressing -- no divergence or bank conflicts.
|
||||||
|
*/
|
||||||
|
__kernel void reduce2(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
||||||
|
{
|
||||||
|
// load shared mem
|
||||||
|
unsigned int tid = get_local_id(0);
|
||||||
|
unsigned int i = get_global_id(0);
|
||||||
|
|
||||||
|
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
for(unsigned int s=get_local_size(0)/2; s>0; s>>=1)
|
||||||
|
{
|
||||||
|
if (tid < s)
|
||||||
|
{
|
||||||
|
sdata[tid] += sdata[tid + s];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// write result for this block to global mem
|
||||||
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
This version uses n/2 threads --
|
||||||
|
it performs the first level of reduction when reading from global memory
|
||||||
|
*/
|
||||||
|
__kernel void reduce3(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
||||||
|
{
|
||||||
|
// perform first level of reduction,
|
||||||
|
// reading from global memory, writing to shared memory
|
||||||
|
unsigned int tid = get_local_id(0);
|
||||||
|
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
||||||
|
|
||||||
|
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||||
|
if (i + get_local_size(0) < n)
|
||||||
|
sdata[tid] += g_idata[i+get_local_size(0)];
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
for(unsigned int s=get_local_size(0)/2; s>0; s>>=1)
|
||||||
|
{
|
||||||
|
if (tid < s)
|
||||||
|
{
|
||||||
|
sdata[tid] += sdata[tid + s];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// write result for this block to global mem
|
||||||
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
This version unrolls the last warp to avoid synchronization where it
|
||||||
|
isn't needed
|
||||||
|
*/
|
||||||
|
__kernel void reduce4(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
|
||||||
|
{
|
||||||
|
// perform first level of reduction,
|
||||||
|
// reading from global memory, writing to shared memory
|
||||||
|
unsigned int tid = get_local_id(0);
|
||||||
|
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
||||||
|
|
||||||
|
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||||
|
if (i + get_local_size(0) < n)
|
||||||
|
sdata[tid] += g_idata[i+get_local_size(0)];
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
#pragma unroll 1
|
||||||
|
for(unsigned int s=get_local_size(0)/2; s>32; s>>=1)
|
||||||
|
{
|
||||||
|
if (tid < s)
|
||||||
|
{
|
||||||
|
sdata[tid] += sdata[tid + s];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tid < 32)
|
||||||
|
{
|
||||||
|
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
|
||||||
|
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
|
||||||
|
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
|
||||||
|
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
|
||||||
|
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
|
||||||
|
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// write result for this block to global mem
|
||||||
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
This version is completely unrolled. It uses a template parameter to achieve
|
||||||
|
optimal code for any (power of 2) number of threads. This requires a switch
|
||||||
|
statement in the host code to handle all the different thread block sizes at
|
||||||
|
compile time.
|
||||||
|
*/
|
||||||
|
__kernel void reduce5(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
|
||||||
|
{
|
||||||
|
// perform first level of reduction,
|
||||||
|
// reading from global memory, writing to shared memory
|
||||||
|
unsigned int tid = get_local_id(0);
|
||||||
|
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
||||||
|
|
||||||
|
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||||
|
if (i + blockSize < n)
|
||||||
|
sdata[tid] += g_idata[i+blockSize];
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||||
|
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||||
|
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||||
|
|
||||||
|
if (tid < 32)
|
||||||
|
{
|
||||||
|
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
|
||||||
|
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
|
||||||
|
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
|
||||||
|
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
|
||||||
|
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
|
||||||
|
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// write result for this block to global mem
|
||||||
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
This version adds multiple elements per thread sequentially. This reduces the overall
|
||||||
|
cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
|
||||||
|
(Brent's Theorem optimization)
|
||||||
|
*/
|
||||||
|
__kernel void reduce6(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
|
||||||
|
{
|
||||||
|
// perform first level of reduction,
|
||||||
|
// reading from global memory, writing to shared memory
|
||||||
|
unsigned int tid = get_local_id(0);
|
||||||
|
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
||||||
|
unsigned int gridSize = blockSize*2*get_num_groups(0);
|
||||||
|
sdata[tid] = 0;
|
||||||
|
|
||||||
|
// we reduce multiple elements per thread. The number is determined by the
|
||||||
|
// number of active thread blocks (via gridDim). More blocks will result
|
||||||
|
// in a larger gridSize and therefore fewer elements per thread
|
||||||
|
while (i < n)
|
||||||
|
{
|
||||||
|
sdata[tid] += g_idata[i];
|
||||||
|
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
|
||||||
|
if (nIsPow2 || i + blockSize < n)
|
||||||
|
sdata[tid] += g_idata[i+blockSize];
|
||||||
|
i += gridSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// do reduction in shared mem
|
||||||
|
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||||
|
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||||
|
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||||
|
|
||||||
|
if (tid < 32)
|
||||||
|
{
|
||||||
|
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
|
||||||
|
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
|
||||||
|
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
|
||||||
|
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
|
||||||
|
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
|
||||||
|
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// write result for this block to global mem
|
||||||
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // #ifndef _REDUCE_KERNEL_H_
|
||||||
198
benchmarks/opencl/reduce0/oclUtils.h
Normal file
198
benchmarks/opencl/reduce0/oclUtils.h
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef OCL_UTILS_H
|
||||||
|
#define OCL_UTILS_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// Common headers: Cross-API utililties and OpenCL header
|
||||||
|
#include <shrUtils.h>
|
||||||
|
|
||||||
|
// All OpenCL headers
|
||||||
|
#if defined (__APPLE__) || defined(MACOSX)
|
||||||
|
#include <OpenCL/opencl.h>
|
||||||
|
#else
|
||||||
|
#include <CL/opencl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Includes
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||||
|
// extensions from <CL/cl_ext.h>
|
||||||
|
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||||
|
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||||
|
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||||
|
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||||
|
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||||
|
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||||
|
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||||
|
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||||
|
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// reminders for build output window and log
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including shrUtils.h")
|
||||||
|
#pragma message ("Note: including opencl.h")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// SDK Revision #
|
||||||
|
#define OCL_SDKREVISION "7027912"
|
||||||
|
|
||||||
|
// Error and Exit Handling Macros...
|
||||||
|
// *********************************************************************
|
||||||
|
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||||
|
|
||||||
|
// Short version without Cleanup() callback pointer
|
||||||
|
// Both Input (a) and Reference (b) are specified as args
|
||||||
|
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param clSelectedPlatformID OpenCL platform ID
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Print info about the device
|
||||||
|
//!
|
||||||
|
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and return device capability
|
||||||
|
//!
|
||||||
|
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" int oclGetDevCap(cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Print the device name
|
||||||
|
//!
|
||||||
|
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of the first device from the context
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of the nth device from the context
|
||||||
|
//!
|
||||||
|
//! @return the id or -1 when out of range
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//! @param device_idx index of the device of interest
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of device with maximal FLOPS from the context
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Loads a Program file and prepends the cPreamble to the code.
|
||||||
|
//!
|
||||||
|
//! @return the source string if succeeded, 0 otherwise
|
||||||
|
//! @param cFilename program filename
|
||||||
|
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||||
|
//! @param szFinalLength returned length of the code string
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the binary (PTX) of the program associated with the device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//! @param binary returned code
|
||||||
|
//! @param length length of returned code
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//! @param const char* cPtxFileName optional PTX file name
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||||
|
|
||||||
|
// Helper function for De-allocating cl objects
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||||
|
|
||||||
|
// Helper function to get OpenCL error string from constant
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" const char* oclErrorString(cl_int error);
|
||||||
|
|
||||||
|
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||||
|
|
||||||
|
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||||
|
{
|
||||||
|
// An error condition is defined by the sample/test value not equal to the reference
|
||||||
|
if (iReference != iSample)
|
||||||
|
{
|
||||||
|
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||||
|
iSample = (iSample == 0) ? -9999 : iSample;
|
||||||
|
|
||||||
|
// Log the error info
|
||||||
|
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||||
|
|
||||||
|
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||||
|
if (pCleanup != NULL)
|
||||||
|
{
|
||||||
|
pCleanup(iSample);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||||
|
exit(iSample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
238
benchmarks/opencl/reduce0/shrQATest.h
Normal file
238
benchmarks/opencl/reduce0/shrQATest.h
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SHR_QATEST_H
|
||||||
|
#define SHR_QATEST_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Generic utilities for NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// OS dependent includes
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including windows.h")
|
||||||
|
#pragma message ("Note: including math.h")
|
||||||
|
#pragma message ("Note: including assert.h")
|
||||||
|
#pragma message ("Note: including time.h")
|
||||||
|
|
||||||
|
// Headers needed for Windows
|
||||||
|
#include <windows.h>
|
||||||
|
#include <time.h>
|
||||||
|
#else
|
||||||
|
// Headers needed for Linux
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <time.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef STRCASECMP
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define STRCASECMP _stricmp
|
||||||
|
#else
|
||||||
|
#define STRCASECMP strcasecmp
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef STRNCASECMP
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define STRNCASECMP _strnicmp
|
||||||
|
#else
|
||||||
|
#define STRNCASECMP strncasecmp
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
// Standardized QA Start/Finish for CUDA SDK tests
|
||||||
|
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||||
|
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||||
|
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||||
|
|
||||||
|
inline int findExeNameStart(const char *exec_name)
|
||||||
|
{
|
||||||
|
int exename_start = (int)strlen(exec_name);
|
||||||
|
|
||||||
|
while( (exename_start > 0) &&
|
||||||
|
(exec_name[exename_start] != '\\') &&
|
||||||
|
(exec_name[exename_start] != '/') )
|
||||||
|
{
|
||||||
|
exename_start--;
|
||||||
|
}
|
||||||
|
if (exec_name[exename_start] == '\\' ||
|
||||||
|
exec_name[exename_start] == '/')
|
||||||
|
{
|
||||||
|
return exename_start+1;
|
||||||
|
} else {
|
||||||
|
return exename_start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int __shrQAStart(int argc, char **argv)
|
||||||
|
{
|
||||||
|
bool bQATest = false;
|
||||||
|
// First clear the output buffer
|
||||||
|
fflush(stdout);
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
char *string_argv = &argv[i][string_start];
|
||||||
|
|
||||||
|
if (!STRCASECMP(string_argv, "qatest")) {
|
||||||
|
bQATest = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't want to print the entire path, so we search for the first
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
printf("\n"); fflush(stdout);
|
||||||
|
return exename_start;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum eQAstatus {
|
||||||
|
QA_FAILED = 0,
|
||||||
|
QA_PASSED = 1,
|
||||||
|
QA_WAIVED = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
inline void __ExitInTime(int seconds)
|
||||||
|
{
|
||||||
|
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||||
|
fflush(stdout);
|
||||||
|
time_t t;
|
||||||
|
int count;
|
||||||
|
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||||
|
fprintf(stdout, "%d...", count);
|
||||||
|
#ifdef WIN32
|
||||||
|
Sleep(1000);
|
||||||
|
#else
|
||||||
|
sleep(1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
fprintf(stdout,"done!\n\n");
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||||
|
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||||
|
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
if (!STRCASECMP(string_argv, "qatest")) {
|
||||||
|
bQATest = true;
|
||||||
|
}
|
||||||
|
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||||
|
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||||
|
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||||
|
bNoPrompt = true;
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
if (!STRCASECMP(string_argv, "prompt")) {
|
||||||
|
bNoPrompt = false;
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
printf("\n"); fflush(stdout);
|
||||||
|
if (bQuitInTime) {
|
||||||
|
__ExitInTime(3);
|
||||||
|
} else {
|
||||||
|
if (!bNoPrompt) {
|
||||||
|
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||||
|
fflush(stdout);
|
||||||
|
getchar();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
bool bQuitInTime = true;
|
||||||
|
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||||
|
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||||
|
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
if (!STRCASECMP(string_argv, "prompt")) {
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
if (bQuitInTime) {
|
||||||
|
__ExitInTime(3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
__shrQAFinish(argc, argv, iStatus);
|
||||||
|
|
||||||
|
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||||
|
|
||||||
|
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
642
benchmarks/opencl/reduce0/shrUtils.h
Normal file
642
benchmarks/opencl/reduce0/shrUtils.h
Normal file
@@ -0,0 +1,642 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SHR_UTILS_H
|
||||||
|
#define SHR_UTILS_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Generic utilities for NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// reminders for output window and build log
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including windows.h")
|
||||||
|
#pragma message ("Note: including math.h")
|
||||||
|
#pragma message ("Note: including assert.h")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// OS dependent includes
|
||||||
|
#ifdef _WIN32
|
||||||
|
// Headers needed for Windows
|
||||||
|
#include <windows.h>
|
||||||
|
#else
|
||||||
|
// Headers needed for Linux
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Other headers needed for both Windows and Linux
|
||||||
|
#include <math.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// Un-comment the following #define to enable profiling code in SDK apps
|
||||||
|
//#define GPU_PROFILING
|
||||||
|
|
||||||
|
// Beginning of GPU Architecture definitions
|
||||||
|
inline int ConvertSMVer2Cores(int major, int minor)
|
||||||
|
{
|
||||||
|
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||||
|
typedef struct {
|
||||||
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||||
|
int Cores;
|
||||||
|
} sSMtoCores;
|
||||||
|
|
||||||
|
sSMtoCores nGpuArchCoresPerSM[] =
|
||||||
|
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||||
|
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||||
|
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||||
|
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||||
|
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||||
|
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||||
|
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||||
|
{ -1, -1 }
|
||||||
|
};
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||||
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||||
|
return nGpuArchCoresPerSM[index].Cores;
|
||||||
|
}
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
// end of GPU Architecture definitions
|
||||||
|
|
||||||
|
|
||||||
|
// Defines and enum for use with logging functions
|
||||||
|
// *********************************************************************
|
||||||
|
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||||
|
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||||
|
enum LOGMODES
|
||||||
|
{
|
||||||
|
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||||
|
LOGFILE = 2, // bit to signal "log to file"
|
||||||
|
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||||
|
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||||
|
MASTER = 8, // bit to signal master .csv log output
|
||||||
|
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||||
|
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||||
|
};
|
||||||
|
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||||
|
|
||||||
|
// Standardized boolean
|
||||||
|
enum shrBOOL
|
||||||
|
{
|
||||||
|
shrFALSE = 0,
|
||||||
|
shrTRUE = 1
|
||||||
|
};
|
||||||
|
|
||||||
|
// Standardized MAX, MIN and CLAMP
|
||||||
|
#define MAX(a, b) ((a > b) ? a : b)
|
||||||
|
#define MIN(a, b) ((a < b) ? a : b)
|
||||||
|
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||||
|
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||||
|
|
||||||
|
// Error and Exit Handling Macros...
|
||||||
|
// *********************************************************************
|
||||||
|
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||||
|
|
||||||
|
// Short version without Cleanup() callback pointer
|
||||||
|
// Both Input (a) and Reference (b) are specified as args
|
||||||
|
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||||
|
|
||||||
|
// Standardized Exit Macro for leaving main()... extended version
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||||
|
|
||||||
|
// Standardized Exit Macro for leaving main()... short version
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||||
|
|
||||||
|
// Simple argument checker macro
|
||||||
|
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||||
|
|
||||||
|
// Define for user-customized error handling
|
||||||
|
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||||
|
|
||||||
|
// Function to deallocate memory allocated within shrUtils
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrFree(void* ptr);
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Helper function to log standardized information to Console, to File or to both
|
||||||
|
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||||
|
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||||
|
//!
|
||||||
|
//! Automatically opens file and stores handle if needed and not done yet
|
||||||
|
//! Closes file and nulls handle on request
|
||||||
|
//!
|
||||||
|
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||||
|
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||||
|
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||||
|
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||||
|
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||||
|
//! @param 2 dValue:
|
||||||
|
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||||
|
//! Negative val is an error code and this give error preformatting.
|
||||||
|
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||||
|
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||||
|
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||||
|
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||||
|
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||||
|
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||||
|
|
||||||
|
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" int shrLog(const char* cFormatString, ...);
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||||
|
// Maintains state for 3 independent counters
|
||||||
|
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||||
|
//!
|
||||||
|
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||||
|
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" double shrDeltaT(int iCounterID);
|
||||||
|
|
||||||
|
// Optional LogFileNameOverride function
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||||
|
|
||||||
|
// Helper function to init data arrays
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||||
|
|
||||||
|
// Helper function to print data arrays
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Find the path for a filename
|
||||||
|
//! @return the path if succeeded, otherwise 0
|
||||||
|
//! @param filename name of the file
|
||||||
|
//! @param executablePath optional absolute path of the executable
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing single precision floating point data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing double precision floating point data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing integer data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing unsigned integer data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||||
|
unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing char / byte data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing unsigned char / byte data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||||
|
unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing single precision floating point
|
||||||
|
//! data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @param epsilon epsilon for comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||||
|
const float epsilon, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing double precision floating point
|
||||||
|
//! data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @param epsilon epsilon for comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||||
|
const double epsilon, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing integer data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing unsigned integer data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||||
|
unsigned int len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing char / byte data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing unsigned char / byte data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||||
|
unsigned int len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Load PPM image file (with unsigned char as data element type), padding
|
||||||
|
//! 4th component
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param OutData handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
//!
|
||||||
|
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||||
|
unsigned int *w, unsigned int *h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||||
|
//! 4 bytes)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||||
|
unsigned int w, unsigned int h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Save PGM image file (with unsigned char as data element type)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||||
|
unsigned int w, unsigned int h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Load PGM image file (with unsigned char as data element type)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||||
|
unsigned int *w,unsigned int *h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Command line arguments: General notes
|
||||||
|
// * All command line arguments begin with '--' followed by the token;
|
||||||
|
// token and value are seperated by '='; example --samples=50
|
||||||
|
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||||
|
// (without whitespaces)
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Check if command line argument \a flag-name is given
|
||||||
|
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||||
|
//! otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param flag_name name of command line flag
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||||
|
const char* flag_name);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type int
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||||
|
const char* arg_name, int* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type unsigned int
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||||
|
const char* arg_name, unsigned int* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type float
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||||
|
const char* arg_name, float* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type string
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||||
|
const char* arg_name, char** val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument list those element are strings
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val command line argument list
|
||||||
|
//! @param len length of the list / number of elements
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||||
|
const char* arg_name, char** val,
|
||||||
|
unsigned int* len);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||||
|
const unsigned int len);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integer arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||||
|
const unsigned int len );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two unsigned char arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integers with a tolernance for # of byte errors
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||||
|
//! threshold for # pixel errors
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||||
|
//! equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param src_file filename for the image to be compared
|
||||||
|
//! @param data filename for the reference data / gold image
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||||
|
//! $param verboseErrors output details of image mismatch to std::err
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param src_file filename for the image to be compared
|
||||||
|
//! @param data filename for the reference data / gold image
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||||
|
//! $param verboseErrors output details of image mismatch to std::err
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||||
|
|
||||||
|
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||||
|
|
||||||
|
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||||
|
|
||||||
|
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||||
|
{
|
||||||
|
if (iReference != iSample)
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||||
|
if (pCleanup != NULL)
|
||||||
|
{
|
||||||
|
pCleanup(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Standardized Exit
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||||
|
{
|
||||||
|
#ifdef WIN32
|
||||||
|
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||||
|
#else
|
||||||
|
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||||
|
getchar();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||||
|
}
|
||||||
|
fflush(stderr);
|
||||||
|
exit(iExitCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
66
benchmarks/opencl/transpose/Makefile
Normal file
66
benchmarks/opencl/transpose/Makefile
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
|
||||||
|
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
|
||||||
|
POCL_INC_PATH = $(wildcard ../include)
|
||||||
|
POCL_LIB_PATH = $(wildcard ../lib)
|
||||||
|
VX_RT_PATH = $(wildcard ../../../runtime)
|
||||||
|
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
|
||||||
|
|
||||||
|
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||||
|
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||||
|
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||||
|
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||||
|
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||||
|
|
||||||
|
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||||
|
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||||
|
|
||||||
|
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||||
|
|
||||||
|
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||||
|
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||||
|
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||||
|
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||||
|
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||||
|
|
||||||
|
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||||
|
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
|
||||||
|
|
||||||
|
PROJECT=transpose
|
||||||
|
|
||||||
|
all: $(PROJECT).dump $(PROJECT).hex
|
||||||
|
|
||||||
|
lib$(PROJECT).a: transpose.cl
|
||||||
|
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||||
|
|
||||||
|
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||||
|
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
|
||||||
|
|
||||||
|
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||||
|
$(CXX) $(CXXFLAGS) main.cc transpose_gold.cpp $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||||
|
|
||||||
|
$(PROJECT).hex: $(PROJECT).elf
|
||||||
|
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||||
|
|
||||||
|
$(PROJECT).dump: $(PROJECT).elf
|
||||||
|
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||||
|
|
||||||
|
run: $(PROJECT).hex
|
||||||
|
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||||
|
|
||||||
|
qemu: $(PROJECT).qemu
|
||||||
|
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||||
|
|
||||||
|
gdb-s: $(PROJECT).qemu
|
||||||
|
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||||
|
|
||||||
|
gdb-c: $(PROJECT).qemu
|
||||||
|
$(GDB) $(PROJECT).qemu
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf *.elf *.dump *.hex
|
||||||
365
benchmarks/opencl/transpose/main.cc
Normal file
365
benchmarks/opencl/transpose/main.cc
Normal file
@@ -0,0 +1,365 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Matrix transpose with Cuda
|
||||||
|
* Host code.
|
||||||
|
|
||||||
|
* This example transposes arbitrary-size matrices. It compares a naive
|
||||||
|
* transpose kernel that suffers from non-coalesced writes, to an optimized
|
||||||
|
* transpose with fully coalesced memory access and no bank conflicts. On
|
||||||
|
* a G80 GPU, the optimized transpose can be more than 10x faster for large
|
||||||
|
* matrices.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// standard utility and system includes
|
||||||
|
#include <oclUtils.h>
|
||||||
|
#include <shrQATest.h>
|
||||||
|
|
||||||
|
#define BLOCK_DIM 16
|
||||||
|
|
||||||
|
// max GPU's to manage for multi-GPU parallel compute
|
||||||
|
const unsigned int MAX_GPU_COUNT = 8;
|
||||||
|
|
||||||
|
// global variables
|
||||||
|
cl_platform_id cpPlatform;
|
||||||
|
cl_uint uiNumDevices;
|
||||||
|
cl_device_id* cdDevices;
|
||||||
|
cl_context cxGPUContext;
|
||||||
|
cl_kernel ckKernel[MAX_GPU_COUNT];
|
||||||
|
cl_command_queue commandQueue[MAX_GPU_COUNT];
|
||||||
|
cl_program rv_program;
|
||||||
|
|
||||||
|
// forward declarations
|
||||||
|
// *********************************************************************
|
||||||
|
int runTest( int argc, const char** argv);
|
||||||
|
extern "C" void computeGold( float* reference, float* idata,
|
||||||
|
const unsigned int size_x, const unsigned int size_y );
|
||||||
|
|
||||||
|
// Main Program
|
||||||
|
// *********************************************************************
|
||||||
|
int main( int argc, const char** argv)
|
||||||
|
{
|
||||||
|
shrQAStart(argc, (char **)argv);
|
||||||
|
|
||||||
|
// set logfile name and start logs
|
||||||
|
shrSetLogFileName ("oclTranspose.txt");
|
||||||
|
shrLog("%s Starting...\n\n", argv[0]);
|
||||||
|
|
||||||
|
// run the main test
|
||||||
|
int result = runTest(argc, argv);
|
||||||
|
//oclCheckError(result, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
|
||||||
|
{
|
||||||
|
cl_mem d_odata[MAX_GPU_COUNT];
|
||||||
|
cl_mem d_idata[MAX_GPU_COUNT];
|
||||||
|
cl_kernel ckKernel[MAX_GPU_COUNT];
|
||||||
|
|
||||||
|
size_t szGlobalWorkSize[2];
|
||||||
|
size_t szLocalWorkSize[2];
|
||||||
|
cl_int ciErrNum;
|
||||||
|
|
||||||
|
// Create buffers for each GPU
|
||||||
|
// Each GPU will compute sizePerGPU rows of the result
|
||||||
|
size_t sizePerGPU = shrRoundUp(BLOCK_DIM, (size_x+ciDeviceCount-1) / ciDeviceCount);
|
||||||
|
|
||||||
|
// size of memory required to store the matrix
|
||||||
|
const size_t mem_size = sizeof(float) * size_x * size_y;
|
||||||
|
|
||||||
|
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
||||||
|
// allocate device memory and copy host to device memory
|
||||||
|
d_idata[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||||
|
mem_size, h_idata, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// create buffer to store output
|
||||||
|
d_odata[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY ,
|
||||||
|
sizePerGPU*size_y*sizeof(float), NULL, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// create the naive transpose kernel
|
||||||
|
ckKernel[i] = clCreateKernel(rv_program, kernelName, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// set the args values for the naive kernel
|
||||||
|
size_t offset = i * sizePerGPU;
|
||||||
|
ciErrNum = clSetKernelArg(ckKernel[i], 0, sizeof(cl_mem), (void *) &d_odata[i]);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[i], 1, sizeof(cl_mem), (void *) &d_idata[0]);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[i], 2, sizeof(int), &offset);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[i], 3, sizeof(int), &size_x);
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[i], 4, sizeof(int), &size_y);
|
||||||
|
if(useLocalMem)
|
||||||
|
{
|
||||||
|
ciErrNum |= clSetKernelArg(ckKernel[i], 5, (BLOCK_DIM + 1) * BLOCK_DIM * sizeof(float), 0 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// set up execution configuration
|
||||||
|
szLocalWorkSize[0] = BLOCK_DIM;
|
||||||
|
szLocalWorkSize[1] = BLOCK_DIM;
|
||||||
|
szGlobalWorkSize[0] = sizePerGPU;
|
||||||
|
szGlobalWorkSize[1] = shrRoundUp(BLOCK_DIM, size_y);
|
||||||
|
|
||||||
|
// execute the kernel numIterations times
|
||||||
|
int numIterations = 100;
|
||||||
|
shrLog("\nProcessing a %d by %d matrix of floats...\n\n", size_x, size_y);
|
||||||
|
for (int i = -1; i < numIterations; ++i)
|
||||||
|
{
|
||||||
|
// Start time measurement after warmup
|
||||||
|
if( i == 0 ) shrDeltaT(0);
|
||||||
|
|
||||||
|
for(unsigned int k=0; k < ciDeviceCount; ++k){
|
||||||
|
ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL,
|
||||||
|
szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
|
||||||
|
}
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Block CPU till GPU is done
|
||||||
|
for(unsigned int k=0; k < ciDeviceCount; ++k){
|
||||||
|
ciErrNum |= clFinish(commandQueue[k]);
|
||||||
|
}
|
||||||
|
double time = shrDeltaT(0)/(double)numIterations;
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// Copy back to host
|
||||||
|
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
||||||
|
size_t offset = i * sizePerGPU;
|
||||||
|
size_t size = MIN(size_x - i * sizePerGPU, sizePerGPU);
|
||||||
|
|
||||||
|
ciErrNum |= clEnqueueReadBuffer(commandQueue[i], d_odata[i], CL_TRUE, 0,
|
||||||
|
size * size_y * sizeof(float), &h_odata[offset * size_y],
|
||||||
|
0, NULL, NULL);
|
||||||
|
}
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
||||||
|
ciErrNum |= clReleaseMemObject(d_idata[i]);
|
||||||
|
ciErrNum |= clReleaseMemObject(d_odata[i]);
|
||||||
|
ciErrNum |= clReleaseKernel(ckKernel[i]);
|
||||||
|
}
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
return time;
|
||||||
|
}
|
||||||
|
|
||||||
|
//! Run a simple test for CUDA
|
||||||
|
// *********************************************************************
|
||||||
|
int runTest( const int argc, const char** argv)
|
||||||
|
{
|
||||||
|
cl_int ciErrNum;
|
||||||
|
cl_uint ciDeviceCount;
|
||||||
|
unsigned int size_x = 2048;
|
||||||
|
unsigned int size_y = 2048;
|
||||||
|
|
||||||
|
int temp;
|
||||||
|
if( shrGetCmdLineArgumenti( argc, argv,"width", &temp) ){
|
||||||
|
size_x = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( shrGetCmdLineArgumenti( argc, argv,"height", &temp) ){
|
||||||
|
size_y = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
// size of memory required to store the matrix
|
||||||
|
const size_t mem_size = sizeof(float) * size_x * size_y;
|
||||||
|
|
||||||
|
//Get the NVIDIA platform
|
||||||
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
//Get the devices
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||||
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
//Create the context
|
||||||
|
cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
|
||||||
|
{
|
||||||
|
ciDeviceCount = 0;
|
||||||
|
// User specified GPUs
|
||||||
|
char* deviceList;
|
||||||
|
char* deviceStr;
|
||||||
|
|
||||||
|
shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList);
|
||||||
|
|
||||||
|
#ifdef WIN32
|
||||||
|
char* next_token;
|
||||||
|
deviceStr = strtok_s (deviceList," ,.-", &next_token);
|
||||||
|
#else
|
||||||
|
deviceStr = strtok (deviceList," ,.-");
|
||||||
|
#endif
|
||||||
|
ciDeviceCount = 0;
|
||||||
|
while(deviceStr != NULL)
|
||||||
|
{
|
||||||
|
// get and print the device for this queue
|
||||||
|
cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr));
|
||||||
|
if( device == (cl_device_id)-1 ) {
|
||||||
|
shrLog(" Invalid Device: %s\n\n", deviceStr);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
shrLog("Device %d: ", atoi(deviceStr));
|
||||||
|
oclPrintDevName(LOGBOTH, device);
|
||||||
|
shrLog("\n");
|
||||||
|
|
||||||
|
// create command queue
|
||||||
|
commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
|
||||||
|
return ciErrNum;
|
||||||
|
}
|
||||||
|
|
||||||
|
++ciDeviceCount;
|
||||||
|
|
||||||
|
#ifdef WIN32
|
||||||
|
deviceStr = strtok_s (NULL," ,.-", &next_token);
|
||||||
|
#else
|
||||||
|
deviceStr = strtok (NULL," ,.-");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
free(deviceList);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Find out how many GPU's to compute on all available GPUs
|
||||||
|
size_t nDeviceBytes;
|
||||||
|
ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
|
||||||
|
ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);
|
||||||
|
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
|
||||||
|
return ciErrNum;
|
||||||
|
}
|
||||||
|
else if (ciDeviceCount == 0)
|
||||||
|
{
|
||||||
|
shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create command-queues
|
||||||
|
for(unsigned int i = 0; i < ciDeviceCount; ++i)
|
||||||
|
{
|
||||||
|
// get and print the device for this queue
|
||||||
|
cl_device_id device = oclGetDev(cxGPUContext, i);
|
||||||
|
shrLog("Device %d: ", i);
|
||||||
|
oclPrintDevName(LOGBOTH, device);
|
||||||
|
shrLog("\n");
|
||||||
|
|
||||||
|
// create command queue
|
||||||
|
commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
|
||||||
|
return ciErrNum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate and initalize host memory
|
||||||
|
float* h_idata = (float*)malloc(mem_size);
|
||||||
|
float* h_odata = (float*) malloc(mem_size);
|
||||||
|
srand(15235911);
|
||||||
|
shrFillArray(h_idata, (size_x * size_y));
|
||||||
|
|
||||||
|
// Program Setup
|
||||||
|
size_t program_length;
|
||||||
|
char* source_path = shrFindFilePath("transpose.cl", argv[0]);
|
||||||
|
//oclCheckError(source_path != NULL, shrTRUE);
|
||||||
|
char *source = oclLoadProgSource(source_path, "", &program_length);
|
||||||
|
//oclCheckError(source != NULL, shrTRUE);
|
||||||
|
|
||||||
|
// create the program
|
||||||
|
rv_program =
|
||||||
|
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "transpose", NULL);
|
||||||
|
//rv_program = clCreateProgramWithSource(cxGPUContext, 1,
|
||||||
|
// (const char **)&source, &program_length, &ciErrNum);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// build the program
|
||||||
|
ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
|
||||||
|
if (ciErrNum != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
// write out standard error, Build Log and PTX, then return error
|
||||||
|
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||||
|
oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
|
||||||
|
oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclTranspose.ptx");
|
||||||
|
return(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run Naive Kernel
|
||||||
|
#ifdef GPU_PROFILING
|
||||||
|
// Matrix Copy kernel runs to measure reference performance.
|
||||||
|
double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||||
|
double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||||
|
double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
double naiveTime = transposeGPU("transpose_naive", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||||
|
double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||||
|
|
||||||
|
#ifdef GPU_PROFILING
|
||||||
|
// log times
|
||||||
|
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-simple copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||||
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/simpleCopyTime), simpleCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||||
|
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-shared memory copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||||
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/sharedCopyTime), sharedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||||
|
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-uncoalesced copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||||
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/uncoalescedCopyTime), uncoalescedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||||
|
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-naive, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||||
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/naiveTime), naiveTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||||
|
|
||||||
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-optimized, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||||
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/optimizedTime), optimizedTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// compute reference solution and cross check results
|
||||||
|
float* reference = (float*)malloc( mem_size);
|
||||||
|
computeGold( reference, h_idata, size_x, size_y);
|
||||||
|
shrLog("\nComparing results with CPU computation... \n\n");
|
||||||
|
shrBOOL res = shrComparef( reference, h_odata, size_x * size_y);
|
||||||
|
|
||||||
|
// cleanup memory
|
||||||
|
free(h_idata);
|
||||||
|
free(h_odata);
|
||||||
|
free(reference);
|
||||||
|
free(source);
|
||||||
|
free(source_path);
|
||||||
|
|
||||||
|
// cleanup OpenCL
|
||||||
|
ciErrNum = clReleaseProgram(rv_program);
|
||||||
|
for(unsigned int i = 0; i < ciDeviceCount; ++i)
|
||||||
|
{
|
||||||
|
ciErrNum |= clReleaseCommandQueue(commandQueue[i]);
|
||||||
|
}
|
||||||
|
ciErrNum |= clReleaseContext(cxGPUContext);
|
||||||
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||||
|
|
||||||
|
// pass or fail (cumulative... all tests in the loop)
|
||||||
|
shrQAFinishExit(argc, (const char **)argv, (1 == res) ? QA_PASSED : QA_FAILED);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
198
benchmarks/opencl/transpose/oclUtils.h
Normal file
198
benchmarks/opencl/transpose/oclUtils.h
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef OCL_UTILS_H
|
||||||
|
#define OCL_UTILS_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// Common headers: Cross-API utililties and OpenCL header
|
||||||
|
#include <shrUtils.h>
|
||||||
|
|
||||||
|
// All OpenCL headers
|
||||||
|
#if defined (__APPLE__) || defined(MACOSX)
|
||||||
|
#include <OpenCL/opencl.h>
|
||||||
|
#else
|
||||||
|
#include <CL/opencl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Includes
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||||
|
// extensions from <CL/cl_ext.h>
|
||||||
|
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||||
|
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||||
|
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||||
|
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||||
|
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||||
|
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||||
|
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||||
|
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||||
|
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// reminders for build output window and log
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including shrUtils.h")
|
||||||
|
#pragma message ("Note: including opencl.h")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// SDK Revision #
|
||||||
|
#define OCL_SDKREVISION "7027912"
|
||||||
|
|
||||||
|
// Error and Exit Handling Macros...
|
||||||
|
// *********************************************************************
|
||||||
|
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||||
|
|
||||||
|
// Short version without Cleanup() callback pointer
|
||||||
|
// Both Input (a) and Reference (b) are specified as args
|
||||||
|
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param clSelectedPlatformID OpenCL platform ID
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Print info about the device
|
||||||
|
//!
|
||||||
|
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and return device capability
|
||||||
|
//!
|
||||||
|
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" int oclGetDevCap(cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Print the device name
|
||||||
|
//!
|
||||||
|
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||||
|
//! @param device OpenCL id of the device
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of the first device from the context
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of the nth device from the context
|
||||||
|
//!
|
||||||
|
//! @return the id or -1 when out of range
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//! @param device_idx index of the device of interest
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Gets the id of device with maximal FLOPS from the context
|
||||||
|
//!
|
||||||
|
//! @return the id
|
||||||
|
//! @param cxGPUContext OpenCL context
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Loads a Program file and prepends the cPreamble to the code.
|
||||||
|
//!
|
||||||
|
//! @return the source string if succeeded, 0 otherwise
|
||||||
|
//! @param cFilename program filename
|
||||||
|
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||||
|
//! @param szFinalLength returned length of the code string
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the binary (PTX) of the program associated with the device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//! @param binary returned code
|
||||||
|
//! @param length length of returned code
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//! @param const char* cPtxFileName optional PTX file name
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||||
|
//!
|
||||||
|
//! @param cpProgram OpenCL program
|
||||||
|
//! @param cdDevice device of interest
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||||
|
|
||||||
|
// Helper function for De-allocating cl objects
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||||
|
|
||||||
|
// Helper function to get OpenCL error string from constant
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" const char* oclErrorString(cl_int error);
|
||||||
|
|
||||||
|
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||||
|
|
||||||
|
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||||
|
{
|
||||||
|
// An error condition is defined by the sample/test value not equal to the reference
|
||||||
|
if (iReference != iSample)
|
||||||
|
{
|
||||||
|
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||||
|
iSample = (iSample == 0) ? -9999 : iSample;
|
||||||
|
|
||||||
|
// Log the error info
|
||||||
|
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||||
|
|
||||||
|
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||||
|
if (pCleanup != NULL)
|
||||||
|
{
|
||||||
|
pCleanup(iSample);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||||
|
exit(iSample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
238
benchmarks/opencl/transpose/shrQATest.h
Normal file
238
benchmarks/opencl/transpose/shrQATest.h
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SHR_QATEST_H
|
||||||
|
#define SHR_QATEST_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Generic utilities for NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// OS dependent includes
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including windows.h")
|
||||||
|
#pragma message ("Note: including math.h")
|
||||||
|
#pragma message ("Note: including assert.h")
|
||||||
|
#pragma message ("Note: including time.h")
|
||||||
|
|
||||||
|
// Headers needed for Windows
|
||||||
|
#include <windows.h>
|
||||||
|
#include <time.h>
|
||||||
|
#else
|
||||||
|
// Headers needed for Linux
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <time.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef STRCASECMP
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define STRCASECMP _stricmp
|
||||||
|
#else
|
||||||
|
#define STRCASECMP strcasecmp
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef STRNCASECMP
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define STRNCASECMP _strnicmp
|
||||||
|
#else
|
||||||
|
#define STRNCASECMP strncasecmp
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
// Standardized QA Start/Finish for CUDA SDK tests
|
||||||
|
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||||
|
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||||
|
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||||
|
|
||||||
|
inline int findExeNameStart(const char *exec_name)
|
||||||
|
{
|
||||||
|
int exename_start = (int)strlen(exec_name);
|
||||||
|
|
||||||
|
while( (exename_start > 0) &&
|
||||||
|
(exec_name[exename_start] != '\\') &&
|
||||||
|
(exec_name[exename_start] != '/') )
|
||||||
|
{
|
||||||
|
exename_start--;
|
||||||
|
}
|
||||||
|
if (exec_name[exename_start] == '\\' ||
|
||||||
|
exec_name[exename_start] == '/')
|
||||||
|
{
|
||||||
|
return exename_start+1;
|
||||||
|
} else {
|
||||||
|
return exename_start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int __shrQAStart(int argc, char **argv)
|
||||||
|
{
|
||||||
|
bool bQATest = false;
|
||||||
|
// First clear the output buffer
|
||||||
|
fflush(stdout);
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
char *string_argv = &argv[i][string_start];
|
||||||
|
|
||||||
|
if (!STRCASECMP(string_argv, "qatest")) {
|
||||||
|
bQATest = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't want to print the entire path, so we search for the first
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
printf("\n"); fflush(stdout);
|
||||||
|
return exename_start;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum eQAstatus {
|
||||||
|
QA_FAILED = 0,
|
||||||
|
QA_PASSED = 1,
|
||||||
|
QA_WAIVED = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
inline void __ExitInTime(int seconds)
|
||||||
|
{
|
||||||
|
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||||
|
fflush(stdout);
|
||||||
|
time_t t;
|
||||||
|
int count;
|
||||||
|
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||||
|
fprintf(stdout, "%d...", count);
|
||||||
|
#ifdef WIN32
|
||||||
|
Sleep(1000);
|
||||||
|
#else
|
||||||
|
sleep(1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
fprintf(stdout,"done!\n\n");
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||||
|
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||||
|
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
if (!STRCASECMP(string_argv, "qatest")) {
|
||||||
|
bQATest = true;
|
||||||
|
}
|
||||||
|
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||||
|
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||||
|
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||||
|
bNoPrompt = true;
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
if (!STRCASECMP(string_argv, "prompt")) {
|
||||||
|
bNoPrompt = false;
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
printf("\n"); fflush(stdout);
|
||||||
|
if (bQuitInTime) {
|
||||||
|
__ExitInTime(3);
|
||||||
|
} else {
|
||||||
|
if (!bNoPrompt) {
|
||||||
|
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||||
|
fflush(stdout);
|
||||||
|
getchar();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
bool bQuitInTime = true;
|
||||||
|
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||||
|
|
||||||
|
for (int i=1; i < argc; i++) {
|
||||||
|
int string_start = 0;
|
||||||
|
while (argv[i][string_start] == '-')
|
||||||
|
string_start++;
|
||||||
|
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||||
|
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||||
|
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
if (!STRCASECMP(string_argv, "prompt")) {
|
||||||
|
bQuitInTime = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int exename_start = findExeNameStart(argv[0]);
|
||||||
|
if (bQATest) {
|
||||||
|
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||||
|
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
} else {
|
||||||
|
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
if (bQuitInTime) {
|
||||||
|
__ExitInTime(3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
__shrQAFinish(argc, argv, iStatus);
|
||||||
|
|
||||||
|
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||||
|
{
|
||||||
|
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||||
|
|
||||||
|
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
642
benchmarks/opencl/transpose/shrUtils.h
Normal file
642
benchmarks/opencl/transpose/shrUtils.h
Normal file
@@ -0,0 +1,642 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef SHR_UTILS_H
|
||||||
|
#define SHR_UTILS_H
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Generic utilities for NVIDIA GPU Computing SDK
|
||||||
|
// *********************************************************************
|
||||||
|
|
||||||
|
// reminders for output window and build log
|
||||||
|
#ifdef _WIN32
|
||||||
|
#pragma message ("Note: including windows.h")
|
||||||
|
#pragma message ("Note: including math.h")
|
||||||
|
#pragma message ("Note: including assert.h")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// OS dependent includes
|
||||||
|
#ifdef _WIN32
|
||||||
|
// Headers needed for Windows
|
||||||
|
#include <windows.h>
|
||||||
|
#else
|
||||||
|
// Headers needed for Linux
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Other headers needed for both Windows and Linux
|
||||||
|
#include <math.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// Un-comment the following #define to enable profiling code in SDK apps
|
||||||
|
//#define GPU_PROFILING
|
||||||
|
|
||||||
|
// Beginning of GPU Architecture definitions
|
||||||
|
inline int ConvertSMVer2Cores(int major, int minor)
|
||||||
|
{
|
||||||
|
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||||
|
typedef struct {
|
||||||
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||||
|
int Cores;
|
||||||
|
} sSMtoCores;
|
||||||
|
|
||||||
|
sSMtoCores nGpuArchCoresPerSM[] =
|
||||||
|
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||||
|
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||||
|
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||||
|
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||||
|
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||||
|
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||||
|
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||||
|
{ -1, -1 }
|
||||||
|
};
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||||
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||||
|
return nGpuArchCoresPerSM[index].Cores;
|
||||||
|
}
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
// end of GPU Architecture definitions
|
||||||
|
|
||||||
|
|
||||||
|
// Defines and enum for use with logging functions
|
||||||
|
// *********************************************************************
|
||||||
|
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||||
|
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||||
|
enum LOGMODES
|
||||||
|
{
|
||||||
|
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||||
|
LOGFILE = 2, // bit to signal "log to file"
|
||||||
|
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||||
|
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||||
|
MASTER = 8, // bit to signal master .csv log output
|
||||||
|
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||||
|
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||||
|
};
|
||||||
|
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||||
|
|
||||||
|
// Standardized boolean
|
||||||
|
enum shrBOOL
|
||||||
|
{
|
||||||
|
shrFALSE = 0,
|
||||||
|
shrTRUE = 1
|
||||||
|
};
|
||||||
|
|
||||||
|
// Standardized MAX, MIN and CLAMP
|
||||||
|
#define MAX(a, b) ((a > b) ? a : b)
|
||||||
|
#define MIN(a, b) ((a < b) ? a : b)
|
||||||
|
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||||
|
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||||
|
|
||||||
|
// Error and Exit Handling Macros...
|
||||||
|
// *********************************************************************
|
||||||
|
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||||
|
|
||||||
|
// Short version without Cleanup() callback pointer
|
||||||
|
// Both Input (a) and Reference (b) are specified as args
|
||||||
|
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||||
|
|
||||||
|
// Standardized Exit Macro for leaving main()... extended version
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||||
|
|
||||||
|
// Standardized Exit Macro for leaving main()... short version
|
||||||
|
// (Companion Inline Function lower on page)
|
||||||
|
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||||
|
|
||||||
|
// Simple argument checker macro
|
||||||
|
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||||
|
|
||||||
|
// Define for user-customized error handling
|
||||||
|
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||||
|
|
||||||
|
// Function to deallocate memory allocated within shrUtils
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrFree(void* ptr);
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Helper function to log standardized information to Console, to File or to both
|
||||||
|
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||||
|
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||||
|
//!
|
||||||
|
//! Automatically opens file and stores handle if needed and not done yet
|
||||||
|
//! Closes file and nulls handle on request
|
||||||
|
//!
|
||||||
|
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||||
|
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||||
|
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||||
|
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||||
|
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||||
|
//! @param 2 dValue:
|
||||||
|
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||||
|
//! Negative val is an error code and this give error preformatting.
|
||||||
|
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||||
|
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||||
|
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||||
|
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||||
|
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||||
|
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||||
|
|
||||||
|
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" int shrLog(const char* cFormatString, ...);
|
||||||
|
|
||||||
|
// *********************************************************************
|
||||||
|
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||||
|
// Maintains state for 3 independent counters
|
||||||
|
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||||
|
//!
|
||||||
|
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||||
|
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" double shrDeltaT(int iCounterID);
|
||||||
|
|
||||||
|
// Optional LogFileNameOverride function
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||||
|
|
||||||
|
// Helper function to init data arrays
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||||
|
|
||||||
|
// Helper function to print data arrays
|
||||||
|
// *********************************************************************
|
||||||
|
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Find the path for a filename
|
||||||
|
//! @return the path if succeeded, otherwise 0
|
||||||
|
//! @param filename name of the file
|
||||||
|
//! @param executablePath optional absolute path of the executable
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing single precision floating point data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing double precision floating point data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing integer data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing unsigned integer data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||||
|
unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing char / byte data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Read file \filename containing unsigned char / byte data
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the source file
|
||||||
|
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||||
|
//! the data read
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||||
|
unsigned int* len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing single precision floating point
|
||||||
|
//! data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @param epsilon epsilon for comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||||
|
const float epsilon, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing double precision floating point
|
||||||
|
//! data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
//! @param epsilon epsilon for comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||||
|
const double epsilon, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing integer data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing unsigned integer data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||||
|
unsigned int len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing char / byte data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||||
|
bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Write a data file \filename containing unsigned char / byte data
|
||||||
|
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param filename name of the file to write
|
||||||
|
//! @param data pointer to data to write
|
||||||
|
//! @param len number of data elements in data, -1 on error
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||||
|
unsigned int len, bool verbose = false);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Load PPM image file (with unsigned char as data element type), padding
|
||||||
|
//! 4th component
|
||||||
|
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param OutData handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
//!
|
||||||
|
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||||
|
unsigned int *w, unsigned int *h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||||
|
//! 4 bytes)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||||
|
unsigned int w, unsigned int h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Save PGM image file (with unsigned char as data element type)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||||
|
unsigned int w, unsigned int h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Load PGM image file (with unsigned char as data element type)
|
||||||
|
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||||
|
//! @param file name of the image file
|
||||||
|
//! @param data handle to the data read
|
||||||
|
//! @param w width of the image
|
||||||
|
//! @param h height of the image
|
||||||
|
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||||
|
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||||
|
unsigned int *w,unsigned int *h);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Command line arguments: General notes
|
||||||
|
// * All command line arguments begin with '--' followed by the token;
|
||||||
|
// token and value are seperated by '='; example --samples=50
|
||||||
|
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||||
|
// (without whitespaces)
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Check if command line argument \a flag-name is given
|
||||||
|
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||||
|
//! otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param flag_name name of command line flag
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||||
|
const char* flag_name);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type int
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||||
|
const char* arg_name, int* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type unsigned int
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||||
|
const char* arg_name, unsigned int* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type float
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||||
|
const char* arg_name, float* val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument of type string
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val value of the command line argument
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||||
|
const char* arg_name, char** val);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Get the value of a command line argument list those element are strings
|
||||||
|
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||||
|
//! is of the requested type, otherwise shrFALSE
|
||||||
|
//! @param argc argc as passed to main()
|
||||||
|
//! @param argv argv as passed to main()
|
||||||
|
//! @param arg_name name of the command line argument
|
||||||
|
//! @param val command line argument list
|
||||||
|
//! @param len length of the list / number of elements
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||||
|
const char* arg_name, char** val,
|
||||||
|
unsigned int* len);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||||
|
const unsigned int len);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integer arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||||
|
const unsigned int len );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two unsigned char arrays
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integers with a tolernance for # of byte errors
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||||
|
//! threshold for # pixel errors
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon, const float threshold );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||||
|
//! equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param reference handle to the reference data / gold image
|
||||||
|
//! @param data handle to the computed data
|
||||||
|
//! @param len number of elements in reference and data
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||||
|
const unsigned int len, const float epsilon );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param src_file filename for the image to be compared
|
||||||
|
//! @param data filename for the reference data / gold image
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||||
|
//! $param verboseErrors output details of image mismatch to std::err
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||||
|
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||||
|
//! @param src_file filename for the image to be compared
|
||||||
|
//! @param data filename for the reference data / gold image
|
||||||
|
//! @param epsilon epsilon to use for the comparison
|
||||||
|
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||||
|
//! $param verboseErrors output details of image mismatch to std::err
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||||
|
|
||||||
|
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||||
|
|
||||||
|
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||||
|
|
||||||
|
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||||
|
{
|
||||||
|
if (iReference != iSample)
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||||
|
if (pCleanup != NULL)
|
||||||
|
{
|
||||||
|
pCleanup(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Standardized Exit
|
||||||
|
// *********************************************************************
|
||||||
|
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||||
|
{
|
||||||
|
#ifdef WIN32
|
||||||
|
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||||
|
#else
|
||||||
|
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||||
|
getchar();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||||
|
}
|
||||||
|
fflush(stderr);
|
||||||
|
exit(iExitCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
108
benchmarks/opencl/transpose/transpose.cl
Normal file
108
benchmarks/opencl/transpose/transpose.cl
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Matrix transpose with OpenCL
|
||||||
|
* Device code.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define BLOCK_DIM 16
|
||||||
|
|
||||||
|
// This kernel is optimized to ensure all global reads and writes are coalesced,
|
||||||
|
// and to avoid bank conflicts in shared memory. This kernel is up to 11x faster
|
||||||
|
// than the naive kernel below. Note that the shared memory array is sized to
|
||||||
|
// (BLOCK_DIM+1)*BLOCK_DIM. This pads each row of the 2D block in shared memory
|
||||||
|
// so that bank conflicts do not occur when threads address the array column-wise.
|
||||||
|
__kernel void transpose(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
|
||||||
|
{
|
||||||
|
// read the matrix tile into shared memory
|
||||||
|
unsigned int xIndex = get_global_id(0);
|
||||||
|
unsigned int yIndex = get_global_id(1);
|
||||||
|
|
||||||
|
if((xIndex + offset < width) && (yIndex < height))
|
||||||
|
{
|
||||||
|
unsigned int index_in = yIndex * width + xIndex + offset;
|
||||||
|
block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// write the transposed matrix tile to global memory
|
||||||
|
xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);
|
||||||
|
yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);
|
||||||
|
if((xIndex < height) && (yIndex + offset < width))
|
||||||
|
{
|
||||||
|
unsigned int index_out = yIndex * height + xIndex;
|
||||||
|
odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// This naive transpose kernel suffers from completely non-coalesced writes.
|
||||||
|
// It can be up to 10x slower than the kernel above for large matrices.
|
||||||
|
__kernel void transpose_naive(__global float *odata, __global float* idata, int offset, int width, int height)
|
||||||
|
{
|
||||||
|
unsigned int xIndex = get_global_id(0);
|
||||||
|
unsigned int yIndex = get_global_id(1);
|
||||||
|
|
||||||
|
if (xIndex + offset < width && yIndex < height)
|
||||||
|
{
|
||||||
|
unsigned int index_in = xIndex + offset + width * yIndex;
|
||||||
|
unsigned int index_out = yIndex + height * xIndex;
|
||||||
|
odata[index_out] = idata[index_in];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__kernel void simple_copy(__global float *odata, __global float* idata, int offset, int width, int height)
|
||||||
|
{
|
||||||
|
unsigned int xIndex = get_global_id(0);
|
||||||
|
unsigned int yIndex = get_global_id(1);
|
||||||
|
|
||||||
|
if (xIndex + offset < width && yIndex < height)
|
||||||
|
{
|
||||||
|
unsigned int index_in = xIndex + offset + width * yIndex;
|
||||||
|
odata[index_in] = idata[index_in];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void shared_copy(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
|
||||||
|
{
|
||||||
|
// read the matrix tile into shared memory
|
||||||
|
unsigned int xIndex = get_global_id(0);
|
||||||
|
unsigned int yIndex = get_global_id(1);
|
||||||
|
|
||||||
|
unsigned int index_in = yIndex * width + xIndex + offset;
|
||||||
|
if((xIndex + offset< width) && (yIndex < height))
|
||||||
|
{
|
||||||
|
block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if((xIndex < height) && (yIndex+ offset < width))
|
||||||
|
{
|
||||||
|
odata[index_in] = block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__kernel void uncoalesced_copy(__global float *odata, __global float* idata, int offset, int width, int height)
|
||||||
|
{
|
||||||
|
unsigned int xIndex = get_global_id(0);
|
||||||
|
unsigned int yIndex = get_global_id(1);
|
||||||
|
|
||||||
|
if (xIndex + offset < width && yIndex < height)
|
||||||
|
{
|
||||||
|
unsigned int index_in = yIndex + height * (xIndex+ offset);
|
||||||
|
odata[index_in] = idata[index_in];
|
||||||
|
}
|
||||||
|
}
|
||||||
38
benchmarks/opencl/transpose/transpose_gold.cpp
Normal file
38
benchmarks/opencl/transpose/transpose_gold.cpp
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Small Matrix transpose with Cuda (Example for a 16x16 matrix)
|
||||||
|
* Reference solution.
|
||||||
|
*/
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// export C interface
|
||||||
|
extern "C"
|
||||||
|
void computeGold( float* reference, float* idata,
|
||||||
|
const unsigned int size_x, const unsigned int size_y );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Compute reference data set
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void
|
||||||
|
computeGold( float* reference, float* idata,
|
||||||
|
const unsigned int size_x, const unsigned int size_y )
|
||||||
|
{
|
||||||
|
// transpose matrix
|
||||||
|
for( unsigned int y = 0; y < size_y; ++y)
|
||||||
|
{
|
||||||
|
for( unsigned int x = 0; x < size_x; ++x)
|
||||||
|
{
|
||||||
|
reference[(x * size_y) + y] = idata[(y * size_x) + x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
33
benchmarks/vector/saxpy/Makefile
Normal file
33
benchmarks/vector/saxpy/Makefile
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
LIB_PATH = ../../../runtime
|
||||||
|
|
||||||
|
COMP = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-gcc
|
||||||
|
|
||||||
|
CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/mains/vortex_link.ld -march=rv32imv -mabi=ilp32
|
||||||
|
|
||||||
|
DMP = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objdump
|
||||||
|
CPY = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objcopy
|
||||||
|
|
||||||
|
# VX_STR = ../../startup/vx_start.s
|
||||||
|
|
||||||
|
NEWLIB = $(LIB_PATH)/newlib/newlib.c
|
||||||
|
VX_STR = $(LIB_PATH)/startup/vx_start.s
|
||||||
|
VX_INT = $(LIB_PATH)/intrinsics/vx_intrinsics.s
|
||||||
|
VX_IO = $(LIB_PATH)/io/vx_io.s $(LIB_PATH)/io/vx_io.c
|
||||||
|
VX_API = $(LIB_PATH)/vx_api/vx_api.c
|
||||||
|
VX_TEST = $(LIB_PATH)/tests/tests.c
|
||||||
|
VX_FIO = $(LIB_PATH)/fileio/fileio.s
|
||||||
|
VX_VEC = vx_vec_saxpy.s #float --> int
|
||||||
|
LIBS = /nethome/ekim79/riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libc.a /nethome/ekim79/riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
|
||||||
|
|
||||||
|
VX_MAIN = vx_vec_saxpy
|
||||||
|
|
||||||
|
all: HEX DUMP ELF
|
||||||
|
|
||||||
|
DUMP: ELF
|
||||||
|
$(DMP) -D $(VX_MAIN).elf > $(VX_MAIN).dump
|
||||||
|
|
||||||
|
HEX: ELF
|
||||||
|
$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
|
||||||
|
|
||||||
|
ELF:
|
||||||
|
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||||
69
benchmarks/vector/saxpy/vx_vec_saxpy.c
Normal file
69
benchmarks/vector/saxpy/vx_vec_saxpy.c
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include "../../../runtime/intrinsics/vx_intrinsics.h"
|
||||||
|
#include "vx_vec_saxpy.h"
|
||||||
|
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
/* # void saxpy(size_t n, const float a, const float *x, float *y)
|
||||||
|
# ==> convert to int!!
|
||||||
|
# void saxpy(size_t n, const int a, const int *x, int *y)
|
||||||
|
# { size_t i;
|
||||||
|
# for (i=0; i<n; i++) y[i] = a * x[i] + y[i]; } */
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
vx_tmc(1);
|
||||||
|
|
||||||
|
int n = 4; //#define NUM_DATA 65536
|
||||||
|
|
||||||
|
int *a = (int*)malloc(sizeof(int) * n);
|
||||||
|
int *b = (int*)malloc(sizeof(int) * n);
|
||||||
|
int *c = (int*)malloc(sizeof(int) * n); //verification
|
||||||
|
|
||||||
|
// float factor = ((float)rand()/(float)(RAND_MAX)) * 100.0;
|
||||||
|
int factor = ((float)rand()/(RAND_MAX)) * 100.0;
|
||||||
|
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
a[i] = ((float)rand()/(RAND_MAX)) * 100.0;
|
||||||
|
b[i] = 0;
|
||||||
|
c[i] = 0;
|
||||||
|
}
|
||||||
|
//; c[i] = 2;}
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
printf("saxpy\nfactor: %d\na[%d]: ", factor, n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d ", a[i]);
|
||||||
|
// printf("\nb[%d]: ", n);
|
||||||
|
// for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
vx_vec_saxpy(n, factor, a, b);
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
printf("\nsaxpy\na[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d ", a[i]);
|
||||||
|
printf("\n\nb[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d ", b[i]);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for(int i = 0; i < n; ++i)
|
||||||
|
{
|
||||||
|
if(b[i] != ((a[i] * factor) + c[i]))
|
||||||
|
{
|
||||||
|
printf("\n<saxpy> FAILED at <index: %d>! \n", i);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\nPASSED.......................... <saxpy> \n");
|
||||||
|
|
||||||
|
|
||||||
|
free(a); free(b); free(c);
|
||||||
|
|
||||||
|
vx_tmc(0);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
}
|
||||||
92950
benchmarks/vector/saxpy/vx_vec_saxpy.dump
Normal file
92950
benchmarks/vector/saxpy/vx_vec_saxpy.dump
Normal file
File diff suppressed because it is too large
Load Diff
BIN
benchmarks/vector/saxpy/vx_vec_saxpy.elf
Executable file
BIN
benchmarks/vector/saxpy/vx_vec_saxpy.elf
Executable file
Binary file not shown.
12
benchmarks/vector/saxpy/vx_vec_saxpy.h
Normal file
12
benchmarks/vector/saxpy/vx_vec_saxpy.h
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void vx_vec_saxpy(int n, int scalar, int* a, int* b);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
5697
benchmarks/vector/saxpy/vx_vec_saxpy.hex
Normal file
5697
benchmarks/vector/saxpy/vx_vec_saxpy.hex
Normal file
File diff suppressed because it is too large
Load Diff
26
benchmarks/vector/saxpy/vx_vec_saxpy.s
Normal file
26
benchmarks/vector/saxpy/vx_vec_saxpy.s
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
.type vx_vec_saxpy, @function
|
||||||
|
.global vx_vec_saxpy
|
||||||
|
# void
|
||||||
|
# saxpy(size_t n, int factor, int *a, int *b)
|
||||||
|
# { for (int i=0; i<n; i++) { y[i] = a * x[i] + y[i];} }
|
||||||
|
#
|
||||||
|
# register arguments:
|
||||||
|
# a0 n
|
||||||
|
# a1 factor
|
||||||
|
# a2 a
|
||||||
|
# a3 b
|
||||||
|
vx_vec_saxpy:
|
||||||
|
vsetvli a4, a0, e32
|
||||||
|
loop:
|
||||||
|
vlw.v v0, (a2)
|
||||||
|
sub a0, a0, a4
|
||||||
|
slli a4, a4, 2
|
||||||
|
add a2, a2, a4
|
||||||
|
vlw.v v1, (a3)
|
||||||
|
vmul.vx v0, v0, a1
|
||||||
|
vadd.vv v1, v0, v1
|
||||||
|
# vmacc.vx v1, rs1, v0
|
||||||
|
vsw.v v1, (a3)
|
||||||
|
add a3, a3, a4
|
||||||
|
bnez a0, loop
|
||||||
|
ret
|
||||||
33
benchmarks/vector/sgemm_nn/Makefile
Normal file
33
benchmarks/vector/sgemm_nn/Makefile
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
LIB_PATH = ../../../runtime
|
||||||
|
|
||||||
|
COMP = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-gcc
|
||||||
|
|
||||||
|
CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/mains/vortex_link.ld -march=rv32imv -mabi=ilp32
|
||||||
|
|
||||||
|
DMP = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objdump
|
||||||
|
CPY = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objcopy
|
||||||
|
|
||||||
|
# VX_STR = ../../startup/vx_start.s
|
||||||
|
|
||||||
|
NEWLIB = $(LIB_PATH)/newlib/newlib.c
|
||||||
|
VX_STR = $(LIB_PATH)/startup/vx_start.s
|
||||||
|
VX_INT = $(LIB_PATH)/intrinsics/vx_intrinsics.s
|
||||||
|
VX_IO = $(LIB_PATH)/io/vx_io.s $(LIB_PATH)/io/vx_io.c
|
||||||
|
VX_API = $(LIB_PATH)/vx_api/vx_api.c
|
||||||
|
VX_TEST = $(LIB_PATH)/tests/tests.c
|
||||||
|
VX_FIO = $(LIB_PATH)/fileio/fileio.s
|
||||||
|
VX_VEC = vx_vec_sgemm_nn.s #float --> int
|
||||||
|
LIBS = /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libc.a /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
|
||||||
|
|
||||||
|
VX_MAIN = vx_vec_sgemm_nn
|
||||||
|
|
||||||
|
all: HEX DUMP ELF
|
||||||
|
|
||||||
|
DUMP: ELF
|
||||||
|
$(DMP) -D $(VX_MAIN).elf > $(VX_MAIN).dump
|
||||||
|
|
||||||
|
HEX: ELF
|
||||||
|
$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
|
||||||
|
|
||||||
|
ELF:
|
||||||
|
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||||
38
benchmarks/vector/sgemm_nn/test_asm.s
Normal file
38
benchmarks/vector/sgemm_nn/test_asm.s
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
.type vx_vec_sgemm_nn, @function
|
||||||
|
.global vx_vec_sgemm_nn
|
||||||
|
#
|
||||||
|
# for (int n = 0; n < k; n++) {
|
||||||
|
# for (int m = 0; m < m; m++) {
|
||||||
|
# for (int i = 0; i < n;) {
|
||||||
|
#// d1[n*k+i] += a1[n*k+m]*b1[i*n+m];
|
||||||
|
# vx_vec_sgemm_nn(i, c, r, a1, b1, c1, ldc);
|
||||||
|
# i = i + 4;
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# a3 = a, a4 = b, a5 = c
|
||||||
|
# a0 = i, a1 = m, a2 = n
|
||||||
|
# a6 = ldc
|
||||||
|
vx_vec_sgemm_nn:
|
||||||
|
vsetvli t0, a6, e32
|
||||||
|
mul x1, a6, a2 # n*ldc
|
||||||
|
add x2, x1, a1 # i + (n*ldc)
|
||||||
|
add a3, x2, a3 # a[i+ n*ldc]
|
||||||
|
lw x3, (a3)
|
||||||
|
|
||||||
|
mul x4, a1, a6 # m*ldc
|
||||||
|
add x5, a0, x4 # i + m*ldc
|
||||||
|
add a4, x5, a4 # b[i + m*ldc]
|
||||||
|
# lw x6, (a4)
|
||||||
|
|
||||||
|
vlw.v v0, (a4)
|
||||||
|
vmul.vx v2, v1, x3
|
||||||
|
|
||||||
|
mul x6, a2, a6 # n*ldc
|
||||||
|
add x7, a0, x6 # i + n*ldc
|
||||||
|
add a5, x7, a5 # c[i + m*ldc]
|
||||||
|
|
||||||
|
vlw.v v3, (a5) #c
|
||||||
|
vadd.vv v3, v3, v2
|
||||||
|
|
||||||
|
ret
|
||||||
110
benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.c
Normal file
110
benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.c
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "../../../runtime/intrinsics/vx_intrinsics.h"
|
||||||
|
#include "vx_vec_sgemm_nn.h"
|
||||||
|
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
/* # void sgemm_nn(size_t n, size_t m, size_t k,
|
||||||
|
# int *a, // m * k matri size_t lda,
|
||||||
|
# int *b, // k * n matrix size_t ldb,
|
||||||
|
# int *c, // m * n matrix size_t ldc)
|
||||||
|
# c += a*b (alpha=1, no transpose on input matrices)
|
||||||
|
# matrices stored in C row-major order */
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
vx_tmc(1);
|
||||||
|
|
||||||
|
int m = 4;
|
||||||
|
int k = 4;
|
||||||
|
int n = 4;
|
||||||
|
|
||||||
|
int* a1 = (int*)malloc(sizeof(int) * m * k);
|
||||||
|
int* b1 = (int*)malloc(sizeof(int) * k * n);
|
||||||
|
int* c1 = (int*)malloc(sizeof(int) * m * n);
|
||||||
|
int* d1 = (int*)malloc(sizeof(int) * m * n); //verfication
|
||||||
|
|
||||||
|
for (int i = 0; i < (m * k); ++i) a1[i] = i;
|
||||||
|
for (int i = 0; i < (k * n); ++i) b1[i] = 1;
|
||||||
|
for (int i = 0; i < (m * n); ++i) c1[i] = 0;
|
||||||
|
for (int i = 0; i < (m * n); ++i) d1[i] = 0;
|
||||||
|
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
printf("sgemm_nn\na[%d]:", m*k);
|
||||||
|
for (int i = 0; i < m*k; ++i) {
|
||||||
|
if(!(i % k)) printf("\n");
|
||||||
|
printf("%d ", a1[i]);
|
||||||
|
}
|
||||||
|
printf("\n\nb[%d]:", k*n);
|
||||||
|
for (int i = 0; i < k*n; ++i) {
|
||||||
|
if (!(i % n)) printf("\n");
|
||||||
|
printf("%d ", b1[i]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int lda = 4;
|
||||||
|
int ldb = 4;
|
||||||
|
int ldc = 4; //64;
|
||||||
|
int vsize = 4;
|
||||||
|
|
||||||
|
for (int r = 0; r < m; r++) {
|
||||||
|
for (int c = 0; c < n; c++) {
|
||||||
|
for (int i = 0; i < k;) {
|
||||||
|
// d1[r*k+i] += a1[r*k+c]*b1[i*n+c];
|
||||||
|
vx_vec_sgemm_nn(i, r, c, a1, b1, c1, ldc, vsize);
|
||||||
|
i = i + vsize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// vx_vec_sgemm_nn(n, a1, b1, c1);
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
printf("\n\nc[%d]:", m*n);
|
||||||
|
for (int i = 0; i < m*n; ++i) {
|
||||||
|
if (!(i % n)) printf("\n");
|
||||||
|
printf("%d ", c1[i]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (int r = 0; r < m; r++) {
|
||||||
|
for (int c = 0; c < n; c++) {
|
||||||
|
for (int i = 0; i < k; i++) {
|
||||||
|
d1[c*ldc+i] += a1[c*ldc+r]*b1[i + (r*ldc)];
|
||||||
|
//printf("d[%d] += a[%d]*b[%d]\n", c*ldc+i, c*ldc+r , i + (r*ldc));
|
||||||
|
//printf("%d %d %d\n", d1[c*ldc+i] , a1[c*ldc+r] , b1[i + (r*ldc)]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
printf("\n\nc[%d]:\n", m*n);
|
||||||
|
for(int i = 0; i < m; ++i) {
|
||||||
|
for(int j = 0; j < n; ++j) {
|
||||||
|
printf("%d ", d1[i*m+j]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
for(int i = 0; i < m*n; ++i)
|
||||||
|
{
|
||||||
|
if(c1[i] != d1[i])
|
||||||
|
{
|
||||||
|
printf("\n<sgemm_nn> FAILED at <index: %d>! \n", i);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\nPASS.......................... <sgemm_nn> \n");
|
||||||
|
|
||||||
|
|
||||||
|
free(a1); free(b1); free(c1);
|
||||||
|
|
||||||
|
vx_tmc(0);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
}
|
||||||
87276
benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.dump
Normal file
87276
benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.dump
Normal file
File diff suppressed because it is too large
Load Diff
BIN
benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.elf
Executable file
BIN
benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.elf
Executable file
Binary file not shown.
13
benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.h
Normal file
13
benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.h
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int lda, int* b1, int ldb, int* c1, int ldc);
|
||||||
|
void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int* b1, int* c1, int ldc, int vsize);
|
||||||
|
//void vx_vec_sgemm_nn(int n, int* a1, int* b1, int* c1);
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
5644
benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.hex
Normal file
5644
benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.hex
Normal file
File diff suppressed because it is too large
Load Diff
42
benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.s
Normal file
42
benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.s
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
.type vx_vec_sgemm_nn, @function
|
||||||
|
.global vx_vec_sgemm_nn
|
||||||
|
#
|
||||||
|
# for (int n = 0; n < k; n++) {
|
||||||
|
# for (int m = 0; m < m; m++) {
|
||||||
|
# for (int i = 0; i < n;) {
|
||||||
|
#// d1[n*k+i] += a1[n*k+m]*b1[i*n+m];
|
||||||
|
# vx_vec_sgemm_nn(i, c, r, a1, b1, c1, ldc);
|
||||||
|
# i = i + 4;
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# a3 = a, a4 = b, a5 = c
|
||||||
|
# a0 = i, a1 = m, a2 = n
|
||||||
|
# a6 = ldc
|
||||||
|
vx_vec_sgemm_nn:
|
||||||
|
vsetvli t0, a7, e32
|
||||||
|
mul t1, a6, a2 # n*ldc
|
||||||
|
add t2, t1, a1 # i + (n*ldc)
|
||||||
|
slli t2, t2, 2
|
||||||
|
add a3, t2, a3 # a[i+ n*ldc]
|
||||||
|
lw t3, (a3)
|
||||||
|
|
||||||
|
mul t4, a1, a6 # m*ldc
|
||||||
|
add t5, a0, t4 # i + m*ldc
|
||||||
|
slli t5, t5, 2
|
||||||
|
add a4, t5, a4 # b[i + m*ldc]
|
||||||
|
# lw x6, (a4)
|
||||||
|
|
||||||
|
vlw.v v0, (a4)
|
||||||
|
vmul.vx v1, v0, t3
|
||||||
|
|
||||||
|
mul t6, a2, a6 # n*ldc
|
||||||
|
add t0, a0, t6 # i + n*ldc
|
||||||
|
slli t0, t0, 2
|
||||||
|
add a5, t0, a5 # c[i + m*ldc]
|
||||||
|
|
||||||
|
vlw.v v2, (a5) #c
|
||||||
|
vadd.vv v2, v2, v1
|
||||||
|
vsw.v v2, (a5)
|
||||||
|
|
||||||
|
ret
|
||||||
41
benchmarks/vector/vecadd/Makefile
Normal file
41
benchmarks/vector/vecadd/Makefile
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
LIB_PATH = ../../../runtime
|
||||||
|
|
||||||
|
COMP = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-gcc
|
||||||
|
|
||||||
|
CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/mains/vortex_link.ld -march=rv32imv -mabi=ilp32
|
||||||
|
|
||||||
|
DMP = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objdump
|
||||||
|
CPY = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objcopy
|
||||||
|
|
||||||
|
# VX_STR = ../../startup/vx_start.s
|
||||||
|
|
||||||
|
NEWLIB = $(LIB_PATH)/newlib/newlib.c
|
||||||
|
VX_STR = $(LIB_PATH)/startup/vx_start.s
|
||||||
|
VX_INT = $(LIB_PATH)/intrinsics/vx_intrinsics.s
|
||||||
|
VX_IO = $(LIB_PATH)/io/vx_io.s $(LIB_PATH)/io/vx_io.c
|
||||||
|
VX_API = $(LIB_PATH)/vx_api/vx_api.c
|
||||||
|
VX_TEST = $(LIB_PATH)/tests/tests.c
|
||||||
|
VX_FIO = $(LIB_PATH)/fileio/fileio.s
|
||||||
|
VX_VEC1 = vx_vec_vvaddint32.s
|
||||||
|
#VX_VEC2 = vx_vec_saxpy.s #float --> int
|
||||||
|
#VX_VEC3 = vx_vec_sgemm.s #float --> int
|
||||||
|
#VX_VEC4 = vx_vec_vsadd.s
|
||||||
|
#VX_VEC5 = vx_vec_memcpy.s
|
||||||
|
LIBS = /nethome/ekim79/riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libc.a /nethome/ekim79/riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
|
||||||
|
|
||||||
|
VX_MAIN = vx_vec_vecadd
|
||||||
|
|
||||||
|
all: HEX DUMP ELF
|
||||||
|
|
||||||
|
DUMP: ELF
|
||||||
|
$(DMP) -D $(VX_MAIN).elf > $(VX_MAIN).dump
|
||||||
|
|
||||||
|
HEX: ELF
|
||||||
|
$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
|
||||||
|
|
||||||
|
ELF:
|
||||||
|
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC1) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||||
|
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||||
|
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||||
|
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||||
|
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC5) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf~
|
||||||
57
benchmarks/vector/vecadd/vx_vec_vecadd.c
Normal file
57
benchmarks/vector/vecadd/vx_vec_vecadd.c
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "../../../runtime/intrinsics/vx_intrinsics.h"
|
||||||
|
#include "vx_vec_vecadd.h"
|
||||||
|
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
/* vvaddint32
|
||||||
|
* # vector-vector add routine of 32-bit integers
|
||||||
|
* # void vvaddint32(size_t n, const int*x, const int*y, int*z)
|
||||||
|
* # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } } */
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
vx_tmc(1);
|
||||||
|
|
||||||
|
int n = 4; //SIZE
|
||||||
|
|
||||||
|
int *a = (int*)malloc(sizeof(int) * n);
|
||||||
|
int *b = (int*)malloc(sizeof(int) * n);
|
||||||
|
int *c = (int*)malloc(sizeof(int) * n);
|
||||||
|
|
||||||
|
// Initialize values for array members.
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
a[i] = i * 2 + 0;
|
||||||
|
b[i] = i * 2 + 1;
|
||||||
|
c[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
printf("vvaddint...\na[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d ", a[i]);
|
||||||
|
printf("\nb[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d ", b[i]);
|
||||||
|
printf("\nc[%d] = a[%d] + b[%d]: ", n, n, n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d ", c[i]);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
vx_vec_vvaddint32(n, a, b, c);
|
||||||
|
|
||||||
|
for(int i = 0; i < n; ++i)
|
||||||
|
{
|
||||||
|
if(c[i] != (a[i]+b[i]))
|
||||||
|
{
|
||||||
|
printf("\n<vddint32> FAILED at <index: %d>! \n", i);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\nPASSED.......................... <vddint32> \n");
|
||||||
|
|
||||||
|
free(a); free(b); free(c);
|
||||||
|
|
||||||
|
vx_tmc(0);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
}
|
||||||
86984
benchmarks/vector/vecadd/vx_vec_vecadd.dump
Normal file
86984
benchmarks/vector/vecadd/vx_vec_vecadd.dump
Normal file
File diff suppressed because it is too large
Load Diff
BIN
benchmarks/vector/vecadd/vx_vec_vecadd.elf
Executable file
BIN
benchmarks/vector/vecadd/vx_vec_vecadd.elf
Executable file
Binary file not shown.
17
benchmarks/vector/vecadd/vx_vec_vecadd.h
Normal file
17
benchmarks/vector/vecadd/vx_vec_vecadd.h
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void vx_vec_vvaddint32(int n, int* a, int* b, int *c);
|
||||||
|
//void vx_vec_vsadd(int n, int* a, int scalar);
|
||||||
|
//void vx_vec_memcpy(int* a, int* b, int n);
|
||||||
|
//void vx_vec_saxpy(int n, int scalar, int* a, int* b);
|
||||||
|
//void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int lda, int* b1, int ldb, int* c1, int ldc);
|
||||||
|
//void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int* b1, int* c1);
|
||||||
|
//void vx_vec_sgemm_nn(int n, int* a1, int* b1, int* c1);
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
5567
benchmarks/vector/vecadd/vx_vec_vecadd.hex
Normal file
5567
benchmarks/vector/vecadd/vx_vec_vecadd.hex
Normal file
File diff suppressed because it is too large
Load Diff
22
benchmarks/vector/vecadd/vx_vec_vvaddint32.s
Normal file
22
benchmarks/vector/vecadd/vx_vec_vvaddint32.s
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
.type vx_vec_vvaddi32, @function
|
||||||
|
.global vx_vec_vvaddint32
|
||||||
|
# vector-vector add routine of 32-bit integers
|
||||||
|
# void vvaddint32(size_t n, const int*x, const int*y, int*z)
|
||||||
|
# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
|
||||||
|
#
|
||||||
|
# a0 = n, a1 = x, a2 = y, a3 = z
|
||||||
|
# Non-vector instructions are indented
|
||||||
|
vx_vec_vvaddint32:
|
||||||
|
vsetvli t0, a0, e32 # Set vector length based on 32-bit vectors
|
||||||
|
loop:
|
||||||
|
vlw.v v0, (a1) # Get first vector
|
||||||
|
sub a0, a0, t0 # Decrement number done
|
||||||
|
slli t0, t0, 2 # Multiply number done by 4 bytes
|
||||||
|
add a1, a1, t0 # Bump pointer
|
||||||
|
vlw.v v1, (a2) # Get second vector
|
||||||
|
add a2, a2, t0 # Bump pointer
|
||||||
|
vadd.vv v2, v0, v1 # Sum vectors
|
||||||
|
vsw.v v2, (a3) # Store result
|
||||||
|
add a3, a3, t0 # Bump pointer
|
||||||
|
bnez a0, loop # Loop back
|
||||||
|
ret # Finished
|
||||||
22
runtime/mains/vector_test/vx_vec_original.s
Normal file
22
runtime/mains/vector_test/vx_vec_original.s
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
.type vx_vec_test, @function
|
||||||
|
.global vx_vec_test
|
||||||
|
vx_vec_test:
|
||||||
|
# vector-vector add routine of 32-bit integers
|
||||||
|
# void vvaddint32(size_t n, const int*x, const int*y, int*z)
|
||||||
|
# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
|
||||||
|
#
|
||||||
|
# a0 = n, a1 = x, a2 = y, a3 = z
|
||||||
|
# Non-vector instructions are indented
|
||||||
|
vsetvli t0, a0, e32 # Set vector length based on 32-bit vectors
|
||||||
|
loop:
|
||||||
|
vlw.v v0, (a1) # Get first vector
|
||||||
|
sub a0, a0, t0 # Decrement number done
|
||||||
|
slli t0, t0, 2 # Multiply number done by 4 bytes
|
||||||
|
add a1, a1, t0 # Bump pointer
|
||||||
|
vlw.v v1, (a2) # Get second vector
|
||||||
|
add a2, a2, t0 # Bump pointer
|
||||||
|
vadd.vv v2, v0, v1 # Sum vectors
|
||||||
|
vsw.v v2, (a3) # Store result
|
||||||
|
add a3, a3, t0 # Bump pointer
|
||||||
|
bnez a0, loop # Loop back
|
||||||
|
ret # Finished
|
||||||
30
rvvector/basic/_1_vx_vec.s
Normal file
30
rvvector/basic/_1_vx_vec.s
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.type vx_vec_test, @function
|
||||||
|
.global vx_vec_test
|
||||||
|
vx_vec_test:
|
||||||
|
li a1, 7
|
||||||
|
sw a1, 0(a0)
|
||||||
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# slli a0, a0, 2
|
||||||
|
# add a0, a0, a3
|
||||||
|
# vmv.v.x vv0, a2
|
||||||
|
# # vsplat4 vv0, a2
|
||||||
|
# stripmine_loop:
|
||||||
|
# vlb4 vv1, (a1)
|
||||||
|
# vcmpez4 vp0, vv1
|
||||||
|
# !vp0 vlw4 vv1, (a3)
|
||||||
|
# !vp0 vlw4 vv2, (a4)
|
||||||
|
# !vp0 vfma4 vv1, vv0, vv1, vv2
|
||||||
|
# !vp0 vsw4 vv1, (a4)
|
||||||
|
# addi a1, a1, 4
|
||||||
|
# addi a3, a3, 16
|
||||||
|
# addi a4, a4, 16
|
||||||
|
# bleu a3, a0, stripmine_loop
|
||||||
|
# handle edge cases
|
||||||
|
# when (n % 4) != 0 ...
|
||||||
32
rvvector/basic/_1_vx_vector_main.c
Normal file
32
rvvector/basic/_1_vx_vector_main.c
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
|
||||||
|
#include "../../runtime/intrinsics/vx_intrinsics.h"
|
||||||
|
#include "vx_vec.h"
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
vx_tmc(1);
|
||||||
|
// int * a = malloc(4);
|
||||||
|
// int * b = malloc(4);
|
||||||
|
// int * c = malloc(4);
|
||||||
|
|
||||||
|
|
||||||
|
int * a = malloc(4);
|
||||||
|
*a = 5;
|
||||||
|
printf("Value of a: %d\n", *a);
|
||||||
|
|
||||||
|
vx_vec_test(a);
|
||||||
|
|
||||||
|
printf("Value of a: %d\n", *a);
|
||||||
|
|
||||||
|
|
||||||
|
// for (int i = 0; i < 4; i++)
|
||||||
|
// {
|
||||||
|
// if (c[i] != (a[i] + b[i]))
|
||||||
|
// {
|
||||||
|
// printf("Fail\n");
|
||||||
|
// break;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
vx_tmc(0);
|
||||||
|
}
|
||||||
91
rvvector/basic/__vx_vector_main.c
Normal file
91
rvvector/basic/__vx_vector_main.c
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "../../runtime/intrinsics/vx_intrinsics.h"
|
||||||
|
#include "vx_vec.h"
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
vx_tmc(1);
|
||||||
|
#if 0
|
||||||
|
# vector-vector add routine of 32-bit integers
|
||||||
|
# void vvaddint32(size_t n, const int*x, const int*y, int*z)
|
||||||
|
# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
|
||||||
|
#
|
||||||
|
# a0 = n, a1 = x, a2 = y, a3 = z
|
||||||
|
# Non-vector instructions are indented
|
||||||
|
#endif
|
||||||
|
#if 1
|
||||||
|
int n = 5;
|
||||||
|
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||||
|
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||||
|
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||||
|
|
||||||
|
for(int i = 0; i < n; ++i)
|
||||||
|
{
|
||||||
|
a[i] = b[i] = c[i] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d, ", a[i]);
|
||||||
|
printf("\n");
|
||||||
|
// for(int i = 0; i < n; ++i) printf("%d, ", b[i]);
|
||||||
|
// printf("\n");
|
||||||
|
// for(int i = 0; i < n; ++i) printf("%d, ", c[i]);
|
||||||
|
|
||||||
|
int *d;
|
||||||
|
*d = 1;
|
||||||
|
vx_vec_test(n, d, b, c);
|
||||||
|
|
||||||
|
|
||||||
|
printf("(after: n = %d, %d)\n", n, *d);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d, ", a[i]);
|
||||||
|
// printf("\n");
|
||||||
|
// for(int i = 0; i < n; ++i) printf("%d, ", b[i]);
|
||||||
|
// printf("\n");
|
||||||
|
// for(int i = 0; i < n; ++i) printf("%d, ", c[i]);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#if 0
|
||||||
|
int * a = malloc(sizeof(int) * 10);
|
||||||
|
for(int i = 0; i < 10; ++i) a[i] = 5;
|
||||||
|
|
||||||
|
|
||||||
|
for(int i = 0; i < 10; ++i)
|
||||||
|
printf("%d, ", a[i]);
|
||||||
|
|
||||||
|
vx_vec_test(a);
|
||||||
|
//vx_vec_test(2, a, a, a);
|
||||||
|
|
||||||
|
printf("after--------\n");
|
||||||
|
for(int i = 0; i < 10; ++i)
|
||||||
|
printf("%d, ", a[i]);
|
||||||
|
#endif
|
||||||
|
#if 0
|
||||||
|
int n = 5;
|
||||||
|
int *a = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
|
||||||
|
int *b = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
|
||||||
|
int *c = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
|
||||||
|
|
||||||
|
for(int i = 0; i < n; ++i)
|
||||||
|
{
|
||||||
|
a[i] = 1;
|
||||||
|
b[i] = 1;
|
||||||
|
c[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Value of a: %d, b: %d, c: %d, n: %d\n", a[0], b[0], c[0], n);
|
||||||
|
vx_vec_test(n, a, b, c);
|
||||||
|
printf("Value of a: %d, b: %d, c: %d, n: %d\n", a[0], b[0], c[0], n);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// for (int i = 0; i < 4; i++)
|
||||||
|
// {
|
||||||
|
// if (c[i] != (a[i] + b[i]))
|
||||||
|
// {
|
||||||
|
// printf("Fail\n");
|
||||||
|
// break;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
vx_tmc(0);
|
||||||
|
}
|
||||||
27
rvvector/basic/vx_vec_main.c
Normal file
27
rvvector/basic/vx_vec_main.c
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
#include "../../runtime/intrinsics/vx_intrinsics.h"
|
||||||
|
#include "vx_vec.h"
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
vx_tmc(1);
|
||||||
|
printf("----------------hello!!! \n");
|
||||||
|
|
||||||
|
int n = 8;
|
||||||
|
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||||
|
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||||
|
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||||
|
|
||||||
|
printf("hello!!! \n");
|
||||||
|
|
||||||
|
for(int i = 0; i < n; ++i)
|
||||||
|
{
|
||||||
|
a[i] = b[i] = c[i] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
vx_vec_test(n, a, b, c);
|
||||||
|
|
||||||
|
for(int i = 0; i < n; ++i)
|
||||||
|
printf("%d ", c[i]);
|
||||||
|
|
||||||
|
vx_tmc(0);
|
||||||
|
}
|
||||||
166
rvvector/benchmark_temp/1
Normal file
166
rvvector/benchmark_temp/1
Normal file
@@ -0,0 +1,166 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "../../runtime/intrinsics/vx_intrinsics.h"
|
||||||
|
#include "vx_vec_benchmark.h"
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
vx_tmc(1);
|
||||||
|
|
||||||
|
int n = 65536;
|
||||||
|
int scalar = 10;
|
||||||
|
|
||||||
|
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||||
|
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||||
|
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
|
||||||
|
|
||||||
|
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 5; }
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
/* vvaddint32
|
||||||
|
* # vector-vector add routine of 32-bit integers
|
||||||
|
* # void vvaddint32(size_t n, const int*x, const int*y, int*z)
|
||||||
|
* # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } } */
|
||||||
|
printf("vvaddint...\na[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d ", a[i]);
|
||||||
|
printf("\nb[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d ", b[i]);
|
||||||
|
printf("\nc[%d] = a[%d] + b[%d]: ", n, n, n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d ", c[i]);
|
||||||
|
|
||||||
|
vx_vec_vvaddint32(n, a, b, c);
|
||||||
|
|
||||||
|
for(int i = 0; i < n; ++i)
|
||||||
|
{
|
||||||
|
if(c[i] != (a[i]+b[i]))
|
||||||
|
{
|
||||||
|
printf("\n<vddint32> failed at <index: %d>! \n", i);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\nPASSED.......................... <vddint32> \n");
|
||||||
|
#endif
|
||||||
|
#if 0
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
/* # vector-scalar add
|
||||||
|
# for (i=0; i<N; i++) { C[i] = A[i] + B; } // 32-bit ints */
|
||||||
|
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 1;}
|
||||||
|
printf("vsadd...scalar:%d\na[%d]: ", scalar, n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
|
||||||
|
printf("\nb: %d", scalar);
|
||||||
|
|
||||||
|
vx_vec_vsadd(n, a, scalar);
|
||||||
|
|
||||||
|
for(int i = 0; i < n; ++i)
|
||||||
|
{
|
||||||
|
if(a[i] != (b[i] * scalar))
|
||||||
|
{
|
||||||
|
printf("\n<vsadd> failed at <index: %d>! \n", i);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\nPASSED.......................... <vsadd> \n");
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#if 0
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
/* # memory copy
|
||||||
|
# void *memcpy(void* dest, const void* src, size_t n) */
|
||||||
|
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2;}
|
||||||
|
printf("memcpy\na[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
|
||||||
|
printf("\nb[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
|
||||||
|
|
||||||
|
vx_vec_memcpy(a, b, n);
|
||||||
|
|
||||||
|
for(int i = 0; i < n; ++i)
|
||||||
|
{
|
||||||
|
if(a[i] != b[i])
|
||||||
|
{
|
||||||
|
printf("\n<memcpy> failed at <index: %d>! \n", i);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\nPASSED.......................... <memcpy> \n");
|
||||||
|
#endif
|
||||||
|
#if 1
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
/* # void saxpy(size_t n, const float a, const float *x, float *y)
|
||||||
|
# ==> convert to int!!
|
||||||
|
# void saxpy(size_t n, const int a, const int *x, int *y)
|
||||||
|
# {
|
||||||
|
# size_t i;
|
||||||
|
# for (i=0; i<n; i++) y[i] = a * x[i] + y[i];
|
||||||
|
# } */
|
||||||
|
for (int i = 0; i < n; ++i) { a[i] = 4; b[i] = 2; c[i] = 2;}
|
||||||
|
|
||||||
|
printf("saxpy\na[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
|
||||||
|
printf("\nb[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
|
||||||
|
|
||||||
|
vx_vec_saxpy(n, scalar, a, b);
|
||||||
|
|
||||||
|
printf("saxpy\na[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
|
||||||
|
printf("\nb[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
|
||||||
|
|
||||||
|
for(int i = 0; i < n; ++i)
|
||||||
|
{
|
||||||
|
if(b[i] != ((a[i] * scalar) + c[i]))
|
||||||
|
{
|
||||||
|
printf("\n<saxpy> failed at <index: %d>! \n", i);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\nPASSED.......................... <saxpy> \n");
|
||||||
|
#endif
|
||||||
|
#if 0
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
/* # void sgemm_nn(size_t n, size_t m, size_t k, const float*a, // m * k matrix
|
||||||
|
# size_t lda, const float*b, // k * n matrix
|
||||||
|
# size_t ldb, float*c, // m * n matrix
|
||||||
|
# size_t ldc)
|
||||||
|
# c += a*b (alpha=1, no transpose on input matrices)
|
||||||
|
# matrices stored in C row-major order */
|
||||||
|
|
||||||
|
int m = 8;
|
||||||
|
int k = 8;
|
||||||
|
int n = 8
|
||||||
|
int lda = 4;
|
||||||
|
int ldb = 4;
|
||||||
|
int ldc = 4;
|
||||||
|
|
||||||
|
int* a1 = (int*)malloc(sizeof(m * k));
|
||||||
|
int* b1 = (int*)malloc(sizeof(k * n));
|
||||||
|
int* c1 = (int*)malloc(sizeof(m * n));
|
||||||
|
|
||||||
|
for(int i = 0; i < (m * k); ++i) a1[i] = 1;
|
||||||
|
for(int i = 0; i < (k * n); ++i) b1[i] = 1;
|
||||||
|
for(int i = 0; i < (m * n); ++i) c1[i] = 1;
|
||||||
|
|
||||||
|
printf("sgemm_nn\na[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d \n", a1[i]);
|
||||||
|
printf("\nb[%d]: ", n);
|
||||||
|
for(int i = 0; i < n; ++i) printf("%d \n", b1[i]);
|
||||||
|
|
||||||
|
vx_vec_sgemm_nn(n, m, k, a1, lda, b1, ldb, c1, ldc);
|
||||||
|
|
||||||
|
//for(int i = 0; i < n; ++i)
|
||||||
|
//{
|
||||||
|
// if(b[i] != ((a[i] * scalar) + c[i]))
|
||||||
|
// {
|
||||||
|
// printf("\n<sgemm_nn> failed at <index: %d>! \n", i);
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
printf("\nNOT TESTED.......................... <sgemm_nn> \n");
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
#endif
|
||||||
|
|
||||||
|
vx_tmc(0);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
@@ -1,11 +1,11 @@
|
|||||||
LIB_PATH = ../../runtime
|
LIB_PATH = ../../runtime
|
||||||
|
|
||||||
COMP = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-gcc
|
COMP = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-gcc
|
||||||
|
|
||||||
CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/mains/vortex_link.ld -march=rv32imv -mabi=ilp32
|
CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/mains/vortex_link.ld -march=rv32imv -mabi=ilp32
|
||||||
|
|
||||||
DMP = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objdump
|
DMP = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objdump
|
||||||
CPY = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objcopy
|
CPY = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objcopy
|
||||||
|
|
||||||
# VX_STR = ../../startup/vx_start.s
|
# VX_STR = ../../startup/vx_start.s
|
||||||
|
|
||||||
@@ -21,7 +21,7 @@ VX_VEC2 = vx_vec_saxpy.s #float --> int
|
|||||||
VX_VEC3 = vx_vec_sgemm_float.s #float --> int
|
VX_VEC3 = vx_vec_sgemm_float.s #float --> int
|
||||||
VX_VEC4 = vx_vec_vsadd.s
|
VX_VEC4 = vx_vec_vsadd.s
|
||||||
VX_VEC5 = vx_vec_memcpy.s
|
VX_VEC5 = vx_vec_memcpy.s
|
||||||
LIBS = /nethome/ekim79/riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libc.a /nethome/ekim79/riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
|
LIBS = /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libc.a /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
|
||||||
|
|
||||||
VX_MAIN = vx_vec_benchmark
|
VX_MAIN = vx_vec_benchmark
|
||||||
|
|
||||||
@@ -34,7 +34,6 @@ HEX: ELF
|
|||||||
$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
|
$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
|
||||||
|
|
||||||
ELF:
|
ELF:
|
||||||
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC1) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
|
||||||
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||||
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||||
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||||
|
|||||||
@@ -80,6 +80,12 @@ int main()
|
|||||||
if(a[i] != b[i])
|
if(a[i] != b[i])
|
||||||
{
|
{
|
||||||
printf("\n<memcpy> failed at <index: %d>! \n", i);
|
printf("\n<memcpy> failed at <index: %d>! \n", i);
|
||||||
|
<<<<<<< HEAD
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\nPASSED.......................... <memcpy> \n");
|
||||||
|
=======
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -113,6 +119,11 @@ int main()
|
|||||||
if(b[i] != ((a[i] * scalar) + c[i]))
|
if(b[i] != ((a[i] * scalar) + c[i]))
|
||||||
{
|
{
|
||||||
printf("\n<saxpy> failed at <index: %d>! \n", i);
|
printf("\n<saxpy> failed at <index: %d>! \n", i);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\nPASSED.......................... <saxpy> \n");
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
87070
rvvector/benchmark_temp/vx_vec_benchmark.dump
Normal file
87070
rvvector/benchmark_temp/vx_vec_benchmark.dump
Normal file
File diff suppressed because it is too large
Load Diff
BIN
rvvector/benchmark_temp/vx_vec_benchmark.elf
Executable file
BIN
rvvector/benchmark_temp/vx_vec_benchmark.elf
Executable file
Binary file not shown.
5595
rvvector/benchmark_temp/vx_vec_benchmark.hex
Normal file
5595
rvvector/benchmark_temp/vx_vec_benchmark.hex
Normal file
File diff suppressed because it is too large
Load Diff
@@ -13,6 +13,19 @@
|
|||||||
# fa0 a
|
# fa0 a
|
||||||
# a1 x
|
# a1 x
|
||||||
# a2 y
|
# a2 y
|
||||||
|
vx_vec_saxpy:
|
||||||
|
vsetvli a4, a0, e32, m8
|
||||||
|
saxpy:
|
||||||
|
vlw.v v0, (a1)
|
||||||
|
sub a0, a0, a4
|
||||||
|
slli a4, a4, 2
|
||||||
|
add a1, a1, a4
|
||||||
|
vlw.v v8, (a2)
|
||||||
|
vfmacc.vf v8, fa0, v0
|
||||||
|
vsw.v v8, (a2)
|
||||||
|
add a2, a2, a4
|
||||||
|
bnez a0, saxpy
|
||||||
|
ret
|
||||||
#vx_vec_saxpy:
|
#vx_vec_saxpy:
|
||||||
# vsetvli a4, a0, e32, m8
|
# vsetvli a4, a0, e32, m8
|
||||||
#saxpy:
|
#saxpy:
|
||||||
@@ -28,15 +41,21 @@
|
|||||||
# ret
|
# ret
|
||||||
|
|
||||||
# a0 n, rs1 a, a2 x, a3 y
|
# a0 n, rs1 a, a2 x, a3 y
|
||||||
|
|
||||||
|
# a0 n, a1 a, a2 x, a3 y
|
||||||
vx_vec_saxpy:
|
vx_vec_saxpy:
|
||||||
vsetvli a4, a0, e32, m8
|
vsetvli a4, a0, e32, m1
|
||||||
saxpy:
|
saxpy:
|
||||||
vlw.v v0, (a2)
|
vlw.v v0, (a2)
|
||||||
sub a0, a0, a4
|
sub a0, a0, a4
|
||||||
slli a4, a4, 2
|
slli a4, a4, 2
|
||||||
add a2, a2, a4
|
add a2, a2, a4
|
||||||
vlw.v v1, (a3)
|
vlw.v v1, (a3)
|
||||||
vmacc.vx v1, rs1, v0
|
vmul.vx v0, v0, a1
|
||||||
|
# vmul.vv v0, v0, v1
|
||||||
|
# li x1, 10
|
||||||
|
# vmul.vx v0, v0, x1
|
||||||
|
vadd.vv v1, v0, v1
|
||||||
vsw.v v1, (a3)
|
vsw.v v1, (a3)
|
||||||
add a3, a3, a4
|
add a3, a3, a4
|
||||||
bnez a0, saxpy
|
bnez a0, saxpy
|
||||||
|
|||||||
@@ -285,8 +285,8 @@ Instruction *WordDecoder::decode(const std::vector<Byte> &v, Size &idx, trace_in
|
|||||||
inst.setVlsWidth((code>>shift_func3) & func3_mask);
|
inst.setVlsWidth((code>>shift_func3) & func3_mask);
|
||||||
inst.setSrcReg((code>>shift_rs2) & reg_mask);
|
inst.setSrcReg((code>>shift_rs2) & reg_mask);
|
||||||
inst.setVmask((code>>shift_vmask));
|
inst.setVmask((code>>shift_vmask));
|
||||||
inst.setVmop((code>>shift_vmop) && func3_mask);
|
inst.setVmop((code>>shift_vmop) & func3_mask);
|
||||||
inst.setVnf((code>>shift_vnf) && func3_mask);
|
inst.setVnf((code>>shift_vnf) & func3_mask);
|
||||||
|
|
||||||
trace_inst->valid_inst = true;
|
trace_inst->valid_inst = true;
|
||||||
trace_inst->rs1 = ((code>>shift_rs1) & reg_mask);
|
trace_inst->rs1 = ((code>>shift_rs1) & reg_mask);
|
||||||
@@ -300,8 +300,8 @@ Instruction *WordDecoder::decode(const std::vector<Byte> &v, Size &idx, trace_in
|
|||||||
inst.setVlsWidth((code>>shift_func3) & func3_mask);
|
inst.setVlsWidth((code>>shift_func3) & func3_mask);
|
||||||
inst.setSrcReg((code>>shift_rs2) & reg_mask);
|
inst.setSrcReg((code>>shift_rs2) & reg_mask);
|
||||||
inst.setVmask((code>>shift_vmask));
|
inst.setVmask((code>>shift_vmask));
|
||||||
inst.setVmop((code>>shift_vmop) && func3_mask);
|
inst.setVmop((code>>shift_vmop) & func3_mask);
|
||||||
inst.setVnf((code>>shift_vnf) && func3_mask);
|
inst.setVnf((code>>shift_vnf) & func3_mask);
|
||||||
|
|
||||||
trace_inst->valid_inst = true;
|
trace_inst->valid_inst = true;
|
||||||
trace_inst->rs1 = ((code>>shift_rs1) & reg_mask);
|
trace_inst->rs1 = ((code>>shift_rs1) & reg_mask);
|
||||||
|
|||||||
@@ -2016,6 +2016,64 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 37: //vmul
|
||||||
|
{
|
||||||
|
D(3, "vmul");
|
||||||
|
uint8_t *result_ptr;
|
||||||
|
|
||||||
|
vector<Reg<char *>> & vr1 = c.vreg[rsrc[0]];
|
||||||
|
vector<Reg<char *>> & vr2 = c.vreg[rsrc[1]];
|
||||||
|
vector<Reg<char *>> & vd = c.vreg[rdest];
|
||||||
|
if(c.vtype.vsew == 8){
|
||||||
|
for(uint8_t i = 0; i < c.vl; i++){
|
||||||
|
uint8_t *first_ptr = (uint8_t *)vr1[i].val;
|
||||||
|
uint8_t *second_ptr = (uint8_t *)vr2[i].val;
|
||||||
|
uint8_t result = (*first_ptr * *second_ptr);
|
||||||
|
D(3,"Comparing " << *first_ptr << " + " << *second_ptr << " = " << result);
|
||||||
|
|
||||||
|
result_ptr = (uint8_t *) vd[i].val;
|
||||||
|
*result_ptr = result;
|
||||||
|
}
|
||||||
|
for(uint8_t i = c.vl; i < VLMAX; i++){
|
||||||
|
result_ptr = (uint8_t *) vd[i].val;
|
||||||
|
*result_ptr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(c.vtype.vsew == 16) {
|
||||||
|
uint16_t *result_ptr;
|
||||||
|
for(uint16_t i = 0; i < c.vl; i++){
|
||||||
|
uint16_t *first_ptr = (uint16_t *)vr1[i].val;
|
||||||
|
uint16_t *second_ptr = (uint16_t *)vr2[i].val;
|
||||||
|
uint16_t result = (*first_ptr * *second_ptr);
|
||||||
|
D(3,"Comparing " << *first_ptr << " + " << *second_ptr << " = " << result);
|
||||||
|
|
||||||
|
result_ptr = (uint16_t *) vd[i].val;
|
||||||
|
*result_ptr = result;
|
||||||
|
}
|
||||||
|
for(uint16_t i = c.vl; i < VLMAX; i++){
|
||||||
|
result_ptr = (uint16_t *) vd[i].val;
|
||||||
|
*result_ptr = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if(c.vtype.vsew == 32) {
|
||||||
|
uint32_t *result_ptr;
|
||||||
|
|
||||||
|
for(uint32_t i = 0; i < c.vl; i++){
|
||||||
|
uint32_t *first_ptr = (uint32_t *)vr1[i].val;
|
||||||
|
uint32_t *second_ptr = (uint32_t *)vr2[i].val;
|
||||||
|
uint32_t result = (*first_ptr * *second_ptr);
|
||||||
|
D(3,"Comparing " << *first_ptr << " + " << *second_ptr << " = " << result);
|
||||||
|
|
||||||
|
result_ptr = (uint32_t *) vd[i].val;
|
||||||
|
*result_ptr = result;
|
||||||
|
}
|
||||||
|
for(Word i = c.vl; i < VLMAX; i++){
|
||||||
|
result_ptr = (uint32_t *) vd[i].val;
|
||||||
|
*result_ptr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
case 45: //vmacc
|
case 45: //vmacc
|
||||||
{
|
{
|
||||||
D(3, "vmacc");
|
D(3, "vmacc");
|
||||||
@@ -2077,6 +2135,129 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 6:
|
||||||
|
{
|
||||||
|
switch(func6)
|
||||||
|
{
|
||||||
|
case 0:
|
||||||
|
{
|
||||||
|
D(3, "vmadd.vx");
|
||||||
|
uint8_t *result_ptr;
|
||||||
|
|
||||||
|
//vector<Reg<char *>> & vr1 = c.vreg[rsrc[0]];
|
||||||
|
vector<Reg<char *>> & vr2 = c.vreg[rsrc[1]];
|
||||||
|
vector<Reg<char *>> & vd = c.vreg[rdest];
|
||||||
|
if(c.vtype.vsew == 8){
|
||||||
|
for(uint8_t i = 0; i < c.vl; i++){
|
||||||
|
//uint8_t *first_ptr = (uint8_t *)vr1[i].val;
|
||||||
|
uint8_t *second_ptr = (uint8_t *)vr2[i].val;
|
||||||
|
uint8_t result = (reg[rsrc[0]] + *second_ptr);
|
||||||
|
D(3,"Comparing " << reg[rsrc[0]] << " + " << *second_ptr << " = " << result);
|
||||||
|
|
||||||
|
result_ptr = (uint8_t *) vd[i].val;
|
||||||
|
*result_ptr = result;
|
||||||
|
}
|
||||||
|
for(uint8_t i = c.vl; i < VLMAX; i++){
|
||||||
|
result_ptr = (uint8_t *) vd[i].val;
|
||||||
|
*result_ptr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(c.vtype.vsew == 16) {
|
||||||
|
uint16_t *result_ptr;
|
||||||
|
for(uint16_t i = 0; i < c.vl; i++){
|
||||||
|
//uint16_t *first_ptr = (uint16_t *)vr1[i].val;
|
||||||
|
uint16_t *second_ptr = (uint16_t *)vr2[i].val;
|
||||||
|
uint16_t result = (reg[rsrc[0]] + *second_ptr);
|
||||||
|
D(3,"Comparing " << reg[rsrc[0]] << " + " << *second_ptr << " = " << result);
|
||||||
|
|
||||||
|
result_ptr = (uint16_t *) vd[i].val;
|
||||||
|
*result_ptr = result;
|
||||||
|
}
|
||||||
|
for(uint16_t i = c.vl; i < VLMAX; i++){
|
||||||
|
result_ptr = (uint16_t *) vd[i].val;
|
||||||
|
*result_ptr = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if(c.vtype.vsew == 32) {
|
||||||
|
uint32_t *result_ptr;
|
||||||
|
|
||||||
|
for(uint32_t i = 0; i < c.vl; i++){
|
||||||
|
//uint32_t *first_ptr = (uint32_t *)vr1[i].val;
|
||||||
|
uint32_t *second_ptr = (uint32_t *)vr2[i].val;
|
||||||
|
uint32_t result = (reg[rsrc[0]] + *second_ptr);
|
||||||
|
D(3,"Comparing " << reg[rsrc[0]] << " + " << *second_ptr << " = " << result);
|
||||||
|
|
||||||
|
result_ptr = (uint32_t *) vd[i].val;
|
||||||
|
*result_ptr = result;
|
||||||
|
}
|
||||||
|
for(Word i = c.vl; i < VLMAX; i++){
|
||||||
|
result_ptr = (uint32_t *) vd[i].val;
|
||||||
|
*result_ptr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 37: //vmul.vx
|
||||||
|
{
|
||||||
|
D(3, "vmul.vx");
|
||||||
|
uint8_t *result_ptr;
|
||||||
|
|
||||||
|
//vector<Reg<char *>> & vr1 = c.vreg[rsrc[0]];
|
||||||
|
vector<Reg<char *>> & vr2 = c.vreg[rsrc[1]];
|
||||||
|
vector<Reg<char *>> & vd = c.vreg[rdest];
|
||||||
|
if(c.vtype.vsew == 8){
|
||||||
|
for(uint8_t i = 0; i < c.vl; i++){
|
||||||
|
//uint8_t *first_ptr = (uint8_t *)vr1[i].val;
|
||||||
|
uint8_t *second_ptr = (uint8_t *)vr2[i].val;
|
||||||
|
uint8_t result = (reg[rsrc[0]] * *second_ptr);
|
||||||
|
D(3,"Comparing " << reg[rsrc[0]] << " + " << *second_ptr << " = " << result);
|
||||||
|
|
||||||
|
result_ptr = (uint8_t *) vd[i].val;
|
||||||
|
*result_ptr = result;
|
||||||
|
}
|
||||||
|
for(uint8_t i = c.vl; i < VLMAX; i++){
|
||||||
|
result_ptr = (uint8_t *) vd[i].val;
|
||||||
|
*result_ptr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(c.vtype.vsew == 16) {
|
||||||
|
uint16_t *result_ptr;
|
||||||
|
for(uint16_t i = 0; i < c.vl; i++){
|
||||||
|
//uint16_t *first_ptr = (uint16_t *)vr1[i].val;
|
||||||
|
uint16_t *second_ptr = (uint16_t *)vr2[i].val;
|
||||||
|
uint16_t result = (reg[rsrc[0]] * *second_ptr);
|
||||||
|
D(3,"Comparing " << reg[rsrc[0]] << " + " << *second_ptr << " = " << result);
|
||||||
|
|
||||||
|
result_ptr = (uint16_t *) vd[i].val;
|
||||||
|
*result_ptr = result;
|
||||||
|
}
|
||||||
|
for(uint16_t i = c.vl; i < VLMAX; i++){
|
||||||
|
result_ptr = (uint16_t *) vd[i].val;
|
||||||
|
*result_ptr = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if(c.vtype.vsew == 32) {
|
||||||
|
uint32_t *result_ptr;
|
||||||
|
|
||||||
|
for(uint32_t i = 0; i < c.vl; i++){
|
||||||
|
//uint32_t *first_ptr = (uint32_t *)vr1[i].val;
|
||||||
|
uint32_t *second_ptr = (uint32_t *)vr2[i].val;
|
||||||
|
uint32_t result = (reg[rsrc[0]] * *second_ptr);
|
||||||
|
D(3,"Comparing " << reg[rsrc[0]] << " + " << *second_ptr << " = " << result);
|
||||||
|
|
||||||
|
result_ptr = (uint32_t *) vd[i].val;
|
||||||
|
*result_ptr = result;
|
||||||
|
}
|
||||||
|
for(Word i = c.vl; i < VLMAX; i++){
|
||||||
|
result_ptr = (uint32_t *) vd[i].val;
|
||||||
|
*result_ptr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
case 7:
|
case 7:
|
||||||
{
|
{
|
||||||
is_vec = true;
|
is_vec = true;
|
||||||
|
|||||||
2
simX/out
Normal file
2
simX/out
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
verilator --compiler gcc -cc cache_simX.v -I. -I../rtl/shared_memory -I../rtl/cache -I../rtl/interfaces -Isimulate -I../rtl --exe simX.cpp args.cpp mem.cpp core.cpp instruction.cpp enc.cpp util.cpp -CFLAGS '-std=c++11 -fPIC -O3' -Wno-UNOPTFLAT -Wno-WIDTH --trace -DVL_DEBUG=1
|
||||||
|
Makefile:26: recipe for target 'simX' failed
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
echo start > results.txt
|
echo start > results.txt
|
||||||
|
|
||||||
# echo ../kernel/vortex_test.hex
|
|
||||||
make
|
make
|
||||||
printf "Fasten your seatbelts ladies and gentelmen!!\n\n\n\n"
|
printf "Fasten your seatbelts ladies and gentelmen!!\n\n\n\n"
|
||||||
cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../rvvector/benchmark_temp/vx_vec_benchmark.hex -s -b 1> emulator.debug
|
#cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../benchmarks/vector/vecadd/vx_vec_vecadd.hex -s -b 1> emulator.debug
|
||||||
|
#cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../benchmarks/vector/saxpy/vx_vec_saxpy.hex -s -b 1> emulator.debug
|
||||||
|
cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.hex -s -b 1> emulator.debug
|
||||||
|
|||||||
@@ -3,4 +3,5 @@ echo start > results.txt
|
|||||||
# echo ../kernel/vortex_test.hex
|
# echo ../kernel/vortex_test.hex
|
||||||
make
|
make
|
||||||
printf "Fasten your seatbelts ladies and gentelmen!!\n\n\n\n"
|
printf "Fasten your seatbelts ladies and gentelmen!!\n\n\n\n"
|
||||||
cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../runtime/mains/simple/vx_simple_main.hex -s -b 1> emulator.debug
|
#cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../runtime/mains/simple/vx_simple_main.hex -s -b 1> emulator.debug
|
||||||
|
cd obj_dir && ./Vcache_simX -E -a rv32i --core /home/priya/Desktop/new_vortex/Vortex/rvvector/benchmark_temp/vx_vec_benchmark.hex -s -b 1> emulator.debug
|
||||||
|
|||||||
6
simX/test_vec.sh
Executable file
6
simX/test_vec.sh
Executable file
@@ -0,0 +1,6 @@
|
|||||||
|
echo start > results.txt
|
||||||
|
|
||||||
|
# echo ../kernel/vortex_test.hex
|
||||||
|
make
|
||||||
|
printf "Fasten your seatbelts ladies and gentelmen!!\n\n\n\n"
|
||||||
|
cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../rvvector/basic/vx_vector_main.hex -s -b 1> emulator.debug
|
||||||
Reference in New Issue
Block a user