diff --git a/benchmarks/opencl/VectorHypot/VectorHypot.cl b/benchmarks/opencl/VectorHypot/VectorHypot.cl new file mode 100644 index 00000000..1983a862 --- /dev/null +++ b/benchmarks/opencl/VectorHypot/VectorHypot.cl @@ -0,0 +1,41 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// OpenCL Kernel Function Naive Implementation for hyptenuse +__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements) +{ + // get index into global data array + size_t szGlobalOffset = get_global_id(0) + uiOffset; + + // bound check + if (szGlobalOffset >= uiNumElements) + { + return; + } + + // Processing 4 elements per work item, so read fgA and fgB source values from GMEM + float4 f4A = fg4A[szGlobalOffset]; + float4 f4B = fg4B[szGlobalOffset]; + float4 f4H = (float4)0.0f; + + // Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop + for (int i = 0; i < iInnerLoopCount; i++) + { + // compute the 4 hypotenuses using built-in function + f4H.x = hypot (f4A.x, f4B.x); + f4H.y = hypot (f4A.y, f4B.y); + f4H.z = hypot (f4A.z, f4B.z); + f4H.w = hypot (f4A.w, f4B.w); + } + + // Write 4 result values back out to GMEM + fg4Hypot[szGlobalOffset] = f4H; +} \ No newline at end of file diff --git a/benchmarks/opencl/VectorHypot/main.cc b/benchmarks/opencl/VectorHypot/main.cc new file mode 100644 index 00000000..2e703e69 --- /dev/null +++ b/benchmarks/opencl/VectorHypot/main.cc @@ -0,0 +1,686 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// ********************************************************************* +// oclCopyComputeOverlap Notes: +// +// OpenCL API demo application for NVIDIA CUDA GPU's that implements a +// element by element vector hyptenuse computation using 2 input float arrays +// and 1 output float array. +// +// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped +// with respect to GPU computation (and with respect to host thread). +// +// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the +// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard. +// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's. +// +// After setup, warmup and calibration to the system, the sample runs 4 scenarios: +// A) Computations with 2 command queues on GPU +// A multiple-cycle sequence is executed, timed and compared against the host +// B) Computations with 1 command queue on GPU +// A multiple-cycle sequence is executed, timed and compared against the host +// +// The 2-command queue approach ought to be substantially faster +// +// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently +// increases compute time without increasing data size (via a loop inside the kernel) +// +// At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%: +// (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems) +// +// If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is: +// +// Single Queue with all the data and all the work +// Ttot (serial) = 4T + 4T + 2T = 10T +// +// Dual Queue, where each queue has 1/2 the data and 1/2 the work +// Tq0 (overlap) = 2T + 2T + T .... +// Tq1 (overlap) = .... 2T + 2T + T +// +// Ttot (elapsed, wall) = 2T + 2T + 2T + T = 7T +// +// Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 % (Tesla arch 1.2 or 1.3, single copy engine) +// +// For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped. +// This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles. +// ********************************************************************* + +// common SDK header for standard utilities and system libs +#include +#include + +// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test +// values greater than 0.0f represent a speed-up relative to non-overlapped +#define EXPECTED_OVERLAP 30.0f +#define EXPECTED_OVERLAP_FERMI 45.0f +#define PASS_FACTOR 0.60f +#define RETRIES_ON_FAILURE 1 + +// Base sizes for parameters manipulated dynamically or on the command line +#define BASE_WORK_ITEMS 64 +#define BASE_ARRAY_LENGTH 40000 +#define BASE_LOOP_COUNT 32 + +// Vars +// ********************************************************************* +cl_platform_id cpPlatform; // OpenCL platform +cl_context cxGPUContext; // OpenCL context +cl_command_queue cqCommandQueue[2]; // OpenCL command queues +cl_device_id* cdDevices; // OpenCL device list +cl_program cpProgram; // OpenCL program +cl_kernel ckKernel[2]; // OpenCL kernel, 1 per queue +cl_mem cmPinnedSrcA; // OpenCL pinned host source buffer A +cl_mem cmPinnedSrcB; // OpenCL pinned host source buffer B +cl_mem cmPinnedResult; // OpenCL pinned host result buffer +float* fSourceA = NULL; // Mapped pointer for pinned Host source A buffer +float* fSourceB = NULL; // Mapped pointer for pinned Host source B buffer +float* fResult = NULL; // Mapped pointer for pinned Host result buffer +cl_mem cmDevSrcA; // OpenCL device source buffer A +cl_mem cmDevSrcB; // OpenCL device source buffer B +cl_mem cmDevResult; // OpenCL device result buffer +size_t szBuffBytes; // Size of main buffers +size_t szGlobalWorkSize; // 1D var for Total # of work items in the launched ND range +size_t szLocalWorkSize = BASE_WORK_ITEMS; // initial # of work items in the work group +cl_int ciErrNum; // Error code var +char* cPathAndName = NULL; // Var for full paths to data, src, etc. +char* cSourceCL = NULL; // Buffer to hold source for compilation +const char* cExecutableName = NULL; + +// demo config vars +const char* cSourceFile = "VectorHypot.cl"; // OpenCL computation kernel source code +float* Golden = NULL; // temp buffer to hold golden results for cross check +bool bNoPrompt = false; // Command line switch to skip exit prompt +bool bQATest = false; // Command line switch to test + +// Forward Declarations +// ********************************************************************* +double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig); +double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig); +int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles); +void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount); +void Cleanup (int iExitCode); +void (*pCleanup)(int) = &Cleanup; + +int *gp_argc = 0; +const char *** gp_argv = NULL; + +// Main function +// ********************************************************************* +int main(int argc, const char **argv) +{ + //Locals + size_t szKernelLength; // Byte size of kernel code + double dBuildTime; // Compile time + cl_uint uiTargetDevice = 0; // Default Device to compute on + cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample + cl_uint uiNumDevices; // Number of devices available + int iDevCap = -1; // Capability of device + int iInnerLoopCount = BASE_LOOP_COUNT; // Varies "compute intensity" per data within the kernel + const int iTestCycles = 10; // How many times to run the external test loop + const int iWarmupCycles = 8; // How many times to run the warmup sequence + cl_uint uiWorkGroupMultiple = 4; // Command line var (using "workgroupmult=") to optionally increase workgroup size + cl_uint uiNumElements = BASE_ARRAY_LENGTH; // initial # of elements per array to process (note: procesing 4 per work item) + cl_uint uiSizeMultiple = 4; // Command line var (using "sizemult=") to optionally increase vector sizes + bool bPassFlag = false; // Var to accumulate test pass/fail + shrBOOL bMatch = shrFALSE; // Cross check result + shrBOOL bTestOverlap = shrFALSE; + double dAvgGPUTime[2] = {0.0, 0.0}; // Average time of iTestCycles calls for 2-Queue and 1-Queue test + double dHostTime[2] = {0.0, 0.0}; // Host computation time (2nd test is redundant but a good stability indicator) + float fMinPassCriteria[2] = {0.0f, 0.0f}; // Test pass cireria, adjusted dependant on GPU arch + + gp_argc = &argc; + gp_argv = &argv; + + shrQAStart(argc, (char **)argv); + + // start logs + cExecutableName = argv[0]; + shrSetLogFileName ("oclCopyComputeOverlap.txt"); + shrLog("%s Starting...\n\n", argv[0]); + + // get basic command line args + bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt")); + bQATest = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest")); + shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice); + + // Optional Command-line multiplier for vector size + // Default val of 4 gives 10.24 million float elements per vector + // Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory) + shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple); + uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50); + uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS; + shrLog("Array sizes = %u float elements\n", uiNumElements); + + // Optional Command-line multiplier for workgroup size (x 64 work items) + // Default val of 4 gives szLocalWorkSize of 256. + // Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range + shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple); + uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10); + szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS; + shrLog("Workgroup Size = %u\n\n", szLocalWorkSize); + + // Get the NVIDIA platform if available, otherwise use default + shrLog("Get the Platform ID...\n\n"); + ciErrNum = oclGetPlatformID(&cpPlatform); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + + // Get OpenCL platform name and version + char cBuffer[256]; + ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + shrLog("Platform Name = %s\n\n", cBuffer); + + // Get all the devices + shrLog("Get the Device info and select Device...\n"); + ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id)); + + // Ethans changes + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + //ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + + // Set target device and check capabilities + shrLog(" # of Devices Available = %u\n", uiNumDevices); + uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); + shrLog(" Using Device %u, ", uiTargetDevice); + oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); + iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]); + if (iDevCap > 0) { + shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10); + } else { + shrLog("\n\n", iDevCap); + } + if (strstr(cBuffer, "NVIDIA") != NULL) + { + if (iDevCap < 12) + { + shrLog("Device doesn't have overlap capability. Skipping test...\n"); + Cleanup (EXIT_SUCCESS); + } + + // Device and Platform eligible for overlap testing + bTestOverlap = shrTRUE; + + // If device has overlap capability, proceed + fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP; // 1st cycle overlap is same for 1 or 2 copy engines + if (iDevCap != 20) + { + // Single copy engine + fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // avg of many cycles + } + else + { + char cDevName[1024]; + clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL); + if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0) + { + // Tesla or Quadro (arch = 2.0) ... Dual copy engine + fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI; // average of many cycles + } + else + { + // Geforce ... Single copy engine + fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // average of many cycles + } + } + } + + // Create the context + shrLog("clCreateContext...\n"); + cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + + // Create 2 command-queues + cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + shrLog("clCreateCommandQueue [0]...\n"); + cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + shrLog("clCreateCommandQueue [1]...\n"); + + // Allocate the OpenCL source and result buffer memory objects on GPU device GMEM + szBuffBytes = sizeof(cl_float) * uiNumElements; + cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements); + + // Allocate pinned source and result host buffers: + // Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations *** + cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements); + + // Get mapped pointers to pinned input host buffers + // Note: This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers + fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); + //oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup); + shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n"); + + // Alloc temp golden buffer for cross checks + Golden = (float*)malloc(szBuffBytes); + //oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup); + + // Read the OpenCL kernel in from source file + cPathAndName = shrFindFilePath(cSourceFile, argv[0]); + //oclCheckError(cPathAndName != NULL, shrTRUE); + cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); + // oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + shrLog("oclLoadProgSource (%s)...\n", cSourceFile); + + // Create the program object + cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + shrLog("clCreateProgramWithSource...\n"); + + // Build the program for the target device + clFinish(cqCommandQueue[0]); + shrDeltaT(0); + ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL); + shrLog("clBuildProgram..."); + if (ciErrNum != CL_SUCCESS) + { + // write out standard error, Build Log and PTX, then cleanup and exit + shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR); + oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); + oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx"); + Cleanup(EXIT_FAILURE); + } + dBuildTime = shrDeltaT(0); + + // Ethan - Kernel Addition + cl_program program = + clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL); + if (program == NULL) { + std::cerr << "Failed to write program binary" << std::endl; + Cleanup(context, queue, program, kernel, memObjects); + return 1; + } else { + std::cout << "Read program from binary." << std::endl; + } + + // Create the kernel + ckKernel[0] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + ckKernel[1] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + shrLog("clCreateKernel (ckKernel[2])...\n"); + + // Offsets for 2 queues + cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)}; + + // Set the Argument values for the 1st kernel instance (queue 0) + ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA); + ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB); + ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult); + ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]); + ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount); + ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements); + shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n"); + + // Set the Argument values for the 2d kernel instance (queue 1) + ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA); + ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB); + ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult); + ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]); + ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount); + ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements); + oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n"); + + //******************************************* + // Warmup the driver with dual queue sequence + //******************************************* + + // Warmup with dual queue sequence for iTestCycles + shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles); + DualQueueSequence(iWarmupCycles, uiNumElements, false); + + // Use single queue config to adjust compute intensity + shrLog("Adjust compute for GPU / system...\n"); + iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles); + shrLog(" Kernel inner loop count = %d\n", iInnerLoopCount); + + //******************************************* + // Run and time with 2 command-queues + //******************************************* + for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) { + + // Run the sequence iTestCycles times + dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false); + + // Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer) + shrLog(" Device vs Host Result Comparison\t: "); + VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount); + shrDeltaT(0); + for (int i = 0; i < iTestCycles; i++) + { + VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount); + } + dHostTime[0] = shrDeltaT(0)/iTestCycles; + + // Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer) + bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0); + shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); + bPassFlag = (bMatch == shrTRUE); + + //******************************************* + // Run and time with 1 command queue + //******************************************* + // Run the sequence iTestCycles times + dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false); + + // Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer) + shrLog(" Device vs Host Result Comparison\t: "); + shrDeltaT(0); + for (int i = 0; i < iTestCycles; i++) + { + VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount); + } + dHostTime[1] = shrDeltaT(0)/iTestCycles; + + // Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer) + bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0); + shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); + bPassFlag &= (bMatch == shrTRUE); + + //******************************************* + + // Compare Single and Dual queue timing + shrLog("\nResult Summary:\n"); + + // Log GPU and CPU Time for 2-queue scenario + shrLog(" Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]); + shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]); + + // Log GPU and CPU Time for 1-queue scenario + shrLog(" Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]); + shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]); + + // Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status + double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]); + + if( bTestOverlap ) { + bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]); + if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) { + shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable"); + + // Log info to master log in standard format + shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", + dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize); + + bPassFlag &= bAvgOverlapOK; + break; + } + } + + shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun); + } + + + //******************************************* + // Report pass/fail, cleanup and exit + Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE); +} + +// Run 1 queue sequence for n cycles +// ********************************************************************* +double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig) +{ + // Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer) + shrFillArray(fSourceA, (int)uiNumElements); + shrFillArray(fSourceB, (int)uiNumElements); + + // Reset Global work size for 1 command-queue, and log work sizes & dimensions + szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4)); + + // *** Make sure queues are empty and then start timer + double dAvgTime = 0.0; + clFinish(cqCommandQueue[0]); + clFinish(cqCommandQueue[1]); + shrDeltaT(0); + + // Run the sequence iCycles times + for (int i = 0; i < iCycles; i++) + { + // Nonblocking Write of all of input data from host to device in command-queue 0 + ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL); + ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL); + shrCheckError(ciErrNum, CL_SUCCESS); + + // Launch kernel computation, command-queue 0 + ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + + // Non Blocking Read of output data from device to host, command-queue 0 + ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL); + shrCheckError(ciErrNum, CL_SUCCESS); + + // Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver) + clFlush(cqCommandQueue[0]); + } + + // *** Assure sync to host and return average sequence time + clFinish(cqCommandQueue[0]); + dAvgTime = shrDeltaT(0)/(double)iCycles; + + // Log config if asked for + if (bShowConfig) + { + shrLog("\n1-Queue sequence Configuration:\n"); + shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 1\n", + szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize); + } + return dAvgTime; +} + +// Run 2 queue sequence for n cycles +// ********************************************************************* +double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig) +{ + // Locals + size_t szHalfBuffer = szBuffBytes / 2; + size_t szHalfOffset = szHalfBuffer / sizeof(float); + double dAvgTime = 0.0; + + // Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer) + shrFillArray(fSourceA, (int)uiNumElements); + shrFillArray(fSourceB, (int)uiNumElements); + + // Set Global work size for 2 command-queues, and log work sizes & dimensions + szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4))); + + // Make sure queues are empty and then start timer + clFinish(cqCommandQueue[0]); + clFinish(cqCommandQueue[1]); + shrDeltaT(0); + + for (int i = 0; i < iCycles; i++) + { + // Mid Phase 0 + // Nonblocking Write of 1st half of input data from host to device in command-queue 0 + ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL); + ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL); + shrCheckError(ciErrNum, CL_SUCCESS); + + // Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver + // (not necessary on Linux, Mac OSX or WinXP) + clFlush(cqCommandQueue[0]); + clFlush(cqCommandQueue[1]); + + // Start Phase 1 *********************************** + + // Launch kernel computation, command-queue 0 + // (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE) + ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); + oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + + // Nonblocking Write of 2nd half of input data from host to device in command-queue 1 + // (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE) + ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL); + ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL); + shrCheckError(ciErrNum, CL_SUCCESS); + + // Push out the compute for queue 0 and write for queue 1 to the driver + // (not necessary on Linux, Mac OSX or WinXP) + clFlush(cqCommandQueue[0]); + clFlush(cqCommandQueue[1]); + + // Start Phase 2 *********************************** + + // Launch kernel computation, command-queue 1 + ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); + //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + + // Non Blocking Read of 1st half of output data from device to host, command-queue 0 + ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL); + shrCheckError(ciErrNum, CL_SUCCESS); + + // Push out the compute for queue 1 and the read for queue 0 to the driver + // (not necessary on Linux, Mac OSX or WinXP) + clFlush(cqCommandQueue[0]); + clFlush(cqCommandQueue[1]); + + // Start Phase 0 (Rolls over) *********************************** + + // Non Blocking Read of 2nd half of output data from device to host, command-queue 1 + ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL); + shrCheckError(ciErrNum, CL_SUCCESS); + } + + // *** Sync to host and get average sequence time + clFinish(cqCommandQueue[0]); + clFinish(cqCommandQueue[1]); + dAvgTime = shrDeltaT(0)/(double)iCycles; + + // Log config if asked for + if (bShowConfig) + { + shrLog("\n2-Queue sequence Configuration:\n"); + shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 2\n", + szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize); + } + + return dAvgTime; +} + +// Function to adjust compute task according to device capability +// This allows a consistent overlap % across a wide variety of GPU's for test purposes +// It also implitly illustrates the relationship between compute capability and overlap at fixed work size +// ********************************************************************* +int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles) +{ + // Locals + double dCopyTime, dComputeTime; + int iComputedLoopCount; + + // Change Source Data + shrFillArray(fSourceA, (int)uiNumElements); + shrFillArray(fSourceB, (int)uiNumElements); + + // Reset Global work size for 1 command-queue, and log work sizes & dimensions + szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4)); + + // *** Make sure queues are empty and then start timer + clFinish(cqCommandQueue[0]); + clFinish(cqCommandQueue[1]); + shrDeltaT(0); + + // Run the copy iCycles times and measure copy time on this system + for (int i = 0; i < iCycles; i++) + { + // Nonblocking Write of all of input data from host to device in command-queue 0 + ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL); + ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL); + ciErrNum |= clFlush(cqCommandQueue[0]); + shrCheckError(ciErrNum, CL_SUCCESS); + } + clFinish(cqCommandQueue[0]); + dCopyTime = shrDeltaT(0); + + // Run the compute iCycles times and measure compute time on this system + for (int i = 0; i < iCycles; i++) + { + // Launch kernel computation, command-queue 0 + ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); + ciErrNum |= clFlush(cqCommandQueue[0]); + oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + } + clFinish(cqCommandQueue[0]); + dComputeTime = shrDeltaT(0); + + // Determine number of core loop cycles proportional to copy/compute time ratio + dComputeTime = MAX(dComputeTime, 1.0e-6); + iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4)); + ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount); + ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount); + oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); + return (iComputedLoopCount); +} + +// Cleanup/Exit function +// ********************************************************************* +void Cleanup (int iExitCode) +{ + // Cleanup allocated objects + shrLog("Starting Cleanup...\n\n"); + if(cPathAndName)free(cPathAndName); + if(cSourceCL)free(cSourceCL); + if(Golden)free(Golden); + if(ckKernel[0])clReleaseKernel(ckKernel[0]); + if(ckKernel[1])clReleaseKernel(ckKernel[1]); + if(cpProgram)clReleaseProgram(cpProgram); + if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL); + if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL); + if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL); + if(cmDevSrcA)clReleaseMemObject(cmDevSrcA); + if(cmDevSrcB)clReleaseMemObject(cmDevSrcB); + if(cmDevResult)clReleaseMemObject(cmDevResult); + if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA); + if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB); + if(cmPinnedResult)clReleaseMemObject(cmPinnedResult); + if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]); + if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]); + if(cxGPUContext)clReleaseContext(cxGPUContext); + if(cdDevices)free(cdDevices); + + // Master status Pass/Fail (all tests) + shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED ); +} + +// "Golden" Host processing vector hyptenuse function for comparison purposes +// ********************************************************************* +void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount) +{ + for (unsigned int i = 0; i < uiNumElements; i++) + { + float fA = pfData1[i]; + float fB = pfData2[i]; + float fC = sqrtf(fA * fA + fB * fB); + + pfResult[i] = fC; + } +} \ No newline at end of file diff --git a/benchmarks/opencl/VectorHypot/oclUtils.h b/benchmarks/opencl/VectorHypot/oclUtils.h new file mode 100644 index 00000000..2b109e18 --- /dev/null +++ b/benchmarks/opencl/VectorHypot/oclUtils.h @@ -0,0 +1,198 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +#ifndef OCL_UTILS_H +#define OCL_UTILS_H + +// ********************************************************************* +// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK +// ********************************************************************* + +// Common headers: Cross-API utililties and OpenCL header +#include + +// All OpenCL headers +#if defined (__APPLE__) || defined(MACOSX) + #include +#else + #include +#endif + +// Includes +#include +#include +#include + +// For systems with CL_EXT that are not updated with these extensions, we copied these +// extensions from +#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV + /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ + #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 + #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 + #define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 + #define CL_DEVICE_WARP_SIZE_NV 0x4003 + #define CL_DEVICE_GPU_OVERLAP_NV 0x4004 + #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 + #define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 +#endif + +// reminders for build output window and log +#ifdef _WIN32 + #pragma message ("Note: including shrUtils.h") + #pragma message ("Note: including opencl.h") +#endif + +// SDK Revision # +#define OCL_SDKREVISION "7027912" + +// Error and Exit Handling Macros... +// ********************************************************************* +// Full error handling macro with Cleanup() callback (if supplied)... +// (Companion Inline Function lower on page) +#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__) + +// Short version without Cleanup() callback pointer +// Both Input (a) and Reference (b) are specified as args +#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0) + +////////////////////////////////////////////////////////////////////////////// +//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0 +//! +//! @return the id +//! @param clSelectedPlatformID OpenCL platform ID +////////////////////////////////////////////////////////////////////////////// +extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID); + +////////////////////////////////////////////////////////////////////////////// +//! Print info about the device +//! +//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE +//! @param device OpenCL id of the device +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device); + +////////////////////////////////////////////////////////////////////////////// +//! Get and return device capability +//! +//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA +//! @param device OpenCL id of the device +////////////////////////////////////////////////////////////////////////////// +extern "C" int oclGetDevCap(cl_device_id device); + +////////////////////////////////////////////////////////////////////////////// +//! Print the device name +//! +//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE +//! @param device OpenCL id of the device +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclPrintDevName(int iLogMode, cl_device_id device); + +////////////////////////////////////////////////////////////////////////////// +//! Gets the id of the first device from the context +//! +//! @return the id +//! @param cxGPUContext OpenCL context +////////////////////////////////////////////////////////////////////////////// +extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext); + +////////////////////////////////////////////////////////////////////////////// +//! Gets the id of the nth device from the context +//! +//! @return the id or -1 when out of range +//! @param cxGPUContext OpenCL context +//! @param device_idx index of the device of interest +////////////////////////////////////////////////////////////////////////////// +extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx); + +////////////////////////////////////////////////////////////////////////////// +//! Gets the id of device with maximal FLOPS from the context +//! +//! @return the id +//! @param cxGPUContext OpenCL context +////////////////////////////////////////////////////////////////////////////// +extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext); + +////////////////////////////////////////////////////////////////////////////// +//! Loads a Program file and prepends the cPreamble to the code. +//! +//! @return the source string if succeeded, 0 otherwise +//! @param cFilename program filename +//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header +//! @param szFinalLength returned length of the code string +////////////////////////////////////////////////////////////////////////////// +extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength); + +////////////////////////////////////////////////////////////////////////////// +//! Get the binary (PTX) of the program associated with the device +//! +//! @param cpProgram OpenCL program +//! @param cdDevice device of interest +//! @param binary returned code +//! @param length length of returned code +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length); + +////////////////////////////////////////////////////////////////////////////// +//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device +//! +//! @param cpProgram OpenCL program +//! @param cdDevice device of interest +//! @param const char* cPtxFileName optional PTX file name +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName); + +////////////////////////////////////////////////////////////////////////////// +//! Get and log the Build Log from the OpenCL compiler for the requested program & device +//! +//! @param cpProgram OpenCL program +//! @param cdDevice device of interest +////////////////////////////////////////////////////////////////////////////// +extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice); + +// Helper function for De-allocating cl objects +// ********************************************************************* +extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs); + +// Helper function to get OpenCL error string from constant +// ********************************************************************* +extern "C" const char* oclErrorString(cl_int error); + +// Helper function to get OpenCL image format string (channel order and type) from constant +// ********************************************************************* +extern "C" const char* oclImageFormatString(cl_uint uiImageFormat); + +// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied) +// ********************************************************************* +inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine) +{ + // An error condition is defined by the sample/test value not equal to the reference + if (iReference != iSample) + { + // If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value + iSample = (iSample == 0) ? -9999 : iSample; + + // Log the error info + shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile); + + // Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code. + if (pCleanup != NULL) + { + pCleanup(iSample); + } + else + { + shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n"); + exit(iSample); + } + } +} + +#endif \ No newline at end of file diff --git a/benchmarks/opencl/VectorHypot/shrQATest.h b/benchmarks/opencl/VectorHypot/shrQATest.h new file mode 100644 index 00000000..245cf8dc --- /dev/null +++ b/benchmarks/opencl/VectorHypot/shrQATest.h @@ -0,0 +1,238 @@ +/* +* Copyright 1993-2010 NVIDIA Corporation. All rights reserved. +* +* Please refer to the NVIDIA end user license agreement (EULA) associated +* with this source code for terms and conditions that govern your use of +* this software. Any use, reproduction, disclosure, or distribution of +* this software and related documentation outside the terms of the EULA +* is strictly prohibited. +* +*/ + +#ifndef SHR_QATEST_H +#define SHR_QATEST_H + +// ********************************************************************* +// Generic utilities for NVIDIA GPU Computing SDK +// ********************************************************************* + +// OS dependent includes +#ifdef _WIN32 + #pragma message ("Note: including windows.h") + #pragma message ("Note: including math.h") + #pragma message ("Note: including assert.h") + #pragma message ("Note: including time.h") + +// Headers needed for Windows + #include + #include +#else + // Headers needed for Linux + #include + #include + #include + #include + #include + #include + #include + #include + #include +#endif + +#ifndef STRCASECMP +#ifdef _WIN32 +#define STRCASECMP _stricmp +#else +#define STRCASECMP strcasecmp +#endif +#endif + +#ifndef STRNCASECMP +#ifdef _WIN32 +#define STRNCASECMP _strnicmp +#else +#define STRNCASECMP strncasecmp +#endif +#endif + + +// Standardized QA Start/Finish for CUDA SDK tests +#define shrQAStart(a, b) __shrQAStart(a, b) +#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c) +#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d) + +inline int findExeNameStart(const char *exec_name) +{ + int exename_start = (int)strlen(exec_name); + + while( (exename_start > 0) && + (exec_name[exename_start] != '\\') && + (exec_name[exename_start] != '/') ) + { + exename_start--; + } + if (exec_name[exename_start] == '\\' || + exec_name[exename_start] == '/') + { + return exename_start+1; + } else { + return exename_start; + } +} + +inline int __shrQAStart(int argc, char **argv) +{ + bool bQATest = false; + // First clear the output buffer + fflush(stdout); + fflush(stdout); + + for (int i=1; i < argc; i++) { + int string_start = 0; + while (argv[i][string_start] == '-') + string_start++; + char *string_argv = &argv[i][string_start]; + + if (!STRCASECMP(string_argv, "qatest")) { + bQATest = true; + } + } + + // We don't want to print the entire path, so we search for the first + int exename_start = findExeNameStart(argv[0]); + if (bQATest) { + fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start])); + for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]); + fprintf(stdout, "\n"); + } else { + fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start])); + } + fflush(stdout); + printf("\n"); fflush(stdout); + return exename_start; +} + +enum eQAstatus { + QA_FAILED = 0, + QA_PASSED = 1, + QA_WAIVED = 2 +}; + +inline void __ExitInTime(int seconds) +{ + fprintf(stdout, "> exiting in %d seconds: ", seconds); + fflush(stdout); + time_t t; + int count; + for (t=time(0)+seconds, count=seconds; time(0) < t; count--) { + fprintf(stdout, "%d...", count); +#ifdef WIN32 + Sleep(1000); +#else + sleep(1); +#endif + } + fprintf(stdout,"done!\n\n"); + fflush(stdout); +} + + +inline void __shrQAFinish(int argc, const char **argv, int iStatus) +{ + // By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() ) + bool bQATest = false, bNoPrompt = true, bQuitInTime = true; + const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL }; + + for (int i=1; i < argc; i++) { + int string_start = 0; + while (argv[i][string_start] == '-') + string_start++; + + const char *string_argv = &argv[i][string_start]; + if (!STRCASECMP(string_argv, "qatest")) { + bQATest = true; + } + // For SDK individual samples that don't specify -noprompt or -prompt, + // a 3 second delay will happen before exiting, giving a user time to view results + if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) { + bNoPrompt = true; + bQuitInTime = false; + } + if (!STRCASECMP(string_argv, "prompt")) { + bNoPrompt = false; + bQuitInTime = false; + } + } + + int exename_start = findExeNameStart(argv[0]); + if (bQATest) { + fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start])); + for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]); + fprintf(stdout, "\n"); + } else { + fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]); + } + fflush(stdout); + printf("\n"); fflush(stdout); + if (bQuitInTime) { + __ExitInTime(3); + } else { + if (!bNoPrompt) { + fprintf(stdout, "\nPress to exit...\n"); + fflush(stdout); + getchar(); + } + } +} + +inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus) +{ + bool bQuitInTime = true; + const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL }; + + for (int i=1; i < argc; i++) { + int string_start = 0; + while (argv[i][string_start] == '-') + string_start++; + + const char *string_argv = &argv[i][string_start]; + // For SDK individual samples that don't specify -noprompt or -prompt, + // a 3 second delay will happen before exiting, giving a user time to view results + if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) { + bQuitInTime = false; + } + if (!STRCASECMP(string_argv, "prompt")) { + bQuitInTime = false; + } + } + + int exename_start = findExeNameStart(argv[0]); + if (bQATest) { + fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start])); + for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]); + fprintf(stdout, "\n"); + } else { + fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]); + } + fflush(stdout); + + if (bQuitInTime) { + __ExitInTime(3); + } +} + +inline void shrQAFinishExit(int argc, const char **argv, int iStatus) +{ + __shrQAFinish(argc, argv, iStatus); + + exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); +} + +inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus) +{ + __shrQAFinish2(bQAtest, argc, argv, iStatus); + + exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); +} + +#endif \ No newline at end of file diff --git a/benchmarks/opencl/VectorHypot/shrUtils.h b/benchmarks/opencl/VectorHypot/shrUtils.h new file mode 100644 index 00000000..0f2795d4 --- /dev/null +++ b/benchmarks/opencl/VectorHypot/shrUtils.h @@ -0,0 +1,642 @@ +/* +* Copyright 1993-2010 NVIDIA Corporation. All rights reserved. +* +* Please refer to the NVIDIA end user license agreement (EULA) associated +* with this source code for terms and conditions that govern your use of +* this software. Any use, reproduction, disclosure, or distribution of +* this software and related documentation outside the terms of the EULA +* is strictly prohibited. +* +*/ + +#ifndef SHR_UTILS_H +#define SHR_UTILS_H + +// ********************************************************************* +// Generic utilities for NVIDIA GPU Computing SDK +// ********************************************************************* + +// reminders for output window and build log +#ifdef _WIN32 + #pragma message ("Note: including windows.h") + #pragma message ("Note: including math.h") + #pragma message ("Note: including assert.h") +#endif + +// OS dependent includes +#ifdef _WIN32 + // Headers needed for Windows + #include +#else + // Headers needed for Linux + #include + #include + #include + #include + #include + #include + #include +#endif + +// Other headers needed for both Windows and Linux +#include +#include +#include +#include +#include + +// Un-comment the following #define to enable profiling code in SDK apps +//#define GPU_PROFILING + +// Beginning of GPU Architecture definitions +inline int ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = + { { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class + { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class + { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class + { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class + { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class + { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class + { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class + { -1, -1 } + }; + + int index = 0; + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) { + return nGpuArchCoresPerSM[index].Cores; + } + index++; + } + printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor); + return -1; +} +// end of GPU Architecture definitions + + +// Defines and enum for use with logging functions +// ********************************************************************* +#define DEFAULTLOGFILE "SdkConsoleLog.txt" +#define MASTERLOGFILE "SdkMasterLog.csv" +enum LOGMODES +{ + LOGCONSOLE = 1, // bit to signal "log to console" + LOGFILE = 2, // bit to signal "log to file" + LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both" + APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open + MASTER = 8, // bit to signal master .csv log output + ERRORMSG = 16, // bit to signal "pre-pend Error" + CLOSELOG = 32 // bit to close log file, if open, after any requested file write +}; +#define HDASHLINE "-----------------------------------------------------------\n" + +// Standardized boolean +enum shrBOOL +{ + shrFALSE = 0, + shrTRUE = 1 +}; + +// Standardized MAX, MIN and CLAMP +#define MAX(a, b) ((a > b) ? a : b) +#define MIN(a, b) ((a < b) ? a : b) +#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a +#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a + +// Error and Exit Handling Macros... +// ********************************************************************* +// Full error handling macro with Cleanup() callback (if supplied)... +// (Companion Inline Function lower on page) +#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__) + +// Short version without Cleanup() callback pointer +// Both Input (a) and Reference (b) are specified as args +#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0) + +// Standardized Exit Macro for leaving main()... extended version +// (Companion Inline Function lower on page) +#define shrExitEX(a, b, c) __shrExitEX(a, b, c) + +// Standardized Exit Macro for leaving main()... short version +// (Companion Inline Function lower on page) +#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS) + +// Simple argument checker macro +#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE + +// Define for user-customized error handling +#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__ + +// Function to deallocate memory allocated within shrUtils +// ********************************************************************* +extern "C" void shrFree(void* ptr); + +// ********************************************************************* +// Helper function to log standardized information to Console, to File or to both +//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n"); +//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); +//! +//! Automatically opens file and stores handle if needed and not done yet +//! Closes file and nulls handle on request +//! +//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG. +//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode +//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close" +//! First 3 options may be | 'd with MASTER to enable independent write to master data log file +//! First 3 options may be | 'd with ERRORMSG to start line with standard error message +//! @param 2 dValue: +//! Positive val = double value for time in secs to be formatted to 6 decimals. +//! Negative val is an error code and this give error preformatting. +//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf. +//! ALL printf flags, width, precision and type specifiers are supported with this exception: +//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported +//! Single byte char type specifiers (%s and %c) ARE supported +//! @param 4... variable args: like printf or fprintf. Must match format specifer type above. +//! @return 0 if OK, negative value on error or if error occurs or was passed in. +// ********************************************************************* +extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...); + +// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0, +// ********************************************************************* +extern "C" int shrLog(const char* cFormatString, ...); + +// ********************************************************************* +// Delta timer function for up to 3 independent timers using host high performance counters +// Maintains state for 3 independent counters +//! Example: double dElapsedTime = shrDeltaTime(0); +//! +//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2) +//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error +// ********************************************************************* +extern "C" double shrDeltaT(int iCounterID); + +// Optional LogFileNameOverride function +// ********************************************************************* +extern "C" void shrSetLogFileName (const char* cOverRideName); + +// Helper function to init data arrays +// ********************************************************************* +extern "C" void shrFillArray(float* pfData, int iSize); + +// Helper function to print data arrays +// ********************************************************************* +extern "C" void shrPrintArray(float* pfData, int iSize); + +//////////////////////////////////////////////////////////////////////////// +//! Find the path for a filename +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executablePath optional absolute path of the executable +//////////////////////////////////////////////////////////////////////////// +extern "C" char* shrFindFilePath(const char* filename, const char* executablePath); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing single precision floating point data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing double precision floating point data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing integer data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing unsigned integer data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data, + unsigned int* len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing char / byte data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Read file \filename containing unsigned char / byte data +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +//! @note If a NULL pointer is passed to this function and it is +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data, + unsigned int* len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing single precision floating point +//! data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len, + const float epsilon, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing double precision floating point +//! data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len, + const double epsilon, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing integer data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing unsigned integer data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data, + unsigned int len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing char / byte data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len, + bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename containing unsigned char / byte data +//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE +//! @param filename name of the file to write +//! @param data pointer to data to write +//! @param len number of data elements in data, -1 on error +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data, + unsigned int len, bool verbose = false); + +//////////////////////////////////////////////////////////////////////////// +//! Load PPM image file (with unsigned char as data element type), padding +//! 4th component +//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE +//! @param file name of the image file +//! @param OutData handle to the data read +//! @param w width of the image +//! @param h height of the image +//! +//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData, + unsigned int *w, unsigned int *h); + +//////////////////////////////////////////////////////////////////////////// +//! Save PPM image file (with unsigned char as data element type, padded to +//! 4 bytes) +//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data, + unsigned int w, unsigned int h); + +//////////////////////////////////////////////////////////////////////////////// +//! Save PGM image file (with unsigned char as data element type) +//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data, + unsigned int w, unsigned int h); + +//////////////////////////////////////////////////////////////////////////// +//! Load PGM image file (with unsigned char as data element type) +//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE +//! @param file name of the image file +//! @param data handle to the data read +//! @param w width of the image +//! @param h height of the image +//! @note If a NULL pointer is passed to this function and it is initialized +//! within shrUtils, then free() has to be used to deallocate the memory +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data, + unsigned int *w,unsigned int *h); + +//////////////////////////////////////////////////////////////////////////// +// Command line arguments: General notes +// * All command line arguments begin with '--' followed by the token; +// token and value are seperated by '='; example --samples=50 +// * Arrays have the form --model=[one.obj,two.obj,three.obj] +// (without whitespaces) +//////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////// +//! Check if command line argument \a flag-name is given +//! @return shrTRUE if command line argument \a flag_name has been given, +//! otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param flag_name name of command line flag +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv, + const char* flag_name); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type int +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv, + const char* arg_name, int* val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type unsigned int +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv, + const char* arg_name, unsigned int* val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type float +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv, + const char* arg_name, float* val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument of type string +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val value of the command line argument +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv, + const char* arg_name, char** val); + +//////////////////////////////////////////////////////////////////////////// +//! Get the value of a command line argument list those element are strings +//! @return shrTRUE if command line argument \a arg_name has been given and +//! is of the requested type, otherwise shrFALSE +//! @param argc argc as passed to main() +//! @param argv argv as passed to main() +//! @param arg_name name of the command line argument +//! @param val command line argument list +//! @param len length of the list / number of elements +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv, + const char* arg_name, char** val, + unsigned int* len); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparef( const float* reference, const float* data, + const unsigned int len); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two integer arrays +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparei( const int* reference, const int* data, + const unsigned int len ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two unsigned integer arrays, with epsilon and threshold +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param threshold tolerance % # of comparison errors (0.15f = 15%) +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data, + const unsigned int len, const float epsilon, const float threshold ); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two unsigned char arrays +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data, + const unsigned int len ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two integers with a tolernance for # of byte errors +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//! @param threshold tolerance % # of comparison errors (0.15f = 15%) +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data, + const unsigned int len, const float epsilon, const float threshold ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two integer arrays witha n epsilon tolerance for equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data, + const unsigned int len, const float epsilon ); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays with an epsilon tolerance for equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparefe( const float* reference, const float* data, + const unsigned int len, const float epsilon ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays with an epsilon tolerance for equality and a +//! threshold for # pixel errors +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparefet( const float* reference, const float* data, + const unsigned int len, const float epsilon, const float threshold ); + +//////////////////////////////////////////////////////////////////////////// +//! Compare two float arrays using L2-norm with an epsilon tolerance for +//! equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data, + const unsigned int len, const float epsilon ); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two PPM image files with an epsilon tolerance for equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param src_file filename for the image to be compared +//! @param data filename for the reference data / gold image +//! @param epsilon epsilon to use for the comparison +//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass) +//! $param verboseErrors output details of image mismatch to std::err +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold); + +//////////////////////////////////////////////////////////////////////////////// +//! Compare two PGM image files with an epsilon tolerance for equality +//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE +//! @param src_file filename for the image to be compared +//! @param data filename for the reference data / gold image +//! @param epsilon epsilon to use for the comparison +//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass) +//! $param verboseErrors output details of image mismatch to std::err +//////////////////////////////////////////////////////////////////////////////// +extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold); + +extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size); + +extern "C" size_t shrRoundUp(int group_size, int global_size); + +// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied) +// ********************************************************************* +inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine) +{ + if (iReference != iSample) + { + shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile); + if (pCleanup != NULL) + { + pCleanup(EXIT_FAILURE); + } + else + { + shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n"); + exit(EXIT_FAILURE); + } + } +} + +// Standardized Exit +// ********************************************************************* +inline void __shrExitEX(int argc, const char** argv, int iExitCode) +{ +#ifdef WIN32 + if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) +#else + if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) +#endif + { + shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress to Quit...\n"); + getchar(); + } + else + { + shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]); + } + fflush(stderr); + exit(iExitCode); +} + +#endif \ No newline at end of file