249 lines
9.9 KiB
C++
249 lines
9.9 KiB
C++
/*
|
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
|
*
|
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
|
* with this source code for terms and conditions that govern your use of
|
|
* this software. Any use, reproduction, disclosure, or distribution of
|
|
* this software and related documentation outside the terms of the EULA
|
|
* is strictly prohibited.
|
|
*
|
|
*/
|
|
|
|
// standard utilities and systems includes
|
|
#include <oclUtils.h>
|
|
#include <shrQATest.h>
|
|
#include "oclBlackScholes_common.h"
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Helper functions
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
double executionTime(cl_event &event){
|
|
cl_ulong start, end;
|
|
|
|
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
|
|
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
|
|
|
|
return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Random float helper
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
float randFloat(float low, float high){
|
|
float t = (float)rand() / (float)RAND_MAX;
|
|
return (1.0f - t) * low + t * high;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Main program
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
int main(int argc, char **argv)
|
|
{
|
|
cl_platform_id cpPlatform; //OpenCL platform
|
|
cl_device_id* cdDevices = NULL; //OpenCL devices list (array)
|
|
cl_context cxGPUContext; //OpenCL context
|
|
cl_command_queue cqCommandQueue; //OpenCL command que
|
|
cl_mem //OpenCL memory buffer objects
|
|
d_Call,
|
|
d_Put,
|
|
d_S,
|
|
d_X,
|
|
d_T;
|
|
|
|
cl_int ciErrNum;
|
|
|
|
float
|
|
*h_CallCPU,
|
|
*h_PutCPU,
|
|
*h_CallGPU,
|
|
*h_PutGPU,
|
|
*h_S,
|
|
*h_X,
|
|
*h_T;
|
|
|
|
const unsigned int optionCount = 4000000;
|
|
const float R = 0.02f;
|
|
const float V = 0.30f;
|
|
|
|
shrQAStart(argc, argv);
|
|
|
|
// Get the NVIDIA platform
|
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
|
shrLog("clGetPlatformID...\n");
|
|
|
|
//Get all the devices
|
|
cl_uint uiNumDevices = 0; // Number of devices available
|
|
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
|
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
|
|
shrLog("Get the Device info and select Device...\n");
|
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
|
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
|
|
|
// Get command line device options and config accordingly
|
|
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
|
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
|
|
{
|
|
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
|
}
|
|
shrLog(" Using Device %u: ", uiTargetDevice);
|
|
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
|
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
|
|
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
|
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
|
|
|
|
// set logfile name and start logs
|
|
shrSetLogFileName ("oclBlackScholes.txt");
|
|
shrLog("%s Starting...\n\n", argv[0]);
|
|
|
|
shrLog("Allocating and initializing host memory...\n");
|
|
h_CallCPU = (float *)malloc(optionCount * sizeof(float));
|
|
h_PutCPU = (float *)malloc(optionCount * sizeof(float));
|
|
h_CallGPU = (float *)malloc(optionCount * sizeof(float));
|
|
h_PutGPU = (float *)malloc(optionCount * sizeof(float));
|
|
h_S = (float *)malloc(optionCount * sizeof(float));
|
|
h_X = (float *)malloc(optionCount * sizeof(float));
|
|
h_T = (float *)malloc(optionCount * sizeof(float));
|
|
|
|
srand(2009);
|
|
for(unsigned int i = 0; i < optionCount; i++){
|
|
h_CallCPU[i] = -1.0f;
|
|
h_PutCPU[i] = -1.0f;
|
|
h_S[i] = randFloat(5.0f, 30.0f);
|
|
h_X[i] = randFloat(1.0f, 100.0f);
|
|
h_T[i] = randFloat(0.25f, 10.0f);
|
|
}
|
|
|
|
shrLog("Initializing OpenCL...\n");
|
|
// Get the NVIDIA platform
|
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
// Get a GPU device
|
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
// Create the context
|
|
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
//Create a command-queue
|
|
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
shrLog("Creating OpenCL memory objects...\n");
|
|
d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
d_Put = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
d_S = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
d_X = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
d_T = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
shrLog("Starting up BlackScholes...\n");
|
|
initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv);
|
|
|
|
shrLog("Running OpenCL BlackScholes...\n\n");
|
|
//Just a single run or a warmup iteration
|
|
BlackScholes(
|
|
NULL,
|
|
d_Call,
|
|
d_Put,
|
|
d_S,
|
|
d_X,
|
|
d_T,
|
|
R,
|
|
V,
|
|
optionCount
|
|
);
|
|
|
|
#ifdef GPU_PROFILING
|
|
const int numIterations = 16;
|
|
cl_event startMark, endMark;
|
|
ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
|
|
ciErrNum |= clFinish(cqCommandQueue);
|
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
|
shrDeltaT(0);
|
|
|
|
for(int i = 0; i < numIterations; i++){
|
|
BlackScholes(
|
|
cqCommandQueue,
|
|
d_Call,
|
|
d_Put,
|
|
d_S,
|
|
d_X,
|
|
d_T,
|
|
R,
|
|
V,
|
|
optionCount
|
|
);
|
|
}
|
|
|
|
ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark);
|
|
ciErrNum |= clFinish(cqCommandQueue);
|
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
//Calculate performance metrics by wallclock time
|
|
double gpuTime = shrDeltaT(0) / numIterations;
|
|
shrLogEx(LOGBOTH | MASTER, 0, "oclBlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %i, Workgroup = %u\n",
|
|
(double)(2.0 * optionCount * 1.0e-9)/gpuTime, gpuTime, (2 * optionCount), 1, 0);
|
|
|
|
//Get profiling info
|
|
cl_ulong startTime = 0, endTime = 0;
|
|
ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
|
|
ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
|
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
|
shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
|
|
#endif
|
|
|
|
shrLog("\nReading back OpenCL BlackScholes results...\n");
|
|
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
shrLog("Comparing against Host/C++ computation...\n");
|
|
BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount);
|
|
double deltaCall = 0, deltaPut = 0, sumCall = 0, sumPut = 0;
|
|
double L1call, L1put;
|
|
for(unsigned int i = 0; i < optionCount; i++)
|
|
{
|
|
sumCall += fabs(h_CallCPU[i]);
|
|
sumPut += fabs(h_PutCPU[i]);
|
|
deltaCall += fabs(h_CallCPU[i] - h_CallGPU[i]);
|
|
deltaPut += fabs(h_PutCPU[i] - h_PutGPU[i]);
|
|
}
|
|
L1call = deltaCall / sumCall;
|
|
L1put = deltaPut / sumPut;
|
|
shrLog("Relative L1 (call, put) = (%.3e, %.3e)\n\n", L1call, L1put);
|
|
|
|
shrLog("Shutting down...\n");
|
|
closeBlackScholes();
|
|
ciErrNum = clReleaseMemObject(d_T);
|
|
ciErrNum |= clReleaseMemObject(d_X);
|
|
ciErrNum |= clReleaseMemObject(d_S);
|
|
ciErrNum |= clReleaseMemObject(d_Put);
|
|
ciErrNum |= clReleaseMemObject(d_Call);
|
|
ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
|
|
ciErrNum |= clReleaseContext(cxGPUContext);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
free(h_T);
|
|
free(h_X);
|
|
free(h_S);
|
|
free(h_PutGPU);
|
|
free(h_CallGPU);
|
|
free(h_PutCPU);
|
|
free(h_CallCPU);
|
|
|
|
if(cdDevices)free(cdDevices);
|
|
|
|
shrQAFinishExit(argc, (const char **)argv, ((L1call < 1E-6) && (L1put < 1E-6)) ? QA_PASSED : QA_FAILED );
|
|
}
|