274 lines
9.1 KiB
Common Lisp
274 lines
9.1 KiB
Common Lisp
/*
|
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
|
*
|
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
|
* with this source code for terms and conditions that govern your use of
|
|
* this software. Any use, reproduction, disclosure, or distribution of
|
|
* this software and related documentation outside the terms of the EULA
|
|
* is strictly prohibited.
|
|
*
|
|
*/
|
|
|
|
/*
|
|
Parallel reduction kernels
|
|
*/
|
|
|
|
// The following defines are set during runtime compilation, see reduction.cpp
|
|
// #define T float
|
|
// #define blockSize 128
|
|
// #define nIsPow2 1
|
|
|
|
#ifndef _REDUCE_KERNEL_H_
|
|
#define _REDUCE_KERNEL_H_
|
|
|
|
/*
|
|
Parallel sum reduction using shared memory
|
|
- takes log(n) steps for n input elements
|
|
- uses n threads
|
|
- only works for power-of-2 arrays
|
|
*/
|
|
|
|
/* This reduction interleaves which threads are active by using the modulo
|
|
operator. This operator is very expensive on GPUs, and the interleaved
|
|
inactivity means that no whole warps are active, which is also very
|
|
inefficient */
|
|
__kernel void reduce0(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
|
{
|
|
// load shared mem
|
|
unsigned int tid = get_local_id(0);
|
|
unsigned int i = get_global_id(0);
|
|
|
|
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// do reduction in shared mem
|
|
for(unsigned int s=1; s < get_local_size(0); s *= 2) {
|
|
// modulo arithmetic is slow!
|
|
if ((tid % (2*s)) == 0) {
|
|
sdata[tid] += sdata[tid + s];
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
|
|
// write result for this block to global mem
|
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
|
}
|
|
|
|
|
|
/* This version uses contiguous threads, but its interleaved
|
|
addressing results in many shared memory bank conflicts. */
|
|
__kernel void reduce1(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
|
{
|
|
// load shared mem
|
|
unsigned int tid = get_local_id(0);
|
|
unsigned int i = get_global_id(0);
|
|
|
|
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// do reduction in shared mem
|
|
for(unsigned int s=1; s < get_local_size(0); s *= 2)
|
|
{
|
|
int index = 2 * s * tid;
|
|
|
|
if (index < get_local_size(0))
|
|
{
|
|
sdata[index] += sdata[index + s];
|
|
}
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
|
|
// write result for this block to global mem
|
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
|
}
|
|
|
|
/*
|
|
This version uses sequential addressing -- no divergence or bank conflicts.
|
|
*/
|
|
__kernel void reduce2(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
|
{
|
|
// load shared mem
|
|
unsigned int tid = get_local_id(0);
|
|
unsigned int i = get_global_id(0);
|
|
|
|
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// do reduction in shared mem
|
|
for(unsigned int s=get_local_size(0)/2; s>0; s>>=1)
|
|
{
|
|
if (tid < s)
|
|
{
|
|
sdata[tid] += sdata[tid + s];
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
|
|
// write result for this block to global mem
|
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
|
}
|
|
|
|
/*
|
|
This version uses n/2 threads --
|
|
it performs the first level of reduction when reading from global memory
|
|
*/
|
|
__kernel void reduce3(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
|
{
|
|
// perform first level of reduction,
|
|
// reading from global memory, writing to shared memory
|
|
unsigned int tid = get_local_id(0);
|
|
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
|
|
|
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
|
if (i + get_local_size(0) < n)
|
|
sdata[tid] += g_idata[i+get_local_size(0)];
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// do reduction in shared mem
|
|
for(unsigned int s=get_local_size(0)/2; s>0; s>>=1)
|
|
{
|
|
if (tid < s)
|
|
{
|
|
sdata[tid] += sdata[tid + s];
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
|
|
// write result for this block to global mem
|
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
|
}
|
|
|
|
/*
|
|
This version unrolls the last warp to avoid synchronization where it
|
|
isn't needed
|
|
*/
|
|
__kernel void reduce4(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
|
|
{
|
|
// perform first level of reduction,
|
|
// reading from global memory, writing to shared memory
|
|
unsigned int tid = get_local_id(0);
|
|
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
|
|
|
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
|
if (i + get_local_size(0) < n)
|
|
sdata[tid] += g_idata[i+get_local_size(0)];
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// do reduction in shared mem
|
|
#pragma unroll 1
|
|
for(unsigned int s=get_local_size(0)/2; s>32; s>>=1)
|
|
{
|
|
if (tid < s)
|
|
{
|
|
sdata[tid] += sdata[tid + s];
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
|
|
if (tid < 32)
|
|
{
|
|
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
|
|
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
|
|
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
|
|
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
|
|
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
|
|
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
|
|
}
|
|
|
|
// write result for this block to global mem
|
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
|
}
|
|
|
|
/*
|
|
This version is completely unrolled. It uses a template parameter to achieve
|
|
optimal code for any (power of 2) number of threads. This requires a switch
|
|
statement in the host code to handle all the different thread block sizes at
|
|
compile time.
|
|
*/
|
|
__kernel void reduce5(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
|
|
{
|
|
// perform first level of reduction,
|
|
// reading from global memory, writing to shared memory
|
|
unsigned int tid = get_local_id(0);
|
|
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
|
|
|
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
|
if (i + blockSize < n)
|
|
sdata[tid] += g_idata[i+blockSize];
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// do reduction in shared mem
|
|
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
|
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
|
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
|
|
|
if (tid < 32)
|
|
{
|
|
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
|
|
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
|
|
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
|
|
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
|
|
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
|
|
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
|
|
}
|
|
|
|
// write result for this block to global mem
|
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
|
}
|
|
|
|
/*
|
|
This version adds multiple elements per thread sequentially. This reduces the overall
|
|
cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
|
|
(Brent's Theorem optimization)
|
|
*/
|
|
__kernel void reduce6(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
|
|
{
|
|
// perform first level of reduction,
|
|
// reading from global memory, writing to shared memory
|
|
unsigned int tid = get_local_id(0);
|
|
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
|
unsigned int gridSize = blockSize*2*get_num_groups(0);
|
|
sdata[tid] = 0;
|
|
|
|
// we reduce multiple elements per thread. The number is determined by the
|
|
// number of active thread blocks (via gridDim). More blocks will result
|
|
// in a larger gridSize and therefore fewer elements per thread
|
|
while (i < n)
|
|
{
|
|
sdata[tid] += g_idata[i];
|
|
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
|
|
if (nIsPow2 || i + blockSize < n)
|
|
sdata[tid] += g_idata[i+blockSize];
|
|
i += gridSize;
|
|
}
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// do reduction in shared mem
|
|
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
|
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
|
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
|
|
|
if (tid < 32)
|
|
{
|
|
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
|
|
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
|
|
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
|
|
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
|
|
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
|
|
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
|
|
}
|
|
|
|
// write result for this block to global mem
|
|
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
|
}
|
|
|
|
#endif // #ifndef _REDUCE_KERNEL_H_
|