PageRenderTime 17ms CodeModel.GetById 11ms app.highlight 2ms RepoModel.GetById 2ms app.codeStats 0ms

/extra/cuda/demos/prefix-sum/prefix-sum.cu

http://github.com/abeaumont/factor
CUDA | 103 lines | 79 code | 24 blank | 0 comment | 0 complexity | bf0aea855a8422e5cfd479c316dce00c MD5 | raw file
  1#include <stdio.h>
  2#include <stdlib.h>
  3#include <cuda_runtime.h>
  4
  5static const int LOG_BANK_COUNT = 4;
  6
  7static inline __device__ __host__ unsigned shared_offset(unsigned i)
  8{
  9    return i + (i >> LOG_BANK_COUNT);
 10}
 11
 12static inline __device__ __host__ unsigned offset_a(unsigned offset, unsigned i)
 13{
 14    return shared_offset(offset * (2*i + 1) - 1);
 15}
 16
 17static inline __device__ __host__ unsigned offset_b(unsigned offset, unsigned i)
 18{
 19    return shared_offset(offset * (2*i + 2) - 1);
 20}
 21
 22static inline __device__ __host__ unsigned lpot(unsigned x)
 23{
 24    --x; x |= x>>1; x|=x>>2; x|=x>>4; x|=x>>8; x|=x>>16; return ++x;
 25}
 26
 27template<typename T>
 28__global__ void prefix_sum_block(T *in, T *out, unsigned n)
 29{
 30    extern __shared__ T temp[];
 31
 32    int idx = threadIdx.x;
 33    int blocksize = blockDim.x;
 34
 35    temp[shared_offset(idx            )] = (idx             < n) ? in[idx            ] : 0;
 36    temp[shared_offset(idx + blocksize)] = (idx + blocksize < n) ? in[idx + blocksize] : 0;
 37
 38    int offset, d;
 39    for (offset = 1, d = blocksize; d > 0; d >>= 1, offset <<= 1) {
 40        __syncthreads();
 41        if (idx < d) {
 42            unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
 43            temp[b] += temp[a];
 44        }
 45    }
 46
 47    if (idx == 0) temp[shared_offset(blocksize*2 - 1)] = 0;
 48
 49    for (d = 1; d <= blocksize; d <<= 1) {
 50        offset >>= 1;
 51        __syncthreads();
 52
 53        if (idx < d) {
 54            unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
 55            unsigned t = temp[a];
 56            temp[a] = temp[b];
 57            temp[b] += t;
 58        }
 59    }
 60    __syncthreads();
 61
 62    if (idx             < n) out[idx            ] = temp[shared_offset(idx            )];
 63    if (idx + blocksize < n) out[idx + blocksize] = temp[shared_offset(idx + blocksize)];
 64}
 65
 66template<typename T>
 67void prefix_sum(T *in, T *out, unsigned n)
 68{
 69    char *device_values;
 70    unsigned n_lpot = lpot(n);
 71    size_t n_pitch;
 72
 73    cudaError_t error = cudaMallocPitch((void**)&device_values, &n_pitch, sizeof(T)*n, 2);
 74    if (error != 0) {
 75        printf("error %u allocating width %lu height %u\n", error, sizeof(T)*n, 2);
 76        exit(1);
 77    }
 78
 79    cudaMemcpy(device_values, in, sizeof(T)*n, cudaMemcpyHostToDevice);
 80
 81    prefix_sum_block<<<1, n_lpot/2, shared_offset(n_lpot)*sizeof(T)>>>
 82        ((T*)device_values, (T*)(device_values + n_pitch), n);
 83
 84    cudaMemcpy(out, device_values + n_pitch, sizeof(T)*n, cudaMemcpyDeviceToHost);
 85    cudaFree(device_values);
 86}
 87
 88int main()
 89{
 90    sranddev();
 91
 92    static unsigned in_values[1024], out_values[1024];
 93
 94    for (int i = 0; i < 1024; ++i)
 95        in_values[i] = rand() >> 21;
 96
 97    prefix_sum(in_values, out_values, 1024);
 98
 99    for (int i = 0; i < 1024; ++i)
100        printf("%5d => %5d\n", in_values[i], out_values[i]);
101
102    return 0;
103}