/extra/cuda/demos/prefix-sum/prefix-sum.cu

http://github.com/abeaumont/factor · Cuda · 103 lines · 79 code · 24 blank · 0 comment · 12 complexity · bf0aea855a8422e5cfd479c316dce00c MD5 · raw file

  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <cuda_runtime.h>
  4. static const int LOG_BANK_COUNT = 4;
  5. static inline __device__ __host__ unsigned shared_offset(unsigned i)
  6. {
  7. return i + (i >> LOG_BANK_COUNT);
  8. }
  9. static inline __device__ __host__ unsigned offset_a(unsigned offset, unsigned i)
  10. {
  11. return shared_offset(offset * (2*i + 1) - 1);
  12. }
  13. static inline __device__ __host__ unsigned offset_b(unsigned offset, unsigned i)
  14. {
  15. return shared_offset(offset * (2*i + 2) - 1);
  16. }
  17. static inline __device__ __host__ unsigned lpot(unsigned x)
  18. {
  19. --x; x |= x>>1; x|=x>>2; x|=x>>4; x|=x>>8; x|=x>>16; return ++x;
  20. }
  21. template<typename T>
  22. __global__ void prefix_sum_block(T *in, T *out, unsigned n)
  23. {
  24. extern __shared__ T temp[];
  25. int idx = threadIdx.x;
  26. int blocksize = blockDim.x;
  27. temp[shared_offset(idx )] = (idx < n) ? in[idx ] : 0;
  28. temp[shared_offset(idx + blocksize)] = (idx + blocksize < n) ? in[idx + blocksize] : 0;
  29. int offset, d;
  30. for (offset = 1, d = blocksize; d > 0; d >>= 1, offset <<= 1) {
  31. __syncthreads();
  32. if (idx < d) {
  33. unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
  34. temp[b] += temp[a];
  35. }
  36. }
  37. if (idx == 0) temp[shared_offset(blocksize*2 - 1)] = 0;
  38. for (d = 1; d <= blocksize; d <<= 1) {
  39. offset >>= 1;
  40. __syncthreads();
  41. if (idx < d) {
  42. unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
  43. unsigned t = temp[a];
  44. temp[a] = temp[b];
  45. temp[b] += t;
  46. }
  47. }
  48. __syncthreads();
  49. if (idx < n) out[idx ] = temp[shared_offset(idx )];
  50. if (idx + blocksize < n) out[idx + blocksize] = temp[shared_offset(idx + blocksize)];
  51. }
  52. template<typename T>
  53. void prefix_sum(T *in, T *out, unsigned n)
  54. {
  55. char *device_values;
  56. unsigned n_lpot = lpot(n);
  57. size_t n_pitch;
  58. cudaError_t error = cudaMallocPitch((void**)&device_values, &n_pitch, sizeof(T)*n, 2);
  59. if (error != 0) {
  60. printf("error %u allocating width %lu height %u\n", error, sizeof(T)*n, 2);
  61. exit(1);
  62. }
  63. cudaMemcpy(device_values, in, sizeof(T)*n, cudaMemcpyHostToDevice);
  64. prefix_sum_block<<<1, n_lpot/2, shared_offset(n_lpot)*sizeof(T)>>>
  65. ((T*)device_values, (T*)(device_values + n_pitch), n);
  66. cudaMemcpy(out, device_values + n_pitch, sizeof(T)*n, cudaMemcpyDeviceToHost);
  67. cudaFree(device_values);
  68. }
  69. int main()
  70. {
  71. sranddev();
  72. static unsigned in_values[1024], out_values[1024];
  73. for (int i = 0; i < 1024; ++i)
  74. in_values[i] = rand() >> 21;
  75. prefix_sum(in_values, out_values, 1024);
  76. for (int i = 0; i < 1024; ++i)
  77. printf("%5d => %5d\n", in_values[i], out_values[i]);
  78. return 0;
  79. }