PageRenderTime 46ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/thrust/detail/backend/cuda/reduce_intervals.inl

https://code.google.com/p/thrust/
C++ Header | 201 lines | 128 code | 44 blank | 29 comment | 14 complexity | 8fef352034ddaa06c9642a8fbeb98cfe MD5 | raw file
Possible License(s): Apache-2.0
  1. /*
  2. * Copyright 2008-2011 NVIDIA Corporation
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <thrust/detail/config.h>
  17. #include <thrust/iterator/iterator_traits.h>
  18. #include <thrust/detail/minmax.h>
  19. #include <thrust/detail/backend/decompose.h>
  20. #include <thrust/detail/backend/cuda/extern_shared_ptr.h>
  21. #include <thrust/detail/backend/cuda/block/reduce.h>
  22. #include <thrust/detail/backend/cuda/detail/launch_closure.h>
  23. #include <thrust/detail/backend/cuda/detail/launch_calculator.h>
  24. namespace thrust
  25. {
  26. namespace detail
  27. {
  28. namespace backend
  29. {
  30. namespace cuda
  31. {
  32. template <typename InputIterator,
  33. typename OutputIterator,
  34. typename BinaryFunction,
  35. typename Decomposition,
  36. typename Context>
  37. struct commutative_reduce_intervals_closure
  38. {
  39. InputIterator input;
  40. OutputIterator output;
  41. BinaryFunction binary_op;
  42. Decomposition decomposition;
  43. unsigned int shared_array_size;
  44. typedef Context context_type;
  45. context_type context;
  46. commutative_reduce_intervals_closure(InputIterator input, OutputIterator output, BinaryFunction binary_op, Decomposition decomposition, unsigned int shared_array_size, Context context = Context())
  47. : input(input), output(output), binary_op(binary_op), decomposition(decomposition), shared_array_size(shared_array_size), context(context) {}
  48. __device__ __thrust_forceinline__
  49. void operator()(void)
  50. {
  51. typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
  52. thrust::detail::backend::cuda::extern_shared_ptr<OutputType> shared_array;
  53. typedef typename Decomposition::index_type index_type;
  54. // this block processes results in [range.begin(), range.end())
  55. thrust::detail::backend::index_range<index_type> range = decomposition[context.block_index()];
  56. index_type i = range.begin() + context.thread_index();
  57. input += i;
  58. if (range.size() < context.block_dimension())
  59. {
  60. // compute reduction with the first shared_array_size threads
  61. if (context.thread_index() < thrust::min<index_type>(shared_array_size,range.size()))
  62. {
  63. OutputType sum = backend::dereference(input);
  64. i += shared_array_size;
  65. input += shared_array_size;
  66. while (i < range.end())
  67. {
  68. OutputType val = backend::dereference(input);
  69. sum = binary_op(sum, val);
  70. i += shared_array_size;
  71. input += shared_array_size;
  72. }
  73. shared_array[context.thread_index()] = sum;
  74. }
  75. }
  76. else
  77. {
  78. // compute reduction with all blockDim.x threads
  79. OutputType sum = backend::dereference(input);
  80. i += context.block_dimension();
  81. input += context.block_dimension();
  82. while (i < range.end())
  83. {
  84. OutputType val = backend::dereference(input);
  85. sum = binary_op(sum, val);
  86. i += context.block_dimension();
  87. input += context.block_dimension();
  88. }
  89. // write first shared_array_size values into shared memory
  90. if (context.thread_index() < shared_array_size)
  91. shared_array[context.thread_index()] = sum;
  92. // accumulate remaining values (if any) to shared memory in stages
  93. if (context.block_dimension() > shared_array_size)
  94. {
  95. unsigned int lb = shared_array_size;
  96. unsigned int ub = shared_array_size + lb;
  97. while (lb < context.block_dimension())
  98. {
  99. context.barrier();
  100. if (lb <= context.thread_index() && context.thread_index() < ub)
  101. {
  102. OutputType tmp = shared_array[context.thread_index() - lb];
  103. shared_array[context.thread_index() - lb] = binary_op(tmp, sum);
  104. }
  105. lb += shared_array_size;
  106. ub += shared_array_size;
  107. }
  108. }
  109. }
  110. context.barrier();
  111. thrust::detail::backend::cuda::block::reduce_n(context, shared_array, thrust::min<index_type>(range.size(), shared_array_size), binary_op);
  112. if (context.thread_index() == 0)
  113. {
  114. output += context.block_index();
  115. backend::dereference(output) = shared_array[0];
  116. }
  117. }
  118. };
  119. __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
  120. template <typename InputIterator,
  121. typename OutputIterator,
  122. typename BinaryFunction,
  123. typename Decomposition>
  124. void reduce_intervals(InputIterator input,
  125. OutputIterator output,
  126. BinaryFunction binary_op,
  127. Decomposition decomp)
  128. {
  129. // we're attempting to launch a kernel, assert we're compiling with nvcc
  130. // ========================================================================
  131. // X Note to the user: If you've found this line due to a compiler error, X
  132. // X you need to compile your code using nvcc, rather than g++ or cl.exe X
  133. // ========================================================================
  134. THRUST_STATIC_ASSERT( (depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
  135. if (decomp.size() == 0)
  136. return;
  137. // TODO if (decomp.size() > deviceProperties.maxGridSize[0]) throw cuda exception (or handle general case)
  138. typedef cuda::detail::blocked_thread_array Context;
  139. typedef commutative_reduce_intervals_closure<InputIterator,OutputIterator,BinaryFunction,Decomposition,Context> Closure;
  140. typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
  141. thrust::detail::backend::cuda::detail::launch_calculator<Closure> calculator;
  142. thrust::tuple<size_t,size_t,size_t> config = calculator.with_variable_block_size_available_smem();
  143. //size_t max_blocks = thrust::get<0>(config);
  144. size_t block_size = thrust::get<1>(config);
  145. size_t max_memory = thrust::get<2>(config);
  146. // determine shared array size
  147. size_t shared_array_size = thrust::min(max_memory / sizeof(OutputType), block_size);
  148. size_t shared_array_bytes = sizeof(OutputType) * shared_array_size;
  149. // TODO if (shared_array_size < 1) throw cuda exception "insufficient shared memory"
  150. Closure closure(input, output, binary_op, decomp, shared_array_size);
  151. thrust::detail::backend::cuda::detail::launch_closure(closure, decomp.size(), block_size, shared_array_bytes);
  152. }
  153. __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
  154. } // end namespace cuda
  155. } // end namespace backend
  156. } // end namespace detail
  157. } // end namespace thrust