GooFit  v2.1.3
ThrustOverride.h
Go to the documentation of this file.
1 #pragma once
2 
3 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
4 
5 #include <thrust/detail/config.h>
6 #include <thrust/detail/seq.h>
7 #include <thrust/detail/temporary_array.h>
8 #include <thrust/detail/type_traits.h>
9 #include <thrust/execution_policy.h>
10 #include <thrust/reduce.h>
11 #include <thrust/system/cuda/detail/bulk.h>
12 #include <thrust/system/cuda/detail/decomposition.h>
13 #include <thrust/system/cuda/detail/execute_on_stream.h>
14 #include <thrust/system/cuda/detail/execution_policy.h>
15 
16 #include <thrust/system/cuda/detail/bulk.h>
17 
18 #include <goofit/detail/ThrustOverrideConfig.h>
19 
20 #ifndef OR_GROUPSIZE
21 #define OR_GROUPSIZE 128
22 #endif
23 
24 #ifndef OR_GRAINSIZE
25 #define OR_GRAINSIZE 7
26 #endif
27 
28 // We are defining a new policy. This will allow us to override the internal thread/block distribution
29 struct goofit_policy : thrust::device_execution_policy<goofit_policy> {};
30 
31 template <typename InputIterator, typename UnaryFunction, typename OutputType, typename BinaryFunction>
32 __host__ __device__ OutputType transform_reduce(goofit_policy &exec,
33  InputIterator first,
34  InputIterator last,
35  UnaryFunction unary_op,
36  OutputType init,
37  BinaryFunction binary_op) {
38  // printf ("goofit transform_reduce\n");
39  thrust::transform_iterator<UnaryFunction, InputIterator, OutputType> xfrm_first(first, unary_op);
40  thrust::transform_iterator<UnaryFunction, InputIterator, OutputType> xfrm_last(last, unary_op);
41 
42  return thrust::reduce(exec, xfrm_first, xfrm_last, init, binary_op);
43 } // end transform_reduce()
44 
45 template <typename InputIterator, typename OutputType, typename BinaryFunction>
46 __host__ __device__ OutputType
47 reduce(goofit_policy &exec, InputIterator first, InputIterator last, OutputType init, BinaryFunction binary_op) {
48  typedef typename thrust::iterator_difference<InputIterator>::type size_type;
49 
50  const size_type n = last - first;
51 
52  if(n <= 0)
53  return init;
54 
55  // We are no longer expecting a default stream to be passed
56  cudaStream_t s = stream(thrust::detail::derived_cast(exec));
57 
58  // OR_GROUPSIZE=128
59  // OR_GRAINSIZE=7
60  // printf ("groupsize:%i grainsize:%i\n", OR_GROUPSIZE, OR_GRAINSIZE);
61  const size_type groupsize = OR_GROUPSIZE;
62  // const size_type groupsize = 128;
63  // const size_type grainsize = 7;
64  const size_type grainsize = OR_GRAINSIZE;
65  const size_type tile_size = groupsize * grainsize;
66  const size_type num_tiles = (n + tile_size - 1) / tile_size;
67  const size_type subscription = 10;
68 
69  thrust::system::cuda::detail::bulk_::concurrent_group<thrust::system::cuda::detail::bulk_::agent<grainsize>,
70  groupsize>
71  g;
72 
73  const size_type num_groups = thrust::min<size_type>(subscription * g.hardware_concurrency(), num_tiles);
74 
75  thrust::system::cuda::detail::aligned_decomposition<size_type> decomp(n, num_groups, tile_size);
76 
77  thrust::detail::temporary_array<OutputType, goofit_policy> partial_sums(exec, decomp.size());
78 
79  // reduce into partial sums
80  thrust::system::cuda::detail::bulk_::async(thrust::system::cuda::detail::bulk_::par(s, g, decomp.size()),
81  thrust::system::cuda::detail::reduce_detail::reduce_partitions(),
82  thrust::system::cuda::detail::bulk_::root.this_exec,
83  first,
84  decomp,
85  partial_sums.begin(),
86  init,
87  binary_op)
88  .wait();
89 
90  if(partial_sums.size() > 1) {
91  // reduce the partial sums
92  thrust::system::cuda::detail::bulk_::async(thrust::system::cuda::detail::bulk_::par(s, g, 1),
93  thrust::system::cuda::detail::reduce_detail::reduce_partitions(),
94  thrust::system::cuda::detail::bulk_::root.this_exec,
95  partial_sums.begin(),
96  partial_sums.end(),
97  partial_sums.begin(),
98  binary_op);
99  } // end if
100 
101  return get_value(exec, &partial_sums[0]);
102 } // end goofit_reduce()
103 
104 #endif
#define OR_GRAINSIZE
__host__ __device__ OutputType transform_reduce(goofit_policy &exec, InputIterator first, InputIterator last, UnaryFunction unary_op, OutputType init, BinaryFunction binary_op)
__host__ __device__ OutputType reduce(goofit_policy &exec, InputIterator first, InputIterator last, OutputType init, BinaryFunction binary_op)
#define OR_GROUPSIZE