3 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA 5 #include <thrust/detail/config.h> 6 #include <thrust/detail/seq.h> 7 #include <thrust/detail/temporary_array.h> 8 #include <thrust/detail/type_traits.h> 9 #include <thrust/execution_policy.h> 10 #include <thrust/reduce.h> 11 #include <thrust/system/cuda/detail/bulk.h> 12 #include <thrust/system/cuda/detail/decomposition.h> 13 #include <thrust/system/cuda/detail/execute_on_stream.h> 14 #include <thrust/system/cuda/detail/execution_policy.h> 16 #include <thrust/system/cuda/detail/bulk.h> 18 #include <goofit/detail/ThrustOverrideConfig.h> 21 #define OR_GROUPSIZE 128 25 #define OR_GRAINSIZE 7 29 struct goofit_policy : thrust::device_execution_policy<goofit_policy> {};
31 template <
typename InputIterator,
typename UnaryFunction,
typename OutputType,
typename BinaryFunction>
35 UnaryFunction unary_op,
37 BinaryFunction binary_op) {
39 thrust::transform_iterator<UnaryFunction, InputIterator, OutputType> xfrm_first(first, unary_op);
40 thrust::transform_iterator<UnaryFunction, InputIterator, OutputType> xfrm_last(last, unary_op);
42 return thrust::reduce(exec, xfrm_first, xfrm_last, init, binary_op);
45 template <
typename InputIterator,
typename OutputType,
typename BinaryFunction>
46 __host__ __device__ OutputType
47 reduce(
goofit_policy &exec, InputIterator first, InputIterator last, OutputType init, BinaryFunction binary_op) {
48 typedef typename thrust::iterator_difference<InputIterator>::type size_type;
50 const size_type n = last - first;
56 cudaStream_t s = stream(thrust::detail::derived_cast(exec));
65 const size_type tile_size = groupsize * grainsize;
66 const size_type num_tiles = (n + tile_size - 1) / tile_size;
67 const size_type subscription = 10;
69 thrust::system::cuda::detail::bulk_::concurrent_group<thrust::system::cuda::detail::bulk_::agent<grainsize>,
73 const size_type num_groups = thrust::min<size_type>(subscription * g.hardware_concurrency(), num_tiles);
75 thrust::system::cuda::detail::aligned_decomposition<size_type> decomp(n, num_groups, tile_size);
77 thrust::detail::temporary_array<OutputType, goofit_policy> partial_sums(exec, decomp.size());
80 thrust::system::cuda::detail::bulk_::async(thrust::system::cuda::detail::bulk_::par(s, g, decomp.size()),
81 thrust::system::cuda::detail::reduce_detail::reduce_partitions(),
82 thrust::system::cuda::detail::bulk_::root.this_exec,
90 if(partial_sums.size() > 1) {
92 thrust::system::cuda::detail::bulk_::async(thrust::system::cuda::detail::bulk_::par(s, g, 1),
93 thrust::system::cuda::detail::reduce_detail::reduce_partitions(),
94 thrust::system::cuda::detail::bulk_::root.this_exec,
101 return get_value(exec, &partial_sums[0]);
__host__ __device__ OutputType transform_reduce(goofit_policy &exec, InputIterator first, InputIterator last, UnaryFunction unary_op, OutputType init, BinaryFunction binary_op)
__host__ __device__ OutputType reduce(goofit_policy &exec, InputIterator first, InputIterator last, OutputType init, BinaryFunction binary_op)