3#include "../cudaflow.hpp"
17template<
unsigned nt,
typename T>
18struct cudaBlockReduce {
20 static const unsigned group_size =
std::min(nt, CUDA_WARP_SIZE);
21 static const unsigned num_passes = log2(group_size);
22 static const unsigned num_items = nt / group_size;
25 nt && (0 == nt % CUDA_WARP_SIZE),
26 "cudaBlockReduce requires num threads to be a multiple of warp_size (32)"
31 T data[
std::max(nt, 2 * group_size)];
34 template<
typename op_t>
35 __device__ T operator()(
unsigned, T, Storage&,
unsigned, op_t,
bool =
true)
const;
39template<
unsigned nt,
typename T>
40template<
typename op_t>
41__device__ T cudaBlockReduce<nt, T>::operator ()(
42 unsigned tid, T x, Storage& storage,
unsigned count, op_t op,
bool ret
46 storage.data[tid] = x;
49 if(tid < group_size) {
51 cuda_strided_iterate<group_size, num_items>([&](
auto i,
auto j) {
53 x = op(x, storage.data[j]);
56 storage.data[tid] = x;
60 auto count2 =
count < group_size ?
count : group_size;
61 auto first = (1 & num_passes) ? group_size : 0;
62 if(tid < group_size) {
63 storage.data[first + tid] = x;
67 cuda_iterate<num_passes>([&](
auto pass) {
68 if(tid < group_size) {
69 if(
auto offset = 1 << pass; tid + offset < count2) {
70 x = op(x, storage.data[first + offset + tid]);
72 first = group_size - first;
73 storage.data[first + tid] = x;
86template <
typename P,
typename I,
typename T,
typename O>
88 P&& p, I input,
unsigned count, T* res, O op,
void* ptr
94 auto buf =
static_cast<U*
>(ptr);
95 auto B = (
count + E::nv - 1) / E::nv;
97 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (
auto tid,
auto bid) {
98 __shared__
typename cudaBlockReduce<E::nt, U>::Storage shm;
99 auto tile = cuda_get_tile(bid, E::nv, count);
100 auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
101 input + tile.begin, tid, tile.count()
105 cuda_strided_iterate<E::nt, E::vt>(
106 [&] (
auto i,
auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count()
109 s = cudaBlockReduce<E::nt, U>()(
110 tid, s, shm, (tile.count() < E::nt ? tile.count() : E::nt), op,
false
113 (1 == B) ? *res = op(*res, s) : buf[bid] = s;
118 cuda_reduce_loop(p, buf, B, res, op, buf+B);
123template <
typename P,
typename I,
typename T,
typename O>
124void cuda_uninitialized_reduce_loop(
125 P&& p, I input,
unsigned count, T* res, O op,
void* ptr
131 auto buf =
static_cast<U*
>(ptr);
132 auto B = (
count + E::nv - 1) / E::nv;
134 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (
auto tid,
auto bid) {
135 __shared__
typename cudaBlockReduce<E::nt, U>::Storage shm;
136 auto tile = cuda_get_tile(bid, E::nv, count);
137 auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
138 input + tile.begin, tid, tile.count()
142 cuda_strided_iterate<E::nt, E::vt>(
143 [&] (
auto i,
auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count()
146 s = cudaBlockReduce<E::nt, U>()(
147 tid, s, shm, (tile.count() < E::nt ? tile.count() : E::nt), op,
false
150 (1 == B) ? *res = s : buf[bid] = s;
155 cuda_uninitialized_reduce_loop(p, buf, B, res, op, buf+B);
175template <
typename P,
typename T>
178 unsigned B = (count + E::nv - 1) / E::nv;
180 for(
auto b=B; b>1; n += (b=(b+E::nv-1)/E::nv));
211template <
typename P,
typename I,
typename T,
typename O>
213 P&& p, I first, I last, T* res, O op,
void* buf
219 detail::cuda_reduce_loop(p, first, count, res, op, buf);
252template <
typename P,
typename I,
typename T,
typename O>
254 P&& p, I first, I last, T* res, O op,
void* buf
260 detail::cuda_uninitialized_reduce_loop(p, first, count, res, op, buf);
293template<
typename P,
typename I,
typename T,
typename O,
typename U>
295 P&& p, I first, I last, T* res, O bop, U uop,
void* buf
305 detail::cuda_reduce_loop(p,
306 cuda_make_load_iterator<T>([=]__device__(
auto i){
307 return uop(*(first+i));
345template<
typename P,
typename I,
typename T,
typename O,
typename U>
347 P&& p, I first, I last, T* res, O bop, U uop,
void* buf
360 detail::cuda_uninitialized_reduce_loop(p,
361 cuda_make_load_iterator<T>([=]__device__(
auto i){
return uop(*(first+i)); }),
426template <
typename I,
typename T,
typename C>
430 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
434 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
435 (cudaStream_t stream)
mutable {
437 cuda_reduce(p, first, last, result, c, buf.get().data());
442template <
typename I,
typename T,
typename C>
446 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
450 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
451 (cudaStream_t stream)
mutable {
458template <
typename I,
typename T,
typename C,
typename U>
460 I first, I last, T* result, C bop, U uop
464 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
468 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
469 (cudaStream_t stream)
mutable {
472 p, first, last, result, bop, uop, buf.get().data()
478template <
typename I,
typename T,
typename C,
typename U>
480 I first, I last, T* result, C bop, U uop) {
483 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
487 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
488 (cudaStream_t stream)
mutable {
491 p, first, last, result, bop, uop, buf.get().data()
497template <
typename I,
typename T,
typename C>
499 cudaTask task, I first, I last, T* result, C c
503 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
507 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
508 (cudaStream_t stream)
mutable {
510 cuda_reduce(p, first, last, result, c, buf.get().data());
515template <
typename I,
typename T,
typename C>
517 cudaTask task, I first, I last, T* result, C c
520 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
524 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
525 (cudaStream_t stream)
mutable {
532template <
typename I,
typename T,
typename C,
typename U>
534 cudaTask task, I first, I last, T* result, C bop, U uop
538 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
542 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
543 (cudaStream_t stream)
mutable {
546 p, first, last, result, bop, uop, buf.get().data()
552template <
typename I,
typename T,
typename C,
typename U>
554 cudaTask task, I first, I last, T* result, C bop, U uop
558 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
562 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
563 (cudaStream_t stream)
mutable {
566 p, first, last, result, bop, uop, buf.get().data()
577template <
typename I,
typename T,
typename B>
581 cap.
reduce(first, last, result, bop);
586template <
typename I,
typename T,
typename B>
595template <
typename I,
typename T,
typename B,
typename U>
604template <
typename I,
typename T,
typename B,
typename U>
606 I first, I last, T* result, B bop, U uop
615template <
typename I,
typename T,
typename C>
619 cap.
reduce(first, last, result, op);
624template <
typename I,
typename T,
typename C>
626 cudaTask task, I first, I last, T* result, C op
635template <
typename I,
typename T,
typename B,
typename U>
637 cudaTask task, I first, I last, T* result, B bop, U uop
646template <
typename I,
typename T,
typename B,
typename U>
648 cudaTask task, I first, I last, T* result, B bop, U uop
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask transform_reduce(I first, I last, T *result, C bop, U uop)
captures kernels that perform parallel reduction over a range of transformed items
Definition reduce.hpp:459
cudaTask reduce(I first, I last, T *result, C op)
captures kernels that perform parallel reduction over a range of items
Definition reduce.hpp:427
cudaTask transform_uninitialized_reduce(I first, I last, T *result, C bop, U uop)
similar to tf::cudaFlowCapturer::transform_reduce but does not assume any initial value to reduce
Definition reduce.hpp:479
cudaTask uninitialized_reduce(I first, I last, T *result, C op)
similar to tf::cudaFlowCapturer::reduce but does not assume any initial value to reduce
Definition reduce.hpp:443
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask transform_reduce(I first, I last, T *result, B bop, U uop)
performs parallel reduction over a range of transformed items
Definition reduce.hpp:596
cudaTask uninitialized_reduce(I first, I last, T *result, B bop)
similar to tf::cudaFlow::reduce but does not assume any initial value to reduce
Definition reduce.hpp:587
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask reduce(I first, I last, T *result, B bop)
performs parallel reduction over a range of items
Definition reduce.hpp:578
cudaTask transform_uninitialized_reduce(I first, I last, T *result, B bop, U uop)
similar to tf::cudaFlow::transform_reduce but does not assume any initial value to reduce
Definition reduce.hpp:605
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
taskflow namespace
Definition small_vector.hpp:27
unsigned cuda_reduce_buffer_size(unsigned count)
queries the buffer size in bytes needed to call reduce kernels
Definition reduce.hpp:176
void cuda_transform_reduce(P &&p, I first, I last, T *res, O bop, U uop, void *buf)
performs asynchronous parallel reduction over a range of transformed items without an initial value
Definition reduce.hpp:294
void cuda_uninitialized_reduce(P &&p, I first, I last, T *res, O op, void *buf)
performs asynchronous parallel reduction over a range of items without an initial value
Definition reduce.hpp:253
void cuda_transform_uninitialized_reduce(P &&p, I first, I last, T *res, O bop, U uop, void *buf)
performs asynchronous parallel reduction over a range of transformed items with an initial value
Definition reduce.hpp:346
void cuda_reduce(P &&p, I first, I last, T *res, O op, void *buf)
performs asynchronous parallel reduction over a range of items
Definition reduce.hpp:212