Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
reduce.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "../cudaflow.hpp"
4
10namespace tf::detail {
11
12// ----------------------------------------------------------------------------
13// reduction helper functions
14// ----------------------------------------------------------------------------
15
17template<unsigned nt, typename T>
18struct cudaBlockReduce {
19
20 static const unsigned group_size = std::min(nt, CUDA_WARP_SIZE);
21 static const unsigned num_passes = log2(group_size);
22 static const unsigned num_items = nt / group_size;
23
24 static_assert(
25 nt && (0 == nt % CUDA_WARP_SIZE),
26 "cudaBlockReduce requires num threads to be a multiple of warp_size (32)"
27 );
28
30 struct Storage {
31 T data[std::max(nt, 2 * group_size)];
32 };
33
34 template<typename op_t>
35 __device__ T operator()(unsigned, T, Storage&, unsigned, op_t, bool = true) const;
36};
37
38// function: reduce to be called from a block
39template<unsigned nt, typename T>
40template<typename op_t>
41__device__ T cudaBlockReduce<nt, T>::operator ()(
42 unsigned tid, T x, Storage& storage, unsigned count, op_t op, bool ret
43) const {
44
45 // Store your data into shared memory.
46 storage.data[tid] = x;
47 __syncthreads();
48
49 if(tid < group_size) {
50 // Each thread scans within its lane.
51 cuda_strided_iterate<group_size, num_items>([&](auto i, auto j) {
52 if(i > 0) {
53 x = op(x, storage.data[j]);
54 }
55 }, tid, count);
56 storage.data[tid] = x;
57 }
58 __syncthreads();
59
60 auto count2 = count < group_size ? count : group_size;
61 auto first = (1 & num_passes) ? group_size : 0;
62 if(tid < group_size) {
63 storage.data[first + tid] = x;
64 }
65 __syncthreads();
66
67 cuda_iterate<num_passes>([&](auto pass) {
68 if(tid < group_size) {
69 if(auto offset = 1 << pass; tid + offset < count2) {
70 x = op(x, storage.data[first + offset + tid]);
71 }
72 first = group_size - first;
73 storage.data[first + tid] = x;
74 }
75 __syncthreads();
76 });
77
78 if(ret) {
79 x = storage.data[0];
80 __syncthreads();
81 }
82 return x;
83}
84
86template <typename P, typename I, typename T, typename O>
87void cuda_reduce_loop(
88 P&& p, I input, unsigned count, T* res, O op, void* ptr
89) {
90
91 using U = typename std::iterator_traits<I>::value_type;
92 using E = std::decay_t<P>;
93
94 auto buf = static_cast<U*>(ptr);
95 auto B = (count + E::nv - 1) / E::nv;
96
97 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
98 __shared__ typename cudaBlockReduce<E::nt, U>::Storage shm;
99 auto tile = cuda_get_tile(bid, E::nv, count);
100 auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
101 input + tile.begin, tid, tile.count()
102 );
103 // reduce multiple values per thread into a scalar.
104 U s;
105 cuda_strided_iterate<E::nt, E::vt>(
106 [&] (auto i, auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count()
107 );
108 // reduce to a scalar per block.
109 s = cudaBlockReduce<E::nt, U>()(
110 tid, s, shm, (tile.count() < E::nt ? tile.count() : E::nt), op, false
111 );
112 if(!tid) {
113 (1 == B) ? *res = op(*res, s) : buf[bid] = s;
114 }
115 });
116
117 if(B > 1) {
118 cuda_reduce_loop(p, buf, B, res, op, buf+B);
119 }
120}
121
123template <typename P, typename I, typename T, typename O>
124void cuda_uninitialized_reduce_loop(
125 P&& p, I input, unsigned count, T* res, O op, void* ptr
126) {
127
128 using U = typename std::iterator_traits<I>::value_type;
129 using E = std::decay_t<P>;
130
131 auto buf = static_cast<U*>(ptr);
132 auto B = (count + E::nv - 1) / E::nv;
133
134 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
135 __shared__ typename cudaBlockReduce<E::nt, U>::Storage shm;
136 auto tile = cuda_get_tile(bid, E::nv, count);
137 auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
138 input + tile.begin, tid, tile.count()
139 );
140 // reduce multiple values per thread into a scalar.
141 U s;
142 cuda_strided_iterate<E::nt, E::vt>(
143 [&] (auto i, auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count()
144 );
145 // reduce to a scalar per block.
146 s = cudaBlockReduce<E::nt, U>()(
147 tid, s, shm, (tile.count() < E::nt ? tile.count() : E::nt), op, false
148 );
149 if(!tid) {
150 (1 == B) ? *res = s : buf[bid] = s;
151 }
152 });
153
154 if(B > 1) {
155 cuda_uninitialized_reduce_loop(p, buf, B, res, op, buf+B);
156 }
157}
158
159} // namespace tf::detail ----------------------------------------------------
160
161namespace tf {
162
175template <typename P, typename T>
176unsigned cuda_reduce_buffer_size(unsigned count) {
177 using E = std::decay_t<P>;
178 unsigned B = (count + E::nv - 1) / E::nv;
179 unsigned n = 0;
180 for(auto b=B; b>1; n += (b=(b+E::nv-1)/E::nv));
181 return n*sizeof(T);
182}
183
184// ----------------------------------------------------------------------------
185// cuda_reduce
186// ----------------------------------------------------------------------------
187
211template <typename P, typename I, typename T, typename O>
213 P&& p, I first, I last, T* res, O op, void* buf
214) {
215 unsigned count = std::distance(first, last);
216 if(count == 0) {
217 return;
218 }
219 detail::cuda_reduce_loop(p, first, count, res, op, buf);
220}
221
222// ----------------------------------------------------------------------------
223// cuda_uninitialized_reduce
224// ----------------------------------------------------------------------------
225
252template <typename P, typename I, typename T, typename O>
254 P&& p, I first, I last, T* res, O op, void* buf
255) {
256 unsigned count = std::distance(first, last);
257 if(count == 0) {
258 return;
259 }
260 detail::cuda_uninitialized_reduce_loop(p, first, count, res, op, buf);
261}
262
263// ----------------------------------------------------------------------------
264// transform_reduce
265// ----------------------------------------------------------------------------
266
293template<typename P, typename I, typename T, typename O, typename U>
295 P&& p, I first, I last, T* res, O bop, U uop, void* buf
296) {
297
298 unsigned count = std::distance(first, last);
299
300 if(count == 0) {
301 return;
302 }
303
304 // reduction loop
305 detail::cuda_reduce_loop(p,
306 cuda_make_load_iterator<T>([=]__device__(auto i){
307 return uop(*(first+i));
308 }),
309 count, res, bop, buf
310 );
311}
312
313// ----------------------------------------------------------------------------
314// transform_uninitialized_reduce
315// ----------------------------------------------------------------------------
316
345template<typename P, typename I, typename T, typename O, typename U>
347 P&& p, I first, I last, T* res, O bop, U uop, void* buf
348) {
349
350 unsigned count = std::distance(first, last);
351
352 if(count == 0) {
353 return;
354 }
355
356 // reduction loop
357 //detail::cuda_transform_reduce_loop(
358 // p, first, count, res, bop, uop, false, s, buf
359 //);
360 detail::cuda_uninitialized_reduce_loop(p,
361 cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),
362 count, res, bop, buf
363 );
364}
365
366// ----------------------------------------------------------------------------
367
368//template <typename T, typename C>
369//__device__ void cuda_warp_reduce(
370// volatile T* shm, size_t N, size_t tid, C op
371//) {
372// if(tid + 32 < N) shm[tid] = op(shm[tid], shm[tid+32]);
373// if(tid + 16 < N) shm[tid] = op(shm[tid], shm[tid+16]);
374// if(tid + 8 < N) shm[tid] = op(shm[tid], shm[tid+8]);
375// if(tid + 4 < N) shm[tid] = op(shm[tid], shm[tid+4]);
376// if(tid + 2 < N) shm[tid] = op(shm[tid], shm[tid+2]);
377// if(tid + 1 < N) shm[tid] = op(shm[tid], shm[tid+1]);
378//}
379//
380//template <typename I, typename T, typename C, bool uninitialized>
381//__global__ void cuda_reduce(I first, size_t N, T* res, C op) {
382//
383// size_t tid = threadIdx.x;
384//
385// if(tid >= N) {
386// return;
387// }
388//
389// cudaSharedMemory<T> shared_memory;
390// T* shm = shared_memory.get();
391//
392// shm[tid] = *(first+tid);
393//
394// for(size_t i=tid+blockDim.x; i<N; i+=blockDim.x) {
395// shm[tid] = op(shm[tid], *(first+i));
396// }
397//
398// __syncthreads();
399//
400// for(size_t s = blockDim.x / 2; s > 32; s >>= 1) {
401// if(tid < s && tid + s < N) {
402// shm[tid] = op(shm[tid], shm[tid+s]);
403// }
404// __syncthreads();
405// }
406//
407// if(tid < 32) {
408// cuda_warp_reduce(shm, N, tid, op);
409// }
410//
411// if(tid == 0) {
412// if constexpr (uninitialized) {
413// *res = shm[0];
414// }
415// else {
416// *res = op(*res, shm[0]);
417// }
418// }
419//}
420
421// ----------------------------------------------------------------------------
422// cudaFlowCapturer
423// ----------------------------------------------------------------------------
424
425// Function: reduce
426template <typename I, typename T, typename C>
427cudaTask cudaFlowCapturer::reduce(I first, I last, T* result, C c) {
428
429 // TODO
430 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
431 std::distance(first, last)
432 );
433
434 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
435 (cudaStream_t stream) mutable {
437 cuda_reduce(p, first, last, result, c, buf.get().data());
438 });
439}
440
441// Function: uninitialized_reduce
442template <typename I, typename T, typename C>
443cudaTask cudaFlowCapturer::uninitialized_reduce(I first, I last, T* result, C c) {
444
445 // TODO
446 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
447 std::distance(first, last)
448 );
449
450 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
451 (cudaStream_t stream) mutable {
453 cuda_uninitialized_reduce(p, first, last, result, c, buf.get().data());
454 });
455}
456
457// Function: transform_reduce
458template <typename I, typename T, typename C, typename U>
460 I first, I last, T* result, C bop, U uop
461) {
462
463 // TODO
464 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
465 std::distance(first, last)
466 );
467
468 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
469 (cudaStream_t stream) mutable {
472 p, first, last, result, bop, uop, buf.get().data()
473 );
474 });
475}
476
477// Function: transform_uninitialized_reduce
478template <typename I, typename T, typename C, typename U>
480 I first, I last, T* result, C bop, U uop) {
481
482 // TODO
483 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
484 std::distance(first, last)
485 );
486
487 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
488 (cudaStream_t stream) mutable {
491 p, first, last, result, bop, uop, buf.get().data()
492 );
493 });
494}
495
496// Function: reduce
497template <typename I, typename T, typename C>
499 cudaTask task, I first, I last, T* result, C c
500) {
501
502 // TODO
503 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
504 std::distance(first, last)
505 );
506
507 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
508 (cudaStream_t stream) mutable {
510 cuda_reduce(p, first, last, result, c, buf.get().data());
511 });
512}
513
514// Function: uninitialized_reduce
515template <typename I, typename T, typename C>
517 cudaTask task, I first, I last, T* result, C c
518) {
519 // TODO
520 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
521 std::distance(first, last)
522 );
523
524 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
525 (cudaStream_t stream) mutable {
527 cuda_uninitialized_reduce(p, first, last, result, c, buf.get().data());
528 });
529}
530
531// Function: transform_reduce
532template <typename I, typename T, typename C, typename U>
534 cudaTask task, I first, I last, T* result, C bop, U uop
535) {
536
537 // TODO
538 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
539 std::distance(first, last)
540 );
541
542 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
543 (cudaStream_t stream) mutable {
546 p, first, last, result, bop, uop, buf.get().data()
547 );
548 });
549}
550
551// Function: transform_uninitialized_reduce
552template <typename I, typename T, typename C, typename U>
554 cudaTask task, I first, I last, T* result, C bop, U uop
555) {
556
557 // TODO
558 auto bufsz = cuda_reduce_buffer_size<cudaDefaultExecutionPolicy, T>(
559 std::distance(first, last)
560 );
561
562 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
563 (cudaStream_t stream) mutable {
566 p, first, last, result, bop, uop, buf.get().data()
567 );
568 });
569}
570
571
572// ----------------------------------------------------------------------------
573// cudaFlow
574// ----------------------------------------------------------------------------
575
576// Function: reduce
577template <typename I, typename T, typename B>
578cudaTask cudaFlow::reduce(I first, I last, T* result, B bop) {
579 return capture([=](cudaFlowCapturer& cap){
581 cap.reduce(first, last, result, bop);
582 });
583}
584
585// Function: uninitialized_reduce
586template <typename I, typename T, typename B>
587cudaTask cudaFlow::uninitialized_reduce(I first, I last, T* result, B bop) {
588 return capture([=](cudaFlowCapturer& cap){
590 cap.uninitialized_reduce(first, last, result, bop);
591 });
592}
593
594// Function: transform_reduce
595template <typename I, typename T, typename B, typename U>
596cudaTask cudaFlow::transform_reduce(I first, I last, T* result, B bop, U uop) {
597 return capture([=](cudaFlowCapturer& cap){
599 cap.transform_reduce(first, last, result, bop, uop);
600 });
601}
602
603// Function: transform_uninitialized_reduce
604template <typename I, typename T, typename B, typename U>
606 I first, I last, T* result, B bop, U uop
607) {
608 return capture([=](cudaFlowCapturer& cap){
610 cap.transform_uninitialized_reduce(first, last, result, bop, uop);
611 });
612}
613
614// Function: reduce
615template <typename I, typename T, typename C>
616void cudaFlow::reduce(cudaTask task, I first, I last, T* result, C op) {
617 capture(task, [=](cudaFlowCapturer& cap){
619 cap.reduce(first, last, result, op);
620 });
621}
622
623// Function: uninitialized_reduce
624template <typename I, typename T, typename C>
626 cudaTask task, I first, I last, T* result, C op
627) {
628 capture(task, [=](cudaFlowCapturer& cap){
630 cap.uninitialized_reduce(first, last, result, op);
631 });
632}
633
634// Function: transform_reduce
635template <typename I, typename T, typename B, typename U>
637 cudaTask task, I first, I last, T* result, B bop, U uop
638) {
639 capture(task, [=](cudaFlowCapturer& cap){
641 cap.transform_reduce(first, last, result, bop, uop);
642 });
643}
644
645// Function: transform_uninitialized_reduce
646template <typename I, typename T, typename B, typename U>
648 cudaTask task, I first, I last, T* result, B bop, U uop
649) {
650 capture(task, [=](cudaFlowCapturer& cap){
652 cap.transform_uninitialized_reduce(first, last, result, bop, uop);
653 });
654}
655
656
657} // end of namespace tf -----------------------------------------------------
658
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask transform_reduce(I first, I last, T *result, C bop, U uop)
captures kernels that perform parallel reduction over a range of transformed items
Definition reduce.hpp:459
cudaTask reduce(I first, I last, T *result, C op)
captures kernels that perform parallel reduction over a range of items
Definition reduce.hpp:427
cudaTask transform_uninitialized_reduce(I first, I last, T *result, C bop, U uop)
similar to tf::cudaFlowCapturer::transform_reduce but does not assume any initial value to reduce
Definition reduce.hpp:479
cudaTask uninitialized_reduce(I first, I last, T *result, C op)
similar to tf::cudaFlowCapturer::reduce but does not assume any initial value to reduce
Definition reduce.hpp:443
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask transform_reduce(I first, I last, T *result, B bop, U uop)
performs parallel reduction over a range of transformed items
Definition reduce.hpp:596
cudaTask uninitialized_reduce(I first, I last, T *result, B bop)
similar to tf::cudaFlow::reduce but does not assume any initial value to reduce
Definition reduce.hpp:587
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask reduce(I first, I last, T *result, B bop)
performs parallel reduction over a range of items
Definition reduce.hpp:578
cudaTask transform_uninitialized_reduce(I first, I last, T *result, B bop, U uop)
similar to tf::cudaFlow::transform_reduce but does not assume any initial value to reduce
Definition reduce.hpp:605
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
T count(T... args)
T distance(T... args)
T forward(T... args)
T max(T... args)
T min(T... args)
taskflow namespace
Definition small_vector.hpp:27
unsigned cuda_reduce_buffer_size(unsigned count)
queries the buffer size in bytes needed to call reduce kernels
Definition reduce.hpp:176
void cuda_transform_reduce(P &&p, I first, I last, T *res, O bop, U uop, void *buf)
performs asynchronous parallel reduction over a range of transformed items without an initial value
Definition reduce.hpp:294
void cuda_uninitialized_reduce(P &&p, I first, I last, T *res, O op, void *buf)
performs asynchronous parallel reduction over a range of items without an initial value
Definition reduce.hpp:253
void cuda_transform_uninitialized_reduce(P &&p, I first, I last, T *res, O bop, U uop, void *buf)
performs asynchronous parallel reduction over a range of transformed items with an initial value
Definition reduce.hpp:346
void cuda_reduce(P &&p, I first, I last, T *res, O op, void *buf)
performs asynchronous parallel reduction over a range of items
Definition reduce.hpp:212