Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
scan.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "reduce.hpp"
4
10namespace tf::detail {
11
12// ----------------------------------------------------------------------------
13// scan
14// ----------------------------------------------------------------------------
15
17inline constexpr unsigned cudaScanRecursionThreshold = 8;
18
20enum class cudaScanType : int {
21 EXCLUSIVE = 1,
22 INCLUSIVE
23};
24
26template<typename T, unsigned vt = 0, bool is_array = (vt > 0)>
27struct cudaScanResult {
28 T scan;
29 T reduction;
30};
31
33template<typename T, unsigned vt>
34struct cudaScanResult<T, vt, true> {
35 cudaArray<T, vt> scan;
36 T reduction;
37};
38
39//-----------------------------------------------------------------------------
40
42template<unsigned nt, typename T>
43struct cudaBlockScan {
44
45 const static unsigned num_warps = nt / CUDA_WARP_SIZE;
46 const static unsigned num_passes = log2(nt);
47 const static unsigned capacity = nt + num_warps;
48
50 union storage_t {
51 T data[2 * nt];
52 struct { T threads[nt], warps[num_warps]; };
53 };
54
55 // standard scan
56 template<typename op_t>
57 __device__ cudaScanResult<T> operator ()(
58 unsigned tid,
59 T x,
60 storage_t& storage,
61 unsigned count = nt,
62 op_t op = op_t(),
63 T init = T(),
64 cudaScanType type = cudaScanType::EXCLUSIVE
65 ) const;
66
67 // vectorized scan. accepts multiple values per thread and adds in
68 // optional global carry-in
69 template<unsigned vt, typename op_t>
70 __device__ cudaScanResult<T, vt> operator()(
71 unsigned tid,
72 cudaArray<T, vt> x,
73 storage_t& storage,
74 T carry_in = T(),
75 bool use_carry_in = false,
76 unsigned count = nt,
77 op_t op = op_t(),
78 T init = T(),
79 cudaScanType type = cudaScanType::EXCLUSIVE
80 ) const;
81};
82
83// standard scan
84template <unsigned nt, typename T>
85template<typename op_t>
86__device__ cudaScanResult<T> cudaBlockScan<nt, T>::operator () (
87 unsigned tid, T x, storage_t& storage, unsigned count, op_t op,
88 T init, cudaScanType type
89) const {
90
91 unsigned first = 0;
92 storage.data[first + tid] = x;
93 __syncthreads();
94
95 cuda_iterate<num_passes>([&](auto pass) {
96 if(auto offset = 1<<pass; tid >= offset) {
97 x = op(storage.data[first + tid - offset], x);
98 }
99 first = nt - first;
100 storage.data[first + tid] = x;
101 __syncthreads();
102 });
103
104 cudaScanResult<T> result;
105 result.reduction = storage.data[first + count - 1];
106 result.scan = (tid < count) ?
107 (cudaScanType::INCLUSIVE == type ? x :
108 (tid ? storage.data[first + tid - 1] : init)) :
109 result.reduction;
110 __syncthreads();
111
112 return result;
113}
114
115// vectorized scan block
116template <unsigned nt, typename T>
117template<unsigned vt, typename op_t>
118__device__ cudaScanResult<T, vt> cudaBlockScan<nt, T>::operator()(
119 unsigned tid,
120 cudaArray<T, vt> x,
121 storage_t& storage,
122 T carry_in,
123 bool use_carry_in,
124 unsigned count, op_t op,
125 T init,
126 cudaScanType type
127) const {
128
129 // Start with an inclusive scan of the in-range elements.
130 if(count >= nt * vt) {
131 cuda_iterate<vt>([&](auto i) {
132 x[i] = i ? op(x[i], x[i - 1]) : x[i];
133 });
134 } else {
135 cuda_iterate<vt>([&](auto i) {
136 auto index = vt * tid + i;
137 x[i] = i ?
138 ((index < count) ? op(x[i], x[i - 1]) : x[i - 1]) :
139 (x[i] = (index < count) ? x[i] : init);
140 });
141 }
142
143 // Scan the thread-local reductions for a carry-in for each thread.
144 auto result = operator()(
145 tid, x[vt - 1], storage,
146 (count + vt - 1) / vt, op, init, cudaScanType::EXCLUSIVE
147 );
148
149 // Perform the scan downsweep and add both the global carry-in and the
150 // thread carry-in to the values.
151 if(use_carry_in) {
152 result.reduction = op(carry_in, result.reduction);
153 result.scan = tid ? op(carry_in, result.scan) : carry_in;
154 } else {
155 use_carry_in = tid > 0;
156 }
157
158 cudaArray<T, vt> y;
159 cuda_iterate<vt>([&](auto i) {
160 if(cudaScanType::EXCLUSIVE == type) {
161 y[i] = i ? x[i - 1] : result.scan;
162 if(use_carry_in && i > 0) y[i] = op(result.scan, y[i]);
163 } else {
164 y[i] = use_carry_in ? op(x[i], result.scan) : x[i];
165 }
166 });
167
168 return cudaScanResult<T, vt> { y, result.reduction };
169}
170
175template <typename P, typename I, typename O, typename C>
176void cuda_single_pass_scan(
177 P&& p,
178 cudaScanType scan_type,
179 I input,
180 unsigned count,
181 O output,
182 C op
183 //reduction_it reduction,
184) {
185
186 using T = typename std::iterator_traits<O>::value_type;
187 using E = std::decay_t<P>;
188
189 // Small input specialization. This is the non-recursive branch.
190 cuda_kernel<<<1, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
191
192 using scan_t = cudaBlockScan<E::nt, T>;
193
194 __shared__ union {
195 typename scan_t::storage_t scan;
196 T values[E::nv];
197 } shared;
198
199 auto carry_in = T();
200 for(unsigned cur = 0; cur < count; cur += E::nv) {
201 // Cooperatively load values into register.
202 auto count2 = min(count - cur, E::nv);
203
204 auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(input + cur,
205 tid, count2, shared.values);
206
207 auto result = scan_t()(tid, x, shared.scan,
208 carry_in, cur > 0, count2, op, T(), scan_type);
209
210 // Store the scanned values back to global memory.
211 cuda_reg_to_mem_thread<E::nt, E::vt>(result.scan, tid, count2,
212 output + cur, shared.values);
213
214 // Roll the reduction into carry_in.
215 carry_in = result.reduction;
216 }
217
218 // Store the carry-out to the reduction pointer. This may be a
219 // discard_iterator_t if no reduction is wanted.
220 //if(!tid) *reduction = carry_in;
221 });
222}
223
229template<typename P, typename I, typename O, typename C>
230void cuda_scan_loop(
231 P&& p,
232 cudaScanType scan_type,
233 I input,
234 unsigned count,
235 O output,
236 C op,
237 //reduction_it reduction,
238 void* ptr
239) {
240
241 using E = std::decay_t<P>;
242 using T = typename std::iterator_traits<O>::value_type;
243
244 T* buffer = static_cast<T*>(ptr);
245
246 //launch_t::cta_dim(context).B(count);
247 unsigned B = (count + E::nv - 1) / E::nv;
248
249 if(B > cudaScanRecursionThreshold) {
250
251 //cudaDeviceVector<T> partials(B);
252 //auto buffer = partials.data();
253
254 // upsweep phase
255 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
256
257 __shared__ typename cudaBlockReduce<E::nt, T>::Storage shm;
258
259 // Load the tile's data into register.
260 auto tile = cuda_get_tile(bid, E::nv, count);
261 auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
262 input + tile.begin, tid, tile.count()
263 );
264
265 // Reduce the thread's values into a scalar.
266 T scalar;
267 cuda_strided_iterate<E::nt, E::vt>(
268 [&] (auto i, auto j) { scalar = i ? op(scalar, x[i]) : x[0]; },
269 tid, tile.count()
270 );
271
272 // Reduce across all threads.
273 auto all_reduce = cudaBlockReduce<E::nt, T>()(
274 tid, scalar, shm, tile.count(), op
275 );
276
277 // Store the final reduction to the partials.
278 if(!tid) {
279 buffer[bid] = all_reduce;
280 }
281 });
282
283 // recursively call scan
284 //cuda_scan_loop(p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, S);
285 cuda_scan_loop(
286 p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, buffer+B
287 );
288
289 // downsweep: perform an intra-tile scan and add the scan of the partials
290 // as carry-in
291 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
292
293 using scan_t = cudaBlockScan<E::nt, T>;
294
295 __shared__ union {
296 typename scan_t::storage_t scan;
297 T values[E::nv];
298 } shared;
299
300 // Load a tile to register in thread order.
301 auto tile = cuda_get_tile(bid, E::nv, count);
302 auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(
303 input + tile.begin, tid, tile.count(), shared.values
304 );
305
306 // Scan the array with carry-in from the partials.
307 auto y = scan_t()(tid, x, shared.scan,
308 buffer[bid], bid > 0, tile.count(), op, T(),
309 scan_type).scan;
310
311 // Store the scanned values to the output.
312 cuda_reg_to_mem_thread<E::nt, E::vt>(
313 y, tid, tile.count(), output + tile.begin, shared.values
314 );
315 });
316 }
317 // Small input specialization. This is the non-recursive branch.
318 else {
319 cuda_single_pass_scan(p, scan_type, input, count, output, op);
320 }
321}
322
323} // namespace tf::detail ----------------------------------------------------
324
325namespace tf {
326
339template <typename P, typename T>
340unsigned cuda_scan_buffer_size(unsigned count) {
341 using E = std::decay_t<P>;
342 unsigned B = (count + E::nv - 1) / E::nv;
343 unsigned n = 0;
344 for(auto b=B; b>detail::cudaScanRecursionThreshold; b=(b+E::nv-1)/E::nv) {
345 n += b;
346 }
347 return n*sizeof(T);
348}
349
350// ----------------------------------------------------------------------------
351// inclusive scan
352// ----------------------------------------------------------------------------
353
354//template<typename P, typename I, typename O, typename C>
355//void cuda_inclusive_scan(P&& p, I first, I last, O output, C op) {
356//
357// unsigned count = std::distance(first, last);
358//
359// if(count == 0) {
360// return;
361// }
362//
363// using T = typename std::iterator_traits<O>::value_type;
364//
365// // allocate temporary buffer
366// cudaDeviceVector<std::byte> temp(cuda_scan_buffer_size<P, T>(count));
367//
368// // launch the scan loop
369// detail::cuda_scan_loop(
370// p, detail::cudaScanType::INCLUSIVE, first, count, output, op, temp.data()
371// );
372//
373// // synchronize the execution
374// p.synchronize();
375//}
376
393template<typename P, typename I, typename O, typename C>
395 P&& p, I first, I last, O output, C op, void* buf
396) {
397
398 unsigned count = std::distance(first, last);
399
400 if(count == 0) {
401 return;
402 }
403
404 // launch the scan loop
405 detail::cuda_scan_loop(
406 p, detail::cudaScanType::INCLUSIVE, first, count, output, op, buf
407 );
408}
409
410// ----------------------------------------------------------------------------
411// transform inclusive_scan
412// ----------------------------------------------------------------------------
413
414//template<typename P, typename I, typename O, typename C, typename U>
415//void cuda_transform_inclusive_scan(
416// P&& p, I first, I last, O output, C bop, U uop
417//) {
418//
419// unsigned count = std::distance(first, last);
420//
421// if(count == 0) {
422// return;
423// }
424//
425// using T = typename std::iterator_traits<O>::value_type;
426//
427// // allocate temporary buffer
428// cudaDeviceVector<std::byte> temp(cuda_scan_buffer_size<P, T>(count));
429// auto buf = temp.data();
430//
431// // launch the scan loop
432// detail::cuda_scan_loop(
433// p, detail::cudaScanType::INCLUSIVE,
434// cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),
435// count, output, bop, buf
436// );
437//
438// // synchronize the execution
439// p.synchronize();
440//}
441
460template<typename P, typename I, typename O, typename C, typename U>
462 P&& p, I first, I last, O output, C bop, U uop, void* buf
463) {
464
465 using T = typename std::iterator_traits<O>::value_type;
466
467 unsigned count = std::distance(first, last);
468
469 if(count == 0) {
470 return;
471 }
472
473 // launch the scan loop
474 detail::cuda_scan_loop(
475 p, detail::cudaScanType::INCLUSIVE,
476 cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),
477 count, output, bop, buf
478 );
479}
480
481// ----------------------------------------------------------------------------
482// exclusive scan
483// ----------------------------------------------------------------------------
484
485//template<typename P, typename I, typename O, typename C>
486//void cuda_exclusive_scan(P&& p, I first, I last, O output, C op) {
487//
488// unsigned count = std::distance(first, last);
489//
490// if(count == 0) {
491// return;
492// }
493//
494// using T = typename std::iterator_traits<O>::value_type;
495//
496// // allocate temporary buffer
497// cudaDeviceVector<std::byte> temp(cuda_scan_buffer_size<P, T>(count));
498// auto buf = temp.data();
499//
500// // launch the scan loop
501// detail::cuda_scan_loop(
502// p, detail::cudaScanType::EXCLUSIVE, first, count, output, op, buf
503// );
504//
505// // synchronize the execution
506// p.synchronize();
507//}
508
525template<typename P, typename I, typename O, typename C>
527 P&& p, I first, I last, O output, C op, void* buf
528) {
529
530 unsigned count = std::distance(first, last);
531
532 if(count == 0) {
533 return;
534 }
535
536 // launch the scan loop
537 detail::cuda_scan_loop(
538 p, detail::cudaScanType::EXCLUSIVE, first, count, output, op, buf
539 );
540}
541
542// ----------------------------------------------------------------------------
543// transform exclusive scan
544// ----------------------------------------------------------------------------
545
546//template<typename P, typename I, typename O, typename C, typename U>
547//void cuda_transform_exclusive_scan(
548// P&& p, I first, I last, O output, C bop, U uop
549//) {
550//
551// unsigned count = std::distance(first, last);
552//
553// if(count == 0) {
554// return;
555// }
556//
557// using T = typename std::iterator_traits<O>::value_type;
558//
559// // allocate temporary buffer
560// cudaDeviceVector<std::byte> temp(cuda_scan_buffer_size<P, T>(count));
561// auto buf = temp.data();
562//
563// // launch the scan loop
564// detail::cuda_scan_loop(
565// p, detail::cudaScanType::EXCLUSIVE,
566// cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),
567// count, output, bop, buf
568// );
569//
570// // synchronize the execution
571// p.synchronize();
572//}
573
592template<typename P, typename I, typename O, typename C, typename U>
594 P&& p, I first, I last, O output, C bop, U uop, void* buf
595) {
596
597 using T = typename std::iterator_traits<O>::value_type;
598
599 unsigned count = std::distance(first, last);
600
601 if(count == 0) {
602 return;
603 }
604
605 // launch the scan loop
606 detail::cuda_scan_loop(
607 p, detail::cudaScanType::EXCLUSIVE,
608 cuda_make_load_iterator<T>([=]__device__(auto i){ return uop(*(first+i)); }),
609 count, output, bop, buf
610 );
611}
612
613// ----------------------------------------------------------------------------
614// cudaFlow
615// ----------------------------------------------------------------------------
616
617// Function: inclusive_scan
618template <typename I, typename O, typename C>
619cudaTask cudaFlow::inclusive_scan(I first, I last, O output, C op) {
620 return capture([=](cudaFlowCapturer& cap) {
622 cap.inclusive_scan(first, last, output, op);
623 });
624}
625
626// Function: inclusive_scan
627template <typename I, typename O, typename C>
628void cudaFlow::inclusive_scan(cudaTask task, I first, I last, O output, C op) {
629 capture(task, [=](cudaFlowCapturer& cap) {
631 cap.inclusive_scan(first, last, output, op);
632 });
633}
634
635// Function: exclusive_scan
636template <typename I, typename O, typename C>
637cudaTask cudaFlow::exclusive_scan(I first, I last, O output, C op) {
638 return capture([=](cudaFlowCapturer& cap) {
640 cap.exclusive_scan(first, last, output, op);
641 });
642}
643
644// Function: exclusive_scan
645template <typename I, typename O, typename C>
646void cudaFlow::exclusive_scan(cudaTask task, I first, I last, O output, C op) {
647 capture(task, [=](cudaFlowCapturer& cap) {
649 cap.exclusive_scan(first, last, output, op);
650 });
651}
652
653// Function: transform_inclusive_scan
654template <typename I, typename O, typename B, typename U>
656 I first, I last, O output, B bop, U uop
657) {
658 return capture([=](cudaFlowCapturer& cap) {
660 cap.transform_inclusive_scan(first, last, output, bop, uop);
661 });
662}
663
664// Function: transform_inclusive_scan
665template <typename I, typename O, typename B, typename U>
667 cudaTask task, I first, I last, O output, B bop, U uop
668) {
669 capture(task, [=](cudaFlowCapturer& cap) {
671 cap.transform_inclusive_scan(first, last, output, bop, uop);
672 });
673}
674
675// Function: transform_exclusive_scan
676template <typename I, typename O, typename B, typename U>
678 I first, I last, O output, B bop, U uop
679) {
680 return capture([=](cudaFlowCapturer& cap) {
682 cap.transform_exclusive_scan(first, last, output, bop, uop);
683 });
684}
685
686// Function: transform_exclusive_scan
687template <typename I, typename O, typename B, typename U>
689 cudaTask task, I first, I last, O output, B bop, U uop
690) {
691 capture(task, [=](cudaFlowCapturer& cap) {
693 cap.transform_exclusive_scan(first, last, output, bop, uop);
694 });
695}
696
697// ----------------------------------------------------------------------------
698// cudaFlowCapturer
699// ----------------------------------------------------------------------------
700
701// Function: inclusive_scan
702template <typename I, typename O, typename C>
703cudaTask cudaFlowCapturer::inclusive_scan(I first, I last, O output, C op) {
704
705 using T = typename std::iterator_traits<O>::value_type;
706
707 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
708 std::distance(first, last)
709 );
710
711 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
712 (cudaStream_t stream) mutable {
714 cuda_inclusive_scan(p, first, last, output, op, buf.get().data());
715 });
716}
717
718// Function: inclusive_scan
719template <typename I, typename O, typename C>
721 cudaTask task, I first, I last, O output, C op
722) {
723
724 using T = typename std::iterator_traits<O>::value_type;
725
726 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
727 std::distance(first, last)
728 );
729
730 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
731 (cudaStream_t stream) mutable {
733 cuda_inclusive_scan(p, first, last, output, op, buf.get().data());
734 });
735}
736
737// Function: exclusive_scan
738template <typename I, typename O, typename C>
739cudaTask cudaFlowCapturer::exclusive_scan(I first, I last, O output, C op) {
740
741 using T = typename std::iterator_traits<O>::value_type;
742
743 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
744 std::distance(first, last)
745 );
746
747 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
748 (cudaStream_t stream) mutable {
750 cuda_exclusive_scan(p, first, last, output, op, buf.get().data());
751 });
752}
753
754// Function: exclusive_scan
755template <typename I, typename O, typename C>
757 cudaTask task, I first, I last, O output, C op
758) {
759
760 using T = typename std::iterator_traits<O>::value_type;
761
762 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
763 std::distance(first, last)
764 );
765
766 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
767 (cudaStream_t stream) mutable {
769 cuda_exclusive_scan(p, first, last, output, op, buf.get().data());
770 });
771}
772
773// Function: transform_inclusive_scan
774template <typename I, typename O, typename B, typename U>
776 I first, I last, O output, B bop, U uop
777) {
778
779 using T = typename std::iterator_traits<O>::value_type;
780
781 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
782 std::distance(first, last)
783 );
784
785 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
786 (cudaStream_t stream) mutable {
789 p, first, last, output, bop, uop, buf.get().data()
790 );
791 });
792}
793
794// Function: transform_inclusive_scan
795template <typename I, typename O, typename B, typename U>
797 cudaTask task, I first, I last, O output, B bop, U uop
798) {
799
800 using T = typename std::iterator_traits<O>::value_type;
801
802 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
803 std::distance(first, last)
804 );
805
806 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
807 (cudaStream_t stream) mutable {
810 p, first, last, output, bop, uop, buf.get().data()
811 );
812 });
813}
814
815// Function: transform_exclusive_scan
816template <typename I, typename O, typename B, typename U>
818 I first, I last, O output, B bop, U uop
819) {
820
821 using T = typename std::iterator_traits<O>::value_type;
822
823 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
824 std::distance(first, last)
825 );
826
827 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
828 (cudaStream_t stream) mutable {
831 p, first, last, output, bop, uop, buf.get().data()
832 );
833 });
834}
835
836// Function: transform_exclusive_scan
837template <typename I, typename O, typename B, typename U>
839 cudaTask task, I first, I last, O output, B bop, U uop
840) {
841
842 using T = typename std::iterator_traits<O>::value_type;
843
844 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
845 std::distance(first, last)
846 );
847
848 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
849 (cudaStream_t stream) mutable {
852 p, first, last, output, bop, uop, buf.get().data()
853 );
854 });
855}
856
857
858} // end of namespace tf -----------------------------------------------------
859
860
861
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask exclusive_scan(I first, I last, O output, C op)
similar to cudaFlowCapturer::inclusive_scan but excludes the first value
Definition scan.hpp:739
cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop)
captures kernels that perform parallel inclusive scan over a range of transformed items
Definition scan.hpp:775
cudaTask inclusive_scan(I first, I last, O output, C op)
captures kernels that perform parallel inclusive scan over a range of items
Definition scan.hpp:703
cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop)
similar to cudaFlowCapturer::transform_inclusive_scan but excludes the first value
Definition scan.hpp:817
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask inclusive_scan(I first, I last, O output, C op)
creates a task to perform parallel inclusive scan over a range of items
Definition scan.hpp:619
cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop)
creates a task to perform parallel inclusive scan over a range of transformed items
Definition scan.hpp:655
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask exclusive_scan(I first, I last, O output, C op)
similar to cudaFlow::inclusive_scan but excludes the first value
Definition scan.hpp:637
cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop)
similar to cudaFlow::transform_inclusive_scan but excludes the first value
Definition scan.hpp:677
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
T count(T... args)
T distance(T... args)
T forward(T... args)
T min(T... args)
taskflow namespace
Definition small_vector.hpp:27
void cuda_inclusive_scan(P &&p, I first, I last, O output, C op, void *buf)
performs asynchronous inclusive scan over a range of items
Definition scan.hpp:394
void cuda_transform_exclusive_scan(P &&p, I first, I last, O output, C bop, U uop, void *buf)
performs asynchronous exclusive scan over a range of items
Definition scan.hpp:593
void cuda_exclusive_scan(P &&p, I first, I last, O output, C op, void *buf)
performs asynchronous exclusive scan over a range of items
Definition scan.hpp:526
void cuda_transform_inclusive_scan(P &&p, I first, I last, O output, C bop, U uop, void *buf)
performs asynchronous inclusive scan over a range of transformed items
Definition scan.hpp:461
unsigned cuda_scan_buffer_size(unsigned count)
queries the buffer size in bytes needed to call scan kernels
Definition scan.hpp:340
cuda reduce algorithms include file