17inline constexpr unsigned cudaScanRecursionThreshold = 8;
20enum class cudaScanType :
int {
26template<
typename T,
unsigned vt = 0,
bool is_array = (vt > 0)>
27struct cudaScanResult {
33template<
typename T,
unsigned vt>
34struct cudaScanResult<T, vt, true> {
35 cudaArray<T, vt> scan;
42template<
unsigned nt,
typename T>
45 const static unsigned num_warps = nt / CUDA_WARP_SIZE;
46 const static unsigned num_passes = log2(nt);
47 const static unsigned capacity = nt + num_warps;
52 struct { T threads[nt], warps[num_warps]; };
56 template<
typename op_t>
57 __device__ cudaScanResult<T> operator ()(
64 cudaScanType type = cudaScanType::EXCLUSIVE
69 template<
unsigned vt,
typename op_t>
70 __device__ cudaScanResult<T, vt> operator()(
75 bool use_carry_in =
false,
79 cudaScanType type = cudaScanType::EXCLUSIVE
84template <
unsigned nt,
typename T>
85template<
typename op_t>
86__device__ cudaScanResult<T> cudaBlockScan<nt, T>::operator () (
87 unsigned tid, T x, storage_t& storage,
unsigned count, op_t op,
88 T init, cudaScanType type
92 storage.data[first + tid] = x;
95 cuda_iterate<num_passes>([&](
auto pass) {
96 if(
auto offset = 1<<pass; tid >= offset) {
97 x = op(storage.data[first + tid - offset], x);
100 storage.data[first + tid] = x;
104 cudaScanResult<T> result;
105 result.reduction = storage.data[first +
count - 1];
106 result.scan = (tid <
count) ?
107 (cudaScanType::INCLUSIVE == type ? x :
108 (tid ? storage.data[first + tid - 1] : init)) :
116template <
unsigned nt,
typename T>
117template<
unsigned vt,
typename op_t>
118__device__ cudaScanResult<T, vt> cudaBlockScan<nt, T>::operator()(
124 unsigned count, op_t op,
130 if(count >= nt * vt) {
131 cuda_iterate<vt>([&](
auto i) {
132 x[i] = i ? op(x[i], x[i - 1]) : x[i];
135 cuda_iterate<vt>([&](
auto i) {
136 auto index = vt * tid + i;
138 ((index <
count) ? op(x[i], x[i - 1]) : x[i - 1]) :
139 (x[i] = (index <
count) ? x[i] : init);
144 auto result = operator()(
145 tid, x[vt - 1], storage,
146 (count + vt - 1) / vt, op, init, cudaScanType::EXCLUSIVE
152 result.reduction = op(carry_in, result.reduction);
153 result.scan = tid ? op(carry_in, result.scan) : carry_in;
155 use_carry_in = tid > 0;
159 cuda_iterate<vt>([&](
auto i) {
160 if(cudaScanType::EXCLUSIVE == type) {
161 y[i] = i ? x[i - 1] : result.scan;
162 if(use_carry_in && i > 0) y[i] = op(result.scan, y[i]);
164 y[i] = use_carry_in ? op(x[i], result.scan) : x[i];
168 return cudaScanResult<T, vt> { y, result.reduction };
175template <
typename P,
typename I,
typename O,
typename C>
176void cuda_single_pass_scan(
178 cudaScanType scan_type,
190 cuda_kernel<<<1, E::nt, 0, p.stream()>>>([=] __device__ (
auto tid,
auto bid) {
192 using scan_t = cudaBlockScan<E::nt, T>;
195 typename scan_t::storage_t scan;
200 for(
unsigned cur = 0; cur <
count; cur += E::nv) {
202 auto count2 =
min(count - cur, E::nv);
204 auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(input + cur,
205 tid, count2, shared.values);
207 auto result = scan_t()(tid, x, shared.scan,
208 carry_in, cur > 0, count2, op, T(), scan_type);
211 cuda_reg_to_mem_thread<E::nt, E::vt>(result.scan, tid, count2,
212 output + cur, shared.values);
215 carry_in = result.reduction;
229template<
typename P,
typename I,
typename O,
typename C>
232 cudaScanType scan_type,
244 T* buffer =
static_cast<T*
>(ptr);
247 unsigned B = (
count + E::nv - 1) / E::nv;
249 if(B > cudaScanRecursionThreshold) {
255 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (
auto tid,
auto bid) {
257 __shared__
typename cudaBlockReduce<E::nt, T>::Storage shm;
260 auto tile = cuda_get_tile(bid, E::nv, count);
261 auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
262 input + tile.begin, tid, tile.count()
267 cuda_strided_iterate<E::nt, E::vt>(
268 [&] (
auto i,
auto j) { scalar = i ? op(scalar, x[i]) : x[0]; },
273 auto all_reduce = cudaBlockReduce<E::nt, T>()(
274 tid, scalar, shm, tile.count(), op
279 buffer[bid] = all_reduce;
286 p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, buffer+B
291 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (
auto tid,
auto bid) {
293 using scan_t = cudaBlockScan<E::nt, T>;
296 typename scan_t::storage_t scan;
301 auto tile = cuda_get_tile(bid, E::nv, count);
302 auto x = cuda_mem_to_reg_thread<E::nt, E::vt>(
303 input + tile.begin, tid, tile.count(), shared.values
307 auto y = scan_t()(tid, x, shared.scan,
308 buffer[bid], bid > 0, tile.count(), op, T(),
312 cuda_reg_to_mem_thread<E::nt, E::vt>(
313 y, tid, tile.count(), output + tile.begin, shared.values
319 cuda_single_pass_scan(p, scan_type, input, count, output, op);
339template <
typename P,
typename T>
342 unsigned B = (count + E::nv - 1) / E::nv;
344 for(
auto b=B; b>detail::cudaScanRecursionThreshold; b=(b+E::nv-1)/E::nv) {
393template<
typename P,
typename I,
typename O,
typename C>
395 P&& p, I first, I last, O output, C op,
void* buf
405 detail::cuda_scan_loop(
406 p, detail::cudaScanType::INCLUSIVE, first, count, output, op, buf
460template<
typename P,
typename I,
typename O,
typename C,
typename U>
462 P&& p, I first, I last, O output, C bop, U uop,
void* buf
474 detail::cuda_scan_loop(
475 p, detail::cudaScanType::INCLUSIVE,
476 cuda_make_load_iterator<T>([=]__device__(
auto i){
return uop(*(first+i)); }),
477 count, output, bop, buf
525template<
typename P,
typename I,
typename O,
typename C>
527 P&& p, I first, I last, O output, C op,
void* buf
537 detail::cuda_scan_loop(
538 p, detail::cudaScanType::EXCLUSIVE, first, count, output, op, buf
592template<
typename P,
typename I,
typename O,
typename C,
typename U>
594 P&& p, I first, I last, O output, C bop, U uop,
void* buf
606 detail::cuda_scan_loop(
607 p, detail::cudaScanType::EXCLUSIVE,
608 cuda_make_load_iterator<T>([=]__device__(
auto i){
return uop(*(first+i)); }),
609 count, output, bop, buf
618template <
typename I,
typename O,
typename C>
627template <
typename I,
typename O,
typename C>
636template <
typename I,
typename O,
typename C>
645template <
typename I,
typename O,
typename C>
654template <
typename I,
typename O,
typename B,
typename U>
656 I first, I last, O output, B bop, U uop
665template <
typename I,
typename O,
typename B,
typename U>
667 cudaTask task, I first, I last, O output, B bop, U uop
676template <
typename I,
typename O,
typename B,
typename U>
678 I first, I last, O output, B bop, U uop
687template <
typename I,
typename O,
typename B,
typename U>
689 cudaTask task, I first, I last, O output, B bop, U uop
702template <
typename I,
typename O,
typename C>
707 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
711 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
712 (cudaStream_t stream)
mutable {
719template <
typename I,
typename O,
typename C>
721 cudaTask task, I first, I last, O output, C op
726 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
730 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
731 (cudaStream_t stream)
mutable {
738template <
typename I,
typename O,
typename C>
743 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
747 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
748 (cudaStream_t stream)
mutable {
755template <
typename I,
typename O,
typename C>
757 cudaTask task, I first, I last, O output, C op
762 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
766 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
767 (cudaStream_t stream)
mutable {
774template <
typename I,
typename O,
typename B,
typename U>
776 I first, I last, O output, B bop, U uop
781 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
785 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
786 (cudaStream_t stream)
mutable {
789 p, first, last, output, bop, uop, buf.get().data()
795template <
typename I,
typename O,
typename B,
typename U>
797 cudaTask task, I first, I last, O output, B bop, U uop
802 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
806 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
807 (cudaStream_t stream)
mutable {
810 p, first, last, output, bop, uop, buf.get().data()
816template <
typename I,
typename O,
typename B,
typename U>
818 I first, I last, O output, B bop, U uop
823 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
827 return on([=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
828 (cudaStream_t stream)
mutable {
831 p, first, last, output, bop, uop, buf.get().data()
837template <
typename I,
typename O,
typename B,
typename U>
839 cudaTask task, I first, I last, O output, B bop, U uop
844 auto bufsz = cuda_scan_buffer_size<cudaDefaultExecutionPolicy, T>(
848 on(task, [=, buf=MoC{cudaDeviceVector<std::byte>(bufsz)}]
849 (cudaStream_t stream)
mutable {
852 p, first, last, output, bop, uop, buf.get().data()
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask exclusive_scan(I first, I last, O output, C op)
similar to cudaFlowCapturer::inclusive_scan but excludes the first value
Definition scan.hpp:739
cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop)
captures kernels that perform parallel inclusive scan over a range of transformed items
Definition scan.hpp:775
cudaTask inclusive_scan(I first, I last, O output, C op)
captures kernels that perform parallel inclusive scan over a range of items
Definition scan.hpp:703
cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop)
similar to cudaFlowCapturer::transform_inclusive_scan but excludes the first value
Definition scan.hpp:817
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask inclusive_scan(I first, I last, O output, C op)
creates a task to perform parallel inclusive scan over a range of items
Definition scan.hpp:619
cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop)
creates a task to perform parallel inclusive scan over a range of transformed items
Definition scan.hpp:655
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask exclusive_scan(I first, I last, O output, C op)
similar to cudaFlow::inclusive_scan but excludes the first value
Definition scan.hpp:637
cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop)
similar to cudaFlow::transform_inclusive_scan but excludes the first value
Definition scan.hpp:677
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
taskflow namespace
Definition small_vector.hpp:27
void cuda_inclusive_scan(P &&p, I first, I last, O output, C op, void *buf)
performs asynchronous inclusive scan over a range of items
Definition scan.hpp:394
void cuda_transform_exclusive_scan(P &&p, I first, I last, O output, C bop, U uop, void *buf)
performs asynchronous exclusive scan over a range of items
Definition scan.hpp:593
void cuda_exclusive_scan(P &&p, I first, I last, O output, C op, void *buf)
performs asynchronous exclusive scan over a range of items
Definition scan.hpp:526
void cuda_transform_inclusive_scan(P &&p, I first, I last, O output, C bop, U uop, void *buf)
performs asynchronous inclusive scan over a range of transformed items
Definition scan.hpp:461
unsigned cuda_scan_buffer_size(unsigned count)
queries the buffer size in bytes needed to call scan kernels
Definition scan.hpp:340
cuda reduce algorithms include file