3#include "../cudaflow.hpp"
15template <
typename P,
typename I,
typename C>
16void cuda_for_each_loop(P&& p, I first,
unsigned count, C c) {
20 unsigned B = (
count + E::nv - 1) / E::nv;
22 cuda_kernel<<<B, E::nt, 0, p.stream()>>>(
23 [=] __device__ (
auto tid,
auto bid) {
24 auto tile = cuda_get_tile(bid, E::nv, count);
25 cuda_strided_iterate<E::nt, E::vt>([=](
auto,
auto j) {
26 c(*(first + tile.begin + j));
27 }, tid, tile.count());
32template <
typename P,
typename I,
typename C>
33void cuda_for_each_index_loop(
34 P&& p, I first, I inc,
unsigned count, C c
39 unsigned B = (
count + E::nv - 1) / E::nv;
41 cuda_kernel<<<B, E::nt, 0, p.stream()>>>(
42 [=]__device__(
auto tid,
auto bid) {
43 auto tile = cuda_get_tile(bid, E::nv, count);
44 cuda_strided_iterate<E::nt, E::vt>([=]__device__(
auto,
auto j) {
45 c(first + inc*(tile.begin+j));
46 }, tid, tile.count());
68template <
typename P,
typename C>
70 cuda_kernel<<<1, 1, 0, p.stream()>>>(
71 [=]__device__(
auto,
auto)
mutable { c(); }
96template <
typename P,
typename I,
typename C>
105 detail::cuda_for_each_loop(p, first, count, c);
137template <
typename P,
typename I,
typename C>
140 if(is_range_invalid(first, last, inc)) {
141 TF_THROW(
"invalid range [", first,
", ", last,
") with inc size ", inc);
144 unsigned count = distance(first, last, inc);
150 detail::cuda_for_each_index_loop(p, first, inc, count, c);
170 return kernel(1, 1, 0, cuda_single_task<C>, c);
176 return kernel(task, 1, 1, 0, cuda_single_task<C>, c);
180template <
typename I,
typename C>
189template <
typename I,
typename C>
198template <
typename I,
typename C>
207template <
typename I,
typename C>
220template <
typename I,
typename C>
222 return on([=](cudaStream_t stream)
mutable {
229template <
typename I,
typename C>
231 return on([=] (cudaStream_t stream)
mutable {
238template <
typename I,
typename C>
240 on(task, [=](cudaStream_t stream)
mutable {
247template <
typename I,
typename C>
249 cudaTask task, I beg, I end, I inc, C c
251 on(task, [=] (cudaStream_t stream)
mutable {
260 return on([=] (cudaStream_t stream)
mutable {
269 on(task, [=] (cudaStream_t stream)
mutable {
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask for_each(I first, I last, C callable)
captures a kernel that applies a callable to each dereferenced element of the data array
Definition for_each.hpp:221
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask single_task(C c)
capturers a kernel to runs the given callable with only one thread
Definition for_each.hpp:259
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask for_each_index(I first, I last, I step, C callable)
captures a kernel that applies a callable to each index in the range with the step size
Definition for_each.hpp:230
cudaTask for_each(I first, I last, C callable)
applies a callable to each dereferenced element of the data array
Definition for_each.hpp:181
cudaTask for_each_index(I first, I last, I step, C callable)
applies a callable to each index in the range with the step size
Definition for_each.hpp:190
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
creates a kernel task
Definition cudaflow.hpp:1272
cudaTask single_task(C c)
runs a callable with only a single kernel thread
Definition for_each.hpp:169
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
taskflow namespace
Definition small_vector.hpp:27
void cuda_for_each_index(P &&p, I first, I last, I inc, C c)
performs asynchronous parallel iterations over an index-based range of items
Definition for_each.hpp:138
void cuda_single_task(P &&p, C c)
runs a callable asynchronously using one kernel thread
Definition for_each.hpp:69
void cuda_for_each(P &&p, I first, I last, C c)
performs asynchronous parallel iterations over a range of items
Definition for_each.hpp:97