Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
for_each.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "../cudaflow.hpp"
4
10namespace tf {
11
12namespace detail {
13
15template <typename P, typename I, typename C>
16void cuda_for_each_loop(P&& p, I first, unsigned count, C c) {
17
18 using E = std::decay_t<P>;
19
20 unsigned B = (count + E::nv - 1) / E::nv;
21
22 cuda_kernel<<<B, E::nt, 0, p.stream()>>>(
23 [=] __device__ (auto tid, auto bid) {
24 auto tile = cuda_get_tile(bid, E::nv, count);
25 cuda_strided_iterate<E::nt, E::vt>([=](auto, auto j) {
26 c(*(first + tile.begin + j));
27 }, tid, tile.count());
28 });
29}
30
32template <typename P, typename I, typename C>
33void cuda_for_each_index_loop(
34 P&& p, I first, I inc, unsigned count, C c
35) {
36
37 using E = std::decay_t<P>;
38
39 unsigned B = (count + E::nv - 1) / E::nv;
40
41 cuda_kernel<<<B, E::nt, 0, p.stream()>>>(
42 [=]__device__(auto tid, auto bid) {
43 auto tile = cuda_get_tile(bid, E::nv, count);
44 cuda_strided_iterate<E::nt, E::vt>([=]__device__(auto, auto j) {
45 c(first + inc*(tile.begin+j));
46 }, tid, tile.count());
47 });
48}
49
50} // end of namespace detail -------------------------------------------------
51
52// ----------------------------------------------------------------------------
53// cuda standard algorithms: single_task/for_each/for_each_index
54// ----------------------------------------------------------------------------
55
68template <typename P, typename C>
69void cuda_single_task(P&& p, C c) {
70 cuda_kernel<<<1, 1, 0, p.stream()>>>(
71 [=]__device__(auto, auto) mutable { c(); }
72 );
73}
74
96template <typename P, typename I, typename C>
97void cuda_for_each(P&& p, I first, I last, C c) {
98
99 unsigned count = std::distance(first, last);
100
101 if(count == 0) {
102 return;
103 }
104
105 detail::cuda_for_each_loop(p, first, count, c);
106}
107
137template <typename P, typename I, typename C>
138void cuda_for_each_index(P&& p, I first, I last, I inc, C c) {
139
140 if(is_range_invalid(first, last, inc)) {
141 TF_THROW("invalid range [", first, ", ", last, ") with inc size ", inc);
142 }
143
144 unsigned count = distance(first, last, inc);
145
146 if(count == 0) {
147 return;
148 }
149
150 detail::cuda_for_each_index_loop(p, first, inc, count, c);
151}
152
153// ----------------------------------------------------------------------------
154// single_task
155// ----------------------------------------------------------------------------
156
158template <typename C>
159__global__ void cuda_single_task(C callable) {
160 callable();
161}
162
163// ----------------------------------------------------------------------------
164// cudaFlow
165// ----------------------------------------------------------------------------
166
167// Function: single_task
168template <typename C>
170 return kernel(1, 1, 0, cuda_single_task<C>, c);
171}
172
173// Function: single_task
174template <typename C>
176 return kernel(task, 1, 1, 0, cuda_single_task<C>, c);
177}
178
179// Function: for_each
180template <typename I, typename C>
181cudaTask cudaFlow::for_each(I first, I last, C c) {
182 return capture([=](cudaFlowCapturer& cap) mutable {
184 cap.for_each(first, last, c);
185 });
186}
187
188// Function: for_each_index
189template <typename I, typename C>
190cudaTask cudaFlow::for_each_index(I first, I last, I inc, C c) {
191 return capture([=](cudaFlowCapturer& cap) mutable {
193 cap.for_each_index(first, last, inc, c);
194 });
195}
196
197// Function: for_each
198template <typename I, typename C>
199void cudaFlow::for_each(cudaTask task, I first, I last, C c) {
200 capture(task, [=](cudaFlowCapturer& cap) mutable {
202 cap.for_each(first, last, c);
203 });
204}
205
206// Function: for_each_index
207template <typename I, typename C>
208void cudaFlow::for_each_index(cudaTask task, I first, I last, I inc, C c) {
209 capture(task, [=](cudaFlowCapturer& cap) mutable {
211 cap.for_each_index(first, last, inc, c);
212 });
213}
214
215// ----------------------------------------------------------------------------
216// cudaFlowCapturer
217// ----------------------------------------------------------------------------
218
219// Function: for_each
220template <typename I, typename C>
222 return on([=](cudaStream_t stream) mutable {
224 cuda_for_each(p, first, last, c);
225 });
226}
227
228// Function: for_each_index
229template <typename I, typename C>
231 return on([=] (cudaStream_t stream) mutable {
233 cuda_for_each_index(p, beg, end, inc, c);
234 });
235}
236
237// Function: for_each
238template <typename I, typename C>
239void cudaFlowCapturer::for_each(cudaTask task, I first, I last, C c) {
240 on(task, [=](cudaStream_t stream) mutable {
242 cuda_for_each(p, first, last, c);
243 });
244}
245
246// Function: for_each_index
247template <typename I, typename C>
249 cudaTask task, I beg, I end, I inc, C c
250) {
251 on(task, [=] (cudaStream_t stream) mutable {
253 cuda_for_each_index(p, beg, end, inc, c);
254 });
255}
256
257// Function: single_task
258template <typename C>
260 return on([=] (cudaStream_t stream) mutable {
262 cuda_single_task(p, callable);
263 });
264}
265
266// Function: single_task
267template <typename C>
269 on(task, [=] (cudaStream_t stream) mutable {
271 cuda_single_task(p, callable);
272 });
273}
274
275} // end of namespace tf -----------------------------------------------------
276
277
278
279
280
281
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask for_each(I first, I last, C callable)
captures a kernel that applies a callable to each dereferenced element of the data array
Definition for_each.hpp:221
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask single_task(C c)
capturers a kernel to runs the given callable with only one thread
Definition for_each.hpp:259
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask for_each_index(I first, I last, I step, C callable)
captures a kernel that applies a callable to each index in the range with the step size
Definition for_each.hpp:230
cudaTask for_each(I first, I last, C callable)
applies a callable to each dereferenced element of the data array
Definition for_each.hpp:181
cudaTask for_each_index(I first, I last, I step, C callable)
applies a callable to each index in the range with the step size
Definition for_each.hpp:190
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
creates a kernel task
Definition cudaflow.hpp:1272
cudaTask single_task(C c)
runs a callable with only a single kernel thread
Definition for_each.hpp:169
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
T count(T... args)
T distance(T... args)
T forward(T... args)
taskflow namespace
Definition small_vector.hpp:27
void cuda_for_each_index(P &&p, I first, I last, I inc, C c)
performs asynchronous parallel iterations over an index-based range of items
Definition for_each.hpp:138
void cuda_single_task(P &&p, C c)
runs a callable asynchronously using one kernel thread
Definition for_each.hpp:69
void cuda_for_each(P &&p, I first, I last, C c)
performs asynchronous parallel iterations over a range of items
Definition for_each.hpp:97