3#include "../cudaflow.hpp"
19template <
typename P,
typename I,
typename O,
typename C>
20void cuda_transform_loop(P&& p, I first,
unsigned count, O output, C op) {
24 unsigned B = (
count + E::nv - 1) / E::nv;
26 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=]__device__(
auto tid,
auto bid) {
27 auto tile = cuda_get_tile(bid, E::nv, count);
28 cuda_strided_iterate<E::nt, E::vt>([=]__device__(
auto,
auto j) {
29 auto offset = j + tile.begin;
30 *(output + offset) = op(*(first+offset));
31 }, tid, tile.count());
36template <
typename P,
typename I1,
typename I2,
typename O,
typename C>
37void cuda_transform_loop(
38 P&& p, I1 first1, I2 first2,
unsigned count, O output, C op
43 unsigned B = (
count + E::nv - 1) / E::nv;
45 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=]__device__(
auto tid,
auto bid) {
46 auto tile = cuda_get_tile(bid, E::nv, count);
47 cuda_strided_iterate<E::nt, E::vt>([=]__device__(
auto,
auto j) {
48 auto offset = j + tile.begin;
49 *(output + offset) = op(*(first1+offset), *(first2+offset));
50 }, tid, tile.count());
83template <
typename P,
typename I,
typename O,
typename C>
92 detail::cuda_transform_loop(p, first, count, output, op);
119template <
typename P,
typename I1,
typename I2,
typename O,
typename C>
121 P&& p, I1 first1, I1 last1, I2 first2, O output, C op
130 detail::cuda_transform_loop(p, first1, first2, count, output, op);
138template <
typename I,
typename O,
typename C>
147template <
typename I1,
typename I2,
typename O,
typename C>
151 cap.
transform(first1, last1, first2, output, c);
156template <
typename I,
typename O,
typename C>
165template <
typename I1,
typename I2,
typename O,
typename C>
167 cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c
171 cap.
transform(first1, last1, first2, output, c);
180template <
typename I,
typename O,
typename C>
182 return on([=](cudaStream_t stream)
mutable {
189template <
typename I1,
typename I2,
typename O,
typename C>
191 I1 first1, I1 last1, I2 first2, O output, C op
193 return on([=](cudaStream_t stream)
mutable {
200template <
typename I,
typename O,
typename C>
202 cudaTask task, I first, I last, O output, C op
204 on(task, [=] (cudaStream_t stream)
mutable {
211template <
typename I1,
typename I2,
typename O,
typename C>
213 cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op
215 on(task, [=] (cudaStream_t stream)
mutable {
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask transform(I first, I last, O output, C op)
captures a kernel that transforms an input range to an output range
Definition transform.hpp:181
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask transform(I first, I last, O output, C op)
applies a callable to a source range and stores the result in a target range
Definition transform.hpp:139
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
taskflow namespace
Definition small_vector.hpp:27
void cuda_transform(P &&p, I first, I last, O output, C op)
performs asynchronous parallel transforms over a range of items
Definition transform.hpp:84