Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
transform.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "../cudaflow.hpp"
4
10namespace tf {
11
12// ----------------------------------------------------------------------------
13// transform
14// ----------------------------------------------------------------------------
15
16namespace detail {
17
19template <typename P, typename I, typename O, typename C>
20void cuda_transform_loop(P&& p, I first, unsigned count, O output, C op) {
21
22 using E = std::decay_t<P>;
23
24 unsigned B = (count + E::nv - 1) / E::nv;
25
26 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=]__device__(auto tid, auto bid) {
27 auto tile = cuda_get_tile(bid, E::nv, count);
28 cuda_strided_iterate<E::nt, E::vt>([=]__device__(auto, auto j) {
29 auto offset = j + tile.begin;
30 *(output + offset) = op(*(first+offset));
31 }, tid, tile.count());
32 });
33}
34
36template <typename P, typename I1, typename I2, typename O, typename C>
37void cuda_transform_loop(
38 P&& p, I1 first1, I2 first2, unsigned count, O output, C op
39) {
40
41 using E = std::decay_t<P>;
42
43 unsigned B = (count + E::nv - 1) / E::nv;
44
45 cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=]__device__(auto tid, auto bid) {
46 auto tile = cuda_get_tile(bid, E::nv, count);
47 cuda_strided_iterate<E::nt, E::vt>([=]__device__(auto, auto j) {
48 auto offset = j + tile.begin;
49 *(output + offset) = op(*(first1+offset), *(first2+offset));
50 }, tid, tile.count());
51 });
52}
53
54} // end of namespace detail -------------------------------------------------
55
56// ----------------------------------------------------------------------------
57// CUDA standard algorithms: transform
58// ----------------------------------------------------------------------------
59
83template <typename P, typename I, typename O, typename C>
84void cuda_transform(P&& p, I first, I last, O output, C op) {
85
86 unsigned count = std::distance(first, last);
87
88 if(count == 0) {
89 return;
90 }
91
92 detail::cuda_transform_loop(p, first, count, output, op);
93}
94
119template <typename P, typename I1, typename I2, typename O, typename C>
121 P&& p, I1 first1, I1 last1, I2 first2, O output, C op
122) {
123
124 unsigned count = std::distance(first1, last1);
125
126 if(count == 0) {
127 return;
128 }
129
130 detail::cuda_transform_loop(p, first1, first2, count, output, op);
131}
132
133// ----------------------------------------------------------------------------
134// cudaFlow
135// ----------------------------------------------------------------------------
136
137// Function: transform
138template <typename I, typename O, typename C>
139cudaTask cudaFlow::transform(I first, I last, O output, C c) {
140 return capture([=](cudaFlowCapturer& cap) mutable {
142 cap.transform(first, last, output, c);
143 });
144}
145
146// Function: transform
147template <typename I1, typename I2, typename O, typename C>
148cudaTask cudaFlow::transform(I1 first1, I1 last1, I2 first2, O output, C c) {
149 return capture([=](cudaFlowCapturer& cap) mutable {
151 cap.transform(first1, last1, first2, output, c);
152 });
153}
154
155// Function: update transform
156template <typename I, typename O, typename C>
157void cudaFlow::transform(cudaTask task, I first, I last, O output, C c) {
158 capture(task, [=](cudaFlowCapturer& cap) mutable {
160 cap.transform(first, last, output, c);
161 });
162}
163
164// Function: update transform
165template <typename I1, typename I2, typename O, typename C>
167 cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c
168) {
169 capture(task, [=](cudaFlowCapturer& cap) mutable {
171 cap.transform(first1, last1, first2, output, c);
172 });
173}
174
175// ----------------------------------------------------------------------------
176// cudaFlowCapturer
177// ----------------------------------------------------------------------------
178
179// Function: transform
180template <typename I, typename O, typename C>
181cudaTask cudaFlowCapturer::transform(I first, I last, O output, C op) {
182 return on([=](cudaStream_t stream) mutable {
184 cuda_transform(p, first, last, output, op);
185 });
186}
187
188// Function: transform
189template <typename I1, typename I2, typename O, typename C>
191 I1 first1, I1 last1, I2 first2, O output, C op
192) {
193 return on([=](cudaStream_t stream) mutable {
195 cuda_transform(p, first1, last1, first2, output, op);
196 });
197}
198
199// Function: transform
200template <typename I, typename O, typename C>
202 cudaTask task, I first, I last, O output, C op
203) {
204 on(task, [=] (cudaStream_t stream) mutable {
206 cuda_transform(p, first, last, output, op);
207 });
208}
209
210// Function: transform
211template <typename I1, typename I2, typename O, typename C>
213 cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op
214) {
215 on(task, [=] (cudaStream_t stream) mutable {
217 cuda_transform(p, first1, last1, first2, output, op);
218 });
219}
220
221} // end of namespace tf -----------------------------------------------------
222
223
224
225
226
227
class to define execution policy for CUDA standard algorithms
Definition cuda_execution_policy.hpp:29
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask transform(I first, I last, O output, C op)
captures a kernel that transforms an input range to an output range
Definition transform.hpp:181
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask transform(I first, I last, O output, C op)
applies a callable to a source range and stores the result in a target range
Definition transform.hpp:139
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
T count(T... args)
T distance(T... args)
T forward(T... args)
taskflow namespace
Definition small_vector.hpp:27
void cuda_transform(P &&p, I first, I last, O output, C op)
performs asynchronous parallel transforms over a range of items
Definition transform.hpp:84