![]() |
Taskflow
3.2.0-Master-Branch
|
class to create a cudaFlow graph using stream capture More...
#include <cuda_capturer.hpp>
Public Member Functions | |
cudaFlowCapturer () | |
constrcts a standalone cudaFlowCapturer | |
virtual | ~cudaFlowCapturer () |
destructs the cudaFlowCapturer | |
bool | empty () const |
queries the emptiness of the graph | |
size_t | num_tasks () const |
queries the number of tasks | |
void | clear () |
clear this cudaFlow capturer | |
void | dump (std::ostream &os) const |
dumps the capture graph into a DOT format through an output stream | |
template<typename OPT , typename... ArgsT> | |
OPT & | make_optimizer (ArgsT &&... args) |
selects a different optimization algorithm | |
template<typename C , std::enable_if_t< std::is_invocable_r_v< void, C, cudaStream_t >, void > * = nullptr> | |
cudaTask | on (C &&callable) |
captures a sequential CUDA operations from the given callable | |
template<typename C , std::enable_if_t< std::is_invocable_r_v< void, C, cudaStream_t >, void > * = nullptr> | |
void | on (cudaTask task, C &&callable) |
updates a capture task to another sequential CUDA operations | |
cudaTask | noop () |
captures a no-operation task | |
void | noop (cudaTask task) |
updates a task to a no-operation task | |
cudaTask | memcpy (void *dst, const void *src, size_t count) |
copies data between host and device asynchronously through a stream | |
void | memcpy (cudaTask task, void *dst, const void *src, size_t count) |
updates a capture task to a memcpy operation | |
template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * = nullptr> | |
cudaTask | copy (T *tgt, const T *src, size_t num) |
captures a copy task of typed data | |
template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * = nullptr> | |
void | copy (cudaTask task, T *tgt, const T *src, size_t num) |
updates a capture task to a copy operation | |
cudaTask | memset (void *ptr, int v, size_t n) |
initializes or sets GPU memory to the given value byte by byte | |
void | memset (cudaTask task, void *ptr, int value, size_t n) |
updates a capture task to a memset operation | |
template<typename F , typename... ArgsT> | |
cudaTask | kernel (dim3 g, dim3 b, size_t s, F f, ArgsT &&... args) |
captures a kernel | |
template<typename F , typename... ArgsT> | |
void | kernel (cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT &&... args) |
updates a capture task to a kernel operation | |
template<typename C > | |
cudaTask | single_task (C c) |
capturers a kernel to runs the given callable with only one thread | |
template<typename C > | |
void | single_task (cudaTask task, C c) |
updates a capture task to a single-threaded kernel | |
template<typename I , typename C > | |
cudaTask | for_each (I first, I last, C callable) |
captures a kernel that applies a callable to each dereferenced element of the data array | |
template<typename I , typename C > | |
void | for_each (cudaTask task, I first, I last, C callable) |
updates a capture task to a for-each kernel task | |
template<typename I , typename C > | |
cudaTask | for_each_index (I first, I last, I step, C callable) |
captures a kernel that applies a callable to each index in the range with the step size | |
template<typename I , typename C > | |
void | for_each_index (cudaTask task, I first, I last, I step, C callable) |
updates a capture task to a for-each-index kernel task | |
template<typename I , typename O , typename C > | |
cudaTask | transform (I first, I last, O output, C op) |
captures a kernel that transforms an input range to an output range | |
template<typename I , typename O , typename C > | |
void | transform (cudaTask task, I first, I last, O output, C op) |
updates a capture task to a transform kernel task | |
template<typename I1 , typename I2 , typename O , typename C > | |
cudaTask | transform (I1 first1, I1 last1, I2 first2, O output, C op) |
captures a kernel that transforms two input ranges to an output range | |
template<typename I1 , typename I2 , typename O , typename C > | |
void | transform (cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op) |
updates a capture task to a transform kernel task | |
template<typename I , typename T , typename C > | |
cudaTask | reduce (I first, I last, T *result, C op) |
captures kernels that perform parallel reduction over a range of items | |
template<typename I , typename T , typename C > | |
void | reduce (cudaTask task, I first, I last, T *result, C op) |
updates a capture task to a reduction task | |
template<typename I , typename T , typename C > | |
cudaTask | uninitialized_reduce (I first, I last, T *result, C op) |
similar to tf::cudaFlowCapturer::reduce but does not assume any initial value to reduce | |
template<typename I , typename T , typename C > | |
void | uninitialized_reduce (cudaTask task, I first, I last, T *result, C op) |
updates a capture task to an uninitialized-reduction task | |
template<typename I , typename T , typename C , typename U > | |
cudaTask | transform_reduce (I first, I last, T *result, C bop, U uop) |
captures kernels that perform parallel reduction over a range of transformed items | |
template<typename I , typename T , typename C , typename U > | |
void | transform_reduce (cudaTask task, I first, I last, T *result, C bop, U uop) |
updates a capture task to a transform-reduce task | |
template<typename I , typename T , typename C , typename U > | |
cudaTask | transform_uninitialized_reduce (I first, I last, T *result, C bop, U uop) |
similar to tf::cudaFlowCapturer::transform_reduce but does not assume any initial value to reduce | |
template<typename I , typename T , typename C , typename U > | |
void | transform_uninitialized_reduce (cudaTask task, I first, I last, T *result, C bop, U uop) |
updates a capture task to a transform-reduce task of no initialized value | |
template<typename I , typename O , typename C > | |
cudaTask | inclusive_scan (I first, I last, O output, C op) |
captures kernels that perform parallel inclusive scan over a range of items | |
template<typename I , typename O , typename C > | |
void | inclusive_scan (cudaTask task, I first, I last, O output, C op) |
updates a capture task to an inclusive scan task | |
template<typename I , typename O , typename C > | |
cudaTask | exclusive_scan (I first, I last, O output, C op) |
similar to cudaFlowCapturer::inclusive_scan but excludes the first value | |
template<typename I , typename O , typename C > | |
void | exclusive_scan (cudaTask task, I first, I last, O output, C op) |
updates a capture task to an exclusive scan task | |
template<typename I , typename O , typename B , typename U > | |
cudaTask | transform_inclusive_scan (I first, I last, O output, B bop, U uop) |
captures kernels that perform parallel inclusive scan over a range of transformed items | |
template<typename I , typename O , typename B , typename U > | |
void | transform_inclusive_scan (cudaTask task, I first, I last, O output, B bop, U uop) |
updates a capture task to a transform-inclusive scan task | |
template<typename I , typename O , typename B , typename U > | |
cudaTask | transform_exclusive_scan (I first, I last, O output, B bop, U uop) |
similar to cudaFlowCapturer::transform_inclusive_scan but excludes the first value | |
template<typename I , typename O , typename B , typename U > | |
void | transform_exclusive_scan (cudaTask task, I first, I last, O output, B bop, U uop) |
updates a capture task to a transform-exclusive scan task | |
template<typename A , typename B , typename C , typename Comp > | |
cudaTask | merge (A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp) |
captures kernels that perform parallel merge on two sorted arrays | |
template<typename A , typename B , typename C , typename Comp > | |
void | merge (cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp) |
updates a capture task to a merge task | |
template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C > | |
cudaTask | merge_by_key (a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp) |
captures kernels that perform parallel key-value merge | |
template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C > | |
void | merge_by_key (cudaTask task, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp) |
updates a capture task to a key-value merge task | |
template<typename I , typename C > | |
cudaTask | sort (I first, I last, C comp) |
captures kernels that sort the given array | |
template<typename I , typename C > | |
void | sort (cudaTask task, I first, I last, C comp) |
updates a capture task to a sort task | |
template<typename K_it , typename V_it , typename C > | |
cudaTask | sort_by_key (K_it k_first, K_it k_last, V_it v_first, C comp) |
captures kernels that sort the given array | |
template<typename K_it , typename V_it , typename C > | |
void | sort_by_key (cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp) |
updates a capture task to a key-value sort task | |
template<typename I , typename U > | |
cudaTask | find_if (I first, I last, unsigned *idx, U op) |
creates a task to find the index of the first element in a range | |
template<typename I , typename U > | |
void | find_if (cudaTask task, I first, I last, unsigned *idx, U op) |
updates the parameters of a find-if task | |
template<typename I , typename O > | |
cudaTask | min_element (I first, I last, unsigned *idx, O op) |
finds the index of the minimum element in a range | |
template<typename I , typename O > | |
void | min_element (cudaTask task, I first, I last, unsigned *idx, O op) |
updates the parameters of a min-element task | |
template<typename I , typename O > | |
cudaTask | max_element (I first, I last, unsigned *idx, O op) |
finds the index of the maximum element in a range | |
template<typename I , typename O > | |
void | max_element (cudaTask task, I first, I last, unsigned *idx, O op) |
updates the parameters of a max-element task | |
template<typename P > | |
void | offload_until (P &&predicate) |
offloads the captured cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true | |
void | offload_n (size_t n) |
offloads the captured cudaFlow and executes it by the given times | |
void | offload () |
offloads the captured cudaFlow and executes it once | |
Friends | |
class | cudaFlow |
class | Executor |
class to create a cudaFlow graph using stream capture
The usage of tf::cudaFlowCapturer is similar to tf::cudaFlow, except users can call the method tf::cudaFlowCapturer::on to capture a sequence of asynchronous CUDA operations through the given stream. The following example creates a CUDA graph that captures two kernel tasks, task_1
and task_2
, where task_1
runs before task_2
.
Similar to tf::cudaFlow, a cudaFlowCapturer is a task (tf::Task) created from tf::Taskflow and will be run by one worker thread in the executor. That is, the callable that describes a cudaFlowCapturer will be executed sequentially. Inside a cudaFlow capturer task, different GPU tasks (tf::cudaTask) may run in parallel depending on the selected optimization algorithm. By default, we use tf::cudaRoundRobinCapturing to transform a user-level graph into a native CUDA graph.
Please refer to GPU Tasking (cudaFlowCapturer) for details.
|
inline |
constrcts a standalone cudaFlowCapturer
A standalone cudaFlow capturer does not go through any taskflow and can be run by the caller thread using explicit offload methods (e.g., tf::cudaFlow::offload).
void tf::cudaFlowCapturer::copy | ( | cudaTask | task, |
T * | tgt, | ||
const T * | src, | ||
size_t | num | ||
) |
updates a capture task to a copy operation
The method is similar to cudaFlowCapturer::copy but operates on an existing task.
cudaTask tf::cudaFlowCapturer::copy | ( | T * | tgt, |
const T * | src, | ||
size_t | num | ||
) |
captures a copy task of typed data
T | element type (non-void) |
tgt | pointer to the target memory block |
src | pointer to the source memory block |
num | number of elements to copy |
A copy task transfers num*sizeof(T)
bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
void tf::cudaFlowCapturer::exclusive_scan | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
O | output, | ||
C | op | ||
) |
updates a capture task to an exclusive scan task
This method is similar to cudaFlowCapturer::exclusive_scan but operates on an existing task.
void tf::cudaFlowCapturer::find_if | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
unsigned * | idx, | ||
U | op | ||
) |
updates the parameters of a find-if task
This method is similar to tf::cudaFlowCapturer::find_if but operates on an existing task.
cudaTask tf::cudaFlowCapturer::find_if | ( | I | first, |
I | last, | ||
unsigned * | idx, | ||
U | op | ||
) |
creates a task to find the index of the first element in a range
I | input iterator type |
U | unary operator type |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | pointer to the index of the found element |
op | unary operator which returns true for the required element |
Finds the index idx
of the first element in the range [first, last)
such that op(*(first+idx))
is true. This is equivalent to the parallel execution of the following loop:
void tf::cudaFlowCapturer::for_each | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
C | callable | ||
) |
updates a capture task to a for-each kernel task
This method is similar to cudaFlowCapturer::for_each but operates on an existing task.
cudaTask tf::cudaFlowCapturer::for_each | ( | I | first, |
I | last, | ||
C | callable | ||
) |
captures a kernel that applies a callable to each dereferenced element of the data array
I | iterator type |
C | callable type |
first | iterator to the beginning |
last | iterator to the end |
callable | a callable object to apply to the dereferenced iterator |
This method is equivalent to the parallel execution of the following loop on a GPU:
void tf::cudaFlowCapturer::for_each_index | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
I | step, | ||
C | callable | ||
) |
updates a capture task to a for-each-index kernel task
This method is similar to cudaFlowCapturer::for_each_index but operates on an existing task.
cudaTask tf::cudaFlowCapturer::for_each_index | ( | I | first, |
I | last, | ||
I | step, | ||
C | callable | ||
) |
captures a kernel that applies a callable to each index in the range with the step size
I | index type |
C | callable type |
first | beginning index |
last | last index |
step | step size |
callable | the callable to apply to each element in the data array |
This method is equivalent to the parallel execution of the following loop on a GPU:
void tf::cudaFlowCapturer::inclusive_scan | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
O | output, | ||
C | op | ||
) |
updates a capture task to an inclusive scan task
This method is similar to cudaFlowCapturer::inclusive_scan but operates on an existing task.
cudaTask tf::cudaFlowCapturer::inclusive_scan | ( | I | first, |
I | last, | ||
O | output, | ||
C | op | ||
) |
captures kernels that perform parallel inclusive scan over a range of items
I | input iterator type |
O | output iterator type |
C | binary operator type |
first | iterator to the beginning |
last | iterator to the end |
output | iterator to the beginning of the output |
op | binary operator |
This method is equivalent to the parallel execution of the following loop on a GPU:
void tf::cudaFlowCapturer::kernel | ( | cudaTask | task, |
dim3 | g, | ||
dim3 | b, | ||
size_t | s, | ||
F | f, | ||
ArgsT &&... | args | ||
) |
updates a capture task to a kernel operation
The method is similar to cudaFlowCapturer::kernel but operates on an existing task.
cudaTask tf::cudaFlowCapturer::kernel | ( | dim3 | g, |
dim3 | b, | ||
size_t | s, | ||
F | f, | ||
ArgsT &&... | args | ||
) |
captures a kernel
F | kernel function type |
ArgsT | kernel function parameters type |
g | configured grid |
b | configured block |
s | configured shared memory size in bytes |
f | kernel function |
args | arguments to forward to the kernel function by copy |
OPT & tf::cudaFlowCapturer::make_optimizer | ( | ArgsT &&... | args | ) |
selects a different optimization algorithm
OPT | optimizer type |
ArgsT | arguments types |
args | arguments to forward to construct the optimizer |
We currently supports the following optimization algorithms to capture a user-described cudaFlow:
By default, tf::cudaFlowCapturer uses the round-robin optimization algorithm with four streams to transform a user-level graph into a native CUDA graph.
void tf::cudaFlowCapturer::max_element | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
unsigned * | idx, | ||
O | op | ||
) |
updates the parameters of a max-element task
This method is similar to cudaFlowCapturer::max_element but operates on an existing task.
cudaTask tf::cudaFlowCapturer::max_element | ( | I | first, |
I | last, | ||
unsigned * | idx, | ||
O | op | ||
) |
finds the index of the maximum element in a range
I | input iterator type |
O | comparator type |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | solution index of the maximum element |
op | comparison function object |
The function launches kernels asynchronously to find the largest element in the range [first, last)
using the given comparator op
. The function is equivalent to a parallel execution of the following loop:
|
inline |
updates a capture task to a memcpy operation
The method is similar to cudaFlowCapturer::memcpy but operates on an existing task.
|
inline |
copies data between host and device asynchronously through a stream
dst | destination memory address |
src | source memory address |
count | size in bytes to copy |
The method captures a cudaMemcpyAsync
operation through an internal stream.
|
inline |
updates a capture task to a memset operation
The method is similar to cudaFlowCapturer::memset but operates on an existing task.
|
inline |
initializes or sets GPU memory to the given value byte by byte
ptr | pointer to GPU mempry |
v | value to set for each byte of the specified memory |
n | size in bytes to set |
The method captures a cudaMemsetAsync
operation through an internal stream to fill the first count
bytes of the memory area pointed to by devPtr
with the constant byte value value
.
cudaTask tf::cudaFlowCapturer::merge | ( | A | a_first, |
A | a_last, | ||
B | b_first, | ||
B | b_last, | ||
C | c_first, | ||
Comp | comp | ||
) |
captures kernels that perform parallel merge on two sorted arrays
A | iterator type of the first input array |
B | iterator type of the second input array |
C | iterator type of the output array |
Comp | comparator type |
a_first | iterator to the beginning of the first input array |
a_last | iterator to the end of the first input array |
b_first | iterator to the beginning of the second input array |
b_last | iterator to the end of the second input array |
c_first | iterator to the beginning of the output array |
comp | binary comparator |
Merges two sorted ranges [a_first, a_last)
and [b_first, b_last)
into one sorted range beginning at c_first
.
A sequence is said to be sorted with respect to a comparator comp
if for any iterator it pointing to the sequence and any non-negative integer n
such that it + n
is a valid iterator pointing to an element of the sequence, comp(*(it + n), *it)
evaluates to false
.
void tf::cudaFlowCapturer::merge | ( | cudaTask | task, |
A | a_first, | ||
A | a_last, | ||
B | b_first, | ||
B | b_last, | ||
C | c_first, | ||
Comp | comp | ||
) |
updates a capture task to a merge task
This method is similar to cudaFlowCapturer::merge but operates on an existing task.
cudaTask tf::cudaFlowCapturer::merge_by_key | ( | a_keys_it | a_keys_first, |
a_keys_it | a_keys_last, | ||
a_vals_it | a_vals_first, | ||
b_keys_it | b_keys_first, | ||
b_keys_it | b_keys_last, | ||
b_vals_it | b_vals_first, | ||
c_keys_it | c_keys_first, | ||
c_vals_it | c_vals_first, | ||
C | comp | ||
) |
captures kernels that perform parallel key-value merge
a_keys_it | first key iterator type |
a_vals_it | first value iterator type |
b_keys_it | second key iterator type |
b_vals_it | second value iterator type |
c_keys_it | output key iterator type |
c_vals_it | output value iterator type |
C | comparator type |
a_keys_first | iterator to the beginning of the first key range |
a_keys_last | iterator to the end of the first key range |
a_vals_first | iterator to the beginning of the first value range |
b_keys_first | iterator to the beginning of the second key range |
b_keys_last | iterator to the end of the second key range |
b_vals_first | iterator to the beginning of the second value range |
c_keys_first | iterator to the beginning of the output key range |
c_vals_first | iterator to the beginning of the output value range |
comp | comparator |
Performs a key-value merge that copies elements from [a_keys_first, a_keys_last)
and [b_keys_first, b_keys_last)
into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))
such that the resulting range is in ascending key order.
At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first))
and [b_vals_first + (b_keys_last - b_keys_first))
into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))
such that the resulting range is in ascending order implied by each input element's associated key.
For example, assume:
a_keys
= {8, 1}
a_vals
= {1, 2}
b_keys
= {3, 7}
b_vals
= {3, 4}
After the merge, we have:
c_keys
= {1, 3, 7, 8}
c_vals
= {2, 3, 4, 1}
void tf::cudaFlowCapturer::merge_by_key | ( | cudaTask | task, |
a_keys_it | a_keys_first, | ||
a_keys_it | a_keys_last, | ||
a_vals_it | a_vals_first, | ||
b_keys_it | b_keys_first, | ||
b_keys_it | b_keys_last, | ||
b_vals_it | b_vals_first, | ||
c_keys_it | c_keys_first, | ||
c_vals_it | c_vals_first, | ||
C | comp | ||
) |
updates a capture task to a key-value merge task
This method is similar to tf::cudaFlowCapturer::merge_by_key but operates on an existing task.
void tf::cudaFlowCapturer::min_element | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
unsigned * | idx, | ||
O | op | ||
) |
updates the parameters of a min-element task
This method is similar to cudaFlowCapturer::min_element but operates on an existing task.
cudaTask tf::cudaFlowCapturer::min_element | ( | I | first, |
I | last, | ||
unsigned * | idx, | ||
O | op | ||
) |
finds the index of the minimum element in a range
I | input iterator type |
O | comparator type |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | solution index of the minimum element |
op | comparison function object |
The function launches kernels asynchronously to find the smallest element in the range [first, last)
using the given comparator op
. The function is equivalent to a parallel execution of the following loop:
|
inline |
captures a no-operation task
An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n
nodes with a barrier between them can be represented using an empty node and 2*n
dependency edges, rather than no empty node and n^2
dependency edges.
|
inline |
updates a task to a no-operation task
The method is similar to tf::cudaFlowCapturer::noop but operates on an existing task.
|
inline |
offloads the captured cudaFlow and executes it by the given times
n | number of executions |
void tf::cudaFlowCapturer::offload_until | ( | P && | predicate | ) |
offloads the captured cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
P | predicate type (a binary callable) |
predicate | a binary predicate (returns true for stop) |
Immediately offloads the cudaFlow captured so far onto a GPU and repeatedly runs it until the predicate returns true
.
By default, if users do not offload the cudaFlow capturer, the executor will offload it once.
cudaTask tf::cudaFlowCapturer::on | ( | C && | callable | ) |
captures a sequential CUDA operations from the given callable
C | callable type constructible with std::function<void(cudaStream_t)> |
callable | a callable to capture CUDA operations with the stream |
This methods applies a stream created by the flow to capture a sequence of CUDA operations defined in the callable.
void tf::cudaFlowCapturer::on | ( | cudaTask | task, |
C && | callable | ||
) |
updates a capture task to another sequential CUDA operations
The method is similar to cudaFlowCapturer::on but operates on an existing task.
void tf::cudaFlowCapturer::reduce | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
T * | result, | ||
C | op | ||
) |
updates a capture task to a reduction task
This method is similar to cudaFlowCapturer::reduce but operates on an existing task.
cudaTask tf::cudaFlowCapturer::reduce | ( | I | first, |
I | last, | ||
T * | result, | ||
C | op | ||
) |
captures kernels that perform parallel reduction over a range of items
I | input iterator type |
T | value type |
C | binary operator type |
first | iterator to the beginning |
last | iterator to the end |
result | pointer to the result with an initialized value |
op | binary reduction operator |
This method is equivalent to the parallel execution of the following loop on a GPU:
cudaTask tf::cudaFlowCapturer::single_task | ( | C | c | ) |
capturers a kernel to runs the given callable with only one thread
C | callable type |
c | callable to run by a single kernel thread |
void tf::cudaFlowCapturer::single_task | ( | cudaTask | task, |
C | c | ||
) |
updates a capture task to a single-threaded kernel
This method is similar to cudaFlowCapturer::single_task but operates on an existing task.
void tf::cudaFlowCapturer::sort | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
C | comp | ||
) |
updates a capture task to a sort task
This method is similar to cudaFlowCapturer::sort but operates on an existing task.
cudaTask tf::cudaFlowCapturer::sort | ( | I | first, |
I | last, | ||
C | comp | ||
) |
captures kernels that sort the given array
I | iterator type of the first input array |
C | comparator type |
first | iterator to the beginning of the input array |
last | iterator to the end of the input array |
comp | binary comparator |
Sorts elements in the range [first, last)
with the given comparator.
void tf::cudaFlowCapturer::sort_by_key | ( | cudaTask | task, |
K_it | k_first, | ||
K_it | k_last, | ||
V_it | v_first, | ||
C | comp | ||
) |
updates a capture task to a key-value sort task
This method is similar to tf::cudaFlowCapturer::sort_by_key but operates on an existing task.
cudaTask tf::cudaFlowCapturer::sort_by_key | ( | K_it | k_first, |
K_it | k_last, | ||
V_it | v_first, | ||
C | comp | ||
) |
captures kernels that sort the given array
K_it | iterator type of the key |
V_it | iterator type of the value |
C | comparator type |
k_first | iterator to the beginning of the key array |
k_last | iterator to the end of the key array |
v_first | iterator to the beginning of the value array |
comp | binary comparator |
Sorts key-value elements in [k_first, k_last)
and [v_first, v_first + (k_last - k_first))
into ascending key order using the given comparator comp
. If i
and j
are any two valid iterators in [k_first, k_last)
such that i
precedes j
, and p
and q
are iterators in [v_first, v_first + (k_last - k_first))
corresponding to i
and j
respectively, then comp(*j, *i)
evaluates to false
.
For example, assume:
keys
are {1, 4, 2, 8, 5, 7}
values
are {'a', 'b', 'c', 'd', 'e', 'f'}
After sort:
keys
are {1, 2, 4, 5, 7, 8}
values
are {'a', 'c', 'b', 'e', 'f', 'd'}
void tf::cudaFlowCapturer::transform | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
O | output, | ||
C | op | ||
) |
updates a capture task to a transform kernel task
This method is similar to cudaFlowCapturer::transform but operates on an existing task.
void tf::cudaFlowCapturer::transform | ( | cudaTask | task, |
I1 | first1, | ||
I1 | last1, | ||
I2 | first2, | ||
O | output, | ||
C | op | ||
) |
updates a capture task to a transform kernel task
This method is similar to cudaFlowCapturer::transform but operates on an existing task.
cudaTask tf::cudaFlowCapturer::transform | ( | I | first, |
I | last, | ||
O | output, | ||
C | op | ||
) |
captures a kernel that transforms an input range to an output range
I | input iterator type |
O | output iterator type |
C | unary operator type |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
op | unary operator to apply to transform each item in the range |
This method is equivalent to the parallel execution of the following loop on a GPU:
cudaTask tf::cudaFlowCapturer::transform | ( | I1 | first1, |
I1 | last1, | ||
I2 | first2, | ||
O | output, | ||
C | op | ||
) |
captures a kernel that transforms two input ranges to an output range
I1 | first input iterator type |
I2 | second input iterator type |
O | output iterator type |
C | unary operator type |
first1 | iterator to the beginning of the input range |
last1 | iterator to the end of the input range |
first2 | iterato |
output | iterator to the beginning of the output range |
op | binary operator to apply to transform each pair of items in the two input ranges |
This method is equivalent to the parallel execution of the following loop on a GPU:
void tf::cudaFlowCapturer::transform_exclusive_scan | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
O | output, | ||
B | bop, | ||
U | uop | ||
) |
updates a capture task to a transform-exclusive scan task
This method is similar to cudaFlowCapturer::transform_exclusive_scan but operates on an existing task.
void tf::cudaFlowCapturer::transform_inclusive_scan | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
O | output, | ||
B | bop, | ||
U | uop | ||
) |
updates a capture task to a transform-inclusive scan task
This method is similar to cudaFlowCapturer::transform_inclusive_scan but operates on an existing task.
cudaTask tf::cudaFlowCapturer::transform_inclusive_scan | ( | I | first, |
I | last, | ||
O | output, | ||
B | bop, | ||
U | uop | ||
) |
captures kernels that perform parallel inclusive scan over a range of transformed items
I | input iterator type |
O | output iterator type |
B | binary operator type |
U | unary operator type |
first | iterator to the beginning |
last | iterator to the end |
output | iterator to the beginning of the output |
bop | binary operator |
uop | unary operator |
This method is equivalent to the parallel execution of the following loop on a GPU:
void tf::cudaFlowCapturer::transform_reduce | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
T * | result, | ||
C | bop, | ||
U | uop | ||
) |
updates a capture task to a transform-reduce task
This method is similar to cudaFlowCapturer::transform_reduce but operates on an existing task.
cudaTask tf::cudaFlowCapturer::transform_reduce | ( | I | first, |
I | last, | ||
T * | result, | ||
C | bop, | ||
U | uop | ||
) |
captures kernels that perform parallel reduction over a range of transformed items
I | input iterator type |
T | value type |
C | binary operator type |
U | unary operator type |
first | iterator to the beginning |
last | iterator to the end |
result | pointer to the result with an initialized value |
bop | binary reduce operator |
uop | unary transform operator |
This method is equivalent to the parallel execution of the following loop on a GPU:
void tf::cudaFlowCapturer::transform_uninitialized_reduce | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
T * | result, | ||
C | bop, | ||
U | uop | ||
) |
updates a capture task to a transform-reduce task of no initialized value
This method is similar to cudaFlowCapturer::transform_uninitialized_reduce but operates on an existing task.
cudaTask tf::cudaFlowCapturer::transform_uninitialized_reduce | ( | I | first, |
I | last, | ||
T * | result, | ||
C | bop, | ||
U | uop | ||
) |
similar to tf::cudaFlowCapturer::transform_reduce but does not assume any initial value to reduce
This method is equivalent to the parallel execution of the following loop on a GPU:
void tf::cudaFlowCapturer::uninitialized_reduce | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
T * | result, | ||
C | op | ||
) |
updates a capture task to an uninitialized-reduction task
This method is similar to cudaFlowCapturer::uninitialized_reduce but operates on an existing task.
cudaTask tf::cudaFlowCapturer::uninitialized_reduce | ( | I | first, |
I | last, | ||
T * | result, | ||
C | op | ||
) |
similar to tf::cudaFlowCapturer::reduce but does not assume any initial value to reduce
This method is equivalent to the parallel execution of the following loop on a GPU: