![]() |
Taskflow
3.2.0-Master-Branch
|
class to create a cudaFlow task dependency graph More...
#include <cudaflow.hpp>
Public Member Functions | |
cudaFlow () | |
constructs a standalone cudaFlow | |
~cudaFlow () | |
destroys the cudaFlow and its associated native CUDA graph and executable graph | |
bool | empty () const |
queries the emptiness of the graph | |
size_t | num_tasks () const |
queries the number of tasks | |
void | clear () |
clears the cudaFlow object | |
void | dump (std::ostream &os) const |
dumps the cudaFlow graph into a DOT format through an output stream | |
void | dump_native_graph (std::ostream &os) const |
dumps the native CUDA graph into a DOT format through an output stream | |
cudaTask | noop () |
creates a no-operation task | |
template<typename C > | |
cudaTask | host (C &&callable) |
creates a host task that runs a callable on the host | |
template<typename C > | |
void | host (cudaTask task, C &&callable) |
updates parameters of a host task | |
template<typename F , typename... ArgsT> | |
cudaTask | kernel (dim3 g, dim3 b, size_t s, F f, ArgsT &&... args) |
creates a kernel task | |
template<typename F , typename... ArgsT> | |
void | kernel (cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT &&... args) |
updates parameters of a kernel task | |
cudaTask | memset (void *dst, int v, size_t count) |
creates a memset task that fills untyped data with a byte value | |
void | memset (cudaTask task, void *dst, int ch, size_t count) |
updates parameters of a memset task | |
cudaTask | memcpy (void *tgt, const void *src, size_t bytes) |
creates a memcpy task that copies untyped data in bytes | |
void | memcpy (cudaTask task, void *tgt, const void *src, size_t bytes) |
updates parameters of a memcpy task | |
template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * = nullptr> | |
cudaTask | zero (T *dst, size_t count) |
creates a memset task that sets a typed memory block to zero | |
template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * = nullptr> | |
void | zero (cudaTask task, T *dst, size_t count) |
updates parameters of a memset task to a zero task | |
template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * = nullptr> | |
cudaTask | fill (T *dst, T value, size_t count) |
creates a memset task that fills a typed memory block with a value | |
template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * = nullptr> | |
void | fill (cudaTask task, T *dst, T value, size_t count) |
updates parameters of a memset task to a fill task | |
template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * = nullptr> | |
cudaTask | copy (T *tgt, const T *src, size_t num) |
creates a memcopy task that copies typed data | |
template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * = nullptr> | |
void | copy (cudaTask task, T *tgt, const T *src, size_t num) |
updates parameters of a memcpy task to a copy task | |
template<typename P > | |
void | offload_until (P &&predicate) |
offloads the cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true | |
void | offload_n (size_t N) |
offloads the cudaFlow and executes it by the given times | |
void | offload () |
offloads the cudaFlow and executes it once | |
template<typename C > | |
cudaTask | single_task (C c) |
runs a callable with only a single kernel thread | |
template<typename C > | |
void | single_task (cudaTask task, C c) |
updates a single-threaded kernel task | |
template<typename I , typename C > | |
cudaTask | for_each (I first, I last, C callable) |
applies a callable to each dereferenced element of the data array | |
template<typename I , typename C > | |
void | for_each (cudaTask task, I first, I last, C callable) |
updates parameters of a kernel task created from tf::cudaFlow::for_each | |
template<typename I , typename C > | |
cudaTask | for_each_index (I first, I last, I step, C callable) |
applies a callable to each index in the range with the step size | |
template<typename I , typename C > | |
void | for_each_index (cudaTask task, I first, I last, I step, C callable) |
updates parameters of a kernel task created from tf::cudaFlow::for_each_index | |
template<typename I , typename O , typename C > | |
cudaTask | transform (I first, I last, O output, C op) |
applies a callable to a source range and stores the result in a target range | |
template<typename I , typename O , typename C > | |
void | transform (cudaTask task, I first, I last, O output, C c) |
updates parameters of a kernel task created from tf::cudaFlow::transform | |
template<typename I1 , typename I2 , typename O , typename C > | |
cudaTask | transform (I1 first1, I1 last1, I2 first2, O output, C op) |
creates a task to perform parallel transforms over two ranges of items | |
template<typename I1 , typename I2 , typename O , typename C > | |
void | transform (cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c) |
updates parameters of a kernel task created from tf::cudaFlow::transform | |
template<typename I , typename T , typename B > | |
cudaTask | reduce (I first, I last, T *result, B bop) |
performs parallel reduction over a range of items | |
template<typename I , typename T , typename C > | |
void | reduce (cudaTask task, I first, I last, T *result, C op) |
updates parameters of a kernel task created from tf::cudaFlow::reduce | |
template<typename I , typename T , typename B > | |
cudaTask | uninitialized_reduce (I first, I last, T *result, B bop) |
similar to tf::cudaFlow::reduce but does not assume any initial value to reduce | |
template<typename I , typename T , typename C > | |
void | uninitialized_reduce (cudaTask task, I first, I last, T *result, C op) |
updates parameters of a kernel task created from tf::cudaFlow::uninitialized_reduce | |
template<typename I , typename T , typename B , typename U > | |
cudaTask | transform_reduce (I first, I last, T *result, B bop, U uop) |
performs parallel reduction over a range of transformed items | |
template<typename I , typename T , typename B , typename U > | |
void | transform_reduce (cudaTask, I first, I last, T *result, B bop, U uop) |
updates parameters of a kernel task created from tf::cudaFlow::transform_reduce | |
template<typename I , typename T , typename B , typename U > | |
cudaTask | transform_uninitialized_reduce (I first, I last, T *result, B bop, U uop) |
similar to tf::cudaFlow::transform_reduce but does not assume any initial value to reduce | |
template<typename I , typename T , typename B , typename U > | |
void | transform_uninitialized_reduce (cudaTask task, I first, I last, T *result, B bop, U uop) |
updates parameters of a kernel task created from tf::cudaFlow::transform_uninitialized_reduce | |
template<typename I , typename O , typename C > | |
cudaTask | inclusive_scan (I first, I last, O output, C op) |
creates a task to perform parallel inclusive scan over a range of items | |
template<typename I , typename O , typename C > | |
void | inclusive_scan (cudaTask task, I first, I last, O output, C op) |
updates the parameters of a task created from tf::cudaFlow::inclusive_scan | |
template<typename I , typename O , typename C > | |
cudaTask | exclusive_scan (I first, I last, O output, C op) |
similar to cudaFlow::inclusive_scan but excludes the first value | |
template<typename I , typename O , typename C > | |
void | exclusive_scan (cudaTask task, I first, I last, O output, C op) |
updates the parameters of a task created from tf::cudaFlow::exclusive_scan | |
template<typename I , typename O , typename B , typename U > | |
cudaTask | transform_inclusive_scan (I first, I last, O output, B bop, U uop) |
creates a task to perform parallel inclusive scan over a range of transformed items | |
template<typename I , typename O , typename B , typename U > | |
void | transform_inclusive_scan (cudaTask task, I first, I last, O output, B bop, U uop) |
updates the parameters of a task created from tf::cudaFlow::transform_inclusive_scan | |
template<typename I , typename O , typename B , typename U > | |
cudaTask | transform_exclusive_scan (I first, I last, O output, B bop, U uop) |
similar to cudaFlow::transform_inclusive_scan but excludes the first value | |
template<typename I , typename O , typename B , typename U > | |
void | transform_exclusive_scan (cudaTask task, I first, I last, O output, B bop, U uop) |
updates the parameters of a task created from tf::cudaFlow::transform_exclusive_scan | |
template<typename A , typename B , typename C , typename Comp > | |
cudaTask | merge (A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp) |
creates a task to perform parallel merge on two sorted arrays | |
template<typename A , typename B , typename C , typename Comp > | |
void | merge (cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp) |
updates the parameters of a task created from tf::cudaFlow::merge | |
template<typename I , typename C > | |
cudaTask | sort (I first, I last, C comp) |
creates a task to perform parallel sort an array | |
template<typename I , typename C > | |
void | sort (cudaTask task, I first, I last, C comp) |
updates the parameters of the task created from tf::cudaFlow::sort | |
template<typename K_it , typename V_it , typename C > | |
cudaTask | sort_by_key (K_it k_first, K_it k_last, V_it v_first, C comp) |
creates kernels that sort the given array | |
template<typename K_it , typename V_it , typename C > | |
void | sort_by_key (cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp) |
updates the parameters of a task created from tf::cudaFlow::sort_by_key | |
template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C > | |
cudaTask | merge_by_key (a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp) |
creates a task to perform parallel key-value merge | |
template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C > | |
void | merge_by_key (cudaTask task, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp) |
updates the parameters of a task created from tf::cudaFlow::merge_by_key | |
template<typename I , typename U > | |
cudaTask | find_if (I first, I last, unsigned *idx, U op) |
creates a task to find the index of the first element in a range | |
template<typename I , typename U > | |
void | find_if (cudaTask task, I first, I last, unsigned *idx, U op) |
updates the parameters of the task created from tf::cudaFlow::find_if | |
template<typename I , typename O > | |
cudaTask | min_element (I first, I last, unsigned *idx, O op) |
finds the index of the minimum element in a range | |
template<typename I , typename O > | |
void | min_element (cudaTask task, I first, I last, unsigned *idx, O op) |
updates the parameters of the task created from tf::cudaFlow::min_element | |
template<typename I , typename O > | |
cudaTask | max_element (I first, I last, unsigned *idx, O op) |
finds the index of the maximum element in a range | |
template<typename I , typename O > | |
void | max_element (cudaTask task, I first, I last, unsigned *idx, O op) |
updates the parameters of the task created from tf::cudaFlow::max_element | |
template<typename C > | |
cudaTask | capture (C &&callable) |
constructs a subflow graph through tf::cudaFlowCapturer | |
template<typename C > | |
void | capture (cudaTask task, C callable) |
updates the captured child graph | |
Friends | |
class | Executor |
class to create a cudaFlow task dependency graph
A cudaFlow is a high-level interface over CUDA Graph to perform GPU operations using the task dependency graph model. The class provides a set of methods for creating and launch different tasks on one or multiple CUDA devices, for instance, kernel tasks, data transfer tasks, and memory operation tasks. The following example creates a cudaFlow of two kernel tasks, task1
and task2
, where task1
runs before task2
.
A cudaFlow is a task (tf::Task) created from tf::Taskflow and will be run by one worker thread in the executor. That is, the callable that describes a cudaFlow will be executed sequentially. Inside a cudaFlow task, different GPU tasks (tf::cudaTask) may run in parallel scheduled by the CUDA runtime.
Please refer to GPU Tasking (cudaFlow) for details.
|
inline |
constructs a standalone cudaFlow
A standalone cudaFlow does not go through any taskflow and can be run by the caller thread using explicit offload methods (e.g., tf::cudaFlow::offload).
cudaTask tf::cudaFlow::capture | ( | C && | callable | ) |
constructs a subflow graph through tf::cudaFlowCapturer
C | callable type constructible from std::function<void(tf::cudaFlowCapturer&)> |
callable | the callable to construct a capture flow |
A captured subflow forms a sub-graph to the cudaFlow and can be used to capture custom (or third-party) kernels that cannot be directly constructed from the cudaFlow.
Example usage:
void tf::cudaFlow::capture | ( | cudaTask | task, |
C | callable | ||
) |
updates the captured child graph
The method is similar to tf::cudaFlow::capture but operates on a task of type tf::cudaTaskType::SUBFLOW. The new captured graph must be topologically identical to the original captured graph.
void tf::cudaFlow::copy | ( | cudaTask | task, |
T * | tgt, | ||
const T * | src, | ||
size_t | num | ||
) |
updates parameters of a memcpy task to a copy task
The method is similar to tf::cudaFlow::copy but operates on a task of type tf::cudaTaskType::MEMCPY. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.
cudaTask tf::cudaFlow::copy | ( | T * | tgt, |
const T * | src, | ||
size_t | num | ||
) |
creates a memcopy task that copies typed data
T | element type (non-void) |
tgt | pointer to the target memory block |
src | pointer to the source memory block |
num | number of elements to copy |
A copy task transfers num*sizeof(T)
bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
|
inline |
dumps the native CUDA graph into a DOT format through an output stream
The native CUDA graph may be different from the upper-level cudaFlow graph when flow capture is involved.
void tf::cudaFlow::exclusive_scan | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
O | output, | ||
C | op | ||
) |
updates the parameters of a task created from tf::cudaFlow::exclusive_scan
This method is similar to tf::cudaFlow::exclusive_scan but operates on an existing task.
void tf::cudaFlow::fill | ( | cudaTask | task, |
T * | dst, | ||
T | value, | ||
size_t | count | ||
) |
updates parameters of a memset task to a fill task
The method is similar to tf::cudaFlow::fill but operates on a task of type tf::cudaTaskType::MEMSET.
The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.
cudaTask tf::cudaFlow::fill | ( | T * | dst, |
T | value, | ||
size_t | count | ||
) |
creates a memset task that fills a typed memory block with a value
T | element type (size of T must be either 1, 2, or 4) |
dst | pointer to the destination device memory area |
value | value to fill for each element of type T |
count | number of elements |
A fill task fills the first count
elements of type T
with value
in a device memory area pointed by dst
. The value to fill is interpreted in type T
rather than byte.
cudaTask tf::cudaFlow::find_if | ( | I | first, |
I | last, | ||
unsigned * | idx, | ||
U | op | ||
) |
creates a task to find the index of the first element in a range
I | input iterator type |
U | unary operator type |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | pointer to the index of the found element |
op | unary operator which returns true for the required element |
Finds the index idx
of the first element in the range [first, last)
such that op(*(first+idx))
is true. This is equivalent to the parallel execution of the following loop:
void tf::cudaFlow::for_each | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
C | callable | ||
) |
updates parameters of a kernel task created from tf::cudaFlow::for_each
The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each.
cudaTask tf::cudaFlow::for_each | ( | I | first, |
I | last, | ||
C | callable | ||
) |
applies a callable to each dereferenced element of the data array
I | iterator type |
C | callable type |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
callable | a callable object to apply to the dereferenced iterator |
This method is equivalent to the parallel execution of the following loop on a GPU:
void tf::cudaFlow::for_each_index | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
I | step, | ||
C | callable | ||
) |
updates parameters of a kernel task created from tf::cudaFlow::for_each_index
The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each_index.
cudaTask tf::cudaFlow::for_each_index | ( | I | first, |
I | last, | ||
I | step, | ||
C | callable | ||
) |
applies a callable to each index in the range with the step size
I | index type |
C | callable type |
first | beginning index |
last | last index |
step | step size |
callable | the callable to apply to each element in the data array |
This method is equivalent to the parallel execution of the following loop on a GPU:
cudaTask tf::cudaFlow::host | ( | C && | callable | ) |
creates a host task that runs a callable on the host
C | callable type |
callable | a callable object with neither arguments nor return (i.e., constructible from std::function<void()> ) |
A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc
).
void tf::cudaFlow::host | ( | cudaTask | task, |
C && | callable | ||
) |
updates parameters of a host task
The method is similar to tf::cudaFlow::host but operates on a task of type tf::cudaTaskType::HOST.
void tf::cudaFlow::inclusive_scan | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
O | output, | ||
C | op | ||
) |
updates the parameters of a task created from tf::cudaFlow::inclusive_scan
This method is similar to tf::cudaFlow::inclusive_scan but operates on an existing task.
cudaTask tf::cudaFlow::inclusive_scan | ( | I | first, |
I | last, | ||
O | output, | ||
C | op | ||
) |
creates a task to perform parallel inclusive scan over a range of items
I | input iterator type |
O | output iterator type |
C | binary operator type |
first | iterator to the beginning |
last | iterator to the end |
output | iterator to the beginning of the output |
op | binary operator |
This method is equivalent to the parallel execution of the following loop on a GPU:
void tf::cudaFlow::kernel | ( | cudaTask | task, |
dim3 | g, | ||
dim3 | b, | ||
size_t | shm, | ||
F | f, | ||
ArgsT &&... | args | ||
) |
updates parameters of a kernel task
The method is similar to tf::cudaFlow::kernel but operates on a task of type tf::cudaTaskType::KERNEL. The kernel function name must NOT change.
cudaTask tf::cudaFlow::kernel | ( | dim3 | g, |
dim3 | b, | ||
size_t | s, | ||
F | f, | ||
ArgsT &&... | args | ||
) |
creates a kernel task
F | kernel function type |
ArgsT | kernel function parameters type |
g | configured grid |
b | configured block |
s | configured shared memory size in bytes |
f | kernel function |
args | arguments to forward to the kernel function by copy |
cudaTask tf::cudaFlow::max_element | ( | I | first, |
I | last, | ||
unsigned * | idx, | ||
O | op | ||
) |
finds the index of the maximum element in a range
I | input iterator type |
O | comparator type |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | solution index of the maximum element |
op | comparison function object |
The function launches kernels asynchronously to find the largest element in the range [first, last)
using the given comparator op
. The function is equivalent to a parallel execution of the following loop:
|
inline |
updates parameters of a memcpy task
The method is similar to tf::cudaFlow::memcpy but operates on a task of type tf::cudaTaskType::MEMCPY. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.
|
inline |
creates a memcpy task that copies untyped data in bytes
tgt | pointer to the target memory block |
src | pointer to the source memory block |
bytes | bytes to copy |
A memcpy task transfers bytes
of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
|
inline |
updates parameters of a memset task
The method is similar to tf::cudaFlow::memset but operates on a task of type tf::cudaTaskType::MEMSET. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.
|
inline |
creates a memset task that fills untyped data with a byte value
dst | pointer to the destination device memory area |
v | value to set for each byte of specified memory |
count | size in bytes to set |
A memset task fills the first count
bytes of device memory area pointed by dst
with the byte value v
.
cudaTask tf::cudaFlow::merge | ( | A | a_first, |
A | a_last, | ||
B | b_first, | ||
B | b_last, | ||
C | c_first, | ||
Comp | comp | ||
) |
creates a task to perform parallel merge on two sorted arrays
A | iterator type of the first input array |
B | iterator type of the second input array |
C | iterator type of the output array |
Comp | comparator type |
a_first | iterator to the beginning of the first input array |
a_last | iterator to the end of the first input array |
b_first | iterator to the beginning of the second input array |
b_last | iterator to the end of the second input array |
c_first | iterator to the beginning of the output array |
comp | binary comparator |
Merges two sorted ranges [a_first, a_last)
and [b_first, b_last)
into one sorted range beginning at c_first
.
A sequence is said to be sorted with respect to a comparator comp
if for any iterator it pointing to the sequence and any non-negative integer n
such that it + n
is a valid iterator pointing to an element of the sequence, comp(*(it + n), *it)
evaluates to false.
void tf::cudaFlow::merge | ( | cudaTask | task, |
A | a_first, | ||
A | a_last, | ||
B | b_first, | ||
B | b_last, | ||
C | c_first, | ||
Comp | comp | ||
) |
updates the parameters of a task created from tf::cudaFlow::merge
This method is similar to tf::cudaFlow::merge but operates on an existing task.
cudaTask tf::cudaFlow::merge_by_key | ( | a_keys_it | a_keys_first, |
a_keys_it | a_keys_last, | ||
a_vals_it | a_vals_first, | ||
b_keys_it | b_keys_first, | ||
b_keys_it | b_keys_last, | ||
b_vals_it | b_vals_first, | ||
c_keys_it | c_keys_first, | ||
c_vals_it | c_vals_first, | ||
C | comp | ||
) |
creates a task to perform parallel key-value merge
a_keys_it | first key iterator type |
a_vals_it | first value iterator type |
b_keys_it | second key iterator type |
b_vals_it | second value iterator type |
c_keys_it | output key iterator type |
c_vals_it | output value iterator type |
C | comparator type |
a_keys_first | iterator to the beginning of the first key range |
a_keys_last | iterator to the end of the first key range |
a_vals_first | iterator to the beginning of the first value range |
b_keys_first | iterator to the beginning of the second key range |
b_keys_last | iterator to the end of the second key range |
b_vals_first | iterator to the beginning of the second value range |
c_keys_first | iterator to the beginning of the output key range |
c_vals_first | iterator to the beginning of the output value range |
comp | comparator |
Performs a key-value merge that copies elements from [a_keys_first, a_keys_last)
and [b_keys_first, b_keys_last)
into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))
such that the resulting range is in ascending key order.
At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first))
and [b_vals_first + (b_keys_last - b_keys_first))
into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))
such that the resulting range is in ascending order implied by each input element's associated key.
For example, assume:
a_keys
= {8, 1}
a_vals
= {1, 2}
b_keys
= {3, 7}
b_vals
= {3, 4}
After the merge, we have:
c_keys
= {1, 3, 7, 8}
c_vals
= {2, 3, 4, 1}
void tf::cudaFlow::merge_by_key | ( | cudaTask | task, |
a_keys_it | a_keys_first, | ||
a_keys_it | a_keys_last, | ||
a_vals_it | a_vals_first, | ||
b_keys_it | b_keys_first, | ||
b_keys_it | b_keys_last, | ||
b_vals_it | b_vals_first, | ||
c_keys_it | c_keys_first, | ||
c_vals_it | c_vals_first, | ||
C | comp | ||
) |
updates the parameters of a task created from tf::cudaFlow::merge_by_key
This method is similar to tf::cudaFlow::merge_by_key but operates on an existing task.
cudaTask tf::cudaFlow::min_element | ( | I | first, |
I | last, | ||
unsigned * | idx, | ||
O | op | ||
) |
finds the index of the minimum element in a range
I | input iterator type |
O | comparator type |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | solution index of the minimum element |
op | comparison function object |
The function launches kernels asynchronously to find the smallest element in the range [first, last)
using the given comparator op
. The function is equivalent to a parallel execution of the following loop:
|
inline |
creates a no-operation task
An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n
nodes with a barrier between them can be represented using an empty node and 2*n
dependency edges, rather than no empty node and n^2
dependency edges.
|
inline |
offloads the cudaFlow and executes it by the given times
N | number of executions |
void tf::cudaFlow::offload_until | ( | P && | predicate | ) |
offloads the cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
P | predicate type (a binary callable) |
predicate | a binary predicate (returns true for stop) |
Immediately offloads the present cudaFlow onto a GPU and repeatedly runs it until the predicate returns true
.
An offloaded cudaFlow forces the underlying graph to be instantiated. After the instantiation, you should not modify the graph topology but update node parameters.
By default, if users do not offload the cudaFlow, the executor will offload it once.
void tf::cudaFlow::reduce | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
T * | result, | ||
C | op | ||
) |
updates parameters of a kernel task created from tf::cudaFlow::reduce
The type of the iterators, result, and callable must be the same as the task created from tf::cudaFlow::reduce.
cudaTask tf::cudaFlow::reduce | ( | I | first, |
I | last, | ||
T * | result, | ||
B | bop | ||
) |
performs parallel reduction over a range of items
I | input iterator type |
T | value type |
B | binary operator type |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
result | pointer to the result with an initialized value |
bop | binary operator to apply to reduce items |
This method is equivalent to the parallel execution of the following loop on a GPU:
cudaTask tf::cudaFlow::single_task | ( | C | c | ) |
runs a callable with only a single kernel thread
C | callable type |
c | callable to run by a single kernel thread |
void tf::cudaFlow::single_task | ( | cudaTask | task, |
C | c | ||
) |
updates a single-threaded kernel task
This method is similar to cudaFlow::single_task but operates on an existing task.
void tf::cudaFlow::sort | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
C | comp | ||
) |
updates the parameters of the task created from tf::cudaFlow::sort
This method is similar to tf::cudaFlow::sort but operates on an existing task.
cudaTask tf::cudaFlow::sort | ( | I | first, |
I | last, | ||
C | comp | ||
) |
creates a task to perform parallel sort an array
I | iterator type of the first input array |
C | comparator type |
first | iterator to the beginning of the input array |
last | iterator to the end of the input array |
comp | binary comparator |
Sorts elements in the range [first, last)
with the given comparator comp
.
void tf::cudaFlow::sort_by_key | ( | cudaTask | task, |
K_it | k_first, | ||
K_it | k_last, | ||
V_it | v_first, | ||
C | comp | ||
) |
updates the parameters of a task created from tf::cudaFlow::sort_by_key
This method is similar to tf::cudaFlow::sort_by_key but operates on an existing task.
cudaTask tf::cudaFlow::sort_by_key | ( | K_it | k_first, |
K_it | k_last, | ||
V_it | v_first, | ||
C | comp | ||
) |
creates kernels that sort the given array
K_it | iterator type of the key |
V_it | iterator type of the value |
C | comparator type |
k_first | iterator to the beginning of the key array |
k_last | iterator to the end of the key array |
v_first | iterator to the beginning of the value array |
comp | binary comparator |
Sorts key-value elements in [k_first, k_last)
and [v_first, v_first + (k_last - k_first))
into ascending key order using the given comparator comp
. If i
and j
are any two valid iterators in [k_first, k_last)
such that i
precedes j
, and p
and q
are iterators in [v_first, v_first + (k_last - k_first))
corresponding to i
and j
respectively, then comp(*j, *i)
evaluates to false
.
For example, assume:
keys
are {1, 4, 2, 8, 5, 7}
values
are {'a', 'b', 'c', 'd', 'e', 'f'}
After sort:
keys
are {1, 2, 4, 5, 7, 8}
values
are {'a', 'c', 'b', 'e', 'f', 'd'}
void tf::cudaFlow::transform | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
O | output, | ||
C | c | ||
) |
updates parameters of a kernel task created from tf::cudaFlow::transform
The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each.
void tf::cudaFlow::transform | ( | cudaTask | task, |
I1 | first1, | ||
I1 | last1, | ||
I2 | first2, | ||
O | output, | ||
C | c | ||
) |
updates parameters of a kernel task created from tf::cudaFlow::transform
The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each.
cudaTask tf::cudaFlow::transform | ( | I | first, |
I | last, | ||
O | output, | ||
C | op | ||
) |
applies a callable to a source range and stores the result in a target range
I | input iterator type |
O | output iterator type |
C | unary operator type |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
op | the operator to apply to transform each element in the range |
This method is equivalent to the parallel execution of the following loop on a GPU:
cudaTask tf::cudaFlow::transform | ( | I1 | first1, |
I1 | last1, | ||
I2 | first2, | ||
O | output, | ||
C | op | ||
) |
creates a task to perform parallel transforms over two ranges of items
I1 | first input iterator type |
I2 | second input iterator type |
O | output iterator type |
C | unary operator type |
first1 | iterator to the beginning of the input range |
last1 | iterator to the end of the input range |
first2 | iterato |
output | iterator to the beginning of the output range |
op | binary operator to apply to transform each pair of items in the two input ranges |
This method is equivalent to the parallel execution of the following loop on a GPU:
void tf::cudaFlow::transform_exclusive_scan | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
O | output, | ||
B | bop, | ||
U | uop | ||
) |
updates the parameters of a task created from tf::cudaFlow::transform_exclusive_scan
This method is similar to tf::cudaFlow::transform_exclusive_scan but operates on an existing task.
void tf::cudaFlow::transform_inclusive_scan | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
O | output, | ||
B | bop, | ||
U | uop | ||
) |
updates the parameters of a task created from tf::cudaFlow::transform_inclusive_scan
This method is similar to tf::cudaFlow::transform_inclusive_scan but operates on an existing task.
cudaTask tf::cudaFlow::transform_inclusive_scan | ( | I | first, |
I | last, | ||
O | output, | ||
B | bop, | ||
U | uop | ||
) |
creates a task to perform parallel inclusive scan over a range of transformed items
I | input iterator type |
O | output iterator type |
B | binary operator type |
U | unary operator type |
first | iterator to the beginning |
last | iterator to the end |
output | iterator to the beginning of the output |
bop | binary operator |
uop | unary operator |
This method is equivalent to the parallel execution of the following loop on a GPU:
cudaTask tf::cudaFlow::transform_reduce | ( | I | first, |
I | last, | ||
T * | result, | ||
B | bop, | ||
U | uop | ||
) |
performs parallel reduction over a range of transformed items
I | input iterator type |
T | value type |
B | binary operator type |
U | unary operator type |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
result | pointer to the result with an initialized value |
bop | binary operator to apply to reduce items |
uop | unary operator to transform each item before reduction |
This method is equivalent to the parallel execution of the following loop on a GPU:
cudaTask tf::cudaFlow::transform_uninitialized_reduce | ( | I | first, |
I | last, | ||
T * | result, | ||
B | bop, | ||
U | uop | ||
) |
similar to tf::cudaFlow::transform_reduce but does not assume any initial value to reduce
This method is equivalent to the parallel execution of the following loop on a GPU:
void tf::cudaFlow::uninitialized_reduce | ( | cudaTask | task, |
I | first, | ||
I | last, | ||
T * | result, | ||
C | op | ||
) |
updates parameters of a kernel task created from tf::cudaFlow::uninitialized_reduce
The type of the iterators, result, and callable must be the same as the task created from tf::cudaFlow::uninitialized_reduce.
cudaTask tf::cudaFlow::uninitialized_reduce | ( | I | first, |
I | last, | ||
T * | result, | ||
B | bop | ||
) |
similar to tf::cudaFlow::reduce but does not assume any initial value to reduce
This method is equivalent to the parallel execution of the following loop on a GPU:
void tf::cudaFlow::zero | ( | cudaTask | task, |
T * | dst, | ||
size_t | count | ||
) |
updates parameters of a memset task to a zero task
The method is similar to tf::cudaFlow::zero but operates on a task of type tf::cudaTaskType::MEMSET.
The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.
cudaTask tf::cudaFlow::zero | ( | T * | dst, |
size_t | count | ||
) |
creates a memset task that sets a typed memory block to zero
T | element type (size of T must be either 1, 2, or 4) |
dst | pointer to the destination device memory area |
count | number of elements |
A zero task zeroes the first count
elements of type T
in a device memory area pointed by dst
.