Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
tf::cudaFlow Class Reference

class to create a cudaFlow task dependency graph More...

#include <cudaflow.hpp>

Public Member Functions

 cudaFlow ()
 constructs a standalone cudaFlow
 
 ~cudaFlow ()
 destroys the cudaFlow and its associated native CUDA graph and executable graph
 
bool empty () const
 queries the emptiness of the graph
 
size_t num_tasks () const
 queries the number of tasks
 
void clear ()
 clears the cudaFlow object
 
void dump (std::ostream &os) const
 dumps the cudaFlow graph into a DOT format through an output stream
 
void dump_native_graph (std::ostream &os) const
 dumps the native CUDA graph into a DOT format through an output stream
 
cudaTask noop ()
 creates a no-operation task
 
template<typename C >
cudaTask host (C &&callable)
 creates a host task that runs a callable on the host
 
template<typename C >
void host (cudaTask task, C &&callable)
 updates parameters of a host task
 
template<typename F , typename... ArgsT>
cudaTask kernel (dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
 creates a kernel task
 
template<typename F , typename... ArgsT>
void kernel (cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT &&... args)
 updates parameters of a kernel task
 
cudaTask memset (void *dst, int v, size_t count)
 creates a memset task that fills untyped data with a byte value
 
void memset (cudaTask task, void *dst, int ch, size_t count)
 updates parameters of a memset task
 
cudaTask memcpy (void *tgt, const void *src, size_t bytes)
 creates a memcpy task that copies untyped data in bytes
 
void memcpy (cudaTask task, void *tgt, const void *src, size_t bytes)
 updates parameters of a memcpy task
 
template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * = nullptr>
cudaTask zero (T *dst, size_t count)
 creates a memset task that sets a typed memory block to zero
 
template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * = nullptr>
void zero (cudaTask task, T *dst, size_t count)
 updates parameters of a memset task to a zero task
 
template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * = nullptr>
cudaTask fill (T *dst, T value, size_t count)
 creates a memset task that fills a typed memory block with a value
 
template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * = nullptr>
void fill (cudaTask task, T *dst, T value, size_t count)
 updates parameters of a memset task to a fill task
 
template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * = nullptr>
cudaTask copy (T *tgt, const T *src, size_t num)
 creates a memcopy task that copies typed data
 
template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * = nullptr>
void copy (cudaTask task, T *tgt, const T *src, size_t num)
 updates parameters of a memcpy task to a copy task
 
template<typename P >
void offload_until (P &&predicate)
 offloads the cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
 
void offload_n (size_t N)
 offloads the cudaFlow and executes it by the given times
 
void offload ()
 offloads the cudaFlow and executes it once
 
template<typename C >
cudaTask single_task (C c)
 runs a callable with only a single kernel thread
 
template<typename C >
void single_task (cudaTask task, C c)
 updates a single-threaded kernel task
 
template<typename I , typename C >
cudaTask for_each (I first, I last, C callable)
 applies a callable to each dereferenced element of the data array
 
template<typename I , typename C >
void for_each (cudaTask task, I first, I last, C callable)
 updates parameters of a kernel task created from tf::cudaFlow::for_each
 
template<typename I , typename C >
cudaTask for_each_index (I first, I last, I step, C callable)
 applies a callable to each index in the range with the step size
 
template<typename I , typename C >
void for_each_index (cudaTask task, I first, I last, I step, C callable)
 updates parameters of a kernel task created from tf::cudaFlow::for_each_index
 
template<typename I , typename O , typename C >
cudaTask transform (I first, I last, O output, C op)
 applies a callable to a source range and stores the result in a target range
 
template<typename I , typename O , typename C >
void transform (cudaTask task, I first, I last, O output, C c)
 updates parameters of a kernel task created from tf::cudaFlow::transform
 
template<typename I1 , typename I2 , typename O , typename C >
cudaTask transform (I1 first1, I1 last1, I2 first2, O output, C op)
 creates a task to perform parallel transforms over two ranges of items
 
template<typename I1 , typename I2 , typename O , typename C >
void transform (cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c)
 updates parameters of a kernel task created from tf::cudaFlow::transform
 
template<typename I , typename T , typename B >
cudaTask reduce (I first, I last, T *result, B bop)
 performs parallel reduction over a range of items
 
template<typename I , typename T , typename C >
void reduce (cudaTask task, I first, I last, T *result, C op)
 updates parameters of a kernel task created from tf::cudaFlow::reduce
 
template<typename I , typename T , typename B >
cudaTask uninitialized_reduce (I first, I last, T *result, B bop)
 similar to tf::cudaFlow::reduce but does not assume any initial value to reduce
 
template<typename I , typename T , typename C >
void uninitialized_reduce (cudaTask task, I first, I last, T *result, C op)
 updates parameters of a kernel task created from tf::cudaFlow::uninitialized_reduce
 
template<typename I , typename T , typename B , typename U >
cudaTask transform_reduce (I first, I last, T *result, B bop, U uop)
 performs parallel reduction over a range of transformed items
 
template<typename I , typename T , typename B , typename U >
void transform_reduce (cudaTask, I first, I last, T *result, B bop, U uop)
 updates parameters of a kernel task created from tf::cudaFlow::transform_reduce
 
template<typename I , typename T , typename B , typename U >
cudaTask transform_uninitialized_reduce (I first, I last, T *result, B bop, U uop)
 similar to tf::cudaFlow::transform_reduce but does not assume any initial value to reduce
 
template<typename I , typename T , typename B , typename U >
void transform_uninitialized_reduce (cudaTask task, I first, I last, T *result, B bop, U uop)
 updates parameters of a kernel task created from tf::cudaFlow::transform_uninitialized_reduce
 
template<typename I , typename O , typename C >
cudaTask inclusive_scan (I first, I last, O output, C op)
 creates a task to perform parallel inclusive scan over a range of items
 
template<typename I , typename O , typename C >
void inclusive_scan (cudaTask task, I first, I last, O output, C op)
 updates the parameters of a task created from tf::cudaFlow::inclusive_scan
 
template<typename I , typename O , typename C >
cudaTask exclusive_scan (I first, I last, O output, C op)
 similar to cudaFlow::inclusive_scan but excludes the first value
 
template<typename I , typename O , typename C >
void exclusive_scan (cudaTask task, I first, I last, O output, C op)
 updates the parameters of a task created from tf::cudaFlow::exclusive_scan
 
template<typename I , typename O , typename B , typename U >
cudaTask transform_inclusive_scan (I first, I last, O output, B bop, U uop)
 creates a task to perform parallel inclusive scan over a range of transformed items
 
template<typename I , typename O , typename B , typename U >
void transform_inclusive_scan (cudaTask task, I first, I last, O output, B bop, U uop)
 updates the parameters of a task created from tf::cudaFlow::transform_inclusive_scan
 
template<typename I , typename O , typename B , typename U >
cudaTask transform_exclusive_scan (I first, I last, O output, B bop, U uop)
 similar to cudaFlow::transform_inclusive_scan but excludes the first value
 
template<typename I , typename O , typename B , typename U >
void transform_exclusive_scan (cudaTask task, I first, I last, O output, B bop, U uop)
 updates the parameters of a task created from tf::cudaFlow::transform_exclusive_scan
 
template<typename A , typename B , typename C , typename Comp >
cudaTask merge (A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
 creates a task to perform parallel merge on two sorted arrays
 
template<typename A , typename B , typename C , typename Comp >
void merge (cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
 updates the parameters of a task created from tf::cudaFlow::merge
 
template<typename I , typename C >
cudaTask sort (I first, I last, C comp)
 creates a task to perform parallel sort an array
 
template<typename I , typename C >
void sort (cudaTask task, I first, I last, C comp)
 updates the parameters of the task created from tf::cudaFlow::sort
 
template<typename K_it , typename V_it , typename C >
cudaTask sort_by_key (K_it k_first, K_it k_last, V_it v_first, C comp)
 creates kernels that sort the given array
 
template<typename K_it , typename V_it , typename C >
void sort_by_key (cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp)
 updates the parameters of a task created from tf::cudaFlow::sort_by_key
 
template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
cudaTask merge_by_key (a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
 creates a task to perform parallel key-value merge
 
template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
void merge_by_key (cudaTask task, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
 updates the parameters of a task created from tf::cudaFlow::merge_by_key
 
template<typename I , typename U >
cudaTask find_if (I first, I last, unsigned *idx, U op)
 creates a task to find the index of the first element in a range
 
template<typename I , typename U >
void find_if (cudaTask task, I first, I last, unsigned *idx, U op)
 updates the parameters of the task created from tf::cudaFlow::find_if
 
template<typename I , typename O >
cudaTask min_element (I first, I last, unsigned *idx, O op)
 finds the index of the minimum element in a range
 
template<typename I , typename O >
void min_element (cudaTask task, I first, I last, unsigned *idx, O op)
 updates the parameters of the task created from tf::cudaFlow::min_element
 
template<typename I , typename O >
cudaTask max_element (I first, I last, unsigned *idx, O op)
 finds the index of the maximum element in a range
 
template<typename I , typename O >
void max_element (cudaTask task, I first, I last, unsigned *idx, O op)
 updates the parameters of the task created from tf::cudaFlow::max_element
 
template<typename C >
cudaTask capture (C &&callable)
 constructs a subflow graph through tf::cudaFlowCapturer
 
template<typename C >
void capture (cudaTask task, C callable)
 updates the captured child graph
 

Friends

class Executor
 

Detailed Description

class to create a cudaFlow task dependency graph

A cudaFlow is a high-level interface over CUDA Graph to perform GPU operations using the task dependency graph model. The class provides a set of methods for creating and launch different tasks on one or multiple CUDA devices, for instance, kernel tasks, data transfer tasks, and memory operation tasks. The following example creates a cudaFlow of two kernel tasks, task1 and task2, where task1 runs before task2.

tf::Taskflow taskflow;
tf::Executor executor;
taskflow.emplace([&](tf::cudaFlow& cf){
// create two kernel tasks
tf::cudaTask task1 = cf.kernel(grid1, block1, shm_size1, kernel1, args1);
tf::cudaTask task2 = cf.kernel(grid2, block2, shm_size2, kernel2, args2);
// kernel1 runs before kernel2
task1.precede(task2);
});
executor.run(taskflow).wait();
class to create an executor for running a taskflow graph
Definition executor.hpp:50
tf::Future< void > run(Taskflow &taskflow)
runs a taskflow once
Definition executor.hpp:1573
Task emplace(C &&callable)
creates a static task
Definition flow_builder.hpp:742
class to create a taskflow object
Definition core/taskflow.hpp:73
class to create a cudaFlow task dependency graph
Definition cudaflow.hpp:56
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
creates a kernel task
Definition cudaflow.hpp:1272
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
cudaTask & precede(Ts &&... tasks)
adds precedence links from this to other tasks
Definition cuda_task.hpp:182

A cudaFlow is a task (tf::Task) created from tf::Taskflow and will be run by one worker thread in the executor. That is, the callable that describes a cudaFlow will be executed sequentially. Inside a cudaFlow task, different GPU tasks (tf::cudaTask) may run in parallel scheduled by the CUDA runtime.

Please refer to GPU Tasking (cudaFlow) for details.

Constructor & Destructor Documentation

◆ cudaFlow()

tf::cudaFlow::cudaFlow ( )
inline

constructs a standalone cudaFlow

A standalone cudaFlow does not go through any taskflow and can be run by the caller thread using explicit offload methods (e.g., tf::cudaFlow::offload).

Member Function Documentation

◆ capture() [1/2]

template<typename C >
cudaTask tf::cudaFlow::capture ( C &&  callable)

constructs a subflow graph through tf::cudaFlowCapturer

Template Parameters
Ccallable type constructible from std::function<void(tf::cudaFlowCapturer&)>
Parameters
callablethe callable to construct a capture flow
Returns
a tf::cudaTask handle

A captured subflow forms a sub-graph to the cudaFlow and can be used to capture custom (or third-party) kernels that cannot be directly constructed from the cudaFlow.

Example usage:

taskflow.emplace([&](tf::cudaFlow& cf){
tf::cudaTask my_kernel = cf.kernel(my_arguments);
// create a flow capturer to capture custom kernels
tf::cudaTask my_subflow = cf.capture([&](tf::cudaFlowCapturer& capturer){
capturer.on([&](cudaStream_t stream){
invoke_custom_kernel_with_stream(stream, custom_arguments);
});
});
my_kernel.precede(my_subflow);
});
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582

◆ capture() [2/2]

template<typename C >
void tf::cudaFlow::capture ( cudaTask  task,
callable 
)

updates the captured child graph

The method is similar to tf::cudaFlow::capture but operates on a task of type tf::cudaTaskType::SUBFLOW. The new captured graph must be topologically identical to the original captured graph.

◆ copy() [1/2]

template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * >
void tf::cudaFlow::copy ( cudaTask  task,
T *  tgt,
const T *  src,
size_t  num 
)

updates parameters of a memcpy task to a copy task

The method is similar to tf::cudaFlow::copy but operates on a task of type tf::cudaTaskType::MEMCPY. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.

◆ copy() [2/2]

template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * >
cudaTask tf::cudaFlow::copy ( T *  tgt,
const T *  src,
size_t  num 
)

creates a memcopy task that copies typed data

Template Parameters
Telement type (non-void)
Parameters
tgtpointer to the target memory block
srcpointer to the source memory block
numnumber of elements to copy
Returns
a tf::cudaTask handle

A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

◆ dump_native_graph()

void tf::cudaFlow::dump_native_graph ( std::ostream os) const
inline

dumps the native CUDA graph into a DOT format through an output stream

The native CUDA graph may be different from the upper-level cudaFlow graph when flow capture is involved.

◆ exclusive_scan()

template<typename I , typename O , typename C >
void tf::cudaFlow::exclusive_scan ( cudaTask  task,
first,
last,
output,
op 
)

updates the parameters of a task created from tf::cudaFlow::exclusive_scan

This method is similar to tf::cudaFlow::exclusive_scan but operates on an existing task.

◆ fill() [1/2]

template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * >
void tf::cudaFlow::fill ( cudaTask  task,
T *  dst,
value,
size_t  count 
)

updates parameters of a memset task to a fill task

The method is similar to tf::cudaFlow::fill but operates on a task of type tf::cudaTaskType::MEMSET.

The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.

◆ fill() [2/2]

template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * >
cudaTask tf::cudaFlow::fill ( T *  dst,
value,
size_t  count 
)

creates a memset task that fills a typed memory block with a value

Template Parameters
Telement type (size of T must be either 1, 2, or 4)
Parameters
dstpointer to the destination device memory area
valuevalue to fill for each element of type T
countnumber of elements
Returns
a tf::cudaTask handle

A fill task fills the first count elements of type T with value in a device memory area pointed by dst. The value to fill is interpreted in type T rather than byte.

◆ find_if()

template<typename I , typename U >
cudaTask tf::cudaFlow::find_if ( first,
last,
unsigned *  idx,
op 
)

creates a task to find the index of the first element in a range

Template Parameters
Iinput iterator type
Uunary operator type
Parameters
firstiterator to the beginning of the range
lastiterator to the end of the range
idxpointer to the index of the found element
opunary operator which returns true for the required element

Finds the index idx of the first element in the range [first, last) such that op(*(first+idx)) is true. This is equivalent to the parallel execution of the following loop:

unsigned idx = 0;
for(; first != last; ++first, ++idx) {
if (p(*first)) {
return idx;
}
}
return idx;

◆ for_each() [1/2]

template<typename I , typename C >
void tf::cudaFlow::for_each ( cudaTask  task,
first,
last,
callable 
)

updates parameters of a kernel task created from tf::cudaFlow::for_each

The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each.

◆ for_each() [2/2]

template<typename I , typename C >
cudaTask tf::cudaFlow::for_each ( first,
last,
callable 
)

applies a callable to each dereferenced element of the data array

Template Parameters
Iiterator type
Ccallable type
Parameters
firstiterator to the beginning (inclusive)
lastiterator to the end (exclusive)
callablea callable object to apply to the dereferenced iterator
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; itr++) {
callable(*itr);
}

◆ for_each_index() [1/2]

template<typename I , typename C >
void tf::cudaFlow::for_each_index ( cudaTask  task,
first,
last,
step,
callable 
)

updates parameters of a kernel task created from tf::cudaFlow::for_each_index

The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each_index.

◆ for_each_index() [2/2]

template<typename I , typename C >
cudaTask tf::cudaFlow::for_each_index ( first,
last,
step,
callable 
)

applies a callable to each index in the range with the step size

Template Parameters
Iindex type
Ccallable type
Parameters
firstbeginning index
lastlast index
stepstep size
callablethe callable to apply to each element in the data array
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

// step is positive [first, last)
for(auto i=first; i<last; i+=step) {
callable(i);
}
// step is negative [first, last)
for(auto i=first; i>last; i+=step) {
callable(i);
}

◆ host() [1/2]

template<typename C >
cudaTask tf::cudaFlow::host ( C &&  callable)

creates a host task that runs a callable on the host

Template Parameters
Ccallable type
Parameters
callablea callable object with neither arguments nor return (i.e., constructible from std::function<void()>)
Returns
a tf::cudaTask handle

A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc).

◆ host() [2/2]

template<typename C >
void tf::cudaFlow::host ( cudaTask  task,
C &&  callable 
)

updates parameters of a host task

The method is similar to tf::cudaFlow::host but operates on a task of type tf::cudaTaskType::HOST.

◆ inclusive_scan() [1/2]

template<typename I , typename O , typename C >
void tf::cudaFlow::inclusive_scan ( cudaTask  task,
first,
last,
output,
op 
)

updates the parameters of a task created from tf::cudaFlow::inclusive_scan

This method is similar to tf::cudaFlow::inclusive_scan but operates on an existing task.

◆ inclusive_scan() [2/2]

template<typename I , typename O , typename C >
cudaTask tf::cudaFlow::inclusive_scan ( first,
last,
output,
op 
)

creates a task to perform parallel inclusive scan over a range of items

Template Parameters
Iinput iterator type
Ooutput iterator type
Cbinary operator type
Parameters
firstiterator to the beginning
lastiterator to the end
outputiterator to the beginning of the output
opbinary operator
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(size_t i=0; i<std::distance(first, last); i++) {
*(output + i) = i ? op(*(first+i), *(output+i-1)) : *(first+i);
}
T distance(T... args)

◆ kernel() [1/2]

template<typename F , typename... ArgsT>
void tf::cudaFlow::kernel ( cudaTask  task,
dim3  g,
dim3  b,
size_t  shm,
f,
ArgsT &&...  args 
)

updates parameters of a kernel task

The method is similar to tf::cudaFlow::kernel but operates on a task of type tf::cudaTaskType::KERNEL. The kernel function name must NOT change.

◆ kernel() [2/2]

template<typename F , typename... ArgsT>
cudaTask tf::cudaFlow::kernel ( dim3  g,
dim3  b,
size_t  s,
f,
ArgsT &&...  args 
)

creates a kernel task

Template Parameters
Fkernel function type
ArgsTkernel function parameters type
Parameters
gconfigured grid
bconfigured block
sconfigured shared memory size in bytes
fkernel function
argsarguments to forward to the kernel function by copy
Returns
a tf::cudaTask handle

◆ max_element()

template<typename I , typename O >
cudaTask tf::cudaFlow::max_element ( first,
last,
unsigned *  idx,
op 
)

finds the index of the maximum element in a range

Template Parameters
Iinput iterator type
Ocomparator type
Parameters
firstiterator to the beginning of the range
lastiterator to the end of the range
idxsolution index of the maximum element
opcomparison function object

The function launches kernels asynchronously to find the largest element in the range [first, last) using the given comparator op. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
return 0;
}
auto largest = first;
for (++first; first != last; ++first) {
if (op(*largest, *first)) {
largest = first;
}
}
return std::distance(first, largest);

◆ memcpy() [1/2]

void tf::cudaFlow::memcpy ( cudaTask  task,
void *  tgt,
const void *  src,
size_t  bytes 
)
inline

updates parameters of a memcpy task

The method is similar to tf::cudaFlow::memcpy but operates on a task of type tf::cudaTaskType::MEMCPY. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.

◆ memcpy() [2/2]

cudaTask tf::cudaFlow::memcpy ( void *  tgt,
const void *  src,
size_t  bytes 
)
inline

creates a memcpy task that copies untyped data in bytes

Parameters
tgtpointer to the target memory block
srcpointer to the source memory block
bytesbytes to copy
Returns
a tf::cudaTask handle

A memcpy task transfers bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

◆ memset() [1/2]

void tf::cudaFlow::memset ( cudaTask  task,
void *  dst,
int  ch,
size_t  count 
)
inline

updates parameters of a memset task

The method is similar to tf::cudaFlow::memset but operates on a task of type tf::cudaTaskType::MEMSET. The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.

◆ memset() [2/2]

cudaTask tf::cudaFlow::memset ( void *  dst,
int  v,
size_t  count 
)
inline

creates a memset task that fills untyped data with a byte value

Parameters
dstpointer to the destination device memory area
vvalue to set for each byte of specified memory
countsize in bytes to set
Returns
a tf::cudaTask handle

A memset task fills the first count bytes of device memory area pointed by dst with the byte value v.

◆ merge() [1/2]

template<typename A , typename B , typename C , typename Comp >
cudaTask tf::cudaFlow::merge ( a_first,
a_last,
b_first,
b_last,
c_first,
Comp  comp 
)

creates a task to perform parallel merge on two sorted arrays

Template Parameters
Aiterator type of the first input array
Biterator type of the second input array
Citerator type of the output array
Compcomparator type
Parameters
a_firstiterator to the beginning of the first input array
a_lastiterator to the end of the first input array
b_firstiterator to the beginning of the second input array
b_lastiterator to the end of the second input array
c_firstiterator to the beginning of the output array
compbinary comparator
Returns
a tf::cudaTask handle

Merges two sorted ranges [a_first, a_last) and [b_first, b_last) into one sorted range beginning at c_first.

A sequence is said to be sorted with respect to a comparator comp if for any iterator it pointing to the sequence and any non-negative integer n such that it + n is a valid iterator pointing to an element of the sequence, comp(*(it + n), *it) evaluates to false.

◆ merge() [2/2]

template<typename A , typename B , typename C , typename Comp >
void tf::cudaFlow::merge ( cudaTask  task,
a_first,
a_last,
b_first,
b_last,
c_first,
Comp  comp 
)

updates the parameters of a task created from tf::cudaFlow::merge

This method is similar to tf::cudaFlow::merge but operates on an existing task.

◆ merge_by_key() [1/2]

template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
cudaTask tf::cudaFlow::merge_by_key ( a_keys_it  a_keys_first,
a_keys_it  a_keys_last,
a_vals_it  a_vals_first,
b_keys_it  b_keys_first,
b_keys_it  b_keys_last,
b_vals_it  b_vals_first,
c_keys_it  c_keys_first,
c_vals_it  c_vals_first,
comp 
)

creates a task to perform parallel key-value merge

Template Parameters
a_keys_itfirst key iterator type
a_vals_itfirst value iterator type
b_keys_itsecond key iterator type
b_vals_itsecond value iterator type
c_keys_itoutput key iterator type
c_vals_itoutput value iterator type
Ccomparator type
Parameters
a_keys_firstiterator to the beginning of the first key range
a_keys_lastiterator to the end of the first key range
a_vals_firstiterator to the beginning of the first value range
b_keys_firstiterator to the beginning of the second key range
b_keys_lastiterator to the end of the second key range
b_vals_firstiterator to the beginning of the second value range
c_keys_firstiterator to the beginning of the output key range
c_vals_firstiterator to the beginning of the output value range
compcomparator

Performs a key-value merge that copies elements from [a_keys_first, a_keys_last) and [b_keys_first, b_keys_last) into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending key order.

At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first)) and [b_vals_first + (b_keys_last - b_keys_first)) into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending order implied by each input element's associated key.

For example, assume:

  • a_keys = {8, 1}
  • a_vals = {1, 2}
  • b_keys = {3, 7}
  • b_vals = {3, 4}

After the merge, we have:

  • c_keys = {1, 3, 7, 8}
  • c_vals = {2, 3, 4, 1}

◆ merge_by_key() [2/2]

template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
void tf::cudaFlow::merge_by_key ( cudaTask  task,
a_keys_it  a_keys_first,
a_keys_it  a_keys_last,
a_vals_it  a_vals_first,
b_keys_it  b_keys_first,
b_keys_it  b_keys_last,
b_vals_it  b_vals_first,
c_keys_it  c_keys_first,
c_vals_it  c_vals_first,
comp 
)

updates the parameters of a task created from tf::cudaFlow::merge_by_key

This method is similar to tf::cudaFlow::merge_by_key but operates on an existing task.

◆ min_element()

template<typename I , typename O >
cudaTask tf::cudaFlow::min_element ( first,
last,
unsigned *  idx,
op 
)

finds the index of the minimum element in a range

Template Parameters
Iinput iterator type
Ocomparator type
Parameters
firstiterator to the beginning of the range
lastiterator to the end of the range
idxsolution index of the minimum element
opcomparison function object

The function launches kernels asynchronously to find the smallest element in the range [first, last) using the given comparator op. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
return 0;
}
auto smallest = first;
for (++first; first != last; ++first) {
if (op(*first, *smallest)) {
smallest = first;
}
}
return std::distance(first, smallest);

◆ noop()

cudaTask tf::cudaFlow::noop ( )
inline

creates a no-operation task

Returns
a tf::cudaTask handle

An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.

◆ offload_n()

void tf::cudaFlow::offload_n ( size_t  N)
inline

offloads the cudaFlow and executes it by the given times

Parameters
Nnumber of executions

◆ offload_until()

template<typename P >
void tf::cudaFlow::offload_until ( P &&  predicate)

offloads the cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true

Template Parameters
Ppredicate type (a binary callable)
Parameters
predicatea binary predicate (returns true for stop)

Immediately offloads the present cudaFlow onto a GPU and repeatedly runs it until the predicate returns true.

An offloaded cudaFlow forces the underlying graph to be instantiated. After the instantiation, you should not modify the graph topology but update node parameters.

By default, if users do not offload the cudaFlow, the executor will offload it once.

◆ reduce() [1/2]

template<typename I , typename T , typename C >
void tf::cudaFlow::reduce ( cudaTask  task,
first,
last,
T *  result,
op 
)

updates parameters of a kernel task created from tf::cudaFlow::reduce

The type of the iterators, result, and callable must be the same as the task created from tf::cudaFlow::reduce.

◆ reduce() [2/2]

template<typename I , typename T , typename B >
cudaTask tf::cudaFlow::reduce ( first,
last,
T *  result,
bop 
)

performs parallel reduction over a range of items

Template Parameters
Iinput iterator type
Tvalue type
Bbinary operator type
Parameters
firstiterator to the beginning (inclusive)
lastiterator to the end (exclusive)
resultpointer to the result with an initialized value
bopbinary operator to apply to reduce items
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
*result = bop(*result, *first++);
}

◆ single_task() [1/2]

template<typename C >
cudaTask tf::cudaFlow::single_task ( c)

runs a callable with only a single kernel thread

Template Parameters
Ccallable type
Parameters
ccallable to run by a single kernel thread
Returns
a tf::cudaTask handle

◆ single_task() [2/2]

template<typename C >
void tf::cudaFlow::single_task ( cudaTask  task,
c 
)

updates a single-threaded kernel task

This method is similar to cudaFlow::single_task but operates on an existing task.

◆ sort() [1/2]

template<typename I , typename C >
void tf::cudaFlow::sort ( cudaTask  task,
first,
last,
comp 
)

updates the parameters of the task created from tf::cudaFlow::sort

This method is similar to tf::cudaFlow::sort but operates on an existing task.

◆ sort() [2/2]

template<typename I , typename C >
cudaTask tf::cudaFlow::sort ( first,
last,
comp 
)

creates a task to perform parallel sort an array

Template Parameters
Iiterator type of the first input array
Ccomparator type
Parameters
firstiterator to the beginning of the input array
lastiterator to the end of the input array
compbinary comparator
Returns
a tf::cudaTask handle

Sorts elements in the range [first, last) with the given comparator comp.

◆ sort_by_key() [1/2]

template<typename K_it , typename V_it , typename C >
void tf::cudaFlow::sort_by_key ( cudaTask  task,
K_it  k_first,
K_it  k_last,
V_it  v_first,
comp 
)

updates the parameters of a task created from tf::cudaFlow::sort_by_key

This method is similar to tf::cudaFlow::sort_by_key but operates on an existing task.

◆ sort_by_key() [2/2]

template<typename K_it , typename V_it , typename C >
cudaTask tf::cudaFlow::sort_by_key ( K_it  k_first,
K_it  k_last,
V_it  v_first,
comp 
)

creates kernels that sort the given array

Template Parameters
K_ititerator type of the key
V_ititerator type of the value
Ccomparator type
Parameters
k_firstiterator to the beginning of the key array
k_lastiterator to the end of the key array
v_firstiterator to the beginning of the value array
compbinary comparator
Returns
a tf::cudaTask handle

Sorts key-value elements in [k_first, k_last) and [v_first, v_first + (k_last - k_first)) into ascending key order using the given comparator comp. If i and j are any two valid iterators in [k_first, k_last) such that i precedes j, and p and q are iterators in [v_first, v_first + (k_last - k_first)) corresponding to i and j respectively, then comp(*j, *i) evaluates to false.

For example, assume:

  • keys are {1, 4, 2, 8, 5, 7}
  • values are {'a', 'b', 'c', 'd', 'e', 'f'}

After sort:

  • keys are {1, 2, 4, 5, 7, 8}
  • values are {'a', 'c', 'b', 'e', 'f', 'd'}

◆ transform() [1/4]

template<typename I , typename O , typename C >
void tf::cudaFlow::transform ( cudaTask  task,
first,
last,
output,
c 
)

updates parameters of a kernel task created from tf::cudaFlow::transform

The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each.

◆ transform() [2/4]

template<typename I1 , typename I2 , typename O , typename C >
void tf::cudaFlow::transform ( cudaTask  task,
I1  first1,
I1  last1,
I2  first2,
output,
c 
)

updates parameters of a kernel task created from tf::cudaFlow::transform

The type of the iterators and the callable must be the same as the task created from tf::cudaFlow::for_each.

◆ transform() [3/4]

template<typename I , typename O , typename C >
cudaTask tf::cudaFlow::transform ( first,
last,
output,
op 
)

applies a callable to a source range and stores the result in a target range

Template Parameters
Iinput iterator type
Ooutput iterator type
Cunary operator type
Parameters
firstiterator to the beginning of the input range
lastiterator to the end of the input range
outputiterator to the beginning of the output range
opthe operator to apply to transform each element in the range
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
*output++ = callable(*first++);
}

◆ transform() [4/4]

template<typename I1 , typename I2 , typename O , typename C >
cudaTask tf::cudaFlow::transform ( I1  first1,
I1  last1,
I2  first2,
output,
op 
)

creates a task to perform parallel transforms over two ranges of items

Template Parameters
I1first input iterator type
I2second input iterator type
Ooutput iterator type
Cunary operator type
Parameters
first1iterator to the beginning of the input range
last1iterator to the end of the input range
first2iterato
outputiterator to the beginning of the output range
opbinary operator to apply to transform each pair of items in the two input ranges
Returns
cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {
*output++ = op(*first1++, *first2++);
}

◆ transform_exclusive_scan()

template<typename I , typename O , typename B , typename U >
void tf::cudaFlow::transform_exclusive_scan ( cudaTask  task,
first,
last,
output,
bop,
uop 
)

updates the parameters of a task created from tf::cudaFlow::transform_exclusive_scan

This method is similar to tf::cudaFlow::transform_exclusive_scan but operates on an existing task.

◆ transform_inclusive_scan() [1/2]

template<typename I , typename O , typename B , typename U >
void tf::cudaFlow::transform_inclusive_scan ( cudaTask  task,
first,
last,
output,
bop,
uop 
)

updates the parameters of a task created from tf::cudaFlow::transform_inclusive_scan

This method is similar to tf::cudaFlow::transform_inclusive_scan but operates on an existing task.

◆ transform_inclusive_scan() [2/2]

template<typename I , typename O , typename B , typename U >
cudaTask tf::cudaFlow::transform_inclusive_scan ( first,
last,
output,
bop,
uop 
)

creates a task to perform parallel inclusive scan over a range of transformed items

Template Parameters
Iinput iterator type
Ooutput iterator type
Bbinary operator type
Uunary operator type
Parameters
firstiterator to the beginning
lastiterator to the end
outputiterator to the beginning of the output
bopbinary operator
uopunary operator
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(size_t i=0; i<std::distance(first, last); i++) {
*(output + i) = i ? op(uop(*(first+i)), *(output+i-1)) : uop(*(first+i));
}

◆ transform_reduce()

template<typename I , typename T , typename B , typename U >
cudaTask tf::cudaFlow::transform_reduce ( first,
last,
T *  result,
bop,
uop 
)

performs parallel reduction over a range of transformed items

Template Parameters
Iinput iterator type
Tvalue type
Bbinary operator type
Uunary operator type
Parameters
firstiterator to the beginning (inclusive)
lastiterator to the end (exclusive)
resultpointer to the result with an initialized value
bopbinary operator to apply to reduce items
uopunary operator to transform each item before reduction
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
*result = bop(*result, uop(*first++));
}

◆ transform_uninitialized_reduce()

template<typename I , typename T , typename B , typename U >
cudaTask tf::cudaFlow::transform_uninitialized_reduce ( first,
last,
T *  result,
bop,
uop 
)

similar to tf::cudaFlow::transform_reduce but does not assume any initial value to reduce

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = uop(*first++); // no initial values partitipcate in the loop
while (first != last) {
*result = bop(*result, uop(*first++));
}

◆ uninitialized_reduce() [1/2]

template<typename I , typename T , typename C >
void tf::cudaFlow::uninitialized_reduce ( cudaTask  task,
first,
last,
T *  result,
op 
)

updates parameters of a kernel task created from tf::cudaFlow::uninitialized_reduce

The type of the iterators, result, and callable must be the same as the task created from tf::cudaFlow::uninitialized_reduce.

◆ uninitialized_reduce() [2/2]

template<typename I , typename T , typename B >
cudaTask tf::cudaFlow::uninitialized_reduce ( first,
last,
T *  result,
bop 
)

similar to tf::cudaFlow::reduce but does not assume any initial value to reduce

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = *first++; // no initial values partitipcate in the loop
while (first != last) {
*result = op(*result, *first++);
}

◆ zero() [1/2]

template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * >
void tf::cudaFlow::zero ( cudaTask  task,
T *  dst,
size_t  count 
)

updates parameters of a memset task to a zero task

The method is similar to tf::cudaFlow::zero but operates on a task of type tf::cudaTaskType::MEMSET.

The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.

◆ zero() [2/2]

template<typename T , std::enable_if_t< is_pod_v< T > &&(sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void > * >
cudaTask tf::cudaFlow::zero ( T *  dst,
size_t  count 
)

creates a memset task that sets a typed memory block to zero

Template Parameters
Telement type (size of T must be either 1, 2, or 4)
Parameters
dstpointer to the destination device memory area
countnumber of elements
Returns
a tf::cudaTask handle

A zero task zeroes the first count elements of type T in a device memory area pointed by dst.


The documentation for this class was generated from the following files: