Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
tf::cudaFlowCapturer Class Reference

class to create a cudaFlow graph using stream capture More...

#include <cuda_capturer.hpp>

Public Member Functions

 cudaFlowCapturer ()
 constrcts a standalone cudaFlowCapturer
 
virtual ~cudaFlowCapturer ()
 destructs the cudaFlowCapturer
 
bool empty () const
 queries the emptiness of the graph
 
size_t num_tasks () const
 queries the number of tasks
 
void clear ()
 clear this cudaFlow capturer
 
void dump (std::ostream &os) const
 dumps the capture graph into a DOT format through an output stream
 
template<typename OPT , typename... ArgsT>
OPT & make_optimizer (ArgsT &&... args)
 selects a different optimization algorithm
 
template<typename C , std::enable_if_t< std::is_invocable_r_v< void, C, cudaStream_t >, void > * = nullptr>
cudaTask on (C &&callable)
 captures a sequential CUDA operations from the given callable
 
template<typename C , std::enable_if_t< std::is_invocable_r_v< void, C, cudaStream_t >, void > * = nullptr>
void on (cudaTask task, C &&callable)
 updates a capture task to another sequential CUDA operations
 
cudaTask noop ()
 captures a no-operation task
 
void noop (cudaTask task)
 updates a task to a no-operation task
 
cudaTask memcpy (void *dst, const void *src, size_t count)
 copies data between host and device asynchronously through a stream
 
void memcpy (cudaTask task, void *dst, const void *src, size_t count)
 updates a capture task to a memcpy operation
 
template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * = nullptr>
cudaTask copy (T *tgt, const T *src, size_t num)
 captures a copy task of typed data
 
template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * = nullptr>
void copy (cudaTask task, T *tgt, const T *src, size_t num)
 updates a capture task to a copy operation
 
cudaTask memset (void *ptr, int v, size_t n)
 initializes or sets GPU memory to the given value byte by byte
 
void memset (cudaTask task, void *ptr, int value, size_t n)
 updates a capture task to a memset operation
 
template<typename F , typename... ArgsT>
cudaTask kernel (dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
 captures a kernel
 
template<typename F , typename... ArgsT>
void kernel (cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
 updates a capture task to a kernel operation
 
template<typename C >
cudaTask single_task (C c)
 capturers a kernel to runs the given callable with only one thread
 
template<typename C >
void single_task (cudaTask task, C c)
 updates a capture task to a single-threaded kernel
 
template<typename I , typename C >
cudaTask for_each (I first, I last, C callable)
 captures a kernel that applies a callable to each dereferenced element of the data array
 
template<typename I , typename C >
void for_each (cudaTask task, I first, I last, C callable)
 updates a capture task to a for-each kernel task
 
template<typename I , typename C >
cudaTask for_each_index (I first, I last, I step, C callable)
 captures a kernel that applies a callable to each index in the range with the step size
 
template<typename I , typename C >
void for_each_index (cudaTask task, I first, I last, I step, C callable)
 updates a capture task to a for-each-index kernel task
 
template<typename I , typename O , typename C >
cudaTask transform (I first, I last, O output, C op)
 captures a kernel that transforms an input range to an output range
 
template<typename I , typename O , typename C >
void transform (cudaTask task, I first, I last, O output, C op)
 updates a capture task to a transform kernel task
 
template<typename I1 , typename I2 , typename O , typename C >
cudaTask transform (I1 first1, I1 last1, I2 first2, O output, C op)
 captures a kernel that transforms two input ranges to an output range
 
template<typename I1 , typename I2 , typename O , typename C >
void transform (cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op)
 updates a capture task to a transform kernel task
 
template<typename I , typename T , typename C >
cudaTask reduce (I first, I last, T *result, C op)
 captures kernels that perform parallel reduction over a range of items
 
template<typename I , typename T , typename C >
void reduce (cudaTask task, I first, I last, T *result, C op)
 updates a capture task to a reduction task
 
template<typename I , typename T , typename C >
cudaTask uninitialized_reduce (I first, I last, T *result, C op)
 similar to tf::cudaFlowCapturer::reduce but does not assume any initial value to reduce
 
template<typename I , typename T , typename C >
void uninitialized_reduce (cudaTask task, I first, I last, T *result, C op)
 updates a capture task to an uninitialized-reduction task
 
template<typename I , typename T , typename C , typename U >
cudaTask transform_reduce (I first, I last, T *result, C bop, U uop)
 captures kernels that perform parallel reduction over a range of transformed items
 
template<typename I , typename T , typename C , typename U >
void transform_reduce (cudaTask task, I first, I last, T *result, C bop, U uop)
 updates a capture task to a transform-reduce task
 
template<typename I , typename T , typename C , typename U >
cudaTask transform_uninitialized_reduce (I first, I last, T *result, C bop, U uop)
 similar to tf::cudaFlowCapturer::transform_reduce but does not assume any initial value to reduce
 
template<typename I , typename T , typename C , typename U >
void transform_uninitialized_reduce (cudaTask task, I first, I last, T *result, C bop, U uop)
 updates a capture task to a transform-reduce task of no initialized value
 
template<typename I , typename O , typename C >
cudaTask inclusive_scan (I first, I last, O output, C op)
 captures kernels that perform parallel inclusive scan over a range of items
 
template<typename I , typename O , typename C >
void inclusive_scan (cudaTask task, I first, I last, O output, C op)
 updates a capture task to an inclusive scan task
 
template<typename I , typename O , typename C >
cudaTask exclusive_scan (I first, I last, O output, C op)
 similar to cudaFlowCapturer::inclusive_scan but excludes the first value
 
template<typename I , typename O , typename C >
void exclusive_scan (cudaTask task, I first, I last, O output, C op)
 updates a capture task to an exclusive scan task
 
template<typename I , typename O , typename B , typename U >
cudaTask transform_inclusive_scan (I first, I last, O output, B bop, U uop)
 captures kernels that perform parallel inclusive scan over a range of transformed items
 
template<typename I , typename O , typename B , typename U >
void transform_inclusive_scan (cudaTask task, I first, I last, O output, B bop, U uop)
 updates a capture task to a transform-inclusive scan task
 
template<typename I , typename O , typename B , typename U >
cudaTask transform_exclusive_scan (I first, I last, O output, B bop, U uop)
 similar to cudaFlowCapturer::transform_inclusive_scan but excludes the first value
 
template<typename I , typename O , typename B , typename U >
void transform_exclusive_scan (cudaTask task, I first, I last, O output, B bop, U uop)
 updates a capture task to a transform-exclusive scan task
 
template<typename A , typename B , typename C , typename Comp >
cudaTask merge (A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
 captures kernels that perform parallel merge on two sorted arrays
 
template<typename A , typename B , typename C , typename Comp >
void merge (cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
 updates a capture task to a merge task
 
template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
cudaTask merge_by_key (a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
 captures kernels that perform parallel key-value merge
 
template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
void merge_by_key (cudaTask task, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
 updates a capture task to a key-value merge task
 
template<typename I , typename C >
cudaTask sort (I first, I last, C comp)
 captures kernels that sort the given array
 
template<typename I , typename C >
void sort (cudaTask task, I first, I last, C comp)
 updates a capture task to a sort task
 
template<typename K_it , typename V_it , typename C >
cudaTask sort_by_key (K_it k_first, K_it k_last, V_it v_first, C comp)
 captures kernels that sort the given array
 
template<typename K_it , typename V_it , typename C >
void sort_by_key (cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp)
 updates a capture task to a key-value sort task
 
template<typename I , typename U >
cudaTask find_if (I first, I last, unsigned *idx, U op)
 creates a task to find the index of the first element in a range
 
template<typename I , typename U >
void find_if (cudaTask task, I first, I last, unsigned *idx, U op)
 updates the parameters of a find-if task
 
template<typename I , typename O >
cudaTask min_element (I first, I last, unsigned *idx, O op)
 finds the index of the minimum element in a range
 
template<typename I , typename O >
void min_element (cudaTask task, I first, I last, unsigned *idx, O op)
 updates the parameters of a min-element task
 
template<typename I , typename O >
cudaTask max_element (I first, I last, unsigned *idx, O op)
 finds the index of the maximum element in a range
 
template<typename I , typename O >
void max_element (cudaTask task, I first, I last, unsigned *idx, O op)
 updates the parameters of a max-element task
 
template<typename P >
void offload_until (P &&predicate)
 offloads the captured cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
 
void offload_n (size_t n)
 offloads the captured cudaFlow and executes it by the given times
 
void offload ()
 offloads the captured cudaFlow and executes it once
 

Friends

class cudaFlow
 
class Executor
 

Detailed Description

class to create a cudaFlow graph using stream capture

The usage of tf::cudaFlowCapturer is similar to tf::cudaFlow, except users can call the method tf::cudaFlowCapturer::on to capture a sequence of asynchronous CUDA operations through the given stream. The following example creates a CUDA graph that captures two kernel tasks, task_1 and task_2, where task_1 runs before task_2.

taskflow.emplace([](tf::cudaFlowCapturer& capturer){
// capture my_kernel_1 through the given stream managed by the capturer
auto task_1 = capturer.on([&](cudaStream_t stream){
my_kernel_1<<<grid_1, block_1, shm_size_1, stream>>>(my_parameters_1);
});
// capture my_kernel_2 through the given stream managed by the capturer
auto task_2 = capturer.on([&](cudaStream_t stream){
my_kernel_2<<<grid_2, block_2, shm_size_2, stream>>>(my_parameters_2);
});
task_1.precede(task_2);
});
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask & precede(Ts &&... tasks)
adds precedence links from this to other tasks
Definition cuda_task.hpp:182

Similar to tf::cudaFlow, a cudaFlowCapturer is a task (tf::Task) created from tf::Taskflow and will be run by one worker thread in the executor. That is, the callable that describes a cudaFlowCapturer will be executed sequentially. Inside a cudaFlow capturer task, different GPU tasks (tf::cudaTask) may run in parallel depending on the selected optimization algorithm. By default, we use tf::cudaRoundRobinCapturing to transform a user-level graph into a native CUDA graph.

Please refer to GPU Tasking (cudaFlowCapturer) for details.

Constructor & Destructor Documentation

◆ cudaFlowCapturer()

tf::cudaFlowCapturer::cudaFlowCapturer ( )
inline

constrcts a standalone cudaFlowCapturer

A standalone cudaFlow capturer does not go through any taskflow and can be run by the caller thread using explicit offload methods (e.g., tf::cudaFlow::offload).

Member Function Documentation

◆ copy() [1/2]

template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * >
void tf::cudaFlowCapturer::copy ( cudaTask  task,
T *  tgt,
const T *  src,
size_t  num 
)

updates a capture task to a copy operation

The method is similar to cudaFlowCapturer::copy but operates on an existing task.

◆ copy() [2/2]

template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * >
cudaTask tf::cudaFlowCapturer::copy ( T *  tgt,
const T *  src,
size_t  num 
)

captures a copy task of typed data

Template Parameters
Telement type (non-void)
Parameters
tgtpointer to the target memory block
srcpointer to the source memory block
numnumber of elements to copy
Returns
cudaTask handle

A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

◆ exclusive_scan()

template<typename I , typename O , typename C >
void tf::cudaFlowCapturer::exclusive_scan ( cudaTask  task,
first,
last,
output,
op 
)

updates a capture task to an exclusive scan task

This method is similar to cudaFlowCapturer::exclusive_scan but operates on an existing task.

◆ find_if() [1/2]

template<typename I , typename U >
void tf::cudaFlowCapturer::find_if ( cudaTask  task,
first,
last,
unsigned *  idx,
op 
)

updates the parameters of a find-if task

This method is similar to tf::cudaFlowCapturer::find_if but operates on an existing task.

◆ find_if() [2/2]

template<typename I , typename U >
cudaTask tf::cudaFlowCapturer::find_if ( first,
last,
unsigned *  idx,
op 
)

creates a task to find the index of the first element in a range

Template Parameters
Iinput iterator type
Uunary operator type
Parameters
firstiterator to the beginning of the range
lastiterator to the end of the range
idxpointer to the index of the found element
opunary operator which returns true for the required element

Finds the index idx of the first element in the range [first, last) such that op(*(first+idx)) is true. This is equivalent to the parallel execution of the following loop:

unsigned idx = 0;
for(; first != last; ++first, ++idx) {
if (p(*first)) {
return idx;
}
}
return idx;

◆ for_each() [1/2]

template<typename I , typename C >
void tf::cudaFlowCapturer::for_each ( cudaTask  task,
first,
last,
callable 
)

updates a capture task to a for-each kernel task

This method is similar to cudaFlowCapturer::for_each but operates on an existing task.

◆ for_each() [2/2]

template<typename I , typename C >
cudaTask tf::cudaFlowCapturer::for_each ( first,
last,
callable 
)

captures a kernel that applies a callable to each dereferenced element of the data array

Template Parameters
Iiterator type
Ccallable type
Parameters
firstiterator to the beginning
lastiterator to the end
callablea callable object to apply to the dereferenced iterator
Returns
cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; i++) {
callable(*itr);
}

◆ for_each_index() [1/2]

template<typename I , typename C >
void tf::cudaFlowCapturer::for_each_index ( cudaTask  task,
first,
last,
step,
callable 
)

updates a capture task to a for-each-index kernel task

This method is similar to cudaFlowCapturer::for_each_index but operates on an existing task.

◆ for_each_index() [2/2]

template<typename I , typename C >
cudaTask tf::cudaFlowCapturer::for_each_index ( first,
last,
step,
callable 
)

captures a kernel that applies a callable to each index in the range with the step size

Template Parameters
Iindex type
Ccallable type
Parameters
firstbeginning index
lastlast index
stepstep size
callablethe callable to apply to each element in the data array
Returns
cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

// step is positive [first, last)
for(auto i=first; i<last; i+=step) {
callable(i);
}
// step is negative [first, last)
for(auto i=first; i>last; i+=step) {
callable(i);
}

◆ inclusive_scan() [1/2]

template<typename I , typename O , typename C >
void tf::cudaFlowCapturer::inclusive_scan ( cudaTask  task,
first,
last,
output,
op 
)

updates a capture task to an inclusive scan task

This method is similar to cudaFlowCapturer::inclusive_scan but operates on an existing task.

◆ inclusive_scan() [2/2]

template<typename I , typename O , typename C >
cudaTask tf::cudaFlowCapturer::inclusive_scan ( first,
last,
output,
op 
)

captures kernels that perform parallel inclusive scan over a range of items

Template Parameters
Iinput iterator type
Ooutput iterator type
Cbinary operator type
Parameters
firstiterator to the beginning
lastiterator to the end
outputiterator to the beginning of the output
opbinary operator
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(size_t i=0; i<std::distance(first, last); i++) {
*(output + i) = i ? op(*(first+i), *(output+i-1)) : *(first+i);
}
T distance(T... args)

◆ kernel() [1/2]

template<typename F , typename... ArgsT>
void tf::cudaFlowCapturer::kernel ( cudaTask  task,
dim3  g,
dim3  b,
size_t  s,
f,
ArgsT &&...  args 
)

updates a capture task to a kernel operation

The method is similar to cudaFlowCapturer::kernel but operates on an existing task.

◆ kernel() [2/2]

template<typename F , typename... ArgsT>
cudaTask tf::cudaFlowCapturer::kernel ( dim3  g,
dim3  b,
size_t  s,
f,
ArgsT &&...  args 
)

captures a kernel

Template Parameters
Fkernel function type
ArgsTkernel function parameters type
Parameters
gconfigured grid
bconfigured block
sconfigured shared memory size in bytes
fkernel function
argsarguments to forward to the kernel function by copy
Returns
cudaTask handle

◆ make_optimizer()

template<typename OPT , typename ... ArgsT>
OPT & tf::cudaFlowCapturer::make_optimizer ( ArgsT &&...  args)

selects a different optimization algorithm

Template Parameters
OPToptimizer type
ArgsTarguments types
Parameters
argsarguments to forward to construct the optimizer
Returns
a reference to the optimizer

We currently supports the following optimization algorithms to capture a user-described cudaFlow:

By default, tf::cudaFlowCapturer uses the round-robin optimization algorithm with four streams to transform a user-level graph into a native CUDA graph.

◆ max_element() [1/2]

template<typename I , typename O >
void tf::cudaFlowCapturer::max_element ( cudaTask  task,
first,
last,
unsigned *  idx,
op 
)

updates the parameters of a max-element task

This method is similar to cudaFlowCapturer::max_element but operates on an existing task.

◆ max_element() [2/2]

template<typename I , typename O >
cudaTask tf::cudaFlowCapturer::max_element ( first,
last,
unsigned *  idx,
op 
)

finds the index of the maximum element in a range

Template Parameters
Iinput iterator type
Ocomparator type
Parameters
firstiterator to the beginning of the range
lastiterator to the end of the range
idxsolution index of the maximum element
opcomparison function object

The function launches kernels asynchronously to find the largest element in the range [first, last) using the given comparator op. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
return 0;
}
auto largest = first;
for (++first; first != last; ++first) {
if (op(*largest, *first)) {
largest = first;
}
}
return std::distance(first, largest);

◆ memcpy() [1/2]

void tf::cudaFlowCapturer::memcpy ( cudaTask  task,
void *  dst,
const void *  src,
size_t  count 
)
inline

updates a capture task to a memcpy operation

The method is similar to cudaFlowCapturer::memcpy but operates on an existing task.

◆ memcpy() [2/2]

cudaTask tf::cudaFlowCapturer::memcpy ( void *  dst,
const void *  src,
size_t  count 
)
inline

copies data between host and device asynchronously through a stream

Parameters
dstdestination memory address
srcsource memory address
countsize in bytes to copy

The method captures a cudaMemcpyAsync operation through an internal stream.

◆ memset() [1/2]

void tf::cudaFlowCapturer::memset ( cudaTask  task,
void *  ptr,
int  value,
size_t  n 
)
inline

updates a capture task to a memset operation

The method is similar to cudaFlowCapturer::memset but operates on an existing task.

◆ memset() [2/2]

cudaTask tf::cudaFlowCapturer::memset ( void *  ptr,
int  v,
size_t  n 
)
inline

initializes or sets GPU memory to the given value byte by byte

Parameters
ptrpointer to GPU mempry
vvalue to set for each byte of the specified memory
nsize in bytes to set

The method captures a cudaMemsetAsync operation through an internal stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

◆ merge() [1/2]

template<typename A , typename B , typename C , typename Comp >
cudaTask tf::cudaFlowCapturer::merge ( a_first,
a_last,
b_first,
b_last,
c_first,
Comp  comp 
)

captures kernels that perform parallel merge on two sorted arrays

Template Parameters
Aiterator type of the first input array
Biterator type of the second input array
Citerator type of the output array
Compcomparator type
Parameters
a_firstiterator to the beginning of the first input array
a_lastiterator to the end of the first input array
b_firstiterator to the beginning of the second input array
b_lastiterator to the end of the second input array
c_firstiterator to the beginning of the output array
compbinary comparator
Returns
a tf::cudaTask handle

Merges two sorted ranges [a_first, a_last) and [b_first, b_last) into one sorted range beginning at c_first.

A sequence is said to be sorted with respect to a comparator comp if for any iterator it pointing to the sequence and any non-negative integer n such that it + n is a valid iterator pointing to an element of the sequence, comp(*(it + n), *it) evaluates to false.

◆ merge() [2/2]

template<typename A , typename B , typename C , typename Comp >
void tf::cudaFlowCapturer::merge ( cudaTask  task,
a_first,
a_last,
b_first,
b_last,
c_first,
Comp  comp 
)

updates a capture task to a merge task

This method is similar to cudaFlowCapturer::merge but operates on an existing task.

◆ merge_by_key() [1/2]

template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
cudaTask tf::cudaFlowCapturer::merge_by_key ( a_keys_it  a_keys_first,
a_keys_it  a_keys_last,
a_vals_it  a_vals_first,
b_keys_it  b_keys_first,
b_keys_it  b_keys_last,
b_vals_it  b_vals_first,
c_keys_it  c_keys_first,
c_vals_it  c_vals_first,
comp 
)

captures kernels that perform parallel key-value merge

Template Parameters
a_keys_itfirst key iterator type
a_vals_itfirst value iterator type
b_keys_itsecond key iterator type
b_vals_itsecond value iterator type
c_keys_itoutput key iterator type
c_vals_itoutput value iterator type
Ccomparator type
Parameters
a_keys_firstiterator to the beginning of the first key range
a_keys_lastiterator to the end of the first key range
a_vals_firstiterator to the beginning of the first value range
b_keys_firstiterator to the beginning of the second key range
b_keys_lastiterator to the end of the second key range
b_vals_firstiterator to the beginning of the second value range
c_keys_firstiterator to the beginning of the output key range
c_vals_firstiterator to the beginning of the output value range
compcomparator

Performs a key-value merge that copies elements from [a_keys_first, a_keys_last) and [b_keys_first, b_keys_last) into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending key order.

At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first)) and [b_vals_first + (b_keys_last - b_keys_first)) into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending order implied by each input element's associated key.

For example, assume:

  • a_keys = {8, 1}
  • a_vals = {1, 2}
  • b_keys = {3, 7}
  • b_vals = {3, 4}

After the merge, we have:

  • c_keys = {1, 3, 7, 8}
  • c_vals = {2, 3, 4, 1}

◆ merge_by_key() [2/2]

template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
void tf::cudaFlowCapturer::merge_by_key ( cudaTask  task,
a_keys_it  a_keys_first,
a_keys_it  a_keys_last,
a_vals_it  a_vals_first,
b_keys_it  b_keys_first,
b_keys_it  b_keys_last,
b_vals_it  b_vals_first,
c_keys_it  c_keys_first,
c_vals_it  c_vals_first,
comp 
)

updates a capture task to a key-value merge task

This method is similar to tf::cudaFlowCapturer::merge_by_key but operates on an existing task.

◆ min_element() [1/2]

template<typename I , typename O >
void tf::cudaFlowCapturer::min_element ( cudaTask  task,
first,
last,
unsigned *  idx,
op 
)

updates the parameters of a min-element task

This method is similar to cudaFlowCapturer::min_element but operates on an existing task.

◆ min_element() [2/2]

template<typename I , typename O >
cudaTask tf::cudaFlowCapturer::min_element ( first,
last,
unsigned *  idx,
op 
)

finds the index of the minimum element in a range

Template Parameters
Iinput iterator type
Ocomparator type
Parameters
firstiterator to the beginning of the range
lastiterator to the end of the range
idxsolution index of the minimum element
opcomparison function object

The function launches kernels asynchronously to find the smallest element in the range [first, last) using the given comparator op. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
return 0;
}
auto smallest = first;
for (++first; first != last; ++first) {
if (op(*first, *smallest)) {
smallest = first;
}
}
return std::distance(first, smallest);

◆ noop() [1/2]

cudaTask tf::cudaFlowCapturer::noop ( )
inline

captures a no-operation task

Returns
a tf::cudaTask handle

An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.

◆ noop() [2/2]

void tf::cudaFlowCapturer::noop ( cudaTask  task)
inline

updates a task to a no-operation task

The method is similar to tf::cudaFlowCapturer::noop but operates on an existing task.

◆ offload_n()

void tf::cudaFlowCapturer::offload_n ( size_t  n)
inline

offloads the captured cudaFlow and executes it by the given times

Parameters
nnumber of executions

◆ offload_until()

template<typename P >
void tf::cudaFlowCapturer::offload_until ( P &&  predicate)

offloads the captured cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true

Template Parameters
Ppredicate type (a binary callable)
Parameters
predicatea binary predicate (returns true for stop)

Immediately offloads the cudaFlow captured so far onto a GPU and repeatedly runs it until the predicate returns true.

By default, if users do not offload the cudaFlow capturer, the executor will offload it once.

◆ on() [1/2]

template<typename C , std::enable_if_t< std::is_invocable_r_v< void, C, cudaStream_t >, void > * >
cudaTask tf::cudaFlowCapturer::on ( C &&  callable)

captures a sequential CUDA operations from the given callable

Template Parameters
Ccallable type constructible with std::function<void(cudaStream_t)>
Parameters
callablea callable to capture CUDA operations with the stream

This methods applies a stream created by the flow to capture a sequence of CUDA operations defined in the callable.

◆ on() [2/2]

template<typename C , std::enable_if_t< std::is_invocable_r_v< void, C, cudaStream_t >, void > * >
void tf::cudaFlowCapturer::on ( cudaTask  task,
C &&  callable 
)

updates a capture task to another sequential CUDA operations

The method is similar to cudaFlowCapturer::on but operates on an existing task.

◆ reduce() [1/2]

template<typename I , typename T , typename C >
void tf::cudaFlowCapturer::reduce ( cudaTask  task,
first,
last,
T *  result,
op 
)

updates a capture task to a reduction task

This method is similar to cudaFlowCapturer::reduce but operates on an existing task.

◆ reduce() [2/2]

template<typename I , typename T , typename C >
cudaTask tf::cudaFlowCapturer::reduce ( first,
last,
T *  result,
op 
)

captures kernels that perform parallel reduction over a range of items

Template Parameters
Iinput iterator type
Tvalue type
Cbinary operator type
Parameters
firstiterator to the beginning
lastiterator to the end
resultpointer to the result with an initialized value
opbinary reduction operator
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
*result = op(*result, *first++);
}

◆ single_task() [1/2]

template<typename C >
cudaTask tf::cudaFlowCapturer::single_task ( c)

capturers a kernel to runs the given callable with only one thread

Template Parameters
Ccallable type
Parameters
ccallable to run by a single kernel thread

◆ single_task() [2/2]

template<typename C >
void tf::cudaFlowCapturer::single_task ( cudaTask  task,
c 
)

updates a capture task to a single-threaded kernel

This method is similar to cudaFlowCapturer::single_task but operates on an existing task.

◆ sort() [1/2]

template<typename I , typename C >
void tf::cudaFlowCapturer::sort ( cudaTask  task,
first,
last,
comp 
)

updates a capture task to a sort task

This method is similar to cudaFlowCapturer::sort but operates on an existing task.

◆ sort() [2/2]

template<typename I , typename C >
cudaTask tf::cudaFlowCapturer::sort ( first,
last,
comp 
)

captures kernels that sort the given array

Template Parameters
Iiterator type of the first input array
Ccomparator type
Parameters
firstiterator to the beginning of the input array
lastiterator to the end of the input array
compbinary comparator
Returns
a tf::cudaTask handle

Sorts elements in the range [first, last) with the given comparator.

◆ sort_by_key() [1/2]

template<typename K_it , typename V_it , typename C >
void tf::cudaFlowCapturer::sort_by_key ( cudaTask  task,
K_it  k_first,
K_it  k_last,
V_it  v_first,
comp 
)

updates a capture task to a key-value sort task

This method is similar to tf::cudaFlowCapturer::sort_by_key but operates on an existing task.

◆ sort_by_key() [2/2]

template<typename K_it , typename V_it , typename C >
cudaTask tf::cudaFlowCapturer::sort_by_key ( K_it  k_first,
K_it  k_last,
V_it  v_first,
comp 
)

captures kernels that sort the given array

Template Parameters
K_ititerator type of the key
V_ititerator type of the value
Ccomparator type
Parameters
k_firstiterator to the beginning of the key array
k_lastiterator to the end of the key array
v_firstiterator to the beginning of the value array
compbinary comparator
Returns
a tf::cudaTask handle

Sorts key-value elements in [k_first, k_last) and [v_first, v_first + (k_last - k_first)) into ascending key order using the given comparator comp. If i and j are any two valid iterators in [k_first, k_last) such that i precedes j, and p and q are iterators in [v_first, v_first + (k_last - k_first)) corresponding to i and j respectively, then comp(*j, *i) evaluates to false.

For example, assume:

  • keys are {1, 4, 2, 8, 5, 7}
  • values are {'a', 'b', 'c', 'd', 'e', 'f'}

After sort:

  • keys are {1, 2, 4, 5, 7, 8}
  • values are {'a', 'c', 'b', 'e', 'f', 'd'}

◆ transform() [1/4]

template<typename I , typename O , typename C >
void tf::cudaFlowCapturer::transform ( cudaTask  task,
first,
last,
output,
op 
)

updates a capture task to a transform kernel task

This method is similar to cudaFlowCapturer::transform but operates on an existing task.

◆ transform() [2/4]

template<typename I1 , typename I2 , typename O , typename C >
void tf::cudaFlowCapturer::transform ( cudaTask  task,
I1  first1,
I1  last1,
I2  first2,
output,
op 
)

updates a capture task to a transform kernel task

This method is similar to cudaFlowCapturer::transform but operates on an existing task.

◆ transform() [3/4]

template<typename I , typename O , typename C >
cudaTask tf::cudaFlowCapturer::transform ( first,
last,
output,
op 
)

captures a kernel that transforms an input range to an output range

Template Parameters
Iinput iterator type
Ooutput iterator type
Cunary operator type
Parameters
firstiterator to the beginning of the input range
lastiterator to the end of the input range
outputiterator to the beginning of the output range
opunary operator to apply to transform each item in the range
Returns
cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
*output++ = op(*first++);
}

◆ transform() [4/4]

template<typename I1 , typename I2 , typename O , typename C >
cudaTask tf::cudaFlowCapturer::transform ( I1  first1,
I1  last1,
I2  first2,
output,
op 
)

captures a kernel that transforms two input ranges to an output range

Template Parameters
I1first input iterator type
I2second input iterator type
Ooutput iterator type
Cunary operator type
Parameters
first1iterator to the beginning of the input range
last1iterator to the end of the input range
first2iterato
outputiterator to the beginning of the output range
opbinary operator to apply to transform each pair of items in the two input ranges
Returns
cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {
*output++ = op(*first1++, *first2++);
}

◆ transform_exclusive_scan()

template<typename I , typename O , typename B , typename U >
void tf::cudaFlowCapturer::transform_exclusive_scan ( cudaTask  task,
first,
last,
output,
bop,
uop 
)

updates a capture task to a transform-exclusive scan task

This method is similar to cudaFlowCapturer::transform_exclusive_scan but operates on an existing task.

◆ transform_inclusive_scan() [1/2]

template<typename I , typename O , typename B , typename U >
void tf::cudaFlowCapturer::transform_inclusive_scan ( cudaTask  task,
first,
last,
output,
bop,
uop 
)

updates a capture task to a transform-inclusive scan task

This method is similar to cudaFlowCapturer::transform_inclusive_scan but operates on an existing task.

◆ transform_inclusive_scan() [2/2]

template<typename I , typename O , typename B , typename U >
cudaTask tf::cudaFlowCapturer::transform_inclusive_scan ( first,
last,
output,
bop,
uop 
)

captures kernels that perform parallel inclusive scan over a range of transformed items

Template Parameters
Iinput iterator type
Ooutput iterator type
Bbinary operator type
Uunary operator type
Parameters
firstiterator to the beginning
lastiterator to the end
outputiterator to the beginning of the output
bopbinary operator
uopunary operator
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(size_t i=0; i<std::distance(first, last); i++) {
*(output + i) = i ? op(uop(*(first+i)), *(output+i-1)) : uop(*(first+i));
}

◆ transform_reduce() [1/2]

template<typename I , typename T , typename C , typename U >
void tf::cudaFlowCapturer::transform_reduce ( cudaTask  task,
first,
last,
T *  result,
bop,
uop 
)

updates a capture task to a transform-reduce task

This method is similar to cudaFlowCapturer::transform_reduce but operates on an existing task.

◆ transform_reduce() [2/2]

template<typename I , typename T , typename C , typename U >
cudaTask tf::cudaFlowCapturer::transform_reduce ( first,
last,
T *  result,
bop,
uop 
)

captures kernels that perform parallel reduction over a range of transformed items

Template Parameters
Iinput iterator type
Tvalue type
Cbinary operator type
Uunary operator type
Parameters
firstiterator to the beginning
lastiterator to the end
resultpointer to the result with an initialized value
bopbinary reduce operator
uopunary transform operator
Returns
a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
*result = bop(*result, uop(*first++));
}

◆ transform_uninitialized_reduce() [1/2]

template<typename I , typename T , typename C , typename U >
void tf::cudaFlowCapturer::transform_uninitialized_reduce ( cudaTask  task,
first,
last,
T *  result,
bop,
uop 
)

updates a capture task to a transform-reduce task of no initialized value

This method is similar to cudaFlowCapturer::transform_uninitialized_reduce but operates on an existing task.

◆ transform_uninitialized_reduce() [2/2]

template<typename I , typename T , typename C , typename U >
cudaTask tf::cudaFlowCapturer::transform_uninitialized_reduce ( first,
last,
T *  result,
bop,
uop 
)

similar to tf::cudaFlowCapturer::transform_reduce but does not assume any initial value to reduce

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = uop(*first++); // initial value does not involve in the loop
while (first != last) {
*result = bop(*result, uop(*first++));
}

◆ uninitialized_reduce() [1/2]

template<typename I , typename T , typename C >
void tf::cudaFlowCapturer::uninitialized_reduce ( cudaTask  task,
first,
last,
T *  result,
op 
)

updates a capture task to an uninitialized-reduction task

This method is similar to cudaFlowCapturer::uninitialized_reduce but operates on an existing task.

◆ uninitialized_reduce() [2/2]

template<typename I , typename T , typename C >
cudaTask tf::cudaFlowCapturer::uninitialized_reduce ( first,
last,
T *  result,
op 
)

similar to tf::cudaFlowCapturer::reduce but does not assume any initial value to reduce

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = *first++; // initial value does not involve in the loop
while (first != last) {
*result = op(*result, *first++);
}

The documentation for this class was generated from the following files: