Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
tf Namespace Reference

taskflow namespace More...

Classes

class  ChromeObserver
 class to create an observer based on Chrome tracing format More...
 
class  CriticalSection
 class to create a critical region of limited workers to run tasks More...
 
class  cudaDeviceAllocator
 class to create a CUDA device allocator More...
 
class  cudaEvent
 ‍** More...
 
class  cudaExecutionPolicy
 class to define execution policy for CUDA standard algorithms More...
 
class  cudaFlow
 class to create a cudaFlow task dependency graph More...
 
class  cudaFlowCapturer
 class to create a cudaFlow graph using stream capture More...
 
class  cudaLinearCapturing
 class to capture a linear CUDA graph using a sequential stream More...
 
class  cudaRoundRobinCapturing
 class to capture a CUDA graph using a round-robin algorithm More...
 
class  cudaScopedDevice
 class to create an RAII-styled context switch More...
 
class  cudaSequentialCapturing
 class to capture a CUDA graph using a sequential stream More...
 
class  cudaStream
 ‍** More...
 
class  cudaTask
 class to create a task handle over an internal node of a cudaFlow graph More...
 
class  cudaUSMAllocator
 class to create a unified shared memory (USM) allocator More...
 
class  Executor
 class to create an executor for running a taskflow graph More...
 
class  FlowBuilder
 class to build a task dependency graph More...
 
class  Future
 class to access the result of an execution More...
 
class  Graph
 class to create a graph object More...
 
class  ObserverInterface
 class to derive an executor observer More...
 
class  Pipe
 class to create a pipe object for a pipeline stage More...
 
class  Pipeflow
 class to create a pipeflow object used by the pipe callable More...
 
class  Pipeline
 class to create a pipeline scheduling framework More...
 
class  Runtime
 class to create a runtime object used by a runtime task More...
 
class  ScalablePipeline
 class to create a scalable pipeline object More...
 
class  Semaphore
 class to create a semophore object for building a concurrency constraint More...
 
class  SmallVector
 class to define a vector optimized for small array More...
 
class  Subflow
 class to construct a subflow graph from the execution of a dynamic task More...
 
class  syclFlow
 class for building a SYCL task dependency graph More...
 
class  syclTask
 handle to a node of the internal CUDA graph More...
 
class  Task
 class to create a task handle over a node in a taskflow graph More...
 
class  Taskflow
 class to create a taskflow object More...
 
class  TaskView
 class to access task information from the observer interface More...
 
class  TFProfObserver
 class to create an observer based on the built-in taskflow profiler format More...
 
class  WorkerView
 class to create an immutable view of a worker in an executor More...
 

Typedefs

using observer_stamp_t = std::chrono::time_point< std::chrono::steady_clock >
 default time point type of observers
 
using cudaDefaultExecutionPolicy = cudaExecutionPolicy< 512, 9 >
 default execution policy
 

Enumerations

enum class  TaskType : int {
  PLACEHOLDER = 0 , CUDAFLOW , SYCLFLOW , STATIC ,
  DYNAMIC , CONDITION , MULTI_CONDITION , MODULE ,
  ASYNC , RUNTIME , UNDEFINED
}
 enumeration of all task types More...
 
enum class  ObserverType : int { TFPROF = 0 , CHROME , UNDEFINED }
 enumeration of all observer types
 
enum class  PipeType : int { PARALLEL = 1 , SERIAL = 2 }
 enumeration of all pipe types More...
 
enum class  cudaTaskType : int {
  EMPTY = 0 , HOST , MEMSET , MEMCPY ,
  KERNEL , SUBFLOW , CAPTURE , UNDEFINED
}
 enumeration of all cudaTask types More...
 

Functions

const char * to_string (TaskType type)
 convert a task type to a human-readable string
 
std::ostreamoperator<< (std::ostream &os, const Task &task)
 overload of ostream inserter operator for cudaTask
 
const char * to_string (ObserverType type)
 convert an observer type to a human-readable string
 
size_t cuda_get_num_devices ()
 queries the number of available devices
 
int cuda_get_device ()
 gets the current device associated with the caller thread
 
void cuda_set_device (int id)
 switches to a given device context
 
void cuda_get_device_property (int i, cudaDeviceProp &p)
 obtains the device property
 
cudaDeviceProp cuda_get_device_property (int i)
 obtains the device property
 
void cuda_dump_device_property (std::ostream &os, const cudaDeviceProp &p)
 dumps the device property
 
size_t cuda_get_device_max_threads_per_block (int d)
 queries the maximum threads per block on a device
 
size_t cuda_get_device_max_x_dim_per_block (int d)
 queries the maximum x-dimension per block on a device
 
size_t cuda_get_device_max_y_dim_per_block (int d)
 queries the maximum y-dimension per block on a device
 
size_t cuda_get_device_max_z_dim_per_block (int d)
 queries the maximum z-dimension per block on a device
 
size_t cuda_get_device_max_x_dim_per_grid (int d)
 queries the maximum x-dimension per grid on a device
 
size_t cuda_get_device_max_y_dim_per_grid (int d)
 queries the maximum y-dimension per grid on a device
 
size_t cuda_get_device_max_z_dim_per_grid (int d)
 queries the maximum z-dimension per grid on a device
 
size_t cuda_get_device_max_shm_per_block (int d)
 queries the maximum shared memory size in bytes per block on a device
 
size_t cuda_get_device_warp_size (int d)
 queries the warp size on a device
 
int cuda_get_device_compute_capability_major (int d)
 queries the major number of compute capability of a device
 
int cuda_get_device_compute_capability_minor (int d)
 queries the minor number of compute capability of a device
 
bool cuda_get_device_unified_addressing (int d)
 queries if the device supports unified addressing
 
int cuda_get_driver_version ()
 queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
 
int cuda_get_runtime_version ()
 queries the CUDA Runtime version (1000 * major + 10 * minor)
 
size_t cuda_get_free_mem (int d)
 queries the free memory (expensive call)
 
size_t cuda_get_total_mem (int d)
 queries the total available memory (expensive call)
 
template<typename T >
T * cuda_malloc_device (size_t N, int d)
 allocates memory on the given device for holding N elements of type T
 
template<typename T >
T * cuda_malloc_device (size_t N)
 allocates memory on the current device associated with the caller
 
template<typename T >
T * cuda_malloc_shared (size_t N)
 allocates shared memory for holding N elements of type T
 
template<typename T >
void cuda_free (T *ptr, int d)
 frees memory on the GPU device
 
template<typename T >
void cuda_free (T *ptr)
 frees memory on the GPU device
 
void cuda_memcpy_async (cudaStream_t stream, void *dst, const void *src, size_t count)
 copies data between host and device asynchronously through a stream
 
void cuda_memset_async (cudaStream_t stream, void *devPtr, int value, size_t count)
 initializes or sets GPU memory to the given value byte by byte
 
constexpr const char * to_string (cudaTaskType type)
 convert a cuda_task type to a human-readable string
 
std::ostreamoperator<< (std::ostream &os, const cudaTask &ct)
 overload of ostream inserter operator for cudaTask
 
template<typename P , typename C >
void cuda_single_task (P &&p, C c)
 runs a callable asynchronously using one kernel thread
 
template<typename P , typename I , typename C >
void cuda_for_each (P &&p, I first, I last, C c)
 performs asynchronous parallel iterations over a range of items
 
template<typename P , typename I , typename C >
void cuda_for_each_index (P &&p, I first, I last, I inc, C c)
 performs asynchronous parallel iterations over an index-based range of items
 
template<typename P , typename I , typename O , typename C >
void cuda_transform (P &&p, I first, I last, O output, C op)
 performs asynchronous parallel transforms over a range of items
 
template<typename P , typename I1 , typename I2 , typename O , typename C >
void cuda_transform (P &&p, I1 first1, I1 last1, I2 first2, O output, C op)
 performs asynchronous parallel transforms over two ranges of items
 
template<typename P , typename T >
unsigned cuda_reduce_buffer_size (unsigned count)
 queries the buffer size in bytes needed to call reduce kernels
 
template<typename P , typename I , typename T , typename O >
void cuda_reduce (P &&p, I first, I last, T *res, O op, void *buf)
 performs asynchronous parallel reduction over a range of items
 
template<typename P , typename I , typename T , typename O >
void cuda_uninitialized_reduce (P &&p, I first, I last, T *res, O op, void *buf)
 performs asynchronous parallel reduction over a range of items without an initial value
 
template<typename P , typename I , typename T , typename O , typename U >
void cuda_transform_reduce (P &&p, I first, I last, T *res, O bop, U uop, void *buf)
 performs asynchronous parallel reduction over a range of transformed items without an initial value
 
template<typename P , typename I , typename T , typename O , typename U >
void cuda_transform_uninitialized_reduce (P &&p, I first, I last, T *res, O bop, U uop, void *buf)
 performs asynchronous parallel reduction over a range of transformed items with an initial value
 
template<typename P , typename T >
unsigned cuda_scan_buffer_size (unsigned count)
 queries the buffer size in bytes needed to call scan kernels
 
template<typename P , typename I , typename O , typename C >
void cuda_inclusive_scan (P &&p, I first, I last, O output, C op, void *buf)
 performs asynchronous inclusive scan over a range of items
 
template<typename P , typename I , typename O , typename C , typename U >
void cuda_transform_inclusive_scan (P &&p, I first, I last, O output, C bop, U uop, void *buf)
 performs asynchronous inclusive scan over a range of transformed items
 
template<typename P , typename I , typename O , typename C >
void cuda_exclusive_scan (P &&p, I first, I last, O output, C op, void *buf)
 performs asynchronous exclusive scan over a range of items
 
template<typename P , typename I , typename O , typename C , typename U >
void cuda_transform_exclusive_scan (P &&p, I first, I last, O output, C bop, U uop, void *buf)
 performs asynchronous exclusive scan over a range of items
 
template<typename P >
unsigned cuda_merge_buffer_size (unsigned a_count, unsigned b_count)
 queries the buffer size in bytes needed to call merge kernels
 
template<typename P , typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
void cuda_merge_by_key (P &&p, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp, void *buf)
 performs asynchronous key-value merge over a range of keys and values
 
template<typename P , typename a_keys_it , typename b_keys_it , typename c_keys_it , typename C >
void cuda_merge (P &&p, a_keys_it a_keys_first, a_keys_it a_keys_last, b_keys_it b_keys_first, b_keys_it b_keys_last, c_keys_it c_keys_first, C comp, void *buf)
 performs asynchronous key-only merge over a range of keys
 
template<typename P , typename K , typename V = cudaEmpty>
unsigned cuda_sort_buffer_size (unsigned count)
 queries the buffer size in bytes needed to call sort kernels for the given number of elements
 
template<typename P , typename K_it , typename V_it , typename C >
void cuda_sort_by_key (P &&p, K_it k_first, K_it k_last, V_it v_first, C comp, void *buf)
 performs asynchronous key-value sort on a range of items
 
template<typename P , typename K_it , typename C >
void cuda_sort (P &&p, K_it k_first, K_it k_last, C comp, void *buf)
 performs asynchronous key-only sort on a range of items
 
template<typename P , typename I , typename U >
void cuda_find_if (P &&p, I first, I last, unsigned *idx, U op)
 finds the index of the first element that satisfies the given criteria
 
template<typename P , typename T >
unsigned cuda_min_element_buffer_size (unsigned count)
 queries the buffer size in bytes needed to call tf::cuda_min_element
 
template<typename P , typename I , typename O >
void cuda_min_element (P &&p, I first, I last, unsigned *idx, O op, void *buf)
 finds the index of the minimum element in a range
 
template<typename P , typename T >
unsigned cuda_max_element_buffer_size (unsigned count)
 queries the buffer size in bytes needed to call tf::cuda_max_element
 
template<typename P , typename I , typename O >
void cuda_max_element (P &&p, I first, I last, unsigned *idx, O op, void *buf)
 finds the index of the maximum element in a range
 
std::ostreamoperator<< (std::ostream &os, const syclTask &ct)
 overload of ostream inserter operator for syclTask
 
constexpr const char * version ()
 queries the version information in a string format major.minor.patch
 

Variables

template<typename C >
constexpr bool is_static_task_v
 determines if a callable is a static task
 
template<typename C >
constexpr bool is_dynamic_task_v = std::is_invocable_r_v<void, C, Subflow&>
 determines if a callable is a dynamic task
 
template<typename C >
constexpr bool is_condition_task_v = std::is_invocable_r_v<int, C>
 determines if a callable is a condition task
 
template<typename C >
constexpr bool is_multi_condition_task_v
 determines if a callable is a multi-condition task
 
template<typename C >
constexpr bool is_cudaflow_task_v
 determines if a callable is a cudaFlow task
 
template<typename C >
constexpr bool is_syclflow_task_v = std::is_invocable_r_v<void, C, syclFlow&>
 determines if a callable is a syclFlow task
 
template<typename C >
constexpr bool is_runtime_task_v = std::is_invocable_r_v<void, C, Runtime&>
 determines if a callable is a runtime task
 

Detailed Description

taskflow namespace

Enumeration Type Documentation

◆ cudaTaskType

enum class tf::cudaTaskType : int
strong

enumeration of all cudaTask types

Enumerator
EMPTY 

empty task type

HOST 

host task type

MEMSET 

memory set task type

MEMCPY 

memory copy task type

KERNEL 

memory copy task type

SUBFLOW 

subflow (child graph) task type

CAPTURE 

capture task type

UNDEFINED 

undefined task type

◆ PipeType

enum class tf::PipeType : int
strong

enumeration of all pipe types

Enumerator
PARALLEL 

parallel type

SERIAL 

serial type

◆ TaskType

enum class tf::TaskType : int
strong

enumeration of all task types

Enumerator
PLACEHOLDER 

placeholder task type

CUDAFLOW 

cudaFlow task type

SYCLFLOW 

syclFlow task type

STATIC 

static task type

DYNAMIC 

dynamic (subflow) task type

CONDITION 

condition task type

MULTI_CONDITION 

multi-condition task type

MODULE 

module task type

ASYNC 

asynchronous task type

RUNTIME 

runtime task type

UNDEFINED 

undefined task type (for internal use only)

Function Documentation

◆ cuda_exclusive_scan()

template<typename P , typename I , typename O , typename C >
void tf::cuda_exclusive_scan ( P &&  p,
first,
last,
output,
op,
void *  buf 
)

performs asynchronous exclusive scan over a range of items

Template Parameters
Pexecution policy type
Iinput iterator
Ooutput iterator
Cbinary operator type
Parameters
pexecution policy
firstiterator to the beginning of the input range
lastiterator to the end of the input range
outputiterator to the beginning of the output range
opbinary operator to apply to scan
bufpointer to the temporary buffer

◆ cuda_find_if()

template<typename P , typename I , typename U >
void tf::cuda_find_if ( P &&  p,
first,
last,
unsigned *  idx,
op 
)

finds the index of the first element that satisfies the given criteria

Template Parameters
Pexecution policy type
Iinput iterator type
Uunary operator type
Parameters
pexecution policy
firstiterator to the beginning of the range
lastiterator to the end of the range
idxpointer to the index of the found element
opunary operator which returns true for the required element

The function launches kernels asynchronously to find the index idx of the first element in the range [first, last) such that op(*(first+idx)) is true. This is equivalent to the parallel execution of the following loop:

unsigned idx = 0;
for(; first != last; ++first, ++idx) {
if (p(*first)) {
return idx;
}
}
return idx;

◆ cuda_for_each()

template<typename P , typename I , typename C >
void tf::cuda_for_each ( P &&  p,
first,
last,
c 
)

performs asynchronous parallel iterations over a range of items

Template Parameters
Pexecution policy type
Iinput iterator type
Cunary operator type
Parameters
pexecution policy object
firstiterator to the beginning of the range
lastiterator to the end of the range
cunary operator to apply to each dereferenced iterator

This function is equivalent to a parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; itr++) {
c(*itr);
}

◆ cuda_for_each_index()

template<typename P , typename I , typename C >
void tf::cuda_for_each_index ( P &&  p,
first,
last,
inc,
c 
)

performs asynchronous parallel iterations over an index-based range of items

Template Parameters
Pexecution policy type
Iinput index type
Cunary operator type
Parameters
pexecution policy object
firstindex to the beginning of the range
lastindex to the end of the range
incstep size between successive iterations
cunary operator to apply to each index

This function is equivalent to a parallel execution of the following loop on a GPU:

// step is positive [first, last)
for(auto i=first; i<last; i+=step) {
c(i);
}
// step is negative [first, last)
for(auto i=first; i>last; i+=step) {
c(i);
}

◆ cuda_free() [1/2]

template<typename T >
void tf::cuda_free ( T *  ptr)

frees memory on the GPU device

Template Parameters
Tpointer type
Parameters
ptrdevice pointer to memory to free

This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller.

◆ cuda_free() [2/2]

template<typename T >
void tf::cuda_free ( T *  ptr,
int  d 
)

frees memory on the GPU device

Template Parameters
Tpointer type
Parameters
ptrdevice pointer to memory to free
ddevice context identifier

This methods call cudaFree to free the memory space pointed to by ptr using the given device context.

◆ cuda_inclusive_scan()

template<typename P , typename I , typename O , typename C >
void tf::cuda_inclusive_scan ( P &&  p,
first,
last,
output,
op,
void *  buf 
)

performs asynchronous inclusive scan over a range of items

Template Parameters
Pexecution policy type
Iinput iterator
Ooutput iterator
Cbinary operator type
Parameters
pexecution policy
firstiterator to the beginning of the input range
lastiterator to the end of the input range
outputiterator to the beginning of the output range
opbinary operator to apply to scan
bufpointer to the temporary buffer

◆ cuda_malloc_device() [1/2]

template<typename T >
T * tf::cuda_malloc_device ( size_t  N)

allocates memory on the current device associated with the caller

The function calls malloc_device from the current device associated with the caller.

◆ cuda_malloc_device() [2/2]

template<typename T >
T * tf::cuda_malloc_device ( size_t  N,
int  d 
)

allocates memory on the given device for holding N elements of type T

The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory.

◆ cuda_malloc_shared()

template<typename T >
T * tf::cuda_malloc_shared ( size_t  N)

allocates shared memory for holding N elements of type T

The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory.

◆ cuda_max_element()

template<typename P , typename I , typename O >
void tf::cuda_max_element ( P &&  p,
first,
last,
unsigned *  idx,
op,
void *  buf 
)

finds the index of the maximum element in a range

Template Parameters
Pexecution policy type
Iinput iterator type
Ocomparator type
Parameters
pexecution policy object
firstiterator to the beginning of the range
lastiterator to the end of the range
idxsolution index of the maximum element
opcomparison function object
bufpointer to the buffer

The function launches kernels asynchronously to find the largest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_max_element_buffer_size bytes for internal use. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
return 0;
}
auto largest = first;
for (++first; first != last; ++first) {
if (op(*largest, *first)) {
largest = first;
}
}
return std::distance(first, largest);
T distance(T... args)

◆ cuda_max_element_buffer_size()

template<typename P , typename T >
unsigned tf::cuda_max_element_buffer_size ( unsigned  count)

queries the buffer size in bytes needed to call tf::cuda_max_element

Template Parameters
Pexecution policy type
Tvalue type
Parameters
countnumber of elements to search

The function is used to decide the buffer size in bytes for calling tf::cuda_max_element.

◆ cuda_memcpy_async()

void tf::cuda_memcpy_async ( cudaStream_t  stream,
void *  dst,
const void *  src,
size_t  count 
)
inline

copies data between host and device asynchronously through a stream

Parameters
streamstream identifier
dstdestination memory address
srcsource memory address
countsize in bytes to copy

The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap.

◆ cuda_memset_async()

void tf::cuda_memset_async ( cudaStream_t  stream,
void *  devPtr,
int  value,
size_t  count 
)
inline

initializes or sets GPU memory to the given value byte by byte

Parameters
streamstream identifier
devPtrpointer to GPU mempry
valuevalue to set for each byte of the specified memory
countsize in bytes to set

The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

◆ cuda_merge()

template<typename P , typename a_keys_it , typename b_keys_it , typename c_keys_it , typename C >
void tf::cuda_merge ( P &&  p,
a_keys_it  a_keys_first,
a_keys_it  a_keys_last,
b_keys_it  b_keys_first,
b_keys_it  b_keys_last,
c_keys_it  c_keys_first,
comp,
void *  buf 
)

performs asynchronous key-only merge over a range of keys

Template Parameters
Pexecution policy type
a_keys_itfirst key iterator type
b_keys_itsecond key iterator type
c_keys_itoutput key iterator type
Ccomparator type
Parameters
pexecution policy
a_keys_firstiterator to the beginning of the first key range
a_keys_lastiterator to the end of the first key range
b_keys_firstiterator to the beginning of the second key range
b_keys_lastiterator to the end of the second key range
c_keys_firstiterator to the beginning of the output key range
compcomparator
bufpointer to the temporary buffer

This function is equivalent to tf::cuda_merge_by_key without values.

◆ cuda_merge_buffer_size()

template<typename P >
unsigned tf::cuda_merge_buffer_size ( unsigned  a_count,
unsigned  b_count 
)

queries the buffer size in bytes needed to call merge kernels

Template Parameters
Pexecution polity type
Parameters
a_countnumber of elements in the first input array
b_countnumber of elements in the second input array

The function is used to allocate a buffer for calling tf::cuda_merge.

◆ cuda_merge_by_key()

template<typename P , typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
void tf::cuda_merge_by_key ( P &&  p,
a_keys_it  a_keys_first,
a_keys_it  a_keys_last,
a_vals_it  a_vals_first,
b_keys_it  b_keys_first,
b_keys_it  b_keys_last,
b_vals_it  b_vals_first,
c_keys_it  c_keys_first,
c_vals_it  c_vals_first,
comp,
void *  buf 
)

performs asynchronous key-value merge over a range of keys and values

Template Parameters
Pexecution policy type
a_keys_itfirst key iterator type
a_vals_itfirst value iterator type
b_keys_itsecond key iterator type
b_vals_itsecond value iterator type
c_keys_itoutput key iterator type
c_vals_itoutput value iterator type
Ccomparator type
Parameters
pexecution policy
a_keys_firstiterator to the beginning of the first key range
a_keys_lastiterator to the end of the first key range
a_vals_firstiterator to the beginning of the first value range
b_keys_firstiterator to the beginning of the second key range
b_keys_lastiterator to the end of the second key range
b_vals_firstiterator to the beginning of the second value range
c_keys_firstiterator to the beginning of the output key range
c_vals_firstiterator to the beginning of the output value range
compcomparator
bufpointer to the temporary buffer

Performs a key-value merge that copies elements from [a_keys_first, a_keys_last) and [b_keys_first, b_keys_last) into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending key order.

At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first)) and [b_vals_first + (b_keys_last - b_keys_first)) into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending order implied by each input element's associated key.

For example, assume:

  • a_keys = {8, 1};
  • a_vals = {1, 2};
  • b_keys = {3, 7};
  • b_vals = {3, 4};

After the merge, we have:

  • c_keys = {1, 3, 7, 8}
  • c_vals = {2, 3, 4, 1}

◆ cuda_min_element()

template<typename P , typename I , typename O >
void tf::cuda_min_element ( P &&  p,
first,
last,
unsigned *  idx,
op,
void *  buf 
)

finds the index of the minimum element in a range

Template Parameters
Pexecution policy type
Iinput iterator type
Ocomparator type
Parameters
pexecution policy object
firstiterator to the beginning of the range
lastiterator to the end of the range
idxsolution index of the minimum element
opcomparison function object
bufpointer to the buffer

The function launches kernels asynchronously to find the smallest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_min_element_buffer_size bytes for internal use. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
return 0;
}
auto smallest = first;
for (++first; first != last; ++first) {
if (op(*first, *smallest)) {
smallest = first;
}
}
return std::distance(first, smallest);

◆ cuda_min_element_buffer_size()

template<typename P , typename T >
unsigned tf::cuda_min_element_buffer_size ( unsigned  count)

queries the buffer size in bytes needed to call tf::cuda_min_element

Template Parameters
Pexecution policy type
Tvalue type
Parameters
countnumber of elements to search

The function is used to decide the buffer size in bytes for calling tf::cuda_min_element.

◆ cuda_reduce()

template<typename P , typename I , typename T , typename O >
void tf::cuda_reduce ( P &&  p,
first,
last,
T *  res,
op,
void *  buf 
)

performs asynchronous parallel reduction over a range of items

Template Parameters
Pexecution policy type
Iinput iterator type
Tvalue type
Obinary operator type
Parameters
pexecution policy
firstiterator to the beginning of the range
lastiterator to the end of the range
respointer to the result
opbinary operator to apply to reduce elements
bufpointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
*result = op(*result, *first++);
}

◆ cuda_reduce_buffer_size()

template<typename P , typename T >
unsigned tf::cuda_reduce_buffer_size ( unsigned  count)

queries the buffer size in bytes needed to call reduce kernels

Template Parameters
Pexecution policy type
Tvalue type
Parameters
countnumber of elements to reduce

The function is used to allocate a buffer for calling tf::cuda_reduce, tf::cuda_uninitialized_reduce, tf::cuda_transform_reduce, and tf::cuda_transform_uninitialized_reduce.

◆ cuda_scan_buffer_size()

template<typename P , typename T >
unsigned tf::cuda_scan_buffer_size ( unsigned  count)

queries the buffer size in bytes needed to call scan kernels

Template Parameters
Pexecution policy type
Tvalue type
Parameters
countnumber of elements to scan

The function is used to allocate a buffer for calling tf::cuda_inclusive_scan, tf::cuda_exclusive_scan, tf::cuda_transform_inclusive_scan, and tf::cuda_transform_exclusive_scan.

◆ cuda_single_task()

template<typename P , typename C >
void tf::cuda_single_task ( P &&  p,
c 
)

runs a callable asynchronously using one kernel thread

Template Parameters
Pexecution policy type
Cclosure type
Parameters
pexecution policy
cclosure to run by one kernel thread

The function launches a single kernel thread to run the given callable through the stream in the execution policy object.

◆ cuda_sort()

template<typename P , typename K_it , typename C >
void tf::cuda_sort ( P &&  p,
K_it  k_first,
K_it  k_last,
comp,
void *  buf 
)

performs asynchronous key-only sort on a range of items

Template Parameters
Pexecution policy type
K_itkey iterator type
Ccomparator type
Parameters
pexecution policy
k_firstiterator to the beginning of the key range
k_lastiterator to the end of the key range
compbinary comparator
bufpointer to the temporary buffer

This method is equivalent to tf::cuda_sort_by_key without values.

◆ cuda_sort_buffer_size()

template<typename P , typename K , typename V = cudaEmpty>
unsigned tf::cuda_sort_buffer_size ( unsigned  count)

queries the buffer size in bytes needed to call sort kernels for the given number of elements

Template Parameters
Pexecution policy type
Kkey type
Vvalue type (default tf::cudaEmpty)
Parameters
countnumber of keys/values to sort

The function is used to allocate a buffer for calling tf::cuda_sort.

◆ cuda_sort_by_key()

template<typename P , typename K_it , typename V_it , typename C >
void tf::cuda_sort_by_key ( P &&  p,
K_it  k_first,
K_it  k_last,
V_it  v_first,
comp,
void *  buf 
)

performs asynchronous key-value sort on a range of items

Template Parameters
Pexecution policy type
K_itkey iterator type
V_itvalue iterator type
Ccomparator type
Parameters
pexecution policy
k_firstiterator to the beginning of the key range
k_lastiterator to the end of the key range
v_firstiterator to the beginning of the value range
compbinary comparator
bufpointer to the temporary buffer

Sorts key-value elements in [k_first, k_last) and [v_first, v_first + (k_last - k_first)) into ascending key order using the given comparator comp. If i and j are any two valid iterators in [k_first, k_last) such that i precedes j, and p and q are iterators in [v_first, v_first + (k_last - k_first)) corresponding to i and j respectively, then comp(*j, *i) evaluates to false.

For example, assume:

  • keys are {1, 4, 2, 8, 5, 7}
  • values are {'a', 'b', 'c', 'd', 'e', 'f'}

After sort:

  • keys are {1, 2, 4, 5, 7, 8}
  • values are {'a', 'c', 'b', 'e', 'f', 'd'}

◆ cuda_transform() [1/2]

template<typename P , typename I , typename O , typename C >
void tf::cuda_transform ( P &&  p,
first,
last,
output,
op 
)

performs asynchronous parallel transforms over a range of items

Template Parameters
Pexecution policy type
Iinput iterator type
Ooutput iterator type
Cunary operator type
Parameters
pexecution policy
firstiterator to the beginning of the range
lastiterator to the end of the range
outputiterator to the beginning of the output range
opunary operator to apply to transform each item

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
*output++ = op(*first++);
}

◆ cuda_transform() [2/2]

template<typename P , typename I1 , typename I2 , typename O , typename C >
void tf::cuda_transform ( P &&  p,
I1  first1,
I1  last1,
I2  first2,
output,
op 
)

performs asynchronous parallel transforms over two ranges of items

Template Parameters
Pexecution policy type
I1first input iterator type
I2second input iterator type
Ooutput iterator type
Cbinary operator type
Parameters
pexecution policy
first1iterator to the beginning of the first range
last1iterator to the end of the first range
first2iterator to the beginning of the second range
outputiterator to the beginning of the output range
opbinary operator to apply to transform each pair of items

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {
*output++ = op(*first1++, *first2++);
}

◆ cuda_transform_exclusive_scan()

template<typename P , typename I , typename O , typename C , typename U >
void tf::cuda_transform_exclusive_scan ( P &&  p,
first,
last,
output,
bop,
uop,
void *  buf 
)

performs asynchronous exclusive scan over a range of items

Template Parameters
Pexecution policy type
Iinput iterator
Ooutput iterator
Cbinary operator type
Uunary operator type
Parameters
pexecution policy
firstiterator to the beginning of the input range
lastiterator to the end of the input range
outputiterator to the beginning of the output range
bopbinary operator to apply to scan
uopunary operator to apply to transform each item before scan
bufpointer to the temporary buffer

◆ cuda_transform_inclusive_scan()

template<typename P , typename I , typename O , typename C , typename U >
void tf::cuda_transform_inclusive_scan ( P &&  p,
first,
last,
output,
bop,
uop,
void *  buf 
)

performs asynchronous inclusive scan over a range of transformed items

Template Parameters
Pexecution policy type
Iinput iterator
Ooutput iterator
Cbinary operator type
Uunary operator type
Parameters
pexecution policy
firstiterator to the beginning of the input range
lastiterator to the end of the input range
outputiterator to the beginning of the output range
bopbinary operator to apply to scan
uopunary operator to apply to transform each item before scan
bufpointer to the temporary buffer

◆ cuda_transform_reduce()

template<typename P , typename I , typename T , typename O , typename U >
void tf::cuda_transform_reduce ( P &&  p,
first,
last,
T *  res,
bop,
uop,
void *  buf 
)

performs asynchronous parallel reduction over a range of transformed items without an initial value

Template Parameters
Pexecution policy type
Iinput iterator type
Tvalue type
Obinary operator type
Uunary operator type
Parameters
pexecution policy
firstiterator to the beginning of the range
lastiterator to the end of the range
respointer to the result
bopbinary operator to apply to reduce elements
uopunary operator to apply to transform elements
bufpointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
*result = bop(*result, uop(*first++));
}

◆ cuda_transform_uninitialized_reduce()

template<typename P , typename I , typename T , typename O , typename U >
void tf::cuda_transform_uninitialized_reduce ( P &&  p,
first,
last,
T *  res,
bop,
uop,
void *  buf 
)

performs asynchronous parallel reduction over a range of transformed items with an initial value

Template Parameters
Pexecution policy type
Iinput iterator type
Tvalue type
Obinary operator type
Uunary operator type
Parameters
pexecution policy
firstiterator to the beginning of the range
lastiterator to the end of the range
respointer to the result
bopbinary operator to apply to reduce elements
uopunary operator to apply to transform elements
bufpointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = uop(*first++); // no initial values partitipcate in the loop
while (first != last) {
*result = bop(*result, uop(*first++));
}

◆ cuda_uninitialized_reduce()

template<typename P , typename I , typename T , typename O >
void tf::cuda_uninitialized_reduce ( P &&  p,
first,
last,
T *  res,
op,
void *  buf 
)

performs asynchronous parallel reduction over a range of items without an initial value

Template Parameters
Pexecution policy type
Iinput iterator type
Tvalue type
Obinary operator type
Parameters
pexecution policy
firstiterator to the beginning of the range
lastiterator to the end of the range
respointer to the result
opbinary operator to apply to reduce elements
bufpointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = *first++; // no initial values partitipcate in the loop
while (first != last) {
*result = op(*result, *first++);
}

◆ to_string()

const char * tf::to_string ( TaskType  type)
inline

convert a task type to a human-readable string

The name of each task type is the litte-case string of its characters.

TaskType::PLACEHOLDER -> "placeholder"
TaskType::CUDAFLOW -> "cudaflow"
TaskType::SYCLFLOW -> "syclflow"
TaskType::STATIC -> "static"
TaskType::DYNAMIC -> "subflow"
TaskType::CONDITION -> "condition"
TaskType::MULTI_CONDITION -> "multi_condition"
TaskType::MODULE -> "module"
TaskType::ASYNC -> "async"
TaskType::RUNTIME -> "runtime"
@ DYNAMIC
dynamic (subflow) task type
@ MODULE
module task type
@ CUDAFLOW
cudaFlow task type
@ MULTI_CONDITION
multi-condition task type
@ CONDITION
condition task type
@ SYCLFLOW
syclFlow task type
@ ASYNC
asynchronous task type
@ PLACEHOLDER
placeholder task type
@ RUNTIME
runtime task type
@ STATIC
static task type

Variable Documentation

◆ is_condition_task_v

template<typename C >
constexpr bool tf::is_condition_task_v = std::is_invocable_r_v<int, C>
constexpr

determines if a callable is a condition task

A condition task is a callable object constructible from std::function<int()>.

◆ is_cudaflow_task_v

template<typename C >
constexpr bool tf::is_cudaflow_task_v
constexpr
Initial value:

determines if a callable is a cudaFlow task

A cudaFlow task is a callable object constructible from std::function<void(tf::cudaFlow&)> or std::function<void(tf::cudaFlowCapturer&)>.

◆ is_dynamic_task_v

template<typename C >
constexpr bool tf::is_dynamic_task_v = std::is_invocable_r_v<void, C, Subflow&>
constexpr

determines if a callable is a dynamic task

A dynamic task is a callable object constructible from std::function<void(Subflow&)>.

◆ is_multi_condition_task_v

template<typename C >
constexpr bool tf::is_multi_condition_task_v
constexpr
Initial value:

determines if a callable is a multi-condition task

A multi-condition task is a callable object constructible from std::function<tf::SmallVector<int>()>.

◆ is_runtime_task_v

template<typename C >
constexpr bool tf::is_runtime_task_v = std::is_invocable_r_v<void, C, Runtime&>
constexpr

determines if a callable is a runtime task

A runtime task is a callable object constructible from std::function<void(tf::Runtime&)>.

◆ is_static_task_v

template<typename C >
constexpr bool tf::is_static_task_v
constexpr
Initial value:

determines if a callable is a static task

A static task is a callable object constructible from std::function<void()>.

◆ is_syclflow_task_v

template<typename C >
constexpr bool tf::is_syclflow_task_v = std::is_invocable_r_v<void, C, syclFlow&>
constexpr

determines if a callable is a syclFlow task

A syclFlow task is a callable object constructible from std::function<void(tf::syclFlow&)>.