taskflow namespace More...

Classes
class	ChromeObserver
	class to create an observer based on Chrome tracing format More...

class	CriticalSection
	class to create a critical region of limited workers to run tasks More...

class	cudaDeviceAllocator
	class to create a CUDA device allocator More...

class	cudaEvent
	‍** More...

class	cudaExecutionPolicy
	class to define execution policy for CUDA standard algorithms More...

class	cudaFlow
	class to create a cudaFlow task dependency graph More...

class	cudaFlowCapturer
	class to create a cudaFlow graph using stream capture More...

class	cudaLinearCapturing
	class to capture a linear CUDA graph using a sequential stream More...

class	cudaRoundRobinCapturing
	class to capture a CUDA graph using a round-robin algorithm More...

class	cudaScopedDevice
	class to create an RAII-styled context switch More...

class	cudaSequentialCapturing
	class to capture a CUDA graph using a sequential stream More...

class	cudaStream
	‍** More...

class	cudaTask
	class to create a task handle over an internal node of a cudaFlow graph More...

class	cudaUSMAllocator
	class to create a unified shared memory (USM) allocator More...

class	Executor
	class to create an executor for running a taskflow graph More...

class	FlowBuilder
	class to build a task dependency graph More...

class	Future
	class to access the result of an execution More...

class	Graph
	class to create a graph object More...

class	ObserverInterface
	class to derive an executor observer More...

class	Pipe
	class to create a pipe object for a pipeline stage More...

class	Pipeflow
	class to create a pipeflow object used by the pipe callable More...

class	Pipeline
	class to create a pipeline scheduling framework More...

class	Runtime
	class to create a runtime object used by a runtime task More...

class	ScalablePipeline
	class to create a scalable pipeline object More...

class	Semaphore
	class to create a semophore object for building a concurrency constraint More...

class	SmallVector
	class to define a vector optimized for small array More...

class	Subflow
	class to construct a subflow graph from the execution of a dynamic task More...

class	syclFlow
	class for building a SYCL task dependency graph More...

class	syclTask
	handle to a node of the internal CUDA graph More...

class	Task
	class to create a task handle over a node in a taskflow graph More...

class	Taskflow
	class to create a taskflow object More...

class	TaskView
	class to access task information from the observer interface More...

class	TFProfObserver
	class to create an observer based on the built-in taskflow profiler format More...

class	WorkerView
	class to create an immutable view of a worker in an executor More...

Typedefs
using	observer_stamp_t = std::chrono::time_point< std::chrono::steady_clock >
	default time point type of observers

using	cudaDefaultExecutionPolicy = cudaExecutionPolicy< 512, 9 >
	default execution policy

Enumerations
enum class	TaskType : int { PLACEHOLDER = 0 , CUDAFLOW , SYCLFLOW , STATIC , DYNAMIC , CONDITION , MULTI_CONDITION , MODULE , ASYNC , RUNTIME , UNDEFINED }
	enumeration of all task types More...

enum class	ObserverType : int { TFPROF = 0 , CHROME , UNDEFINED }
	enumeration of all observer types

enum class	PipeType : int { PARALLEL = 1 , SERIAL = 2 }
	enumeration of all pipe types More...

enum class	cudaTaskType : int { EMPTY = 0 , HOST , MEMSET , MEMCPY , KERNEL , SUBFLOW , CAPTURE , UNDEFINED }
	enumeration of all cudaTask types More...

Functions
const char *	to_string (TaskType type)
	convert a task type to a human-readable string

std::ostream &	operator<< (std::ostream &os, const Task &task)
	overload of ostream inserter operator for cudaTask

const char *	to_string (ObserverType type)
	convert an observer type to a human-readable string

size_t	cuda_get_num_devices ()
	queries the number of available devices

int	cuda_get_device ()
	gets the current device associated with the caller thread

void	cuda_set_device (int id)
	switches to a given device context

void	cuda_get_device_property (int i, cudaDeviceProp &p)
	obtains the device property

cudaDeviceProp	cuda_get_device_property (int i)
	obtains the device property

void	cuda_dump_device_property (std::ostream &os, const cudaDeviceProp &p)
	dumps the device property

size_t	cuda_get_device_max_threads_per_block (int d)
	queries the maximum threads per block on a device

size_t	cuda_get_device_max_x_dim_per_block (int d)
	queries the maximum x-dimension per block on a device

size_t	cuda_get_device_max_y_dim_per_block (int d)
	queries the maximum y-dimension per block on a device

size_t	cuda_get_device_max_z_dim_per_block (int d)
	queries the maximum z-dimension per block on a device

size_t	cuda_get_device_max_x_dim_per_grid (int d)
	queries the maximum x-dimension per grid on a device

size_t	cuda_get_device_max_y_dim_per_grid (int d)
	queries the maximum y-dimension per grid on a device

size_t	cuda_get_device_max_z_dim_per_grid (int d)
	queries the maximum z-dimension per grid on a device

size_t	cuda_get_device_max_shm_per_block (int d)
	queries the maximum shared memory size in bytes per block on a device

size_t	cuda_get_device_warp_size (int d)
	queries the warp size on a device

int	cuda_get_device_compute_capability_major (int d)
	queries the major number of compute capability of a device

int	cuda_get_device_compute_capability_minor (int d)
	queries the minor number of compute capability of a device

bool	cuda_get_device_unified_addressing (int d)
	queries if the device supports unified addressing

int	cuda_get_driver_version ()
	queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver

int	cuda_get_runtime_version ()
	queries the CUDA Runtime version (1000 * major + 10 * minor)

size_t	cuda_get_free_mem (int d)
	queries the free memory (expensive call)

size_t	cuda_get_total_mem (int d)
	queries the total available memory (expensive call)

template<typename T >
T *	cuda_malloc_device (size_t N, int d)
	allocates memory on the given device for holding `N` elements of type `T`

template<typename T >
T *	cuda_malloc_device (size_t N)
	allocates memory on the current device associated with the caller

template<typename T >
T *	cuda_malloc_shared (size_t N)
	allocates shared memory for holding `N` elements of type `T`

template<typename T >
void	cuda_free (T *ptr, int d)
	frees memory on the GPU device

template<typename T >
void	cuda_free (T *ptr)
	frees memory on the GPU device

void	cuda_memcpy_async (cudaStream_t stream, void dst, const void src, size_t count)
	copies data between host and device asynchronously through a stream

void	cuda_memset_async (cudaStream_t stream, void *devPtr, int value, size_t count)
	initializes or sets GPU memory to the given value byte by byte

constexpr const char *	to_string (cudaTaskType type)
	convert a cuda_task type to a human-readable string

std::ostream &	operator<< (std::ostream &os, const cudaTask &ct)
	overload of ostream inserter operator for cudaTask

template<typename P , typename C >
void	cuda_single_task (P &&p, C c)
	runs a callable asynchronously using one kernel thread

template<typename P , typename I , typename C >
void	cuda_for_each (P &&p, I first, I last, C c)
	performs asynchronous parallel iterations over a range of items

template<typename P , typename I , typename C >
void	cuda_for_each_index (P &&p, I first, I last, I inc, C c)
	performs asynchronous parallel iterations over an index-based range of items

template<typename P , typename I , typename O , typename C >
void	cuda_transform (P &&p, I first, I last, O output, C op)
	performs asynchronous parallel transforms over a range of items

template<typename P , typename I1 , typename I2 , typename O , typename C >
void	cuda_transform (P &&p, I1 first1, I1 last1, I2 first2, O output, C op)
	performs asynchronous parallel transforms over two ranges of items

template<typename P , typename T >
unsigned	cuda_reduce_buffer_size (unsigned count)
	queries the buffer size in bytes needed to call reduce kernels

template<typename P , typename I , typename T , typename O >
void	cuda_reduce (P &&p, I first, I last, T res, O op, void buf)
	performs asynchronous parallel reduction over a range of items

template<typename P , typename I , typename T , typename O >
void	cuda_uninitialized_reduce (P &&p, I first, I last, T res, O op, void buf)
	performs asynchronous parallel reduction over a range of items without an initial value

template<typename P , typename I , typename T , typename O , typename U >
void	cuda_transform_reduce (P &&p, I first, I last, T res, O bop, U uop, void buf)
	performs asynchronous parallel reduction over a range of transformed items without an initial value

template<typename P , typename I , typename T , typename O , typename U >
void	cuda_transform_uninitialized_reduce (P &&p, I first, I last, T res, O bop, U uop, void buf)
	performs asynchronous parallel reduction over a range of transformed items with an initial value

template<typename P , typename T >
unsigned	cuda_scan_buffer_size (unsigned count)
	queries the buffer size in bytes needed to call scan kernels

template<typename P , typename I , typename O , typename C >
void	cuda_inclusive_scan (P &&p, I first, I last, O output, C op, void *buf)
	performs asynchronous inclusive scan over a range of items

template<typename P , typename I , typename O , typename C , typename U >
void	cuda_transform_inclusive_scan (P &&p, I first, I last, O output, C bop, U uop, void *buf)
	performs asynchronous inclusive scan over a range of transformed items

template<typename P , typename I , typename O , typename C >
void	cuda_exclusive_scan (P &&p, I first, I last, O output, C op, void *buf)
	performs asynchronous exclusive scan over a range of items

template<typename P , typename I , typename O , typename C , typename U >
void	cuda_transform_exclusive_scan (P &&p, I first, I last, O output, C bop, U uop, void *buf)
	performs asynchronous exclusive scan over a range of items

template<typename P >
unsigned	cuda_merge_buffer_size (unsigned a_count, unsigned b_count)
	queries the buffer size in bytes needed to call merge kernels

template<typename P , typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
void	cuda_merge_by_key (P &&p, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp, void *buf)
	performs asynchronous key-value merge over a range of keys and values

template<typename P , typename a_keys_it , typename b_keys_it , typename c_keys_it , typename C >
void	cuda_merge (P &&p, a_keys_it a_keys_first, a_keys_it a_keys_last, b_keys_it b_keys_first, b_keys_it b_keys_last, c_keys_it c_keys_first, C comp, void *buf)
	performs asynchronous key-only merge over a range of keys

template<typename P , typename K , typename V = cudaEmpty>
unsigned	cuda_sort_buffer_size (unsigned count)
	queries the buffer size in bytes needed to call sort kernels for the given number of elements

template<typename P , typename K_it , typename V_it , typename C >
void	cuda_sort_by_key (P &&p, K_it k_first, K_it k_last, V_it v_first, C comp, void *buf)
	performs asynchronous key-value sort on a range of items

template<typename P , typename K_it , typename C >
void	cuda_sort (P &&p, K_it k_first, K_it k_last, C comp, void *buf)
	performs asynchronous key-only sort on a range of items

template<typename P , typename I , typename U >
void	cuda_find_if (P &&p, I first, I last, unsigned *idx, U op)
	finds the index of the first element that satisfies the given criteria

template<typename P , typename T >
unsigned	cuda_min_element_buffer_size (unsigned count)
	queries the buffer size in bytes needed to call tf::cuda_min_element

template<typename P , typename I , typename O >
void	cuda_min_element (P &&p, I first, I last, unsigned idx, O op, void buf)
	finds the index of the minimum element in a range

template<typename P , typename T >
unsigned	cuda_max_element_buffer_size (unsigned count)
	queries the buffer size in bytes needed to call tf::cuda_max_element

template<typename P , typename I , typename O >
void	cuda_max_element (P &&p, I first, I last, unsigned idx, O op, void buf)
	finds the index of the maximum element in a range

std::ostream &	operator<< (std::ostream &os, const syclTask &ct)
	overload of ostream inserter operator for syclTask

constexpr const char *	version ()
	queries the version information in a string format `major.minor.patch`

Variables
template<typename C >
constexpr bool	is_static_task_v
	determines if a callable is a static task

template<typename C >
constexpr bool	is_dynamic_task_v = std::is_invocable_r_v<void, C, Subflow&>
	determines if a callable is a dynamic task

template<typename C >
constexpr bool	is_condition_task_v = std::is_invocable_r_v<int, C>
	determines if a callable is a condition task

template<typename C >
constexpr bool	is_multi_condition_task_v
	determines if a callable is a multi-condition task

template<typename C >
constexpr bool	is_cudaflow_task_v
	determines if a callable is a cudaFlow task

template<typename C >
constexpr bool	is_syclflow_task_v = std::is_invocable_r_v<void, C, syclFlow&>
	determines if a callable is a syclFlow task

template<typename C >
constexpr bool	is_runtime_task_v = std::is_invocable_r_v<void, C, Runtime&>
	determines if a callable is a runtime task

Detailed Description

taskflow namespace

Enumeration Type Documentation

◆ cudaTaskType

enum class tf::cudaTaskType : int

strong

enumeration of all cudaTask types

Enumerator
EMPTY	empty task type
HOST	host task type
MEMSET	memory set task type
MEMCPY	memory copy task type
KERNEL	memory copy task type
SUBFLOW	subflow (child graph) task type
CAPTURE	capture task type
UNDEFINED	undefined task type

◆ PipeType

enum class tf::PipeType : int

strong

enumeration of all pipe types

Enumerator
PARALLEL	parallel type
SERIAL	serial type

◆ TaskType

enum class tf::TaskType : int

strong

enumeration of all task types

Enumerator
PLACEHOLDER	placeholder task type
CUDAFLOW	cudaFlow task type
SYCLFLOW	syclFlow task type
STATIC	static task type
DYNAMIC	dynamic (subflow) task type
CONDITION	condition task type
MULTI_CONDITION	multi-condition task type
MODULE	module task type
ASYNC	asynchronous task type
RUNTIME	runtime task type
UNDEFINED	undefined task type (for internal use only)

Function Documentation

◆ cuda_exclusive_scan()

template<typename P , typename I , typename O , typename C >

void tf::cuda_exclusive_scan	(	P &&	p,
		I	first,
		I	last,
		O	output,
		C	op,
		void *	buf
	)

performs asynchronous exclusive scan over a range of items

Template Parameters

P	execution policy type
I	input iterator
O	output iterator
C	binary operator type

Parameters

p	execution policy
first	iterator to the beginning of the input range
last	iterator to the end of the input range
output	iterator to the beginning of the output range
op	binary operator to apply to scan
buf	pointer to the temporary buffer

◆ cuda_find_if()

template<typename P , typename I , typename U >

void tf::cuda_find_if	(	P &&	p,
		I	first,
		I	last,
		unsigned *	idx,
		U	op
	)

finds the index of the first element that satisfies the given criteria

Template Parameters

P	execution policy type
I	input iterator type
U	unary operator type

Parameters

p	execution policy
first	iterator to the beginning of the range
last	iterator to the end of the range
idx	pointer to the index of the found element
op	unary operator which returns `true` for the required element

The function launches kernels asynchronously to find the index idx of the first element in the range [first, last) such that op(*(first+idx)) is true. This is equivalent to the parallel execution of the following loop:

unsigned idx = 0;
for(; first != last; ++first, ++idx) {
  if (p(*first)) {
    return idx;
  }
}
return idx;

◆ cuda_for_each()

template<typename P , typename I , typename C >

void tf::cuda_for_each	(	P &&	p,
		I	first,
		I	last,
		C	c
	)

performs asynchronous parallel iterations over a range of items

Template Parameters

P	execution policy type
I	input iterator type
C	unary operator type

Parameters

p	execution policy object
first	iterator to the beginning of the range
last	iterator to the end of the range
c	unary operator to apply to each dereferenced iterator

This function is equivalent to a parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; itr++) {
  c(*itr);
}

◆ cuda_for_each_index()

template<typename P , typename I , typename C >

void tf::cuda_for_each_index	(	P &&	p,
		I	first,
		I	last,
		I	inc,
		C	c
	)

performs asynchronous parallel iterations over an index-based range of items

Template Parameters

P	execution policy type
I	input index type
C	unary operator type

Parameters

p	execution policy object
first	index to the beginning of the range
last	index to the end of the range
inc	step size between successive iterations
c	unary operator to apply to each index

This function is equivalent to a parallel execution of the following loop on a GPU:

// step is positive [first, last)
for(auto i=first; i<last; i+=step) {
  c(i);
}
 
// step is negative [first, last)
for(auto i=first; i>last; i+=step) {
  c(i);
}

◆ cuda_free() [1/2]

template<typename T >

void tf::cuda_free ( T * ptr )

frees memory on the GPU device

Template Parameters

T	pointer type

Parameters

ptr	device pointer to memory to free

This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller.

◆ cuda_free() [2/2]

template<typename T >

void tf::cuda_free	(	T *	ptr,
		int	d
	)

frees memory on the GPU device

Template Parameters

T	pointer type

Parameters

ptr	device pointer to memory to free
d	device context identifier

This methods call cudaFree to free the memory space pointed to by ptr using the given device context.

◆ cuda_inclusive_scan()

template<typename P , typename I , typename O , typename C >

void tf::cuda_inclusive_scan	(	P &&	p,
		I	first,
		I	last,
		O	output,
		C	op,
		void *	buf
	)

performs asynchronous inclusive scan over a range of items

Template Parameters

P	execution policy type
I	input iterator
O	output iterator
C	binary operator type

Parameters

p	execution policy
first	iterator to the beginning of the input range
last	iterator to the end of the input range
output	iterator to the beginning of the output range
op	binary operator to apply to scan
buf	pointer to the temporary buffer

◆ cuda_malloc_device() [1/2]

template<typename T >

T * tf::cuda_malloc_device ( size_t N )

allocates memory on the current device associated with the caller

The function calls malloc_device from the current device associated with the caller.

◆ cuda_malloc_device() [2/2]

template<typename T >

T * tf::cuda_malloc_device	(	size_t	N,
		int	d
	)

allocates memory on the given device for holding N elements of type T

The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory.

◆ cuda_malloc_shared()

template<typename T >

T * tf::cuda_malloc_shared ( size_t N )

allocates shared memory for holding N elements of type T

The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory.

◆ cuda_max_element()

template<typename P , typename I , typename O >

void tf::cuda_max_element	(	P &&	p,
		I	first,
		I	last,
		unsigned *	idx,
		O	op,
		void *	buf
	)

finds the index of the maximum element in a range

Template Parameters

P	execution policy type
I	input iterator type
O	comparator type

Parameters

p	execution policy object
first	iterator to the beginning of the range
last	iterator to the end of the range
idx	solution index of the maximum element
op	comparison function object
buf	pointer to the buffer

The function launches kernels asynchronously to find the largest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_max_element_buffer_size bytes for internal use. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
  return 0;
}
auto largest = first;
for (++first; first != last; ++first) {
  if (op(*largest, *first)) {
    largest = first;
  }
}
return std::distance(first, largest);

◆ cuda_max_element_buffer_size()

template<typename P , typename T >

unsigned tf::cuda_max_element_buffer_size ( unsigned count )

queries the buffer size in bytes needed to call tf::cuda_max_element

Template Parameters

P	execution policy type
T	value type

Parameters

count number of elements to search

The function is used to decide the buffer size in bytes for calling tf::cuda_max_element.

◆ cuda_memcpy_async()

void tf::cuda_memcpy_async	(	cudaStream_t	stream,
		void *	dst,
		const void *	src,
		size_t	count
	)

inline

copies data between host and device asynchronously through a stream

Parameters

stream	stream identifier
dst	destination memory address
src	source memory address
count	size in bytes to copy

The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap.

◆ cuda_memset_async()

void tf::cuda_memset_async	(	cudaStream_t	stream,
		void *	devPtr,
		int	value,
		size_t	count
	)

inline

initializes or sets GPU memory to the given value byte by byte

Parameters

stream	stream identifier
devPtr	pointer to GPU mempry
value	value to set for each byte of the specified memory
count	size in bytes to set

The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

◆ cuda_merge()

template<typename P , typename a_keys_it , typename b_keys_it , typename c_keys_it , typename C >

void tf::cuda_merge	(	P &&	p,
		a_keys_it	a_keys_first,
		a_keys_it	a_keys_last,
		b_keys_it	b_keys_first,
		b_keys_it	b_keys_last,
		c_keys_it	c_keys_first,
		C	comp,
		void *	buf
	)

performs asynchronous key-only merge over a range of keys

Template Parameters

P	execution policy type
a_keys_it	first key iterator type
b_keys_it	second key iterator type
c_keys_it	output key iterator type
C	comparator type

Parameters

p	execution policy
a_keys_first	iterator to the beginning of the first key range
a_keys_last	iterator to the end of the first key range
b_keys_first	iterator to the beginning of the second key range
b_keys_last	iterator to the end of the second key range
c_keys_first	iterator to the beginning of the output key range
comp	comparator
buf	pointer to the temporary buffer

This function is equivalent to tf::cuda_merge_by_key without values.

◆ cuda_merge_buffer_size()

template<typename P >

unsigned tf::cuda_merge_buffer_size	(	unsigned	a_count,
		unsigned	b_count
	)

queries the buffer size in bytes needed to call merge kernels

Template Parameters

P	execution polity type

Parameters

a_count	number of elements in the first input array
b_count	number of elements in the second input array

The function is used to allocate a buffer for calling tf::cuda_merge.

◆ cuda_merge_by_key()

template<typename P , typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >

void tf::cuda_merge_by_key	(	P &&	p,
		a_keys_it	a_keys_first,
		a_keys_it	a_keys_last,
		a_vals_it	a_vals_first,
		b_keys_it	b_keys_first,
		b_keys_it	b_keys_last,
		b_vals_it	b_vals_first,
		c_keys_it	c_keys_first,
		c_vals_it	c_vals_first,
		C	comp,
		void *	buf
	)

performs asynchronous key-value merge over a range of keys and values

Template Parameters

P	execution policy type
a_keys_it	first key iterator type
a_vals_it	first value iterator type
b_keys_it	second key iterator type
b_vals_it	second value iterator type
c_keys_it	output key iterator type
c_vals_it	output value iterator type
C	comparator type

Parameters

p	execution policy
a_keys_first	iterator to the beginning of the first key range
a_keys_last	iterator to the end of the first key range
a_vals_first	iterator to the beginning of the first value range
b_keys_first	iterator to the beginning of the second key range
b_keys_last	iterator to the end of the second key range
b_vals_first	iterator to the beginning of the second value range
c_keys_first	iterator to the beginning of the output key range
c_vals_first	iterator to the beginning of the output value range
comp	comparator
buf	pointer to the temporary buffer

Performs a key-value merge that copies elements from [a_keys_first, a_keys_last) and [b_keys_first, b_keys_last) into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending key order.

At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first)) and [b_vals_first + (b_keys_last - b_keys_first)) into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending order implied by each input element's associated key.

For example, assume:

a_keys = {8, 1};
a_vals = {1, 2};
b_keys = {3, 7};
b_vals = {3, 4};

After the merge, we have:

c_keys = {1, 3, 7, 8}
c_vals = {2, 3, 4, 1}

◆ cuda_min_element()

template<typename P , typename I , typename O >

void tf::cuda_min_element	(	P &&	p,
		I	first,
		I	last,
		unsigned *	idx,
		O	op,
		void *	buf
	)

finds the index of the minimum element in a range

Template Parameters

P	execution policy type
I	input iterator type
O	comparator type

Parameters

p	execution policy object
first	iterator to the beginning of the range
last	iterator to the end of the range
idx	solution index of the minimum element
op	comparison function object
buf	pointer to the buffer

The function launches kernels asynchronously to find the smallest element in the range [first, last) using the given comparator op. You need to provide a buffer that holds at least tf::cuda_min_element_buffer_size bytes for internal use. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
  return 0;
}
auto smallest = first;
for (++first; first != last; ++first) {
  if (op(*first, *smallest)) {
    smallest = first;
  }
}
return std::distance(first, smallest);

◆ cuda_min_element_buffer_size()

template<typename P , typename T >

unsigned tf::cuda_min_element_buffer_size ( unsigned count )

queries the buffer size in bytes needed to call tf::cuda_min_element

Template Parameters

P	execution policy type
T	value type

Parameters

count number of elements to search

The function is used to decide the buffer size in bytes for calling tf::cuda_min_element.

◆ cuda_reduce()

template<typename P , typename I , typename T , typename O >

void tf::cuda_reduce	(	P &&	p,
		I	first,
		I	last,
		T *	res,
		O	op,
		void *	buf
	)

performs asynchronous parallel reduction over a range of items

Template Parameters

P	execution policy type
I	input iterator type
T	value type
O	binary operator type

Parameters

p	execution policy
first	iterator to the beginning of the range
last	iterator to the end of the range
res	pointer to the result
op	binary operator to apply to reduce elements
buf	pointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *result = op(*result, *first++);
}

◆ cuda_reduce_buffer_size()

template<typename P , typename T >

unsigned tf::cuda_reduce_buffer_size ( unsigned count )

queries the buffer size in bytes needed to call reduce kernels

Template Parameters

P	execution policy type
T	value type

Parameters

count number of elements to reduce

The function is used to allocate a buffer for calling tf::cuda_reduce, tf::cuda_uninitialized_reduce, tf::cuda_transform_reduce, and tf::cuda_transform_uninitialized_reduce.

◆ cuda_scan_buffer_size()

template<typename P , typename T >

unsigned tf::cuda_scan_buffer_size ( unsigned count )

queries the buffer size in bytes needed to call scan kernels

Template Parameters

P	execution policy type
T	value type

Parameters

count number of elements to scan

The function is used to allocate a buffer for calling tf::cuda_inclusive_scan, tf::cuda_exclusive_scan, tf::cuda_transform_inclusive_scan, and tf::cuda_transform_exclusive_scan.

◆ cuda_single_task()

template<typename P , typename C >

void tf::cuda_single_task	(	P &&	p,
		C	c
	)

runs a callable asynchronously using one kernel thread

Template Parameters

P	execution policy type
C	closure type

Parameters

p	execution policy
c	closure to run by one kernel thread

The function launches a single kernel thread to run the given callable through the stream in the execution policy object.

◆ cuda_sort()

template<typename P , typename K_it , typename C >

void tf::cuda_sort	(	P &&	p,
		K_it	k_first,
		K_it	k_last,
		C	comp,
		void *	buf
	)

performs asynchronous key-only sort on a range of items

Template Parameters

P	execution policy type
K_it	key iterator type
C	comparator type

Parameters

p	execution policy
k_first	iterator to the beginning of the key range
k_last	iterator to the end of the key range
comp	binary comparator
buf	pointer to the temporary buffer

This method is equivalent to tf::cuda_sort_by_key without values.

◆ cuda_sort_buffer_size()

template<typename P , typename K , typename V = cudaEmpty>

unsigned tf::cuda_sort_buffer_size ( unsigned count )

queries the buffer size in bytes needed to call sort kernels for the given number of elements

Template Parameters

P	execution policy type
K	key type
V	value type (default tf::cudaEmpty)

Parameters

count number of keys/values to sort

The function is used to allocate a buffer for calling tf::cuda_sort.

◆ cuda_sort_by_key()

template<typename P , typename K_it , typename V_it , typename C >

void tf::cuda_sort_by_key	(	P &&	p,
		K_it	k_first,
		K_it	k_last,
		V_it	v_first,
		C	comp,
		void *	buf
	)

performs asynchronous key-value sort on a range of items

Template Parameters

P	execution policy type
K_it	key iterator type
V_it	value iterator type
C	comparator type

Parameters

p	execution policy
k_first	iterator to the beginning of the key range
k_last	iterator to the end of the key range
v_first	iterator to the beginning of the value range
comp	binary comparator
buf	pointer to the temporary buffer

Sorts key-value elements in [k_first, k_last) and [v_first, v_first + (k_last - k_first)) into ascending key order using the given comparator comp. If i and j are any two valid iterators in [k_first, k_last) such that i precedes j, and p and q are iterators in [v_first, v_first + (k_last - k_first)) corresponding to i and j respectively, then comp(*j, *i) evaluates to false.

For example, assume:

keys are {1, 4, 2, 8, 5, 7}
values are {'a', 'b', 'c', 'd', 'e', 'f'}

After sort:

keys are {1, 2, 4, 5, 7, 8}
values are {'a', 'c', 'b', 'e', 'f', 'd'}

◆ cuda_transform() [1/2]

template<typename P , typename I , typename O , typename C >

void tf::cuda_transform	(	P &&	p,
		I	first,
		I	last,
		O	output,
		C	op
	)

performs asynchronous parallel transforms over a range of items

Template Parameters

P	execution policy type
I	input iterator type
O	output iterator type
C	unary operator type

Parameters

p	execution policy
first	iterator to the beginning of the range
last	iterator to the end of the range
output	iterator to the beginning of the output range
op	unary operator to apply to transform each item

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *output++ = op(*first++);
}

◆ cuda_transform() [2/2]

template<typename P , typename I1 , typename I2 , typename O , typename C >

void tf::cuda_transform	(	P &&	p,
		I1	first1,
		I1	last1,
		I2	first2,
		O	output,
		C	op
	)

performs asynchronous parallel transforms over two ranges of items

Template Parameters

P	execution policy type
I1	first input iterator type
I2	second input iterator type
O	output iterator type
C	binary operator type

Parameters

p	execution policy
first1	iterator to the beginning of the first range
last1	iterator to the end of the first range
first2	iterator to the beginning of the second range
output	iterator to the beginning of the output range
op	binary operator to apply to transform each pair of items

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {
  *output++ = op(*first1++, *first2++);
}

◆ cuda_transform_exclusive_scan()

template<typename P , typename I , typename O , typename C , typename U >

void tf::cuda_transform_exclusive_scan	(	P &&	p,
		I	first,
		I	last,
		O	output,
		C	bop,
		U	uop,
		void *	buf
	)

performs asynchronous exclusive scan over a range of items

Template Parameters

P	execution policy type
I	input iterator
O	output iterator
C	binary operator type
U	unary operator type

Parameters

p	execution policy
first	iterator to the beginning of the input range
last	iterator to the end of the input range
output	iterator to the beginning of the output range
bop	binary operator to apply to scan
uop	unary operator to apply to transform each item before scan
buf	pointer to the temporary buffer

◆ cuda_transform_inclusive_scan()

template<typename P , typename I , typename O , typename C , typename U >

void tf::cuda_transform_inclusive_scan	(	P &&	p,
		I	first,
		I	last,
		O	output,
		C	bop,
		U	uop,
		void *	buf
	)

performs asynchronous inclusive scan over a range of transformed items

Template Parameters

P	execution policy type
I	input iterator
O	output iterator
C	binary operator type
U	unary operator type

Parameters

p	execution policy
first	iterator to the beginning of the input range
last	iterator to the end of the input range
output	iterator to the beginning of the output range
bop	binary operator to apply to scan
uop	unary operator to apply to transform each item before scan
buf	pointer to the temporary buffer

◆ cuda_transform_reduce()

template<typename P , typename I , typename T , typename O , typename U >

void tf::cuda_transform_reduce	(	P &&	p,
		I	first,
		I	last,
		T *	res,
		O	bop,
		U	uop,
		void *	buf
	)

performs asynchronous parallel reduction over a range of transformed items without an initial value

Template Parameters

P	execution policy type
I	input iterator type
T	value type
O	binary operator type
U	unary operator type

Parameters

p	execution policy
first	iterator to the beginning of the range
last	iterator to the end of the range
res	pointer to the result
bop	binary operator to apply to reduce elements
uop	unary operator to apply to transform elements
buf	pointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *result = bop(*result, uop(*first++));
}

◆ cuda_transform_uninitialized_reduce()

template<typename P , typename I , typename T , typename O , typename U >

void tf::cuda_transform_uninitialized_reduce	(	P &&	p,
		I	first,
		I	last,
		T *	res,
		O	bop,
		U	uop,
		void *	buf
	)

performs asynchronous parallel reduction over a range of transformed items with an initial value

Template Parameters

P	execution policy type
I	input iterator type
T	value type
O	binary operator type
U	unary operator type

Parameters

p	execution policy
first	iterator to the beginning of the range
last	iterator to the end of the range
res	pointer to the result
bop	binary operator to apply to reduce elements
uop	unary operator to apply to transform elements
buf	pointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = uop(*first++);  // no initial values partitipcate in the loop
while (first != last) {
  *result = bop(*result, uop(*first++));
}

◆ cuda_uninitialized_reduce()

template<typename P , typename I , typename T , typename O >

void tf::cuda_uninitialized_reduce	(	P &&	p,
		I	first,
		I	last,
		T *	res,
		O	op,
		void *	buf
	)

performs asynchronous parallel reduction over a range of items without an initial value

Template Parameters

P	execution policy type
I	input iterator type
T	value type
O	binary operator type

Parameters

p	execution policy
first	iterator to the beginning of the range
last	iterator to the end of the range
res	pointer to the result
op	binary operator to apply to reduce elements
buf	pointer to the temporary buffer

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = *first++;  // no initial values partitipcate in the loop
while (first != last) {
  *result = op(*result, *first++);
}

◆ to_string()

const char * tf::to_string ( TaskType type )

inline

convert a task type to a human-readable string

The name of each task type is the litte-case string of its characters.

TaskType::PLACEHOLDER     ->  "placeholder"
TaskType::CUDAFLOW        ->  "cudaflow"
TaskType::SYCLFLOW        ->  "syclflow"
TaskType::STATIC          ->  "static"
TaskType::DYNAMIC         ->  "subflow"
TaskType::CONDITION       ->  "condition"
TaskType::MULTI_CONDITION ->  "multi_condition"
TaskType::MODULE          ->  "module"
TaskType::ASYNC           ->  "async"
TaskType::RUNTIME         ->  "runtime"

Variable Documentation

◆ is_condition_task_v

template<typename C >

constexpr bool tf::is_condition_task_v = std::is_invocable_r_v<int, C>

constexpr

determines if a callable is a condition task

A condition task is a callable object constructible from std::function<int()>.

◆ is_cudaflow_task_v

template<typename C >

constexpr bool tf::is_cudaflow_task_v

constexpr

Initial value:

= std::is_invocable_r_v<void, C, cudaFlow&> ||

std::is_invocable_r_v<void, C, cudaFlowCapturer&>

std::forward

T forward(T... args)

determines if a callable is a cudaFlow task

A cudaFlow task is a callable object constructible from std::function<void(tf::cudaFlow&)> or std::function<void(tf::cudaFlowCapturer&)>.

◆ is_dynamic_task_v

template<typename C >

constexpr bool tf::is_dynamic_task_v = std::is_invocable_r_v<void, C, Subflow&>

constexpr

determines if a callable is a dynamic task

A dynamic task is a callable object constructible from std::function<void(Subflow&)>.

◆ is_multi_condition_task_v

template<typename C >

constexpr bool tf::is_multi_condition_task_v

constexpr

Initial value:

=

std::is_invocable_r_v<SmallVector<int>, C>

determines if a callable is a multi-condition task

A multi-condition task is a callable object constructible from std::function<tf::SmallVector<int>()>.

◆ is_runtime_task_v

template<typename C >

constexpr bool tf::is_runtime_task_v = std::is_invocable_r_v<void, C, Runtime&>

constexpr

determines if a callable is a runtime task

A runtime task is a callable object constructible from std::function<void(tf::Runtime&)>.

◆ is_static_task_v

template<typename C >

constexpr bool tf::is_static_task_v

constexpr

Initial value:

=
  std::is_invocable_r_v<void, C> &&
  !std::is_invocable_r_v<int, C> &&
  !std::is_invocable_r_v<tf::SmallVector<int>, C>

determines if a callable is a static task

A static task is a callable object constructible from std::function<void()>.

◆ is_syclflow_task_v

template<typename C >

constexpr bool tf::is_syclflow_task_v = std::is_invocable_r_v<void, C, syclFlow&>

constexpr

determines if a callable is a syclFlow task

A syclFlow task is a callable object constructible from std::function<void(tf::syclFlow&)>.

Classes

Typedefs

Enumerations

Functions

Variables

Detailed Description

Enumeration Type Documentation

◆ cudaTaskType

◆ PipeType

◆ TaskType

Function Documentation

◆ cuda_exclusive_scan()

◆ cuda_find_if()

◆ cuda_for_each()

◆ cuda_for_each_index()

◆ cuda_free() [1/2]

◆ cuda_free() [2/2]

◆ cuda_inclusive_scan()

◆ cuda_malloc_device() [1/2]

◆ cuda_malloc_device() [2/2]

◆ cuda_malloc_shared()

◆ cuda_max_element()

◆ cuda_max_element_buffer_size()

◆ cuda_memcpy_async()

◆ cuda_memset_async()

◆ cuda_merge()

◆ cuda_merge_buffer_size()

◆ cuda_merge_by_key()

◆ cuda_min_element()

◆ cuda_min_element_buffer_size()

◆ cuda_reduce()

◆ cuda_reduce_buffer_size()

◆ cuda_scan_buffer_size()

◆ cuda_single_task()

◆ cuda_sort()

◆ cuda_sort_buffer_size()

◆ cuda_sort_by_key()

◆ cuda_transform() [1/2]

◆ cuda_transform() [2/2]

◆ cuda_transform_exclusive_scan()

◆ cuda_transform_inclusive_scan()

◆ cuda_transform_reduce()

◆ cuda_transform_uninitialized_reduce()

◆ cuda_uninitialized_reduce()

◆ to_string()

Variable Documentation

◆ is_condition_task_v

◆ is_cudaflow_task_v

◆ is_dynamic_task_v

◆ is_multi_condition_task_v

◆ is_runtime_task_v

◆ is_static_task_v

◆ is_syclflow_task_v