class to create a cudaFlow graph using stream capture More...

#include <cuda_capturer.hpp>

Public Member Functions
	cudaFlowCapturer ()
	constrcts a standalone cudaFlowCapturer

virtual	~cudaFlowCapturer ()
	destructs the cudaFlowCapturer

bool	empty () const
	queries the emptiness of the graph

size_t	num_tasks () const
	queries the number of tasks

void	clear ()
	clear this cudaFlow capturer

void	dump (std::ostream &os) const
	dumps the capture graph into a DOT format through an output stream

template<typename OPT , typename... ArgsT>
OPT &	make_optimizer (ArgsT &&... args)
	selects a different optimization algorithm

template<typename C , std::enable_if_t< std::is_invocable_r_v< void, C, cudaStream_t >, void > * = nullptr>
cudaTask	on (C &&callable)
	captures a sequential CUDA operations from the given callable

template<typename C , std::enable_if_t< std::is_invocable_r_v< void, C, cudaStream_t >, void > * = nullptr>
void	on (cudaTask task, C &&callable)
	updates a capture task to another sequential CUDA operations

cudaTask	noop ()
	captures a no-operation task

void	noop (cudaTask task)
	updates a task to a no-operation task

cudaTask	memcpy (void dst, const void src, size_t count)
	copies data between host and device asynchronously through a stream

void	memcpy (cudaTask task, void dst, const void src, size_t count)
	updates a capture task to a memcpy operation

template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * = nullptr>
cudaTask	copy (T tgt, const T src, size_t num)
	captures a copy task of typed data

template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * = nullptr>
void	copy (cudaTask task, T tgt, const T src, size_t num)
	updates a capture task to a copy operation

cudaTask	memset (void *ptr, int v, size_t n)
	initializes or sets GPU memory to the given value byte by byte

void	memset (cudaTask task, void *ptr, int value, size_t n)
	updates a capture task to a memset operation

template<typename F , typename... ArgsT>
cudaTask	kernel (dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
	captures a kernel

template<typename F , typename... ArgsT>
void	kernel (cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
	updates a capture task to a kernel operation

template<typename C >
cudaTask	single_task (C c)
	capturers a kernel to runs the given callable with only one thread

template<typename C >
void	single_task (cudaTask task, C c)
	updates a capture task to a single-threaded kernel

template<typename I , typename C >
cudaTask	for_each (I first, I last, C callable)
	captures a kernel that applies a callable to each dereferenced element of the data array

template<typename I , typename C >
void	for_each (cudaTask task, I first, I last, C callable)
	updates a capture task to a for-each kernel task

template<typename I , typename C >
cudaTask	for_each_index (I first, I last, I step, C callable)
	captures a kernel that applies a callable to each index in the range with the step size

template<typename I , typename C >
void	for_each_index (cudaTask task, I first, I last, I step, C callable)
	updates a capture task to a for-each-index kernel task

template<typename I , typename O , typename C >
cudaTask	transform (I first, I last, O output, C op)
	captures a kernel that transforms an input range to an output range

template<typename I , typename O , typename C >
void	transform (cudaTask task, I first, I last, O output, C op)
	updates a capture task to a transform kernel task

template<typename I1 , typename I2 , typename O , typename C >
cudaTask	transform (I1 first1, I1 last1, I2 first2, O output, C op)
	captures a kernel that transforms two input ranges to an output range

template<typename I1 , typename I2 , typename O , typename C >
void	transform (cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op)
	updates a capture task to a transform kernel task

template<typename I , typename T , typename C >
cudaTask	reduce (I first, I last, T *result, C op)
	captures kernels that perform parallel reduction over a range of items

template<typename I , typename T , typename C >
void	reduce (cudaTask task, I first, I last, T *result, C op)
	updates a capture task to a reduction task

template<typename I , typename T , typename C >
cudaTask	uninitialized_reduce (I first, I last, T *result, C op)
	similar to tf::cudaFlowCapturer::reduce but does not assume any initial value to reduce

template<typename I , typename T , typename C >
void	uninitialized_reduce (cudaTask task, I first, I last, T *result, C op)
	updates a capture task to an uninitialized-reduction task

template<typename I , typename T , typename C , typename U >
cudaTask	transform_reduce (I first, I last, T *result, C bop, U uop)
	captures kernels that perform parallel reduction over a range of transformed items

template<typename I , typename T , typename C , typename U >
void	transform_reduce (cudaTask task, I first, I last, T *result, C bop, U uop)
	updates a capture task to a transform-reduce task

template<typename I , typename T , typename C , typename U >
cudaTask	transform_uninitialized_reduce (I first, I last, T *result, C bop, U uop)
	similar to tf::cudaFlowCapturer::transform_reduce but does not assume any initial value to reduce

template<typename I , typename T , typename C , typename U >
void	transform_uninitialized_reduce (cudaTask task, I first, I last, T *result, C bop, U uop)
	updates a capture task to a transform-reduce task of no initialized value

template<typename I , typename O , typename C >
cudaTask	inclusive_scan (I first, I last, O output, C op)
	captures kernels that perform parallel inclusive scan over a range of items

template<typename I , typename O , typename C >
void	inclusive_scan (cudaTask task, I first, I last, O output, C op)
	updates a capture task to an inclusive scan task

template<typename I , typename O , typename C >
cudaTask	exclusive_scan (I first, I last, O output, C op)
	similar to cudaFlowCapturer::inclusive_scan but excludes the first value

template<typename I , typename O , typename C >
void	exclusive_scan (cudaTask task, I first, I last, O output, C op)
	updates a capture task to an exclusive scan task

template<typename I , typename O , typename B , typename U >
cudaTask	transform_inclusive_scan (I first, I last, O output, B bop, U uop)
	captures kernels that perform parallel inclusive scan over a range of transformed items

template<typename I , typename O , typename B , typename U >
void	transform_inclusive_scan (cudaTask task, I first, I last, O output, B bop, U uop)
	updates a capture task to a transform-inclusive scan task

template<typename I , typename O , typename B , typename U >
cudaTask	transform_exclusive_scan (I first, I last, O output, B bop, U uop)
	similar to cudaFlowCapturer::transform_inclusive_scan but excludes the first value

template<typename I , typename O , typename B , typename U >
void	transform_exclusive_scan (cudaTask task, I first, I last, O output, B bop, U uop)
	updates a capture task to a transform-exclusive scan task

template<typename A , typename B , typename C , typename Comp >
cudaTask	merge (A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
	captures kernels that perform parallel merge on two sorted arrays

template<typename A , typename B , typename C , typename Comp >
void	merge (cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
	updates a capture task to a merge task

template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
cudaTask	merge_by_key (a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
	captures kernels that perform parallel key-value merge

template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >
void	merge_by_key (cudaTask task, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
	updates a capture task to a key-value merge task

template<typename I , typename C >
cudaTask	sort (I first, I last, C comp)
	captures kernels that sort the given array

template<typename I , typename C >
void	sort (cudaTask task, I first, I last, C comp)
	updates a capture task to a sort task

template<typename K_it , typename V_it , typename C >
cudaTask	sort_by_key (K_it k_first, K_it k_last, V_it v_first, C comp)
	captures kernels that sort the given array

template<typename K_it , typename V_it , typename C >
void	sort_by_key (cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp)
	updates a capture task to a key-value sort task

template<typename I , typename U >
cudaTask	find_if (I first, I last, unsigned *idx, U op)
	creates a task to find the index of the first element in a range

template<typename I , typename U >
void	find_if (cudaTask task, I first, I last, unsigned *idx, U op)
	updates the parameters of a find-if task

template<typename I , typename O >
cudaTask	min_element (I first, I last, unsigned *idx, O op)
	finds the index of the minimum element in a range

template<typename I , typename O >
void	min_element (cudaTask task, I first, I last, unsigned *idx, O op)
	updates the parameters of a min-element task

template<typename I , typename O >
cudaTask	max_element (I first, I last, unsigned *idx, O op)
	finds the index of the maximum element in a range

template<typename I , typename O >
void	max_element (cudaTask task, I first, I last, unsigned *idx, O op)
	updates the parameters of a max-element task

template<typename P >
void	offload_until (P &&predicate)
	offloads the captured cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true

void	offload_n (size_t n)
	offloads the captured cudaFlow and executes it by the given times

void	offload ()
	offloads the captured cudaFlow and executes it once

Friends
class	cudaFlow

class	Executor

Detailed Description

class to create a cudaFlow graph using stream capture

The usage of tf::cudaFlowCapturer is similar to tf::cudaFlow, except users can call the method tf::cudaFlowCapturer::on to capture a sequence of asynchronous CUDA operations through the given stream. The following example creates a CUDA graph that captures two kernel tasks, task_1 and task_2, where task_1 runs before task_2.

taskflow.emplace([](tf::cudaFlowCapturer& capturer){
 
  // capture my_kernel_1 through the given stream managed by the capturer
  auto task_1 = capturer.on([&](cudaStream_t stream){
    my_kernel_1<<<grid_1, block_1, shm_size_1, stream>>>(my_parameters_1);
  });
 
  // capture my_kernel_2 through the given stream managed by the capturer
  auto task_2 = capturer.on([&](cudaStream_t stream){
    my_kernel_2<<<grid_2, block_2, shm_size_2, stream>>>(my_parameters_2);
  });
 
  task_1.precede(task_2);
});

Similar to tf::cudaFlow, a cudaFlowCapturer is a task (tf::Task) created from tf::Taskflow and will be run by one worker thread in the executor. That is, the callable that describes a cudaFlowCapturer will be executed sequentially. Inside a cudaFlow capturer task, different GPU tasks (tf::cudaTask) may run in parallel depending on the selected optimization algorithm. By default, we use tf::cudaRoundRobinCapturing to transform a user-level graph into a native CUDA graph.

Please refer to GPU Tasking (cudaFlowCapturer) for details.

Constructor & Destructor Documentation

◆ cudaFlowCapturer()

tf::cudaFlowCapturer::cudaFlowCapturer ( )

inline

constrcts a standalone cudaFlowCapturer

A standalone cudaFlow capturer does not go through any taskflow and can be run by the caller thread using explicit offload methods (e.g., tf::cudaFlow::offload).

Member Function Documentation

◆ copy() [1/2]

template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * >

void tf::cudaFlowCapturer::copy	(	cudaTask	task,
		T *	tgt,
		const T *	src,
		size_t	num
	)

updates a capture task to a copy operation

The method is similar to cudaFlowCapturer::copy but operates on an existing task.

◆ copy() [2/2]

template<typename T , std::enable_if_t<!std::is_same_v< T, void >, void > * >

cudaTask tf::cudaFlowCapturer::copy	(	T *	tgt,
		const T *	src,
		size_t	num
	)

captures a copy task of typed data

Template Parameters

T	element type (non-void)

Parameters

tgt	pointer to the target memory block
src	pointer to the source memory block
num	number of elements to copy

Returns: cudaTask handle

A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

◆ exclusive_scan()

template<typename I , typename O , typename C >

void tf::cudaFlowCapturer::exclusive_scan	(	cudaTask	task,
		I	first,
		I	last,
		O	output,
		C	op
	)

updates a capture task to an exclusive scan task

This method is similar to cudaFlowCapturer::exclusive_scan but operates on an existing task.

◆ find_if() [1/2]

template<typename I , typename U >

void tf::cudaFlowCapturer::find_if	(	cudaTask	task,
		I	first,
		I	last,
		unsigned *	idx,
		U	op
	)

updates the parameters of a find-if task

This method is similar to tf::cudaFlowCapturer::find_if but operates on an existing task.

◆ find_if() [2/2]

template<typename I , typename U >

cudaTask tf::cudaFlowCapturer::find_if	(	I	first,
		I	last,
		unsigned *	idx,
		U	op
	)

creates a task to find the index of the first element in a range

Template Parameters

I	input iterator type
U	unary operator type

Parameters

first	iterator to the beginning of the range
last	iterator to the end of the range
idx	pointer to the index of the found element
op	unary operator which returns `true` for the required element

Finds the index idx of the first element in the range [first, last) such that op(*(first+idx)) is true. This is equivalent to the parallel execution of the following loop:

unsigned idx = 0;
for(; first != last; ++first, ++idx) {
  if (p(*first)) {
    return idx;
  }
}
return idx;

◆ for_each() [1/2]

template<typename I , typename C >

void tf::cudaFlowCapturer::for_each	(	cudaTask	task,
		I	first,
		I	last,
		C	callable
	)

updates a capture task to a for-each kernel task

This method is similar to cudaFlowCapturer::for_each but operates on an existing task.

◆ for_each() [2/2]

template<typename I , typename C >

cudaTask tf::cudaFlowCapturer::for_each	(	I	first,
		I	last,
		C	callable
	)

captures a kernel that applies a callable to each dereferenced element of the data array

Template Parameters

I	iterator type
C	callable type

Parameters

first	iterator to the beginning
last	iterator to the end
callable	a callable object to apply to the dereferenced iterator

Returns: cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; i++) {
  callable(*itr);
}

◆ for_each_index() [1/2]

template<typename I , typename C >

void tf::cudaFlowCapturer::for_each_index	(	cudaTask	task,
		I	first,
		I	last,
		I	step,
		C	callable
	)

updates a capture task to a for-each-index kernel task

This method is similar to cudaFlowCapturer::for_each_index but operates on an existing task.

◆ for_each_index() [2/2]

template<typename I , typename C >

cudaTask tf::cudaFlowCapturer::for_each_index	(	I	first,
		I	last,
		I	step,
		C	callable
	)

captures a kernel that applies a callable to each index in the range with the step size

Template Parameters

I	index type
C	callable type

Parameters

first	beginning index
last	last index
step	step size
callable	the callable to apply to each element in the data array

Returns: cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

// step is positive [first, last)
for(auto i=first; i<last; i+=step) {
  callable(i);
}
 
// step is negative [first, last)
for(auto i=first; i>last; i+=step) {
  callable(i);
}

◆ inclusive_scan() [1/2]

template<typename I , typename O , typename C >

void tf::cudaFlowCapturer::inclusive_scan	(	cudaTask	task,
		I	first,
		I	last,
		O	output,
		C	op
	)

updates a capture task to an inclusive scan task

This method is similar to cudaFlowCapturer::inclusive_scan but operates on an existing task.

◆ inclusive_scan() [2/2]

template<typename I , typename O , typename C >

cudaTask tf::cudaFlowCapturer::inclusive_scan	(	I	first,
		I	last,
		O	output,
		C	op
	)

captures kernels that perform parallel inclusive scan over a range of items

Template Parameters

I	input iterator type
O	output iterator type
C	binary operator type

Parameters

first	iterator to the beginning
last	iterator to the end
output	iterator to the beginning of the output
op	binary operator

Returns: a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(size_t i=0; i<std::distance(first, last); i++) {
  *(output + i) = i ? op(*(first+i), *(output+i-1)) : *(first+i);
}

◆ kernel() [1/2]

template<typename F , typename... ArgsT>

void tf::cudaFlowCapturer::kernel	(	cudaTask	task,
		dim3	g,
		dim3	b,
		size_t	s,
		F	f,
		ArgsT &&...	args
	)

updates a capture task to a kernel operation

The method is similar to cudaFlowCapturer::kernel but operates on an existing task.

◆ kernel() [2/2]

template<typename F , typename... ArgsT>

cudaTask tf::cudaFlowCapturer::kernel	(	dim3	g,
		dim3	b,
		size_t	s,
		F	f,
		ArgsT &&...	args
	)

captures a kernel

Template Parameters

F	kernel function type
ArgsT	kernel function parameters type

Parameters

g	configured grid
b	configured block
s	configured shared memory size in bytes
f	kernel function
args	arguments to forward to the kernel function by copy

Returns: cudaTask handle

◆ make_optimizer()

template<typename OPT , typename ... ArgsT>

OPT & tf::cudaFlowCapturer::make_optimizer ( ArgsT &&... args )

selects a different optimization algorithm

Template Parameters

OPT	optimizer type
ArgsT	arguments types

Parameters

args	arguments to forward to construct the optimizer

Returns: a reference to the optimizer

We currently supports the following optimization algorithms to capture a user-described cudaFlow:

By default, tf::cudaFlowCapturer uses the round-robin optimization algorithm with four streams to transform a user-level graph into a native CUDA graph.

◆ max_element() [1/2]

template<typename I , typename O >

void tf::cudaFlowCapturer::max_element	(	cudaTask	task,
		I	first,
		I	last,
		unsigned *	idx,
		O	op
	)

updates the parameters of a max-element task

This method is similar to cudaFlowCapturer::max_element but operates on an existing task.

◆ max_element() [2/2]

template<typename I , typename O >

cudaTask tf::cudaFlowCapturer::max_element	(	I	first,
		I	last,
		unsigned *	idx,
		O	op
	)

finds the index of the maximum element in a range

Template Parameters

I	input iterator type
O	comparator type

Parameters

first	iterator to the beginning of the range
last	iterator to the end of the range
idx	solution index of the maximum element
op	comparison function object

The function launches kernels asynchronously to find the largest element in the range [first, last) using the given comparator op. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
  return 0;
}
auto largest = first;
for (++first; first != last; ++first) {
  if (op(*largest, *first)) {
    largest = first;
  }
}
return std::distance(first, largest);

◆ memcpy() [1/2]

void tf::cudaFlowCapturer::memcpy	(	cudaTask	task,
		void *	dst,
		const void *	src,
		size_t	count
	)

inline

updates a capture task to a memcpy operation

The method is similar to cudaFlowCapturer::memcpy but operates on an existing task.

◆ memcpy() [2/2]

cudaTask tf::cudaFlowCapturer::memcpy	(	void *	dst,
		const void *	src,
		size_t	count
	)

inline

copies data between host and device asynchronously through a stream

Parameters

dst	destination memory address
src	source memory address
count	size in bytes to copy

The method captures a cudaMemcpyAsync operation through an internal stream.

◆ memset() [1/2]

void tf::cudaFlowCapturer::memset	(	cudaTask	task,
		void *	ptr,
		int	value,
		size_t	n
	)

inline

updates a capture task to a memset operation

The method is similar to cudaFlowCapturer::memset but operates on an existing task.

◆ memset() [2/2]

cudaTask tf::cudaFlowCapturer::memset	(	void *	ptr,
		int	v,
		size_t	n
	)

inline

initializes or sets GPU memory to the given value byte by byte

Parameters

ptr	pointer to GPU mempry
v	value to set for each byte of the specified memory
n	size in bytes to set

The method captures a cudaMemsetAsync operation through an internal stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

◆ merge() [1/2]

template<typename A , typename B , typename C , typename Comp >

cudaTask tf::cudaFlowCapturer::merge	(	A	a_first,
		A	a_last,
		B	b_first,
		B	b_last,
		C	c_first,
		Comp	comp
	)

captures kernels that perform parallel merge on two sorted arrays

Template Parameters

A	iterator type of the first input array
B	iterator type of the second input array
C	iterator type of the output array
Comp	comparator type

Parameters

a_first	iterator to the beginning of the first input array
a_last	iterator to the end of the first input array
b_first	iterator to the beginning of the second input array
b_last	iterator to the end of the second input array
c_first	iterator to the beginning of the output array
comp	binary comparator

Returns: a tf::cudaTask handle

Merges two sorted ranges [a_first, a_last) and [b_first, b_last) into one sorted range beginning at c_first.

A sequence is said to be sorted with respect to a comparator comp if for any iterator it pointing to the sequence and any non-negative integer n such that it + n is a valid iterator pointing to an element of the sequence, comp(*(it + n), *it) evaluates to false.

◆ merge() [2/2]

template<typename A , typename B , typename C , typename Comp >

void tf::cudaFlowCapturer::merge	(	cudaTask	task,
		A	a_first,
		A	a_last,
		B	b_first,
		B	b_last,
		C	c_first,
		Comp	comp
	)

updates a capture task to a merge task

This method is similar to cudaFlowCapturer::merge but operates on an existing task.

◆ merge_by_key() [1/2]

template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >

cudaTask tf::cudaFlowCapturer::merge_by_key	(	a_keys_it	a_keys_first,
		a_keys_it	a_keys_last,
		a_vals_it	a_vals_first,
		b_keys_it	b_keys_first,
		b_keys_it	b_keys_last,
		b_vals_it	b_vals_first,
		c_keys_it	c_keys_first,
		c_vals_it	c_vals_first,
		C	comp
	)

captures kernels that perform parallel key-value merge

Template Parameters

a_keys_it	first key iterator type
a_vals_it	first value iterator type
b_keys_it	second key iterator type
b_vals_it	second value iterator type
c_keys_it	output key iterator type
c_vals_it	output value iterator type
C	comparator type

Parameters

a_keys_first	iterator to the beginning of the first key range
a_keys_last	iterator to the end of the first key range
a_vals_first	iterator to the beginning of the first value range
b_keys_first	iterator to the beginning of the second key range
b_keys_last	iterator to the end of the second key range
b_vals_first	iterator to the beginning of the second value range
c_keys_first	iterator to the beginning of the output key range
c_vals_first	iterator to the beginning of the output value range
comp	comparator

Performs a key-value merge that copies elements from [a_keys_first, a_keys_last) and [b_keys_first, b_keys_last) into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending key order.

At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first)) and [b_vals_first + (b_keys_last - b_keys_first)) into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first)) such that the resulting range is in ascending order implied by each input element's associated key.

For example, assume:

a_keys = {8, 1}
a_vals = {1, 2}
b_keys = {3, 7}
b_vals = {3, 4}

After the merge, we have:

c_keys = {1, 3, 7, 8}
c_vals = {2, 3, 4, 1}

◆ merge_by_key() [2/2]

template<typename a_keys_it , typename a_vals_it , typename b_keys_it , typename b_vals_it , typename c_keys_it , typename c_vals_it , typename C >

void tf::cudaFlowCapturer::merge_by_key	(	cudaTask	task,
		a_keys_it	a_keys_first,
		a_keys_it	a_keys_last,
		a_vals_it	a_vals_first,
		b_keys_it	b_keys_first,
		b_keys_it	b_keys_last,
		b_vals_it	b_vals_first,
		c_keys_it	c_keys_first,
		c_vals_it	c_vals_first,
		C	comp
	)

updates a capture task to a key-value merge task

This method is similar to tf::cudaFlowCapturer::merge_by_key but operates on an existing task.

◆ min_element() [1/2]

template<typename I , typename O >

void tf::cudaFlowCapturer::min_element	(	cudaTask	task,
		I	first,
		I	last,
		unsigned *	idx,
		O	op
	)

updates the parameters of a min-element task

This method is similar to cudaFlowCapturer::min_element but operates on an existing task.

◆ min_element() [2/2]

template<typename I , typename O >

cudaTask tf::cudaFlowCapturer::min_element	(	I	first,
		I	last,
		unsigned *	idx,
		O	op
	)

finds the index of the minimum element in a range

Template Parameters

I	input iterator type
O	comparator type

Parameters

first	iterator to the beginning of the range
last	iterator to the end of the range
idx	solution index of the minimum element
op	comparison function object

The function launches kernels asynchronously to find the smallest element in the range [first, last) using the given comparator op. The function is equivalent to a parallel execution of the following loop:

if(first == last) {
  return 0;
}
auto smallest = first;
for (++first; first != last; ++first) {
  if (op(*first, *smallest)) {
    smallest = first;
  }
}
return std::distance(first, smallest);

◆ noop() [1/2]

cudaTask tf::cudaFlowCapturer::noop ( )

inline

captures a no-operation task

Returns: a tf::cudaTask handle

An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.

◆ noop() [2/2]

void tf::cudaFlowCapturer::noop ( cudaTask task )

inline

updates a task to a no-operation task

The method is similar to tf::cudaFlowCapturer::noop but operates on an existing task.

◆ offload_n()

void tf::cudaFlowCapturer::offload_n ( size_t n )

inline

offloads the captured cudaFlow and executes it by the given times

Parameters

n	number of executions

◆ offload_until()

template<typename P >

void tf::cudaFlowCapturer::offload_until ( P && predicate )

offloads the captured cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true

Template Parameters

P	predicate type (a binary callable)

Parameters

predicate a binary predicate (returns true for stop)

Immediately offloads the cudaFlow captured so far onto a GPU and repeatedly runs it until the predicate returns true.

By default, if users do not offload the cudaFlow capturer, the executor will offload it once.

◆ on() [1/2]

template<typename C , std::enable_if_t< std::is_invocable_r_v< void, C, cudaStream_t >, void > * >

cudaTask tf::cudaFlowCapturer::on ( C && callable )

captures a sequential CUDA operations from the given callable

Template Parameters

C	callable type constructible with `std::function<void(cudaStream_t)>`

Parameters

callable a callable to capture CUDA operations with the stream

This methods applies a stream created by the flow to capture a sequence of CUDA operations defined in the callable.

◆ on() [2/2]

template<typename C , std::enable_if_t< std::is_invocable_r_v< void, C, cudaStream_t >, void > * >

void tf::cudaFlowCapturer::on	(	cudaTask	task,
		C &&	callable
	)

updates a capture task to another sequential CUDA operations

The method is similar to cudaFlowCapturer::on but operates on an existing task.

◆ reduce() [1/2]

template<typename I , typename T , typename C >

void tf::cudaFlowCapturer::reduce	(	cudaTask	task,
		I	first,
		I	last,
		T *	result,
		C	op
	)

updates a capture task to a reduction task

This method is similar to cudaFlowCapturer::reduce but operates on an existing task.

◆ reduce() [2/2]

template<typename I , typename T , typename C >

cudaTask tf::cudaFlowCapturer::reduce	(	I	first,
		I	last,
		T *	result,
		C	op
	)

captures kernels that perform parallel reduction over a range of items

Template Parameters

I	input iterator type
T	value type
C	binary operator type

Parameters

first	iterator to the beginning
last	iterator to the end
result	pointer to the result with an initialized value
op	binary reduction operator

Returns: a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *result = op(*result, *first++);
}

◆ single_task() [1/2]

template<typename C >

cudaTask tf::cudaFlowCapturer::single_task ( C c )

capturers a kernel to runs the given callable with only one thread

Template Parameters

C	callable type

Parameters

c	callable to run by a single kernel thread

◆ single_task() [2/2]

template<typename C >

void tf::cudaFlowCapturer::single_task	(	cudaTask	task,
		C	c
	)

updates a capture task to a single-threaded kernel

This method is similar to cudaFlowCapturer::single_task but operates on an existing task.

◆ sort() [1/2]

template<typename I , typename C >

void tf::cudaFlowCapturer::sort	(	cudaTask	task,
		I	first,
		I	last,
		C	comp
	)

updates a capture task to a sort task

This method is similar to cudaFlowCapturer::sort but operates on an existing task.

◆ sort() [2/2]

template<typename I , typename C >

cudaTask tf::cudaFlowCapturer::sort	(	I	first,
		I	last,
		C	comp
	)

captures kernels that sort the given array

Template Parameters

I	iterator type of the first input array
C	comparator type

Parameters

first	iterator to the beginning of the input array
last	iterator to the end of the input array
comp	binary comparator

Returns: a tf::cudaTask handle

Sorts elements in the range [first, last) with the given comparator.

◆ sort_by_key() [1/2]

template<typename K_it , typename V_it , typename C >

void tf::cudaFlowCapturer::sort_by_key	(	cudaTask	task,
		K_it	k_first,
		K_it	k_last,
		V_it	v_first,
		C	comp
	)

updates a capture task to a key-value sort task

This method is similar to tf::cudaFlowCapturer::sort_by_key but operates on an existing task.

◆ sort_by_key() [2/2]

template<typename K_it , typename V_it , typename C >

cudaTask tf::cudaFlowCapturer::sort_by_key	(	K_it	k_first,
		K_it	k_last,
		V_it	v_first,
		C	comp
	)

captures kernels that sort the given array

Template Parameters

K_it	iterator type of the key
V_it	iterator type of the value
C	comparator type

Parameters

k_first	iterator to the beginning of the key array
k_last	iterator to the end of the key array
v_first	iterator to the beginning of the value array
comp	binary comparator

Returns: a tf::cudaTask handle

Sorts key-value elements in [k_first, k_last) and [v_first, v_first + (k_last - k_first)) into ascending key order using the given comparator comp. If i and j are any two valid iterators in [k_first, k_last) such that i precedes j, and p and q are iterators in [v_first, v_first + (k_last - k_first)) corresponding to i and j respectively, then comp(*j, *i) evaluates to false.

For example, assume:

keys are {1, 4, 2, 8, 5, 7}
values are {'a', 'b', 'c', 'd', 'e', 'f'}

After sort:

keys are {1, 2, 4, 5, 7, 8}
values are {'a', 'c', 'b', 'e', 'f', 'd'}

◆ transform() [1/4]

template<typename I , typename O , typename C >

void tf::cudaFlowCapturer::transform	(	cudaTask	task,
		I	first,
		I	last,
		O	output,
		C	op
	)

updates a capture task to a transform kernel task

This method is similar to cudaFlowCapturer::transform but operates on an existing task.

◆ transform() [2/4]

template<typename I1 , typename I2 , typename O , typename C >

void tf::cudaFlowCapturer::transform	(	cudaTask	task,
		I1	first1,
		I1	last1,
		I2	first2,
		O	output,
		C	op
	)

updates a capture task to a transform kernel task

This method is similar to cudaFlowCapturer::transform but operates on an existing task.

◆ transform() [3/4]

template<typename I , typename O , typename C >

cudaTask tf::cudaFlowCapturer::transform	(	I	first,
		I	last,
		O	output,
		C	op
	)

captures a kernel that transforms an input range to an output range

Template Parameters

I	input iterator type
O	output iterator type
C	unary operator type

Parameters

first	iterator to the beginning of the input range
last	iterator to the end of the input range
output	iterator to the beginning of the output range
op	unary operator to apply to transform each item in the range

Returns: cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *output++ = op(*first++);
}

◆ transform() [4/4]

template<typename I1 , typename I2 , typename O , typename C >

cudaTask tf::cudaFlowCapturer::transform	(	I1	first1,
		I1	last1,
		I2	first2,
		O	output,
		C	op
	)

captures a kernel that transforms two input ranges to an output range

Template Parameters

I1	first input iterator type
I2	second input iterator type
O	output iterator type
C	unary operator type

Parameters

first1	iterator to the beginning of the input range
last1	iterator to the end of the input range
first2	iterato
output	iterator to the beginning of the output range
op	binary operator to apply to transform each pair of items in the two input ranges

Returns: cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {
  *output++ = op(*first1++, *first2++);
}

◆ transform_exclusive_scan()

template<typename I , typename O , typename B , typename U >

void tf::cudaFlowCapturer::transform_exclusive_scan	(	cudaTask	task,
		I	first,
		I	last,
		O	output,
		B	bop,
		U	uop
	)

updates a capture task to a transform-exclusive scan task

This method is similar to cudaFlowCapturer::transform_exclusive_scan but operates on an existing task.

◆ transform_inclusive_scan() [1/2]

template<typename I , typename O , typename B , typename U >

void tf::cudaFlowCapturer::transform_inclusive_scan	(	cudaTask	task,
		I	first,
		I	last,
		O	output,
		B	bop,
		U	uop
	)

updates a capture task to a transform-inclusive scan task

This method is similar to cudaFlowCapturer::transform_inclusive_scan but operates on an existing task.

◆ transform_inclusive_scan() [2/2]

template<typename I , typename O , typename B , typename U >

cudaTask tf::cudaFlowCapturer::transform_inclusive_scan	(	I	first,
		I	last,
		O	output,
		B	bop,
		U	uop
	)

captures kernels that perform parallel inclusive scan over a range of transformed items

Template Parameters

I	input iterator type
O	output iterator type
B	binary operator type
U	unary operator type

Parameters

first	iterator to the beginning
last	iterator to the end
output	iterator to the beginning of the output
bop	binary operator
uop	unary operator

Returns: a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

for(size_t i=0; i<std::distance(first, last); i++) {
  *(output + i) = i ? op(uop(*(first+i)), *(output+i-1)) : uop(*(first+i));
}

◆ transform_reduce() [1/2]

template<typename I , typename T , typename C , typename U >

void tf::cudaFlowCapturer::transform_reduce	(	cudaTask	task,
		I	first,
		I	last,
		T *	result,
		C	bop,
		U	uop
	)

updates a capture task to a transform-reduce task

This method is similar to cudaFlowCapturer::transform_reduce but operates on an existing task.

◆ transform_reduce() [2/2]

template<typename I , typename T , typename C , typename U >

cudaTask tf::cudaFlowCapturer::transform_reduce	(	I	first,
		I	last,
		T *	result,
		C	bop,
		U	uop
	)

captures kernels that perform parallel reduction over a range of transformed items

Template Parameters

I	input iterator type
T	value type
C	binary operator type
U	unary operator type

Parameters

first	iterator to the beginning
last	iterator to the end
result	pointer to the result with an initialized value
bop	binary reduce operator
uop	unary transform operator

Returns: a tf::cudaTask handle

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {
  *result = bop(*result, uop(*first++));
}

◆ transform_uninitialized_reduce() [1/2]

template<typename I , typename T , typename C , typename U >

void tf::cudaFlowCapturer::transform_uninitialized_reduce	(	cudaTask	task,
		I	first,
		I	last,
		T *	result,
		C	bop,
		U	uop
	)

updates a capture task to a transform-reduce task of no initialized value

This method is similar to cudaFlowCapturer::transform_uninitialized_reduce but operates on an existing task.

◆ transform_uninitialized_reduce() [2/2]

template<typename I , typename T , typename C , typename U >

cudaTask tf::cudaFlowCapturer::transform_uninitialized_reduce	(	I	first,
		I	last,
		T *	result,
		C	bop,
		U	uop
	)

similar to tf::cudaFlowCapturer::transform_reduce but does not assume any initial value to reduce

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = uop(*first++);  // initial value does not involve in the loop
while (first != last) {
  *result = bop(*result, uop(*first++));
}

◆ uninitialized_reduce() [1/2]

template<typename I , typename T , typename C >

void tf::cudaFlowCapturer::uninitialized_reduce	(	cudaTask	task,
		I	first,
		I	last,
		T *	result,
		C	op
	)

updates a capture task to an uninitialized-reduction task

This method is similar to cudaFlowCapturer::uninitialized_reduce but operates on an existing task.

◆ uninitialized_reduce() [2/2]

template<typename I , typename T , typename C >

cudaTask tf::cudaFlowCapturer::uninitialized_reduce	(	I	first,
		I	last,
		T *	result,
		C	op
	)

similar to tf::cudaFlowCapturer::reduce but does not assume any initial value to reduce

This method is equivalent to the parallel execution of the following loop on a GPU:

*result = *first++;  // initial value does not involve in the loop
while (first != last) {
  *result = op(*result, *first++);
}

The documentation for this class was generated from the following files:

Public Member Functions

Friends

Detailed Description

Constructor & Destructor Documentation

◆ cudaFlowCapturer()

Member Function Documentation

◆ copy() [1/2]

◆ copy() [2/2]

◆ exclusive_scan()

◆ find_if() [1/2]

◆ find_if() [2/2]

◆ for_each() [1/2]

◆ for_each() [2/2]

◆ for_each_index() [1/2]

◆ for_each_index() [2/2]

◆ inclusive_scan() [1/2]

◆ inclusive_scan() [2/2]

◆ kernel() [1/2]

◆ kernel() [2/2]

◆ make_optimizer()

◆ max_element() [1/2]

◆ max_element() [2/2]

◆ memcpy() [1/2]

◆ memcpy() [2/2]

◆ memset() [1/2]

◆ memset() [2/2]

◆ merge() [1/2]

◆ merge() [2/2]

◆ merge_by_key() [1/2]

◆ merge_by_key() [2/2]

◆ min_element() [1/2]

◆ min_element() [2/2]

◆ noop() [1/2]

◆ noop() [2/2]

◆ offload_n()

◆ offload_until()

◆ on() [1/2]

◆ on() [2/2]

◆ reduce() [1/2]

◆ reduce() [2/2]

◆ single_task() [1/2]

◆ single_task() [2/2]

◆ sort() [1/2]

◆ sort() [2/2]

◆ sort_by_key() [1/2]

◆ sort_by_key() [2/2]

◆ transform() [1/4]

◆ transform() [2/4]

◆ transform() [3/4]

◆ transform() [4/4]

◆ transform_exclusive_scan()

◆ transform_inclusive_scan() [1/2]

◆ transform_inclusive_scan() [2/2]

◆ transform_reduce() [1/2]

◆ transform_reduce() [2/2]

◆ transform_uninitialized_reduce() [1/2]

◆ transform_uninitialized_reduce() [2/2]

◆ uninitialized_reduce() [1/2]

◆ uninitialized_reduce() [2/2]