3#include "../taskflow.hpp"
147 template <
typename C>
156 template <
typename C>
173 template <
typename F,
typename... ArgsT>
183 template <
typename F,
typename... ArgsT>
185 cudaTask task, dim3 g, dim3 b,
size_t shm, F f, ArgsT&&... args
236 void memcpy(
cudaTask task,
void* tgt,
const void* src,
size_t bytes);
251 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>* =
nullptr
266 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>* =
nullptr
286 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>* =
nullptr
301 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>* =
nullptr
303 void fill(
cudaTask task, T* dst, T value,
size_t count);
319 template <
typename T,
333 template <
typename T,
336 void copy(
cudaTask task, T* tgt,
const T* src,
size_t num);
360 template <
typename P>
388 template <
typename C>
397 template <
typename C>
420 template <
typename I,
typename C>
430 template <
typename I,
typename C>
460 template <
typename I,
typename C>
470 template <
typename I,
typename C>
472 cudaTask task, I first, I last, I step, C callable
497 template <
typename I,
typename O,
typename C>
507 template <
typename I,
typename O,
typename C>
535 template <
typename I1,
typename I2,
typename O,
typename C>
545 template <
typename I1,
typename I2,
typename O,
typename C>
547 cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c
572 template <
typename I,
typename T,
typename B>
582 template <
typename I,
typename T,
typename C>
599 template <
typename I,
typename T,
typename B>
609 template <
typename I,
typename T,
typename C>
611 cudaTask task, I first, I last, T* result, C op
638 template <
typename I,
typename T,
typename B,
typename U>
645 template <
typename I,
typename T,
typename B,
typename U>
662 template <
typename I,
typename T,
typename B,
typename U>
664 I first, I last, T* result, B bop, U uop
671 template <
typename I,
typename T,
typename B,
typename U>
673 cudaTask task, I first, I last, T* result, B bop, U uop
699 template <
typename I,
typename O,
typename C>
709 template <
typename I,
typename O,
typename C>
715 template <
typename I,
typename O,
typename C>
725 template <
typename I,
typename O,
typename C>
754 template <
typename I,
typename O,
typename B,
typename U>
764 template <
typename I,
typename O,
typename B,
typename U>
766 cudaTask task, I first, I last, O output, B bop, U uop
773 template <
typename I,
typename O,
typename B,
typename U>
783 template <
typename I,
typename O,
typename B,
typename U>
785 cudaTask task, I first, I last, O output, B bop, U uop
814 template <
typename A,
typename B,
typename C,
typename Comp>
815 cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp);
824 template <
typename A,
typename B,
typename C,
typename Comp>
826 cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp
844 template <
typename I,
typename C>
854 template <
typename I,
typename C>
887 template <
typename K_it,
typename V_it,
typename C>
897 template <
typename K_it,
typename V_it,
typename C>
899 cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp
946 typename a_keys_it,
typename a_vals_it,
947 typename b_keys_it,
typename b_vals_it,
948 typename c_keys_it,
typename c_vals_it,
952 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
953 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
954 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
965 typename a_keys_it,
typename a_vals_it,
966 typename b_keys_it,
typename b_vals_it,
967 typename c_keys_it,
typename c_vals_it,
972 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
973 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
974 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
1002 template <
typename I,
typename U>
1009 template <
typename I,
typename U>
1041 template <
typename I,
typename O>
1048 template <
typename I,
typename O>
1080 template <
typename I,
typename O>
1087 template <
typename I,
typename O>
1126 template <
typename C>
1137 template <
typename C>
1146 cudaGraphExec_t _executable {
nullptr};
1153 _handle {
std::in_place_type_t<External>{}},
1157 cudaGraphCreate(&_graph._native_handle, 0),
1158 "cudaFlow failed to create a native graph (external mode)"
1163inline cudaFlow::cudaFlow(cudaGraph& g) :
1164 _handle {
std::in_place_type_t<Internal>{}},
1167 assert(_graph._native_handle ==
nullptr);
1170 cudaGraphCreate(&_graph._native_handle, 0),
1171 "failed to create a native graph (internal mode)"
1178 cudaGraphExecDestroy(_executable);
1180 cudaGraphDestroy(_graph._native_handle);
1181 _graph._native_handle =
nullptr;
1189 cudaGraphExecDestroy(_executable),
"failed to destroy executable graph"
1191 _executable =
nullptr;
1195 cudaGraphDestroy(_graph._native_handle),
"failed to destroy native graph"
1199 cudaGraphCreate(&_graph._native_handle, 0),
"failed to create native graph"
1202 _graph._nodes.clear();
1207 return _graph._nodes.empty();
1212 return _graph._nodes.size();
1217 _graph.dump(os,
nullptr,
"");
1222 cuda_dump_graph(os, _graph._native_handle);
1232 auto node = _graph.emplace_back(
1237 cudaGraphAddEmptyNode(
1238 &node->_native_handle, _graph._native_handle,
nullptr, 0
1240 "failed to create a no-operation (empty) node"
1247template <
typename C>
1250 auto node = _graph.emplace_back(
1256 cudaHostNodeParams p;
1257 p.fn = cudaNode::Host::callback;
1261 cudaGraphAddHostNode(
1262 &node->_native_handle, _graph._native_handle,
nullptr, 0, &p
1264 "failed to create a host node"
1271template <
typename F,
typename... ArgsT>
1273 dim3 g, dim3 b,
size_t s, F f, ArgsT&&... args
1276 auto node = _graph.emplace_back(
1280 cudaKernelNodeParams p;
1281 void* arguments[
sizeof...(ArgsT)] = { (
void*)(&args)... };
1285 p.sharedMemBytes = s;
1286 p.kernelParams = arguments;
1290 cudaGraphAddKernelNode(
1291 &node->_native_handle, _graph._native_handle,
nullptr, 0, &p
1293 "failed to create a kernel task"
1301 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>*
1305 auto node = _graph.emplace_back(
1309 auto p = cuda_get_zero_parms(dst, count);
1312 cudaGraphAddMemsetNode(
1313 &node->_native_handle, _graph._native_handle,
nullptr, 0, &p
1315 "failed to create a memset (zero) task"
1323 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>*
1327 auto node = _graph.emplace_back(
1331 auto p = cuda_get_fill_parms(dst, value, count);
1334 cudaGraphAddMemsetNode(
1335 &node->_native_handle, _graph._native_handle,
nullptr, 0, &p
1337 "failed to create a memset (fill) task"
1350 auto node = _graph.emplace_back(
1354 auto p = cuda_get_copy_parms(tgt, src, num);
1357 cudaGraphAddMemcpyNode(
1358 &node->_native_handle, _graph._native_handle,
nullptr, 0, &p
1360 "failed to create a memcpy (copy) task"
1369 auto node = _graph.emplace_back(
1373 auto p = cuda_get_memset_parms(dst, ch, count);
1376 cudaGraphAddMemsetNode(
1377 &node->_native_handle, _graph._native_handle,
nullptr, 0, &p
1379 "failed to create a memset task"
1388 auto node = _graph.emplace_back(
1392 auto p = cuda_get_memcpy_parms(tgt, src, bytes);
1395 cudaGraphAddMemcpyNode(
1396 &node->_native_handle, _graph._native_handle,
nullptr, 0, &p
1398 "failed to create a memcpy task"
1409template <
typename C>
1413 TF_THROW(task,
" is not a host task");
1422template <
typename F,
typename... ArgsT>
1424 cudaTask task, dim3 g, dim3 b,
size_t s, F f, ArgsT&&... args
1428 TF_THROW(task,
" is not a kernel task");
1431 cudaKernelNodeParams p;
1433 void* arguments[
sizeof...(ArgsT)] = { (
void*)(&args)... };
1437 p.sharedMemBytes = s;
1438 p.kernelParams = arguments;
1442 cudaGraphExecKernelNodeSetParams(
1443 _executable, task._node->_native_handle, &p
1445 "failed to update kernel parameters on ", task
1457 TF_THROW(task,
" is not a memcpy task");
1460 auto p = cuda_get_copy_parms(tgt, src, num);
1463 cudaGraphExecMemcpyNodeSetParams(
1464 _executable, task._node->_native_handle, &p
1466 "failed to update memcpy parameters on ", task
1472 cudaTask task,
void* tgt,
const void* src,
size_t bytes
1476 TF_THROW(task,
" is not a memcpy task");
1479 auto p = cuda_get_memcpy_parms(tgt, src, bytes);
1482 cudaGraphExecMemcpyNodeSetParams(_executable, task._node->_native_handle, &p),
1483 "failed to update memcpy parameters on ", task
1492 TF_THROW(task,
" is not a memset task");
1495 auto p = cuda_get_memset_parms(dst, ch, count);
1498 cudaGraphExecMemsetNodeSetParams(
1499 _executable, task._node->_native_handle, &p
1501 "failed to update memset parameters on ", task
1507 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>*
1512 TF_THROW(task,
" is not a memset task");
1515 auto p = cuda_get_fill_parms(dst, value, count);
1518 cudaGraphExecMemsetNodeSetParams(
1519 _executable, task._node->_native_handle, &p
1521 "failed to update memset parameters on ", task
1527 is_pod_v<T> && (
sizeof(T)==1 ||
sizeof(T)==2 ||
sizeof(T)==4),
void>*
1532 TF_THROW(task,
" is not a memset task");
1535 auto p = cuda_get_zero_parms(dst, count);
1538 cudaGraphExecMemsetNodeSetParams(
1539 _executable, task._node->_native_handle, &p
1541 "failed to update memset parameters on ", task
1546template <
typename C>
1550 TF_THROW(task,
" is not a subflow task");
1556 node_handle->graph.clear();
1563 auto captured = capturer._capture();
1567 cudaGraphExecChildGraphNodeSetParams(
1568 _executable, task._node->_native_handle, captured
1570 "failed to update a captured child graph"
1573 TF_CHECK_CUDA(cudaGraphDestroy(captured),
"failed to destroy captured graph");
1581template <
typename C>
1585 auto node = _graph.emplace_back(
1591 node_handle->graph.clear();
1597 auto captured = capturer._capture();
1601 cudaGraphAddChildGraphNode(
1602 &node->_native_handle, _graph._native_handle,
nullptr, 0, captured
1604 "failed to add a cudaFlow capturer task"
1607 TF_CHECK_CUDA(cudaGraphDestroy(captured),
"failed to destroy captured graph");
1617template <
typename P>
1622 if(_executable ==
nullptr) {
1624 cudaGraphInstantiate(
1625 &_executable, _graph._native_handle,
nullptr,
nullptr, 0
1627 "failed to create an executable graph"
1635 while(!predicate()) {
1637 cudaGraphLaunch(_executable, s),
"failed to execute cudaFlow"
1645 _graph._state = cudaGraph::OFFLOADED;
1650 offload_until([repeat=n] ()
mutable {
return repeat-- == 0; });
1655 offload_until([repeat=1] ()
mutable {
return repeat-- == 0; });
1663template <
typename C,
typename D,
1667 auto n =
_graph._emplace_back(
1671 e._invoke_cudaflow_task_entry(p, c);
1679template <
typename C, std::enable_if_t<is_cudaflow_task_v<C>,
void>*>
1689template <
typename C, std::enable_if_t<is_cudaflow_task_v<C>,
void>*>
1690void Executor::_invoke_cudaflow_task_entry(Node* node, C&& c) {
1698 cudaGraph* g =
dynamic_cast<cudaGraph*
>(h->graph.get());
1708 if(!(g->_state & cudaGraph::OFFLOADED)) {
class to create an executor for running a taskflow graph
Definition executor.hpp:50
Task emplace(C &&callable)
creates a static task
Definition flow_builder.hpp:742
Graph & _graph
associated graph object
Definition flow_builder.hpp:727
Task emplace_on(C &&callable, D &&device)
creates a cudaFlow task on the given device
Definition cudaflow.hpp:1666
class to create a task handle over a node in a taskflow graph
Definition task.hpp:187
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
class to create a cudaFlow task dependency graph
Definition cudaflow.hpp:56
cudaTask host(C &&callable)
creates a host task that runs a callable on the host
Definition cudaflow.hpp:1248
cudaTask inclusive_scan(I first, I last, O output, C op)
creates a task to perform parallel inclusive scan over a range of items
Definition scan.hpp:619
cudaTask memset(void *dst, int v, size_t count)
creates a memset task that fills untyped data with a byte value
Definition cudaflow.hpp:1367
bool empty() const
queries the emptiness of the graph
Definition cudaflow.hpp:1206
~cudaFlow()
destroys the cudaFlow and its associated native CUDA graph and executable graph
Definition cudaflow.hpp:1176
cudaTask for_each(I first, I last, C callable)
applies a callable to each dereferenced element of the data array
Definition for_each.hpp:181
cudaTask transform_reduce(I first, I last, T *result, B bop, U uop)
performs parallel reduction over a range of transformed items
Definition reduce.hpp:596
cudaTask fill(T *dst, T value, size_t count)
creates a memset task that fills a typed memory block with a value
Definition cudaflow.hpp:1325
cudaTask noop()
creates a no-operation task
Definition cudaflow.hpp:1230
cudaTask for_each_index(I first, I last, I step, C callable)
applies a callable to each index in the range with the step size
Definition for_each.hpp:190
cudaTask uninitialized_reduce(I first, I last, T *result, B bop)
similar to tf::cudaFlow::reduce but does not assume any initial value to reduce
Definition reduce.hpp:587
cudaTask zero(T *dst, size_t count)
creates a memset task that sets a typed memory block to zero
Definition cudaflow.hpp:1303
void dump_native_graph(std::ostream &os) const
dumps the native CUDA graph into a DOT format through an output stream
Definition cudaflow.hpp:1221
cudaTask find_if(I first, I last, unsigned *idx, U op)
creates a task to find the index of the first element in a range
Definition find.hpp:193
cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop)
creates a task to perform parallel inclusive scan over a range of transformed items
Definition scan.hpp:655
cudaTask min_element(I first, I last, unsigned *idx, O op)
finds the index of the minimum element in a range
Definition find.hpp:340
cudaTask max_element(I first, I last, unsigned *idx, O op)
finds the index of the maximum element in a range
Definition find.hpp:465
void dump(std::ostream &os) const
dumps the cudaFlow graph into a DOT format through an output stream
Definition cudaflow.hpp:1216
void offload()
offloads the cudaFlow and executes it once
Definition cudaflow.hpp:1654
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask exclusive_scan(I first, I last, O output, C op)
similar to cudaFlow::inclusive_scan but excludes the first value
Definition scan.hpp:637
cudaTask reduce(I first, I last, T *result, B bop)
performs parallel reduction over a range of items
Definition reduce.hpp:578
cudaTask transform_uninitialized_reduce(I first, I last, T *result, B bop, U uop)
similar to tf::cudaFlow::transform_reduce but does not assume any initial value to reduce
Definition reduce.hpp:605
cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp)
creates kernels that sort the given array
Definition sort.hpp:533
void offload_until(P &&predicate)
offloads the cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
Definition cudaflow.hpp:1618
cudaTask merge_by_key(a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
creates a task to perform parallel key-value merge
Definition merge.hpp:679
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
creates a kernel task
Definition cudaflow.hpp:1272
void clear()
clears the cudaFlow object
Definition cudaflow.hpp:1185
void offload_n(size_t N)
offloads the cudaFlow and executes it by the given times
Definition cudaflow.hpp:1649
cudaTask single_task(C c)
runs a callable with only a single kernel thread
Definition for_each.hpp:169
cudaTask memcpy(void *tgt, const void *src, size_t bytes)
creates a memcpy task that copies untyped data in bytes
Definition cudaflow.hpp:1386
cudaFlow()
constructs a standalone cudaFlow
Definition cudaflow.hpp:1152
cudaTask sort(I first, I last, C comp)
creates a task to perform parallel sort an array
Definition sort.hpp:515
size_t num_tasks() const
queries the number of tasks
Definition cudaflow.hpp:1211
cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop)
similar to cudaFlow::transform_inclusive_scan but excludes the first value
Definition scan.hpp:677
cudaTask copy(T *tgt, const T *src, size_t num)
creates a memcopy task that copies typed data
Definition cudaflow.hpp:1348
cudaTask transform(I first, I last, O output, C op)
applies a callable to a source range and stores the result in a target range
Definition transform.hpp:139
cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
creates a task to perform parallel merge on two sorted arrays
Definition merge.hpp:652
class to create an RAII-styled context switch
Definition cuda_device.hpp:293
**
Definition cuda_stream.hpp:174
void synchronize() const
synchronizes the associated stream
Definition cuda_stream.hpp:253
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
cudaTaskType type() const
queries the task type
Definition cuda_task.hpp:221
cudaFlow capturer include file
taskflow namespace
Definition small_vector.hpp:27
int cuda_get_device()
gets the current device associated with the caller thread
Definition cuda_device.hpp:24
@ KERNEL
memory copy task type
@ MEMSET
memory set task type
@ SUBFLOW
subflow (child graph) task type
@ MEMCPY
memory copy task type