Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
cudaflow.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "../taskflow.hpp"
4#include "cuda_task.hpp"
5#include "cuda_capturer.hpp"
6
12namespace tf {
13
14// ----------------------------------------------------------------------------
15// class definition: cudaFlow
16// ----------------------------------------------------------------------------
17
56class cudaFlow {
57
58 friend class Executor;
59
60 struct External {
61 cudaGraph graph;
62 };
63
64 struct Internal {
65 };
66
67 using handle_t = std::variant<External, Internal>;
68
69 public:
70
78 cudaFlow();
79
84 ~cudaFlow();
85
89 bool empty() const;
90
94 size_t num_tasks() const;
95
99 void clear();
100
105 void dump(std::ostream& os) const;
106
114 void dump_native_graph(std::ostream& os) const;
115
116 // ------------------------------------------------------------------------
117 // Graph building routines
118 // ------------------------------------------------------------------------
119
132 cudaTask noop();
133
147 template <typename C>
148 cudaTask host(C&& callable);
149
156 template <typename C>
157 void host(cudaTask task, C&& callable);
158
173 template <typename F, typename... ArgsT>
174 cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT&&... args);
175
183 template <typename F, typename... ArgsT>
184 void kernel(
185 cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT&&... args
186 );
187
200 cudaTask memset(void* dst, int v, size_t count);
201
211 void memset(cudaTask task, void* dst, int ch, size_t count);
212
225 cudaTask memcpy(void* tgt, const void* src, size_t bytes);
226
236 void memcpy(cudaTask task, void* tgt, const void* src, size_t bytes);
237
250 template <typename T, std::enable_if_t<
251 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
252 >
253 cudaTask zero(T* dst, size_t count);
254
265 template <typename T, std::enable_if_t<
266 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
267 >
268 void zero(cudaTask task, T* dst, size_t count);
269
285 template <typename T, std::enable_if_t<
286 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
287 >
288 cudaTask fill(T* dst, T value, size_t count);
289
300 template <typename T, std::enable_if_t<
301 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
302 >
303 void fill(cudaTask task, T* dst, T value, size_t count);
304
319 template <typename T,
321 >
322 cudaTask copy(T* tgt, const T* src, size_t num);
323
333 template <typename T,
335 >
336 void copy(cudaTask task, T* tgt, const T* src, size_t num);
337
338 // ------------------------------------------------------------------------
339 // offload methods
340 // ------------------------------------------------------------------------
341
360 template <typename P>
361 void offload_until(P&& predicate);
362
368 void offload_n(size_t N);
369
373 void offload();
374
375 // ------------------------------------------------------------------------
376 // generic algorithms
377 // ------------------------------------------------------------------------
378
388 template <typename C>
390
397 template <typename C>
398 void single_task(cudaTask task, C c);
399
420 template <typename I, typename C>
421 cudaTask for_each(I first, I last, C callable);
422
430 template <typename I, typename C>
431 void for_each(cudaTask task, I first, I last, C callable);
432
460 template <typename I, typename C>
461 cudaTask for_each_index(I first, I last, I step, C callable);
462
470 template <typename I, typename C>
471 void for_each_index(
472 cudaTask task, I first, I last, I step, C callable
473 );
474
497 template <typename I, typename O, typename C>
498 cudaTask transform(I first, I last, O output, C op);
499
507 template <typename I, typename O, typename C>
508 void transform(cudaTask task, I first, I last, O output, C c);
509
535 template <typename I1, typename I2, typename O, typename C>
536 cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op);
537
545 template <typename I1, typename I2, typename O, typename C>
546 void transform(
547 cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c
548 );
549
572 template <typename I, typename T, typename B>
573 cudaTask reduce(I first, I last, T* result, B bop);
574
582 template <typename I, typename T, typename C>
583 void reduce(cudaTask task, I first, I last, T* result, C op);
584
599 template <typename I, typename T, typename B>
600 cudaTask uninitialized_reduce(I first, I last, T* result, B bop);
601
609 template <typename I, typename T, typename C>
611 cudaTask task, I first, I last, T* result, C op
612 );
613
638 template <typename I, typename T, typename B, typename U>
639 cudaTask transform_reduce(I first, I last, T* result, B bop, U uop);
640
645 template <typename I, typename T, typename B, typename U>
646 void transform_reduce(cudaTask, I first, I last, T* result, B bop, U uop);
647
662 template <typename I, typename T, typename B, typename U>
664 I first, I last, T* result, B bop, U uop
665 );
666
671 template <typename I, typename T, typename B, typename U>
673 cudaTask task, I first, I last, T* result, B bop, U uop
674 );
675
699 template <typename I, typename O, typename C>
700 cudaTask inclusive_scan(I first, I last, O output, C op);
701
709 template <typename I, typename O, typename C>
710 void inclusive_scan(cudaTask task, I first, I last, O output, C op);
711
715 template <typename I, typename O, typename C>
716 cudaTask exclusive_scan(I first, I last, O output, C op);
717
725 template <typename I, typename O, typename C>
726 void exclusive_scan(cudaTask task, I first, I last, O output, C op);
727
754 template <typename I, typename O, typename B, typename U>
755 cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop);
756
764 template <typename I, typename O, typename B, typename U>
766 cudaTask task, I first, I last, O output, B bop, U uop
767 );
768
773 template <typename I, typename O, typename B, typename U>
774 cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop);
775
783 template <typename I, typename O, typename B, typename U>
785 cudaTask task, I first, I last, O output, B bop, U uop
786 );
787
814 template <typename A, typename B, typename C, typename Comp>
815 cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp);
816
824 template <typename A, typename B, typename C, typename Comp>
825 void merge(
826 cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp
827 );
828
844 template <typename I, typename C>
845 cudaTask sort(I first, I last, C comp);
846
854 template <typename I, typename C>
855 void sort(cudaTask task, I first, I last, C comp);
856
887 template <typename K_it, typename V_it, typename C>
888 cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp);
889
897 template <typename K_it, typename V_it, typename C>
898 void sort_by_key(
899 cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp
900 );
901
945 template<
946 typename a_keys_it, typename a_vals_it,
947 typename b_keys_it, typename b_vals_it,
948 typename c_keys_it, typename c_vals_it,
949 typename C
950 >
952 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
953 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
954 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
955 );
956
964 template<
965 typename a_keys_it, typename a_vals_it,
966 typename b_keys_it, typename b_vals_it,
967 typename c_keys_it, typename c_vals_it,
968 typename C
969 >
970 void merge_by_key(
971 cudaTask task,
972 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
973 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
974 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
975 );
976
1002 template <typename I, typename U>
1003 cudaTask find_if(I first, I last, unsigned* idx, U op);
1004
1009 template <typename I, typename U>
1010 void find_if(cudaTask task, I first, I last, unsigned* idx, U op);
1011
1041 template <typename I, typename O>
1042 cudaTask min_element(I first, I last, unsigned* idx, O op);
1043
1048 template <typename I, typename O>
1049 void min_element(cudaTask task, I first, I last, unsigned* idx, O op);
1050
1080 template <typename I, typename O>
1081 cudaTask max_element(I first, I last, unsigned* idx, O op);
1082
1087 template <typename I, typename O>
1088 void max_element(cudaTask task, I first, I last, unsigned* idx, O op);
1089
1090 // ------------------------------------------------------------------------
1091 // subflow
1092 // ------------------------------------------------------------------------
1093
1126 template <typename C>
1127 cudaTask capture(C&& callable);
1128
1137 template <typename C>
1138 void capture(cudaTask task, C callable);
1139
1140 private:
1141
1142 handle_t _handle;
1143
1144 cudaGraph& _graph;
1145
1146 cudaGraphExec_t _executable {nullptr};
1147
1148 cudaFlow(cudaGraph&);
1149};
1150
1151// Construct a standalone cudaFlow
1153 _handle {std::in_place_type_t<External>{}},
1154 _graph {std::get_if<External>(&_handle)->graph} {
1155
1156 TF_CHECK_CUDA(
1157 cudaGraphCreate(&_graph._native_handle, 0),
1158 "cudaFlow failed to create a native graph (external mode)"
1159 );
1160}
1161
1162// Construct the cudaFlow from executor (internal graph)
1163inline cudaFlow::cudaFlow(cudaGraph& g) :
1164 _handle {std::in_place_type_t<Internal>{}},
1165 _graph {g} {
1166
1167 assert(_graph._native_handle == nullptr);
1168
1169 TF_CHECK_CUDA(
1170 cudaGraphCreate(&_graph._native_handle, 0),
1171 "failed to create a native graph (internal mode)"
1172 );
1173}
1174
1175// Destructor
1177 if(_executable) {
1178 cudaGraphExecDestroy(_executable);
1179 }
1180 cudaGraphDestroy(_graph._native_handle);
1181 _graph._native_handle = nullptr;
1182}
1183
1184// Procedure: clear
1185inline void cudaFlow::clear() {
1186
1187 if(_executable) {
1188 TF_CHECK_CUDA(
1189 cudaGraphExecDestroy(_executable), "failed to destroy executable graph"
1190 );
1191 _executable = nullptr;
1192 }
1193
1194 TF_CHECK_CUDA(
1195 cudaGraphDestroy(_graph._native_handle), "failed to destroy native graph"
1196 );
1197
1198 TF_CHECK_CUDA(
1199 cudaGraphCreate(&_graph._native_handle, 0), "failed to create native graph"
1200 );
1201
1202 _graph._nodes.clear();
1203}
1204
1205// Function: empty
1206inline bool cudaFlow::empty() const {
1207 return _graph._nodes.empty();
1208}
1209
1210// Function: num_tasks
1211inline size_t cudaFlow::num_tasks() const {
1212 return _graph._nodes.size();
1213}
1214
1215// Procedure: dump
1216inline void cudaFlow::dump(std::ostream& os) const {
1217 _graph.dump(os, nullptr, "");
1218}
1219
1220// Procedure: dump
1222 cuda_dump_graph(os, _graph._native_handle);
1223}
1224
1225// ----------------------------------------------------------------------------
1226// Graph building methods
1227// ----------------------------------------------------------------------------
1228
1229// Function: noop
1231
1232 auto node = _graph.emplace_back(
1234 );
1235
1236 TF_CHECK_CUDA(
1237 cudaGraphAddEmptyNode(
1238 &node->_native_handle, _graph._native_handle, nullptr, 0
1239 ),
1240 "failed to create a no-operation (empty) node"
1241 );
1242
1243 return cudaTask(node);
1244}
1245
1246// Function: host
1247template <typename C>
1249
1250 auto node = _graph.emplace_back(
1252 );
1253
1254 auto h = std::get_if<cudaNode::Host>(&node->_handle);
1255
1256 cudaHostNodeParams p;
1257 p.fn = cudaNode::Host::callback;
1258 p.userData = h;
1259
1260 TF_CHECK_CUDA(
1261 cudaGraphAddHostNode(
1262 &node->_native_handle, _graph._native_handle, nullptr, 0, &p
1263 ),
1264 "failed to create a host node"
1265 );
1266
1267 return cudaTask(node);
1268}
1269
1270// Function: kernel
1271template <typename F, typename... ArgsT>
1273 dim3 g, dim3 b, size_t s, F f, ArgsT&&... args
1274) {
1275
1276 auto node = _graph.emplace_back(
1277 _graph, std::in_place_type_t<cudaNode::Kernel>{}, (void*)f
1278 );
1279
1280 cudaKernelNodeParams p;
1281 void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
1282 p.func = (void*)f;
1283 p.gridDim = g;
1284 p.blockDim = b;
1285 p.sharedMemBytes = s;
1286 p.kernelParams = arguments;
1287 p.extra = nullptr;
1288
1289 TF_CHECK_CUDA(
1290 cudaGraphAddKernelNode(
1291 &node->_native_handle, _graph._native_handle, nullptr, 0, &p
1292 ),
1293 "failed to create a kernel task"
1294 );
1295
1296 return cudaTask(node);
1297}
1298
1299// Function: zero
1300template <typename T, std::enable_if_t<
1301 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
1302>
1303cudaTask cudaFlow::zero(T* dst, size_t count) {
1304
1305 auto node = _graph.emplace_back(
1307 );
1308
1309 auto p = cuda_get_zero_parms(dst, count);
1310
1311 TF_CHECK_CUDA(
1312 cudaGraphAddMemsetNode(
1313 &node->_native_handle, _graph._native_handle, nullptr, 0, &p
1314 ),
1315 "failed to create a memset (zero) task"
1316 );
1317
1318 return cudaTask(node);
1319}
1320
1321// Function: fill
1322template <typename T, std::enable_if_t<
1323 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
1324>
1325cudaTask cudaFlow::fill(T* dst, T value, size_t count) {
1326
1327 auto node = _graph.emplace_back(
1329 );
1330
1331 auto p = cuda_get_fill_parms(dst, value, count);
1332
1333 TF_CHECK_CUDA(
1334 cudaGraphAddMemsetNode(
1335 &node->_native_handle, _graph._native_handle, nullptr, 0, &p
1336 ),
1337 "failed to create a memset (fill) task"
1338 );
1339
1340 return cudaTask(node);
1341}
1342
1343// Function: copy
1344template <
1345 typename T,
1347>
1348cudaTask cudaFlow::copy(T* tgt, const T* src, size_t num) {
1349
1350 auto node = _graph.emplace_back(
1352 );
1353
1354 auto p = cuda_get_copy_parms(tgt, src, num);
1355
1356 TF_CHECK_CUDA(
1357 cudaGraphAddMemcpyNode(
1358 &node->_native_handle, _graph._native_handle, nullptr, 0, &p
1359 ),
1360 "failed to create a memcpy (copy) task"
1361 );
1362
1363 return cudaTask(node);
1364}
1365
1366// Function: memset
1367inline cudaTask cudaFlow::memset(void* dst, int ch, size_t count) {
1368
1369 auto node = _graph.emplace_back(
1371 );
1372
1373 auto p = cuda_get_memset_parms(dst, ch, count);
1374
1375 TF_CHECK_CUDA(
1376 cudaGraphAddMemsetNode(
1377 &node->_native_handle, _graph._native_handle, nullptr, 0, &p
1378 ),
1379 "failed to create a memset task"
1380 );
1381
1382 return cudaTask(node);
1383}
1384
1385// Function: memcpy
1386inline cudaTask cudaFlow::memcpy(void* tgt, const void* src, size_t bytes) {
1387
1388 auto node = _graph.emplace_back(
1390 );
1391
1392 auto p = cuda_get_memcpy_parms(tgt, src, bytes);
1393
1394 TF_CHECK_CUDA(
1395 cudaGraphAddMemcpyNode(
1396 &node->_native_handle, _graph._native_handle, nullptr, 0, &p
1397 ),
1398 "failed to create a memcpy task"
1399 );
1400
1401 return cudaTask(node);
1402}
1403
1404// ------------------------------------------------------------------------
1405// update methods
1406// ------------------------------------------------------------------------
1407
1408// Function: host
1409template <typename C>
1410void cudaFlow::host(cudaTask task, C&& c) {
1411
1412 if(task.type() != cudaTaskType::HOST) {
1413 TF_THROW(task, " is not a host task");
1414 }
1415
1416 auto h = std::get_if<cudaNode::Host>(&task._node->_handle);
1417
1418 h->func = std::forward<C>(c);
1419}
1420
1421// Function: update kernel parameters
1422template <typename F, typename... ArgsT>
1424 cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args
1425) {
1426
1427 if(task.type() != cudaTaskType::KERNEL) {
1428 TF_THROW(task, " is not a kernel task");
1429 }
1430
1431 cudaKernelNodeParams p;
1432
1433 void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
1434 p.func = (void*)f;
1435 p.gridDim = g;
1436 p.blockDim = b;
1437 p.sharedMemBytes = s;
1438 p.kernelParams = arguments;
1439 p.extra = nullptr;
1440
1441 TF_CHECK_CUDA(
1442 cudaGraphExecKernelNodeSetParams(
1443 _executable, task._node->_native_handle, &p
1444 ),
1445 "failed to update kernel parameters on ", task
1446 );
1447}
1448
1449// Function: update copy parameters
1450template <
1451 typename T,
1453>
1454void cudaFlow::copy(cudaTask task, T* tgt, const T* src, size_t num) {
1455
1456 if(task.type() != cudaTaskType::MEMCPY) {
1457 TF_THROW(task, " is not a memcpy task");
1458 }
1459
1460 auto p = cuda_get_copy_parms(tgt, src, num);
1461
1462 TF_CHECK_CUDA(
1463 cudaGraphExecMemcpyNodeSetParams(
1464 _executable, task._node->_native_handle, &p
1465 ),
1466 "failed to update memcpy parameters on ", task
1467 );
1468}
1469
1470// Function: update memcpy parameters
1472 cudaTask task, void* tgt, const void* src, size_t bytes
1473) {
1474
1475 if(task.type() != cudaTaskType::MEMCPY) {
1476 TF_THROW(task, " is not a memcpy task");
1477 }
1478
1479 auto p = cuda_get_memcpy_parms(tgt, src, bytes);
1480
1481 TF_CHECK_CUDA(
1482 cudaGraphExecMemcpyNodeSetParams(_executable, task._node->_native_handle, &p),
1483 "failed to update memcpy parameters on ", task
1484 );
1485}
1486
1487// Procedure: memset
1488inline
1489void cudaFlow::memset(cudaTask task, void* dst, int ch, size_t count) {
1490
1491 if(task.type() != cudaTaskType::MEMSET) {
1492 TF_THROW(task, " is not a memset task");
1493 }
1494
1495 auto p = cuda_get_memset_parms(dst, ch, count);
1496
1497 TF_CHECK_CUDA(
1498 cudaGraphExecMemsetNodeSetParams(
1499 _executable, task._node->_native_handle, &p
1500 ),
1501 "failed to update memset parameters on ", task
1502 );
1503}
1504
1505// Procedure: fill
1506template <typename T, std::enable_if_t<
1507 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
1508>
1509void cudaFlow::fill(cudaTask task, T* dst, T value, size_t count) {
1510
1511 if(task.type() != cudaTaskType::MEMSET) {
1512 TF_THROW(task, " is not a memset task");
1513 }
1514
1515 auto p = cuda_get_fill_parms(dst, value, count);
1516
1517 TF_CHECK_CUDA(
1518 cudaGraphExecMemsetNodeSetParams(
1519 _executable, task._node->_native_handle, &p
1520 ),
1521 "failed to update memset parameters on ", task
1522 );
1523}
1524
1525// Procedure: zero
1526template <typename T, std::enable_if_t<
1527 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
1528>
1529void cudaFlow::zero(cudaTask task, T* dst, size_t count) {
1530
1531 if(task.type() != cudaTaskType::MEMSET) {
1532 TF_THROW(task, " is not a memset task");
1533 }
1534
1535 auto p = cuda_get_zero_parms(dst, count);
1536
1537 TF_CHECK_CUDA(
1538 cudaGraphExecMemsetNodeSetParams(
1539 _executable, task._node->_native_handle, &p
1540 ),
1541 "failed to update memset parameters on ", task
1542 );
1543}
1544
1545// Function: capture
1546template <typename C>
1548
1549 if(task.type() != cudaTaskType::SUBFLOW) {
1550 TF_THROW(task, " is not a subflow task");
1551 }
1552
1553 // insert a subflow node
1554 // construct a captured flow from the callable
1555 auto node_handle = std::get_if<cudaNode::Subflow>(&task._node->_handle);
1556 node_handle->graph.clear();
1557
1558 cudaFlowCapturer capturer(node_handle->graph);
1559
1560 c(capturer);
1561
1562 // obtain the optimized captured graph
1563 auto captured = capturer._capture();
1564 //cuda_dump_graph(std::cout, captured);
1565
1566 TF_CHECK_CUDA(
1567 cudaGraphExecChildGraphNodeSetParams(
1568 _executable, task._node->_native_handle, captured
1569 ),
1570 "failed to update a captured child graph"
1571 );
1572
1573 TF_CHECK_CUDA(cudaGraphDestroy(captured), "failed to destroy captured graph");
1574}
1575
1576// ----------------------------------------------------------------------------
1577// captured flow
1578// ----------------------------------------------------------------------------
1579
1580// Function: capture
1581template <typename C>
1583
1584 // insert a subflow node
1585 auto node = _graph.emplace_back(
1587 );
1588
1589 // construct a captured flow from the callable
1590 auto node_handle = std::get_if<cudaNode::Subflow>(&node->_handle);
1591 node_handle->graph.clear();
1592 cudaFlowCapturer capturer(node_handle->graph);
1593
1594 c(capturer);
1595
1596 // obtain the optimized captured graph
1597 auto captured = capturer._capture();
1598 //cuda_dump_graph(std::cout, captured);
1599
1600 TF_CHECK_CUDA(
1601 cudaGraphAddChildGraphNode(
1602 &node->_native_handle, _graph._native_handle, nullptr, 0, captured
1603 ),
1604 "failed to add a cudaFlow capturer task"
1605 );
1606
1607 TF_CHECK_CUDA(cudaGraphDestroy(captured), "failed to destroy captured graph");
1608
1609 return cudaTask(node);
1610}
1611
1612// ----------------------------------------------------------------------------
1613// Offload methods
1614// ----------------------------------------------------------------------------
1615
1616// Procedure: offload_until
1617template <typename P>
1618void cudaFlow::offload_until(P&& predicate) {
1619
1620 // transforms cudaFlow to a native cudaGraph under the specified device
1621 // and launches the graph through a given or an internal device stream
1622 if(_executable == nullptr) {
1623 TF_CHECK_CUDA(
1624 cudaGraphInstantiate(
1625 &_executable, _graph._native_handle, nullptr, nullptr, 0
1626 ),
1627 "failed to create an executable graph"
1628 );
1629 //cuda_dump_graph(std::cout, cf._graph._native_handle);
1630 }
1631
1632 //cudaScopedPerThreadStream s;
1633 cudaStream s;
1634
1635 while(!predicate()) {
1636 TF_CHECK_CUDA(
1637 cudaGraphLaunch(_executable, s), "failed to execute cudaFlow"
1638 );
1639 s.synchronize();
1640 //TF_CHECK_CUDA(
1641 // cudaStreamSynchronize(s), "failed to synchronize cudaFlow execution"
1642 //);
1643 }
1644
1645 _graph._state = cudaGraph::OFFLOADED;
1646}
1647
1648// Procedure: offload_n
1649inline void cudaFlow::offload_n(size_t n) {
1650 offload_until([repeat=n] () mutable { return repeat-- == 0; });
1651}
1652
1653// Procedure: offload
1654inline void cudaFlow::offload() {
1655 offload_until([repeat=1] () mutable { return repeat-- == 0; });
1656}
1657
1658// ############################################################################
1659// Forward declaration: FlowBuilder
1660// ############################################################################
1661
1662// FlowBuilder::emplace_on
1663template <typename C, typename D,
1665>
1667 auto n = _graph._emplace_back(
1669 [c=std::forward<C>(c), d=std::forward<D>(d)] (Executor& e, Node* p) mutable {
1670 cudaScopedDevice ctx(d);
1671 e._invoke_cudaflow_task_entry(p, c);
1672 },
1674 );
1675 return Task(n);
1676}
1677
1678// FlowBuilder::emplace
1679template <typename C, std::enable_if_t<is_cudaflow_task_v<C>, void>*>
1682}
1683
1684// ############################################################################
1685// Forward declaration: Executor
1686// ############################################################################
1687
1688// Procedure: _invoke_cudaflow_task_entry
1689template <typename C, std::enable_if_t<is_cudaflow_task_v<C>, void>*>
1690void Executor::_invoke_cudaflow_task_entry(Node* node, C&& c) {
1691
1692 using T = std::conditional_t<
1693 std::is_invocable_r_v<void, C, cudaFlow&>, cudaFlow, cudaFlowCapturer
1694 >;
1695
1696 auto h = std::get_if<Node::cudaFlow>(&node->_handle);
1697
1698 cudaGraph* g = dynamic_cast<cudaGraph*>(h->graph.get());
1699
1700 g->clear();
1701
1702 T cf(*g);
1703
1704 c(cf);
1705
1706 // TODO: change it to _graph.state
1707 //if(cf._executable == nullptr) {
1708 if(!(g->_state & cudaGraph::OFFLOADED)) {
1709 cf.offload();
1710 }
1711}
1712
1713/*// Procedure: _invoke_cudaflow_task_entry (cudaFlow)
1714template <typename C,
1715 std::enable_if_t<std::is_invocable_r_v<void, C, cudaFlow&>, void>*
1716>
1717void Executor::_invoke_cudaflow_task_entry(Node* node, C&& c) {
1718
1719 auto h = std::get_if<Node::cudaFlow>(&node->_handle);
1720
1721 cudaGraph* g = dynamic_cast<cudaGraph*>(h->graph.get());
1722
1723 g->clear();
1724
1725 cudaFlow cf(*g);
1726
1727 c(cf);
1728
1729 if(cf._executable == nullptr) {
1730 cf.offload();
1731 }
1732}
1733
1734// Procedure: _invoke_cudaflow_task_entry (cudaFlowCapturer)
1735template <typename C,
1736 std::enable_if_t<std::is_invocable_r_v<void, C, cudaFlowCapturer&>, void>*
1737>
1738void Executor::_invoke_cudaflow_task_entry(Node* node, C&& c) {
1739
1740 auto h = std::get_if<Node::cudaFlow>(&node->_handle);
1741
1742 cudaGraph* g = dynamic_cast<cudaGraph*>(h->graph.get());
1743
1744 g->clear();
1745
1746 cudaFlowCapturer fc(*g);
1747
1748 c(fc);
1749
1750 if(fc._executable == nullptr) {
1751 fc.offload();
1752 }
1753}*/
1754
1755
1756} // end of namespace tf -----------------------------------------------------
1757
1758
class to create an executor for running a taskflow graph
Definition executor.hpp:50
Task emplace(C &&callable)
creates a static task
Definition flow_builder.hpp:742
Graph & _graph
associated graph object
Definition flow_builder.hpp:727
Task emplace_on(C &&callable, D &&device)
creates a cudaFlow task on the given device
Definition cudaflow.hpp:1666
class to create a task handle over a node in a taskflow graph
Definition task.hpp:187
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
class to create a cudaFlow task dependency graph
Definition cudaflow.hpp:56
cudaTask host(C &&callable)
creates a host task that runs a callable on the host
Definition cudaflow.hpp:1248
cudaTask inclusive_scan(I first, I last, O output, C op)
creates a task to perform parallel inclusive scan over a range of items
Definition scan.hpp:619
cudaTask memset(void *dst, int v, size_t count)
creates a memset task that fills untyped data with a byte value
Definition cudaflow.hpp:1367
bool empty() const
queries the emptiness of the graph
Definition cudaflow.hpp:1206
~cudaFlow()
destroys the cudaFlow and its associated native CUDA graph and executable graph
Definition cudaflow.hpp:1176
cudaTask for_each(I first, I last, C callable)
applies a callable to each dereferenced element of the data array
Definition for_each.hpp:181
cudaTask transform_reduce(I first, I last, T *result, B bop, U uop)
performs parallel reduction over a range of transformed items
Definition reduce.hpp:596
cudaTask fill(T *dst, T value, size_t count)
creates a memset task that fills a typed memory block with a value
Definition cudaflow.hpp:1325
cudaTask noop()
creates a no-operation task
Definition cudaflow.hpp:1230
cudaTask for_each_index(I first, I last, I step, C callable)
applies a callable to each index in the range with the step size
Definition for_each.hpp:190
cudaTask uninitialized_reduce(I first, I last, T *result, B bop)
similar to tf::cudaFlow::reduce but does not assume any initial value to reduce
Definition reduce.hpp:587
cudaTask zero(T *dst, size_t count)
creates a memset task that sets a typed memory block to zero
Definition cudaflow.hpp:1303
void dump_native_graph(std::ostream &os) const
dumps the native CUDA graph into a DOT format through an output stream
Definition cudaflow.hpp:1221
cudaTask find_if(I first, I last, unsigned *idx, U op)
creates a task to find the index of the first element in a range
Definition find.hpp:193
cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop)
creates a task to perform parallel inclusive scan over a range of transformed items
Definition scan.hpp:655
cudaTask min_element(I first, I last, unsigned *idx, O op)
finds the index of the minimum element in a range
Definition find.hpp:340
cudaTask max_element(I first, I last, unsigned *idx, O op)
finds the index of the maximum element in a range
Definition find.hpp:465
void dump(std::ostream &os) const
dumps the cudaFlow graph into a DOT format through an output stream
Definition cudaflow.hpp:1216
void offload()
offloads the cudaFlow and executes it once
Definition cudaflow.hpp:1654
cudaTask capture(C &&callable)
constructs a subflow graph through tf::cudaFlowCapturer
Definition cudaflow.hpp:1582
cudaTask exclusive_scan(I first, I last, O output, C op)
similar to cudaFlow::inclusive_scan but excludes the first value
Definition scan.hpp:637
cudaTask reduce(I first, I last, T *result, B bop)
performs parallel reduction over a range of items
Definition reduce.hpp:578
cudaTask transform_uninitialized_reduce(I first, I last, T *result, B bop, U uop)
similar to tf::cudaFlow::transform_reduce but does not assume any initial value to reduce
Definition reduce.hpp:605
cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp)
creates kernels that sort the given array
Definition sort.hpp:533
void offload_until(P &&predicate)
offloads the cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
Definition cudaflow.hpp:1618
cudaTask merge_by_key(a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
creates a task to perform parallel key-value merge
Definition merge.hpp:679
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
creates a kernel task
Definition cudaflow.hpp:1272
void clear()
clears the cudaFlow object
Definition cudaflow.hpp:1185
void offload_n(size_t N)
offloads the cudaFlow and executes it by the given times
Definition cudaflow.hpp:1649
cudaTask single_task(C c)
runs a callable with only a single kernel thread
Definition for_each.hpp:169
cudaTask memcpy(void *tgt, const void *src, size_t bytes)
creates a memcpy task that copies untyped data in bytes
Definition cudaflow.hpp:1386
cudaFlow()
constructs a standalone cudaFlow
Definition cudaflow.hpp:1152
cudaTask sort(I first, I last, C comp)
creates a task to perform parallel sort an array
Definition sort.hpp:515
size_t num_tasks() const
queries the number of tasks
Definition cudaflow.hpp:1211
cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop)
similar to cudaFlow::transform_inclusive_scan but excludes the first value
Definition scan.hpp:677
cudaTask copy(T *tgt, const T *src, size_t num)
creates a memcopy task that copies typed data
Definition cudaflow.hpp:1348
cudaTask transform(I first, I last, O output, C op)
applies a callable to a source range and stores the result in a target range
Definition transform.hpp:139
cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
creates a task to perform parallel merge on two sorted arrays
Definition merge.hpp:652
class to create an RAII-styled context switch
Definition cuda_device.hpp:293
‍**
Definition cuda_stream.hpp:174
void synchronize() const
synchronizes the associated stream
Definition cuda_stream.hpp:253
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
cudaTaskType type() const
queries the task type
Definition cuda_task.hpp:221
cudaFlow capturer include file
cudaTask include file
T forward(T... args)
taskflow namespace
Definition small_vector.hpp:27
int cuda_get_device()
gets the current device associated with the caller thread
Definition cuda_device.hpp:24
@ KERNEL
memory copy task type
@ MEMSET
memory set task type
@ SUBFLOW
subflow (child graph) task type
@ HOST
host task type
@ MEMCPY
memory copy task type