Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
cuda_capturer.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "cuda_task.hpp"
4#include "cuda_optimizer.hpp"
5
11namespace tf {
12
13// ----------------------------------------------------------------------------
14// class definition: cudaFlowCapturer
15// ----------------------------------------------------------------------------
16
58
59 friend class cudaFlow;
60 friend class Executor;
61
62 struct External {
63 cudaGraph graph;
64 };
65
66 struct Internal {
67 };
68
69 using handle_t = std::variant<External, Internal>;
70
71 using Optimizer = std::variant<
75 >;
76
77 public:
78
87
91 virtual ~cudaFlowCapturer();
92
96 bool empty() const;
97
101 size_t num_tasks() const;
102
106 void clear();
107
112 void dump(std::ostream& os) const;
113
134 template <typename OPT, typename... ArgsT>
135 OPT& make_optimizer(ArgsT&&... args);
136
137 // ------------------------------------------------------------------------
138 // basic methods
139 // ------------------------------------------------------------------------
140
150 template <typename C, std::enable_if_t<
152 >
153 cudaTask on(C&& callable);
154
161 template <typename C, std::enable_if_t<
163 >
164 void on(cudaTask task, C&& callable);
165
178 cudaTask noop();
179
186 void noop(cudaTask task);
187
198 cudaTask memcpy(void* dst, const void* src, size_t count);
199
206 void memcpy(cudaTask task, void* dst, const void* src, size_t count);
207
222 template <typename T,
224 >
225 cudaTask copy(T* tgt, const T* src, size_t num);
226
233 template <typename T,
235 >
236 void copy(cudaTask task, T* tgt, const T* src, size_t num);
237
249 cudaTask memset(void* ptr, int v, size_t n);
250
257 void memset(cudaTask task, void* ptr, int value, size_t n);
258
273 template <typename F, typename... ArgsT>
274 cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT&&... args);
275
282 template <typename F, typename... ArgsT>
283 void kernel(
284 cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args
285 );
286
287 // ------------------------------------------------------------------------
288 // generic algorithms
289 // ------------------------------------------------------------------------
290
298 template <typename C>
300
307 template <typename C>
308 void single_task(cudaTask task, C c);
309
331 template <typename I, typename C>
332 cudaTask for_each(I first, I last, C callable);
333
340 template <typename I, typename C>
341 void for_each(cudaTask task, I first, I last, C callable);
342
371 template <typename I, typename C>
372 cudaTask for_each_index(I first, I last, I step, C callable);
373
380 template <typename I, typename C>
381 void for_each_index(
382 cudaTask task, I first, I last, I step, C callable
383 );
384
407 template <typename I, typename O, typename C>
408 cudaTask transform(I first, I last, O output, C op);
409
416 template <typename I, typename O, typename C>
417 void transform(cudaTask task, I first, I last, O output, C op);
418
444 template <typename I1, typename I2, typename O, typename C>
445 cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op);
446
453 template <typename I1, typename I2, typename O, typename C>
454 void transform(
455 cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op
456 );
457
480 template <typename I, typename T, typename C>
481 cudaTask reduce(I first, I last, T* result, C op);
482
489 template <typename I, typename T, typename C>
490 void reduce(cudaTask task, I first, I last, T* result, C op);
491
506 template <typename I, typename T, typename C>
507 cudaTask uninitialized_reduce(I first, I last, T* result, C op);
508
515 template <typename I, typename T, typename C>
517 cudaTask task, I first, I last, T* result, C op
518 );
519
545 template <typename I, typename T, typename C, typename U>
546 cudaTask transform_reduce(I first, I last, T* result, C bop, U uop);
547
554 template <typename I, typename T, typename C, typename U>
555 void transform_reduce(
556 cudaTask task, I first, I last, T* result, C bop, U uop
557 );
558
573 template <typename I, typename T, typename C, typename U>
574 cudaTask transform_uninitialized_reduce(I first, I last, T* result, C bop, U uop);
575
582 template <typename I, typename T, typename C, typename U>
584 cudaTask task, I first, I last, T* result, C bop, U uop
585 );
586
610 template <typename I, typename O, typename C>
611 cudaTask inclusive_scan(I first, I last, O output, C op);
612
619 template <typename I, typename O, typename C>
620 void inclusive_scan(cudaTask task, I first, I last, O output, C op);
621
626 template <typename I, typename O, typename C>
627 cudaTask exclusive_scan(I first, I last, O output, C op);
628
635 template <typename I, typename O, typename C>
636 void exclusive_scan(cudaTask task, I first, I last, O output, C op);
637
664 template <typename I, typename O, typename B, typename U>
665 cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop);
666
673 template <typename I, typename O, typename B, typename U>
675 cudaTask task, I first, I last, O output, B bop, U uop
676 );
677
682 template <typename I, typename O, typename B, typename U>
683 cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop);
684
691 template <typename I, typename O, typename B, typename U>
693 cudaTask task, I first, I last, O output, B bop, U uop
694 );
695
722 template <typename A, typename B, typename C, typename Comp>
723 cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp);
724
731 template <typename A, typename B, typename C, typename Comp>
732 void merge(
733 cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp
734 );
735
779 template<
780 typename a_keys_it, typename a_vals_it,
781 typename b_keys_it, typename b_vals_it,
782 typename c_keys_it, typename c_vals_it,
783 typename C
784 >
786 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
787 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
788 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
789 );
790
797 template<
798 typename a_keys_it, typename a_vals_it,
799 typename b_keys_it, typename b_vals_it,
800 typename c_keys_it, typename c_vals_it,
801 typename C
802 >
803 void merge_by_key(
804 cudaTask task,
805 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
806 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
807 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
808 );
809
825 template <typename I, typename C>
826 cudaTask sort(I first, I last, C comp);
827
834 template <typename I, typename C>
835 void sort(cudaTask task, I first, I last, C comp);
836
867 template <typename K_it, typename V_it, typename C>
868 cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp);
869
876 template <typename K_it, typename V_it, typename C>
877 void sort_by_key(
878 cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp
879 );
880
906 template <typename I, typename U>
907 cudaTask find_if(I first, I last, unsigned* idx, U op);
908
915 template <typename I, typename U>
916 void find_if(cudaTask task, I first, I last, unsigned* idx, U op);
917
947 template <typename I, typename O>
948 cudaTask min_element(I first, I last, unsigned* idx, O op);
949
956 template <typename I, typename O>
957 void min_element(cudaTask task, I first, I last, unsigned* idx, O op);
958
988 template <typename I, typename O>
989 cudaTask max_element(I first, I last, unsigned* idx, O op);
990
997 template <typename I, typename O>
998 void max_element(cudaTask task, I first, I last, unsigned* idx, O op);
999
1000 // ------------------------------------------------------------------------
1001 // offload methods
1002 // ------------------------------------------------------------------------
1003
1018 template <typename P>
1019 void offload_until(P&& predicate);
1020
1026 void offload_n(size_t n);
1027
1031 void offload();
1032
1033 private:
1034
1035 handle_t _handle;
1036
1037 cudaGraph& _graph;
1038
1039 Optimizer _optimizer;
1040
1041 cudaGraphExec_t _executable {nullptr};
1042
1043 cudaFlowCapturer(cudaGraph&);
1044
1045 cudaGraph_t _capture();
1046
1047 void _destroy_executable();
1048
1049};
1050
1051// constructs a cudaFlow capturer from a taskflow
1052inline cudaFlowCapturer::cudaFlowCapturer(cudaGraph& g) :
1053 _handle {std::in_place_type_t<Internal>{}},
1054 _graph {g} {
1055}
1056
1057// constructs a standalone cudaFlow capturer
1058inline cudaFlowCapturer::cudaFlowCapturer() :
1059 _handle {std::in_place_type_t<External>{}},
1060 _graph {std::get_if<External>(&_handle)->graph} {
1061}
1062
1064
1065 if(_executable != nullptr) {
1066 cudaGraphExecDestroy(_executable);
1067 }
1068}
1069
1070// Function: empty
1071inline bool cudaFlowCapturer::empty() const {
1072 return _graph.empty();
1073}
1074
1075// Function: num_tasks
1076inline size_t cudaFlowCapturer::num_tasks() const {
1077 return _graph._nodes.size();
1078}
1079
1080// Procedure: clear
1082 _destroy_executable();
1083 _graph._nodes.clear();
1084}
1085
1086// Procedure: dump
1087inline void cudaFlowCapturer::dump(std::ostream& os) const {
1088 _graph.dump(os, nullptr, "");
1089}
1090
1091// Procedure: _destroy_executable
1092inline void cudaFlowCapturer::_destroy_executable() {
1093 if(_executable != nullptr) {
1094 TF_CHECK_CUDA(
1095 cudaGraphExecDestroy(_executable), "failed to destroy executable graph"
1096 );
1097 _executable = nullptr;
1098 }
1099}
1100
1101// Function: capture
1102template <typename C, std::enable_if_t<
1104>
1106 auto node = _graph.emplace_back(_graph,
1108 );
1109 return cudaTask(node);
1110}
1111
1112// Function: noop
1114 return on([](cudaStream_t){});
1115}
1116
1117// Function: noop
1119 on(task, [](cudaStream_t){});
1120}
1121
1122// Function: memcpy
1124 void* dst, const void* src, size_t count
1125) {
1126 return on([dst, src, count] (cudaStream_t stream) mutable {
1127 TF_CHECK_CUDA(
1128 cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
1129 "failed to capture memcpy"
1130 );
1131 });
1132}
1133
1134// Function: copy
1135template <typename T, std::enable_if_t<!std::is_same_v<T, void>, void>*>
1136cudaTask cudaFlowCapturer::copy(T* tgt, const T* src, size_t num) {
1137 return on([tgt, src, num] (cudaStream_t stream) mutable {
1138 TF_CHECK_CUDA(
1139 cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream),
1140 "failed to capture copy"
1141 );
1142 });
1143}
1144
1145// Function: memset
1146inline cudaTask cudaFlowCapturer::memset(void* ptr, int v, size_t n) {
1147 return on([ptr, v, n] (cudaStream_t stream) mutable {
1148 TF_CHECK_CUDA(
1149 cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset"
1150 );
1151 });
1152}
1153
1154// Function: kernel
1155template <typename F, typename... ArgsT>
1157 dim3 g, dim3 b, size_t s, F f, ArgsT&&... args
1158) {
1159 return on([g, b, s, f, args...] (cudaStream_t stream) mutable {
1160 f<<<g, b, s, stream>>>(args...);
1161 });
1162}
1163
1164// Function: _capture
1165inline cudaGraph_t cudaFlowCapturer::_capture() {
1166 return std::visit(
1167 [this](auto&& opt){ return opt._optimize(_graph); }, _optimizer
1168 );
1169}
1170
1171// Procedure: offload_until
1172template <typename P>
1174
1175 // If the topology got changed, we need to destroy the executable
1176 // and create a new one
1177 if(_graph._state & cudaGraph::CHANGED) {
1178
1179 _destroy_executable();
1180
1181 auto g = _capture();
1182 TF_CHECK_CUDA(
1183 cudaGraphInstantiate(&_executable, g, nullptr, nullptr, 0),
1184 "failed to create an executable graph"
1185 );
1186
1187 //cuda_dump_graph(std::cout, g);
1188
1189 // TODO: store the native graph?
1190 TF_CHECK_CUDA(cudaGraphDestroy(g), "failed to destroy captured graph");
1191 }
1192 // if the graph is just updated (i.e., topology does not change),
1193 // we can skip part of the optimization and just update the executable
1194 // with the new captured graph
1195 else if(_graph._state & cudaGraph::UPDATED) {
1196
1197 // TODO: skip part of the optimization (e.g., levelization)
1198 auto g = _capture();
1199
1200 assert(_executable != nullptr);
1201
1202 cudaGraphNode_t error_node;
1203 cudaGraphExecUpdateResult error_result;
1204 cudaGraphExecUpdate(_executable, g, &error_node, &error_result);
1205
1206 if(error_result != cudaGraphExecUpdateSuccess) {
1207 _destroy_executable();
1208 TF_CHECK_CUDA(
1209 cudaGraphInstantiate(&_executable, g, nullptr, nullptr, 0),
1210 "failed to re-create an executable graph after updates fail"
1211 );
1212 }
1213 // TODO: store the native graph?
1214 TF_CHECK_CUDA(cudaGraphDestroy(g), "failed to destroy captured graph");
1215 }
1216
1217 // offload the executable
1218 if(_executable) {
1219 //cudaScopedPerThreadStream s;
1220 cudaStream s;
1221
1222 while(!predicate()) {
1223 TF_CHECK_CUDA(
1224 cudaGraphLaunch(_executable, s), "failed to launch the exec graph"
1225 );
1226
1227 s.synchronize();
1228
1229 //TF_CHECK_CUDA(cudaStreamSynchronize(s), "failed to synchronize stream");
1230 }
1231 }
1232
1233 _graph._state = cudaGraph::OFFLOADED;
1234}
1235
1236// Procedure: offload_n
1237inline void cudaFlowCapturer::offload_n(size_t n) {
1238 offload_until([repeat=n] () mutable { return repeat-- == 0; });
1239}
1240
1241// Procedure: offload
1243 offload_until([repeat=1] () mutable { return repeat-- == 0; });
1244}
1245
1246// Function: on
1247template <typename C, std::enable_if_t<
1249>
1250void cudaFlowCapturer::on(cudaTask task, C&& callable) {
1251
1252 if(task.type() != cudaTaskType::CAPTURE) {
1253 TF_THROW("invalid cudaTask type (must be CAPTURE)");
1254 }
1255
1256 _graph._state |= cudaGraph::UPDATED;
1257
1258 std::get_if<cudaNode::Capture>(&task._node->_handle)->work =
1259 std::forward<C>(callable);
1260}
1261
1262// Function: memcpy
1264 cudaTask task, void* dst, const void* src, size_t count
1265) {
1266 on(task, [dst, src, count](cudaStream_t stream) mutable {
1267 TF_CHECK_CUDA(
1268 cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
1269 "failed to capture memcpy"
1270 );
1271 });
1272}
1273
1274// Function: copy
1275template <typename T,
1277>
1279 cudaTask task, T* tgt, const T* src, size_t num
1280) {
1281 on(task, [tgt, src, num] (cudaStream_t stream) mutable {
1282 TF_CHECK_CUDA(
1283 cudaMemcpyAsync(tgt, src, sizeof(T)*num, cudaMemcpyDefault, stream),
1284 "failed to capture copy"
1285 );
1286 });
1287}
1288
1289// Function: memset
1291 cudaTask task, void* ptr, int v, size_t n
1292) {
1293 on(task, [ptr, v, n] (cudaStream_t stream) mutable {
1294 TF_CHECK_CUDA(
1295 cudaMemsetAsync(ptr, v, n, stream), "failed to capture memset"
1296 );
1297 });
1298}
1299
1300// Function: kernel
1301template <typename F, typename... ArgsT>
1303 cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT&&... args
1304) {
1305 on(task, [g, b, s, f, args...] (cudaStream_t stream) mutable {
1306 f<<<g, b, s, stream>>>(args...);
1307 });
1308}
1309
1310// Function: make_optimizer
1311template <typename OPT, typename ...ArgsT>
1313 return _optimizer.emplace<OPT>(std::forward<ArgsT>(args)...);
1314}
1315
1316} // end of namespace tf -----------------------------------------------------
1317
class to create an executor for running a taskflow graph
Definition executor.hpp:50
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask transform_reduce(I first, I last, T *result, C bop, U uop)
captures kernels that perform parallel reduction over a range of transformed items
Definition reduce.hpp:459
void clear()
clear this cudaFlow capturer
Definition cuda_capturer.hpp:1081
cudaTask for_each(I first, I last, C callable)
captures a kernel that applies a callable to each dereferenced element of the data array
Definition for_each.hpp:221
void offload_n(size_t n)
offloads the captured cudaFlow and executes it by the given times
Definition cuda_capturer.hpp:1237
cudaTask memset(void *ptr, int v, size_t n)
initializes or sets GPU memory to the given value byte by byte
Definition cuda_capturer.hpp:1146
cudaTask merge_by_key(a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
captures kernels that perform parallel key-value merge
Definition merge.hpp:766
cudaTask exclusive_scan(I first, I last, O output, C op)
similar to cudaFlowCapturer::inclusive_scan but excludes the first value
Definition scan.hpp:739
cudaTask reduce(I first, I last, T *result, C op)
captures kernels that perform parallel reduction over a range of items
Definition reduce.hpp:427
bool empty() const
queries the emptiness of the graph
Definition cuda_capturer.hpp:1071
cudaTask sort(I first, I last, C comp)
captures kernels that sort the given array
Definition sort.hpp:557
cudaTask transform_uninitialized_reduce(I first, I last, T *result, C bop, U uop)
similar to tf::cudaFlowCapturer::transform_reduce but does not assume any initial value to reduce
Definition reduce.hpp:479
cudaTask noop()
captures a no-operation task
Definition cuda_capturer.hpp:1113
void offload()
offloads the captured cudaFlow and executes it once
Definition cuda_capturer.hpp:1242
cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop)
captures kernels that perform parallel inclusive scan over a range of transformed items
Definition scan.hpp:775
cudaTask inclusive_scan(I first, I last, O output, C op)
captures kernels that perform parallel inclusive scan over a range of items
Definition scan.hpp:703
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
captures a kernel
Definition cuda_capturer.hpp:1156
cudaTask find_if(I first, I last, unsigned *idx, U op)
creates a task to find the index of the first element in a range
Definition find.hpp:215
void offload_until(P &&predicate)
offloads the captured cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
Definition cuda_capturer.hpp:1173
cudaTask min_element(I first, I last, unsigned *idx, O op)
finds the index of the minimum element in a range
Definition find.hpp:300
void dump(std::ostream &os) const
dumps the capture graph into a DOT format through an output stream
Definition cuda_capturer.hpp:1087
cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop)
similar to cudaFlowCapturer::transform_inclusive_scan but excludes the first value
Definition scan.hpp:817
cudaTask uninitialized_reduce(I first, I last, T *result, C op)
similar to tf::cudaFlowCapturer::reduce but does not assume any initial value to reduce
Definition reduce.hpp:443
cudaTask transform(I first, I last, O output, C op)
captures a kernel that transforms an input range to an output range
Definition transform.hpp:181
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaFlowCapturer()
constrcts a standalone cudaFlowCapturer
Definition cuda_capturer.hpp:1058
virtual ~cudaFlowCapturer()
destructs the cudaFlowCapturer
Definition cuda_capturer.hpp:1063
cudaTask copy(T *tgt, const T *src, size_t num)
captures a copy task of typed data
Definition cuda_capturer.hpp:1136
cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp)
captures kernels that sort the given array
Definition sort.hpp:593
cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
captures kernels that perform parallel merge on two sorted arrays
Definition merge.hpp:725
cudaTask single_task(C c)
capturers a kernel to runs the given callable with only one thread
Definition for_each.hpp:259
cudaTask max_element(I first, I last, unsigned *idx, O op)
finds the index of the maximum element in a range
Definition find.hpp:425
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask memcpy(void *dst, const void *src, size_t count)
copies data between host and device asynchronously through a stream
Definition cuda_capturer.hpp:1123
size_t num_tasks() const
queries the number of tasks
Definition cuda_capturer.hpp:1076
cudaTask for_each_index(I first, I last, I step, C callable)
captures a kernel that applies a callable to each index in the range with the step size
Definition for_each.hpp:230
class to create a cudaFlow task dependency graph
Definition cudaflow.hpp:56
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to capture a CUDA graph using a round-robin algorithm
Definition cuda_optimizer.hpp:243
class to capture a CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:134
‍**
Definition cuda_stream.hpp:174
void synchronize() const
synchronizes the associated stream
Definition cuda_stream.hpp:253
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
cudaTaskType type() const
queries the task type
Definition cuda_task.hpp:221
cudaFlow capturing algorithms include file
cudaTask include file
T forward(T... args)
taskflow namespace
Definition small_vector.hpp:27
@ CAPTURE
capture task type