134 template <
typename OPT,
typename... ArgsT>
206 void memcpy(
cudaTask task,
void* dst,
const void* src,
size_t count);
222 template <
typename T,
233 template <
typename T,
236 void copy(
cudaTask task, T* tgt,
const T* src,
size_t num);
273 template <
typename F,
typename... ArgsT>
282 template <
typename F,
typename... ArgsT>
284 cudaTask task, dim3 g, dim3 b,
size_t s, F f, ArgsT&&... args
298 template <
typename C>
307 template <
typename C>
331 template <
typename I,
typename C>
340 template <
typename I,
typename C>
371 template <
typename I,
typename C>
380 template <
typename I,
typename C>
382 cudaTask task, I first, I last, I step, C callable
407 template <
typename I,
typename O,
typename C>
416 template <
typename I,
typename O,
typename C>
444 template <
typename I1,
typename I2,
typename O,
typename C>
453 template <
typename I1,
typename I2,
typename O,
typename C>
455 cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op
480 template <
typename I,
typename T,
typename C>
489 template <
typename I,
typename T,
typename C>
506 template <
typename I,
typename T,
typename C>
515 template <
typename I,
typename T,
typename C>
517 cudaTask task, I first, I last, T* result, C op
545 template <
typename I,
typename T,
typename C,
typename U>
554 template <
typename I,
typename T,
typename C,
typename U>
556 cudaTask task, I first, I last, T* result, C bop, U uop
573 template <
typename I,
typename T,
typename C,
typename U>
582 template <
typename I,
typename T,
typename C,
typename U>
584 cudaTask task, I first, I last, T* result, C bop, U uop
610 template <
typename I,
typename O,
typename C>
619 template <
typename I,
typename O,
typename C>
626 template <
typename I,
typename O,
typename C>
635 template <
typename I,
typename O,
typename C>
664 template <
typename I,
typename O,
typename B,
typename U>
673 template <
typename I,
typename O,
typename B,
typename U>
675 cudaTask task, I first, I last, O output, B bop, U uop
682 template <
typename I,
typename O,
typename B,
typename U>
691 template <
typename I,
typename O,
typename B,
typename U>
693 cudaTask task, I first, I last, O output, B bop, U uop
722 template <
typename A,
typename B,
typename C,
typename Comp>
723 cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp);
731 template <
typename A,
typename B,
typename C,
typename Comp>
733 cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp
780 typename a_keys_it,
typename a_vals_it,
781 typename b_keys_it,
typename b_vals_it,
782 typename c_keys_it,
typename c_vals_it,
786 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
787 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
788 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
798 typename a_keys_it,
typename a_vals_it,
799 typename b_keys_it,
typename b_vals_it,
800 typename c_keys_it,
typename c_vals_it,
805 a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first,
806 b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first,
807 c_keys_it c_keys_first, c_vals_it c_vals_first, C comp
825 template <
typename I,
typename C>
834 template <
typename I,
typename C>
867 template <
typename K_it,
typename V_it,
typename C>
876 template <
typename K_it,
typename V_it,
typename C>
878 cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp
906 template <
typename I,
typename U>
915 template <
typename I,
typename U>
947 template <
typename I,
typename O>
956 template <
typename I,
typename O>
988 template <
typename I,
typename O>
997 template <
typename I,
typename O>
1018 template <
typename P>
1039 Optimizer _optimizer;
1041 cudaGraphExec_t _executable {
nullptr};
1045 cudaGraph_t _capture();
1047 void _destroy_executable();
1053 _handle {
std::in_place_type_t<Internal>{}},
1058inline cudaFlowCapturer::cudaFlowCapturer() :
1059 _handle {
std::in_place_type_t<External>{}},
1065 if(_executable !=
nullptr) {
1066 cudaGraphExecDestroy(_executable);
1072 return _graph.empty();
1077 return _graph._nodes.size();
1082 _destroy_executable();
1083 _graph._nodes.clear();
1088 _graph.dump(os,
nullptr,
"");
1092inline void cudaFlowCapturer::_destroy_executable() {
1093 if(_executable !=
nullptr) {
1095 cudaGraphExecDestroy(_executable),
"failed to destroy executable graph"
1097 _executable =
nullptr;
1106 auto node = _graph.emplace_back(_graph,
1114 return on([](cudaStream_t){});
1119 on(task, [](cudaStream_t){});
1124 void* dst,
const void* src,
size_t count
1126 return on([dst, src, count] (cudaStream_t stream)
mutable {
1128 cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
1129 "failed to capture memcpy"
1135template <
typename T, std::enable_if_t<!std::is_same_v<T,
void>,
void>*>
1137 return on([tgt, src, num] (cudaStream_t stream)
mutable {
1139 cudaMemcpyAsync(tgt, src,
sizeof(T)*num, cudaMemcpyDefault, stream),
1140 "failed to capture copy"
1147 return on([ptr, v, n] (cudaStream_t stream)
mutable {
1149 cudaMemsetAsync(ptr, v, n, stream),
"failed to capture memset"
1155template <
typename F,
typename... ArgsT>
1157 dim3 g, dim3 b,
size_t s, F f, ArgsT&&... args
1159 return on([g, b, s, f, args...] (cudaStream_t stream)
mutable {
1160 f<<<g, b, s, stream>>>(args...);
1165inline cudaGraph_t cudaFlowCapturer::_capture() {
1167 [
this](
auto&& opt){
return opt._optimize(_graph); }, _optimizer
1172template <
typename P>
1177 if(_graph._state & cudaGraph::CHANGED) {
1179 _destroy_executable();
1181 auto g = _capture();
1183 cudaGraphInstantiate(&_executable, g,
nullptr,
nullptr, 0),
1184 "failed to create an executable graph"
1190 TF_CHECK_CUDA(cudaGraphDestroy(g),
"failed to destroy captured graph");
1195 else if(_graph._state & cudaGraph::UPDATED) {
1198 auto g = _capture();
1200 assert(_executable !=
nullptr);
1202 cudaGraphNode_t error_node;
1203 cudaGraphExecUpdateResult error_result;
1204 cudaGraphExecUpdate(_executable, g, &error_node, &error_result);
1206 if(error_result != cudaGraphExecUpdateSuccess) {
1207 _destroy_executable();
1209 cudaGraphInstantiate(&_executable, g,
nullptr,
nullptr, 0),
1210 "failed to re-create an executable graph after updates fail"
1214 TF_CHECK_CUDA(cudaGraphDestroy(g),
"failed to destroy captured graph");
1222 while(!predicate()) {
1224 cudaGraphLaunch(_executable, s),
"failed to launch the exec graph"
1233 _graph._state = cudaGraph::OFFLOADED;
1238 offload_until([repeat=n] ()
mutable {
return repeat-- == 0; });
1243 offload_until([repeat=1] ()
mutable {
return repeat-- == 0; });
1253 TF_THROW(
"invalid cudaTask type (must be CAPTURE)");
1256 _graph._state |= cudaGraph::UPDATED;
1264 cudaTask task,
void* dst,
const void* src,
size_t count
1266 on(task, [dst, src, count](cudaStream_t stream)
mutable {
1268 cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
1269 "failed to capture memcpy"
1275template <
typename T,
1279 cudaTask task, T* tgt,
const T* src,
size_t num
1281 on(task, [tgt, src, num] (cudaStream_t stream)
mutable {
1283 cudaMemcpyAsync(tgt, src,
sizeof(T)*num, cudaMemcpyDefault, stream),
1284 "failed to capture copy"
1291 cudaTask task,
void* ptr,
int v,
size_t n
1293 on(task, [ptr, v, n] (cudaStream_t stream)
mutable {
1295 cudaMemsetAsync(ptr, v, n, stream),
"failed to capture memset"
1301template <
typename F,
typename... ArgsT>
1303 cudaTask task, dim3 g, dim3 b,
size_t s, F f, ArgsT&&... args
1305 on(task, [g, b, s, f, args...] (cudaStream_t stream)
mutable {
1306 f<<<g, b, s, stream>>>(args...);
1311template <
typename OPT,
typename ...ArgsT>
class to create an executor for running a taskflow graph
Definition executor.hpp:50
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
cudaTask transform_reduce(I first, I last, T *result, C bop, U uop)
captures kernels that perform parallel reduction over a range of transformed items
Definition reduce.hpp:459
void clear()
clear this cudaFlow capturer
Definition cuda_capturer.hpp:1081
cudaTask for_each(I first, I last, C callable)
captures a kernel that applies a callable to each dereferenced element of the data array
Definition for_each.hpp:221
void offload_n(size_t n)
offloads the captured cudaFlow and executes it by the given times
Definition cuda_capturer.hpp:1237
cudaTask memset(void *ptr, int v, size_t n)
initializes or sets GPU memory to the given value byte by byte
Definition cuda_capturer.hpp:1146
cudaTask merge_by_key(a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
captures kernels that perform parallel key-value merge
Definition merge.hpp:766
cudaTask exclusive_scan(I first, I last, O output, C op)
similar to cudaFlowCapturer::inclusive_scan but excludes the first value
Definition scan.hpp:739
cudaTask reduce(I first, I last, T *result, C op)
captures kernels that perform parallel reduction over a range of items
Definition reduce.hpp:427
bool empty() const
queries the emptiness of the graph
Definition cuda_capturer.hpp:1071
cudaTask sort(I first, I last, C comp)
captures kernels that sort the given array
Definition sort.hpp:557
cudaTask transform_uninitialized_reduce(I first, I last, T *result, C bop, U uop)
similar to tf::cudaFlowCapturer::transform_reduce but does not assume any initial value to reduce
Definition reduce.hpp:479
cudaTask noop()
captures a no-operation task
Definition cuda_capturer.hpp:1113
void offload()
offloads the captured cudaFlow and executes it once
Definition cuda_capturer.hpp:1242
cudaTask transform_inclusive_scan(I first, I last, O output, B bop, U uop)
captures kernels that perform parallel inclusive scan over a range of transformed items
Definition scan.hpp:775
cudaTask inclusive_scan(I first, I last, O output, C op)
captures kernels that perform parallel inclusive scan over a range of items
Definition scan.hpp:703
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT &&... args)
captures a kernel
Definition cuda_capturer.hpp:1156
cudaTask find_if(I first, I last, unsigned *idx, U op)
creates a task to find the index of the first element in a range
Definition find.hpp:215
void offload_until(P &&predicate)
offloads the captured cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
Definition cuda_capturer.hpp:1173
cudaTask min_element(I first, I last, unsigned *idx, O op)
finds the index of the minimum element in a range
Definition find.hpp:300
void dump(std::ostream &os) const
dumps the capture graph into a DOT format through an output stream
Definition cuda_capturer.hpp:1087
cudaTask transform_exclusive_scan(I first, I last, O output, B bop, U uop)
similar to cudaFlowCapturer::transform_inclusive_scan but excludes the first value
Definition scan.hpp:817
cudaTask uninitialized_reduce(I first, I last, T *result, C op)
similar to tf::cudaFlowCapturer::reduce but does not assume any initial value to reduce
Definition reduce.hpp:443
cudaTask transform(I first, I last, O output, C op)
captures a kernel that transforms an input range to an output range
Definition transform.hpp:181
OPT & make_optimizer(ArgsT &&... args)
selects a different optimization algorithm
Definition cuda_capturer.hpp:1312
cudaFlowCapturer()
constrcts a standalone cudaFlowCapturer
Definition cuda_capturer.hpp:1058
virtual ~cudaFlowCapturer()
destructs the cudaFlowCapturer
Definition cuda_capturer.hpp:1063
cudaTask copy(T *tgt, const T *src, size_t num)
captures a copy task of typed data
Definition cuda_capturer.hpp:1136
cudaTask sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp)
captures kernels that sort the given array
Definition sort.hpp:593
cudaTask merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
captures kernels that perform parallel merge on two sorted arrays
Definition merge.hpp:725
cudaTask single_task(C c)
capturers a kernel to runs the given callable with only one thread
Definition for_each.hpp:259
cudaTask max_element(I first, I last, unsigned *idx, O op)
finds the index of the maximum element in a range
Definition find.hpp:425
cudaTask on(C &&callable)
captures a sequential CUDA operations from the given callable
Definition cuda_capturer.hpp:1105
cudaTask memcpy(void *dst, const void *src, size_t count)
copies data between host and device asynchronously through a stream
Definition cuda_capturer.hpp:1123
size_t num_tasks() const
queries the number of tasks
Definition cuda_capturer.hpp:1076
cudaTask for_each_index(I first, I last, I step, C callable)
captures a kernel that applies a callable to each index in the range with the step size
Definition for_each.hpp:230
class to create a cudaFlow task dependency graph
Definition cudaflow.hpp:56
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
class to capture a CUDA graph using a round-robin algorithm
Definition cuda_optimizer.hpp:243
class to capture a CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:134
**
Definition cuda_stream.hpp:174
void synchronize() const
synchronizes the associated stream
Definition cuda_stream.hpp:253
class to create a task handle over an internal node of a cudaFlow graph
Definition cuda_task.hpp:65
cudaTaskType type() const
queries the task type
Definition cuda_task.hpp:221
cudaFlow capturing algorithms include file
taskflow namespace
Definition small_vector.hpp:27
@ CAPTURE
capture task type