3#include "cuda_graph.hpp"
21class cudaCapturingBase {
35 res.reserve(graph._nodes.size());
38 for(
auto& u : graph._nodes) {
41 hu->level = u->_dependents.size();
56 for(
auto v : u->_successors) {
58 if(--hv->level == 0) {
69cudaCapturingBase::_levelize(cudaGraph& graph) {
76 for(
auto& u : graph._nodes) {
79 hu->level = u->_dependents.size();
94 for(
auto v : u->_successors) {
96 if(--hv->level == 0) {
97 hv->level = hu->level + 1;
98 if(hv->level > max_level) {
99 max_level = hv->level;
108 for(
auto& u : graph._nodes) {
110 hu->lid = level_graph[hu->level].size();
111 level_graph[hu->level].emplace_back(u.get());
147 cudaGraph_t _optimize(cudaGraph& graph);
150inline cudaGraph_t cudaSequentialCapturing::_optimize(cudaGraph& graph) {
159 auto ordered = _toposort(graph);
160 for(
auto node : ordered) {
195 cudaGraph_t _optimize(cudaGraph& graph);
198inline cudaGraph_t cudaLinearCapturing::_optimize(cudaGraph& graph) {
207 cudaNode* src {
nullptr};
208 for(
auto& u : graph._nodes) {
209 if(u->_dependents.size() == 0) {
213 src = src->_successors.empty() ? nullptr : src->_successors[0];
271 size_t _num_streams {4};
273 cudaGraph_t _optimize(cudaGraph& graph);
281 _num_streams {num_streams} {
284 TF_THROW(
"number of streams must be at least one");
296 TF_THROW(
"number of streams must be at least one");
301inline void cudaRoundRobinCapturing::_reset(
307 for(
auto& each_level: graph) {
308 for(
auto& node: each_level) {
311 hn->idx = _num_streams;
318inline cudaGraph_t cudaRoundRobinCapturing::_optimize(cudaGraph& graph) {
321 auto levelized = _levelize(graph);
329 streams[0].begin_capture(cudaStreamCaptureModeThreadLocal);
333 events.reserve((_num_streams >> 1) + levelized.size());
336 cudaEvent_t fork_event = events.emplace_back();
337 streams[0].record(fork_event);
339 for(
size_t i = 1; i < streams.size(); ++i) {
340 streams[i].wait(fork_event);
344 for(
auto& each_level: levelized) {
345 for(
auto& node: each_level) {
347 size_t sid = hn->lid % _num_streams;
350 cudaNode* wait_node{
nullptr};
351 for(
auto& pn: node->_dependents) {
353 size_t psid = phn->lid % _num_streams;
357 if(psid == hn->idx) {
358 if(wait_node ==
nullptr ||
363 else if(psid != sid) {
364 streams[sid].wait(phn->event);
368 if(wait_node !=
nullptr) {
374 hn->work(streams[sid]);
377 for(
auto& sn: node->_successors) {
379 size_t ssid = shn->lid % _num_streams;
382 hn->event = events.emplace_back();
383 streams[sid].record(hn->event);
393 for(
size_t i=1; i<_num_streams; ++i) {
394 cudaEvent_t join_event = events.emplace_back();
395 streams[i].record(join_event);
396 streams[0].wait(join_event);
399 return streams[0].end_capture();
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
cudaLinearCapturing()=default
constructs a linear optimizer
class to capture a CUDA graph using a round-robin algorithm
Definition cuda_optimizer.hpp:243
size_t num_streams() const
queries the number of streams used by the optimizer
Definition cuda_optimizer.hpp:289
cudaRoundRobinCapturing()=default
constructs a round-robin optimizer with 4 streams by default
class to capture a CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:134
cudaSequentialCapturing()=default
constructs a sequential optimizer
**
Definition cuda_stream.hpp:174
cudaGraph_t end_capture() const
ends graph capturing on the stream
Definition cuda_stream.hpp:299
void begin_capture(cudaStreamCaptureMode m=cudaStreamCaptureModeGlobal) const
begins graph capturing on the stream
Definition cuda_stream.hpp:283
taskflow namespace
Definition small_vector.hpp:27