Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
cuda_optimizer.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "cuda_graph.hpp"
4
10namespace tf {
11
12// ----------------------------------------------------------------------------
13// cudaCapturingBase
14// ----------------------------------------------------------------------------
15
21class cudaCapturingBase {
22
23 protected:
24
25 std::vector<cudaNode*> _toposort(cudaGraph&);
26 std::vector<std::vector<cudaNode*>> _levelize(cudaGraph&);
27};
28
29// Function: _toposort
30inline std::vector<cudaNode*> cudaCapturingBase::_toposort(cudaGraph& graph) {
31
34
35 res.reserve(graph._nodes.size());
36
37 // insert the first level of nodes into the queue
38 for(auto& u : graph._nodes) {
39
40 auto hu = std::get_if<cudaNode::Capture>(&u->_handle);
41 hu->level = u->_dependents.size();
42
43 if(hu->level == 0) {
44 bfs.push(u.get());
45 }
46 }
47
48 // levelize the graph using bfs
49 while(!bfs.empty()) {
50
51 auto u = bfs.front();
52 bfs.pop();
53
54 res.push_back(u);
55
56 for(auto v : u->_successors) {
57 auto hv = std::get_if<cudaNode::Capture>(&v->_handle);
58 if(--hv->level == 0) {
59 bfs.push(v);
60 }
61 }
62 }
63
64 return res;
65}
66
67// Function: _levelize
69cudaCapturingBase::_levelize(cudaGraph& graph) {
70
72
73 size_t max_level = 0;
74
75 // insert the first level of nodes into the queue
76 for(auto& u : graph._nodes) {
77
78 auto hu = std::get_if<cudaNode::Capture>(&u->_handle);
79 hu->level = u->_dependents.size();
80
81 if(hu->level == 0) {
82 bfs.push(u.get());
83 }
84 }
85
86 // levelize the graph using bfs
87 while(!bfs.empty()) {
88
89 auto u = bfs.front();
90 bfs.pop();
91
92 auto hu = std::get_if<cudaNode::Capture>(&u->_handle);
93
94 for(auto v : u->_successors) {
95 auto hv = std::get_if<cudaNode::Capture>(&v->_handle);
96 if(--hv->level == 0) {
97 hv->level = hu->level + 1;
98 if(hv->level > max_level) {
99 max_level = hv->level;
100 }
101 bfs.push(v);
102 }
103 }
104 }
105
106 // set level_graph and each node's idx
107 std::vector<std::vector<cudaNode*>> level_graph(max_level+1);
108 for(auto& u : graph._nodes) {
109 auto hu = std::get_if<cudaNode::Capture>(&u->_handle);
110 hu->lid = level_graph[hu->level].size();
111 level_graph[hu->level].emplace_back(u.get());
112
113 //for(auto s : u->_successors) {
114 // assert(hu.level < std::get_if<cudaNode::Capture>(&s->_handle)->level);
115 //}
116 }
117
118 return level_graph;
119}
120
121// ----------------------------------------------------------------------------
122// class definition: cudaSequentialCapturing
123// ----------------------------------------------------------------------------
124
134class cudaSequentialCapturing : public cudaCapturingBase {
135
136 friend class cudaFlowCapturer;
137
138 public:
139
144
145 private:
146
147 cudaGraph_t _optimize(cudaGraph& graph);
148};
149
150inline cudaGraph_t cudaSequentialCapturing::_optimize(cudaGraph& graph) {
151
152 // acquire per-thread stream and turn it into capture mode
153 // we must use ThreadLocal mode to avoid clashing with CUDA global states
154
155 cudaStream stream;
156
157 stream.begin_capture(cudaStreamCaptureModeThreadLocal);
158
159 auto ordered = _toposort(graph);
160 for(auto node : ordered) {
161 std::get_if<cudaNode::Capture>(&node->_handle)->work(stream);
162 }
163
164 return stream.end_capture();
165}
166
167// ----------------------------------------------------------------------------
168// class definition: cudaLinearCapturing
169// ----------------------------------------------------------------------------
170
182class cudaLinearCapturing : public cudaCapturingBase {
183
184 friend class cudaFlowCapturer;
185
186 public:
187
192
193 private:
194
195 cudaGraph_t _optimize(cudaGraph& graph);
196};
197
198inline cudaGraph_t cudaLinearCapturing::_optimize(cudaGraph& graph) {
199
200 // acquire per-thread stream and turn it into capture mode
201 // we must use ThreadLocal mode to avoid clashing with CUDA global states
202 cudaStream stream;
203
204 stream.begin_capture(cudaStreamCaptureModeThreadLocal);
205
206 // find the source node
207 cudaNode* src {nullptr};
208 for(auto& u : graph._nodes) {
209 if(u->_dependents.size() == 0) {
210 src = u.get();
211 while(src) {
212 std::get_if<cudaNode::Capture>(&src->_handle)->work(stream);
213 src = src->_successors.empty() ? nullptr : src->_successors[0];
214 }
215 break;
216 }
217 // ideally, there should be only one source
218 }
219
220 return stream.end_capture();
221}
222
223// ----------------------------------------------------------------------------
224// class definition: cudaRoundRobinCapturing
225// ----------------------------------------------------------------------------
226
243class cudaRoundRobinCapturing : public cudaCapturingBase {
244
245 friend class cudaFlowCapturer;
246
247 public:
248
253
257 explicit cudaRoundRobinCapturing(size_t num_streams);
258
262 size_t num_streams() const;
263
267 void num_streams(size_t n);
268
269 private:
270
271 size_t _num_streams {4};
272
273 cudaGraph_t _optimize(cudaGraph& graph);
274
275 void _reset(std::vector<std::vector<cudaNode*>>& graph);
276
277};
278
279// Constructor
281 _num_streams {num_streams} {
282
283 if(num_streams == 0) {
284 TF_THROW("number of streams must be at least one");
285 }
286}
287
288// Function: num_streams
290 return _num_streams;
291}
292
293// Procedure: num_streams
295 if(n == 0) {
296 TF_THROW("number of streams must be at least one");
297 }
298 _num_streams = n;
299}
300
301inline void cudaRoundRobinCapturing::_reset(
303) {
304 //level == global id
305 //idx == stream id we want to skip
306 size_t id{0};
307 for(auto& each_level: graph) {
308 for(auto& node: each_level) {
309 auto hn = std::get_if<cudaNode::Capture>(&node->_handle);
310 hn->level = id++;
311 hn->idx = _num_streams;
312 hn->event = nullptr;
313 }
314 }
315}
316
317// Function: _optimize
318inline cudaGraph_t cudaRoundRobinCapturing::_optimize(cudaGraph& graph) {
319
320 // levelize the graph
321 auto levelized = _levelize(graph);
322
323 // initialize the data structure
324 _reset(levelized);
325
326 // begin to capture
327 std::vector<cudaStream> streams(_num_streams);
328
329 streams[0].begin_capture(cudaStreamCaptureModeThreadLocal);
330
331 // reserve space for scoped events
333 events.reserve((_num_streams >> 1) + levelized.size());
334
335 // fork
336 cudaEvent_t fork_event = events.emplace_back();
337 streams[0].record(fork_event);
338
339 for(size_t i = 1; i < streams.size(); ++i) {
340 streams[i].wait(fork_event);
341 }
342
343 // assign streams to levelized nodes in a round-robin manner
344 for(auto& each_level: levelized) {
345 for(auto& node: each_level) {
346 auto hn = std::get_if<cudaNode::Capture>(&node->_handle);
347 size_t sid = hn->lid % _num_streams;
348
349 //wait events
350 cudaNode* wait_node{nullptr};
351 for(auto& pn: node->_dependents) {
352 auto phn = std::get_if<cudaNode::Capture>(&pn->_handle);
353 size_t psid = phn->lid % _num_streams;
354
355 //level == global id
356 //idx == stream id we want to skip
357 if(psid == hn->idx) {
358 if(wait_node == nullptr ||
359 std::get_if<cudaNode::Capture>(&wait_node->_handle)->level < phn->level) {
360 wait_node = pn;
361 }
362 }
363 else if(psid != sid) {
364 streams[sid].wait(phn->event);
365 }
366 }
367
368 if(wait_node != nullptr) {
369 assert(std::get_if<cudaNode::Capture>(&wait_node->_handle)->event);
370 streams[sid].wait(std::get_if<cudaNode::Capture>(&wait_node->_handle)->event);
371 }
372
373 //capture
374 hn->work(streams[sid]);
375
376 //create/record stream
377 for(auto& sn: node->_successors) {
378 auto shn = std::get_if<cudaNode::Capture>(&sn->_handle);
379 size_t ssid = shn->lid % _num_streams;
380 if(ssid != sid) {
381 if(!hn->event) {
382 hn->event = events.emplace_back();
383 streams[sid].record(hn->event);
384 }
385 //idx == stream id we want to skip
386 shn->idx = sid;
387 }
388 }
389 }
390 }
391
392 // join
393 for(size_t i=1; i<_num_streams; ++i) {
394 cudaEvent_t join_event = events.emplace_back();
395 streams[i].record(join_event);
396 streams[0].wait(join_event);
397 }
398
399 return streams[0].end_capture();
400}
401
402
403} // end of namespace tf -----------------------------------------------------
404
class to create a cudaFlow graph using stream capture
Definition cuda_capturer.hpp:57
class to capture a linear CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:182
cudaLinearCapturing()=default
constructs a linear optimizer
class to capture a CUDA graph using a round-robin algorithm
Definition cuda_optimizer.hpp:243
size_t num_streams() const
queries the number of streams used by the optimizer
Definition cuda_optimizer.hpp:289
cudaRoundRobinCapturing()=default
constructs a round-robin optimizer with 4 streams by default
class to capture a CUDA graph using a sequential stream
Definition cuda_optimizer.hpp:134
cudaSequentialCapturing()=default
constructs a sequential optimizer
‍**
Definition cuda_stream.hpp:174
cudaGraph_t end_capture() const
ends graph capturing on the stream
Definition cuda_stream.hpp:299
void begin_capture(cudaStreamCaptureMode m=cudaStreamCaptureModeGlobal) const
begins graph capturing on the stream
Definition cuda_stream.hpp:283
T forward(T... args)
taskflow namespace
Definition small_vector.hpp:27