Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
cuda_stream.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "cuda_pool.hpp"
4
10namespace tf {
11
12
13// ----------------------------------------------------------------------------
14// cudaStreamCreator and cudaStreamDeleter for per-thread stream pool
15// ----------------------------------------------------------------------------
16
18struct cudaStreamCreator {
19
23 cudaStream_t operator () () const {
24 cudaStream_t stream;
25 TF_CHECK_CUDA(cudaStreamCreate(&stream), "failed to create a CUDA stream");
26 return stream;
27 }
28};
29
31struct cudaStreamDeleter {
32
36 void operator () (cudaStream_t stream) const {
37 if(stream) {
38 cudaStreamDestroy(stream);
39 }
40 }
41};
42
44struct cudaStreamSynchronizer {
45
46 void operator () (cudaStream_t stream) const {
47 TF_CHECK_CUDA(
48 cudaStreamSynchronize(stream), "failed to synchronize a CUDA stream"
49 );
50 }
51
52};
53
55//@brief alias of per-thread stream pool type
56// */
57//using cudaPerThreadStreamPool = cudaPerThreadDeviceObjectPool<
58// cudaStream_t, cudaStreamCreator, cudaStreamDeleter
59//>;
60//
62//@brief acquires the per-thread cuda stream pool
63//*/
64//inline cudaPerThreadStreamPool& cuda_per_thread_stream_pool() {
65// thread_local cudaPerThreadStreamPool pool;
66// return pool;
67//}
68//
72//
74//@brief class to create an RAII-styled guard of stream acquisition
75//
76//Sample usage:
77//
78//@code{.cpp}
79//{
80// tf::cudaScopedPerThreadStream stream(1); // acquires a stream on device 1
81//
82// // use stream as a normal cuda stream (cudaStream_t)
83// cudaStreamWaitEvent(stream, ...);
84//
85//} // leaving the scope releases the stream back to the pool on device 1
86//@endcode
87//
88//The scoped per-thread stream is primarily used by tf::Executor to execute
89//CUDA tasks (e.g., tf::cudaFlow, tf::cudaFlowCapturer).
90//
91//%cudaScopedPerThreadStream is non-copyable.
92//*/
93//class cudaScopedPerThreadStream {
94//
95// public:
96//
97// /**
98// @brief constructs a scoped stream under the given device
99//
100// The constructor acquires a stream from a per-thread stream pool.
101//
102// @param device device context of the requested stream
103// */
104// explicit cudaScopedPerThreadStream(int device) :
105// _ptr {cuda_per_thread_stream_pool().acquire(device)} {
106// }
107//
108// /**
109// @brief constructs a scoped stream under the current device.
110//
111// The constructor acquires a stream from a per-thread stream pool.
112// */
113// cudaScopedPerThreadStream() :
114// _ptr {cuda_per_thread_stream_pool().acquire(cuda_get_device())} {
115// }
116//
117// /**
118// @brief destructs the scoped stream guard
119//
120// The destructor releases the stream to the per-thread stream pool.
121// */
122// ~cudaScopedPerThreadStream() {
123// if(_ptr) {
124// cuda_per_thread_stream_pool().release(std::move(_ptr));
125// }
126// }
127//
128// /**
129// @brief implicit conversion to the native CUDA stream (cudaStream_t)
130// */
131// operator cudaStream_t () const {
132// return _ptr->value;
133// }
134//
135// /**
136// @brief disabled copy constructor
137// */
138// cudaScopedPerThreadStream(const cudaScopedPerThreadStream&) = delete;
139//
140// /**
141// @brief default move constructor
142// */
143// cudaScopedPerThreadStream(cudaScopedPerThreadStream&&) = default;
144//
145// /**
146// @brief disabled copy assignment
147// */
148// cudaScopedPerThreadStream& operator = (const cudaScopedPerThreadStream&) = delete;
149//
150// /**
151// @brief default move assignment
152// */
153// cudaScopedPerThreadStream& operator = (cudaScopedPerThreadStream&&) = delete;
154//
155// private:
156//
157// std::shared_ptr<cudaPerThreadStreamPool::Object> _ptr;
158//
159//};
160
161// ----------------------------------------------------------------------------
162// cudaStream
163// ----------------------------------------------------------------------------
164
175
176 public:
177
183 explicit cudaStream(cudaStream_t stream) : _stream(stream) {
184 }
185
191 cudaStream() : _stream{ cudaStreamCreator{}() } {
192 }
193
197 cudaStream(const cudaStream&) = delete;
198
202 cudaStream(cudaStream&& rhs) : _stream{rhs._stream} {
203 rhs._stream = nullptr;
204 }
205
210 cudaStreamDeleter {} (_stream);
211 }
212
217
222 cudaStreamDeleter {} (_stream);
223 _stream = rhs._stream;
224 rhs._stream = nullptr;
225 return *this;
226 }
227
233 void reset(cudaStream_t stream = nullptr) {
234 cudaStreamDeleter {} (_stream);
235 _stream = stream;
236 }
237
243 operator cudaStream_t () const {
244 return _stream;
245 }
246
253 void synchronize() const {
254 cudaStreamSynchronizer{}(_stream);
255 }
256
283 void begin_capture(cudaStreamCaptureMode m = cudaStreamCaptureModeGlobal) const {
284 TF_CHECK_CUDA(
285 cudaStreamBeginCapture(_stream, m),
286 "failed to begin capture on stream ", _stream, " with thread mode ", m
287 );
288 }
289
299 cudaGraph_t end_capture() const {
300 cudaGraph_t native_g;
301 TF_CHECK_CUDA(
302 cudaStreamEndCapture(_stream, &native_g),
303 "failed to end capture on stream ", _stream
304 );
305 return native_g;
306 }
307
314 void record(cudaEvent_t event) const {
315 TF_CHECK_CUDA(
316 cudaEventRecord(event, _stream),
317 "failed to record event ", event, " on stream ", _stream
318 );
319 }
320
327 void wait(cudaEvent_t event) const {
328 TF_CHECK_CUDA(
329 cudaStreamWaitEvent(_stream, event, 0),
330 "failed to wait for event ", event, " on stream ", _stream
331 );
332 }
333
334 private:
335
336 cudaStream_t _stream {nullptr};
337};
338
339
340
341// ----------------------------------------------------------------------------
342// cudaEventCreator and cudaEventDeleter for per-thread event pool
343// ----------------------------------------------------------------------------
344
346struct cudaEventCreator {
347
351 cudaEvent_t operator () () const {
352 cudaEvent_t event;
353 TF_CHECK_CUDA(cudaEventCreate(&event), "failed to create a CUDA event");
354 return event;
355 }
356};
357
359struct cudaEventDeleter {
360
364 void operator () (cudaEvent_t event) const {
365 cudaEventDestroy(event);
366 }
367};
368
369
371//@brief alias of per-thread event pool type
372// */
373//using cudaPerThreadEventPool = cudaPerThreadDeviceObjectPool<
374// cudaEvent_t, cudaEventCreator, cudaEventDeleter
375//>;
376//
378//@brief per-thread cuda event pool
379//*/
380//inline cudaPerThreadEventPool& cuda_per_thread_event_pool() {
381// thread_local cudaPerThreadEventPool pool;
382// return pool;
383//}
384//
388//
390//@brief class to create an RAII-styled guard of event acquisition
391//
392//Sample usage:
393//
394//@code{.cpp}
395//{
396// tf::cudaScopedPerThreadEvent event(1); // acquires a event on device 1
397//
398// // use event as a normal cuda event (cudaEvent_t)
399// cudaStreamWaitEvent(stream, event);
400//
401//} // leaving the scope releases the event back to the pool on device 1
402//@endcode
403//
404//The scoped per-thread event is primarily used by tf::Executor to execute
405//CUDA tasks (e.g., tf::cudaFlow, tf::cudaFlowCapturer).
406//
407//%cudaScopedPerThreadEvent is non-copyable.
408//*/
409//class cudaScopedPerThreadEvent {
410//
411// public:
412//
413// /**
414// @brief constructs a scoped event under the given device
415//
416// The constructor acquires a event from a per-thread event pool.
417//
418// @param device device context of the requested event
419// */
420// explicit cudaScopedPerThreadEvent(int device) :
421// _ptr {cuda_per_thread_event_pool().acquire(device)} {
422// }
423//
424// /**
425// @brief constructs a scoped event under the current device.
426//
427// The constructor acquires a event from a per-thread event pool.
428// */
429// cudaScopedPerThreadEvent() :
430// _ptr {cuda_per_thread_event_pool().acquire(cuda_get_device())} {
431// }
432//
433// /**
434// @brief destructs the scoped event guard
435//
436// The destructor releases the event to the per-thread event pool.
437// */
438// ~cudaScopedPerThreadEvent() {
439// if(_ptr) {
440// cuda_per_thread_event_pool().release(std::move(_ptr));
441// }
442// }
443//
444// /**
445// @brief implicit conversion to the native CUDA event (cudaEvent_t)
446// */
447// operator cudaEvent_t () const {
448// return _ptr->value;
449// }
450//
451// /**
452// @brief disabled copy constructor
453// */
454// cudaScopedPerThreadEvent(const cudaScopedPerThreadEvent&) = delete;
455//
456// /**
457// @brief default move constructor
458// */
459// cudaScopedPerThreadEvent(cudaScopedPerThreadEvent&&) = default;
460//
461// /**
462// @brief disabled copy assignment
463// */
464// cudaScopedPerThreadEvent& operator = (const cudaScopedPerThreadEvent&) = delete;
465//
466// /**
467// @brief default move assignment
468// */
469// cudaScopedPerThreadEvent& operator = (cudaScopedPerThreadEvent&&) = delete;
470//
471// private:
472//
473// std::shared_ptr<cudaPerThreadEventPool::Object> _ptr;
474//
475//};
476
477// ----------------------------------------------------------------------------
478// cudaEvent
479// ----------------------------------------------------------------------------
480
491
492 public:
493
497 explicit cudaEvent(cudaEvent_t event) : _event(event) {
498 }
499
503 cudaEvent() : _event{ cudaEventCreator{}() } {
504 }
505
509 cudaEvent(const cudaEvent&) = delete;
510
514 cudaEvent(cudaEvent&& rhs) : _event{rhs._event} {
515 rhs._event = nullptr;
516 }
517
522 cudaEventDeleter {} (_event);
523 }
524
528 cudaEvent& operator = (const cudaEvent&) = delete;
529
534 cudaEventDeleter {} (_event);
535 _event = rhs._event;
536 rhs._event = nullptr;
537 return *this;
538 }
539
545 operator cudaEvent_t () const {
546 return _event;
547 }
548
554 void reset(cudaEvent_t event = nullptr) {
555 cudaEventDeleter {} (_event);
556 _event = event;
557 }
558
559 private:
560
561 cudaEvent_t _event {nullptr};
562};
563
564
565} // end of namespace tf -----------------------------------------------------
566
567
568
‍**
Definition cuda_stream.hpp:490
cudaEvent()
constructs an RAII-styled object for a new CUDA event
Definition cuda_stream.hpp:503
void reset(cudaEvent_t event=nullptr)
replaces the managed event
Definition cuda_stream.hpp:554
cudaEvent(cudaEvent &&rhs)
move constructor
Definition cuda_stream.hpp:514
~cudaEvent()
destructs the CUDA event
Definition cuda_stream.hpp:521
cudaEvent & operator=(const cudaEvent &)=delete
disabled copy assignment
cudaEvent(cudaEvent_t event)
constructs an RAII-styled object from the given CUDA event
Definition cuda_stream.hpp:497
cudaEvent(const cudaEvent &)=delete
disabled copy constructor
‍**
Definition cuda_stream.hpp:174
void synchronize() const
synchronizes the associated stream
Definition cuda_stream.hpp:253
cudaStream & operator=(const cudaStream &)=delete
disabled copy assignment
void reset(cudaStream_t stream=nullptr)
replaces the managed stream
Definition cuda_stream.hpp:233
void wait(cudaEvent_t event) const
waits on an event
Definition cuda_stream.hpp:327
void record(cudaEvent_t event) const
records an event on the stream
Definition cuda_stream.hpp:314
cudaGraph_t end_capture() const
ends graph capturing on the stream
Definition cuda_stream.hpp:299
cudaStream(cudaStream_t stream)
constructs an RAII-styled object from the given CUDA stream
Definition cuda_stream.hpp:183
~cudaStream()
destructs the CUDA stream
Definition cuda_stream.hpp:209
void begin_capture(cudaStreamCaptureMode m=cudaStreamCaptureModeGlobal) const
begins graph capturing on the stream
Definition cuda_stream.hpp:283
cudaStream()
constructs an RAII-styled object for a new CUDA stream
Definition cuda_stream.hpp:191
cudaStream(const cudaStream &)=delete
disabled copy constructor
cudaStream(cudaStream &&rhs)
move constructor
Definition cuda_stream.hpp:202
taskflow namespace
Definition small_vector.hpp:27