Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
cuda_memory.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "cuda_device.hpp"
4
10namespace tf {
11
12// ----------------------------------------------------------------------------
13// memory
14// ----------------------------------------------------------------------------
15
19inline size_t cuda_get_free_mem(int d) {
20 cudaScopedDevice ctx(d);
21 size_t free, total;
22 TF_CHECK_CUDA(
23 cudaMemGetInfo(&free, &total), "failed to get mem info on device ", d
24 );
25 return free;
26}
27
31inline size_t cuda_get_total_mem(int d) {
32 cudaScopedDevice ctx(d);
33 size_t free, total;
34 TF_CHECK_CUDA(
35 cudaMemGetInfo(&free, &total), "failed to get mem info on device ", d
36 );
37 return total;
38}
39
47template <typename T>
48T* cuda_malloc_device(size_t N, int d) {
49 cudaScopedDevice ctx(d);
50 T* ptr {nullptr};
51 TF_CHECK_CUDA(
52 cudaMalloc(&ptr, N*sizeof(T)),
53 "failed to allocate memory (", N*sizeof(T), "bytes) on device ", d
54 )
55 return ptr;
56}
57
64template <typename T>
65T* cuda_malloc_device(size_t N) {
66 T* ptr {nullptr};
67 TF_CHECK_CUDA(
68 cudaMalloc(&ptr, N*sizeof(T)),
69 "failed to allocate memory (", N*sizeof(T), "bytes)"
70 )
71 return ptr;
72}
73
80template <typename T>
81T* cuda_malloc_shared(size_t N) {
82 T* ptr {nullptr};
83 TF_CHECK_CUDA(
84 cudaMallocManaged(&ptr, N*sizeof(T)),
85 "failed to allocate shared memory (", N*sizeof(T), "bytes)"
86 )
87 return ptr;
88}
89
100template <typename T>
101void cuda_free(T* ptr, int d) {
102 cudaScopedDevice ctx(d);
103 TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr, " on GPU ", d);
104}
105
115template <typename T>
116void cuda_free(T* ptr) {
117 TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr);
118}
119
133 cudaStream_t stream, void* dst, const void* src, size_t count
134) {
135 TF_CHECK_CUDA(
136 cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
137 "failed to perform cudaMemcpyAsync"
138 );
139}
140
154 cudaStream_t stream, void* devPtr, int value, size_t count
155){
156 TF_CHECK_CUDA(
157 cudaMemsetAsync(devPtr, value, count, stream),
158 "failed to perform cudaMemsetAsync"
159 );
160}
161
162// ----------------------------------------------------------------------------
163// Shared Memory
164// ----------------------------------------------------------------------------
165//
166// Because dynamically sized shared memory arrays are declared "extern",
167// we can't templatize them directly. To get around this, we declare a
168// simple wrapper struct that will declare the extern array with a different
169// name depending on the type. This avoids compiler errors about duplicate
170// definitions.
171//
172// To use dynamically allocated shared memory in a templatized __global__ or
173// __device__ function, just replace code like this:
174//
175// template<class T>
176// __global__ void
177// foo( T* g_idata, T* g_odata)
178// {
179// // Shared mem size is determined by the host app at run time
180// extern __shared__ T sdata[];
181// ...
182// doStuff(sdata);
183// ...
184// }
185//
186// With this:
187//
188// template<class T>
189// __global__ void
190// foo( T* g_idata, T* g_odata)
191// {
192// // Shared mem size is determined by the host app at run time
193// cudaSharedMemory<T> smem;
194// T* sdata = smem.get();
195// ...
196// doStuff(sdata);
197// ...
198// }
199// ----------------------------------------------------------------------------
200
201// This is the un-specialized struct. Note that we prevent instantiation of this
202// struct by putting an undefined symbol in the function body so it won't compile.
206template <typename T>
207struct cudaSharedMemory
208{
209 // Ensure that we won't compile any un-specialized types
210 __device__ T *get()
211 {
212 extern __device__ void error(void);
213 error();
214 return NULL;
215 }
216};
217
218// Following are the specializations for the following types.
219// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
220// One could also specialize it for user-defined types.
221
225template <>
226struct cudaSharedMemory <int>
227{
228 __device__ int *get()
229 {
230 extern __shared__ int s_int[];
231 return s_int;
232 }
233};
234
238template <>
239struct cudaSharedMemory <unsigned int>
240{
241 __device__ unsigned int *get()
242 {
243 extern __shared__ unsigned int s_uint[];
244 return s_uint;
245 }
246};
247
251template <>
252struct cudaSharedMemory <char>
253{
254 __device__ char *get()
255 {
256 extern __shared__ char s_char[];
257 return s_char;
258 }
259};
260
264template <>
265struct cudaSharedMemory <unsigned char>
266{
267 __device__ unsigned char *get()
268 {
269 extern __shared__ unsigned char s_uchar[];
270 return s_uchar;
271 }
272};
273
277template <>
278struct cudaSharedMemory <short>
279{
280 __device__ short *get()
281 {
282 extern __shared__ short s_short[];
283 return s_short;
284 }
285};
286
290template <>
291struct cudaSharedMemory <unsigned short>
292{
293 __device__ unsigned short *get()
294 {
295 extern __shared__ unsigned short s_ushort[];
296 return s_ushort;
297 }
298};
299
303template <>
304struct cudaSharedMemory <long>
305{
306 __device__ long *get()
307 {
308 extern __shared__ long s_long[];
309 return s_long;
310 }
311};
312
316template <>
317struct cudaSharedMemory <unsigned long>
318{
319 __device__ unsigned long *get()
320 {
321 extern __shared__ unsigned long s_ulong[];
322 return s_ulong;
323 }
324};
325
326//template <>
327//struct cudaSharedMemory <size_t>
328//{
329// __device__ size_t *get()
330// {
331// extern __shared__ size_t s_sizet[];
332// return s_sizet;
333// }
334//};
335
339template <>
340struct cudaSharedMemory <bool>
341{
342 __device__ bool *get()
343 {
344 extern __shared__ bool s_bool[];
345 return s_bool;
346 }
347};
348
352template <>
353struct cudaSharedMemory <float>
354{
355 __device__ float *get()
356 {
357 extern __shared__ float s_float[];
358 return s_float;
359 }
360};
361
365template <>
366struct cudaSharedMemory <double>
367{
368 __device__ double *get()
369 {
370 extern __shared__ double s_double[];
371 return s_double;
372 }
373};
374
375
376
377// ----------------------------------------------------------------------------
378// cudaDeviceAllocator
379// ----------------------------------------------------------------------------
380
392template<typename T>
394
395 public:
396
400 using value_type = T;
401
405 using pointer = T*;
406
410 using reference = T&;
411
415 using const_pointer = const T*;
416
420 using const_reference = const T&;
421
426
431
435 template<typename U>
436 struct rebind {
441 };
442
447
452
457 template<typename U>
459
464
473 pointer address( reference x ) { return &x; }
474
483 const_pointer address( const_reference x ) const { return &x; }
484
502 {
503 void* ptr = NULL;
504 TF_CHECK_CUDA(
505 cudaMalloc( &ptr, n*sizeof(T) ),
506 "failed to allocate ", n, " elements (", n*sizeof(T), "bytes)"
507 )
508 return static_cast<pointer>(ptr);
509 }
510
519 {
520 if(ptr){
521 cudaFree(ptr);
522 }
523 }
524
535 size_type max_size() const noexcept { return size_type {-1}; }
536
541
545 void destroy( pointer) { }
546
554 template <typename U>
555 bool operator == (const cudaDeviceAllocator<U>&) const noexcept {
556 return true;
557 }
558
566 template <typename U>
567 bool operator != (const cudaDeviceAllocator<U>&) const noexcept {
568 return false;
569 }
570
571};
572
573// ----------------------------------------------------------------------------
574// cudaUSMAllocator
575// ----------------------------------------------------------------------------
576
588template<typename T>
590
591 public:
592
596 using value_type = T;
597
601 using pointer = T*;
602
606 using reference = T&;
607
611 using const_pointer = const T*;
612
616 using const_reference = const T&;
617
622
627
631 template<typename U>
632 struct rebind {
637 };
638
642 cudaUSMAllocator() noexcept {}
643
647 cudaUSMAllocator( const cudaUSMAllocator& ) noexcept {}
648
653 template<typename U>
655
659 ~cudaUSMAllocator() noexcept {}
660
669 pointer address( reference x ) { return &x; }
670
679 const_pointer address( const_reference x ) const { return &x; }
680
698 {
699 void* ptr {nullptr};
700 TF_CHECK_CUDA(
701 cudaMallocManaged( &ptr, n*sizeof(T) ),
702 "failed to allocate ", n, " elements (", n*sizeof(T), "bytes)"
703 )
704 return static_cast<pointer>(ptr);
705 }
706
715 {
716 if(ptr){
717 cudaFree(ptr);
718 }
719 }
720
731 size_type max_size() const noexcept { return size_type {-1}; }
732
741 new ((void*)ptr) value_type(val);
742 }
743
752 void destroy( pointer ptr ) {
753 ptr->~value_type();
754 }
755
763 template <typename U>
764 bool operator == (const cudaUSMAllocator<U>&) const noexcept {
765 return true;
766 }
767
775 template <typename U>
776 bool operator != (const cudaUSMAllocator<U>&) const noexcept {
777 return false;
778 }
779
780};
781
782// ----------------------------------------------------------------------------
783// GPU vector object
784// ----------------------------------------------------------------------------
785
786//template <typename T>
787//using cudaDeviceVector = std::vector<NoInit<T>, cudaDeviceAllocator<NoInit<T>>>;
788
789//template <typename T>
790//using cudaUSMVector = std::vector<T, cudaUSMAllocator<T>>;
791
795template <typename T>
796class cudaDeviceVector {
797
798 public:
799
800 cudaDeviceVector() = default;
801
802 cudaDeviceVector(size_t N) : _N {N} {
803 if(N) {
804 TF_CHECK_CUDA(
805 cudaMalloc(&_data, N*sizeof(T)),
806 "failed to allocate device memory (", N*sizeof(T), " bytes)"
807 );
808 }
809 }
810
811 cudaDeviceVector(cudaDeviceVector&& rhs) :
812 _data{rhs._data}, _N {rhs._N} {
813 rhs._data = nullptr;
814 rhs._N = 0;
815 }
816
817 ~cudaDeviceVector() {
818 if(_data) {
819 cudaFree(_data);
820 }
821 }
822
823 cudaDeviceVector& operator = (cudaDeviceVector&& rhs) {
824 if(_data) {
825 cudaFree(_data);
826 }
827 _data = rhs._data;
828 _N = rhs._N;
829 rhs._data = nullptr;
830 rhs._N = 0;
831 return *this;
832 }
833
834 size_t size() const { return _N; }
835
836 T* data() { return _data; }
837 const T* data() const { return _data; }
838
839 cudaDeviceVector(const cudaDeviceVector&) = delete;
840 cudaDeviceVector& operator = (const cudaDeviceVector&) = delete;
841
842 private:
843
844 T* _data {nullptr};
845 size_t _N {0};
846};
847
848
849} // end of namespace tf -----------------------------------------------------
850
851
852
853
854
855
class to create a CUDA device allocator
Definition cuda_memory.hpp:393
size_type max_size() const noexcept
returns the maximum number of elements that could potentially be allocated by this allocator
Definition cuda_memory.hpp:535
bool operator==(const cudaDeviceAllocator< U > &) const noexcept
compares two allocator of different types using ==
Definition cuda_memory.hpp:555
~cudaDeviceAllocator() noexcept
Destructs the device allocator object.
Definition cuda_memory.hpp:463
void construct(pointer, const_reference)
ignored to avoid de-referencing device pointer from the host
Definition cuda_memory.hpp:540
T value_type
element type
Definition cuda_memory.hpp:400
pointer address(reference x)
Returns the address of x.
Definition cuda_memory.hpp:473
const T * const_pointer
const element pointer type
Definition cuda_memory.hpp:415
const T & const_reference
constant element reference type
Definition cuda_memory.hpp:420
cudaDeviceAllocator() noexcept
Constructs a device allocator object.
Definition cuda_memory.hpp:446
T * pointer
element pointer type
Definition cuda_memory.hpp:405
cudaDeviceAllocator(const cudaDeviceAllocator &) noexcept
Constructs a device allocator object from another device allocator object.
Definition cuda_memory.hpp:451
const_pointer address(const_reference x) const
Returns the address of x.
Definition cuda_memory.hpp:483
cudaDeviceAllocator(const cudaDeviceAllocator< U > &) noexcept
Constructs a device allocator object from another device allocator object with a different element ty...
Definition cuda_memory.hpp:458
pointer allocate(size_type n, std::allocator< void >::const_pointer=0)
allocates block of storage.
Definition cuda_memory.hpp:501
void deallocate(pointer ptr, size_type)
Releases a block of storage previously allocated with member allocate and not yet released.
Definition cuda_memory.hpp:518
bool operator!=(const cudaDeviceAllocator< U > &) const noexcept
compares two allocator of different types using !=
Definition cuda_memory.hpp:567
T & reference
element reference type
Definition cuda_memory.hpp:410
void destroy(pointer)
ignored to avoid de-referencing device pointer from the host
Definition cuda_memory.hpp:545
class to create an RAII-styled context switch
Definition cuda_device.hpp:293
class to create a unified shared memory (USM) allocator
Definition cuda_memory.hpp:589
void deallocate(pointer ptr, size_type)
Releases a block of storage previously allocated with member allocate and not yet released.
Definition cuda_memory.hpp:714
cudaUSMAllocator() noexcept
Constructs a device allocator object.
Definition cuda_memory.hpp:642
pointer address(reference x)
Returns the address of x.
Definition cuda_memory.hpp:669
cudaUSMAllocator(const cudaUSMAllocator< U > &) noexcept
Constructs a device allocator object from another device allocator object with a different element ty...
Definition cuda_memory.hpp:654
const T * const_pointer
const element pointer type
Definition cuda_memory.hpp:611
void destroy(pointer ptr)
destroys in-place the object pointed by ptr
Definition cuda_memory.hpp:752
pointer allocate(size_type n, std::allocator< void >::const_pointer=0)
allocates block of storage.
Definition cuda_memory.hpp:697
T value_type
element type
Definition cuda_memory.hpp:596
T * pointer
element pointer type
Definition cuda_memory.hpp:601
cudaUSMAllocator(const cudaUSMAllocator &) noexcept
Constructs a device allocator object from another device allocator object.
Definition cuda_memory.hpp:647
~cudaUSMAllocator() noexcept
Destructs the device allocator object.
Definition cuda_memory.hpp:659
void construct(pointer ptr, const_reference val)
Constructs an element object on the location pointed by ptr.
Definition cuda_memory.hpp:740
size_type max_size() const noexcept
returns the maximum number of elements that could potentially be allocated by this allocator
Definition cuda_memory.hpp:731
bool operator!=(const cudaUSMAllocator< U > &) const noexcept
compares two allocator of different types using !=
Definition cuda_memory.hpp:776
T & reference
element reference type
Definition cuda_memory.hpp:606
const_pointer address(const_reference x) const
Returns the address of x.
Definition cuda_memory.hpp:679
bool operator==(const cudaUSMAllocator< U > &) const noexcept
compares two allocator of different types using ==
Definition cuda_memory.hpp:764
const T & const_reference
constant element reference type
Definition cuda_memory.hpp:616
CUDA device utilities include file.
taskflow namespace
Definition small_vector.hpp:27
size_t cuda_get_free_mem(int d)
queries the free memory (expensive call)
Definition cuda_memory.hpp:19
T * cuda_malloc_device(size_t N, int d)
allocates memory on the given device for holding N elements of type T
Definition cuda_memory.hpp:48
size_t cuda_get_total_mem(int d)
queries the total available memory (expensive call)
Definition cuda_memory.hpp:31
void cuda_memset_async(cudaStream_t stream, void *devPtr, int value, size_t count)
initializes or sets GPU memory to the given value byte by byte
Definition cuda_memory.hpp:153
void cuda_memcpy_async(cudaStream_t stream, void *dst, const void *src, size_t count)
copies data between host and device asynchronously through a stream
Definition cuda_memory.hpp:132
void cuda_free(T *ptr, int d)
frees memory on the GPU device
Definition cuda_memory.hpp:101
T * cuda_malloc_shared(size_t N)
allocates shared memory for holding N elements of type T
Definition cuda_memory.hpp:81
its member type U is the equivalent allocator type to allocate elements of type U
Definition cuda_memory.hpp:436
its member type U is the equivalent allocator type to allocate elements of type U
Definition cuda_memory.hpp:632