23 cudaMemGetInfo(&free, &total),
"failed to get mem info on device ", d
35 cudaMemGetInfo(&free, &total),
"failed to get mem info on device ", d
52 cudaMalloc(&ptr, N*
sizeof(T)),
53 "failed to allocate memory (", N*
sizeof(T),
"bytes) on device ", d
68 cudaMalloc(&ptr, N*
sizeof(T)),
69 "failed to allocate memory (", N*
sizeof(T),
"bytes)"
84 cudaMallocManaged(&ptr, N*
sizeof(T)),
85 "failed to allocate shared memory (", N*
sizeof(T),
"bytes)"
103 TF_CHECK_CUDA(cudaFree(ptr),
"failed to free memory ", ptr,
" on GPU ", d);
117 TF_CHECK_CUDA(cudaFree(ptr),
"failed to free memory ", ptr);
133 cudaStream_t stream,
void* dst,
const void* src,
size_t count
136 cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),
137 "failed to perform cudaMemcpyAsync"
154 cudaStream_t stream,
void* devPtr,
int value,
size_t count
157 cudaMemsetAsync(devPtr, value, count, stream),
158 "failed to perform cudaMemsetAsync"
207struct cudaSharedMemory
212 extern __device__
void error(
void);
226struct cudaSharedMemory <int>
228 __device__
int *get()
230 extern __shared__
int s_int[];
239struct cudaSharedMemory <unsigned int>
241 __device__
unsigned int *get()
243 extern __shared__
unsigned int s_uint[];
252struct cudaSharedMemory <char>
254 __device__
char *get()
256 extern __shared__
char s_char[];
265struct cudaSharedMemory <unsigned char>
267 __device__
unsigned char *get()
269 extern __shared__
unsigned char s_uchar[];
278struct cudaSharedMemory <short>
280 __device__
short *get()
282 extern __shared__
short s_short[];
291struct cudaSharedMemory <unsigned short>
293 __device__
unsigned short *get()
295 extern __shared__
unsigned short s_ushort[];
304struct cudaSharedMemory <long>
306 __device__
long *get()
308 extern __shared__
long s_long[];
317struct cudaSharedMemory <unsigned long>
319 __device__
unsigned long *get()
321 extern __shared__
unsigned long s_ulong[];
340struct cudaSharedMemory <bool>
342 __device__
bool *get()
344 extern __shared__
bool s_bool[];
353struct cudaSharedMemory <float>
355 __device__
float *get()
357 extern __shared__
float s_float[];
366struct cudaSharedMemory <double>
368 __device__
double *get()
370 extern __shared__
double s_double[];
505 cudaMalloc( &ptr, n*
sizeof(T) ),
506 "failed to allocate ", n,
" elements (", n*
sizeof(T),
"bytes)"
508 return static_cast<pointer>(ptr);
554 template <
typename U>
566 template <
typename U>
701 cudaMallocManaged( &ptr, n*
sizeof(T) ),
702 "failed to allocate ", n,
" elements (", n*
sizeof(T),
"bytes)"
704 return static_cast<pointer>(ptr);
763 template <
typename U>
775 template <
typename U>
796class cudaDeviceVector {
800 cudaDeviceVector() =
default;
802 cudaDeviceVector(
size_t N) : _N {N} {
805 cudaMalloc(&_data, N*
sizeof(T)),
806 "failed to allocate device memory (", N*
sizeof(T),
" bytes)"
811 cudaDeviceVector(cudaDeviceVector&& rhs) :
812 _data{rhs._data}, _N {rhs._N} {
817 ~cudaDeviceVector() {
823 cudaDeviceVector& operator = (cudaDeviceVector&& rhs) {
834 size_t size()
const {
return _N; }
836 T* data() {
return _data; }
837 const T* data()
const {
return _data; }
839 cudaDeviceVector(
const cudaDeviceVector&) =
delete;
840 cudaDeviceVector& operator = (
const cudaDeviceVector&) =
delete;
class to create a CUDA device allocator
Definition cuda_memory.hpp:393
size_type max_size() const noexcept
returns the maximum number of elements that could potentially be allocated by this allocator
Definition cuda_memory.hpp:535
bool operator==(const cudaDeviceAllocator< U > &) const noexcept
compares two allocator of different types using ==
Definition cuda_memory.hpp:555
~cudaDeviceAllocator() noexcept
Destructs the device allocator object.
Definition cuda_memory.hpp:463
void construct(pointer, const_reference)
ignored to avoid de-referencing device pointer from the host
Definition cuda_memory.hpp:540
T value_type
element type
Definition cuda_memory.hpp:400
pointer address(reference x)
Returns the address of x.
Definition cuda_memory.hpp:473
const T * const_pointer
const element pointer type
Definition cuda_memory.hpp:415
const T & const_reference
constant element reference type
Definition cuda_memory.hpp:420
cudaDeviceAllocator() noexcept
Constructs a device allocator object.
Definition cuda_memory.hpp:446
T * pointer
element pointer type
Definition cuda_memory.hpp:405
cudaDeviceAllocator(const cudaDeviceAllocator &) noexcept
Constructs a device allocator object from another device allocator object.
Definition cuda_memory.hpp:451
const_pointer address(const_reference x) const
Returns the address of x.
Definition cuda_memory.hpp:483
cudaDeviceAllocator(const cudaDeviceAllocator< U > &) noexcept
Constructs a device allocator object from another device allocator object with a different element ty...
Definition cuda_memory.hpp:458
pointer allocate(size_type n, std::allocator< void >::const_pointer=0)
allocates block of storage.
Definition cuda_memory.hpp:501
void deallocate(pointer ptr, size_type)
Releases a block of storage previously allocated with member allocate and not yet released.
Definition cuda_memory.hpp:518
bool operator!=(const cudaDeviceAllocator< U > &) const noexcept
compares two allocator of different types using !=
Definition cuda_memory.hpp:567
T & reference
element reference type
Definition cuda_memory.hpp:410
void destroy(pointer)
ignored to avoid de-referencing device pointer from the host
Definition cuda_memory.hpp:545
class to create an RAII-styled context switch
Definition cuda_device.hpp:293
class to create a unified shared memory (USM) allocator
Definition cuda_memory.hpp:589
void deallocate(pointer ptr, size_type)
Releases a block of storage previously allocated with member allocate and not yet released.
Definition cuda_memory.hpp:714
cudaUSMAllocator() noexcept
Constructs a device allocator object.
Definition cuda_memory.hpp:642
pointer address(reference x)
Returns the address of x.
Definition cuda_memory.hpp:669
cudaUSMAllocator(const cudaUSMAllocator< U > &) noexcept
Constructs a device allocator object from another device allocator object with a different element ty...
Definition cuda_memory.hpp:654
const T * const_pointer
const element pointer type
Definition cuda_memory.hpp:611
void destroy(pointer ptr)
destroys in-place the object pointed by ptr
Definition cuda_memory.hpp:752
pointer allocate(size_type n, std::allocator< void >::const_pointer=0)
allocates block of storage.
Definition cuda_memory.hpp:697
T value_type
element type
Definition cuda_memory.hpp:596
T * pointer
element pointer type
Definition cuda_memory.hpp:601
cudaUSMAllocator(const cudaUSMAllocator &) noexcept
Constructs a device allocator object from another device allocator object.
Definition cuda_memory.hpp:647
~cudaUSMAllocator() noexcept
Destructs the device allocator object.
Definition cuda_memory.hpp:659
void construct(pointer ptr, const_reference val)
Constructs an element object on the location pointed by ptr.
Definition cuda_memory.hpp:740
size_type max_size() const noexcept
returns the maximum number of elements that could potentially be allocated by this allocator
Definition cuda_memory.hpp:731
bool operator!=(const cudaUSMAllocator< U > &) const noexcept
compares two allocator of different types using !=
Definition cuda_memory.hpp:776
T & reference
element reference type
Definition cuda_memory.hpp:606
const_pointer address(const_reference x) const
Returns the address of x.
Definition cuda_memory.hpp:679
bool operator==(const cudaUSMAllocator< U > &) const noexcept
compares two allocator of different types using ==
Definition cuda_memory.hpp:764
const T & const_reference
constant element reference type
Definition cuda_memory.hpp:616
CUDA device utilities include file.
taskflow namespace
Definition small_vector.hpp:27
size_t cuda_get_free_mem(int d)
queries the free memory (expensive call)
Definition cuda_memory.hpp:19
T * cuda_malloc_device(size_t N, int d)
allocates memory on the given device for holding N elements of type T
Definition cuda_memory.hpp:48
size_t cuda_get_total_mem(int d)
queries the total available memory (expensive call)
Definition cuda_memory.hpp:31
void cuda_memset_async(cudaStream_t stream, void *devPtr, int value, size_t count)
initializes or sets GPU memory to the given value byte by byte
Definition cuda_memory.hpp:153
void cuda_memcpy_async(cudaStream_t stream, void *dst, const void *src, size_t count)
copies data between host and device asynchronously through a stream
Definition cuda_memory.hpp:132
void cuda_free(T *ptr, int d)
frees memory on the GPU device
Definition cuda_memory.hpp:101
T * cuda_malloc_shared(size_t N)
allocates shared memory for holding N elements of type T
Definition cuda_memory.hpp:81
its member type U is the equivalent allocator type to allocate elements of type U
Definition cuda_memory.hpp:436
its member type U is the equivalent allocator type to allocate elements of type U
Definition cuda_memory.hpp:632