Taskflow  3.2.0-Master-Branch
Loading...
Searching...
No Matches
cuda_device.hpp
Go to the documentation of this file.
1#pragma once
2
3#include "cuda_error.hpp"
4
10namespace tf {
11
15inline size_t cuda_get_num_devices() {
16 int N = 0;
17 TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count");
18 return static_cast<size_t>(N);
19}
20
24inline int cuda_get_device() {
25 int id;
26 TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id");
27 return id;
28}
29
33inline void cuda_set_device(int id) {
34 TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id);
35}
36
40inline void cuda_get_device_property(int i, cudaDeviceProp& p) {
41 TF_CHECK_CUDA(
42 cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
43 );
44}
45
49inline cudaDeviceProp cuda_get_device_property(int i) {
50 cudaDeviceProp p;
51 TF_CHECK_CUDA(
52 cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
53 );
54 return p;
55}
56
60inline void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) {
61
62 os << "Major revision number: " << p.major << '\n'
63 << "Minor revision number: " << p.minor << '\n'
64 << "Name: " << p.name << '\n'
65 << "Total global memory: " << p.totalGlobalMem << '\n'
66 << "Total shared memory per block: " << p.sharedMemPerBlock << '\n'
67 << "Total registers per block: " << p.regsPerBlock << '\n'
68 << "Warp size: " << p.warpSize << '\n'
69 << "Maximum memory pitch: " << p.memPitch << '\n'
70 << "Maximum threads per block: " << p.maxThreadsPerBlock << '\n';
71
72 os << "Maximum dimension of block: ";
73 for (int i = 0; i < 3; ++i) {
74 if(i) os << 'x';
75 os << p.maxThreadsDim[i];
76 }
77 os << '\n';
78
79 os << "Maximum dimenstion of grid: ";
80 for (int i = 0; i < 3; ++i) {
81 if(i) os << 'x';
82 os << p.maxGridSize[i];;
83 }
84 os << '\n';
85
86 os << "Clock rate: " << p.clockRate << '\n'
87 << "Total constant memory: " << p.totalConstMem << '\n'
88 << "Texture alignment: " << p.textureAlignment << '\n'
89 << "Concurrent copy and execution: " << p.deviceOverlap << '\n'
90 << "Number of multiprocessors: " << p.multiProcessorCount << '\n'
91 << "Kernel execution timeout: " << p.kernelExecTimeoutEnabled << '\n'
92 << "GPU sharing Host Memory: " << p.integrated << '\n'
93 << "Host page-locked mem mapping: " << p.canMapHostMemory << '\n'
94 << "Alignment for Surfaces: " << p.surfaceAlignment << '\n'
95 << "Device has ECC support: " << p.ECCEnabled << '\n'
96 << "Unified Addressing (UVA): " << p.unifiedAddressing << '\n';
97}
98
103 int threads = 0;
104 TF_CHECK_CUDA(
105 cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, d),
106 "failed to query the maximum threads per block on device ", d
107 )
108 return threads;
109}
110
115 int dim = 0;
116 TF_CHECK_CUDA(
117 cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimX, d),
118 "failed to query the maximum x-dimension per block on device ", d
119 )
120 return dim;
121}
122
127 int dim = 0;
128 TF_CHECK_CUDA(
129 cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimY, d),
130 "failed to query the maximum y-dimension per block on device ", d
131 )
132 return dim;
133}
134
139 int dim = 0;
140 TF_CHECK_CUDA(
141 cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimZ, d),
142 "failed to query the maximum z-dimension per block on device ", d
143 )
144 return dim;
145}
146
151 int dim = 0;
152 TF_CHECK_CUDA(
153 cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimX, d),
154 "failed to query the maximum x-dimension per grid on device ", d
155 )
156 return dim;
157}
158
163 int dim = 0;
164 TF_CHECK_CUDA(
165 cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimY, d),
166 "failed to query the maximum y-dimension per grid on device ", d
167 )
168 return dim;
169}
170
175 int dim = 0;
176 TF_CHECK_CUDA(
177 cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimZ, d),
178 "failed to query the maximum z-dimension per grid on device ", d
179 )
180 return dim;
181}
182
187 int num = 0;
188 TF_CHECK_CUDA(
189 cudaDeviceGetAttribute(&num, cudaDevAttrMaxSharedMemoryPerBlock, d),
190 "failed to query the maximum shared memory per block on device ", d
191 )
192 return num;
193}
194
198inline size_t cuda_get_device_warp_size(int d) {
199 int num = 0;
200 TF_CHECK_CUDA(
201 cudaDeviceGetAttribute(&num, cudaDevAttrWarpSize, d),
202 "failed to query the warp size per block on device ", d
203 )
204 return num;
205}
206
211 int num = 0;
212 TF_CHECK_CUDA(
213 cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMajor, d),
214 "failed to query the major number of compute capability of device ", d
215 )
216 return num;
217}
218
223 int num = 0;
224 TF_CHECK_CUDA(
225 cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMinor, d),
226 "failed to query the minor number of compute capability of device ", d
227 )
228 return num;
229}
230
235 int num = 0;
236 TF_CHECK_CUDA(
237 cudaDeviceGetAttribute(&num, cudaDevAttrUnifiedAddressing, d),
238 "failed to query unified addressing status on device ", d
239 )
240 return num;
241}
242
243// ----------------------------------------------------------------------------
244// CUDA Version
245// ----------------------------------------------------------------------------
246
251 int num = 0;
252 TF_CHECK_CUDA(
253 cudaDriverGetVersion(&num),
254 "failed to query the latest cuda version supported by the driver"
255 );
256 return num;
257}
258
263 int num = 0;
264 TF_CHECK_CUDA(
265 cudaRuntimeGetVersion(&num), "failed to query cuda runtime version"
266 );
267 return num;
268}
269
270// ----------------------------------------------------------------------------
271// cudaScopedDevice
272// ----------------------------------------------------------------------------
273
294
295 public:
296
302 explicit cudaScopedDevice(int device);
303
308
309 private:
310
311 cudaScopedDevice() = delete;
312 cudaScopedDevice(const cudaScopedDevice&) = delete;
314
315 int _p;
316};
317
318// Constructor
319inline cudaScopedDevice::cudaScopedDevice(int dev) {
320 TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope");
321 if(_p == dev) {
322 _p = -1;
323 }
324 else {
325 TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev);
326 }
327}
328
329// Destructor
331 if(_p != -1) {
332 cudaSetDevice(_p);
333 //TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p);
334 }
335}
336
337} // end of namespace cuda ---------------------------------------------------
338
339
340
341
342
class to create an RAII-styled context switch
Definition cuda_device.hpp:293
~cudaScopedDevice()
destructs the guard and switches back to the previous device context
Definition cuda_device.hpp:330
taskflow namespace
Definition small_vector.hpp:27
size_t cuda_get_device_max_z_dim_per_grid(int d)
queries the maximum z-dimension per grid on a device
Definition cuda_device.hpp:174
int cuda_get_device_compute_capability_major(int d)
queries the major number of compute capability of a device
Definition cuda_device.hpp:210
int cuda_get_device()
gets the current device associated with the caller thread
Definition cuda_device.hpp:24
int cuda_get_runtime_version()
queries the CUDA Runtime version (1000 * major + 10 * minor)
Definition cuda_device.hpp:262
void cuda_get_device_property(int i, cudaDeviceProp &p)
obtains the device property
Definition cuda_device.hpp:40
int cuda_get_driver_version()
queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
Definition cuda_device.hpp:250
size_t cuda_get_device_max_z_dim_per_block(int d)
queries the maximum z-dimension per block on a device
Definition cuda_device.hpp:138
size_t cuda_get_device_max_x_dim_per_grid(int d)
queries the maximum x-dimension per grid on a device
Definition cuda_device.hpp:150
int cuda_get_device_compute_capability_minor(int d)
queries the minor number of compute capability of a device
Definition cuda_device.hpp:222
size_t cuda_get_device_max_y_dim_per_grid(int d)
queries the maximum y-dimension per grid on a device
Definition cuda_device.hpp:162
size_t cuda_get_device_max_y_dim_per_block(int d)
queries the maximum y-dimension per block on a device
Definition cuda_device.hpp:126
size_t cuda_get_device_max_threads_per_block(int d)
queries the maximum threads per block on a device
Definition cuda_device.hpp:102
size_t cuda_get_num_devices()
queries the number of available devices
Definition cuda_device.hpp:15
bool cuda_get_device_unified_addressing(int d)
queries if the device supports unified addressing
Definition cuda_device.hpp:234
void cuda_set_device(int id)
switches to a given device context
Definition cuda_device.hpp:33
size_t cuda_get_device_warp_size(int d)
queries the warp size on a device
Definition cuda_device.hpp:198
size_t cuda_get_device_max_shm_per_block(int d)
queries the maximum shared memory size in bytes per block on a device
Definition cuda_device.hpp:186
size_t cuda_get_device_max_x_dim_per_block(int d)
queries the maximum x-dimension per block on a device
Definition cuda_device.hpp:114
void cuda_dump_device_property(std::ostream &os, const cudaDeviceProp &p)
dumps the device property
Definition cuda_device.hpp:60