hello-world/docs/cuda__device_8hpp_source.html

#pragma once


#include "cuda_error.hpp"


namespace tf {


inline size_t cuda_get_num_devices() {

    int N = 0;

  TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count");

    return static_cast<size_t>(N);

}


inline int cuda_get_device() {

  int id;

  TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id");

    return id;

}


inline void cuda_set_device(int id) {

  TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id);

}


inline void cuda_get_device_property(int i, cudaDeviceProp& p) {

  TF_CHECK_CUDA(

    cudaGetDeviceProperties(&p, i), "failed to get property of device ", i

  );

}


inline cudaDeviceProp cuda_get_device_property(int i) {

  cudaDeviceProp p;

  TF_CHECK_CUDA(

    cudaGetDeviceProperties(&p, i), "failed to get property of device ", i

  );

  return p;

}


inline void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) {


  os << "Major revision number:         " << p.major << '\n'

     << "Minor revision number:         " << p.minor << '\n'

     << "Name:                          " << p.name  << '\n'

     << "Total global memory:           " << p.totalGlobalMem << '\n'

     << "Total shared memory per block: " << p.sharedMemPerBlock << '\n'

     << "Total registers per block:     " << p.regsPerBlock << '\n'

     << "Warp size:                     " << p.warpSize << '\n'

     << "Maximum memory pitch:          " << p.memPitch << '\n'

     << "Maximum threads per block:     " << p.maxThreadsPerBlock << '\n';


  os << "Maximum dimension of block:    ";

  for (int i = 0; i < 3; ++i) {

    if(i) os << 'x';

    os << p.maxThreadsDim[i];

  }

  os << '\n';


  os << "Maximum dimenstion of grid:    ";

  for (int i = 0; i < 3; ++i) {

    if(i) os << 'x';

    os << p.maxGridSize[i];;

  }

  os << '\n';


  os << "Clock rate:                    " << p.clockRate << '\n'

     << "Total constant memory:         " << p.totalConstMem << '\n'

     << "Texture alignment:             " << p.textureAlignment << '\n'

     << "Concurrent copy and execution: " << p.deviceOverlap << '\n'

     << "Number of multiprocessors:     " << p.multiProcessorCount << '\n'

     << "Kernel execution timeout:      " << p.kernelExecTimeoutEnabled << '\n'

     << "GPU sharing Host Memory:       " << p.integrated << '\n'

     << "Host page-locked mem mapping:  " << p.canMapHostMemory << '\n'

     << "Alignment for Surfaces:        " << p.surfaceAlignment << '\n'

     << "Device has ECC support:        " << p.ECCEnabled << '\n'

     << "Unified Addressing (UVA):      " << p.unifiedAddressing << '\n';

}


inline size_t cuda_get_device_max_threads_per_block(int d) {

  int threads = 0;

  TF_CHECK_CUDA(

    cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, d),

    "failed to query the maximum threads per block on device ", d

  )

  return threads;

}


inline size_t cuda_get_device_max_x_dim_per_block(int d) {

  int dim = 0;

  TF_CHECK_CUDA(

    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimX, d),

    "failed to query the maximum x-dimension per block on device ", d

  )

  return dim;

}


inline size_t cuda_get_device_max_y_dim_per_block(int d) {

  int dim = 0;

  TF_CHECK_CUDA(

    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimY, d),

    "failed to query the maximum y-dimension per block on device ", d

  )

  return dim;

}


inline size_t cuda_get_device_max_z_dim_per_block(int d) {

  int dim = 0;

  TF_CHECK_CUDA(

    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimZ, d),

    "failed to query the maximum z-dimension per block on device ", d

  )

  return dim;

}


inline size_t cuda_get_device_max_x_dim_per_grid(int d) {

  int dim = 0;

  TF_CHECK_CUDA(

    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimX, d),

    "failed to query the maximum x-dimension per grid on device ", d

  )

  return dim;

}


inline size_t cuda_get_device_max_y_dim_per_grid(int d) {

  int dim = 0;

  TF_CHECK_CUDA(

    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimY, d),

    "failed to query the maximum y-dimension per grid on device ", d

  )

  return dim;

}


inline size_t cuda_get_device_max_z_dim_per_grid(int d) {

  int dim = 0;

  TF_CHECK_CUDA(

    cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimZ, d),

    "failed to query the maximum z-dimension per grid on device ", d

  )

  return dim;

}


inline size_t cuda_get_device_max_shm_per_block(int d) {

  int num = 0;

  TF_CHECK_CUDA(

    cudaDeviceGetAttribute(&num, cudaDevAttrMaxSharedMemoryPerBlock, d),

    "failed to query the maximum shared memory per block on device ", d

  )

  return num;

}


inline size_t cuda_get_device_warp_size(int d) {

  int num = 0;

  TF_CHECK_CUDA(

    cudaDeviceGetAttribute(&num, cudaDevAttrWarpSize, d),

    "failed to query the warp size per block on device ", d

  )

  return num;

}


inline int cuda_get_device_compute_capability_major(int d) {

  int num = 0;

  TF_CHECK_CUDA(

    cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMajor, d),

    "failed to query the major number of compute capability of device ", d

  )

  return num;

}


inline int cuda_get_device_compute_capability_minor(int d) {

  int num = 0;

  TF_CHECK_CUDA(

    cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMinor, d),

    "failed to query the minor number of compute capability of device ", d

  )

  return num;

}


inline bool cuda_get_device_unified_addressing(int d) {

  int num = 0;

  TF_CHECK_CUDA(

    cudaDeviceGetAttribute(&num, cudaDevAttrUnifiedAddressing, d),

    "failed to query unified addressing status on device ", d

  )

  return num;

}


// ----------------------------------------------------------------------------

// CUDA Version

// ----------------------------------------------------------------------------


inline int cuda_get_driver_version() {

  int num = 0;

  TF_CHECK_CUDA(

    cudaDriverGetVersion(&num),

    "failed to query the latest cuda version supported by the driver"

  );

  return num;

}


inline int cuda_get_runtime_version() {

  int num = 0;

  TF_CHECK_CUDA(

    cudaRuntimeGetVersion(&num), "failed to query cuda runtime version"

  );

  return num;

}


// ----------------------------------------------------------------------------

// cudaScopedDevice

// ----------------------------------------------------------------------------


class cudaScopedDevice {


  public:


    explicit cudaScopedDevice(int device);


    ~cudaScopedDevice();


  private:


    cudaScopedDevice() = delete;

    cudaScopedDevice(const cudaScopedDevice&) = delete;

    cudaScopedDevice(cudaScopedDevice&&) = delete;


    int _p;

};


// Constructor


inline cudaScopedDevice::cudaScopedDevice(int dev) {

  TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope");

  if(_p == dev) {

    _p = -1;

  }

  else {

    TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev);

  }

}


// Destructor


inline cudaScopedDevice::~cudaScopedDevice() {

  if(_p != -1) {

    cudaSetDevice(_p);

    //TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p);

  }

}


}  // end of namespace cuda ---------------------------------------------------


std::ostream

tf::cudaScopedDevice
class to create an RAII-styled context switch
Definition cuda_device.hpp:293

tf::cudaScopedDevice::~cudaScopedDevice
~cudaScopedDevice()
destructs the guard and switches back to the previous device context
Definition cuda_device.hpp:330

tf
taskflow namespace
Definition small_vector.hpp:27

tf::cuda_get_device_max_z_dim_per_grid
size_t cuda_get_device_max_z_dim_per_grid(int d)
queries the maximum z-dimension per grid on a device
Definition cuda_device.hpp:174

tf::cuda_get_device_compute_capability_major
int cuda_get_device_compute_capability_major(int d)
queries the major number of compute capability of a device
Definition cuda_device.hpp:210

tf::cuda_get_device
int cuda_get_device()
gets the current device associated with the caller thread
Definition cuda_device.hpp:24

tf::cuda_get_runtime_version
int cuda_get_runtime_version()
queries the CUDA Runtime version (1000 * major + 10 * minor)
Definition cuda_device.hpp:262

tf::cuda_get_device_property
void cuda_get_device_property(int i, cudaDeviceProp &p)
obtains the device property
Definition cuda_device.hpp:40

tf::cuda_get_driver_version
int cuda_get_driver_version()
queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
Definition cuda_device.hpp:250

tf::cuda_get_device_max_z_dim_per_block
size_t cuda_get_device_max_z_dim_per_block(int d)
queries the maximum z-dimension per block on a device
Definition cuda_device.hpp:138

tf::cuda_get_device_max_x_dim_per_grid
size_t cuda_get_device_max_x_dim_per_grid(int d)
queries the maximum x-dimension per grid on a device
Definition cuda_device.hpp:150

tf::cuda_get_device_compute_capability_minor
int cuda_get_device_compute_capability_minor(int d)
queries the minor number of compute capability of a device
Definition cuda_device.hpp:222

tf::cuda_get_device_max_y_dim_per_grid
size_t cuda_get_device_max_y_dim_per_grid(int d)
queries the maximum y-dimension per grid on a device
Definition cuda_device.hpp:162

tf::cuda_get_device_max_y_dim_per_block
size_t cuda_get_device_max_y_dim_per_block(int d)
queries the maximum y-dimension per block on a device
Definition cuda_device.hpp:126

tf::cuda_get_device_max_threads_per_block
size_t cuda_get_device_max_threads_per_block(int d)
queries the maximum threads per block on a device
Definition cuda_device.hpp:102

tf::cuda_get_num_devices
size_t cuda_get_num_devices()
queries the number of available devices
Definition cuda_device.hpp:15

tf::cuda_get_device_unified_addressing
bool cuda_get_device_unified_addressing(int d)
queries if the device supports unified addressing
Definition cuda_device.hpp:234

tf::cuda_set_device
void cuda_set_device(int id)
switches to a given device context
Definition cuda_device.hpp:33

tf::cuda_get_device_warp_size
size_t cuda_get_device_warp_size(int d)
queries the warp size on a device
Definition cuda_device.hpp:198

tf::cuda_get_device_max_shm_per_block
size_t cuda_get_device_max_shm_per_block(int d)
queries the maximum shared memory size in bytes per block on a device
Definition cuda_device.hpp:186

tf::cuda_get_device_max_x_dim_per_block
size_t cuda_get_device_max_x_dim_per_block(int d)
queries the maximum x-dimension per block on a device
Definition cuda_device.hpp:114

tf::cuda_dump_device_property
void cuda_dump_device_property(std::ostream &os, const cudaDeviceProp &p)
dumps the device property
Definition cuda_device.hpp:60