Dragon - C++ API
A Computation Graph Virtual Machine Based Deep Learning Framework
cuda_device.h
Go to the documentation of this file.
1 
13 #ifndef DRAGON_UTILS_CUDA_DEVICE_H_
14 #define DRAGON_UTILS_CUDA_DEVICE_H_
15 
16 #ifdef WITH_CUDA
17 #include <cuda.h>
18 #include <cublas.h>
19 #include <curand.h>
20 #include <cuda_runtime.h>
21 #include <device_launch_parameters.h>
22 #endif
23 
24 #ifdef WITH_NCCL
25 #include <nccl.h>
26 #endif
27 
28 #include "core/common.h"
29 
30 namespace dragon {
31 
32 #ifdef WITH_CUDA
33 
41 const int CUDA_THREADS = 1024;
42 
49 const int CUDA_MAX_BLOCKS = 65535;
50 
51 // You really need a NVIDIA DGX-2 !!! :-)
52 #define CUDA_MAX_DEVICES 16
53 
54 #define CUDA_VERSION_MIN(major, minor, patch) \
55  (CUDA_VERSION >= (major * 1000 + minor * 100 + patch))
56 
57 #define CUDA_VERSION_MAX(major, minor, patch) \
58  (CUDA_VERSION < (major * 1000 + minor * 100 + patch))
59 
60 #define CUDA_CHECK(condition) \
61  do { \
62  cudaError_t error = condition; \
63  CHECK_EQ(error, cudaSuccess) \
64  << "\n" << cudaGetErrorString(error); \
65  } while (0)
66 
67 #define CUBLAS_CHECK(condition) \
68  do { \
69  cublasStatus_t status = condition; \
70  CHECK_EQ(status, CUBLAS_STATUS_SUCCESS); \
71  } while (0)
72 
73 #define CURAND_CHECK(condition) \
74  do { \
75  curandStatus_t status = condition; \
76  CHECK_EQ(status, CURAND_STATUS_SUCCESS); \
77  } while (0)
78 
79 #ifdef WITH_NCCL
80 #define NCCL_CHECK(condition) \
81  do { \
82  ncclResult_t status = condition; \
83  CHECK_EQ(status, ncclSuccess) \
84  << "\n" << ncclGetErrorString(status); \
85  } while (0)
86 #endif // WITH_NCCL
87 
88 #define CUDA_1D_KERNEL_LOOP(i, n) \
89  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; \
90  i < n; i += blockDim.x * gridDim.x)
91 
92 #define CUDA_2D_KERNEL_LOOP1(i, n) \
93  for (size_t i = blockIdx.x; i < n; i += gridDim.x)
94 
95 #define CUDA_2D_KERNEL_LOOP2(j, m) \
96  for (size_t j = threadIdx.x; j < m; j += blockDim.x)
97 
98 inline int CUDA_BLOCKS(const int N) {
99  return std::max(
100  std::min(
101  (N + CUDA_THREADS - 1) / CUDA_THREADS,
103  ), 1);
104 }
105 
106 inline int CUDA_2D_BLOCKS(const int N) {
107  return std::max(std::min(N, CUDA_MAX_BLOCKS), 1);
108 }
109 
110 #if CUDA_VERSION_MAX(9, 0, 0)
111 #define __hdiv hdiv
112 #endif
113 
114 inline int CUDA_NUM_DEVICES() {
115  static int count = -1;
116  if (count < 0) {
117  auto err = cudaGetDeviceCount(&count);
118  if (err == cudaErrorNoDevice ||
119  err == cudaErrorInsufficientDriver) count = 0;
120  }
121  return count;
122 }
123 
124 inline int CUDA_GET_DEVICE() {
125  int device_id;
126  cudaGetDevice(&device_id);
127  return device_id;
128 }
129 
132  for (int i = 0; i < CUDA_NUM_DEVICES(); ++i)
133  CUDA_CHECK(cudaGetDeviceProperties(&props[i], i));
134  }
135  vector<cudaDeviceProp> props;
136 };
137 
138 inline const cudaDeviceProp& GetCUDADeviceProp(
139  int device_id) {
140  static CUDADeviceProps props;
141  CHECK_LT(device_id, (int)props.props.size())
142  << "\nInvalid device id: " << device_id
143  << "\nDetected " << props.props.size()
144  << " devices.";
145  return props.props[device_id];
146 }
147 
149  int device = CUDA_GET_DEVICE();
150  auto& prop = GetCUDADeviceProp(device);
151  return prop.major >= 6;
152 }
153 
154 inline bool TENSOR_CORE_AVAILABLE() {
155 #if CUDA_VERSION < 9000
156  return false;
157 #else
158  int device = CUDA_GET_DEVICE();
159  auto& prop = GetCUDADeviceProp(device);
160  return prop.major >= 7;
161 #endif
162 }
163 
165  public:
166  CUDADeviceGuard(int new_id)
167  : prev_id_(CUDA_GET_DEVICE()) {
168  if (prev_id_ != new_id) {
169  CUDA_CHECK(cudaSetDevice(new_id));
170  }
171  }
172 
174  CUDA_CHECK(cudaSetDevice(prev_id_));
175  }
176 
177  private:
178  int prev_id_;
179 };
180 
181 #else
182 
183 #define CUDA_NOT_COMPILED \
184  LOG(FATAL) << "CUDA was not compiled."
185 
186 #endif // WITH_CUDA
187 
188 } // namespace dragon
189 
190 #endif // DRAGON_UTILS_CUDA_DEVICE_H_
int CUDA_GET_DEVICE()
Definition: cuda_device.h:124
~CUDADeviceGuard()
Definition: cuda_device.h:173
bool TENSOR_CORE_AVAILABLE()
Definition: cuda_device.h:154
int CUDA_BLOCKS(const int N)
Definition: cuda_device.h:98
#define CUDA_CHECK(condition)
Definition: cuda_device.h:60
CUDADeviceProps()
Definition: cuda_device.h:131
bool CUDA_TRUE_FP16_AVAILABLE()
Definition: cuda_device.h:148
Definition: cuda_device.h:164
#define CHECK_LT(val1, val2)
Definition: logging.h:52
const cudaDeviceProp & GetCUDADeviceProp(int device_id)
Definition: cuda_device.h:138
CUDADeviceGuard(int new_id)
Definition: cuda_device.h:166
Definition: cuda_device.h:130
vector< cudaDeviceProp > props
Definition: cuda_device.h:135
int CUDA_2D_BLOCKS(const int N)
Definition: cuda_device.h:106
int CUDA_NUM_DEVICES()
Definition: cuda_device.h:114
Definition: common.h:41
const int CUDA_THREADS
Definition: cuda_device.h:41
const int CUDA_MAX_BLOCKS
Definition: cuda_device.h:49