blob: 8be5af5c9fe0d22bb0d59d4f64d50ce79d81c22a [file] [log] [blame]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
namespace Eigen {
// Default device for the machine (typically a single cpu core)
struct DefaultDevice {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return internal::aligned_malloc(num_bytes);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
internal::aligned_free(buffer);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
::memcpy(dst, src, n);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
memcpy(dst, src, n);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
memcpy(dst, src, n);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
::memset(buffer, c, n);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
#ifndef __CUDA_ARCH__
// Running on the host CPU
return 1;
#else
// Running on a CUDA device
return 32;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
return 2 * numThreads();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
#ifndef __CUDA_ARCH__
// Running on the host CPU
return l1CacheSize();
#else
// Running on a CUDA device, return the amount of shared memory available.
return 48*1024;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
#ifndef __CUDA_ARCH__
// Running single threaded on the host CPU
return l3CacheSize();
#else
// Running on a CUDA device
return firstLevelCacheSize();
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
#ifndef __CUDA_ARCH__
// Running single threaded on the host CPU
// Should return an enum that encodes the ISA supported by the CPU
return 1;
#else
// Running on a CUDA device
return __CUDA_ARCH__ / 100;
#endif
}
};
// Multiple cpu cores
#ifdef EIGEN_USE_THREADS
#ifdef EIGEN_USE_CUSTOM_THREAD_POOL
typedef std::mutex mutex;
typedef std::condition_variable condition_variable;
typedef std::unique_lock<std::mutex> mutex_lock;
#else
typedef tensorflow::mutex mutex;
typedef tensorflow::condition_variable condition_variable;
typedef tensorflow::mutex_lock mutex_lock;
#endif
#ifdef EIGEN_USE_CUSTOM_THREAD_POOL
struct StlThreadEnvironment {
struct Task {
std::function<void()> f;
};
// EnvThread constructor must start the thread,
// destructor must join the thread.
class EnvThread {
public:
EnvThread(std::function<void()> f) : thr_(f) {}
~EnvThread() { thr_.join(); }
private:
std::thread thr_;
};
EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
void ExecuteTask(const Task& t) { t.f(); }
};
#endif // EIGEN_USE_CUSTOM_THREAD_POOL
// Barrier is an object that allows one or more threads to wait until
// Notify has been called a specified number of times.
class Barrier {
public:
explicit Barrier(unsigned int count) : state_(count << 1), notified_(false) {
eigen_assert(((count << 1) >> 1) == count);
}
~Barrier() {
eigen_assert((state_>>1) == 0);
}
void Notify() {
unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
if (v != 1) {
eigen_assert(((v + 2) & ~1) != 0);
return; // either count has not dropped to 0, or waiter is not waiting
}
mutex_lock l(mu_);
eigen_assert(!notified_);
notified_ = true;
cv_.notify_all();
}
void Wait() {
unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
if ((v >> 1) == 0) return;
mutex_lock l(mu_);
while (!notified_) {
cv_.wait(l);
}
}
private:
mutex mu_;
condition_variable cv_;
std::atomic<unsigned int> state_; // low bit is waiter flag
bool notified_;
};
// Notification is an object that allows a user to to wait for another
// thread to signal a notification that an event has occurred.
//
// Multiple threads can wait on the same Notification object,
// but only one caller must call Notify() on the object.
struct Notification : Barrier {
Notification() : Barrier(1) {};
};
// Runs an arbitrary function and then calls Notify() on the passed in
// Notification.
template <typename Function, typename... Args> struct FunctionWrapperWithNotification
{
static void run(Notification* n, Function f, Args... args) {
f(args...);
if (n) {
n->Notify();
}
}
};
template <typename Function, typename... Args> struct FunctionWrapperWithBarrier
{
static void run(Barrier* b, Function f, Args... args) {
f(args...);
if (b) {
b->Notify();
}
}
};
template <typename SyncType>
static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
if (n) {
n->Wait();
}
}
struct MemcpyExecutor {
typedef MemcpyExecutor Self;
MemcpyExecutor(void *dst, const void *src) :
m_dst(static_cast<char *>(dst)), m_src(static_cast<const char *>(src)) { }
static EIGEN_STRONG_INLINE void run(const MemcpyExecutor* exec, size_t idx, size_t block_size) {
::memcpy(&(exec->m_dst[idx]), &(exec->m_src[idx]), block_size);
}
private:
char* m_dst;
const char* m_src;
};
struct MemsetExecutor {
typedef MemsetExecutor Self;
MemsetExecutor(void *buffer, int val) :
m_buffer(static_cast<char *>(buffer)), m_val(val) { }
static EIGEN_STRONG_INLINE void run(const MemsetExecutor* exec, size_t idx, size_t block_size) {
::memset(&(exec->m_buffer[idx]), exec->m_val, block_size);
}
private:
char* m_buffer;
const int m_val;
};
struct ThreadPoolDevice {
// The ownership of the thread pool remains with the caller.
ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores)
: num_threads_(num_cores), pool_(pool) {}
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return internal::aligned_malloc(num_bytes);
}
EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
internal::aligned_free(buffer);
}
EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifdef __ANDROID__
::memcpy(dst, src, n);
#else
if (n <= 32768) {
::memcpy(dst, src, n);
} else {
MemcpyExecutor memcpy_executor(dst, src);
execute(memcpy_executor, n);
}
#endif
}
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
memcpy(dst, src, n);
}
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
memcpy(dst, src, n);
}
EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
#ifdef __ANDROID__
::memset(buffer, c, n);
#else
if (n <= 32768) {
::memset(buffer, c, n);
} else {
MemsetExecutor memset_executor(buffer, c);
execute(memset_executor, n);
}
#endif
}
EIGEN_STRONG_INLINE size_t numThreads() const {
return num_threads_;
}
EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
return 2 * numThreads();
}
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
return l1CacheSize();
}
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// The l3 cache size is shared between all the cores.
return l3CacheSize() / num_threads_;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
// Should return an enum that encodes the ISA supported by the CPU
return 1;
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
Notification* n = new Notification();
std::function<void()> func =
std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...);
pool_->Schedule(func);
return n;
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
Function&& f,
Args&&... args) const {
std::function<void()> func = std::bind(
&FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...);
pool_->Schedule(func);
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE void enqueue_and_forget(Function&& f, Args&&... args) const {
std::function<void()> func = std::bind(f, args...);
pool_->Schedule(func);
}
template <class Function>
EIGEN_STRONG_INLINE void enqueue_function(Function&& f) const {
pool_->Schedule(f);
}
// Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
// called from one of the threads in pool_. Returns -1 otherwise.
EIGEN_STRONG_INLINE int currentThreadId() const {
return pool_->CurrentThreadId();
}
// parallelFor executes f with [0, n) arguments in parallel and waits for
// completion. F accepts a half-open interval [first, last).
// Block size is choosen based on the iteration cost and resulting parallel
// efficiency. If block_align is not nullptr, it is called to round up the
// block size.
void parallelFor(Index n, const TensorOpCost& cost,
std::function<Index(Index)> block_align,
std::function<void(Index, Index)> f) const {
typedef TensorCostModel<ThreadPoolDevice> CostModel;
if (n <= 1 || numThreads() == 1 ||
CostModel::numThreads(n, cost, numThreads()) == 1) {
f(0, n);
return;
}
// Calculate block size based on (1) the iteration cost and (2) parallel
// efficiency. We want blocks to be not too small to mitigate
// parallelization overheads; not too large to mitigate tail
// effect and potential load imbalance and we also want number
// of blocks to be evenly dividable across threads.
double block_size_f = 1.0 / CostModel::taskSize(1, cost);
Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
const Index max_block_size =
numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
if (block_align) {
Index new_block_size = block_align(block_size);
eigen_assert(new_block_size >= block_size);
block_size = numext::mini(n, new_block_size);
}
Index block_count = divup(n, block_size);
// Calculate parallel efficiency as fraction of total CPU time used for
// computations:
double max_efficiency =
static_cast<double>(block_count) /
(divup<int>(block_count, numThreads()) * numThreads());
// Now try to increase block size up to max_block_size as long as it
// doesn't decrease parallel efficiency.
for (Index prev_block_count = block_count; prev_block_count > 1;) {
// This is the next block size that divides size into a smaller number
// of blocks than the current block_size.
Index coarser_block_size = divup(n, prev_block_count - 1);
if (block_align) {
Index new_block_size = block_align(coarser_block_size);
eigen_assert(new_block_size >= coarser_block_size);
coarser_block_size = numext::mini(n, new_block_size);
}
if (coarser_block_size > max_block_size) {
break; // Reached max block size. Stop.
}
// Recalculate parallel efficiency.
const Index coarser_block_count = divup(n, coarser_block_size);
eigen_assert(coarser_block_count < prev_block_count);
prev_block_count = coarser_block_count;
const double coarser_efficiency =
static_cast<double>(coarser_block_count) /
(divup<int>(coarser_block_count, numThreads()) * numThreads());
if (coarser_efficiency + 0.01 >= max_efficiency) {
// Taking it.
block_size = coarser_block_size;
block_count = coarser_block_count;
if (max_efficiency < coarser_efficiency) {
max_efficiency = coarser_efficiency;
}
}
}
// Recursively divide size into halves until we reach block_size.
// Division code rounds mid to block_size, so we are guaranteed to get
// block_count leaves that do actual computations.
Barrier barrier(block_count);
std::function<void(Index, Index)> handleRange;
handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
while (last - first > block_size) {
// Split into halves and schedule the second half on a different thread.
const Index mid = first + divup((last - first) / 2, block_size) * block_size;
pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
last = mid;
}
// Single block or less, execute directly.
f(first, last);
barrier.Notify();
};
if (block_count <= numThreads()) {
// Avoid a thread hop by running the root of the tree and one block on the
// main thread.
handleRange(0, n);
} else {
// Execute the root in the thread pool to avoid running work on more than
// numThreads() threads.
pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
}
barrier.Wait();
}
// Convinience wrapper for parallelFor that does not align blocks.
void parallelFor(Index n, const TensorOpCost& cost,
std::function<void(Index, Index)> f) const {
parallelFor(n, cost, nullptr, std::move(f));
}
private:
template<typename Executor>
EIGEN_STRONG_INLINE void execute(const Executor& exec, size_t n) const {
parallelFor(n, TensorOpCost(1, 0, 0), [&exec](Index first, Index last) {
Executor::run(&exec, first, last - first);
});
}
// todo: NUMA, ...
size_t num_threads_;
ThreadPoolInterface* pool_;
};
#endif
// GPU offloading
#ifdef EIGEN_USE_GPU
static const int kCudaScratchSize = 1024;
// An interface abstracting away device specific memory allocator.
class Allocator {
public:
virtual ~Allocator() {}
EIGEN_DEVICE_FUNC virtual void* allocate(size_t num_bytes) const = 0;
EIGEN_DEVICE_FUNC virtual void deallocate(void* buffer) const = 0;
};
// This defines an interface that GPUDevice can take to use
// CUDA streams underneath.
class StreamInterface {
public:
virtual ~StreamInterface() {}
virtual const cudaStream_t& stream() const = 0;
virtual const cudaDeviceProp& deviceProperties() const = 0;
// Allocate memory on the actual device where the computation will run.
virtual void* allocate(size_t num_bytes) const = 0;
virtual void deallocate(void* buffer) const = 0;
// Return a scratchpad buffer of size 1k.
virtual void* scratchpad() const = 0;
// Return a semaphore. The semaphore is initially initialized to 0, and
// each kernel using it is responsible for resetting to 0 upon completion
// to maintain the invariant that the semaphore is always equal to 0 upon
// each kernel start.
virtual unsigned int* semaphore() const = 0;
};
static cudaDeviceProp* m_deviceProperties;
static bool m_devicePropInitialized = false;
#ifndef __CUDA_ARCH__
static tensorflow::mutex m_devicePropInitMutex(tensorflow::LINKER_INITIALIZED);
static void initializeDeviceProp() {
if (!m_devicePropInitialized) {
tensorflow::mutex_lock l(m_devicePropInitMutex);
if (!m_devicePropInitialized) {
int num_devices;
cudaError_t status = cudaGetDeviceCount(&num_devices);
eigen_check(status == cudaSuccess);
m_deviceProperties = new cudaDeviceProp[num_devices];
for (int i = 0; i < num_devices; ++i) {
status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
eigen_check(status == cudaSuccess);
}
m_devicePropInitialized = true;
}
}
}
#else
static void initializeDeviceProp() {
assert(false && "This function should never be called from within a CUDA kernel");
}
#endif // __CUDA_ARCH__
static const cudaStream_t default_stream = cudaStreamDefault;
class CudaStreamDevice : public StreamInterface {
public:
// Use the default stream on the current device
CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
cudaGetDevice(&device_);
initializeDeviceProp();
}
// Use the default stream on the specified device
CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
initializeDeviceProp();
}
// Use the specified stream. Note that it's the
// caller responsibility to ensure that the stream can run on
// the specified device. If no device is specified the code
// assumes that the stream is associated to the current gpu device.
CudaStreamDevice(const cudaStream_t* stream, int device = -1)
: stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
if (device < 0) {
cudaGetDevice(&device_);
} else {
int num_devices;
cudaError_t err = cudaGetDeviceCount(&num_devices);
eigen_check(err == cudaSuccess);
eigen_check(device < num_devices);
device_ = device;
}
initializeDeviceProp();
}
const cudaStream_t& stream() const { return *stream_; }
const cudaDeviceProp& deviceProperties() const {
return m_deviceProperties[device_];
}
virtual void* allocate(size_t num_bytes) const {
cudaError_t err = cudaSetDevice(device_);
eigen_check(err == cudaSuccess);
void* result;
err = cudaMalloc(&result, num_bytes);
eigen_check(err == cudaSuccess);
eigen_check(result != NULL);
return result;
}
virtual void deallocate(void* buffer) const {
cudaError_t err = cudaSetDevice(device_);
eigen_check(err == cudaSuccess);
assert(buffer != NULL);
err = cudaFree(buffer);
assert(err == cudaSuccess);
}
virtual void* scratchpad() const {
if (scratch_ == NULL) {
scratch_ = allocate(kCudaScratchSize+sizeof(unsigned int));
}
return scratch_;
}
virtual unsigned int* semaphore() const {
if (semaphore_ == NULL) {
char* semaphore_start = static_cast<char*>(scratchpad()) + kCudaScratchSize;
semaphore_ = reinterpret_cast<unsigned int*>(semaphore_start);
cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
}
return semaphore_;
}
private:
const cudaStream_t* stream_;
int device_;
mutable void* scratch_;
mutable unsigned int* semaphore_;
};
static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
eigen_check(status == cudaSuccess);
}
struct GpuDevice {
// Neither the cudastream nor the allocator is not owned: the caller is
// responsible for their initialization and eventual destruction.
explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
eigen_assert(stream);
}
// TODO(bsteiner): This is an internal API, we should not expose it.
EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
return stream_->stream();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
#ifndef __CUDA_ARCH__
return stream_->allocate(num_bytes);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return NULL;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
#ifndef __CUDA_ARCH__
stream_->deallocate(buffer);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
#ifndef __CUDA_ARCH__
return stream_->scratchpad();
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return NULL;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
#ifndef __CUDA_ARCH__
return stream_->semaphore();
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return NULL;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
stream_->stream());
assert(err == cudaSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__
cudaError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
assert(err == cudaSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__
cudaError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
assert(err == cudaSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
#ifndef __CUDA_ARCH__
cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
assert(err == cudaSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
// FIXME
return 32;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
return 4 * 1024 * 1024;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
// FIXME
return 48*1024;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and
// there is no l3 cache on cuda devices.
return firstLevelCacheSize();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
#ifndef __CUDA_ARCH__
cudaError_t err = cudaStreamSynchronize(stream_->stream());
assert(err == cudaSuccess);
#else
assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
#ifndef __CUDA_ARCH__
return stream_->deviceProperties().multiProcessorCount;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
#ifndef __CUDA_ARCH__
return stream_->deviceProperties().maxThreadsPerBlock;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
#ifndef __CUDA_ARCH__
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
#ifndef __CUDA_ARCH__
return stream_->deviceProperties().sharedMemPerBlock;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
#ifndef __CUDA_ARCH__
return stream_->deviceProperties().major;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
}
// This function checks if the CUDA runtime recorded an error for the
// underlying stream device.
inline bool ok() const {
cudaError_t error = cudaStreamQuery(stream_->stream());
return (error == cudaSuccess) || (error == cudaErrorNotReady);
}
private:
const StreamInterface* stream_;
};
inline void assertCudaOk() {
cudaError_t err = cudaGetLastError();
assert(err != cudaErrorMissingConfiguration);
assert(err != cudaErrorMemoryAllocation);
assert(err != cudaErrorInitializationError);
assert(err != cudaErrorLaunchFailure);
assert(err != cudaErrorPriorLaunchFailure);
assert(err != cudaErrorLaunchTimeout);
assert(err != cudaErrorLaunchOutOfResources);
assert(err != cudaErrorInvalidDeviceFunction);
assert(err != cudaErrorInvalidConfiguration);
assert(err != cudaErrorInvalidDevice);
assert(err != cudaErrorInvalidValue);
assert(err != cudaErrorInvalidPitchValue);
assert(err != cudaErrorInvalidSymbol);
assert(err != cudaErrorMapBufferObjectFailed);
assert(err != cudaErrorUnmapBufferObjectFailed);
assert(err != cudaErrorInvalidHostPointer);
assert(err != cudaErrorInvalidDevicePointer);
assert(err != cudaErrorInvalidTexture);
assert(err != cudaErrorInvalidTextureBinding);
assert(err != cudaErrorInvalidChannelDescriptor);
assert(err != cudaErrorInvalidMemcpyDirection);
assert(err != cudaErrorAddressOfConstant);
assert(err != cudaErrorTextureFetchFailed);
assert(err != cudaErrorTextureNotBound);
assert(err != cudaErrorSynchronizationError);
assert(err != cudaErrorInvalidFilterSetting);
assert(err != cudaErrorInvalidNormSetting);
assert(err != cudaErrorMixedDeviceExecution);
assert(err != cudaErrorCudartUnloading);
assert(err != cudaErrorUnknown);
assert(err != cudaErrorNotYetImplemented);
assert(err != cudaErrorMemoryValueTooLarge);
assert(err != cudaErrorInvalidResourceHandle);
assert(err != cudaErrorNotReady);
assert(err != cudaErrorInsufficientDriver);
assert(err != cudaErrorSetOnActiveProcess);
assert(err != cudaErrorInvalidSurface);
assert(err != cudaErrorNoDevice);
assert(err != cudaErrorECCUncorrectable);
assert(err != cudaErrorSharedObjectSymbolNotFound);
assert(err != cudaErrorSharedObjectInitFailed);
assert(err != cudaErrorUnsupportedLimit);
assert(err != cudaErrorDuplicateVariableName);
assert(err != cudaErrorDuplicateTextureName);
assert(err != cudaErrorDuplicateSurfaceName);
assert(err != cudaErrorDevicesUnavailable);
assert(err != cudaErrorInvalidKernelImage);
assert(err != cudaErrorNoKernelImageForDevice);
assert(err != cudaErrorIncompatibleDriverContext);
assert(err != cudaErrorPeerAccessAlreadyEnabled);
assert(err != cudaErrorPeerAccessNotEnabled);
assert(err != cudaErrorDeviceAlreadyInUse);
assert(err != cudaErrorProfilerDisabled);
assert(err != cudaErrorProfilerNotInitialized);
assert(err != cudaErrorProfilerAlreadyStarted);
assert(err != cudaErrorProfilerAlreadyStopped);
assert(err != cudaErrorAssert);
assert(err != cudaErrorTooManyPeers);
assert(err != cudaErrorHostMemoryAlreadyRegistered);
assert(err != cudaErrorHostMemoryNotRegistered);
assert(err != cudaErrorOperatingSystem);
assert(err != cudaErrorStartupFailure);
assert(err != cudaErrorApiFailureBase);
// catch errors types introduced after this function was written
assert(err == cudaSuccess);
}
#ifndef __CUDA_ARCH__
#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
assert(cudaGetLastError() == cudaSuccess);
#else
#define LAUNCH_CUDA_KERNEL(kernel, ...) \
{ const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
#endif
#endif // EIGEN_USE_GPU
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H