unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h - eigen/fuchsia - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H

 namespace Eigen {

 // Default device for the machine (typically a single cpu core)
 struct DefaultDevice {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     internal::aligned_free(buffer);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
     ::memcpy(dst, src, n);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
     ::memset(buffer, c, n);
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
 #ifndef __CUDA_ARCH__
     // Running on the host CPU
     return 1;
 #else
     // Running on a CUDA device
     return 32;
 #endif
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
     return 2 * numThreads();
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
 #ifndef __CUDA_ARCH__
     // Running on the host CPU
     return l1CacheSize();
 #else
     // Running on a CUDA device, return the amount of shared memory available.
     return 48*1024;
 #endif
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
 #ifndef __CUDA_ARCH__
     // Running single threaded on the host CPU
     return l3CacheSize();
 #else
     // Running on a CUDA device
     return firstLevelCacheSize();
 #endif
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
 #ifndef __CUDA_ARCH__
     // Running single threaded on the host CPU
     // Should return an enum that encodes the ISA supported by the CPU
     return 1;
 #else
     // Running on a CUDA device
     return __CUDA_ARCH__ / 100;
 #endif
   }
 };

 // Multiple cpu cores
 #ifdef EIGEN_USE_THREADS

 #ifdef EIGEN_USE_CUSTOM_THREAD_POOL
 typedef std::mutex mutex;
 typedef std::condition_variable condition_variable;
 typedef std::unique_lock<std::mutex> mutex_lock;
 #else
 typedef tensorflow::mutex mutex;
 typedef tensorflow::condition_variable condition_variable;
 typedef tensorflow::mutex_lock mutex_lock;
 #endif

 #ifdef EIGEN_USE_CUSTOM_THREAD_POOL

 struct StlThreadEnvironment {
   struct Task {
     std::function<void()> f;
   };

   // EnvThread constructor must start the thread,
   // destructor must join the thread.
   class EnvThread {
    public:
     EnvThread(std::function<void()> f) : thr_(f) {}
     ~EnvThread() { thr_.join(); }

    private:
     std::thread thr_;
   };

   EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
   Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
   void ExecuteTask(const Task& t) { t.f(); }
 };

 #endif  // EIGEN_USE_CUSTOM_THREAD_POOL

 // Barrier is an object that allows one or more threads to wait until
 // Notify has been called a specified number of times.
 class Barrier {
  public:
   explicit Barrier(unsigned int count) : state_(count << 1), notified_(false) {
     eigen_assert(((count << 1) >> 1) == count);
   }
   ~Barrier() {
     eigen_assert((state_>>1) == 0);
   }

   void Notify() {
     unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
     if (v != 1) {
       eigen_assert(((v + 2) & ~1) != 0);
       return;  // either count has not dropped to 0, or waiter is not waiting
     }
     mutex_lock l(mu_);
     eigen_assert(!notified_);
     notified_ = true;
     cv_.notify_all();
   }

   void Wait() {
     unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
     if ((v >> 1) == 0) return;
     mutex_lock l(mu_);
     while (!notified_) {
       cv_.wait(l);
     }
   }

  private:
   mutex mu_;
   condition_variable cv_;
   std::atomic<unsigned int> state_;  // low bit is waiter flag
   bool notified_;
 };

 // Notification is an object that allows a user to to wait for another
 // thread to signal a notification that an event has occurred.
 //
 // Multiple threads can wait on the same Notification object,
 // but only one caller must call Notify() on the object.
 struct Notification : Barrier {
   Notification() : Barrier(1) {};
 };

 // Runs an arbitrary function and then calls Notify() on the passed in
 // Notification.
 template <typename Function, typename... Args> struct FunctionWrapperWithNotification
 {
   static void run(Notification* n, Function f, Args... args) {
     f(args...);
     if (n) {
       n->Notify();
     }
   }
 };

 template <typename Function, typename... Args> struct FunctionWrapperWithBarrier
 {
   static void run(Barrier* b, Function f, Args... args) {
     f(args...);
     if (b) {
       b->Notify();
     }
   }
 };

 template <typename SyncType>
 static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
   if (n) {
     n->Wait();
   }
 }


 struct MemcpyExecutor {
   typedef MemcpyExecutor Self;

   MemcpyExecutor(void *dst, const void *src) :
       m_dst(static_cast<char *>(dst)), m_src(static_cast<const char *>(src)) { }

   static EIGEN_STRONG_INLINE void run(const MemcpyExecutor* exec, size_t idx, size_t block_size) {
     ::memcpy(&(exec->m_dst[idx]), &(exec->m_src[idx]), block_size);
   }

  private:
   char* m_dst;
   const char* m_src;
 };

 struct MemsetExecutor {
   typedef MemsetExecutor Self;

   MemsetExecutor(void *buffer, int val) :
       m_buffer(static_cast<char *>(buffer)), m_val(val) { }

   static EIGEN_STRONG_INLINE void run(const MemsetExecutor* exec, size_t idx, size_t block_size) {
     ::memset(&(exec->m_buffer[idx]), exec->m_val, block_size);
   }

  private:
   char* m_buffer;
   const int m_val;
 };


 struct ThreadPoolDevice {
   // The ownership of the thread pool remains with the caller.
   ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores)
       : num_threads_(num_cores), pool_(pool) {}

   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
   }

   EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     internal::aligned_free(buffer);
   }

   EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
 #ifdef __ANDROID__
     ::memcpy(dst, src, n);
 #else
     if (n <= 32768) {
       ::memcpy(dst, src, n);
     } else {
       MemcpyExecutor memcpy_executor(dst, src);
       execute(memcpy_executor, n);
     }
 #endif
   }

   EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
   }

   EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
   }

   EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
 #ifdef __ANDROID__
     ::memset(buffer, c, n);
 #else
     if (n <= 32768) {
       ::memset(buffer, c, n);
     } else {
       MemsetExecutor memset_executor(buffer, c);
       execute(memset_executor, n);
     }
 #endif
   }

   EIGEN_STRONG_INLINE size_t numThreads() const {
     return num_threads_;
   }

   EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
     return 2 * numThreads();
   }

   EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
     return l1CacheSize();
   }

   EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
     // The l3 cache size is shared between all the cores.
     return l3CacheSize() / num_threads_;
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
     // Should return an enum that encodes the ISA supported by the CPU
     return 1;
   }

   template <class Function, class... Args>
   EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
     Notification* n = new Notification();
     std::function<void()> func =
         std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...);
     pool_->Schedule(func);
     return n;
   }

   template <class Function, class... Args>
   EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
                                                 Function&& f,
                                                 Args&&... args) const {
     std::function<void()> func = std::bind(
         &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...);
     pool_->Schedule(func);
   }

   template <class Function, class... Args>
   EIGEN_STRONG_INLINE void enqueue_and_forget(Function&& f, Args&&... args) const {
     std::function<void()> func = std::bind(f, args...);
     pool_->Schedule(func);
   }

   template <class Function>
   EIGEN_STRONG_INLINE void enqueue_function(Function&& f) const {
     pool_->Schedule(f);
   }

   // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
   // called from one of the threads in pool_. Returns -1 otherwise.
   EIGEN_STRONG_INLINE int currentThreadId() const {
     return pool_->CurrentThreadId();
   }

   // parallelFor executes f with [0, n) arguments in parallel and waits for
   // completion. F accepts a half-open interval [first, last).
   // Block size is choosen based on the iteration cost and resulting parallel
   // efficiency. If block_align is not nullptr, it is called to round up the
   // block size.
   void parallelFor(Index n, const TensorOpCost& cost,
                    std::function<Index(Index)> block_align,
                    std::function<void(Index, Index)> f) const {
     typedef TensorCostModel<ThreadPoolDevice> CostModel;
     if (n <= 1 || numThreads() == 1 ||
         CostModel::numThreads(n, cost, numThreads()) == 1) {
       f(0, n);
       return;
     }

     // Calculate block size based on (1) the iteration cost and (2) parallel
     // efficiency. We want blocks to be not too small to mitigate
     // parallelization overheads; not too large to mitigate tail
     // effect and potential load imbalance and we also want number
     // of blocks to be evenly dividable across threads.

     double block_size_f = 1.0 / CostModel::taskSize(1, cost);
     Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
     const Index max_block_size =
         numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
     if (block_align) {
       Index new_block_size = block_align(block_size);
       eigen_assert(new_block_size >= block_size);
       block_size = numext::mini(n, new_block_size);
     }
     Index block_count = divup(n, block_size);
     // Calculate parallel efficiency as fraction of total CPU time used for
     // computations:
     double max_efficiency =
         static_cast<double>(block_count) /
         (divup<int>(block_count, numThreads()) * numThreads());
     // Now try to increase block size up to max_block_size as long as it
     // doesn't decrease parallel efficiency.
     for (Index prev_block_count = block_count; prev_block_count > 1;) {
       // This is the next block size that divides size into a smaller number
       // of blocks than the current block_size.
       Index coarser_block_size = divup(n, prev_block_count - 1);
       if (block_align) {
         Index new_block_size = block_align(coarser_block_size);
         eigen_assert(new_block_size >= coarser_block_size);
         coarser_block_size = numext::mini(n, new_block_size);
       }
       if (coarser_block_size > max_block_size) {
         break;  // Reached max block size. Stop.
       }
       // Recalculate parallel efficiency.
       const Index coarser_block_count = divup(n, coarser_block_size);
       eigen_assert(coarser_block_count < prev_block_count);
       prev_block_count = coarser_block_count;
       const double coarser_efficiency =
           static_cast<double>(coarser_block_count) /
           (divup<int>(coarser_block_count, numThreads()) * numThreads());
       if (coarser_efficiency + 0.01 >= max_efficiency) {
         // Taking it.
         block_size = coarser_block_size;
         block_count = coarser_block_count;
         if (max_efficiency < coarser_efficiency) {
           max_efficiency = coarser_efficiency;
         }
       }
     }

     // Recursively divide size into halves until we reach block_size.
     // Division code rounds mid to block_size, so we are guaranteed to get
     // block_count leaves that do actual computations.
     Barrier barrier(block_count);
     std::function<void(Index, Index)> handleRange;
     handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
       while (last - first > block_size) {
         // Split into halves and schedule the second half on a different thread.
         const Index mid = first + divup((last - first) / 2, block_size) * block_size;
         pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
         last = mid;
       }
       // Single block or less, execute directly.
       f(first, last);
       barrier.Notify();
     };
     if (block_count <= numThreads()) {
       // Avoid a thread hop by running the root of the tree and one block on the
       // main thread.
       handleRange(0, n);
     } else {
       // Execute the root in the thread pool to avoid running work on more than
       // numThreads() threads.
       pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
     }
     barrier.Wait();
   }

   // Convinience wrapper for parallelFor that does not align blocks.
   void parallelFor(Index n, const TensorOpCost& cost,
                    std::function<void(Index, Index)> f) const {
     parallelFor(n, cost, nullptr, std::move(f));
   }

  private:
   template<typename Executor>
   EIGEN_STRONG_INLINE void execute(const Executor& exec, size_t n) const {
     parallelFor(n, TensorOpCost(1, 0, 0), [&exec](Index first, Index last) {
       Executor::run(&exec, first, last - first);
     });
   }

   // todo: NUMA, ...
   size_t num_threads_;
   ThreadPoolInterface* pool_;
 };
 #endif


 // GPU offloading
 #ifdef EIGEN_USE_GPU

 static const int kCudaScratchSize = 1024;


 // An interface abstracting away device specific memory allocator.
 class Allocator {
  public:
   virtual ~Allocator() {}
   EIGEN_DEVICE_FUNC virtual void* allocate(size_t num_bytes) const = 0;
   EIGEN_DEVICE_FUNC virtual void deallocate(void* buffer) const = 0;
 };

 // This defines an interface that GPUDevice can take to use
 // CUDA streams underneath.
 class StreamInterface {
  public:
   virtual ~StreamInterface() {}

   virtual const cudaStream_t& stream() const = 0;
   virtual const cudaDeviceProp& deviceProperties() const = 0;

   // Allocate memory on the actual device where the computation will run.
   virtual void* allocate(size_t num_bytes) const = 0;
   virtual void deallocate(void* buffer) const = 0;

   // Return a scratchpad buffer of size 1k.
   virtual void* scratchpad() const = 0;

   // Return a semaphore. The semaphore is initially initialized to 0, and
   // each kernel using it is responsible for resetting to 0 upon completion
   // to maintain the invariant that the semaphore is always equal to 0 upon
   // each kernel start.
   virtual unsigned int* semaphore() const = 0;
 };

 static cudaDeviceProp* m_deviceProperties;
 static bool m_devicePropInitialized = false;

 #ifndef __CUDA_ARCH__
 static tensorflow::mutex m_devicePropInitMutex(tensorflow::LINKER_INITIALIZED);

 static void initializeDeviceProp() {
   if (!m_devicePropInitialized) {
     tensorflow::mutex_lock l(m_devicePropInitMutex);
     if (!m_devicePropInitialized) {
       int num_devices;
       cudaError_t status = cudaGetDeviceCount(&num_devices);
       eigen_check(status == cudaSuccess);
       m_deviceProperties = new cudaDeviceProp[num_devices];
       for (int i = 0; i < num_devices; ++i) {
         status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
         eigen_check(status == cudaSuccess);
       }
       m_devicePropInitialized = true;
     }
   }
 }
 #else
 static void initializeDeviceProp() {
   assert(false && "This function should never be called from within a CUDA kernel");
 }
 #endif  // __CUDA_ARCH__

 static const cudaStream_t default_stream = cudaStreamDefault;

 class CudaStreamDevice : public StreamInterface {
  public:
   // Use the default stream on the current device
   CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
     cudaGetDevice(&device_);
     initializeDeviceProp();
   }
   // Use the default stream on the specified device
   CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
     initializeDeviceProp();
   }
   // Use the specified stream. Note that it's the
   // caller responsibility to ensure that the stream can run on
   // the specified device. If no device is specified the code
   // assumes that the stream is associated to the current gpu device.
   CudaStreamDevice(const cudaStream_t* stream, int device = -1)
       : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
     if (device < 0) {
       cudaGetDevice(&device_);
     } else {
       int num_devices;
       cudaError_t err = cudaGetDeviceCount(&num_devices);
       eigen_check(err == cudaSuccess);
       eigen_check(device < num_devices);
       device_ = device;
     }
     initializeDeviceProp();
   }

   const cudaStream_t& stream() const { return *stream_; }
   const cudaDeviceProp& deviceProperties() const {
     return m_deviceProperties[device_];
   }
   virtual void* allocate(size_t num_bytes) const {
     cudaError_t err = cudaSetDevice(device_);
     eigen_check(err == cudaSuccess);
     void* result;
     err = cudaMalloc(&result, num_bytes);
     eigen_check(err == cudaSuccess);
     eigen_check(result != NULL);
     return result;
   }
   virtual void deallocate(void* buffer) const {
     cudaError_t err = cudaSetDevice(device_);
     eigen_check(err == cudaSuccess);
     assert(buffer != NULL);
     err = cudaFree(buffer);
     assert(err == cudaSuccess);
   }

   virtual void* scratchpad() const {
     if (scratch_ == NULL) {
       scratch_ = allocate(kCudaScratchSize+sizeof(unsigned int));
     }
     return scratch_;
   }

   virtual unsigned int* semaphore() const {
     if (semaphore_ == NULL) {
       char* semaphore_start = static_cast<char*>(scratchpad()) + kCudaScratchSize;
       semaphore_ = reinterpret_cast<unsigned int*>(semaphore_start);
       cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
       EIGEN_UNUSED_VARIABLE(err)
       assert(err == cudaSuccess);
     }
     return semaphore_;
   }

  private:
   const cudaStream_t* stream_;
   int device_;
   mutable void* scratch_;
   mutable unsigned int* semaphore_;
 };

 static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
   cudaError_t status = cudaDeviceSetSharedMemConfig(config);
   eigen_check(status == cudaSuccess);
 }

 struct GpuDevice {
   // Neither the cudastream nor the allocator is not owned: the caller is
   // responsible for their initialization and eventual destruction.
   explicit GpuDevice(const StreamInterface* stream) : stream_(stream) {
     eigen_assert(stream);
   }

   // TODO(bsteiner): This is an internal API, we should not expose it.
   EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
     return stream_->stream();
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
 #ifndef __CUDA_ARCH__
     return stream_->allocate(num_bytes);
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
     return NULL;
 #endif
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
 #ifndef __CUDA_ARCH__
     stream_->deallocate(buffer);
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
 #ifndef __CUDA_ARCH__
     return stream_->scratchpad();
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
     return NULL;
 #endif
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
 #ifndef __CUDA_ARCH__
     return stream_->semaphore();
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
     return NULL;
 #endif
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
 #ifndef __CUDA_ARCH__
     cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
                                       stream_->stream());
     assert(err == cudaSuccess);
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
 #ifndef __CUDA_ARCH__
     cudaError_t err =
         cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
     assert(err == cudaSuccess);
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
 #ifndef __CUDA_ARCH__
     cudaError_t err =
         cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
     assert(err == cudaSuccess);
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
 #ifndef __CUDA_ARCH__
     cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
     assert(err == cudaSuccess);
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
     // FIXME
     return 32;
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const {
     return 4 * 1024 * 1024;
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
     // FIXME
     return 48*1024;
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
     // We won't try to take advantage of the l2 cache for the time being, and
     // there is no l3 cache on cuda devices.
     return firstLevelCacheSize();
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
 #ifndef __CUDA_ARCH__
     cudaError_t err = cudaStreamSynchronize(stream_->stream());
     assert(err == cudaSuccess);
 #else
     assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
 #ifndef __CUDA_ARCH__
     return stream_->deviceProperties().multiProcessorCount;
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
     return 0;
 #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
 #ifndef __CUDA_ARCH__
     return stream_->deviceProperties().maxThreadsPerBlock;
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
     return 0;
 #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
 #ifndef __CUDA_ARCH__
     return stream_->deviceProperties().maxThreadsPerMultiProcessor;
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
     return 0;
 #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
 #ifndef __CUDA_ARCH__
     return stream_->deviceProperties().sharedMemPerBlock;
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
     return 0;
 #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
 #ifndef __CUDA_ARCH__
     return stream_->deviceProperties().major;
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
     return 0;
 #endif
   }

   // This function checks if the CUDA runtime recorded an error for the
   // underlying stream device.
   inline bool ok() const {
     cudaError_t error = cudaStreamQuery(stream_->stream());
     return (error == cudaSuccess) || (error == cudaErrorNotReady);
   }

  private:
   const StreamInterface* stream_;
 };

 inline void assertCudaOk() {
   cudaError_t err = cudaGetLastError();

   assert(err != cudaErrorMissingConfiguration);
   assert(err != cudaErrorMemoryAllocation);
   assert(err != cudaErrorInitializationError);
   assert(err != cudaErrorLaunchFailure);
   assert(err != cudaErrorPriorLaunchFailure);
   assert(err != cudaErrorLaunchTimeout);
   assert(err != cudaErrorLaunchOutOfResources);
   assert(err != cudaErrorInvalidDeviceFunction);
   assert(err != cudaErrorInvalidConfiguration);
   assert(err != cudaErrorInvalidDevice);
   assert(err != cudaErrorInvalidValue);
   assert(err != cudaErrorInvalidPitchValue);
   assert(err != cudaErrorInvalidSymbol);
   assert(err != cudaErrorMapBufferObjectFailed);
   assert(err != cudaErrorUnmapBufferObjectFailed);
   assert(err != cudaErrorInvalidHostPointer);
   assert(err != cudaErrorInvalidDevicePointer);
   assert(err != cudaErrorInvalidTexture);
   assert(err != cudaErrorInvalidTextureBinding);
   assert(err != cudaErrorInvalidChannelDescriptor);
   assert(err != cudaErrorInvalidMemcpyDirection);
   assert(err != cudaErrorAddressOfConstant);
   assert(err != cudaErrorTextureFetchFailed);
   assert(err != cudaErrorTextureNotBound);
   assert(err != cudaErrorSynchronizationError);
   assert(err != cudaErrorInvalidFilterSetting);
   assert(err != cudaErrorInvalidNormSetting);
   assert(err != cudaErrorMixedDeviceExecution);
   assert(err != cudaErrorCudartUnloading);
   assert(err != cudaErrorUnknown);
   assert(err != cudaErrorNotYetImplemented);
   assert(err != cudaErrorMemoryValueTooLarge);
   assert(err != cudaErrorInvalidResourceHandle);
   assert(err != cudaErrorNotReady);
   assert(err != cudaErrorInsufficientDriver);
   assert(err != cudaErrorSetOnActiveProcess);
   assert(err != cudaErrorInvalidSurface);
   assert(err != cudaErrorNoDevice);
   assert(err != cudaErrorECCUncorrectable);
   assert(err != cudaErrorSharedObjectSymbolNotFound);
   assert(err != cudaErrorSharedObjectInitFailed);
   assert(err != cudaErrorUnsupportedLimit);
   assert(err != cudaErrorDuplicateVariableName);
   assert(err != cudaErrorDuplicateTextureName);
   assert(err != cudaErrorDuplicateSurfaceName);
   assert(err != cudaErrorDevicesUnavailable);
   assert(err != cudaErrorInvalidKernelImage);
   assert(err != cudaErrorNoKernelImageForDevice);
   assert(err != cudaErrorIncompatibleDriverContext);
   assert(err != cudaErrorPeerAccessAlreadyEnabled);
   assert(err != cudaErrorPeerAccessNotEnabled);
   assert(err != cudaErrorDeviceAlreadyInUse);
   assert(err != cudaErrorProfilerDisabled);
   assert(err != cudaErrorProfilerNotInitialized);
   assert(err != cudaErrorProfilerAlreadyStarted);
   assert(err != cudaErrorProfilerAlreadyStopped);
   assert(err != cudaErrorAssert);
   assert(err != cudaErrorTooManyPeers);
   assert(err != cudaErrorHostMemoryAlreadyRegistered);
   assert(err != cudaErrorHostMemoryNotRegistered);
   assert(err != cudaErrorOperatingSystem);
   assert(err != cudaErrorStartupFailure);
   assert(err != cudaErrorApiFailureBase);

   // catch errors types introduced after this function was written
   assert(err == cudaSuccess);
 }

 #ifndef __CUDA_ARCH__
 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
   assert(cudaGetLastError() == cudaSuccess);
 #else
 #define LAUNCH_CUDA_KERNEL(kernel, ...)                                                     \
   { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \
   eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__);
 #endif

 #endif  // EIGEN_USE_GPU
 }  // end namespace Eigen

 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H