| // This file is part of Eigen, a lightweight C++ template library |
| // for linear algebra. |
| // |
| // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> |
| // |
| // This Source Code Form is subject to the terms of the Mozilla |
| // Public License v. 2.0. If a copy of the MPL was not distributed |
| // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| |
| #ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H |
| #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H |
| |
| namespace Eigen { |
| |
| // Default device for the machine (typically a single cpu core) |
| struct DefaultDevice { |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { |
| return internal::aligned_malloc(num_bytes); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { |
| internal::aligned_free(buffer); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { |
| ::memcpy(dst, src, n); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { |
| memcpy(dst, src, n); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { |
| memcpy(dst, src, n); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { |
| ::memset(buffer, c, n); |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { |
| #ifndef __CUDA_ARCH__ |
| // Running on the host CPU |
| return 1; |
| #else |
| // Running on a CUDA device |
| return 32; |
| #endif |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const { |
| return 2 * numThreads(); |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { |
| #ifndef __CUDA_ARCH__ |
| // Running on the host CPU |
| return l1CacheSize(); |
| #else |
| // Running on a CUDA device, return the amount of shared memory available. |
| return 48*1024; |
| #endif |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { |
| #ifndef __CUDA_ARCH__ |
| // Running single threaded on the host CPU |
| return l3CacheSize(); |
| #else |
| // Running on a CUDA device |
| return firstLevelCacheSize(); |
| #endif |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { |
| #ifndef __CUDA_ARCH__ |
| // Running single threaded on the host CPU |
| // Should return an enum that encodes the ISA supported by the CPU |
| return 1; |
| #else |
| // Running on a CUDA device |
| return __CUDA_ARCH__ / 100; |
| #endif |
| } |
| }; |
| |
| // Multiple cpu cores |
| #ifdef EIGEN_USE_THREADS |
| |
| #ifdef EIGEN_USE_CUSTOM_THREAD_POOL |
| typedef std::mutex mutex; |
| typedef std::condition_variable condition_variable; |
| typedef std::unique_lock<std::mutex> mutex_lock; |
| #else |
| typedef tensorflow::mutex mutex; |
| typedef tensorflow::condition_variable condition_variable; |
| typedef tensorflow::mutex_lock mutex_lock; |
| #endif |
| |
| #ifdef EIGEN_USE_CUSTOM_THREAD_POOL |
| |
| struct StlThreadEnvironment { |
| struct Task { |
| std::function<void()> f; |
| }; |
| |
| // EnvThread constructor must start the thread, |
| // destructor must join the thread. |
| class EnvThread { |
| public: |
| EnvThread(std::function<void()> f) : thr_(f) {} |
| ~EnvThread() { thr_.join(); } |
| |
| private: |
| std::thread thr_; |
| }; |
| |
| EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); } |
| Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; } |
| void ExecuteTask(const Task& t) { t.f(); } |
| }; |
| |
| #endif // EIGEN_USE_CUSTOM_THREAD_POOL |
| |
| // Barrier is an object that allows one or more threads to wait until |
| // Notify has been called a specified number of times. |
| class Barrier { |
| public: |
| explicit Barrier(unsigned int count) : state_(count << 1), notified_(false) { |
| eigen_assert(((count << 1) >> 1) == count); |
| } |
| ~Barrier() { |
| eigen_assert((state_>>1) == 0); |
| } |
| |
| void Notify() { |
| unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; |
| if (v != 1) { |
| eigen_assert(((v + 2) & ~1) != 0); |
| return; // either count has not dropped to 0, or waiter is not waiting |
| } |
| mutex_lock l(mu_); |
| eigen_assert(!notified_); |
| notified_ = true; |
| cv_.notify_all(); |
| } |
| |
| void Wait() { |
| unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); |
| if ((v >> 1) == 0) return; |
| mutex_lock l(mu_); |
| while (!notified_) { |
| cv_.wait(l); |
| } |
| } |
| |
| private: |
| mutex mu_; |
| condition_variable cv_; |
| std::atomic<unsigned int> state_; // low bit is waiter flag |
| bool notified_; |
| }; |
| |
| // Notification is an object that allows a user to to wait for another |
| // thread to signal a notification that an event has occurred. |
| // |
| // Multiple threads can wait on the same Notification object, |
| // but only one caller must call Notify() on the object. |
| struct Notification : Barrier { |
| Notification() : Barrier(1) {}; |
| }; |
| |
| // Runs an arbitrary function and then calls Notify() on the passed in |
| // Notification. |
| template <typename Function, typename... Args> struct FunctionWrapperWithNotification |
| { |
| static void run(Notification* n, Function f, Args... args) { |
| f(args...); |
| if (n) { |
| n->Notify(); |
| } |
| } |
| }; |
| |
| template <typename Function, typename... Args> struct FunctionWrapperWithBarrier |
| { |
| static void run(Barrier* b, Function f, Args... args) { |
| f(args...); |
| if (b) { |
| b->Notify(); |
| } |
| } |
| }; |
| |
| template <typename SyncType> |
| static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { |
| if (n) { |
| n->Wait(); |
| } |
| } |
| |
| |
| struct MemcpyExecutor { |
| typedef MemcpyExecutor Self; |
| |
| MemcpyExecutor(void *dst, const void *src) : |
| m_dst(static_cast<char *>(dst)), m_src(static_cast<const char *>(src)) { } |
| |
| static EIGEN_STRONG_INLINE void run(const MemcpyExecutor* exec, size_t idx, size_t block_size) { |
| ::memcpy(&(exec->m_dst[idx]), &(exec->m_src[idx]), block_size); |
| } |
| |
| private: |
| char* m_dst; |
| const char* m_src; |
| }; |
| |
| struct MemsetExecutor { |
| typedef MemsetExecutor Self; |
| |
| MemsetExecutor(void *buffer, int val) : |
| m_buffer(static_cast<char *>(buffer)), m_val(val) { } |
| |
| static EIGEN_STRONG_INLINE void run(const MemsetExecutor* exec, size_t idx, size_t block_size) { |
| ::memset(&(exec->m_buffer[idx]), exec->m_val, block_size); |
| } |
| |
| private: |
| char* m_buffer; |
| const int m_val; |
| }; |
| |
| |
| struct ThreadPoolDevice { |
| // The ownership of the thread pool remains with the caller. |
| ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) |
| : num_threads_(num_cores), pool_(pool) {} |
| |
| EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { |
| return internal::aligned_malloc(num_bytes); |
| } |
| |
| EIGEN_STRONG_INLINE void deallocate(void* buffer) const { |
| internal::aligned_free(buffer); |
| } |
| |
| EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { |
| #ifdef __ANDROID__ |
| ::memcpy(dst, src, n); |
| #else |
| if (n <= 32768) { |
| ::memcpy(dst, src, n); |
| } else { |
| MemcpyExecutor memcpy_executor(dst, src); |
| execute(memcpy_executor, n); |
| } |
| #endif |
| } |
| |
| EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { |
| memcpy(dst, src, n); |
| } |
| |
| EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { |
| memcpy(dst, src, n); |
| } |
| |
| EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { |
| #ifdef __ANDROID__ |
| ::memset(buffer, c, n); |
| #else |
| if (n <= 32768) { |
| ::memset(buffer, c, n); |
| } else { |
| MemsetExecutor memset_executor(buffer, c); |
| execute(memset_executor, n); |
| } |
| #endif |
| } |
| |
| EIGEN_STRONG_INLINE size_t numThreads() const { |
| return num_threads_; |
| } |
| |
| EIGEN_STRONG_INLINE size_t memcpyThreshold() const { |
| return 2 * numThreads(); |
| } |
| |
| EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { |
| return l1CacheSize(); |
| } |
| |
| EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { |
| // The l3 cache size is shared between all the cores. |
| return l3CacheSize() / num_threads_; |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { |
| // Should return an enum that encodes the ISA supported by the CPU |
| return 1; |
| } |
| |
| template <class Function, class... Args> |
| EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { |
| Notification* n = new Notification(); |
| std::function<void()> func = |
| std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...); |
| pool_->Schedule(func); |
| return n; |
| } |
| |
| template <class Function, class... Args> |
| EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, |
| Function&& f, |
| Args&&... args) const { |
| std::function<void()> func = std::bind( |
| &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...); |
| pool_->Schedule(func); |
| } |
| |
| template <class Function, class... Args> |
| EIGEN_STRONG_INLINE void enqueue_and_forget(Function&& f, Args&&... args) const { |
| std::function<void()> func = std::bind(f, args...); |
| pool_->Schedule(func); |
| } |
| |
| template <class Function> |
| EIGEN_STRONG_INLINE void enqueue_function(Function&& f) const { |
| pool_->Schedule(f); |
| } |
| |
| // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if |
| // called from one of the threads in pool_. Returns -1 otherwise. |
| EIGEN_STRONG_INLINE int currentThreadId() const { |
| return pool_->CurrentThreadId(); |
| } |
| |
| // parallelFor executes f with [0, n) arguments in parallel and waits for |
| // completion. F accepts a half-open interval [first, last). |
| // Block size is choosen based on the iteration cost and resulting parallel |
| // efficiency. If block_align is not nullptr, it is called to round up the |
| // block size. |
| void parallelFor(Index n, const TensorOpCost& cost, |
| std::function<Index(Index)> block_align, |
| std::function<void(Index, Index)> f) const { |
| typedef TensorCostModel<ThreadPoolDevice> CostModel; |
| if (n <= 1 || numThreads() == 1 || |
| CostModel::numThreads(n, cost, numThreads()) == 1) { |
| f(0, n); |
| return; |
| } |
| |
| // Calculate block size based on (1) the iteration cost and (2) parallel |
| // efficiency. We want blocks to be not too small to mitigate |
| // parallelization overheads; not too large to mitigate tail |
| // effect and potential load imbalance and we also want number |
| // of blocks to be evenly dividable across threads. |
| |
| double block_size_f = 1.0 / CostModel::taskSize(1, cost); |
| Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f)); |
| const Index max_block_size = |
| numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f)); |
| if (block_align) { |
| Index new_block_size = block_align(block_size); |
| eigen_assert(new_block_size >= block_size); |
| block_size = numext::mini(n, new_block_size); |
| } |
| Index block_count = divup(n, block_size); |
| // Calculate parallel efficiency as fraction of total CPU time used for |
| // computations: |
| double max_efficiency = |
| static_cast<double>(block_count) / |
| (divup<int>(block_count, numThreads()) * numThreads()); |
| // Now try to increase block size up to max_block_size as long as it |
| // doesn't decrease parallel efficiency. |
| for (Index prev_block_count = block_count; prev_block_count > 1;) { |
| // This is the next block size that divides size into a smaller number |
| // of blocks than the current block_size. |
| Index coarser_block_size = divup(n, prev_block_count - 1); |
| if (block_align) { |
| Index new_block_size = block_align(coarser_block_size); |
| eigen_assert(new_block_size >= coarser_block_size); |
| coarser_block_size = numext::mini(n, new_block_size); |
| } |
| if (coarser_block_size > max_block_size) { |
| break; // Reached max block size. Stop. |
| } |
| // Recalculate parallel efficiency. |
| const Index coarser_block_count = divup(n, coarser_block_size); |
| eigen_assert(coarser_block_count < prev_block_count); |
| prev_block_count = coarser_block_count; |
| const double coarser_efficiency = |
| static_cast<double>(coarser_block_count) / |
| (divup<int>(coarser_block_count, numThreads()) * numThreads()); |
| if (coarser_efficiency + 0.01 >= max_efficiency) { |
| // Taking it. |
| block_size = coarser_block_size; |
| block_count = coarser_block_count; |
| if (max_efficiency < coarser_efficiency) { |
| max_efficiency = coarser_efficiency; |
| } |
| } |
| } |
| |
| // Recursively divide size into halves until we reach block_size. |
| // Division code rounds mid to block_size, so we are guaranteed to get |
| // block_count leaves that do actual computations. |
| Barrier barrier(block_count); |
| std::function<void(Index, Index)> handleRange; |
| handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { |
| while (last - first > block_size) { |
| // Split into halves and schedule the second half on a different thread. |
| const Index mid = first + divup((last - first) / 2, block_size) * block_size; |
| pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); |
| last = mid; |
| } |
| // Single block or less, execute directly. |
| f(first, last); |
| barrier.Notify(); |
| }; |
| if (block_count <= numThreads()) { |
| // Avoid a thread hop by running the root of the tree and one block on the |
| // main thread. |
| handleRange(0, n); |
| } else { |
| // Execute the root in the thread pool to avoid running work on more than |
| // numThreads() threads. |
| pool_->Schedule([=, &handleRange]() { handleRange(0, n); }); |
| } |
| barrier.Wait(); |
| } |
| |
| // Convinience wrapper for parallelFor that does not align blocks. |
| void parallelFor(Index n, const TensorOpCost& cost, |
| std::function<void(Index, Index)> f) const { |
| parallelFor(n, cost, nullptr, std::move(f)); |
| } |
| |
| private: |
| template<typename Executor> |
| EIGEN_STRONG_INLINE void execute(const Executor& exec, size_t n) const { |
| parallelFor(n, TensorOpCost(1, 0, 0), [&exec](Index first, Index last) { |
| Executor::run(&exec, first, last - first); |
| }); |
| } |
| |
| // todo: NUMA, ... |
| size_t num_threads_; |
| ThreadPoolInterface* pool_; |
| }; |
| #endif |
| |
| |
| // GPU offloading |
| #ifdef EIGEN_USE_GPU |
| |
| static const int kCudaScratchSize = 1024; |
| |
| |
| // An interface abstracting away device specific memory allocator. |
| class Allocator { |
| public: |
| virtual ~Allocator() {} |
| EIGEN_DEVICE_FUNC virtual void* allocate(size_t num_bytes) const = 0; |
| EIGEN_DEVICE_FUNC virtual void deallocate(void* buffer) const = 0; |
| }; |
| |
| // This defines an interface that GPUDevice can take to use |
| // CUDA streams underneath. |
| class StreamInterface { |
| public: |
| virtual ~StreamInterface() {} |
| |
| virtual const cudaStream_t& stream() const = 0; |
| virtual const cudaDeviceProp& deviceProperties() const = 0; |
| |
| // Allocate memory on the actual device where the computation will run. |
| virtual void* allocate(size_t num_bytes) const = 0; |
| virtual void deallocate(void* buffer) const = 0; |
| |
| // Return a scratchpad buffer of size 1k. |
| virtual void* scratchpad() const = 0; |
| |
| // Return a semaphore. The semaphore is initially initialized to 0, and |
| // each kernel using it is responsible for resetting to 0 upon completion |
| // to maintain the invariant that the semaphore is always equal to 0 upon |
| // each kernel start. |
| virtual unsigned int* semaphore() const = 0; |
| }; |
| |
| static cudaDeviceProp* m_deviceProperties; |
| static bool m_devicePropInitialized = false; |
| |
| #ifndef __CUDA_ARCH__ |
| static tensorflow::mutex m_devicePropInitMutex(tensorflow::LINKER_INITIALIZED); |
| |
| static void initializeDeviceProp() { |
| if (!m_devicePropInitialized) { |
| tensorflow::mutex_lock l(m_devicePropInitMutex); |
| if (!m_devicePropInitialized) { |
| int num_devices; |
| cudaError_t status = cudaGetDeviceCount(&num_devices); |
| eigen_check(status == cudaSuccess); |
| m_deviceProperties = new cudaDeviceProp[num_devices]; |
| for (int i = 0; i < num_devices; ++i) { |
| status = cudaGetDeviceProperties(&m_deviceProperties[i], i); |
| eigen_check(status == cudaSuccess); |
| } |
| m_devicePropInitialized = true; |
| } |
| } |
| } |
| #else |
| static void initializeDeviceProp() { |
| assert(false && "This function should never be called from within a CUDA kernel"); |
| } |
| #endif // __CUDA_ARCH__ |
| |
| static const cudaStream_t default_stream = cudaStreamDefault; |
| |
| class CudaStreamDevice : public StreamInterface { |
| public: |
| // Use the default stream on the current device |
| CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { |
| cudaGetDevice(&device_); |
| initializeDeviceProp(); |
| } |
| // Use the default stream on the specified device |
| CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { |
| initializeDeviceProp(); |
| } |
| // Use the specified stream. Note that it's the |
| // caller responsibility to ensure that the stream can run on |
| // the specified device. If no device is specified the code |
| // assumes that the stream is associated to the current gpu device. |
| CudaStreamDevice(const cudaStream_t* stream, int device = -1) |
| : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { |
| if (device < 0) { |
| cudaGetDevice(&device_); |
| } else { |
| int num_devices; |
| cudaError_t err = cudaGetDeviceCount(&num_devices); |
| eigen_check(err == cudaSuccess); |
| eigen_check(device < num_devices); |
| device_ = device; |
| } |
| initializeDeviceProp(); |
| } |
| |
| const cudaStream_t& stream() const { return *stream_; } |
| const cudaDeviceProp& deviceProperties() const { |
| return m_deviceProperties[device_]; |
| } |
| virtual void* allocate(size_t num_bytes) const { |
| cudaError_t err = cudaSetDevice(device_); |
| eigen_check(err == cudaSuccess); |
| void* result; |
| err = cudaMalloc(&result, num_bytes); |
| eigen_check(err == cudaSuccess); |
| eigen_check(result != NULL); |
| return result; |
| } |
| virtual void deallocate(void* buffer) const { |
| cudaError_t err = cudaSetDevice(device_); |
| eigen_check(err == cudaSuccess); |
| assert(buffer != NULL); |
| err = cudaFree(buffer); |
| assert(err == cudaSuccess); |
| } |
| |
| virtual void* scratchpad() const { |
| if (scratch_ == NULL) { |
| scratch_ = allocate(kCudaScratchSize+sizeof(unsigned int)); |
| } |
| return scratch_; |
| } |
| |
| virtual unsigned int* semaphore() const { |
| if (semaphore_ == NULL) { |
| char* semaphore_start = static_cast<char*>(scratchpad()) + kCudaScratchSize; |
| semaphore_ = reinterpret_cast<unsigned int*>(semaphore_start); |
| cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); |
| EIGEN_UNUSED_VARIABLE(err) |
| assert(err == cudaSuccess); |
| } |
| return semaphore_; |
| } |
| |
| private: |
| const cudaStream_t* stream_; |
| int device_; |
| mutable void* scratch_; |
| mutable unsigned int* semaphore_; |
| }; |
| |
| static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { |
| cudaError_t status = cudaDeviceSetSharedMemConfig(config); |
| eigen_check(status == cudaSuccess); |
| } |
| |
| struct GpuDevice { |
| // Neither the cudastream nor the allocator is not owned: the caller is |
| // responsible for their initialization and eventual destruction. |
| explicit GpuDevice(const StreamInterface* stream) : stream_(stream) { |
| eigen_assert(stream); |
| } |
| |
| // TODO(bsteiner): This is an internal API, we should not expose it. |
| EIGEN_STRONG_INLINE const cudaStream_t& stream() const { |
| return stream_->stream(); |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { |
| #ifndef __CUDA_ARCH__ |
| return stream_->allocate(num_bytes); |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| return NULL; |
| #endif |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { |
| #ifndef __CUDA_ARCH__ |
| stream_->deallocate(buffer); |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| #endif |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const { |
| #ifndef __CUDA_ARCH__ |
| return stream_->scratchpad(); |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| return NULL; |
| #endif |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const { |
| #ifndef __CUDA_ARCH__ |
| return stream_->semaphore(); |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| return NULL; |
| #endif |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { |
| #ifndef __CUDA_ARCH__ |
| cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, |
| stream_->stream()); |
| assert(err == cudaSuccess); |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| #endif |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { |
| #ifndef __CUDA_ARCH__ |
| cudaError_t err = |
| cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); |
| assert(err == cudaSuccess); |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| #endif |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { |
| #ifndef __CUDA_ARCH__ |
| cudaError_t err = |
| cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); |
| assert(err == cudaSuccess); |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| #endif |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { |
| #ifndef __CUDA_ARCH__ |
| cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); |
| assert(err == cudaSuccess); |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| #endif |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { |
| // FIXME |
| return 32; |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t memcpyThreshold() const { |
| return 4 * 1024 * 1024; |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { |
| // FIXME |
| return 48*1024; |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { |
| // We won't try to take advantage of the l2 cache for the time being, and |
| // there is no l3 cache on cuda devices. |
| return firstLevelCacheSize(); |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { |
| #ifndef __CUDA_ARCH__ |
| cudaError_t err = cudaStreamSynchronize(stream_->stream()); |
| assert(err == cudaSuccess); |
| #else |
| assert(false && "The default device should be used instead to generate kernel code"); |
| #endif |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { |
| #ifndef __CUDA_ARCH__ |
| return stream_->deviceProperties().multiProcessorCount; |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| return 0; |
| #endif |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { |
| #ifndef __CUDA_ARCH__ |
| return stream_->deviceProperties().maxThreadsPerBlock; |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| return 0; |
| #endif |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { |
| #ifndef __CUDA_ARCH__ |
| return stream_->deviceProperties().maxThreadsPerMultiProcessor; |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| return 0; |
| #endif |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const { |
| #ifndef __CUDA_ARCH__ |
| return stream_->deviceProperties().sharedMemPerBlock; |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| return 0; |
| #endif |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { |
| #ifndef __CUDA_ARCH__ |
| return stream_->deviceProperties().major; |
| #else |
| eigen_assert(false && "The default device should be used instead to generate kernel code"); |
| return 0; |
| #endif |
| } |
| |
| // This function checks if the CUDA runtime recorded an error for the |
| // underlying stream device. |
| inline bool ok() const { |
| cudaError_t error = cudaStreamQuery(stream_->stream()); |
| return (error == cudaSuccess) || (error == cudaErrorNotReady); |
| } |
| |
| private: |
| const StreamInterface* stream_; |
| }; |
| |
| inline void assertCudaOk() { |
| cudaError_t err = cudaGetLastError(); |
| |
| assert(err != cudaErrorMissingConfiguration); |
| assert(err != cudaErrorMemoryAllocation); |
| assert(err != cudaErrorInitializationError); |
| assert(err != cudaErrorLaunchFailure); |
| assert(err != cudaErrorPriorLaunchFailure); |
| assert(err != cudaErrorLaunchTimeout); |
| assert(err != cudaErrorLaunchOutOfResources); |
| assert(err != cudaErrorInvalidDeviceFunction); |
| assert(err != cudaErrorInvalidConfiguration); |
| assert(err != cudaErrorInvalidDevice); |
| assert(err != cudaErrorInvalidValue); |
| assert(err != cudaErrorInvalidPitchValue); |
| assert(err != cudaErrorInvalidSymbol); |
| assert(err != cudaErrorMapBufferObjectFailed); |
| assert(err != cudaErrorUnmapBufferObjectFailed); |
| assert(err != cudaErrorInvalidHostPointer); |
| assert(err != cudaErrorInvalidDevicePointer); |
| assert(err != cudaErrorInvalidTexture); |
| assert(err != cudaErrorInvalidTextureBinding); |
| assert(err != cudaErrorInvalidChannelDescriptor); |
| assert(err != cudaErrorInvalidMemcpyDirection); |
| assert(err != cudaErrorAddressOfConstant); |
| assert(err != cudaErrorTextureFetchFailed); |
| assert(err != cudaErrorTextureNotBound); |
| assert(err != cudaErrorSynchronizationError); |
| assert(err != cudaErrorInvalidFilterSetting); |
| assert(err != cudaErrorInvalidNormSetting); |
| assert(err != cudaErrorMixedDeviceExecution); |
| assert(err != cudaErrorCudartUnloading); |
| assert(err != cudaErrorUnknown); |
| assert(err != cudaErrorNotYetImplemented); |
| assert(err != cudaErrorMemoryValueTooLarge); |
| assert(err != cudaErrorInvalidResourceHandle); |
| assert(err != cudaErrorNotReady); |
| assert(err != cudaErrorInsufficientDriver); |
| assert(err != cudaErrorSetOnActiveProcess); |
| assert(err != cudaErrorInvalidSurface); |
| assert(err != cudaErrorNoDevice); |
| assert(err != cudaErrorECCUncorrectable); |
| assert(err != cudaErrorSharedObjectSymbolNotFound); |
| assert(err != cudaErrorSharedObjectInitFailed); |
| assert(err != cudaErrorUnsupportedLimit); |
| assert(err != cudaErrorDuplicateVariableName); |
| assert(err != cudaErrorDuplicateTextureName); |
| assert(err != cudaErrorDuplicateSurfaceName); |
| assert(err != cudaErrorDevicesUnavailable); |
| assert(err != cudaErrorInvalidKernelImage); |
| assert(err != cudaErrorNoKernelImageForDevice); |
| assert(err != cudaErrorIncompatibleDriverContext); |
| assert(err != cudaErrorPeerAccessAlreadyEnabled); |
| assert(err != cudaErrorPeerAccessNotEnabled); |
| assert(err != cudaErrorDeviceAlreadyInUse); |
| assert(err != cudaErrorProfilerDisabled); |
| assert(err != cudaErrorProfilerNotInitialized); |
| assert(err != cudaErrorProfilerAlreadyStarted); |
| assert(err != cudaErrorProfilerAlreadyStopped); |
| assert(err != cudaErrorAssert); |
| assert(err != cudaErrorTooManyPeers); |
| assert(err != cudaErrorHostMemoryAlreadyRegistered); |
| assert(err != cudaErrorHostMemoryNotRegistered); |
| assert(err != cudaErrorOperatingSystem); |
| assert(err != cudaErrorStartupFailure); |
| assert(err != cudaErrorApiFailureBase); |
| |
| // catch errors types introduced after this function was written |
| assert(err == cudaSuccess); |
| } |
| |
| #ifndef __CUDA_ARCH__ |
| #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ |
| (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ |
| assert(cudaGetLastError() == cudaSuccess); |
| #else |
| #define LAUNCH_CUDA_KERNEL(kernel, ...) \ |
| { const auto __attribute__((__unused__)) __makeTheKernelInstantiate = &(kernel); } \ |
| eigen_assert(false && "Cannot launch a kernel from another kernel" __CUDA_ARCH__); |
| #endif |
| |
| #endif // EIGEN_USE_GPU |
| } // end namespace Eigen |
| |
| #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H |