| // This file is part of Eigen, a lightweight C++ template library |
| // for linear algebra. |
| // |
| // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> |
| // |
| // This Source Code Form is subject to the terms of the Mozilla |
| // Public License v. 2.0. If a copy of the MPL was not distributed |
| // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| |
| #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H |
| #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H |
| |
| namespace Eigen { |
| |
| /** \class TensorExecutor |
| * \ingroup CXX11_Tensor_Module |
| * |
| * \brief The tensor executor class. |
| * |
| * This class is responsible for launch the evaluation of the expression on |
| * the specified computing device. |
| */ |
| namespace internal { |
| |
| // Default strategy: the expression is evaluated with a single cpu thread. |
| template <typename Expression, typename Device, |
| bool Vectorizable, bool Tileable> |
| class TensorExecutor { |
| public: |
| typedef typename Expression::Index Index; |
| EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const Device& device = Device()) |
| { |
| TensorEvaluator<Expression, Device> evaluator(expr, device); |
| const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); |
| if (needs_assign) |
| { |
| const Index size = array_prod(evaluator.dimensions()); |
| for (Index i = 0; i < size; ++i) { |
| evaluator.evalScalar(i); |
| } |
| } |
| evaluator.cleanup(); |
| } |
| }; |
| |
| template <typename Expression> |
| class TensorExecutor<Expression, DefaultDevice, true, false> { |
| public: |
| typedef typename Expression::Index Index; |
| EIGEN_DEVICE_FUNC |
| static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) |
| { |
| TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); |
| const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); |
| if (needs_assign) |
| { |
| const Index size = array_prod(evaluator.dimensions()); |
| const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; |
| |
| // Give compiler a strong possibility to unroll the loop. But don't insist |
| // on unrolling, because if the function is expensive compiler should not |
| // unroll the loop at the expense of inlining. |
| const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; |
| for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { |
| for (Index j = 0; j < 4; j++) { |
| evaluator.evalPacket(i + j * PacketSize); |
| } |
| } |
| const Index VectorizedSize = (size / PacketSize) * PacketSize; |
| for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { |
| evaluator.evalPacket(i); |
| } |
| for (Index i = VectorizedSize; i < size; ++i) { |
| evaluator.evalScalar(i); |
| } |
| } |
| evaluator.cleanup(); |
| } |
| }; |
| |
| template <typename Expression, bool Vectorizable> |
| class TensorExecutor<Expression, DefaultDevice, Vectorizable, true> { |
| public: |
| typedef typename Expression::Index Index; |
| EIGEN_DEVICE_FUNC |
| static inline void run(const Expression& expr, |
| const DefaultDevice& device = DefaultDevice()) { |
| typedef TensorEvaluator<Expression, DefaultDevice> Evaluator; |
| typedef typename traits<Expression>::Scalar Scalar; |
| typedef typename traits<Expression>::Index Index; |
| const std::size_t NumDims = traits<Expression>::NumDimensions; |
| |
| typedef TensorBlockMapper<Index, |
| typename internal::remove_const<Scalar>::type, |
| NumDims, Evaluator::Layout> TensorBlockMapper; |
| typedef TensorBlock<Index, typename internal::remove_const<Scalar>::type, |
| NumDims, Evaluator::Layout> TensorBlock; |
| |
| Evaluator evaluator(expr, device); |
| std::size_t total_size = array_prod(evaluator.dimensions()); |
| std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); |
| if (total_size < cache_size) { |
| // TODO(andydavis) Reduce block management overhead for small tensors. |
| internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, |
| false>::run(expr, device); |
| return; |
| } |
| |
| const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); |
| if (needs_assign) { |
| // Size tensor blocks to fit in cache (or requested target block size). |
| size_t block_total_size = numext::mini(cache_size, total_size); |
| TensorBlockShapeType block_shape = kUniformAllDims; |
| // Query expression tree for desired block size/shape. |
| std::vector<internal::TensorOpResourceRequirements> resources; |
| evaluator.getResourceRequirements(&resources); |
| if (!resources.empty()) { |
| // TODO(andydavis) Implement different policies (i.e. revert to a |
| // default policy if block shapes/sizes conflict). |
| block_shape = resources[0].block_shape; |
| block_total_size = resources[0].block_total_size; |
| } |
| |
| TensorBlockMapper block_mapper(evaluator.dimensions(), |
| block_shape, |
| block_total_size); |
| block_total_size = block_mapper.block_dims_total_size(); |
| |
| Scalar* data = static_cast<Scalar*>(device.allocate( |
| block_total_size * sizeof(Scalar))); |
| |
| const Index total_block_count = block_mapper.total_block_count(); |
| for (Index i = 0; i < total_block_count; ++i) { |
| TensorBlock block = block_mapper.GetBlockForIndex(i, data); |
| evaluator.evalBlock(&block); |
| } |
| device.deallocate(data); |
| } |
| evaluator.cleanup(); |
| } |
| }; |
| |
| // Multicore strategy: the index space is partitioned and each partition is executed on a single core |
| #ifdef EIGEN_USE_THREADS |
| template <typename Evaluator, typename Index, bool Vectorizable> |
| struct EvalRange { |
| static void run(void* evaluator_in, const Index first, const Index last) { |
| Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in)); |
| eigen_assert(last >= first); |
| for (Index i = first; i < last; ++i) { |
| evaluator.evalScalar(i); |
| } |
| } |
| |
| static Index alignBlockSize(Index size) { |
| return size; |
| } |
| }; |
| |
| template <typename Evaluator, typename Index> |
| struct EvalRange<Evaluator, Index, true> { |
| static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; |
| |
| static void run(void* evaluator_in, const Index first, const Index last) { |
| Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in)); |
| eigen_assert(last >= first); |
| |
| Index i = first; |
| if (last - first >= PacketSize) { |
| eigen_assert(first % PacketSize == 0); |
| Index last_chunk_offset = last - 4 * PacketSize; |
| // Give compiler a strong possibility to unroll the loop. But don't insist |
| // on unrolling, because if the function is expensive compiler should not |
| // unroll the loop at the expense of inlining. |
| for (; i <= last_chunk_offset; i += 4*PacketSize) { |
| for (Index j = 0; j < 4; j++) { |
| evaluator.evalPacket(i + j * PacketSize); |
| } |
| } |
| last_chunk_offset = last - PacketSize; |
| for (; i <= last_chunk_offset; i += PacketSize) { |
| evaluator.evalPacket(i); |
| } |
| } |
| for (; i < last; ++i) { |
| evaluator.evalScalar(i); |
| } |
| } |
| |
| static Index alignBlockSize(Index size) { |
| // Align block size to packet size and account for unrolling in run above. |
| if (size >= 16 * PacketSize) { |
| return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); |
| } |
| // Aligning to 4 * PacketSize would increase block size by more than 25%. |
| return (size + PacketSize - 1) & ~(PacketSize - 1); |
| } |
| }; |
| |
| template <typename Expression, bool Vectorizable, bool Tileable> |
| class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> { |
| public: |
| typedef typename Expression::Index Index; |
| static inline void run(const Expression& expr, const ThreadPoolDevice& device) |
| { |
| typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; |
| typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange; |
| Evaluator evaluator(expr, device); |
| const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); |
| if (needs_assign) |
| { |
| const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; |
| const Index size = array_prod(evaluator.dimensions()); |
| device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), |
| EvalRange::alignBlockSize, |
| [&evaluator](Index first, Index last) { |
| EvalRange::run(&evaluator, first, last); |
| }); |
| } |
| evaluator.cleanup(); |
| } |
| }; |
| |
| |
| template <typename Expression, bool Vectorizable> |
| class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, true> { |
| public: |
| typedef typename Expression::Index Index; |
| static inline void run(const Expression& expr, |
| const ThreadPoolDevice& device) { |
| typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; |
| typedef typename internal::remove_const< |
| typename traits<Expression>::Scalar>::type Scalar; |
| typedef typename traits<Expression>::Index Index; |
| static const std::size_t NumDims = traits<Expression>::NumDimensions; |
| typedef TensorBlockMapper<Index, Scalar, NumDims, Evaluator::Layout> |
| TensorBlockMapper; |
| typedef TensorBlock<Index, Scalar, NumDims, Evaluator::Layout> |
| TensorBlock; |
| |
| Evaluator evaluator(expr, device); |
| std::size_t total_size = array_prod(evaluator.dimensions()); |
| std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar); |
| if (total_size < cache_size) { |
| // TODO(andydavis) Reduce block management overhead for small tensors. |
| internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, |
| false>::run(expr, device); |
| evaluator.cleanup(); |
| return; |
| } |
| |
| const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); |
| if (needs_assign) { |
| TensorBlockShapeType block_shape = kUniformAllDims; |
| size_t block_total_size = 0; |
| // Query expression tree for desired block size/shape. |
| std::vector<internal::TensorOpResourceRequirements> resources; |
| evaluator.getResourceRequirements(&resources); |
| if (!resources.empty()) { |
| // TODO(andydavis) Implement different shape/size policies. |
| block_shape = resources[0].block_shape; |
| block_total_size = resources[0].block_total_size; |
| } |
| int num_threads = device.numThreads(); |
| |
| // Estimate minimum block size based on cost. |
| TensorOpCost cost = evaluator.costPerCoeff(Vectorizable); |
| double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost); |
| size_t block_size = 1.0 / taskSize; |
| TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape, |
| block_size); |
| block_size = block_mapper.block_dims_total_size(); |
| const size_t aligned_blocksize = |
| EIGEN_MAX_ALIGN_BYTES * |
| divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES); |
| void* buf = internal::aligned_malloc((num_threads+1) * aligned_blocksize); |
| device.parallelFor( |
| block_mapper.total_block_count(), cost * block_size, |
| [=, &device, &evaluator, &block_mapper](Index first, Index last) { |
| // currentThreadId() returns -1 if called from a thread not in the |
| // threadpool, such as the main thread dispatching Eigen expressions. |
| const int thread_idx = device.currentThreadId(); |
| eigen_assert(thread_idx >= -1 && thread_idx < num_threads); |
| Scalar* thread_buf = reinterpret_cast<Scalar*>( |
| static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1)); |
| for (Index i = first; i < last; ++i) { |
| auto block = block_mapper.GetBlockForIndex(i, thread_buf); |
| evaluator.evalBlock(&block); |
| } |
| }); |
| internal::aligned_free(buf); |
| } |
| evaluator.cleanup(); |
| } |
| }; |
| |
| #endif |
| |
| |
| // GPU: the evaluation of the expression is offloaded to a GPU. |
| #if defined(EIGEN_USE_GPU) |
| |
| template <typename Expression, bool Vectorizable, bool Tileable> |
| class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> { |
| public: |
| typedef typename Expression::Index Index; |
| static void run(const Expression& expr, const GpuDevice& device); |
| }; |
| |
| |
| #if defined(__CUDACC__) |
| template <typename Evaluator, typename Index, bool Vectorizable> |
| struct EigenMetaKernelEval { |
| static __device__ EIGEN_ALWAYS_INLINE |
| void run(Evaluator eval, Index first, Index last, Index step_size) { |
| for (Index i = first; i < last; i += step_size) { |
| eval.evalScalar(i); |
| } |
| } |
| }; |
| |
| template <typename Evaluator, typename Index> |
| struct EigenMetaKernelEval<Evaluator, Index, true> { |
| static __device__ EIGEN_ALWAYS_INLINE |
| void run(Evaluator eval, Index first, Index last, Index step_size) { |
| const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; |
| const Index vectorized_size = (last / PacketSize) * PacketSize; |
| const Index vectorized_step_size = step_size * PacketSize; |
| |
| // Use the vector path |
| for (Index i = first * PacketSize; i < vectorized_size; |
| i += vectorized_step_size) { |
| eval.evalPacket(i); |
| } |
| for (Index i = vectorized_size + first; i < last; i += step_size) { |
| eval.evalScalar(i); |
| } |
| } |
| }; |
| |
| template <typename Evaluator, typename Index> |
| __global__ void |
| __launch_bounds__(1024) |
| EigenMetaKernel(Evaluator memcopied_eval, Index size) { |
| |
| const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; |
| const Index step_size = blockDim.x * gridDim.x; |
| |
| // Cuda memcopies the kernel arguments. That's fine for POD, but for more |
| // complex types such as evaluators we should really conform to the C++ |
| // standard and call a proper copy constructor. |
| Evaluator eval(memcopied_eval); |
| |
| const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; |
| EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size); |
| } |
| |
| /*static*/ |
| template <typename Expression, bool Vectorizable, bool Tileable> |
| inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run( |
| const Expression& expr, const GpuDevice& device) { |
| TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); |
| const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); |
| if (needs_assign) { |
| const int block_size = device.maxCudaThreadsPerBlock(); |
| const int max_blocks = device.getNumCudaMultiProcessors() * |
| device.maxCudaThreadsPerMultiProcessor() / block_size; |
| const Index size = array_prod(evaluator.dimensions()); |
| // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. |
| const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1); |
| |
| LAUNCH_CUDA_KERNEL( |
| (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), |
| num_blocks, block_size, 0, device, evaluator, size); |
| } |
| evaluator.cleanup(); |
| } |
| |
| #endif // __CUDACC__ |
| #endif // EIGEN_USE_GPU |
| |
| } // end namespace internal |
| |
| } // end namespace Eigen |
| |
| #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H |