blob: 5f93c19faa10c6a648a7f2942b52b83a99c01885 [file] [log] [blame]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
namespace Eigen {
/** \class TensorExecutor
* \ingroup CXX11_Tensor_Module
*
* \brief The tensor executor class.
*
* This class is responsible for launch the evaluation of the expression on
* the specified computing device.
*/
namespace internal {
// Default strategy: the expression is evaluated with a single cpu thread.
template <typename Expression, typename Device,
bool Vectorizable, bool Tileable>
class TensorExecutor {
public:
typedef typename Expression::Index Index;
EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const Device& device = Device())
{
TensorEvaluator<Expression, Device> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
const Index size = array_prod(evaluator.dimensions());
for (Index i = 0; i < size; ++i) {
evaluator.evalScalar(i);
}
}
evaluator.cleanup();
}
};
template <typename Expression>
class TensorExecutor<Expression, DefaultDevice, true, false> {
public:
typedef typename Expression::Index Index;
EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
{
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
const Index size = array_prod(evaluator.dimensions());
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
// Give compiler a strong possibility to unroll the loop. But don't insist
// on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining.
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
for (Index j = 0; j < 4; j++) {
evaluator.evalPacket(i + j * PacketSize);
}
}
const Index VectorizedSize = (size / PacketSize) * PacketSize;
for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
evaluator.evalPacket(i);
}
for (Index i = VectorizedSize; i < size; ++i) {
evaluator.evalScalar(i);
}
}
evaluator.cleanup();
}
};
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, DefaultDevice, Vectorizable, true> {
public:
typedef typename Expression::Index Index;
EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr,
const DefaultDevice& device = DefaultDevice()) {
typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
typedef typename traits<Expression>::Scalar Scalar;
typedef typename traits<Expression>::Index Index;
const std::size_t NumDims = traits<Expression>::NumDimensions;
typedef TensorBlockMapper<Index,
typename internal::remove_const<Scalar>::type,
NumDims, Evaluator::Layout> TensorBlockMapper;
typedef TensorBlock<Index, typename internal::remove_const<Scalar>::type,
NumDims, Evaluator::Layout> TensorBlock;
Evaluator evaluator(expr, device);
std::size_t total_size = array_prod(evaluator.dimensions());
std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
if (total_size < cache_size) {
// TODO(andydavis) Reduce block management overhead for small tensors.
internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
false>::run(expr, device);
return;
}
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
// Size tensor blocks to fit in cache (or requested target block size).
size_t block_total_size = numext::mini(cache_size, total_size);
TensorBlockShapeType block_shape = kUniformAllDims;
// Query expression tree for desired block size/shape.
std::vector<internal::TensorOpResourceRequirements> resources;
evaluator.getResourceRequirements(&resources);
if (!resources.empty()) {
// TODO(andydavis) Implement different policies (i.e. revert to a
// default policy if block shapes/sizes conflict).
block_shape = resources[0].block_shape;
block_total_size = resources[0].block_total_size;
}
TensorBlockMapper block_mapper(evaluator.dimensions(),
block_shape,
block_total_size);
block_total_size = block_mapper.block_dims_total_size();
Scalar* data = static_cast<Scalar*>(device.allocate(
block_total_size * sizeof(Scalar)));
const Index total_block_count = block_mapper.total_block_count();
for (Index i = 0; i < total_block_count; ++i) {
TensorBlock block = block_mapper.GetBlockForIndex(i, data);
evaluator.evalBlock(&block);
}
device.deallocate(data);
}
evaluator.cleanup();
}
};
// Multicore strategy: the index space is partitioned and each partition is executed on a single core
#ifdef EIGEN_USE_THREADS
template <typename Evaluator, typename Index, bool Vectorizable>
struct EvalRange {
static void run(void* evaluator_in, const Index first, const Index last) {
Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in));
eigen_assert(last >= first);
for (Index i = first; i < last; ++i) {
evaluator.evalScalar(i);
}
}
static Index alignBlockSize(Index size) {
return size;
}
};
template <typename Evaluator, typename Index>
struct EvalRange<Evaluator, Index, true> {
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
static void run(void* evaluator_in, const Index first, const Index last) {
Evaluator evaluator(*static_cast<Evaluator*>(evaluator_in));
eigen_assert(last >= first);
Index i = first;
if (last - first >= PacketSize) {
eigen_assert(first % PacketSize == 0);
Index last_chunk_offset = last - 4 * PacketSize;
// Give compiler a strong possibility to unroll the loop. But don't insist
// on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining.
for (; i <= last_chunk_offset; i += 4*PacketSize) {
for (Index j = 0; j < 4; j++) {
evaluator.evalPacket(i + j * PacketSize);
}
}
last_chunk_offset = last - PacketSize;
for (; i <= last_chunk_offset; i += PacketSize) {
evaluator.evalPacket(i);
}
}
for (; i < last; ++i) {
evaluator.evalScalar(i);
}
}
static Index alignBlockSize(Index size) {
// Align block size to packet size and account for unrolling in run above.
if (size >= 16 * PacketSize) {
return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
}
// Aligning to 4 * PacketSize would increase block size by more than 25%.
return (size + PacketSize - 1) & ~(PacketSize - 1);
}
};
template <typename Expression, bool Vectorizable, bool Tileable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
{
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange;
Evaluator evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
const Index size = array_prod(evaluator.dimensions());
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
EvalRange::alignBlockSize,
[&evaluator](Index first, Index last) {
EvalRange::run(&evaluator, first, last);
});
}
evaluator.cleanup();
}
};
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, true> {
public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr,
const ThreadPoolDevice& device) {
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
typedef typename internal::remove_const<
typename traits<Expression>::Scalar>::type Scalar;
typedef typename traits<Expression>::Index Index;
static const std::size_t NumDims = traits<Expression>::NumDimensions;
typedef TensorBlockMapper<Index, Scalar, NumDims, Evaluator::Layout>
TensorBlockMapper;
typedef TensorBlock<Index, Scalar, NumDims, Evaluator::Layout>
TensorBlock;
Evaluator evaluator(expr, device);
std::size_t total_size = array_prod(evaluator.dimensions());
std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
if (total_size < cache_size) {
// TODO(andydavis) Reduce block management overhead for small tensors.
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
false>::run(expr, device);
evaluator.cleanup();
return;
}
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
TensorBlockShapeType block_shape = kUniformAllDims;
size_t block_total_size = 0;
// Query expression tree for desired block size/shape.
std::vector<internal::TensorOpResourceRequirements> resources;
evaluator.getResourceRequirements(&resources);
if (!resources.empty()) {
// TODO(andydavis) Implement different shape/size policies.
block_shape = resources[0].block_shape;
block_total_size = resources[0].block_total_size;
}
int num_threads = device.numThreads();
// Estimate minimum block size based on cost.
TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
size_t block_size = 1.0 / taskSize;
TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
block_size);
block_size = block_mapper.block_dims_total_size();
const size_t aligned_blocksize =
EIGEN_MAX_ALIGN_BYTES *
divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES);
void* buf = internal::aligned_malloc((num_threads+1) * aligned_blocksize);
device.parallelFor(
block_mapper.total_block_count(), cost * block_size,
[=, &device, &evaluator, &block_mapper](Index first, Index last) {
// currentThreadId() returns -1 if called from a thread not in the
// threadpool, such as the main thread dispatching Eigen expressions.
const int thread_idx = device.currentThreadId();
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
Scalar* thread_buf = reinterpret_cast<Scalar*>(
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
for (Index i = first; i < last; ++i) {
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
evaluator.evalBlock(&block);
}
});
internal::aligned_free(buf);
}
evaluator.cleanup();
}
};
#endif
// GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU)
template <typename Expression, bool Vectorizable, bool Tileable>
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
public:
typedef typename Expression::Index Index;
static void run(const Expression& expr, const GpuDevice& device);
};
#if defined(__CUDACC__)
template <typename Evaluator, typename Index, bool Vectorizable>
struct EigenMetaKernelEval {
static __device__ EIGEN_ALWAYS_INLINE
void run(Evaluator eval, Index first, Index last, Index step_size) {
for (Index i = first; i < last; i += step_size) {
eval.evalScalar(i);
}
}
};
template <typename Evaluator, typename Index>
struct EigenMetaKernelEval<Evaluator, Index, true> {
static __device__ EIGEN_ALWAYS_INLINE
void run(Evaluator eval, Index first, Index last, Index step_size) {
const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
const Index vectorized_size = (last / PacketSize) * PacketSize;
const Index vectorized_step_size = step_size * PacketSize;
// Use the vector path
for (Index i = first * PacketSize; i < vectorized_size;
i += vectorized_step_size) {
eval.evalPacket(i);
}
for (Index i = vectorized_size + first; i < last; i += step_size) {
eval.evalScalar(i);
}
}
};
template <typename Evaluator, typename Index>
__global__ void
__launch_bounds__(1024)
EigenMetaKernel(Evaluator memcopied_eval, Index size) {
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
const Index step_size = blockDim.x * gridDim.x;
// Cuda memcopies the kernel arguments. That's fine for POD, but for more
// complex types such as evaluators we should really conform to the C++
// standard and call a proper copy constructor.
Evaluator eval(memcopied_eval);
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
}
/*static*/
template <typename Expression, bool Vectorizable, bool Tileable>
inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
const Expression& expr, const GpuDevice& device) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
const int block_size = device.maxCudaThreadsPerBlock();
const int max_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size;
const Index size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
LAUNCH_CUDA_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
}
#endif // __CUDACC__
#endif // EIGEN_USE_GPU
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H