blob: f961b4066d02bf593c5b301ae8524bdcdd29fe10 [file] [log] [blame]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
#include "./InternalHeaderCheck.h"
namespace Eigen {
/**
* \class TensorExecutor
* \ingroup CXX11_Tensor_Module
*
* \brief The tensor executor class.
*
* This class is responsible for launch the evaluation of the expression on
* the specified computing device.
*
* @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
* instructions)
* @tparam Tiling can use block based tensor evaluation
* (see TensorBlock.h)
*/
namespace internal {
/**
* Evaluating TensorBroadcastingOp via coefficient of packet path is extremely
* expensive. If expression has at least one broadcast op in it, and it supports
* block based evaluation, we always prefer it, even for the small tensors. For
* all other tileable ops, block evaluation overhead for small tensors (fits
* into L1) is too large, and we fallback on vectorized evaluation.
*/
// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
template<typename Expression>
struct ExpressionHasTensorBroadcastingOp {
enum { value = false };
};
template<typename LhsXprType, typename RhsXprType>
struct ExpressionHasTensorBroadcastingOp<
const TensorAssignOp<LhsXprType, RhsXprType> > {
enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
};
template<typename UnaryOp, typename XprType>
struct ExpressionHasTensorBroadcastingOp<
const TensorCwiseUnaryOp<UnaryOp, XprType> > {
enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
};
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
struct ExpressionHasTensorBroadcastingOp<
const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
enum {
value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
ExpressionHasTensorBroadcastingOp<RhsXprType>::value
};
};
template<typename Broadcast, typename XprType>
struct ExpressionHasTensorBroadcastingOp<
const TensorBroadcastingOp<Broadcast, XprType> > {
enum { value = true };
};
// -------------------------------------------------------------------------- //
/**
* Default strategy: the expression is evaluated sequentially with a single cpu
* thread, without vectorization and block evaluation.
*/
template <typename Expression, typename Device, bool Vectorizable,
TiledEvaluation Tiling>
class TensorExecutor {
public:
typedef typename Expression::Index StorageIndex;
// Including `unsupported/Eigen/CXX11/Tensor` in different translation units
// with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
// violation. If this template is instantiated with a non-default device, it
// means that this header file was included without defining
// `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
static_assert(std::is_same<Device, DefaultDevice>::value,
"Default executor instantiated with non-default device. "
"You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
"EIGEN_USE_SYCL before including Eigen headers.");
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const Device& device = Device()) {
TensorEvaluator<Expression, Device> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
const StorageIndex size = array_prod(evaluator.dimensions());
for (StorageIndex i = 0; i < size; ++i) {
evaluator.evalScalar(i);
}
}
evaluator.cleanup();
}
};
/**
* Default async execution strategy is not implemented. Currently it's only
* available for ThreadPoolDevice (see definition below).
*/
template <typename Expression, typename Device, typename DoneCallback,
bool Vectorizable, TiledEvaluation Tiling>
class TensorAsyncExecutor {};
/**
* Process all the data with a single cpu thread, using vectorized instructions.
*/
template <typename Expression>
class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
/*Tiling=*/TiledEvaluation::Off> {
public:
typedef typename Expression::Index StorageIndex;
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(
const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
const StorageIndex size = array_prod(evaluator.dimensions());
const int PacketSize = unpacket_traits<typename TensorEvaluator<
Expression, DefaultDevice>::PacketReturnType>::size;
// Give compiler a strong possibility to unroll the loop. But don't insist
// on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining.
const StorageIndex UnrolledSize =
(size / (4 * PacketSize)) * 4 * PacketSize;
for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
for (StorageIndex j = 0; j < 4; j++) {
evaluator.evalPacket(i + j * PacketSize);
}
}
const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
evaluator.evalPacket(i);
}
for (StorageIndex i = VectorizedSize; i < size; ++i) {
evaluator.evalScalar(i);
}
}
evaluator.cleanup();
}
};
/**
* Process all the data with a single cpu thread, using blocks of data. By
* sizing a block to fit L1 cache we get better cache performance.
*/
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
/*Tiling=*/TiledEvaluation::On> {
public:
typedef typename traits<Expression>::Scalar Scalar;
typedef std::remove_const_t<Scalar> ScalarNoConst;
typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
typedef typename traits<Expression>::Index StorageIndex;
static constexpr int NumDims = traits<Expression>::NumDimensions;
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const DefaultDevice& device = DefaultDevice()) {
typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
TensorBlockMapper;
typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<DefaultDevice>
TensorBlockScratch;
Evaluator evaluator(expr, device);
// TODO(ezhulenev): Do not use tiling for small tensors?
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
// Query expression tree for desired block size/shape.
const TensorBlockResourceRequirements requirements =
evaluator.getResourceRequirements();
const TensorBlockMapper block_mapper(
typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
requirements);
// Share scratch memory allocator between all blocks.
TensorBlockScratch scratch(device);
const StorageIndex total_block_count = block_mapper.blockCount();
for (StorageIndex i = 0; i < total_block_count; ++i) {
TensorBlockDesc desc = block_mapper.blockDescriptor(i);
evaluator.evalBlock(desc, scratch);
scratch.reset();
}
}
evaluator.cleanup();
}
};
/**
* Multicore strategy: the index space is partitioned and each partition is
* executed on a single core.
*
* (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
* pool, and will block the caller thread until all tasks are finished.
*
* (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
* the ThreadPoolDevice managed thread pool, and will return immediately.
* It will call 'done' callback after all tasks are finished.
*/
#ifdef EIGEN_USE_THREADS
template <typename TensorBlockMapper>
struct TensorExecutorTilingContext {
TensorExecutorTilingContext() = default;
TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
const TensorOpCost& b_cost, size_t b_aligned_size)
: block_mapper(b_mapper),
cost(b_cost),
aligned_blocksize(b_aligned_size) {}
TensorBlockMapper block_mapper; // navigate through blocks
TensorOpCost cost; // cost of computing a single block
size_t aligned_blocksize; // block size after memory alignment
};
// Computes a block evaluation parameters, and allocates temporary memory buffer
// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
const Evaluator& evaluator) {
// Query expression tree for desired block size/shape.
TensorBlockResourceRequirements requirements =
evaluator.getResourceRequirements();
// Update target block size based on cost model.
double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
1, requirements.cost_per_coeff);
requirements.size = static_cast<size_t>(1.0 / taskSize);
TensorBlockMapper block_mapper(
typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
requirements);
size_t block_size = block_mapper.blockTotalSize();
const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
const size_t aligned_blocksize =
align *
divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
return {block_mapper, requirements.cost_per_coeff * block_size,
aligned_blocksize};
}
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EvalRange {
static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
const StorageIndex lastIdx) {
Evaluator evaluator = *evaluator_in;
eigen_assert(lastIdx >= firstIdx);
for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
evaluator.evalScalar(i);
}
}
static StorageIndex alignBlockSize(StorageIndex size) { return size; }
};
template <typename Evaluator, typename StorageIndex>
struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
static constexpr int PacketSize =
unpacket_traits<typename Evaluator::PacketReturnType>::size;
static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
const StorageIndex lastIdx) {
Evaluator evaluator = *evaluator_in;
eigen_assert(lastIdx >= firstIdx);
StorageIndex i = firstIdx;
if (lastIdx - firstIdx >= PacketSize) {
eigen_assert(firstIdx % PacketSize == 0);
StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
// Give compiler a strong possibility to unroll the loop. But don't insist
// on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining.
for (; i <= last_chunk_offset; i += 4 * PacketSize) {
for (StorageIndex j = 0; j < 4; j++) {
evaluator.evalPacket(i + j * PacketSize);
}
}
last_chunk_offset = lastIdx - PacketSize;
for (; i <= last_chunk_offset; i += PacketSize) {
evaluator.evalPacket(i);
}
}
for (; i < lastIdx; ++i) {
evaluator.evalScalar(i);
}
}
static StorageIndex alignBlockSize(StorageIndex size) {
// Align block size to packet size and account for unrolling in run above.
if (size >= 16 * PacketSize) {
return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
}
// Aligning to 4 * PacketSize would increase block size by more than 25%.
return (size + PacketSize - 1) & ~(PacketSize - 1);
}
};
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index StorageIndex;
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const ThreadPoolDevice& device) {
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
Evaluator evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
if (needs_assign) {
const StorageIndex size = array_prod(evaluator.dimensions());
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
EvalRange::alignBlockSize,
[&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
EvalRange::run(&evaluator, firstIdx, lastIdx);
});
}
evaluator.cleanup();
}
};
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
/*Tiling=*/TiledEvaluation::On> {
public:
typedef typename traits<Expression>::Index IndexType;
typedef typename traits<Expression>::Scalar Scalar;
typedef std::remove_const_t<Scalar> ScalarNoConst;
static constexpr int NumDims = traits<Expression>::NumDimensions;
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
typedef internal::TensorBlockDescriptor<NumDims, IndexType>
TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
TensorBlockScratch;
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const ThreadPoolDevice& device) {
Evaluator evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
if (needs_assign) {
const TilingContext tiling =
internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
Vectorizable>(evaluator);
auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
IndexType lastBlockIdx) {
TensorBlockScratch scratch(device);
for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
++block_idx) {
TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
evaluator.evalBlock(desc, scratch);
scratch.reset();
}
};
// Evaluate small expressions directly as a single block.
if (tiling.block_mapper.blockCount() == 1) {
TensorBlockScratch scratch(device);
TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
evaluator.evalBlock(desc, scratch);
} else {
device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
eval_block);
}
}
evaluator.cleanup();
}
};
template <typename Expression, typename DoneCallback, bool Vectorizable,
TiledEvaluation Tiling>
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
Vectorizable, Tiling> {
public:
typedef typename Expression::Index StorageIndex;
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
const ThreadPoolDevice& device,
DoneCallback done) {
TensorAsyncExecutorContext* const ctx =
new TensorAsyncExecutorContext(expr, device, std::move(done));
const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
if (!need_assign) {
delete ctx;
return;
}
typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
const StorageIndex size = array_prod(ctx->evaluator.dimensions());
device.parallelForAsync(
size, ctx->evaluator.costPerCoeff(Vectorizable),
EvalRange::alignBlockSize,
[ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
},
[ctx]() { delete ctx; });
};
ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
}
private:
struct TensorAsyncExecutorContext {
TensorAsyncExecutorContext(const Expression& expr,
const ThreadPoolDevice& thread_pool,
DoneCallback done)
: evaluator(expr, thread_pool), on_done(std::move(done)) {}
~TensorAsyncExecutorContext() {
evaluator.cleanup();
on_done();
}
Evaluator evaluator;
private:
DoneCallback on_done;
};
};
template <typename Expression, typename DoneCallback, bool Vectorizable>
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
Vectorizable, /*Tileable*/ TiledEvaluation::On> {
public:
typedef typename traits<Expression>::Index IndexType;
typedef typename traits<Expression>::Scalar Scalar;
typedef std::remove_const_t<Scalar> ScalarNoConst;
static constexpr int NumDims = traits<Expression>::NumDimensions;
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
TensorBlockScratch;
static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
const ThreadPoolDevice& device,
DoneCallback done) {
TensorAsyncExecutorContext* const ctx =
new TensorAsyncExecutorContext(expr, device, std::move(done));
const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
if (!need_assign) {
delete ctx;
return;
}
ctx->tiling = internal::GetTensorExecutorTilingContext<
Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
TensorBlockScratch scratch(ctx->device);
for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
++block_idx) {
TensorBlockDesc desc =
ctx->tiling.block_mapper.blockDescriptor(block_idx);
ctx->evaluator.evalBlock(desc, scratch);
scratch.reset();
}
};
// Evaluate small expressions directly as a single block.
if (ctx->tiling.block_mapper.blockCount() == 1) {
TensorBlockScratch scratch(ctx->device);
TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
ctx->evaluator.evalBlock(desc, scratch);
delete ctx;
} else {
ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
ctx->tiling.cost, eval_block,
[ctx]() { delete ctx; });
}
};
ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
}
private:
struct TensorAsyncExecutorContext {
TensorAsyncExecutorContext(const Expression& expr,
const ThreadPoolDevice& thread_pool,
DoneCallback done)
: device(thread_pool),
evaluator(expr, thread_pool),
on_done(std::move(done)) {}
~TensorAsyncExecutorContext() {
evaluator.cleanup();
on_done();
}
const ThreadPoolDevice& device;
Evaluator evaluator;
TilingContext tiling;
private:
DoneCallback on_done;
};
};
#endif // EIGEN_USE_THREADS
// GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU)
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index StorageIndex;
static void run(const Expression& expr, const GpuDevice& device);
};
#if defined(EIGEN_GPUCC)
// Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0.
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs,
Index rhs) {
const Index highest = NumTraits<Index>::highest();
const Index lowest = NumTraits<Index>::lowest();
if (lhs > 0 && rhs > 0) {
return lhs > highest - rhs ? 1 : 0;
} else if (lhs < 0 && rhs < 0) {
return lhs < lowest - rhs ? -1 : 0;
} else {
return 0;
}
}
// Returns lhs + rhs, saturating to the highest/lowest representable value on
// overflow/underflow respectively.
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {
const Index highest = NumTraits<Index>::highest();
const Index lowest = NumTraits<Index>::lowest();
int overflow = sum_will_overflow(lhs, rhs);
return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
}
// A functor that adds step_size to a given index, saturating to avoid
// overflow/underflow. If overflow/underflow is not possible, regular addition
// is used (for efficiency).
template <typename Index>
struct SafeStep {
// lastIdx is one past the end of the possible indexes.
// step_size is the value that will be added to the given index when the
// functor is called.
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size)
: can_overflow_(sum_will_overflow(lastIdx, step_size)),
step_size_(step_size) {}
// Adds step_size to index, saturating on overflow (if overflow is possible).
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const {
return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
}
private:
const bool can_overflow_;
const Index step_size_;
};
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EigenMetaKernelEval {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
SafeStep<StorageIndex> safe_step(lastIdx, step_size);
for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) {
eval.evalScalar(i);
}
}
};
template <typename Evaluator, typename StorageIndex>
struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
const StorageIndex vectorized_step_size = step_size * PacketSize;
SafeStep<StorageIndex> safe_vectorized_step(vectorized_size,
vectorized_step_size);
// Use the vector path
for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
i = safe_vectorized_step(i)) {
eval.evalPacket(i);
}
SafeStep<StorageIndex> safe_step(lastIdx, step_size);
for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx;
i = safe_step(i)) {
eval.evalScalar(i);
}
}
};
template <typename Evaluator, typename StorageIndex>
__global__ void
__launch_bounds__(1024)
EigenMetaKernel(Evaluator eval, StorageIndex size) {
const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
const StorageIndex step_size = blockDim.x * gridDim.x;
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
}
/*static*/
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
const Expression& expr, const GpuDevice& device) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
if (needs_assign) {
const int block_size = device.maxGpuThreadsPerBlock();
const int max_blocks =
numext::mini<int64_t>(device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor(),
NumTraits<StorageIndex>::highest()) /
block_size;
const StorageIndex size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
LAUNCH_GPU_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
}
#endif // EIGEN_GPUCC
#endif // EIGEN_USE_GPU
// SYCL Executor policy
#ifdef EIGEN_USE_SYCL
template <typename Evaluator>
struct ExecExprFunctorKernel {
typedef typename Evaluator::Index Index;
Evaluator evaluator;
const Index range;
template <typename Scratch>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
const Scratch, Evaluator evaluator_, const Index range_)
: evaluator(evaluator_), range(range_) {}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
cl::sycl::nd_item<1> itemID) const {
compute(itemID);
}
template <bool is_vec = Evaluator::PacketAccess>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>
compute(const cl::sycl::nd_item<1>& itemID) const {
Index gId = static_cast<Index>(itemID.get_global_linear_id());
Index total_threads = itemID.get_global_range(0);
for (Index i = gId; i < range; i += total_threads) {
evaluator.evalScalar(i);
}
}
template <bool is_vec = Evaluator::PacketAccess>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>
compute(const cl::sycl::nd_item<1>& itemID) const {
const Index vectorizedRange =
(range / Evaluator::PacketSize) * Evaluator::PacketSize;
Index gId = static_cast<Index>(itemID.get_global_linear_id());
const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
const Index start = Evaluator::PacketSize * gId;
for (Index i = start; i < vectorizedRange; i += step) {
evaluator.evalPacket(i);
}
gId += vectorizedRange;
for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
evaluator.evalScalar(i);
}
}
};
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index Index;
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const Eigen::SyclDevice& dev) {
typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
Evaluator evaluator(expr, dev);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
Index range, GRange, tileSize;
Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
total_size = (total_size == 0) ? 1 : total_size;
const int PacketSize =
Eigen::PacketType<typename Evaluator::CoeffReturnType,
Eigen::SyclDevice>::size;
Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
range = total_size;
dev.template nullary_kernel_launcher<
typename Evaluator::CoeffReturnType,
ExecExprFunctorKernel<Evaluator> >(
evaluator,
cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
cl::sycl::range<1>(tileSize)),
Index(1), range);
}
evaluator.cleanup();
}
};
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H