blob: 49fb21dc80e8f847b75356d20d4c4cfc93202ba2 [file] [log] [blame]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2018 Andy Davis <andydavis@google.com>
// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
namespace Eigen {
namespace internal {
namespace {
// Helper template to choose between ColMajor and RowMajor values.
template <int Layout>
struct cond;
template <>
struct cond<ColMajor> {
template <typename T>
EIGEN_STRONG_INLINE const T& operator()(const T& col,
const T& /*row*/) const {
return col;
}
};
template <>
struct cond<RowMajor> {
template <typename T>
EIGEN_STRONG_INLINE const T& operator()(const T& /*col*/,
const T& row) const {
return row;
}
};
} // namespace
/**
* \enum TensorBlockShapeType
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor block shape type.
*
* Tensor block shape type defines what are the shape preference for the blocks
* extracted from the larger tensor.
*
* Example:
*
* We want to extract blocks of 100 elements from the large 100x100 tensor:
* - tensor: 100x100
* - target_block_size: 100
*
* TensorBlockShapeType:
* - kUniformAllDims: 100 blocks of size 10x10
* - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
* or row major layout)
*/
enum TensorBlockShapeType {
kUniformAllDims,
kSkewedInnerDims
};
struct TensorOpResourceRequirements {
TensorBlockShapeType block_shape;
Index block_total_size;
// TODO(andydavis) Add 'target_num_threads' to support communication of
// thread-resource requirements. This will allow ops deep in the
// expression tree (like reductions) to communicate resources
// requirements based on local state (like the total number of reductions
// to be computed).
TensorOpResourceRequirements(TensorBlockShapeType shape,
const Index size)
: block_shape(shape), block_total_size(size) {}
};
// Tries to merge multiple resource requirements.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MergeResourceRequirements(
const std::vector<TensorOpResourceRequirements>& resources,
TensorBlockShapeType* block_shape, Index* block_total_size) {
if (resources.empty()) {
return;
}
// TODO(andydavis) Implement different policies (i.e. revert to a default
// policy if block shapes/sizes conflict).
*block_shape = resources[0].block_shape;
*block_total_size = resources[0].block_total_size;
for (std::vector<TensorOpResourceRequirements>::size_type i = 1; i < resources.size(); ++i) {
if (resources[i].block_shape == kSkewedInnerDims &&
*block_shape != kSkewedInnerDims) {
*block_shape = kSkewedInnerDims;
}
*block_total_size =
numext::maxi(*block_total_size, resources[i].block_total_size);
}
}
/**
* \class TensorBlock
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor block class.
*
* This class represents a tensor block specified by the index of the
* first block coefficient, and the size of the block in each dimension.
*/
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorBlock {
public:
typedef DSizes<StorageIndex, NumDims> Dimensions;
TensorBlock(const StorageIndex first_coeff_index, const Dimensions& block_sizes,
const Dimensions& block_strides, const Dimensions& tensor_strides,
Scalar* data)
: m_first_coeff_index(first_coeff_index),
m_block_sizes(block_sizes),
m_block_strides(block_strides),
m_tensor_strides(tensor_strides),
m_data(data) {}
StorageIndex first_coeff_index() const { return m_first_coeff_index; }
const Dimensions& block_sizes() const { return m_block_sizes; }
const Dimensions& block_strides() const { return m_block_strides; }
const Dimensions& tensor_strides() const { return m_tensor_strides; }
Scalar* data() { return m_data; }
const Scalar* data() const { return m_data; }
private:
StorageIndex m_first_coeff_index;
Dimensions m_block_sizes;
Dimensions m_block_strides;
Dimensions m_tensor_strides;
Scalar* m_data; // Not owned.
};
template <typename Scalar, typename StorageIndex>
struct TensorBlockCopyOp {
typedef typename packet_traits<Scalar>::type Packet;
enum {
Vectorizable = packet_traits<Scalar>::Vectorizable,
PacketSize = packet_traits<Scalar>::size
};
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const StorageIndex num_coeff_to_copy, const StorageIndex dst_index,
const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data,
const StorageIndex src_index, const StorageIndex src_stride,
const Scalar* EIGEN_RESTRICT src_data) {
const Scalar* src = &src_data[src_index];
Scalar* dst = &dst_data[dst_index];
if (!Vectorizable) {
for (Index i = 0; i < num_coeff_to_copy; ++i) {
dst[i * dst_stride] = src[i * src_stride];
}
return;
}
if (src_stride == 1) {
const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
if (dst_stride == 1) {
// LINEAR
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
Packet p = ploadu<Packet>(src + i);
pstoreu<Scalar, Packet>(dst + i, p);
}
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
dst[i] = src[i];
}
} else {
// SCATTER
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
Packet p = ploadu<Packet>(src + i);
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
}
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
dst[i * dst_stride] = src[i];
}
}
} else if (src_stride == 0) {
const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
if (dst_stride == 1) {
// LINEAR
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
Packet p = pload1<Packet>(src);
pstoreu<Scalar, Packet>(dst + i, p);
}
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
dst[i] = *src;
}
} else {
// SCATTER
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
Packet p = pload1<Packet>(src);
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
}
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
dst[i * dst_stride] = *src;
}
}
} else {
if (dst_stride == 1) {
// GATHER
const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
pstoreu<Scalar, Packet>(dst + i, p);
}
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
dst[i] = src[i * src_stride];
}
} else {
// RANDOM
for (StorageIndex i = 0; i < num_coeff_to_copy; ++i) {
dst[i * dst_stride] = src[i * src_stride];
}
}
}
}
};
/**
* \class TensorBlockIO
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor block IO class.
*
* This class is responsible for copying data between a tensor and a tensor
* block.
*/
template <typename Scalar, typename StorageIndex, int NumDims, int Layout,
bool BlockRead>
class TensorBlockIO {
public:
typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
typedef TensorBlockCopyOp<Scalar, StorageIndex> BlockCopyOp;
protected:
typedef array<StorageIndex, NumDims> Dimensions;
struct BlockIteratorState {
StorageIndex input_stride;
StorageIndex output_stride;
StorageIndex input_span;
StorageIndex output_span;
StorageIndex size;
StorageIndex count;
BlockIteratorState()
: input_stride(0),
output_stride(0),
input_span(0),
output_span(0),
size(0),
count(0) {}
};
// Compute how many inner dimensions it's allowed to squeeze when doing IO
// between a tensor and a block. It's safe to squeeze inner dimensions, only
// if they are not reordered.
static int NumSqueezableInnerDims(const Dimensions& tensor_to_block_dim_map) {
int num_squeezable_dims = 0;
if (Layout == ColMajor) {
for (int i = 0; i < NumDims; ++i) {
if (tensor_to_block_dim_map[i] == i) num_squeezable_dims++;
else break;
}
} else {
for (int i = NumDims - 1; i >= 0; --i) {
if (tensor_to_block_dim_map[i] == i) num_squeezable_dims++;
else break;
}
}
return num_squeezable_dims;
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
const Block& block, StorageIndex first_coeff_index,
const Dimensions& tensor_to_block_dim_map,
const Dimensions& tensor_strides,
const Scalar* src_data,
Scalar* dst_data) {
// Do not squeeze reordered inner dimensions.
int num_squeezable_dims = NumSqueezableInnerDims(tensor_to_block_dim_map);
// Find the innermost tensor dimension whose size is not 1. This is the
// effective inner dim. If all dimensions are of size 1, then fallback to
// using the actual innermost dim to avoid out-of-bound access.
StorageIndex num_size_one_inner_dims = 0;
for (int i = 0; i < num_squeezable_dims; ++i) {
const int dim = cond<Layout>()(i, NumDims - i - 1);
if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) {
num_size_one_inner_dims = i;
break;
}
}
// Calculate strides and dimensions.
const StorageIndex tensor_stride1_dim = cond<Layout>()(
num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
const StorageIndex block_dim_for_tensor_stride1_dim =
NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
StorageIndex block_inner_dim_size =
NumDims == 0 ? 1
: block.block_sizes()[block_dim_for_tensor_stride1_dim];
// Squeeze multiple inner dims into one for larger inner dim size.
for (Index i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
const Index dim = cond<Layout>()(i, NumDims - i - 1);
const StorageIndex block_stride =
block.block_strides()[tensor_to_block_dim_map[dim]];
if (block_inner_dim_size == block_stride &&
block_stride == tensor_strides[dim]) {
block_inner_dim_size *=
block.block_sizes()[tensor_to_block_dim_map[dim]];
++num_size_one_inner_dims;
} else {
break;
}
}
StorageIndex inputIndex;
StorageIndex outputIndex;
StorageIndex input_stride;
StorageIndex output_stride;
// Setup strides to read/write along the tensor's stride1 dimension.
if (BlockRead) {
inputIndex = first_coeff_index;
outputIndex = 0;
input_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
output_stride =
NumDims == 0
? 1
: block.block_strides()[block_dim_for_tensor_stride1_dim];
} else {
inputIndex = 0;
outputIndex = first_coeff_index;
input_stride =
NumDims == 0
? 1
: block.block_strides()[block_dim_for_tensor_stride1_dim];
output_stride = NumDims == 0 ? 1 : tensor_strides[tensor_stride1_dim];
}
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
array<BlockIteratorState, at_least_1_dim> block_iter_state;
// Initialize block iterator state. Squeeze away any dimension of size 1.
Index num_squeezed_dims = 0;
for (Index i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
const Index dim = cond<Layout>()(i + 1, NumDims - i - 2);
const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]];
if (size == 1) {
continue;
}
block_iter_state[num_squeezed_dims].size = size;
if (BlockRead) {
block_iter_state[num_squeezed_dims].input_stride = tensor_strides[dim];
block_iter_state[num_squeezed_dims].output_stride =
block.block_strides()[tensor_to_block_dim_map[dim]];
} else {
block_iter_state[num_squeezed_dims].input_stride =
block.block_strides()[tensor_to_block_dim_map[dim]];
block_iter_state[num_squeezed_dims].output_stride = tensor_strides[dim];
}
block_iter_state[num_squeezed_dims].input_span =
block_iter_state[num_squeezed_dims].input_stride *
(block_iter_state[num_squeezed_dims].size - 1);
block_iter_state[num_squeezed_dims].output_span =
block_iter_state[num_squeezed_dims].output_stride *
(block_iter_state[num_squeezed_dims].size - 1);
++num_squeezed_dims;
}
// Iterate copying data from src to dst.
const StorageIndex block_total_size =
NumDims == 0 ? 1 : block.block_sizes().TotalSize();
for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) {
BlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
dst_data, inputIndex, input_stride, src_data);
// Update index.
for (int j = 0; j < num_squeezed_dims; ++j) {
if (++block_iter_state[j].count < block_iter_state[j].size) {
inputIndex += block_iter_state[j].input_stride;
outputIndex += block_iter_state[j].output_stride;
break;
}
block_iter_state[j].count = 0;
inputIndex -= block_iter_state[j].input_span;
outputIndex -= block_iter_state[j].output_span;
}
}
}
};
/**
* \class TensorBlockReader
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor block reader class.
*
* This class is responsible for reading a tensor block.
*
*/
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims,
Layout, /*BlockRead=*/true> {
public:
typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true> Base;
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
Block* block, const Scalar* src_data) {
array<StorageIndex, NumDims> tensor_to_block_dim_map;
for (int i = 0; i < NumDims; ++i) {
tensor_to_block_dim_map[i] = i;
}
Base::Copy(*block, block->first_coeff_index(), tensor_to_block_dim_map,
block->tensor_strides(), src_data, block->data());
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
Block* block, StorageIndex first_coeff_index,
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data) {
Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
tensor_strides, src_data, block->data());
}
};
/**
* \class TensorBlockWriter
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor block writer class.
*
* This class is responsible for writing a tensor block.
*
*/
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
Layout, /*BlockRead=*/false> {
public:
typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false> Base;
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const Block& block, Scalar* dst_data) {
array<StorageIndex, NumDims> tensor_to_block_dim_map;
for (int i = 0; i < NumDims; ++i) {
tensor_to_block_dim_map[i] = i;
}
Base::Copy(block, block.first_coeff_index(), tensor_to_block_dim_map,
block.tensor_strides(), block.data(), dst_data);
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const Block& block, StorageIndex first_coeff_index,
const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
const array<StorageIndex, NumDims>& tensor_strides, Scalar* dst_data) {
Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
tensor_strides, block.data(), dst_data);
}
};
/**
* \class TensorBlockCwiseUnaryOp
* \ingroup CXX11_Tensor_Module
*
* \brief Carries out a cwise binary op on a number of coefficients.
*
* This class reads strided input from the argument, and writes the
* result of the cwise unary op to the strided output array.
*
*/
template <bool Vectorizable>
struct TensorBlockCwiseUnaryOp {
template <typename StorageIndex, typename UnaryFunctor,
typename OutputScalar, typename InputScalar>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const UnaryFunctor& functor, const StorageIndex num_coeff,
const StorageIndex output_index, const StorageIndex output_stride,
OutputScalar* output_data, const StorageIndex input_index,
const StorageIndex input_stride, const InputScalar* input_data) {
typedef const Array<InputScalar, Dynamic, 1> Input;
typedef Array<OutputScalar, Dynamic, 1> Output;
typedef Map<Input, 0, InnerStride<> > InputMap;
typedef Map<Output, 0, InnerStride<> > OutputMap;
const InputScalar* input_base = &input_data[input_index];
OutputScalar* output_base = &output_data[output_index];
const InputMap input(input_base, num_coeff, InnerStride<>(input_stride));
OutputMap output(output_base, num_coeff, InnerStride<>(output_stride));
output = CwiseUnaryOp<UnaryFunctor, InputMap>(input, functor);
}
};
template<>
struct TensorBlockCwiseUnaryOp<true> {
template <typename StorageIndex, typename UnaryFunctor,
typename OutputScalar, typename InputScalar>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const UnaryFunctor& functor, const StorageIndex num_coeff,
const StorageIndex output_index, const StorageIndex output_stride,
OutputScalar* output_data, const StorageIndex input_index,
const StorageIndex input_stride, const InputScalar* input_data) {
if (input_stride == 1 && output_stride == 1) {
typedef const Array<InputScalar, Dynamic, 1> Input;
typedef Array<OutputScalar, Dynamic, 1> Output;
const Map<Input> input(&input_data[input_index], num_coeff);
Map<Output> output(&output_data[output_index], num_coeff);
output = CwiseUnaryOp<UnaryFunctor, Map<Input> >(input, functor);
} else {
TensorBlockCwiseUnaryOp<false>::Run(
functor, num_coeff, output_index, output_stride, output_data,
input_index, input_stride, input_data);
}
}
};
/**
* \class TensorBlockCwiseUnaryIO
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor block IO class for carrying out cwise unary ops.
*
* This class carries out the unary op on given blocks.
*/
template <typename UnaryFunctor, typename StorageIndex, typename OutputScalar,
int NumDims, int Layout>
struct TensorBlockCwiseUnaryIO {
typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims,
Layout>::Dimensions Dimensions;
typedef TensorBlockCwiseUnaryOp<
packet_traits<OutputScalar>::Vectorizable &&
functor_traits<UnaryFunctor>::PacketAccess>
TensorBlockCwiseUnaryOpImpl;
struct BlockIteratorState {
StorageIndex output_stride, output_span;
StorageIndex input_stride, input_span;
StorageIndex size, count;
};
template <typename InputScalar>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const UnaryFunctor& functor, const Dimensions& block_sizes,
const Dimensions& block_strides, OutputScalar* output_data,
const array<StorageIndex, NumDims>& input_strides,
const InputScalar* input_data) {
// Find the innermost dimension whose size is not 1. This is the effective
// inner dim. If all dimensions are of size 1, fallback to using the actual
// innermost dim to avoid out-of-bound access.
int num_size_one_inner_dims = 0;
for (int i = 0; i < NumDims; ++i) {
const int dim = cond<Layout>()(i, NumDims - i - 1);
if (block_sizes[dim] != 1) {
num_size_one_inner_dims = i;
break;
}
}
// Calculate strides and dimensions.
const int inner_dim =
NumDims == 0 ? 1
: cond<Layout>()(num_size_one_inner_dims,
NumDims - num_size_one_inner_dims - 1);
StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
const int dim = cond<Layout>()(i, NumDims - i - 1);
// Merge multiple inner dims into one for larger inner dim size (i.e.
// fewer calls to TensorBlockCwiseUnaryOp::Run()).
if (inner_dim_size == block_strides[dim] &&
block_strides[dim] == input_strides[dim]) {
inner_dim_size *= block_sizes[dim];
++num_size_one_inner_dims;
} else {
break;
}
}
StorageIndex output_index = 0, input_index = 0;
const StorageIndex output_stride =
NumDims == 0 ? 1 : block_strides[inner_dim];
const StorageIndex input_stride =
NumDims == 0 ? 1 : input_strides[inner_dim];
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
array<BlockIteratorState, at_least_1_dim> block_iter_state;
// Initialize block iterator state. Squeeze away any dimension of size 1.
int num_squeezed_dims = 0;
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
const StorageIndex size = block_sizes[dim];
if (size == 1) {
continue;
}
BlockIteratorState& state = block_iter_state[num_squeezed_dims];
state.output_stride = block_strides[dim];
state.input_stride = input_strides[dim];
state.size = size;
state.output_span = state.output_stride * (size - 1);
state.input_span = state.input_stride * (size - 1);
state.count = 0;
++num_squeezed_dims;
}
// Compute cwise unary op.
const StorageIndex block_total_size =
NumDims == 0 ? 1 : block_sizes.TotalSize();
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
TensorBlockCwiseUnaryOpImpl::Run(functor, inner_dim_size, output_index,
output_stride, output_data, input_index,
input_stride, input_data);
// Update index.
for (int j = 0; j < num_squeezed_dims; ++j) {
BlockIteratorState& state = block_iter_state[j];
if (++state.count < state.size) {
output_index += state.output_stride;
input_index += state.input_stride;
break;
}
state.count = 0;
output_index -= state.output_span;
input_index -= state.input_span;
}
}
}
};
/**
* \class TensorBlockCwiseBinaryOp
* \ingroup CXX11_Tensor_Module
*
* \brief Carries out a cwise binary op on a number of coefficients.
*
* This class reads strided inputs from left and right operands, and writes the
* result of the cwise binary op to the strided output array.
*
*/
template<bool Vectorizable>
struct TensorBlockCwiseBinaryOp {
template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
typename LeftScalar, typename RightScalar>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const BinaryFunctor& functor, const StorageIndex num_coeff,
const StorageIndex output_index, const StorageIndex output_stride,
OutputScalar* output_data, const StorageIndex left_index,
const StorageIndex left_stride, const LeftScalar* left_data,
const StorageIndex right_index, const StorageIndex right_stride,
const RightScalar* right_data) {
typedef const Array<LeftScalar, Dynamic, 1> Lhs;
typedef const Array<RightScalar, Dynamic, 1> Rhs;
typedef Array<OutputScalar, Dynamic, 1> Out;
typedef Map<Lhs, 0, InnerStride<> > LhsMap;
typedef Map<Rhs, 0, InnerStride<> > RhsMap;
typedef Map<Out, 0, InnerStride<> > OutMap;
const LeftScalar* lhs_base = &left_data[left_index];
const RightScalar* rhs_base = &right_data[right_index];
OutputScalar* out_base = &output_data[output_index];
const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride));
const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride));
OutMap out(out_base, num_coeff, InnerStride<>(output_stride));
out = CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor);
}
};
template<>
struct TensorBlockCwiseBinaryOp<true> {
template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
typename LeftScalar, typename RightScalar>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const BinaryFunctor& functor, const StorageIndex num_coeff,
const StorageIndex output_index, const StorageIndex output_stride,
OutputScalar* output_data, const StorageIndex left_index,
const StorageIndex left_stride, const LeftScalar* left_data,
const StorageIndex right_index, const StorageIndex right_stride,
const RightScalar* right_data) {
if (left_stride == 1 && right_stride == 1 && output_stride == 1) {
typedef const Array<LeftScalar, Dynamic, 1> Lhs;
typedef const Array<RightScalar, Dynamic, 1> Rhs;
typedef Array<OutputScalar, Dynamic, 1> Out;
const LeftScalar* lhs_base = &left_data[left_index];
const RightScalar* rhs_base = &right_data[right_index];
OutputScalar* out_base = &output_data[output_index];
const Map<Lhs> lhs(lhs_base, num_coeff);
const Map<Rhs> rhs(rhs_base, num_coeff);
Map<Out> out(out_base, num_coeff);
out = CwiseBinaryOp<BinaryFunctor, Map<Lhs>, Map<Rhs> >(lhs, rhs, functor);
} else {
TensorBlockCwiseBinaryOp<false>::Run(
functor, num_coeff, output_index, output_stride, output_data,
left_index, left_stride, left_data, right_index, right_stride,
right_data);
}
}
};
/**
* \class TensorBlockCwiseBinaryIO
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor block IO class for carrying out cwise binary ops.
*
* This class carries out the binary op on given blocks.
*
*/
template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
int NumDims, int Layout>
struct TensorBlockCwiseBinaryIO {
typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims, Layout>::Dimensions Dimensions;
typedef TensorBlockCwiseBinaryOp<
packet_traits<OutputScalar>::Vectorizable &&
functor_traits<BinaryFunctor>::PacketAccess>
TensorBlockCwiseBinaryOpImpl;
struct BlockIteratorState {
StorageIndex output_stride, output_span;
StorageIndex left_stride, left_span;
StorageIndex right_stride, right_span;
StorageIndex size, count;
};
template <typename LeftScalar, typename RightScalar>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const BinaryFunctor& functor, const Dimensions& block_sizes,
const Dimensions& block_strides, OutputScalar* output_data,
const array<StorageIndex, NumDims>& left_strides,
const LeftScalar* left_data,
const array<StorageIndex, NumDims>& right_strides,
const RightScalar* right_data) {
// Find the innermost dimension whose size is not 1. This is the effective
// inner dim. If all dimensions are of size 1, fallback to using the actual
// innermost dim to avoid out-of-bound access.
int num_size_one_inner_dims = 0;
for (int i = 0; i < NumDims; ++i) {
const int dim = cond<Layout>()(i, NumDims - i - 1);
if (block_sizes[dim] != 1) {
num_size_one_inner_dims = i;
break;
}
}
// Calculate strides and dimensions.
const int inner_dim =
NumDims == 0 ? 1
: cond<Layout>()(num_size_one_inner_dims,
NumDims - num_size_one_inner_dims - 1);
StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
const int dim = cond<Layout>()(i, NumDims - i - 1);
// Merge multiple inner dims into one for larger inner dim size (i.e.
// fewer calls to TensorBlockCwiseBinaryOp::Run()).
if (inner_dim_size == block_strides[dim] &&
block_strides[dim] == left_strides[dim] &&
block_strides[dim] == right_strides[dim]) {
inner_dim_size *= block_sizes[dim];
++num_size_one_inner_dims;
} else {
break;
}
}
StorageIndex output_index = 0, left_index = 0, right_index = 0;
const StorageIndex output_stride =
NumDims == 0 ? 1 : block_strides[inner_dim];
const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim];
const StorageIndex right_stride =
NumDims == 0 ? 1 : right_strides[inner_dim];
const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
array<BlockIteratorState, at_least_1_dim> block_iter_state;
// Initialize block iterator state. Squeeze away any dimension of size 1.
int num_squeezed_dims = 0;
for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
const StorageIndex size = block_sizes[dim];
if (size == 1) {
continue;
}
BlockIteratorState& state = block_iter_state[num_squeezed_dims];
state.output_stride = block_strides[dim];
state.left_stride = left_strides[dim];
state.right_stride = right_strides[dim];
state.size = size;
state.output_span = state.output_stride * (size - 1);
state.left_span = state.left_stride * (size - 1);
state.right_span = state.right_stride * (size - 1);
state.count = 0;
++num_squeezed_dims;
}
// Compute cwise binary op.
const StorageIndex block_total_size =
NumDims == 0 ? 1 : block_sizes.TotalSize();
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
TensorBlockCwiseBinaryOpImpl::Run(functor, inner_dim_size, output_index,
output_stride, output_data, left_index,
left_stride, left_data, right_index,
right_stride, right_data);
// Update index.
for (int j = 0; j < num_squeezed_dims; ++j) {
BlockIteratorState& state = block_iter_state[j];
if (++state.count < state.size) {
output_index += state.output_stride;
left_index += state.left_stride;
right_index += state.right_stride;
break;
}
state.count = 0;
output_index -= state.output_span;
left_index -= state.left_span;
right_index -= state.right_span;
}
}
}
};
/**
* \class TensorBlockView
* \ingroup CXX11_Tensor_Module
*
* \brief Read-only view into a block of data.
*
* This class provides read-only access to a block of data in impl. It may need
* to allocate space for holding the intermediate result.
*
*/
template <class ArgType, class Device>
struct TensorBlockView {
typedef TensorEvaluator<ArgType, Device> Impl;
typedef typename Impl::Index StorageIndex;
typedef typename remove_const<typename Impl::Scalar>::type Scalar;
static const int NumDims = array_size<typename Impl::Dimensions>::value;
typedef DSizes<StorageIndex, NumDims> Dimensions;
// Constructs a TensorBlockView for `impl`. `block` is only used for for
// specifying the start offset, shape, and strides of the block.
template <typename OtherTensorBlock>
TensorBlockView(const Device& device,
const TensorEvaluator<ArgType, Device>& impl,
const OtherTensorBlock& block)
: m_device(device),
m_block_sizes(block.block_sizes()),
m_data(NULL),
m_allocated_data(NULL) {
if (Impl::RawAccess && impl.data() != NULL) {
m_data = impl.data() + block.first_coeff_index();
m_block_strides = block.tensor_strides();
} else {
// Actually make a copy.
// TODO(wuke): This sometimes put a lot pressure on the heap allocator.
// Consider allowing ops to request additional temporary block memory in
// TensorOpResourceRequirements.
m_allocated_data = static_cast<Scalar*>(
m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar)));
m_data = m_allocated_data;
if (NumDims > 0) {
if (static_cast<int>(Impl::Layout) == static_cast<int>(ColMajor)) {
m_block_strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1];
}
} else {
m_block_strides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1];
}
}
}
TensorBlock<Scalar, StorageIndex, NumDims, Impl::Layout> input_block(
block.first_coeff_index(), m_block_sizes, m_block_strides,
block.tensor_strides(), m_allocated_data);
impl.block(&input_block);
}
}
~TensorBlockView() {
if (m_allocated_data != NULL) {
m_device.deallocate(m_allocated_data);
}
}
const Dimensions& block_sizes() const { return m_block_sizes; }
const Dimensions& block_strides() const { return m_block_strides; }
const Scalar* data() const { return m_data; }
private:
const Device EIGEN_DEVICE_REF m_device;
Dimensions m_block_sizes, m_block_strides;
const Scalar* m_data; // Not owned.
Scalar* m_allocated_data; // Owned.
};
/**
* \class TensorBlockMapper
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor block mapper class.
*
* This class is responsible for iterating over the blocks of a tensor.
*/
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorBlockMapper {
public:
typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
typedef DSizes<StorageIndex, NumDims> Dimensions;
TensorBlockMapper(const Dimensions& dims,
const TensorBlockShapeType block_shape,
Index min_target_size)
: m_dimensions(dims),
m_block_dim_sizes(BlockDimensions(dims, block_shape, convert_index<StorageIndex>(min_target_size))) {
// Calculate block counts by dimension and total block count.
DSizes<StorageIndex, NumDims> block_count;
for (Index i = 0; i < block_count.rank(); ++i) {
block_count[i] = divup(m_dimensions[i], m_block_dim_sizes[i]);
}
m_total_block_count = array_prod(block_count);
// Calculate block strides (used for enumerating blocks).
if (NumDims > 0) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_block_strides[0] = 1;
m_tensor_strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
m_tensor_strides[i] = m_tensor_strides[i - 1] * m_dimensions[i - 1];
}
} else {
m_block_strides[NumDims - 1] = 1;
m_tensor_strides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
m_tensor_strides[i] = m_tensor_strides[i + 1] * m_dimensions[i + 1];
}
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block
GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
StorageIndex first_coeff_index = 0;
DSizes<StorageIndex, NumDims> coords;
DSizes<StorageIndex, NumDims> sizes;
DSizes<StorageIndex, NumDims> strides;
if (NumDims > 0) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const StorageIndex idx = block_index / m_block_strides[i];
coords[i] = idx * m_block_dim_sizes[i];
sizes[i] =
numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
block_index -= idx * m_block_strides[i];
first_coeff_index += coords[i] * m_tensor_strides[i];
}
coords[0] = block_index * m_block_dim_sizes[0];
sizes[0] =
numext::mini((m_dimensions[0] - coords[0]), m_block_dim_sizes[0]);
first_coeff_index += coords[0] * m_tensor_strides[0];
strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
strides[i] = strides[i - 1] * sizes[i - 1];
}
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const StorageIndex idx = block_index / m_block_strides[i];
coords[i] = idx * m_block_dim_sizes[i];
sizes[i] =
numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
block_index -= idx * m_block_strides[i];
first_coeff_index += coords[i] * m_tensor_strides[i];
}
coords[NumDims - 1] = block_index * m_block_dim_sizes[NumDims - 1];
sizes[NumDims - 1] =
numext::mini((m_dimensions[NumDims - 1] - coords[NumDims - 1]),
m_block_dim_sizes[NumDims - 1]);
first_coeff_index +=
coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
strides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * sizes[i + 1];
}
}
}
return Block(first_coeff_index, sizes, strides, m_tensor_strides, data);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
return m_total_block_count;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex
block_dims_total_size() const {
return m_block_dim_sizes.TotalSize();
}
private:
static Dimensions BlockDimensions(const Dimensions& tensor_dims,
const TensorBlockShapeType block_shape,
StorageIndex min_target_size) {
min_target_size = numext::maxi<StorageIndex>(1, min_target_size);
// If tensor fully fits into the target size, we'll treat it a single block.
Dimensions block_dim_sizes = tensor_dims;
if (tensor_dims.TotalSize() == 0) {
// Corner case: one of the dimensions is zero. Logic below is too complex
// to handle this case on a general basis, just use unit block size.
// Note: we must not yield blocks with zero dimensions (recipe for
// overflows/underflows, divisions by zero and NaNs later).
for (int i = 0; i < NumDims; ++i) {
block_dim_sizes[i] = 1;
}
} else if (block_dim_sizes.TotalSize() > min_target_size) {
if (block_shape == kUniformAllDims) {
// Tensor will not fit within 'min_target_size' budget: calculate tensor
// block dimension sizes based on "square" dimension size target.
const StorageIndex dim_size_target = convert_index<StorageIndex>(
std::pow(static_cast<float>(min_target_size),
1.0f / static_cast<float>(block_dim_sizes.rank())));
for (Index i = 0; i < block_dim_sizes.rank(); ++i) {
// TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
// a multiple of the packet size. Note that reducing
// 'block_dim_size' in this manner can increase the number of
// blocks, and so will amplify any per-block overhead.
block_dim_sizes[i] = numext::mini(dim_size_target, tensor_dims[i]);
}
// Add any un-allocated coefficients to inner dimension(s).
StorageIndex total_size = block_dim_sizes.TotalSize();
for (int i = 0; i < NumDims; ++i) {
const int dim = cond<Layout>()(i, NumDims - i - 1);
if (block_dim_sizes[dim] < tensor_dims[dim]) {
const StorageIndex total_size_other_dims =
total_size / block_dim_sizes[dim];
const StorageIndex alloc_avail =
divup<StorageIndex>(min_target_size, total_size_other_dims);
if (alloc_avail == block_dim_sizes[dim]) {
// Insufficient excess coefficients to allocate.
break;
}
block_dim_sizes[dim] = numext::mini(tensor_dims[dim], alloc_avail);
total_size = total_size_other_dims * block_dim_sizes[dim];
}
}
} else if (block_shape == kSkewedInnerDims) {
StorageIndex coeff_to_allocate = min_target_size;
for (int i = 0; i < NumDims; ++i) {
const int dim = cond<Layout>()(i, NumDims - i - 1);
block_dim_sizes[dim] =
numext::mini(coeff_to_allocate, tensor_dims[dim]);
coeff_to_allocate = divup(
coeff_to_allocate,
numext::maxi(static_cast<StorageIndex>(1), block_dim_sizes[dim]));
}
eigen_assert(coeff_to_allocate == 1);
} else {
eigen_assert(false); // someone added new block shape type
}
}
eigen_assert(
block_dim_sizes.TotalSize() >=
numext::mini<Index>(min_target_size, tensor_dims.TotalSize()));
return block_dim_sizes;
}
Dimensions m_dimensions;
Dimensions m_block_dim_sizes;
Dimensions m_block_strides;
Dimensions m_tensor_strides;
StorageIndex m_total_block_count;
};
/**
* \class TensorSliceBlockMapper
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor slice block mapper class.
*
* This class is responsible for iterating over the blocks of
* a slice of a tensor. Supports shuffling of the block strides
* for callers that want to reduce strides for dimensions to be
* processed together.
*
*/
template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
class TensorSliceBlockMapper {
public:
typedef TensorBlock<Scalar, StorageIndex, NumDims, Layout> Block;
typedef DSizes<StorageIndex, NumDims> Dimensions;
TensorSliceBlockMapper(const Dimensions& tensor_dims,
const Dimensions& tensor_slice_offsets,
const Dimensions& tensor_slice_extents,
const Dimensions& block_dim_sizes,
const Dimensions& block_stride_order)
: m_tensor_dimensions(tensor_dims),
m_tensor_slice_offsets(tensor_slice_offsets),
m_tensor_slice_extents(tensor_slice_extents),
m_block_dim_sizes(block_dim_sizes),
m_block_stride_order(block_stride_order),
m_total_block_count(1) {
// Calculate block counts by dimension and total block count.
DSizes<StorageIndex, NumDims> block_count;
for (Index i = 0; i < block_count.rank(); ++i) {
block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]);
}
m_total_block_count = array_prod(block_count);
// Calculate block strides (used for enumerating blocks).
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_block_strides[0] = 1;
m_tensor_strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_block_strides[i] = m_block_strides[i - 1] * block_count[i - 1];
m_tensor_strides[i] =
m_tensor_strides[i - 1] * m_tensor_dimensions[i - 1];
}
} else {
m_block_strides[NumDims - 1] = 1;
m_tensor_strides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_block_strides[i] = m_block_strides[i + 1] * block_count[i + 1];
m_tensor_strides[i] =
m_tensor_strides[i + 1] * m_tensor_dimensions[i + 1];
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block
GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
StorageIndex first_coeff_index = 0;
DSizes<StorageIndex, NumDims> coords;
DSizes<StorageIndex, NumDims> sizes;
DSizes<StorageIndex, NumDims> strides;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = block_index / m_block_strides[i];
coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
sizes[i] = numext::mini(
m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
m_block_dim_sizes[i]);
block_index -= idx * m_block_strides[i];
first_coeff_index += coords[i] * m_tensor_strides[i];
}
coords[0] =
m_tensor_slice_offsets[0] + block_index * m_block_dim_sizes[0];
sizes[0] = numext::mini(
m_tensor_slice_offsets[0] + m_tensor_slice_extents[0] - coords[0],
m_block_dim_sizes[0]);
first_coeff_index += coords[0] * m_tensor_strides[0];
StorageIndex prev_dim = m_block_stride_order[0];
strides[prev_dim] = 1;
for (int i = 1; i < NumDims; ++i) {
const StorageIndex curr_dim = m_block_stride_order[i];
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
prev_dim = curr_dim;
}
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const StorageIndex idx = block_index / m_block_strides[i];
coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
sizes[i] = numext::mini(
m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
m_block_dim_sizes[i]);
block_index -= idx * m_block_strides[i];
first_coeff_index += coords[i] * m_tensor_strides[i];
}
coords[NumDims - 1] = m_tensor_slice_offsets[NumDims - 1] +
block_index * m_block_dim_sizes[NumDims - 1];
sizes[NumDims - 1] = numext::mini(
m_tensor_slice_offsets[NumDims - 1] +
m_tensor_slice_extents[NumDims - 1] - coords[NumDims - 1],
m_block_dim_sizes[NumDims - 1]);
first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
StorageIndex prev_dim = m_block_stride_order[NumDims - 1];
strides[prev_dim] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
const StorageIndex curr_dim = m_block_stride_order[i];
strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
prev_dim = curr_dim;
}
}
return Block(first_coeff_index, sizes, strides, m_tensor_strides, data);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
return m_total_block_count;
}
private:
Dimensions m_tensor_dimensions;
Dimensions m_tensor_slice_offsets;
Dimensions m_tensor_slice_extents;
Dimensions m_tensor_strides;
Dimensions m_block_dim_sizes;
Dimensions m_block_stride_order;
Dimensions m_block_strides;
StorageIndex m_total_block_count;
};
} // namespace internal
} // namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H